diff --git a/prompt-service/src/unstract/prompt_service/services/answer_prompt.py b/prompt-service/src/unstract/prompt_service/services/answer_prompt.py index 79e5a3dff4..59e23a74ba 100644 --- a/prompt-service/src/unstract/prompt_service/services/answer_prompt.py +++ b/prompt-service/src/unstract/prompt_service/services/answer_prompt.py @@ -1,9 +1,7 @@ -import json from logging import Logger from typing import Any from flask import current_app as app -from json_repair import repair_json from unstract.core.flask.exceptions import APIError from unstract.prompt_service.constants import ExecutionSource, FileStorageKeys, RunLevel @@ -11,6 +9,9 @@ from unstract.prompt_service.exceptions import RateLimitError from unstract.prompt_service.helpers.plugin import PluginManager from unstract.prompt_service.utils.env_loader import get_env_or_die +from unstract.prompt_service.utils.json_repair_helper import ( + repair_json_with_best_structure, +) from unstract.prompt_service.utils.log import publish_log from unstract.sdk.constants import LogLevel from unstract.sdk.exceptions import RateLimitError as SdkRateLimitError @@ -245,20 +246,8 @@ def handle_json( if answer.lower() == "na": structured_output[prompt_key] = None else: - # Attempt parsing as-is (could be a valid object, array, or partial JSON) - a = repair_json(json_str=answer, return_objects=True, ensure_ascii=False) - - # Attempt parsing with array wrap (useful for multiple comma-separated objects like {}, {}, {}) - b = repair_json( - json_str="[" + answer, return_objects=True, ensure_ascii=False - ) - - # Heuristic: if wrapping only added '[' and ']', len(b) - len(a) >= 2 → original was valid, use 'a' - # Otherwise, fallback to 'b' which likely fixed multiple items or invalid top-level structure - dump_a = json.dumps(a, ensure_ascii=False) - dump_b = json.dumps(b, ensure_ascii=False) - ARRAY_WRAP_DELTA = 2 # '[' and ']' - parsed_data = a if len(dump_b) - len(dump_a) >= ARRAY_WRAP_DELTA else b + # Use the utility function to repair JSON with the best structure + parsed_data = repair_json_with_best_structure(answer) if isinstance(parsed_data, str): err_msg = "Error parsing response (to json)\n" f"Candidate JSON: {answer}" diff --git a/prompt-service/src/unstract/prompt_service/utils/json_repair_helper.py b/prompt-service/src/unstract/prompt_service/utils/json_repair_helper.py new file mode 100644 index 0000000000..c15918dc26 --- /dev/null +++ b/prompt-service/src/unstract/prompt_service/utils/json_repair_helper.py @@ -0,0 +1,64 @@ +"""JSON repair utility functions.""" + +from typing import Any + +from json_repair import repair_json + + +def repair_json_with_best_structure(json_str: str) -> Any: + """Intelligently repair JSON string using the best parsing strategy. + + This function attempts to parse JSON in two ways: + 1. As-is (could be valid object, array, or partial JSON) + 2. With array wrapping (useful for comma-separated objects) + + It chooses the result based on structural integrity rather than string length. + + Args: + json_str: The JSON string to repair + + Returns: + The parsed JSON object with the best structure + """ + # Attempt parsing as-is + parsed_as_is = repair_json(json_str=json_str, return_objects=True, ensure_ascii=False) + + # Attempt parsing with array wrap + parsed_with_wrap = repair_json( + json_str="[" + json_str, return_objects=True, ensure_ascii=False + ) + + # If both results are strings, return the as-is result + if isinstance(parsed_as_is, str) and isinstance(parsed_with_wrap, str): + return parsed_as_is + + # If only one is a string, return the non-string result + if isinstance(parsed_as_is, str): + return parsed_with_wrap + if isinstance(parsed_with_wrap, str): + return parsed_as_is + + # Both are valid structures - choose based on structure analysis + # If parsed_with_wrap is a list with exactly one element that equals parsed_as_is, + # then the original was already valid and wrapping just added unnecessary array + if ( + isinstance(parsed_with_wrap, list) + and len(parsed_with_wrap) == 1 + and parsed_with_wrap[0] == parsed_as_is + ): + return parsed_as_is + + # If parsed_as_is is a valid structure (dict or list), prefer it + # unless parsed_with_wrap provides a more complete structure + if isinstance(parsed_as_is, (dict, list)): + # Check if the wrapped version provides multiple objects that were + # incorrectly concatenated in the original (e.g., {},{},{}) + if isinstance(parsed_with_wrap, list) and len(parsed_with_wrap) > 1: + # The original likely had multiple comma-separated objects + return parsed_with_wrap + else: + # The original was already a valid structure + return parsed_as_is + + # Default to wrapped version if we can't determine otherwise + return parsed_with_wrap