Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -1,16 +1,17 @@
import json
from logging import Logger
from typing import Any

from flask import current_app as app
from json_repair import repair_json

from unstract.core.flask.exceptions import APIError
from unstract.prompt_service.constants import ExecutionSource, FileStorageKeys, RunLevel
from unstract.prompt_service.constants import PromptServiceConstants as PSKeys
from unstract.prompt_service.exceptions import RateLimitError
from unstract.prompt_service.helpers.plugin import PluginManager
from unstract.prompt_service.utils.env_loader import get_env_or_die
from unstract.prompt_service.utils.json_repair_helper import (
repair_json_with_best_structure,
)
from unstract.prompt_service.utils.log import publish_log
from unstract.sdk.constants import LogLevel
from unstract.sdk.exceptions import RateLimitError as SdkRateLimitError
Expand Down Expand Up @@ -245,20 +246,8 @@ def handle_json(
if answer.lower() == "na":
structured_output[prompt_key] = None
else:
# Attempt parsing as-is (could be a valid object, array, or partial JSON)
a = repair_json(json_str=answer, return_objects=True, ensure_ascii=False)

# Attempt parsing with array wrap (useful for multiple comma-separated objects like {}, {}, {})
b = repair_json(
json_str="[" + answer, return_objects=True, ensure_ascii=False
)

# Heuristic: if wrapping only added '[' and ']', len(b) - len(a) >= 2 → original was valid, use 'a'
# Otherwise, fallback to 'b' which likely fixed multiple items or invalid top-level structure
dump_a = json.dumps(a, ensure_ascii=False)
dump_b = json.dumps(b, ensure_ascii=False)
ARRAY_WRAP_DELTA = 2 # '[' and ']'
parsed_data = a if len(dump_b) - len(dump_a) >= ARRAY_WRAP_DELTA else b
# Use the utility function to repair JSON with the best structure
parsed_data = repair_json_with_best_structure(answer)

if isinstance(parsed_data, str):
err_msg = "Error parsing response (to json)\n" f"Candidate JSON: {answer}"
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
"""JSON repair utility functions."""

from typing import Any

from json_repair import repair_json


def repair_json_with_best_structure(json_str: str) -> Any:
Comment thread
Deepak-Kesavan marked this conversation as resolved.
"""Intelligently repair JSON string using the best parsing strategy.

This function attempts to parse JSON in two ways:
1. As-is (could be valid object, array, or partial JSON)
2. With array wrapping (useful for comma-separated objects)

It chooses the result based on structural integrity rather than string length.

Args:
json_str: The JSON string to repair

Returns:
The parsed JSON object with the best structure
"""
# Attempt parsing as-is
parsed_as_is = repair_json(json_str=json_str, return_objects=True, ensure_ascii=False)

# Attempt parsing with array wrap
parsed_with_wrap = repair_json(
json_str="[" + json_str, return_objects=True, ensure_ascii=False
)
Comment thread
Deepak-Kesavan marked this conversation as resolved.

# If both results are strings, return the as-is result
Comment thread
Deepak-Kesavan marked this conversation as resolved.
if isinstance(parsed_as_is, str) and isinstance(parsed_with_wrap, str):
return parsed_as_is

# If only one is a string, return the non-string result
if isinstance(parsed_as_is, str):
return parsed_with_wrap
if isinstance(parsed_with_wrap, str):
return parsed_as_is

# Both are valid structures - choose based on structure analysis
# If parsed_with_wrap is a list with exactly one element that equals parsed_as_is,
# then the original was already valid and wrapping just added unnecessary array
if (
isinstance(parsed_with_wrap, list)
and len(parsed_with_wrap) == 1
and parsed_with_wrap[0] == parsed_as_is
):
return parsed_as_is

# If parsed_as_is is a valid structure (dict or list), prefer it
# unless parsed_with_wrap provides a more complete structure
if isinstance(parsed_as_is, (dict, list)):
# Check if the wrapped version provides multiple objects that were
# incorrectly concatenated in the original (e.g., {},{},{})
if isinstance(parsed_with_wrap, list) and len(parsed_with_wrap) > 1:
# The original likely had multiple comma-separated objects
return parsed_with_wrap
else:
# The original was already a valid structure
return parsed_as_is

# Default to wrapped version if we can't determine otherwise
return parsed_with_wrap