Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -1347,6 +1347,7 @@ def dynamic_extractor(
document_id=document_id,
profile_manager=profile_manager,
doc_id=doc_id,
enable_highlight=enable_highlight,
)
if is_extracted:
fs_instance = EnvHelper.get_storage(
Expand Down Expand Up @@ -1388,6 +1389,7 @@ def dynamic_extractor(
document_id=document_id,
profile_manager=profile_manager,
doc_id=doc_id,
enable_highlight=enable_highlight,
)
except SdkError as e:
msg = str(e)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,8 @@ class IndexManager(BaseModel):
null=False,
blank=False,
)
# TODO: Consider making this an enum instead
# A nested JSON of the form {doc_id: {"extracted": True, "enable_highlight": <bool>}}
Comment thread
chandrasekharan-zipstack marked this conversation as resolved.
extraction_status = models.JSONField(
db_comment="Extraction status for documents",
null=False,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -69,14 +69,18 @@ def handle_index_manager(

@staticmethod
def mark_extraction_status(
document_id: str, profile_manager: ProfileManager, doc_id: str
document_id: str,
profile_manager: ProfileManager,
doc_id: str,
enable_highlight: bool = False,
) -> bool:
"""Marks the extraction status for a given document.
"""Marks the extraction status for a given document with highlight metadata.

Args:
document_id (str): ID of the document in DocumentManager.
profile_manager (ProfileManager): ProfileManager instance for context.
doc_id (str): Unique identifier for the document within extraction status.
enable_highlight (bool): Whether highlight metadata was used during extraction.

Returns:
bool: True if the status is successfully updated, False otherwise.
Expand All @@ -95,7 +99,10 @@ def mark_extraction_status(

index_manager.extraction_status = index_manager.extraction_status or {}

index_manager.extraction_status[doc_id] = True
index_manager.extraction_status[doc_id] = {
"extracted": True,
"enable_highlight": enable_highlight,
}
logger.info(
f"Index manager {index_manager} {index_manager.index_ids_history}"
)
Expand All @@ -104,12 +111,14 @@ def mark_extraction_status(
if created:
logger.info(
f"IndexManager entry created "
f"for document: {document_id} with {doc_id}"
f"for document: {document_id} with {doc_id} "
f"(highlight={enable_highlight})"
)
else:
logger.info(
f"Updated extraction status "
f"for document: {document_id} with {doc_id}"
f"for document: {document_id} with {doc_id} "
f"(highlight={enable_highlight})"
)
return True

Expand All @@ -125,18 +134,23 @@ def mark_extraction_status(

@staticmethod
def check_extraction_status(
document_id: str, profile_manager: ProfileManager, doc_id: str
document_id: str,
profile_manager: ProfileManager,
doc_id: str,
enable_highlight: bool = False,
) -> bool:
"""Checks if the extraction status is already marked as complete
for the given document and doc_id.
for the given document and doc_id with matching highlight setting.

Args:
document_id (str): ID of the document in DocumentManager.
profile_manager (ProfileManager): ProfileManager instance for context.
doc_id (str): Unique identifier for the document within extraction status.
enable_highlight (bool): Whether highlight metadata is required.

Returns:
bool: True if extraction is complete, False otherwise.
bool: True if extraction is complete with matching highlight setting,
False otherwise.
"""
try:
index_manager = IndexManager.objects.filter(
Expand All @@ -148,20 +162,46 @@ def check_extraction_status(
return False

extraction_status = index_manager.extraction_status or {}
is_extracted = extraction_status.get(doc_id, False)
status_entry = extraction_status.get(doc_id)

if is_extracted:
if not status_entry:
logger.info(
f"Extraction is already marked as complete "
f"Extraction is NOT yet marked as complete "
f"for document: {document_id} with {doc_id}"
)
return False

# Backward compatibility: treat boolean True as non-highlighted
if isinstance(status_entry, bool):
is_extracted = status_entry
is_highlight_handled = False
Comment thread
chandrasekharan-zipstack marked this conversation as resolved.
else:
# New format: {"extracted": True, "enable_highlight": <bool>}
Comment thread
chandrasekharan-zipstack marked this conversation as resolved.
is_extracted = status_entry.get("extracted", False)
is_highlight_handled = status_entry.get("enable_highlight", False)

# Check if extraction exists AND highlight setting matches
if is_extracted and is_highlight_handled == enable_highlight:
logger.info(
f"Extraction is already marked as complete "
f"for document: {document_id} with {doc_id} "
f"(highlight={enable_highlight})"
)
return True
elif is_extracted and is_highlight_handled != enable_highlight:
logger.info(
f"Extraction exists but highlight mismatch "
f"for document: {document_id} with {doc_id}. "
f"Stored: {is_highlight_handled}, Requested: {enable_highlight}. "
f"Re-extraction needed."
)
return False
else:
logger.info(
f"Extraction is NOT yet marked as complete "
f"for document: {document_id} with {doc_id}"
)

return is_extracted
return False

except Exception as e:
logger.error(f"Unexpected error while checking extraction status: {e}")
Expand Down