From 77cbd1d146465bcae0a0bed6ab2ce165291f3a2e Mon Sep 17 00:00:00 2001 From: Chandrasekharan M Date: Tue, 4 Nov 2025 17:29:27 +0530 Subject: [PATCH 1/2] UN-2947: Fix highlight extraction tracking regression MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Enhance extraction_status to track highlight metadata separately. When enable_highlight is toggled, the system now correctly detects the mismatch and triggers re-extraction instead of reusing cached non-highlighted text. Changes: - check_extraction_status() now verifies highlight flag matches - mark_extraction_status() stores highlight metadata in new format - Backward compatible: treats old boolean values as non-highlighted Fixes issue where: - Documents extracted without highlight were reused when highlight enabled - Caused by PR #1605 removing reindex parameter - Index key doesn't include enable_highlight (by design for minimal impact) 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- .../prompt_studio_helper.py | 2 + .../prompt_studio_index_helper.py | 66 +++++++++++++++---- 2 files changed, 55 insertions(+), 13 deletions(-) diff --git a/backend/prompt_studio/prompt_studio_core_v2/prompt_studio_helper.py b/backend/prompt_studio/prompt_studio_core_v2/prompt_studio_helper.py index f6bf7086bd..7dd2fc8be3 100644 --- a/backend/prompt_studio/prompt_studio_core_v2/prompt_studio_helper.py +++ b/backend/prompt_studio/prompt_studio_core_v2/prompt_studio_helper.py @@ -1347,6 +1347,7 @@ def dynamic_extractor( document_id=document_id, profile_manager=profile_manager, doc_id=doc_id, + enable_highlight=enable_highlight, ) if is_extracted: fs_instance = EnvHelper.get_storage( @@ -1388,6 +1389,7 @@ def dynamic_extractor( document_id=document_id, profile_manager=profile_manager, doc_id=doc_id, + enable_highlight=enable_highlight, ) except SdkError as e: msg = str(e) diff --git a/backend/prompt_studio/prompt_studio_index_manager_v2/prompt_studio_index_helper.py b/backend/prompt_studio/prompt_studio_index_manager_v2/prompt_studio_index_helper.py index 739f3fb3ff..edf5be2d4c 100644 --- a/backend/prompt_studio/prompt_studio_index_manager_v2/prompt_studio_index_helper.py +++ b/backend/prompt_studio/prompt_studio_index_manager_v2/prompt_studio_index_helper.py @@ -69,14 +69,18 @@ def handle_index_manager( @staticmethod def mark_extraction_status( - document_id: str, profile_manager: ProfileManager, doc_id: str + document_id: str, + profile_manager: ProfileManager, + doc_id: str, + enable_highlight: bool = False, ) -> bool: - """Marks the extraction status for a given document. + """Marks the extraction status for a given document with highlight metadata. Args: document_id (str): ID of the document in DocumentManager. profile_manager (ProfileManager): ProfileManager instance for context. doc_id (str): Unique identifier for the document within extraction status. + enable_highlight (bool): Whether highlight metadata was used during extraction. Returns: bool: True if the status is successfully updated, False otherwise. @@ -95,7 +99,10 @@ def mark_extraction_status( index_manager.extraction_status = index_manager.extraction_status or {} - index_manager.extraction_status[doc_id] = True + index_manager.extraction_status[doc_id] = { + "extracted": True, + "enable_highlight": enable_highlight, + } logger.info( f"Index manager {index_manager} {index_manager.index_ids_history}" ) @@ -104,12 +111,14 @@ def mark_extraction_status( if created: logger.info( f"IndexManager entry created " - f"for document: {document_id} with {doc_id}" + f"for document: {document_id} with {doc_id} " + f"(highlight={enable_highlight})" ) else: logger.info( f"Updated extraction status " - f"for document: {document_id} with {doc_id}" + f"for document: {document_id} with {doc_id} " + f"(highlight={enable_highlight})" ) return True @@ -125,18 +134,23 @@ def mark_extraction_status( @staticmethod def check_extraction_status( - document_id: str, profile_manager: ProfileManager, doc_id: str + document_id: str, + profile_manager: ProfileManager, + doc_id: str, + enable_highlight: bool = False, ) -> bool: """Checks if the extraction status is already marked as complete - for the given document and doc_id. + for the given document and doc_id with matching highlight setting. Args: document_id (str): ID of the document in DocumentManager. profile_manager (ProfileManager): ProfileManager instance for context. doc_id (str): Unique identifier for the document within extraction status. + enable_highlight (bool): Whether highlight metadata is required. Returns: - bool: True if extraction is complete, False otherwise. + bool: True if extraction is complete with matching highlight setting, + False otherwise. """ try: index_manager = IndexManager.objects.filter( @@ -148,20 +162,46 @@ def check_extraction_status( return False extraction_status = index_manager.extraction_status or {} - is_extracted = extraction_status.get(doc_id, False) + status_entry = extraction_status.get(doc_id) - if is_extracted: + if not status_entry: logger.info( - f"Extraction is already marked as complete " + f"Extraction is NOT yet marked as complete " f"for document: {document_id} with {doc_id}" ) + return False + + # Backward compatibility: treat boolean True as non-highlighted + if isinstance(status_entry, bool): + is_extracted = status_entry + is_highlight_handled = False + else: + # New format: {"extracted": True, "enable_highlight": } + is_extracted = status_entry.get("extracted", False) + is_highlight_handled = status_entry.get("enable_highlight", False) + + # Check if extraction exists AND highlight setting matches + if is_extracted and is_highlight_handled == enable_highlight: + logger.info( + f"Extraction is already marked as complete " + f"for document: {document_id} with {doc_id} " + f"(highlight={enable_highlight})" + ) + return True + elif is_extracted and is_highlight_handled != enable_highlight: + logger.info( + f"Extraction exists but highlight mismatch " + f"for document: {document_id} with {doc_id}. " + f"Stored: {is_highlight_handled}, Requested: {enable_highlight}. " + f"Re-extraction needed." + ) + return False else: logger.info( f"Extraction is NOT yet marked as complete " f"for document: {document_id} with {doc_id}" ) - - return is_extracted + return False except Exception as e: logger.error(f"Unexpected error while checking extraction status: {e}") From 11842a93c495d30a53cb4daa8e207efd7652424a Mon Sep 17 00:00:00 2001 From: Chandrasekharan M Date: Wed, 5 Nov 2025 11:09:58 +0530 Subject: [PATCH 2/2] minor: Added comment to highlight JSON structure for extraction status --- backend/prompt_studio/prompt_studio_index_manager_v2/models.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/backend/prompt_studio/prompt_studio_index_manager_v2/models.py b/backend/prompt_studio/prompt_studio_index_manager_v2/models.py index 9573757dcf..14109f7c11 100644 --- a/backend/prompt_studio/prompt_studio_index_manager_v2/models.py +++ b/backend/prompt_studio/prompt_studio_index_manager_v2/models.py @@ -69,6 +69,8 @@ class IndexManager(BaseModel): null=False, blank=False, ) + # TODO: Consider making this an enum instead + # A nested JSON of the form {doc_id: {"extracted": True, "enable_highlight": }} extraction_status = models.JSONField( db_comment="Extraction status for documents", null=False,