From 77cbd1d146465bcae0a0bed6ab2ce165291f3a2e Mon Sep 17 00:00:00 2001
From: Chandrasekharan M <chandrasekharan@zipstack.com>
Date: Tue, 4 Nov 2025 17:29:27 +0530
Subject: [PATCH 1/2] UN-2947: Fix highlight extraction tracking regression
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Enhance extraction_status to track highlight metadata separately.
When enable_highlight is toggled, the system now correctly detects
the mismatch and triggers re-extraction instead of reusing cached
non-highlighted text.

Changes:
- check_extraction_status() now verifies highlight flag matches
- mark_extraction_status() stores highlight metadata in new format
- Backward compatible: treats old boolean values as non-highlighted

Fixes issue where:
- Documents extracted without highlight were reused when highlight enabled
- Caused by PR #1605 removing reindex parameter
- Index key doesn't include enable_highlight (by design for minimal impact)

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
---
 .../prompt_studio_helper.py                   |  2 +
 .../prompt_studio_index_helper.py             | 66 +++++++++++++++----
 2 files changed, 55 insertions(+), 13 deletions(-)

diff --git a/backend/prompt_studio/prompt_studio_core_v2/prompt_studio_helper.py b/backend/prompt_studio/prompt_studio_core_v2/prompt_studio_helper.py
index f6bf7086bd..7dd2fc8be3 100644
--- a/backend/prompt_studio/prompt_studio_core_v2/prompt_studio_helper.py
+++ b/backend/prompt_studio/prompt_studio_core_v2/prompt_studio_helper.py
@@ -1347,6 +1347,7 @@ def dynamic_extractor(
             document_id=document_id,
             profile_manager=profile_manager,
             doc_id=doc_id,
+            enable_highlight=enable_highlight,
         )
         if is_extracted:
             fs_instance = EnvHelper.get_storage(
@@ -1388,6 +1389,7 @@ def dynamic_extractor(
                 document_id=document_id,
                 profile_manager=profile_manager,
                 doc_id=doc_id,
+                enable_highlight=enable_highlight,
             )
         except SdkError as e:
             msg = str(e)
diff --git a/backend/prompt_studio/prompt_studio_index_manager_v2/prompt_studio_index_helper.py b/backend/prompt_studio/prompt_studio_index_manager_v2/prompt_studio_index_helper.py
index 739f3fb3ff..edf5be2d4c 100644
--- a/backend/prompt_studio/prompt_studio_index_manager_v2/prompt_studio_index_helper.py
+++ b/backend/prompt_studio/prompt_studio_index_manager_v2/prompt_studio_index_helper.py
@@ -69,14 +69,18 @@ def handle_index_manager(
 
     @staticmethod
     def mark_extraction_status(
-        document_id: str, profile_manager: ProfileManager, doc_id: str
+        document_id: str,
+        profile_manager: ProfileManager,
+        doc_id: str,
+        enable_highlight: bool = False,
     ) -> bool:
-        """Marks the extraction status for a given document.
+        """Marks the extraction status for a given document with highlight metadata.
 
         Args:
             document_id (str): ID of the document in DocumentManager.
             profile_manager (ProfileManager): ProfileManager instance for context.
             doc_id (str): Unique identifier for the document within extraction status.
+            enable_highlight (bool): Whether highlight metadata was used during extraction.
 
         Returns:
             bool: True if the status is successfully updated, False otherwise.
@@ -95,7 +99,10 @@ def mark_extraction_status(
 
                 index_manager.extraction_status = index_manager.extraction_status or {}
 
-                index_manager.extraction_status[doc_id] = True
+                index_manager.extraction_status[doc_id] = {
+                    "extracted": True,
+                    "enable_highlight": enable_highlight,
+                }
                 logger.info(
                     f"Index manager {index_manager} {index_manager.index_ids_history}"
                 )
@@ -104,12 +111,14 @@ def mark_extraction_status(
                 if created:
                     logger.info(
                         f"IndexManager entry created "
-                        f"for document: {document_id} with {doc_id}"
+                        f"for document: {document_id} with {doc_id} "
+                        f"(highlight={enable_highlight})"
                     )
                 else:
                     logger.info(
                         f"Updated extraction status "
-                        f"for document: {document_id} with {doc_id}"
+                        f"for document: {document_id} with {doc_id} "
+                        f"(highlight={enable_highlight})"
                     )
             return True
 
@@ -125,18 +134,23 @@ def mark_extraction_status(
 
     @staticmethod
     def check_extraction_status(
-        document_id: str, profile_manager: ProfileManager, doc_id: str
+        document_id: str,
+        profile_manager: ProfileManager,
+        doc_id: str,
+        enable_highlight: bool = False,
     ) -> bool:
         """Checks if the extraction status is already marked as complete
-        for the given document and doc_id.
+        for the given document and doc_id with matching highlight setting.
 
         Args:
             document_id (str): ID of the document in DocumentManager.
             profile_manager (ProfileManager): ProfileManager instance for context.
             doc_id (str): Unique identifier for the document within extraction status.
+            enable_highlight (bool): Whether highlight metadata is required.
 
         Returns:
-            bool: True if extraction is complete, False otherwise.
+            bool: True if extraction is complete with matching highlight setting,
+                  False otherwise.
         """
         try:
             index_manager = IndexManager.objects.filter(
@@ -148,20 +162,46 @@ def check_extraction_status(
                 return False
 
             extraction_status = index_manager.extraction_status or {}
-            is_extracted = extraction_status.get(doc_id, False)
+            status_entry = extraction_status.get(doc_id)
 
-            if is_extracted:
+            if not status_entry:
                 logger.info(
-                    f"Extraction is already marked as complete "
+                    f"Extraction is NOT yet marked as complete "
                     f"for document: {document_id} with {doc_id}"
                 )
+                return False
+
+            # Backward compatibility: treat boolean True as non-highlighted
+            if isinstance(status_entry, bool):
+                is_extracted = status_entry
+                is_highlight_handled = False
+            else:
+                # New format: {"extracted": True, "enable_highlight": <bool>}
+                is_extracted = status_entry.get("extracted", False)
+                is_highlight_handled = status_entry.get("enable_highlight", False)
+
+            # Check if extraction exists AND highlight setting matches
+            if is_extracted and is_highlight_handled == enable_highlight:
+                logger.info(
+                    f"Extraction is already marked as complete "
+                    f"for document: {document_id} with {doc_id} "
+                    f"(highlight={enable_highlight})"
+                )
+                return True
+            elif is_extracted and is_highlight_handled != enable_highlight:
+                logger.info(
+                    f"Extraction exists but highlight mismatch "
+                    f"for document: {document_id} with {doc_id}. "
+                    f"Stored: {is_highlight_handled}, Requested: {enable_highlight}. "
+                    f"Re-extraction needed."
+                )
+                return False
             else:
                 logger.info(
                     f"Extraction is NOT yet marked as complete "
                     f"for document: {document_id} with {doc_id}"
                 )
-
-            return is_extracted
+                return False
 
         except Exception as e:
             logger.error(f"Unexpected error while checking extraction status: {e}")

From 11842a93c495d30a53cb4daa8e207efd7652424a Mon Sep 17 00:00:00 2001
From: Chandrasekharan M <chandrasekharan@zipstack.com>
Date: Wed, 5 Nov 2025 11:09:58 +0530
Subject: [PATCH 2/2] minor: Added comment to highlight JSON structure for
 extraction status

---
 backend/prompt_studio/prompt_studio_index_manager_v2/models.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/backend/prompt_studio/prompt_studio_index_manager_v2/models.py b/backend/prompt_studio/prompt_studio_index_manager_v2/models.py
index 9573757dcf..14109f7c11 100644
--- a/backend/prompt_studio/prompt_studio_index_manager_v2/models.py
+++ b/backend/prompt_studio/prompt_studio_index_manager_v2/models.py
@@ -69,6 +69,8 @@ class IndexManager(BaseModel):
         null=False,
         blank=False,
     )
+    # TODO: Consider making this an enum instead
+    # A nested JSON of the form {doc_id: {"extracted": True, "enable_highlight": <bool>}}
     extraction_status = models.JSONField(
         db_comment="Extraction status for documents",
         null=False,