Skip to content

Commit aebe25c

Browse files
authored
fix(html): prevent hierarchy reset in rich table cells (#2716)
* fix(html): restore parents after rich cell walking Signed-off-by: Matvei Smirnov <[email protected]> * fix(html): add table cell context manager, update tests Signed-off-by: Matvei Smirnov <[email protected]> * fix(html): table with heading test data Signed-off-by: Matvei Smirnov <[email protected]> --------- Signed-off-by: Matvei Smirnov <[email protected]>
1 parent c97715f commit aebe25c

33 files changed

+373
-32
lines changed

docling/backend/html_backend.py

Lines changed: 17 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -459,10 +459,8 @@ def parse_table_data(
459459
rich_table_cell = self._is_rich_table_cell(html_cell)
460460
if rich_table_cell:
461461
# Parse table cell sub-tree for Rich Cells content:
462-
table_level = self.level
463-
provs_in_cell = self._walk(html_cell, doc)
464-
# After walking sub-tree in cell, restore previously set level
465-
self.level = table_level
462+
with self._use_table_cell_context():
463+
provs_in_cell = self._walk(html_cell, doc)
466464

467465
group_name = f"rich_cell_group_{len(doc.tables)}_{col_idx}_{start_row_span + row_idx}"
468466
rich_table_cell, ref_for_rich_cell = (
@@ -829,6 +827,21 @@ def _use_footer(self, tag: Tag, doc: DoclingDocument):
829827
self.level -= 1
830828
self.content_layer = current_layer
831829

830+
@contextmanager
831+
def _use_table_cell_context(self):
832+
"""Preserve the hierarchy level and parents during table cell processing.
833+
834+
While the context manager is active, the hierarchy level and parents can be modified.
835+
When exiting, the original level and parents are restored.
836+
"""
837+
original_level = self.level
838+
original_parents = self.parents.copy()
839+
try:
840+
yield
841+
finally:
842+
self.level = original_level
843+
self.parents = original_parents
844+
832845
def _handle_heading(self, tag: Tag, doc: DoclingDocument) -> list[RefItem]:
833846
added_ref = []
834847
tag_name = tag.name.lower()

tests/data/groundtruth/docling_v2/example_01.html.json

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,10 @@
11
{
22
"schema_name": "DoclingDocument",
3-
"version": "1.7.0",
3+
"version": "1.8.0",
44
"name": "example_01",
55
"origin": {
66
"mimetype": "text/html",
7-
"binary_hash": 13726679883013609282,
7+
"binary_hash": 3245959421868226348,
88
"filename": "example_01.html"
99
},
1010
"furniture": {

tests/data/groundtruth/docling_v2/example_02.html.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
{
22
"schema_name": "DoclingDocument",
3-
"version": "1.7.0",
3+
"version": "1.8.0",
44
"name": "example_02",
55
"origin": {
66
"mimetype": "text/html",

tests/data/groundtruth/docling_v2/example_03.html.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
{
22
"schema_name": "DoclingDocument",
3-
"version": "1.7.0",
3+
"version": "1.8.0",
44
"name": "example_03",
55
"origin": {
66
"mimetype": "text/html",

tests/data/groundtruth/docling_v2/example_04.html.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
{
22
"schema_name": "DoclingDocument",
3-
"version": "1.7.0",
3+
"version": "1.8.0",
44
"name": "example_04",
55
"origin": {
66
"mimetype": "text/html",

tests/data/groundtruth/docling_v2/example_05.html.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
{
22
"schema_name": "DoclingDocument",
3-
"version": "1.7.0",
3+
"version": "1.8.0",
44
"name": "example_05",
55
"origin": {
66
"mimetype": "text/html",

tests/data/groundtruth/docling_v2/example_06.html.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
{
22
"schema_name": "DoclingDocument",
3-
"version": "1.7.0",
3+
"version": "1.8.0",
44
"name": "example_06",
55
"origin": {
66
"mimetype": "text/html",

tests/data/groundtruth/docling_v2/example_07.html.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
{
22
"schema_name": "DoclingDocument",
3-
"version": "1.7.0",
3+
"version": "1.8.0",
44
"name": "example_07",
55
"origin": {
66
"mimetype": "text/html",

tests/data/groundtruth/docling_v2/example_08.html.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
{
22
"schema_name": "DoclingDocument",
3-
"version": "1.7.0",
3+
"version": "1.8.0",
44
"name": "example_08",
55
"origin": {
66
"mimetype": "text/html",

tests/data/groundtruth/docling_v2/formatting.html.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
{
22
"schema_name": "DoclingDocument",
3-
"version": "1.7.0",
3+
"version": "1.8.0",
44
"name": "formatting",
55
"origin": {
66
"mimetype": "text/html",

0 commit comments

Comments
 (0)