Skip to content

Commit 054c4a6

Browse files
authored
fix(docx): parse page headers and footers (#2599)
* fix(docx): parse page headers and footers Signed-off-by: Cesar Berrospi Ramis <[email protected]> * chore(docx): rename _add_header with _add_heading To avoid confusion, rename _add_header function name with _add_heading since the function is about adding section headings. Signed-off-by: Cesar Berrospi Ramis <[email protected]> * chore(docx): extend the page header and footer parsing to any content type Signed-off-by: Cesar Berrospi Ramis <[email protected]> * chore(docx): fix _add_header_footer function Signed-off-by: Cesar Berrospi Ramis <[email protected]> --------- Signed-off-by: Cesar Berrospi Ramis <[email protected]>
1 parent 463051b commit 054c4a6

File tree

6 files changed

+516
-22
lines changed

6 files changed

+516
-22
lines changed

docling/backend/msword_backend.py

Lines changed: 130 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
from typing import Any, Callable, Final, Optional, Union
77

88
from docling_core.types.doc import (
9+
ContentLayer,
910
DocItemLabel,
1011
DoclingDocument,
1112
DocumentOrigin,
@@ -95,6 +96,8 @@ def __init__(
9596
self.listIter = 0
9697
# Track list counters per numId and ilvl
9798
self.list_counters: dict[tuple[int, int], int] = {}
99+
# Set starting content layer
100+
self.content_layer = ContentLayer.BODY
98101

99102
self.history: dict[str, Any] = {
100103
"names": [None],
@@ -148,6 +151,7 @@ def convert(self) -> DoclingDocument:
148151
if self.is_valid():
149152
assert self.docx_obj is not None
150153
doc, _ = self._walk_linear(self.docx_obj.element.body, doc)
154+
self._add_header_footer(self.docx_obj, doc)
151155

152156
return doc
153157
else:
@@ -258,12 +262,14 @@ def _walk_linear(
258262
label=GroupLabel.SECTION,
259263
parent=self.parents[level - 1],
260264
name="shape-text",
265+
content_layer=self.content_layer,
261266
)
262267
added_elements.append(shape_group.get_ref())
263268
doc.add_text(
264269
label=DocItemLabel.TEXT,
265270
parent=shape_group,
266271
text=text_content,
272+
content_layer=self.content_layer,
267273
)
268274

269275
if textbox_elements:
@@ -750,7 +756,10 @@ def _handle_textbox_content(
750756
level = self._get_level()
751757
# Create a textbox group to contain all text from the textbox
752758
textbox_group = doc.add_group(
753-
label=GroupLabel.SECTION, parent=self.parents[level - 1], name="textbox"
759+
label=GroupLabel.SECTION,
760+
parent=self.parents[level - 1],
761+
name="textbox",
762+
content_layer=self.content_layer,
754763
)
755764
elem_ref.append(textbox_group.get_ref())
756765
# Set this as the current parent to ensure textbox content
@@ -864,7 +873,7 @@ def _create_or_reuse_parent(
864873
paragraph_elements: list,
865874
) -> Optional[NodeItem]:
866875
return (
867-
doc.add_inline_group(parent=prev_parent)
876+
doc.add_inline_group(parent=prev_parent, content_layer=self.content_layer)
868877
if len(paragraph_elements) > 1
869878
else prev_parent
870879
)
@@ -932,7 +941,12 @@ def _handle_text_elements(
932941
if p_style_id in ["Title"]:
933942
for key in range(len(self.parents)):
934943
self.parents[key] = None
935-
te = doc.add_text(parent=None, label=DocItemLabel.TITLE, text=text)
944+
te = doc.add_text(
945+
parent=None,
946+
label=DocItemLabel.TITLE,
947+
text=text,
948+
content_layer=self.content_layer,
949+
)
936950
self.parents[0] = te
937951
elem_ref.append(te.get_ref())
938952
elif "Heading" in p_style_id:
@@ -943,7 +957,7 @@ def _handle_text_elements(
943957
)
944958
else:
945959
is_numbered_style = False
946-
h1 = self._add_header(doc, p_level, text, is_numbered_style)
960+
h1 = self._add_heading(doc, p_level, text, is_numbered_style)
947961
elem_ref.extend(h1)
948962

949963
elif len(equations) > 0:
@@ -956,12 +970,15 @@ def _handle_text_elements(
956970
label=DocItemLabel.FORMULA,
957971
parent=self.parents[level - 1],
958972
text=text.replace("<eq>", "").replace("</eq>", ""),
973+
content_layer=self.content_layer,
959974
)
960975
elem_ref.append(t1.get_ref())
961976
else:
962977
# Inline equation
963978
level = self._get_level()
964-
inline_equation = doc.add_inline_group(parent=self.parents[level - 1])
979+
inline_equation = doc.add_inline_group(
980+
parent=self.parents[level - 1], content_layer=self.content_layer
981+
)
965982
elem_ref.append(inline_equation.get_ref())
966983
text_tmp = text
967984
for eq in equations:
@@ -978,12 +995,14 @@ def _handle_text_elements(
978995
label=DocItemLabel.TEXT,
979996
parent=inline_equation,
980997
text=pre_eq_text,
998+
content_layer=self.content_layer,
981999
)
9821000
elem_ref.append(e1.get_ref())
9831001
e2 = doc.add_text(
9841002
label=DocItemLabel.FORMULA,
9851003
parent=inline_equation,
9861004
text=eq.replace("<eq>", "").replace("</eq>", ""),
1005+
content_layer=self.content_layer,
9871006
)
9881007
elem_ref.append(e2.get_ref())
9891008

@@ -992,6 +1011,7 @@ def _handle_text_elements(
9921011
label=DocItemLabel.TEXT,
9931012
parent=inline_equation,
9941013
text=text_tmp.strip(),
1014+
content_layer=self.content_layer,
9951015
)
9961016
elem_ref.append(e3.get_ref())
9971017

@@ -1018,6 +1038,7 @@ def _handle_text_elements(
10181038
text=text,
10191039
formatting=format,
10201040
hyperlink=hyperlink,
1041+
content_layer=self.content_layer,
10211042
)
10221043
elem_ref.append(t2.get_ref())
10231044

@@ -1037,13 +1058,14 @@ def _handle_text_elements(
10371058
text=text,
10381059
formatting=format,
10391060
hyperlink=hyperlink,
1061+
content_layer=self.content_layer,
10401062
)
10411063
elem_ref.append(t3.get_ref())
10421064

10431065
self._update_history(p_style_id, p_level, numid, ilevel)
10441066
return elem_ref
10451067

1046-
def _add_header(
1068+
def _add_heading(
10471069
self,
10481070
doc: DoclingDocument,
10491071
curr_level: Optional[int],
@@ -1154,6 +1176,7 @@ def _add_formatted_list_item(
11541176
text=text,
11551177
formatting=format,
11561178
hyperlink=hyperlink,
1179+
content_layer=self.content_layer,
11571180
)
11581181
return elem_ref
11591182

@@ -1180,7 +1203,11 @@ def _add_list_item(
11801203
# Reset counters for the new numbering sequence
11811204
self._reset_list_counters_for_new_sequence(numid)
11821205

1183-
list_gr = doc.add_list_group(name="list", parent=self.parents[level - 1])
1206+
list_gr = doc.add_list_group(
1207+
name="list",
1208+
parent=self.parents[level - 1],
1209+
content_layer=self.content_layer,
1210+
)
11841211
self.parents[level] = list_gr
11851212
elem_ref.append(list_gr.get_ref())
11861213

@@ -1203,7 +1230,11 @@ def _add_list_item(
12031230
self.level_at_new_list + prev_indent + 1,
12041231
self.level_at_new_list + ilevel + 1,
12051232
):
1206-
list_gr1 = doc.add_list_group(name="list", parent=self.parents[i - 1])
1233+
list_gr1 = doc.add_list_group(
1234+
name="list",
1235+
parent=self.parents[i - 1],
1236+
content_layer=self.content_layer,
1237+
)
12071238
self.parents[i] = list_gr1
12081239
elem_ref.append(list_gr1.get_ref())
12091240

@@ -1262,11 +1293,13 @@ def _group_cell_elements(
12621293
doc: DoclingDocument,
12631294
provs_in_cell: list[RefItem],
12641295
docling_table: TableItem,
1296+
content_layer: ContentLayer = ContentLayer.BODY,
12651297
) -> RefItem:
12661298
group_element = doc.add_group(
12671299
label=GroupLabel.UNSPECIFIED,
12681300
name=group_name,
12691301
parent=docling_table,
1302+
content_layer=content_layer,
12701303
)
12711304
for prov in provs_in_cell:
12721305
group_element.children.append(prov)
@@ -1298,7 +1331,9 @@ def _handle_tables(
12981331

12991332
data = TableData(num_rows=num_rows, num_cols=num_cols)
13001333
level = self._get_level()
1301-
docling_table = doc.add_table(data=data, parent=self.parents[level - 1])
1334+
docling_table = doc.add_table(
1335+
data=data, parent=self.parents[level - 1], content_layer=self.content_layer
1336+
)
13021337
elem_ref.append(docling_table.get_ref())
13031338

13041339
cell_set: set[CT_Tc] = set()
@@ -1349,7 +1384,11 @@ def _handle_tables(
13491384
rich_table_cell = True
13501385
group_name = f"rich_cell_group_{len(doc.tables)}_{col_idx}_{row.grid_cols_before + row_idx}"
13511386
ref_for_rich_cell = MsWordDocumentBackend._group_cell_elements(
1352-
group_name, doc, provs_in_cell, docling_table
1387+
group_name,
1388+
doc,
1389+
provs_in_cell,
1390+
docling_table,
1391+
content_layer=self.content_layer,
13531392
)
13541393

13551394
if rich_table_cell:
@@ -1383,6 +1422,26 @@ def _handle_tables(
13831422
col_idx += cell.grid_span
13841423
return elem_ref
13851424

1425+
def _has_blip(self, element: BaseOxmlElement) -> bool:
1426+
"""Check if a docx element holds any BLIP as a child.
1427+
1428+
Args:
1429+
element: a docx element
1430+
1431+
Returns:
1432+
Whether the element contains a BLIP as a direct child.
1433+
"""
1434+
1435+
for item in element:
1436+
if self.blip_xpath_expr(item):
1437+
return True
1438+
if item.findall(
1439+
".//w:drawing", namespaces=MsWordDocumentBackend._BLIP_NAMESPACES
1440+
):
1441+
return True
1442+
1443+
return False
1444+
13861445
def _is_rich_table_cell(self, cell: _Cell) -> bool:
13871446
"""Determine whether a docx cell should be parsed as a Docling RichTableCell.
13881447
@@ -1420,13 +1479,8 @@ def _is_rich_table_cell(self, cell: _Cell) -> bool:
14201479
tag = child.tag.split("}")[-1]
14211480
if tag not in allowed_tags:
14221481
return True
1423-
for elem in tc:
1424-
if self.blip_xpath_expr(elem):
1425-
return True
1426-
if elem.findall(
1427-
".//w:drawing", namespaces=MsWordDocumentBackend._BLIP_NAMESPACES
1428-
):
1429-
return True
1482+
if self._has_blip(tc):
1483+
return True
14301484

14311485
# paragraph must contain runs with no run-properties
14321486
for para in paragraphs:
@@ -1468,6 +1522,7 @@ def get_docx_image(drawing_blip: Any) -> Optional[bytes]:
14681522
p1 = doc.add_picture(
14691523
parent=self.parents[level - 1],
14701524
caption=None,
1525+
content_layer=self.content_layer,
14711526
)
14721527
elem_ref.append(p1.get_ref())
14731528
else:
@@ -1478,13 +1533,15 @@ def get_docx_image(drawing_blip: Any) -> Optional[bytes]:
14781533
parent=self.parents[level - 1],
14791534
image=ImageRef.from_pil(image=pil_image, dpi=72),
14801535
caption=None,
1536+
content_layer=self.content_layer,
14811537
)
14821538
elem_ref.append(p2.get_ref())
14831539
except (UnidentifiedImageError, OSError):
14841540
_log.warning("Warning: image cannot be loaded by Pillow")
14851541
p3 = doc.add_picture(
14861542
parent=self.parents[level - 1],
14871543
caption=None,
1544+
content_layer=self.content_layer,
14881545
)
14891546
elem_ref.append(p3.get_ref())
14901547
return elem_ref
@@ -1515,12 +1572,68 @@ def _handle_drawingml(self, doc: DoclingDocument, drawingml_els: Any):
15151572
parent=self.parents[level - 1],
15161573
image=ImageRef.from_pil(image=pil_image, dpi=72),
15171574
caption=None,
1575+
content_layer=self.content_layer,
15181576
)
15191577
except (UnidentifiedImageError, OSError):
15201578
_log.warning("Warning: DrawingML image cannot be loaded by Pillow")
15211579
doc.add_picture(
15221580
parent=self.parents[level - 1],
15231581
caption=None,
1582+
content_layer=self.content_layer,
15241583
)
15251584

15261585
return
1586+
1587+
def _add_header_footer(self, docx_obj: DocxDocument, doc: DoclingDocument) -> None:
1588+
"""Add section headers and footers.
1589+
1590+
Headers and footers are added in the furniture content and only the text paragraphs
1591+
are parsed. The paragraphs are attached to a single group item for the header or the
1592+
footer. If the document has a section with new header and footer, they will be parsed
1593+
in new group items.
1594+
1595+
Args:
1596+
docx_obj: A docx Document object to be parsed.
1597+
doc: A DoclingDocument object to add the header and footer from docx_obj.
1598+
"""
1599+
current_layer = self.content_layer
1600+
base_parent = self.parents[0]
1601+
self.content_layer = ContentLayer.FURNITURE
1602+
for sec_idx, section in enumerate(docx_obj.sections):
1603+
if sec_idx > 0 and not section.different_first_page_header_footer:
1604+
continue
1605+
1606+
hdr = (
1607+
section.first_page_header
1608+
if section.different_first_page_header_footer
1609+
else section.header
1610+
)
1611+
par = [txt for txt in (par.text.strip() for par in hdr.paragraphs) if txt]
1612+
tables = hdr.tables
1613+
has_blip = self._has_blip(hdr._element)
1614+
if par or tables or has_blip:
1615+
self.parents[0] = doc.add_group(
1616+
label=GroupLabel.SECTION,
1617+
name="page header",
1618+
content_layer=self.content_layer,
1619+
)
1620+
self._walk_linear(hdr._element, doc)
1621+
1622+
ftr = (
1623+
section.first_page_footer
1624+
if section.different_first_page_header_footer
1625+
else section.footer
1626+
)
1627+
par = [txt for txt in (par.text.strip() for par in ftr.paragraphs) if txt]
1628+
tables = ftr.tables
1629+
has_blip = self._has_blip(ftr._element)
1630+
if par or tables or has_blip:
1631+
self.parents[0] = doc.add_group(
1632+
label=GroupLabel.SECTION,
1633+
name="page footer",
1634+
content_layer=self.content_layer,
1635+
)
1636+
self._walk_linear(ftr._element, doc)
1637+
1638+
self.content_layer = current_layer
1639+
self.parents[0] = base_parent
9.15 KB
Binary file not shown.

tests/data/groundtruth/docling_v2/unit_test_formatting.docx.itxt

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,4 +29,7 @@ item-0 at level 0: unspecified: group _root_
2929
item-28 at level 5: text: Nested
3030
item-29 at level 5: text: italic
3131
item-30 at level 5: text: bold
32-
item-31 at level 1: text:
32+
item-31 at level 1: text:
33+
item-32 at level 1: text: The second page of the document with same header and footer
34+
item-33 at level 1: text:
35+
item-34 at level 1: text: The third page of the document with different header and footer

0 commit comments

Comments
 (0)