66from typing import Any , Callable , Final , Optional , Union
77
88from docling_core .types .doc import (
9+ ContentLayer ,
910 DocItemLabel ,
1011 DoclingDocument ,
1112 DocumentOrigin ,
@@ -95,6 +96,8 @@ def __init__(
9596 self .listIter = 0
9697 # Track list counters per numId and ilvl
9798 self .list_counters : dict [tuple [int , int ], int ] = {}
99+ # Set starting content layer
100+ self .content_layer = ContentLayer .BODY
98101
99102 self .history : dict [str , Any ] = {
100103 "names" : [None ],
@@ -148,6 +151,7 @@ def convert(self) -> DoclingDocument:
148151 if self .is_valid ():
149152 assert self .docx_obj is not None
150153 doc , _ = self ._walk_linear (self .docx_obj .element .body , doc )
154+ self ._add_header_footer (self .docx_obj , doc )
151155
152156 return doc
153157 else :
@@ -258,12 +262,14 @@ def _walk_linear(
258262 label = GroupLabel .SECTION ,
259263 parent = self .parents [level - 1 ],
260264 name = "shape-text" ,
265+ content_layer = self .content_layer ,
261266 )
262267 added_elements .append (shape_group .get_ref ())
263268 doc .add_text (
264269 label = DocItemLabel .TEXT ,
265270 parent = shape_group ,
266271 text = text_content ,
272+ content_layer = self .content_layer ,
267273 )
268274
269275 if textbox_elements :
@@ -750,7 +756,10 @@ def _handle_textbox_content(
750756 level = self ._get_level ()
751757 # Create a textbox group to contain all text from the textbox
752758 textbox_group = doc .add_group (
753- label = GroupLabel .SECTION , parent = self .parents [level - 1 ], name = "textbox"
759+ label = GroupLabel .SECTION ,
760+ parent = self .parents [level - 1 ],
761+ name = "textbox" ,
762+ content_layer = self .content_layer ,
754763 )
755764 elem_ref .append (textbox_group .get_ref ())
756765 # Set this as the current parent to ensure textbox content
@@ -864,7 +873,7 @@ def _create_or_reuse_parent(
864873 paragraph_elements : list ,
865874 ) -> Optional [NodeItem ]:
866875 return (
867- doc .add_inline_group (parent = prev_parent )
876+ doc .add_inline_group (parent = prev_parent , content_layer = self . content_layer )
868877 if len (paragraph_elements ) > 1
869878 else prev_parent
870879 )
@@ -932,7 +941,12 @@ def _handle_text_elements(
932941 if p_style_id in ["Title" ]:
933942 for key in range (len (self .parents )):
934943 self .parents [key ] = None
935- te = doc .add_text (parent = None , label = DocItemLabel .TITLE , text = text )
944+ te = doc .add_text (
945+ parent = None ,
946+ label = DocItemLabel .TITLE ,
947+ text = text ,
948+ content_layer = self .content_layer ,
949+ )
936950 self .parents [0 ] = te
937951 elem_ref .append (te .get_ref ())
938952 elif "Heading" in p_style_id :
@@ -943,7 +957,7 @@ def _handle_text_elements(
943957 )
944958 else :
945959 is_numbered_style = False
946- h1 = self ._add_header (doc , p_level , text , is_numbered_style )
960+ h1 = self ._add_heading (doc , p_level , text , is_numbered_style )
947961 elem_ref .extend (h1 )
948962
949963 elif len (equations ) > 0 :
@@ -956,12 +970,15 @@ def _handle_text_elements(
956970 label = DocItemLabel .FORMULA ,
957971 parent = self .parents [level - 1 ],
958972 text = text .replace ("<eq>" , "" ).replace ("</eq>" , "" ),
973+ content_layer = self .content_layer ,
959974 )
960975 elem_ref .append (t1 .get_ref ())
961976 else :
962977 # Inline equation
963978 level = self ._get_level ()
964- inline_equation = doc .add_inline_group (parent = self .parents [level - 1 ])
979+ inline_equation = doc .add_inline_group (
980+ parent = self .parents [level - 1 ], content_layer = self .content_layer
981+ )
965982 elem_ref .append (inline_equation .get_ref ())
966983 text_tmp = text
967984 for eq in equations :
@@ -978,12 +995,14 @@ def _handle_text_elements(
978995 label = DocItemLabel .TEXT ,
979996 parent = inline_equation ,
980997 text = pre_eq_text ,
998+ content_layer = self .content_layer ,
981999 )
9821000 elem_ref .append (e1 .get_ref ())
9831001 e2 = doc .add_text (
9841002 label = DocItemLabel .FORMULA ,
9851003 parent = inline_equation ,
9861004 text = eq .replace ("<eq>" , "" ).replace ("</eq>" , "" ),
1005+ content_layer = self .content_layer ,
9871006 )
9881007 elem_ref .append (e2 .get_ref ())
9891008
@@ -992,6 +1011,7 @@ def _handle_text_elements(
9921011 label = DocItemLabel .TEXT ,
9931012 parent = inline_equation ,
9941013 text = text_tmp .strip (),
1014+ content_layer = self .content_layer ,
9951015 )
9961016 elem_ref .append (e3 .get_ref ())
9971017
@@ -1018,6 +1038,7 @@ def _handle_text_elements(
10181038 text = text ,
10191039 formatting = format ,
10201040 hyperlink = hyperlink ,
1041+ content_layer = self .content_layer ,
10211042 )
10221043 elem_ref .append (t2 .get_ref ())
10231044
@@ -1037,13 +1058,14 @@ def _handle_text_elements(
10371058 text = text ,
10381059 formatting = format ,
10391060 hyperlink = hyperlink ,
1061+ content_layer = self .content_layer ,
10401062 )
10411063 elem_ref .append (t3 .get_ref ())
10421064
10431065 self ._update_history (p_style_id , p_level , numid , ilevel )
10441066 return elem_ref
10451067
1046- def _add_header (
1068+ def _add_heading (
10471069 self ,
10481070 doc : DoclingDocument ,
10491071 curr_level : Optional [int ],
@@ -1154,6 +1176,7 @@ def _add_formatted_list_item(
11541176 text = text ,
11551177 formatting = format ,
11561178 hyperlink = hyperlink ,
1179+ content_layer = self .content_layer ,
11571180 )
11581181 return elem_ref
11591182
@@ -1180,7 +1203,11 @@ def _add_list_item(
11801203 # Reset counters for the new numbering sequence
11811204 self ._reset_list_counters_for_new_sequence (numid )
11821205
1183- list_gr = doc .add_list_group (name = "list" , parent = self .parents [level - 1 ])
1206+ list_gr = doc .add_list_group (
1207+ name = "list" ,
1208+ parent = self .parents [level - 1 ],
1209+ content_layer = self .content_layer ,
1210+ )
11841211 self .parents [level ] = list_gr
11851212 elem_ref .append (list_gr .get_ref ())
11861213
@@ -1203,7 +1230,11 @@ def _add_list_item(
12031230 self .level_at_new_list + prev_indent + 1 ,
12041231 self .level_at_new_list + ilevel + 1 ,
12051232 ):
1206- list_gr1 = doc .add_list_group (name = "list" , parent = self .parents [i - 1 ])
1233+ list_gr1 = doc .add_list_group (
1234+ name = "list" ,
1235+ parent = self .parents [i - 1 ],
1236+ content_layer = self .content_layer ,
1237+ )
12071238 self .parents [i ] = list_gr1
12081239 elem_ref .append (list_gr1 .get_ref ())
12091240
@@ -1262,11 +1293,13 @@ def _group_cell_elements(
12621293 doc : DoclingDocument ,
12631294 provs_in_cell : list [RefItem ],
12641295 docling_table : TableItem ,
1296+ content_layer : ContentLayer = ContentLayer .BODY ,
12651297 ) -> RefItem :
12661298 group_element = doc .add_group (
12671299 label = GroupLabel .UNSPECIFIED ,
12681300 name = group_name ,
12691301 parent = docling_table ,
1302+ content_layer = content_layer ,
12701303 )
12711304 for prov in provs_in_cell :
12721305 group_element .children .append (prov )
@@ -1298,7 +1331,9 @@ def _handle_tables(
12981331
12991332 data = TableData (num_rows = num_rows , num_cols = num_cols )
13001333 level = self ._get_level ()
1301- docling_table = doc .add_table (data = data , parent = self .parents [level - 1 ])
1334+ docling_table = doc .add_table (
1335+ data = data , parent = self .parents [level - 1 ], content_layer = self .content_layer
1336+ )
13021337 elem_ref .append (docling_table .get_ref ())
13031338
13041339 cell_set : set [CT_Tc ] = set ()
@@ -1349,7 +1384,11 @@ def _handle_tables(
13491384 rich_table_cell = True
13501385 group_name = f"rich_cell_group_{ len (doc .tables )} _{ col_idx } _{ row .grid_cols_before + row_idx } "
13511386 ref_for_rich_cell = MsWordDocumentBackend ._group_cell_elements (
1352- group_name , doc , provs_in_cell , docling_table
1387+ group_name ,
1388+ doc ,
1389+ provs_in_cell ,
1390+ docling_table ,
1391+ content_layer = self .content_layer ,
13531392 )
13541393
13551394 if rich_table_cell :
@@ -1383,6 +1422,26 @@ def _handle_tables(
13831422 col_idx += cell .grid_span
13841423 return elem_ref
13851424
1425+ def _has_blip (self , element : BaseOxmlElement ) -> bool :
1426+ """Check if a docx element holds any BLIP as a child.
1427+
1428+ Args:
1429+ element: a docx element
1430+
1431+ Returns:
1432+ Whether the element contains a BLIP as a direct child.
1433+ """
1434+
1435+ for item in element :
1436+ if self .blip_xpath_expr (item ):
1437+ return True
1438+ if item .findall (
1439+ ".//w:drawing" , namespaces = MsWordDocumentBackend ._BLIP_NAMESPACES
1440+ ):
1441+ return True
1442+
1443+ return False
1444+
13861445 def _is_rich_table_cell (self , cell : _Cell ) -> bool :
13871446 """Determine whether a docx cell should be parsed as a Docling RichTableCell.
13881447
@@ -1420,13 +1479,8 @@ def _is_rich_table_cell(self, cell: _Cell) -> bool:
14201479 tag = child .tag .split ("}" )[- 1 ]
14211480 if tag not in allowed_tags :
14221481 return True
1423- for elem in tc :
1424- if self .blip_xpath_expr (elem ):
1425- return True
1426- if elem .findall (
1427- ".//w:drawing" , namespaces = MsWordDocumentBackend ._BLIP_NAMESPACES
1428- ):
1429- return True
1482+ if self ._has_blip (tc ):
1483+ return True
14301484
14311485 # paragraph must contain runs with no run-properties
14321486 for para in paragraphs :
@@ -1468,6 +1522,7 @@ def get_docx_image(drawing_blip: Any) -> Optional[bytes]:
14681522 p1 = doc .add_picture (
14691523 parent = self .parents [level - 1 ],
14701524 caption = None ,
1525+ content_layer = self .content_layer ,
14711526 )
14721527 elem_ref .append (p1 .get_ref ())
14731528 else :
@@ -1478,13 +1533,15 @@ def get_docx_image(drawing_blip: Any) -> Optional[bytes]:
14781533 parent = self .parents [level - 1 ],
14791534 image = ImageRef .from_pil (image = pil_image , dpi = 72 ),
14801535 caption = None ,
1536+ content_layer = self .content_layer ,
14811537 )
14821538 elem_ref .append (p2 .get_ref ())
14831539 except (UnidentifiedImageError , OSError ):
14841540 _log .warning ("Warning: image cannot be loaded by Pillow" )
14851541 p3 = doc .add_picture (
14861542 parent = self .parents [level - 1 ],
14871543 caption = None ,
1544+ content_layer = self .content_layer ,
14881545 )
14891546 elem_ref .append (p3 .get_ref ())
14901547 return elem_ref
@@ -1515,12 +1572,68 @@ def _handle_drawingml(self, doc: DoclingDocument, drawingml_els: Any):
15151572 parent = self .parents [level - 1 ],
15161573 image = ImageRef .from_pil (image = pil_image , dpi = 72 ),
15171574 caption = None ,
1575+ content_layer = self .content_layer ,
15181576 )
15191577 except (UnidentifiedImageError , OSError ):
15201578 _log .warning ("Warning: DrawingML image cannot be loaded by Pillow" )
15211579 doc .add_picture (
15221580 parent = self .parents [level - 1 ],
15231581 caption = None ,
1582+ content_layer = self .content_layer ,
15241583 )
15251584
15261585 return
1586+
1587+ def _add_header_footer (self , docx_obj : DocxDocument , doc : DoclingDocument ) -> None :
1588+ """Add section headers and footers.
1589+
1590+ Headers and footers are added in the furniture content and only the text paragraphs
1591+ are parsed. The paragraphs are attached to a single group item for the header or the
1592+ footer. If the document has a section with new header and footer, they will be parsed
1593+ in new group items.
1594+
1595+ Args:
1596+ docx_obj: A docx Document object to be parsed.
1597+ doc: A DoclingDocument object to add the header and footer from docx_obj.
1598+ """
1599+ current_layer = self .content_layer
1600+ base_parent = self .parents [0 ]
1601+ self .content_layer = ContentLayer .FURNITURE
1602+ for sec_idx , section in enumerate (docx_obj .sections ):
1603+ if sec_idx > 0 and not section .different_first_page_header_footer :
1604+ continue
1605+
1606+ hdr = (
1607+ section .first_page_header
1608+ if section .different_first_page_header_footer
1609+ else section .header
1610+ )
1611+ par = [txt for txt in (par .text .strip () for par in hdr .paragraphs ) if txt ]
1612+ tables = hdr .tables
1613+ has_blip = self ._has_blip (hdr ._element )
1614+ if par or tables or has_blip :
1615+ self .parents [0 ] = doc .add_group (
1616+ label = GroupLabel .SECTION ,
1617+ name = "page header" ,
1618+ content_layer = self .content_layer ,
1619+ )
1620+ self ._walk_linear (hdr ._element , doc )
1621+
1622+ ftr = (
1623+ section .first_page_footer
1624+ if section .different_first_page_header_footer
1625+ else section .footer
1626+ )
1627+ par = [txt for txt in (par .text .strip () for par in ftr .paragraphs ) if txt ]
1628+ tables = ftr .tables
1629+ has_blip = self ._has_blip (ftr ._element )
1630+ if par or tables or has_blip :
1631+ self .parents [0 ] = doc .add_group (
1632+ label = GroupLabel .SECTION ,
1633+ name = "page footer" ,
1634+ content_layer = self .content_layer ,
1635+ )
1636+ self ._walk_linear (ftr ._element , doc )
1637+
1638+ self .content_layer = current_layer
1639+ self .parents [0 ] = base_parent
0 commit comments