Skip to content

Commit c97715f

Browse files
authored
fix(docx): parse integrals as n-ary objects without chr element (#2712)
Signed-off-by: Cesar Berrospi Ramis <[email protected]>
1 parent f80c903 commit c97715f

File tree

6 files changed

+248
-5
lines changed

6 files changed

+248
-5
lines changed

docling/backend/docx/latex/latex_dict.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,11 @@
6565
"\u2210": "\\coprod",
6666
"\u2211": "\\sum",
6767
"\u222b": "\\int",
68+
"\u222c": "\\iint",
69+
"\u222d": "\\iiint",
70+
"\u222e": "\\oint",
71+
"\u222f": "\\oiint",
72+
"\u2230": "\\oiiint",
6873
"\u22c0": "\\bigwedge",
6974
"\u22c1": "\\bigvee",
7075
"\u22c2": "\\bigcap",

docling/backend/docx/latex/omml.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -381,7 +381,8 @@ def do_nary(self, elm):
381381
bo = ""
382382
for stag, t, e in self.process_children_list(elm):
383383
if stag == "naryPr":
384-
bo = get_val(t.chr, store=CHR_BO)
384+
# if <m:naryPr> contains no <m:chr>, the n-ary represents an integral
385+
bo = get_val(t.chr, default="\\int", store=CHR_BO)
385386
else:
386387
res.append(t)
387388
return bo + BLANK.join(res)

tests/data/docx/equations.docx

797 Bytes
Binary file not shown.

tests/data/groundtruth/docling_v2/equations.docx.itxt

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,4 +37,17 @@ item-0 at level 0: unspecified: group _root_
3737
item-36 at level 1: formula: e^{x}=1+\frac{x}{1!}+\frac{x^{2} ... xtellipsis } , - \infty < x < \infty
3838
item-37 at level 1: text:
3939
item-38 at level 1: text: And that is an equation by itself. Cheers!
40-
item-39 at level 1: text:
40+
item-39 at level 1: text:
41+
item-40 at level 1: text: Large operators and integrals ar ... sented with n-ary objects in OMML XML:
42+
item-41 at level 1: text:
43+
item-42 at level 1: formula: \sum_{0}^{2}x
44+
item-43 at level 1: formula: \bigcup_{n=1}^{m}\left(X_{n} \cap Y_{n}\right)
45+
item-44 at level 1: formula: \prod_{k=1}^{n}A_{k}
46+
item-45 at level 1: formula: \bigwedge_{}^{}x
47+
item-46 at level 1: formula: \int_{}^{}(2x+1)dx
48+
item-47 at level 1: formula: \iint_{0}^{1}xdx
49+
item-48 at level 1: formula: \iiint_{}^{}ydy
50+
item-49 at level 1: formula: \oint_{}^{}\frac{dy}{dx}
51+
item-50 at level 1: formula: \oiint_{0}^{2 \pi }idt
52+
item-51 at level 1: formula: \oiiint_{C}^{}\frac{1}{z}dz
53+
item-52 at level 1: text:

tests/data/groundtruth/docling_v2/equations.docx.json

Lines changed: 204 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,10 @@
11
{
22
"schema_name": "DoclingDocument",
3-
"version": "1.7.0",
3+
"version": "1.8.0",
44
"name": "equations",
55
"origin": {
66
"mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
7-
"binary_hash": 11121138535595486899,
7+
"binary_hash": 8638432756089077257,
88
"filename": "equations.docx"
99
},
1010
"furniture": {
@@ -106,6 +106,45 @@
106106
},
107107
{
108108
"$ref": "#/texts/35"
109+
},
110+
{
111+
"$ref": "#/texts/36"
112+
},
113+
{
114+
"$ref": "#/texts/37"
115+
},
116+
{
117+
"$ref": "#/texts/38"
118+
},
119+
{
120+
"$ref": "#/texts/39"
121+
},
122+
{
123+
"$ref": "#/texts/40"
124+
},
125+
{
126+
"$ref": "#/texts/41"
127+
},
128+
{
129+
"$ref": "#/texts/42"
130+
},
131+
{
132+
"$ref": "#/texts/43"
133+
},
134+
{
135+
"$ref": "#/texts/44"
136+
},
137+
{
138+
"$ref": "#/texts/45"
139+
},
140+
{
141+
"$ref": "#/texts/46"
142+
},
143+
{
144+
"$ref": "#/texts/47"
145+
},
146+
{
147+
"$ref": "#/texts/48"
109148
}
110149
],
111150
"content_layer": "body",
@@ -655,6 +694,169 @@
655694
"prov": [],
656695
"orig": "",
657696
"text": ""
697+
},
698+
{
699+
"self_ref": "#/texts/36",
700+
"parent": {
701+
"$ref": "#/body"
702+
},
703+
"children": [],
704+
"content_layer": "body",
705+
"label": "text",
706+
"prov": [],
707+
"orig": "Large operators and integrals are represented with n-ary objects in OMML XML:",
708+
"text": "Large operators and integrals are represented with n-ary objects in OMML XML:",
709+
"formatting": {
710+
"bold": false,
711+
"italic": false,
712+
"underline": false,
713+
"strikethrough": false,
714+
"script": "baseline"
715+
}
716+
},
717+
{
718+
"self_ref": "#/texts/37",
719+
"parent": {
720+
"$ref": "#/body"
721+
},
722+
"children": [],
723+
"content_layer": "body",
724+
"label": "text",
725+
"prov": [],
726+
"orig": "",
727+
"text": ""
728+
},
729+
{
730+
"self_ref": "#/texts/38",
731+
"parent": {
732+
"$ref": "#/body"
733+
},
734+
"children": [],
735+
"content_layer": "body",
736+
"label": "formula",
737+
"prov": [],
738+
"orig": "\\sum_{0}^{2}x",
739+
"text": "\\sum_{0}^{2}x"
740+
},
741+
{
742+
"self_ref": "#/texts/39",
743+
"parent": {
744+
"$ref": "#/body"
745+
},
746+
"children": [],
747+
"content_layer": "body",
748+
"label": "formula",
749+
"prov": [],
750+
"orig": "\\bigcup_{n=1}^{m}\\left(X_{n} \\cap Y_{n}\\right)",
751+
"text": "\\bigcup_{n=1}^{m}\\left(X_{n} \\cap Y_{n}\\right)"
752+
},
753+
{
754+
"self_ref": "#/texts/40",
755+
"parent": {
756+
"$ref": "#/body"
757+
},
758+
"children": [],
759+
"content_layer": "body",
760+
"label": "formula",
761+
"prov": [],
762+
"orig": "\\prod_{k=1}^{n}A_{k}",
763+
"text": "\\prod_{k=1}^{n}A_{k}"
764+
},
765+
{
766+
"self_ref": "#/texts/41",
767+
"parent": {
768+
"$ref": "#/body"
769+
},
770+
"children": [],
771+
"content_layer": "body",
772+
"label": "formula",
773+
"prov": [],
774+
"orig": "\\bigwedge_{}^{}x",
775+
"text": "\\bigwedge_{}^{}x"
776+
},
777+
{
778+
"self_ref": "#/texts/42",
779+
"parent": {
780+
"$ref": "#/body"
781+
},
782+
"children": [],
783+
"content_layer": "body",
784+
"label": "formula",
785+
"prov": [],
786+
"orig": "\\int_{}^{}(2x+1)dx",
787+
"text": "\\int_{}^{}(2x+1)dx"
788+
},
789+
{
790+
"self_ref": "#/texts/43",
791+
"parent": {
792+
"$ref": "#/body"
793+
},
794+
"children": [],
795+
"content_layer": "body",
796+
"label": "formula",
797+
"prov": [],
798+
"orig": "\\iint_{0}^{1}xdx",
799+
"text": "\\iint_{0}^{1}xdx"
800+
},
801+
{
802+
"self_ref": "#/texts/44",
803+
"parent": {
804+
"$ref": "#/body"
805+
},
806+
"children": [],
807+
"content_layer": "body",
808+
"label": "formula",
809+
"prov": [],
810+
"orig": "\\iiint_{}^{}ydy",
811+
"text": "\\iiint_{}^{}ydy"
812+
},
813+
{
814+
"self_ref": "#/texts/45",
815+
"parent": {
816+
"$ref": "#/body"
817+
},
818+
"children": [],
819+
"content_layer": "body",
820+
"label": "formula",
821+
"prov": [],
822+
"orig": "\\oint_{}^{}\\frac{dy}{dx}",
823+
"text": "\\oint_{}^{}\\frac{dy}{dx}"
824+
},
825+
{
826+
"self_ref": "#/texts/46",
827+
"parent": {
828+
"$ref": "#/body"
829+
},
830+
"children": [],
831+
"content_layer": "body",
832+
"label": "formula",
833+
"prov": [],
834+
"orig": "\\oiint_{0}^{2 \\pi }idt",
835+
"text": "\\oiint_{0}^{2 \\pi }idt"
836+
},
837+
{
838+
"self_ref": "#/texts/47",
839+
"parent": {
840+
"$ref": "#/body"
841+
},
842+
"children": [],
843+
"content_layer": "body",
844+
"label": "formula",
845+
"prov": [],
846+
"orig": "\\oiiint_{C}^{}\\frac{1}{z}dz",
847+
"text": "\\oiiint_{C}^{}\\frac{1}{z}dz"
848+
},
849+
{
850+
"self_ref": "#/texts/48",
851+
"parent": {
852+
"$ref": "#/body"
853+
},
854+
"children": [],
855+
"content_layer": "body",
856+
"label": "text",
857+
"prov": [],
858+
"orig": "",
859+
"text": ""
658860
}
659861
],
660862
"pictures": [],

tests/data/groundtruth/docling_v2/equations.docx.md

Lines changed: 23 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,4 +26,26 @@ This is a word document and this is an inline equation: $A= \pi r^{2}$ . If ins
2626

2727
$$e^{x}=1+\frac{x}{1!}+\frac{x^{2}}{2!}+\frac{x^{3}}{3!}+ \text{ \textellipsis } , - \infty < x < \infty$$
2828

29-
And that is an equation by itself. Cheers!
29+
And that is an equation by itself. Cheers!
30+
31+
Large operators and integrals are represented with n-ary objects in OMML XML:
32+
33+
$$\sum_{0}^{2}x$$
34+
35+
$$\bigcup_{n=1}^{m}\left(X_{n} \cap Y_{n}\right)$$
36+
37+
$$\prod_{k=1}^{n}A_{k}$$
38+
39+
$$\bigwedge_{}^{}x$$
40+
41+
$$\int_{}^{}(2x+1)dx$$
42+
43+
$$\iint_{0}^{1}xdx$$
44+
45+
$$\iiint_{}^{}ydy$$
46+
47+
$$\oint_{}^{}\frac{dy}{dx}$$
48+
49+
$$\oiint_{0}^{2 \pi }idt$$
50+
51+
$$\oiiint_{C}^{}\frac{1}{z}dz$$

0 commit comments

Comments
 (0)