Skip to content

Commit ed4ce8e

Browse files
committed
refactor: standardize parameter naming from generateIDs to generate_ids and update documentation
1 parent 2b0a245 commit ed4ce8e

4 files changed

Lines changed: 42 additions & 39 deletions

File tree

Readme.md

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -134,13 +134,13 @@ grobid_client [OPTIONS] SERVICE
134134

135135
| Option | Description |
136136
|------------------------------|-------------------------------------------|
137-
| `--generateIDs` | Generate random XML IDs |
137+
| `--generate_ids` | Generate random XML IDs |
138138
| `--consolidate_header` | Consolidate header metadata |
139139
| `--consolidate_citations` | Consolidate bibliographic references |
140140
| `--include_raw_citations` | Include raw citation text |
141141
| `--include_raw_affiliations` | Include raw affiliation text |
142-
| `--teiCoordinates` | Add PDF coordinates to XML |
143-
| `--segmentSentences` | Segment sentences with coordinates |
142+
| `--tei_coordinates` | Add PDF coordinates to XML |
143+
| `--segment_sentences` | Segment sentences with coordinates |
144144
| `--flavor` | Processing flavor for fulltext extraction |
145145
| `--json` | Convert TEI output to JSON format |
146146
| `--markdown` | Convert TEI output to Markdown format |
@@ -153,7 +153,7 @@ grobid_client [OPTIONS] SERVICE
153153
grobid_client --input ~/documents --output ~/results processFulltextDocument
154154

155155
# High concurrency with coordinates
156-
grobid_client --input ~/pdfs --output ~/tei --n 20 --teiCoordinates processFulltextDocument
156+
grobid_client --input ~/pdfs --output ~/tei --n 20 --tei_coordinates processFulltextDocument
157157

158158
# Process with JSON output
159159
grobid_client --input ~/pdfs --output ~/results --json processFulltextDocument
@@ -165,7 +165,7 @@ grobid_client --input ~/pdfs --output ~/results --markdown processFulltextDocume
165165
grobid_client --server https://grobid.example.com --input ~/citations.txt processCitationList
166166

167167
# Force reprocessing with sentence segmentation and JSON output
168-
grobid_client --input ~/docs --force --segmentSentences --json processFulltextDocument
168+
grobid_client --input ~/docs --force --segment_sentences --json processFulltextDocument
169169
```
170170

171171
### Python Library
@@ -202,10 +202,10 @@ client.process(
202202
input_path="/path/to/pdfs",
203203
output_path="/path/to/output",
204204
n=10,
205-
generateIDs=True,
205+
generate_ids=True,
206206
consolidate_header=True,
207-
teiCoordinates=True,
208-
segmentSentences=True
207+
tei_coordinates=True,
208+
segment_sentences=True
209209
)
210210

211211
# Process with JSON output
@@ -454,7 +454,7 @@ When using the `--json` flag, the client converts TEI XML output to a structured
454454
grobid_client --input pdfs/ --output results/ --json processFulltextDocument
455455

456456
# JSON output with coordinates and sentence segmentation
457-
grobid_client --input pdfs/ --output results/ --json --teiCoordinates --segmentSentences processFulltextDocument
457+
grobid_client --input pdfs/ --output results/ --json --tei_coordinates --segment_sentences processFulltextDocument
458458
```
459459

460460
```python
@@ -535,7 +535,7 @@ Competing interests statement...
535535
grobid_client --input pdfs/ --output results/ --markdown processFulltextDocument
536536

537537
# Markdown output with coordinates and sentence segmentation
538-
grobid_client --input pdfs/ --output results/ --markdown --teiCoordinates --segmentSentences processFulltextDocument
538+
grobid_client --input pdfs/ --output results/ --markdown --tei_coordinates --segment_sentences processFulltextDocument
539539
```
540540

541541
```python

grobid_client/grobid_client.py

Lines changed: 21 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
import time
2121
import concurrent.futures
2222
import ntpath
23+
import re
2324
import requests
2425
import pathlib
2526
import logging
@@ -209,7 +210,6 @@ def _parse_file_size(self, size_str):
209210
size_str = str(size_str).upper().strip()
210211

211212
# Extract number and unit
212-
import re
213213
match = re.match(r'(\d+(?:\.\d+)?)\s*([KMGT]?B?)', size_str)
214214
if not match:
215215
return 10 * 1024 * 1024 # Default 10MB
@@ -329,7 +329,7 @@ def process(
329329
input_path,
330330
output=None,
331331
n=10,
332-
generateIDs=False,
332+
generate_ids=False,
333333
consolidate_header=True,
334334
consolidate_citations=False,
335335
include_raw_citations=False,
@@ -391,7 +391,7 @@ def process(
391391
input_path,
392392
output,
393393
n,
394-
generateIDs,
394+
generate_ids,
395395
consolidate_header,
396396
consolidate_citations,
397397
include_raw_citations,
@@ -417,7 +417,7 @@ def process(
417417
input_path,
418418
output,
419419
n,
420-
generateIDs,
420+
generate_ids,
421421
consolidate_header,
422422
consolidate_citations,
423423
include_raw_citations,
@@ -455,7 +455,7 @@ def process_batch(
455455
input_path,
456456
output,
457457
n,
458-
generateIDs,
458+
generate_ids,
459459
consolidate_header,
460460
consolidate_citations,
461461
include_raw_citations,
@@ -542,7 +542,7 @@ def process_batch(
542542
selected_process,
543543
service,
544544
input_file,
545-
generateIDs,
545+
generate_ids,
546546
consolidate_header,
547547
consolidate_citations,
548548
include_raw_citations,
@@ -639,7 +639,7 @@ def process_pdf(
639639
self,
640640
service,
641641
pdf_file,
642-
generateIDs,
642+
generate_ids,
643643
consolidate_header,
644644
consolidate_citations,
645645
include_raw_citations,
@@ -667,7 +667,7 @@ def process_pdf(
667667

668668
# set the GROBID parameters
669669
the_data = {}
670-
if generateIDs:
670+
if generate_ids:
671671
the_data["generateIDs"] = "1"
672672
if consolidate_header:
673673
the_data["consolidateHeader"] = "1"
@@ -699,7 +699,7 @@ def process_pdf(
699699
self.process_pdf,
700700
service,
701701
pdf_file,
702-
generateIDs,
702+
generate_ids,
703703
consolidate_header,
704704
consolidate_citations,
705705
include_raw_citations,
@@ -734,7 +734,7 @@ def process_txt(
734734
self,
735735
service,
736736
txt_file,
737-
generateIDs,
737+
generate_ids,
738738
consolidate_header,
739739
consolidate_citations,
740740
include_raw_citations,
@@ -777,7 +777,7 @@ def process_txt(
777777
self.process_txt,
778778
service,
779779
txt_file,
780-
generateIDs,
780+
generate_ids,
781781
consolidate_header,
782782
consolidate_citations,
783783
include_raw_citations,
@@ -840,7 +840,8 @@ def main():
840840
help="concurrency for service usage"
841841
)
842842
parser.add_argument(
843-
"--generateIDs",
843+
"--generate_ids", "--generateIDs",
844+
dest="generate_ids",
844845
action="store_true",
845846
help="generate random xml:id to textual XML elements of the result files",
846847
)
@@ -870,12 +871,14 @@ def main():
870871
help="force re-processing pdf input files when tei output files already exist",
871872
)
872873
parser.add_argument(
873-
"--teiCoordinates",
874+
"--tei_coordinates", "--teiCoordinates",
875+
dest="tei_coordinates",
874876
action="store_true",
875877
help="add the original PDF coordinates (bounding boxes) to the extracted elements",
876878
)
877879
parser.add_argument(
878-
"--segmentSentences",
880+
"--segment_sentences", "--segmentSentences",
881+
dest="segment_sentences",
879882
action="store_true",
880883
help="segment sentences in the text content of the document with additional <s> elements",
881884
)
@@ -951,14 +954,14 @@ def main():
951954
exit(1)
952955

953956
service = args.service
954-
generateIDs = args.generateIDs
957+
generate_ids = args.generate_ids
955958
consolidate_header = args.consolidate_header
956959
consolidate_citations = args.consolidate_citations
957960
include_raw_citations = args.include_raw_citations
958961
include_raw_affiliations = args.include_raw_affiliations
959962
force = args.force
960-
tei_coordinates = args.teiCoordinates
961-
segment_sentences = args.segmentSentences
963+
tei_coordinates = args.tei_coordinates
964+
segment_sentences = args.segment_sentences
962965
verbose = args.verbose
963966

964967
if service is None or service not in valid_services:
@@ -973,7 +976,7 @@ def main():
973976
input_path,
974977
output=output_path,
975978
n=n,
976-
generateIDs=generateIDs,
979+
generate_ids=generate_ids,
977980
consolidate_header=consolidate_header,
978981
consolidate_citations=consolidate_citations,
979982
include_raw_citations=include_raw_citations,

tests/test_grobid_client.py

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -284,7 +284,7 @@ def test_process_pdf_success(self, mock_post, mock_file):
284284
result = client.process_pdf(
285285
'processFulltextDocument',
286286
'/test/document.pdf',
287-
generateIDs=True,
287+
generate_ids=True,
288288
consolidate_header=True,
289289
consolidate_citations=False,
290290
include_raw_citations=False,
@@ -308,7 +308,7 @@ def test_process_pdf_file_not_found(self, mock_file):
308308
result = client.process_pdf(
309309
'processFulltextDocument',
310310
'/nonexistent/document.pdf',
311-
generateIDs=False,
311+
generate_ids=False,
312312
consolidate_header=False,
313313
consolidate_citations=False,
314314
include_raw_citations=False,
@@ -335,7 +335,7 @@ def test_process_txt_success(self, mock_post, mock_file):
335335
result = client.process_txt(
336336
'processCitationList',
337337
'/test/references.txt',
338-
generateIDs=False,
338+
generate_ids=False,
339339
consolidate_header=False,
340340
consolidate_citations=True,
341341
include_raw_citations=True,
@@ -371,7 +371,7 @@ def test_process_pdf_server_busy_retry(self, mock_post):
371371
result = client.process_pdf(
372372
'processFulltextDocument',
373373
'/test/document.pdf',
374-
generateIDs=False,
374+
generate_ids=False,
375375
consolidate_header=False,
376376
consolidate_citations=False,
377377
include_raw_citations=False,
@@ -412,7 +412,7 @@ def test_process_batch(self, mock_isfile, mock_executor):
412412
'/test',
413413
'/output',
414414
n=2,
415-
generateIDs=False,
415+
generate_ids=False,
416416
consolidate_header=False,
417417
consolidate_citations=False,
418418
include_raw_citations=False,
@@ -493,7 +493,7 @@ def test_process_batch_empty_input_files(self, mock_configure_logging, mock_test
493493
input_path='/test',
494494
output='/output',
495495
n=1,
496-
generateIDs=False,
496+
generate_ids=False,
497497
consolidate_header=False,
498498
consolidate_citations=False,
499499
include_raw_citations=False,
@@ -535,7 +535,7 @@ def test_process_txt_unicode_error(self, mock_configure_logging, mock_test_serve
535535
result = client.process_txt(
536536
'processCitationList',
537537
'/test/references.txt',
538-
generateIDs=False,
538+
generate_ids=False,
539539
consolidate_header=False,
540540
consolidate_citations=False,
541541
include_raw_citations=False,
@@ -585,7 +585,7 @@ def test_process_pdf_timeout_error(self, mock_configure_logging, mock_test_serve
585585
result = client.process_pdf(
586586
'processFulltextDocument',
587587
'/test/document.pdf',
588-
generateIDs=False,
588+
generate_ids=False,
589589
consolidate_header=False,
590590
consolidate_citations=False,
591591
include_raw_citations=False,
@@ -600,7 +600,7 @@ def test_process_pdf_timeout_error(self, mock_configure_logging, mock_test_serve
600600
result = client.process_pdf(
601601
'processFulltextDocument',
602602
'/test/document.pdf',
603-
generateIDs=False,
603+
generate_ids=False,
604604
consolidate_header=False,
605605
consolidate_citations=False,
606606
include_raw_citations=False,

tests/test_integration.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -171,7 +171,7 @@ def test_batch_processing(self):
171171
'/test',
172172
'/output',
173173
n=2,
174-
generateIDs=False,
174+
generate_ids=False,
175175
consolidate_header=False,
176176
consolidate_citations=False,
177177
include_raw_citations=False,
@@ -332,7 +332,7 @@ def test_concurrent_processing_stress(self):
332332
'/test',
333333
'/output',
334334
n=5, # 5 concurrent threads
335-
generateIDs=False,
335+
generate_ids=False,
336336
consolidate_header=False,
337337
consolidate_citations=False,
338338
include_raw_citations=False,

0 commit comments

Comments
 (0)