Skip to content

Commit dbf7718

Browse files
committed
fixup! Drop disallowed control characters and strip blank characters
1 parent 6ffd72c commit dbf7718

File tree

2 files changed

+10
-13
lines changed

2 files changed

+10
-13
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
1010
### Added
1111

1212
- Add utility function to compute ZIM Tags #164, including deduplication #156
13+
- Metadata does not automatically drops control characters #159
1314

1415
### Fixed
1516

src/zimscraperlib/zim/creator.py

Lines changed: 9 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,7 @@
6767
)
6868

6969
# All control characters are disallowed in str metadata except \n, \r and \t
70-
CONTROL_CHARACTERS_REGEX = regex.compile(r"(?![\n\t\r])\p{C}")
70+
UNWANTED_CONTROL_CHARACTERS_REGEX = regex.compile(r"(?![\n\t\r])\p{C}")
7171

7272

7373
def mimetype_for(
@@ -256,10 +256,9 @@ def add_metadata(
256256
):
257257
# drop control characters before passing them to libzim
258258
if isinstance(content, str):
259-
if CONTROL_CHARACTERS_REGEX.search(content):
260-
content = CONTROL_CHARACTERS_REGEX.sub("", content).strip(" \r\n\t")
261-
else:
262-
content = content.strip(" \r\n\t")
259+
content = UNWANTED_CONTROL_CHARACTERS_REGEX.sub("", content).strip(
260+
" \r\n\t"
261+
)
263262
if not self.disable_metadata_checks:
264263
self.validate_metadata(name, content)
265264
if name == "Date" and isinstance(content, (datetime.date, datetime.datetime)):
@@ -314,16 +313,13 @@ def config_metadata(
314313
}
315314
)
316315
self._metadata.update(extras)
317-
for k, v in self._metadata.items():
316+
for metadata_key, metadata_value in self._metadata.items():
318317
# drop control characters so that proper value is stored in memory and
319318
# logged in DEBUG mode ; also strip blank characters
320-
if isinstance(v, str):
321-
if CONTROL_CHARACTERS_REGEX.search(v):
322-
self._metadata[k] = CONTROL_CHARACTERS_REGEX.sub("", v).strip(
323-
" \r\n\t"
324-
)
325-
else:
326-
self._metadata[k] = v.strip(" \r\n\t")
319+
if isinstance(metadata_value, str):
320+
self._metadata[metadata_key] = UNWANTED_CONTROL_CHARACTERS_REGEX.sub(
321+
"", metadata_value
322+
).strip(" \r\n\t")
327323
return self
328324

329325
def config_dev_metadata(self, **extras: str):

0 commit comments

Comments
 (0)