Skip to content

Commit 6c791ff

Browse files
author
ashariyar
committed
Broaden exception handling in FontInfo extraction
1 parent 59d882d commit 6c791ff

File tree

5 files changed

+23
-5
lines changed

5 files changed

+23
-5
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
# NEXT RELEASE
2+
* Broaden exception handling in `FontInfo` extraction
23

34
### 1.17.8
45
* Handle `AttributeError` in `FontInfo` extraction

pdfalyzer/decorators/pdf_file.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -173,8 +173,8 @@ def extract_text(
173173
except EmptyFileError:
174174
log.warning("Skipping empty file!")
175175
except PdfStreamError as e:
176-
print_error(f"Error parsing PDF file '{self.file_path}': {e}")
177176
stderr_console.print_exception()
177+
print_error(f"Error parsing PDF file '{self.file_path}': {e}")
178178

179179
return "\n\n".join(extracted_pages).strip()
180180

pdfalyzer/helpers/rich_text_helper.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -106,4 +106,5 @@ def attention_getting_panel(text: Text, title: str, style: str = 'white on red')
106106

107107

108108
def print_error(text: Union[str, Text]) -> Text:
109+
console.line()
109110
console.print(error_text(text))

pdfalyzer/output/pdfalyzer_presenter.py

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,19 +20,27 @@
2020
from pdfalyzer.config import PdfalyzerConfig
2121
from pdfalyzer.decorators.pdf_tree_node import DECODE_FAILURE_LEN
2222
from pdfalyzer.detection.yaralyzer_helper import get_bytes_yaralyzer, get_file_yaralyzer
23+
from pdfalyzer.helpers.rich_text_helper import print_error
2324
from pdfalyzer.helpers.string_helper import pp
2425
from pdfalyzer.output.layout import (print_fatal_error_panel, print_section_header, print_section_subheader,
2526
print_section_sub_subheader)
2627
from pdfalyzer.output.tables.decoding_stats_table import build_decoding_stats_table
2728
from pdfalyzer.output.tables.pdf_node_rich_table import generate_rich_tree, get_symlink_representation
2829
from pdfalyzer.output.tables.stream_objects_table import stream_objects_table
2930
from pdfalyzer.pdfalyzer import Pdfalyzer
30-
# from pdfalyzer.util.adobe_strings import *
3131

3232
INTERNAL_YARA_ERROR_MSG = "Internal YARA error! YARA's error codes can be checked here: https://github.com/VirusTotal/yara/blob/master/libyara/include/yara/error.h" # noqa: E501
3333

3434

3535
class PdfalyzerPresenter:
36+
"""
37+
Handles formatting of console text output for Pdfalyzer class.
38+
39+
Attributes:
40+
pdfalyzer (Pdfalyzer): Pdfalyzer for a given PDF file
41+
yaralyzer (Yaralyzer): Yaralyzer for a given PDF file
42+
"""
43+
3644
def __init__(self, pdfalyzer: Pdfalyzer):
3745
self.pdfalyzer = pdfalyzer
3846
self.yaralyzer = get_file_yaralyzer(self.pdfalyzer.pdf_path)
@@ -83,6 +91,9 @@ def print_font_info(self, font_idnum=None) -> None:
8391
"""Print informatin about all fonts that appear in this PDF."""
8492
print_section_header(f'{len(self.pdfalyzer.font_infos)} fonts found in {self.pdfalyzer.pdf_basename}')
8593

94+
if self.pdfalyzer.font_info_extraction_error:
95+
print_error(f"Failed to extract font information (error: {self.pdfalyzer.font_info_extraction_error})")
96+
8697
for font_info in [fi for fi in self.pdfalyzer.font_infos if font_idnum is None or font_idnum == fi.idnum]:
8798
font_info.print_summary()
8899

pdfalyzer/pdfalyzer.py

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
from pdfalyzer.decorators.pdf_tree_node import PdfTreeNode
2020
from pdfalyzer.decorators.pdf_tree_verifier import PdfTreeVerifier
2121
from pdfalyzer.font_info import FontInfo
22+
from pdfalyzer.helpers.rich_text_helper import print_error
2223
from pdfalyzer.pdf_object_relationship import PdfObjectRelationship
2324
from pdfalyzer.util.adobe_strings import *
2425
from pdfalyzer.util.exceptions import PdfWalkError
@@ -37,6 +38,7 @@ class Pdfalyzer:
3738
3839
Attributes:
3940
font_infos (List[FontInfo]): Font summary objects
41+
font_info_extraction_error (Optional[Exception]): Error encountered extracting FontInfo (if any)
4042
max_generation (int): Max revision number ("generation") encounted in this PDF.
4143
nodes_encountered (Dict[int, PdfTreeNode]): Nodes we've traversed already.
4244
pdf_basename (str): The base name of the PDF file (with extension).
@@ -70,6 +72,7 @@ def __init__(self, pdf_path: str):
7072

7173
# Initialize tracking variables
7274
self.font_infos: List[FontInfo] = [] # Font summary objects
75+
self.font_info_extraction_error: Optional[Exception] = None
7376
self.max_generation = 0 # PDF revisions are "generations"; this is the max generation encountered
7477
self.nodes_encountered: Dict[int, PdfTreeNode] = {} # Nodes we've seen already
7578
self._indeterminate_ids = set() # See INDETERMINATE_REF_KEYS comment
@@ -231,9 +234,11 @@ def _extract_font_infos(self) -> None:
231234
fi for fi in FontInfo.extract_font_infos(node.obj)
232235
if fi.idnum not in known_font_ids
233236
]
234-
except AttributeError as e:
235-
console.print_exception()
236-
log.error(f"Failed to extract font information from node: {node}")
237+
except Exception as e:
238+
self.font_info_extraction_error = e
239+
console.line()
240+
log.warning(f"Failed to extract font information from node: {node} (error: {e})")
241+
console.line()
237242

238243
def _build_or_find_node(self, relationship: IndirectObject, relationship_key: str) -> PdfTreeNode:
239244
"""If node in self.nodes_encountered already then return it, otherwise build a node and store it."""

0 commit comments

Comments
 (0)