Skip to content

Commit 7a96232

Browse files
Copilotpirate
andcommitted
Make defuddle parse existing local HTML source files
Co-authored-by: pirate <511499+pirate@users.noreply.github.com>
1 parent bce896b commit 7a96232

2 files changed

Lines changed: 62 additions & 2 deletions

File tree

abx_plugins/plugins/defuddle/on_Snapshot__57_defuddle.py

Lines changed: 37 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -57,14 +57,50 @@ def get_env_array(name: str, default: list[str] | None = None) -> list[str]:
5757
return default if default is not None else []
5858

5959

60+
def find_html_source() -> str | None:
61+
"""Return first non-empty HTML source file from sibling extractor outputs."""
62+
search_patterns = [
63+
"singlefile/singlefile.html",
64+
"*_singlefile/singlefile.html",
65+
"singlefile/*.html",
66+
"*_singlefile/*.html",
67+
"dom/output.html",
68+
"*_dom/output.html",
69+
"dom/*.html",
70+
"*_dom/*.html",
71+
"wget/**/*.html",
72+
"*_wget/**/*.html",
73+
"wget/**/*.htm",
74+
"*_wget/**/*.htm",
75+
]
76+
77+
for base in (Path.cwd(), Path.cwd().parent):
78+
for pattern in search_patterns:
79+
for match in base.glob(pattern):
80+
if match.is_file() and match.stat().st_size > 0:
81+
return str(match)
82+
return None
83+
84+
6085
def extract_defuddle(url: str, binary: str) -> tuple[bool, str | None, str]:
6186
timeout = get_env_int("DEFUDDLE_TIMEOUT") or get_env_int("TIMEOUT", 60)
6287
defuddle_args = get_env_array("DEFUDDLE_ARGS", [])
6388
defuddle_args_extra = get_env_array("DEFUDDLE_ARGS_EXTRA", [])
6489
output_dir = Path(OUTPUT_DIR)
90+
html_source = find_html_source()
91+
if not html_source:
92+
return False, None, "No HTML source found (run singlefile, dom, or wget first)"
6593

6694
try:
67-
cmd = [binary, *defuddle_args, *defuddle_args_extra, url]
95+
cmd = [
96+
binary,
97+
*defuddle_args,
98+
"parse",
99+
html_source,
100+
*defuddle_args_extra,
101+
]
102+
if "--json" not in cmd and "-j" not in cmd:
103+
cmd.append("--json")
68104
result = subprocess.run(
69105
cmd,
70106
stdout=subprocess.PIPE,

abx_plugins/plugins/defuddle/tests/test_defuddle.py

Lines changed: 25 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,18 @@
2727
TEST_URL = "https://example.com"
2828

2929

30+
def create_example_html(tmpdir: Path) -> Path:
31+
"""Create a local singlefile HTML fixture used as parser input."""
32+
singlefile_dir = tmpdir / "singlefile"
33+
singlefile_dir.mkdir(parents=True, exist_ok=True)
34+
html_file = singlefile_dir / "singlefile.html"
35+
html_file.write_text(
36+
"<html><head><title>Example Domain</title></head><body><article><h1>Example Domain</h1><p>Example text body</p></article></body></html>",
37+
encoding="utf-8",
38+
)
39+
return html_file
40+
41+
3042
def test_hook_script_exists():
3143
assert DEFUDDLE_HOOK.exists(), f"Hook script not found: {DEFUDDLE_HOOK}"
3244

@@ -57,6 +69,7 @@ def test_reports_missing_dependency_when_not_installed():
5769
tmpdir = Path(tmpdir)
5870
snap_dir = tmpdir / "snap"
5971
snap_dir.mkdir(parents=True, exist_ok=True)
72+
create_example_html(snap_dir)
6073

6174
env = {"PATH": "/nonexistent", "HOME": str(tmpdir), "SNAP_DIR": str(snap_dir)}
6275
result = subprocess.run(
@@ -87,10 +100,20 @@ def test_extracts_article_with_json_output_from_binary():
87100
tmpdir = Path(tmpdir)
88101
snap_dir = tmpdir / "snap"
89102
snap_dir.mkdir(parents=True, exist_ok=True)
103+
expected_html = create_example_html(snap_dir)
90104

91105
fake_binary = tmpdir / "fake_defuddle.py"
92106
fake_binary.write_text(
93-
"import json,sys; print(json.dumps({'content':'<article>Example</article>','textContent':'Example text','title':'Example Title'}))"
107+
"import json, pathlib, sys\n"
108+
"args = sys.argv[1:]\n"
109+
"assert 'parse' in args\n"
110+
"idx = args.index('parse') + 1\n"
111+
"source = pathlib.Path(args[idx])\n"
112+
"assert source.is_file()\n"
113+
"assert str(source).startswith('/')\n"
114+
"assert not str(source).startswith('http')\n"
115+
"assert '--json' in args or '-j' in args\n"
116+
"print(json.dumps({'content':'<article>Example</article>','textContent':'Example text','title':'Example Title'}))\n"
94117
)
95118
fake_binary.chmod(fake_binary.stat().st_mode | stat.S_IXUSR)
96119

@@ -126,3 +149,4 @@ def test_extracts_article_with_json_output_from_binary():
126149
assert "Example text" in (output_dir / "content.txt").read_text(encoding="utf-8")
127150
metadata = json.loads((output_dir / "article.json").read_text(encoding="utf-8"))
128151
assert metadata.get("title") == "Example Title"
152+
assert expected_html.exists()

0 commit comments

Comments
 (0)