Skip to content

Commit 983115b

Browse files
committed
add check if forced harvest, add test to detect skip is done
1 parent 36065c2 commit 983115b

File tree

2 files changed

+131
-4
lines changed

2 files changed

+131
-4
lines changed

harvester/harvest.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -43,8 +43,8 @@
4343
download_file,
4444
find_indexes_for_duplicates,
4545
get_datetime,
46-
munge_title_to_name,
4746
make_record_mapping,
47+
munge_title_to_name,
4848
open_json,
4949
prepare_transform_msg,
5050
send_email_to_recipients,
@@ -454,7 +454,10 @@ def run_full_harvest(self) -> None:
454454
self.determine_internal_deletions()
455455
internal_records_to_delete = self.iter_internal_records_to_be_deleted()
456456

457-
if self.source_type in ["waf", "waf-collection"]:
457+
if (
458+
self.source_type in ["waf", "waf-collection"]
459+
and self.job_type != "force_harvest"
460+
):
458461
self.filter_waf_files_by_datetime()
459462

460463
self.filter_duplicate_identifiers()
@@ -1051,7 +1054,8 @@ def _insert_dataset_with_unique_slug(self, dataset_payload: dict) -> None:
10511054
raise
10521055

10531056
logger.info(
1054-
"Dataset slug '%s' already exists; generating a new slug", self.dataset_slug
1057+
"Dataset slug '%s' already exists; generating a new slug",
1058+
self.dataset_slug,
10551059
)
10561060
self.dataset_slug = add_uuid_to_package_name(self.dataset_slug)
10571061
dataset_payload["slug"] = self.dataset_slug

tests/integration/harvest_job_flows/test_harvest_job_force_update.py

Lines changed: 124 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,9 @@
1+
from datetime import datetime
2+
import json
3+
from unittest.mock import Mock, patch
4+
15
from database.models import Dataset
2-
from harvester.harvest import harvest_job_starter
6+
from harvester.harvest import HarvestSource, harvest_job_starter
37

48

59
class TestHarvestJobSync:
@@ -88,3 +92,122 @@ def test_harvest_job_force_update(
8892
assert set(initial_harvest_record_ids) == set(updated_harvest_record_ids)
8993
for slug, record_id in initial_harvest_record_ids.items():
9094
assert updated_harvest_record_ids[slug] != record_id
95+
96+
def test_waf_force_harvest_skips_datetime_filter(
97+
self,
98+
interface,
99+
organization_data,
100+
source_data_waf_iso19115_2,
101+
iso19115_2_transform,
102+
):
103+
"""
104+
Force harvest for WAF sources should skip filter_waf_files_by_datetime
105+
to ensure all records are updated, not just those with newer modified dates.
106+
"""
107+
interface.add_organization(organization_data)
108+
interface.add_harvest_source(source_data_waf_iso19115_2)
109+
110+
# Mock external dependencies
111+
mock_waf_files = [
112+
{
113+
"identifier": f"{source_data_waf_iso19115_2['url']}file1.xml",
114+
"modified_date": datetime(2024, 1, 1),
115+
},
116+
{
117+
"identifier": f"{source_data_waf_iso19115_2['url']}file2.xml",
118+
"modified_date": datetime(2024, 1, 2),
119+
},
120+
]
121+
122+
valid_xml = """<?xml version="1.0" encoding="UTF-8"?>
123+
<gmi:MI_Metadata xmlns:gmi="http://www.isotc211.org/2005/gmi">
124+
<gmd:fileIdentifier>
125+
<gco:CharacterString>test-file</gco:CharacterString>
126+
</gmd:fileIdentifier>
127+
</gmi:MI_Metadata>"""
128+
129+
# Mock MDTranslator response
130+
mock_mdt_response = Mock()
131+
mock_mdt_response.status_code = 200
132+
mock_mdt_response.json.return_value = {
133+
"writerOutput": json.dumps(iso19115_2_transform),
134+
"readerStructureMessages": [],
135+
"readerValidationMessages": [],
136+
}
137+
138+
with patch("harvester.harvest.traverse_waf", return_value=mock_waf_files), \
139+
patch("harvester.harvest.download_file", return_value=valid_xml), \
140+
patch("harvester.harvest.requests.post", return_value=mock_mdt_response):
141+
142+
# Initial harvest
143+
harvest_job = interface.add_harvest_job(
144+
{
145+
"status": "new",
146+
"harvest_source_id": source_data_waf_iso19115_2["id"],
147+
}
148+
)
149+
150+
job_id = harvest_job.id
151+
job_type = harvest_job.job_type
152+
assert job_type == "harvest"
153+
harvest_job_starter(job_id, job_type)
154+
155+
harvest_job = interface.get_harvest_job(job_id)
156+
job_err = interface.get_harvest_job_errors_by_job(job_id)
157+
record_err = interface.get_harvest_record_errors_by_job(job_id)
158+
159+
assert len(job_err) == 0
160+
assert len(record_err) == 0
161+
assert harvest_job.status == "complete"
162+
163+
initial_records_added = harvest_job.records_added
164+
assert initial_records_added > 0
165+
166+
datasets_initial = interface.db.query(Dataset).all()
167+
assert len(datasets_initial) == initial_records_added
168+
initial_harvest_record_ids = {
169+
dataset.slug: dataset.harvest_record_id for dataset in datasets_initial
170+
}
171+
172+
# Force harvest - should skip filter_waf_files_by_datetime
173+
with patch.object(
174+
HarvestSource, "filter_waf_files_by_datetime"
175+
) as mock_filter:
176+
harvest_job = interface.add_harvest_job(
177+
{
178+
"status": "new",
179+
"harvest_source_id": source_data_waf_iso19115_2["id"],
180+
"job_type": "force_harvest",
181+
}
182+
)
183+
184+
job_id = harvest_job.id
185+
job_type = harvest_job.job_type
186+
assert job_type == "force_harvest"
187+
harvest_job_starter(job_id, job_type)
188+
189+
# Verify filter was NOT called
190+
mock_filter.assert_not_called()
191+
192+
harvest_job = interface.get_harvest_job(job_id)
193+
194+
# Assert all records are resynced
195+
assert len(job_err) == 0
196+
assert len(record_err) == 0
197+
assert harvest_job.status == "complete"
198+
assert harvest_job.records_added == 0
199+
assert harvest_job.records_deleted == 0
200+
assert harvest_job.records_errored == 0
201+
assert harvest_job.records_ignored == 0
202+
assert harvest_job.records_total == initial_records_added
203+
assert harvest_job.records_updated == initial_records_added
204+
assert harvest_job.records_validated == initial_records_added
205+
206+
datasets_after = interface.db.query(Dataset).all()
207+
assert len(datasets_after) == initial_records_added
208+
updated_harvest_record_ids = {
209+
dataset.slug: dataset.harvest_record_id for dataset in datasets_after
210+
}
211+
assert set(initial_harvest_record_ids) == set(updated_harvest_record_ids)
212+
for slug, record_id in initial_harvest_record_ids.items():
213+
assert updated_harvest_record_ids[slug] != record_id

0 commit comments

Comments
 (0)