1010
1111sys .path .insert (1 , "/" .join (os .path .realpath (__file__ ).split ("/" )[0 :- 2 ]))
1212
13- from database .models import Dataset , HarvestRecord , db
13+ from database .models import Dataset , HarvestRecord , HarvestSource , db
1414from harvester import HarvesterDBInterface
1515from harvester .utils .ckan_utils import add_uuid_to_package_name , munge_title_to_name
1616from harvester .utils .general_utils import get_datetime
@@ -37,11 +37,19 @@ def _insert_dataset_for_record(interface: HarvesterDBInterface, record: HarvestR
3737 f"Record { record .id } missing metadata to build dataset payload"
3838 )
3939
40+ harvest_source = getattr (record , "harvest_source" , None )
41+ if harvest_source is None :
42+ harvest_source = interface .db .get (HarvestSource , record .harvest_source_id )
43+ if harvest_source is None :
44+ raise click .ClickException (
45+ f"Record { record .id } is missing an associated harvest source"
46+ )
47+
4048 slug = munge_title_to_name (metadata .get ("title" ) or record .identifier )
4149 payload = {
4250 "slug" : slug ,
4351 "dcat" : metadata ,
44- "organization_id" : record . harvest_source .organization_id ,
52+ "organization_id" : harvest_source .organization_id ,
4553 "harvest_source_id" : record .harvest_source_id ,
4654 "harvest_record_id" : record .id ,
4755 "last_harvested_date" : record .date_finished or get_datetime (),
@@ -70,15 +78,6 @@ def _records_missing_datasets(session) -> List[HarvestRecord]:
7078 )
7179
7280
73- def _datasets_missing_records (session ) -> List [Dataset ]:
74- return (
75- session .query (Dataset )
76- .outerjoin (HarvestRecord , Dataset .harvest_record_id == HarvestRecord .id )
77- .filter (HarvestRecord .id .is_ (None ))
78- .all ()
79- )
80-
81-
8281def _datasets_with_unexpected_records (session ) -> List [Dataset ]:
8382 return (
8483 session .query (Dataset )
@@ -93,10 +92,9 @@ def _datasets_with_unexpected_records(session) -> List[Dataset]:
9392 )
9493
9594
96- def _report (records_missing , datasets_missing , datasets_bad ):
95+ def _report (records_missing , datasets_bad ):
9796 click .echo ("Dataset Sync Report\n ====================" )
9897 click .echo (f"Records needing datasets: { len (records_missing )} " )
99- click .echo (f"Datasets pointing to missing harvest records: { len (datasets_missing )} " )
10098 click .echo (
10199 f"Datasets tied to non-success/non-create records: { len (datasets_bad )} "
102100 )
@@ -107,10 +105,9 @@ def _sync_impl(apply_changes: bool):
107105
108106 try :
109107 records_missing = _records_missing_datasets (db .session )
110- datasets_missing = _datasets_missing_records (db .session )
111108 datasets_bad = _datasets_with_unexpected_records (db .session )
112109
113- _report (records_missing , datasets_missing , datasets_bad )
110+ _report (records_missing , datasets_bad )
114111
115112 if apply_changes :
116113 synced = 0
@@ -131,6 +128,28 @@ def _sync_impl(apply_changes: bool):
131128 except click .ClickException as exc :
132129 click .echo (f"Failed to sync record { record .id } : { exc } " )
133130 click .echo (f"Datasets created: { synced } " )
131+
132+ deleted = 0
133+ if datasets_bad :
134+ click .echo (
135+ f"Deleting { len (datasets_bad )} dataset(s) tied "
136+ "to invalid harvest records..."
137+ )
138+ for dataset in datasets_bad :
139+ try :
140+ interface .db .delete (dataset )
141+ interface .db .commit ()
142+ deleted += 1
143+ click .echo (
144+ f"Deleted dataset { dataset .slug } "
145+ f"(harvest_record_id={ dataset .harvest_record_id } )"
146+ )
147+ except Exception as exc : # pragma: no cover - defensive
148+ interface .db .rollback ()
149+ click .echo (
150+ f"Failed to delete dataset { dataset .slug } : { exc } "
151+ )
152+ click .echo (f"Datasets deleted: { deleted } " )
134153 finally :
135154 db .session .remove ()
136155
@@ -145,7 +164,10 @@ def dataset_group():
145164 "--apply" ,
146165 "apply_changes" ,
147166 is_flag = True ,
148- help = "Create datasets for missing harvest records" ,
167+ help = (
168+ "Create datasets for missing harvest records and delete datasets "
169+ "tied to invalid harvest records"
170+ ),
149171 )
150172 def dataset_check (apply_changes ):
151173 """Report (and optionally repair) dataset mismatches."""
0 commit comments