Skip to content

Commit 56fc286

Browse files
committed
flask dataset check
1 parent d7b9995 commit 56fc286

File tree

3 files changed

+50
-22
lines changed

3 files changed

+50
-22
lines changed

database/models.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -214,21 +214,21 @@ class Dataset(db.Model):
214214

215215
organization_id = db.Column(
216216
db.String(36),
217-
db.ForeignKey("organization.id"),
217+
db.ForeignKey("organization.id", ondelete="CASCADE"),
218218
nullable=False,
219219
index=True,
220220
)
221221

222222
harvest_source_id = db.Column(
223223
db.String(36),
224-
db.ForeignKey("harvest_source.id"),
224+
db.ForeignKey("harvest_source.id", ondelete="CASCADE"),
225225
nullable=False,
226226
index=True,
227227
)
228228

229229
harvest_record_id = db.Column(
230230
db.String(36),
231-
db.ForeignKey("harvest_record.id"),
231+
db.ForeignKey("harvest_record.id", ondelete="CASCADE"),
232232
nullable=False,
233233
index=True,
234234
)

migrations/versions/d2a064c40061_create_dataset_table.py

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -28,9 +28,15 @@ def upgrade():
2828
sa.Column("popularity", sa.Numeric(), nullable=True),
2929
sa.Column("last_harvested_date", sa.DateTime(), nullable=True),
3030
sa.Column("id", sa.String(length=36), nullable=False),
31-
sa.ForeignKeyConstraint(["organization_id"], ["organization.id"]),
32-
sa.ForeignKeyConstraint(["harvest_source_id"], ["harvest_source.id"]),
33-
sa.ForeignKeyConstraint(["harvest_record_id"], ["harvest_record.id"]),
31+
sa.ForeignKeyConstraint(
32+
["organization_id"], ["organization.id"], ondelete="CASCADE"
33+
),
34+
sa.ForeignKeyConstraint(
35+
["harvest_source_id"], ["harvest_source.id"], ondelete="CASCADE"
36+
),
37+
sa.ForeignKeyConstraint(
38+
["harvest_record_id"], ["harvest_record.id"], ondelete="CASCADE"
39+
),
3440
sa.PrimaryKeyConstraint("id"),
3541
)
3642
op.create_index(op.f("ix_dataset_slug"), "dataset", ["slug"], unique=True)

scripts/sync_datasets.py

Lines changed: 38 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010

1111
sys.path.insert(1, "/".join(os.path.realpath(__file__).split("/")[0:-2]))
1212

13-
from database.models import Dataset, HarvestRecord, db
13+
from database.models import Dataset, HarvestRecord, HarvestSource, db
1414
from harvester import HarvesterDBInterface
1515
from harvester.utils.ckan_utils import add_uuid_to_package_name, munge_title_to_name
1616
from harvester.utils.general_utils import get_datetime
@@ -37,11 +37,19 @@ def _insert_dataset_for_record(interface: HarvesterDBInterface, record: HarvestR
3737
f"Record {record.id} missing metadata to build dataset payload"
3838
)
3939

40+
harvest_source = getattr(record, "harvest_source", None)
41+
if harvest_source is None:
42+
harvest_source = interface.db.get(HarvestSource, record.harvest_source_id)
43+
if harvest_source is None:
44+
raise click.ClickException(
45+
f"Record {record.id} is missing an associated harvest source"
46+
)
47+
4048
slug = munge_title_to_name(metadata.get("title") or record.identifier)
4149
payload = {
4250
"slug": slug,
4351
"dcat": metadata,
44-
"organization_id": record.harvest_source.organization_id,
52+
"organization_id": harvest_source.organization_id,
4553
"harvest_source_id": record.harvest_source_id,
4654
"harvest_record_id": record.id,
4755
"last_harvested_date": record.date_finished or get_datetime(),
@@ -70,15 +78,6 @@ def _records_missing_datasets(session) -> List[HarvestRecord]:
7078
)
7179

7280

73-
def _datasets_missing_records(session) -> List[Dataset]:
74-
return (
75-
session.query(Dataset)
76-
.outerjoin(HarvestRecord, Dataset.harvest_record_id == HarvestRecord.id)
77-
.filter(HarvestRecord.id.is_(None))
78-
.all()
79-
)
80-
81-
8281
def _datasets_with_unexpected_records(session) -> List[Dataset]:
8382
return (
8483
session.query(Dataset)
@@ -93,10 +92,9 @@ def _datasets_with_unexpected_records(session) -> List[Dataset]:
9392
)
9493

9594

96-
def _report(records_missing, datasets_missing, datasets_bad):
95+
def _report(records_missing, datasets_bad):
9796
click.echo("Dataset Sync Report\n====================")
9897
click.echo(f"Records needing datasets: {len(records_missing)}")
99-
click.echo(f"Datasets pointing to missing harvest records: {len(datasets_missing)}")
10098
click.echo(
10199
f"Datasets tied to non-success/non-create records: {len(datasets_bad)}"
102100
)
@@ -107,10 +105,9 @@ def _sync_impl(apply_changes: bool):
107105

108106
try:
109107
records_missing = _records_missing_datasets(db.session)
110-
datasets_missing = _datasets_missing_records(db.session)
111108
datasets_bad = _datasets_with_unexpected_records(db.session)
112109

113-
_report(records_missing, datasets_missing, datasets_bad)
110+
_report(records_missing, datasets_bad)
114111

115112
if apply_changes:
116113
synced = 0
@@ -131,6 +128,28 @@ def _sync_impl(apply_changes: bool):
131128
except click.ClickException as exc:
132129
click.echo(f"Failed to sync record {record.id}: {exc}")
133130
click.echo(f"Datasets created: {synced}")
131+
132+
deleted = 0
133+
if datasets_bad:
134+
click.echo(
135+
f"Deleting {len(datasets_bad)} dataset(s) tied "
136+
"to invalid harvest records..."
137+
)
138+
for dataset in datasets_bad:
139+
try:
140+
interface.db.delete(dataset)
141+
interface.db.commit()
142+
deleted += 1
143+
click.echo(
144+
f"Deleted dataset {dataset.slug} "
145+
f"(harvest_record_id={dataset.harvest_record_id})"
146+
)
147+
except Exception as exc: # pragma: no cover - defensive
148+
interface.db.rollback()
149+
click.echo(
150+
f"Failed to delete dataset {dataset.slug}: {exc}"
151+
)
152+
click.echo(f"Datasets deleted: {deleted}")
134153
finally:
135154
db.session.remove()
136155

@@ -145,7 +164,10 @@ def dataset_group():
145164
"--apply",
146165
"apply_changes",
147166
is_flag=True,
148-
help="Create datasets for missing harvest records",
167+
help=(
168+
"Create datasets for missing harvest records and delete datasets "
169+
"tied to invalid harvest records"
170+
),
149171
)
150172
def dataset_check(apply_changes):
151173
"""Report (and optionally repair) dataset mismatches."""

0 commit comments

Comments
 (0)