diff --git a/CHANGELOG.md b/CHANGELOG.md index c52c2ed7..aa2b2181 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,6 +19,8 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. - Fix unawaited coroutine in `stac_fastapi.core.core`. [#551](https://github.com/stac-utils/stac-fastapi-elasticsearch-opensearch/pull/551) - Parse `ES_TIMEOUT` environment variable as an integer. [#556](https://github.com/stac-utils/stac-fastapi-elasticsearch-opensearch/pull/556) +- Implemented "Smart Unlink" logic in delete_catalog: when cascade=False (default), collections are unlinked from the catalog and become root-level orphans if they have no other parents, rather than being deleted. When cascade=True, collections are deleted entirely. This prevents accidental data loss and supports poly-hierarchy scenarios where collections belong to multiple catalogs. [#557](https://github.com/stac-utils/stac-fastapi-elasticsearch-opensearch/pull/557) +- Fixed delete_catalog to use reverse lookup query on parent_ids field instead of fragile link parsing. This ensures all collections are found and updated correctly, preventing ghost relationships where collections remain tagged with deleted catalogs, especially in large catalogs or pagination scenarios. [#557](https://github.com/stac-utils/stac-fastapi-elasticsearch-opensearch/pull/557) ### Removed diff --git a/README.md b/README.md index 53998be3..7744b7f4 100644 --- a/README.md +++ b/README.md @@ -28,7 +28,7 @@ The following organizations have contributed time and/or funding to support the ## Latest News -- **12/09/2025:** **Feature Merge: Federated Catalogs.** The [`Catalogs Endpoint`](https://github.com/Healy-Hyperspatial/stac-api-extensions-catalogs-endpoint) extension is now in main! This enables a registry of catalogs and supports **poly-hierarchy** (collections belonging to multiple catalogs simultaneously). Enable it via `ENABLE_CATALOGS_EXTENSION`. _Coming next: Support for nested sub-catalogs._ +- **12/09/2025:** Feature Merge: **Federated Catalogs**. The [`Catalogs Endpoint`](https://github.com/Healy-Hyperspatial/stac-api-extensions-catalogs-endpoint) extension is now in main! This enables a registry of catalogs and supports **poly-hierarchy** (collections belonging to multiple catalogs simultaneously). Enable it via `ENABLE_CATALOGS_EXTENSION`. _Coming next: Support for nested sub-catalogs._ - **11/07/2025:** 🌍 The SFEOS STAC Viewer is now available at: https://healy-hyperspatial.github.io/sfeos-web. Use this site to examine your data and test your STAC API! - **10/24/2025:** Added `previous_token` pagination using Redis for efficient navigation. This feature allows users to navigate backwards through large result sets by storing pagination state in Redis. To use this feature, ensure Redis is configured (see [Redis for navigation](#redis-for-navigation)) and set `REDIS_ENABLE=true` in your environment. - **10/23/2025:** The `EXCLUDED_FROM_QUERYABLES` environment variable was added to exclude fields from the `queryables` endpoint. See [docs](#excluding-fields-from-queryables). diff --git a/stac_fastapi/core/stac_fastapi/core/extensions/catalogs.py b/stac_fastapi/core/stac_fastapi/core/extensions/catalogs.py index 0e0c13e6..96491701 100644 --- a/stac_fastapi/core/stac_fastapi/core/extensions/catalogs.py +++ b/stac_fastapi/core/stac_fastapi/core/extensions/catalogs.py @@ -314,71 +314,42 @@ async def delete_catalog( HTTPException: If the catalog is not found. """ try: - # Get the catalog to verify it exists and get its collections - db_catalog = await self.client.database.find_catalog(catalog_id) - catalog = self.client.catalog_serializer.db_to_stac(db_catalog, request) + # Get the catalog to verify it exists + await self.client.database.find_catalog(catalog_id) - # Extract collection IDs from catalog links - collection_ids = [] - if hasattr(catalog, "links") and catalog.links: - for link in catalog.links: - rel = ( - link.get("rel") - if hasattr(link, "get") - else getattr(link, "rel", None) - ) - if rel == "child": - href = ( - link.get("href", "") - if hasattr(link, "get") - else getattr(link, "href", "") - ) - if href and "/collections/" in href: - # Extract collection ID from href - collection_id = href.split("/collections/")[-1].split("/")[ - 0 - ] - if collection_id: - collection_ids.append(collection_id) - - if cascade: - # Delete each collection - for coll_id in collection_ids: - try: - await self.client.database.delete_collection(coll_id) + # Use reverse lookup query to find all collections with this catalog in parent_ids. + # This is more reliable than parsing links, as it captures all collections + # regardless of pagination or link truncation. + query_body = {"query": {"term": {"parent_ids": catalog_id}}} + search_result = await self.client.database.client.search( + index=COLLECTIONS_INDEX, body=query_body, size=10000 + ) + children = [hit["_source"] for hit in search_result["hits"]["hits"]] + + # Process each child collection + for child in children: + child_id = child.get("id") + try: + if cascade: + # DANGER ZONE: User explicitly requested cascade delete. + # Delete the collection entirely, regardless of other parents. + await self.client.database.delete_collection(child_id) logger.info( - f"Deleted collection {coll_id} as part of cascade delete for catalog {catalog_id}" - ) - except Exception as e: - error_msg = str(e) - if "not found" in error_msg.lower(): - logger.debug( - f"Collection {coll_id} not found, skipping (may have been deleted elsewhere)" - ) - else: - logger.warning( - f"Failed to delete collection {coll_id}: {e}" - ) - else: - # Remove catalog from each collection's parent_ids and links (orphan them) - for coll_id in collection_ids: - try: - # Get the collection from database to access parent_ids - collection_db = await self.client.database.find_collection( - coll_id + f"Deleted collection {child_id} as part of cascade delete for catalog {catalog_id}" ) - - # Remove catalog_id from parent_ids - parent_ids = collection_db.get("parent_ids", []) + else: + # SAFE ZONE: Smart Unlink - Remove only this catalog from parent_ids. + # The collection survives and becomes a root-level collection if it has no other parents. + parent_ids = child.get("parent_ids", []) if catalog_id in parent_ids: parent_ids.remove(catalog_id) - collection_db["parent_ids"] = parent_ids + child["parent_ids"] = parent_ids # Also remove the catalog link from the collection's links - if "links" in collection_db: - collection_db["links"] = [ + if "links" in child: + child["links"] = [ link - for link in collection_db.get("links", []) + for link in child.get("links", []) if not ( link.get("rel") == "catalog" and catalog_id in link.get("href", "") @@ -387,30 +358,37 @@ async def delete_catalog( # Update the collection in the database await self.client.database.update_collection( - collection_id=coll_id, - collection=collection_db, - refresh=True, - ) - logger.info( - f"Removed catalog {catalog_id} from collection {coll_id} parent_ids and links" + collection_id=child_id, + collection=child, + refresh=False, ) + + # Log the result + if len(parent_ids) == 0: + logger.info( + f"Collection {child_id} is now a root-level orphan (no parent catalogs)" + ) + else: + logger.info( + f"Removed catalog {catalog_id} from collection {child_id}; still belongs to {len(parent_ids)} other catalog(s)" + ) else: logger.debug( - f"Catalog {catalog_id} not in parent_ids for collection {coll_id}" - ) - except Exception as e: - error_msg = str(e) - if "not found" in error_msg.lower(): - logger.debug( - f"Collection {coll_id} not found, skipping (may have been deleted elsewhere)" - ) - else: - logger.warning( - f"Failed to remove catalog {catalog_id} from collection {coll_id}: {e}" + f"Catalog {catalog_id} not in parent_ids for collection {child_id}" ) + except Exception as e: + error_msg = str(e) + if "not found" in error_msg.lower(): + logger.debug( + f"Collection {child_id} not found, skipping (may have been deleted elsewhere)" + ) + else: + logger.warning( + f"Failed to process collection {child_id} during catalog deletion: {e}" + ) - # Delete the catalog - await self.client.database.delete_catalog(catalog_id) + # Delete the catalog itself + await self.client.database.delete_catalog(catalog_id, refresh=True) logger.info(f"Deleted catalog {catalog_id}") except Exception as e: diff --git a/stac_fastapi/tests/extensions/test_catalogs.py b/stac_fastapi/tests/extensions/test_catalogs.py index 62c46a71..9a85d03a 100644 --- a/stac_fastapi/tests/extensions/test_catalogs.py +++ b/stac_fastapi/tests/extensions/test_catalogs.py @@ -992,6 +992,116 @@ async def test_catalog_links_contain_all_collections( ), f"Collection {collection_id} missing from catalog links. Found links: {child_hrefs}" +@pytest.mark.asyncio +async def test_delete_catalog_no_cascade_orphans_collections( + catalogs_app_client, load_test_data +): + """Test that deleting a catalog without cascade makes collections root-level orphans.""" + # Create a catalog + test_catalog = load_test_data("test_catalog.json") + catalog_id = f"test-catalog-{uuid.uuid4()}" + test_catalog["id"] = catalog_id + test_catalog["links"] = [ + link for link in test_catalog.get("links", []) if link.get("rel") != "child" + ] + + catalog_resp = await catalogs_app_client.post("/catalogs", json=test_catalog) + assert catalog_resp.status_code == 201 + + # Create a collection in the catalog + test_collection = load_test_data("test_collection.json") + collection_id = f"test-collection-{uuid.uuid4()}" + test_collection["id"] = collection_id + + create_resp = await catalogs_app_client.post( + f"/catalogs/{catalog_id}/collections", json=test_collection + ) + assert create_resp.status_code == 201 + + # Delete the catalog without cascade (default behavior) + delete_resp = await catalogs_app_client.delete(f"/catalogs/{catalog_id}") + assert delete_resp.status_code == 204 + + # Verify the collection still exists + get_resp = await catalogs_app_client.get(f"/collections/{collection_id}") + assert get_resp.status_code == 200 + + # Verify the collection is now a root-level orphan (accessible from /collections) + collections_resp = await catalogs_app_client.get("/collections") + assert collections_resp.status_code == 200 + collections_data = collections_resp.json() + collection_ids = [col["id"] for col in collections_data.get("collections", [])] + assert ( + collection_id in collection_ids + ), "Orphaned collection should appear in root /collections endpoint" + + # Verify the collection no longer has a catalog link to the deleted catalog + collection_data = get_resp.json() + collection_links = collection_data.get("links", []) + catalog_link = None + for link in collection_links: + if link.get("rel") == "catalog" and catalog_id in link.get("href", ""): + catalog_link = link + break + + assert ( + catalog_link is None + ), "Orphaned collection should not have link to deleted catalog" + + +@pytest.mark.asyncio +async def test_delete_catalog_no_cascade_multi_parent_collection( + catalogs_app_client, load_test_data +): + """Test that deleting a catalog without cascade preserves collections with other parents.""" + # Create two catalogs + catalog_ids = [] + for i in range(2): + test_catalog = load_test_data("test_catalog.json") + catalog_id = f"test-catalog-{uuid.uuid4()}-{i}" + test_catalog["id"] = catalog_id + + catalog_resp = await catalogs_app_client.post("/catalogs", json=test_catalog) + assert catalog_resp.status_code == 201 + catalog_ids.append(catalog_id) + + # Create a collection in the first catalog + test_collection = load_test_data("test_collection.json") + collection_id = f"test-collection-{uuid.uuid4()}" + test_collection["id"] = collection_id + + create_resp = await catalogs_app_client.post( + f"/catalogs/{catalog_ids[0]}/collections", json=test_collection + ) + assert create_resp.status_code == 201 + + # Add the collection to the second catalog + add_resp = await catalogs_app_client.post( + f"/catalogs/{catalog_ids[1]}/collections", json=test_collection + ) + assert add_resp.status_code == 201 + + # Delete the first catalog without cascade + delete_resp = await catalogs_app_client.delete(f"/catalogs/{catalog_ids[0]}") + assert delete_resp.status_code == 204 + + # Verify the collection still exists + get_resp = await catalogs_app_client.get(f"/collections/{collection_id}") + assert get_resp.status_code == 200 + + # Verify the collection is still accessible from the second catalog + get_from_catalog_resp = await catalogs_app_client.get( + f"/catalogs/{catalog_ids[1]}/collections/{collection_id}" + ) + assert get_from_catalog_resp.status_code == 200 + + # Verify the collection is NOT accessible from the deleted catalog + get_from_deleted_resp = await catalogs_app_client.get( + f"/catalogs/{catalog_ids[0]}/collections/{collection_id}" + ) + assert get_from_deleted_resp.status_code == 404 + + @pytest.mark.asyncio async def test_parent_ids_not_exposed_to_client(catalogs_app_client, load_test_data): """Test that parent_ids field is not exposed in API responses."""