Skip to content
Merged
12 changes: 12 additions & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,18 @@ Release notes
prevent the creation of duplicated "resolved" dependencies.
https://github.com/aboutcode-org/dejacode/issues/297

- Display the filename/download_url in the Inventory tab.
https://github.com/aboutcode-org/dejacode/issues/303

- Improve exception support in improve_packages_from_purldb task.
In case of an exception, the error is properly logged on the Import instance.
https://github.com/aboutcode-org/dejacode/issues/303

- Refine the ``update_from_purldb`` function to avoid any IntegrityError.
Also, when multiple entries are returned from the PurlDB, only the common values are
merged and kept for the data update.
https://github.com/aboutcode-org/dejacode/issues/303

### Version 5.2.1

- Fix the models documentation navigation.
Expand Down
59 changes: 54 additions & 5 deletions component_catalog/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@
from dje.models import ReferenceNotesMixin
from dje.tasks import logger as tasks_logger
from dje.utils import is_purl_str
from dje.utils import merge_common_non_empty_values
from dje.utils import set_fields_from_object
from dje.validators import generic_uri_validator
from dje.validators import validate_url_segment
Expand Down Expand Up @@ -2454,6 +2455,7 @@ def get_purldb_entries(self, user, max_request_call=0, timeout=10):
is nothing was found.
"""
payloads = []
purldb_entries = []

package_url = self.package_url
if package_url:
Expand All @@ -2468,30 +2470,77 @@ def get_purldb_entries(self, user, max_request_call=0, timeout=10):
if max_request_call and index >= max_request_call:
return

if packages_data := purldb.find_packages(payload, timeout):
return packages_data
if purldb_entries := purldb.find_packages(payload, timeout):
break

# Cleanup the PurlDB entries:
# - Packages with different PURL are excluded.
if package_url:
purldb_entries = [entry for entry in purldb_entries if entry.get("purl") == package_url]

return purldb_entries

def update_from_purldb(self, user):
"""
Find this Package in the PurlDB and update empty fields with PurlDB data
when available.
Update this Package instance with data from PurlDB.

- Retrieves matching entries from PurlDB using the given user.
- If exactly one match is found, its data is used directly.
- If multiple entries are found, only values that are non-empty and
common across all entries are merged and used to update the Package.
"""
purldb_entries = self.get_purldb_entries(user)
if not purldb_entries:
return

package_data = purldb_entries[0]
purldb_entries_count = len(purldb_entries)
if purldb_entries_count == 1:
package_data = purldb_entries[0]
else:
package_data = merge_common_non_empty_values(purldb_entries)

# The format from PURLDB is "2019-11-18T00:00:00Z"
if release_date := package_data.get("release_date"):
package_data["release_date"] = release_date.split("T")[0]
package_data["license_expression"] = package_data.get("declared_license_expression")

# Avoid raising an IntegrityError when the values in `package_data` for the
# identifier fields already exist on another Package instance.
#
# This situation can occur when a complete package (with both `purl` and
# `download_url`) already exists in the Dataspace, and `update_from_purldb` is
# called on a different package that has the same `purl` but no `download_url`.
#
# If we try to assign the same `download_url` to the second package, it would
# violate the unique constraints defined in the Package model (since the
# combination of fields must be unique).
unique_filters_lookups = {
field_name: package_data.get(field_name, "")
for field_name in self.get_identifier_fields()
}
unique_filters_qs = (
Package.objects.scope(self.dataspace)
.filter(**unique_filters_lookups)
.exclude(pk=self.pk)
)
if unique_filters_qs.exists():
# Remove the problematic "identifier_fields" values and the checksum values
hash_field_names = [field.name for field in HashFieldsMixin._meta.fields]
identifier_fields = self.get_identifier_fields()
for field_name in [*hash_field_names, *identifier_fields]:
package_data.pop(field_name, None)

# try:
updated_fields = self.update_from_data(
user,
package_data,
override=False,
override_unknown=True,
)
# except IntegrityError as e:
# logger.error(f"[update_from_purldb] Skipping {self} due to IntegrityError: {e}")
# return []

return updated_fields

def update_from_scan(self, user):
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
<div title="{{ package.download_url }}"{% if not package.filename %} class="text-truncate"{% endif %}>
{% if package.download_url %}
<a href="{{ package.download_url }}">
{% if display_icons %}
<i class="fa-solid fa-download me-1"></i>
{% endif %}
{% if package.filename %}
{{ package.filename }}
{% else %}
{{ package.download_url|truncatechars:40 }}
{% endif %}
</a>
{% elif package.filename %}
{% if display_icons %}
<i class="fa-solid fa-file me-1"></i>
{% endif %}
{{ package.filename }}
{% endif %}
</div>
Original file line number Diff line number Diff line change
Expand Up @@ -57,12 +57,8 @@
<td>
{{ object.primary_language }}
</td>
<td title="{{ object.download_url }}"{% if not object.filename %} class="text-truncate"{% endif %}>
{% if object.download_url %}
<a href="{{ object.download_url }}">
{% if object.filename %}{{ object.filename }}{% else %}{{ object.download_url }}{% endif %}
</a>
{% endif %}
<td>
{% include 'component_catalog/includes/package_filename_as_link.html' with package=object %}
</td>
<td>
{% with components=object.component_set.all %}
Expand Down
87 changes: 85 additions & 2 deletions component_catalog/tests/test_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -2556,6 +2556,27 @@ def test_package_model_inferred_url_property(self):
expected = "https://github.com/package-url/packageurl-python/tree/v0.10.4"
self.assertEqual(expected, package1.inferred_url)

@mock.patch("dejacode_toolkit.purldb.PurlDB.find_packages")
def test_package_model_get_purldb_entries(self, mock_find_packages):
purl = "pkg:pypi/[email protected]"
package1 = make_package(self.dataspace, package_url=purl)
purldb_entry1 = {
"purl": purl,
"type": "pypi",
"name": "django",
"version": "3.0",
}
purldb_entry2 = {
"purl": "pkg:pypi/django",
"type": "pypi",
"name": "django",
}

mock_find_packages.return_value = [purldb_entry1, purldb_entry2]
purldb_entries = package1.get_purldb_entries(user=self.user)
# The purldb_entry2 is excluded as the PURL differs
self.assertEqual([purldb_entry1], purldb_entries)

@mock.patch("component_catalog.models.Package.get_purldb_entries")
def test_package_model_update_from_purldb(self, mock_get_purldb_entries):
purldb_entry = {
Expand All @@ -2577,9 +2598,9 @@ def test_package_model_update_from_purldb(self, mock_get_purldb_entries):
}

mock_get_purldb_entries.return_value = [purldb_entry]
package1 = Package.objects.create(
package1 = make_package(
self.dataspace,
filename="package",
dataspace=self.dataspace,
# "unknown" values are overrided
declared_license_expression="unknown",
)
Expand Down Expand Up @@ -2607,6 +2628,68 @@ def test_package_model_update_from_purldb(self, mock_get_purldb_entries):
for field_name in updated_fields:
self.assertEqual(purldb_entry[field_name], getattr(package1, field_name))

@mock.patch("component_catalog.models.Package.get_purldb_entries")
def test_package_model_update_from_purldb_multiple_entries(self, mock_get_purldb_entries):
purldb_entry1 = {
"uuid": "326aa7a8-4f28-406d-89f9-c1404916925b",
"purl": "pkg:pypi/[email protected]",
"type": "pypi",
"name": "django",
"version": "3.0",
"keywords": ["Keyword1", "Keyword2"],
"filename": "Django-3.0.tar.gz",
"download_url": "https://files.pythonhosted.org/packages/38/Django-3.0.tar.gz",
}
purldb_entry2 = {
"uuid": "e133e70b-8dd3-4cf1-9711-72b1f57523a0",
"purl": "pkg:pypi/[email protected]",
"type": "pypi",
"name": "django",
"version": "3.0",
"primary_language": "Python",
"keywords": ["Keyword1", "Keyword2"],
"download_url": "https://another.url/Django-3.0.tar.gz",
}

mock_get_purldb_entries.return_value = [purldb_entry1, purldb_entry2]
package1 = make_package(self.dataspace, package_url="pkg:pypi/[email protected]")
updated_fields = package1.update_from_purldb(self.user)
expected = ["filename", "keywords", "primary_language"]
self.assertEqual(expected, sorted(updated_fields))
self.assertEqual("Django-3.0.tar.gz", package1.filename)
self.assertEqual(["Keyword1", "Keyword2"], package1.keywords)
self.assertEqual("Python", package1.primary_language)

@mock.patch("component_catalog.models.Package.get_purldb_entries")
def test_package_model_update_from_purldb_duplicate_exception(self, mock_get_purldb_entries):
package_url = "pkg:pypi/[email protected]"
download_url = "https://files.pythonhosted.org/packages/38/Django-3.0.tar.gz"
purldb_entry = {
"purl": package_url,
"type": "pypi",
"name": "django",
"version": "3.0",
"download_url": download_url,
"description": "This value will be updated",
"md5": "This value is skipped",
"sha1": "This value is skipped",
}
mock_get_purldb_entries.return_value = [purldb_entry]

# 2 packages with the same "pkg:pypi/[email protected]" PURL:
# - 1 with a `download_url` value
# - 1 without a `download_url` value
make_package(self.dataspace, package_url=package_url, download_url=download_url)
package_no_download_url = make_package(self.dataspace, package_url=package_url)

# Updating the package with the `download_url` from the purldb_entry data
# would violates the unique constraint.
# This is handle properly by update_from_purldb.
updated_fields = package_no_download_url.update_from_purldb(self.user)
self.assertEqual(["description"], updated_fields)
package_no_download_url.refresh_from_db()
self.assertEqual(purldb_entry["description"], package_no_download_url.description)

def test_package_model_vulnerability_queryset_mixin(self):
package1 = make_package(self.dataspace, is_vulnerable=True)
package2 = make_package(self.dataspace)
Expand Down
8 changes: 4 additions & 4 deletions component_catalog/tests/test_views.py
Original file line number Diff line number Diff line change
Expand Up @@ -1194,20 +1194,20 @@ def test_package_list_view_download_column(self):
response = self.client.get(reverse("component_catalog:package_list"))

expected = f"""
<td title="{self.package1.download_url}">
<div title="{self.package1.download_url}">
<a href="{self.package1.download_url}">
{self.package1.filename}
</a>
</td>
</div>
"""
self.assertContains(response, expected, html=True)

expected = f"""
<td title="{self.package2.download_url}" class="text-truncate">
<div title="{self.package2.download_url}" class="text-truncate">
<a href="{self.package2.download_url}">
{self.package2.download_url}
</a>
</td>
</div>
"""
self.assertContains(response, expected, html=True)

Expand Down
7 changes: 5 additions & 2 deletions component_catalog/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -2435,9 +2435,12 @@ def get_tab_fields(self):
}
tab_fields.append(("", alert_context, None, "includes/field_alert.html"))

if len(purldb_entries) > 1:
len_purldb_entries = len(purldb_entries)
if len_purldb_entries > 1:
alert_context = {
"message": "There are multiple entries in the PurlDB for this Package.",
"message": (
f"There are {len_purldb_entries} entries in the PurlDB for this Package."
),
"full_width": True,
"alert_class": "alert-warning",
}
Expand Down
2 changes: 1 addition & 1 deletion dejacode_toolkit/purldb.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ def get_package(self, uuid):

def get_package_by_purl(self, package_url):
"""Get a Package details entry providing its `package_url`."""
if results := self.find_packages({"purl": package_url}):
if results := self.find_packages(payload={"purl": package_url}):
return results[0]

def find_packages(self, payload, timeout=None):
Expand Down
Loading