Fix duplicate dependency extraction from PyPI wheel METADATA

Wheel METADATA files can list the same dependency multiple times under
different extras (e.g., bokeh appears under [docs] and [bokeh-tests]).
This caused unique constraint violations when storing dependencies.

Fix by deduplicating extracted deps before DB insertion.
This commit is contained in:
Mondo Diaz
2026-02-03 17:43:38 -06:00
parent 7b0d423bee
commit a97d3e630f

View File

@@ -821,9 +821,15 @@ async def pypi_download_file(
) )
db.add(cached_url_record) db.add(cached_url_record)
# Store extracted dependencies # Store extracted dependencies (deduplicate first - METADATA can list same dep under multiple extras)
if extracted_deps: if extracted_deps:
# Deduplicate: keep first version constraint seen for each package name
seen_deps: dict[str, str] = {}
for dep_name, dep_version in extracted_deps: for dep_name, dep_version in extracted_deps:
if dep_name not in seen_deps:
seen_deps[dep_name] = dep_version if dep_version else "*"
for dep_name, dep_version in seen_deps.items():
# Check if this dependency already exists for this artifact # Check if this dependency already exists for this artifact
existing_dep = db.query(ArtifactDependency).filter( existing_dep = db.query(ArtifactDependency).filter(
ArtifactDependency.artifact_id == sha256, ArtifactDependency.artifact_id == sha256,
@@ -836,7 +842,7 @@ async def pypi_download_file(
artifact_id=sha256, artifact_id=sha256,
dependency_project="_pypi", dependency_project="_pypi",
dependency_package=dep_name, dependency_package=dep_name,
version_constraint=dep_version if dep_version else "*", version_constraint=dep_version,
) )
db.add(dep) db.add(dep)