perf: use batch dependency storage in pypi_proxy

This commit is contained in:
Mondo Diaz
2026-02-04 09:52:16 -06:00
parent 8fdb73901e
commit 7ad5a15ef4

View File

@@ -23,7 +23,7 @@ from fastapi.responses import StreamingResponse, HTMLResponse, RedirectResponse
from sqlalchemy.orm import Session from sqlalchemy.orm import Session
from .database import get_db from .database import get_db
from .models import UpstreamSource, CachedUrl, Artifact, Project, Package, PackageVersion, ArtifactDependency from .models import UpstreamSource, CachedUrl, Artifact, Project, Package, PackageVersion
from .storage import S3Storage, get_storage from .storage import S3Storage, get_storage
from .config import get_env_upstream_sources, get_settings from .config import get_env_upstream_sources, get_settings
from .http_client import HttpClientManager from .http_client import HttpClientManager
@@ -895,7 +895,7 @@ async def pypi_download_file(
) )
db.add(cached_url_record) db.add(cached_url_record)
# Store extracted dependencies (deduplicate first - METADATA can list same dep under multiple extras) # Store extracted dependencies using batch operation
if extracted_deps: if extracted_deps:
# Deduplicate: keep first version constraint seen for each package name # Deduplicate: keep first version constraint seen for each package name
seen_deps: dict[str, str] = {} seen_deps: dict[str, str] = {}
@@ -903,22 +903,17 @@ async def pypi_download_file(
if dep_name not in seen_deps: if dep_name not in seen_deps:
seen_deps[dep_name] = dep_version if dep_version else "*" seen_deps[dep_name] = dep_version if dep_version else "*"
for dep_name, dep_version in seen_deps.items(): # Convert to list of tuples for batch insert
# Check if this dependency already exists for this artifact deps_to_store = [
existing_dep = db.query(ArtifactDependency).filter( ("_pypi", dep_name, dep_version)
ArtifactDependency.artifact_id == sha256, for dep_name, dep_version in seen_deps.items()
ArtifactDependency.dependency_project == "_pypi", ]
ArtifactDependency.dependency_package == dep_name,
).first()
if not existing_dep: # Batch upsert - handles duplicates with ON CONFLICT DO NOTHING
dep = ArtifactDependency( repo = ArtifactRepository(db)
artifact_id=sha256, inserted = repo.batch_upsert_dependencies(sha256, deps_to_store)
dependency_project="_pypi", if inserted > 0:
dependency_package=dep_name, logger.debug(f"Stored {inserted} dependencies for {sha256[:12]}...")
version_constraint=dep_version,
)
db.add(dep)
db.commit() db.commit()