Add backfill script for PyPI package dependencies

Script extracts Requires-Dist metadata from cached PyPI packages
and stores them in artifact_dependencies table.

Usage:
  docker exec <container> python -m backend.scripts.backfill_pypi_dependencies
  docker exec <container> python -m backend.scripts.backfill_pypi_dependencies --dry-run
This commit is contained in:
Mondo Diaz
2026-01-30 15:38:45 -06:00
parent 4b76ca2046
commit ead016208d
2 changed files with 263 additions and 0 deletions

View File

@@ -0,0 +1 @@
# Scripts package

View File

@@ -0,0 +1,262 @@
#!/usr/bin/env python3
"""
Backfill script to extract dependencies from cached PyPI packages.
This script scans all artifacts in the _pypi project and extracts
Requires-Dist metadata from wheel and sdist files that don't already
have dependencies recorded.
Usage:
# From within the container:
python -m scripts.backfill_pypi_dependencies
# Or with docker exec:
docker exec orchard_orchard-server_1 python -m scripts.backfill_pypi_dependencies
# Dry run (preview only):
docker exec orchard_orchard-server_1 python -m scripts.backfill_pypi_dependencies --dry-run
"""
import argparse
import logging
import re
import sys
import tarfile
import zipfile
from io import BytesIO
from typing import List, Optional, Tuple
# Add parent directory to path for imports
sys.path.insert(0, "/app")
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
from backend.app.config import get_settings
from backend.app.models import (
Artifact,
ArtifactDependency,
Package,
Project,
Tag,
)
from backend.app.storage import get_storage
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s - %(levelname)s - %(message)s",
)
logger = logging.getLogger(__name__)
def parse_requires_dist(requires_dist: str) -> Tuple[Optional[str], Optional[str]]:
"""Parse a Requires-Dist line into (package_name, version_constraint)."""
# Remove any environment markers (after semicolon)
if ";" in requires_dist:
requires_dist = requires_dist.split(";")[0].strip()
# Match patterns like "package (>=1.0)" or "package>=1.0" or "package"
match = re.match(
r"^([a-zA-Z0-9][-a-zA-Z0-9._]*)\s*(?:\(([^)]+)\)|([<>=!~][^\s;]+))?",
requires_dist.strip(),
)
if not match:
return None, None
package_name = match.group(1)
version_constraint = match.group(2) or match.group(3)
# Normalize package name (PEP 503)
normalized_name = re.sub(r"[-_.]+", "-", package_name).lower()
if version_constraint:
version_constraint = version_constraint.strip()
return normalized_name, version_constraint
def extract_requires_from_metadata(metadata_content: str) -> List[Tuple[str, Optional[str]]]:
"""Extract all Requires-Dist entries from METADATA/PKG-INFO content."""
dependencies = []
for line in metadata_content.split("\n"):
if line.startswith("Requires-Dist:"):
value = line[len("Requires-Dist:"):].strip()
pkg_name, version = parse_requires_dist(value)
if pkg_name:
dependencies.append((pkg_name, version))
return dependencies
def extract_metadata_from_wheel(content: bytes) -> Optional[str]:
"""Extract METADATA file content from a wheel (zip) file."""
try:
with zipfile.ZipFile(BytesIO(content)) as zf:
for name in zf.namelist():
if name.endswith(".dist-info/METADATA"):
return zf.read(name).decode("utf-8", errors="replace")
except Exception as e:
logger.warning(f"Failed to extract metadata from wheel: {e}")
return None
def extract_metadata_from_sdist(content: bytes) -> Optional[str]:
"""Extract PKG-INFO file content from a source distribution (.tar.gz)."""
try:
with tarfile.open(fileobj=BytesIO(content), mode="r:gz") as tf:
for member in tf.getmembers():
if member.name.endswith("/PKG-INFO") and member.name.count("/") == 1:
f = tf.extractfile(member)
if f:
return f.read().decode("utf-8", errors="replace")
except Exception as e:
logger.warning(f"Failed to extract metadata from sdist: {e}")
return None
def extract_dependencies(content: bytes, filename: str) -> List[Tuple[str, Optional[str]]]:
"""Extract dependencies from a PyPI package file."""
metadata = None
if filename.endswith(".whl"):
metadata = extract_metadata_from_wheel(content)
elif filename.endswith(".tar.gz"):
metadata = extract_metadata_from_sdist(content)
if metadata:
return extract_requires_from_metadata(metadata)
return []
def backfill_dependencies(dry_run: bool = False):
"""Main backfill function."""
settings = get_settings()
# Create database connection
engine = create_engine(settings.database_url)
Session = sessionmaker(bind=engine)
db = Session()
# Create storage client
storage = get_storage()
try:
# Find the _pypi project
pypi_project = db.query(Project).filter(Project.name == "_pypi").first()
if not pypi_project:
logger.info("No _pypi project found. Nothing to backfill.")
return
# Get all packages in _pypi
packages = db.query(Package).filter(Package.project_id == pypi_project.id).all()
logger.info(f"Found {len(packages)} packages in _pypi project")
total_artifacts = 0
artifacts_with_deps = 0
artifacts_processed = 0
dependencies_added = 0
for package in packages:
# Get all tags (each tag points to an artifact)
tags = db.query(Tag).filter(Tag.package_id == package.id).all()
for tag in tags:
total_artifacts += 1
filename = tag.name
# Skip non-package files (like .metadata files)
if not (filename.endswith(".whl") or filename.endswith(".tar.gz")):
continue
# Check if this artifact already has dependencies
existing_deps = db.query(ArtifactDependency).filter(
ArtifactDependency.artifact_id == tag.artifact_id
).count()
if existing_deps > 0:
artifacts_with_deps += 1
continue
# Get the artifact
artifact = db.query(Artifact).filter(Artifact.id == tag.artifact_id).first()
if not artifact:
logger.warning(f"Artifact {tag.artifact_id} not found for tag {filename}")
continue
logger.info(f"Processing {package.name}/{filename}...")
if dry_run:
logger.info(f" [DRY RUN] Would extract dependencies from {filename}")
artifacts_processed += 1
continue
# Download the artifact from S3
try:
content = storage.get(artifact.s3_key)
except Exception as e:
logger.error(f" Failed to download {filename}: {e}")
continue
# Extract dependencies
deps = extract_dependencies(content, filename)
if deps:
logger.info(f" Found {len(deps)} dependencies")
for dep_name, dep_version in deps:
# Check if already exists (race condition protection)
existing = db.query(ArtifactDependency).filter(
ArtifactDependency.artifact_id == tag.artifact_id,
ArtifactDependency.dependency_project == "_pypi",
ArtifactDependency.dependency_package == dep_name,
).first()
if not existing:
dep = ArtifactDependency(
artifact_id=tag.artifact_id,
dependency_project="_pypi",
dependency_package=dep_name,
version_constraint=dep_version if dep_version else "*",
)
db.add(dep)
dependencies_added += 1
logger.info(f" + {dep_name} {dep_version or '*'}")
db.commit()
else:
logger.info(f" No dependencies found")
artifacts_processed += 1
logger.info("")
logger.info("=" * 50)
logger.info("Backfill complete!")
logger.info(f" Total artifacts: {total_artifacts}")
logger.info(f" Already had deps: {artifacts_with_deps}")
logger.info(f" Processed: {artifacts_processed}")
logger.info(f" Dependencies added: {dependencies_added}")
if dry_run:
logger.info(" (DRY RUN - no changes made)")
finally:
db.close()
def main():
parser = argparse.ArgumentParser(
description="Backfill dependencies for cached PyPI packages"
)
parser.add_argument(
"--dry-run",
action="store_true",
help="Preview what would be done without making changes",
)
args = parser.parse_args()
backfill_dependencies(dry_run=args.dry_run)
if __name__ == "__main__":
main()