#!/usr/bin/env python3 """ Backfill script to extract dependencies from cached PyPI packages. This script scans all artifacts in the _pypi project and extracts Requires-Dist metadata from wheel and sdist files that don't already have dependencies recorded. Usage: # From within the container: python -m scripts.backfill_pypi_dependencies # Or with docker exec: docker exec orchard_orchard-server_1 python -m scripts.backfill_pypi_dependencies # Dry run (preview only): docker exec orchard_orchard-server_1 python -m scripts.backfill_pypi_dependencies --dry-run """ import argparse import logging import re import sys import tarfile import zipfile from io import BytesIO from typing import List, Optional, Tuple # Add parent directory to path for imports sys.path.insert(0, "/app") from sqlalchemy import create_engine from sqlalchemy.orm import sessionmaker from backend.app.config import get_settings from backend.app.models import ( Artifact, ArtifactDependency, Package, Project, Tag, ) from backend.app.storage import get_storage logging.basicConfig( level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s", ) logger = logging.getLogger(__name__) def parse_requires_dist(requires_dist: str) -> Tuple[Optional[str], Optional[str]]: """Parse a Requires-Dist line into (package_name, version_constraint).""" # Remove any environment markers (after semicolon) if ";" in requires_dist: requires_dist = requires_dist.split(";")[0].strip() # Match patterns like "package (>=1.0)" or "package>=1.0" or "package" match = re.match( r"^([a-zA-Z0-9][-a-zA-Z0-9._]*)\s*(?:\(([^)]+)\)|([<>=!~][^\s;]+))?", requires_dist.strip(), ) if not match: return None, None package_name = match.group(1) version_constraint = match.group(2) or match.group(3) # Normalize package name (PEP 503) normalized_name = re.sub(r"[-_.]+", "-", package_name).lower() if version_constraint: version_constraint = version_constraint.strip() return normalized_name, version_constraint def extract_requires_from_metadata(metadata_content: str) -> List[Tuple[str, Optional[str]]]: """Extract all Requires-Dist entries from METADATA/PKG-INFO content.""" dependencies = [] for line in metadata_content.split("\n"): if line.startswith("Requires-Dist:"): value = line[len("Requires-Dist:"):].strip() pkg_name, version = parse_requires_dist(value) if pkg_name: dependencies.append((pkg_name, version)) return dependencies def extract_metadata_from_wheel(content: bytes) -> Optional[str]: """Extract METADATA file content from a wheel (zip) file.""" try: with zipfile.ZipFile(BytesIO(content)) as zf: for name in zf.namelist(): if name.endswith(".dist-info/METADATA"): return zf.read(name).decode("utf-8", errors="replace") except Exception as e: logger.warning(f"Failed to extract metadata from wheel: {e}") return None def extract_metadata_from_sdist(content: bytes) -> Optional[str]: """Extract PKG-INFO file content from a source distribution (.tar.gz).""" try: with tarfile.open(fileobj=BytesIO(content), mode="r:gz") as tf: for member in tf.getmembers(): if member.name.endswith("/PKG-INFO") and member.name.count("/") == 1: f = tf.extractfile(member) if f: return f.read().decode("utf-8", errors="replace") except Exception as e: logger.warning(f"Failed to extract metadata from sdist: {e}") return None def extract_dependencies(content: bytes, filename: str) -> List[Tuple[str, Optional[str]]]: """Extract dependencies from a PyPI package file.""" metadata = None if filename.endswith(".whl"): metadata = extract_metadata_from_wheel(content) elif filename.endswith(".tar.gz"): metadata = extract_metadata_from_sdist(content) if metadata: return extract_requires_from_metadata(metadata) return [] def backfill_dependencies(dry_run: bool = False): """Main backfill function.""" settings = get_settings() # Create database connection engine = create_engine(settings.database_url) Session = sessionmaker(bind=engine) db = Session() # Create storage client storage = get_storage() try: # Find the _pypi project pypi_project = db.query(Project).filter(Project.name == "_pypi").first() if not pypi_project: logger.info("No _pypi project found. Nothing to backfill.") return # Get all packages in _pypi packages = db.query(Package).filter(Package.project_id == pypi_project.id).all() logger.info(f"Found {len(packages)} packages in _pypi project") total_artifacts = 0 artifacts_with_deps = 0 artifacts_processed = 0 dependencies_added = 0 for package in packages: # Get all tags (each tag points to an artifact) tags = db.query(Tag).filter(Tag.package_id == package.id).all() for tag in tags: total_artifacts += 1 filename = tag.name # Skip non-package files (like .metadata files) if not (filename.endswith(".whl") or filename.endswith(".tar.gz")): continue # Check if this artifact already has dependencies existing_deps = db.query(ArtifactDependency).filter( ArtifactDependency.artifact_id == tag.artifact_id ).count() if existing_deps > 0: artifacts_with_deps += 1 continue # Get the artifact artifact = db.query(Artifact).filter(Artifact.id == tag.artifact_id).first() if not artifact: logger.warning(f"Artifact {tag.artifact_id} not found for tag {filename}") continue logger.info(f"Processing {package.name}/{filename}...") if dry_run: logger.info(f" [DRY RUN] Would extract dependencies from {filename}") artifacts_processed += 1 continue # Download the artifact from S3 try: content = storage.get(artifact.s3_key) except Exception as e: logger.error(f" Failed to download {filename}: {e}") continue # Extract dependencies deps = extract_dependencies(content, filename) if deps: logger.info(f" Found {len(deps)} dependencies") for dep_name, dep_version in deps: # Check if already exists (race condition protection) existing = db.query(ArtifactDependency).filter( ArtifactDependency.artifact_id == tag.artifact_id, ArtifactDependency.dependency_project == "_pypi", ArtifactDependency.dependency_package == dep_name, ).first() if not existing: dep = ArtifactDependency( artifact_id=tag.artifact_id, dependency_project="_pypi", dependency_package=dep_name, version_constraint=dep_version if dep_version else "*", ) db.add(dep) dependencies_added += 1 logger.info(f" + {dep_name} {dep_version or '*'}") db.commit() else: logger.info(f" No dependencies found") artifacts_processed += 1 logger.info("") logger.info("=" * 50) logger.info("Backfill complete!") logger.info(f" Total artifacts: {total_artifacts}") logger.info(f" Already had deps: {artifacts_with_deps}") logger.info(f" Processed: {artifacts_processed}") logger.info(f" Dependencies added: {dependencies_added}") if dry_run: logger.info(" (DRY RUN - no changes made)") finally: db.close() def main(): parser = argparse.ArgumentParser( description="Backfill dependencies for cached PyPI packages" ) parser.add_argument( "--dry-run", action="store_true", help="Preview what would be done without making changes", ) args = parser.parse_args() backfill_dependencies(dry_run=args.dry_run) if __name__ == "__main__": main()