diff --git a/backend/scripts/__init__.py b/backend/scripts/__init__.py new file mode 100644 index 0000000..7994ee4 --- /dev/null +++ b/backend/scripts/__init__.py @@ -0,0 +1 @@ +# Scripts package diff --git a/backend/scripts/backfill_pypi_dependencies.py b/backend/scripts/backfill_pypi_dependencies.py new file mode 100644 index 0000000..48194b7 --- /dev/null +++ b/backend/scripts/backfill_pypi_dependencies.py @@ -0,0 +1,262 @@ +#!/usr/bin/env python3 +""" +Backfill script to extract dependencies from cached PyPI packages. + +This script scans all artifacts in the _pypi project and extracts +Requires-Dist metadata from wheel and sdist files that don't already +have dependencies recorded. + +Usage: + # From within the container: + python -m scripts.backfill_pypi_dependencies + + # Or with docker exec: + docker exec orchard_orchard-server_1 python -m scripts.backfill_pypi_dependencies + + # Dry run (preview only): + docker exec orchard_orchard-server_1 python -m scripts.backfill_pypi_dependencies --dry-run +""" + +import argparse +import logging +import re +import sys +import tarfile +import zipfile +from io import BytesIO +from typing import List, Optional, Tuple + +# Add parent directory to path for imports +sys.path.insert(0, "/app") + +from sqlalchemy import create_engine +from sqlalchemy.orm import sessionmaker + +from backend.app.config import get_settings +from backend.app.models import ( + Artifact, + ArtifactDependency, + Package, + Project, + Tag, +) +from backend.app.storage import get_storage + +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s - %(levelname)s - %(message)s", +) +logger = logging.getLogger(__name__) + + +def parse_requires_dist(requires_dist: str) -> Tuple[Optional[str], Optional[str]]: + """Parse a Requires-Dist line into (package_name, version_constraint).""" + # Remove any environment markers (after semicolon) + if ";" in requires_dist: + requires_dist = requires_dist.split(";")[0].strip() + + # Match patterns like "package (>=1.0)" or "package>=1.0" or "package" + match = re.match( + r"^([a-zA-Z0-9][-a-zA-Z0-9._]*)\s*(?:\(([^)]+)\)|([<>=!~][^\s;]+))?", + requires_dist.strip(), + ) + + if not match: + return None, None + + package_name = match.group(1) + version_constraint = match.group(2) or match.group(3) + + # Normalize package name (PEP 503) + normalized_name = re.sub(r"[-_.]+", "-", package_name).lower() + + if version_constraint: + version_constraint = version_constraint.strip() + + return normalized_name, version_constraint + + +def extract_requires_from_metadata(metadata_content: str) -> List[Tuple[str, Optional[str]]]: + """Extract all Requires-Dist entries from METADATA/PKG-INFO content.""" + dependencies = [] + + for line in metadata_content.split("\n"): + if line.startswith("Requires-Dist:"): + value = line[len("Requires-Dist:"):].strip() + pkg_name, version = parse_requires_dist(value) + if pkg_name: + dependencies.append((pkg_name, version)) + + return dependencies + + +def extract_metadata_from_wheel(content: bytes) -> Optional[str]: + """Extract METADATA file content from a wheel (zip) file.""" + try: + with zipfile.ZipFile(BytesIO(content)) as zf: + for name in zf.namelist(): + if name.endswith(".dist-info/METADATA"): + return zf.read(name).decode("utf-8", errors="replace") + except Exception as e: + logger.warning(f"Failed to extract metadata from wheel: {e}") + return None + + +def extract_metadata_from_sdist(content: bytes) -> Optional[str]: + """Extract PKG-INFO file content from a source distribution (.tar.gz).""" + try: + with tarfile.open(fileobj=BytesIO(content), mode="r:gz") as tf: + for member in tf.getmembers(): + if member.name.endswith("/PKG-INFO") and member.name.count("/") == 1: + f = tf.extractfile(member) + if f: + return f.read().decode("utf-8", errors="replace") + except Exception as e: + logger.warning(f"Failed to extract metadata from sdist: {e}") + return None + + +def extract_dependencies(content: bytes, filename: str) -> List[Tuple[str, Optional[str]]]: + """Extract dependencies from a PyPI package file.""" + metadata = None + + if filename.endswith(".whl"): + metadata = extract_metadata_from_wheel(content) + elif filename.endswith(".tar.gz"): + metadata = extract_metadata_from_sdist(content) + + if metadata: + return extract_requires_from_metadata(metadata) + + return [] + + +def backfill_dependencies(dry_run: bool = False): + """Main backfill function.""" + settings = get_settings() + + # Create database connection + engine = create_engine(settings.database_url) + Session = sessionmaker(bind=engine) + db = Session() + + # Create storage client + storage = get_storage() + + try: + # Find the _pypi project + pypi_project = db.query(Project).filter(Project.name == "_pypi").first() + if not pypi_project: + logger.info("No _pypi project found. Nothing to backfill.") + return + + # Get all packages in _pypi + packages = db.query(Package).filter(Package.project_id == pypi_project.id).all() + logger.info(f"Found {len(packages)} packages in _pypi project") + + total_artifacts = 0 + artifacts_with_deps = 0 + artifacts_processed = 0 + dependencies_added = 0 + + for package in packages: + # Get all tags (each tag points to an artifact) + tags = db.query(Tag).filter(Tag.package_id == package.id).all() + + for tag in tags: + total_artifacts += 1 + filename = tag.name + + # Skip non-package files (like .metadata files) + if not (filename.endswith(".whl") or filename.endswith(".tar.gz")): + continue + + # Check if this artifact already has dependencies + existing_deps = db.query(ArtifactDependency).filter( + ArtifactDependency.artifact_id == tag.artifact_id + ).count() + + if existing_deps > 0: + artifacts_with_deps += 1 + continue + + # Get the artifact + artifact = db.query(Artifact).filter(Artifact.id == tag.artifact_id).first() + if not artifact: + logger.warning(f"Artifact {tag.artifact_id} not found for tag {filename}") + continue + + logger.info(f"Processing {package.name}/{filename}...") + + if dry_run: + logger.info(f" [DRY RUN] Would extract dependencies from {filename}") + artifacts_processed += 1 + continue + + # Download the artifact from S3 + try: + content = storage.get(artifact.s3_key) + except Exception as e: + logger.error(f" Failed to download {filename}: {e}") + continue + + # Extract dependencies + deps = extract_dependencies(content, filename) + + if deps: + logger.info(f" Found {len(deps)} dependencies") + for dep_name, dep_version in deps: + # Check if already exists (race condition protection) + existing = db.query(ArtifactDependency).filter( + ArtifactDependency.artifact_id == tag.artifact_id, + ArtifactDependency.dependency_project == "_pypi", + ArtifactDependency.dependency_package == dep_name, + ).first() + + if not existing: + dep = ArtifactDependency( + artifact_id=tag.artifact_id, + dependency_project="_pypi", + dependency_package=dep_name, + version_constraint=dep_version if dep_version else "*", + ) + db.add(dep) + dependencies_added += 1 + logger.info(f" + {dep_name} {dep_version or '*'}") + + db.commit() + else: + logger.info(f" No dependencies found") + + artifacts_processed += 1 + + logger.info("") + logger.info("=" * 50) + logger.info("Backfill complete!") + logger.info(f" Total artifacts: {total_artifacts}") + logger.info(f" Already had deps: {artifacts_with_deps}") + logger.info(f" Processed: {artifacts_processed}") + logger.info(f" Dependencies added: {dependencies_added}") + if dry_run: + logger.info(" (DRY RUN - no changes made)") + + finally: + db.close() + + +def main(): + parser = argparse.ArgumentParser( + description="Backfill dependencies for cached PyPI packages" + ) + parser.add_argument( + "--dry-run", + action="store_true", + help="Preview what would be done without making changes", + ) + args = parser.parse_args() + + backfill_dependencies(dry_run=args.dry_run) + + +if __name__ == "__main__": + main()