Add backfill script for PyPI package dependencies
Script extracts Requires-Dist metadata from cached PyPI packages and stores them in artifact_dependencies table. Usage: docker exec <container> python -m backend.scripts.backfill_pypi_dependencies docker exec <container> python -m backend.scripts.backfill_pypi_dependencies --dry-run
This commit is contained in:
1
backend/scripts/__init__.py
Normal file
1
backend/scripts/__init__.py
Normal file
@@ -0,0 +1 @@
|
||||
# Scripts package
|
||||
262
backend/scripts/backfill_pypi_dependencies.py
Normal file
262
backend/scripts/backfill_pypi_dependencies.py
Normal file
@@ -0,0 +1,262 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Backfill script to extract dependencies from cached PyPI packages.
|
||||
|
||||
This script scans all artifacts in the _pypi project and extracts
|
||||
Requires-Dist metadata from wheel and sdist files that don't already
|
||||
have dependencies recorded.
|
||||
|
||||
Usage:
|
||||
# From within the container:
|
||||
python -m scripts.backfill_pypi_dependencies
|
||||
|
||||
# Or with docker exec:
|
||||
docker exec orchard_orchard-server_1 python -m scripts.backfill_pypi_dependencies
|
||||
|
||||
# Dry run (preview only):
|
||||
docker exec orchard_orchard-server_1 python -m scripts.backfill_pypi_dependencies --dry-run
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import logging
|
||||
import re
|
||||
import sys
|
||||
import tarfile
|
||||
import zipfile
|
||||
from io import BytesIO
|
||||
from typing import List, Optional, Tuple
|
||||
|
||||
# Add parent directory to path for imports
|
||||
sys.path.insert(0, "/app")
|
||||
|
||||
from sqlalchemy import create_engine
|
||||
from sqlalchemy.orm import sessionmaker
|
||||
|
||||
from backend.app.config import get_settings
|
||||
from backend.app.models import (
|
||||
Artifact,
|
||||
ArtifactDependency,
|
||||
Package,
|
||||
Project,
|
||||
Tag,
|
||||
)
|
||||
from backend.app.storage import get_storage
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s - %(levelname)s - %(message)s",
|
||||
)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def parse_requires_dist(requires_dist: str) -> Tuple[Optional[str], Optional[str]]:
|
||||
"""Parse a Requires-Dist line into (package_name, version_constraint)."""
|
||||
# Remove any environment markers (after semicolon)
|
||||
if ";" in requires_dist:
|
||||
requires_dist = requires_dist.split(";")[0].strip()
|
||||
|
||||
# Match patterns like "package (>=1.0)" or "package>=1.0" or "package"
|
||||
match = re.match(
|
||||
r"^([a-zA-Z0-9][-a-zA-Z0-9._]*)\s*(?:\(([^)]+)\)|([<>=!~][^\s;]+))?",
|
||||
requires_dist.strip(),
|
||||
)
|
||||
|
||||
if not match:
|
||||
return None, None
|
||||
|
||||
package_name = match.group(1)
|
||||
version_constraint = match.group(2) or match.group(3)
|
||||
|
||||
# Normalize package name (PEP 503)
|
||||
normalized_name = re.sub(r"[-_.]+", "-", package_name).lower()
|
||||
|
||||
if version_constraint:
|
||||
version_constraint = version_constraint.strip()
|
||||
|
||||
return normalized_name, version_constraint
|
||||
|
||||
|
||||
def extract_requires_from_metadata(metadata_content: str) -> List[Tuple[str, Optional[str]]]:
|
||||
"""Extract all Requires-Dist entries from METADATA/PKG-INFO content."""
|
||||
dependencies = []
|
||||
|
||||
for line in metadata_content.split("\n"):
|
||||
if line.startswith("Requires-Dist:"):
|
||||
value = line[len("Requires-Dist:"):].strip()
|
||||
pkg_name, version = parse_requires_dist(value)
|
||||
if pkg_name:
|
||||
dependencies.append((pkg_name, version))
|
||||
|
||||
return dependencies
|
||||
|
||||
|
||||
def extract_metadata_from_wheel(content: bytes) -> Optional[str]:
|
||||
"""Extract METADATA file content from a wheel (zip) file."""
|
||||
try:
|
||||
with zipfile.ZipFile(BytesIO(content)) as zf:
|
||||
for name in zf.namelist():
|
||||
if name.endswith(".dist-info/METADATA"):
|
||||
return zf.read(name).decode("utf-8", errors="replace")
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to extract metadata from wheel: {e}")
|
||||
return None
|
||||
|
||||
|
||||
def extract_metadata_from_sdist(content: bytes) -> Optional[str]:
|
||||
"""Extract PKG-INFO file content from a source distribution (.tar.gz)."""
|
||||
try:
|
||||
with tarfile.open(fileobj=BytesIO(content), mode="r:gz") as tf:
|
||||
for member in tf.getmembers():
|
||||
if member.name.endswith("/PKG-INFO") and member.name.count("/") == 1:
|
||||
f = tf.extractfile(member)
|
||||
if f:
|
||||
return f.read().decode("utf-8", errors="replace")
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to extract metadata from sdist: {e}")
|
||||
return None
|
||||
|
||||
|
||||
def extract_dependencies(content: bytes, filename: str) -> List[Tuple[str, Optional[str]]]:
|
||||
"""Extract dependencies from a PyPI package file."""
|
||||
metadata = None
|
||||
|
||||
if filename.endswith(".whl"):
|
||||
metadata = extract_metadata_from_wheel(content)
|
||||
elif filename.endswith(".tar.gz"):
|
||||
metadata = extract_metadata_from_sdist(content)
|
||||
|
||||
if metadata:
|
||||
return extract_requires_from_metadata(metadata)
|
||||
|
||||
return []
|
||||
|
||||
|
||||
def backfill_dependencies(dry_run: bool = False):
|
||||
"""Main backfill function."""
|
||||
settings = get_settings()
|
||||
|
||||
# Create database connection
|
||||
engine = create_engine(settings.database_url)
|
||||
Session = sessionmaker(bind=engine)
|
||||
db = Session()
|
||||
|
||||
# Create storage client
|
||||
storage = get_storage()
|
||||
|
||||
try:
|
||||
# Find the _pypi project
|
||||
pypi_project = db.query(Project).filter(Project.name == "_pypi").first()
|
||||
if not pypi_project:
|
||||
logger.info("No _pypi project found. Nothing to backfill.")
|
||||
return
|
||||
|
||||
# Get all packages in _pypi
|
||||
packages = db.query(Package).filter(Package.project_id == pypi_project.id).all()
|
||||
logger.info(f"Found {len(packages)} packages in _pypi project")
|
||||
|
||||
total_artifacts = 0
|
||||
artifacts_with_deps = 0
|
||||
artifacts_processed = 0
|
||||
dependencies_added = 0
|
||||
|
||||
for package in packages:
|
||||
# Get all tags (each tag points to an artifact)
|
||||
tags = db.query(Tag).filter(Tag.package_id == package.id).all()
|
||||
|
||||
for tag in tags:
|
||||
total_artifacts += 1
|
||||
filename = tag.name
|
||||
|
||||
# Skip non-package files (like .metadata files)
|
||||
if not (filename.endswith(".whl") or filename.endswith(".tar.gz")):
|
||||
continue
|
||||
|
||||
# Check if this artifact already has dependencies
|
||||
existing_deps = db.query(ArtifactDependency).filter(
|
||||
ArtifactDependency.artifact_id == tag.artifact_id
|
||||
).count()
|
||||
|
||||
if existing_deps > 0:
|
||||
artifacts_with_deps += 1
|
||||
continue
|
||||
|
||||
# Get the artifact
|
||||
artifact = db.query(Artifact).filter(Artifact.id == tag.artifact_id).first()
|
||||
if not artifact:
|
||||
logger.warning(f"Artifact {tag.artifact_id} not found for tag {filename}")
|
||||
continue
|
||||
|
||||
logger.info(f"Processing {package.name}/{filename}...")
|
||||
|
||||
if dry_run:
|
||||
logger.info(f" [DRY RUN] Would extract dependencies from {filename}")
|
||||
artifacts_processed += 1
|
||||
continue
|
||||
|
||||
# Download the artifact from S3
|
||||
try:
|
||||
content = storage.get(artifact.s3_key)
|
||||
except Exception as e:
|
||||
logger.error(f" Failed to download {filename}: {e}")
|
||||
continue
|
||||
|
||||
# Extract dependencies
|
||||
deps = extract_dependencies(content, filename)
|
||||
|
||||
if deps:
|
||||
logger.info(f" Found {len(deps)} dependencies")
|
||||
for dep_name, dep_version in deps:
|
||||
# Check if already exists (race condition protection)
|
||||
existing = db.query(ArtifactDependency).filter(
|
||||
ArtifactDependency.artifact_id == tag.artifact_id,
|
||||
ArtifactDependency.dependency_project == "_pypi",
|
||||
ArtifactDependency.dependency_package == dep_name,
|
||||
).first()
|
||||
|
||||
if not existing:
|
||||
dep = ArtifactDependency(
|
||||
artifact_id=tag.artifact_id,
|
||||
dependency_project="_pypi",
|
||||
dependency_package=dep_name,
|
||||
version_constraint=dep_version if dep_version else "*",
|
||||
)
|
||||
db.add(dep)
|
||||
dependencies_added += 1
|
||||
logger.info(f" + {dep_name} {dep_version or '*'}")
|
||||
|
||||
db.commit()
|
||||
else:
|
||||
logger.info(f" No dependencies found")
|
||||
|
||||
artifacts_processed += 1
|
||||
|
||||
logger.info("")
|
||||
logger.info("=" * 50)
|
||||
logger.info("Backfill complete!")
|
||||
logger.info(f" Total artifacts: {total_artifacts}")
|
||||
logger.info(f" Already had deps: {artifacts_with_deps}")
|
||||
logger.info(f" Processed: {artifacts_processed}")
|
||||
logger.info(f" Dependencies added: {dependencies_added}")
|
||||
if dry_run:
|
||||
logger.info(" (DRY RUN - no changes made)")
|
||||
|
||||
finally:
|
||||
db.close()
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Backfill dependencies for cached PyPI packages"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--dry-run",
|
||||
action="store_true",
|
||||
help="Preview what would be done without making changes",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
backfill_dependencies(dry_run=args.dry_run)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user