263 lines
8.7 KiB
Python
263 lines
8.7 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Backfill script to extract dependencies from cached PyPI packages.
|
|
|
|
This script scans all artifacts in the _pypi project and extracts
|
|
Requires-Dist metadata from wheel and sdist files that don't already
|
|
have dependencies recorded.
|
|
|
|
Usage:
|
|
# From within the container:
|
|
python -m scripts.backfill_pypi_dependencies
|
|
|
|
# Or with docker exec:
|
|
docker exec orchard_orchard-server_1 python -m scripts.backfill_pypi_dependencies
|
|
|
|
# Dry run (preview only):
|
|
docker exec orchard_orchard-server_1 python -m scripts.backfill_pypi_dependencies --dry-run
|
|
"""
|
|
|
|
import argparse
|
|
import logging
|
|
import re
|
|
import sys
|
|
import tarfile
|
|
import zipfile
|
|
from io import BytesIO
|
|
from typing import List, Optional, Tuple
|
|
|
|
# Add parent directory to path for imports
|
|
sys.path.insert(0, "/app")
|
|
|
|
from sqlalchemy import create_engine
|
|
from sqlalchemy.orm import sessionmaker
|
|
|
|
from backend.app.config import get_settings
|
|
from backend.app.models import (
|
|
Artifact,
|
|
ArtifactDependency,
|
|
Package,
|
|
Project,
|
|
Tag,
|
|
)
|
|
from backend.app.storage import get_storage
|
|
|
|
logging.basicConfig(
|
|
level=logging.INFO,
|
|
format="%(asctime)s - %(levelname)s - %(message)s",
|
|
)
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
def parse_requires_dist(requires_dist: str) -> Tuple[Optional[str], Optional[str]]:
|
|
"""Parse a Requires-Dist line into (package_name, version_constraint)."""
|
|
# Remove any environment markers (after semicolon)
|
|
if ";" in requires_dist:
|
|
requires_dist = requires_dist.split(";")[0].strip()
|
|
|
|
# Match patterns like "package (>=1.0)" or "package>=1.0" or "package"
|
|
match = re.match(
|
|
r"^([a-zA-Z0-9][-a-zA-Z0-9._]*)\s*(?:\(([^)]+)\)|([<>=!~][^\s;]+))?",
|
|
requires_dist.strip(),
|
|
)
|
|
|
|
if not match:
|
|
return None, None
|
|
|
|
package_name = match.group(1)
|
|
version_constraint = match.group(2) or match.group(3)
|
|
|
|
# Normalize package name (PEP 503)
|
|
normalized_name = re.sub(r"[-_.]+", "-", package_name).lower()
|
|
|
|
if version_constraint:
|
|
version_constraint = version_constraint.strip()
|
|
|
|
return normalized_name, version_constraint
|
|
|
|
|
|
def extract_requires_from_metadata(metadata_content: str) -> List[Tuple[str, Optional[str]]]:
|
|
"""Extract all Requires-Dist entries from METADATA/PKG-INFO content."""
|
|
dependencies = []
|
|
|
|
for line in metadata_content.split("\n"):
|
|
if line.startswith("Requires-Dist:"):
|
|
value = line[len("Requires-Dist:"):].strip()
|
|
pkg_name, version = parse_requires_dist(value)
|
|
if pkg_name:
|
|
dependencies.append((pkg_name, version))
|
|
|
|
return dependencies
|
|
|
|
|
|
def extract_metadata_from_wheel(content: bytes) -> Optional[str]:
|
|
"""Extract METADATA file content from a wheel (zip) file."""
|
|
try:
|
|
with zipfile.ZipFile(BytesIO(content)) as zf:
|
|
for name in zf.namelist():
|
|
if name.endswith(".dist-info/METADATA"):
|
|
return zf.read(name).decode("utf-8", errors="replace")
|
|
except Exception as e:
|
|
logger.warning(f"Failed to extract metadata from wheel: {e}")
|
|
return None
|
|
|
|
|
|
def extract_metadata_from_sdist(content: bytes) -> Optional[str]:
|
|
"""Extract PKG-INFO file content from a source distribution (.tar.gz)."""
|
|
try:
|
|
with tarfile.open(fileobj=BytesIO(content), mode="r:gz") as tf:
|
|
for member in tf.getmembers():
|
|
if member.name.endswith("/PKG-INFO") and member.name.count("/") == 1:
|
|
f = tf.extractfile(member)
|
|
if f:
|
|
return f.read().decode("utf-8", errors="replace")
|
|
except Exception as e:
|
|
logger.warning(f"Failed to extract metadata from sdist: {e}")
|
|
return None
|
|
|
|
|
|
def extract_dependencies(content: bytes, filename: str) -> List[Tuple[str, Optional[str]]]:
|
|
"""Extract dependencies from a PyPI package file."""
|
|
metadata = None
|
|
|
|
if filename.endswith(".whl"):
|
|
metadata = extract_metadata_from_wheel(content)
|
|
elif filename.endswith(".tar.gz"):
|
|
metadata = extract_metadata_from_sdist(content)
|
|
|
|
if metadata:
|
|
return extract_requires_from_metadata(metadata)
|
|
|
|
return []
|
|
|
|
|
|
def backfill_dependencies(dry_run: bool = False):
|
|
"""Main backfill function."""
|
|
settings = get_settings()
|
|
|
|
# Create database connection
|
|
engine = create_engine(settings.database_url)
|
|
Session = sessionmaker(bind=engine)
|
|
db = Session()
|
|
|
|
# Create storage client
|
|
storage = get_storage()
|
|
|
|
try:
|
|
# Find the _pypi project
|
|
pypi_project = db.query(Project).filter(Project.name == "_pypi").first()
|
|
if not pypi_project:
|
|
logger.info("No _pypi project found. Nothing to backfill.")
|
|
return
|
|
|
|
# Get all packages in _pypi
|
|
packages = db.query(Package).filter(Package.project_id == pypi_project.id).all()
|
|
logger.info(f"Found {len(packages)} packages in _pypi project")
|
|
|
|
total_artifacts = 0
|
|
artifacts_with_deps = 0
|
|
artifacts_processed = 0
|
|
dependencies_added = 0
|
|
|
|
for package in packages:
|
|
# Get all tags (each tag points to an artifact)
|
|
tags = db.query(Tag).filter(Tag.package_id == package.id).all()
|
|
|
|
for tag in tags:
|
|
total_artifacts += 1
|
|
filename = tag.name
|
|
|
|
# Skip non-package files (like .metadata files)
|
|
if not (filename.endswith(".whl") or filename.endswith(".tar.gz")):
|
|
continue
|
|
|
|
# Check if this artifact already has dependencies
|
|
existing_deps = db.query(ArtifactDependency).filter(
|
|
ArtifactDependency.artifact_id == tag.artifact_id
|
|
).count()
|
|
|
|
if existing_deps > 0:
|
|
artifacts_with_deps += 1
|
|
continue
|
|
|
|
# Get the artifact
|
|
artifact = db.query(Artifact).filter(Artifact.id == tag.artifact_id).first()
|
|
if not artifact:
|
|
logger.warning(f"Artifact {tag.artifact_id} not found for tag {filename}")
|
|
continue
|
|
|
|
logger.info(f"Processing {package.name}/{filename}...")
|
|
|
|
if dry_run:
|
|
logger.info(f" [DRY RUN] Would extract dependencies from {filename}")
|
|
artifacts_processed += 1
|
|
continue
|
|
|
|
# Download the artifact from S3
|
|
try:
|
|
content = storage.get(artifact.s3_key)
|
|
except Exception as e:
|
|
logger.error(f" Failed to download {filename}: {e}")
|
|
continue
|
|
|
|
# Extract dependencies
|
|
deps = extract_dependencies(content, filename)
|
|
|
|
if deps:
|
|
logger.info(f" Found {len(deps)} dependencies")
|
|
for dep_name, dep_version in deps:
|
|
# Check if already exists (race condition protection)
|
|
existing = db.query(ArtifactDependency).filter(
|
|
ArtifactDependency.artifact_id == tag.artifact_id,
|
|
ArtifactDependency.dependency_project == "_pypi",
|
|
ArtifactDependency.dependency_package == dep_name,
|
|
).first()
|
|
|
|
if not existing:
|
|
dep = ArtifactDependency(
|
|
artifact_id=tag.artifact_id,
|
|
dependency_project="_pypi",
|
|
dependency_package=dep_name,
|
|
version_constraint=dep_version if dep_version else "*",
|
|
)
|
|
db.add(dep)
|
|
dependencies_added += 1
|
|
logger.info(f" + {dep_name} {dep_version or '*'}")
|
|
|
|
db.commit()
|
|
else:
|
|
logger.info(f" No dependencies found")
|
|
|
|
artifacts_processed += 1
|
|
|
|
logger.info("")
|
|
logger.info("=" * 50)
|
|
logger.info("Backfill complete!")
|
|
logger.info(f" Total artifacts: {total_artifacts}")
|
|
logger.info(f" Already had deps: {artifacts_with_deps}")
|
|
logger.info(f" Processed: {artifacts_processed}")
|
|
logger.info(f" Dependencies added: {dependencies_added}")
|
|
if dry_run:
|
|
logger.info(" (DRY RUN - no changes made)")
|
|
|
|
finally:
|
|
db.close()
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(
|
|
description="Backfill dependencies for cached PyPI packages"
|
|
)
|
|
parser.add_argument(
|
|
"--dry-run",
|
|
action="store_true",
|
|
help="Preview what would be done without making changes",
|
|
)
|
|
args = parser.parse_args()
|
|
|
|
backfill_dependencies(dry_run=args.dry_run)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|