From 7068f36cb5725035a7992fe117a8fef24df55ea6 Mon Sep 17 00:00:00 2001 From: Mondo Diaz Date: Tue, 3 Feb 2026 17:18:54 -0600 Subject: [PATCH] Restore dependency extraction from PyPI packages Re-adds the dependency extraction that was accidentally removed with the proactive caching feature. Now when a PyPI package is cached: 1. Extract METADATA from wheel or PKG-INFO from sdist 2. Parse Requires-Dist lines for dependencies 3. Store in artifact_dependencies table This restores the dependency graph functionality for PyPI packages. --- backend/app/pypi_proxy.py | 159 +++++++++++++++++++++++++++++++++++++- 1 file changed, 157 insertions(+), 2 deletions(-) diff --git a/backend/app/pypi_proxy.py b/backend/app/pypi_proxy.py index 2dc56a5..e5c37e1 100644 --- a/backend/app/pypi_proxy.py +++ b/backend/app/pypi_proxy.py @@ -10,8 +10,11 @@ import json import logging import os import re +import tarfile import tempfile -from typing import Optional +import zipfile +from io import BytesIO +from typing import Optional, List, Tuple from urllib.parse import urljoin, urlparse, quote, unquote import httpx @@ -20,7 +23,7 @@ from fastapi.responses import StreamingResponse, HTMLResponse, RedirectResponse from sqlalchemy.orm import Session from .database import get_db -from .models import UpstreamSource, CachedUrl, Artifact, Project, Package, PackageVersion +from .models import UpstreamSource, CachedUrl, Artifact, Project, Package, PackageVersion, ArtifactDependency from .storage import S3Storage, get_storage from .config import get_env_upstream_sources, get_settings @@ -34,6 +37,131 @@ PROXY_CONNECT_TIMEOUT = 30.0 PROXY_READ_TIMEOUT = 60.0 +def _parse_requires_dist(requires_dist: str) -> Tuple[str, Optional[str]]: + """Parse a Requires-Dist line into (package_name, version_constraint). + + Examples: + "requests (>=2.25.0)" -> ("requests", ">=2.25.0") + "typing-extensions; python_version < '3.8'" -> ("typing-extensions", None) + "numpy>=1.21.0" -> ("numpy", ">=1.21.0") + "certifi" -> ("certifi", None) + + Returns: + Tuple of (normalized_package_name, version_constraint or None) + """ + # Remove any environment markers (after semicolon) + if ';' in requires_dist: + requires_dist = requires_dist.split(';')[0].strip() + + # Match patterns like "package (>=1.0)" or "package>=1.0" or "package" + match = re.match( + r'^([a-zA-Z0-9][-a-zA-Z0-9._]*)\s*(?:\(([^)]+)\)|([<>=!~][^\s;]+))?', + requires_dist.strip() + ) + + if not match: + return None, None + + package_name = match.group(1) + # Version can be in parentheses (group 2) or directly after name (group 3) + version_constraint = match.group(2) or match.group(3) + + # Normalize package name (PEP 503) + normalized_name = re.sub(r'[-_.]+', '-', package_name).lower() + + # Clean up version constraint + if version_constraint: + version_constraint = version_constraint.strip() + + return normalized_name, version_constraint + + +def _extract_requires_from_metadata(metadata_content: str) -> List[Tuple[str, Optional[str]]]: + """Extract all Requires-Dist entries from METADATA/PKG-INFO content. + + Args: + metadata_content: The content of a METADATA or PKG-INFO file + + Returns: + List of (package_name, version_constraint) tuples + """ + dependencies = [] + + for line in metadata_content.split('\n'): + if line.startswith('Requires-Dist:'): + value = line[len('Requires-Dist:'):].strip() + pkg_name, version = _parse_requires_dist(value) + if pkg_name: + dependencies.append((pkg_name, version)) + + return dependencies + + +def _extract_metadata_from_wheel(file_path: str) -> Optional[str]: + """Extract METADATA file content from a wheel (zip) file. + + Args: + file_path: Path to the wheel file + + Returns: + METADATA file content as string, or None if not found + """ + try: + with zipfile.ZipFile(file_path) as zf: + for name in zf.namelist(): + if name.endswith('.dist-info/METADATA'): + return zf.read(name).decode('utf-8', errors='replace') + except Exception as e: + logger.warning(f"Failed to extract metadata from wheel: {e}") + return None + + +def _extract_metadata_from_sdist(file_path: str) -> Optional[str]: + """Extract PKG-INFO file content from a source distribution (.tar.gz). + + Args: + file_path: Path to the tarball file + + Returns: + PKG-INFO file content as string, or None if not found + """ + try: + with tarfile.open(file_path, mode='r:gz') as tf: + for member in tf.getmembers(): + if member.name.endswith('/PKG-INFO') and member.name.count('/') == 1: + f = tf.extractfile(member) + if f: + return f.read().decode('utf-8', errors='replace') + except Exception as e: + logger.warning(f"Failed to extract metadata from sdist: {e}") + return None + + +def _extract_dependencies_from_file(file_path: str, filename: str) -> List[Tuple[str, Optional[str]]]: + """Extract dependencies from a PyPI package file. + + Supports wheel (.whl) and source distribution (.tar.gz) formats. + + Args: + file_path: Path to the package file + filename: The original filename + + Returns: + List of (package_name, version_constraint) tuples + """ + metadata = None + + if filename.endswith('.whl'): + metadata = _extract_metadata_from_wheel(file_path) + elif filename.endswith('.tar.gz'): + metadata = _extract_metadata_from_sdist(file_path) + + if metadata: + return _extract_requires_from_metadata(metadata) + + return [] + + def _parse_upstream_error(response: httpx.Response) -> str: """Parse upstream error response to extract useful error details. @@ -528,6 +656,9 @@ async def pypi_download_file( timeout = httpx.Timeout(300.0, connect=PROXY_CONNECT_TIMEOUT) # 5 minutes for large files + # Initialize extracted dependencies list + extracted_deps = [] + # Fetch the file logger.info(f"PyPI proxy: fetching {filename} from {upstream_url}") @@ -593,6 +724,11 @@ async def pypi_download_file( size = result.size s3_key = result.s3_key + # Extract dependencies from the temp file before cleaning up + extracted_deps = _extract_dependencies_from_file(tmp_path, filename) + if extracted_deps: + logger.info(f"PyPI proxy: extracted {len(extracted_deps)} dependencies from {filename}") + logger.info(f"PyPI proxy: downloaded {filename}, {size} bytes, sha256={sha256[:12]}") finally: # Clean up temp file @@ -685,6 +821,25 @@ async def pypi_download_file( ) db.add(cached_url_record) + # Store extracted dependencies + if extracted_deps: + for dep_name, dep_version in extracted_deps: + # Check if this dependency already exists for this artifact + existing_dep = db.query(ArtifactDependency).filter( + ArtifactDependency.artifact_id == sha256, + ArtifactDependency.dependency_project == "_pypi", + ArtifactDependency.dependency_package == dep_name, + ).first() + + if not existing_dep: + dep = ArtifactDependency( + artifact_id=sha256, + dependency_project="_pypi", + dependency_package=dep_name, + version_constraint=dep_version if dep_version else "*", + ) + db.add(dep) + db.commit() # Serve the file from S3