From 47b3eb439d71a8d816a7dd9e4d5602c0f3ff60d2 Mon Sep 17 00:00:00 2001 From: Mondo Diaz Date: Fri, 30 Jan 2026 15:14:52 -0600 Subject: [PATCH] Extract and store dependencies from PyPI packages - Add functions to parse Requires-Dist metadata from wheel and sdist files - Store extracted dependencies in artifact_dependencies table - Fix streaming response for cached artifacts (proper tuple unpacking) - Fix version uniqueness check to use version string instead of artifact_id - Skip creating versions for .metadata files --- backend/app/pypi_proxy.py | 182 ++++++++++++++++++++++++++++++++++++-- 1 file changed, 175 insertions(+), 7 deletions(-) diff --git a/backend/app/pypi_proxy.py b/backend/app/pypi_proxy.py index f336c0c..6aec04f 100644 --- a/backend/app/pypi_proxy.py +++ b/backend/app/pypi_proxy.py @@ -8,7 +8,10 @@ Artifacts are cached on first access through configured upstream sources. import hashlib import logging import re -from typing import Optional +import tarfile +import zipfile +from io import BytesIO +from typing import Optional, List, Tuple from urllib.parse import urljoin, urlparse, quote, unquote import httpx @@ -17,7 +20,7 @@ from fastapi.responses import StreamingResponse, HTMLResponse from sqlalchemy.orm import Session from .database import get_db -from .models import UpstreamSource, CachedUrl, Artifact, Project, Package, Tag, PackageVersion +from .models import UpstreamSource, CachedUrl, Artifact, Project, Package, Tag, PackageVersion, ArtifactDependency from .storage import S3Storage, get_storage from .config import get_env_upstream_sources @@ -25,6 +28,140 @@ logger = logging.getLogger(__name__) router = APIRouter(prefix="/pypi", tags=["pypi-proxy"]) + +def _parse_requires_dist(requires_dist: str) -> Tuple[str, Optional[str]]: + """Parse a Requires-Dist line into (package_name, version_constraint). + + Examples: + "requests (>=2.25.0)" -> ("requests", ">=2.25.0") + "typing-extensions; python_version < '3.8'" -> ("typing-extensions", None) + "numpy>=1.21.0" -> ("numpy", ">=1.21.0") + "certifi" -> ("certifi", None) + + Returns: + Tuple of (normalized_package_name, version_constraint or None) + """ + # Remove any environment markers (after semicolon) + if ';' in requires_dist: + requires_dist = requires_dist.split(';')[0].strip() + + # Match patterns like "package (>=1.0)" or "package>=1.0" or "package" + # Pattern breakdown: package name, optional whitespace, optional version in parens or directly + match = re.match( + r'^([a-zA-Z0-9][-a-zA-Z0-9._]*)\s*(?:\(([^)]+)\)|([<>=!~][^\s;]+))?', + requires_dist.strip() + ) + + if not match: + return None, None + + package_name = match.group(1) + # Version can be in parentheses (group 2) or directly after name (group 3) + version_constraint = match.group(2) or match.group(3) + + # Normalize package name (PEP 503) + normalized_name = re.sub(r'[-_.]+', '-', package_name).lower() + + # Clean up version constraint + if version_constraint: + version_constraint = version_constraint.strip() + + return normalized_name, version_constraint + + +def _extract_requires_from_metadata(metadata_content: str) -> List[Tuple[str, Optional[str]]]: + """Extract all Requires-Dist entries from METADATA/PKG-INFO content. + + Args: + metadata_content: The content of a METADATA or PKG-INFO file + + Returns: + List of (package_name, version_constraint) tuples + """ + dependencies = [] + + for line in metadata_content.split('\n'): + if line.startswith('Requires-Dist:'): + # Extract the value after "Requires-Dist:" + value = line[len('Requires-Dist:'):].strip() + pkg_name, version = _parse_requires_dist(value) + if pkg_name: + dependencies.append((pkg_name, version)) + + return dependencies + + +def _extract_metadata_from_wheel(content: bytes) -> Optional[str]: + """Extract METADATA file content from a wheel (zip) file. + + Wheel files have structure: {package}-{version}.dist-info/METADATA + + Args: + content: The wheel file content as bytes + + Returns: + METADATA file content as string, or None if not found + """ + try: + with zipfile.ZipFile(BytesIO(content)) as zf: + # Find the .dist-info directory + for name in zf.namelist(): + if name.endswith('.dist-info/METADATA'): + return zf.read(name).decode('utf-8', errors='replace') + except Exception as e: + logger.warning(f"Failed to extract metadata from wheel: {e}") + return None + + +def _extract_metadata_from_sdist(content: bytes, filename: str) -> Optional[str]: + """Extract PKG-INFO file content from a source distribution (.tar.gz). + + Source distributions have structure: {package}-{version}/PKG-INFO + + Args: + content: The tarball content as bytes + filename: The original filename (used to determine package name) + + Returns: + PKG-INFO file content as string, or None if not found + """ + try: + with tarfile.open(fileobj=BytesIO(content), mode='r:gz') as tf: + # Find PKG-INFO in the root directory of the archive + for member in tf.getmembers(): + if member.name.endswith('/PKG-INFO') and member.name.count('/') == 1: + f = tf.extractfile(member) + if f: + return f.read().decode('utf-8', errors='replace') + except Exception as e: + logger.warning(f"Failed to extract metadata from sdist {filename}: {e}") + return None + + +def _extract_dependencies(content: bytes, filename: str) -> List[Tuple[str, Optional[str]]]: + """Extract dependencies from a PyPI package file. + + Supports wheel (.whl) and source distribution (.tar.gz) formats. + + Args: + content: The package file content as bytes + filename: The original filename + + Returns: + List of (package_name, version_constraint) tuples + """ + metadata = None + + if filename.endswith('.whl'): + metadata = _extract_metadata_from_wheel(content) + elif filename.endswith('.tar.gz'): + metadata = _extract_metadata_from_sdist(content, filename) + + if metadata: + return _extract_requires_from_metadata(metadata) + + return [] + # Timeout configuration for proxy requests PROXY_CONNECT_TIMEOUT = 30.0 PROXY_READ_TIMEOUT = 60.0 @@ -403,14 +540,22 @@ async def pypi_download_file( # Stream from S3 try: - content_stream = storage.get_stream(artifact.s3_key) + stream, content_length, _ = storage.get_stream(artifact.s3_key) + + def stream_content(): + """Generator that yields chunks from the S3 stream.""" + try: + for chunk in stream.iter_chunks(): + yield chunk + finally: + stream.close() return StreamingResponse( - content_stream, + stream_content(), media_type=artifact.content_type or "application/octet-stream", headers={ "Content-Disposition": f'attachment; filename="{filename}"', - "Content-Length": str(artifact.size), + "Content-Length": str(content_length), "X-Checksum-SHA256": artifact.id, "X-Cache": "HIT", } @@ -561,11 +706,13 @@ async def pypi_download_file( db.add(tag) # Extract and create version + # Only create version for actual package files, not .metadata files version = _extract_pypi_version(filename) - if version: + if version and not filename.endswith('.metadata'): + # Check by version string (the unique constraint is on package_id + version) existing_version = db.query(PackageVersion).filter( PackageVersion.package_id == package.id, - PackageVersion.artifact_id == sha256, + PackageVersion.version == version, ).first() if not existing_version: pkg_version = PackageVersion( @@ -587,6 +734,27 @@ async def pypi_download_file( ) db.add(cached_url_record) + # Extract and store dependencies + dependencies = _extract_dependencies(content, filename) + if dependencies: + logger.info(f"PyPI proxy: extracted {len(dependencies)} dependencies from {filename}") + for dep_name, dep_version in dependencies: + # Check if this dependency already exists for this artifact + existing_dep = db.query(ArtifactDependency).filter( + ArtifactDependency.artifact_id == sha256, + ArtifactDependency.dependency_project == "_pypi", + ArtifactDependency.dependency_package == dep_name, + ).first() + + if not existing_dep: + dep = ArtifactDependency( + artifact_id=sha256, + dependency_project="_pypi", + dependency_package=dep_name, + version_constraint=dep_version if dep_version else "*", + ) + db.add(dep) + db.commit() # Return the file