Extract and store dependencies from PyPI packages

- Add functions to parse Requires-Dist metadata from wheel and sdist files
- Store extracted dependencies in artifact_dependencies table
- Fix streaming response for cached artifacts (proper tuple unpacking)
- Fix version uniqueness check to use version string instead of artifact_id
- Skip creating versions for .metadata files
This commit is contained in:
Mondo Diaz
2026-01-30 15:14:52 -06:00
parent c5f75e4fd6
commit 47b3eb439d

View File

@@ -8,7 +8,10 @@ Artifacts are cached on first access through configured upstream sources.
import hashlib
import logging
import re
from typing import Optional
import tarfile
import zipfile
from io import BytesIO
from typing import Optional, List, Tuple
from urllib.parse import urljoin, urlparse, quote, unquote
import httpx
@@ -17,7 +20,7 @@ from fastapi.responses import StreamingResponse, HTMLResponse
from sqlalchemy.orm import Session
from .database import get_db
from .models import UpstreamSource, CachedUrl, Artifact, Project, Package, Tag, PackageVersion
from .models import UpstreamSource, CachedUrl, Artifact, Project, Package, Tag, PackageVersion, ArtifactDependency
from .storage import S3Storage, get_storage
from .config import get_env_upstream_sources
@@ -25,6 +28,140 @@ logger = logging.getLogger(__name__)
router = APIRouter(prefix="/pypi", tags=["pypi-proxy"])
def _parse_requires_dist(requires_dist: str) -> Tuple[str, Optional[str]]:
"""Parse a Requires-Dist line into (package_name, version_constraint).
Examples:
"requests (>=2.25.0)" -> ("requests", ">=2.25.0")
"typing-extensions; python_version < '3.8'" -> ("typing-extensions", None)
"numpy>=1.21.0" -> ("numpy", ">=1.21.0")
"certifi" -> ("certifi", None)
Returns:
Tuple of (normalized_package_name, version_constraint or None)
"""
# Remove any environment markers (after semicolon)
if ';' in requires_dist:
requires_dist = requires_dist.split(';')[0].strip()
# Match patterns like "package (>=1.0)" or "package>=1.0" or "package"
# Pattern breakdown: package name, optional whitespace, optional version in parens or directly
match = re.match(
r'^([a-zA-Z0-9][-a-zA-Z0-9._]*)\s*(?:\(([^)]+)\)|([<>=!~][^\s;]+))?',
requires_dist.strip()
)
if not match:
return None, None
package_name = match.group(1)
# Version can be in parentheses (group 2) or directly after name (group 3)
version_constraint = match.group(2) or match.group(3)
# Normalize package name (PEP 503)
normalized_name = re.sub(r'[-_.]+', '-', package_name).lower()
# Clean up version constraint
if version_constraint:
version_constraint = version_constraint.strip()
return normalized_name, version_constraint
def _extract_requires_from_metadata(metadata_content: str) -> List[Tuple[str, Optional[str]]]:
"""Extract all Requires-Dist entries from METADATA/PKG-INFO content.
Args:
metadata_content: The content of a METADATA or PKG-INFO file
Returns:
List of (package_name, version_constraint) tuples
"""
dependencies = []
for line in metadata_content.split('\n'):
if line.startswith('Requires-Dist:'):
# Extract the value after "Requires-Dist:"
value = line[len('Requires-Dist:'):].strip()
pkg_name, version = _parse_requires_dist(value)
if pkg_name:
dependencies.append((pkg_name, version))
return dependencies
def _extract_metadata_from_wheel(content: bytes) -> Optional[str]:
"""Extract METADATA file content from a wheel (zip) file.
Wheel files have structure: {package}-{version}.dist-info/METADATA
Args:
content: The wheel file content as bytes
Returns:
METADATA file content as string, or None if not found
"""
try:
with zipfile.ZipFile(BytesIO(content)) as zf:
# Find the .dist-info directory
for name in zf.namelist():
if name.endswith('.dist-info/METADATA'):
return zf.read(name).decode('utf-8', errors='replace')
except Exception as e:
logger.warning(f"Failed to extract metadata from wheel: {e}")
return None
def _extract_metadata_from_sdist(content: bytes, filename: str) -> Optional[str]:
"""Extract PKG-INFO file content from a source distribution (.tar.gz).
Source distributions have structure: {package}-{version}/PKG-INFO
Args:
content: The tarball content as bytes
filename: The original filename (used to determine package name)
Returns:
PKG-INFO file content as string, or None if not found
"""
try:
with tarfile.open(fileobj=BytesIO(content), mode='r:gz') as tf:
# Find PKG-INFO in the root directory of the archive
for member in tf.getmembers():
if member.name.endswith('/PKG-INFO') and member.name.count('/') == 1:
f = tf.extractfile(member)
if f:
return f.read().decode('utf-8', errors='replace')
except Exception as e:
logger.warning(f"Failed to extract metadata from sdist {filename}: {e}")
return None
def _extract_dependencies(content: bytes, filename: str) -> List[Tuple[str, Optional[str]]]:
"""Extract dependencies from a PyPI package file.
Supports wheel (.whl) and source distribution (.tar.gz) formats.
Args:
content: The package file content as bytes
filename: The original filename
Returns:
List of (package_name, version_constraint) tuples
"""
metadata = None
if filename.endswith('.whl'):
metadata = _extract_metadata_from_wheel(content)
elif filename.endswith('.tar.gz'):
metadata = _extract_metadata_from_sdist(content, filename)
if metadata:
return _extract_requires_from_metadata(metadata)
return []
# Timeout configuration for proxy requests
PROXY_CONNECT_TIMEOUT = 30.0
PROXY_READ_TIMEOUT = 60.0
@@ -403,14 +540,22 @@ async def pypi_download_file(
# Stream from S3
try:
content_stream = storage.get_stream(artifact.s3_key)
stream, content_length, _ = storage.get_stream(artifact.s3_key)
def stream_content():
"""Generator that yields chunks from the S3 stream."""
try:
for chunk in stream.iter_chunks():
yield chunk
finally:
stream.close()
return StreamingResponse(
content_stream,
stream_content(),
media_type=artifact.content_type or "application/octet-stream",
headers={
"Content-Disposition": f'attachment; filename="{filename}"',
"Content-Length": str(artifact.size),
"Content-Length": str(content_length),
"X-Checksum-SHA256": artifact.id,
"X-Cache": "HIT",
}
@@ -561,11 +706,13 @@ async def pypi_download_file(
db.add(tag)
# Extract and create version
# Only create version for actual package files, not .metadata files
version = _extract_pypi_version(filename)
if version:
if version and not filename.endswith('.metadata'):
# Check by version string (the unique constraint is on package_id + version)
existing_version = db.query(PackageVersion).filter(
PackageVersion.package_id == package.id,
PackageVersion.artifact_id == sha256,
PackageVersion.version == version,
).first()
if not existing_version:
pkg_version = PackageVersion(
@@ -587,6 +734,27 @@ async def pypi_download_file(
)
db.add(cached_url_record)
# Extract and store dependencies
dependencies = _extract_dependencies(content, filename)
if dependencies:
logger.info(f"PyPI proxy: extracted {len(dependencies)} dependencies from {filename}")
for dep_name, dep_version in dependencies:
# Check if this dependency already exists for this artifact
existing_dep = db.query(ArtifactDependency).filter(
ArtifactDependency.artifact_id == sha256,
ArtifactDependency.dependency_project == "_pypi",
ArtifactDependency.dependency_package == dep_name,
).first()
if not existing_dep:
dep = ArtifactDependency(
artifact_id=sha256,
dependency_project="_pypi",
dependency_package=dep_name,
version_constraint=dep_version if dep_version else "*",
)
db.add(dep)
db.commit()
# Return the file