Extract and store dependencies from PyPI packages
- Add functions to parse Requires-Dist metadata from wheel and sdist files - Store extracted dependencies in artifact_dependencies table - Fix streaming response for cached artifacts (proper tuple unpacking) - Fix version uniqueness check to use version string instead of artifact_id - Skip creating versions for .metadata files
This commit is contained in:
@@ -8,7 +8,10 @@ Artifacts are cached on first access through configured upstream sources.
|
||||
import hashlib
|
||||
import logging
|
||||
import re
|
||||
from typing import Optional
|
||||
import tarfile
|
||||
import zipfile
|
||||
from io import BytesIO
|
||||
from typing import Optional, List, Tuple
|
||||
from urllib.parse import urljoin, urlparse, quote, unquote
|
||||
|
||||
import httpx
|
||||
@@ -17,7 +20,7 @@ from fastapi.responses import StreamingResponse, HTMLResponse
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from .database import get_db
|
||||
from .models import UpstreamSource, CachedUrl, Artifact, Project, Package, Tag, PackageVersion
|
||||
from .models import UpstreamSource, CachedUrl, Artifact, Project, Package, Tag, PackageVersion, ArtifactDependency
|
||||
from .storage import S3Storage, get_storage
|
||||
from .config import get_env_upstream_sources
|
||||
|
||||
@@ -25,6 +28,140 @@ logger = logging.getLogger(__name__)
|
||||
|
||||
router = APIRouter(prefix="/pypi", tags=["pypi-proxy"])
|
||||
|
||||
|
||||
def _parse_requires_dist(requires_dist: str) -> Tuple[str, Optional[str]]:
|
||||
"""Parse a Requires-Dist line into (package_name, version_constraint).
|
||||
|
||||
Examples:
|
||||
"requests (>=2.25.0)" -> ("requests", ">=2.25.0")
|
||||
"typing-extensions; python_version < '3.8'" -> ("typing-extensions", None)
|
||||
"numpy>=1.21.0" -> ("numpy", ">=1.21.0")
|
||||
"certifi" -> ("certifi", None)
|
||||
|
||||
Returns:
|
||||
Tuple of (normalized_package_name, version_constraint or None)
|
||||
"""
|
||||
# Remove any environment markers (after semicolon)
|
||||
if ';' in requires_dist:
|
||||
requires_dist = requires_dist.split(';')[0].strip()
|
||||
|
||||
# Match patterns like "package (>=1.0)" or "package>=1.0" or "package"
|
||||
# Pattern breakdown: package name, optional whitespace, optional version in parens or directly
|
||||
match = re.match(
|
||||
r'^([a-zA-Z0-9][-a-zA-Z0-9._]*)\s*(?:\(([^)]+)\)|([<>=!~][^\s;]+))?',
|
||||
requires_dist.strip()
|
||||
)
|
||||
|
||||
if not match:
|
||||
return None, None
|
||||
|
||||
package_name = match.group(1)
|
||||
# Version can be in parentheses (group 2) or directly after name (group 3)
|
||||
version_constraint = match.group(2) or match.group(3)
|
||||
|
||||
# Normalize package name (PEP 503)
|
||||
normalized_name = re.sub(r'[-_.]+', '-', package_name).lower()
|
||||
|
||||
# Clean up version constraint
|
||||
if version_constraint:
|
||||
version_constraint = version_constraint.strip()
|
||||
|
||||
return normalized_name, version_constraint
|
||||
|
||||
|
||||
def _extract_requires_from_metadata(metadata_content: str) -> List[Tuple[str, Optional[str]]]:
|
||||
"""Extract all Requires-Dist entries from METADATA/PKG-INFO content.
|
||||
|
||||
Args:
|
||||
metadata_content: The content of a METADATA or PKG-INFO file
|
||||
|
||||
Returns:
|
||||
List of (package_name, version_constraint) tuples
|
||||
"""
|
||||
dependencies = []
|
||||
|
||||
for line in metadata_content.split('\n'):
|
||||
if line.startswith('Requires-Dist:'):
|
||||
# Extract the value after "Requires-Dist:"
|
||||
value = line[len('Requires-Dist:'):].strip()
|
||||
pkg_name, version = _parse_requires_dist(value)
|
||||
if pkg_name:
|
||||
dependencies.append((pkg_name, version))
|
||||
|
||||
return dependencies
|
||||
|
||||
|
||||
def _extract_metadata_from_wheel(content: bytes) -> Optional[str]:
|
||||
"""Extract METADATA file content from a wheel (zip) file.
|
||||
|
||||
Wheel files have structure: {package}-{version}.dist-info/METADATA
|
||||
|
||||
Args:
|
||||
content: The wheel file content as bytes
|
||||
|
||||
Returns:
|
||||
METADATA file content as string, or None if not found
|
||||
"""
|
||||
try:
|
||||
with zipfile.ZipFile(BytesIO(content)) as zf:
|
||||
# Find the .dist-info directory
|
||||
for name in zf.namelist():
|
||||
if name.endswith('.dist-info/METADATA'):
|
||||
return zf.read(name).decode('utf-8', errors='replace')
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to extract metadata from wheel: {e}")
|
||||
return None
|
||||
|
||||
|
||||
def _extract_metadata_from_sdist(content: bytes, filename: str) -> Optional[str]:
|
||||
"""Extract PKG-INFO file content from a source distribution (.tar.gz).
|
||||
|
||||
Source distributions have structure: {package}-{version}/PKG-INFO
|
||||
|
||||
Args:
|
||||
content: The tarball content as bytes
|
||||
filename: The original filename (used to determine package name)
|
||||
|
||||
Returns:
|
||||
PKG-INFO file content as string, or None if not found
|
||||
"""
|
||||
try:
|
||||
with tarfile.open(fileobj=BytesIO(content), mode='r:gz') as tf:
|
||||
# Find PKG-INFO in the root directory of the archive
|
||||
for member in tf.getmembers():
|
||||
if member.name.endswith('/PKG-INFO') and member.name.count('/') == 1:
|
||||
f = tf.extractfile(member)
|
||||
if f:
|
||||
return f.read().decode('utf-8', errors='replace')
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to extract metadata from sdist {filename}: {e}")
|
||||
return None
|
||||
|
||||
|
||||
def _extract_dependencies(content: bytes, filename: str) -> List[Tuple[str, Optional[str]]]:
|
||||
"""Extract dependencies from a PyPI package file.
|
||||
|
||||
Supports wheel (.whl) and source distribution (.tar.gz) formats.
|
||||
|
||||
Args:
|
||||
content: The package file content as bytes
|
||||
filename: The original filename
|
||||
|
||||
Returns:
|
||||
List of (package_name, version_constraint) tuples
|
||||
"""
|
||||
metadata = None
|
||||
|
||||
if filename.endswith('.whl'):
|
||||
metadata = _extract_metadata_from_wheel(content)
|
||||
elif filename.endswith('.tar.gz'):
|
||||
metadata = _extract_metadata_from_sdist(content, filename)
|
||||
|
||||
if metadata:
|
||||
return _extract_requires_from_metadata(metadata)
|
||||
|
||||
return []
|
||||
|
||||
# Timeout configuration for proxy requests
|
||||
PROXY_CONNECT_TIMEOUT = 30.0
|
||||
PROXY_READ_TIMEOUT = 60.0
|
||||
@@ -403,14 +540,22 @@ async def pypi_download_file(
|
||||
|
||||
# Stream from S3
|
||||
try:
|
||||
content_stream = storage.get_stream(artifact.s3_key)
|
||||
stream, content_length, _ = storage.get_stream(artifact.s3_key)
|
||||
|
||||
def stream_content():
|
||||
"""Generator that yields chunks from the S3 stream."""
|
||||
try:
|
||||
for chunk in stream.iter_chunks():
|
||||
yield chunk
|
||||
finally:
|
||||
stream.close()
|
||||
|
||||
return StreamingResponse(
|
||||
content_stream,
|
||||
stream_content(),
|
||||
media_type=artifact.content_type or "application/octet-stream",
|
||||
headers={
|
||||
"Content-Disposition": f'attachment; filename="{filename}"',
|
||||
"Content-Length": str(artifact.size),
|
||||
"Content-Length": str(content_length),
|
||||
"X-Checksum-SHA256": artifact.id,
|
||||
"X-Cache": "HIT",
|
||||
}
|
||||
@@ -561,11 +706,13 @@ async def pypi_download_file(
|
||||
db.add(tag)
|
||||
|
||||
# Extract and create version
|
||||
# Only create version for actual package files, not .metadata files
|
||||
version = _extract_pypi_version(filename)
|
||||
if version:
|
||||
if version and not filename.endswith('.metadata'):
|
||||
# Check by version string (the unique constraint is on package_id + version)
|
||||
existing_version = db.query(PackageVersion).filter(
|
||||
PackageVersion.package_id == package.id,
|
||||
PackageVersion.artifact_id == sha256,
|
||||
PackageVersion.version == version,
|
||||
).first()
|
||||
if not existing_version:
|
||||
pkg_version = PackageVersion(
|
||||
@@ -587,6 +734,27 @@ async def pypi_download_file(
|
||||
)
|
||||
db.add(cached_url_record)
|
||||
|
||||
# Extract and store dependencies
|
||||
dependencies = _extract_dependencies(content, filename)
|
||||
if dependencies:
|
||||
logger.info(f"PyPI proxy: extracted {len(dependencies)} dependencies from {filename}")
|
||||
for dep_name, dep_version in dependencies:
|
||||
# Check if this dependency already exists for this artifact
|
||||
existing_dep = db.query(ArtifactDependency).filter(
|
||||
ArtifactDependency.artifact_id == sha256,
|
||||
ArtifactDependency.dependency_project == "_pypi",
|
||||
ArtifactDependency.dependency_package == dep_name,
|
||||
).first()
|
||||
|
||||
if not existing_dep:
|
||||
dep = ArtifactDependency(
|
||||
artifact_id=sha256,
|
||||
dependency_project="_pypi",
|
||||
dependency_package=dep_name,
|
||||
version_constraint=dep_version if dep_version else "*",
|
||||
)
|
||||
db.add(dep)
|
||||
|
||||
db.commit()
|
||||
|
||||
# Return the file
|
||||
|
||||
Reference in New Issue
Block a user