Restore dependency extraction from PyPI packages
Re-adds the dependency extraction that was accidentally removed with the proactive caching feature. Now when a PyPI package is cached: 1. Extract METADATA from wheel or PKG-INFO from sdist 2. Parse Requires-Dist lines for dependencies 3. Store in artifact_dependencies table This restores the dependency graph functionality for PyPI packages.
This commit is contained in:
@@ -10,8 +10,11 @@ import json
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import tarfile
|
||||
import tempfile
|
||||
from typing import Optional
|
||||
import zipfile
|
||||
from io import BytesIO
|
||||
from typing import Optional, List, Tuple
|
||||
from urllib.parse import urljoin, urlparse, quote, unquote
|
||||
|
||||
import httpx
|
||||
@@ -20,7 +23,7 @@ from fastapi.responses import StreamingResponse, HTMLResponse, RedirectResponse
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from .database import get_db
|
||||
from .models import UpstreamSource, CachedUrl, Artifact, Project, Package, PackageVersion
|
||||
from .models import UpstreamSource, CachedUrl, Artifact, Project, Package, PackageVersion, ArtifactDependency
|
||||
from .storage import S3Storage, get_storage
|
||||
from .config import get_env_upstream_sources, get_settings
|
||||
|
||||
@@ -34,6 +37,131 @@ PROXY_CONNECT_TIMEOUT = 30.0
|
||||
PROXY_READ_TIMEOUT = 60.0
|
||||
|
||||
|
||||
def _parse_requires_dist(requires_dist: str) -> Tuple[str, Optional[str]]:
|
||||
"""Parse a Requires-Dist line into (package_name, version_constraint).
|
||||
|
||||
Examples:
|
||||
"requests (>=2.25.0)" -> ("requests", ">=2.25.0")
|
||||
"typing-extensions; python_version < '3.8'" -> ("typing-extensions", None)
|
||||
"numpy>=1.21.0" -> ("numpy", ">=1.21.0")
|
||||
"certifi" -> ("certifi", None)
|
||||
|
||||
Returns:
|
||||
Tuple of (normalized_package_name, version_constraint or None)
|
||||
"""
|
||||
# Remove any environment markers (after semicolon)
|
||||
if ';' in requires_dist:
|
||||
requires_dist = requires_dist.split(';')[0].strip()
|
||||
|
||||
# Match patterns like "package (>=1.0)" or "package>=1.0" or "package"
|
||||
match = re.match(
|
||||
r'^([a-zA-Z0-9][-a-zA-Z0-9._]*)\s*(?:\(([^)]+)\)|([<>=!~][^\s;]+))?',
|
||||
requires_dist.strip()
|
||||
)
|
||||
|
||||
if not match:
|
||||
return None, None
|
||||
|
||||
package_name = match.group(1)
|
||||
# Version can be in parentheses (group 2) or directly after name (group 3)
|
||||
version_constraint = match.group(2) or match.group(3)
|
||||
|
||||
# Normalize package name (PEP 503)
|
||||
normalized_name = re.sub(r'[-_.]+', '-', package_name).lower()
|
||||
|
||||
# Clean up version constraint
|
||||
if version_constraint:
|
||||
version_constraint = version_constraint.strip()
|
||||
|
||||
return normalized_name, version_constraint
|
||||
|
||||
|
||||
def _extract_requires_from_metadata(metadata_content: str) -> List[Tuple[str, Optional[str]]]:
|
||||
"""Extract all Requires-Dist entries from METADATA/PKG-INFO content.
|
||||
|
||||
Args:
|
||||
metadata_content: The content of a METADATA or PKG-INFO file
|
||||
|
||||
Returns:
|
||||
List of (package_name, version_constraint) tuples
|
||||
"""
|
||||
dependencies = []
|
||||
|
||||
for line in metadata_content.split('\n'):
|
||||
if line.startswith('Requires-Dist:'):
|
||||
value = line[len('Requires-Dist:'):].strip()
|
||||
pkg_name, version = _parse_requires_dist(value)
|
||||
if pkg_name:
|
||||
dependencies.append((pkg_name, version))
|
||||
|
||||
return dependencies
|
||||
|
||||
|
||||
def _extract_metadata_from_wheel(file_path: str) -> Optional[str]:
|
||||
"""Extract METADATA file content from a wheel (zip) file.
|
||||
|
||||
Args:
|
||||
file_path: Path to the wheel file
|
||||
|
||||
Returns:
|
||||
METADATA file content as string, or None if not found
|
||||
"""
|
||||
try:
|
||||
with zipfile.ZipFile(file_path) as zf:
|
||||
for name in zf.namelist():
|
||||
if name.endswith('.dist-info/METADATA'):
|
||||
return zf.read(name).decode('utf-8', errors='replace')
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to extract metadata from wheel: {e}")
|
||||
return None
|
||||
|
||||
|
||||
def _extract_metadata_from_sdist(file_path: str) -> Optional[str]:
|
||||
"""Extract PKG-INFO file content from a source distribution (.tar.gz).
|
||||
|
||||
Args:
|
||||
file_path: Path to the tarball file
|
||||
|
||||
Returns:
|
||||
PKG-INFO file content as string, or None if not found
|
||||
"""
|
||||
try:
|
||||
with tarfile.open(file_path, mode='r:gz') as tf:
|
||||
for member in tf.getmembers():
|
||||
if member.name.endswith('/PKG-INFO') and member.name.count('/') == 1:
|
||||
f = tf.extractfile(member)
|
||||
if f:
|
||||
return f.read().decode('utf-8', errors='replace')
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to extract metadata from sdist: {e}")
|
||||
return None
|
||||
|
||||
|
||||
def _extract_dependencies_from_file(file_path: str, filename: str) -> List[Tuple[str, Optional[str]]]:
|
||||
"""Extract dependencies from a PyPI package file.
|
||||
|
||||
Supports wheel (.whl) and source distribution (.tar.gz) formats.
|
||||
|
||||
Args:
|
||||
file_path: Path to the package file
|
||||
filename: The original filename
|
||||
|
||||
Returns:
|
||||
List of (package_name, version_constraint) tuples
|
||||
"""
|
||||
metadata = None
|
||||
|
||||
if filename.endswith('.whl'):
|
||||
metadata = _extract_metadata_from_wheel(file_path)
|
||||
elif filename.endswith('.tar.gz'):
|
||||
metadata = _extract_metadata_from_sdist(file_path)
|
||||
|
||||
if metadata:
|
||||
return _extract_requires_from_metadata(metadata)
|
||||
|
||||
return []
|
||||
|
||||
|
||||
def _parse_upstream_error(response: httpx.Response) -> str:
|
||||
"""Parse upstream error response to extract useful error details.
|
||||
|
||||
@@ -528,6 +656,9 @@ async def pypi_download_file(
|
||||
|
||||
timeout = httpx.Timeout(300.0, connect=PROXY_CONNECT_TIMEOUT) # 5 minutes for large files
|
||||
|
||||
# Initialize extracted dependencies list
|
||||
extracted_deps = []
|
||||
|
||||
# Fetch the file
|
||||
logger.info(f"PyPI proxy: fetching {filename} from {upstream_url}")
|
||||
|
||||
@@ -593,6 +724,11 @@ async def pypi_download_file(
|
||||
size = result.size
|
||||
s3_key = result.s3_key
|
||||
|
||||
# Extract dependencies from the temp file before cleaning up
|
||||
extracted_deps = _extract_dependencies_from_file(tmp_path, filename)
|
||||
if extracted_deps:
|
||||
logger.info(f"PyPI proxy: extracted {len(extracted_deps)} dependencies from {filename}")
|
||||
|
||||
logger.info(f"PyPI proxy: downloaded {filename}, {size} bytes, sha256={sha256[:12]}")
|
||||
finally:
|
||||
# Clean up temp file
|
||||
@@ -685,6 +821,25 @@ async def pypi_download_file(
|
||||
)
|
||||
db.add(cached_url_record)
|
||||
|
||||
# Store extracted dependencies
|
||||
if extracted_deps:
|
||||
for dep_name, dep_version in extracted_deps:
|
||||
# Check if this dependency already exists for this artifact
|
||||
existing_dep = db.query(ArtifactDependency).filter(
|
||||
ArtifactDependency.artifact_id == sha256,
|
||||
ArtifactDependency.dependency_project == "_pypi",
|
||||
ArtifactDependency.dependency_package == dep_name,
|
||||
).first()
|
||||
|
||||
if not existing_dep:
|
||||
dep = ArtifactDependency(
|
||||
artifact_id=sha256,
|
||||
dependency_project="_pypi",
|
||||
dependency_package=dep_name,
|
||||
version_constraint=dep_version if dep_version else "*",
|
||||
)
|
||||
db.add(dep)
|
||||
|
||||
db.commit()
|
||||
|
||||
# Serve the file from S3
|
||||
|
||||
Reference in New Issue
Block a user