Restore dependency extraction from PyPI packages
Re-adds the dependency extraction that was accidentally removed with the proactive caching feature. Now when a PyPI package is cached: 1. Extract METADATA from wheel or PKG-INFO from sdist 2. Parse Requires-Dist lines for dependencies 3. Store in artifact_dependencies table This restores the dependency graph functionality for PyPI packages.
This commit is contained in:
@@ -10,8 +10,11 @@ import json
|
|||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
|
import tarfile
|
||||||
import tempfile
|
import tempfile
|
||||||
from typing import Optional
|
import zipfile
|
||||||
|
from io import BytesIO
|
||||||
|
from typing import Optional, List, Tuple
|
||||||
from urllib.parse import urljoin, urlparse, quote, unquote
|
from urllib.parse import urljoin, urlparse, quote, unquote
|
||||||
|
|
||||||
import httpx
|
import httpx
|
||||||
@@ -20,7 +23,7 @@ from fastapi.responses import StreamingResponse, HTMLResponse, RedirectResponse
|
|||||||
from sqlalchemy.orm import Session
|
from sqlalchemy.orm import Session
|
||||||
|
|
||||||
from .database import get_db
|
from .database import get_db
|
||||||
from .models import UpstreamSource, CachedUrl, Artifact, Project, Package, PackageVersion
|
from .models import UpstreamSource, CachedUrl, Artifact, Project, Package, PackageVersion, ArtifactDependency
|
||||||
from .storage import S3Storage, get_storage
|
from .storage import S3Storage, get_storage
|
||||||
from .config import get_env_upstream_sources, get_settings
|
from .config import get_env_upstream_sources, get_settings
|
||||||
|
|
||||||
@@ -34,6 +37,131 @@ PROXY_CONNECT_TIMEOUT = 30.0
|
|||||||
PROXY_READ_TIMEOUT = 60.0
|
PROXY_READ_TIMEOUT = 60.0
|
||||||
|
|
||||||
|
|
||||||
|
def _parse_requires_dist(requires_dist: str) -> Tuple[str, Optional[str]]:
|
||||||
|
"""Parse a Requires-Dist line into (package_name, version_constraint).
|
||||||
|
|
||||||
|
Examples:
|
||||||
|
"requests (>=2.25.0)" -> ("requests", ">=2.25.0")
|
||||||
|
"typing-extensions; python_version < '3.8'" -> ("typing-extensions", None)
|
||||||
|
"numpy>=1.21.0" -> ("numpy", ">=1.21.0")
|
||||||
|
"certifi" -> ("certifi", None)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Tuple of (normalized_package_name, version_constraint or None)
|
||||||
|
"""
|
||||||
|
# Remove any environment markers (after semicolon)
|
||||||
|
if ';' in requires_dist:
|
||||||
|
requires_dist = requires_dist.split(';')[0].strip()
|
||||||
|
|
||||||
|
# Match patterns like "package (>=1.0)" or "package>=1.0" or "package"
|
||||||
|
match = re.match(
|
||||||
|
r'^([a-zA-Z0-9][-a-zA-Z0-9._]*)\s*(?:\(([^)]+)\)|([<>=!~][^\s;]+))?',
|
||||||
|
requires_dist.strip()
|
||||||
|
)
|
||||||
|
|
||||||
|
if not match:
|
||||||
|
return None, None
|
||||||
|
|
||||||
|
package_name = match.group(1)
|
||||||
|
# Version can be in parentheses (group 2) or directly after name (group 3)
|
||||||
|
version_constraint = match.group(2) or match.group(3)
|
||||||
|
|
||||||
|
# Normalize package name (PEP 503)
|
||||||
|
normalized_name = re.sub(r'[-_.]+', '-', package_name).lower()
|
||||||
|
|
||||||
|
# Clean up version constraint
|
||||||
|
if version_constraint:
|
||||||
|
version_constraint = version_constraint.strip()
|
||||||
|
|
||||||
|
return normalized_name, version_constraint
|
||||||
|
|
||||||
|
|
||||||
|
def _extract_requires_from_metadata(metadata_content: str) -> List[Tuple[str, Optional[str]]]:
|
||||||
|
"""Extract all Requires-Dist entries from METADATA/PKG-INFO content.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
metadata_content: The content of a METADATA or PKG-INFO file
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of (package_name, version_constraint) tuples
|
||||||
|
"""
|
||||||
|
dependencies = []
|
||||||
|
|
||||||
|
for line in metadata_content.split('\n'):
|
||||||
|
if line.startswith('Requires-Dist:'):
|
||||||
|
value = line[len('Requires-Dist:'):].strip()
|
||||||
|
pkg_name, version = _parse_requires_dist(value)
|
||||||
|
if pkg_name:
|
||||||
|
dependencies.append((pkg_name, version))
|
||||||
|
|
||||||
|
return dependencies
|
||||||
|
|
||||||
|
|
||||||
|
def _extract_metadata_from_wheel(file_path: str) -> Optional[str]:
|
||||||
|
"""Extract METADATA file content from a wheel (zip) file.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
file_path: Path to the wheel file
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
METADATA file content as string, or None if not found
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
with zipfile.ZipFile(file_path) as zf:
|
||||||
|
for name in zf.namelist():
|
||||||
|
if name.endswith('.dist-info/METADATA'):
|
||||||
|
return zf.read(name).decode('utf-8', errors='replace')
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Failed to extract metadata from wheel: {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def _extract_metadata_from_sdist(file_path: str) -> Optional[str]:
|
||||||
|
"""Extract PKG-INFO file content from a source distribution (.tar.gz).
|
||||||
|
|
||||||
|
Args:
|
||||||
|
file_path: Path to the tarball file
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
PKG-INFO file content as string, or None if not found
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
with tarfile.open(file_path, mode='r:gz') as tf:
|
||||||
|
for member in tf.getmembers():
|
||||||
|
if member.name.endswith('/PKG-INFO') and member.name.count('/') == 1:
|
||||||
|
f = tf.extractfile(member)
|
||||||
|
if f:
|
||||||
|
return f.read().decode('utf-8', errors='replace')
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Failed to extract metadata from sdist: {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def _extract_dependencies_from_file(file_path: str, filename: str) -> List[Tuple[str, Optional[str]]]:
|
||||||
|
"""Extract dependencies from a PyPI package file.
|
||||||
|
|
||||||
|
Supports wheel (.whl) and source distribution (.tar.gz) formats.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
file_path: Path to the package file
|
||||||
|
filename: The original filename
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of (package_name, version_constraint) tuples
|
||||||
|
"""
|
||||||
|
metadata = None
|
||||||
|
|
||||||
|
if filename.endswith('.whl'):
|
||||||
|
metadata = _extract_metadata_from_wheel(file_path)
|
||||||
|
elif filename.endswith('.tar.gz'):
|
||||||
|
metadata = _extract_metadata_from_sdist(file_path)
|
||||||
|
|
||||||
|
if metadata:
|
||||||
|
return _extract_requires_from_metadata(metadata)
|
||||||
|
|
||||||
|
return []
|
||||||
|
|
||||||
|
|
||||||
def _parse_upstream_error(response: httpx.Response) -> str:
|
def _parse_upstream_error(response: httpx.Response) -> str:
|
||||||
"""Parse upstream error response to extract useful error details.
|
"""Parse upstream error response to extract useful error details.
|
||||||
|
|
||||||
@@ -528,6 +656,9 @@ async def pypi_download_file(
|
|||||||
|
|
||||||
timeout = httpx.Timeout(300.0, connect=PROXY_CONNECT_TIMEOUT) # 5 minutes for large files
|
timeout = httpx.Timeout(300.0, connect=PROXY_CONNECT_TIMEOUT) # 5 minutes for large files
|
||||||
|
|
||||||
|
# Initialize extracted dependencies list
|
||||||
|
extracted_deps = []
|
||||||
|
|
||||||
# Fetch the file
|
# Fetch the file
|
||||||
logger.info(f"PyPI proxy: fetching {filename} from {upstream_url}")
|
logger.info(f"PyPI proxy: fetching {filename} from {upstream_url}")
|
||||||
|
|
||||||
@@ -593,6 +724,11 @@ async def pypi_download_file(
|
|||||||
size = result.size
|
size = result.size
|
||||||
s3_key = result.s3_key
|
s3_key = result.s3_key
|
||||||
|
|
||||||
|
# Extract dependencies from the temp file before cleaning up
|
||||||
|
extracted_deps = _extract_dependencies_from_file(tmp_path, filename)
|
||||||
|
if extracted_deps:
|
||||||
|
logger.info(f"PyPI proxy: extracted {len(extracted_deps)} dependencies from {filename}")
|
||||||
|
|
||||||
logger.info(f"PyPI proxy: downloaded {filename}, {size} bytes, sha256={sha256[:12]}")
|
logger.info(f"PyPI proxy: downloaded {filename}, {size} bytes, sha256={sha256[:12]}")
|
||||||
finally:
|
finally:
|
||||||
# Clean up temp file
|
# Clean up temp file
|
||||||
@@ -685,6 +821,25 @@ async def pypi_download_file(
|
|||||||
)
|
)
|
||||||
db.add(cached_url_record)
|
db.add(cached_url_record)
|
||||||
|
|
||||||
|
# Store extracted dependencies
|
||||||
|
if extracted_deps:
|
||||||
|
for dep_name, dep_version in extracted_deps:
|
||||||
|
# Check if this dependency already exists for this artifact
|
||||||
|
existing_dep = db.query(ArtifactDependency).filter(
|
||||||
|
ArtifactDependency.artifact_id == sha256,
|
||||||
|
ArtifactDependency.dependency_project == "_pypi",
|
||||||
|
ArtifactDependency.dependency_package == dep_name,
|
||||||
|
).first()
|
||||||
|
|
||||||
|
if not existing_dep:
|
||||||
|
dep = ArtifactDependency(
|
||||||
|
artifact_id=sha256,
|
||||||
|
dependency_project="_pypi",
|
||||||
|
dependency_package=dep_name,
|
||||||
|
version_constraint=dep_version if dep_version else "*",
|
||||||
|
)
|
||||||
|
db.add(dep)
|
||||||
|
|
||||||
db.commit()
|
db.commit()
|
||||||
|
|
||||||
# Serve the file from S3
|
# Serve the file from S3
|
||||||
|
|||||||
Reference in New Issue
Block a user