feat: add auto-fetch for missing dependencies from upstream registries
Add auto_fetch parameter to dependency resolution endpoint that fetches missing dependencies from upstream registries (PyPI) when resolving. - Add RegistryClient abstraction with PyPIRegistryClient implementation - Extract fetch_and_cache_pypi_package() for reuse - Add resolve_dependencies_with_fetch() async function - Extend MissingDependency schema with fetch_attempted/fetch_error - Add fetched list to DependencyResolutionResponse - Add auto_fetch_max_depth config setting (default: 3) - Remove Usage section from Package page UI - Add 6 integration tests for auto-fetch functionality
This commit is contained in:
416
backend/app/registry_client.py
Normal file
416
backend/app/registry_client.py
Normal file
@@ -0,0 +1,416 @@
|
||||
"""
|
||||
Registry client abstraction for upstream package registries.
|
||||
|
||||
Provides a pluggable interface for fetching packages from upstream registries
|
||||
(PyPI, npm, Maven, etc.) during dependency resolution with auto-fetch enabled.
|
||||
"""
|
||||
|
||||
import hashlib
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import tempfile
|
||||
from abc import ABC, abstractmethod
|
||||
from dataclasses import dataclass
|
||||
from typing import List, Optional, TYPE_CHECKING
|
||||
from urllib.parse import urljoin, urlparse
|
||||
|
||||
import httpx
|
||||
from packaging.specifiers import SpecifierSet, InvalidSpecifier
|
||||
from packaging.version import Version, InvalidVersion
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from .storage import S3Storage
|
||||
from .http_client import HttpClientManager
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class VersionInfo:
|
||||
"""Information about a package version from an upstream registry."""
|
||||
|
||||
version: str
|
||||
download_url: str
|
||||
filename: str
|
||||
sha256: Optional[str] = None
|
||||
size: Optional[int] = None
|
||||
content_type: Optional[str] = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class FetchResult:
|
||||
"""Result of fetching a package from upstream."""
|
||||
|
||||
artifact_id: str # SHA256 hash
|
||||
size: int
|
||||
version: str
|
||||
filename: str
|
||||
already_cached: bool = False
|
||||
|
||||
|
||||
class RegistryClient(ABC):
|
||||
"""Abstract base class for upstream registry clients."""
|
||||
|
||||
@property
|
||||
@abstractmethod
|
||||
def source_type(self) -> str:
|
||||
"""Return the source type this client handles (e.g., 'pypi', 'npm')."""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
async def get_available_versions(self, package_name: str) -> List[str]:
|
||||
"""
|
||||
Get all available versions of a package from upstream.
|
||||
|
||||
Args:
|
||||
package_name: The normalized package name
|
||||
|
||||
Returns:
|
||||
List of version strings, sorted from oldest to newest
|
||||
"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
async def resolve_constraint(
|
||||
self, package_name: str, constraint: str
|
||||
) -> Optional[VersionInfo]:
|
||||
"""
|
||||
Find the best version matching a constraint.
|
||||
|
||||
Args:
|
||||
package_name: The normalized package name
|
||||
constraint: Version constraint (e.g., '>=1.9', '<2.0,>=1.5', '*')
|
||||
|
||||
Returns:
|
||||
VersionInfo with download URL, or None if no matching version found
|
||||
"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
async def fetch_package(
|
||||
self,
|
||||
package_name: str,
|
||||
version_info: VersionInfo,
|
||||
db: Session,
|
||||
storage: "S3Storage",
|
||||
) -> Optional[FetchResult]:
|
||||
"""
|
||||
Fetch and cache a package from upstream.
|
||||
|
||||
Args:
|
||||
package_name: The normalized package name
|
||||
version_info: Version details including download URL
|
||||
db: Database session for creating records
|
||||
storage: S3 storage for caching the artifact
|
||||
|
||||
Returns:
|
||||
FetchResult with artifact_id, or None if fetch failed
|
||||
"""
|
||||
pass
|
||||
|
||||
|
||||
class PyPIRegistryClient(RegistryClient):
|
||||
"""PyPI registry client using the JSON API."""
|
||||
|
||||
# Timeout configuration for PyPI requests
|
||||
CONNECT_TIMEOUT = 30.0
|
||||
READ_TIMEOUT = 60.0
|
||||
DOWNLOAD_TIMEOUT = 300.0 # Longer timeout for file downloads
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
http_client: httpx.AsyncClient,
|
||||
upstream_sources: List,
|
||||
pypi_api_url: str = "https://pypi.org/pypi",
|
||||
):
|
||||
"""
|
||||
Initialize PyPI registry client.
|
||||
|
||||
Args:
|
||||
http_client: Shared async HTTP client
|
||||
upstream_sources: List of configured upstream sources for auth
|
||||
pypi_api_url: Base URL for PyPI JSON API
|
||||
"""
|
||||
self.client = http_client
|
||||
self.sources = upstream_sources
|
||||
self.api_url = pypi_api_url
|
||||
|
||||
@property
|
||||
def source_type(self) -> str:
|
||||
return "pypi"
|
||||
|
||||
def _normalize_package_name(self, name: str) -> str:
|
||||
"""Normalize a PyPI package name per PEP 503."""
|
||||
return re.sub(r"[-_.]+", "-", name).lower()
|
||||
|
||||
def _get_auth_headers(self) -> dict:
|
||||
"""Get authentication headers from configured sources."""
|
||||
headers = {"User-Agent": "Orchard-Registry-Client/1.0"}
|
||||
if self.sources:
|
||||
source = self.sources[0]
|
||||
if hasattr(source, "auth_type"):
|
||||
if source.auth_type == "bearer":
|
||||
password = (
|
||||
source.get_password()
|
||||
if hasattr(source, "get_password")
|
||||
else getattr(source, "password", None)
|
||||
)
|
||||
if password:
|
||||
headers["Authorization"] = f"Bearer {password}"
|
||||
elif source.auth_type == "api_key":
|
||||
custom_headers = (
|
||||
source.get_headers()
|
||||
if hasattr(source, "get_headers")
|
||||
else {}
|
||||
)
|
||||
if custom_headers:
|
||||
headers.update(custom_headers)
|
||||
return headers
|
||||
|
||||
def _get_basic_auth(self) -> Optional[tuple]:
|
||||
"""Get basic auth credentials if configured."""
|
||||
if self.sources:
|
||||
source = self.sources[0]
|
||||
if hasattr(source, "auth_type") and source.auth_type == "basic":
|
||||
username = getattr(source, "username", None)
|
||||
if username:
|
||||
password = (
|
||||
source.get_password()
|
||||
if hasattr(source, "get_password")
|
||||
else getattr(source, "password", "")
|
||||
)
|
||||
return (username, password or "")
|
||||
return None
|
||||
|
||||
async def get_available_versions(self, package_name: str) -> List[str]:
|
||||
"""Get all available versions from PyPI JSON API."""
|
||||
normalized = self._normalize_package_name(package_name)
|
||||
url = f"{self.api_url}/{normalized}/json"
|
||||
|
||||
headers = self._get_auth_headers()
|
||||
auth = self._get_basic_auth()
|
||||
timeout = httpx.Timeout(self.READ_TIMEOUT, connect=self.CONNECT_TIMEOUT)
|
||||
|
||||
try:
|
||||
response = await self.client.get(
|
||||
url, headers=headers, auth=auth, timeout=timeout
|
||||
)
|
||||
|
||||
if response.status_code == 404:
|
||||
logger.debug(f"Package {normalized} not found on PyPI")
|
||||
return []
|
||||
|
||||
if response.status_code != 200:
|
||||
logger.warning(
|
||||
f"PyPI API returned {response.status_code} for {normalized}"
|
||||
)
|
||||
return []
|
||||
|
||||
data = response.json()
|
||||
releases = data.get("releases", {})
|
||||
|
||||
# Filter to valid versions and sort
|
||||
versions = []
|
||||
for v in releases.keys():
|
||||
try:
|
||||
Version(v)
|
||||
versions.append(v)
|
||||
except InvalidVersion:
|
||||
continue
|
||||
|
||||
versions.sort(key=lambda x: Version(x))
|
||||
return versions
|
||||
|
||||
except httpx.RequestError as e:
|
||||
logger.warning(f"Failed to query PyPI for {normalized}: {e}")
|
||||
return []
|
||||
except Exception as e:
|
||||
logger.warning(f"Error parsing PyPI response for {normalized}: {e}")
|
||||
return []
|
||||
|
||||
async def resolve_constraint(
|
||||
self, package_name: str, constraint: str
|
||||
) -> Optional[VersionInfo]:
|
||||
"""Find best version matching constraint from PyPI."""
|
||||
normalized = self._normalize_package_name(package_name)
|
||||
url = f"{self.api_url}/{normalized}/json"
|
||||
|
||||
headers = self._get_auth_headers()
|
||||
auth = self._get_basic_auth()
|
||||
timeout = httpx.Timeout(self.READ_TIMEOUT, connect=self.CONNECT_TIMEOUT)
|
||||
|
||||
try:
|
||||
response = await self.client.get(
|
||||
url, headers=headers, auth=auth, timeout=timeout
|
||||
)
|
||||
|
||||
if response.status_code == 404:
|
||||
logger.debug(f"Package {normalized} not found on PyPI")
|
||||
return None
|
||||
|
||||
if response.status_code != 200:
|
||||
logger.warning(
|
||||
f"PyPI API returned {response.status_code} for {normalized}"
|
||||
)
|
||||
return None
|
||||
|
||||
data = response.json()
|
||||
releases = data.get("releases", {})
|
||||
|
||||
# Handle wildcard - return latest version
|
||||
if constraint == "*":
|
||||
latest_version = data.get("info", {}).get("version")
|
||||
if latest_version and latest_version in releases:
|
||||
return self._get_version_info(
|
||||
normalized, latest_version, releases[latest_version]
|
||||
)
|
||||
return None
|
||||
|
||||
# Parse constraint
|
||||
try:
|
||||
specifier = SpecifierSet(constraint)
|
||||
except InvalidSpecifier:
|
||||
# Invalid constraint - treat as wildcard
|
||||
logger.warning(
|
||||
f"Invalid version constraint '{constraint}' for {normalized}, "
|
||||
"treating as wildcard"
|
||||
)
|
||||
latest_version = data.get("info", {}).get("version")
|
||||
if latest_version and latest_version in releases:
|
||||
return self._get_version_info(
|
||||
normalized, latest_version, releases[latest_version]
|
||||
)
|
||||
return None
|
||||
|
||||
# Find matching versions
|
||||
matching = []
|
||||
for v_str, files in releases.items():
|
||||
if not files: # Skip versions with no files
|
||||
continue
|
||||
try:
|
||||
v = Version(v_str)
|
||||
if v in specifier:
|
||||
matching.append((v_str, v, files))
|
||||
except InvalidVersion:
|
||||
continue
|
||||
|
||||
if not matching:
|
||||
logger.debug(
|
||||
f"No versions of {normalized} match constraint '{constraint}'"
|
||||
)
|
||||
return None
|
||||
|
||||
# Sort by version and return highest match
|
||||
matching.sort(key=lambda x: x[1], reverse=True)
|
||||
best_version, _, best_files = matching[0]
|
||||
|
||||
return self._get_version_info(normalized, best_version, best_files)
|
||||
|
||||
except httpx.RequestError as e:
|
||||
logger.warning(f"Failed to query PyPI for {normalized}: {e}")
|
||||
return None
|
||||
except Exception as e:
|
||||
logger.warning(f"Error resolving {normalized}@{constraint}: {e}")
|
||||
return None
|
||||
|
||||
def _get_version_info(
|
||||
self, package_name: str, version: str, files: List[dict]
|
||||
) -> Optional[VersionInfo]:
|
||||
"""Extract download info from PyPI release files."""
|
||||
if not files:
|
||||
return None
|
||||
|
||||
# Prefer wheel over sdist
|
||||
wheel_file = None
|
||||
sdist_file = None
|
||||
|
||||
for f in files:
|
||||
filename = f.get("filename", "")
|
||||
if filename.endswith(".whl"):
|
||||
# Prefer platform-agnostic wheels
|
||||
if "py3-none-any" in filename or wheel_file is None:
|
||||
wheel_file = f
|
||||
elif filename.endswith(".tar.gz") and sdist_file is None:
|
||||
sdist_file = f
|
||||
|
||||
selected = wheel_file or sdist_file
|
||||
if not selected:
|
||||
# Fall back to first available file
|
||||
selected = files[0]
|
||||
|
||||
return VersionInfo(
|
||||
version=version,
|
||||
download_url=selected.get("url", ""),
|
||||
filename=selected.get("filename", ""),
|
||||
sha256=selected.get("digests", {}).get("sha256"),
|
||||
size=selected.get("size"),
|
||||
content_type="application/zip"
|
||||
if selected.get("filename", "").endswith(".whl")
|
||||
else "application/gzip",
|
||||
)
|
||||
|
||||
async def fetch_package(
|
||||
self,
|
||||
package_name: str,
|
||||
version_info: VersionInfo,
|
||||
db: Session,
|
||||
storage: "S3Storage",
|
||||
) -> Optional[FetchResult]:
|
||||
"""Fetch and cache a PyPI package."""
|
||||
# Import here to avoid circular imports
|
||||
from .pypi_proxy import fetch_and_cache_pypi_package
|
||||
|
||||
normalized = self._normalize_package_name(package_name)
|
||||
|
||||
logger.info(
|
||||
f"Fetching {normalized}=={version_info.version} from upstream PyPI"
|
||||
)
|
||||
|
||||
result = await fetch_and_cache_pypi_package(
|
||||
db=db,
|
||||
storage=storage,
|
||||
http_client=self.client,
|
||||
package_name=normalized,
|
||||
filename=version_info.filename,
|
||||
download_url=version_info.download_url,
|
||||
expected_sha256=version_info.sha256,
|
||||
)
|
||||
|
||||
if result is None:
|
||||
return None
|
||||
|
||||
return FetchResult(
|
||||
artifact_id=result["artifact_id"],
|
||||
size=result["size"],
|
||||
version=version_info.version,
|
||||
filename=version_info.filename,
|
||||
already_cached=result.get("already_cached", False),
|
||||
)
|
||||
|
||||
|
||||
def get_registry_client(
|
||||
source_type: str,
|
||||
http_client: httpx.AsyncClient,
|
||||
upstream_sources: List,
|
||||
) -> Optional[RegistryClient]:
|
||||
"""
|
||||
Factory function to get a registry client for a source type.
|
||||
|
||||
Args:
|
||||
source_type: The registry type ('pypi', 'npm', etc.)
|
||||
http_client: Shared async HTTP client
|
||||
upstream_sources: List of configured upstream sources
|
||||
|
||||
Returns:
|
||||
RegistryClient for the source type, or None if not supported
|
||||
"""
|
||||
if source_type == "pypi":
|
||||
# Filter to PyPI sources
|
||||
pypi_sources = [s for s in upstream_sources if getattr(s, "source_type", "") == "pypi"]
|
||||
return PyPIRegistryClient(http_client, pypi_sources)
|
||||
|
||||
# Future: Add npm, maven, etc.
|
||||
logger.debug(f"No registry client available for source type: {source_type}")
|
||||
return None
|
||||
Reference in New Issue
Block a user