When resolving dependencies like certifi@2025.10.5, the bare version string "2025.10.5" was being rejected as an invalid SpecifierSet and falling back to wildcard, which fetched the latest version instead. Now bare versions starting with a digit are automatically prefixed with "==" to create an exact match constraint.
427 lines
14 KiB
Python
427 lines
14 KiB
Python
"""
|
|
Registry client abstraction for upstream package registries.
|
|
|
|
Provides a pluggable interface for fetching packages from upstream registries
|
|
(PyPI, npm, Maven, etc.) during dependency resolution with auto-fetch enabled.
|
|
"""
|
|
|
|
import hashlib
|
|
import logging
|
|
import os
|
|
import re
|
|
import tempfile
|
|
from abc import ABC, abstractmethod
|
|
from dataclasses import dataclass
|
|
from typing import List, Optional, TYPE_CHECKING
|
|
from urllib.parse import urljoin, urlparse
|
|
|
|
import httpx
|
|
from packaging.specifiers import SpecifierSet, InvalidSpecifier
|
|
from packaging.version import Version, InvalidVersion
|
|
from sqlalchemy.orm import Session
|
|
|
|
if TYPE_CHECKING:
|
|
from .storage import S3Storage
|
|
from .http_client import HttpClientManager
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
@dataclass
|
|
class VersionInfo:
|
|
"""Information about a package version from an upstream registry."""
|
|
|
|
version: str
|
|
download_url: str
|
|
filename: str
|
|
sha256: Optional[str] = None
|
|
size: Optional[int] = None
|
|
content_type: Optional[str] = None
|
|
|
|
|
|
@dataclass
|
|
class FetchResult:
|
|
"""Result of fetching a package from upstream."""
|
|
|
|
artifact_id: str # SHA256 hash
|
|
size: int
|
|
version: str
|
|
filename: str
|
|
already_cached: bool = False
|
|
|
|
|
|
class RegistryClient(ABC):
|
|
"""Abstract base class for upstream registry clients."""
|
|
|
|
@property
|
|
@abstractmethod
|
|
def source_type(self) -> str:
|
|
"""Return the source type this client handles (e.g., 'pypi', 'npm')."""
|
|
pass
|
|
|
|
@abstractmethod
|
|
async def get_available_versions(self, package_name: str) -> List[str]:
|
|
"""
|
|
Get all available versions of a package from upstream.
|
|
|
|
Args:
|
|
package_name: The normalized package name
|
|
|
|
Returns:
|
|
List of version strings, sorted from oldest to newest
|
|
"""
|
|
pass
|
|
|
|
@abstractmethod
|
|
async def resolve_constraint(
|
|
self, package_name: str, constraint: str
|
|
) -> Optional[VersionInfo]:
|
|
"""
|
|
Find the best version matching a constraint.
|
|
|
|
Args:
|
|
package_name: The normalized package name
|
|
constraint: Version constraint (e.g., '>=1.9', '<2.0,>=1.5', '*')
|
|
|
|
Returns:
|
|
VersionInfo with download URL, or None if no matching version found
|
|
"""
|
|
pass
|
|
|
|
@abstractmethod
|
|
async def fetch_package(
|
|
self,
|
|
package_name: str,
|
|
version_info: VersionInfo,
|
|
db: Session,
|
|
storage: "S3Storage",
|
|
) -> Optional[FetchResult]:
|
|
"""
|
|
Fetch and cache a package from upstream.
|
|
|
|
Args:
|
|
package_name: The normalized package name
|
|
version_info: Version details including download URL
|
|
db: Database session for creating records
|
|
storage: S3 storage for caching the artifact
|
|
|
|
Returns:
|
|
FetchResult with artifact_id, or None if fetch failed
|
|
"""
|
|
pass
|
|
|
|
|
|
class PyPIRegistryClient(RegistryClient):
|
|
"""PyPI registry client using the JSON API."""
|
|
|
|
# Timeout configuration for PyPI requests
|
|
CONNECT_TIMEOUT = 30.0
|
|
READ_TIMEOUT = 60.0
|
|
DOWNLOAD_TIMEOUT = 300.0 # Longer timeout for file downloads
|
|
|
|
def __init__(
|
|
self,
|
|
http_client: httpx.AsyncClient,
|
|
upstream_sources: List,
|
|
pypi_api_url: str = "https://pypi.org/pypi",
|
|
):
|
|
"""
|
|
Initialize PyPI registry client.
|
|
|
|
Args:
|
|
http_client: Shared async HTTP client
|
|
upstream_sources: List of configured upstream sources for auth
|
|
pypi_api_url: Base URL for PyPI JSON API
|
|
"""
|
|
self.client = http_client
|
|
self.sources = upstream_sources
|
|
self.api_url = pypi_api_url
|
|
|
|
@property
|
|
def source_type(self) -> str:
|
|
return "pypi"
|
|
|
|
def _normalize_package_name(self, name: str) -> str:
|
|
"""Normalize a PyPI package name per PEP 503."""
|
|
return re.sub(r"[-_.]+", "-", name).lower()
|
|
|
|
def _get_auth_headers(self) -> dict:
|
|
"""Get authentication headers from configured sources."""
|
|
headers = {"User-Agent": "Orchard-Registry-Client/1.0"}
|
|
if self.sources:
|
|
source = self.sources[0]
|
|
if hasattr(source, "auth_type"):
|
|
if source.auth_type == "bearer":
|
|
password = (
|
|
source.get_password()
|
|
if hasattr(source, "get_password")
|
|
else getattr(source, "password", None)
|
|
)
|
|
if password:
|
|
headers["Authorization"] = f"Bearer {password}"
|
|
elif source.auth_type == "api_key":
|
|
custom_headers = (
|
|
source.get_headers()
|
|
if hasattr(source, "get_headers")
|
|
else {}
|
|
)
|
|
if custom_headers:
|
|
headers.update(custom_headers)
|
|
return headers
|
|
|
|
def _get_basic_auth(self) -> Optional[tuple]:
|
|
"""Get basic auth credentials if configured."""
|
|
if self.sources:
|
|
source = self.sources[0]
|
|
if hasattr(source, "auth_type") and source.auth_type == "basic":
|
|
username = getattr(source, "username", None)
|
|
if username:
|
|
password = (
|
|
source.get_password()
|
|
if hasattr(source, "get_password")
|
|
else getattr(source, "password", "")
|
|
)
|
|
return (username, password or "")
|
|
return None
|
|
|
|
async def get_available_versions(self, package_name: str) -> List[str]:
|
|
"""Get all available versions from PyPI JSON API."""
|
|
normalized = self._normalize_package_name(package_name)
|
|
url = f"{self.api_url}/{normalized}/json"
|
|
|
|
headers = self._get_auth_headers()
|
|
auth = self._get_basic_auth()
|
|
timeout = httpx.Timeout(self.READ_TIMEOUT, connect=self.CONNECT_TIMEOUT)
|
|
|
|
try:
|
|
response = await self.client.get(
|
|
url, headers=headers, auth=auth, timeout=timeout
|
|
)
|
|
|
|
if response.status_code == 404:
|
|
logger.debug(f"Package {normalized} not found on PyPI")
|
|
return []
|
|
|
|
if response.status_code != 200:
|
|
logger.warning(
|
|
f"PyPI API returned {response.status_code} for {normalized}"
|
|
)
|
|
return []
|
|
|
|
data = response.json()
|
|
releases = data.get("releases", {})
|
|
|
|
# Filter to valid versions and sort
|
|
versions = []
|
|
for v in releases.keys():
|
|
try:
|
|
Version(v)
|
|
versions.append(v)
|
|
except InvalidVersion:
|
|
continue
|
|
|
|
versions.sort(key=lambda x: Version(x))
|
|
return versions
|
|
|
|
except httpx.RequestError as e:
|
|
logger.warning(f"Failed to query PyPI for {normalized}: {e}")
|
|
return []
|
|
except Exception as e:
|
|
logger.warning(f"Error parsing PyPI response for {normalized}: {e}")
|
|
return []
|
|
|
|
async def resolve_constraint(
|
|
self, package_name: str, constraint: str
|
|
) -> Optional[VersionInfo]:
|
|
"""Find best version matching constraint from PyPI."""
|
|
normalized = self._normalize_package_name(package_name)
|
|
url = f"{self.api_url}/{normalized}/json"
|
|
|
|
headers = self._get_auth_headers()
|
|
auth = self._get_basic_auth()
|
|
timeout = httpx.Timeout(self.READ_TIMEOUT, connect=self.CONNECT_TIMEOUT)
|
|
|
|
try:
|
|
response = await self.client.get(
|
|
url, headers=headers, auth=auth, timeout=timeout
|
|
)
|
|
|
|
if response.status_code == 404:
|
|
logger.debug(f"Package {normalized} not found on PyPI")
|
|
return None
|
|
|
|
if response.status_code != 200:
|
|
logger.warning(
|
|
f"PyPI API returned {response.status_code} for {normalized}"
|
|
)
|
|
return None
|
|
|
|
data = response.json()
|
|
releases = data.get("releases", {})
|
|
|
|
# Handle wildcard - return latest version
|
|
if constraint == "*":
|
|
latest_version = data.get("info", {}).get("version")
|
|
if latest_version and latest_version in releases:
|
|
return self._get_version_info(
|
|
normalized, latest_version, releases[latest_version]
|
|
)
|
|
return None
|
|
|
|
# Parse constraint
|
|
# If constraint looks like a bare version (no operator), treat as exact match
|
|
# e.g., "2025.10.5" -> "==2025.10.5"
|
|
effective_constraint = constraint
|
|
if constraint and constraint[0].isdigit():
|
|
effective_constraint = f"=={constraint}"
|
|
logger.debug(
|
|
f"Bare version '{constraint}' for {normalized}, "
|
|
f"treating as exact match '{effective_constraint}'"
|
|
)
|
|
|
|
try:
|
|
specifier = SpecifierSet(effective_constraint)
|
|
except InvalidSpecifier:
|
|
# Invalid constraint - treat as wildcard
|
|
logger.warning(
|
|
f"Invalid version constraint '{constraint}' for {normalized}, "
|
|
"treating as wildcard"
|
|
)
|
|
latest_version = data.get("info", {}).get("version")
|
|
if latest_version and latest_version in releases:
|
|
return self._get_version_info(
|
|
normalized, latest_version, releases[latest_version]
|
|
)
|
|
return None
|
|
|
|
# Find matching versions
|
|
matching = []
|
|
for v_str, files in releases.items():
|
|
if not files: # Skip versions with no files
|
|
continue
|
|
try:
|
|
v = Version(v_str)
|
|
if v in specifier:
|
|
matching.append((v_str, v, files))
|
|
except InvalidVersion:
|
|
continue
|
|
|
|
if not matching:
|
|
logger.debug(
|
|
f"No versions of {normalized} match constraint '{constraint}'"
|
|
)
|
|
return None
|
|
|
|
# Sort by version and return highest match
|
|
matching.sort(key=lambda x: x[1], reverse=True)
|
|
best_version, _, best_files = matching[0]
|
|
|
|
return self._get_version_info(normalized, best_version, best_files)
|
|
|
|
except httpx.RequestError as e:
|
|
logger.warning(f"Failed to query PyPI for {normalized}: {e}")
|
|
return None
|
|
except Exception as e:
|
|
logger.warning(f"Error resolving {normalized}@{constraint}: {e}")
|
|
return None
|
|
|
|
def _get_version_info(
|
|
self, package_name: str, version: str, files: List[dict]
|
|
) -> Optional[VersionInfo]:
|
|
"""Extract download info from PyPI release files."""
|
|
if not files:
|
|
return None
|
|
|
|
# Prefer wheel over sdist
|
|
wheel_file = None
|
|
sdist_file = None
|
|
|
|
for f in files:
|
|
filename = f.get("filename", "")
|
|
if filename.endswith(".whl"):
|
|
# Prefer platform-agnostic wheels
|
|
if "py3-none-any" in filename or wheel_file is None:
|
|
wheel_file = f
|
|
elif filename.endswith(".tar.gz") and sdist_file is None:
|
|
sdist_file = f
|
|
|
|
selected = wheel_file or sdist_file
|
|
if not selected:
|
|
# Fall back to first available file
|
|
selected = files[0]
|
|
|
|
return VersionInfo(
|
|
version=version,
|
|
download_url=selected.get("url", ""),
|
|
filename=selected.get("filename", ""),
|
|
sha256=selected.get("digests", {}).get("sha256"),
|
|
size=selected.get("size"),
|
|
content_type="application/zip"
|
|
if selected.get("filename", "").endswith(".whl")
|
|
else "application/gzip",
|
|
)
|
|
|
|
async def fetch_package(
|
|
self,
|
|
package_name: str,
|
|
version_info: VersionInfo,
|
|
db: Session,
|
|
storage: "S3Storage",
|
|
) -> Optional[FetchResult]:
|
|
"""Fetch and cache a PyPI package."""
|
|
# Import here to avoid circular imports
|
|
from .pypi_proxy import fetch_and_cache_pypi_package
|
|
|
|
normalized = self._normalize_package_name(package_name)
|
|
|
|
logger.info(
|
|
f"Fetching {normalized}=={version_info.version} from upstream PyPI"
|
|
)
|
|
|
|
result = await fetch_and_cache_pypi_package(
|
|
db=db,
|
|
storage=storage,
|
|
http_client=self.client,
|
|
package_name=normalized,
|
|
filename=version_info.filename,
|
|
download_url=version_info.download_url,
|
|
expected_sha256=version_info.sha256,
|
|
)
|
|
|
|
if result is None:
|
|
return None
|
|
|
|
return FetchResult(
|
|
artifact_id=result["artifact_id"],
|
|
size=result["size"],
|
|
version=version_info.version,
|
|
filename=version_info.filename,
|
|
already_cached=result.get("already_cached", False),
|
|
)
|
|
|
|
|
|
def get_registry_client(
|
|
source_type: str,
|
|
http_client: httpx.AsyncClient,
|
|
upstream_sources: List,
|
|
) -> Optional[RegistryClient]:
|
|
"""
|
|
Factory function to get a registry client for a source type.
|
|
|
|
Args:
|
|
source_type: The registry type ('pypi', 'npm', etc.)
|
|
http_client: Shared async HTTP client
|
|
upstream_sources: List of configured upstream sources
|
|
|
|
Returns:
|
|
RegistryClient for the source type, or None if not supported
|
|
"""
|
|
if source_type == "pypi":
|
|
# Filter to PyPI sources
|
|
pypi_sources = [s for s in upstream_sources if getattr(s, "source_type", "") == "pypi"]
|
|
return PyPIRegistryClient(http_client, pypi_sources)
|
|
|
|
# Future: Add npm, maven, etc.
|
|
logger.debug(f"No registry client available for source type: {source_type}")
|
|
return None
|