""" Registry client abstraction for upstream package registries. Provides a pluggable interface for fetching packages from upstream registries (PyPI, npm, Maven, etc.) during dependency resolution with auto-fetch enabled. """ import hashlib import logging import os import re import tempfile from abc import ABC, abstractmethod from dataclasses import dataclass from typing import List, Optional, TYPE_CHECKING from urllib.parse import urljoin, urlparse import httpx from packaging.specifiers import SpecifierSet, InvalidSpecifier from packaging.version import Version, InvalidVersion from sqlalchemy.orm import Session if TYPE_CHECKING: from .storage import S3Storage from .http_client import HttpClientManager logger = logging.getLogger(__name__) @dataclass class VersionInfo: """Information about a package version from an upstream registry.""" version: str download_url: str filename: str sha256: Optional[str] = None size: Optional[int] = None content_type: Optional[str] = None @dataclass class FetchResult: """Result of fetching a package from upstream.""" artifact_id: str # SHA256 hash size: int version: str filename: str already_cached: bool = False class RegistryClient(ABC): """Abstract base class for upstream registry clients.""" @property @abstractmethod def source_type(self) -> str: """Return the source type this client handles (e.g., 'pypi', 'npm').""" pass @abstractmethod async def get_available_versions(self, package_name: str) -> List[str]: """ Get all available versions of a package from upstream. Args: package_name: The normalized package name Returns: List of version strings, sorted from oldest to newest """ pass @abstractmethod async def resolve_constraint( self, package_name: str, constraint: str ) -> Optional[VersionInfo]: """ Find the best version matching a constraint. Args: package_name: The normalized package name constraint: Version constraint (e.g., '>=1.9', '<2.0,>=1.5', '*') Returns: VersionInfo with download URL, or None if no matching version found """ pass @abstractmethod async def fetch_package( self, package_name: str, version_info: VersionInfo, db: Session, storage: "S3Storage", ) -> Optional[FetchResult]: """ Fetch and cache a package from upstream. Args: package_name: The normalized package name version_info: Version details including download URL db: Database session for creating records storage: S3 storage for caching the artifact Returns: FetchResult with artifact_id, or None if fetch failed """ pass class PyPIRegistryClient(RegistryClient): """PyPI registry client using the JSON API.""" # Timeout configuration for PyPI requests CONNECT_TIMEOUT = 30.0 READ_TIMEOUT = 60.0 DOWNLOAD_TIMEOUT = 300.0 # Longer timeout for file downloads def __init__( self, http_client: httpx.AsyncClient, upstream_sources: List, pypi_api_url: str = "https://pypi.org/pypi", ): """ Initialize PyPI registry client. Args: http_client: Shared async HTTP client upstream_sources: List of configured upstream sources for auth pypi_api_url: Base URL for PyPI JSON API """ self.client = http_client self.sources = upstream_sources self.api_url = pypi_api_url @property def source_type(self) -> str: return "pypi" def _normalize_package_name(self, name: str) -> str: """Normalize a PyPI package name per PEP 503.""" return re.sub(r"[-_.]+", "-", name).lower() def _get_auth_headers(self) -> dict: """Get authentication headers from configured sources.""" headers = {"User-Agent": "Orchard-Registry-Client/1.0"} if self.sources: source = self.sources[0] if hasattr(source, "auth_type"): if source.auth_type == "bearer": password = ( source.get_password() if hasattr(source, "get_password") else getattr(source, "password", None) ) if password: headers["Authorization"] = f"Bearer {password}" elif source.auth_type == "api_key": custom_headers = ( source.get_headers() if hasattr(source, "get_headers") else {} ) if custom_headers: headers.update(custom_headers) return headers def _get_basic_auth(self) -> Optional[tuple]: """Get basic auth credentials if configured.""" if self.sources: source = self.sources[0] if hasattr(source, "auth_type") and source.auth_type == "basic": username = getattr(source, "username", None) if username: password = ( source.get_password() if hasattr(source, "get_password") else getattr(source, "password", "") ) return (username, password or "") return None async def get_available_versions(self, package_name: str) -> List[str]: """Get all available versions from PyPI JSON API.""" normalized = self._normalize_package_name(package_name) url = f"{self.api_url}/{normalized}/json" headers = self._get_auth_headers() auth = self._get_basic_auth() timeout = httpx.Timeout(self.READ_TIMEOUT, connect=self.CONNECT_TIMEOUT) try: response = await self.client.get( url, headers=headers, auth=auth, timeout=timeout ) if response.status_code == 404: logger.debug(f"Package {normalized} not found on PyPI") return [] if response.status_code != 200: logger.warning( f"PyPI API returned {response.status_code} for {normalized}" ) return [] data = response.json() releases = data.get("releases", {}) # Filter to valid versions and sort versions = [] for v in releases.keys(): try: Version(v) versions.append(v) except InvalidVersion: continue versions.sort(key=lambda x: Version(x)) return versions except httpx.RequestError as e: logger.warning(f"Failed to query PyPI for {normalized}: {e}") return [] except Exception as e: logger.warning(f"Error parsing PyPI response for {normalized}: {e}") return [] async def resolve_constraint( self, package_name: str, constraint: str ) -> Optional[VersionInfo]: """Find best version matching constraint from PyPI.""" normalized = self._normalize_package_name(package_name) url = f"{self.api_url}/{normalized}/json" headers = self._get_auth_headers() auth = self._get_basic_auth() timeout = httpx.Timeout(self.READ_TIMEOUT, connect=self.CONNECT_TIMEOUT) try: response = await self.client.get( url, headers=headers, auth=auth, timeout=timeout ) if response.status_code == 404: logger.debug(f"Package {normalized} not found on PyPI") return None if response.status_code != 200: logger.warning( f"PyPI API returned {response.status_code} for {normalized}" ) return None data = response.json() releases = data.get("releases", {}) # Handle wildcard - return latest version if constraint == "*": latest_version = data.get("info", {}).get("version") if latest_version and latest_version in releases: return self._get_version_info( normalized, latest_version, releases[latest_version] ) return None # Parse constraint try: specifier = SpecifierSet(constraint) except InvalidSpecifier: # Invalid constraint - treat as wildcard logger.warning( f"Invalid version constraint '{constraint}' for {normalized}, " "treating as wildcard" ) latest_version = data.get("info", {}).get("version") if latest_version and latest_version in releases: return self._get_version_info( normalized, latest_version, releases[latest_version] ) return None # Find matching versions matching = [] for v_str, files in releases.items(): if not files: # Skip versions with no files continue try: v = Version(v_str) if v in specifier: matching.append((v_str, v, files)) except InvalidVersion: continue if not matching: logger.debug( f"No versions of {normalized} match constraint '{constraint}'" ) return None # Sort by version and return highest match matching.sort(key=lambda x: x[1], reverse=True) best_version, _, best_files = matching[0] return self._get_version_info(normalized, best_version, best_files) except httpx.RequestError as e: logger.warning(f"Failed to query PyPI for {normalized}: {e}") return None except Exception as e: logger.warning(f"Error resolving {normalized}@{constraint}: {e}") return None def _get_version_info( self, package_name: str, version: str, files: List[dict] ) -> Optional[VersionInfo]: """Extract download info from PyPI release files.""" if not files: return None # Prefer wheel over sdist wheel_file = None sdist_file = None for f in files: filename = f.get("filename", "") if filename.endswith(".whl"): # Prefer platform-agnostic wheels if "py3-none-any" in filename or wheel_file is None: wheel_file = f elif filename.endswith(".tar.gz") and sdist_file is None: sdist_file = f selected = wheel_file or sdist_file if not selected: # Fall back to first available file selected = files[0] return VersionInfo( version=version, download_url=selected.get("url", ""), filename=selected.get("filename", ""), sha256=selected.get("digests", {}).get("sha256"), size=selected.get("size"), content_type="application/zip" if selected.get("filename", "").endswith(".whl") else "application/gzip", ) async def fetch_package( self, package_name: str, version_info: VersionInfo, db: Session, storage: "S3Storage", ) -> Optional[FetchResult]: """Fetch and cache a PyPI package.""" # Import here to avoid circular imports from .pypi_proxy import fetch_and_cache_pypi_package normalized = self._normalize_package_name(package_name) logger.info( f"Fetching {normalized}=={version_info.version} from upstream PyPI" ) result = await fetch_and_cache_pypi_package( db=db, storage=storage, http_client=self.client, package_name=normalized, filename=version_info.filename, download_url=version_info.download_url, expected_sha256=version_info.sha256, ) if result is None: return None return FetchResult( artifact_id=result["artifact_id"], size=result["size"], version=version_info.version, filename=version_info.filename, already_cached=result.get("already_cached", False), ) def get_registry_client( source_type: str, http_client: httpx.AsyncClient, upstream_sources: List, ) -> Optional[RegistryClient]: """ Factory function to get a registry client for a source type. Args: source_type: The registry type ('pypi', 'npm', etc.) http_client: Shared async HTTP client upstream_sources: List of configured upstream sources Returns: RegistryClient for the source type, or None if not supported """ if source_type == "pypi": # Filter to PyPI sources pypi_sources = [s for s in upstream_sources if getattr(s, "source_type", "") == "pypi"] return PyPIRegistryClient(http_client, pypi_sources) # Future: Add npm, maven, etc. logger.debug(f"No registry client available for source type: {source_type}") return None