feat: add auto-fetch for missing dependencies from upstream registries

Add auto_fetch parameter to dependency resolution endpoint that fetches missing dependencies from upstream registries (PyPI) when resolving. - Add RegistryClient abstraction with PyPIRegistryClient implementation - Extract fetch_and_cache_pypi_package() for reuse - Add resolve_dependencies_with_fetch() async function - Extend MissingDependency schema with fetch_attempted/fetch_error - Add fetched list to DependencyResolutionResponse - Add auto_fetch_max_depth config setting (default: 3) - Remove Usage section from Package page UI - Add 6 integration tests for auto-fetch functionality
2026-02-04 12:01:49 -06:00
parent 9f233e0d4d
commit cbc2e5e11a
10 changed files with 1348 additions and 65 deletions
--- a/backend/app/pypi_proxy.py
+++ b/backend/app/pypi_proxy.py
@@ -572,6 +572,258 @@ async def pypi_package_versions(
    )


+async def fetch_and_cache_pypi_package(
+    db: Session,
+    storage: S3Storage,
+    http_client: httpx.AsyncClient,
+    package_name: str,
+    filename: str,
+    download_url: str,
+    expected_sha256: Optional[str] = None,
+) -> Optional[dict]:
+    """
+    Fetch a PyPI package from upstream and cache it in Orchard.
+
+    This is the core caching logic extracted from pypi_download_file() for reuse
+    by the registry client during auto-fetch dependency resolution.
+
+    Args:
+        db: Database session
+        storage: S3 storage instance
+        http_client: Async HTTP client for making requests
+        package_name: Normalized package name (e.g., 'requests')
+        filename: Package filename (e.g., 'requests-2.31.0-py3-none-any.whl')
+        download_url: Full URL to download from upstream
+        expected_sha256: Optional SHA256 to verify download integrity
+
+    Returns:
+        Dict with artifact_id, size, version, already_cached if successful.
+        None if the fetch failed.
+    """
+    # Normalize package name
+    normalized_name = re.sub(r'[-_.]+', '-', package_name).lower()
+
+    # Check if we already have this URL cached
+    url_hash = hashlib.sha256(download_url.encode()).hexdigest()
+    cached_url = db.query(CachedUrl).filter(CachedUrl.url_hash == url_hash).first()
+
+    if cached_url:
+        # Already cached - return existing artifact info
+        artifact = db.query(Artifact).filter(Artifact.id == cached_url.artifact_id).first()
+        if artifact:
+            version = _extract_pypi_version(filename)
+            logger.info(f"PyPI fetch: {filename} already cached (artifact {artifact.id[:12]})")
+            return {
+                "artifact_id": artifact.id,
+                "size": artifact.size,
+                "version": version,
+                "already_cached": True,
+            }
+
+    # Get upstream sources for auth headers
+    sources = _get_pypi_upstream_sources(db)
+    matched_source = sources[0] if sources else None
+
+    headers = {"User-Agent": "Orchard-PyPI-Proxy/1.0"}
+    if matched_source:
+        headers.update(_build_auth_headers(matched_source))
+    auth = _get_basic_auth(matched_source) if matched_source else None
+
+    download_timeout = httpx.Timeout(connect=30.0, read=300.0, write=300.0, pool=30.0)
+
+    try:
+        logger.info(f"PyPI fetch: downloading {filename} from {download_url}")
+
+        response = await http_client.get(
+            download_url,
+            headers=headers,
+            auth=auth,
+            timeout=download_timeout,
+        )
+
+        # Handle redirects manually
+        redirect_count = 0
+        while response.status_code in (301, 302, 303, 307, 308) and redirect_count < 5:
+            redirect_url = response.headers.get('location')
+            if not redirect_url:
+                break
+
+            if not redirect_url.startswith('http'):
+                redirect_url = urljoin(download_url, redirect_url)
+
+            logger.debug(f"PyPI fetch: following redirect to {redirect_url}")
+
+            # Don't send auth to different hosts
+            redirect_headers = {"User-Agent": "Orchard-PyPI-Proxy/1.0"}
+            redirect_auth = None
+            if urlparse(redirect_url).netloc == urlparse(download_url).netloc:
+                redirect_headers.update(headers)
+                redirect_auth = auth
+
+            response = await http_client.get(
+                redirect_url,
+                headers=redirect_headers,
+                auth=redirect_auth,
+                follow_redirects=False,
+                timeout=download_timeout,
+            )
+            redirect_count += 1
+
+        if response.status_code != 200:
+            error_detail = _parse_upstream_error(response)
+            logger.warning(f"PyPI fetch: upstream returned {response.status_code} for {filename}: {error_detail}")
+            return None
+
+        content_type = response.headers.get('content-type', 'application/octet-stream')
+
+        # Stream to temp file to avoid loading large packages into memory
+        tmp_path = None
+        try:
+            with tempfile.NamedTemporaryFile(delete=False, suffix=f"_{filename}") as tmp_file:
+                tmp_path = tmp_file.name
+                async for chunk in response.aiter_bytes(chunk_size=65536):
+                    tmp_file.write(chunk)
+
+            # Store in S3 from temp file (computes hash and deduplicates automatically)
+            with open(tmp_path, 'rb') as f:
+                result = storage.store(f)
+            sha256 = result.sha256
+            size = result.size
+
+            # Verify hash if expected
+            if expected_sha256 and sha256 != expected_sha256.lower():
+                logger.error(
+                    f"PyPI fetch: hash mismatch for {filename}: "
+                    f"expected {expected_sha256[:12]}, got {sha256[:12]}"
+                )
+                return None
+
+            # Extract dependencies from the temp file
+            extracted_deps = _extract_dependencies_from_file(tmp_path, filename)
+            if extracted_deps:
+                logger.info(f"PyPI fetch: extracted {len(extracted_deps)} dependencies from {filename}")
+
+            logger.info(f"PyPI fetch: downloaded {filename}, {size} bytes, sha256={sha256[:12]}")
+        finally:
+            # Clean up temp file
+            if tmp_path and os.path.exists(tmp_path):
+                os.unlink(tmp_path)
+
+        # Check if artifact already exists
+        existing = db.query(Artifact).filter(Artifact.id == sha256).first()
+        if existing:
+            existing.ref_count += 1
+            db.flush()
+        else:
+            new_artifact = Artifact(
+                id=sha256,
+                original_name=filename,
+                content_type=content_type,
+                size=size,
+                ref_count=1,
+                created_by="pypi-proxy",
+                s3_key=result.s3_key,
+                checksum_md5=result.md5,
+                checksum_sha1=result.sha1,
+                s3_etag=result.s3_etag,
+            )
+            db.add(new_artifact)
+            db.flush()
+
+        # Create/get system project and package
+        system_project = db.query(Project).filter(Project.name == "_pypi").first()
+        if not system_project:
+            system_project = Project(
+                name="_pypi",
+                description="System project for cached PyPI packages",
+                is_public=True,
+                is_system=True,
+                created_by="pypi-proxy",
+            )
+            db.add(system_project)
+            db.flush()
+        elif not system_project.is_system:
+            system_project.is_system = True
+            db.flush()
+
+        package = db.query(Package).filter(
+            Package.project_id == system_project.id,
+            Package.name == normalized_name,
+        ).first()
+        if not package:
+            package = Package(
+                project_id=system_project.id,
+                name=normalized_name,
+                description=f"PyPI package: {normalized_name}",
+                format="pypi",
+            )
+            db.add(package)
+            db.flush()
+
+        # Extract and create version
+        version = _extract_pypi_version(filename)
+        if version and not filename.endswith('.metadata'):
+            existing_version = db.query(PackageVersion).filter(
+                PackageVersion.package_id == package.id,
+                PackageVersion.version == version,
+            ).first()
+            if not existing_version:
+                pkg_version = PackageVersion(
+                    package_id=package.id,
+                    artifact_id=sha256,
+                    version=version,
+                    version_source="filename",
+                    created_by="pypi-proxy",
+                )
+                db.add(pkg_version)
+
+        # Cache the URL mapping
+        existing_cached = db.query(CachedUrl).filter(CachedUrl.url_hash == url_hash).first()
+        if not existing_cached:
+            cached_url_record = CachedUrl(
+                url_hash=url_hash,
+                url=download_url,
+                artifact_id=sha256,
+            )
+            db.add(cached_url_record)
+
+        # Store extracted dependencies using batch operation
+        if extracted_deps:
+            seen_deps: dict[str, str] = {}
+            for dep_name, dep_version in extracted_deps:
+                if dep_name not in seen_deps:
+                    seen_deps[dep_name] = dep_version if dep_version else "*"
+
+            deps_to_store = [
+                ("_pypi", dep_name, dep_version)
+                for dep_name, dep_version in seen_deps.items()
+            ]
+
+            repo = ArtifactRepository(db)
+            inserted = repo.batch_upsert_dependencies(sha256, deps_to_store)
+            if inserted > 0:
+                logger.debug(f"Stored {inserted} dependencies for {sha256[:12]}...")
+
+        db.commit()
+
+        return {
+            "artifact_id": sha256,
+            "size": size,
+            "version": version,
+            "already_cached": False,
+        }
+
+    except httpx.ConnectError as e:
+        logger.warning(f"PyPI fetch: connection failed for {filename}: {e}")
+        return None
+    except httpx.TimeoutException as e:
+        logger.warning(f"PyPI fetch: timeout for {filename}: {e}")
+        return None
+    except Exception as e:
+        logger.exception(f"PyPI fetch: error downloading {filename}")
+        return None
+
+
@router.get("/simple/{package_name}/{filename}")
 async def pypi_download_file(
    request: Request,