From c1060feb5fa883031313d582b7f1842c1cc4a0c1 Mon Sep 17 00:00:00 2001 From: Mondo Diaz Date: Fri, 30 Jan 2026 17:45:30 -0600 Subject: [PATCH] Add proactive dependency caching for PyPI packages When a PyPI package is cached, its dependencies are now automatically fetched in background threads. This ensures the entire dependency tree is cached even if pip already has some packages installed locally. Features: - Background threads fetch each dependency without blocking the response - Uses our own proxy endpoint to cache, which recursively caches transitive deps - Max depth of 10 to prevent infinite loops - Daemon threads so they don't block process shutdown --- backend/app/pypi_proxy.py | 88 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 86 insertions(+), 2 deletions(-) diff --git a/backend/app/pypi_proxy.py b/backend/app/pypi_proxy.py index 699c600..5582fe0 100644 --- a/backend/app/pypi_proxy.py +++ b/backend/app/pypi_proxy.py @@ -9,13 +9,14 @@ import hashlib import logging import re import tarfile +import threading import zipfile from io import BytesIO from typing import Optional, List, Tuple from urllib.parse import urljoin, urlparse, quote, unquote import httpx -from fastapi import APIRouter, Depends, HTTPException, Request, Response +from fastapi import APIRouter, BackgroundTasks, Depends, HTTPException, Request, Response from fastapi.responses import StreamingResponse, HTMLResponse from sqlalchemy.orm import Session @@ -502,6 +503,84 @@ async def pypi_package_versions( ) +def _cache_dependency_background( + base_url: str, + dep_name: str, + dep_version: Optional[str], + depth: int = 0, + max_depth: int = 10, +): + """ + Background task to proactively cache a dependency. + + Fetches the dependency from upstream via our own proxy, which will + recursively cache its dependencies as well. + """ + if depth >= max_depth: + logger.warning(f"PyPI proxy: max depth {max_depth} reached caching {dep_name}") + return + + try: + # Normalize package name for URL (PEP 503) + normalized_name = re.sub(r'[-_.]+', '-', dep_name).lower() + + # First, get the simple index page to find available versions + simple_url = f"{base_url}/pypi/simple/{normalized_name}/" + logger.info(f"PyPI proxy: proactively caching {dep_name} (depth={depth})") + + with httpx.Client(timeout=30.0) as client: + response = client.get(simple_url) + if response.status_code != 200: + logger.warning(f"PyPI proxy: failed to get index for {dep_name}: {response.status_code}") + return + + # Parse the HTML to find wheel files + html = response.text + # Look for wheel files (.whl) - prefer them over sdist + wheel_pattern = rf'href="([^"]*{normalized_name}[^"]*\.whl[^"]*)"' + matches = re.findall(wheel_pattern, html, re.IGNORECASE) + + if not matches: + # Try sdist + sdist_pattern = rf'href="([^"]*{normalized_name}[^"]*\.tar\.gz[^"]*)"' + matches = re.findall(sdist_pattern, html, re.IGNORECASE) + + if not matches: + logger.warning(f"PyPI proxy: no downloadable files found for {dep_name}") + return + + # Get the last match (usually the latest version) + # The URL might be relative or absolute + download_url = matches[-1] + if download_url.startswith('/'): + download_url = f"{base_url}{download_url}" + elif not download_url.startswith('http'): + download_url = f"{base_url}/pypi/simple/{normalized_name}/{download_url}" + + # Download the file through our proxy (this will cache it) + logger.info(f"PyPI proxy: downloading dependency {dep_name} from {download_url}") + response = client.get(download_url) + if response.status_code == 200: + logger.info(f"PyPI proxy: successfully cached {dep_name}") + else: + logger.warning(f"PyPI proxy: failed to cache {dep_name}: {response.status_code}") + + except Exception as e: + logger.warning(f"PyPI proxy: error caching dependency {dep_name}: {e}") + + +def _start_background_dependency_caching(base_url: str, dependencies: List[Tuple[str, Optional[str]]]): + """Start background threads to cache dependencies.""" + for dep_name, dep_version in dependencies: + # Use a thread to avoid blocking + thread = threading.Thread( + target=_cache_dependency_background, + args=(base_url, dep_name, dep_version), + daemon=True, + ) + thread.start() + + @router.get("/simple/{package_name}/{filename}") async def pypi_download_file( request: Request, @@ -736,10 +815,10 @@ async def pypi_download_file( # Extract and store dependencies dependencies = _extract_dependencies(content, filename) + unique_deps = [] if dependencies: # Deduplicate dependencies by package name (keep first occurrence) seen_packages = set() - unique_deps = [] for dep_name, dep_version in dependencies: if dep_name not in seen_packages: seen_packages.add(dep_name) @@ -765,6 +844,11 @@ async def pypi_download_file( db.commit() + # Proactively cache dependencies in the background + if unique_deps: + base_url = str(request.base_url).rstrip("/") + _start_background_dependency_caching(base_url, unique_deps) + # Return the file return Response( content=content,