Add proactive dependency caching for PyPI packages

When a PyPI package is cached, its dependencies are now automatically
fetched in background threads. This ensures the entire dependency tree
is cached even if pip already has some packages installed locally.

Features:
- Background threads fetch each dependency without blocking the response
- Uses our own proxy endpoint to cache, which recursively caches transitive deps
- Max depth of 10 to prevent infinite loops
- Daemon threads so they don't block process shutdown
This commit is contained in:
Mondo Diaz
2026-01-30 17:45:30 -06:00
parent 044a6c1d27
commit f992fc540e

View File

@@ -9,13 +9,14 @@ import hashlib
import logging
import re
import tarfile
import threading
import zipfile
from io import BytesIO
from typing import Optional, List, Tuple
from urllib.parse import urljoin, urlparse, quote, unquote
import httpx
from fastapi import APIRouter, Depends, HTTPException, Request, Response
from fastapi import APIRouter, BackgroundTasks, Depends, HTTPException, Request, Response
from fastapi.responses import StreamingResponse, HTMLResponse
from sqlalchemy.orm import Session
@@ -502,6 +503,84 @@ async def pypi_package_versions(
)
def _cache_dependency_background(
base_url: str,
dep_name: str,
dep_version: Optional[str],
depth: int = 0,
max_depth: int = 10,
):
"""
Background task to proactively cache a dependency.
Fetches the dependency from upstream via our own proxy, which will
recursively cache its dependencies as well.
"""
if depth >= max_depth:
logger.warning(f"PyPI proxy: max depth {max_depth} reached caching {dep_name}")
return
try:
# Normalize package name for URL (PEP 503)
normalized_name = re.sub(r'[-_.]+', '-', dep_name).lower()
# First, get the simple index page to find available versions
simple_url = f"{base_url}/pypi/simple/{normalized_name}/"
logger.info(f"PyPI proxy: proactively caching {dep_name} (depth={depth})")
with httpx.Client(timeout=30.0) as client:
response = client.get(simple_url)
if response.status_code != 200:
logger.warning(f"PyPI proxy: failed to get index for {dep_name}: {response.status_code}")
return
# Parse the HTML to find wheel files
html = response.text
# Look for wheel files (.whl) - prefer them over sdist
wheel_pattern = rf'href="([^"]*{normalized_name}[^"]*\.whl[^"]*)"'
matches = re.findall(wheel_pattern, html, re.IGNORECASE)
if not matches:
# Try sdist
sdist_pattern = rf'href="([^"]*{normalized_name}[^"]*\.tar\.gz[^"]*)"'
matches = re.findall(sdist_pattern, html, re.IGNORECASE)
if not matches:
logger.warning(f"PyPI proxy: no downloadable files found for {dep_name}")
return
# Get the last match (usually the latest version)
# The URL might be relative or absolute
download_url = matches[-1]
if download_url.startswith('/'):
download_url = f"{base_url}{download_url}"
elif not download_url.startswith('http'):
download_url = f"{base_url}/pypi/simple/{normalized_name}/{download_url}"
# Download the file through our proxy (this will cache it)
logger.info(f"PyPI proxy: downloading dependency {dep_name} from {download_url}")
response = client.get(download_url)
if response.status_code == 200:
logger.info(f"PyPI proxy: successfully cached {dep_name}")
else:
logger.warning(f"PyPI proxy: failed to cache {dep_name}: {response.status_code}")
except Exception as e:
logger.warning(f"PyPI proxy: error caching dependency {dep_name}: {e}")
def _start_background_dependency_caching(base_url: str, dependencies: List[Tuple[str, Optional[str]]]):
"""Start background threads to cache dependencies."""
for dep_name, dep_version in dependencies:
# Use a thread to avoid blocking
thread = threading.Thread(
target=_cache_dependency_background,
args=(base_url, dep_name, dep_version),
daemon=True,
)
thread.start()
@router.get("/simple/{package_name}/{filename}")
async def pypi_download_file(
request: Request,
@@ -736,10 +815,10 @@ async def pypi_download_file(
# Extract and store dependencies
dependencies = _extract_dependencies(content, filename)
unique_deps = []
if dependencies:
# Deduplicate dependencies by package name (keep first occurrence)
seen_packages = set()
unique_deps = []
for dep_name, dep_version in dependencies:
if dep_name not in seen_packages:
seen_packages.add(dep_name)
@@ -765,6 +844,11 @@ async def pypi_download_file(
db.commit()
# Proactively cache dependencies in the background
if unique_deps:
base_url = str(request.base_url).rstrip("/")
_start_background_dependency_caching(base_url, unique_deps)
# Return the file
return Response(
content=content,