diff --git a/backend/app/pypi_proxy.py b/backend/app/pypi_proxy.py index 699c600..5582fe0 100644 --- a/backend/app/pypi_proxy.py +++ b/backend/app/pypi_proxy.py @@ -9,13 +9,14 @@ import hashlib import logging import re import tarfile +import threading import zipfile from io import BytesIO from typing import Optional, List, Tuple from urllib.parse import urljoin, urlparse, quote, unquote import httpx -from fastapi import APIRouter, Depends, HTTPException, Request, Response +from fastapi import APIRouter, BackgroundTasks, Depends, HTTPException, Request, Response from fastapi.responses import StreamingResponse, HTMLResponse from sqlalchemy.orm import Session @@ -502,6 +503,84 @@ async def pypi_package_versions( ) +def _cache_dependency_background( + base_url: str, + dep_name: str, + dep_version: Optional[str], + depth: int = 0, + max_depth: int = 10, +): + """ + Background task to proactively cache a dependency. + + Fetches the dependency from upstream via our own proxy, which will + recursively cache its dependencies as well. + """ + if depth >= max_depth: + logger.warning(f"PyPI proxy: max depth {max_depth} reached caching {dep_name}") + return + + try: + # Normalize package name for URL (PEP 503) + normalized_name = re.sub(r'[-_.]+', '-', dep_name).lower() + + # First, get the simple index page to find available versions + simple_url = f"{base_url}/pypi/simple/{normalized_name}/" + logger.info(f"PyPI proxy: proactively caching {dep_name} (depth={depth})") + + with httpx.Client(timeout=30.0) as client: + response = client.get(simple_url) + if response.status_code != 200: + logger.warning(f"PyPI proxy: failed to get index for {dep_name}: {response.status_code}") + return + + # Parse the HTML to find wheel files + html = response.text + # Look for wheel files (.whl) - prefer them over sdist + wheel_pattern = rf'href="([^"]*{normalized_name}[^"]*\.whl[^"]*)"' + matches = re.findall(wheel_pattern, html, re.IGNORECASE) + + if not matches: + # Try sdist + sdist_pattern = rf'href="([^"]*{normalized_name}[^"]*\.tar\.gz[^"]*)"' + matches = re.findall(sdist_pattern, html, re.IGNORECASE) + + if not matches: + logger.warning(f"PyPI proxy: no downloadable files found for {dep_name}") + return + + # Get the last match (usually the latest version) + # The URL might be relative or absolute + download_url = matches[-1] + if download_url.startswith('/'): + download_url = f"{base_url}{download_url}" + elif not download_url.startswith('http'): + download_url = f"{base_url}/pypi/simple/{normalized_name}/{download_url}" + + # Download the file through our proxy (this will cache it) + logger.info(f"PyPI proxy: downloading dependency {dep_name} from {download_url}") + response = client.get(download_url) + if response.status_code == 200: + logger.info(f"PyPI proxy: successfully cached {dep_name}") + else: + logger.warning(f"PyPI proxy: failed to cache {dep_name}: {response.status_code}") + + except Exception as e: + logger.warning(f"PyPI proxy: error caching dependency {dep_name}: {e}") + + +def _start_background_dependency_caching(base_url: str, dependencies: List[Tuple[str, Optional[str]]]): + """Start background threads to cache dependencies.""" + for dep_name, dep_version in dependencies: + # Use a thread to avoid blocking + thread = threading.Thread( + target=_cache_dependency_background, + args=(base_url, dep_name, dep_version), + daemon=True, + ) + thread.start() + + @router.get("/simple/{package_name}/{filename}") async def pypi_download_file( request: Request, @@ -736,10 +815,10 @@ async def pypi_download_file( # Extract and store dependencies dependencies = _extract_dependencies(content, filename) + unique_deps = [] if dependencies: # Deduplicate dependencies by package name (keep first occurrence) seen_packages = set() - unique_deps = [] for dep_name, dep_version in dependencies: if dep_name not in seen_packages: seen_packages.add(dep_name) @@ -765,6 +844,11 @@ async def pypi_download_file( db.commit() + # Proactively cache dependencies in the background + if unique_deps: + base_url = str(request.base_url).rstrip("/") + _start_background_dependency_caching(base_url, unique_deps) + # Return the file return Response( content=content,