Add robust PyPI dependency caching with task queue

Replace unbounded thread spawning with managed worker pool:
- New pypi_cache_tasks table tracks caching jobs
- Thread pool with 5 workers (configurable via ORCHARD_PYPI_CACHE_WORKERS)
- Automatic retries with exponential backoff (30s, 60s, then fail)
- Deduplication to prevent duplicate caching attempts

New API endpoints for visibility and control:
- GET /pypi/cache/status - queue health summary
- GET /pypi/cache/failed - list failed tasks with errors
- POST /pypi/cache/retry/{package} - retry single package
- POST /pypi/cache/retry-all - retry all failed packages

This fixes silent failures in background dependency caching where
packages would fail to cache without any tracking or retry mechanism.
This commit is contained in:
Mondo Diaz
2026-02-02 11:16:02 -06:00
parent db7d0bb7c4
commit e0562195df
7 changed files with 1071 additions and 94 deletions

View File

@@ -9,7 +9,6 @@ import hashlib
import logging
import re
import tarfile
import threading
import zipfile
from io import BytesIO
from typing import Optional, List, Tuple
@@ -24,6 +23,13 @@ from .database import get_db
from .models import UpstreamSource, CachedUrl, Artifact, Project, Package, Tag, PackageVersion, ArtifactDependency
from .storage import S3Storage, get_storage
from .config import get_env_upstream_sources
from .pypi_cache_worker import (
enqueue_cache_task,
get_cache_status,
get_failed_tasks,
retry_failed_task,
retry_all_failed_tasks,
)
logger = logging.getLogger(__name__)
@@ -503,93 +509,6 @@ async def pypi_package_versions(
)
def _cache_dependency_background(
base_url: str,
dep_name: str,
dep_version: Optional[str],
depth: int = 0,
max_depth: int = 10,
):
"""
Background task to proactively cache a dependency.
Fetches the dependency from upstream via our own proxy, which will
recursively cache its dependencies as well.
"""
if depth >= max_depth:
logger.warning(f"PyPI proxy: max depth {max_depth} reached caching {dep_name}")
return
try:
# Normalize package name for URL (PEP 503)
normalized_name = re.sub(r'[-_.]+', '-', dep_name).lower()
# First, get the simple index page to find available versions
# Use HTTPS explicitly to avoid redirect issues that can drop trailing slashes
if base_url.startswith('http://'):
base_url = 'https://' + base_url[7:]
simple_url = f"{base_url}/pypi/simple/{normalized_name}/"
logger.info(f"PyPI proxy: proactively caching {dep_name} (depth={depth})")
with httpx.Client(timeout=30.0, follow_redirects=True) as client:
response = client.get(simple_url)
if response.status_code != 200:
logger.warning(f"PyPI proxy: failed to get index for {dep_name}: {response.status_code}")
return
# Parse the HTML to find wheel files
html = response.text
# Create pattern that matches both normalized (hyphens) and original (underscores) forms
# PEP 503 normalizes to hyphens, but wheel filenames may use underscores
name_pattern = re.sub(r'[-_]+', '[-_]+', normalized_name)
# Look for wheel files (.whl) - prefer them over sdist
wheel_pattern = rf'href="([^"]*{name_pattern}[^"]*\.whl[^"]*)"'
matches = re.findall(wheel_pattern, html, re.IGNORECASE)
if not matches:
# Try sdist
sdist_pattern = rf'href="([^"]*{name_pattern}[^"]*\.tar\.gz[^"]*)"'
matches = re.findall(sdist_pattern, html, re.IGNORECASE)
if not matches:
# Debug: log first 500 chars of HTML and the pattern we're looking for
logger.warning(f"PyPI proxy: no downloadable files found for {dep_name}. Pattern: {wheel_pattern}, HTML preview: {html[:500]}")
return
# Get the last match (usually the latest version)
# The URL might be relative or absolute
download_url = matches[-1]
if download_url.startswith('/'):
download_url = f"{base_url}{download_url}"
elif not download_url.startswith('http'):
download_url = f"{base_url}/pypi/simple/{normalized_name}/{download_url}"
# Download the file through our proxy (this will cache it)
logger.info(f"PyPI proxy: downloading dependency {dep_name} from {download_url}")
response = client.get(download_url)
if response.status_code == 200:
logger.info(f"PyPI proxy: successfully cached {dep_name}")
else:
logger.warning(f"PyPI proxy: failed to cache {dep_name}: {response.status_code}")
except Exception as e:
logger.warning(f"PyPI proxy: error caching dependency {dep_name}: {e}")
def _start_background_dependency_caching(base_url: str, dependencies: List[Tuple[str, Optional[str]]]):
"""Start background threads to cache dependencies."""
for dep_name, dep_version in dependencies:
# Use a thread to avoid blocking
thread = threading.Thread(
target=_cache_dependency_background,
args=(base_url, dep_name, dep_version),
daemon=True,
)
thread.start()
@router.get("/simple/{package_name}/{filename}")
async def pypi_download_file(
request: Request,
@@ -851,12 +770,20 @@ async def pypi_download_file(
)
db.add(dep)
db.commit()
# Proactively cache dependencies in the background
# Proactively cache dependencies via task queue
if unique_deps:
base_url = str(request.base_url).rstrip("/")
_start_background_dependency_caching(base_url, unique_deps)
for dep_name, dep_version in unique_deps:
enqueue_cache_task(
db,
package_name=dep_name,
version_constraint=dep_version,
parent_task_id=None, # Top-level, triggered by user download
depth=0,
triggered_by_artifact=sha256,
)
logger.info(f"PyPI proxy: queued {len(unique_deps)} dependencies for caching")
db.commit()
# Return the file
return Response(
@@ -879,3 +806,63 @@ async def pypi_download_file(
except Exception as e:
logger.exception(f"PyPI proxy: error downloading {filename}")
raise HTTPException(status_code=500, detail=str(e))
# =============================================================================
# Cache Status and Management Endpoints
# =============================================================================
@router.get("/cache/status")
async def pypi_cache_status(db: Session = Depends(get_db)):
"""
Get summary of the PyPI cache task queue.
Returns counts of tasks by status (pending, in_progress, completed, failed).
"""
return get_cache_status(db)
@router.get("/cache/failed")
async def pypi_cache_failed(
limit: int = 50,
db: Session = Depends(get_db),
):
"""
Get list of failed cache tasks for debugging.
Args:
limit: Maximum number of tasks to return (default 50).
"""
return get_failed_tasks(db, limit=limit)
@router.post("/cache/retry/{package_name}")
async def pypi_cache_retry(
package_name: str,
db: Session = Depends(get_db),
):
"""
Reset a failed cache task to retry.
Args:
package_name: The package name to retry.
"""
task = retry_failed_task(db, package_name)
if not task:
raise HTTPException(
status_code=404,
detail=f"No failed cache task found for package '{package_name}'"
)
return {"message": f"Retry queued for {task.package_name}", "task_id": str(task.id)}
@router.post("/cache/retry-all")
async def pypi_cache_retry_all(db: Session = Depends(get_db)):
"""
Reset all failed cache tasks to retry.
Returns the count of tasks that were reset.
"""
count = retry_all_failed_tasks(db)
return {"message": f"Queued {count} tasks for retry", "count": count}