Add robust PyPI dependency caching with task queue
Replace unbounded thread spawning with managed worker pool:
- New pypi_cache_tasks table tracks caching jobs
- Thread pool with 5 workers (configurable via ORCHARD_PYPI_CACHE_WORKERS)
- Automatic retries with exponential backoff (30s, 60s, then fail)
- Deduplication to prevent duplicate caching attempts
New API endpoints for visibility and control:
- GET /pypi/cache/status - queue health summary
- GET /pypi/cache/failed - list failed tasks with errors
- POST /pypi/cache/retry/{package} - retry single package
- POST /pypi/cache/retry-all - retry all failed packages
This fixes silent failures in background dependency caching where
packages would fail to cache without any tracking or retry mechanism.
This commit is contained in:
@@ -9,7 +9,6 @@ import hashlib
|
||||
import logging
|
||||
import re
|
||||
import tarfile
|
||||
import threading
|
||||
import zipfile
|
||||
from io import BytesIO
|
||||
from typing import Optional, List, Tuple
|
||||
@@ -24,6 +23,13 @@ from .database import get_db
|
||||
from .models import UpstreamSource, CachedUrl, Artifact, Project, Package, Tag, PackageVersion, ArtifactDependency
|
||||
from .storage import S3Storage, get_storage
|
||||
from .config import get_env_upstream_sources
|
||||
from .pypi_cache_worker import (
|
||||
enqueue_cache_task,
|
||||
get_cache_status,
|
||||
get_failed_tasks,
|
||||
retry_failed_task,
|
||||
retry_all_failed_tasks,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -503,93 +509,6 @@ async def pypi_package_versions(
|
||||
)
|
||||
|
||||
|
||||
def _cache_dependency_background(
|
||||
base_url: str,
|
||||
dep_name: str,
|
||||
dep_version: Optional[str],
|
||||
depth: int = 0,
|
||||
max_depth: int = 10,
|
||||
):
|
||||
"""
|
||||
Background task to proactively cache a dependency.
|
||||
|
||||
Fetches the dependency from upstream via our own proxy, which will
|
||||
recursively cache its dependencies as well.
|
||||
"""
|
||||
if depth >= max_depth:
|
||||
logger.warning(f"PyPI proxy: max depth {max_depth} reached caching {dep_name}")
|
||||
return
|
||||
|
||||
try:
|
||||
# Normalize package name for URL (PEP 503)
|
||||
normalized_name = re.sub(r'[-_.]+', '-', dep_name).lower()
|
||||
|
||||
# First, get the simple index page to find available versions
|
||||
# Use HTTPS explicitly to avoid redirect issues that can drop trailing slashes
|
||||
if base_url.startswith('http://'):
|
||||
base_url = 'https://' + base_url[7:]
|
||||
simple_url = f"{base_url}/pypi/simple/{normalized_name}/"
|
||||
logger.info(f"PyPI proxy: proactively caching {dep_name} (depth={depth})")
|
||||
|
||||
with httpx.Client(timeout=30.0, follow_redirects=True) as client:
|
||||
response = client.get(simple_url)
|
||||
if response.status_code != 200:
|
||||
logger.warning(f"PyPI proxy: failed to get index for {dep_name}: {response.status_code}")
|
||||
return
|
||||
|
||||
# Parse the HTML to find wheel files
|
||||
html = response.text
|
||||
|
||||
# Create pattern that matches both normalized (hyphens) and original (underscores) forms
|
||||
# PEP 503 normalizes to hyphens, but wheel filenames may use underscores
|
||||
name_pattern = re.sub(r'[-_]+', '[-_]+', normalized_name)
|
||||
|
||||
# Look for wheel files (.whl) - prefer them over sdist
|
||||
wheel_pattern = rf'href="([^"]*{name_pattern}[^"]*\.whl[^"]*)"'
|
||||
matches = re.findall(wheel_pattern, html, re.IGNORECASE)
|
||||
|
||||
if not matches:
|
||||
# Try sdist
|
||||
sdist_pattern = rf'href="([^"]*{name_pattern}[^"]*\.tar\.gz[^"]*)"'
|
||||
matches = re.findall(sdist_pattern, html, re.IGNORECASE)
|
||||
|
||||
if not matches:
|
||||
# Debug: log first 500 chars of HTML and the pattern we're looking for
|
||||
logger.warning(f"PyPI proxy: no downloadable files found for {dep_name}. Pattern: {wheel_pattern}, HTML preview: {html[:500]}")
|
||||
return
|
||||
|
||||
# Get the last match (usually the latest version)
|
||||
# The URL might be relative or absolute
|
||||
download_url = matches[-1]
|
||||
if download_url.startswith('/'):
|
||||
download_url = f"{base_url}{download_url}"
|
||||
elif not download_url.startswith('http'):
|
||||
download_url = f"{base_url}/pypi/simple/{normalized_name}/{download_url}"
|
||||
|
||||
# Download the file through our proxy (this will cache it)
|
||||
logger.info(f"PyPI proxy: downloading dependency {dep_name} from {download_url}")
|
||||
response = client.get(download_url)
|
||||
if response.status_code == 200:
|
||||
logger.info(f"PyPI proxy: successfully cached {dep_name}")
|
||||
else:
|
||||
logger.warning(f"PyPI proxy: failed to cache {dep_name}: {response.status_code}")
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"PyPI proxy: error caching dependency {dep_name}: {e}")
|
||||
|
||||
|
||||
def _start_background_dependency_caching(base_url: str, dependencies: List[Tuple[str, Optional[str]]]):
|
||||
"""Start background threads to cache dependencies."""
|
||||
for dep_name, dep_version in dependencies:
|
||||
# Use a thread to avoid blocking
|
||||
thread = threading.Thread(
|
||||
target=_cache_dependency_background,
|
||||
args=(base_url, dep_name, dep_version),
|
||||
daemon=True,
|
||||
)
|
||||
thread.start()
|
||||
|
||||
|
||||
@router.get("/simple/{package_name}/{filename}")
|
||||
async def pypi_download_file(
|
||||
request: Request,
|
||||
@@ -851,12 +770,20 @@ async def pypi_download_file(
|
||||
)
|
||||
db.add(dep)
|
||||
|
||||
db.commit()
|
||||
|
||||
# Proactively cache dependencies in the background
|
||||
# Proactively cache dependencies via task queue
|
||||
if unique_deps:
|
||||
base_url = str(request.base_url).rstrip("/")
|
||||
_start_background_dependency_caching(base_url, unique_deps)
|
||||
for dep_name, dep_version in unique_deps:
|
||||
enqueue_cache_task(
|
||||
db,
|
||||
package_name=dep_name,
|
||||
version_constraint=dep_version,
|
||||
parent_task_id=None, # Top-level, triggered by user download
|
||||
depth=0,
|
||||
triggered_by_artifact=sha256,
|
||||
)
|
||||
logger.info(f"PyPI proxy: queued {len(unique_deps)} dependencies for caching")
|
||||
|
||||
db.commit()
|
||||
|
||||
# Return the file
|
||||
return Response(
|
||||
@@ -879,3 +806,63 @@ async def pypi_download_file(
|
||||
except Exception as e:
|
||||
logger.exception(f"PyPI proxy: error downloading {filename}")
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Cache Status and Management Endpoints
|
||||
# =============================================================================
|
||||
|
||||
|
||||
@router.get("/cache/status")
|
||||
async def pypi_cache_status(db: Session = Depends(get_db)):
|
||||
"""
|
||||
Get summary of the PyPI cache task queue.
|
||||
|
||||
Returns counts of tasks by status (pending, in_progress, completed, failed).
|
||||
"""
|
||||
return get_cache_status(db)
|
||||
|
||||
|
||||
@router.get("/cache/failed")
|
||||
async def pypi_cache_failed(
|
||||
limit: int = 50,
|
||||
db: Session = Depends(get_db),
|
||||
):
|
||||
"""
|
||||
Get list of failed cache tasks for debugging.
|
||||
|
||||
Args:
|
||||
limit: Maximum number of tasks to return (default 50).
|
||||
"""
|
||||
return get_failed_tasks(db, limit=limit)
|
||||
|
||||
|
||||
@router.post("/cache/retry/{package_name}")
|
||||
async def pypi_cache_retry(
|
||||
package_name: str,
|
||||
db: Session = Depends(get_db),
|
||||
):
|
||||
"""
|
||||
Reset a failed cache task to retry.
|
||||
|
||||
Args:
|
||||
package_name: The package name to retry.
|
||||
"""
|
||||
task = retry_failed_task(db, package_name)
|
||||
if not task:
|
||||
raise HTTPException(
|
||||
status_code=404,
|
||||
detail=f"No failed cache task found for package '{package_name}'"
|
||||
)
|
||||
return {"message": f"Retry queued for {task.package_name}", "task_id": str(task.id)}
|
||||
|
||||
|
||||
@router.post("/cache/retry-all")
|
||||
async def pypi_cache_retry_all(db: Session = Depends(get_db)):
|
||||
"""
|
||||
Reset all failed cache tasks to retry.
|
||||
|
||||
Returns the count of tasks that were reset.
|
||||
"""
|
||||
count = retry_all_failed_tasks(db)
|
||||
return {"message": f"Queued {count} tasks for retry", "count": count}
|
||||
|
||||
Reference in New Issue
Block a user