2 Commits

Author SHA1 Message Date
Mondo Diaz
f992fc540e Add proactive dependency caching for PyPI packages
When a PyPI package is cached, its dependencies are now automatically
fetched in background threads. This ensures the entire dependency tree
is cached even if pip already has some packages installed locally.

Features:
- Background threads fetch each dependency without blocking the response
- Uses our own proxy endpoint to cache, which recursively caches transitive deps
- Max depth of 10 to prevent infinite loops
- Daemon threads so they don't block process shutdown
2026-01-30 17:45:30 -06:00
Mondo Diaz
044a6c1d27 Fix duplicate dependency constraint causing 500 errors
- Deduplicate dependencies by package name before inserting
- Some packages (like anyio) list the same dep (trio) multiple times with
  different version constraints for different extras
- The unique constraint on (artifact_id, project, package) rejected these
- Also removed debug logging from dependencies.py
2026-01-30 17:43:49 -06:00
2 changed files with 94 additions and 19 deletions

View File

@@ -10,11 +10,8 @@ Handles:
- Conflict detection - Conflict detection
""" """
import logging
import re import re
import yaml import yaml
logger = logging.getLogger(__name__)
from typing import List, Dict, Any, Optional, Set, Tuple from typing import List, Dict, Any, Optional, Set, Tuple
from sqlalchemy.orm import Session from sqlalchemy.orm import Session
from sqlalchemy import and_ from sqlalchemy import and_
@@ -435,8 +432,6 @@ def _resolve_dependency_to_artifact(
if not package: if not package:
return None return None
logger.info(f"_resolve_dependency_to_artifact: package_id={package.id}, version={version!r}, tag={tag!r}")
if version: if version:
# Check if this is a version constraint (>=, <, etc.) or exact version # Check if this is a version constraint (>=, <, etc.) or exact version
if _is_version_constraint(version): if _is_version_constraint(version):
@@ -454,7 +449,6 @@ def _resolve_dependency_to_artifact(
Artifact.id == pkg_version.artifact_id Artifact.id == pkg_version.artifact_id
).first() ).first()
if artifact: if artifact:
logger.info(f"_resolve_dependency_to_artifact: found by version {version}")
return (artifact.id, version, artifact.size) return (artifact.id, version, artifact.size)
# Also check if there's a tag with this exact name # Also check if there's a tag with this exact name
@@ -462,7 +456,6 @@ def _resolve_dependency_to_artifact(
Tag.package_id == package.id, Tag.package_id == package.id,
Tag.name == version, Tag.name == version,
).first() ).first()
logger.info(f"_resolve_dependency_to_artifact: tag lookup by version name, found={tag_record is not None}")
if tag_record: if tag_record:
artifact = db.query(Artifact).filter( artifact = db.query(Artifact).filter(
Artifact.id == tag_record.artifact_id Artifact.id == tag_record.artifact_id
@@ -476,7 +469,6 @@ def _resolve_dependency_to_artifact(
Tag.package_id == package.id, Tag.package_id == package.id,
Tag.name == tag, Tag.name == tag,
).first() ).first()
logger.info(f"_resolve_dependency_to_artifact: tag lookup by tag param, found={tag_record is not None}")
if tag_record: if tag_record:
artifact = db.query(Artifact).filter( artifact = db.query(Artifact).filter(
Artifact.id == tag_record.artifact_id Artifact.id == tag_record.artifact_id
@@ -484,9 +476,6 @@ def _resolve_dependency_to_artifact(
if artifact: if artifact:
return (artifact.id, tag, artifact.size) return (artifact.id, tag, artifact.size)
# Debug: list actual tags
sample_tags = db.query(Tag).filter(Tag.package_id == package.id).limit(3).all()
logger.info(f"_resolve_dependency_to_artifact: NOT FOUND. Sample tags: {[t.name for t in sample_tags]}")
return None return None
@@ -670,12 +659,9 @@ def resolve_dependencies(
CircularDependencyError: If circular dependencies are detected CircularDependencyError: If circular dependencies are detected
DependencyConflictError: If conflicting versions are required DependencyConflictError: If conflicting versions are required
""" """
logger.info(f"resolve_dependencies: project={project_name}, package={package_name}, ref={ref!r}")
# Resolve the initial artifact # Resolve the initial artifact
project = db.query(Project).filter(Project.name == project_name).first() project = db.query(Project).filter(Project.name == project_name).first()
if not project: if not project:
logger.warning(f"resolve_dependencies: project '{project_name}' not found")
raise DependencyNotFoundError(project_name, package_name, ref) raise DependencyNotFoundError(project_name, package_name, ref)
package = db.query(Package).filter( package = db.query(Package).filter(
@@ -683,11 +669,8 @@ def resolve_dependencies(
Package.name == package_name, Package.name == package_name,
).first() ).first()
if not package: if not package:
logger.warning(f"resolve_dependencies: package '{package_name}' not found in project '{project_name}'")
raise DependencyNotFoundError(project_name, package_name, ref) raise DependencyNotFoundError(project_name, package_name, ref)
logger.info(f"resolve_dependencies: found package id={package.id}")
# Try to find artifact by tag or version # Try to find artifact by tag or version
resolved = _resolve_dependency_to_artifact( resolved = _resolve_dependency_to_artifact(
db, project_name, package_name, ref, ref db, project_name, package_name, ref, ref

View File

@@ -9,13 +9,14 @@ import hashlib
import logging import logging
import re import re
import tarfile import tarfile
import threading
import zipfile import zipfile
from io import BytesIO from io import BytesIO
from typing import Optional, List, Tuple from typing import Optional, List, Tuple
from urllib.parse import urljoin, urlparse, quote, unquote from urllib.parse import urljoin, urlparse, quote, unquote
import httpx import httpx
from fastapi import APIRouter, Depends, HTTPException, Request, Response from fastapi import APIRouter, BackgroundTasks, Depends, HTTPException, Request, Response
from fastapi.responses import StreamingResponse, HTMLResponse from fastapi.responses import StreamingResponse, HTMLResponse
from sqlalchemy.orm import Session from sqlalchemy.orm import Session
@@ -502,6 +503,84 @@ async def pypi_package_versions(
) )
def _cache_dependency_background(
base_url: str,
dep_name: str,
dep_version: Optional[str],
depth: int = 0,
max_depth: int = 10,
):
"""
Background task to proactively cache a dependency.
Fetches the dependency from upstream via our own proxy, which will
recursively cache its dependencies as well.
"""
if depth >= max_depth:
logger.warning(f"PyPI proxy: max depth {max_depth} reached caching {dep_name}")
return
try:
# Normalize package name for URL (PEP 503)
normalized_name = re.sub(r'[-_.]+', '-', dep_name).lower()
# First, get the simple index page to find available versions
simple_url = f"{base_url}/pypi/simple/{normalized_name}/"
logger.info(f"PyPI proxy: proactively caching {dep_name} (depth={depth})")
with httpx.Client(timeout=30.0) as client:
response = client.get(simple_url)
if response.status_code != 200:
logger.warning(f"PyPI proxy: failed to get index for {dep_name}: {response.status_code}")
return
# Parse the HTML to find wheel files
html = response.text
# Look for wheel files (.whl) - prefer them over sdist
wheel_pattern = rf'href="([^"]*{normalized_name}[^"]*\.whl[^"]*)"'
matches = re.findall(wheel_pattern, html, re.IGNORECASE)
if not matches:
# Try sdist
sdist_pattern = rf'href="([^"]*{normalized_name}[^"]*\.tar\.gz[^"]*)"'
matches = re.findall(sdist_pattern, html, re.IGNORECASE)
if not matches:
logger.warning(f"PyPI proxy: no downloadable files found for {dep_name}")
return
# Get the last match (usually the latest version)
# The URL might be relative or absolute
download_url = matches[-1]
if download_url.startswith('/'):
download_url = f"{base_url}{download_url}"
elif not download_url.startswith('http'):
download_url = f"{base_url}/pypi/simple/{normalized_name}/{download_url}"
# Download the file through our proxy (this will cache it)
logger.info(f"PyPI proxy: downloading dependency {dep_name} from {download_url}")
response = client.get(download_url)
if response.status_code == 200:
logger.info(f"PyPI proxy: successfully cached {dep_name}")
else:
logger.warning(f"PyPI proxy: failed to cache {dep_name}: {response.status_code}")
except Exception as e:
logger.warning(f"PyPI proxy: error caching dependency {dep_name}: {e}")
def _start_background_dependency_caching(base_url: str, dependencies: List[Tuple[str, Optional[str]]]):
"""Start background threads to cache dependencies."""
for dep_name, dep_version in dependencies:
# Use a thread to avoid blocking
thread = threading.Thread(
target=_cache_dependency_background,
args=(base_url, dep_name, dep_version),
daemon=True,
)
thread.start()
@router.get("/simple/{package_name}/{filename}") @router.get("/simple/{package_name}/{filename}")
async def pypi_download_file( async def pypi_download_file(
request: Request, request: Request,
@@ -736,9 +815,17 @@ async def pypi_download_file(
# Extract and store dependencies # Extract and store dependencies
dependencies = _extract_dependencies(content, filename) dependencies = _extract_dependencies(content, filename)
unique_deps = []
if dependencies: if dependencies:
logger.info(f"PyPI proxy: extracted {len(dependencies)} dependencies from {filename}") # Deduplicate dependencies by package name (keep first occurrence)
seen_packages = set()
for dep_name, dep_version in dependencies: for dep_name, dep_version in dependencies:
if dep_name not in seen_packages:
seen_packages.add(dep_name)
unique_deps.append((dep_name, dep_version))
logger.info(f"PyPI proxy: extracted {len(unique_deps)} dependencies from {filename} (deduped from {len(dependencies)})")
for dep_name, dep_version in unique_deps:
# Check if this dependency already exists for this artifact # Check if this dependency already exists for this artifact
existing_dep = db.query(ArtifactDependency).filter( existing_dep = db.query(ArtifactDependency).filter(
ArtifactDependency.artifact_id == sha256, ArtifactDependency.artifact_id == sha256,
@@ -757,6 +844,11 @@ async def pypi_download_file(
db.commit() db.commit()
# Proactively cache dependencies in the background
if unique_deps:
base_url = str(request.base_url).rstrip("/")
_start_background_dependency_caching(base_url, unique_deps)
# Return the file # Return the file
return Response( return Response(
content=content, content=content,