feat: add auto-fetch for missing dependencies from upstream registries
Add auto_fetch parameter to dependency resolution endpoint that fetches missing dependencies from upstream registries (PyPI) when resolving. - Add RegistryClient abstraction with PyPIRegistryClient implementation - Extract fetch_and_cache_pypi_package() for reuse - Add resolve_dependencies_with_fetch() async function - Extend MissingDependency schema with fetch_attempted/fetch_error - Add fetched list to DependencyResolutionResponse - Add auto_fetch_max_depth config setting (default: 3) - Remove Usage section from Package page UI - Add 6 integration tests for auto-fetch functionality
This commit is contained in:
@@ -572,6 +572,258 @@ async def pypi_package_versions(
|
||||
)
|
||||
|
||||
|
||||
async def fetch_and_cache_pypi_package(
|
||||
db: Session,
|
||||
storage: S3Storage,
|
||||
http_client: httpx.AsyncClient,
|
||||
package_name: str,
|
||||
filename: str,
|
||||
download_url: str,
|
||||
expected_sha256: Optional[str] = None,
|
||||
) -> Optional[dict]:
|
||||
"""
|
||||
Fetch a PyPI package from upstream and cache it in Orchard.
|
||||
|
||||
This is the core caching logic extracted from pypi_download_file() for reuse
|
||||
by the registry client during auto-fetch dependency resolution.
|
||||
|
||||
Args:
|
||||
db: Database session
|
||||
storage: S3 storage instance
|
||||
http_client: Async HTTP client for making requests
|
||||
package_name: Normalized package name (e.g., 'requests')
|
||||
filename: Package filename (e.g., 'requests-2.31.0-py3-none-any.whl')
|
||||
download_url: Full URL to download from upstream
|
||||
expected_sha256: Optional SHA256 to verify download integrity
|
||||
|
||||
Returns:
|
||||
Dict with artifact_id, size, version, already_cached if successful.
|
||||
None if the fetch failed.
|
||||
"""
|
||||
# Normalize package name
|
||||
normalized_name = re.sub(r'[-_.]+', '-', package_name).lower()
|
||||
|
||||
# Check if we already have this URL cached
|
||||
url_hash = hashlib.sha256(download_url.encode()).hexdigest()
|
||||
cached_url = db.query(CachedUrl).filter(CachedUrl.url_hash == url_hash).first()
|
||||
|
||||
if cached_url:
|
||||
# Already cached - return existing artifact info
|
||||
artifact = db.query(Artifact).filter(Artifact.id == cached_url.artifact_id).first()
|
||||
if artifact:
|
||||
version = _extract_pypi_version(filename)
|
||||
logger.info(f"PyPI fetch: {filename} already cached (artifact {artifact.id[:12]})")
|
||||
return {
|
||||
"artifact_id": artifact.id,
|
||||
"size": artifact.size,
|
||||
"version": version,
|
||||
"already_cached": True,
|
||||
}
|
||||
|
||||
# Get upstream sources for auth headers
|
||||
sources = _get_pypi_upstream_sources(db)
|
||||
matched_source = sources[0] if sources else None
|
||||
|
||||
headers = {"User-Agent": "Orchard-PyPI-Proxy/1.0"}
|
||||
if matched_source:
|
||||
headers.update(_build_auth_headers(matched_source))
|
||||
auth = _get_basic_auth(matched_source) if matched_source else None
|
||||
|
||||
download_timeout = httpx.Timeout(connect=30.0, read=300.0, write=300.0, pool=30.0)
|
||||
|
||||
try:
|
||||
logger.info(f"PyPI fetch: downloading {filename} from {download_url}")
|
||||
|
||||
response = await http_client.get(
|
||||
download_url,
|
||||
headers=headers,
|
||||
auth=auth,
|
||||
timeout=download_timeout,
|
||||
)
|
||||
|
||||
# Handle redirects manually
|
||||
redirect_count = 0
|
||||
while response.status_code in (301, 302, 303, 307, 308) and redirect_count < 5:
|
||||
redirect_url = response.headers.get('location')
|
||||
if not redirect_url:
|
||||
break
|
||||
|
||||
if not redirect_url.startswith('http'):
|
||||
redirect_url = urljoin(download_url, redirect_url)
|
||||
|
||||
logger.debug(f"PyPI fetch: following redirect to {redirect_url}")
|
||||
|
||||
# Don't send auth to different hosts
|
||||
redirect_headers = {"User-Agent": "Orchard-PyPI-Proxy/1.0"}
|
||||
redirect_auth = None
|
||||
if urlparse(redirect_url).netloc == urlparse(download_url).netloc:
|
||||
redirect_headers.update(headers)
|
||||
redirect_auth = auth
|
||||
|
||||
response = await http_client.get(
|
||||
redirect_url,
|
||||
headers=redirect_headers,
|
||||
auth=redirect_auth,
|
||||
follow_redirects=False,
|
||||
timeout=download_timeout,
|
||||
)
|
||||
redirect_count += 1
|
||||
|
||||
if response.status_code != 200:
|
||||
error_detail = _parse_upstream_error(response)
|
||||
logger.warning(f"PyPI fetch: upstream returned {response.status_code} for {filename}: {error_detail}")
|
||||
return None
|
||||
|
||||
content_type = response.headers.get('content-type', 'application/octet-stream')
|
||||
|
||||
# Stream to temp file to avoid loading large packages into memory
|
||||
tmp_path = None
|
||||
try:
|
||||
with tempfile.NamedTemporaryFile(delete=False, suffix=f"_{filename}") as tmp_file:
|
||||
tmp_path = tmp_file.name
|
||||
async for chunk in response.aiter_bytes(chunk_size=65536):
|
||||
tmp_file.write(chunk)
|
||||
|
||||
# Store in S3 from temp file (computes hash and deduplicates automatically)
|
||||
with open(tmp_path, 'rb') as f:
|
||||
result = storage.store(f)
|
||||
sha256 = result.sha256
|
||||
size = result.size
|
||||
|
||||
# Verify hash if expected
|
||||
if expected_sha256 and sha256 != expected_sha256.lower():
|
||||
logger.error(
|
||||
f"PyPI fetch: hash mismatch for {filename}: "
|
||||
f"expected {expected_sha256[:12]}, got {sha256[:12]}"
|
||||
)
|
||||
return None
|
||||
|
||||
# Extract dependencies from the temp file
|
||||
extracted_deps = _extract_dependencies_from_file(tmp_path, filename)
|
||||
if extracted_deps:
|
||||
logger.info(f"PyPI fetch: extracted {len(extracted_deps)} dependencies from {filename}")
|
||||
|
||||
logger.info(f"PyPI fetch: downloaded {filename}, {size} bytes, sha256={sha256[:12]}")
|
||||
finally:
|
||||
# Clean up temp file
|
||||
if tmp_path and os.path.exists(tmp_path):
|
||||
os.unlink(tmp_path)
|
||||
|
||||
# Check if artifact already exists
|
||||
existing = db.query(Artifact).filter(Artifact.id == sha256).first()
|
||||
if existing:
|
||||
existing.ref_count += 1
|
||||
db.flush()
|
||||
else:
|
||||
new_artifact = Artifact(
|
||||
id=sha256,
|
||||
original_name=filename,
|
||||
content_type=content_type,
|
||||
size=size,
|
||||
ref_count=1,
|
||||
created_by="pypi-proxy",
|
||||
s3_key=result.s3_key,
|
||||
checksum_md5=result.md5,
|
||||
checksum_sha1=result.sha1,
|
||||
s3_etag=result.s3_etag,
|
||||
)
|
||||
db.add(new_artifact)
|
||||
db.flush()
|
||||
|
||||
# Create/get system project and package
|
||||
system_project = db.query(Project).filter(Project.name == "_pypi").first()
|
||||
if not system_project:
|
||||
system_project = Project(
|
||||
name="_pypi",
|
||||
description="System project for cached PyPI packages",
|
||||
is_public=True,
|
||||
is_system=True,
|
||||
created_by="pypi-proxy",
|
||||
)
|
||||
db.add(system_project)
|
||||
db.flush()
|
||||
elif not system_project.is_system:
|
||||
system_project.is_system = True
|
||||
db.flush()
|
||||
|
||||
package = db.query(Package).filter(
|
||||
Package.project_id == system_project.id,
|
||||
Package.name == normalized_name,
|
||||
).first()
|
||||
if not package:
|
||||
package = Package(
|
||||
project_id=system_project.id,
|
||||
name=normalized_name,
|
||||
description=f"PyPI package: {normalized_name}",
|
||||
format="pypi",
|
||||
)
|
||||
db.add(package)
|
||||
db.flush()
|
||||
|
||||
# Extract and create version
|
||||
version = _extract_pypi_version(filename)
|
||||
if version and not filename.endswith('.metadata'):
|
||||
existing_version = db.query(PackageVersion).filter(
|
||||
PackageVersion.package_id == package.id,
|
||||
PackageVersion.version == version,
|
||||
).first()
|
||||
if not existing_version:
|
||||
pkg_version = PackageVersion(
|
||||
package_id=package.id,
|
||||
artifact_id=sha256,
|
||||
version=version,
|
||||
version_source="filename",
|
||||
created_by="pypi-proxy",
|
||||
)
|
||||
db.add(pkg_version)
|
||||
|
||||
# Cache the URL mapping
|
||||
existing_cached = db.query(CachedUrl).filter(CachedUrl.url_hash == url_hash).first()
|
||||
if not existing_cached:
|
||||
cached_url_record = CachedUrl(
|
||||
url_hash=url_hash,
|
||||
url=download_url,
|
||||
artifact_id=sha256,
|
||||
)
|
||||
db.add(cached_url_record)
|
||||
|
||||
# Store extracted dependencies using batch operation
|
||||
if extracted_deps:
|
||||
seen_deps: dict[str, str] = {}
|
||||
for dep_name, dep_version in extracted_deps:
|
||||
if dep_name not in seen_deps:
|
||||
seen_deps[dep_name] = dep_version if dep_version else "*"
|
||||
|
||||
deps_to_store = [
|
||||
("_pypi", dep_name, dep_version)
|
||||
for dep_name, dep_version in seen_deps.items()
|
||||
]
|
||||
|
||||
repo = ArtifactRepository(db)
|
||||
inserted = repo.batch_upsert_dependencies(sha256, deps_to_store)
|
||||
if inserted > 0:
|
||||
logger.debug(f"Stored {inserted} dependencies for {sha256[:12]}...")
|
||||
|
||||
db.commit()
|
||||
|
||||
return {
|
||||
"artifact_id": sha256,
|
||||
"size": size,
|
||||
"version": version,
|
||||
"already_cached": False,
|
||||
}
|
||||
|
||||
except httpx.ConnectError as e:
|
||||
logger.warning(f"PyPI fetch: connection failed for {filename}: {e}")
|
||||
return None
|
||||
except httpx.TimeoutException as e:
|
||||
logger.warning(f"PyPI fetch: timeout for {filename}: {e}")
|
||||
return None
|
||||
except Exception as e:
|
||||
logger.exception(f"PyPI fetch: error downloading {filename}")
|
||||
return None
|
||||
|
||||
|
||||
@router.get("/simple/{package_name}/{filename}")
|
||||
async def pypi_download_file(
|
||||
request: Request,
|
||||
|
||||
Reference in New Issue
Block a user