diff --git a/backend/app/pypi_proxy.py b/backend/app/pypi_proxy.py index d62338c..a673436 100644 --- a/backend/app/pypi_proxy.py +++ b/backend/app/pypi_proxy.py @@ -7,8 +7,10 @@ Artifacts are cached on first access through configured upstream sources. import hashlib import logging +import os import re import tarfile +import tempfile import zipfile from io import BytesIO from typing import Optional, List, Tuple @@ -635,16 +637,32 @@ async def pypi_download_file( detail=f"Upstream returned {response.status_code}" ) - content = response.content content_type = response.headers.get('content-type', 'application/octet-stream') - # Store in S3 (computes hash and deduplicates automatically) - from io import BytesIO - result = storage.store(BytesIO(content)) - sha256 = result.sha256 - size = result.size + # Stream to temp file to avoid loading large packages into memory + # This keeps memory usage constant regardless of package size + tmp_path = None + try: + with tempfile.NamedTemporaryFile(delete=False, suffix=f"_{filename}") as tmp_file: + tmp_path = tmp_file.name + for chunk in response.iter_bytes(chunk_size=65536): # 64KB chunks + tmp_file.write(chunk) - logger.info(f"PyPI proxy: downloaded {filename}, {size} bytes, sha256={sha256[:12]}") + # Store in S3 from temp file (computes hash and deduplicates automatically) + with open(tmp_path, 'rb') as f: + result = storage.store(f) + sha256 = result.sha256 + size = result.size + + # Read content for metadata extraction and response + with open(tmp_path, 'rb') as f: + content = f.read() + + logger.info(f"PyPI proxy: downloaded {filename}, {size} bytes, sha256={sha256[:12]}") + finally: + # Clean up temp file + if tmp_path and os.path.exists(tmp_path): + os.unlink(tmp_path) # Check if artifact already exists existing = db.query(Artifact).filter(Artifact.id == sha256).first()