Stream downloads to temp file to reduce memory usage

- Download packages in 64KB chunks to temp file instead of loading into memory
- Upload to S3 from temp file (streaming)
- Clean up temp file after processing
- Reduces memory footprint from 2x file size to 1x file size
This commit is contained in:
Mondo Diaz
2026-02-02 15:10:25 -06:00
parent 7008d913bf
commit 36cf288526

View File

@@ -7,8 +7,10 @@ Artifacts are cached on first access through configured upstream sources.
import hashlib
import logging
import os
import re
import tarfile
import tempfile
import zipfile
from io import BytesIO
from typing import Optional, List, Tuple
@@ -635,16 +637,32 @@ async def pypi_download_file(
detail=f"Upstream returned {response.status_code}"
)
content = response.content
content_type = response.headers.get('content-type', 'application/octet-stream')
# Store in S3 (computes hash and deduplicates automatically)
from io import BytesIO
result = storage.store(BytesIO(content))
# Stream to temp file to avoid loading large packages into memory
# This keeps memory usage constant regardless of package size
tmp_path = None
try:
with tempfile.NamedTemporaryFile(delete=False, suffix=f"_{filename}") as tmp_file:
tmp_path = tmp_file.name
for chunk in response.iter_bytes(chunk_size=65536): # 64KB chunks
tmp_file.write(chunk)
# Store in S3 from temp file (computes hash and deduplicates automatically)
with open(tmp_path, 'rb') as f:
result = storage.store(f)
sha256 = result.sha256
size = result.size
# Read content for metadata extraction and response
with open(tmp_path, 'rb') as f:
content = f.read()
logger.info(f"PyPI proxy: downloaded {filename}, {size} bytes, sha256={sha256[:12]}")
finally:
# Clean up temp file
if tmp_path and os.path.exists(tmp_path):
os.unlink(tmp_path)
# Check if artifact already exists
existing = db.query(Artifact).filter(Artifact.id == sha256).first()