Stream downloads to temp file to reduce memory usage

- Download packages in 64KB chunks to temp file instead of loading into memory
- Upload to S3 from temp file (streaming)
- Clean up temp file after processing
- Reduces memory footprint from 2x file size to 1x file size
This commit is contained in:
Mondo Diaz
2026-02-02 15:10:25 -06:00
parent 1667c5a416
commit 415ad9a29a

View File

@@ -7,8 +7,10 @@ Artifacts are cached on first access through configured upstream sources.
import hashlib import hashlib
import logging import logging
import os
import re import re
import tarfile import tarfile
import tempfile
import zipfile import zipfile
from io import BytesIO from io import BytesIO
from typing import Optional, List, Tuple from typing import Optional, List, Tuple
@@ -635,16 +637,32 @@ async def pypi_download_file(
detail=f"Upstream returned {response.status_code}" detail=f"Upstream returned {response.status_code}"
) )
content = response.content
content_type = response.headers.get('content-type', 'application/octet-stream') content_type = response.headers.get('content-type', 'application/octet-stream')
# Store in S3 (computes hash and deduplicates automatically) # Stream to temp file to avoid loading large packages into memory
from io import BytesIO # This keeps memory usage constant regardless of package size
result = storage.store(BytesIO(content)) tmp_path = None
try:
with tempfile.NamedTemporaryFile(delete=False, suffix=f"_{filename}") as tmp_file:
tmp_path = tmp_file.name
for chunk in response.iter_bytes(chunk_size=65536): # 64KB chunks
tmp_file.write(chunk)
# Store in S3 from temp file (computes hash and deduplicates automatically)
with open(tmp_path, 'rb') as f:
result = storage.store(f)
sha256 = result.sha256 sha256 = result.sha256
size = result.size size = result.size
# Read content for metadata extraction and response
with open(tmp_path, 'rb') as f:
content = f.read()
logger.info(f"PyPI proxy: downloaded {filename}, {size} bytes, sha256={sha256[:12]}") logger.info(f"PyPI proxy: downloaded {filename}, {size} bytes, sha256={sha256[:12]}")
finally:
# Clean up temp file
if tmp_path and os.path.exists(tmp_path):
os.unlink(tmp_path)
# Check if artifact already exists # Check if artifact already exists
existing = db.query(Artifact).filter(Artifact.id == sha256).first() existing = db.query(Artifact).filter(Artifact.id == sha256).first()