Stream downloads to temp file to reduce memory usage
- Download packages in 64KB chunks to temp file instead of loading into memory - Upload to S3 from temp file (streaming) - Clean up temp file after processing - Reduces memory footprint from 2x file size to 1x file size
This commit is contained in:
@@ -7,8 +7,10 @@ Artifacts are cached on first access through configured upstream sources.
|
|||||||
|
|
||||||
import hashlib
|
import hashlib
|
||||||
import logging
|
import logging
|
||||||
|
import os
|
||||||
import re
|
import re
|
||||||
import tarfile
|
import tarfile
|
||||||
|
import tempfile
|
||||||
import zipfile
|
import zipfile
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
from typing import Optional, List, Tuple
|
from typing import Optional, List, Tuple
|
||||||
@@ -635,16 +637,32 @@ async def pypi_download_file(
|
|||||||
detail=f"Upstream returned {response.status_code}"
|
detail=f"Upstream returned {response.status_code}"
|
||||||
)
|
)
|
||||||
|
|
||||||
content = response.content
|
|
||||||
content_type = response.headers.get('content-type', 'application/octet-stream')
|
content_type = response.headers.get('content-type', 'application/octet-stream')
|
||||||
|
|
||||||
# Store in S3 (computes hash and deduplicates automatically)
|
# Stream to temp file to avoid loading large packages into memory
|
||||||
from io import BytesIO
|
# This keeps memory usage constant regardless of package size
|
||||||
result = storage.store(BytesIO(content))
|
tmp_path = None
|
||||||
sha256 = result.sha256
|
try:
|
||||||
size = result.size
|
with tempfile.NamedTemporaryFile(delete=False, suffix=f"_{filename}") as tmp_file:
|
||||||
|
tmp_path = tmp_file.name
|
||||||
|
for chunk in response.iter_bytes(chunk_size=65536): # 64KB chunks
|
||||||
|
tmp_file.write(chunk)
|
||||||
|
|
||||||
logger.info(f"PyPI proxy: downloaded {filename}, {size} bytes, sha256={sha256[:12]}")
|
# Store in S3 from temp file (computes hash and deduplicates automatically)
|
||||||
|
with open(tmp_path, 'rb') as f:
|
||||||
|
result = storage.store(f)
|
||||||
|
sha256 = result.sha256
|
||||||
|
size = result.size
|
||||||
|
|
||||||
|
# Read content for metadata extraction and response
|
||||||
|
with open(tmp_path, 'rb') as f:
|
||||||
|
content = f.read()
|
||||||
|
|
||||||
|
logger.info(f"PyPI proxy: downloaded {filename}, {size} bytes, sha256={sha256[:12]}")
|
||||||
|
finally:
|
||||||
|
# Clean up temp file
|
||||||
|
if tmp_path and os.path.exists(tmp_path):
|
||||||
|
os.unlink(tmp_path)
|
||||||
|
|
||||||
# Check if artifact already exists
|
# Check if artifact already exists
|
||||||
existing = db.query(Artifact).filter(Artifact.id == sha256).first()
|
existing = db.query(Artifact).filter(Artifact.id == sha256).first()
|
||||||
|
|||||||
Reference in New Issue
Block a user