Stream downloads to temp file to reduce memory usage

- Download packages in 64KB chunks to temp file instead of loading into memory - Upload to S3 from temp file (streaming) - Clean up temp file after processing - Reduces memory footprint from 2x file size to 1x file size
2026-02-02 15:10:25 -06:00
parent 1667c5a416
commit 415ad9a29a
1 changed files with 25 additions and 7 deletions
--- a/backend/app/pypi_proxy.py
+++ b/backend/app/pypi_proxy.py
@@ -7,8 +7,10 @@ Artifacts are cached on first access through configured upstream sources.
 import hashlib
 import logging
 import os
 import re
 import tarfile
 import tempfile
 import zipfile
 from io import BytesIO
 from typing import Optional, List, Tuple
@@ -635,16 +637,32 @@ async def pypi_download_file(
                    detail=f"Upstream returned {response.status_code}"
                )
            content = response.content
            content_type = response.headers.get('content-type', 'application/octet-stream')
-        # Store in S3 (computes hash and deduplicates automatically)
+            # Stream to temp file to avoid loading large packages into memory
-        from io import BytesIO
+            # This keeps memory usage constant regardless of package size
-        result = storage.store(BytesIO(content))
+            tmp_path = None
-        sha256 = result.sha256
+            try:
-        size = result.size
+                with tempfile.NamedTemporaryFile(delete=False, suffix=f"_{filename}") as tmp_file:
                    tmp_path = tmp_file.name
                    for chunk in response.iter_bytes(chunk_size=65536):  # 64KB chunks
                        tmp_file.write(chunk)
-        logger.info(f"PyPI proxy: downloaded {filename}, {size} bytes, sha256={sha256[:12]}")
+                # Store in S3 from temp file (computes hash and deduplicates automatically)
                with open(tmp_path, 'rb') as f:
                    result = storage.store(f)
                sha256 = result.sha256
                size = result.size
                # Read content for metadata extraction and response
                with open(tmp_path, 'rb') as f:
                    content = f.read()
                logger.info(f"PyPI proxy: downloaded {filename}, {size} bytes, sha256={sha256[:12]}")
            finally:
                # Clean up temp file
                if tmp_path and os.path.exists(tmp_path):
                    os.unlink(tmp_path)
        # Check if artifact already exists
        existing = db.query(Artifact).filter(Artifact.id == sha256).first()