Stream downloads to temp file to reduce memory usage

- Download packages in 64KB chunks to temp file instead of loading into memory - Upload to S3 from temp file (streaming) - Clean up temp file after processing - Reduces memory footprint from 2x file size to 1x file size
2026-02-02 15:10:25 -06:00
parent 1667c5a416
commit 415ad9a29a
1 changed files with 25 additions and 7 deletions
--- a/backend/app/pypi_proxy.py
+++ b/backend/app/pypi_proxy.py
@@ -7,8 +7,10 @@ Artifacts are cached on first access through configured upstream sources.

 import hashlib
 import logging
+import os
 import re
 import tarfile
+import tempfile
 import zipfile
 from io import BytesIO
 from typing import Optional, List, Tuple
@@ -635,16 +637,32 @@ async def pypi_download_file(
                    detail=f"Upstream returned {response.status_code}"
                )

-            content = response.content
            content_type = response.headers.get('content-type', 'application/octet-stream')

-        # Store in S3 (computes hash and deduplicates automatically)
-        from io import BytesIO
-        result = storage.store(BytesIO(content))
-        sha256 = result.sha256
-        size = result.size
+            # Stream to temp file to avoid loading large packages into memory
+            # This keeps memory usage constant regardless of package size
+            tmp_path = None
+            try:
+                with tempfile.NamedTemporaryFile(delete=False, suffix=f"_{filename}") as tmp_file:
+                    tmp_path = tmp_file.name
+                    for chunk in response.iter_bytes(chunk_size=65536):  # 64KB chunks
+                        tmp_file.write(chunk)

-        logger.info(f"PyPI proxy: downloaded {filename}, {size} bytes, sha256={sha256[:12]}")
+                # Store in S3 from temp file (computes hash and deduplicates automatically)
+                with open(tmp_path, 'rb') as f:
+                    result = storage.store(f)
+                sha256 = result.sha256
+                size = result.size
+
+                # Read content for metadata extraction and response
+                with open(tmp_path, 'rb') as f:
+                    content = f.read()
+
+                logger.info(f"PyPI proxy: downloaded {filename}, {size} bytes, sha256={sha256[:12]}")
+            finally:
+                # Clean up temp file
+                if tmp_path and os.path.exists(tmp_path):
+                    os.unlink(tmp_path)

        # Check if artifact already exists
        existing = db.query(Artifact).filter(Artifact.id == sha256).first()