Fix PyPI proxy timeout by streaming from S3 instead of loading into memory

Large packages like TensorFlow (~600MB) caused read timeouts because the
entire file was loaded into memory before responding to the client. Now
the file is stored to S3 first, then streamed back using StreamingResponse.
This commit is contained in:
Mondo Diaz
2026-02-03 16:42:30 -06:00
parent 9a2b323fd8
commit da3fd7a601

View File

@@ -578,10 +578,7 @@ async def pypi_download_file(
result = storage.store(f) result = storage.store(f)
sha256 = result.sha256 sha256 = result.sha256
size = result.size size = result.size
s3_key = result.s3_key
# Read content for response
with open(tmp_path, 'rb') as f:
content = f.read()
logger.info(f"PyPI proxy: downloaded {filename}, {size} bytes, sha256={sha256[:12]}") logger.info(f"PyPI proxy: downloaded {filename}, {size} bytes, sha256={sha256[:12]}")
finally: finally:
@@ -677,17 +674,31 @@ async def pypi_download_file(
db.commit() db.commit()
# Return the file # Stream the file from S3 (don't load into memory)
return Response( try:
content=content, stream, content_length, _ = storage.get_stream(s3_key)
media_type=content_type,
headers={ def stream_content():
"Content-Disposition": f'attachment; filename="{filename}"', """Generator that yields chunks from the S3 stream."""
"Content-Length": str(size), try:
"X-Checksum-SHA256": sha256, for chunk in stream.iter_chunks():
"X-Cache": "MISS", yield chunk
} finally:
) stream.close()
return StreamingResponse(
stream_content(),
media_type=content_type,
headers={
"Content-Disposition": f'attachment; filename="{filename}"',
"Content-Length": str(size),
"X-Checksum-SHA256": sha256,
"X-Cache": "MISS",
}
)
except Exception as e:
logger.error(f"PyPI proxy: error streaming from S3: {e}")
raise HTTPException(status_code=500, detail=f"Error streaming file: {e}")
except httpx.ConnectError as e: except httpx.ConnectError as e:
raise HTTPException(status_code=502, detail=f"Connection failed: {e}") raise HTTPException(status_code=502, detail=f"Connection failed: {e}")