Fix PyPI proxy timeout by streaming from S3 instead of loading into memory
Large packages like TensorFlow (~600MB) caused read timeouts because the entire file was loaded into memory before responding to the client. Now the file is stored to S3 first, then streamed back using StreamingResponse.
This commit is contained in:
@@ -578,10 +578,7 @@ async def pypi_download_file(
|
||||
result = storage.store(f)
|
||||
sha256 = result.sha256
|
||||
size = result.size
|
||||
|
||||
# Read content for response
|
||||
with open(tmp_path, 'rb') as f:
|
||||
content = f.read()
|
||||
s3_key = result.s3_key
|
||||
|
||||
logger.info(f"PyPI proxy: downloaded {filename}, {size} bytes, sha256={sha256[:12]}")
|
||||
finally:
|
||||
@@ -677,9 +674,20 @@ async def pypi_download_file(
|
||||
|
||||
db.commit()
|
||||
|
||||
# Return the file
|
||||
return Response(
|
||||
content=content,
|
||||
# Stream the file from S3 (don't load into memory)
|
||||
try:
|
||||
stream, content_length, _ = storage.get_stream(s3_key)
|
||||
|
||||
def stream_content():
|
||||
"""Generator that yields chunks from the S3 stream."""
|
||||
try:
|
||||
for chunk in stream.iter_chunks():
|
||||
yield chunk
|
||||
finally:
|
||||
stream.close()
|
||||
|
||||
return StreamingResponse(
|
||||
stream_content(),
|
||||
media_type=content_type,
|
||||
headers={
|
||||
"Content-Disposition": f'attachment; filename="{filename}"',
|
||||
@@ -688,6 +696,9 @@ async def pypi_download_file(
|
||||
"X-Cache": "MISS",
|
||||
}
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f"PyPI proxy: error streaming from S3: {e}")
|
||||
raise HTTPException(status_code=500, detail=f"Error streaming file: {e}")
|
||||
|
||||
except httpx.ConnectError as e:
|
||||
raise HTTPException(status_code=502, detail=f"Connection failed: {e}")
|
||||
|
||||
Reference in New Issue
Block a user