Add large file upload enhancements and tests (#43)

- Add upload duration/throughput metrics (duration_ms, throughput_mbps) to response
- Add upload progress logging for large files (hash computation and multipart upload)
- Add client disconnect handling during uploads with proper cleanup
- Add upload progress tracking endpoint GET /upload/{upload_id}/progress
- Add large file upload tests (10MB, 100MB, 1GB)
- Add upload cancellation and timeout handling tests
- Add API documentation for upload endpoints with curl, Python, JavaScript examples
This commit is contained in:
Mondo Diaz
2026-01-16 19:33:31 +00:00
parent a166b6d37f
commit 722e7d2d81
5 changed files with 894 additions and 11 deletions

View File

@@ -82,6 +82,7 @@ from .schemas import (
ResumableUploadCompleteRequest,
ResumableUploadCompleteResponse,
ResumableUploadStatusResponse,
UploadProgressResponse,
GlobalSearchResponse,
SearchResultProject,
SearchResultPackage,
@@ -2283,10 +2284,56 @@ def upload_artifact(
"""
Upload an artifact to a package.
Headers:
- X-Checksum-SHA256: Optional client-provided SHA256 for verification
- User-Agent: Captured for audit purposes
- Authorization: Bearer <api-key> for authentication
**Size Limits:**
- Minimum: 1 byte (empty files rejected)
- Maximum: 10GB (configurable via ORCHARD_MAX_FILE_SIZE)
- Files > 100MB automatically use S3 multipart upload
**Headers:**
- `X-Checksum-SHA256`: Optional SHA256 hash for server-side verification
- `Content-Length`: File size (required for early rejection of oversized files)
- `Authorization`: Bearer <api-key> for authentication
**Deduplication:**
Content-addressable storage automatically deduplicates identical files.
If the same content is uploaded multiple times, only one copy is stored.
**Response Metrics:**
- `duration_ms`: Upload duration in milliseconds
- `throughput_mbps`: Upload throughput in MB/s
- `deduplicated`: True if content already existed
**Example (curl):**
```bash
curl -X POST "http://localhost:8080/api/v1/project/myproject/mypackage/upload" \\
-H "Authorization: Bearer <api-key>" \\
-F "file=@myfile.tar.gz" \\
-F "tag=v1.0.0"
```
**Example (Python requests):**
```python
import requests
with open('myfile.tar.gz', 'rb') as f:
response = requests.post(
'http://localhost:8080/api/v1/project/myproject/mypackage/upload',
files={'file': f},
data={'tag': 'v1.0.0'},
headers={'Authorization': 'Bearer <api-key>'}
)
```
**Example (JavaScript fetch):**
```javascript
const formData = new FormData();
formData.append('file', fileInput.files[0]);
formData.append('tag', 'v1.0.0');
const response = await fetch('/api/v1/project/myproject/mypackage/upload', {
method: 'POST',
headers: { 'Authorization': 'Bearer <api-key>' },
body: formData
});
```
"""
start_time = time.time()
settings = get_settings()
@@ -2388,6 +2435,30 @@ def upload_artifact(
except StorageError as e:
logger.error(f"Storage error during upload: {e}")
raise HTTPException(status_code=500, detail="Internal storage error")
except (ConnectionResetError, BrokenPipeError) as e:
# Client disconnected during upload
logger.warning(
f"Client disconnected during upload: project={project_name} "
f"package={package_name} filename={file.filename} error={e}"
)
raise HTTPException(
status_code=499, # Client Closed Request (nginx convention)
detail="Client disconnected during upload",
)
except Exception as e:
# Catch-all for unexpected errors including client disconnects
error_str = str(e).lower()
if "connection" in error_str or "broken pipe" in error_str or "reset" in error_str:
logger.warning(
f"Client connection error during upload: project={project_name} "
f"package={package_name} filename={file.filename} error={e}"
)
raise HTTPException(
status_code=499,
detail="Client connection error during upload",
)
logger.error(f"Unexpected error during upload: {e}", exc_info=True)
raise HTTPException(status_code=500, detail="Internal server error during upload")
# Verify client-provided checksum if present
checksum_verified = True
@@ -2580,6 +2651,12 @@ def upload_artifact(
detail="Failed to save upload record. Please retry.",
)
# Calculate throughput
throughput_mbps = None
if duration_ms > 0:
duration_seconds = duration_ms / 1000.0
throughput_mbps = round((storage_result.size / (1024 * 1024)) / duration_seconds, 2)
return UploadResponse(
artifact_id=storage_result.sha256,
sha256=storage_result.sha256,
@@ -2599,6 +2676,8 @@ def upload_artifact(
content_type=artifact.content_type,
original_name=artifact.original_name,
created_at=artifact.created_at,
duration_ms=duration_ms,
throughput_mbps=throughput_mbps,
)
@@ -2616,8 +2695,46 @@ def init_resumable_upload(
storage: S3Storage = Depends(get_storage),
):
"""
Initialize a resumable upload session.
Client must provide the SHA256 hash of the file in advance.
Initialize a resumable upload session for large files.
Resumable uploads allow uploading large files in chunks, with the ability
to resume after interruption. The client must compute the SHA256 hash
of the entire file before starting.
**Workflow:**
1. POST /upload/init - Initialize upload session (this endpoint)
2. PUT /upload/{upload_id}/part/{part_number} - Upload each part
3. GET /upload/{upload_id}/progress - Check upload progress (optional)
4. POST /upload/{upload_id}/complete - Finalize upload
5. DELETE /upload/{upload_id} - Abort upload (if needed)
**Chunk Size:**
Use the `chunk_size` returned in the response (10MB default).
Each part except the last must be exactly this size.
**Deduplication:**
If the expected_hash already exists in storage, the response will include
`already_exists: true` and no upload session is created.
**Example (curl):**
```bash
# Step 1: Initialize
curl -X POST "http://localhost:8080/api/v1/project/myproject/mypackage/upload/init" \\
-H "Authorization: Bearer <api-key>" \\
-H "Content-Type: application/json" \\
-d '{"expected_hash": "<sha256>", "filename": "large.tar.gz", "size": 104857600}'
# Step 2: Upload parts
curl -X PUT "http://localhost:8080/api/v1/project/myproject/mypackage/upload/<upload_id>/part/1" \\
-H "Authorization: Bearer <api-key>" \\
--data-binary @part1.bin
# Step 3: Complete
curl -X POST "http://localhost:8080/api/v1/project/myproject/mypackage/upload/<upload_id>/complete" \\
-H "Authorization: Bearer <api-key>" \\
-H "Content-Type: application/json" \\
-d '{"tag": "v1.0.0"}'
```
"""
user_id = get_user_id(request)
@@ -2711,6 +2828,10 @@ def init_resumable_upload(
# Initialize resumable upload
session = storage.initiate_resumable_upload(init_request.expected_hash)
# Set expected size for progress tracking
if session["upload_id"] and init_request.size:
storage.set_upload_expected_size(session["upload_id"], init_request.size)
return ResumableUploadInitResponse(
upload_id=session["upload_id"],
already_exists=False,
@@ -2777,6 +2898,64 @@ def upload_part(
raise HTTPException(status_code=404, detail=str(e))
@router.get(
"/api/v1/project/{project_name}/{package_name}/upload/{upload_id}/progress",
response_model=UploadProgressResponse,
)
def get_upload_progress(
project_name: str,
package_name: str,
upload_id: str,
db: Session = Depends(get_db),
storage: S3Storage = Depends(get_storage),
):
"""
Get progress information for an in-flight resumable upload.
Returns progress metrics including bytes uploaded, percent complete,
elapsed time, and throughput.
"""
# Validate project and package exist
project = db.query(Project).filter(Project.name == project_name).first()
if not project:
raise HTTPException(status_code=404, detail="Project not found")
package = (
db.query(Package)
.filter(Package.project_id == project.id, Package.name == package_name)
.first()
)
if not package:
raise HTTPException(status_code=404, detail="Package not found")
progress = storage.get_upload_progress(upload_id)
if not progress:
# Return not_found status instead of 404 to allow polling
return UploadProgressResponse(
upload_id=upload_id,
status="not_found",
bytes_uploaded=0,
)
from datetime import datetime, timezone
started_at_dt = None
if progress.get("started_at"):
started_at_dt = datetime.fromtimestamp(progress["started_at"], tz=timezone.utc)
return UploadProgressResponse(
upload_id=upload_id,
status=progress.get("status", "in_progress"),
bytes_uploaded=progress.get("bytes_uploaded", 0),
bytes_total=progress.get("bytes_total"),
percent_complete=progress.get("percent_complete"),
parts_uploaded=progress.get("parts_uploaded", 0),
parts_total=progress.get("parts_total"),
started_at=started_at_dt,
elapsed_seconds=progress.get("elapsed_seconds"),
throughput_mbps=progress.get("throughput_mbps"),
)
@router.post(
"/api/v1/project/{project_name}/{package_name}/upload/{upload_id}/complete"
)