Add comprehensive upload/download tests and streaming enhancements (#38, #40, #42, #43)

This commit is contained in:
Mondo Diaz
2026-01-21 09:35:12 -06:00
parent f7ffc1c877
commit 584acd1e90
23 changed files with 5385 additions and 405 deletions

View File

@@ -82,6 +82,7 @@ from .schemas import (
ResumableUploadCompleteRequest,
ResumableUploadCompleteResponse,
ResumableUploadStatusResponse,
UploadProgressResponse,
GlobalSearchResponse,
SearchResultProject,
SearchResultPackage,
@@ -143,6 +144,31 @@ def sanitize_filename(filename: str) -> str:
return re.sub(r'[\r\n"]', "", filename)
def build_content_disposition(filename: str) -> str:
"""Build a Content-Disposition header value with proper encoding.
For ASCII filenames, uses simple: attachment; filename="name"
For non-ASCII filenames, uses RFC 5987 encoding with UTF-8.
"""
from urllib.parse import quote
sanitized = sanitize_filename(filename)
# Check if filename is pure ASCII
try:
sanitized.encode('ascii')
# Pure ASCII - simple format
return f'attachment; filename="{sanitized}"'
except UnicodeEncodeError:
# Non-ASCII - use RFC 5987 encoding
# Provide both filename (ASCII fallback) and filename* (UTF-8 encoded)
ascii_fallback = sanitized.encode('ascii', errors='replace').decode('ascii')
# RFC 5987: filename*=charset'language'encoded_value
# We use UTF-8 encoding and percent-encode non-ASCII chars
encoded = quote(sanitized, safe='')
return f'attachment; filename="{ascii_fallback}"; filename*=UTF-8\'\'{encoded}'
def get_user_id_from_request(
request: Request,
db: Session,
@@ -2258,10 +2284,56 @@ def upload_artifact(
"""
Upload an artifact to a package.
Headers:
- X-Checksum-SHA256: Optional client-provided SHA256 for verification
- User-Agent: Captured for audit purposes
- Authorization: Bearer <api-key> for authentication
**Size Limits:**
- Minimum: 1 byte (empty files rejected)
- Maximum: 10GB (configurable via ORCHARD_MAX_FILE_SIZE)
- Files > 100MB automatically use S3 multipart upload
**Headers:**
- `X-Checksum-SHA256`: Optional SHA256 hash for server-side verification
- `Content-Length`: File size (required for early rejection of oversized files)
- `Authorization`: Bearer <api-key> for authentication
**Deduplication:**
Content-addressable storage automatically deduplicates identical files.
If the same content is uploaded multiple times, only one copy is stored.
**Response Metrics:**
- `duration_ms`: Upload duration in milliseconds
- `throughput_mbps`: Upload throughput in MB/s
- `deduplicated`: True if content already existed
**Example (curl):**
```bash
curl -X POST "http://localhost:8080/api/v1/project/myproject/mypackage/upload" \\
-H "Authorization: Bearer <api-key>" \\
-F "file=@myfile.tar.gz" \\
-F "tag=v1.0.0"
```
**Example (Python requests):**
```python
import requests
with open('myfile.tar.gz', 'rb') as f:
response = requests.post(
'http://localhost:8080/api/v1/project/myproject/mypackage/upload',
files={'file': f},
data={'tag': 'v1.0.0'},
headers={'Authorization': 'Bearer <api-key>'}
)
```
**Example (JavaScript fetch):**
```javascript
const formData = new FormData();
formData.append('file', fileInput.files[0]);
formData.append('tag', 'v1.0.0');
const response = await fetch('/api/v1/project/myproject/mypackage/upload', {
method: 'POST',
headers: { 'Authorization': 'Bearer <api-key>' },
body: formData
});
```
"""
start_time = time.time()
settings = get_settings()
@@ -2363,6 +2435,30 @@ def upload_artifact(
except StorageError as e:
logger.error(f"Storage error during upload: {e}")
raise HTTPException(status_code=500, detail="Internal storage error")
except (ConnectionResetError, BrokenPipeError) as e:
# Client disconnected during upload
logger.warning(
f"Client disconnected during upload: project={project_name} "
f"package={package_name} filename={file.filename} error={e}"
)
raise HTTPException(
status_code=499, # Client Closed Request (nginx convention)
detail="Client disconnected during upload",
)
except Exception as e:
# Catch-all for unexpected errors including client disconnects
error_str = str(e).lower()
if "connection" in error_str or "broken pipe" in error_str or "reset" in error_str:
logger.warning(
f"Client connection error during upload: project={project_name} "
f"package={package_name} filename={file.filename} error={e}"
)
raise HTTPException(
status_code=499,
detail="Client connection error during upload",
)
logger.error(f"Unexpected error during upload: {e}", exc_info=True)
raise HTTPException(status_code=500, detail="Internal server error during upload")
# Verify client-provided checksum if present
checksum_verified = True
@@ -2555,6 +2651,12 @@ def upload_artifact(
detail="Failed to save upload record. Please retry.",
)
# Calculate throughput
throughput_mbps = None
if duration_ms > 0:
duration_seconds = duration_ms / 1000.0
throughput_mbps = round((storage_result.size / (1024 * 1024)) / duration_seconds, 2)
return UploadResponse(
artifact_id=storage_result.sha256,
sha256=storage_result.sha256,
@@ -2574,6 +2676,8 @@ def upload_artifact(
content_type=artifact.content_type,
original_name=artifact.original_name,
created_at=artifact.created_at,
duration_ms=duration_ms,
throughput_mbps=throughput_mbps,
)
@@ -2591,8 +2695,46 @@ def init_resumable_upload(
storage: S3Storage = Depends(get_storage),
):
"""
Initialize a resumable upload session.
Client must provide the SHA256 hash of the file in advance.
Initialize a resumable upload session for large files.
Resumable uploads allow uploading large files in chunks, with the ability
to resume after interruption. The client must compute the SHA256 hash
of the entire file before starting.
**Workflow:**
1. POST /upload/init - Initialize upload session (this endpoint)
2. PUT /upload/{upload_id}/part/{part_number} - Upload each part
3. GET /upload/{upload_id}/progress - Check upload progress (optional)
4. POST /upload/{upload_id}/complete - Finalize upload
5. DELETE /upload/{upload_id} - Abort upload (if needed)
**Chunk Size:**
Use the `chunk_size` returned in the response (10MB default).
Each part except the last must be exactly this size.
**Deduplication:**
If the expected_hash already exists in storage, the response will include
`already_exists: true` and no upload session is created.
**Example (curl):**
```bash
# Step 1: Initialize
curl -X POST "http://localhost:8080/api/v1/project/myproject/mypackage/upload/init" \\
-H "Authorization: Bearer <api-key>" \\
-H "Content-Type: application/json" \\
-d '{"expected_hash": "<sha256>", "filename": "large.tar.gz", "size": 104857600}'
# Step 2: Upload parts
curl -X PUT "http://localhost:8080/api/v1/project/myproject/mypackage/upload/<upload_id>/part/1" \\
-H "Authorization: Bearer <api-key>" \\
--data-binary @part1.bin
# Step 3: Complete
curl -X POST "http://localhost:8080/api/v1/project/myproject/mypackage/upload/<upload_id>/complete" \\
-H "Authorization: Bearer <api-key>" \\
-H "Content-Type: application/json" \\
-d '{"tag": "v1.0.0"}'
```
"""
user_id = get_user_id(request)
@@ -2686,6 +2828,10 @@ def init_resumable_upload(
# Initialize resumable upload
session = storage.initiate_resumable_upload(init_request.expected_hash)
# Set expected size for progress tracking
if session["upload_id"] and init_request.size:
storage.set_upload_expected_size(session["upload_id"], init_request.size)
return ResumableUploadInitResponse(
upload_id=session["upload_id"],
already_exists=False,
@@ -2752,6 +2898,64 @@ def upload_part(
raise HTTPException(status_code=404, detail=str(e))
@router.get(
"/api/v1/project/{project_name}/{package_name}/upload/{upload_id}/progress",
response_model=UploadProgressResponse,
)
def get_upload_progress(
project_name: str,
package_name: str,
upload_id: str,
db: Session = Depends(get_db),
storage: S3Storage = Depends(get_storage),
):
"""
Get progress information for an in-flight resumable upload.
Returns progress metrics including bytes uploaded, percent complete,
elapsed time, and throughput.
"""
# Validate project and package exist
project = db.query(Project).filter(Project.name == project_name).first()
if not project:
raise HTTPException(status_code=404, detail="Project not found")
package = (
db.query(Package)
.filter(Package.project_id == project.id, Package.name == package_name)
.first()
)
if not package:
raise HTTPException(status_code=404, detail="Package not found")
progress = storage.get_upload_progress(upload_id)
if not progress:
# Return not_found status instead of 404 to allow polling
return UploadProgressResponse(
upload_id=upload_id,
status="not_found",
bytes_uploaded=0,
)
from datetime import datetime, timezone
started_at_dt = None
if progress.get("started_at"):
started_at_dt = datetime.fromtimestamp(progress["started_at"], tz=timezone.utc)
return UploadProgressResponse(
upload_id=upload_id,
status=progress.get("status", "in_progress"),
bytes_uploaded=progress.get("bytes_uploaded", 0),
bytes_total=progress.get("bytes_total"),
percent_complete=progress.get("percent_complete"),
parts_uploaded=progress.get("parts_uploaded", 0),
parts_total=progress.get("parts_total"),
started_at=started_at_dt,
elapsed_seconds=progress.get("elapsed_seconds"),
throughput_mbps=progress.get("throughput_mbps"),
)
@router.post(
"/api/v1/project/{project_name}/{package_name}/upload/{upload_id}/complete"
)
@@ -2947,6 +3151,8 @@ def download_artifact(
storage: S3Storage = Depends(get_storage),
current_user: Optional[User] = Depends(get_current_user_optional),
range: Optional[str] = Header(None),
if_none_match: Optional[str] = Header(None, alias="If-None-Match"),
if_modified_since: Optional[str] = Header(None, alias="If-Modified-Since"),
mode: Optional[Literal["proxy", "redirect", "presigned"]] = Query(
default=None,
description="Download mode: proxy (stream through backend), redirect (302 to presigned URL), presigned (return JSON with URL)",
@@ -2963,6 +3169,15 @@ def download_artifact(
"""
Download an artifact by reference (tag name, artifact:hash, tag:name).
Supports conditional requests:
- If-None-Match: Returns 304 Not Modified if ETag matches
- If-Modified-Since: Returns 304 Not Modified if not modified since date
Supports range requests for partial downloads and resume:
- Range: bytes=0-1023 (first 1KB)
- Range: bytes=-1024 (last 1KB)
- Returns 206 Partial Content with Content-Range header
Verification modes:
- verify=false (default): No verification, maximum performance
- verify=true&verify_mode=stream: Compute hash while streaming, verify after completion.
@@ -2975,6 +3190,9 @@ def download_artifact(
- X-Content-Length: File size in bytes
- ETag: Artifact ID (SHA256)
- Digest: RFC 3230 format sha-256 hash
- Last-Modified: Artifact creation timestamp
- Cache-Control: Immutable caching for content-addressable storage
- Accept-Ranges: bytes (advertises range request support)
When verify=true:
- X-Verified: 'true' if verified, 'false' if verification failed
@@ -2999,6 +3217,52 @@ def download_artifact(
filename = sanitize_filename(artifact.original_name or f"{artifact.id}")
# Format Last-Modified header (RFC 7231 format)
last_modified = None
last_modified_str = None
if artifact.created_at:
last_modified = artifact.created_at
if last_modified.tzinfo is None:
last_modified = last_modified.replace(tzinfo=timezone.utc)
last_modified_str = last_modified.strftime("%a, %d %b %Y %H:%M:%S GMT")
# Handle conditional requests (If-None-Match, If-Modified-Since)
# Return 304 Not Modified if content hasn't changed
artifact_etag = f'"{artifact.id}"'
if if_none_match:
# Strip quotes and compare with artifact ETag
client_etag = if_none_match.strip().strip('"')
if client_etag == artifact.id or if_none_match == artifact_etag:
return Response(
status_code=304,
headers={
"ETag": artifact_etag,
"Cache-Control": "public, max-age=31536000, immutable",
**({"Last-Modified": last_modified_str} if last_modified_str else {}),
},
)
if if_modified_since and last_modified:
try:
# Parse If-Modified-Since header
from email.utils import parsedate_to_datetime
client_date = parsedate_to_datetime(if_modified_since)
if client_date.tzinfo is None:
client_date = client_date.replace(tzinfo=timezone.utc)
# If artifact hasn't been modified since client's date, return 304
if last_modified <= client_date:
return Response(
status_code=304,
headers={
"ETag": artifact_etag,
"Cache-Control": "public, max-age=31536000, immutable",
**({"Last-Modified": last_modified_str} if last_modified_str else {}),
},
)
except (ValueError, TypeError):
pass # Invalid date format, ignore and continue with download
# Audit log download
user_id = get_user_id(request)
_log_audit(
@@ -3016,22 +3280,28 @@ def download_artifact(
)
db.commit()
# Build common checksum headers (always included)
checksum_headers = {
# Build common headers (always included)
common_headers = {
"X-Checksum-SHA256": artifact.id,
"X-Content-Length": str(artifact.size),
"ETag": f'"{artifact.id}"',
"ETag": artifact_etag,
# Cache-Control: content-addressable storage is immutable
"Cache-Control": "public, max-age=31536000, immutable",
}
# Add Last-Modified header
if last_modified_str:
common_headers["Last-Modified"] = last_modified_str
# Add RFC 3230 Digest header
try:
digest_base64 = sha256_to_base64(artifact.id)
checksum_headers["Digest"] = f"sha-256={digest_base64}"
common_headers["Digest"] = f"sha-256={digest_base64}"
except Exception:
pass # Skip if conversion fails
# Add MD5 checksum if available
if artifact.checksum_md5:
checksum_headers["X-Checksum-MD5"] = artifact.checksum_md5
common_headers["X-Checksum-MD5"] = artifact.checksum_md5
# Determine download mode (query param overrides server default)
download_mode = mode or settings.download_mode
@@ -3071,15 +3341,29 @@ def download_artifact(
# Proxy mode (default fallback) - stream through backend
# Handle range requests (verification not supported for partial downloads)
if range:
stream, content_length, content_range = storage.get_stream(
artifact.s3_key, range
)
try:
stream, content_length, content_range = storage.get_stream(
artifact.s3_key, range
)
except Exception as e:
# S3 returns InvalidRange error for unsatisfiable ranges
error_str = str(e).lower()
if "invalidrange" in error_str or "range" in error_str:
raise HTTPException(
status_code=416,
detail="Range Not Satisfiable",
headers={
"Content-Range": f"bytes */{artifact.size}",
"Accept-Ranges": "bytes",
},
)
raise
headers = {
"Content-Disposition": f'attachment; filename="{filename}"',
"Content-Disposition": build_content_disposition(filename),
"Accept-Ranges": "bytes",
"Content-Length": str(content_length),
**checksum_headers,
**common_headers,
}
if content_range:
headers["Content-Range"] = content_range
@@ -3094,9 +3378,9 @@ def download_artifact(
# Full download with optional verification
base_headers = {
"Content-Disposition": f'attachment; filename="{filename}"',
"Content-Disposition": build_content_disposition(filename),
"Accept-Ranges": "bytes",
**checksum_headers,
**common_headers,
}
# Pre-verification mode: verify before streaming
@@ -3164,11 +3448,42 @@ def download_artifact(
},
)
# No verification - direct streaming
# No verification - direct streaming with completion logging
stream, content_length, _ = storage.get_stream(artifact.s3_key)
def logged_stream():
"""Generator that yields chunks and logs completion/disconnection."""
import time
start_time = time.time()
bytes_sent = 0
try:
for chunk in stream:
bytes_sent += len(chunk)
yield chunk
# Download completed successfully
duration = time.time() - start_time
throughput_mbps = (bytes_sent / (1024 * 1024)) / duration if duration > 0 else 0
logger.info(
f"Download completed: artifact={artifact.id[:16]}... "
f"bytes={bytes_sent} duration={duration:.2f}s throughput={throughput_mbps:.2f}MB/s"
)
except GeneratorExit:
# Client disconnected before download completed
duration = time.time() - start_time
logger.warning(
f"Download interrupted: artifact={artifact.id[:16]}... "
f"bytes_sent={bytes_sent}/{content_length} duration={duration:.2f}s"
)
except Exception as e:
duration = time.time() - start_time
logger.error(
f"Download error: artifact={artifact.id[:16]}... "
f"bytes_sent={bytes_sent} duration={duration:.2f}s error={e}"
)
raise
return StreamingResponse(
stream,
logged_stream(),
media_type=artifact.content_type or "application/octet-stream",
headers={
**base_headers,
@@ -3276,7 +3591,7 @@ def head_artifact(
# Build headers with checksum information
headers = {
"Content-Disposition": f'attachment; filename="{filename}"',
"Content-Disposition": build_content_disposition(filename),
"Accept-Ranges": "bytes",
"Content-Length": str(artifact.size),
"X-Artifact-Id": artifact.id,