Files
orchard/backend/tests/integration/test_large_uploads.py
Mondo Diaz 722e7d2d81 Add large file upload enhancements and tests (#43)
- Add upload duration/throughput metrics (duration_ms, throughput_mbps) to response
- Add upload progress logging for large files (hash computation and multipart upload)
- Add client disconnect handling during uploads with proper cleanup
- Add upload progress tracking endpoint GET /upload/{upload_id}/progress
- Add large file upload tests (10MB, 100MB, 1GB)
- Add upload cancellation and timeout handling tests
- Add API documentation for upload endpoints with curl, Python, JavaScript examples
2026-01-16 19:33:31 +00:00

551 lines
19 KiB
Python

"""
Integration tests for large file upload functionality.
Tests cover:
- Large file uploads (100MB, 1GB)
- Multipart upload behavior
- Upload metrics (duration, throughput)
- Memory efficiency during uploads
- Upload progress tracking
Note: Large tests are marked with @pytest.mark.slow and will be skipped
by default. Run with `pytest --run-slow` to include them.
"""
import pytest
import io
import time
from tests.factories import (
compute_sha256,
upload_test_file,
s3_object_exists,
)
from tests.conftest import (
SIZE_1KB,
SIZE_100KB,
SIZE_1MB,
SIZE_10MB,
SIZE_100MB,
SIZE_1GB,
)
class TestUploadMetrics:
"""Tests for upload duration and throughput metrics."""
@pytest.mark.integration
def test_upload_response_includes_duration_ms(self, integration_client, test_package):
"""Test upload response includes duration_ms field."""
project, package = test_package
content = b"duration test content"
result = upload_test_file(
integration_client, project, package, content, tag="duration-test"
)
assert "duration_ms" in result
assert result["duration_ms"] is not None
assert result["duration_ms"] >= 0
@pytest.mark.integration
def test_upload_response_includes_throughput(self, integration_client, test_package):
"""Test upload response includes throughput_mbps field."""
project, package = test_package
content = b"throughput test content"
result = upload_test_file(
integration_client, project, package, content, tag="throughput-test"
)
assert "throughput_mbps" in result
# For small files throughput may be very high or None
# Just verify the field exists
@pytest.mark.integration
def test_upload_duration_reasonable(
self, integration_client, test_package, sized_content
):
"""Test upload duration is reasonable for file size."""
project, package = test_package
content, _ = sized_content(SIZE_1MB, seed=100)
start = time.time()
result = upload_test_file(
integration_client, project, package, content, tag="duration-check"
)
actual_duration = (time.time() - start) * 1000 # ms
# Reported duration should be close to actual
assert result["duration_ms"] is not None
# Allow some variance (network overhead)
assert result["duration_ms"] <= actual_duration + 1000 # Within 1s
class TestLargeFileUploads:
"""Tests for large file uploads using multipart."""
@pytest.mark.integration
def test_upload_10mb_file(self, integration_client, test_package, sized_content):
"""Test uploading a 10MB file."""
project, package = test_package
content, expected_hash = sized_content(SIZE_10MB, seed=200)
result = upload_test_file(
integration_client, project, package, content, tag="large-10mb"
)
assert result["artifact_id"] == expected_hash
assert result["size"] == SIZE_10MB
assert result["duration_ms"] is not None
assert result["throughput_mbps"] is not None
@pytest.mark.integration
@pytest.mark.slow
def test_upload_100mb_file(self, integration_client, test_package, sized_content):
"""Test uploading a 100MB file (triggers multipart upload)."""
project, package = test_package
content, expected_hash = sized_content(SIZE_100MB, seed=300)
result = upload_test_file(
integration_client, project, package, content, tag="large-100mb"
)
assert result["artifact_id"] == expected_hash
assert result["size"] == SIZE_100MB
# Verify S3 object exists
assert s3_object_exists(expected_hash)
@pytest.mark.integration
@pytest.mark.slow
@pytest.mark.large
def test_upload_1gb_file(self, integration_client, test_package, sized_content):
"""Test uploading a 1GB file."""
project, package = test_package
content, expected_hash = sized_content(SIZE_1GB, seed=400)
result = upload_test_file(
integration_client, project, package, content, tag="large-1gb"
)
assert result["artifact_id"] == expected_hash
assert result["size"] == SIZE_1GB
# Should have measurable throughput
assert result["throughput_mbps"] is not None
assert result["throughput_mbps"] > 0
@pytest.mark.integration
def test_large_file_deduplication(
self, integration_client, test_package, sized_content, unique_test_id
):
"""Test deduplication works for large files."""
project, package = test_package
# Use unique_test_id to ensure unique content per test run
seed = hash(unique_test_id) % 10000
content, expected_hash = sized_content(SIZE_10MB, seed=seed)
# First upload
result1 = upload_test_file(
integration_client, project, package, content, tag=f"dedup-{unique_test_id}-1"
)
# Note: may be True if previous test uploaded same content
first_dedupe = result1["deduplicated"]
# Second upload of same content
result2 = upload_test_file(
integration_client, project, package, content, tag=f"dedup-{unique_test_id}-2"
)
assert result2["artifact_id"] == expected_hash
# Second upload MUST be deduplicated
assert result2["deduplicated"] is True
class TestUploadProgress:
"""Tests for upload progress tracking endpoint."""
@pytest.mark.integration
def test_progress_endpoint_returns_not_found_for_invalid_id(
self, integration_client, test_package
):
"""Test progress endpoint returns not_found status for invalid upload ID."""
project, package = test_package
response = integration_client.get(
f"/api/v1/project/{project}/{package}/upload/invalid-upload-id/progress"
)
assert response.status_code == 200
data = response.json()
assert data["status"] == "not_found"
assert data["upload_id"] == "invalid-upload-id"
@pytest.mark.integration
def test_progress_endpoint_requires_valid_project(
self, integration_client, unique_test_id
):
"""Test progress endpoint validates project exists."""
response = integration_client.get(
f"/api/v1/project/nonexistent-{unique_test_id}/pkg/upload/upload-id/progress"
)
assert response.status_code == 404
@pytest.mark.integration
def test_progress_endpoint_requires_valid_package(
self, integration_client, test_project, unique_test_id
):
"""Test progress endpoint validates package exists."""
response = integration_client.get(
f"/api/v1/project/{test_project}/nonexistent-{unique_test_id}/upload/upload-id/progress"
)
assert response.status_code == 404
class TestResumableUploadProgress:
"""Tests for progress tracking during resumable uploads."""
@pytest.mark.integration
def test_resumable_upload_init_and_progress(
self, integration_client, test_package, sized_content
):
"""Test initializing resumable upload and checking progress."""
project, package = test_package
content, expected_hash = sized_content(SIZE_100KB, seed=600)
# Get API key for auth
api_key_response = integration_client.post(
"/api/v1/auth/keys",
json={"name": "progress-test-key"},
)
assert api_key_response.status_code == 200
api_key = api_key_response.json()["key"]
# Initialize resumable upload
init_response = integration_client.post(
f"/api/v1/project/{project}/{package}/upload/init",
json={
"expected_hash": expected_hash,
"filename": "progress-test.bin",
"size": SIZE_100KB,
},
headers={"Authorization": f"Bearer {api_key}"},
)
assert init_response.status_code == 200
upload_id = init_response.json().get("upload_id")
if upload_id:
# Check initial progress
progress_response = integration_client.get(
f"/api/v1/project/{project}/{package}/upload/{upload_id}/progress",
headers={"Authorization": f"Bearer {api_key}"},
)
assert progress_response.status_code == 200
progress = progress_response.json()
assert progress["status"] == "in_progress"
assert progress["bytes_uploaded"] == 0
assert progress["bytes_total"] == SIZE_100KB
# Abort to clean up
integration_client.delete(
f"/api/v1/project/{project}/{package}/upload/{upload_id}",
headers={"Authorization": f"Bearer {api_key}"},
)
class TestUploadSizeLimits:
"""Tests for upload size limit enforcement."""
@pytest.mark.integration
def test_empty_file_rejected(self, integration_client, test_package):
"""Test empty files are rejected."""
project, package = test_package
files = {"file": ("empty.txt", io.BytesIO(b""), "application/octet-stream")}
response = integration_client.post(
f"/api/v1/project/{project}/{package}/upload",
files=files,
)
assert response.status_code in [400, 422]
@pytest.mark.integration
def test_minimum_size_accepted(self, integration_client, test_package):
"""Test 1-byte file is accepted."""
project, package = test_package
content = b"X"
result = upload_test_file(
integration_client, project, package, content, tag="min-size"
)
assert result["size"] == 1
@pytest.mark.integration
def test_content_length_header_used_in_response(self, integration_client, test_package):
"""Test that upload response size matches Content-Length."""
project, package = test_package
content = b"content length verification test"
result = upload_test_file(
integration_client, project, package, content, tag="content-length-test"
)
# Size in response should match actual content length
assert result["size"] == len(content)
class TestUploadErrorHandling:
"""Tests for upload error handling."""
@pytest.mark.integration
def test_upload_to_nonexistent_project_returns_404(
self, integration_client, unique_test_id
):
"""Test upload to nonexistent project returns 404."""
content = b"test content"
files = {"file": ("test.bin", io.BytesIO(content), "application/octet-stream")}
response = integration_client.post(
f"/api/v1/project/nonexistent-{unique_test_id}/pkg/upload",
files=files,
)
assert response.status_code == 404
@pytest.mark.integration
def test_upload_to_nonexistent_package_returns_404(
self, integration_client, test_project, unique_test_id
):
"""Test upload to nonexistent package returns 404."""
content = b"test content"
files = {"file": ("test.bin", io.BytesIO(content), "application/octet-stream")}
response = integration_client.post(
f"/api/v1/project/{test_project}/nonexistent-{unique_test_id}/upload",
files=files,
)
assert response.status_code == 404
@pytest.mark.integration
def test_upload_without_file_returns_422(self, integration_client, test_package):
"""Test upload without file field returns 422."""
project, package = test_package
response = integration_client.post(
f"/api/v1/project/{project}/{package}/upload",
data={"tag": "no-file"},
)
assert response.status_code == 422
@pytest.mark.integration
def test_upload_with_invalid_checksum_rejected(
self, integration_client, test_package
):
"""Test upload with invalid checksum header format is rejected."""
project, package = test_package
content = b"checksum test"
files = {"file": ("test.bin", io.BytesIO(content), "application/octet-stream")}
response = integration_client.post(
f"/api/v1/project/{project}/{package}/upload",
files=files,
headers={"X-Checksum-SHA256": "invalid-checksum"},
)
assert response.status_code == 400
@pytest.mark.integration
def test_upload_with_mismatched_checksum_rejected(
self, integration_client, test_package
):
"""Test upload with wrong checksum is rejected."""
project, package = test_package
content = b"mismatch test"
wrong_hash = "0" * 64
files = {"file": ("test.bin", io.BytesIO(content), "application/octet-stream")}
response = integration_client.post(
f"/api/v1/project/{project}/{package}/upload",
files=files,
headers={"X-Checksum-SHA256": wrong_hash},
)
assert response.status_code == 422
assert "verification failed" in response.json().get("detail", "").lower()
class TestResumableUploadCancellation:
"""Tests for resumable upload cancellation."""
@pytest.mark.integration
def test_abort_resumable_upload(self, integration_client, test_package, sized_content):
"""Test aborting a resumable upload cleans up properly."""
project, package = test_package
content, expected_hash = sized_content(SIZE_100KB, seed=700)
# Get API key for auth
api_key_response = integration_client.post(
"/api/v1/auth/keys",
json={"name": "abort-test-key"},
)
assert api_key_response.status_code == 200
api_key = api_key_response.json()["key"]
# Initialize resumable upload
init_response = integration_client.post(
f"/api/v1/project/{project}/{package}/upload/init",
json={
"expected_hash": expected_hash,
"filename": "abort-test.bin",
"size": SIZE_100KB,
},
headers={"Authorization": f"Bearer {api_key}"},
)
assert init_response.status_code == 200
upload_id = init_response.json().get("upload_id")
if upload_id:
# Abort the upload (without uploading any parts)
abort_response = integration_client.delete(
f"/api/v1/project/{project}/{package}/upload/{upload_id}",
headers={"Authorization": f"Bearer {api_key}"},
)
assert abort_response.status_code in [200, 204]
# Verify progress shows not_found after abort
progress_response = integration_client.get(
f"/api/v1/project/{project}/{package}/upload/{upload_id}/progress",
headers={"Authorization": f"Bearer {api_key}"},
)
assert progress_response.status_code == 200
assert progress_response.json()["status"] == "not_found"
@pytest.mark.integration
def test_abort_nonexistent_upload(self, integration_client, test_package):
"""Test aborting nonexistent upload returns appropriate error."""
project, package = test_package
# Get API key for auth
api_key_response = integration_client.post(
"/api/v1/auth/keys",
json={"name": "abort-nonexistent-key"},
)
assert api_key_response.status_code == 200
api_key = api_key_response.json()["key"]
response = integration_client.delete(
f"/api/v1/project/{project}/{package}/upload/nonexistent-upload-id",
headers={"Authorization": f"Bearer {api_key}"},
)
# Should return 404 or 200 (idempotent delete)
assert response.status_code in [200, 204, 404]
class TestUploadTimeout:
"""Tests for upload timeout handling."""
@pytest.mark.integration
def test_upload_with_short_timeout_succeeds_for_small_file(
self, integration_client, test_package
):
"""Test small file upload succeeds with reasonable timeout."""
project, package = test_package
content = b"small timeout test"
# httpx client should handle this quickly
result = upload_test_file(
integration_client, project, package, content, tag="timeout-small"
)
assert result["artifact_id"] is not None
@pytest.mark.integration
def test_upload_response_duration_under_timeout(
self, integration_client, test_package, sized_content
):
"""Test upload completes within reasonable time."""
project, package = test_package
content, _ = sized_content(SIZE_1MB, seed=800)
start = time.time()
result = upload_test_file(
integration_client, project, package, content, tag="timeout-check"
)
duration = time.time() - start
# 1MB should upload in well under 60 seconds on local
assert duration < 60
assert result["artifact_id"] is not None
class TestConcurrentUploads:
"""Tests for concurrent upload handling."""
@pytest.mark.integration
def test_concurrent_different_files(
self, integration_client, test_package, sized_content
):
"""Test concurrent uploads of different files succeed."""
from concurrent.futures import ThreadPoolExecutor, as_completed
project, package = test_package
# Get API key for auth
api_key_response = integration_client.post(
"/api/v1/auth/keys",
json={"name": "concurrent-diff-key"},
)
assert api_key_response.status_code == 200
api_key = api_key_response.json()["key"]
num_uploads = 3
results = []
errors = []
def upload_unique_file(idx):
try:
from httpx import Client
content, expected_hash = sized_content(SIZE_100KB, seed=900 + idx)
base_url = "http://localhost:8080"
with Client(base_url=base_url, timeout=30.0) as client:
files = {
"file": (
f"concurrent-{idx}.bin",
io.BytesIO(content),
"application/octet-stream",
)
}
response = client.post(
f"/api/v1/project/{project}/{package}/upload",
files=files,
data={"tag": f"concurrent-diff-{idx}"},
headers={"Authorization": f"Bearer {api_key}"},
)
if response.status_code == 200:
results.append((idx, response.json(), expected_hash))
else:
errors.append(f"Upload {idx}: {response.status_code} - {response.text}")
except Exception as e:
errors.append(f"Upload {idx}: {str(e)}")
with ThreadPoolExecutor(max_workers=num_uploads) as executor:
futures = [executor.submit(upload_unique_file, i) for i in range(num_uploads)]
for future in as_completed(futures):
pass
assert len(errors) == 0, f"Concurrent upload errors: {errors}"
assert len(results) == num_uploads
# Each upload should have unique artifact ID
artifact_ids = set(r[1]["artifact_id"] for r in results)
assert len(artifact_ids) == num_uploads
# Each should match expected hash
for idx, result, expected_hash in results:
assert result["artifact_id"] == expected_hash