Add deduplication design doc, file size limits, and validation tests
- Add max_file_size (10GB) and min_file_size (1 byte) config options - Add file size validation to regular and resumable upload endpoints - Create comprehensive deduplication design document covering: - SHA256 algorithm selection rationale and migration path - Content-addressable storage model - S3 key derivation and prefix sharding - Duplicate detection workflow - Reference counting lifecycle - Edge cases and error handling - Collision detection strategy - Performance considerations - Operations runbook - Add tests for empty file rejection and file size validation
This commit is contained in:
@@ -38,6 +38,10 @@ class Settings(BaseSettings):
|
||||
s3_read_timeout: int = 60 # Read timeout in seconds
|
||||
s3_max_retries: int = 3 # Max retry attempts for transient failures
|
||||
|
||||
# Upload settings
|
||||
max_file_size: int = 10 * 1024 * 1024 * 1024 # 10GB default max file size
|
||||
min_file_size: int = 1 # Minimum 1 byte (empty files rejected)
|
||||
|
||||
# Download settings
|
||||
download_mode: str = "presigned" # "presigned", "redirect", or "proxy"
|
||||
presigned_url_expiry: int = (
|
||||
|
||||
@@ -973,6 +973,20 @@ def upload_artifact(
|
||||
if not package:
|
||||
raise HTTPException(status_code=404, detail="Package not found")
|
||||
|
||||
# Validate file size
|
||||
settings = get_settings()
|
||||
if content_length is not None:
|
||||
if content_length > settings.max_file_size:
|
||||
raise HTTPException(
|
||||
status_code=413,
|
||||
detail=f"File too large. Maximum size is {settings.max_file_size // (1024 * 1024 * 1024)}GB",
|
||||
)
|
||||
if content_length < settings.min_file_size:
|
||||
raise HTTPException(
|
||||
status_code=422,
|
||||
detail="Empty files are not allowed",
|
||||
)
|
||||
|
||||
# Extract format-specific metadata before storing
|
||||
file_metadata = {}
|
||||
if file.filename:
|
||||
@@ -1162,6 +1176,19 @@ def init_resumable_upload(
|
||||
if not package:
|
||||
raise HTTPException(status_code=404, detail="Package not found")
|
||||
|
||||
# Validate file size
|
||||
settings = get_settings()
|
||||
if init_request.size > settings.max_file_size:
|
||||
raise HTTPException(
|
||||
status_code=413,
|
||||
detail=f"File too large. Maximum size is {settings.max_file_size // (1024 * 1024 * 1024)}GB",
|
||||
)
|
||||
if init_request.size < settings.min_file_size:
|
||||
raise HTTPException(
|
||||
status_code=422,
|
||||
detail="Empty files are not allowed",
|
||||
)
|
||||
|
||||
# Check if artifact already exists (deduplication)
|
||||
existing_artifact = (
|
||||
db.query(Artifact).filter(Artifact.id == init_request.expected_hash).first()
|
||||
|
||||
@@ -549,3 +549,56 @@ class TestUploadFailureCleanup:
|
||||
)
|
||||
assert tag_response.status_code == 200
|
||||
assert tag_response.json()["artifact_id"] == hash2
|
||||
|
||||
|
||||
class TestFileSizeValidation:
|
||||
"""Tests for file size limits and empty file rejection."""
|
||||
|
||||
@pytest.mark.integration
|
||||
def test_empty_file_rejected(self, integration_client, test_package):
|
||||
"""Test that empty files are rejected with appropriate error."""
|
||||
project, package = test_package
|
||||
|
||||
# Try to upload empty content
|
||||
files = {"file": ("empty.txt", io.BytesIO(b""), "application/octet-stream")}
|
||||
response = integration_client.post(
|
||||
f"/api/v1/project/{project}/{package}/upload",
|
||||
files=files,
|
||||
)
|
||||
|
||||
# Should be rejected (422 from storage layer or validation)
|
||||
assert response.status_code in [422, 400]
|
||||
|
||||
@pytest.mark.integration
|
||||
def test_small_valid_file_accepted(self, integration_client, test_package):
|
||||
"""Test that small (1 byte) files are accepted."""
|
||||
project, package = test_package
|
||||
content = b"X" # Single byte
|
||||
|
||||
result = upload_test_file(
|
||||
integration_client, project, package, content, tag="tiny"
|
||||
)
|
||||
|
||||
assert result["artifact_id"] is not None
|
||||
assert result["size"] == 1
|
||||
|
||||
@pytest.mark.integration
|
||||
def test_file_size_reported_correctly(
|
||||
self, integration_client, test_package, unique_test_id
|
||||
):
|
||||
"""Test that file size is correctly reported in response."""
|
||||
project, package = test_package
|
||||
content = f"Test content for size check {unique_test_id}".encode()
|
||||
expected_size = len(content)
|
||||
|
||||
result = upload_test_file(
|
||||
integration_client, project, package, content, tag="size-test"
|
||||
)
|
||||
|
||||
assert result["size"] == expected_size
|
||||
|
||||
# Also verify via artifact endpoint
|
||||
artifact_response = integration_client.get(
|
||||
f"/api/v1/artifact/{result['artifact_id']}"
|
||||
)
|
||||
assert artifact_response.json()["size"] == expected_size
|
||||
|
||||
Reference in New Issue
Block a user