Add deduplication design doc, file size limits, and validation tests

- Add max_file_size (10GB) and min_file_size (1 byte) config options
- Add file size validation to regular and resumable upload endpoints
- Create comprehensive deduplication design document covering:
  - SHA256 algorithm selection rationale and migration path
  - Content-addressable storage model
  - S3 key derivation and prefix sharding
  - Duplicate detection workflow
  - Reference counting lifecycle
  - Edge cases and error handling
  - Collision detection strategy
  - Performance considerations
  - Operations runbook
- Add tests for empty file rejection and file size validation
This commit is contained in:
Mondo Diaz
2026-01-05 15:35:21 -06:00
parent 32115fc1c5
commit 55a38ad850
5 changed files with 664 additions and 0 deletions

View File

@@ -38,6 +38,10 @@ class Settings(BaseSettings):
s3_read_timeout: int = 60 # Read timeout in seconds
s3_max_retries: int = 3 # Max retry attempts for transient failures
# Upload settings
max_file_size: int = 10 * 1024 * 1024 * 1024 # 10GB default max file size
min_file_size: int = 1 # Minimum 1 byte (empty files rejected)
# Download settings
download_mode: str = "presigned" # "presigned", "redirect", or "proxy"
presigned_url_expiry: int = (

View File

@@ -973,6 +973,20 @@ def upload_artifact(
if not package:
raise HTTPException(status_code=404, detail="Package not found")
# Validate file size
settings = get_settings()
if content_length is not None:
if content_length > settings.max_file_size:
raise HTTPException(
status_code=413,
detail=f"File too large. Maximum size is {settings.max_file_size // (1024 * 1024 * 1024)}GB",
)
if content_length < settings.min_file_size:
raise HTTPException(
status_code=422,
detail="Empty files are not allowed",
)
# Extract format-specific metadata before storing
file_metadata = {}
if file.filename:
@@ -1162,6 +1176,19 @@ def init_resumable_upload(
if not package:
raise HTTPException(status_code=404, detail="Package not found")
# Validate file size
settings = get_settings()
if init_request.size > settings.max_file_size:
raise HTTPException(
status_code=413,
detail=f"File too large. Maximum size is {settings.max_file_size // (1024 * 1024 * 1024)}GB",
)
if init_request.size < settings.min_file_size:
raise HTTPException(
status_code=422,
detail="Empty files are not allowed",
)
# Check if artifact already exists (deduplication)
existing_artifact = (
db.query(Artifact).filter(Artifact.id == init_request.expected_hash).first()

View File

@@ -549,3 +549,56 @@ class TestUploadFailureCleanup:
)
assert tag_response.status_code == 200
assert tag_response.json()["artifact_id"] == hash2
class TestFileSizeValidation:
"""Tests for file size limits and empty file rejection."""
@pytest.mark.integration
def test_empty_file_rejected(self, integration_client, test_package):
"""Test that empty files are rejected with appropriate error."""
project, package = test_package
# Try to upload empty content
files = {"file": ("empty.txt", io.BytesIO(b""), "application/octet-stream")}
response = integration_client.post(
f"/api/v1/project/{project}/{package}/upload",
files=files,
)
# Should be rejected (422 from storage layer or validation)
assert response.status_code in [422, 400]
@pytest.mark.integration
def test_small_valid_file_accepted(self, integration_client, test_package):
"""Test that small (1 byte) files are accepted."""
project, package = test_package
content = b"X" # Single byte
result = upload_test_file(
integration_client, project, package, content, tag="tiny"
)
assert result["artifact_id"] is not None
assert result["size"] == 1
@pytest.mark.integration
def test_file_size_reported_correctly(
self, integration_client, test_package, unique_test_id
):
"""Test that file size is correctly reported in response."""
project, package = test_package
content = f"Test content for size check {unique_test_id}".encode()
expected_size = len(content)
result = upload_test_file(
integration_client, project, package, content, tag="size-test"
)
assert result["size"] == expected_size
# Also verify via artifact endpoint
artifact_response = integration_client.get(
f"/api/v1/artifact/{result['artifact_id']}"
)
assert artifact_response.json()["size"] == expected_size