Add deduplication design doc, file size limits, and validation tests

- Add max_file_size (10GB) and min_file_size (1 byte) config options
- Add file size validation to regular and resumable upload endpoints
- Create comprehensive deduplication design document covering:
  - SHA256 algorithm selection rationale and migration path
  - Content-addressable storage model
  - S3 key derivation and prefix sharding
  - Duplicate detection workflow
  - Reference counting lifecycle
  - Edge cases and error handling
  - Collision detection strategy
  - Performance considerations
  - Operations runbook
- Add tests for empty file rejection and file size validation
This commit is contained in:
Mondo Diaz
2026-01-05 15:35:21 -06:00
parent 32115fc1c5
commit 55a38ad850
5 changed files with 664 additions and 0 deletions

View File

@@ -38,6 +38,10 @@ class Settings(BaseSettings):
s3_read_timeout: int = 60 # Read timeout in seconds
s3_max_retries: int = 3 # Max retry attempts for transient failures
# Upload settings
max_file_size: int = 10 * 1024 * 1024 * 1024 # 10GB default max file size
min_file_size: int = 1 # Minimum 1 byte (empty files rejected)
# Download settings
download_mode: str = "presigned" # "presigned", "redirect", or "proxy"
presigned_url_expiry: int = (

View File

@@ -973,6 +973,20 @@ def upload_artifact(
if not package:
raise HTTPException(status_code=404, detail="Package not found")
# Validate file size
settings = get_settings()
if content_length is not None:
if content_length > settings.max_file_size:
raise HTTPException(
status_code=413,
detail=f"File too large. Maximum size is {settings.max_file_size // (1024 * 1024 * 1024)}GB",
)
if content_length < settings.min_file_size:
raise HTTPException(
status_code=422,
detail="Empty files are not allowed",
)
# Extract format-specific metadata before storing
file_metadata = {}
if file.filename:
@@ -1162,6 +1176,19 @@ def init_resumable_upload(
if not package:
raise HTTPException(status_code=404, detail="Package not found")
# Validate file size
settings = get_settings()
if init_request.size > settings.max_file_size:
raise HTTPException(
status_code=413,
detail=f"File too large. Maximum size is {settings.max_file_size // (1024 * 1024 * 1024)}GB",
)
if init_request.size < settings.min_file_size:
raise HTTPException(
status_code=422,
detail="Empty files are not allowed",
)
# Check if artifact already exists (deduplication)
existing_artifact = (
db.query(Artifact).filter(Artifact.id == init_request.expected_hash).first()