diff --git a/CHANGELOG.md b/CHANGELOG.md index c09996b..db52574 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -35,7 +35,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Added integration tests for cascade deletion ref_count behavior (package/project delete) (#35) - Added integration tests for tag update ref_count adjustments (#35) - Added integration tests for garbage collection endpoints (#35) +- Added integration tests for file size validation (#35) - Added test dependencies to requirements.txt (pytest, pytest-asyncio, pytest-cov, httpx, moto) (#35) +- Added `ORCHARD_MAX_FILE_SIZE` config option (default: 10GB) for upload size limits (#37) +- Added `ORCHARD_MIN_FILE_SIZE` config option (default: 1 byte, rejects empty files) (#37) +- Added file size validation to upload and resumable upload endpoints (#37) +- Added comprehensive deduplication design document (`docs/design/deduplication-design.md`) (#37) ### Fixed - Fixed Helm chart `minio.ingress` conflicting with Bitnami MinIO subchart by renaming to `minioIngress` (#48) - Fixed JSON report serialization error for Decimal types in `GET /api/v1/stats/report` (#34) diff --git a/backend/app/config.py b/backend/app/config.py index a5b33d5..2aa4469 100644 --- a/backend/app/config.py +++ b/backend/app/config.py @@ -38,6 +38,10 @@ class Settings(BaseSettings): s3_read_timeout: int = 60 # Read timeout in seconds s3_max_retries: int = 3 # Max retry attempts for transient failures + # Upload settings + max_file_size: int = 10 * 1024 * 1024 * 1024 # 10GB default max file size + min_file_size: int = 1 # Minimum 1 byte (empty files rejected) + # Download settings download_mode: str = "presigned" # "presigned", "redirect", or "proxy" presigned_url_expiry: int = ( diff --git a/backend/app/routes.py b/backend/app/routes.py index dfb6b3d..311d6ab 100644 --- a/backend/app/routes.py +++ b/backend/app/routes.py @@ -973,6 +973,20 @@ def upload_artifact( if not package: raise HTTPException(status_code=404, detail="Package not found") + # Validate file size + settings = get_settings() + if content_length is not None: + if content_length > settings.max_file_size: + raise HTTPException( + status_code=413, + detail=f"File too large. Maximum size is {settings.max_file_size // (1024 * 1024 * 1024)}GB", + ) + if content_length < settings.min_file_size: + raise HTTPException( + status_code=422, + detail="Empty files are not allowed", + ) + # Extract format-specific metadata before storing file_metadata = {} if file.filename: @@ -1162,6 +1176,19 @@ def init_resumable_upload( if not package: raise HTTPException(status_code=404, detail="Package not found") + # Validate file size + settings = get_settings() + if init_request.size > settings.max_file_size: + raise HTTPException( + status_code=413, + detail=f"File too large. Maximum size is {settings.max_file_size // (1024 * 1024 * 1024)}GB", + ) + if init_request.size < settings.min_file_size: + raise HTTPException( + status_code=422, + detail="Empty files are not allowed", + ) + # Check if artifact already exists (deduplication) existing_artifact = ( db.query(Artifact).filter(Artifact.id == init_request.expected_hash).first() diff --git a/backend/tests/test_integration_uploads.py b/backend/tests/test_integration_uploads.py index fca0c67..d354390 100644 --- a/backend/tests/test_integration_uploads.py +++ b/backend/tests/test_integration_uploads.py @@ -549,3 +549,56 @@ class TestUploadFailureCleanup: ) assert tag_response.status_code == 200 assert tag_response.json()["artifact_id"] == hash2 + + +class TestFileSizeValidation: + """Tests for file size limits and empty file rejection.""" + + @pytest.mark.integration + def test_empty_file_rejected(self, integration_client, test_package): + """Test that empty files are rejected with appropriate error.""" + project, package = test_package + + # Try to upload empty content + files = {"file": ("empty.txt", io.BytesIO(b""), "application/octet-stream")} + response = integration_client.post( + f"/api/v1/project/{project}/{package}/upload", + files=files, + ) + + # Should be rejected (422 from storage layer or validation) + assert response.status_code in [422, 400] + + @pytest.mark.integration + def test_small_valid_file_accepted(self, integration_client, test_package): + """Test that small (1 byte) files are accepted.""" + project, package = test_package + content = b"X" # Single byte + + result = upload_test_file( + integration_client, project, package, content, tag="tiny" + ) + + assert result["artifact_id"] is not None + assert result["size"] == 1 + + @pytest.mark.integration + def test_file_size_reported_correctly( + self, integration_client, test_package, unique_test_id + ): + """Test that file size is correctly reported in response.""" + project, package = test_package + content = f"Test content for size check {unique_test_id}".encode() + expected_size = len(content) + + result = upload_test_file( + integration_client, project, package, content, tag="size-test" + ) + + assert result["size"] == expected_size + + # Also verify via artifact endpoint + artifact_response = integration_client.get( + f"/api/v1/artifact/{result['artifact_id']}" + ) + assert artifact_response.json()["size"] == expected_size diff --git a/docs/design/deduplication-design.md b/docs/design/deduplication-design.md new file mode 100644 index 0000000..a7e1f88 --- /dev/null +++ b/docs/design/deduplication-design.md @@ -0,0 +1,575 @@ +# Deduplication Design Document + +This document defines Orchard's content-addressable storage and deduplication approach using SHA256 hashes. + +## Table of Contents + +1. [Overview](#overview) +2. [Hash Algorithm Selection](#hash-algorithm-selection) +3. [Content-Addressable Storage Model](#content-addressable-storage-model) +4. [S3 Key Derivation](#s3-key-derivation) +5. [Duplicate Detection Strategy](#duplicate-detection-strategy) +6. [Reference Counting Lifecycle](#reference-counting-lifecycle) +7. [Edge Cases and Error Handling](#edge-cases-and-error-handling) +8. [Collision Handling](#collision-handling) +9. [Performance Considerations](#performance-considerations) +10. [Operations Runbook](#operations-runbook) + +--- + +## Overview + +Orchard uses **whole-file deduplication** based on content hashing. When a file is uploaded: + +1. The SHA256 hash of the entire file content is computed +2. The hash becomes the artifact's primary identifier +3. If a file with the same hash already exists, no duplicate is stored +4. Multiple tags/references can point to the same artifact + +**Scope:** Orchard implements whole-file deduplication only. Chunk-level or block-level deduplication is out of scope for MVP. + +--- + +## Hash Algorithm Selection + +### Decision: SHA256 + +| Criteria | SHA256 | SHA1 | MD5 | Blake3 | +|----------|--------|------|-----|--------| +| Security | Strong (256-bit) | Weak (broken) | Weak (broken) | Strong | +| Speed | ~400 MB/s | ~600 MB/s | ~800 MB/s | ~1500 MB/s | +| Collision Resistance | 2^128 | Broken | Broken | 2^128 | +| Industry Adoption | Universal | Legacy | Legacy | Emerging | +| Tool Ecosystem | Excellent | Good | Good | Growing | + +### Rationale + +1. **Security**: SHA256 has no known practical collision attacks. SHA1 and MD5 are cryptographically broken. + +2. **Collision Resistance**: With 256-bit output, the probability of accidental collision is approximately 2^-128 (~10^-38). To have a 50% chance of collision, you would need approximately 2^128 unique files. + +3. **Industry Standard**: SHA256 is the de facto standard for content-addressable storage (Git, Docker, npm, etc.). + +4. **Performance**: While Blake3 is faster, SHA256 throughput (~400 MB/s) exceeds typical network bandwidth for uploads. The bottleneck is I/O, not hashing. + +5. **Tooling**: Universal support in all languages, operating systems, and verification tools. + +### Migration Path + +If a future algorithm change is needed (e.g., SHA3 or Blake3): + +1. **Database**: Add `hash_algorithm` column to artifacts table (default: 'sha256') +2. **S3 Keys**: New algorithm uses different prefix (e.g., `fruits-sha3/` vs `fruits/`) +3. **API**: Accept algorithm hint in upload, return algorithm in responses +4. **Migration**: Background job to re-hash existing artifacts if needed + +**Current Implementation**: Single algorithm (SHA256), no algorithm versioning required for MVP. + +--- + +## Content-Addressable Storage Model + +### Core Principles + +1. **Identity = Content**: The artifact ID IS the SHA256 hash of its content +2. **Immutability**: Content cannot change after storage (same hash = same content) +3. **Deduplication**: Same content uploaded twice results in single storage +4. **Metadata Independence**: Files with identical content but different names/types are deduplicated + +### Data Model + +``` +Artifact { + id: VARCHAR(64) PRIMARY KEY -- SHA256 hash (lowercase hex) + size: BIGINT -- File size in bytes + ref_count: INTEGER -- Number of references + s3_key: VARCHAR(1024) -- S3 storage path + checksum_md5: VARCHAR(32) -- Secondary checksum + checksum_sha1: VARCHAR(40) -- Secondary checksum + ... +} + +Tag { + id: UUID PRIMARY KEY + name: VARCHAR(255) + package_id: UUID FK + artifact_id: VARCHAR(64) FK -- Points to Artifact.id (SHA256) +} +``` + +### Hash Format + +- Algorithm: SHA256 +- Output: 64 lowercase hexadecimal characters +- Example: `dffd6021bb2bd5b0af676290809ec3a53191dd81c7f70a4b28688a362182986f` + +--- + +## S3 Key Derivation + +### Key Structure + +``` +fruits/{hash[0:2]}/{hash[2:4]}/{full_hash} +``` + +Example for hash `dffd6021bb2bd5b0...`: +``` +fruits/df/fd/dffd6021bb2bd5b0af676290809ec3a53191dd81c7f70a4b28688a362182986f +``` + +### Rationale for Prefix Sharding + +1. **S3 Performance**: S3 partitions by key prefix. Distributing across prefixes improves throughput. + +2. **Filesystem Compatibility**: When using filesystem-backed storage, avoids single directory with millions of files. + +3. **Distribution**: With 2-character prefixes (256 combinations each level), provides 65,536 (256 x 256) top-level buckets. + +### Bucket Distribution Analysis + +Assuming uniformly distributed SHA256 hashes: + +| Artifacts | Files per Prefix (avg) | Max per Prefix (99.9%) | +|-----------|------------------------|------------------------| +| 100,000 | 1.5 | 10 | +| 1,000,000 | 15 | 50 | +| 10,000,000 | 152 | 250 | +| 100,000,000 | 1,525 | 2,000 | + +The two-level prefix provides excellent distribution up to hundreds of millions of artifacts. + +--- + +## Duplicate Detection Strategy + +### Upload Flow + +``` +┌─────────────────────────────────────────────────────────────────┐ +│ UPLOAD REQUEST │ +└─────────────────────────────────────────────────────────────────┘ + │ + ▼ +┌─────────────────────────────────────────────────────────────────┐ +│ 1. VALIDATE: Check file size limits (min/max) │ +│ - Empty files (0 bytes) → Reject with 422 │ +│ - Exceeds max_file_size → Reject with 413 │ +└─────────────────────────────────────────────────────────────────┘ + │ + ▼ +┌─────────────────────────────────────────────────────────────────┐ +│ 2. COMPUTE HASH: Stream file through SHA256/MD5/SHA1 │ +│ - Use 8MB chunks for memory efficiency │ +│ - Single pass for all three hashes │ +└─────────────────────────────────────────────────────────────────┘ + │ + ▼ +┌─────────────────────────────────────────────────────────────────┐ +│ 3. DERIVE S3 KEY: fruits/{hash[0:2]}/{hash[2:4]}/{hash} │ +└─────────────────────────────────────────────────────────────────┘ + │ + ▼ +┌─────────────────────────────────────────────────────────────────┐ +│ 4. CHECK EXISTENCE: HEAD request to S3 for derived key │ +│ - Retry up to 3 times on transient failures │ +└─────────────────────────────────────────────────────────────────┘ + │ + ┌───────────────┴───────────────┐ + ▼ ▼ +┌─────────────────────────┐ ┌─────────────────────────────────┐ +│ EXISTS: Deduplicated │ │ NOT EXISTS: Upload to S3 │ +│ - Verify size matches │ │ - PUT object (or multipart) │ +│ - Skip S3 upload │ │ - Abort on failure │ +│ - Log saved bytes │ └─────────────────────────────────┘ +└─────────────────────────┘ │ + │ │ + └───────────────┬───────────────┘ + ▼ +┌─────────────────────────────────────────────────────────────────┐ +│ 5. DATABASE: Create/update artifact record │ +│ - Use row locking to prevent race conditions │ +│ - ref_count managed by SQL triggers │ +└─────────────────────────────────────────────────────────────────┘ + │ + ▼ +┌─────────────────────────────────────────────────────────────────┐ +│ 6. CREATE TAG: If tag provided, create/update tag │ +│ - SQL trigger increments ref_count │ +└─────────────────────────────────────────────────────────────────┘ +``` + +### Hash Computation + +**Memory Requirements:** +- Chunk size: 8MB (`HASH_CHUNK_SIZE`) +- Working memory: ~25MB (8MB chunk + hash states) +- Independent of file size (streaming) + +**Throughput:** +- SHA256 alone: ~400 MB/s on modern CPU +- With MD5 + SHA1: ~300 MB/s (parallel computation) +- Typical bottleneck: Network I/O, not CPU + +### Multipart Upload Threshold + +Files larger than 100MB use S3 multipart upload: +- First pass: Stream to compute hashes +- If not duplicate: Seek to start, upload in 10MB parts +- On failure: Abort multipart upload (no orphaned parts) + +--- + +## Reference Counting Lifecycle + +### What Constitutes a "Reference" + +A reference is a **Tag** pointing to an artifact. Each tag increments the ref_count by 1. + +**Uploads do NOT directly increment ref_count** - only tag creation does. + +### Lifecycle + +``` +┌─────────────────────────────────────────────────────────────────┐ +│ CREATE: New artifact uploaded │ +│ - ref_count = 0 (no tags yet) │ +│ - Artifact exists but is "orphaned" │ +└─────────────────────────────────────────────────────────────────┘ + │ + ▼ +┌─────────────────────────────────────────────────────────────────┐ +│ TAG CREATED: Tag points to artifact │ +│ - SQL trigger: ref_count += 1 │ +│ - Artifact is now referenced │ +└─────────────────────────────────────────────────────────────────┘ + │ + ▼ +┌─────────────────────────────────────────────────────────────────┐ +│ TAG UPDATED: Tag moved to different artifact │ +│ - SQL trigger on old artifact: ref_count -= 1 │ +│ - SQL trigger on new artifact: ref_count += 1 │ +└─────────────────────────────────────────────────────────────────┘ + │ + ▼ +┌─────────────────────────────────────────────────────────────────┐ +│ TAG DELETED: Tag removed │ +│ - SQL trigger: ref_count -= 1 │ +│ - If ref_count = 0, artifact is orphaned │ +└─────────────────────────────────────────────────────────────────┘ + │ + ▼ +┌─────────────────────────────────────────────────────────────────┐ +│ GARBAGE COLLECTION: Clean up orphaned artifacts │ +│ - Triggered manually via admin endpoint │ +│ - Finds artifacts where ref_count = 0 │ +│ - Deletes from S3 and database │ +└─────────────────────────────────────────────────────────────────┘ +``` + +### SQL Triggers + +Three triggers manage ref_count automatically: + +1. **`tags_ref_count_insert_trigger`**: On tag INSERT, increment target artifact's ref_count +2. **`tags_ref_count_delete_trigger`**: On tag DELETE, decrement target artifact's ref_count +3. **`tags_ref_count_update_trigger`**: On tag UPDATE (artifact_id changed), decrement old, increment new + +### Garbage Collection + +**Trigger**: Manual admin endpoint (`POST /api/v1/admin/garbage-collect`) + +**Process**: +1. Query artifacts where `ref_count = 0` +2. For each orphan: + - Delete from S3 (`DELETE fruits/xx/yy/hash`) + - Delete from database + - Log deletion + +**Safety**: +- Dry-run mode by default (`?dry_run=true`) +- Limit per run (`?limit=100`) +- Check constraint prevents ref_count < 0 + +--- + +## Edge Cases and Error Handling + +### Empty Files + +- **Behavior**: Rejected with HTTP 422 +- **Reason**: Empty content has deterministic hash but provides no value +- **Error**: "Empty files are not allowed" + +### Maximum File Size + +- **Default Limit**: 10GB (`ORCHARD_MAX_FILE_SIZE`) +- **Configurable**: Via environment variable +- **Behavior**: Rejected with HTTP 413 before upload begins +- **Error**: "File too large. Maximum size is 10GB" + +### Concurrent Upload of Same Content + +**Race Condition Scenario**: Two clients upload identical content simultaneously. + +**Handling**: +1. **S3 Level**: Both compute same hash, both check existence, both may upload +2. **Database Level**: Row-level locking with `SELECT ... FOR UPDATE` +3. **Outcome**: One creates artifact, other sees it exists, both succeed +4. **Trigger Safety**: SQL triggers are atomic per row + +**No Data Corruption**: S3 is eventually consistent; identical content = identical result. + +### Upload Interrupted + +**Scenario**: Upload fails after hash computed but before S3 write completes. + +**Simple Upload**: +- S3 put_object is atomic - either completes or fails entirely +- No cleanup needed + +**Multipart Upload**: +- On any failure, `abort_multipart_upload` is called +- S3 cleans up partial parts +- No orphaned data + +### DB Exists but S3 Missing + +**Detection**: Download request finds artifact in DB but S3 returns 404. + +**Current Behavior**: Return 500 error to client. + +**Recovery Options** (not yet implemented): +1. Mark artifact for re-upload (set flag, notify admins) +2. Decrement ref_count to trigger garbage collection +3. Return specific error code for client retry + +**Recommended**: Log critical alert, return 503 with retry hint. + +### S3 Exists but DB Missing + +**Detection**: Orphan - file in S3 with no corresponding DB record. + +**Cause**: +- Failed transaction after S3 upload +- Manual S3 manipulation +- Database restore from backup + +**Recovery**: +- Garbage collection won't delete (no DB record to query) +- Requires S3 bucket scan + DB reconciliation +- Manual admin task (out of scope for MVP) + +### Network Timeout During Existence Check + +**Behavior**: Retry up to 3 times with adaptive backoff. + +**After Retries Exhausted**: Raise `S3ExistenceCheckError`, return 503 to client. + +**Rationale**: Don't upload without knowing if duplicate exists (prevents orphans). + +--- + +## Collision Handling + +### SHA256 Collision Probability + +For random inputs, the probability of collision is: + +``` +P(collision) ≈ n² / 2^257 + +Where n = number of unique files +``` + +| Files | Collision Probability | +|-------|----------------------| +| 10^9 (1 billion) | 10^-59 | +| 10^12 (1 trillion) | 10^-53 | +| 10^18 | 10^-41 | + +**Practical Assessment**: You would need to store more files than atoms in the observable universe to have meaningful collision risk. + +### Detection Mechanism + +Despite near-zero probability, we detect potential collisions by: + +1. **Size Comparison**: If hash matches but sizes differ, CRITICAL alert +2. **ETag Verification**: S3 ETag provides secondary check + +### Handling Procedure + +If collision detected (size mismatch): + +1. **Log CRITICAL alert** with full details +2. **Reject upload** with 500 error +3. **Do NOT overwrite** existing content +4. **Notify operations** for manual investigation + +```python +raise HashCollisionError( + f"Hash collision detected for {sha256_hash}: size mismatch" +) +``` + +### MVP Position + +For MVP, we: +- Detect collisions via size mismatch +- Log and alert on detection +- Reject conflicting upload +- Accept that true collisions are practically impossible + +No active mitigation (e.g., storing hash + size as composite key) is needed. + +--- + +## Performance Considerations + +### Hash Computation Overhead + +| File Size | Hash Time | Upload Time (100 Mbps) | Overhead | +|-----------|-----------|------------------------|----------| +| 10 MB | 25ms | 800ms | 3% | +| 100 MB | 250ms | 8s | 3% | +| 1 GB | 2.5s | 80s | 3% | +| 10 GB | 25s | 800s | 3% | + +**Conclusion**: Hash computation adds ~3% overhead regardless of file size. Network I/O dominates. + +### Existence Check Overhead + +- S3 HEAD request: ~50-100ms per call +- Cached in future: Could use Redis/memory cache for hot paths +- Current MVP: No caching (acceptable for expected load) + +### Deduplication Savings + +Example with 50% duplication rate: + +| Metric | Without Dedup | With Dedup | Savings | +|--------|---------------|------------|---------| +| Storage (100K files, 10MB avg) | 1 TB | 500 GB | 50% | +| Upload bandwidth | 1 TB | 500 GB | 50% | +| S3 costs | $23/mo | $11.50/mo | 50% | + +--- + +## Operations Runbook + +### Monitoring Deduplication + +```bash +# View deduplication stats +curl http://orchard:8080/api/v1/stats/deduplication + +# Response includes: +# - deduplication_ratio +# - total_uploads, deduplicated_uploads +# - bytes_saved +``` + +### Checking for Orphaned Artifacts + +```bash +# List orphaned artifacts (ref_count = 0) +curl http://orchard:8080/api/v1/admin/orphaned-artifacts + +# Dry-run garbage collection +curl -X POST "http://orchard:8080/api/v1/admin/garbage-collect?dry_run=true" + +# Execute garbage collection +curl -X POST "http://orchard:8080/api/v1/admin/garbage-collect?dry_run=false" +``` + +### Verifying Artifact Integrity + +```bash +# Download and verify hash matches artifact ID +ARTIFACT_ID="dffd6021bb2bd5b0af676290809ec3a53191dd81c7f70a4b28688a362182986f" +curl -O http://orchard:8080/api/v1/artifact/$ARTIFACT_ID/download +COMPUTED=$(sha256sum downloaded_file | cut -d' ' -f1) +[ "$ARTIFACT_ID" = "$COMPUTED" ] && echo "OK" || echo "INTEGRITY FAILURE" +``` + +### Troubleshooting + +| Symptom | Likely Cause | Resolution | +|---------|--------------|------------| +| "Hash computation error" | Empty file or read error | Check file content, retry | +| "Storage unavailable" | S3/MinIO down | Check S3 health, retry | +| "File too large" | Exceeds max_file_size | Adjust config or use chunked upload | +| "Hash collision detected" | Extremely rare | Investigate, do not ignore | +| Orphaned artifacts accumulating | Tags deleted, no GC run | Run garbage collection | +| Download returns 404 | S3 object missing | Check S3 bucket, restore from backup | + +### Configuration Reference + +| Variable | Default | Description | +|----------|---------|-------------| +| `ORCHARD_MAX_FILE_SIZE` | 10GB | Maximum upload size | +| `ORCHARD_MIN_FILE_SIZE` | 1 | Minimum upload size (rejects empty) | +| `ORCHARD_S3_MAX_RETRIES` | 3 | Retry attempts for S3 operations | +| `ORCHARD_S3_CONNECT_TIMEOUT` | 10s | S3 connection timeout | +| `ORCHARD_S3_READ_TIMEOUT` | 60s | S3 read timeout | + +--- + +## Appendix: Decision Records + +### ADR-001: SHA256 for Content Hashing + +**Status**: Accepted + +**Context**: Need deterministic content identifier for deduplication. + +**Decision**: Use SHA256. + +**Rationale**: +- Cryptographically strong (no known attacks) +- Universal adoption (Git, Docker, npm) +- Sufficient speed for I/O-bound workloads +- Excellent tooling + +**Consequences**: +- 64-character artifact IDs (longer than UUIDs) +- CPU overhead ~3% of upload time +- Future algorithm migration requires versioning + +### ADR-002: Whole-File Deduplication Only + +**Status**: Accepted + +**Context**: Could implement chunk-level deduplication for better savings. + +**Decision**: Whole-file only for MVP. + +**Rationale**: +- Simpler implementation +- No chunking algorithm complexity +- Sufficient for build artifact use case +- Can add chunk-level later if needed + +**Consequences**: +- Files with partial overlap stored entirely +- Large files with small changes not deduplicated +- Acceptable for binary artifact workloads + +### ADR-003: SQL Triggers for ref_count + +**Status**: Accepted + +**Context**: ref_count must be accurate for garbage collection. + +**Decision**: Use PostgreSQL triggers, not application code. + +**Rationale**: +- Atomic with tag operations +- Cannot be bypassed +- Works regardless of client (API, direct SQL, migrations) +- Simpler application code + +**Consequences**: +- Trigger logic in SQL (less visible) +- Must maintain triggers across schema changes +- Debugging requires database access