Add storage abstraction, stats endpoints, garbage collection, and test infrastructure

- Add StorageBackend protocol for backend-agnostic storage interface
- Add health check with storage and database connectivity verification
- Add garbage collection endpoints for orphaned artifacts (ref_count=0)
- Add deduplication statistics endpoints (/api/v1/stats, /stats/storage, /stats/deduplication)
- Add per-project statistics endpoint
- Add verify_integrity method for post-upload hash validation
- Set up pytest infrastructure with mock S3 client
- Add unit tests for hash calculation and duplicate detection
This commit is contained in:
Mondo Diaz
2026-01-05 11:16:46 -06:00
parent dbe78ded2f
commit 109677e43a
9 changed files with 1311 additions and 5 deletions

View File

@@ -387,3 +387,72 @@ class PresignedUrlResponse(BaseModel):
class HealthResponse(BaseModel):
status: str
version: str = "1.0.0"
storage_healthy: Optional[bool] = None
database_healthy: Optional[bool] = None
# Garbage collection schemas
class GarbageCollectionResponse(BaseModel):
"""Response from garbage collection operation"""
artifacts_deleted: int
bytes_freed: int
artifact_ids: List[str]
dry_run: bool
class OrphanedArtifactResponse(BaseModel):
"""Information about an orphaned artifact"""
id: str
size: int
created_at: datetime
created_by: str
original_name: Optional[str]
# Storage statistics schemas
class StorageStatsResponse(BaseModel):
"""Global storage statistics"""
total_artifacts: int
total_size_bytes: int
unique_artifacts: int # Artifacts with ref_count > 0
orphaned_artifacts: int # Artifacts with ref_count = 0
orphaned_size_bytes: int
total_uploads: int
deduplicated_uploads: int
deduplication_ratio: (
float # total_uploads / unique_artifacts (if > 1, deduplication is working)
)
storage_saved_bytes: int # Bytes saved through deduplication
class DeduplicationStatsResponse(BaseModel):
"""Deduplication effectiveness statistics"""
total_logical_bytes: (
int # Sum of all upload sizes (what would be stored without dedup)
)
total_physical_bytes: int # Actual storage used
bytes_saved: int
savings_percentage: float
total_uploads: int
unique_artifacts: int
duplicate_uploads: int
average_ref_count: float
max_ref_count: int
most_referenced_artifacts: List[Dict[str, Any]] # Top N most referenced
class ProjectStatsResponse(BaseModel):
"""Per-project statistics"""
project_id: str
project_name: str
package_count: int
tag_count: int
artifact_count: int
total_size_bytes: int
upload_count: int
deduplicated_uploads: int