Add ref_count management for deletions with atomic operations and error handling

This commit is contained in:
Mondo Diaz
2026-01-06 13:44:23 -06:00
parent 66622caf5d
commit 7e68baed08
24 changed files with 6888 additions and 329 deletions

View File

@@ -1,6 +1,6 @@
from datetime import datetime
from typing import Optional, List, Dict, Any, Generic, TypeVar
from pydantic import BaseModel
from pydantic import BaseModel, field_validator
from uuid import UUID
T = TypeVar("T")
@@ -40,8 +40,28 @@ class ProjectResponse(BaseModel):
# Package format and platform enums
PACKAGE_FORMATS = ["generic", "npm", "pypi", "docker", "deb", "rpm", "maven", "nuget", "helm"]
PACKAGE_PLATFORMS = ["any", "linux", "darwin", "windows", "linux-amd64", "linux-arm64", "darwin-amd64", "darwin-arm64", "windows-amd64"]
PACKAGE_FORMATS = [
"generic",
"npm",
"pypi",
"docker",
"deb",
"rpm",
"maven",
"nuget",
"helm",
]
PACKAGE_PLATFORMS = [
"any",
"linux",
"darwin",
"windows",
"linux-amd64",
"linux-arm64",
"darwin-amd64",
"darwin-arm64",
"windows-amd64",
]
# Package schemas
@@ -68,6 +88,7 @@ class PackageResponse(BaseModel):
class TagSummary(BaseModel):
"""Lightweight tag info for embedding in package responses"""
name: str
artifact_id: str
created_at: datetime
@@ -75,6 +96,7 @@ class TagSummary(BaseModel):
class PackageDetailResponse(BaseModel):
"""Package with aggregated metadata"""
id: UUID
project_id: UUID
name: str
@@ -135,6 +157,7 @@ class TagResponse(BaseModel):
class TagDetailResponse(BaseModel):
"""Tag with embedded artifact metadata"""
id: UUID
package_id: UUID
name: str
@@ -154,6 +177,7 @@ class TagDetailResponse(BaseModel):
class TagHistoryResponse(BaseModel):
"""History entry for tag changes"""
id: UUID
tag_id: UUID
old_artifact_id: Optional[str]
@@ -167,6 +191,7 @@ class TagHistoryResponse(BaseModel):
class ArtifactTagInfo(BaseModel):
"""Tag info for embedding in artifact responses"""
id: UUID
name: str
package_id: UUID
@@ -176,6 +201,7 @@ class ArtifactTagInfo(BaseModel):
class ArtifactDetailResponse(BaseModel):
"""Artifact with list of tags/packages referencing it"""
id: str
sha256: str # Explicit SHA256 field (same as id)
size: int
@@ -196,6 +222,7 @@ class ArtifactDetailResponse(BaseModel):
class PackageArtifactResponse(BaseModel):
"""Artifact with tags for package artifact listing"""
id: str
sha256: str # Explicit SHA256 field (same as id)
size: int
@@ -226,20 +253,35 @@ class UploadResponse(BaseModel):
s3_etag: Optional[str] = None
format_metadata: Optional[Dict[str, Any]] = None
deduplicated: bool = False
ref_count: int = 1 # Current reference count after this upload
# Resumable upload schemas
class ResumableUploadInitRequest(BaseModel):
"""Request to initiate a resumable upload"""
expected_hash: str # SHA256 hash of the file (client must compute)
filename: str
content_type: Optional[str] = None
size: int
tag: Optional[str] = None
@field_validator("expected_hash")
@classmethod
def validate_sha256_hash(cls, v: str) -> str:
"""Validate that expected_hash is a valid 64-character lowercase hex SHA256 hash."""
import re
if not re.match(r"^[a-f0-9]{64}$", v.lower()):
raise ValueError(
"expected_hash must be a valid 64-character lowercase hexadecimal SHA256 hash"
)
return v.lower() # Normalize to lowercase
class ResumableUploadInitResponse(BaseModel):
"""Response from initiating a resumable upload"""
upload_id: Optional[str] # None if file already exists
already_exists: bool
artifact_id: Optional[str] = None # Set if already_exists is True
@@ -248,17 +290,20 @@ class ResumableUploadInitResponse(BaseModel):
class ResumableUploadPartResponse(BaseModel):
"""Response from uploading a part"""
part_number: int
etag: str
class ResumableUploadCompleteRequest(BaseModel):
"""Request to complete a resumable upload"""
tag: Optional[str] = None
class ResumableUploadCompleteResponse(BaseModel):
"""Response from completing a resumable upload"""
artifact_id: str
size: int
project: str
@@ -268,6 +313,7 @@ class ResumableUploadCompleteResponse(BaseModel):
class ResumableUploadStatusResponse(BaseModel):
"""Status of a resumable upload"""
upload_id: str
uploaded_parts: List[int]
total_uploaded_bytes: int
@@ -288,6 +334,7 @@ class ConsumerResponse(BaseModel):
# Global search schemas
class SearchResultProject(BaseModel):
"""Project result for global search"""
id: UUID
name: str
description: Optional[str]
@@ -299,6 +346,7 @@ class SearchResultProject(BaseModel):
class SearchResultPackage(BaseModel):
"""Package result for global search"""
id: UUID
project_id: UUID
project_name: str
@@ -312,6 +360,7 @@ class SearchResultPackage(BaseModel):
class SearchResultArtifact(BaseModel):
"""Artifact/tag result for global search"""
tag_id: UUID
tag_name: str
artifact_id: str
@@ -323,6 +372,7 @@ class SearchResultArtifact(BaseModel):
class GlobalSearchResponse(BaseModel):
"""Combined search results across all entity types"""
query: str
projects: List[SearchResultProject]
packages: List[SearchResultPackage]
@@ -333,6 +383,7 @@ class GlobalSearchResponse(BaseModel):
# Presigned URL response
class PresignedUrlResponse(BaseModel):
"""Response containing a presigned URL for direct S3 download"""
url: str
expires_at: datetime
method: str = "GET"
@@ -348,3 +399,131 @@ class PresignedUrlResponse(BaseModel):
class HealthResponse(BaseModel):
status: str
version: str = "1.0.0"
storage_healthy: Optional[bool] = None
database_healthy: Optional[bool] = None
# Garbage collection schemas
class GarbageCollectionResponse(BaseModel):
"""Response from garbage collection operation"""
artifacts_deleted: int
bytes_freed: int
artifact_ids: List[str]
dry_run: bool
class OrphanedArtifactResponse(BaseModel):
"""Information about an orphaned artifact"""
id: str
size: int
created_at: datetime
created_by: str
original_name: Optional[str]
# Storage statistics schemas
class StorageStatsResponse(BaseModel):
"""Global storage statistics"""
total_artifacts: int
total_size_bytes: int
unique_artifacts: int # Artifacts with ref_count > 0
orphaned_artifacts: int # Artifacts with ref_count = 0
orphaned_size_bytes: int
total_uploads: int
deduplicated_uploads: int
deduplication_ratio: (
float # total_uploads / unique_artifacts (if > 1, deduplication is working)
)
storage_saved_bytes: int # Bytes saved through deduplication
class DeduplicationStatsResponse(BaseModel):
"""Deduplication effectiveness statistics"""
total_logical_bytes: (
int # Sum of all upload sizes (what would be stored without dedup)
)
total_physical_bytes: int # Actual storage used
bytes_saved: int
savings_percentage: float
total_uploads: int
unique_artifacts: int
duplicate_uploads: int
average_ref_count: float
max_ref_count: int
most_referenced_artifacts: List[Dict[str, Any]] # Top N most referenced
class ProjectStatsResponse(BaseModel):
"""Per-project statistics"""
project_id: str
project_name: str
package_count: int
tag_count: int
artifact_count: int
total_size_bytes: int
upload_count: int
deduplicated_uploads: int
storage_saved_bytes: int = 0 # Bytes saved through deduplication
deduplication_ratio: float = 1.0 # upload_count / artifact_count
class PackageStatsResponse(BaseModel):
"""Per-package statistics"""
package_id: str
package_name: str
project_name: str
tag_count: int
artifact_count: int
total_size_bytes: int
upload_count: int
deduplicated_uploads: int
storage_saved_bytes: int = 0
deduplication_ratio: float = 1.0
class ArtifactStatsResponse(BaseModel):
"""Per-artifact reference statistics"""
artifact_id: str
sha256: str
size: int
ref_count: int
storage_savings: int # (ref_count - 1) * size
tags: List[Dict[str, Any]] # Tags referencing this artifact
projects: List[str] # Projects using this artifact
packages: List[str] # Packages using this artifact
first_uploaded: Optional[datetime] = None
last_referenced: Optional[datetime] = None
class CrossProjectDeduplicationResponse(BaseModel):
"""Cross-project deduplication statistics"""
shared_artifacts_count: int # Artifacts used in multiple projects
total_cross_project_savings: int # Bytes saved by cross-project sharing
shared_artifacts: List[Dict[str, Any]] # Details of shared artifacts
class TimeBasedStatsResponse(BaseModel):
"""Time-based deduplication statistics"""
period: str # "daily", "weekly", "monthly"
start_date: datetime
end_date: datetime
data_points: List[
Dict[str, Any]
] # List of {date, uploads, unique, duplicated, bytes_saved}
class StatsReportResponse(BaseModel):
"""Summary report in various formats"""
format: str # "json", "csv", "markdown"
generated_at: datetime
content: str # The report content