Metadata database tracks all uploads with project, package, tag, and timestamp queryable via API

This commit is contained in:
Mondo Diaz
2026-01-07 12:31:44 -06:00
parent 81458b3bcb
commit 2f1891cf01
24 changed files with 5044 additions and 2123 deletions

View File

@@ -1,8 +1,16 @@
from datetime import datetime
from typing import Optional
from sqlalchemy import (
Column, String, Text, Boolean, Integer, BigInteger,
DateTime, ForeignKey, CheckConstraint, Index, JSON
Column,
String,
Text,
Boolean,
Integer,
BigInteger,
DateTime,
ForeignKey,
CheckConstraint,
Index,
JSON,
)
from sqlalchemy.dialects.postgresql import UUID
from sqlalchemy.orm import relationship, declarative_base
@@ -19,11 +27,17 @@ class Project(Base):
description = Column(Text)
is_public = Column(Boolean, default=True)
created_at = Column(DateTime(timezone=True), default=datetime.utcnow)
updated_at = Column(DateTime(timezone=True), default=datetime.utcnow, onupdate=datetime.utcnow)
updated_at = Column(
DateTime(timezone=True), default=datetime.utcnow, onupdate=datetime.utcnow
)
created_by = Column(String(255), nullable=False)
packages = relationship("Package", back_populates="project", cascade="all, delete-orphan")
permissions = relationship("AccessPermission", back_populates="project", cascade="all, delete-orphan")
packages = relationship(
"Package", back_populates="project", cascade="all, delete-orphan"
)
permissions = relationship(
"AccessPermission", back_populates="project", cascade="all, delete-orphan"
)
__table_args__ = (
Index("idx_projects_name", "name"),
@@ -35,32 +49,44 @@ class Package(Base):
__tablename__ = "packages"
id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
project_id = Column(UUID(as_uuid=True), ForeignKey("projects.id", ondelete="CASCADE"), nullable=False)
project_id = Column(
UUID(as_uuid=True),
ForeignKey("projects.id", ondelete="CASCADE"),
nullable=False,
)
name = Column(String(255), nullable=False)
description = Column(Text)
format = Column(String(50), default="generic", nullable=False)
platform = Column(String(50), default="any", nullable=False)
created_at = Column(DateTime(timezone=True), default=datetime.utcnow)
updated_at = Column(DateTime(timezone=True), default=datetime.utcnow, onupdate=datetime.utcnow)
updated_at = Column(
DateTime(timezone=True), default=datetime.utcnow, onupdate=datetime.utcnow
)
project = relationship("Project", back_populates="packages")
tags = relationship("Tag", back_populates="package", cascade="all, delete-orphan")
uploads = relationship("Upload", back_populates="package", cascade="all, delete-orphan")
consumers = relationship("Consumer", back_populates="package", cascade="all, delete-orphan")
uploads = relationship(
"Upload", back_populates="package", cascade="all, delete-orphan"
)
consumers = relationship(
"Consumer", back_populates="package", cascade="all, delete-orphan"
)
__table_args__ = (
Index("idx_packages_project_id", "project_id"),
Index("idx_packages_name", "name"),
Index("idx_packages_format", "format"),
Index("idx_packages_platform", "platform"),
Index("idx_packages_project_name", "project_id", "name", unique=True), # Composite unique index
Index(
"idx_packages_project_name", "project_id", "name", unique=True
), # Composite unique index
CheckConstraint(
"format IN ('generic', 'npm', 'pypi', 'docker', 'deb', 'rpm', 'maven', 'nuget', 'helm')",
name="check_package_format"
name="check_package_format",
),
CheckConstraint(
"platform IN ('any', 'linux', 'darwin', 'windows', 'linux-amd64', 'linux-arm64', 'darwin-amd64', 'darwin-arm64', 'windows-amd64')",
name="check_package_platform"
name="check_package_platform",
),
{"extend_existing": True},
)
@@ -76,7 +102,9 @@ class Artifact(Base):
checksum_md5 = Column(String(32)) # MD5 hash for additional verification
checksum_sha1 = Column(String(40)) # SHA1 hash for compatibility
s3_etag = Column(String(64)) # S3 ETag for verification
artifact_metadata = Column("metadata", JSON, default=dict) # Format-specific metadata (column name is 'metadata')
artifact_metadata = Column(
"metadata", JSON, default=dict
) # Format-specific metadata (column name is 'metadata')
created_at = Column(DateTime(timezone=True), default=datetime.utcnow)
created_by = Column(String(255), nullable=False)
ref_count = Column(Integer, default=1)
@@ -113,22 +141,34 @@ class Tag(Base):
__tablename__ = "tags"
id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
package_id = Column(UUID(as_uuid=True), ForeignKey("packages.id", ondelete="CASCADE"), nullable=False)
package_id = Column(
UUID(as_uuid=True),
ForeignKey("packages.id", ondelete="CASCADE"),
nullable=False,
)
name = Column(String(255), nullable=False)
artifact_id = Column(String(64), ForeignKey("artifacts.id"), nullable=False)
created_at = Column(DateTime(timezone=True), default=datetime.utcnow)
updated_at = Column(DateTime(timezone=True), default=datetime.utcnow, onupdate=datetime.utcnow)
updated_at = Column(
DateTime(timezone=True), default=datetime.utcnow, onupdate=datetime.utcnow
)
created_by = Column(String(255), nullable=False)
package = relationship("Package", back_populates="tags")
artifact = relationship("Artifact", back_populates="tags")
history = relationship("TagHistory", back_populates="tag", cascade="all, delete-orphan")
history = relationship(
"TagHistory", back_populates="tag", cascade="all, delete-orphan"
)
__table_args__ = (
Index("idx_tags_package_id", "package_id"),
Index("idx_tags_artifact_id", "artifact_id"),
Index("idx_tags_package_name", "package_id", "name", unique=True), # Composite unique index
Index("idx_tags_package_created_at", "package_id", "created_at"), # For recent tags queries
Index(
"idx_tags_package_name", "package_id", "name", unique=True
), # Composite unique index
Index(
"idx_tags_package_created_at", "package_id", "created_at"
), # For recent tags queries
)
@@ -136,7 +176,9 @@ class TagHistory(Base):
__tablename__ = "tag_history"
id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
tag_id = Column(UUID(as_uuid=True), ForeignKey("tags.id", ondelete="CASCADE"), nullable=False)
tag_id = Column(
UUID(as_uuid=True), ForeignKey("tags.id", ondelete="CASCADE"), nullable=False
)
old_artifact_id = Column(String(64), ForeignKey("artifacts.id"))
new_artifact_id = Column(String(64), ForeignKey("artifacts.id"), nullable=False)
change_type = Column(String(20), nullable=False, default="update")
@@ -148,7 +190,9 @@ class TagHistory(Base):
__table_args__ = (
Index("idx_tag_history_tag_id", "tag_id"),
Index("idx_tag_history_changed_at", "changed_at"),
CheckConstraint("change_type IN ('create', 'update', 'delete')", name="check_change_type"),
CheckConstraint(
"change_type IN ('create', 'update', 'delete')", name="check_change_type"
),
)
@@ -164,6 +208,11 @@ class Upload(Base):
duration_ms = Column(Integer) # Upload timing in milliseconds
deduplicated = Column(Boolean, default=False) # Whether artifact was deduplicated
checksum_verified = Column(Boolean, default=True) # Whether checksum was verified
status = Column(
String(20), default="completed", nullable=False
) # pending, completed, failed
error_message = Column(Text) # Error details for failed uploads
client_checksum = Column(String(64)) # Client-provided SHA256 for verification
uploaded_at = Column(DateTime(timezone=True), default=datetime.utcnow)
uploaded_by = Column(String(255), nullable=False)
source_ip = Column(String(45))
@@ -177,6 +226,35 @@ class Upload(Base):
Index("idx_uploads_uploaded_at", "uploaded_at"),
Index("idx_uploads_package_uploaded_at", "package_id", "uploaded_at"),
Index("idx_uploads_uploaded_by_at", "uploaded_by", "uploaded_at"),
Index("idx_uploads_status", "status"),
Index("idx_uploads_status_uploaded_at", "status", "uploaded_at"),
CheckConstraint(
"status IN ('pending', 'completed', 'failed')", name="check_upload_status"
),
)
class UploadLock(Base):
"""Track in-progress uploads for conflict detection (409 responses)."""
__tablename__ = "upload_locks"
id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
sha256_hash = Column(String(64), nullable=False)
package_id = Column(
UUID(as_uuid=True),
ForeignKey("packages.id", ondelete="CASCADE"),
nullable=False,
)
locked_at = Column(DateTime(timezone=True), default=datetime.utcnow)
locked_by = Column(String(255), nullable=False)
expires_at = Column(DateTime(timezone=True), nullable=False)
__table_args__ = (
Index("idx_upload_locks_expires_at", "expires_at"),
Index(
"idx_upload_locks_hash_package", "sha256_hash", "package_id", unique=True
),
)
@@ -184,7 +262,11 @@ class Consumer(Base):
__tablename__ = "consumers"
id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
package_id = Column(UUID(as_uuid=True), ForeignKey("packages.id", ondelete="CASCADE"), nullable=False)
package_id = Column(
UUID(as_uuid=True),
ForeignKey("packages.id", ondelete="CASCADE"),
nullable=False,
)
project_url = Column(String(2048), nullable=False)
last_access = Column(DateTime(timezone=True), default=datetime.utcnow)
created_at = Column(DateTime(timezone=True), default=datetime.utcnow)
@@ -201,7 +283,11 @@ class AccessPermission(Base):
__tablename__ = "access_permissions"
id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
project_id = Column(UUID(as_uuid=True), ForeignKey("projects.id", ondelete="CASCADE"), nullable=False)
project_id = Column(
UUID(as_uuid=True),
ForeignKey("projects.id", ondelete="CASCADE"),
nullable=False,
)
user_id = Column(String(255), nullable=False)
level = Column(String(20), nullable=False)
created_at = Column(DateTime(timezone=True), default=datetime.utcnow)
@@ -252,3 +338,51 @@ class AuditLog(Base):
Index("idx_audit_logs_resource_timestamp", "resource", "timestamp"),
Index("idx_audit_logs_user_timestamp", "user_id", "timestamp"),
)
class ProjectHistory(Base):
"""Track changes to project metadata over time."""
__tablename__ = "project_history"
id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
project_id = Column(
UUID(as_uuid=True),
ForeignKey("projects.id", ondelete="CASCADE"),
nullable=False,
)
field_name = Column(String(100), nullable=False)
old_value = Column(Text)
new_value = Column(Text)
changed_at = Column(DateTime(timezone=True), default=datetime.utcnow)
changed_by = Column(String(255), nullable=False)
__table_args__ = (
Index("idx_project_history_project_id", "project_id"),
Index("idx_project_history_changed_at", "changed_at"),
Index("idx_project_history_project_changed_at", "project_id", "changed_at"),
)
class PackageHistory(Base):
"""Track changes to package metadata over time."""
__tablename__ = "package_history"
id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
package_id = Column(
UUID(as_uuid=True),
ForeignKey("packages.id", ondelete="CASCADE"),
nullable=False,
)
field_name = Column(String(100), nullable=False)
old_value = Column(Text)
new_value = Column(Text)
changed_at = Column(DateTime(timezone=True), default=datetime.utcnow)
changed_by = Column(String(255), nullable=False)
__table_args__ = (
Index("idx_package_history_package_id", "package_id"),
Index("idx_package_history_changed_at", "changed_at"),
Index("idx_package_history_package_changed_at", "package_id", "changed_at"),
)

File diff suppressed because it is too large Load Diff

View File

@@ -12,6 +12,7 @@ class PaginationMeta(BaseModel):
limit: int
total: int
total_pages: int
has_more: bool = False # True if there are more pages after current page
class PaginatedResponse(BaseModel, Generic[T]):
@@ -39,6 +40,13 @@ class ProjectResponse(BaseModel):
from_attributes = True
class ProjectUpdate(BaseModel):
"""Schema for updating a project"""
description: Optional[str] = None
is_public: Optional[bool] = None
# Package format and platform enums
PACKAGE_FORMATS = [
"generic",
@@ -86,6 +94,14 @@ class PackageResponse(BaseModel):
from_attributes = True
class PackageUpdate(BaseModel):
"""Schema for updating a package"""
description: Optional[str] = None
format: Optional[str] = None
platform: Optional[str] = None
class TagSummary(BaseModel):
"""Lightweight tag info for embedding in package responses"""
@@ -189,6 +205,93 @@ class TagHistoryResponse(BaseModel):
from_attributes = True
class TagHistoryDetailResponse(BaseModel):
"""Tag history with artifact metadata for each version"""
id: UUID
tag_id: UUID
tag_name: str
old_artifact_id: Optional[str]
new_artifact_id: str
changed_at: datetime
changed_by: str
# Artifact metadata for new artifact
artifact_size: int
artifact_original_name: Optional[str]
artifact_content_type: Optional[str]
class Config:
from_attributes = True
# Audit log schemas
class AuditLogResponse(BaseModel):
"""Audit log entry response"""
id: UUID
action: str
resource: str
user_id: str
details: Optional[Dict[str, Any]]
timestamp: datetime
source_ip: Optional[str]
class Config:
from_attributes = True
# Upload history schemas
class UploadHistoryResponse(BaseModel):
"""Upload event with artifact details"""
id: UUID
artifact_id: str
package_id: UUID
package_name: str
project_name: str
original_name: Optional[str]
tag_name: Optional[str]
uploaded_at: datetime
uploaded_by: str
source_ip: Optional[str]
deduplicated: bool
# Artifact metadata
artifact_size: int
artifact_content_type: Optional[str]
class Config:
from_attributes = True
# Artifact provenance schemas
class ArtifactProvenanceResponse(BaseModel):
"""Full provenance/history of an artifact"""
artifact_id: str
sha256: str
size: int
content_type: Optional[str]
original_name: Optional[str]
created_at: datetime
created_by: str
ref_count: int
# First upload info
first_uploaded_at: datetime
first_uploaded_by: str
# Usage statistics
upload_count: int
# References
packages: List[Dict[str, Any]] # List of {project_name, package_name, tag_names}
tags: List[
Dict[str, Any]
] # List of {project_name, package_name, tag_name, created_at}
# Upload history
uploads: List[Dict[str, Any]] # List of upload events
class Config:
from_attributes = True
class ArtifactTagInfo(BaseModel):
"""Tag info for embedding in artifact responses"""
@@ -240,6 +343,44 @@ class PackageArtifactResponse(BaseModel):
from_attributes = True
class GlobalArtifactResponse(BaseModel):
"""Artifact with project/package context for global listing"""
id: str
sha256: str
size: int
content_type: Optional[str]
original_name: Optional[str]
created_at: datetime
created_by: str
format_metadata: Optional[Dict[str, Any]] = None
ref_count: int = 0
# Context from tags/packages
projects: List[str] = [] # List of project names containing this artifact
packages: List[str] = [] # List of "project/package" paths
tags: List[str] = [] # List of "project/package:tag" references
class Config:
from_attributes = True
class GlobalTagResponse(BaseModel):
"""Tag with project/package context for global listing"""
id: UUID
name: str
artifact_id: str
created_at: datetime
created_by: str
project_name: str
package_name: str
artifact_size: Optional[int] = None
artifact_content_type: Optional[str] = None
class Config:
from_attributes = True
# Upload response
class UploadResponse(BaseModel):
artifact_id: str
@@ -254,6 +395,11 @@ class UploadResponse(BaseModel):
format_metadata: Optional[Dict[str, Any]] = None
deduplicated: bool = False
ref_count: int = 1 # Current reference count after this upload
# Enhanced metadata (Issue #19)
upload_id: Optional[UUID] = None # UUID of the upload record
content_type: Optional[str] = None
original_name: Optional[str] = None
created_at: Optional[datetime] = None
# Resumable upload schemas
@@ -440,6 +586,19 @@ class StorageStatsResponse(BaseModel):
storage_saved_bytes: int # Bytes saved through deduplication
class ConsistencyCheckResponse(BaseModel):
"""Result of S3/Database consistency check"""
total_artifacts_checked: int
orphaned_s3_objects: int # Objects in S3 but not in DB
missing_s3_objects: int # Records in DB but not in S3
size_mismatches: int # Records where DB size != S3 size
healthy: bool
orphaned_s3_keys: List[str] = [] # Limited list of orphaned S3 keys
missing_s3_keys: List[str] = [] # Limited list of missing S3 keys
size_mismatch_artifacts: List[Dict[str, Any]] = [] # Limited list of mismatches
class DeduplicationStatsResponse(BaseModel):
"""Deduplication effectiveness statistics"""

View File

@@ -6,7 +6,7 @@ from typing import List, Optional, Tuple
from sqlalchemy.orm import Session
import logging
from ..models import Artifact, Tag, Upload, Package
from ..models import Artifact, Tag
from ..repositories.artifact import ArtifactRepository
from ..repositories.tag import TagRepository
from ..storage import S3Storage
@@ -40,10 +40,14 @@ class ArtifactCleanupService:
artifact = self.artifact_repo.get_by_sha256(artifact_id)
if artifact:
artifact = self.artifact_repo.decrement_ref_count(artifact)
logger.info(f"Decremented ref_count for artifact {artifact_id}: now {artifact.ref_count}")
logger.info(
f"Decremented ref_count for artifact {artifact_id}: now {artifact.ref_count}"
)
return artifact
def on_tag_updated(self, old_artifact_id: str, new_artifact_id: str) -> Tuple[Optional[Artifact], Optional[Artifact]]:
def on_tag_updated(
self, old_artifact_id: str, new_artifact_id: str
) -> Tuple[Optional[Artifact], Optional[Artifact]]:
"""
Called when a tag is updated to point to a different artifact.
Decrements ref_count for old artifact, increments for new (if different).
@@ -58,13 +62,17 @@ class ArtifactCleanupService:
old_artifact = self.artifact_repo.get_by_sha256(old_artifact_id)
if old_artifact:
old_artifact = self.artifact_repo.decrement_ref_count(old_artifact)
logger.info(f"Decremented ref_count for old artifact {old_artifact_id}: now {old_artifact.ref_count}")
logger.info(
f"Decremented ref_count for old artifact {old_artifact_id}: now {old_artifact.ref_count}"
)
# Increment new artifact ref_count
new_artifact = self.artifact_repo.get_by_sha256(new_artifact_id)
if new_artifact:
new_artifact = self.artifact_repo.increment_ref_count(new_artifact)
logger.info(f"Incremented ref_count for new artifact {new_artifact_id}: now {new_artifact.ref_count}")
logger.info(
f"Incremented ref_count for new artifact {new_artifact_id}: now {new_artifact.ref_count}"
)
return old_artifact, new_artifact
@@ -84,11 +92,15 @@ class ArtifactCleanupService:
if artifact:
self.artifact_repo.decrement_ref_count(artifact)
affected_artifacts.append(tag.artifact_id)
logger.info(f"Decremented ref_count for artifact {tag.artifact_id} (package delete)")
logger.info(
f"Decremented ref_count for artifact {tag.artifact_id} (package delete)"
)
return affected_artifacts
def cleanup_orphaned_artifacts(self, batch_size: int = 100, dry_run: bool = False) -> List[str]:
def cleanup_orphaned_artifacts(
self, batch_size: int = 100, dry_run: bool = False
) -> List[str]:
"""
Find and delete artifacts with ref_count = 0.
@@ -116,7 +128,9 @@ class ArtifactCleanupService:
# Then delete from database
self.artifact_repo.delete(artifact)
deleted_ids.append(artifact.id)
logger.info(f"Deleted orphaned artifact from database: {artifact.id}")
logger.info(
f"Deleted orphaned artifact from database: {artifact.id}"
)
except Exception as e:
logger.error(f"Failed to delete artifact {artifact.id}: {e}")
@@ -128,10 +142,12 @@ class ArtifactCleanupService:
def get_orphaned_count(self) -> int:
"""Get count of artifacts with ref_count = 0."""
from sqlalchemy import func
return (
self.db.query(func.count(Artifact.id))
.filter(Artifact.ref_count == 0)
.scalar() or 0
.scalar()
or 0
)
def verify_ref_counts(self, fix: bool = False) -> List[dict]:
@@ -173,7 +189,9 @@ class ArtifactCleanupService:
if fix:
artifact.ref_count = max(actual_count, 1)
logger.warning(f"Fixed ref_count for artifact {artifact.id}: {mismatch['stored_ref_count']} -> {artifact.ref_count}")
logger.warning(
f"Fixed ref_count for artifact {artifact.id}: {mismatch['stored_ref_count']} -> {artifact.ref_count}"
)
if fix and mismatches:
self.db.commit()

View File

@@ -202,6 +202,9 @@ class StorageResult(NamedTuple):
md5: Optional[str] = None
sha1: Optional[str] = None
s3_etag: Optional[str] = None
already_existed: bool = (
False # True if artifact was deduplicated (S3 object already existed)
)
class S3StorageUnavailableError(StorageError):
@@ -354,6 +357,7 @@ class S3Storage:
md5=md5_hash,
sha1=sha1_hash,
s3_etag=s3_etag,
already_existed=exists,
)
def _store_multipart(self, file: BinaryIO, content_length: int) -> StorageResult:
@@ -433,6 +437,7 @@ class S3Storage:
md5=md5_hash,
sha1=sha1_hash,
s3_etag=s3_etag,
already_existed=True,
)
# Seek back to start for upload
@@ -486,6 +491,7 @@ class S3Storage:
md5=md5_hash,
sha1=sha1_hash,
s3_etag=s3_etag,
already_existed=False,
)
except Exception as e:
@@ -535,6 +541,7 @@ class S3Storage:
md5=md5_hash,
sha1=sha1_hash,
s3_etag=s3_etag,
already_existed=True,
)
# Upload based on size
@@ -615,6 +622,7 @@ class S3Storage:
md5=md5_hash,
sha1=sha1_hash,
s3_etag=s3_etag,
already_existed=False,
)
def initiate_resumable_upload(self, expected_hash: str) -> Dict[str, Any]: