From 2f1891cf0126ec0e7d4c789d872a2cb2dd3a1745 Mon Sep 17 00:00:00 2001 From: Mondo Diaz Date: Wed, 7 Jan 2026 12:31:44 -0600 Subject: [PATCH] Metadata database tracks all uploads with project, package, tag, and timestamp queryable via API --- CHANGELOG.md | 45 + backend/app/models.py | 180 +- backend/app/routes.py | 1503 ++++++++++++++++- backend/app/schemas.py | 159 ++ backend/app/services/artifact_cleanup.py | 38 +- backend/app/storage.py | 8 + backend/tests/conftest.py | 199 +-- backend/tests/factories.py | 288 ++++ backend/tests/integration/__init__.py | 0 .../tests/integration/test_artifacts_api.py | 638 +++++++ .../tests/integration/test_packages_api.py | 345 ++++ .../tests/integration/test_projects_api.py | 322 ++++ backend/tests/integration/test_tags_api.py | 403 +++++ .../test_upload_download_api.py} | 612 +++---- backend/tests/test_duplicate_detection.py | 207 --- backend/tests/test_garbage_collection.py | 168 -- backend/tests/test_hash_calculation.py | 215 --- backend/tests/test_ref_count.py | 458 ----- backend/tests/test_stats_endpoints.py | 488 ------ backend/tests/unit/__init__.py | 0 backend/tests/unit/test_models.py | 271 +++ backend/tests/unit/test_storage.py | 439 +++++ migrations/004_history_tables.sql | 98 ++ migrations/005_upload_enhancements.sql | 83 + 24 files changed, 5044 insertions(+), 2123 deletions(-) create mode 100644 backend/tests/factories.py create mode 100644 backend/tests/integration/__init__.py create mode 100644 backend/tests/integration/test_artifacts_api.py create mode 100644 backend/tests/integration/test_packages_api.py create mode 100644 backend/tests/integration/test_projects_api.py create mode 100644 backend/tests/integration/test_tags_api.py rename backend/tests/{test_integration_uploads.py => integration/test_upload_download_api.py} (56%) delete mode 100644 backend/tests/test_duplicate_detection.py delete mode 100644 backend/tests/test_garbage_collection.py delete mode 100644 backend/tests/test_hash_calculation.py delete mode 100644 backend/tests/test_ref_count.py delete mode 100644 backend/tests/test_stats_endpoints.py create mode 100644 backend/tests/unit/__init__.py create mode 100644 backend/tests/unit/test_models.py create mode 100644 backend/tests/unit/test_storage.py create mode 100644 migrations/004_history_tables.sql create mode 100644 migrations/005_upload_enhancements.sql diff --git a/CHANGELOG.md b/CHANGELOG.md index db52574..41a8dec 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,51 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] ### Added +- Added global artifacts endpoint `GET /api/v1/artifacts` with project/package/tag/size/date filters (#18) +- Added global tags endpoint `GET /api/v1/tags` with project/package/search/date filters (#18) +- Added wildcard pattern matching (`*`) for tag filters across all endpoints (#18) +- Added comma-separated multi-value support for tag filters (#18) +- Added `search` parameter to `/api/v1/uploads` for filename search (#18) +- Added `tag` filter to `/api/v1/uploads` endpoint (#18) +- Added `sort` and `order` parameters to `/api/v1/uploads` endpoint (#18) +- Added `min_size` and `max_size` filters to package artifacts endpoint (#18) +- Added `sort` and `order` parameters to package artifacts endpoint (#18) +- Added `from` and `to` date filters to package tags endpoint (#18) +- Added `GlobalArtifactResponse` and `GlobalTagResponse` schemas (#18) +- Added S3 object verification before database commit during upload (#19) +- Added S3 object cleanup on database commit failure (#19) +- Added upload duration tracking (`duration_ms` field) (#19) +- Added `User-Agent` header capture during uploads (#19) +- Added `X-Checksum-SHA256` header support for client-side checksum verification (#19) +- Added `status`, `error_message`, `client_checksum` columns to uploads table (#19) +- Added `upload_locks` table for future concurrent upload conflict detection (#19) +- Added consistency check endpoint `GET /api/v1/admin/consistency-check` (#19) +- Added `PUT /api/v1/projects/{project}` endpoint for project updates with audit logging (#20) +- Added `PUT /api/v1/project/{project}/packages/{package}` endpoint for package updates with audit logging (#20) +- Added `artifact.download` audit logging to download endpoint (#20) +- Added `ProjectHistory` and `PackageHistory` models with database triggers (#20) +- Added migration `004_history_tables.sql` for project/package history (#20) +- Added migration `005_upload_enhancements.sql` for upload status tracking (#19) +- Added 9 integration tests for global artifacts/tags endpoints (#18) +- Added global uploads query endpoint `GET /api/v1/uploads` with project/package/user/date filters (#18) +- Added project-level uploads endpoint `GET /api/v1/project/{project}/uploads` (#18) +- Added `has_more` field to pagination metadata for easier pagination UI (#18) +- Added `upload_id`, `content_type`, `original_name`, `created_at` fields to upload response (#19) +- Added audit log API endpoints with filtering and pagination (#20) + - `GET /api/v1/audit-logs` - list all audit logs with action/resource/user/date filters + - `GET /api/v1/projects/{project}/audit-logs` - project-scoped audit logs + - `GET /api/v1/project/{project}/{package}/audit-logs` - package-scoped audit logs +- Added upload history API endpoints (#20) + - `GET /api/v1/project/{project}/{package}/uploads` - list upload events for a package + - `GET /api/v1/artifact/{id}/uploads` - list all uploads of a specific artifact +- Added artifact provenance endpoint `GET /api/v1/artifact/{id}/history` (#20) + - Returns full artifact history including packages, tags, and upload events +- Added audit logging for project.create, package.create, tag.create, tag.update, artifact.upload actions (#20) +- Added `AuditLogResponse`, `UploadHistoryResponse`, `ArtifactProvenanceResponse` schemas (#20) +- Added `TagHistoryDetailResponse` schema with artifact metadata (#20) +- Added 31 integration tests for audit log, history, and upload query endpoints (#22) +### Changed +- Standardized audit action naming to `{entity}.{action}` pattern (project.delete, package.delete, tag.delete) (#20) - Added `StorageBackend` protocol/interface for backend-agnostic storage (#33) - Added `health_check()` method to storage backend with `/health` endpoint integration (#33) - Added `verify_integrity()` method for post-upload hash validation (#33) diff --git a/backend/app/models.py b/backend/app/models.py index 6fba3d5..37f23ef 100644 --- a/backend/app/models.py +++ b/backend/app/models.py @@ -1,8 +1,16 @@ from datetime import datetime -from typing import Optional from sqlalchemy import ( - Column, String, Text, Boolean, Integer, BigInteger, - DateTime, ForeignKey, CheckConstraint, Index, JSON + Column, + String, + Text, + Boolean, + Integer, + BigInteger, + DateTime, + ForeignKey, + CheckConstraint, + Index, + JSON, ) from sqlalchemy.dialects.postgresql import UUID from sqlalchemy.orm import relationship, declarative_base @@ -19,11 +27,17 @@ class Project(Base): description = Column(Text) is_public = Column(Boolean, default=True) created_at = Column(DateTime(timezone=True), default=datetime.utcnow) - updated_at = Column(DateTime(timezone=True), default=datetime.utcnow, onupdate=datetime.utcnow) + updated_at = Column( + DateTime(timezone=True), default=datetime.utcnow, onupdate=datetime.utcnow + ) created_by = Column(String(255), nullable=False) - packages = relationship("Package", back_populates="project", cascade="all, delete-orphan") - permissions = relationship("AccessPermission", back_populates="project", cascade="all, delete-orphan") + packages = relationship( + "Package", back_populates="project", cascade="all, delete-orphan" + ) + permissions = relationship( + "AccessPermission", back_populates="project", cascade="all, delete-orphan" + ) __table_args__ = ( Index("idx_projects_name", "name"), @@ -35,32 +49,44 @@ class Package(Base): __tablename__ = "packages" id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4) - project_id = Column(UUID(as_uuid=True), ForeignKey("projects.id", ondelete="CASCADE"), nullable=False) + project_id = Column( + UUID(as_uuid=True), + ForeignKey("projects.id", ondelete="CASCADE"), + nullable=False, + ) name = Column(String(255), nullable=False) description = Column(Text) format = Column(String(50), default="generic", nullable=False) platform = Column(String(50), default="any", nullable=False) created_at = Column(DateTime(timezone=True), default=datetime.utcnow) - updated_at = Column(DateTime(timezone=True), default=datetime.utcnow, onupdate=datetime.utcnow) + updated_at = Column( + DateTime(timezone=True), default=datetime.utcnow, onupdate=datetime.utcnow + ) project = relationship("Project", back_populates="packages") tags = relationship("Tag", back_populates="package", cascade="all, delete-orphan") - uploads = relationship("Upload", back_populates="package", cascade="all, delete-orphan") - consumers = relationship("Consumer", back_populates="package", cascade="all, delete-orphan") + uploads = relationship( + "Upload", back_populates="package", cascade="all, delete-orphan" + ) + consumers = relationship( + "Consumer", back_populates="package", cascade="all, delete-orphan" + ) __table_args__ = ( Index("idx_packages_project_id", "project_id"), Index("idx_packages_name", "name"), Index("idx_packages_format", "format"), Index("idx_packages_platform", "platform"), - Index("idx_packages_project_name", "project_id", "name", unique=True), # Composite unique index + Index( + "idx_packages_project_name", "project_id", "name", unique=True + ), # Composite unique index CheckConstraint( "format IN ('generic', 'npm', 'pypi', 'docker', 'deb', 'rpm', 'maven', 'nuget', 'helm')", - name="check_package_format" + name="check_package_format", ), CheckConstraint( "platform IN ('any', 'linux', 'darwin', 'windows', 'linux-amd64', 'linux-arm64', 'darwin-amd64', 'darwin-arm64', 'windows-amd64')", - name="check_package_platform" + name="check_package_platform", ), {"extend_existing": True}, ) @@ -76,7 +102,9 @@ class Artifact(Base): checksum_md5 = Column(String(32)) # MD5 hash for additional verification checksum_sha1 = Column(String(40)) # SHA1 hash for compatibility s3_etag = Column(String(64)) # S3 ETag for verification - artifact_metadata = Column("metadata", JSON, default=dict) # Format-specific metadata (column name is 'metadata') + artifact_metadata = Column( + "metadata", JSON, default=dict + ) # Format-specific metadata (column name is 'metadata') created_at = Column(DateTime(timezone=True), default=datetime.utcnow) created_by = Column(String(255), nullable=False) ref_count = Column(Integer, default=1) @@ -113,22 +141,34 @@ class Tag(Base): __tablename__ = "tags" id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4) - package_id = Column(UUID(as_uuid=True), ForeignKey("packages.id", ondelete="CASCADE"), nullable=False) + package_id = Column( + UUID(as_uuid=True), + ForeignKey("packages.id", ondelete="CASCADE"), + nullable=False, + ) name = Column(String(255), nullable=False) artifact_id = Column(String(64), ForeignKey("artifacts.id"), nullable=False) created_at = Column(DateTime(timezone=True), default=datetime.utcnow) - updated_at = Column(DateTime(timezone=True), default=datetime.utcnow, onupdate=datetime.utcnow) + updated_at = Column( + DateTime(timezone=True), default=datetime.utcnow, onupdate=datetime.utcnow + ) created_by = Column(String(255), nullable=False) package = relationship("Package", back_populates="tags") artifact = relationship("Artifact", back_populates="tags") - history = relationship("TagHistory", back_populates="tag", cascade="all, delete-orphan") + history = relationship( + "TagHistory", back_populates="tag", cascade="all, delete-orphan" + ) __table_args__ = ( Index("idx_tags_package_id", "package_id"), Index("idx_tags_artifact_id", "artifact_id"), - Index("idx_tags_package_name", "package_id", "name", unique=True), # Composite unique index - Index("idx_tags_package_created_at", "package_id", "created_at"), # For recent tags queries + Index( + "idx_tags_package_name", "package_id", "name", unique=True + ), # Composite unique index + Index( + "idx_tags_package_created_at", "package_id", "created_at" + ), # For recent tags queries ) @@ -136,7 +176,9 @@ class TagHistory(Base): __tablename__ = "tag_history" id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4) - tag_id = Column(UUID(as_uuid=True), ForeignKey("tags.id", ondelete="CASCADE"), nullable=False) + tag_id = Column( + UUID(as_uuid=True), ForeignKey("tags.id", ondelete="CASCADE"), nullable=False + ) old_artifact_id = Column(String(64), ForeignKey("artifacts.id")) new_artifact_id = Column(String(64), ForeignKey("artifacts.id"), nullable=False) change_type = Column(String(20), nullable=False, default="update") @@ -148,7 +190,9 @@ class TagHistory(Base): __table_args__ = ( Index("idx_tag_history_tag_id", "tag_id"), Index("idx_tag_history_changed_at", "changed_at"), - CheckConstraint("change_type IN ('create', 'update', 'delete')", name="check_change_type"), + CheckConstraint( + "change_type IN ('create', 'update', 'delete')", name="check_change_type" + ), ) @@ -164,6 +208,11 @@ class Upload(Base): duration_ms = Column(Integer) # Upload timing in milliseconds deduplicated = Column(Boolean, default=False) # Whether artifact was deduplicated checksum_verified = Column(Boolean, default=True) # Whether checksum was verified + status = Column( + String(20), default="completed", nullable=False + ) # pending, completed, failed + error_message = Column(Text) # Error details for failed uploads + client_checksum = Column(String(64)) # Client-provided SHA256 for verification uploaded_at = Column(DateTime(timezone=True), default=datetime.utcnow) uploaded_by = Column(String(255), nullable=False) source_ip = Column(String(45)) @@ -177,6 +226,35 @@ class Upload(Base): Index("idx_uploads_uploaded_at", "uploaded_at"), Index("idx_uploads_package_uploaded_at", "package_id", "uploaded_at"), Index("idx_uploads_uploaded_by_at", "uploaded_by", "uploaded_at"), + Index("idx_uploads_status", "status"), + Index("idx_uploads_status_uploaded_at", "status", "uploaded_at"), + CheckConstraint( + "status IN ('pending', 'completed', 'failed')", name="check_upload_status" + ), + ) + + +class UploadLock(Base): + """Track in-progress uploads for conflict detection (409 responses).""" + + __tablename__ = "upload_locks" + + id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4) + sha256_hash = Column(String(64), nullable=False) + package_id = Column( + UUID(as_uuid=True), + ForeignKey("packages.id", ondelete="CASCADE"), + nullable=False, + ) + locked_at = Column(DateTime(timezone=True), default=datetime.utcnow) + locked_by = Column(String(255), nullable=False) + expires_at = Column(DateTime(timezone=True), nullable=False) + + __table_args__ = ( + Index("idx_upload_locks_expires_at", "expires_at"), + Index( + "idx_upload_locks_hash_package", "sha256_hash", "package_id", unique=True + ), ) @@ -184,7 +262,11 @@ class Consumer(Base): __tablename__ = "consumers" id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4) - package_id = Column(UUID(as_uuid=True), ForeignKey("packages.id", ondelete="CASCADE"), nullable=False) + package_id = Column( + UUID(as_uuid=True), + ForeignKey("packages.id", ondelete="CASCADE"), + nullable=False, + ) project_url = Column(String(2048), nullable=False) last_access = Column(DateTime(timezone=True), default=datetime.utcnow) created_at = Column(DateTime(timezone=True), default=datetime.utcnow) @@ -201,7 +283,11 @@ class AccessPermission(Base): __tablename__ = "access_permissions" id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4) - project_id = Column(UUID(as_uuid=True), ForeignKey("projects.id", ondelete="CASCADE"), nullable=False) + project_id = Column( + UUID(as_uuid=True), + ForeignKey("projects.id", ondelete="CASCADE"), + nullable=False, + ) user_id = Column(String(255), nullable=False) level = Column(String(20), nullable=False) created_at = Column(DateTime(timezone=True), default=datetime.utcnow) @@ -252,3 +338,51 @@ class AuditLog(Base): Index("idx_audit_logs_resource_timestamp", "resource", "timestamp"), Index("idx_audit_logs_user_timestamp", "user_id", "timestamp"), ) + + +class ProjectHistory(Base): + """Track changes to project metadata over time.""" + + __tablename__ = "project_history" + + id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4) + project_id = Column( + UUID(as_uuid=True), + ForeignKey("projects.id", ondelete="CASCADE"), + nullable=False, + ) + field_name = Column(String(100), nullable=False) + old_value = Column(Text) + new_value = Column(Text) + changed_at = Column(DateTime(timezone=True), default=datetime.utcnow) + changed_by = Column(String(255), nullable=False) + + __table_args__ = ( + Index("idx_project_history_project_id", "project_id"), + Index("idx_project_history_changed_at", "changed_at"), + Index("idx_project_history_project_changed_at", "project_id", "changed_at"), + ) + + +class PackageHistory(Base): + """Track changes to package metadata over time.""" + + __tablename__ = "package_history" + + id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4) + package_id = Column( + UUID(as_uuid=True), + ForeignKey("packages.id", ondelete="CASCADE"), + nullable=False, + ) + field_name = Column(String(100), nullable=False) + old_value = Column(Text) + new_value = Column(Text) + changed_at = Column(DateTime(timezone=True), default=datetime.utcnow) + changed_by = Column(String(255), nullable=False) + + __table_args__ = ( + Index("idx_package_history_package_id", "package_id"), + Index("idx_package_history_changed_at", "changed_at"), + Index("idx_package_history_package_changed_at", "package_id", "changed_at"), + ) diff --git a/backend/app/routes.py b/backend/app/routes.py index c06c394..7975746 100644 --- a/backend/app/routes.py +++ b/backend/app/routes.py @@ -39,13 +39,16 @@ from .models import ( Tag, TagHistory, Upload, + UploadLock, Consumer, AuditLog, ) from .schemas import ( ProjectCreate, + ProjectUpdate, ProjectResponse, PackageCreate, + PackageUpdate, PackageResponse, PackageDetailResponse, TagSummary, @@ -58,6 +61,10 @@ from .schemas import ( TagResponse, TagDetailResponse, TagHistoryResponse, + TagHistoryDetailResponse, + AuditLogResponse, + UploadHistoryResponse, + ArtifactProvenanceResponse, UploadResponse, ConsumerResponse, HealthResponse, @@ -76,6 +83,7 @@ from .schemas import ( PresignedUrlResponse, GarbageCollectionResponse, OrphanedArtifactResponse, + ConsistencyCheckResponse, StorageStatsResponse, DeduplicationStatsResponse, ProjectStatsResponse, @@ -84,6 +92,8 @@ from .schemas import ( CrossProjectDeduplicationResponse, TimeBasedStatsResponse, StatsReportResponse, + GlobalArtifactResponse, + GlobalTagResponse, ) from .metadata import extract_metadata from .config import get_settings @@ -91,6 +101,18 @@ from .config import get_settings router = APIRouter() +def sanitize_filename(filename: str) -> str: + """Sanitize filename for use in Content-Disposition header. + + Removes characters that could enable header injection attacks: + - Double quotes (") - could break out of quoted filename + - Carriage return (\\r) and newline (\\n) - could inject headers + """ + import re + + return re.sub(r'[\r\n"]', "", filename) + + def get_user_id(request: Request) -> str: """Extract user ID from request (simplified for now)""" api_key = request.headers.get("X-Orchard-API-Key") @@ -103,6 +125,7 @@ def get_user_id(request: Request) -> str: import logging +import time logger = logging.getLogger(__name__) @@ -478,6 +501,7 @@ def list_projects( limit=limit, total=total, total_pages=total_pages, + has_more=page < total_pages, ), ) @@ -499,6 +523,17 @@ def create_project( created_by=user_id, ) db.add(db_project) + + # Audit log + _log_audit( + db=db, + action="project.create", + resource=f"project/{project.name}", + user_id=user_id, + source_ip=request.client.host if request.client else None, + details={"is_public": project.is_public}, + ) + db.commit() db.refresh(db_project) return db_project @@ -512,6 +547,60 @@ def get_project(project_name: str, db: Session = Depends(get_db)): return project +@router.put("/api/v1/projects/{project_name}", response_model=ProjectResponse) +def update_project( + project_name: str, + project_update: ProjectUpdate, + request: Request, + db: Session = Depends(get_db), +): + """Update a project's metadata.""" + user_id = get_user_id(request) + + project = db.query(Project).filter(Project.name == project_name).first() + if not project: + raise HTTPException(status_code=404, detail="Project not found") + + # Track changes for audit log + changes = {} + if ( + project_update.description is not None + and project_update.description != project.description + ): + changes["description"] = { + "old": project.description, + "new": project_update.description, + } + project.description = project_update.description + if ( + project_update.is_public is not None + and project_update.is_public != project.is_public + ): + changes["is_public"] = { + "old": project.is_public, + "new": project_update.is_public, + } + project.is_public = project_update.is_public + + if not changes: + # No changes, return current project + return project + + # Audit log + _log_audit( + db=db, + action="project.update", + resource=f"project/{project_name}", + user_id=user_id, + source_ip=request.client.host if request.client else None, + details={"changes": changes}, + ) + + db.commit() + db.refresh(project) + return project + + @router.delete("/api/v1/projects/{project_name}", status_code=204) def delete_project( project_name: str, @@ -555,7 +644,7 @@ def delete_project( # Audit log (after commit) _log_audit( db, - action="delete_project", + action="project.delete", resource=f"project/{project_name}", user_id=user_id, source_ip=request.client.host if request.client else None, @@ -740,6 +829,7 @@ def list_packages( limit=limit, total=total, total_pages=total_pages, + has_more=page < total_pages, ), ) @@ -835,7 +925,10 @@ def get_package( @router.post("/api/v1/project/{project_name}/packages", response_model=PackageResponse) def create_package( - project_name: str, package: PackageCreate, db: Session = Depends(get_db) + project_name: str, + package: PackageCreate, + request: Request, + db: Session = Depends(get_db), ): project = db.query(Project).filter(Project.name == project_name).first() if not project: @@ -873,11 +966,106 @@ def create_package( platform=package.platform, ) db.add(db_package) + + # Audit log + _log_audit( + db=db, + action="package.create", + resource=f"project/{project_name}/{package.name}", + user_id=get_user_id(request), + source_ip=request.client.host if request.client else None, + details={"format": package.format, "platform": package.platform}, + ) + db.commit() db.refresh(db_package) return db_package +@router.put( + "/api/v1/project/{project_name}/packages/{package_name}", + response_model=PackageResponse, +) +def update_package( + project_name: str, + package_name: str, + package_update: PackageUpdate, + request: Request, + db: Session = Depends(get_db), +): + """Update a package's metadata.""" + user_id = get_user_id(request) + + project = db.query(Project).filter(Project.name == project_name).first() + if not project: + raise HTTPException(status_code=404, detail="Project not found") + + package = ( + db.query(Package) + .filter(Package.project_id == project.id, Package.name == package_name) + .first() + ) + if not package: + raise HTTPException(status_code=404, detail="Package not found") + + # Validate format and platform if provided + if ( + package_update.format is not None + and package_update.format not in PACKAGE_FORMATS + ): + raise HTTPException( + status_code=400, + detail=f"Invalid format. Must be one of: {', '.join(PACKAGE_FORMATS)}", + ) + if ( + package_update.platform is not None + and package_update.platform not in PACKAGE_PLATFORMS + ): + raise HTTPException( + status_code=400, + detail=f"Invalid platform. Must be one of: {', '.join(PACKAGE_PLATFORMS)}", + ) + + # Track changes for audit log + changes = {} + if ( + package_update.description is not None + and package_update.description != package.description + ): + changes["description"] = { + "old": package.description, + "new": package_update.description, + } + package.description = package_update.description + if package_update.format is not None and package_update.format != package.format: + changes["format"] = {"old": package.format, "new": package_update.format} + package.format = package_update.format + if ( + package_update.platform is not None + and package_update.platform != package.platform + ): + changes["platform"] = {"old": package.platform, "new": package_update.platform} + package.platform = package_update.platform + + if not changes: + # No changes, return current package + return package + + # Audit log + _log_audit( + db=db, + action="package.update", + resource=f"project/{project_name}/{package_name}", + user_id=user_id, + source_ip=request.client.host if request.client else None, + details={"changes": changes}, + ) + + db.commit() + db.refresh(package) + return package + + @router.delete( "/api/v1/project/{project_name}/packages/{package_name}", status_code=204, @@ -927,7 +1115,7 @@ def delete_package( # Audit log (after commit) _log_audit( db, - action="delete_package", + action="package.delete", resource=f"project/{project_name}/{package_name}", user_id=user_id, source_ip=request.client.host if request.client else None, @@ -955,8 +1143,20 @@ def upload_artifact( db: Session = Depends(get_db), storage: S3Storage = Depends(get_storage), content_length: Optional[int] = Header(None, alias="Content-Length"), + user_agent: Optional[str] = Header(None, alias="User-Agent"), + client_checksum: Optional[str] = Header(None, alias="X-Checksum-SHA256"), ): + """ + Upload an artifact to a package. + + Headers: + - X-Checksum-SHA256: Optional client-provided SHA256 for verification + - User-Agent: Captured for audit purposes + """ + start_time = time.time() user_id = get_user_id(request) + settings = get_settings() + storage_result = None # Get project and package project = db.query(Project).filter(Project.name == project_name).first() @@ -972,7 +1172,6 @@ def upload_artifact( raise HTTPException(status_code=404, detail="Package not found") # Validate file size - settings = get_settings() if content_length is not None: if content_length > settings.max_file_size: raise HTTPException( @@ -985,6 +1184,17 @@ def upload_artifact( detail="Empty files are not allowed", ) + # Validate client checksum format if provided + if client_checksum: + client_checksum = client_checksum.lower().strip() + if len(client_checksum) != 64 or not all( + c in "0123456789abcdef" for c in client_checksum + ): + raise HTTPException( + status_code=400, + detail="Invalid X-Checksum-SHA256 header. Must be 64 hex characters.", + ) + # Extract format-specific metadata before storing file_metadata = {} if file.filename: @@ -1041,6 +1251,55 @@ def upload_artifact( logger.error(f"Storage error during upload: {e}") raise HTTPException(status_code=500, detail="Internal storage error") + # Verify client-provided checksum if present + checksum_verified = True + if client_checksum and client_checksum != storage_result.sha256: + # Checksum mismatch - clean up S3 object if it was newly uploaded + logger.warning( + f"Client checksum mismatch: expected {client_checksum}, got {storage_result.sha256}" + ) + # Attempt cleanup of the uploaded object + try: + if not storage_result.already_existed: + storage.delete(storage_result.s3_key) + logger.info( + f"Cleaned up S3 object after checksum mismatch: {storage_result.s3_key}" + ) + except Exception as cleanup_error: + logger.error( + f"Failed to clean up S3 object after checksum mismatch: {cleanup_error}" + ) + raise HTTPException( + status_code=422, + detail=f"Checksum verification failed. Expected {client_checksum}, got {storage_result.sha256}", + ) + + # Verify S3 object exists and size matches before proceeding + try: + s3_info = storage.get_object_info(storage_result.s3_key) + if s3_info is None: + raise HTTPException( + status_code=500, + detail="Failed to verify uploaded object in storage", + ) + if s3_info.get("size") != storage_result.size: + logger.error( + f"Size mismatch after upload: expected {storage_result.size}, " + f"got {s3_info.get('size')}" + ) + raise HTTPException( + status_code=500, + detail="Upload verification failed: size mismatch", + ) + except HTTPException: + raise + except Exception as e: + logger.error(f"Failed to verify S3 object: {e}") + raise HTTPException( + status_code=500, + detail="Failed to verify uploaded object", + ) + # Check if this is a deduplicated upload deduplicated = False saved_bytes = 0 @@ -1092,16 +1351,26 @@ def upload_artifact( ) db.add(artifact) - # Record upload + # Calculate upload duration + duration_ms = int((time.time() - start_time) * 1000) + + # Record upload with enhanced metadata upload = Upload( artifact_id=storage_result.sha256, package_id=package.id, original_name=file.filename, + tag_name=tag, + user_agent=user_agent[:512] if user_agent else None, # Truncate if too long + duration_ms=duration_ms, + deduplicated=deduplicated, + checksum_verified=checksum_verified, + client_checksum=client_checksum, + status="completed", uploaded_by=user_id, source_ip=request.client.host if request.client else None, - deduplicated=deduplicated, ) db.add(upload) + db.flush() # Flush to get upload ID # Create or update tag if provided (with ref_count management and history) if tag: @@ -1117,7 +1386,7 @@ def upload_artifact( # Audit log _log_audit( db, - action="upload", + action="artifact.upload", resource=f"project/{project_name}/{package_name}/artifact/{storage_result.sha256[:12]}", user_id=user_id, source_ip=request.client.host if request.client else None, @@ -1127,10 +1396,32 @@ def upload_artifact( "deduplicated": deduplicated, "saved_bytes": saved_bytes, "tag": tag, + "duration_ms": duration_ms, + "client_checksum_provided": client_checksum is not None, }, ) - db.commit() + # Commit with cleanup on failure + try: + db.commit() + except Exception as commit_error: + logger.error(f"Database commit failed after upload: {commit_error}") + db.rollback() + # Attempt to clean up newly uploaded S3 object + if storage_result and not storage_result.already_existed: + try: + storage.delete(storage_result.s3_key) + logger.info( + f"Cleaned up S3 object after commit failure: {storage_result.s3_key}" + ) + except Exception as cleanup_error: + logger.error( + f"Failed to clean up S3 object after commit failure: {cleanup_error}" + ) + raise HTTPException( + status_code=500, + detail="Failed to save upload record. Please retry.", + ) return UploadResponse( artifact_id=storage_result.sha256, @@ -1145,6 +1436,10 @@ def upload_artifact( format_metadata=artifact.artifact_metadata, deduplicated=deduplicated, ref_count=artifact.ref_count, + upload_id=upload.id, + content_type=artifact.content_type, + original_name=artifact.original_name, + created_at=artifact.created_at, ) @@ -1231,7 +1526,7 @@ def init_resumable_upload( # Audit log _log_audit( db, - action="upload", + action="artifact.upload", resource=f"project/{project_name}/{package_name}/artifact/{init_request.expected_hash[:12]}", user_id=user_id, source_ip=request.client.host if request.client else None, @@ -1517,7 +1812,24 @@ def download_artifact( if not artifact: raise HTTPException(status_code=404, detail="Artifact not found") - filename = artifact.original_name or f"{artifact.id}" + filename = sanitize_filename(artifact.original_name or f"{artifact.id}") + + # Audit log download + user_id = get_user_id(request) + _log_audit( + db=db, + action="artifact.download", + resource=f"project/{project_name}/{package_name}/artifact/{artifact.id[:12]}", + user_id=user_id, + source_ip=request.client.host if request.client else None, + details={ + "artifact_id": artifact.id, + "ref": ref, + "size": artifact.size, + "original_name": artifact.original_name, + }, + ) + db.commit() # Determine download mode (query param overrides server default) download_mode = mode or settings.download_mode @@ -1630,7 +1942,7 @@ def get_artifact_url( if not artifact: raise HTTPException(status_code=404, detail="Artifact not found") - filename = artifact.original_name or f"{artifact.id}" + filename = sanitize_filename(artifact.original_name or f"{artifact.id}") url_expiry = expiry or settings.presigned_url_expiry presigned_url = storage.generate_presigned_url( @@ -1681,7 +1993,7 @@ def head_artifact( if not artifact: raise HTTPException(status_code=404, detail="Artifact not found") - filename = artifact.original_name or f"{artifact.id}" + filename = sanitize_filename(artifact.original_name or f"{artifact.id}") return Response( content=b"", @@ -1724,6 +2036,12 @@ def list_tags( search: Optional[str] = Query(default=None, description="Search by tag name"), sort: str = Query(default="name", description="Sort field (name, created_at)"), order: str = Query(default="asc", description="Sort order (asc, desc)"), + from_date: Optional[datetime] = Query( + default=None, alias="from", description="Filter tags created after this date" + ), + to_date: Optional[datetime] = Query( + default=None, alias="to", description="Filter tags created before this date" + ), db: Session = Depends(get_db), ): project = db.query(Project).filter(Project.name == project_name).first() @@ -1769,6 +2087,12 @@ def list_tags( ) ) + # Apply date range filters + if from_date: + query = query.filter(Tag.created_at >= from_date) + if to_date: + query = query.filter(Tag.created_at <= to_date) + # Get total count before pagination total = query.count() @@ -1812,6 +2136,7 @@ def list_tags( limit=limit, total=total, total_pages=total_pages, + has_more=page < total_pages, ), ) @@ -1850,8 +2175,23 @@ def create_tag( db.query(Tag).filter(Tag.package_id == package.id, Tag.name == tag.name).first() ) if existing: + old_artifact_id = existing.artifact_id existing.artifact_id = tag.artifact_id existing.created_by = user_id + + # Audit log for tag update + _log_audit( + db=db, + action="tag.update", + resource=f"project/{project_name}/{package_name}/tag/{tag.name}", + user_id=user_id, + source_ip=request.client.host if request.client else None, + details={ + "old_artifact_id": old_artifact_id, + "new_artifact_id": tag.artifact_id, + }, + ) + db.commit() db.refresh(existing) return existing @@ -1863,6 +2203,17 @@ def create_tag( created_by=user_id, ) db.add(db_tag) + + # Audit log for tag create + _log_audit( + db=db, + action="tag.create", + resource=f"project/{project_name}/{package_name}/tag/{tag.name}", + user_id=user_id, + source_ip=request.client.host if request.client else None, + details={"artifact_id": tag.artifact_id}, + ) + db.commit() db.refresh(db_tag) return db_tag @@ -1919,15 +2270,17 @@ def get_tag( @router.get( "/api/v1/project/{project_name}/{package_name}/tags/{tag_name}/history", - response_model=List[TagHistoryResponse], + response_model=PaginatedResponse[TagHistoryDetailResponse], ) def get_tag_history( project_name: str, package_name: str, tag_name: str, + page: int = Query(default=1, ge=1), + limit: int = Query(default=20, ge=1, le=100), db: Session = Depends(get_db), ): - """Get the history of artifact assignments for a tag""" + """Get the history of artifact assignments for a tag with artifact metadata""" project = db.query(Project).filter(Project.name == project_name).first() if not project: raise HTTPException(status_code=404, detail="Project not found") @@ -1946,13 +2299,53 @@ def get_tag_history( if not tag: raise HTTPException(status_code=404, detail="Tag not found") - history = ( - db.query(TagHistory) + # Get total count + total = ( + db.query(func.count(TagHistory.id)).filter(TagHistory.tag_id == tag.id).scalar() + or 0 + ) + + # Get paginated history with artifact metadata + offset = (page - 1) * limit + history_items = ( + db.query(TagHistory, Artifact) + .outerjoin(Artifact, TagHistory.new_artifact_id == Artifact.id) .filter(TagHistory.tag_id == tag.id) .order_by(TagHistory.changed_at.desc()) + .offset(offset) + .limit(limit) .all() ) - return history + + # Build response with artifact metadata + items = [] + for history, artifact in history_items: + items.append( + TagHistoryDetailResponse( + id=history.id, + tag_id=history.tag_id, + tag_name=tag.name, + old_artifact_id=history.old_artifact_id, + new_artifact_id=history.new_artifact_id, + changed_at=history.changed_at, + changed_by=history.changed_by, + artifact_size=artifact.size if artifact else 0, + artifact_original_name=artifact.original_name if artifact else None, + artifact_content_type=artifact.content_type if artifact else None, + ) + ) + + total_pages = math.ceil(total / limit) if limit > 0 else 0 + return PaginatedResponse( + items=items, + pagination=PaginationMeta( + page=page, + limit=limit, + total=total, + total_pages=total_pages, + has_more=page < total_pages, + ), + ) @router.delete( @@ -2016,7 +2409,7 @@ def delete_tag( artifact = db.query(Artifact).filter(Artifact.id == artifact_id).first() _log_audit( db, - action="delete_tag", + action="tag.delete", resource=f"project/{project_name}/{package_name}/tag/{tag_name}", user_id=user_id, source_ip=request.client.host if request.client else None, @@ -2076,9 +2469,19 @@ def list_package_artifacts( created_before: Optional[datetime] = Query( default=None, description="Filter artifacts created before this date" ), + min_size: Optional[int] = Query( + default=None, ge=0, description="Minimum artifact size in bytes" + ), + max_size: Optional[int] = Query( + default=None, ge=0, description="Maximum artifact size in bytes" + ), + sort: Optional[str] = Query( + default=None, description="Sort field: created_at, size, original_name" + ), + order: Optional[str] = Query(default="desc", description="Sort order: asc or desc"), db: Session = Depends(get_db), ): - """List all unique artifacts uploaded to a package""" + """List all unique artifacts uploaded to a package with filtering and sorting.""" project = db.query(Project).filter(Project.name == project_name).first() if not project: raise HTTPException(status_code=404, detail="Project not found") @@ -2110,14 +2513,38 @@ def list_package_artifacts( if created_before: query = query.filter(Artifact.created_at <= created_before) + # Apply size range filters + if min_size is not None: + query = query.filter(Artifact.size >= min_size) + if max_size is not None: + query = query.filter(Artifact.size <= max_size) + + # Validate and apply sorting + valid_sort_fields = { + "created_at": Artifact.created_at, + "size": Artifact.size, + "original_name": Artifact.original_name, + } + if sort and sort not in valid_sort_fields: + raise HTTPException( + status_code=400, + detail=f"Invalid sort field. Valid options: {', '.join(valid_sort_fields.keys())}", + ) + sort_column = valid_sort_fields.get(sort, Artifact.created_at) + if order and order.lower() not in ("asc", "desc"): + raise HTTPException( + status_code=400, detail="Invalid order. Valid options: asc, desc" + ) + sort_order = ( + sort_column.asc() if order and order.lower() == "asc" else sort_column.desc() + ) + # Get total count before pagination total = query.count() # Apply pagination offset = (page - 1) * limit - artifacts = ( - query.order_by(Artifact.created_at.desc()).offset(offset).limit(limit).all() - ) + artifacts = query.order_by(sort_order).offset(offset).limit(limit).all() # Calculate total pages total_pages = math.ceil(total / limit) if total > 0 else 1 @@ -2153,6 +2580,267 @@ def list_package_artifacts( limit=limit, total=total, total_pages=total_pages, + has_more=page < total_pages, + ), + ) + + +# Global artifacts listing +@router.get( + "/api/v1/artifacts", + response_model=PaginatedResponse[GlobalArtifactResponse], +) +def list_all_artifacts( + project: Optional[str] = Query(None, description="Filter by project name"), + package: Optional[str] = Query(None, description="Filter by package name"), + tag: Optional[str] = Query( + None, + description="Filter by tag name. Supports wildcards (*) and comma-separated values", + ), + content_type: Optional[str] = Query(None, description="Filter by content type"), + min_size: Optional[int] = Query(None, ge=0, description="Minimum size in bytes"), + max_size: Optional[int] = Query(None, ge=0, description="Maximum size in bytes"), + from_date: Optional[datetime] = Query( + None, alias="from", description="Created after" + ), + to_date: Optional[datetime] = Query(None, alias="to", description="Created before"), + sort: Optional[str] = Query(None, description="Sort field: created_at, size"), + order: Optional[str] = Query("desc", description="Sort order: asc or desc"), + page: int = Query(1, ge=1), + limit: int = Query(20, ge=1, le=100), + db: Session = Depends(get_db), +): + """ + List all artifacts globally with filtering by project, package, tag, etc. + + Returns artifacts with context about which projects/packages/tags reference them. + """ + # Start with base query + query = db.query(Artifact) + + # If filtering by project/package/tag, need to join through tags + if project or package or tag: + # Subquery to get artifact IDs that match the filters + tag_query = ( + db.query(Tag.artifact_id) + .join(Package, Tag.package_id == Package.id) + .join(Project, Package.project_id == Project.id) + ) + if project: + tag_query = tag_query.filter(Project.name == project) + if package: + tag_query = tag_query.filter(Package.name == package) + if tag: + # Support multiple values (comma-separated) and wildcards (*) + tag_values = [t.strip() for t in tag.split(",") if t.strip()] + if len(tag_values) == 1: + tag_val = tag_values[0] + if "*" in tag_val: + # Wildcard: convert * to SQL LIKE % + tag_query = tag_query.filter( + Tag.name.ilike(tag_val.replace("*", "%")) + ) + else: + tag_query = tag_query.filter(Tag.name == tag_val) + else: + # Multiple values: check if any match (with wildcard support) + tag_conditions = [] + for tag_val in tag_values: + if "*" in tag_val: + tag_conditions.append(Tag.name.ilike(tag_val.replace("*", "%"))) + else: + tag_conditions.append(Tag.name == tag_val) + tag_query = tag_query.filter(or_(*tag_conditions)) + artifact_ids = tag_query.distinct().subquery() + query = query.filter(Artifact.id.in_(artifact_ids)) + + # Apply content type filter + if content_type: + query = query.filter(Artifact.content_type == content_type) + + # Apply size filters + if min_size is not None: + query = query.filter(Artifact.size >= min_size) + if max_size is not None: + query = query.filter(Artifact.size <= max_size) + + # Apply date filters + if from_date: + query = query.filter(Artifact.created_at >= from_date) + if to_date: + query = query.filter(Artifact.created_at <= to_date) + + # Validate and apply sorting + valid_sort_fields = {"created_at": Artifact.created_at, "size": Artifact.size} + if sort and sort not in valid_sort_fields: + raise HTTPException( + status_code=400, + detail=f"Invalid sort field. Valid options: {', '.join(valid_sort_fields.keys())}", + ) + sort_column = valid_sort_fields.get(sort, Artifact.created_at) + if order and order.lower() not in ("asc", "desc"): + raise HTTPException( + status_code=400, detail="Invalid order. Valid options: asc, desc" + ) + sort_order = ( + sort_column.asc() if order and order.lower() == "asc" else sort_column.desc() + ) + + total = query.count() + total_pages = math.ceil(total / limit) if total > 0 else 1 + + artifacts = query.order_by(sort_order).offset((page - 1) * limit).limit(limit).all() + + # Build responses with context + items = [] + for artifact in artifacts: + # Get all tags referencing this artifact with project/package info + tags_info = ( + db.query(Tag, Package, Project) + .join(Package, Tag.package_id == Package.id) + .join(Project, Package.project_id == Project.id) + .filter(Tag.artifact_id == artifact.id) + .all() + ) + + projects = list(set(proj.name for _, _, proj in tags_info)) + packages = list(set(f"{proj.name}/{pkg.name}" for _, pkg, proj in tags_info)) + tags = [f"{proj.name}/{pkg.name}:{t.name}" for t, pkg, proj in tags_info] + + items.append( + GlobalArtifactResponse( + id=artifact.id, + sha256=artifact.id, + size=artifact.size, + content_type=artifact.content_type, + original_name=artifact.original_name, + created_at=artifact.created_at, + created_by=artifact.created_by, + format_metadata=artifact.artifact_metadata, + ref_count=artifact.ref_count, + projects=projects, + packages=packages, + tags=tags, + ) + ) + + return PaginatedResponse( + items=items, + pagination=PaginationMeta( + page=page, + limit=limit, + total=total, + total_pages=total_pages, + has_more=page < total_pages, + ), + ) + + +# Global tags listing +@router.get( + "/api/v1/tags", + response_model=PaginatedResponse[GlobalTagResponse], +) +def list_all_tags( + project: Optional[str] = Query(None, description="Filter by project name"), + package: Optional[str] = Query(None, description="Filter by package name"), + search: Optional[str] = Query( + None, + description="Search by tag name. Supports wildcards (*) and comma-separated values", + ), + from_date: Optional[datetime] = Query( + None, alias="from", description="Created after" + ), + to_date: Optional[datetime] = Query(None, alias="to", description="Created before"), + sort: Optional[str] = Query(None, description="Sort field: name, created_at"), + order: Optional[str] = Query("desc", description="Sort order: asc or desc"), + page: int = Query(1, ge=1), + limit: int = Query(20, ge=1, le=100), + db: Session = Depends(get_db), +): + """ + List all tags globally with filtering by project, package, name, etc. + """ + query = ( + db.query(Tag, Package, Project, Artifact) + .join(Package, Tag.package_id == Package.id) + .join(Project, Package.project_id == Project.id) + .join(Artifact, Tag.artifact_id == Artifact.id) + ) + + # Apply filters + if project: + query = query.filter(Project.name == project) + if package: + query = query.filter(Package.name == package) + if search: + # Support multiple values (comma-separated) and wildcards (*) + search_values = [s.strip() for s in search.split(",") if s.strip()] + if len(search_values) == 1: + search_val = search_values[0] + if "*" in search_val: + query = query.filter(Tag.name.ilike(search_val.replace("*", "%"))) + else: + query = query.filter(Tag.name.ilike(f"%{search_val}%")) + else: + search_conditions = [] + for search_val in search_values: + if "*" in search_val: + search_conditions.append( + Tag.name.ilike(search_val.replace("*", "%")) + ) + else: + search_conditions.append(Tag.name.ilike(f"%{search_val}%")) + query = query.filter(or_(*search_conditions)) + if from_date: + query = query.filter(Tag.created_at >= from_date) + if to_date: + query = query.filter(Tag.created_at <= to_date) + + # Validate and apply sorting + valid_sort_fields = {"name": Tag.name, "created_at": Tag.created_at} + if sort and sort not in valid_sort_fields: + raise HTTPException( + status_code=400, + detail=f"Invalid sort field. Valid options: {', '.join(valid_sort_fields.keys())}", + ) + sort_column = valid_sort_fields.get(sort, Tag.created_at) + if order and order.lower() not in ("asc", "desc"): + raise HTTPException( + status_code=400, detail="Invalid order. Valid options: asc, desc" + ) + sort_order = ( + sort_column.asc() if order and order.lower() == "asc" else sort_column.desc() + ) + + total = query.count() + total_pages = math.ceil(total / limit) if total > 0 else 1 + + results = query.order_by(sort_order).offset((page - 1) * limit).limit(limit).all() + + items = [ + GlobalTagResponse( + id=tag.id, + name=tag.name, + artifact_id=tag.artifact_id, + created_at=tag.created_at, + created_by=tag.created_by, + project_name=proj.name, + package_name=pkg.name, + artifact_size=artifact.size, + artifact_content_type=artifact.content_type, + ) + for tag, pkg, proj, artifact in results + ] + + return PaginatedResponse( + items=items, + pagination=PaginationMeta( + page=page, + limit=limit, + total=total, + total_pages=total_pages, + has_more=page < total_pages, ), ) @@ -2322,6 +3010,109 @@ def garbage_collect( ) +@router.get( + "/api/v1/admin/consistency-check", + response_model=ConsistencyCheckResponse, +) +def check_consistency( + limit: int = Query( + default=100, ge=1, le=1000, description="Max items to report per category" + ), + db: Session = Depends(get_db), + storage: S3Storage = Depends(get_storage), +): + """ + Check consistency between database records and S3 storage. + + Reports: + - Orphaned S3 objects (in S3 but not in database) + - Missing S3 objects (in database but not in S3) + - Size mismatches (database size != S3 size) + + This is a read-only operation. Use garbage-collect to clean up issues. + """ + orphaned_s3_keys = [] + missing_s3_keys = [] + size_mismatches = [] + + # Get all artifacts from database + artifacts = db.query(Artifact).all() + total_checked = len(artifacts) + + # Check each artifact exists in S3 and sizes match + for artifact in artifacts: + try: + s3_info = storage.get_object_info(artifact.s3_key) + if s3_info is None: + if len(missing_s3_keys) < limit: + missing_s3_keys.append(artifact.s3_key) + else: + s3_size = s3_info.get("size", 0) + if s3_size != artifact.size: + if len(size_mismatches) < limit: + size_mismatches.append( + { + "artifact_id": artifact.id, + "s3_key": artifact.s3_key, + "db_size": artifact.size, + "s3_size": s3_size, + } + ) + except Exception as e: + logger.error(f"Error checking S3 object {artifact.s3_key}: {e}") + if len(missing_s3_keys) < limit: + missing_s3_keys.append(artifact.s3_key) + + # Check for orphaned S3 objects (objects in S3 bucket but not in database) + # Note: This is expensive for large buckets, so we limit the scan + try: + # List objects in the fruits/ prefix (where artifacts are stored) + paginator = storage.client.get_paginator("list_objects_v2") + artifact_ids_in_db = {a.id for a in artifacts} + + objects_checked = 0 + for page in paginator.paginate( + Bucket=storage.bucket, Prefix="fruits/", MaxKeys=1000 + ): + if "Contents" not in page: + break + for obj in page["Contents"]: + objects_checked += 1 + # Extract hash from key: fruits/ab/cd/abcdef... + key = obj["Key"] + parts = key.split("/") + if len(parts) == 4 and parts[0] == "fruits": + sha256_hash = parts[3] + if sha256_hash not in artifact_ids_in_db: + if len(orphaned_s3_keys) < limit: + orphaned_s3_keys.append(key) + + # Limit total objects checked + if objects_checked >= 10000: + break + if objects_checked >= 10000: + break + except Exception as e: + logger.error(f"Error listing S3 objects for consistency check: {e}") + + healthy = ( + len(orphaned_s3_keys) == 0 + and len(missing_s3_keys) == 0 + and len(size_mismatches) == 0 + ) + + return ConsistencyCheckResponse( + total_artifacts_checked=total_checked, + orphaned_s3_objects=len(orphaned_s3_keys), + missing_s3_objects=len(missing_s3_keys), + size_mismatches=len(size_mismatches), + healthy=healthy, + orphaned_s3_keys=orphaned_s3_keys, + missing_s3_keys=missing_s3_keys, + size_mismatch_artifacts=size_mismatches, + ) + + # ============================================================================= # Statistics Endpoints (ISSUE 34) # ============================================================================= @@ -3069,3 +3860,671 @@ Generated: {generated_at.strftime("%Y-%m-%d %H:%M:%S UTC")} indent=2, ), ) + + +# ============================================================================= +# Audit Log Endpoints +# ============================================================================= + + +@router.get("/api/v1/audit-logs", response_model=PaginatedResponse[AuditLogResponse]) +def list_audit_logs( + action: Optional[str] = Query(None, description="Filter by action type"), + resource: Optional[str] = Query(None, description="Filter by resource pattern"), + user_id: Optional[str] = Query(None, description="Filter by user"), + from_date: Optional[datetime] = Query(None, alias="from", description="Start date"), + to_date: Optional[datetime] = Query(None, alias="to", description="End date"), + page: int = Query(1, ge=1), + limit: int = Query(20, ge=1, le=100), + db: Session = Depends(get_db), +): + """ + List audit logs with filtering and pagination. + + Filters: + - action: Filter by action type (e.g., 'project.create', 'artifact.upload') + - resource: Filter by resource pattern (partial match) + - user_id: Filter by user ID + - from/to: Filter by timestamp range + """ + query = db.query(AuditLog) + + if action: + query = query.filter(AuditLog.action == action) + if resource: + query = query.filter(AuditLog.resource.ilike(f"%{resource}%")) + if user_id: + query = query.filter(AuditLog.user_id == user_id) + if from_date: + query = query.filter(AuditLog.timestamp >= from_date) + if to_date: + query = query.filter(AuditLog.timestamp <= to_date) + + total = query.count() + total_pages = math.ceil(total / limit) if total > 0 else 1 + + logs = ( + query.order_by(AuditLog.timestamp.desc()) + .offset((page - 1) * limit) + .limit(limit) + .all() + ) + + return PaginatedResponse( + items=logs, + pagination=PaginationMeta( + page=page, + limit=limit, + total=total, + total_pages=total_pages, + has_more=page < total_pages, + ), + ) + + +@router.get( + "/api/v1/projects/{project_name}/audit-logs", + response_model=PaginatedResponse[AuditLogResponse], +) +def list_project_audit_logs( + project_name: str, + action: Optional[str] = Query(None, description="Filter by action type"), + from_date: Optional[datetime] = Query(None, alias="from", description="Start date"), + to_date: Optional[datetime] = Query(None, alias="to", description="End date"), + page: int = Query(1, ge=1), + limit: int = Query(20, ge=1, le=100), + db: Session = Depends(get_db), +): + """List audit logs for a specific project.""" + project = db.query(Project).filter(Project.name == project_name).first() + if not project: + raise HTTPException(status_code=404, detail="Project not found") + + # Match resources that start with project name + resource_pattern = f"{project_name}%" + query = db.query(AuditLog).filter(AuditLog.resource.like(resource_pattern)) + + if action: + query = query.filter(AuditLog.action == action) + if from_date: + query = query.filter(AuditLog.timestamp >= from_date) + if to_date: + query = query.filter(AuditLog.timestamp <= to_date) + + total = query.count() + total_pages = math.ceil(total / limit) if total > 0 else 1 + + logs = ( + query.order_by(AuditLog.timestamp.desc()) + .offset((page - 1) * limit) + .limit(limit) + .all() + ) + + return PaginatedResponse( + items=logs, + pagination=PaginationMeta( + page=page, + limit=limit, + total=total, + total_pages=total_pages, + has_more=page < total_pages, + ), + ) + + +@router.get( + "/api/v1/project/{project_name}/{package_name}/audit-logs", + response_model=PaginatedResponse[AuditLogResponse], +) +def list_package_audit_logs( + project_name: str, + package_name: str, + action: Optional[str] = Query(None, description="Filter by action type"), + from_date: Optional[datetime] = Query(None, alias="from", description="Start date"), + to_date: Optional[datetime] = Query(None, alias="to", description="End date"), + page: int = Query(1, ge=1), + limit: int = Query(20, ge=1, le=100), + db: Session = Depends(get_db), +): + """List audit logs for a specific package.""" + project = db.query(Project).filter(Project.name == project_name).first() + if not project: + raise HTTPException(status_code=404, detail="Project not found") + + package = ( + db.query(Package) + .filter(Package.project_id == project.id, Package.name == package_name) + .first() + ) + if not package: + raise HTTPException(status_code=404, detail="Package not found") + + # Match resources that contain project/package + resource_pattern = f"{project_name}/{package_name}%" + query = db.query(AuditLog).filter(AuditLog.resource.like(resource_pattern)) + + if action: + query = query.filter(AuditLog.action == action) + if from_date: + query = query.filter(AuditLog.timestamp >= from_date) + if to_date: + query = query.filter(AuditLog.timestamp <= to_date) + + total = query.count() + total_pages = math.ceil(total / limit) if total > 0 else 1 + + logs = ( + query.order_by(AuditLog.timestamp.desc()) + .offset((page - 1) * limit) + .limit(limit) + .all() + ) + + return PaginatedResponse( + items=logs, + pagination=PaginationMeta( + page=page, + limit=limit, + total=total, + total_pages=total_pages, + has_more=page < total_pages, + ), + ) + + +# ============================================================================= +# Upload History Endpoints +# ============================================================================= + + +@router.get( + "/api/v1/uploads", + response_model=PaginatedResponse[UploadHistoryResponse], +) +def list_all_uploads( + request: Request, + project: Optional[str] = Query(None, description="Filter by project name"), + package: Optional[str] = Query(None, description="Filter by package name"), + uploaded_by: Optional[str] = Query(None, description="Filter by uploader"), + from_date: Optional[datetime] = Query(None, alias="from", description="Start date"), + to_date: Optional[datetime] = Query(None, alias="to", description="End date"), + deduplicated: Optional[bool] = Query( + None, description="Filter by deduplication status" + ), + search: Optional[str] = Query(None, description="Search by original filename"), + tag: Optional[str] = Query( + None, + description="Filter by tag name. Supports wildcards (*) and comma-separated values", + ), + sort: Optional[str] = Query( + None, description="Sort field: uploaded_at, original_name, size" + ), + order: Optional[str] = Query("desc", description="Sort order: asc or desc"), + page: int = Query(1, ge=1), + limit: int = Query(20, ge=1, le=100), + db: Session = Depends(get_db), +): + """ + List all upload events globally (admin endpoint). + + Supports filtering by: + - project: Filter by project name + - package: Filter by package name (requires project) + - uploaded_by: Filter by user ID + - from/to: Filter by timestamp range + - deduplicated: Filter by deduplication status + - search: Search by original filename (case-insensitive) + - tag: Filter by tag name + """ + query = ( + db.query(Upload, Package, Project, Artifact) + .join(Package, Upload.package_id == Package.id) + .join(Project, Package.project_id == Project.id) + .join(Artifact, Upload.artifact_id == Artifact.id) + ) + + # Apply filters + if project: + query = query.filter(Project.name == project) + if package: + query = query.filter(Package.name == package) + if uploaded_by: + query = query.filter(Upload.uploaded_by == uploaded_by) + if from_date: + query = query.filter(Upload.uploaded_at >= from_date) + if to_date: + query = query.filter(Upload.uploaded_at <= to_date) + if deduplicated is not None: + query = query.filter(Upload.deduplicated == deduplicated) + if search: + query = query.filter(Upload.original_name.ilike(f"%{search}%")) + if tag: + # Support multiple values (comma-separated) and wildcards (*) + tag_values = [t.strip() for t in tag.split(",") if t.strip()] + if len(tag_values) == 1: + tag_val = tag_values[0] + if "*" in tag_val: + query = query.filter(Upload.tag_name.ilike(tag_val.replace("*", "%"))) + else: + query = query.filter(Upload.tag_name == tag_val) + else: + tag_conditions = [] + for tag_val in tag_values: + if "*" in tag_val: + tag_conditions.append( + Upload.tag_name.ilike(tag_val.replace("*", "%")) + ) + else: + tag_conditions.append(Upload.tag_name == tag_val) + query = query.filter(or_(*tag_conditions)) + + # Validate and apply sorting + valid_sort_fields = { + "uploaded_at": Upload.uploaded_at, + "original_name": Upload.original_name, + "size": Artifact.size, + } + if sort and sort not in valid_sort_fields: + raise HTTPException( + status_code=400, + detail=f"Invalid sort field. Valid options: {', '.join(valid_sort_fields.keys())}", + ) + sort_column = valid_sort_fields.get(sort, Upload.uploaded_at) + if order and order.lower() not in ("asc", "desc"): + raise HTTPException( + status_code=400, detail="Invalid order. Valid options: asc, desc" + ) + sort_order = ( + sort_column.asc() if order and order.lower() == "asc" else sort_column.desc() + ) + + total = query.count() + total_pages = math.ceil(total / limit) if total > 0 else 1 + + results = query.order_by(sort_order).offset((page - 1) * limit).limit(limit).all() + + items = [ + UploadHistoryResponse( + id=upload.id, + artifact_id=upload.artifact_id, + package_id=upload.package_id, + package_name=pkg.name, + project_name=proj.name, + original_name=upload.original_name, + tag_name=upload.tag_name, + uploaded_at=upload.uploaded_at, + uploaded_by=upload.uploaded_by, + source_ip=upload.source_ip, + deduplicated=upload.deduplicated or False, + artifact_size=artifact.size, + artifact_content_type=artifact.content_type, + ) + for upload, pkg, proj, artifact in results + ] + + return PaginatedResponse( + items=items, + pagination=PaginationMeta( + page=page, + limit=limit, + total=total, + total_pages=total_pages, + has_more=page < total_pages, + ), + ) + + +@router.get( + "/api/v1/project/{project_name}/uploads", + response_model=PaginatedResponse[UploadHistoryResponse], +) +def list_project_uploads( + project_name: str, + package: Optional[str] = Query(None, description="Filter by package name"), + uploaded_by: Optional[str] = Query(None, description="Filter by uploader"), + from_date: Optional[datetime] = Query(None, alias="from", description="Start date"), + to_date: Optional[datetime] = Query(None, alias="to", description="End date"), + deduplicated: Optional[bool] = Query( + None, description="Filter by deduplication status" + ), + page: int = Query(1, ge=1), + limit: int = Query(20, ge=1, le=100), + db: Session = Depends(get_db), +): + """ + List upload events for a specific project. + + Supports filtering by: + - package: Filter by package name within the project + - uploaded_by: Filter by user ID + - from/to: Filter by timestamp range + - deduplicated: Filter by deduplication status + """ + project = db.query(Project).filter(Project.name == project_name).first() + if not project: + raise HTTPException(status_code=404, detail="Project not found") + + # Get all package IDs for this project + package_ids_query = db.query(Package.id).filter(Package.project_id == project.id) + + if package: + package_ids_query = package_ids_query.filter(Package.name == package) + + package_ids = package_ids_query.subquery() + + query = ( + db.query(Upload, Package, Artifact) + .join(Package, Upload.package_id == Package.id) + .join(Artifact, Upload.artifact_id == Artifact.id) + .filter(Upload.package_id.in_(package_ids)) + ) + + if uploaded_by: + query = query.filter(Upload.uploaded_by == uploaded_by) + if from_date: + query = query.filter(Upload.uploaded_at >= from_date) + if to_date: + query = query.filter(Upload.uploaded_at <= to_date) + if deduplicated is not None: + query = query.filter(Upload.deduplicated == deduplicated) + + total = query.count() + total_pages = math.ceil(total / limit) if total > 0 else 1 + + results = ( + query.order_by(Upload.uploaded_at.desc()) + .offset((page - 1) * limit) + .limit(limit) + .all() + ) + + items = [ + UploadHistoryResponse( + id=upload.id, + artifact_id=upload.artifact_id, + package_id=upload.package_id, + package_name=pkg.name, + project_name=project_name, + original_name=upload.original_name, + tag_name=upload.tag_name, + uploaded_at=upload.uploaded_at, + uploaded_by=upload.uploaded_by, + source_ip=upload.source_ip, + deduplicated=upload.deduplicated or False, + artifact_size=artifact.size, + artifact_content_type=artifact.content_type, + ) + for upload, pkg, artifact in results + ] + + return PaginatedResponse( + items=items, + pagination=PaginationMeta( + page=page, + limit=limit, + total=total, + total_pages=total_pages, + has_more=page < total_pages, + ), + ) + + +@router.get( + "/api/v1/project/{project_name}/{package_name}/uploads", + response_model=PaginatedResponse[UploadHistoryResponse], +) +def list_package_uploads( + project_name: str, + package_name: str, + from_date: Optional[datetime] = Query(None, alias="from", description="Start date"), + to_date: Optional[datetime] = Query(None, alias="to", description="End date"), + page: int = Query(1, ge=1), + limit: int = Query(20, ge=1, le=100), + db: Session = Depends(get_db), +): + """List upload events for a specific package.""" + project = db.query(Project).filter(Project.name == project_name).first() + if not project: + raise HTTPException(status_code=404, detail="Project not found") + + package = ( + db.query(Package) + .filter(Package.project_id == project.id, Package.name == package_name) + .first() + ) + if not package: + raise HTTPException(status_code=404, detail="Package not found") + + query = db.query(Upload).filter(Upload.package_id == package.id) + + if from_date: + query = query.filter(Upload.uploaded_at >= from_date) + if to_date: + query = query.filter(Upload.uploaded_at <= to_date) + + total = query.count() + total_pages = math.ceil(total / limit) if total > 0 else 1 + + uploads = ( + query.order_by(Upload.uploaded_at.desc()) + .offset((page - 1) * limit) + .limit(limit) + .all() + ) + + # Build response with artifact metadata + items = [] + for upload in uploads: + artifact = db.query(Artifact).filter(Artifact.id == upload.artifact_id).first() + items.append( + UploadHistoryResponse( + id=upload.id, + artifact_id=upload.artifact_id, + package_id=upload.package_id, + package_name=package_name, + project_name=project_name, + original_name=upload.original_name, + tag_name=upload.tag_name, + uploaded_at=upload.uploaded_at, + uploaded_by=upload.uploaded_by, + source_ip=upload.source_ip, + deduplicated=upload.deduplicated or False, + artifact_size=artifact.size if artifact else 0, + artifact_content_type=artifact.content_type if artifact else None, + ) + ) + + return PaginatedResponse( + items=items, + pagination=PaginationMeta( + page=page, + limit=limit, + total=total, + total_pages=total_pages, + has_more=page < total_pages, + ), + ) + + +@router.get( + "/api/v1/artifact/{artifact_id}/uploads", + response_model=PaginatedResponse[UploadHistoryResponse], +) +def list_artifact_uploads( + artifact_id: str, + page: int = Query(1, ge=1), + limit: int = Query(20, ge=1, le=100), + db: Session = Depends(get_db), +): + """List all upload events for a specific artifact.""" + artifact = db.query(Artifact).filter(Artifact.id == artifact_id).first() + if not artifact: + raise HTTPException(status_code=404, detail="Artifact not found") + + query = db.query(Upload).filter(Upload.artifact_id == artifact_id) + + total = query.count() + total_pages = math.ceil(total / limit) if total > 0 else 1 + + uploads = ( + query.order_by(Upload.uploaded_at.desc()) + .offset((page - 1) * limit) + .limit(limit) + .all() + ) + + # Build response with package/project metadata + items = [] + for upload in uploads: + package = db.query(Package).filter(Package.id == upload.package_id).first() + project = ( + db.query(Project).filter(Project.id == package.project_id).first() + if package + else None + ) + items.append( + UploadHistoryResponse( + id=upload.id, + artifact_id=upload.artifact_id, + package_id=upload.package_id, + package_name=package.name if package else "unknown", + project_name=project.name if project else "unknown", + original_name=upload.original_name, + tag_name=upload.tag_name, + uploaded_at=upload.uploaded_at, + uploaded_by=upload.uploaded_by, + source_ip=upload.source_ip, + deduplicated=upload.deduplicated or False, + artifact_size=artifact.size, + artifact_content_type=artifact.content_type, + ) + ) + + return PaginatedResponse( + items=items, + pagination=PaginationMeta( + page=page, + limit=limit, + total=total, + total_pages=total_pages, + has_more=page < total_pages, + ), + ) + + +# ============================================================================= +# Artifact Provenance/History Endpoint +# ============================================================================= + + +@router.get( + "/api/v1/artifact/{artifact_id}/history", response_model=ArtifactProvenanceResponse +) +def get_artifact_provenance( + artifact_id: str, + db: Session = Depends(get_db), +): + """ + Get full provenance/history of an artifact. + + Returns: + - Artifact metadata + - First upload information + - All packages/tags referencing the artifact + - Complete upload history + """ + artifact = db.query(Artifact).filter(Artifact.id == artifact_id).first() + if not artifact: + raise HTTPException(status_code=404, detail="Artifact not found") + + # Get all uploads for this artifact + uploads = ( + db.query(Upload) + .filter(Upload.artifact_id == artifact_id) + .order_by(Upload.uploaded_at.asc()) + .all() + ) + + # Get first upload info + first_upload = uploads[0] if uploads else None + + # Get all tags referencing this artifact + tags = db.query(Tag).filter(Tag.artifact_id == artifact_id).all() + + # Build package list with tags + package_map = {} # package_id -> {project_name, package_name, tag_names} + tag_list = [] + + for tag in tags: + package = db.query(Package).filter(Package.id == tag.package_id).first() + if package: + project = db.query(Project).filter(Project.id == package.project_id).first() + project_name = project.name if project else "unknown" + + # Add to package map + pkg_key = str(package.id) + if pkg_key not in package_map: + package_map[pkg_key] = { + "project_name": project_name, + "package_name": package.name, + "tag_names": [], + } + package_map[pkg_key]["tag_names"].append(tag.name) + + # Add to tag list + tag_list.append( + { + "project_name": project_name, + "package_name": package.name, + "tag_name": tag.name, + "created_at": tag.created_at.isoformat() + if tag.created_at + else None, + } + ) + + # Build upload history + upload_history = [] + for upload in uploads: + package = db.query(Package).filter(Package.id == upload.package_id).first() + project = ( + db.query(Project).filter(Project.id == package.project_id).first() + if package + else None + ) + upload_history.append( + { + "upload_id": str(upload.id), + "project_name": project.name if project else "unknown", + "package_name": package.name if package else "unknown", + "original_name": upload.original_name, + "tag_name": upload.tag_name, + "uploaded_at": upload.uploaded_at.isoformat() + if upload.uploaded_at + else None, + "uploaded_by": upload.uploaded_by, + "deduplicated": upload.deduplicated or False, + } + ) + + return ArtifactProvenanceResponse( + artifact_id=artifact.id, + sha256=artifact.id, + size=artifact.size, + content_type=artifact.content_type, + original_name=artifact.original_name, + created_at=artifact.created_at, + created_by=artifact.created_by, + ref_count=artifact.ref_count, + first_uploaded_at=first_upload.uploaded_at + if first_upload + else artifact.created_at, + first_uploaded_by=first_upload.uploaded_by + if first_upload + else artifact.created_by, + upload_count=len(uploads), + packages=list(package_map.values()), + tags=tag_list, + uploads=upload_history, + ) diff --git a/backend/app/schemas.py b/backend/app/schemas.py index 4c7db29..9bd3701 100644 --- a/backend/app/schemas.py +++ b/backend/app/schemas.py @@ -12,6 +12,7 @@ class PaginationMeta(BaseModel): limit: int total: int total_pages: int + has_more: bool = False # True if there are more pages after current page class PaginatedResponse(BaseModel, Generic[T]): @@ -39,6 +40,13 @@ class ProjectResponse(BaseModel): from_attributes = True +class ProjectUpdate(BaseModel): + """Schema for updating a project""" + + description: Optional[str] = None + is_public: Optional[bool] = None + + # Package format and platform enums PACKAGE_FORMATS = [ "generic", @@ -86,6 +94,14 @@ class PackageResponse(BaseModel): from_attributes = True +class PackageUpdate(BaseModel): + """Schema for updating a package""" + + description: Optional[str] = None + format: Optional[str] = None + platform: Optional[str] = None + + class TagSummary(BaseModel): """Lightweight tag info for embedding in package responses""" @@ -189,6 +205,93 @@ class TagHistoryResponse(BaseModel): from_attributes = True +class TagHistoryDetailResponse(BaseModel): + """Tag history with artifact metadata for each version""" + + id: UUID + tag_id: UUID + tag_name: str + old_artifact_id: Optional[str] + new_artifact_id: str + changed_at: datetime + changed_by: str + # Artifact metadata for new artifact + artifact_size: int + artifact_original_name: Optional[str] + artifact_content_type: Optional[str] + + class Config: + from_attributes = True + + +# Audit log schemas +class AuditLogResponse(BaseModel): + """Audit log entry response""" + + id: UUID + action: str + resource: str + user_id: str + details: Optional[Dict[str, Any]] + timestamp: datetime + source_ip: Optional[str] + + class Config: + from_attributes = True + + +# Upload history schemas +class UploadHistoryResponse(BaseModel): + """Upload event with artifact details""" + + id: UUID + artifact_id: str + package_id: UUID + package_name: str + project_name: str + original_name: Optional[str] + tag_name: Optional[str] + uploaded_at: datetime + uploaded_by: str + source_ip: Optional[str] + deduplicated: bool + # Artifact metadata + artifact_size: int + artifact_content_type: Optional[str] + + class Config: + from_attributes = True + + +# Artifact provenance schemas +class ArtifactProvenanceResponse(BaseModel): + """Full provenance/history of an artifact""" + + artifact_id: str + sha256: str + size: int + content_type: Optional[str] + original_name: Optional[str] + created_at: datetime + created_by: str + ref_count: int + # First upload info + first_uploaded_at: datetime + first_uploaded_by: str + # Usage statistics + upload_count: int + # References + packages: List[Dict[str, Any]] # List of {project_name, package_name, tag_names} + tags: List[ + Dict[str, Any] + ] # List of {project_name, package_name, tag_name, created_at} + # Upload history + uploads: List[Dict[str, Any]] # List of upload events + + class Config: + from_attributes = True + + class ArtifactTagInfo(BaseModel): """Tag info for embedding in artifact responses""" @@ -240,6 +343,44 @@ class PackageArtifactResponse(BaseModel): from_attributes = True +class GlobalArtifactResponse(BaseModel): + """Artifact with project/package context for global listing""" + + id: str + sha256: str + size: int + content_type: Optional[str] + original_name: Optional[str] + created_at: datetime + created_by: str + format_metadata: Optional[Dict[str, Any]] = None + ref_count: int = 0 + # Context from tags/packages + projects: List[str] = [] # List of project names containing this artifact + packages: List[str] = [] # List of "project/package" paths + tags: List[str] = [] # List of "project/package:tag" references + + class Config: + from_attributes = True + + +class GlobalTagResponse(BaseModel): + """Tag with project/package context for global listing""" + + id: UUID + name: str + artifact_id: str + created_at: datetime + created_by: str + project_name: str + package_name: str + artifact_size: Optional[int] = None + artifact_content_type: Optional[str] = None + + class Config: + from_attributes = True + + # Upload response class UploadResponse(BaseModel): artifact_id: str @@ -254,6 +395,11 @@ class UploadResponse(BaseModel): format_metadata: Optional[Dict[str, Any]] = None deduplicated: bool = False ref_count: int = 1 # Current reference count after this upload + # Enhanced metadata (Issue #19) + upload_id: Optional[UUID] = None # UUID of the upload record + content_type: Optional[str] = None + original_name: Optional[str] = None + created_at: Optional[datetime] = None # Resumable upload schemas @@ -440,6 +586,19 @@ class StorageStatsResponse(BaseModel): storage_saved_bytes: int # Bytes saved through deduplication +class ConsistencyCheckResponse(BaseModel): + """Result of S3/Database consistency check""" + + total_artifacts_checked: int + orphaned_s3_objects: int # Objects in S3 but not in DB + missing_s3_objects: int # Records in DB but not in S3 + size_mismatches: int # Records where DB size != S3 size + healthy: bool + orphaned_s3_keys: List[str] = [] # Limited list of orphaned S3 keys + missing_s3_keys: List[str] = [] # Limited list of missing S3 keys + size_mismatch_artifacts: List[Dict[str, Any]] = [] # Limited list of mismatches + + class DeduplicationStatsResponse(BaseModel): """Deduplication effectiveness statistics""" diff --git a/backend/app/services/artifact_cleanup.py b/backend/app/services/artifact_cleanup.py index d1e807d..0857155 100644 --- a/backend/app/services/artifact_cleanup.py +++ b/backend/app/services/artifact_cleanup.py @@ -6,7 +6,7 @@ from typing import List, Optional, Tuple from sqlalchemy.orm import Session import logging -from ..models import Artifact, Tag, Upload, Package +from ..models import Artifact, Tag from ..repositories.artifact import ArtifactRepository from ..repositories.tag import TagRepository from ..storage import S3Storage @@ -40,10 +40,14 @@ class ArtifactCleanupService: artifact = self.artifact_repo.get_by_sha256(artifact_id) if artifact: artifact = self.artifact_repo.decrement_ref_count(artifact) - logger.info(f"Decremented ref_count for artifact {artifact_id}: now {artifact.ref_count}") + logger.info( + f"Decremented ref_count for artifact {artifact_id}: now {artifact.ref_count}" + ) return artifact - def on_tag_updated(self, old_artifact_id: str, new_artifact_id: str) -> Tuple[Optional[Artifact], Optional[Artifact]]: + def on_tag_updated( + self, old_artifact_id: str, new_artifact_id: str + ) -> Tuple[Optional[Artifact], Optional[Artifact]]: """ Called when a tag is updated to point to a different artifact. Decrements ref_count for old artifact, increments for new (if different). @@ -58,13 +62,17 @@ class ArtifactCleanupService: old_artifact = self.artifact_repo.get_by_sha256(old_artifact_id) if old_artifact: old_artifact = self.artifact_repo.decrement_ref_count(old_artifact) - logger.info(f"Decremented ref_count for old artifact {old_artifact_id}: now {old_artifact.ref_count}") + logger.info( + f"Decremented ref_count for old artifact {old_artifact_id}: now {old_artifact.ref_count}" + ) # Increment new artifact ref_count new_artifact = self.artifact_repo.get_by_sha256(new_artifact_id) if new_artifact: new_artifact = self.artifact_repo.increment_ref_count(new_artifact) - logger.info(f"Incremented ref_count for new artifact {new_artifact_id}: now {new_artifact.ref_count}") + logger.info( + f"Incremented ref_count for new artifact {new_artifact_id}: now {new_artifact.ref_count}" + ) return old_artifact, new_artifact @@ -84,11 +92,15 @@ class ArtifactCleanupService: if artifact: self.artifact_repo.decrement_ref_count(artifact) affected_artifacts.append(tag.artifact_id) - logger.info(f"Decremented ref_count for artifact {tag.artifact_id} (package delete)") + logger.info( + f"Decremented ref_count for artifact {tag.artifact_id} (package delete)" + ) return affected_artifacts - def cleanup_orphaned_artifacts(self, batch_size: int = 100, dry_run: bool = False) -> List[str]: + def cleanup_orphaned_artifacts( + self, batch_size: int = 100, dry_run: bool = False + ) -> List[str]: """ Find and delete artifacts with ref_count = 0. @@ -116,7 +128,9 @@ class ArtifactCleanupService: # Then delete from database self.artifact_repo.delete(artifact) deleted_ids.append(artifact.id) - logger.info(f"Deleted orphaned artifact from database: {artifact.id}") + logger.info( + f"Deleted orphaned artifact from database: {artifact.id}" + ) except Exception as e: logger.error(f"Failed to delete artifact {artifact.id}: {e}") @@ -128,10 +142,12 @@ class ArtifactCleanupService: def get_orphaned_count(self) -> int: """Get count of artifacts with ref_count = 0.""" from sqlalchemy import func + return ( self.db.query(func.count(Artifact.id)) .filter(Artifact.ref_count == 0) - .scalar() or 0 + .scalar() + or 0 ) def verify_ref_counts(self, fix: bool = False) -> List[dict]: @@ -173,7 +189,9 @@ class ArtifactCleanupService: if fix: artifact.ref_count = max(actual_count, 1) - logger.warning(f"Fixed ref_count for artifact {artifact.id}: {mismatch['stored_ref_count']} -> {artifact.ref_count}") + logger.warning( + f"Fixed ref_count for artifact {artifact.id}: {mismatch['stored_ref_count']} -> {artifact.ref_count}" + ) if fix and mismatches: self.db.commit() diff --git a/backend/app/storage.py b/backend/app/storage.py index 99b4783..440dbaf 100644 --- a/backend/app/storage.py +++ b/backend/app/storage.py @@ -202,6 +202,9 @@ class StorageResult(NamedTuple): md5: Optional[str] = None sha1: Optional[str] = None s3_etag: Optional[str] = None + already_existed: bool = ( + False # True if artifact was deduplicated (S3 object already existed) + ) class S3StorageUnavailableError(StorageError): @@ -354,6 +357,7 @@ class S3Storage: md5=md5_hash, sha1=sha1_hash, s3_etag=s3_etag, + already_existed=exists, ) def _store_multipart(self, file: BinaryIO, content_length: int) -> StorageResult: @@ -433,6 +437,7 @@ class S3Storage: md5=md5_hash, sha1=sha1_hash, s3_etag=s3_etag, + already_existed=True, ) # Seek back to start for upload @@ -486,6 +491,7 @@ class S3Storage: md5=md5_hash, sha1=sha1_hash, s3_etag=s3_etag, + already_existed=False, ) except Exception as e: @@ -535,6 +541,7 @@ class S3Storage: md5=md5_hash, sha1=sha1_hash, s3_etag=s3_etag, + already_existed=True, ) # Upload based on size @@ -615,6 +622,7 @@ class S3Storage: md5=md5_hash, sha1=sha1_hash, s3_etag=s3_etag, + already_existed=False, ) def initiate_resumable_upload(self, expected_hash: str) -> Dict[str, Any]: diff --git a/backend/tests/conftest.py b/backend/tests/conftest.py index 605dfe3..34111d8 100644 --- a/backend/tests/conftest.py +++ b/backend/tests/conftest.py @@ -4,15 +4,14 @@ Test configuration and fixtures for Orchard backend tests. This module provides: - Database fixtures with test isolation - Mock S3 storage using moto -- Test data factories for common scenarios +- Shared pytest fixtures """ import os import pytest -import hashlib -from typing import Generator, BinaryIO -from unittest.mock import MagicMock, patch import io +from typing import Generator +from unittest.mock import MagicMock # Set test environment defaults before importing app modules # Use setdefault to NOT override existing env vars (from docker-compose) @@ -26,54 +25,27 @@ os.environ.setdefault("ORCHARD_S3_BUCKET", "test-bucket") os.environ.setdefault("ORCHARD_S3_ACCESS_KEY_ID", "test") os.environ.setdefault("ORCHARD_S3_SECRET_ACCESS_KEY", "test") - -# ============================================================================= -# Test Data Factories -# ============================================================================= - - -def create_test_file(content: bytes = None, size: int = 1024) -> io.BytesIO: - """ - Create a test file with known content. - - Args: - content: Specific content to use, or None to generate random-ish content - size: Size of generated content if content is None - - Returns: - BytesIO object with the content - """ - if content is None: - content = os.urandom(size) - return io.BytesIO(content) - - -def compute_sha256(content: bytes) -> str: - """Compute SHA256 hash of content as lowercase hex string.""" - return hashlib.sha256(content).hexdigest() - - -def compute_md5(content: bytes) -> str: - """Compute MD5 hash of content as lowercase hex string.""" - return hashlib.md5(content).hexdigest() - - -def compute_sha1(content: bytes) -> str: - """Compute SHA1 hash of content as lowercase hex string.""" - return hashlib.sha1(content).hexdigest() - - -# Known test data with pre-computed hashes -TEST_CONTENT_HELLO = b"Hello, World!" -TEST_HASH_HELLO = "dffd6021bb2bd5b0af676290809ec3a53191dd81c7f70a4b28688a362182986f" -TEST_MD5_HELLO = "65a8e27d8879283831b664bd8b7f0ad4" -TEST_SHA1_HELLO = "0a0a9f2a6772942557ab5355d76af442f8f65e01" - -TEST_CONTENT_EMPTY = b"" -# Note: Empty content should be rejected by the storage layer - -TEST_CONTENT_BINARY = bytes(range(256)) -TEST_HASH_BINARY = compute_sha256(TEST_CONTENT_BINARY) +# Re-export factory functions for backward compatibility +from tests.factories import ( + create_test_file, + compute_sha256, + compute_md5, + compute_sha1, + upload_test_file, + TEST_CONTENT_HELLO, + TEST_HASH_HELLO, + TEST_MD5_HELLO, + TEST_SHA1_HELLO, + TEST_CONTENT_EMPTY, + TEST_CONTENT_BINARY, + TEST_HASH_BINARY, + get_s3_client, + get_s3_bucket, + list_s3_objects_by_hash, + count_s3_objects_by_prefix, + s3_object_exists, + delete_s3_object_by_hash, +) # ============================================================================= @@ -289,126 +261,3 @@ def test_content(): content = f"test-content-{uuid.uuid4().hex}".encode() sha256 = compute_sha256(content) return (content, sha256) - - -def upload_test_file( - client, - project: str, - package: str, - content: bytes, - filename: str = "test.bin", - tag: str = None, -) -> dict: - """ - Helper function to upload a test file. - - Returns the upload response as a dict. - """ - files = {"file": (filename, io.BytesIO(content), "application/octet-stream")} - data = {} - if tag: - data["tag"] = tag - - response = client.post( - f"/api/v1/project/{project}/{package}/upload", - files=files, - data=data if data else None, - ) - assert response.status_code == 200, f"Upload failed: {response.text}" - return response.json() - - -# ============================================================================= -# S3 Direct Access Helpers (for integration tests) -# ============================================================================= - - -def get_s3_client(): - """ - Create a boto3 S3 client for direct S3 access in integration tests. - - Uses environment variables for configuration (same as the app). - Note: When running in container, S3 endpoint should be 'minio:9000' not 'localhost:9000'. - """ - import boto3 - from botocore.config import Config - - config = Config(s3={"addressing_style": "path"}) - - # Use the same endpoint as the app (minio:9000 in container, localhost:9000 locally) - endpoint = os.environ.get("ORCHARD_S3_ENDPOINT", "http://minio:9000") - - return boto3.client( - "s3", - endpoint_url=endpoint, - region_name=os.environ.get("ORCHARD_S3_REGION", "us-east-1"), - aws_access_key_id=os.environ.get("ORCHARD_S3_ACCESS_KEY_ID", "minioadmin"), - aws_secret_access_key=os.environ.get( - "ORCHARD_S3_SECRET_ACCESS_KEY", "minioadmin" - ), - config=config, - ) - - -def get_s3_bucket(): - """Get the S3 bucket name from environment.""" - return os.environ.get("ORCHARD_S3_BUCKET", "orchard-artifacts") - - -def list_s3_objects_by_hash(sha256_hash: str) -> list: - """ - List S3 objects that match a specific SHA256 hash. - - Uses the fruits/{hash[:2]}/{hash[2:4]}/{hash} key pattern. - Returns list of matching object keys. - """ - client = get_s3_client() - bucket = get_s3_bucket() - prefix = f"fruits/{sha256_hash[:2]}/{sha256_hash[2:4]}/{sha256_hash}" - - response = client.list_objects_v2(Bucket=bucket, Prefix=prefix) - - if "Contents" not in response: - return [] - - return [obj["Key"] for obj in response["Contents"]] - - -def count_s3_objects_by_prefix(prefix: str) -> int: - """ - Count S3 objects with a given prefix. - - Useful for checking if duplicate uploads created multiple objects. - """ - client = get_s3_client() - bucket = get_s3_bucket() - - response = client.list_objects_v2(Bucket=bucket, Prefix=prefix) - - if "Contents" not in response: - return 0 - - return len(response["Contents"]) - - -def s3_object_exists(sha256_hash: str) -> bool: - """ - Check if an S3 object exists for a given SHA256 hash. - """ - objects = list_s3_objects_by_hash(sha256_hash) - return len(objects) > 0 - - -def delete_s3_object_by_hash(sha256_hash: str) -> bool: - """ - Delete an S3 object by its SHA256 hash (for test cleanup). - """ - client = get_s3_client() - bucket = get_s3_bucket() - s3_key = f"fruits/{sha256_hash[:2]}/{sha256_hash[2:4]}/{sha256_hash}" - - try: - client.delete_object(Bucket=bucket, Key=s3_key) - return True - except Exception: - return False diff --git a/backend/tests/factories.py b/backend/tests/factories.py new file mode 100644 index 0000000..cd58f2a --- /dev/null +++ b/backend/tests/factories.py @@ -0,0 +1,288 @@ +""" +Test data factories for Orchard backend tests. + +This module provides factory functions for creating test data, +including test files, pre-computed hashes, and helper utilities. +""" + +import hashlib +import io +import os +import uuid +from typing import Optional + + +# ============================================================================= +# Hash Computation Utilities +# ============================================================================= + + +def compute_sha256(content: bytes) -> str: + """Compute SHA256 hash of content as lowercase hex string.""" + return hashlib.sha256(content).hexdigest() + + +def compute_md5(content: bytes) -> str: + """Compute MD5 hash of content as lowercase hex string.""" + return hashlib.md5(content).hexdigest() + + +def compute_sha1(content: bytes) -> str: + """Compute SHA1 hash of content as lowercase hex string.""" + return hashlib.sha1(content).hexdigest() + + +# ============================================================================= +# Test File Factories +# ============================================================================= + + +def create_test_file(content: Optional[bytes] = None, size: int = 1024) -> io.BytesIO: + """ + Create a test file with known content. + + Args: + content: Specific content to use, or None to generate random-ish content + size: Size of generated content if content is None + + Returns: + BytesIO object with the content + """ + if content is None: + content = os.urandom(size) + return io.BytesIO(content) + + +def create_unique_content(prefix: str = "test-content") -> tuple[bytes, str]: + """ + Create unique test content with its SHA256 hash. + + Args: + prefix: Prefix for the content string + + Returns: + Tuple of (content_bytes, sha256_hash) + """ + content = f"{prefix}-{uuid.uuid4().hex}".encode() + sha256 = compute_sha256(content) + return content, sha256 + + +# ============================================================================= +# Known Test Data (Pre-computed hashes for deterministic tests) +# ============================================================================= + + +TEST_CONTENT_HELLO = b"Hello, World!" +TEST_HASH_HELLO = "dffd6021bb2bd5b0af676290809ec3a53191dd81c7f70a4b28688a362182986f" +TEST_MD5_HELLO = "65a8e27d8879283831b664bd8b7f0ad4" +TEST_SHA1_HELLO = "0a0a9f2a6772942557ab5355d76af442f8f65e01" + +TEST_CONTENT_EMPTY = b"" +# Note: Empty content should be rejected by the storage layer + +TEST_CONTENT_BINARY = bytes(range(256)) +TEST_HASH_BINARY = compute_sha256(TEST_CONTENT_BINARY) + + +# ============================================================================= +# API Test Helpers +# ============================================================================= + + +def upload_test_file( + client, + project: str, + package: str, + content: bytes, + filename: str = "test.bin", + tag: Optional[str] = None, +) -> dict: + """ + Helper function to upload a test file via the API. + + Args: + client: HTTP client (httpx or TestClient) + project: Project name + package: Package name + content: File content as bytes + filename: Original filename + tag: Optional tag to assign + + Returns: + The upload response as a dict + """ + files = {"file": (filename, io.BytesIO(content), "application/octet-stream")} + data = {} + if tag: + data["tag"] = tag + + response = client.post( + f"/api/v1/project/{project}/{package}/upload", + files=files, + data=data if data else None, + ) + assert response.status_code == 200, f"Upload failed: {response.text}" + return response.json() + + +# ============================================================================= +# Project/Package Factories +# ============================================================================= + + +def create_test_project(client, unique_id: Optional[str] = None) -> str: + """ + Create a test project via the API. + + Args: + client: HTTP client + unique_id: Unique identifier for the project name + + Returns: + Project name + """ + if unique_id is None: + unique_id = uuid.uuid4().hex[:8] + + project_name = f"test-project-{unique_id}" + response = client.post( + "/api/v1/projects", + json={"name": project_name, "description": "Test project", "is_public": True}, + ) + assert response.status_code == 200, f"Failed to create project: {response.text}" + return project_name + + +def create_test_package(client, project: str, unique_id: Optional[str] = None) -> str: + """ + Create a test package via the API. + + Args: + client: HTTP client + project: Project name + unique_id: Unique identifier for the package name + + Returns: + Package name + """ + if unique_id is None: + unique_id = uuid.uuid4().hex[:8] + + package_name = f"test-package-{unique_id}" + response = client.post( + f"/api/v1/project/{project}/packages", + json={"name": package_name, "description": "Test package"}, + ) + assert response.status_code == 200, f"Failed to create package: {response.text}" + return package_name + + +def delete_test_project(client, project: str) -> None: + """ + Delete a test project (cleanup helper). + + Args: + client: HTTP client + project: Project name to delete + """ + try: + client.delete(f"/api/v1/projects/{project}") + except Exception: + pass # Ignore cleanup errors + + +# ============================================================================= +# S3 Test Helpers +# ============================================================================= + + +def get_s3_client(): + """ + Create a boto3 S3 client for direct S3 access in integration tests. + + Uses environment variables for configuration (same as the app). + Note: When running in container, S3 endpoint should be 'minio:9000' not 'localhost:9000'. + """ + import boto3 + from botocore.config import Config + + config = Config(s3={"addressing_style": "path"}) + + # Use the same endpoint as the app (minio:9000 in container, localhost:9000 locally) + endpoint = os.environ.get("ORCHARD_S3_ENDPOINT", "http://minio:9000") + + return boto3.client( + "s3", + endpoint_url=endpoint, + region_name=os.environ.get("ORCHARD_S3_REGION", "us-east-1"), + aws_access_key_id=os.environ.get("ORCHARD_S3_ACCESS_KEY_ID", "minioadmin"), + aws_secret_access_key=os.environ.get( + "ORCHARD_S3_SECRET_ACCESS_KEY", "minioadmin" + ), + config=config, + ) + + +def get_s3_bucket() -> str: + """Get the S3 bucket name from environment.""" + return os.environ.get("ORCHARD_S3_BUCKET", "orchard-artifacts") + + +def list_s3_objects_by_hash(sha256_hash: str) -> list: + """ + List S3 objects that match a specific SHA256 hash. + + Uses the fruits/{hash[:2]}/{hash[2:4]}/{hash} key pattern. + Returns list of matching object keys. + """ + client = get_s3_client() + bucket = get_s3_bucket() + prefix = f"fruits/{sha256_hash[:2]}/{sha256_hash[2:4]}/{sha256_hash}" + + response = client.list_objects_v2(Bucket=bucket, Prefix=prefix) + + if "Contents" not in response: + return [] + + return [obj["Key"] for obj in response["Contents"]] + + +def count_s3_objects_by_prefix(prefix: str) -> int: + """ + Count S3 objects with a given prefix. + + Useful for checking if duplicate uploads created multiple objects. + """ + client = get_s3_client() + bucket = get_s3_bucket() + + response = client.list_objects_v2(Bucket=bucket, Prefix=prefix) + + if "Contents" not in response: + return 0 + + return len(response["Contents"]) + + +def s3_object_exists(sha256_hash: str) -> bool: + """ + Check if an S3 object exists for a given SHA256 hash. + """ + objects = list_s3_objects_by_hash(sha256_hash) + return len(objects) > 0 + + +def delete_s3_object_by_hash(sha256_hash: str) -> bool: + """ + Delete an S3 object by its SHA256 hash (for test cleanup). + """ + client = get_s3_client() + bucket = get_s3_bucket() + s3_key = f"fruits/{sha256_hash[:2]}/{sha256_hash[2:4]}/{sha256_hash}" + + try: + client.delete_object(Bucket=bucket, Key=s3_key) + return True + except Exception: + return False diff --git a/backend/tests/integration/__init__.py b/backend/tests/integration/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/backend/tests/integration/test_artifacts_api.py b/backend/tests/integration/test_artifacts_api.py new file mode 100644 index 0000000..f9b0841 --- /dev/null +++ b/backend/tests/integration/test_artifacts_api.py @@ -0,0 +1,638 @@ +""" +Integration tests for artifact API endpoints. + +Tests cover: +- Artifact retrieval by ID +- Artifact stats endpoint +- Artifact provenance/history +- Artifact uploads listing +- Garbage collection endpoints +- Orphaned artifacts management +""" + +import pytest +from tests.factories import compute_sha256, upload_test_file + + +class TestArtifactRetrieval: + """Tests for artifact retrieval endpoints.""" + + @pytest.mark.integration + def test_get_artifact_by_id(self, integration_client, test_package): + """Test retrieving an artifact by its SHA256 ID.""" + project_name, package_name = test_package + content = b"artifact retrieval test" + expected_hash = compute_sha256(content) + + upload_test_file( + integration_client, project_name, package_name, content, tag="v1" + ) + + response = integration_client.get(f"/api/v1/artifact/{expected_hash}") + assert response.status_code == 200 + + data = response.json() + assert data["id"] == expected_hash + assert data["sha256"] == expected_hash + assert data["size"] == len(content) + assert "ref_count" in data + assert "created_at" in data + + @pytest.mark.integration + def test_get_nonexistent_artifact(self, integration_client): + """Test getting a non-existent artifact returns 404.""" + fake_hash = "a" * 64 + response = integration_client.get(f"/api/v1/artifact/{fake_hash}") + assert response.status_code == 404 + + @pytest.mark.integration + def test_artifact_includes_tags(self, integration_client, test_package): + """Test artifact response includes tags pointing to it.""" + project_name, package_name = test_package + content = b"artifact with tags test" + expected_hash = compute_sha256(content) + + upload_test_file( + integration_client, project_name, package_name, content, tag="tagged-v1" + ) + + response = integration_client.get(f"/api/v1/artifact/{expected_hash}") + assert response.status_code == 200 + + data = response.json() + assert "tags" in data + assert len(data["tags"]) >= 1 + + tag = data["tags"][0] + assert "name" in tag + assert "package_name" in tag + assert "project_name" in tag + + +class TestArtifactStats: + """Tests for artifact statistics endpoint.""" + + @pytest.mark.integration + def test_artifact_stats_returns_valid_response( + self, integration_client, test_package, unique_test_id + ): + """Test artifact stats returns expected fields.""" + project, package = test_package + content = f"artifact stats test {unique_test_id}".encode() + expected_hash = compute_sha256(content) + + upload_test_file( + integration_client, project, package, content, tag=f"art-{unique_test_id}" + ) + + response = integration_client.get(f"/api/v1/artifact/{expected_hash}/stats") + assert response.status_code == 200 + + data = response.json() + assert "artifact_id" in data + assert "sha256" in data + assert "size" in data + assert "ref_count" in data + assert "storage_savings" in data + assert "tags" in data + assert "projects" in data + assert "packages" in data + + @pytest.mark.integration + def test_artifact_stats_not_found(self, integration_client): + """Test artifact stats returns 404 for non-existent artifact.""" + fake_hash = "0" * 64 + response = integration_client.get(f"/api/v1/artifact/{fake_hash}/stats") + assert response.status_code == 404 + + @pytest.mark.integration + def test_artifact_stats_shows_correct_projects( + self, integration_client, unique_test_id + ): + """Test artifact stats shows all projects using the artifact.""" + content = f"multi-project artifact {unique_test_id}".encode() + expected_hash = compute_sha256(content) + + proj1 = f"art-stats-a-{unique_test_id}" + proj2 = f"art-stats-b-{unique_test_id}" + + try: + # Create projects and packages + integration_client.post( + "/api/v1/projects", + json={"name": proj1, "description": "Test", "is_public": True}, + ) + integration_client.post( + "/api/v1/projects", + json={"name": proj2, "description": "Test", "is_public": True}, + ) + integration_client.post( + f"/api/v1/project/{proj1}/packages", + json={"name": "pkg", "description": "Test"}, + ) + integration_client.post( + f"/api/v1/project/{proj2}/packages", + json={"name": "pkg", "description": "Test"}, + ) + + # Upload same content to both projects + upload_test_file(integration_client, proj1, "pkg", content, tag="v1") + upload_test_file(integration_client, proj2, "pkg", content, tag="v1") + + # Check artifact stats + response = integration_client.get(f"/api/v1/artifact/{expected_hash}/stats") + assert response.status_code == 200 + + data = response.json() + assert len(data["projects"]) == 2 + assert proj1 in data["projects"] + assert proj2 in data["projects"] + + finally: + integration_client.delete(f"/api/v1/projects/{proj1}") + integration_client.delete(f"/api/v1/projects/{proj2}") + + +class TestArtifactProvenance: + """Tests for artifact provenance/history endpoint.""" + + @pytest.mark.integration + def test_artifact_history_returns_200(self, integration_client, test_package): + """Test artifact history endpoint returns 200.""" + project_name, package_name = test_package + + upload_result = upload_test_file( + integration_client, + project_name, + package_name, + b"provenance test content", + "prov.txt", + ) + artifact_id = upload_result["artifact_id"] + + response = integration_client.get(f"/api/v1/artifact/{artifact_id}/history") + assert response.status_code == 200 + + @pytest.mark.integration + def test_artifact_history_has_required_fields( + self, integration_client, test_package + ): + """Test artifact history has all required fields.""" + project_name, package_name = test_package + + upload_result = upload_test_file( + integration_client, + project_name, + package_name, + b"provenance fields test", + "fields.txt", + ) + artifact_id = upload_result["artifact_id"] + + response = integration_client.get(f"/api/v1/artifact/{artifact_id}/history") + assert response.status_code == 200 + + data = response.json() + assert "artifact_id" in data + assert "sha256" in data + assert "size" in data + assert "created_at" in data + assert "created_by" in data + assert "ref_count" in data + assert "first_uploaded_at" in data + assert "first_uploaded_by" in data + assert "upload_count" in data + assert "packages" in data + assert "tags" in data + assert "uploads" in data + + @pytest.mark.integration + def test_artifact_history_not_found(self, integration_client): + """Test non-existent artifact returns 404.""" + fake_hash = "b" * 64 + response = integration_client.get(f"/api/v1/artifact/{fake_hash}/history") + assert response.status_code == 404 + + @pytest.mark.integration + def test_artifact_history_with_tag(self, integration_client, test_package): + """Test artifact history includes tag information when tagged.""" + project_name, package_name = test_package + + upload_result = upload_test_file( + integration_client, + project_name, + package_name, + b"tagged provenance test", + "tagged.txt", + tag="v1.0.0", + ) + artifact_id = upload_result["artifact_id"] + + response = integration_client.get(f"/api/v1/artifact/{artifact_id}/history") + assert response.status_code == 200 + + data = response.json() + assert len(data["tags"]) >= 1 + + tag = data["tags"][0] + assert "project_name" in tag + assert "package_name" in tag + assert "tag_name" in tag + + +class TestArtifactUploads: + """Tests for artifact uploads listing endpoint.""" + + @pytest.mark.integration + def test_artifact_uploads_returns_200(self, integration_client, test_package): + """Test artifact uploads endpoint returns 200.""" + project_name, package_name = test_package + + upload_result = upload_test_file( + integration_client, + project_name, + package_name, + b"artifact upload test", + "artifact.txt", + ) + artifact_id = upload_result["artifact_id"] + + response = integration_client.get(f"/api/v1/artifact/{artifact_id}/uploads") + assert response.status_code == 200 + + data = response.json() + assert "items" in data + assert "pagination" in data + assert len(data["items"]) >= 1 + + @pytest.mark.integration + def test_artifact_uploads_not_found(self, integration_client): + """Test non-existent artifact returns 404.""" + fake_hash = "a" * 64 + response = integration_client.get(f"/api/v1/artifact/{fake_hash}/uploads") + assert response.status_code == 404 + + +class TestOrphanedArtifacts: + """Tests for orphaned artifacts management.""" + + @pytest.mark.integration + def test_list_orphaned_artifacts_returns_list(self, integration_client): + """Test orphaned artifacts endpoint returns a list.""" + response = integration_client.get("/api/v1/admin/orphaned-artifacts") + assert response.status_code == 200 + assert isinstance(response.json(), list) + + @pytest.mark.integration + def test_orphaned_artifact_has_required_fields(self, integration_client): + """Test orphaned artifact response has required fields.""" + response = integration_client.get("/api/v1/admin/orphaned-artifacts?limit=1") + assert response.status_code == 200 + + data = response.json() + if len(data) > 0: + artifact = data[0] + assert "id" in artifact + assert "size" in artifact + assert "created_at" in artifact + assert "created_by" in artifact + assert "original_name" in artifact + + @pytest.mark.integration + def test_orphaned_artifacts_respects_limit(self, integration_client): + """Test orphaned artifacts endpoint respects limit parameter.""" + response = integration_client.get("/api/v1/admin/orphaned-artifacts?limit=5") + assert response.status_code == 200 + assert len(response.json()) <= 5 + + @pytest.mark.integration + def test_artifact_becomes_orphaned_when_tag_deleted( + self, integration_client, test_package, unique_test_id + ): + """Test artifact appears in orphaned list after tag is deleted.""" + project, package = test_package + content = f"orphan test {unique_test_id}".encode() + expected_hash = compute_sha256(content) + + # Upload with tag + upload_test_file(integration_client, project, package, content, tag="temp-tag") + + # Verify not in orphaned list + response = integration_client.get("/api/v1/admin/orphaned-artifacts?limit=1000") + orphaned_ids = [a["id"] for a in response.json()] + assert expected_hash not in orphaned_ids + + # Delete the tag + integration_client.delete(f"/api/v1/project/{project}/{package}/tags/temp-tag") + + # Verify now in orphaned list + response = integration_client.get("/api/v1/admin/orphaned-artifacts?limit=1000") + orphaned_ids = [a["id"] for a in response.json()] + assert expected_hash in orphaned_ids + + +class TestGarbageCollection: + """Tests for garbage collection endpoint.""" + + @pytest.mark.integration + def test_garbage_collect_dry_run_returns_response(self, integration_client): + """Test garbage collection dry run returns valid response.""" + response = integration_client.post("/api/v1/admin/garbage-collect?dry_run=true") + assert response.status_code == 200 + + data = response.json() + assert "artifacts_deleted" in data + assert "bytes_freed" in data + assert "artifact_ids" in data + assert "dry_run" in data + assert data["dry_run"] is True + + @pytest.mark.integration + def test_garbage_collect_dry_run_doesnt_delete( + self, integration_client, test_package, unique_test_id + ): + """Test garbage collection dry run doesn't actually delete artifacts.""" + project, package = test_package + content = f"dry run test {unique_test_id}".encode() + expected_hash = compute_sha256(content) + + # Upload and delete tag to create orphan + upload_test_file(integration_client, project, package, content, tag="dry-run") + integration_client.delete(f"/api/v1/project/{project}/{package}/tags/dry-run") + + # Verify artifact exists + response = integration_client.get(f"/api/v1/artifact/{expected_hash}") + assert response.status_code == 200 + + # Run garbage collection in dry-run mode + gc_response = integration_client.post( + "/api/v1/admin/garbage-collect?dry_run=true&limit=1000" + ) + assert gc_response.status_code == 200 + assert expected_hash in gc_response.json()["artifact_ids"] + + # Verify artifact STILL exists + response = integration_client.get(f"/api/v1/artifact/{expected_hash}") + assert response.status_code == 200 + + @pytest.mark.integration + def test_garbage_collect_preserves_referenced_artifacts( + self, integration_client, test_package, unique_test_id + ): + """Test garbage collection doesn't delete artifacts with ref_count > 0.""" + project, package = test_package + content = f"preserve test {unique_test_id}".encode() + expected_hash = compute_sha256(content) + + # Upload with tag (ref_count=1) + upload_test_file(integration_client, project, package, content, tag="keep-this") + + # Verify artifact exists with ref_count=1 + response = integration_client.get(f"/api/v1/artifact/{expected_hash}") + assert response.status_code == 200 + assert response.json()["ref_count"] == 1 + + # Run garbage collection (dry_run to not affect other tests) + gc_response = integration_client.post( + "/api/v1/admin/garbage-collect?dry_run=true&limit=1000" + ) + assert gc_response.status_code == 200 + + # Verify artifact was NOT in delete list + assert expected_hash not in gc_response.json()["artifact_ids"] + + # Verify artifact still exists + response = integration_client.get(f"/api/v1/artifact/{expected_hash}") + assert response.status_code == 200 + assert response.json()["ref_count"] == 1 + + @pytest.mark.integration + def test_garbage_collect_respects_limit(self, integration_client): + """Test garbage collection respects limit parameter.""" + response = integration_client.post( + "/api/v1/admin/garbage-collect?dry_run=true&limit=5" + ) + assert response.status_code == 200 + assert response.json()["artifacts_deleted"] <= 5 + + @pytest.mark.integration + def test_garbage_collect_returns_bytes_freed(self, integration_client): + """Test garbage collection returns accurate bytes_freed.""" + response = integration_client.post("/api/v1/admin/garbage-collect?dry_run=true") + assert response.status_code == 200 + + data = response.json() + assert data["bytes_freed"] >= 0 + assert isinstance(data["bytes_freed"], int) + + +class TestGlobalUploads: + """Tests for global uploads endpoint.""" + + @pytest.mark.integration + def test_global_uploads_returns_200(self, integration_client): + """Test global uploads endpoint returns 200.""" + response = integration_client.get("/api/v1/uploads") + assert response.status_code == 200 + + data = response.json() + assert "items" in data + assert "pagination" in data + + @pytest.mark.integration + def test_global_uploads_pagination(self, integration_client): + """Test global uploads endpoint respects pagination.""" + response = integration_client.get("/api/v1/uploads?limit=5&page=1") + assert response.status_code == 200 + + data = response.json() + assert len(data["items"]) <= 5 + assert data["pagination"]["limit"] == 5 + assert data["pagination"]["page"] == 1 + + @pytest.mark.integration + def test_global_uploads_filter_by_project(self, integration_client, test_package): + """Test filtering global uploads by project name.""" + project_name, package_name = test_package + + # Upload a file + upload_test_file( + integration_client, + project_name, + package_name, + b"global filter test", + "global.txt", + ) + + response = integration_client.get(f"/api/v1/uploads?project={project_name}") + assert response.status_code == 200 + + data = response.json() + for item in data["items"]: + assert item["project_name"] == project_name + + @pytest.mark.integration + def test_global_uploads_has_more_field(self, integration_client): + """Test pagination includes has_more field.""" + response = integration_client.get("/api/v1/uploads?limit=1") + assert response.status_code == 200 + + data = response.json() + assert "has_more" in data["pagination"] + assert isinstance(data["pagination"]["has_more"], bool) + + +class TestGlobalArtifacts: + """Tests for global artifacts endpoint.""" + + @pytest.mark.integration + def test_global_artifacts_returns_200(self, integration_client): + """Test global artifacts endpoint returns 200.""" + response = integration_client.get("/api/v1/artifacts") + assert response.status_code == 200 + + data = response.json() + assert "items" in data + assert "pagination" in data + + @pytest.mark.integration + def test_global_artifacts_pagination(self, integration_client): + """Test global artifacts endpoint respects pagination.""" + response = integration_client.get("/api/v1/artifacts?limit=5&page=1") + assert response.status_code == 200 + + data = response.json() + assert len(data["items"]) <= 5 + assert data["pagination"]["limit"] == 5 + + @pytest.mark.integration + def test_global_artifacts_filter_by_size(self, integration_client): + """Test filtering global artifacts by size range.""" + response = integration_client.get( + "/api/v1/artifacts?min_size=1&max_size=1000000" + ) + assert response.status_code == 200 + + data = response.json() + for item in data["items"]: + assert 1 <= item["size"] <= 1000000 + + @pytest.mark.integration + def test_global_artifacts_sort_by_size(self, integration_client): + """Test sorting global artifacts by size.""" + response = integration_client.get("/api/v1/artifacts?sort=size&order=desc") + assert response.status_code == 200 + data = response.json() + if len(data["items"]) > 1: + sizes = [item["size"] for item in data["items"]] + assert sizes == sorted(sizes, reverse=True) + + @pytest.mark.integration + def test_global_artifacts_invalid_sort_returns_400(self, integration_client): + """Test invalid sort field returns 400.""" + response = integration_client.get("/api/v1/artifacts?sort=invalid_field") + assert response.status_code == 400 + + +class TestGlobalTags: + """Tests for global tags endpoint.""" + + @pytest.mark.integration + def test_global_tags_returns_200(self, integration_client): + """Test global tags endpoint returns 200.""" + response = integration_client.get("/api/v1/tags") + assert response.status_code == 200 + + data = response.json() + assert "items" in data + assert "pagination" in data + + @pytest.mark.integration + def test_global_tags_pagination(self, integration_client): + """Test global tags endpoint respects pagination.""" + response = integration_client.get("/api/v1/tags?limit=5&page=1") + assert response.status_code == 200 + + data = response.json() + assert len(data["items"]) <= 5 + assert data["pagination"]["limit"] == 5 + + @pytest.mark.integration + def test_global_tags_has_project_context(self, integration_client): + """Test global tags response includes project/package context.""" + response = integration_client.get("/api/v1/tags?limit=1") + assert response.status_code == 200 + + data = response.json() + if len(data["items"]) > 0: + item = data["items"][0] + assert "project_name" in item + assert "package_name" in item + assert "artifact_id" in item + + @pytest.mark.integration + def test_global_tags_search_with_wildcard(self, integration_client): + """Test global tags search supports wildcards.""" + response = integration_client.get("/api/v1/tags?search=v*") + assert response.status_code == 200 + # Just verify it doesn't error; results may vary + + +class TestAuditLogs: + """Tests for global audit logs endpoint.""" + + @pytest.mark.integration + def test_list_audit_logs_returns_valid_response(self, integration_client): + """Test audit logs endpoint returns valid paginated response.""" + response = integration_client.get("/api/v1/audit-logs") + assert response.status_code == 200 + + data = response.json() + assert "items" in data + assert "pagination" in data + assert isinstance(data["items"], list) + + pagination = data["pagination"] + assert "page" in pagination + assert "limit" in pagination + assert "total" in pagination + assert "total_pages" in pagination + + @pytest.mark.integration + def test_audit_logs_respects_pagination(self, integration_client): + """Test audit logs endpoint respects limit parameter.""" + response = integration_client.get("/api/v1/audit-logs?limit=5") + assert response.status_code == 200 + + data = response.json() + assert len(data["items"]) <= 5 + assert data["pagination"]["limit"] == 5 + + @pytest.mark.integration + def test_audit_logs_filter_by_action(self, integration_client, test_package): + """Test filtering audit logs by action type.""" + project_name, package_name = test_package + + response = integration_client.get("/api/v1/audit-logs?action=project.create") + assert response.status_code == 200 + + data = response.json() + for item in data["items"]: + assert item["action"] == "project.create" + + @pytest.mark.integration + def test_audit_log_entry_has_required_fields( + self, integration_client, test_project + ): + """Test audit log entries have all required fields.""" + response = integration_client.get("/api/v1/audit-logs?limit=10") + assert response.status_code == 200 + + data = response.json() + if data["items"]: + item = data["items"][0] + assert "id" in item + assert "action" in item + assert "resource" in item + assert "user_id" in item + assert "timestamp" in item diff --git a/backend/tests/integration/test_packages_api.py b/backend/tests/integration/test_packages_api.py new file mode 100644 index 0000000..60af55a --- /dev/null +++ b/backend/tests/integration/test_packages_api.py @@ -0,0 +1,345 @@ +""" +Integration tests for package API endpoints. + +Tests cover: +- Package CRUD operations +- Package listing with pagination, search, filtering +- Package stats endpoint +- Package-level audit logs +- Cascade delete behavior +""" + +import pytest +from tests.factories import compute_sha256, upload_test_file + + +class TestPackageCRUD: + """Tests for package create, read, update, delete operations.""" + + @pytest.mark.integration + def test_create_package(self, integration_client, test_project, unique_test_id): + """Test creating a new package.""" + package_name = f"test-create-pkg-{unique_test_id}" + + response = integration_client.post( + f"/api/v1/project/{test_project}/packages", + json={ + "name": package_name, + "description": "Test package", + "format": "npm", + "platform": "linux", + }, + ) + assert response.status_code == 200 + + data = response.json() + assert data["name"] == package_name + assert data["description"] == "Test package" + assert data["format"] == "npm" + assert data["platform"] == "linux" + + @pytest.mark.integration + def test_get_package(self, integration_client, test_package): + """Test getting a package by name.""" + project_name, package_name = test_package + + response = integration_client.get( + f"/api/v1/project/{project_name}/packages/{package_name}" + ) + assert response.status_code == 200 + + data = response.json() + assert data["name"] == package_name + + @pytest.mark.integration + def test_get_nonexistent_package(self, integration_client, test_project): + """Test getting a non-existent package returns 404.""" + response = integration_client.get( + f"/api/v1/project/{test_project}/packages/nonexistent-pkg" + ) + assert response.status_code == 404 + + @pytest.mark.integration + def test_list_packages(self, integration_client, test_package): + """Test listing packages includes created package.""" + project_name, package_name = test_package + + response = integration_client.get(f"/api/v1/project/{project_name}/packages") + assert response.status_code == 200 + + data = response.json() + assert "items" in data + assert "pagination" in data + + package_names = [p["name"] for p in data["items"]] + assert package_name in package_names + + @pytest.mark.integration + def test_delete_package(self, integration_client, test_project, unique_test_id): + """Test deleting a package.""" + package_name = f"test-delete-pkg-{unique_test_id}" + + # Create package + integration_client.post( + f"/api/v1/project/{test_project}/packages", + json={"name": package_name, "description": "To be deleted"}, + ) + + # Delete package + response = integration_client.delete( + f"/api/v1/project/{test_project}/packages/{package_name}" + ) + assert response.status_code == 204 + + # Verify deleted + response = integration_client.get( + f"/api/v1/project/{test_project}/packages/{package_name}" + ) + assert response.status_code == 404 + + +class TestPackageListingFilters: + """Tests for package listing with filters and pagination.""" + + @pytest.mark.integration + def test_packages_pagination(self, integration_client, test_project): + """Test package listing respects pagination parameters.""" + response = integration_client.get( + f"/api/v1/project/{test_project}/packages?page=1&limit=5" + ) + assert response.status_code == 200 + + data = response.json() + assert len(data["items"]) <= 5 + assert data["pagination"]["limit"] == 5 + assert data["pagination"]["page"] == 1 + + @pytest.mark.integration + def test_packages_filter_by_format( + self, integration_client, test_project, unique_test_id + ): + """Test package filtering by format.""" + # Create a package with specific format + package_name = f"npm-pkg-{unique_test_id}" + integration_client.post( + f"/api/v1/project/{test_project}/packages", + json={"name": package_name, "format": "npm"}, + ) + + response = integration_client.get( + f"/api/v1/project/{test_project}/packages?format=npm" + ) + assert response.status_code == 200 + + data = response.json() + for pkg in data["items"]: + assert pkg["format"] == "npm" + + @pytest.mark.integration + def test_packages_filter_by_platform( + self, integration_client, test_project, unique_test_id + ): + """Test package filtering by platform.""" + # Create a package with specific platform + package_name = f"linux-pkg-{unique_test_id}" + integration_client.post( + f"/api/v1/project/{test_project}/packages", + json={"name": package_name, "platform": "linux"}, + ) + + response = integration_client.get( + f"/api/v1/project/{test_project}/packages?platform=linux" + ) + assert response.status_code == 200 + + data = response.json() + for pkg in data["items"]: + assert pkg["platform"] == "linux" + + +class TestPackageStats: + """Tests for package statistics endpoint.""" + + @pytest.mark.integration + def test_package_stats_returns_valid_response( + self, integration_client, test_package + ): + """Test package stats endpoint returns expected fields.""" + project, package = test_package + response = integration_client.get( + f"/api/v1/project/{project}/packages/{package}/stats" + ) + assert response.status_code == 200 + + data = response.json() + assert "package_id" in data + assert "package_name" in data + assert "project_name" in data + assert "tag_count" in data + assert "artifact_count" in data + assert "total_size_bytes" in data + assert "upload_count" in data + assert "deduplicated_uploads" in data + assert "storage_saved_bytes" in data + assert "deduplication_ratio" in data + + @pytest.mark.integration + def test_package_stats_not_found(self, integration_client, test_project): + """Test package stats returns 404 for non-existent package.""" + response = integration_client.get( + f"/api/v1/project/{test_project}/packages/nonexistent-package/stats" + ) + assert response.status_code == 404 + + +class TestPackageAuditLogs: + """Tests for package-level audit logs endpoint.""" + + @pytest.mark.integration + def test_package_audit_logs_returns_200(self, integration_client, test_package): + """Test package audit logs endpoint returns 200.""" + project_name, package_name = test_package + response = integration_client.get( + f"/api/v1/project/{project_name}/{package_name}/audit-logs" + ) + assert response.status_code == 200 + + data = response.json() + assert "items" in data + assert "pagination" in data + + @pytest.mark.integration + def test_package_audit_logs_project_not_found(self, integration_client): + """Test non-existent project returns 404.""" + response = integration_client.get( + "/api/v1/project/nonexistent/nonexistent/audit-logs" + ) + assert response.status_code == 404 + + @pytest.mark.integration + def test_package_audit_logs_package_not_found( + self, integration_client, test_project + ): + """Test non-existent package returns 404.""" + response = integration_client.get( + f"/api/v1/project/{test_project}/nonexistent-package/audit-logs" + ) + assert response.status_code == 404 + + +class TestPackageCascadeDelete: + """Tests for cascade delete behavior when deleting packages.""" + + @pytest.mark.integration + def test_ref_count_decrements_on_package_delete( + self, integration_client, unique_test_id + ): + """Test ref_count decrements for all tags when package is deleted.""" + project_name = f"cascade-pkg-{unique_test_id}" + package_name = f"test-pkg-{unique_test_id}" + + # Create project + response = integration_client.post( + "/api/v1/projects", + json={ + "name": project_name, + "description": "Test project", + "is_public": True, + }, + ) + assert response.status_code == 200 + + # Create package + response = integration_client.post( + f"/api/v1/project/{project_name}/packages", + json={"name": package_name, "description": "Test package"}, + ) + assert response.status_code == 200 + + # Upload content with multiple tags + content = f"cascade delete test {unique_test_id}".encode() + expected_hash = compute_sha256(content) + + upload_test_file( + integration_client, project_name, package_name, content, tag="v1" + ) + upload_test_file( + integration_client, project_name, package_name, content, tag="v2" + ) + upload_test_file( + integration_client, project_name, package_name, content, tag="v3" + ) + + # Verify ref_count is 3 + response = integration_client.get(f"/api/v1/artifact/{expected_hash}") + assert response.json()["ref_count"] == 3 + + # Delete the package + delete_response = integration_client.delete( + f"/api/v1/project/{project_name}/packages/{package_name}" + ) + assert delete_response.status_code == 204 + + # Verify ref_count is 0 + response = integration_client.get(f"/api/v1/artifact/{expected_hash}") + assert response.json()["ref_count"] == 0 + + # Cleanup + integration_client.delete(f"/api/v1/projects/{project_name}") + + +class TestPackageUploads: + """Tests for package-level uploads endpoint.""" + + @pytest.mark.integration + def test_package_uploads_returns_200(self, integration_client, test_package): + """Test package uploads endpoint returns 200.""" + project_name, package_name = test_package + response = integration_client.get( + f"/api/v1/project/{project_name}/{package_name}/uploads" + ) + assert response.status_code == 200 + + data = response.json() + assert "items" in data + assert "pagination" in data + + @pytest.mark.integration + def test_package_uploads_after_upload(self, integration_client, test_package): + """Test uploads are recorded after file upload.""" + project_name, package_name = test_package + + # Upload a file + upload_result = upload_test_file( + integration_client, + project_name, + package_name, + b"test upload content", + "test.txt", + ) + assert upload_result["artifact_id"] + + # Check uploads endpoint + response = integration_client.get( + f"/api/v1/project/{project_name}/{package_name}/uploads" + ) + assert response.status_code == 200 + + data = response.json() + assert len(data["items"]) >= 1 + + # Verify upload record fields + upload = data["items"][0] + assert "artifact_id" in upload + assert "package_name" in upload + assert "project_name" in upload + assert "uploaded_at" in upload + assert "uploaded_by" in upload + + @pytest.mark.integration + def test_package_uploads_project_not_found(self, integration_client): + """Test non-existent project returns 404.""" + response = integration_client.get( + "/api/v1/project/nonexistent/nonexistent/uploads" + ) + assert response.status_code == 404 diff --git a/backend/tests/integration/test_projects_api.py b/backend/tests/integration/test_projects_api.py new file mode 100644 index 0000000..0de9554 --- /dev/null +++ b/backend/tests/integration/test_projects_api.py @@ -0,0 +1,322 @@ +""" +Integration tests for project API endpoints. + +Tests cover: +- Project CRUD operations +- Project listing with pagination, search, and sorting +- Project stats endpoint +- Project-level audit logs +- Cascade delete behavior +""" + +import pytest +from tests.factories import compute_sha256, upload_test_file + + +class TestProjectCRUD: + """Tests for project create, read, update, delete operations.""" + + @pytest.mark.integration + def test_create_project(self, integration_client, unique_test_id): + """Test creating a new project.""" + project_name = f"test-create-{unique_test_id}" + + try: + response = integration_client.post( + "/api/v1/projects", + json={ + "name": project_name, + "description": "Test project", + "is_public": True, + }, + ) + assert response.status_code == 200 + + data = response.json() + assert data["name"] == project_name + assert data["description"] == "Test project" + assert data["is_public"] is True + assert "id" in data + assert "created_at" in data + finally: + integration_client.delete(f"/api/v1/projects/{project_name}") + + @pytest.mark.integration + def test_get_project(self, integration_client, test_project): + """Test getting a project by name.""" + response = integration_client.get(f"/api/v1/projects/{test_project}") + assert response.status_code == 200 + + data = response.json() + assert data["name"] == test_project + + @pytest.mark.integration + def test_get_nonexistent_project(self, integration_client): + """Test getting a non-existent project returns 404.""" + response = integration_client.get("/api/v1/projects/nonexistent-project-xyz") + assert response.status_code == 404 + + @pytest.mark.integration + def test_list_projects(self, integration_client, test_project): + """Test listing projects includes created project.""" + response = integration_client.get("/api/v1/projects") + assert response.status_code == 200 + + data = response.json() + assert "items" in data + assert "pagination" in data + + project_names = [p["name"] for p in data["items"]] + assert test_project in project_names + + @pytest.mark.integration + def test_delete_project(self, integration_client, unique_test_id): + """Test deleting a project.""" + project_name = f"test-delete-{unique_test_id}" + + # Create project + integration_client.post( + "/api/v1/projects", + json={"name": project_name, "description": "To be deleted"}, + ) + + # Delete project + response = integration_client.delete(f"/api/v1/projects/{project_name}") + assert response.status_code == 204 + + # Verify deleted + response = integration_client.get(f"/api/v1/projects/{project_name}") + assert response.status_code == 404 + + +class TestProjectListingFilters: + """Tests for project listing with filters and pagination.""" + + @pytest.mark.integration + def test_projects_pagination(self, integration_client): + """Test project listing respects pagination parameters.""" + response = integration_client.get("/api/v1/projects?page=1&limit=5") + assert response.status_code == 200 + + data = response.json() + assert len(data["items"]) <= 5 + assert data["pagination"]["limit"] == 5 + assert data["pagination"]["page"] == 1 + assert "has_more" in data["pagination"] + + @pytest.mark.integration + def test_projects_search(self, integration_client, test_project): + """Test project search by name.""" + # Search for our test project + response = integration_client.get( + f"/api/v1/projects?search={test_project[:10]}" + ) + assert response.status_code == 200 + + data = response.json() + # Our project should be in results + project_names = [p["name"] for p in data["items"]] + assert test_project in project_names + + @pytest.mark.integration + def test_projects_sort_by_name(self, integration_client): + """Test project sorting by name.""" + response = integration_client.get("/api/v1/projects?sort=name&order=asc") + assert response.status_code == 200 + + data = response.json() + names = [p["name"] for p in data["items"]] + assert names == sorted(names) + + +class TestProjectStats: + """Tests for project statistics endpoint.""" + + @pytest.mark.integration + def test_project_stats_returns_valid_response( + self, integration_client, test_project + ): + """Test project stats endpoint returns expected fields.""" + response = integration_client.get(f"/api/v1/projects/{test_project}/stats") + assert response.status_code == 200 + + data = response.json() + assert "project_id" in data + assert "project_name" in data + assert "package_count" in data + assert "tag_count" in data + assert "artifact_count" in data + assert "total_size_bytes" in data + assert "upload_count" in data + assert "deduplicated_uploads" in data + assert "storage_saved_bytes" in data + assert "deduplication_ratio" in data + + @pytest.mark.integration + def test_project_stats_not_found(self, integration_client): + """Test project stats returns 404 for non-existent project.""" + response = integration_client.get("/api/v1/projects/nonexistent-project/stats") + assert response.status_code == 404 + + +class TestProjectAuditLogs: + """Tests for project-level audit logs endpoint.""" + + @pytest.mark.integration + def test_project_audit_logs_returns_200(self, integration_client, test_project): + """Test project audit logs endpoint returns 200.""" + response = integration_client.get(f"/api/v1/projects/{test_project}/audit-logs") + assert response.status_code == 200 + + data = response.json() + assert "items" in data + assert "pagination" in data + + @pytest.mark.integration + def test_project_audit_logs_not_found(self, integration_client): + """Test non-existent project returns 404.""" + response = integration_client.get( + "/api/v1/projects/nonexistent-project/audit-logs" + ) + assert response.status_code == 404 + + +class TestProjectCascadeDelete: + """Tests for cascade delete behavior when deleting projects.""" + + @pytest.mark.integration + def test_project_delete_cascades_to_packages( + self, integration_client, unique_test_id + ): + """Test deleting project cascades to packages.""" + project_name = f"cascade-proj-{unique_test_id}" + package_name = f"cascade-pkg-{unique_test_id}" + + try: + # Create project and package + integration_client.post( + "/api/v1/projects", + json={"name": project_name, "description": "Test", "is_public": True}, + ) + integration_client.post( + f"/api/v1/project/{project_name}/packages", + json={"name": package_name, "description": "Test package"}, + ) + + # Verify package exists + response = integration_client.get( + f"/api/v1/project/{project_name}/packages/{package_name}" + ) + assert response.status_code == 200 + + # Delete project + integration_client.delete(f"/api/v1/projects/{project_name}") + + # Verify project is deleted (and package with it) + response = integration_client.get(f"/api/v1/projects/{project_name}") + assert response.status_code == 404 + except Exception: + # Cleanup if test fails + integration_client.delete(f"/api/v1/projects/{project_name}") + raise + + @pytest.mark.integration + def test_ref_count_decrements_on_project_delete( + self, integration_client, unique_test_id + ): + """Test ref_count decrements for all tags when project is deleted.""" + project_name = f"cascade-proj-{unique_test_id}" + package1_name = f"pkg1-{unique_test_id}" + package2_name = f"pkg2-{unique_test_id}" + + # Create project + response = integration_client.post( + "/api/v1/projects", + json={ + "name": project_name, + "description": "Test project", + "is_public": True, + }, + ) + assert response.status_code == 200 + + # Create two packages + for pkg_name in [package1_name, package2_name]: + response = integration_client.post( + f"/api/v1/project/{project_name}/packages", + json={"name": pkg_name, "description": "Test package"}, + ) + assert response.status_code == 200 + + # Upload same content with tags in both packages + content = f"project cascade test {unique_test_id}".encode() + expected_hash = compute_sha256(content) + + upload_test_file( + integration_client, project_name, package1_name, content, tag="v1" + ) + upload_test_file( + integration_client, project_name, package1_name, content, tag="v2" + ) + upload_test_file( + integration_client, project_name, package2_name, content, tag="latest" + ) + upload_test_file( + integration_client, project_name, package2_name, content, tag="stable" + ) + + # Verify ref_count is 4 (2 tags in each of 2 packages) + response = integration_client.get(f"/api/v1/artifact/{expected_hash}") + assert response.json()["ref_count"] == 4 + + # Delete the project + delete_response = integration_client.delete(f"/api/v1/projects/{project_name}") + assert delete_response.status_code == 204 + + # Verify ref_count is 0 + response = integration_client.get(f"/api/v1/artifact/{expected_hash}") + assert response.json()["ref_count"] == 0 + + +class TestProjectUploads: + """Tests for project-level uploads endpoint.""" + + @pytest.mark.integration + def test_project_uploads_returns_200(self, integration_client, test_project): + """Test project uploads endpoint returns 200.""" + response = integration_client.get(f"/api/v1/project/{test_project}/uploads") + assert response.status_code == 200 + + data = response.json() + assert "items" in data + assert "pagination" in data + + @pytest.mark.integration + def test_project_uploads_after_upload(self, integration_client, test_package): + """Test uploads are recorded in project uploads.""" + project_name, package_name = test_package + + # Upload a file + upload_test_file( + integration_client, + project_name, + package_name, + b"project uploads test", + "project.txt", + ) + + response = integration_client.get(f"/api/v1/project/{project_name}/uploads") + assert response.status_code == 200 + + data = response.json() + assert len(data["items"]) >= 1 + + # Verify project name matches + for item in data["items"]: + assert item["project_name"] == project_name + + @pytest.mark.integration + def test_project_uploads_not_found(self, integration_client): + """Test non-existent project returns 404.""" + response = integration_client.get("/api/v1/project/nonexistent/uploads") + assert response.status_code == 404 diff --git a/backend/tests/integration/test_tags_api.py b/backend/tests/integration/test_tags_api.py new file mode 100644 index 0000000..2b8db6e --- /dev/null +++ b/backend/tests/integration/test_tags_api.py @@ -0,0 +1,403 @@ +""" +Integration tests for tag API endpoints. + +Tests cover: +- Tag CRUD operations +- Tag listing with pagination and search +- Tag history tracking +- ref_count behavior with tag operations +""" + +import pytest +from tests.factories import compute_sha256, upload_test_file + + +class TestTagCRUD: + """Tests for tag create, read, delete operations.""" + + @pytest.mark.integration + def test_create_tag_via_upload(self, integration_client, test_package): + """Test creating a tag via upload endpoint.""" + project_name, package_name = test_package + + result = upload_test_file( + integration_client, + project_name, + package_name, + b"tag create test", + tag="v1.0.0", + ) + + assert result["tag"] == "v1.0.0" + assert result["artifact_id"] + + @pytest.mark.integration + def test_create_tag_via_post( + self, integration_client, test_package, unique_test_id + ): + """Test creating a tag via POST /tags endpoint.""" + project_name, package_name = test_package + + # First upload an artifact + result = upload_test_file( + integration_client, + project_name, + package_name, + b"artifact for tag", + ) + artifact_id = result["artifact_id"] + + # Create tag via POST + tag_name = f"post-tag-{unique_test_id}" + response = integration_client.post( + f"/api/v1/project/{project_name}/{package_name}/tags", + json={"name": tag_name, "artifact_id": artifact_id}, + ) + assert response.status_code == 200 + + data = response.json() + assert data["name"] == tag_name + assert data["artifact_id"] == artifact_id + + @pytest.mark.integration + def test_get_tag(self, integration_client, test_package): + """Test getting a tag by name.""" + project_name, package_name = test_package + + upload_test_file( + integration_client, + project_name, + package_name, + b"get tag test", + tag="get-tag", + ) + + response = integration_client.get( + f"/api/v1/project/{project_name}/{package_name}/tags/get-tag" + ) + assert response.status_code == 200 + + data = response.json() + assert data["name"] == "get-tag" + assert "artifact_id" in data + assert "artifact_size" in data + assert "artifact_content_type" in data + + @pytest.mark.integration + def test_list_tags(self, integration_client, test_package): + """Test listing tags for a package.""" + project_name, package_name = test_package + + # Create some tags + upload_test_file( + integration_client, + project_name, + package_name, + b"list tags test", + tag="list-v1", + ) + + response = integration_client.get( + f"/api/v1/project/{project_name}/{package_name}/tags" + ) + assert response.status_code == 200 + + data = response.json() + assert "items" in data + assert "pagination" in data + + tag_names = [t["name"] for t in data["items"]] + assert "list-v1" in tag_names + + @pytest.mark.integration + def test_delete_tag(self, integration_client, test_package): + """Test deleting a tag.""" + project_name, package_name = test_package + + upload_test_file( + integration_client, + project_name, + package_name, + b"delete tag test", + tag="to-delete", + ) + + # Delete tag + response = integration_client.delete( + f"/api/v1/project/{project_name}/{package_name}/tags/to-delete" + ) + assert response.status_code == 204 + + # Verify deleted + response = integration_client.get( + f"/api/v1/project/{project_name}/{package_name}/tags/to-delete" + ) + assert response.status_code == 404 + + +class TestTagListingFilters: + """Tests for tag listing with filters and search.""" + + @pytest.mark.integration + def test_tags_pagination(self, integration_client, test_package): + """Test tag listing respects pagination.""" + project_name, package_name = test_package + + response = integration_client.get( + f"/api/v1/project/{project_name}/{package_name}/tags?limit=5" + ) + assert response.status_code == 200 + + data = response.json() + assert len(data["items"]) <= 5 + assert data["pagination"]["limit"] == 5 + + @pytest.mark.integration + def test_tags_search(self, integration_client, test_package, unique_test_id): + """Test tag search by name.""" + project_name, package_name = test_package + + tag_name = f"searchable-{unique_test_id}" + upload_test_file( + integration_client, + project_name, + package_name, + b"search test", + tag=tag_name, + ) + + response = integration_client.get( + f"/api/v1/project/{project_name}/{package_name}/tags?search=searchable" + ) + assert response.status_code == 200 + + data = response.json() + tag_names = [t["name"] for t in data["items"]] + assert tag_name in tag_names + + +class TestTagHistory: + """Tests for tag history tracking.""" + + @pytest.mark.integration + def test_tag_history_on_create(self, integration_client, test_package): + """Test tag history is created when tag is created.""" + project_name, package_name = test_package + + upload_test_file( + integration_client, + project_name, + package_name, + b"history create test", + tag="history-create", + ) + + response = integration_client.get( + f"/api/v1/project/{project_name}/{package_name}/tags/history-create/history" + ) + assert response.status_code == 200 + + data = response.json() + assert len(data) >= 1 + + @pytest.mark.integration + def test_tag_history_on_update( + self, integration_client, test_package, unique_test_id + ): + """Test tag history is created when tag is updated.""" + project_name, package_name = test_package + + tag_name = f"history-update-{unique_test_id}" + + # Create tag with first artifact + upload_test_file( + integration_client, + project_name, + package_name, + b"first content", + tag=tag_name, + ) + + # Update tag with second artifact + upload_test_file( + integration_client, + project_name, + package_name, + b"second content", + tag=tag_name, + ) + + response = integration_client.get( + f"/api/v1/project/{project_name}/{package_name}/tags/{tag_name}/history" + ) + assert response.status_code == 200 + + data = response.json() + # Should have at least 2 history entries (create + update) + assert len(data) >= 2 + + +class TestTagRefCount: + """Tests for ref_count behavior with tag operations.""" + + @pytest.mark.integration + def test_ref_count_decrements_on_tag_delete(self, integration_client, test_package): + """Test ref_count decrements when a tag is deleted.""" + project_name, package_name = test_package + content = b"ref count delete test" + expected_hash = compute_sha256(content) + + # Upload with two tags + upload_test_file( + integration_client, project_name, package_name, content, tag="rc-v1" + ) + upload_test_file( + integration_client, project_name, package_name, content, tag="rc-v2" + ) + + # Verify ref_count is 2 + response = integration_client.get(f"/api/v1/artifact/{expected_hash}") + assert response.json()["ref_count"] == 2 + + # Delete one tag + delete_response = integration_client.delete( + f"/api/v1/project/{project_name}/{package_name}/tags/rc-v1" + ) + assert delete_response.status_code == 204 + + # Verify ref_count is now 1 + response = integration_client.get(f"/api/v1/artifact/{expected_hash}") + assert response.json()["ref_count"] == 1 + + @pytest.mark.integration + def test_ref_count_zero_after_all_tags_deleted( + self, integration_client, test_package + ): + """Test ref_count goes to 0 when all tags are deleted.""" + project_name, package_name = test_package + content = b"orphan test content" + expected_hash = compute_sha256(content) + + # Upload with one tag + upload_test_file( + integration_client, project_name, package_name, content, tag="only-tag" + ) + + # Delete the tag + integration_client.delete( + f"/api/v1/project/{project_name}/{package_name}/tags/only-tag" + ) + + # Verify ref_count is 0 + response = integration_client.get(f"/api/v1/artifact/{expected_hash}") + assert response.json()["ref_count"] == 0 + + @pytest.mark.integration + def test_ref_count_adjusts_on_tag_update( + self, integration_client, test_package, unique_test_id + ): + """Test ref_count adjusts when a tag is updated to point to different artifact.""" + project_name, package_name = test_package + + # Upload two different artifacts + content1 = f"artifact one {unique_test_id}".encode() + content2 = f"artifact two {unique_test_id}".encode() + hash1 = compute_sha256(content1) + hash2 = compute_sha256(content2) + + # Upload first artifact with tag "latest" + upload_test_file( + integration_client, project_name, package_name, content1, tag="latest" + ) + + # Verify first artifact has ref_count 1 + response = integration_client.get(f"/api/v1/artifact/{hash1}") + assert response.json()["ref_count"] == 1 + + # Upload second artifact with different tag + upload_test_file( + integration_client, project_name, package_name, content2, tag="stable" + ) + + # Now update "latest" tag to point to second artifact + upload_test_file( + integration_client, project_name, package_name, content2, tag="latest" + ) + + # Verify first artifact ref_count decreased to 0 + response = integration_client.get(f"/api/v1/artifact/{hash1}") + assert response.json()["ref_count"] == 0 + + # Verify second artifact ref_count increased to 2 + response = integration_client.get(f"/api/v1/artifact/{hash2}") + assert response.json()["ref_count"] == 2 + + @pytest.mark.integration + def test_ref_count_unchanged_when_tag_same_artifact( + self, integration_client, test_package, unique_test_id + ): + """Test ref_count doesn't change when tag is 'updated' to same artifact.""" + project_name, package_name = test_package + + content = f"same artifact {unique_test_id}".encode() + expected_hash = compute_sha256(content) + + # Upload with tag + upload_test_file( + integration_client, project_name, package_name, content, tag="same-v1" + ) + + # Verify ref_count is 1 + response = integration_client.get(f"/api/v1/artifact/{expected_hash}") + assert response.json()["ref_count"] == 1 + + # Upload same content with same tag (no-op) + upload_test_file( + integration_client, project_name, package_name, content, tag="same-v1" + ) + + # Verify ref_count is still 1 + response = integration_client.get(f"/api/v1/artifact/{expected_hash}") + assert response.json()["ref_count"] == 1 + + @pytest.mark.integration + def test_tag_via_post_endpoint_increments_ref_count( + self, integration_client, test_package, unique_test_id + ): + """Test creating tag via POST /tags endpoint increments ref_count.""" + project_name, package_name = test_package + + content = f"tag endpoint test {unique_test_id}".encode() + expected_hash = compute_sha256(content) + + # Upload artifact without tag + result = upload_test_file( + integration_client, project_name, package_name, content, filename="test.bin" + ) + artifact_id = result["artifact_id"] + + # Verify ref_count is 0 (no tags yet) + response = integration_client.get(f"/api/v1/artifact/{expected_hash}") + assert response.json()["ref_count"] == 0 + + # Create tag via POST endpoint + tag_response = integration_client.post( + f"/api/v1/project/{project_name}/{package_name}/tags", + json={"name": "post-v1", "artifact_id": artifact_id}, + ) + assert tag_response.status_code == 200 + + # Verify ref_count is now 1 + response = integration_client.get(f"/api/v1/artifact/{expected_hash}") + assert response.json()["ref_count"] == 1 + + # Create another tag via POST endpoint + tag_response = integration_client.post( + f"/api/v1/project/{project_name}/{package_name}/tags", + json={"name": "post-latest", "artifact_id": artifact_id}, + ) + assert tag_response.status_code == 200 + + # Verify ref_count is now 2 + response = integration_client.get(f"/api/v1/artifact/{expected_hash}") + assert response.json()["ref_count"] == 2 diff --git a/backend/tests/test_integration_uploads.py b/backend/tests/integration/test_upload_download_api.py similarity index 56% rename from backend/tests/test_integration_uploads.py rename to backend/tests/integration/test_upload_download_api.py index d354390..dfa25f9 100644 --- a/backend/tests/test_integration_uploads.py +++ b/backend/tests/integration/test_upload_download_api.py @@ -1,33 +1,109 @@ """ -Integration tests for duplicate uploads and storage verification. - -These tests require the full stack to be running (docker-compose.local.yml). +Integration tests for upload and download API endpoints. Tests cover: -- Duplicate upload scenarios across packages and projects -- Storage verification (single S3 object, single artifact row) -- Upload table tracking -- Content integrity verification +- Upload functionality and deduplication +- Download by tag and artifact ID - Concurrent upload handling -- Failure cleanup +- File size validation +- Upload failure cleanup +- S3 storage verification """ import pytest import io import threading -import time from concurrent.futures import ThreadPoolExecutor, as_completed -from tests.conftest import ( +from tests.factories import ( compute_sha256, upload_test_file, list_s3_objects_by_hash, s3_object_exists, - delete_s3_object_by_hash, ) -class TestDuplicateUploadScenarios: - """Integration tests for duplicate upload behavior.""" +class TestUploadBasics: + """Tests for basic upload functionality.""" + + @pytest.mark.integration + def test_upload_returns_artifact_id(self, integration_client, test_package): + """Test upload returns the artifact ID (SHA256 hash).""" + project_name, package_name = test_package + content = b"basic upload test" + expected_hash = compute_sha256(content) + + result = upload_test_file( + integration_client, project_name, package_name, content, tag="v1" + ) + + assert result["artifact_id"] == expected_hash + + @pytest.mark.integration + def test_upload_response_has_upload_id(self, integration_client, test_package): + """Test upload response includes upload_id.""" + project_name, package_name = test_package + + result = upload_test_file( + integration_client, + project_name, + package_name, + b"upload id test", + "uploadid.txt", + ) + + assert "upload_id" in result + assert result["upload_id"] is not None + + @pytest.mark.integration + def test_upload_response_has_content_type(self, integration_client, test_package): + """Test upload response includes content_type.""" + project_name, package_name = test_package + + result = upload_test_file( + integration_client, + project_name, + package_name, + b"content type test", + "content.txt", + ) + + assert "content_type" in result + + @pytest.mark.integration + def test_upload_response_has_original_name(self, integration_client, test_package): + """Test upload response includes original_name.""" + project_name, package_name = test_package + + result = upload_test_file( + integration_client, + project_name, + package_name, + b"original name test", + "originalname.txt", + ) + + assert "original_name" in result + assert result["original_name"] == "originalname.txt" + + @pytest.mark.integration + def test_upload_response_has_created_at(self, integration_client, test_package): + """Test upload response includes created_at.""" + project_name, package_name = test_package + + result = upload_test_file( + integration_client, + project_name, + package_name, + b"created at test", + "createdat.txt", + ) + + assert "created_at" in result + assert result["created_at"] is not None + + +class TestDuplicateUploads: + """Tests for duplicate upload deduplication behavior.""" @pytest.mark.integration def test_same_file_twice_returns_same_artifact_id( @@ -103,62 +179,11 @@ class TestDuplicateUploadScenarios: assert result2["artifact_id"] == expected_hash assert result2["deduplicated"] is True - @pytest.mark.integration - def test_same_file_different_projects_shares_artifact( - self, integration_client, unique_test_id - ): - """Test uploading same file to different projects shares artifact.""" - content = f"content shared across projects {unique_test_id}".encode() - expected_hash = compute_sha256(content) - - # Create two projects with packages - proj1 = f"project-x-{unique_test_id}" - proj2 = f"project-y-{unique_test_id}" - pkg_name = "shared-pkg" - - try: - # Create projects and packages - integration_client.post( - "/api/v1/projects", - json={"name": proj1, "description": "Project X", "is_public": True}, - ) - integration_client.post( - "/api/v1/projects", - json={"name": proj2, "description": "Project Y", "is_public": True}, - ) - integration_client.post( - f"/api/v1/project/{proj1}/packages", - json={"name": pkg_name, "description": "Package"}, - ) - integration_client.post( - f"/api/v1/project/{proj2}/packages", - json={"name": pkg_name, "description": "Package"}, - ) - - # Upload to first project - result1 = upload_test_file( - integration_client, proj1, pkg_name, content, tag="v1" - ) - assert result1["artifact_id"] == expected_hash - assert result1["deduplicated"] is False - - # Upload to second project - result2 = upload_test_file( - integration_client, proj2, pkg_name, content, tag="v1" - ) - assert result2["artifact_id"] == expected_hash - assert result2["deduplicated"] is True - - finally: - # Cleanup - integration_client.delete(f"/api/v1/projects/{proj1}") - integration_client.delete(f"/api/v1/projects/{proj2}") - @pytest.mark.integration def test_same_file_different_filenames_shares_artifact( self, integration_client, test_package ): - """Test uploading same file with different original filenames shares artifact.""" + """Test uploading same file with different filenames shares artifact.""" project, package = test_package content = b"content with different filenames" expected_hash = compute_sha256(content) @@ -186,110 +211,68 @@ class TestDuplicateUploadScenarios: assert result2["artifact_id"] == expected_hash assert result2["deduplicated"] is True - @pytest.mark.integration - def test_same_file_different_tags_shares_artifact( - self, integration_client, test_package, unique_test_id - ): - """Test uploading same file with different tags shares artifact.""" - project, package = test_package - content = f"content with different tags {unique_test_id}".encode() - expected_hash = compute_sha256(content) - tags = ["latest", "stable", "v1.0.0", "release"] - for i, tag in enumerate(tags): - result = upload_test_file( - integration_client, project, package, content, tag=tag - ) - assert result["artifact_id"] == expected_hash - if i == 0: - assert result["deduplicated"] is False - else: - assert result["deduplicated"] is True - - -class TestStorageVerification: - """Tests to verify storage behavior after duplicate uploads.""" +class TestDownload: + """Tests for download functionality.""" @pytest.mark.integration - def test_artifact_table_single_row_after_duplicates( - self, integration_client, test_package - ): - """Test artifact table contains only one row after duplicate uploads.""" + def test_download_by_tag(self, integration_client, test_package): + """Test downloading artifact by tag name.""" project, package = test_package - content = b"content for single row test" - expected_hash = compute_sha256(content) + original_content = b"download by tag test" - # Upload same content multiple times with different tags - for tag in ["v1", "v2", "v3"]: - upload_test_file(integration_client, project, package, content, tag=tag) + upload_test_file( + integration_client, project, package, original_content, tag="download-tag" + ) - # Query artifact - should exist and be unique - response = integration_client.get(f"/api/v1/artifact/{expected_hash}") - assert response.status_code == 200 - artifact = response.json() - assert artifact["id"] == expected_hash - assert artifact["ref_count"] == 3 - - @pytest.mark.integration - def test_upload_table_multiple_rows_for_duplicates( - self, integration_client, test_package - ): - """Test upload table contains multiple rows for duplicate uploads (event tracking).""" - project, package = test_package - content = b"content for upload tracking test" - - # Upload same content 3 times - for tag in ["upload1", "upload2", "upload3"]: - upload_test_file(integration_client, project, package, content, tag=tag) - - # Check package stats - should show 3 uploads but fewer unique artifacts response = integration_client.get( - f"/api/v1/project/{project}/packages/{package}" + f"/api/v1/project/{project}/{package}/+/download-tag", + params={"mode": "proxy"}, ) assert response.status_code == 200 - pkg_info = response.json() - assert pkg_info["tag_count"] == 3 + assert response.content == original_content @pytest.mark.integration - def test_artifact_content_matches_original(self, integration_client, test_package): - """Test artifact content retrieved matches original content exactly.""" + def test_download_by_artifact_id(self, integration_client, test_package): + """Test downloading artifact by artifact ID.""" + project, package = test_package + original_content = b"download by id test" + expected_hash = compute_sha256(original_content) + + upload_test_file(integration_client, project, package, original_content) + + response = integration_client.get( + f"/api/v1/project/{project}/{package}/+/artifact:{expected_hash}", + params={"mode": "proxy"}, + ) + assert response.status_code == 200 + assert response.content == original_content + + @pytest.mark.integration + def test_download_nonexistent_tag(self, integration_client, test_package): + """Test downloading nonexistent tag returns 404.""" + project, package = test_package + + response = integration_client.get( + f"/api/v1/project/{project}/{package}/+/nonexistent-tag" + ) + assert response.status_code == 404 + + @pytest.mark.integration + def test_content_matches_original(self, integration_client, test_package): + """Test downloaded content matches original exactly.""" project, package = test_package original_content = b"exact content verification test data 12345" - # Upload - result = upload_test_file( + upload_test_file( integration_client, project, package, original_content, tag="verify" ) - # Download and compare - download_response = integration_client.get( + response = integration_client.get( f"/api/v1/project/{project}/{package}/+/verify", params={"mode": "proxy"} ) - assert download_response.status_code == 200 - downloaded_content = download_response.content - assert downloaded_content == original_content - - @pytest.mark.integration - def test_storage_stats_reflect_deduplication( - self, integration_client, test_package - ): - """Test total storage size matches single artifact size after duplicates.""" - project, package = test_package - content = b"content for storage stats test - should only count once" - content_size = len(content) - - # Upload same content 5 times - for tag in ["a", "b", "c", "d", "e"]: - upload_test_file(integration_client, project, package, content, tag=tag) - - # Check global stats - response = integration_client.get("/api/v1/stats") assert response.status_code == 200 - stats = response.json() - - # Deduplication should show savings - assert stats["deduplicated_uploads"] > 0 - assert stats["storage_saved_bytes"] > 0 + assert response.content == original_content class TestConcurrentUploads: @@ -308,7 +291,6 @@ class TestConcurrentUploads: def upload_worker(tag_suffix): try: - # Create a new client for this thread from httpx import Client base_url = "http://localhost:8080" @@ -332,13 +314,11 @@ class TestConcurrentUploads: except Exception as e: errors.append(str(e)) - # Run concurrent uploads with ThreadPoolExecutor(max_workers=num_concurrent) as executor: futures = [executor.submit(upload_worker, i) for i in range(num_concurrent)] for future in as_completed(futures): - pass # Wait for all to complete + pass - # Verify results assert len(errors) == 0, f"Errors during concurrent uploads: {errors}" assert len(results) == num_concurrent @@ -353,227 +333,27 @@ class TestConcurrentUploads: assert response.json()["ref_count"] == num_concurrent -class TestDeduplicationAcrossRestarts: - """Tests for deduplication persistence.""" - - @pytest.mark.integration - def test_deduplication_persists( - self, integration_client, test_package, unique_test_id - ): - """ - Test deduplication works with persisted data. - - This test uploads content, then uploads the same content again. - Since the database persists, the second upload should detect - the existing artifact even without server restart. - """ - project, package = test_package - content = f"persisted content for dedup test {unique_test_id}".encode() - expected_hash = compute_sha256(content) - - # First upload - result1 = upload_test_file( - integration_client, project, package, content, tag="persist1" - ) - assert result1["artifact_id"] == expected_hash - assert result1["deduplicated"] is False - - # Second upload (simulating after restart - data is persisted) - result2 = upload_test_file( - integration_client, project, package, content, tag="persist2" - ) - assert result2["artifact_id"] == expected_hash - assert result2["deduplicated"] is True - - # Verify artifact exists with correct ref_count - response = integration_client.get(f"/api/v1/artifact/{expected_hash}") - assert response.status_code == 200 - assert response.json()["ref_count"] == 2 - - -class TestS3ObjectVerification: - """Tests to verify S3 storage behavior directly.""" - - @pytest.mark.integration - def test_s3_bucket_single_object_after_duplicates( - self, integration_client, test_package, unique_test_id - ): - """Test S3 bucket contains only one object after duplicate uploads.""" - project, package = test_package - content = f"content for s3 object count test {unique_test_id}".encode() - expected_hash = compute_sha256(content) - - # Upload same content multiple times with different tags - for tag in ["s3test1", "s3test2", "s3test3"]: - upload_test_file(integration_client, project, package, content, tag=tag) - - # Verify only one S3 object exists for this hash - s3_objects = list_s3_objects_by_hash(expected_hash) - assert len(s3_objects) == 1, ( - f"Expected 1 S3 object, found {len(s3_objects)}: {s3_objects}" - ) - - # Verify the object key follows expected pattern - expected_key = ( - f"fruits/{expected_hash[:2]}/{expected_hash[2:4]}/{expected_hash}" - ) - assert s3_objects[0] == expected_key - - -class TestUploadFailureCleanup: - """Tests for cleanup when uploads fail.""" - - @pytest.mark.integration - def test_upload_failure_invalid_project_no_orphaned_s3( - self, integration_client, unique_test_id - ): - """Test upload to non-existent project doesn't leave orphaned S3 objects.""" - content = f"content for orphan s3 test {unique_test_id}".encode() - expected_hash = compute_sha256(content) - - # Attempt upload to non-existent project - files = {"file": ("test.bin", io.BytesIO(content), "application/octet-stream")} - response = integration_client.post( - f"/api/v1/project/nonexistent-project-{unique_test_id}/nonexistent-pkg/upload", - files=files, - data={"tag": "test"}, - ) - - # Upload should fail - assert response.status_code == 404 - - # Verify no S3 object was created - assert not s3_object_exists(expected_hash), ( - "Orphaned S3 object found after failed upload" - ) - - @pytest.mark.integration - def test_upload_failure_invalid_package_no_orphaned_s3( - self, integration_client, test_project, unique_test_id - ): - """Test upload to non-existent package doesn't leave orphaned S3 objects.""" - content = f"content for orphan s3 test pkg {unique_test_id}".encode() - expected_hash = compute_sha256(content) - - # Attempt upload to non-existent package - files = {"file": ("test.bin", io.BytesIO(content), "application/octet-stream")} - response = integration_client.post( - f"/api/v1/project/{test_project}/nonexistent-package-{unique_test_id}/upload", - files=files, - data={"tag": "test"}, - ) - - # Upload should fail - assert response.status_code == 404 - - # Verify no S3 object was created - assert not s3_object_exists(expected_hash), ( - "Orphaned S3 object found after failed upload" - ) - - @pytest.mark.integration - def test_upload_failure_empty_file_no_orphaned_s3( - self, integration_client, test_package, unique_test_id - ): - """Test upload of empty file doesn't leave orphaned S3 objects or DB records.""" - project, package = test_package - content = b"" # Empty content - - # Attempt upload of empty file - files = {"file": ("empty.bin", io.BytesIO(content), "application/octet-stream")} - response = integration_client.post( - f"/api/v1/project/{project}/{package}/upload", - files=files, - data={"tag": f"empty-{unique_test_id}"}, - ) - - # Upload should fail (empty files are rejected) - assert response.status_code in (400, 422), ( - f"Expected 400/422, got {response.status_code}" - ) - - @pytest.mark.integration - def test_upload_failure_no_orphaned_database_records( - self, integration_client, test_project, unique_test_id - ): - """Test failed upload doesn't leave orphaned database records.""" - content = f"content for db orphan test {unique_test_id}".encode() - expected_hash = compute_sha256(content) - - # Attempt upload to non-existent package (should fail before DB insert) - files = {"file": ("test.bin", io.BytesIO(content), "application/octet-stream")} - response = integration_client.post( - f"/api/v1/project/{test_project}/nonexistent-package-{unique_test_id}/upload", - files=files, - data={"tag": "test"}, - ) - - # Upload should fail - assert response.status_code == 404 - - # Verify no artifact record was created - artifact_response = integration_client.get(f"/api/v1/artifact/{expected_hash}") - assert artifact_response.status_code == 404, ( - "Orphaned artifact record found after failed upload" - ) - - @pytest.mark.integration - def test_duplicate_tag_upload_handles_gracefully( - self, integration_client, test_package, unique_test_id - ): - """Test uploading with duplicate tag is handled without orphaned data.""" - project, package = test_package - content1 = f"content version 1 {unique_test_id}".encode() - content2 = f"content version 2 {unique_test_id}".encode() - tag = f"duplicate-tag-{unique_test_id}" - - # First upload with tag - result1 = upload_test_file( - integration_client, project, package, content1, tag=tag - ) - hash1 = result1["artifact_id"] - - # Second upload with same tag (should update the tag to point to new artifact) - result2 = upload_test_file( - integration_client, project, package, content2, tag=tag - ) - hash2 = result2["artifact_id"] - - # Both artifacts should exist - assert integration_client.get(f"/api/v1/artifact/{hash1}").status_code == 200 - assert integration_client.get(f"/api/v1/artifact/{hash2}").status_code == 200 - - # Tag should point to the second artifact - tag_response = integration_client.get( - f"/api/v1/project/{project}/{package}/tags/{tag}" - ) - assert tag_response.status_code == 200 - assert tag_response.json()["artifact_id"] == hash2 - - class TestFileSizeValidation: """Tests for file size limits and empty file rejection.""" @pytest.mark.integration def test_empty_file_rejected(self, integration_client, test_package): - """Test that empty files are rejected with appropriate error.""" + """Test empty files are rejected with appropriate error.""" project, package = test_package - # Try to upload empty content files = {"file": ("empty.txt", io.BytesIO(b""), "application/octet-stream")} response = integration_client.post( f"/api/v1/project/{project}/{package}/upload", files=files, ) - # Should be rejected (422 from storage layer or validation) assert response.status_code in [422, 400] @pytest.mark.integration def test_small_valid_file_accepted(self, integration_client, test_package): - """Test that small (1 byte) files are accepted.""" + """Test small (1 byte) files are accepted.""" project, package = test_package - content = b"X" # Single byte + content = b"X" result = upload_test_file( integration_client, project, package, content, tag="tiny" @@ -586,7 +366,7 @@ class TestFileSizeValidation: def test_file_size_reported_correctly( self, integration_client, test_package, unique_test_id ): - """Test that file size is correctly reported in response.""" + """Test file size is correctly reported in response.""" project, package = test_package content = f"Test content for size check {unique_test_id}".encode() expected_size = len(content) @@ -602,3 +382,121 @@ class TestFileSizeValidation: f"/api/v1/artifact/{result['artifact_id']}" ) assert artifact_response.json()["size"] == expected_size + + +class TestUploadFailureCleanup: + """Tests for cleanup when uploads fail.""" + + @pytest.mark.integration + def test_upload_failure_invalid_project_no_orphaned_s3( + self, integration_client, unique_test_id + ): + """Test upload to non-existent project doesn't leave orphaned S3 objects.""" + content = f"content for orphan s3 test {unique_test_id}".encode() + expected_hash = compute_sha256(content) + + files = {"file": ("test.bin", io.BytesIO(content), "application/octet-stream")} + response = integration_client.post( + f"/api/v1/project/nonexistent-project-{unique_test_id}/nonexistent-pkg/upload", + files=files, + data={"tag": "test"}, + ) + + assert response.status_code == 404 + + # Verify no S3 object was created + assert not s3_object_exists(expected_hash), ( + "Orphaned S3 object found after failed upload" + ) + + @pytest.mark.integration + def test_upload_failure_invalid_package_no_orphaned_s3( + self, integration_client, test_project, unique_test_id + ): + """Test upload to non-existent package doesn't leave orphaned S3 objects.""" + content = f"content for orphan s3 test pkg {unique_test_id}".encode() + expected_hash = compute_sha256(content) + + files = {"file": ("test.bin", io.BytesIO(content), "application/octet-stream")} + response = integration_client.post( + f"/api/v1/project/{test_project}/nonexistent-package-{unique_test_id}/upload", + files=files, + data={"tag": "test"}, + ) + + assert response.status_code == 404 + + assert not s3_object_exists(expected_hash), ( + "Orphaned S3 object found after failed upload" + ) + + @pytest.mark.integration + def test_upload_failure_no_orphaned_database_records( + self, integration_client, test_project, unique_test_id + ): + """Test failed upload doesn't leave orphaned database records.""" + content = f"content for db orphan test {unique_test_id}".encode() + expected_hash = compute_sha256(content) + + files = {"file": ("test.bin", io.BytesIO(content), "application/octet-stream")} + response = integration_client.post( + f"/api/v1/project/{test_project}/nonexistent-package-{unique_test_id}/upload", + files=files, + data={"tag": "test"}, + ) + + assert response.status_code == 404 + + artifact_response = integration_client.get(f"/api/v1/artifact/{expected_hash}") + assert artifact_response.status_code == 404, ( + "Orphaned artifact record found after failed upload" + ) + + +class TestS3StorageVerification: + """Tests to verify S3 storage behavior.""" + + @pytest.mark.integration + def test_s3_single_object_after_duplicates( + self, integration_client, test_package, unique_test_id + ): + """Test S3 bucket contains only one object after duplicate uploads.""" + project, package = test_package + content = f"content for s3 object count test {unique_test_id}".encode() + expected_hash = compute_sha256(content) + + # Upload same content multiple times + for tag in ["s3test1", "s3test2", "s3test3"]: + upload_test_file(integration_client, project, package, content, tag=tag) + + # Verify only one S3 object exists + s3_objects = list_s3_objects_by_hash(expected_hash) + assert len(s3_objects) == 1, ( + f"Expected 1 S3 object, found {len(s3_objects)}: {s3_objects}" + ) + + # Verify object key follows expected pattern + expected_key = ( + f"fruits/{expected_hash[:2]}/{expected_hash[2:4]}/{expected_hash}" + ) + assert s3_objects[0] == expected_key + + @pytest.mark.integration + def test_artifact_table_single_row_after_duplicates( + self, integration_client, test_package + ): + """Test artifact table contains only one row after duplicate uploads.""" + project, package = test_package + content = b"content for single row test" + expected_hash = compute_sha256(content) + + # Upload same content multiple times + for tag in ["v1", "v2", "v3"]: + upload_test_file(integration_client, project, package, content, tag=tag) + + # Query artifact + response = integration_client.get(f"/api/v1/artifact/{expected_hash}") + assert response.status_code == 200 + artifact = response.json() + assert artifact["id"] == expected_hash + assert artifact["ref_count"] == 3 diff --git a/backend/tests/test_duplicate_detection.py b/backend/tests/test_duplicate_detection.py deleted file mode 100644 index b2284b3..0000000 --- a/backend/tests/test_duplicate_detection.py +++ /dev/null @@ -1,207 +0,0 @@ -""" -Unit tests for duplicate detection and deduplication logic. - -Tests cover: -- _exists() method correctly identifies existing S3 keys -- S3 key generation follows expected pattern -- Storage layer skips upload when artifact already exists -- Storage layer performs upload when artifact does not exist -""" - -import pytest -import io -from unittest.mock import MagicMock, patch -from tests.conftest import ( - compute_sha256, - TEST_CONTENT_HELLO, - TEST_HASH_HELLO, -) - - -class TestExistsMethod: - """Tests for the _exists() method that checks S3 object existence.""" - - @pytest.mark.unit - def test_exists_returns_true_for_existing_key(self, mock_storage, mock_s3_client): - """Test _exists() returns True when object exists.""" - # Pre-populate the mock storage - test_key = "fruits/df/fd/test-hash" - mock_s3_client.objects[test_key] = b"content" - - result = mock_storage._exists(test_key) - - assert result is True - - @pytest.mark.unit - def test_exists_returns_false_for_nonexistent_key(self, mock_storage): - """Test _exists() returns False when object doesn't exist.""" - result = mock_storage._exists("fruits/no/ne/nonexistent-key") - - assert result is False - - @pytest.mark.unit - def test_exists_handles_404_error(self, mock_storage): - """Test _exists() handles 404 errors gracefully.""" - # The mock client raises ClientError for nonexistent keys - result = mock_storage._exists("fruits/xx/yy/does-not-exist") - - assert result is False - - -class TestS3KeyGeneration: - """Tests for S3 key pattern generation.""" - - @pytest.mark.unit - def test_s3_key_pattern(self): - """Test S3 key follows pattern: fruits/{hash[:2]}/{hash[2:4]}/{hash}""" - test_hash = "abcdef1234567890abcdef1234567890abcdef1234567890abcdef1234567890" - - expected_key = f"fruits/{test_hash[:2]}/{test_hash[2:4]}/{test_hash}" - # Expected: fruits/ab/cd/abcdef1234567890... - - assert expected_key == f"fruits/ab/cd/{test_hash}" - - @pytest.mark.unit - def test_s3_key_generation_in_storage(self, mock_storage): - """Test storage layer generates correct S3 key.""" - content = TEST_CONTENT_HELLO - file_obj = io.BytesIO(content) - - result = mock_storage._store_simple(file_obj) - - expected_key = ( - f"fruits/{TEST_HASH_HELLO[:2]}/{TEST_HASH_HELLO[2:4]}/{TEST_HASH_HELLO}" - ) - assert result.s3_key == expected_key - - @pytest.mark.unit - def test_s3_key_uses_sha256_hash(self, mock_storage): - """Test S3 key is derived from SHA256 hash.""" - content = b"unique test content for key test" - file_obj = io.BytesIO(content) - expected_hash = compute_sha256(content) - - result = mock_storage._store_simple(file_obj) - - # Key should contain the hash - assert expected_hash in result.s3_key - - -class TestDeduplicationBehavior: - """Tests for deduplication (skip upload when exists).""" - - @pytest.mark.unit - def test_skips_upload_when_exists(self, mock_storage, mock_s3_client): - """Test storage skips S3 upload when artifact already exists.""" - content = TEST_CONTENT_HELLO - s3_key = ( - f"fruits/{TEST_HASH_HELLO[:2]}/{TEST_HASH_HELLO[2:4]}/{TEST_HASH_HELLO}" - ) - - # Pre-populate storage (simulate existing artifact) - mock_s3_client.objects[s3_key] = content - - # Track put_object calls - original_put = mock_s3_client.put_object - put_called = [] - - def tracked_put(*args, **kwargs): - put_called.append(True) - return original_put(*args, **kwargs) - - mock_s3_client.put_object = tracked_put - - # Store the same content - file_obj = io.BytesIO(content) - result = mock_storage._store_simple(file_obj) - - # put_object should NOT have been called (deduplication) - assert len(put_called) == 0 - assert result.sha256 == TEST_HASH_HELLO - - @pytest.mark.unit - def test_uploads_when_not_exists(self, mock_storage, mock_s3_client): - """Test storage uploads to S3 when artifact doesn't exist.""" - content = b"brand new unique content" - content_hash = compute_sha256(content) - s3_key = f"fruits/{content_hash[:2]}/{content_hash[2:4]}/{content_hash}" - - # Ensure object doesn't exist - assert s3_key not in mock_s3_client.objects - - # Store the content - file_obj = io.BytesIO(content) - result = mock_storage._store_simple(file_obj) - - # Object should now exist in mock storage - assert s3_key in mock_s3_client.objects - assert mock_s3_client.objects[s3_key] == content - - @pytest.mark.unit - def test_returns_same_hash_for_duplicate(self, mock_storage, mock_s3_client): - """Test storing same content twice returns same hash.""" - content = b"content to be stored twice" - - # First store - file1 = io.BytesIO(content) - result1 = mock_storage._store_simple(file1) - - # Second store (duplicate) - file2 = io.BytesIO(content) - result2 = mock_storage._store_simple(file2) - - assert result1.sha256 == result2.sha256 - assert result1.s3_key == result2.s3_key - - @pytest.mark.unit - def test_different_content_different_keys(self, mock_storage): - """Test different content produces different S3 keys.""" - content1 = b"first content" - content2 = b"second content" - - file1 = io.BytesIO(content1) - result1 = mock_storage._store_simple(file1) - - file2 = io.BytesIO(content2) - result2 = mock_storage._store_simple(file2) - - assert result1.sha256 != result2.sha256 - assert result1.s3_key != result2.s3_key - - -class TestDeduplicationEdgeCases: - """Edge case tests for deduplication.""" - - @pytest.mark.unit - def test_same_content_different_filenames(self, mock_storage): - """Test same content with different metadata is deduplicated.""" - content = b"identical content" - - # Store with "filename1" - file1 = io.BytesIO(content) - result1 = mock_storage._store_simple(file1) - - # Store with "filename2" (same content) - file2 = io.BytesIO(content) - result2 = mock_storage._store_simple(file2) - - # Both should have same hash (content-addressable) - assert result1.sha256 == result2.sha256 - - @pytest.mark.unit - def test_whitespace_only_difference(self, mock_storage): - """Test content differing only by whitespace produces different hashes.""" - content1 = b"test content" - content2 = b"test content" # Extra space - content3 = b"test content " # Trailing space - - file1 = io.BytesIO(content1) - file2 = io.BytesIO(content2) - file3 = io.BytesIO(content3) - - result1 = mock_storage._store_simple(file1) - result2 = mock_storage._store_simple(file2) - result3 = mock_storage._store_simple(file3) - - # All should be different (content-addressable) - assert len({result1.sha256, result2.sha256, result3.sha256}) == 3 diff --git a/backend/tests/test_garbage_collection.py b/backend/tests/test_garbage_collection.py deleted file mode 100644 index 698f98b..0000000 --- a/backend/tests/test_garbage_collection.py +++ /dev/null @@ -1,168 +0,0 @@ -""" -Integration tests for garbage collection functionality. - -Tests cover: -- Listing orphaned artifacts (ref_count=0) -- Garbage collection in dry-run mode -- Garbage collection actual deletion -- Verifying artifacts with refs are not deleted -""" - -import pytest -from tests.conftest import ( - compute_sha256, - upload_test_file, -) - - -class TestOrphanedArtifactsEndpoint: - """Tests for GET /api/v1/admin/orphaned-artifacts endpoint.""" - - @pytest.mark.integration - def test_list_orphaned_artifacts_returns_list(self, integration_client): - """Test orphaned artifacts endpoint returns a list.""" - response = integration_client.get("/api/v1/admin/orphaned-artifacts") - assert response.status_code == 200 - assert isinstance(response.json(), list) - - @pytest.mark.integration - def test_orphaned_artifact_has_required_fields(self, integration_client): - """Test orphaned artifact response has required fields.""" - response = integration_client.get("/api/v1/admin/orphaned-artifacts?limit=1") - assert response.status_code == 200 - - data = response.json() - if len(data) > 0: - artifact = data[0] - assert "id" in artifact - assert "size" in artifact - assert "created_at" in artifact - assert "created_by" in artifact - assert "original_name" in artifact - - @pytest.mark.integration - def test_orphaned_artifacts_respects_limit(self, integration_client): - """Test orphaned artifacts endpoint respects limit parameter.""" - response = integration_client.get("/api/v1/admin/orphaned-artifacts?limit=5") - assert response.status_code == 200 - assert len(response.json()) <= 5 - - @pytest.mark.integration - def test_artifact_becomes_orphaned_when_tag_deleted( - self, integration_client, test_package, unique_test_id - ): - """Test artifact appears in orphaned list after tag is deleted.""" - project, package = test_package - content = f"orphan test {unique_test_id}".encode() - expected_hash = compute_sha256(content) - - # Upload with tag - upload_test_file(integration_client, project, package, content, tag="temp-tag") - - # Verify not in orphaned list (has ref_count=1) - response = integration_client.get("/api/v1/admin/orphaned-artifacts?limit=1000") - orphaned_ids = [a["id"] for a in response.json()] - assert expected_hash not in orphaned_ids - - # Delete the tag - integration_client.delete(f"/api/v1/project/{project}/{package}/tags/temp-tag") - - # Verify now in orphaned list (ref_count=0) - response = integration_client.get("/api/v1/admin/orphaned-artifacts?limit=1000") - orphaned_ids = [a["id"] for a in response.json()] - assert expected_hash in orphaned_ids - - -class TestGarbageCollectionEndpoint: - """Tests for POST /api/v1/admin/garbage-collect endpoint.""" - - @pytest.mark.integration - def test_garbage_collect_dry_run_returns_response(self, integration_client): - """Test garbage collection dry run returns valid response.""" - response = integration_client.post("/api/v1/admin/garbage-collect?dry_run=true") - assert response.status_code == 200 - - data = response.json() - assert "artifacts_deleted" in data - assert "bytes_freed" in data - assert "artifact_ids" in data - assert "dry_run" in data - assert data["dry_run"] is True - - @pytest.mark.integration - def test_garbage_collect_dry_run_doesnt_delete( - self, integration_client, test_package, unique_test_id - ): - """Test garbage collection dry run doesn't actually delete artifacts.""" - project, package = test_package - content = f"dry run test {unique_test_id}".encode() - expected_hash = compute_sha256(content) - - # Upload and delete tag to create orphan - upload_test_file(integration_client, project, package, content, tag="dry-run") - integration_client.delete(f"/api/v1/project/{project}/{package}/tags/dry-run") - - # Verify artifact exists - response = integration_client.get(f"/api/v1/artifact/{expected_hash}") - assert response.status_code == 200 - - # Run garbage collection in dry-run mode - gc_response = integration_client.post( - "/api/v1/admin/garbage-collect?dry_run=true&limit=1000" - ) - assert gc_response.status_code == 200 - assert expected_hash in gc_response.json()["artifact_ids"] - - # Verify artifact STILL exists (dry run didn't delete) - response = integration_client.get(f"/api/v1/artifact/{expected_hash}") - assert response.status_code == 200 - - @pytest.mark.integration - def test_garbage_collect_preserves_referenced_artifacts( - self, integration_client, test_package, unique_test_id - ): - """Test garbage collection doesn't delete artifacts with ref_count > 0.""" - project, package = test_package - content = f"preserve test {unique_test_id}".encode() - expected_hash = compute_sha256(content) - - # Upload with tag (ref_count=1) - upload_test_file(integration_client, project, package, content, tag="keep-this") - - # Verify artifact exists with ref_count=1 - response = integration_client.get(f"/api/v1/artifact/{expected_hash}") - assert response.status_code == 200 - assert response.json()["ref_count"] == 1 - - # Run garbage collection (dry_run to not affect other tests) - gc_response = integration_client.post( - "/api/v1/admin/garbage-collect?dry_run=true&limit=1000" - ) - assert gc_response.status_code == 200 - - # Verify artifact was NOT in delete list (has ref_count > 0) - assert expected_hash not in gc_response.json()["artifact_ids"] - - # Verify artifact still exists - response = integration_client.get(f"/api/v1/artifact/{expected_hash}") - assert response.status_code == 200 - assert response.json()["ref_count"] == 1 - - @pytest.mark.integration - def test_garbage_collect_respects_limit(self, integration_client): - """Test garbage collection respects limit parameter.""" - response = integration_client.post( - "/api/v1/admin/garbage-collect?dry_run=true&limit=5" - ) - assert response.status_code == 200 - assert response.json()["artifacts_deleted"] <= 5 - - @pytest.mark.integration - def test_garbage_collect_returns_bytes_freed(self, integration_client): - """Test garbage collection returns accurate bytes_freed.""" - response = integration_client.post("/api/v1/admin/garbage-collect?dry_run=true") - assert response.status_code == 200 - - data = response.json() - assert data["bytes_freed"] >= 0 - assert isinstance(data["bytes_freed"], int) diff --git a/backend/tests/test_hash_calculation.py b/backend/tests/test_hash_calculation.py deleted file mode 100644 index 309065e..0000000 --- a/backend/tests/test_hash_calculation.py +++ /dev/null @@ -1,215 +0,0 @@ -""" -Unit tests for SHA256 hash calculation and deduplication logic. - -Tests cover: -- Hash computation produces consistent results -- Hash is always 64 character lowercase hexadecimal -- Different content produces different hashes -- Binary content handling -- Large file handling (streaming) -""" - -import pytest -import hashlib -import io -from tests.conftest import ( - create_test_file, - compute_sha256, - TEST_CONTENT_HELLO, - TEST_HASH_HELLO, - TEST_CONTENT_BINARY, - TEST_HASH_BINARY, -) - - -class TestHashComputation: - """Unit tests for hash calculation functionality.""" - - @pytest.mark.unit - def test_sha256_consistent_results(self): - """Test SHA256 hash produces consistent results for identical content.""" - content = b"test content for hashing" - - # Compute hash multiple times - hash1 = compute_sha256(content) - hash2 = compute_sha256(content) - hash3 = compute_sha256(content) - - assert hash1 == hash2 == hash3 - - @pytest.mark.unit - def test_sha256_different_content_different_hash(self): - """Test SHA256 produces different hashes for different content.""" - content1 = b"content version 1" - content2 = b"content version 2" - - hash1 = compute_sha256(content1) - hash2 = compute_sha256(content2) - - assert hash1 != hash2 - - @pytest.mark.unit - def test_sha256_format_64_char_hex(self): - """Test SHA256 hash is always 64 character lowercase hexadecimal.""" - test_cases = [ - b"", # Empty - b"a", # Single char - b"Hello, World!", # Normal string - bytes(range(256)), # All byte values - b"x" * 10000, # Larger content - ] - - for content in test_cases: - hash_value = compute_sha256(content) - - # Check length - assert len(hash_value) == 64, ( - f"Hash length should be 64, got {len(hash_value)}" - ) - - # Check lowercase - assert hash_value == hash_value.lower(), "Hash should be lowercase" - - # Check hexadecimal - assert all(c in "0123456789abcdef" for c in hash_value), ( - "Hash should be hex" - ) - - @pytest.mark.unit - def test_sha256_known_value(self): - """Test SHA256 produces expected hash for known input.""" - assert compute_sha256(TEST_CONTENT_HELLO) == TEST_HASH_HELLO - - @pytest.mark.unit - def test_sha256_binary_content(self): - """Test SHA256 handles binary content correctly.""" - assert compute_sha256(TEST_CONTENT_BINARY) == TEST_HASH_BINARY - - # Test with null bytes - content_with_nulls = b"\x00\x00test\x00\x00" - hash_value = compute_sha256(content_with_nulls) - assert len(hash_value) == 64 - - @pytest.mark.unit - def test_sha256_streaming_computation(self): - """Test SHA256 can be computed in chunks (streaming).""" - # Large content - chunk_size = 8192 - total_size = chunk_size * 10 # 80KB - content = b"x" * total_size - - # Direct computation - direct_hash = compute_sha256(content) - - # Streaming computation - hasher = hashlib.sha256() - for i in range(0, total_size, chunk_size): - hasher.update(content[i : i + chunk_size]) - streaming_hash = hasher.hexdigest() - - assert direct_hash == streaming_hash - - @pytest.mark.unit - def test_sha256_order_matters(self): - """Test that content order affects hash (not just content set).""" - content1 = b"AB" - content2 = b"BA" - - assert compute_sha256(content1) != compute_sha256(content2) - - -class TestStorageHashComputation: - """Tests for hash computation in the storage layer.""" - - @pytest.mark.unit - def test_storage_computes_sha256(self, mock_storage): - """Test storage layer correctly computes SHA256 hash.""" - content = TEST_CONTENT_HELLO - file_obj = io.BytesIO(content) - - result = mock_storage._store_simple(file_obj) - - assert result.sha256 == TEST_HASH_HELLO - - @pytest.mark.unit - def test_storage_computes_md5(self, mock_storage): - """Test storage layer also computes MD5 hash.""" - content = TEST_CONTENT_HELLO - file_obj = io.BytesIO(content) - - result = mock_storage._store_simple(file_obj) - - expected_md5 = hashlib.md5(content).hexdigest() - assert result.md5 == expected_md5 - - @pytest.mark.unit - def test_storage_computes_sha1(self, mock_storage): - """Test storage layer also computes SHA1 hash.""" - content = TEST_CONTENT_HELLO - file_obj = io.BytesIO(content) - - result = mock_storage._store_simple(file_obj) - - expected_sha1 = hashlib.sha1(content).hexdigest() - assert result.sha1 == expected_sha1 - - @pytest.mark.unit - def test_storage_returns_correct_size(self, mock_storage): - """Test storage layer returns correct file size.""" - content = b"test content with known size" - file_obj = io.BytesIO(content) - - result = mock_storage._store_simple(file_obj) - - assert result.size == len(content) - - @pytest.mark.unit - def test_storage_generates_correct_s3_key(self, mock_storage): - """Test storage layer generates correct S3 key pattern.""" - content = TEST_CONTENT_HELLO - file_obj = io.BytesIO(content) - - result = mock_storage._store_simple(file_obj) - - # Key should be: fruits/{hash[:2]}/{hash[2:4]}/{hash} - expected_key = ( - f"fruits/{TEST_HASH_HELLO[:2]}/{TEST_HASH_HELLO[2:4]}/{TEST_HASH_HELLO}" - ) - assert result.s3_key == expected_key - - -class TestHashEdgeCases: - """Edge case tests for hash computation.""" - - @pytest.mark.unit - def test_hash_empty_content_rejected(self, mock_storage): - """Test that empty content is rejected.""" - from app.storage import HashComputationError - - file_obj = io.BytesIO(b"") - - with pytest.raises(HashComputationError): - mock_storage._store_simple(file_obj) - - @pytest.mark.unit - def test_hash_large_file_streaming(self, mock_storage): - """Test hash computation for large files uses streaming.""" - # Create a 10MB file - size = 10 * 1024 * 1024 - content = b"x" * size - file_obj = io.BytesIO(content) - - result = mock_storage._store_simple(file_obj) - - expected_hash = compute_sha256(content) - assert result.sha256 == expected_hash - - @pytest.mark.unit - def test_hash_special_bytes(self): - """Test hash handles all byte values correctly.""" - # All possible byte values - content = bytes(range(256)) - hash_value = compute_sha256(content) - - assert len(hash_value) == 64 - assert hash_value == TEST_HASH_BINARY diff --git a/backend/tests/test_ref_count.py b/backend/tests/test_ref_count.py deleted file mode 100644 index 6a59995..0000000 --- a/backend/tests/test_ref_count.py +++ /dev/null @@ -1,458 +0,0 @@ -""" -Unit and integration tests for reference counting behavior. - -Tests cover: -- ref_count is set correctly for new artifacts -- ref_count increments on duplicate uploads -- ref_count query correctly identifies existing artifacts -- Artifact lookup by SHA256 hash works correctly -""" - -import pytest -import io -from tests.conftest import ( - compute_sha256, - upload_test_file, - TEST_CONTENT_HELLO, - TEST_HASH_HELLO, -) - - -class TestRefCountQuery: - """Tests for ref_count querying and artifact lookup.""" - - @pytest.mark.integration - def test_artifact_lookup_by_sha256(self, integration_client, test_package): - """Test artifact lookup by SHA256 hash (primary key) works correctly.""" - project, package = test_package - content = b"unique content for lookup test" - expected_hash = compute_sha256(content) - - # Upload a file - upload_result = upload_test_file( - integration_client, project, package, content, tag="v1" - ) - assert upload_result["artifact_id"] == expected_hash - - # Look up artifact by ID (SHA256) - response = integration_client.get(f"/api/v1/artifact/{expected_hash}") - assert response.status_code == 200 - - artifact = response.json() - assert artifact["id"] == expected_hash - assert artifact["sha256"] == expected_hash - assert artifact["size"] == len(content) - - @pytest.mark.integration - def test_ref_count_query_identifies_existing_artifact( - self, integration_client, test_package - ): - """Test ref_count query correctly identifies existing artifacts by hash.""" - project, package = test_package - content = b"content for ref count query test" - expected_hash = compute_sha256(content) - - # Upload a file with a tag - upload_result = upload_test_file( - integration_client, project, package, content, tag="v1" - ) - - # Query artifact and check ref_count - response = integration_client.get(f"/api/v1/artifact/{expected_hash}") - assert response.status_code == 200 - - artifact = response.json() - assert artifact["ref_count"] >= 1 # At least 1 from the tag - - @pytest.mark.integration - def test_ref_count_set_to_1_for_new_artifact_with_tag( - self, integration_client, test_package, unique_test_id - ): - """Test ref_count is set to 1 for new artifacts when created with a tag.""" - project, package = test_package - content = f"brand new content for ref count test {unique_test_id}".encode() - expected_hash = compute_sha256(content) - - # Upload a new file with a tag - upload_result = upload_test_file( - integration_client, project, package, content, tag="initial" - ) - - assert upload_result["artifact_id"] == expected_hash - assert upload_result["ref_count"] == 1 - assert upload_result["deduplicated"] is False - - @pytest.mark.integration - def test_ref_count_increments_on_duplicate_upload_with_tag( - self, integration_client, test_package, unique_test_id - ): - """Test ref_count is incremented when duplicate content is uploaded with a new tag.""" - project, package = test_package - content = f"content that will be uploaded twice {unique_test_id}".encode() - expected_hash = compute_sha256(content) - - # First upload with tag - result1 = upload_test_file( - integration_client, project, package, content, tag="v1" - ) - assert result1["ref_count"] == 1 - assert result1["deduplicated"] is False - - # Second upload with different tag (same content) - result2 = upload_test_file( - integration_client, project, package, content, tag="v2" - ) - assert result2["artifact_id"] == expected_hash - assert result2["ref_count"] == 2 - assert result2["deduplicated"] is True - - @pytest.mark.integration - def test_ref_count_after_multiple_tags(self, integration_client, test_package): - """Test ref_count correctly reflects number of tags pointing to artifact.""" - project, package = test_package - content = b"content for multiple tag test" - expected_hash = compute_sha256(content) - - # Upload with multiple tags - tags = ["v1", "v2", "v3", "latest"] - for i, tag in enumerate(tags): - result = upload_test_file( - integration_client, project, package, content, tag=tag - ) - assert result["artifact_id"] == expected_hash - assert result["ref_count"] == i + 1 - - # Verify final ref_count via artifact endpoint - response = integration_client.get(f"/api/v1/artifact/{expected_hash}") - assert response.status_code == 200 - assert response.json()["ref_count"] == len(tags) - - -class TestRefCountWithDeletion: - """Tests for ref_count behavior when tags are deleted.""" - - @pytest.mark.integration - def test_ref_count_decrements_on_tag_delete(self, integration_client, test_package): - """Test ref_count decrements when a tag is deleted.""" - project, package = test_package - content = b"content for delete test" - expected_hash = compute_sha256(content) - - # Upload with two tags - upload_test_file(integration_client, project, package, content, tag="v1") - upload_test_file(integration_client, project, package, content, tag="v2") - - # Verify ref_count is 2 - response = integration_client.get(f"/api/v1/artifact/{expected_hash}") - assert response.json()["ref_count"] == 2 - - # Delete one tag - delete_response = integration_client.delete( - f"/api/v1/project/{project}/{package}/tags/v1" - ) - assert delete_response.status_code == 204 - - # Verify ref_count is now 1 - response = integration_client.get(f"/api/v1/artifact/{expected_hash}") - assert response.json()["ref_count"] == 1 - - @pytest.mark.integration - def test_ref_count_zero_after_all_tags_deleted( - self, integration_client, test_package - ): - """Test ref_count goes to 0 when all tags are deleted.""" - project, package = test_package - content = b"content that will be orphaned" - expected_hash = compute_sha256(content) - - # Upload with one tag - upload_test_file(integration_client, project, package, content, tag="only-tag") - - # Delete the tag - integration_client.delete(f"/api/v1/project/{project}/{package}/tags/only-tag") - - # Verify ref_count is 0 - response = integration_client.get(f"/api/v1/artifact/{expected_hash}") - assert response.json()["ref_count"] == 0 - - -class TestRefCountCascadeDelete: - """Tests for ref_count behavior during cascade deletions.""" - - @pytest.mark.integration - def test_ref_count_decrements_on_package_delete( - self, integration_client, unique_test_id - ): - """Test ref_count decrements for all tags when package is deleted.""" - # Create a project and package manually (not using fixtures to control cleanup) - project_name = f"cascade-pkg-{unique_test_id}" - package_name = f"test-pkg-{unique_test_id}" - - # Create project - response = integration_client.post( - "/api/v1/projects", - json={ - "name": project_name, - "description": "Test project", - "is_public": True, - }, - ) - assert response.status_code == 200 - - # Create package - response = integration_client.post( - f"/api/v1/project/{project_name}/packages", - json={"name": package_name, "description": "Test package"}, - ) - assert response.status_code == 200 - - # Upload content with multiple tags - content = f"cascade delete test {unique_test_id}".encode() - expected_hash = compute_sha256(content) - - upload_test_file( - integration_client, project_name, package_name, content, tag="v1" - ) - upload_test_file( - integration_client, project_name, package_name, content, tag="v2" - ) - upload_test_file( - integration_client, project_name, package_name, content, tag="v3" - ) - - # Verify ref_count is 3 - response = integration_client.get(f"/api/v1/artifact/{expected_hash}") - assert response.json()["ref_count"] == 3 - - # Delete the package (should cascade delete all tags and decrement ref_count) - delete_response = integration_client.delete( - f"/api/v1/project/{project_name}/packages/{package_name}" - ) - assert delete_response.status_code == 204 - - # Verify ref_count is 0 (all tags were deleted) - response = integration_client.get(f"/api/v1/artifact/{expected_hash}") - assert response.json()["ref_count"] == 0 - - # Cleanup: delete the project - integration_client.delete(f"/api/v1/projects/{project_name}") - - @pytest.mark.integration - def test_ref_count_decrements_on_project_delete( - self, integration_client, unique_test_id - ): - """Test ref_count decrements for all tags in all packages when project is deleted.""" - # Create a project manually (not using fixtures to control cleanup) - project_name = f"cascade-proj-{unique_test_id}" - package1_name = f"pkg1-{unique_test_id}" - package2_name = f"pkg2-{unique_test_id}" - - # Create project - response = integration_client.post( - "/api/v1/projects", - json={ - "name": project_name, - "description": "Test project", - "is_public": True, - }, - ) - assert response.status_code == 200 - - # Create two packages - for pkg_name in [package1_name, package2_name]: - response = integration_client.post( - f"/api/v1/project/{project_name}/packages", - json={"name": pkg_name, "description": "Test package"}, - ) - assert response.status_code == 200 - - # Upload same content with tags in both packages - content = f"project cascade test {unique_test_id}".encode() - expected_hash = compute_sha256(content) - - upload_test_file( - integration_client, project_name, package1_name, content, tag="v1" - ) - upload_test_file( - integration_client, project_name, package1_name, content, tag="v2" - ) - upload_test_file( - integration_client, project_name, package2_name, content, tag="latest" - ) - upload_test_file( - integration_client, project_name, package2_name, content, tag="stable" - ) - - # Verify ref_count is 4 (2 tags in each of 2 packages) - response = integration_client.get(f"/api/v1/artifact/{expected_hash}") - assert response.json()["ref_count"] == 4 - - # Delete the project (should cascade delete all packages, tags, and decrement ref_count) - delete_response = integration_client.delete(f"/api/v1/projects/{project_name}") - assert delete_response.status_code == 204 - - # Verify ref_count is 0 - response = integration_client.get(f"/api/v1/artifact/{expected_hash}") - assert response.json()["ref_count"] == 0 - - @pytest.mark.integration - def test_shared_artifact_ref_count_partial_decrement( - self, integration_client, unique_test_id - ): - """Test ref_count correctly decrements when artifact is shared across packages.""" - # Create project with two packages - project_name = f"shared-artifact-{unique_test_id}" - package1_name = f"pkg1-{unique_test_id}" - package2_name = f"pkg2-{unique_test_id}" - - # Create project - response = integration_client.post( - "/api/v1/projects", - json={ - "name": project_name, - "description": "Test project", - "is_public": True, - }, - ) - assert response.status_code == 200 - - # Create two packages - for pkg_name in [package1_name, package2_name]: - response = integration_client.post( - f"/api/v1/project/{project_name}/packages", - json={"name": pkg_name, "description": "Test package"}, - ) - assert response.status_code == 200 - - # Upload same content to both packages - content = f"shared artifact {unique_test_id}".encode() - expected_hash = compute_sha256(content) - - upload_test_file( - integration_client, project_name, package1_name, content, tag="v1" - ) - upload_test_file( - integration_client, project_name, package2_name, content, tag="v1" - ) - - # Verify ref_count is 2 - response = integration_client.get(f"/api/v1/artifact/{expected_hash}") - assert response.json()["ref_count"] == 2 - - # Delete only package1 (package2 still references the artifact) - delete_response = integration_client.delete( - f"/api/v1/project/{project_name}/packages/{package1_name}" - ) - assert delete_response.status_code == 204 - - # Verify ref_count is 1 (only package2's tag remains) - response = integration_client.get(f"/api/v1/artifact/{expected_hash}") - assert response.json()["ref_count"] == 1 - - # Cleanup - integration_client.delete(f"/api/v1/projects/{project_name}") - - -class TestRefCountTagUpdate: - """Tests for ref_count behavior when tags are updated to point to different artifacts.""" - - @pytest.mark.integration - def test_ref_count_adjusts_on_tag_update( - self, integration_client, test_package, unique_test_id - ): - """Test ref_count adjusts when a tag is updated to point to a different artifact.""" - project, package = test_package - - # Upload two different artifacts - content1 = f"artifact one {unique_test_id}".encode() - content2 = f"artifact two {unique_test_id}".encode() - hash1 = compute_sha256(content1) - hash2 = compute_sha256(content2) - - # Upload first artifact with tag "latest" - upload_test_file(integration_client, project, package, content1, tag="latest") - - # Verify first artifact has ref_count 1 - response = integration_client.get(f"/api/v1/artifact/{hash1}") - assert response.json()["ref_count"] == 1 - - # Upload second artifact with different tag - upload_test_file(integration_client, project, package, content2, tag="stable") - - # Now update "latest" tag to point to second artifact - # This is done by uploading the same content with the same tag - upload_test_file(integration_client, project, package, content2, tag="latest") - - # Verify first artifact ref_count decreased to 0 (tag moved away) - response = integration_client.get(f"/api/v1/artifact/{hash1}") - assert response.json()["ref_count"] == 0 - - # Verify second artifact ref_count increased to 2 (stable + latest) - response = integration_client.get(f"/api/v1/artifact/{hash2}") - assert response.json()["ref_count"] == 2 - - @pytest.mark.integration - def test_ref_count_unchanged_when_tag_same_artifact( - self, integration_client, test_package, unique_test_id - ): - """Test ref_count doesn't change when tag is 'updated' to same artifact.""" - project, package = test_package - - content = f"same artifact {unique_test_id}".encode() - expected_hash = compute_sha256(content) - - # Upload with tag - upload_test_file(integration_client, project, package, content, tag="v1") - - # Verify ref_count is 1 - response = integration_client.get(f"/api/v1/artifact/{expected_hash}") - assert response.json()["ref_count"] == 1 - - # Upload same content with same tag (no-op) - upload_test_file(integration_client, project, package, content, tag="v1") - - # Verify ref_count is still 1 (no double-counting) - response = integration_client.get(f"/api/v1/artifact/{expected_hash}") - assert response.json()["ref_count"] == 1 - - @pytest.mark.integration - def test_tag_via_post_endpoint_increments_ref_count( - self, integration_client, test_package, unique_test_id - ): - """Test creating tag via POST /tags endpoint increments ref_count.""" - project, package = test_package - - content = f"tag endpoint test {unique_test_id}".encode() - expected_hash = compute_sha256(content) - - # Upload artifact without tag - result = upload_test_file( - integration_client, project, package, content, filename="test.bin", tag=None - ) - artifact_id = result["artifact_id"] - - # Verify ref_count is 0 (no tags yet) - response = integration_client.get(f"/api/v1/artifact/{expected_hash}") - assert response.json()["ref_count"] == 0 - - # Create tag via POST endpoint - tag_response = integration_client.post( - f"/api/v1/project/{project}/{package}/tags", - json={"name": "v1.0.0", "artifact_id": artifact_id}, - ) - assert tag_response.status_code == 200 - - # Verify ref_count is now 1 - response = integration_client.get(f"/api/v1/artifact/{expected_hash}") - assert response.json()["ref_count"] == 1 - - # Create another tag via POST endpoint - tag_response = integration_client.post( - f"/api/v1/project/{project}/{package}/tags", - json={"name": "latest", "artifact_id": artifact_id}, - ) - assert tag_response.status_code == 200 - - # Verify ref_count is now 2 - response = integration_client.get(f"/api/v1/artifact/{expected_hash}") - assert response.json()["ref_count"] == 2 diff --git a/backend/tests/test_stats_endpoints.py b/backend/tests/test_stats_endpoints.py deleted file mode 100644 index ce4da69..0000000 --- a/backend/tests/test_stats_endpoints.py +++ /dev/null @@ -1,488 +0,0 @@ -""" -Integration tests for statistics endpoints. - -Tests cover: -- Global stats endpoint -- Deduplication stats endpoint -- Cross-project deduplication -- Timeline stats -- Export and report endpoints -- Package and artifact stats -""" - -import pytest -from tests.conftest import compute_sha256, upload_test_file - - -class TestGlobalStats: - """Tests for GET /api/v1/stats endpoint.""" - - @pytest.mark.integration - def test_stats_returns_valid_response(self, integration_client): - """Test stats endpoint returns expected fields.""" - response = integration_client.get("/api/v1/stats") - assert response.status_code == 200 - - data = response.json() - # Check all required fields exist - assert "total_artifacts" in data - assert "total_size_bytes" in data - assert "unique_artifacts" in data - assert "orphaned_artifacts" in data - assert "orphaned_size_bytes" in data - assert "total_uploads" in data - assert "deduplicated_uploads" in data - assert "deduplication_ratio" in data - assert "storage_saved_bytes" in data - - @pytest.mark.integration - def test_stats_values_are_non_negative(self, integration_client): - """Test all stat values are non-negative.""" - response = integration_client.get("/api/v1/stats") - assert response.status_code == 200 - - data = response.json() - assert data["total_artifacts"] >= 0 - assert data["total_size_bytes"] >= 0 - assert data["unique_artifacts"] >= 0 - assert data["orphaned_artifacts"] >= 0 - assert data["total_uploads"] >= 0 - assert data["deduplicated_uploads"] >= 0 - assert data["deduplication_ratio"] >= 0 - assert data["storage_saved_bytes"] >= 0 - - @pytest.mark.integration - def test_stats_update_after_upload( - self, integration_client, test_package, unique_test_id - ): - """Test stats update after uploading an artifact.""" - project, package = test_package - - # Get initial stats - initial_response = integration_client.get("/api/v1/stats") - initial_stats = initial_response.json() - - # Upload a new file - content = f"stats test content {unique_test_id}".encode() - upload_test_file( - integration_client, project, package, content, tag=f"stats-{unique_test_id}" - ) - - # Get updated stats - updated_response = integration_client.get("/api/v1/stats") - updated_stats = updated_response.json() - - # Verify stats increased - assert updated_stats["total_uploads"] >= initial_stats["total_uploads"] - - -class TestDeduplicationStats: - """Tests for GET /api/v1/stats/deduplication endpoint.""" - - @pytest.mark.integration - def test_dedup_stats_returns_valid_response(self, integration_client): - """Test deduplication stats returns expected fields.""" - response = integration_client.get("/api/v1/stats/deduplication") - assert response.status_code == 200 - - data = response.json() - assert "total_logical_bytes" in data - assert "total_physical_bytes" in data - assert "bytes_saved" in data - assert "savings_percentage" in data - assert "total_uploads" in data - assert "unique_artifacts" in data - assert "duplicate_uploads" in data - assert "average_ref_count" in data - assert "max_ref_count" in data - assert "most_referenced_artifacts" in data - - @pytest.mark.integration - def test_most_referenced_artifacts_format(self, integration_client): - """Test most_referenced_artifacts has correct structure.""" - response = integration_client.get("/api/v1/stats/deduplication") - assert response.status_code == 200 - - data = response.json() - artifacts = data["most_referenced_artifacts"] - assert isinstance(artifacts, list) - - if len(artifacts) > 0: - artifact = artifacts[0] - assert "artifact_id" in artifact - assert "ref_count" in artifact - assert "size" in artifact - assert "storage_saved" in artifact - - @pytest.mark.integration - def test_dedup_stats_with_top_n_param(self, integration_client): - """Test deduplication stats respects top_n parameter.""" - response = integration_client.get("/api/v1/stats/deduplication?top_n=3") - assert response.status_code == 200 - - data = response.json() - assert len(data["most_referenced_artifacts"]) <= 3 - - @pytest.mark.integration - def test_savings_percentage_valid_range(self, integration_client): - """Test savings percentage is between 0 and 100.""" - response = integration_client.get("/api/v1/stats/deduplication") - assert response.status_code == 200 - - data = response.json() - assert 0 <= data["savings_percentage"] <= 100 - - -class TestCrossProjectStats: - """Tests for GET /api/v1/stats/cross-project endpoint.""" - - @pytest.mark.integration - def test_cross_project_returns_valid_response(self, integration_client): - """Test cross-project stats returns expected fields.""" - response = integration_client.get("/api/v1/stats/cross-project") - assert response.status_code == 200 - - data = response.json() - assert "shared_artifacts_count" in data - assert "total_cross_project_savings" in data - assert "shared_artifacts" in data - assert isinstance(data["shared_artifacts"], list) - - @pytest.mark.integration - def test_cross_project_respects_limit(self, integration_client): - """Test cross-project stats respects limit parameter.""" - response = integration_client.get("/api/v1/stats/cross-project?limit=5") - assert response.status_code == 200 - - data = response.json() - assert len(data["shared_artifacts"]) <= 5 - - @pytest.mark.integration - def test_cross_project_detects_shared_artifacts( - self, integration_client, unique_test_id - ): - """Test cross-project deduplication is detected.""" - content = f"shared across projects {unique_test_id}".encode() - - # Create two projects - proj1 = f"cross-proj-a-{unique_test_id}" - proj2 = f"cross-proj-b-{unique_test_id}" - - try: - # Create projects and packages - integration_client.post( - "/api/v1/projects", - json={"name": proj1, "description": "Test", "is_public": True}, - ) - integration_client.post( - "/api/v1/projects", - json={"name": proj2, "description": "Test", "is_public": True}, - ) - integration_client.post( - f"/api/v1/project/{proj1}/packages", - json={"name": "pkg", "description": "Test"}, - ) - integration_client.post( - f"/api/v1/project/{proj2}/packages", - json={"name": "pkg", "description": "Test"}, - ) - - # Upload same content to both projects - upload_test_file(integration_client, proj1, "pkg", content, tag="v1") - upload_test_file(integration_client, proj2, "pkg", content, tag="v1") - - # Check cross-project stats - response = integration_client.get("/api/v1/stats/cross-project") - assert response.status_code == 200 - - data = response.json() - assert data["shared_artifacts_count"] >= 1 - - finally: - # Cleanup - integration_client.delete(f"/api/v1/projects/{proj1}") - integration_client.delete(f"/api/v1/projects/{proj2}") - - -class TestTimelineStats: - """Tests for GET /api/v1/stats/timeline endpoint.""" - - @pytest.mark.integration - def test_timeline_returns_valid_response(self, integration_client): - """Test timeline stats returns expected fields.""" - response = integration_client.get("/api/v1/stats/timeline") - assert response.status_code == 200 - - data = response.json() - assert "period" in data - assert "start_date" in data - assert "end_date" in data - assert "data_points" in data - assert isinstance(data["data_points"], list) - - @pytest.mark.integration - def test_timeline_daily_period(self, integration_client): - """Test timeline with daily period.""" - response = integration_client.get("/api/v1/stats/timeline?period=daily") - assert response.status_code == 200 - - data = response.json() - assert data["period"] == "daily" - - @pytest.mark.integration - def test_timeline_weekly_period(self, integration_client): - """Test timeline with weekly period.""" - response = integration_client.get("/api/v1/stats/timeline?period=weekly") - assert response.status_code == 200 - - data = response.json() - assert data["period"] == "weekly" - - @pytest.mark.integration - def test_timeline_monthly_period(self, integration_client): - """Test timeline with monthly period.""" - response = integration_client.get("/api/v1/stats/timeline?period=monthly") - assert response.status_code == 200 - - data = response.json() - assert data["period"] == "monthly" - - @pytest.mark.integration - def test_timeline_invalid_period_rejected(self, integration_client): - """Test timeline rejects invalid period.""" - response = integration_client.get("/api/v1/stats/timeline?period=invalid") - assert response.status_code == 422 - - @pytest.mark.integration - def test_timeline_data_point_structure(self, integration_client): - """Test timeline data points have correct structure.""" - response = integration_client.get("/api/v1/stats/timeline") - assert response.status_code == 200 - - data = response.json() - if len(data["data_points"]) > 0: - point = data["data_points"][0] - assert "date" in point - assert "total_uploads" in point - assert "unique_artifacts" in point - assert "duplicated_uploads" in point - assert "bytes_saved" in point - - -class TestExportEndpoint: - """Tests for GET /api/v1/stats/export endpoint.""" - - @pytest.mark.integration - def test_export_json_format(self, integration_client): - """Test export with JSON format.""" - response = integration_client.get("/api/v1/stats/export?format=json") - assert response.status_code == 200 - - data = response.json() - assert "total_artifacts" in data - assert "generated_at" in data - - @pytest.mark.integration - def test_export_csv_format(self, integration_client): - """Test export with CSV format.""" - response = integration_client.get("/api/v1/stats/export?format=csv") - assert response.status_code == 200 - assert "text/csv" in response.headers.get("content-type", "") - - content = response.text - assert "Metric,Value" in content - assert "total_artifacts" in content - - @pytest.mark.integration - def test_export_invalid_format_rejected(self, integration_client): - """Test export rejects invalid format.""" - response = integration_client.get("/api/v1/stats/export?format=xml") - assert response.status_code == 422 - - -class TestReportEndpoint: - """Tests for GET /api/v1/stats/report endpoint.""" - - @pytest.mark.integration - def test_report_markdown_format(self, integration_client): - """Test report with markdown format.""" - response = integration_client.get("/api/v1/stats/report?format=markdown") - assert response.status_code == 200 - - data = response.json() - assert data["format"] == "markdown" - assert "generated_at" in data - assert "content" in data - assert "# Orchard Storage Report" in data["content"] - - @pytest.mark.integration - def test_report_json_format(self, integration_client): - """Test report with JSON format.""" - response = integration_client.get("/api/v1/stats/report?format=json") - assert response.status_code == 200 - - data = response.json() - assert data["format"] == "json" - assert "content" in data - - @pytest.mark.integration - def test_report_contains_sections(self, integration_client): - """Test markdown report contains expected sections.""" - response = integration_client.get("/api/v1/stats/report?format=markdown") - assert response.status_code == 200 - - content = response.json()["content"] - assert "## Overview" in content - assert "## Storage" in content - assert "## Uploads" in content - - -class TestProjectStats: - """Tests for GET /api/v1/projects/:project/stats endpoint.""" - - @pytest.mark.integration - def test_project_stats_returns_valid_response( - self, integration_client, test_project - ): - """Test project stats returns expected fields.""" - response = integration_client.get(f"/api/v1/projects/{test_project}/stats") - assert response.status_code == 200 - - data = response.json() - assert "project_id" in data - assert "project_name" in data - assert "package_count" in data - assert "tag_count" in data - assert "artifact_count" in data - assert "total_size_bytes" in data - assert "upload_count" in data - assert "deduplicated_uploads" in data - assert "storage_saved_bytes" in data - assert "deduplication_ratio" in data - - @pytest.mark.integration - def test_project_stats_not_found(self, integration_client): - """Test project stats returns 404 for non-existent project.""" - response = integration_client.get("/api/v1/projects/nonexistent-project/stats") - assert response.status_code == 404 - - -class TestPackageStats: - """Tests for GET /api/v1/project/:project/packages/:package/stats endpoint.""" - - @pytest.mark.integration - def test_package_stats_returns_valid_response( - self, integration_client, test_package - ): - """Test package stats returns expected fields.""" - project, package = test_package - response = integration_client.get( - f"/api/v1/project/{project}/packages/{package}/stats" - ) - assert response.status_code == 200 - - data = response.json() - assert "package_id" in data - assert "package_name" in data - assert "project_name" in data - assert "tag_count" in data - assert "artifact_count" in data - assert "total_size_bytes" in data - assert "upload_count" in data - assert "deduplicated_uploads" in data - assert "storage_saved_bytes" in data - assert "deduplication_ratio" in data - - @pytest.mark.integration - def test_package_stats_not_found(self, integration_client, test_project): - """Test package stats returns 404 for non-existent package.""" - response = integration_client.get( - f"/api/v1/project/{test_project}/packages/nonexistent-package/stats" - ) - assert response.status_code == 404 - - -class TestArtifactStats: - """Tests for GET /api/v1/artifact/:id/stats endpoint.""" - - @pytest.mark.integration - def test_artifact_stats_returns_valid_response( - self, integration_client, test_package, unique_test_id - ): - """Test artifact stats returns expected fields.""" - project, package = test_package - content = f"artifact stats test {unique_test_id}".encode() - expected_hash = compute_sha256(content) - - # Upload artifact - upload_test_file( - integration_client, project, package, content, tag=f"art-{unique_test_id}" - ) - - # Get artifact stats - response = integration_client.get(f"/api/v1/artifact/{expected_hash}/stats") - assert response.status_code == 200 - - data = response.json() - assert "artifact_id" in data - assert "sha256" in data - assert "size" in data - assert "ref_count" in data - assert "storage_savings" in data - assert "tags" in data - assert "projects" in data - assert "packages" in data - - @pytest.mark.integration - def test_artifact_stats_not_found(self, integration_client): - """Test artifact stats returns 404 for non-existent artifact.""" - fake_hash = "0" * 64 - response = integration_client.get(f"/api/v1/artifact/{fake_hash}/stats") - assert response.status_code == 404 - - @pytest.mark.integration - def test_artifact_stats_shows_correct_projects( - self, integration_client, unique_test_id - ): - """Test artifact stats shows all projects using the artifact.""" - content = f"multi-project artifact {unique_test_id}".encode() - expected_hash = compute_sha256(content) - - proj1 = f"art-stats-a-{unique_test_id}" - proj2 = f"art-stats-b-{unique_test_id}" - - try: - # Create projects and packages - integration_client.post( - "/api/v1/projects", - json={"name": proj1, "description": "Test", "is_public": True}, - ) - integration_client.post( - "/api/v1/projects", - json={"name": proj2, "description": "Test", "is_public": True}, - ) - integration_client.post( - f"/api/v1/project/{proj1}/packages", - json={"name": "pkg", "description": "Test"}, - ) - integration_client.post( - f"/api/v1/project/{proj2}/packages", - json={"name": "pkg", "description": "Test"}, - ) - - # Upload same content to both projects - upload_test_file(integration_client, proj1, "pkg", content, tag="v1") - upload_test_file(integration_client, proj2, "pkg", content, tag="v1") - - # Check artifact stats - response = integration_client.get(f"/api/v1/artifact/{expected_hash}/stats") - assert response.status_code == 200 - - data = response.json() - assert len(data["projects"]) == 2 - assert proj1 in data["projects"] - assert proj2 in data["projects"] - - finally: - integration_client.delete(f"/api/v1/projects/{proj1}") - integration_client.delete(f"/api/v1/projects/{proj2}") diff --git a/backend/tests/unit/__init__.py b/backend/tests/unit/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/backend/tests/unit/test_models.py b/backend/tests/unit/test_models.py new file mode 100644 index 0000000..ae85605 --- /dev/null +++ b/backend/tests/unit/test_models.py @@ -0,0 +1,271 @@ +""" +Unit tests for SQLAlchemy models. + +Tests cover: +- Model instantiation and defaults +- Property aliases (sha256, format_metadata) +- Relationship definitions +- Constraint definitions +""" + +import pytest +import uuid +from datetime import datetime + + +class TestArtifactModel: + """Tests for the Artifact model.""" + + @pytest.mark.unit + def test_artifact_sha256_property(self): + """Test sha256 property is an alias for id.""" + from app.models import Artifact + + artifact = Artifact( + id="a" * 64, + size=1024, + created_by="test-user", + s3_key="fruits/aa/aa/test", + ) + + assert artifact.sha256 == artifact.id + assert artifact.sha256 == "a" * 64 + + @pytest.mark.unit + def test_artifact_format_metadata_alias(self): + """Test format_metadata is an alias for artifact_metadata.""" + from app.models import Artifact + + test_metadata = {"format": "tarball", "version": "1.0.0"} + artifact = Artifact( + id="b" * 64, + size=2048, + created_by="test-user", + s3_key="fruits/bb/bb/test", + artifact_metadata=test_metadata, + ) + + assert artifact.format_metadata == test_metadata + assert artifact.format_metadata == artifact.artifact_metadata + + @pytest.mark.unit + def test_artifact_format_metadata_setter(self): + """Test format_metadata setter updates artifact_metadata.""" + from app.models import Artifact + + artifact = Artifact( + id="c" * 64, + size=512, + created_by="test-user", + s3_key="fruits/cc/cc/test", + ) + + new_metadata = {"type": "rpm", "arch": "x86_64"} + artifact.format_metadata = new_metadata + + assert artifact.artifact_metadata == new_metadata + assert artifact.format_metadata == new_metadata + + @pytest.mark.unit + def test_artifact_default_ref_count(self): + """Test artifact ref_count column has default value of 1.""" + from app.models import Artifact + + # Check the column definition has the right default + ref_count_col = Artifact.__table__.columns["ref_count"] + assert ref_count_col.default is not None + assert ref_count_col.default.arg == 1 + + @pytest.mark.unit + def test_artifact_default_metadata_is_dict(self): + """Test artifact default metadata is an empty dict.""" + from app.models import Artifact + + artifact = Artifact( + id="e" * 64, + size=100, + created_by="test-user", + s3_key="fruits/ee/ee/test", + ) + + # Default might be None until saved, but the column default is dict + assert artifact.artifact_metadata is None or isinstance( + artifact.artifact_metadata, dict + ) + + +class TestProjectModel: + """Tests for the Project model.""" + + @pytest.mark.unit + def test_project_default_is_public(self): + """Test project is_public column has default value of True.""" + from app.models import Project + + # Check the column definition has the right default + is_public_col = Project.__table__.columns["is_public"] + assert is_public_col.default is not None + assert is_public_col.default.arg is True + + @pytest.mark.unit + def test_project_uuid_generation(self): + """Test project generates UUID by default.""" + from app.models import Project + + project = Project( + name="uuid-test-project", + created_by="test-user", + ) + + # UUID should be set by default function + assert project.id is not None or hasattr(Project.id, "default") + + +class TestPackageModel: + """Tests for the Package model.""" + + @pytest.mark.unit + def test_package_default_format(self): + """Test package format column has default value of 'generic'.""" + from app.models import Package + + # Check the column definition has the right default + format_col = Package.__table__.columns["format"] + assert format_col.default is not None + assert format_col.default.arg == "generic" + + @pytest.mark.unit + def test_package_default_platform(self): + """Test package platform column has default value of 'any'.""" + from app.models import Package + + # Check the column definition has the right default + platform_col = Package.__table__.columns["platform"] + assert platform_col.default is not None + assert platform_col.default.arg == "any" + + +class TestTagModel: + """Tests for the Tag model.""" + + @pytest.mark.unit + def test_tag_requires_package_id(self): + """Test tag requires package_id.""" + from app.models import Tag + + tag = Tag( + name="v1.0.0", + package_id=uuid.uuid4(), + artifact_id="f" * 64, + created_by="test-user", + ) + + assert tag.package_id is not None + assert tag.artifact_id == "f" * 64 + + +class TestTagHistoryModel: + """Tests for the TagHistory model.""" + + @pytest.mark.unit + def test_tag_history_default_change_type(self): + """Test tag history change_type column has default value of 'update'.""" + from app.models import TagHistory + + # Check the column definition has the right default + change_type_col = TagHistory.__table__.columns["change_type"] + assert change_type_col.default is not None + assert change_type_col.default.arg == "update" + + @pytest.mark.unit + def test_tag_history_allows_null_old_artifact(self): + """Test tag history allows null old_artifact_id (for create events).""" + from app.models import TagHistory + + history = TagHistory( + tag_id=uuid.uuid4(), + old_artifact_id=None, + new_artifact_id="h" * 64, + change_type="create", + changed_by="test-user", + ) + + assert history.old_artifact_id is None + + +class TestUploadModel: + """Tests for the Upload model.""" + + @pytest.mark.unit + def test_upload_default_deduplicated_is_false(self): + """Test upload deduplicated column has default value of False.""" + from app.models import Upload + + # Check the column definition has the right default + deduplicated_col = Upload.__table__.columns["deduplicated"] + assert deduplicated_col.default is not None + assert deduplicated_col.default.arg is False + + @pytest.mark.unit + def test_upload_default_checksum_verified_is_true(self): + """Test upload checksum_verified column has default value of True.""" + from app.models import Upload + + # Check the column definition has the right default + checksum_verified_col = Upload.__table__.columns["checksum_verified"] + assert checksum_verified_col.default is not None + assert checksum_verified_col.default.arg is True + + +class TestAccessPermissionModel: + """Tests for the AccessPermission model.""" + + @pytest.mark.unit + def test_access_permission_levels(self): + """Test valid access permission levels.""" + from app.models import AccessPermission + + # This tests the check constraint values + valid_levels = ["read", "write", "admin"] + + for level in valid_levels: + permission = AccessPermission( + project_id=uuid.uuid4(), + user_id="test-user", + level=level, + ) + assert permission.level == level + + +class TestAuditLogModel: + """Tests for the AuditLog model.""" + + @pytest.mark.unit + def test_audit_log_required_fields(self): + """Test audit log has all required fields.""" + from app.models import AuditLog + + log = AuditLog( + action="project.create", + resource="/projects/test-project", + user_id="test-user", + ) + + assert log.action == "project.create" + assert log.resource == "/projects/test-project" + assert log.user_id == "test-user" + + @pytest.mark.unit + def test_audit_log_optional_details(self): + """Test audit log can have optional details JSON.""" + from app.models import AuditLog + + details = {"old_value": "v1", "new_value": "v2"} + log = AuditLog( + action="tag.update", + resource="/projects/test/packages/pkg/tags/latest", + user_id="test-user", + details=details, + ) + + assert log.details == details diff --git a/backend/tests/unit/test_storage.py b/backend/tests/unit/test_storage.py new file mode 100644 index 0000000..3fbe6eb --- /dev/null +++ b/backend/tests/unit/test_storage.py @@ -0,0 +1,439 @@ +""" +Unit tests for S3 storage layer. + +Tests cover: +- SHA256 hash calculation and consistency +- Hash format validation (64-char hex) +- S3 key generation pattern +- Deduplication behavior (_exists method) +- Storage result computation (MD5, SHA1, size) +- Edge cases (empty files, large files, binary content) +""" + +import pytest +import hashlib +import io +from tests.factories import ( + compute_sha256, + TEST_CONTENT_HELLO, + TEST_HASH_HELLO, + TEST_CONTENT_BINARY, + TEST_HASH_BINARY, +) + + +# ============================================================================= +# Hash Computation Tests +# ============================================================================= + + +class TestHashComputation: + """Unit tests for hash calculation functionality.""" + + @pytest.mark.unit + def test_sha256_consistent_results(self): + """Test SHA256 hash produces consistent results for identical content.""" + content = b"test content for hashing" + + # Compute hash multiple times + hash1 = compute_sha256(content) + hash2 = compute_sha256(content) + hash3 = compute_sha256(content) + + assert hash1 == hash2 == hash3 + + @pytest.mark.unit + def test_sha256_different_content_different_hash(self): + """Test SHA256 produces different hashes for different content.""" + content1 = b"content version 1" + content2 = b"content version 2" + + hash1 = compute_sha256(content1) + hash2 = compute_sha256(content2) + + assert hash1 != hash2 + + @pytest.mark.unit + def test_sha256_format_64_char_hex(self): + """Test SHA256 hash is always 64 character lowercase hexadecimal.""" + test_cases = [ + b"", # Empty + b"a", # Single char + b"Hello, World!", # Normal string + bytes(range(256)), # All byte values + b"x" * 10000, # Larger content + ] + + for content in test_cases: + hash_value = compute_sha256(content) + + # Check length + assert len(hash_value) == 64, ( + f"Hash length should be 64, got {len(hash_value)}" + ) + + # Check lowercase + assert hash_value == hash_value.lower(), "Hash should be lowercase" + + # Check hexadecimal + assert all(c in "0123456789abcdef" for c in hash_value), ( + "Hash should be hex" + ) + + @pytest.mark.unit + def test_sha256_known_value(self): + """Test SHA256 produces expected hash for known input.""" + assert compute_sha256(TEST_CONTENT_HELLO) == TEST_HASH_HELLO + + @pytest.mark.unit + def test_sha256_binary_content(self): + """Test SHA256 handles binary content correctly.""" + assert compute_sha256(TEST_CONTENT_BINARY) == TEST_HASH_BINARY + + # Test with null bytes + content_with_nulls = b"\x00\x00test\x00\x00" + hash_value = compute_sha256(content_with_nulls) + assert len(hash_value) == 64 + + @pytest.mark.unit + def test_sha256_streaming_computation(self): + """Test SHA256 can be computed in chunks (streaming).""" + # Large content + chunk_size = 8192 + total_size = chunk_size * 10 # 80KB + content = b"x" * total_size + + # Direct computation + direct_hash = compute_sha256(content) + + # Streaming computation + hasher = hashlib.sha256() + for i in range(0, total_size, chunk_size): + hasher.update(content[i : i + chunk_size]) + streaming_hash = hasher.hexdigest() + + assert direct_hash == streaming_hash + + @pytest.mark.unit + def test_sha256_order_matters(self): + """Test that content order affects hash (not just content set).""" + content1 = b"AB" + content2 = b"BA" + + assert compute_sha256(content1) != compute_sha256(content2) + + +# ============================================================================= +# Storage Hash Computation Tests +# ============================================================================= + + +class TestStorageHashComputation: + """Tests for hash computation in the storage layer.""" + + @pytest.mark.unit + def test_storage_computes_sha256(self, mock_storage): + """Test storage layer correctly computes SHA256 hash.""" + content = TEST_CONTENT_HELLO + file_obj = io.BytesIO(content) + + result = mock_storage._store_simple(file_obj) + + assert result.sha256 == TEST_HASH_HELLO + + @pytest.mark.unit + def test_storage_computes_md5(self, mock_storage): + """Test storage layer also computes MD5 hash.""" + content = TEST_CONTENT_HELLO + file_obj = io.BytesIO(content) + + result = mock_storage._store_simple(file_obj) + + expected_md5 = hashlib.md5(content).hexdigest() + assert result.md5 == expected_md5 + + @pytest.mark.unit + def test_storage_computes_sha1(self, mock_storage): + """Test storage layer also computes SHA1 hash.""" + content = TEST_CONTENT_HELLO + file_obj = io.BytesIO(content) + + result = mock_storage._store_simple(file_obj) + + expected_sha1 = hashlib.sha1(content).hexdigest() + assert result.sha1 == expected_sha1 + + @pytest.mark.unit + def test_storage_returns_correct_size(self, mock_storage): + """Test storage layer returns correct file size.""" + content = b"test content with known size" + file_obj = io.BytesIO(content) + + result = mock_storage._store_simple(file_obj) + + assert result.size == len(content) + + @pytest.mark.unit + def test_storage_generates_correct_s3_key(self, mock_storage): + """Test storage layer generates correct S3 key pattern.""" + content = TEST_CONTENT_HELLO + file_obj = io.BytesIO(content) + + result = mock_storage._store_simple(file_obj) + + # Key should be: fruits/{hash[:2]}/{hash[2:4]}/{hash} + expected_key = ( + f"fruits/{TEST_HASH_HELLO[:2]}/{TEST_HASH_HELLO[2:4]}/{TEST_HASH_HELLO}" + ) + assert result.s3_key == expected_key + + +# ============================================================================= +# Hash Edge Cases +# ============================================================================= + + +class TestHashEdgeCases: + """Edge case tests for hash computation.""" + + @pytest.mark.unit + def test_hash_empty_content_rejected(self, mock_storage): + """Test that empty content is rejected.""" + from app.storage import HashComputationError + + file_obj = io.BytesIO(b"") + + with pytest.raises(HashComputationError): + mock_storage._store_simple(file_obj) + + @pytest.mark.unit + def test_hash_large_file_streaming(self, mock_storage): + """Test hash computation for large files uses streaming.""" + # Create a 10MB file + size = 10 * 1024 * 1024 + content = b"x" * size + file_obj = io.BytesIO(content) + + result = mock_storage._store_simple(file_obj) + + expected_hash = compute_sha256(content) + assert result.sha256 == expected_hash + + @pytest.mark.unit + def test_hash_special_bytes(self): + """Test hash handles all byte values correctly.""" + # All possible byte values + content = bytes(range(256)) + hash_value = compute_sha256(content) + + assert len(hash_value) == 64 + assert hash_value == TEST_HASH_BINARY + + +# ============================================================================= +# S3 Existence Check Tests +# ============================================================================= + + +class TestExistsMethod: + """Tests for the _exists() method that checks S3 object existence.""" + + @pytest.mark.unit + def test_exists_returns_true_for_existing_key(self, mock_storage, mock_s3_client): + """Test _exists() returns True when object exists.""" + # Pre-populate the mock storage + test_key = "fruits/df/fd/test-hash" + mock_s3_client.objects[test_key] = b"content" + + result = mock_storage._exists(test_key) + + assert result is True + + @pytest.mark.unit + def test_exists_returns_false_for_nonexistent_key(self, mock_storage): + """Test _exists() returns False when object doesn't exist.""" + result = mock_storage._exists("fruits/no/ne/nonexistent-key") + + assert result is False + + @pytest.mark.unit + def test_exists_handles_404_error(self, mock_storage): + """Test _exists() handles 404 errors gracefully.""" + # The mock client raises ClientError for nonexistent keys + result = mock_storage._exists("fruits/xx/yy/does-not-exist") + + assert result is False + + +# ============================================================================= +# S3 Key Generation Tests +# ============================================================================= + + +class TestS3KeyGeneration: + """Tests for S3 key pattern generation.""" + + @pytest.mark.unit + def test_s3_key_pattern(self): + """Test S3 key follows pattern: fruits/{hash[:2]}/{hash[2:4]}/{hash}""" + test_hash = "abcdef1234567890abcdef1234567890abcdef1234567890abcdef1234567890" + + expected_key = f"fruits/{test_hash[:2]}/{test_hash[2:4]}/{test_hash}" + # Expected: fruits/ab/cd/abcdef1234567890... + + assert expected_key == f"fruits/ab/cd/{test_hash}" + + @pytest.mark.unit + def test_s3_key_generation_in_storage(self, mock_storage): + """Test storage layer generates correct S3 key.""" + content = TEST_CONTENT_HELLO + file_obj = io.BytesIO(content) + + result = mock_storage._store_simple(file_obj) + + expected_key = ( + f"fruits/{TEST_HASH_HELLO[:2]}/{TEST_HASH_HELLO[2:4]}/{TEST_HASH_HELLO}" + ) + assert result.s3_key == expected_key + + @pytest.mark.unit + def test_s3_key_uses_sha256_hash(self, mock_storage): + """Test S3 key is derived from SHA256 hash.""" + content = b"unique test content for key test" + file_obj = io.BytesIO(content) + expected_hash = compute_sha256(content) + + result = mock_storage._store_simple(file_obj) + + # Key should contain the hash + assert expected_hash in result.s3_key + + +# ============================================================================= +# Deduplication Behavior Tests +# ============================================================================= + + +class TestDeduplicationBehavior: + """Tests for deduplication (skip upload when exists).""" + + @pytest.mark.unit + def test_skips_upload_when_exists(self, mock_storage, mock_s3_client): + """Test storage skips S3 upload when artifact already exists.""" + content = TEST_CONTENT_HELLO + s3_key = ( + f"fruits/{TEST_HASH_HELLO[:2]}/{TEST_HASH_HELLO[2:4]}/{TEST_HASH_HELLO}" + ) + + # Pre-populate storage (simulate existing artifact) + mock_s3_client.objects[s3_key] = content + + # Track put_object calls + original_put = mock_s3_client.put_object + put_called = [] + + def tracked_put(*args, **kwargs): + put_called.append(True) + return original_put(*args, **kwargs) + + mock_s3_client.put_object = tracked_put + + # Store the same content + file_obj = io.BytesIO(content) + result = mock_storage._store_simple(file_obj) + + # put_object should NOT have been called (deduplication) + assert len(put_called) == 0 + assert result.sha256 == TEST_HASH_HELLO + + @pytest.mark.unit + def test_uploads_when_not_exists(self, mock_storage, mock_s3_client): + """Test storage uploads to S3 when artifact doesn't exist.""" + content = b"brand new unique content" + content_hash = compute_sha256(content) + s3_key = f"fruits/{content_hash[:2]}/{content_hash[2:4]}/{content_hash}" + + # Ensure object doesn't exist + assert s3_key not in mock_s3_client.objects + + # Store the content + file_obj = io.BytesIO(content) + result = mock_storage._store_simple(file_obj) + + # Object should now exist in mock storage + assert s3_key in mock_s3_client.objects + assert mock_s3_client.objects[s3_key] == content + + @pytest.mark.unit + def test_returns_same_hash_for_duplicate(self, mock_storage, mock_s3_client): + """Test storing same content twice returns same hash.""" + content = b"content to be stored twice" + + # First store + file1 = io.BytesIO(content) + result1 = mock_storage._store_simple(file1) + + # Second store (duplicate) + file2 = io.BytesIO(content) + result2 = mock_storage._store_simple(file2) + + assert result1.sha256 == result2.sha256 + assert result1.s3_key == result2.s3_key + + @pytest.mark.unit + def test_different_content_different_keys(self, mock_storage): + """Test different content produces different S3 keys.""" + content1 = b"first content" + content2 = b"second content" + + file1 = io.BytesIO(content1) + result1 = mock_storage._store_simple(file1) + + file2 = io.BytesIO(content2) + result2 = mock_storage._store_simple(file2) + + assert result1.sha256 != result2.sha256 + assert result1.s3_key != result2.s3_key + + +# ============================================================================= +# Deduplication Edge Cases +# ============================================================================= + + +class TestDeduplicationEdgeCases: + """Edge case tests for deduplication.""" + + @pytest.mark.unit + def test_same_content_different_filenames(self, mock_storage): + """Test same content with different metadata is deduplicated.""" + content = b"identical content" + + # Store with "filename1" + file1 = io.BytesIO(content) + result1 = mock_storage._store_simple(file1) + + # Store with "filename2" (same content) + file2 = io.BytesIO(content) + result2 = mock_storage._store_simple(file2) + + # Both should have same hash (content-addressable) + assert result1.sha256 == result2.sha256 + + @pytest.mark.unit + def test_whitespace_only_difference(self, mock_storage): + """Test content differing only by whitespace produces different hashes.""" + content1 = b"test content" + content2 = b"test content" # Extra space + content3 = b"test content " # Trailing space + + file1 = io.BytesIO(content1) + file2 = io.BytesIO(content2) + file3 = io.BytesIO(content3) + + result1 = mock_storage._store_simple(file1) + result2 = mock_storage._store_simple(file2) + result3 = mock_storage._store_simple(file3) + + # All should be different (content-addressable) + assert len({result1.sha256, result2.sha256, result3.sha256}) == 3 diff --git a/migrations/004_history_tables.sql b/migrations/004_history_tables.sql new file mode 100644 index 0000000..79cd836 --- /dev/null +++ b/migrations/004_history_tables.sql @@ -0,0 +1,98 @@ +-- Migration 004: Project and Package History Tables +-- Adds history tracking tables for project and package metadata changes + +-- ============================================ +-- Project History Table +-- ============================================ +CREATE TABLE IF NOT EXISTS project_history ( + id UUID PRIMARY KEY DEFAULT gen_random_uuid(), + project_id UUID NOT NULL REFERENCES projects(id) ON DELETE CASCADE, + field_name VARCHAR(100) NOT NULL, + old_value TEXT, + new_value TEXT, + changed_at TIMESTAMP WITH TIME ZONE DEFAULT NOW(), + changed_by VARCHAR(255) NOT NULL +); + +CREATE INDEX IF NOT EXISTS idx_project_history_project_id ON project_history(project_id); +CREATE INDEX IF NOT EXISTS idx_project_history_changed_at ON project_history(changed_at); +CREATE INDEX IF NOT EXISTS idx_project_history_project_changed_at ON project_history(project_id, changed_at); + +-- ============================================ +-- Package History Table +-- ============================================ +CREATE TABLE IF NOT EXISTS package_history ( + id UUID PRIMARY KEY DEFAULT gen_random_uuid(), + package_id UUID NOT NULL REFERENCES packages(id) ON DELETE CASCADE, + field_name VARCHAR(100) NOT NULL, + old_value TEXT, + new_value TEXT, + changed_at TIMESTAMP WITH TIME ZONE DEFAULT NOW(), + changed_by VARCHAR(255) NOT NULL +); + +CREATE INDEX IF NOT EXISTS idx_package_history_package_id ON package_history(package_id); +CREATE INDEX IF NOT EXISTS idx_package_history_changed_at ON package_history(changed_at); +CREATE INDEX IF NOT EXISTS idx_package_history_package_changed_at ON package_history(package_id, changed_at); + +-- ============================================ +-- Project Update Trigger +-- ============================================ +CREATE OR REPLACE FUNCTION log_project_changes() +RETURNS TRIGGER AS $$ +BEGIN + -- Log description change + IF OLD.description IS DISTINCT FROM NEW.description THEN + INSERT INTO project_history (project_id, field_name, old_value, new_value, changed_by) + VALUES (NEW.id, 'description', OLD.description, NEW.description, COALESCE(current_setting('app.current_user', true), 'system')); + END IF; + + -- Log is_public change + IF OLD.is_public IS DISTINCT FROM NEW.is_public THEN + INSERT INTO project_history (project_id, field_name, old_value, new_value, changed_by) + VALUES (NEW.id, 'is_public', OLD.is_public::text, NEW.is_public::text, COALESCE(current_setting('app.current_user', true), 'system')); + END IF; + + RETURN NEW; +END; +$$ LANGUAGE plpgsql; + +DROP TRIGGER IF EXISTS project_changes_trigger ON projects; +CREATE TRIGGER project_changes_trigger + AFTER UPDATE ON projects + FOR EACH ROW + EXECUTE FUNCTION log_project_changes(); + +-- ============================================ +-- Package Update Trigger +-- ============================================ +CREATE OR REPLACE FUNCTION log_package_changes() +RETURNS TRIGGER AS $$ +BEGIN + -- Log description change + IF OLD.description IS DISTINCT FROM NEW.description THEN + INSERT INTO package_history (package_id, field_name, old_value, new_value, changed_by) + VALUES (NEW.id, 'description', OLD.description, NEW.description, COALESCE(current_setting('app.current_user', true), 'system')); + END IF; + + -- Log format change + IF OLD.format IS DISTINCT FROM NEW.format THEN + INSERT INTO package_history (package_id, field_name, old_value, new_value, changed_by) + VALUES (NEW.id, 'format', OLD.format, NEW.format, COALESCE(current_setting('app.current_user', true), 'system')); + END IF; + + -- Log platform change + IF OLD.platform IS DISTINCT FROM NEW.platform THEN + INSERT INTO package_history (package_id, field_name, old_value, new_value, changed_by) + VALUES (NEW.id, 'platform', OLD.platform, NEW.platform, COALESCE(current_setting('app.current_user', true), 'system')); + END IF; + + RETURN NEW; +END; +$$ LANGUAGE plpgsql; + +DROP TRIGGER IF EXISTS package_changes_trigger ON packages; +CREATE TRIGGER package_changes_trigger + AFTER UPDATE ON packages + FOR EACH ROW + EXECUTE FUNCTION log_package_changes(); diff --git a/migrations/005_upload_enhancements.sql b/migrations/005_upload_enhancements.sql new file mode 100644 index 0000000..b1706e6 --- /dev/null +++ b/migrations/005_upload_enhancements.sql @@ -0,0 +1,83 @@ +-- Migration 005: Upload Workflow Enhancements +-- Adds status tracking and error handling for uploads + +-- ============================================ +-- Add status column to uploads table +-- ============================================ +DO $$ +BEGIN + IF NOT EXISTS (SELECT 1 FROM information_schema.columns + WHERE table_name = 'uploads' AND column_name = 'status') THEN + ALTER TABLE uploads ADD COLUMN status VARCHAR(20) DEFAULT 'completed' NOT NULL; + END IF; +END $$; + +-- ============================================ +-- Add error_message column for failed uploads +-- ============================================ +DO $$ +BEGIN + IF NOT EXISTS (SELECT 1 FROM information_schema.columns + WHERE table_name = 'uploads' AND column_name = 'error_message') THEN + ALTER TABLE uploads ADD COLUMN error_message TEXT; + END IF; +END $$; + +-- ============================================ +-- Add client_checksum column for verification +-- ============================================ +DO $$ +BEGIN + IF NOT EXISTS (SELECT 1 FROM information_schema.columns + WHERE table_name = 'uploads' AND column_name = 'client_checksum') THEN + ALTER TABLE uploads ADD COLUMN client_checksum VARCHAR(64); + END IF; +END $$; + +-- ============================================ +-- Add indexes for upload status queries +-- ============================================ +CREATE INDEX IF NOT EXISTS idx_uploads_status ON uploads(status); +CREATE INDEX IF NOT EXISTS idx_uploads_status_uploaded_at ON uploads(status, uploaded_at); + +-- ============================================ +-- Add constraint to validate status values +-- ============================================ +DO $$ +BEGIN + IF NOT EXISTS (SELECT 1 FROM information_schema.constraint_column_usage + WHERE constraint_name = 'check_upload_status') THEN + ALTER TABLE uploads ADD CONSTRAINT check_upload_status + CHECK (status IN ('pending', 'completed', 'failed')); + END IF; +END $$; + +-- ============================================ +-- Create table for tracking in-progress uploads (for 409 conflict detection) +-- ============================================ +CREATE TABLE IF NOT EXISTS upload_locks ( + id UUID PRIMARY KEY DEFAULT gen_random_uuid(), + sha256_hash VARCHAR(64) NOT NULL, + package_id UUID NOT NULL REFERENCES packages(id) ON DELETE CASCADE, + locked_at TIMESTAMP WITH TIME ZONE DEFAULT NOW(), + locked_by VARCHAR(255) NOT NULL, + expires_at TIMESTAMP WITH TIME ZONE NOT NULL, + UNIQUE(sha256_hash, package_id) +); + +CREATE INDEX IF NOT EXISTS idx_upload_locks_expires_at ON upload_locks(expires_at); +CREATE INDEX IF NOT EXISTS idx_upload_locks_hash_package ON upload_locks(sha256_hash, package_id); + +-- ============================================ +-- Function to clean up expired upload locks +-- ============================================ +CREATE OR REPLACE FUNCTION cleanup_expired_upload_locks() +RETURNS INTEGER AS $$ +DECLARE + deleted_count INTEGER; +BEGIN + DELETE FROM upload_locks WHERE expires_at < NOW(); + GET DIAGNOSTICS deleted_count = ROW_COUNT; + RETURN deleted_count; +END; +$$ LANGUAGE plpgsql;