Add schema enhancements for uploads, artifacts, and audit tracking

- Add format and platform fields to packages table - Add checksum_md5 and metadata JSONB to artifacts with CHECK constraints - Add updated_at and composite index to tags table - Add tag_name, user_agent, duration_ms, deduplicated, checksum_verified to uploads - Add change_type field to tag_history table - Add composite indexes and GIN index to audit_logs - Add partial index for public projects - Add triggers for ref_count accuracy and updated_at timestamps - Create migration script (002) for existing databases
2025-12-12 15:14:44 -06:00
12 changed files with 47 additions and 872 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -6,18 +6,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

 ## [Unreleased]
-### Added
- Added integrity verification workflow design document (#24)
- Added `sha256` field to API responses for clarity (alias of `id`) (#25)
- Added `checksum_sha1` field to artifacts table for compatibility (#25)
- Added `s3_etag` field to artifacts table for S3 verification (#25)
- Compute and store MD5, SHA1, and S3 ETag alongside SHA256 during upload (#25)
- Added `Dockerfile.local` and `docker-compose.local.yml` for local development (#25)
- Added migration script `003_checksum_fields.sql` for existing databases (#25)

-## [0.2.0] - 2025-12-15
-### Changed
- Updated images to use internal container BSF proxy (#46)
 ### Added
 - Added `format` and `platform` fields to packages table (#16)
 - Added `checksum_md5` and `metadata` JSONB fields to artifacts table (#16)
--- a/4
+++ b/4
@@ -1,5 +1,5 @@
 # Frontend build stage
-FROM containers.global.bsf.tools/node:20-alpine AS frontend-builder
+FROM node:20-alpine AS frontend-builder

 ARG NPM_REGISTRY=https://deps.global.bsf.tools/artifactory/api/npm/registry.npmjs.org/

@@ -19,7 +19,7 @@ COPY frontend/ ./
 RUN npm run build

 # Runtime stage
-FROM containers.global.bsf.tools/python:3.12-slim
+FROM python:3.12-slim

 # Disable proxy cache
 RUN echo 'Acquire::http::Pipeline-Depth 0;\nAcquire::http::No-Cache true;\nAcquire::BrokenProxy true;\n' > /etc/apt/apt.conf.d/99fixbadproxy
--- a/Dockerfile.local
+++ b/Dockerfile.local
@@ -1,50 +0,0 @@
-# Frontend build stage
-FROM node:20-alpine AS frontend-builder
-
-WORKDIR /app/frontend
-
-# Copy package files
-COPY frontend/package*.json ./
-RUN npm install
-
-# Copy frontend source
-COPY frontend/ ./
-
-# Build frontend
-RUN npm run build
-
-# Runtime stage
-FROM python:3.12-slim
-
-# Install system dependencies
-RUN apt-get update && apt-get install -y --no-install-recommends \
-    curl \
-    && rm -rf /var/lib/apt/lists/*
-
-# Create non-root user
-RUN groupadd -g 1000 orchard && \
-    useradd -u 1000 -g orchard -s /bin/bash -m orchard
-
-WORKDIR /app
-
-# Copy requirements and install Python dependencies
-COPY backend/requirements.txt .
-RUN pip install --no-cache-dir -r requirements.txt
-
-# Copy backend source
-COPY backend/ ./backend/
-
-# Copy frontend build
-COPY --from=frontend-builder /app/frontend/dist ./frontend/dist
-
-# Set ownership
-RUN chown -R orchard:orchard /app
-
-USER orchard
-
-EXPOSE 8080
-
-HEALTHCHECK --interval=30s --timeout=3s --start-period=5s --retries=3 \
-    CMD curl -f http://localhost:8080/health || exit 1
-
-CMD ["uvicorn", "backend.app.main:app", "--host", "0.0.0.0", "--port", "8080"]
--- a/backend/app/models.py
+++ b/backend/app/models.py
@@ -74,9 +74,7 @@ class Artifact(Base):
    content_type = Column(String(255))
    original_name = Column(String(1024))
    checksum_md5 = Column(String(32))  # MD5 hash for additional verification
-    checksum_sha1 = Column(String(40))  # SHA1 hash for compatibility
-    s3_etag = Column(String(64))  # S3 ETag for verification
-    artifact_metadata = Column("metadata", JSON, default=dict)  # Format-specific metadata (column name is 'metadata')
+    metadata = Column(JSON, default=dict)  # Format-specific metadata
    created_at = Column(DateTime(timezone=True), default=datetime.utcnow)
    created_by = Column(String(255), nullable=False)
    ref_count = Column(Integer, default=1)
@@ -85,21 +83,6 @@ class Artifact(Base):
    tags = relationship("Tag", back_populates="artifact")
    uploads = relationship("Upload", back_populates="artifact")

-    @property
-    def sha256(self) -> str:
-        """Alias for id - the SHA256 hash of the artifact content"""
-        return self.id
-
-    @property
-    def format_metadata(self):
-        """Alias for artifact_metadata - backward compatibility"""
-        return self.artifact_metadata
-
-    @format_metadata.setter
-    def format_metadata(self, value):
-        """Alias setter for artifact_metadata - backward compatibility"""
-        self.artifact_metadata = value
-
    __table_args__ = (
        Index("idx_artifacts_created_at", "created_at"),
        Index("idx_artifacts_created_by", "created_by"),
--- a/backend/app/routes.py
+++ b/backend/app/routes.py
@@ -520,51 +520,40 @@ def upload_artifact(
        )

    # Store file (uses multipart for large files)
-    storage_result = storage.store(file.file, content_length)
+    sha256_hash, size, s3_key = storage.store(file.file, content_length)

    # Check if this is a deduplicated upload
    deduplicated = False

    # Create or update artifact record
-    artifact = db.query(Artifact).filter(Artifact.id == storage_result.sha256).first()
+    artifact = db.query(Artifact).filter(Artifact.id == sha256_hash).first()
    if artifact:
        artifact.ref_count += 1
        deduplicated = True
        # Merge metadata if new metadata was extracted
-        if file_metadata and artifact.artifact_metadata:
-            artifact.artifact_metadata = {**artifact.artifact_metadata, **file_metadata}
+        if file_metadata and artifact.format_metadata:
+            artifact.format_metadata = {**artifact.format_metadata, **file_metadata}
        elif file_metadata:
-            artifact.artifact_metadata = file_metadata
-        # Update checksums if not already set
-        if not artifact.checksum_md5 and storage_result.md5:
-            artifact.checksum_md5 = storage_result.md5
-        if not artifact.checksum_sha1 and storage_result.sha1:
-            artifact.checksum_sha1 = storage_result.sha1
-        if not artifact.s3_etag and storage_result.s3_etag:
-            artifact.s3_etag = storage_result.s3_etag
+            artifact.format_metadata = file_metadata
    else:
        artifact = Artifact(
-            id=storage_result.sha256,
-            size=storage_result.size,
+            id=sha256_hash,
+            size=size,
            content_type=file.content_type,
            original_name=file.filename,
-            checksum_md5=storage_result.md5,
-            checksum_sha1=storage_result.sha1,
-            s3_etag=storage_result.s3_etag,
            created_by=user_id,
-            s3_key=storage_result.s3_key,
-            artifact_metadata=file_metadata or {},
+            s3_key=s3_key,
+            format_metadata=file_metadata or {},
        )
        db.add(artifact)

    # Record upload
    upload = Upload(
-        artifact_id=storage_result.sha256,
+        artifact_id=sha256_hash,
        package_id=package.id,
        original_name=file.filename,
        uploaded_by=user_id,
        source_ip=request.client.host if request.client else None,
-        deduplicated=deduplicated,
    )
    db.add(upload)

@@ -572,13 +561,13 @@ def upload_artifact(
    if tag:
        existing_tag = db.query(Tag).filter(Tag.package_id == package.id, Tag.name == tag).first()
        if existing_tag:
-            existing_tag.artifact_id = storage_result.sha256
+            existing_tag.artifact_id = sha256_hash
            existing_tag.created_by = user_id
        else:
            new_tag = Tag(
                package_id=package.id,
                name=tag,
-                artifact_id=storage_result.sha256,
+                artifact_id=sha256_hash,
                created_by=user_id,
            )
            db.add(new_tag)
@@ -586,16 +575,12 @@ def upload_artifact(
    db.commit()

    return UploadResponse(
-        artifact_id=storage_result.sha256,
-        sha256=storage_result.sha256,
-        size=storage_result.size,
+        artifact_id=sha256_hash,
+        size=size,
        project=project_name,
        package=package_name,
        tag=tag,
-        checksum_md5=storage_result.md5,
-        checksum_sha1=storage_result.sha1,
-        s3_etag=storage_result.s3_etag,
-        format_metadata=artifact.artifact_metadata,
+        format_metadata=artifact.format_metadata,
        deduplicated=deduplicated,
    )

--- a/backend/app/schemas.py
+++ b/backend/app/schemas.py
@@ -99,13 +99,9 @@ class PackageDetailResponse(BaseModel):
 # Artifact schemas
 class ArtifactResponse(BaseModel):
    id: str
-    sha256: str  # Explicit SHA256 field (same as id)
    size: int
    content_type: Optional[str]
    original_name: Optional[str]
-    checksum_md5: Optional[str] = None
-    checksum_sha1: Optional[str] = None
-    s3_etag: Optional[str] = None
    created_at: datetime
    created_by: str
    ref_count: int
@@ -177,13 +173,9 @@ class ArtifactTagInfo(BaseModel):
 class ArtifactDetailResponse(BaseModel):
    """Artifact with list of tags/packages referencing it"""
    id: str
-    sha256: str  # Explicit SHA256 field (same as id)
    size: int
    content_type: Optional[str]
    original_name: Optional[str]
-    checksum_md5: Optional[str] = None
-    checksum_sha1: Optional[str] = None
-    s3_etag: Optional[str] = None
    created_at: datetime
    created_by: str
    ref_count: int
@@ -197,13 +189,9 @@ class ArtifactDetailResponse(BaseModel):
 class PackageArtifactResponse(BaseModel):
    """Artifact with tags for package artifact listing"""
    id: str
-    sha256: str  # Explicit SHA256 field (same as id)
    size: int
    content_type: Optional[str]
    original_name: Optional[str]
-    checksum_md5: Optional[str] = None
-    checksum_sha1: Optional[str] = None
-    s3_etag: Optional[str] = None
    created_at: datetime
    created_by: str
    format_metadata: Optional[Dict[str, Any]] = None
@@ -216,14 +204,10 @@ class PackageArtifactResponse(BaseModel):
 # Upload response
 class UploadResponse(BaseModel):
    artifact_id: str
-    sha256: str  # Explicit SHA256 field (same as artifact_id)
    size: int
    project: str
    package: str
    tag: Optional[str]
-    checksum_md5: Optional[str] = None
-    checksum_sha1: Optional[str] = None
-    s3_etag: Optional[str] = None
    format_metadata: Optional[Dict[str, Any]] = None
    deduplicated: bool = False

--- a/backend/app/storage.py
+++ b/backend/app/storage.py
@@ -1,6 +1,6 @@
 import hashlib
 import logging
-from typing import BinaryIO, Tuple, Optional, Dict, Any, Generator, NamedTuple
+from typing import BinaryIO, Tuple, Optional, Dict, Any, Generator
 import boto3
 from botocore.config import Config
 from botocore.exceptions import ClientError
@@ -18,16 +18,6 @@ MULTIPART_CHUNK_SIZE = 10 * 1024 * 1024
 HASH_CHUNK_SIZE = 8 * 1024 * 1024


-class StorageResult(NamedTuple):
-    """Result of storing a file with all computed checksums"""
-    sha256: str
-    size: int
-    s3_key: str
-    md5: Optional[str] = None
-    sha1: Optional[str] = None
-    s3_etag: Optional[str] = None
-
-
 class S3Storage:
    def __init__(self):
        config = Config(s3={"addressing_style": "path"} if settings.s3_use_path_style else {})
@@ -44,9 +34,9 @@ class S3Storage:
        # Store active multipart uploads for resumable support
        self._active_uploads: Dict[str, Dict[str, Any]] = {}

-    def store(self, file: BinaryIO, content_length: Optional[int] = None) -> StorageResult:
+    def store(self, file: BinaryIO, content_length: Optional[int] = None) -> Tuple[str, int, str]:
        """
-        Store a file and return StorageResult with all checksums.
+        Store a file and return its SHA256 hash, size, and s3_key.
        Content-addressable: if the file already exists, just return the hash.
        Uses multipart upload for files larger than MULTIPART_THRESHOLD.
        """
@@ -56,76 +46,45 @@ class S3Storage:
        else:
            return self._store_multipart(file, content_length)

-    def _store_simple(self, file: BinaryIO) -> StorageResult:
+    def _store_simple(self, file: BinaryIO) -> Tuple[str, int, str]:
        """Store a small file using simple put_object"""
-        # Read file and compute all hashes
+        # Read file and compute hash
        content = file.read()
        sha256_hash = hashlib.sha256(content).hexdigest()
-        md5_hash = hashlib.md5(content).hexdigest()
-        sha1_hash = hashlib.sha1(content).hexdigest()
        size = len(content)

        # Check if already exists
        s3_key = f"fruits/{sha256_hash[:2]}/{sha256_hash[2:4]}/{sha256_hash}"
-        s3_etag = None

        if not self._exists(s3_key):
-            response = self.client.put_object(
+            self.client.put_object(
                Bucket=self.bucket,
                Key=s3_key,
                Body=content,
            )
-            s3_etag = response.get("ETag", "").strip('"')
-        else:
-            # Get existing ETag
-            obj_info = self.get_object_info(s3_key)
-            if obj_info:
-                s3_etag = obj_info.get("etag", "").strip('"')

-        return StorageResult(
-            sha256=sha256_hash,
-            size=size,
-            s3_key=s3_key,
-            md5=md5_hash,
-            sha1=sha1_hash,
-            s3_etag=s3_etag,
-        )
+        return sha256_hash, size, s3_key

-    def _store_multipart(self, file: BinaryIO, content_length: int) -> StorageResult:
+    def _store_multipart(self, file: BinaryIO, content_length: int) -> Tuple[str, int, str]:
        """Store a large file using S3 multipart upload with streaming hash computation"""
-        # First pass: compute all hashes by streaming through file
-        sha256_hasher = hashlib.sha256()
-        md5_hasher = hashlib.md5()
-        sha1_hasher = hashlib.sha1()
+        # First pass: compute hash by streaming through file
+        hasher = hashlib.sha256()
        size = 0

-        # Read file in chunks to compute hashes
+        # Read file in chunks to compute hash
        while True:
            chunk = file.read(HASH_CHUNK_SIZE)
            if not chunk:
                break
-            sha256_hasher.update(chunk)
-            md5_hasher.update(chunk)
-            sha1_hasher.update(chunk)
+            hasher.update(chunk)
            size += len(chunk)

-        sha256_hash = sha256_hasher.hexdigest()
-        md5_hash = md5_hasher.hexdigest()
-        sha1_hash = sha1_hasher.hexdigest()
+        sha256_hash = hasher.hexdigest()
        s3_key = f"fruits/{sha256_hash[:2]}/{sha256_hash[2:4]}/{sha256_hash}"

        # Check if already exists (deduplication)
        if self._exists(s3_key):
-            obj_info = self.get_object_info(s3_key)
-            s3_etag = obj_info.get("etag", "").strip('"') if obj_info else None
-            return StorageResult(
-                sha256=sha256_hash,
-                size=size,
-                s3_key=s3_key,
-                md5=md5_hash,
-                sha1=sha1_hash,
-                s3_etag=s3_etag,
-            )
+            return sha256_hash, size, s3_key

        # Seek back to start for upload
        file.seek(0)
@@ -157,22 +116,14 @@ class S3Storage:
                part_number += 1

            # Complete multipart upload
-            complete_response = self.client.complete_multipart_upload(
+            self.client.complete_multipart_upload(
                Bucket=self.bucket,
                Key=s3_key,
                UploadId=upload_id,
                MultipartUpload={"Parts": parts},
            )
-            s3_etag = complete_response.get("ETag", "").strip('"')

-            return StorageResult(
-                sha256=sha256_hash,
-                size=size,
-                s3_key=s3_key,
-                md5=md5_hash,
-                sha1=sha1_hash,
-                s3_etag=s3_etag,
-            )
+            return sha256_hash, size, s3_key

        except Exception as e:
            # Abort multipart upload on failure
@@ -184,50 +135,33 @@ class S3Storage:
            )
            raise

-    def store_streaming(self, chunks: Generator[bytes, None, None]) -> StorageResult:
+    def store_streaming(self, chunks: Generator[bytes, None, None]) -> Tuple[str, int, str]:
        """
        Store a file from a stream of chunks.
        First accumulates to compute hash, then uploads.
        For truly large files, consider using initiate_resumable_upload instead.
        """
-        # Accumulate chunks and compute all hashes
-        sha256_hasher = hashlib.sha256()
-        md5_hasher = hashlib.md5()
-        sha1_hasher = hashlib.sha1()
+        # Accumulate chunks and compute hash
+        hasher = hashlib.sha256()
        all_chunks = []
        size = 0

        for chunk in chunks:
-            sha256_hasher.update(chunk)
-            md5_hasher.update(chunk)
-            sha1_hasher.update(chunk)
+            hasher.update(chunk)
            all_chunks.append(chunk)
            size += len(chunk)

-        sha256_hash = sha256_hasher.hexdigest()
-        md5_hash = md5_hasher.hexdigest()
-        sha1_hash = sha1_hasher.hexdigest()
+        sha256_hash = hasher.hexdigest()
        s3_key = f"fruits/{sha256_hash[:2]}/{sha256_hash[2:4]}/{sha256_hash}"
-        s3_etag = None

        # Check if already exists
        if self._exists(s3_key):
-            obj_info = self.get_object_info(s3_key)
-            s3_etag = obj_info.get("etag", "").strip('"') if obj_info else None
-            return StorageResult(
-                sha256=sha256_hash,
-                size=size,
-                s3_key=s3_key,
-                md5=md5_hash,
-                sha1=sha1_hash,
-                s3_etag=s3_etag,
-            )
+            return sha256_hash, size, s3_key

        # Upload based on size
        if size < MULTIPART_THRESHOLD:
            content = b"".join(all_chunks)
-            response = self.client.put_object(Bucket=self.bucket, Key=s3_key, Body=content)
-            s3_etag = response.get("ETag", "").strip('"')
+            self.client.put_object(Bucket=self.bucket, Key=s3_key, Body=content)
        else:
            # Use multipart for large files
            mpu = self.client.create_multipart_upload(Bucket=self.bucket, Key=s3_key)
@@ -271,13 +205,12 @@ class S3Storage:
                        "ETag": response["ETag"],
                    })

-                complete_response = self.client.complete_multipart_upload(
+                self.client.complete_multipart_upload(
                    Bucket=self.bucket,
                    Key=s3_key,
                    UploadId=upload_id,
                    MultipartUpload={"Parts": parts},
                )
-                s3_etag = complete_response.get("ETag", "").strip('"')

            except Exception as e:
                logger.error(f"Streaming multipart upload failed: {e}")
@@ -288,14 +221,7 @@ class S3Storage:
                )
                raise

-        return StorageResult(
-            sha256=sha256_hash,
-            size=size,
-            s3_key=s3_key,
-            md5=md5_hash,
-            sha1=sha1_hash,
-            s3_etag=s3_etag,
-        )
+        return sha256_hash, size, s3_key

    def initiate_resumable_upload(self, expected_hash: str) -> Dict[str, Any]:
        """
--- a/docker-compose.local.yml
+++ b/docker-compose.local.yml
@@ -1,122 +0,0 @@
-version: '3.8'
-
-services:
-  orchard-server:
-    build:
-      context: .
-      dockerfile: Dockerfile.local
-    ports:
-      - "8080:8080"
-    environment:
-      - ORCHARD_SERVER_HOST=0.0.0.0
-      - ORCHARD_SERVER_PORT=8080
-      - ORCHARD_DATABASE_HOST=postgres
-      - ORCHARD_DATABASE_PORT=5432
-      - ORCHARD_DATABASE_USER=orchard
-      - ORCHARD_DATABASE_PASSWORD=orchard_secret
-      - ORCHARD_DATABASE_DBNAME=orchard
-      - ORCHARD_DATABASE_SSLMODE=disable
-      - ORCHARD_S3_ENDPOINT=http://minio:9000
-      - ORCHARD_S3_REGION=us-east-1
-      - ORCHARD_S3_BUCKET=orchard-artifacts
-      - ORCHARD_S3_ACCESS_KEY_ID=minioadmin
-      - ORCHARD_S3_SECRET_ACCESS_KEY=minioadmin
-      - ORCHARD_S3_USE_PATH_STYLE=true
-      - ORCHARD_REDIS_HOST=redis
-      - ORCHARD_REDIS_PORT=6379
-    depends_on:
-      postgres:
-        condition: service_healthy
-      minio:
-        condition: service_healthy
-      redis:
-        condition: service_healthy
-    networks:
-      - orchard-network
-    restart: unless-stopped
-    healthcheck:
-      test: ["CMD", "curl", "-f", "http://localhost:8080/health"]
-      interval: 30s
-      timeout: 3s
-      start_period: 10s
-      retries: 3
-
-  postgres:
-    image: postgres:16-alpine
-    environment:
-      - POSTGRES_USER=orchard
-      - POSTGRES_PASSWORD=orchard_secret
-      - POSTGRES_DB=orchard
-    volumes:
-      - postgres-data-local:/var/lib/postgresql/data
-      - ./migrations:/docker-entrypoint-initdb.d:ro
-    ports:
-      - "5432:5432"
-    healthcheck:
-      test: ["CMD-SHELL", "pg_isready -U orchard -d orchard"]
-      interval: 10s
-      timeout: 5s
-      retries: 5
-    networks:
-      - orchard-network
-    restart: unless-stopped
-
-  minio:
-    image: minio/minio:latest
-    command: server /data --console-address ":9001"
-    environment:
-      - MINIO_ROOT_USER=minioadmin
-      - MINIO_ROOT_PASSWORD=minioadmin
-    volumes:
-      - minio-data-local:/data
-    ports:
-      - "9000:9000"
-      - "9001:9001"
-    healthcheck:
-      test: ["CMD", "mc", "ready", "local"]
-      interval: 10s
-      timeout: 5s
-      retries: 5
-    networks:
-      - orchard-network
-    restart: unless-stopped
-
-  minio-init:
-    image: minio/mc:latest
-    depends_on:
-      minio:
-        condition: service_healthy
-    entrypoint: >
-      /bin/sh -c "
-      mc alias set myminio http://minio:9000 minioadmin minioadmin;
-      mc mb myminio/orchard-artifacts --ignore-existing;
-      mc anonymous set download myminio/orchard-artifacts;
-      exit 0;
-      "
-    networks:
-      - orchard-network
-
-  redis:
-    image: redis:7-alpine
-    command: redis-server --appendonly yes
-    volumes:
-      - redis-data-local:/data
-    ports:
-      - "6379:6379"
-    healthcheck:
-      test: ["CMD", "redis-cli", "ping"]
-      interval: 10s
-      timeout: 5s
-      retries: 5
-    networks:
-      - orchard-network
-    restart: unless-stopped
-
-volumes:
-  postgres-data-local:
-  minio-data-local:
-  redis-data-local:
-
-networks:
-  orchard-network:
-    driver: bridge
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -36,7 +36,7 @@ services:
    restart: unless-stopped

  postgres:
-    image: containers.global.bsf.tools/postgres:16-alpine
+    image: postgres:16-alpine
    environment:
      - POSTGRES_USER=orchard
      - POSTGRES_PASSWORD=orchard_secret
@@ -56,7 +56,7 @@ services:
    restart: unless-stopped

  minio:
-    image: containers.global.bsf.tools/minio/minio:latest
+    image: minio/minio:latest
    command: server /data --console-address ":9001"
    environment:
      - MINIO_ROOT_USER=minioadmin
@@ -76,7 +76,7 @@ services:
    restart: unless-stopped

  minio-init:
-    image: containers.global.bsf.tools/minio/mc:latest
+    image: minio/mc:latest
    depends_on:
      minio:
        condition: service_healthy
@@ -91,7 +91,7 @@ services:
      - orchard-network

  redis:
-    image: containers.global.bsf.tools/redis:7-alpine
+    image: redis:7-alpine
    command: redis-server --appendonly yes
    volumes:
      - redis-data:/data
--- a/docs/design/integrity-verification.md
+++ b/docs/design/integrity-verification.md
@@ -1,504 +0,0 @@
-# Integrity Verification Workflow Design
-
-This document defines the process for SHA256 checksum verification on artifact downloads, including failure handling and retry mechanisms.
-
-## Overview
-
-Orchard uses content-addressable storage where the artifact ID is the SHA256 hash of the content. This design leverages that property to provide configurable integrity verification during downloads.
-
-## Current State
-
-| Aspect | Status |
-|--------|--------|
-| Download streams content directly from S3 | ✅ Implemented |
-| Artifact ID is the SHA256 hash | ✅ Implemented |
-| S3 key derived from SHA256 hash | ✅ Implemented |
-| Verification during download | ❌ Not implemented |
-| Checksum headers in response | ❌ Not implemented |
-| Retry mechanism on failure | ❌ Not implemented |
-| Failure handling beyond S3 errors | ❌ Not implemented |
-
-## Verification Modes
-
-The verification mode is selected via query parameter `?verify=<mode>` or server-wide default via `ORCHARD_VERIFY_MODE`.
-
-| Mode | Performance | Integrity | Use Case |
-|------|-------------|-----------|----------|
-| `none` | ⚡ Fastest | Client-side | Trusted networks, high throughput |
-| `header` | ⚡ Fast | Client-side | Standard downloads, client verification |
-| `stream` | 🔄 Moderate | Post-hoc server | Logging/auditing, non-blocking |
-| `pre` | 🐢 Slower | Guaranteed | Critical downloads, untrusted storage |
-| `strict` | 🐢 Slower | Guaranteed + Alert | Security-sensitive, compliance |
-
-### Mode: None (Default)
-
-**Behavior:**
- Stream content directly from S3 with no server-side processing
- Maximum download performance
- Client is responsible for verification
-
-**Headers Returned:**
-```
-X-Checksum-SHA256: <expected_hash>
-Content-Length: <expected_size>
-```
-
-**Flow:**
-```
-Client Request → Lookup Artifact → Stream from S3 → Client
-```
-
-### Mode: Header
-
-**Behavior:**
- Stream content directly from S3
- Include comprehensive checksum headers
- Client performs verification using headers
-
-**Headers Returned:**
-```
-X-Checksum-SHA256: <expected_hash>
-Content-Length: <expected_size>
-Digest: sha-256=<base64_encoded_hash>
-ETag: "<sha256_hash>"
-X-Content-SHA256: <expected_hash>
-```
-
-**Flow:**
-```
-Client Request → Lookup Artifact → Add Headers → Stream from S3 → Client Verifies
-```
-
-**Client Verification Example:**
-```bash
-# Download and verify
-curl -OJ https://orchard/project/foo/bar/+/v1.0.0
-EXPECTED=$(curl -sI https://orchard/project/foo/bar/+/v1.0.0 | grep X-Checksum-SHA256 | cut -d' ' -f2)
-ACTUAL=$(sha256sum downloaded_file | cut -d' ' -f1)
-[ "$EXPECTED" = "$ACTUAL" ] && echo "OK" || echo "MISMATCH"
-```
-
-### Mode: Stream (Post-Hoc Verification)
-
-**Behavior:**
- Wrap S3 stream with `HashingStreamWrapper`
- Compute SHA256 incrementally while streaming to client
- Verify hash after stream completes
- Log verification result
- Cannot reject content (already sent to client)
-
-**Headers Returned:**
-```
-X-Checksum-SHA256: <expected_hash>
-Content-Length: <expected_size>
-X-Verify-Mode: stream
-Trailer: X-Verified
-```
-
-**Trailers (if client supports):**
-```
-X-Verified: true|false
-X-Computed-SHA256: <computed_hash>
-```
-
-**Flow:**
-```
-Client Request → Lookup Artifact → Wrap Stream → Stream to Client
-                                       ↓
-                              Compute Hash Incrementally
-                                       ↓
-                              Verify After Complete → Log Result
-```
-
-**Implementation:**
-```python
-class HashingStreamWrapper:
-    def __init__(self, stream, expected_hash: str, on_complete: Callable):
-        self.stream = stream
-        self.hasher = hashlib.sha256()
-        self.expected_hash = expected_hash
-        self.on_complete = on_complete
-
-    def __iter__(self):
-        for chunk in self.stream:
-            self.hasher.update(chunk)
-            yield chunk
-        # Stream complete, verify
-        computed = self.hasher.hexdigest()
-        self.on_complete(computed == self.expected_hash, computed)
-```
-
-### Mode: Pre-Verify (Blocking)
-
-**Behavior:**
- Download entire content from S3 to memory/temp file
- Compute SHA256 hash before sending to client
- On match: stream verified content to client
- On mismatch: retry from S3 (up to N times)
- If retries exhausted: return 500 error
-
-**Headers Returned:**
-```
-X-Checksum-SHA256: <expected_hash>
-Content-Length: <expected_size>
-X-Verify-Mode: pre
-X-Verified: true
-```
-
-**Flow:**
-```
-Client Request → Lookup Artifact → Download from S3 → Compute Hash
-                                                          ↓
-                                                    Hash Matches?
-                                                    ↓           ↓
-                                                   Yes          No
-                                                    ↓           ↓
-                                            Stream to Client   Retry?
-                                                                ↓
-                                                          Yes → Loop
-                                                          No  → 500 Error
-```
-
-**Memory Considerations:**
- For files < `ORCHARD_VERIFY_MEMORY_LIMIT` (default 100MB): buffer in memory
- For larger files: use temporary file with streaming hash computation
- Cleanup temp files after response sent
-
-### Mode: Strict
-
-**Behavior:**
- Same as pre-verify but with no retries
- Fail immediately on any mismatch
- Quarantine artifact on failure (mark as potentially corrupted)
- Trigger alert/notification on failure
- For security-critical downloads
-
-**Headers Returned (on success):**
-```
-X-Checksum-SHA256: <expected_hash>
-Content-Length: <expected_size>
-X-Verify-Mode: strict
-X-Verified: true
-```
-
-**Error Response (on failure):**
-```json
-{
-  "error": "integrity_verification_failed",
-  "message": "Artifact content does not match expected checksum",
-  "expected_hash": "<expected>",
-  "computed_hash": "<computed>",
-  "artifact_id": "<id>",
-  "action_taken": "quarantined"
-}
-```
-
-**Quarantine Process:**
-1. Mark artifact `status = 'quarantined'` in database
-2. Log security event to audit_logs
-3. Optionally notify via webhook/email
-4. Artifact becomes unavailable for download until resolved
-
-## Failure Detection
-
-### Failure Types
-
-| Failure Type | Detection Method | Severity |
-|--------------|------------------|----------|
-| Hash mismatch | Computed SHA256 ≠ Expected | Critical |
-| Size mismatch | Actual bytes ≠ `Content-Length` | High |
-| S3 read error | boto3 exception | Medium |
-| Truncated content | Stream ends early | High |
-| S3 object missing | `NoSuchKey` error | Critical |
-| ETag mismatch | S3 ETag ≠ expected | Medium |
-
-### Detection Implementation
-
-```python
-class VerificationResult:
-    success: bool
-    failure_type: Optional[str]  # hash_mismatch, size_mismatch, etc.
-    expected_hash: str
-    computed_hash: Optional[str]
-    expected_size: int
-    actual_size: Optional[int]
-    error_message: Optional[str]
-    retry_count: int
-```
-
-## Retry Mechanism
-
-### Configuration
-
-| Environment Variable | Default | Description |
-|---------------------|---------|-------------|
-| `ORCHARD_VERIFY_MAX_RETRIES` | 3 | Maximum retry attempts |
-| `ORCHARD_VERIFY_RETRY_DELAY_MS` | 100 | Base delay between retries |
-| `ORCHARD_VERIFY_RETRY_BACKOFF` | 2.0 | Exponential backoff multiplier |
-| `ORCHARD_VERIFY_RETRY_MAX_DELAY_MS` | 5000 | Maximum delay cap |
-
-### Backoff Formula
-
-```
-delay = min(base_delay * (backoff ^ attempt), max_delay)
-```
-
-Example with defaults:
- Attempt 1: 100ms
- Attempt 2: 200ms
- Attempt 3: 400ms
-
-### Retry Flow
-
-```python
-async def download_with_retry(artifact, max_retries=3):
-    for attempt in range(max_retries + 1):
-        try:
-            content = await fetch_from_s3(artifact.s3_key)
-            computed_hash = compute_sha256(content)
-
-            if computed_hash == artifact.id:
-                return content  # Success
-
-            # Hash mismatch
-            log.warning(f"Verification failed, attempt {attempt + 1}/{max_retries + 1}")
-
-            if attempt < max_retries:
-                delay = calculate_backoff(attempt)
-                await asyncio.sleep(delay / 1000)
-            else:
-                raise IntegrityError("Max retries exceeded")
-
-        except S3Error as e:
-            if attempt < max_retries:
-                delay = calculate_backoff(attempt)
-                await asyncio.sleep(delay / 1000)
-            else:
-                raise
-```
-
-### Retryable vs Non-Retryable Failures
-
-**Retryable:**
- S3 read timeout
- S3 connection error
- Hash mismatch (may be transient S3 issue)
- Truncated content
-
-**Non-Retryable:**
- S3 object not found (404)
- S3 access denied (403)
- Artifact not in database
- Strict mode failures
-
-## Configuration Reference
-
-### Environment Variables
-
-```bash
-# Verification mode (none, header, stream, pre, strict)
-ORCHARD_VERIFY_MODE=none
-
-# Retry settings
-ORCHARD_VERIFY_MAX_RETRIES=3
-ORCHARD_VERIFY_RETRY_DELAY_MS=100
-ORCHARD_VERIFY_RETRY_BACKOFF=2.0
-ORCHARD_VERIFY_RETRY_MAX_DELAY_MS=5000
-
-# Memory limit for pre-verify buffering (bytes)
-ORCHARD_VERIFY_MEMORY_LIMIT=104857600  # 100MB
-
-# Strict mode settings
-ORCHARD_VERIFY_QUARANTINE_ON_FAILURE=true
-ORCHARD_VERIFY_ALERT_WEBHOOK=https://alerts.example.com/webhook
-
-# Allow per-request mode override
-ORCHARD_VERIFY_ALLOW_OVERRIDE=true
-```
-
-### Per-Request Override
-
-When `ORCHARD_VERIFY_ALLOW_OVERRIDE=true`, clients can specify verification mode:
-
-```
-GET /api/v1/project/foo/bar/+/v1.0.0?verify=pre
-GET /api/v1/project/foo/bar/+/v1.0.0?verify=none
-```
-
-## API Changes
-
-### Download Endpoint
-
-**Request:**
-```
-GET /api/v1/project/{project}/{package}/+/{ref}?verify={mode}
-```
-
-**New Query Parameters:**
-| Parameter | Type | Default | Description |
-|-----------|------|---------|-------------|
-| `verify` | string | from config | Verification mode |
-
-**New Response Headers:**
-| Header | Description |
-|--------|-------------|
-| `X-Checksum-SHA256` | Expected SHA256 hash |
-| `X-Verify-Mode` | Active verification mode |
-| `X-Verified` | `true` if server verified content |
-| `Digest` | RFC 3230 digest header |
-
-### New Endpoint: Verify Artifact
-
-**Request:**
-```
-POST /api/v1/project/{project}/{package}/+/{ref}/verify
-```
-
-**Response:**
-```json
-{
-  "artifact_id": "abc123...",
-  "verified": true,
-  "expected_hash": "abc123...",
-  "computed_hash": "abc123...",
-  "size_match": true,
-  "expected_size": 1048576,
-  "actual_size": 1048576,
-  "verification_time_ms": 45
-}
-```
-
-## Logging and Monitoring
-
-### Log Events
-
-| Event | Level | When |
-|-------|-------|------|
-| `verification.success` | INFO | Hash verified successfully |
-| `verification.failure` | ERROR | Hash mismatch detected |
-| `verification.retry` | WARN | Retry attempt initiated |
-| `verification.quarantine` | ERROR | Artifact quarantined |
-| `verification.skip` | DEBUG | Verification skipped (mode=none) |
-
-### Metrics
-
-| Metric | Type | Description |
-|--------|------|-------------|
-| `orchard_verification_total` | Counter | Total verification attempts |
-| `orchard_verification_failures` | Counter | Failed verifications |
-| `orchard_verification_retries` | Counter | Retry attempts |
-| `orchard_verification_duration_ms` | Histogram | Verification time |
-
-### Audit Log Entry
-
-```json
-{
-  "action": "artifact.download.verified",
-  "resource": "project/foo/package/bar/artifact/abc123",
-  "user_id": "user@example.com",
-  "details": {
-    "verification_mode": "pre",
-    "verified": true,
-    "retry_count": 0,
-    "duration_ms": 45
-  }
-}
-```
-
-## Security Considerations
-
-1. **Strict Mode for Sensitive Data**: Use strict mode for artifacts containing credentials, certificates, or security-critical code.
-
-2. **Quarantine Isolation**: Quarantined artifacts should be moved to a separate S3 prefix or bucket for forensic analysis.
-
-3. **Alert on Repeated Failures**: Multiple verification failures for the same artifact may indicate storage corruption or tampering.
-
-4. **Audit Trail**: All verification events should be logged for compliance and forensic purposes.
-
-5. **Client Trust**: In `none` and `header` modes, clients must implement their own verification for security guarantees.
-
-## Implementation Phases
-
-### Phase 1: Headers Only
- Add `X-Checksum-SHA256` header to all downloads
- Add `verify=header` mode support
- Add configuration options
-
-### Phase 2: Stream Verification
- Implement `HashingStreamWrapper`
- Add `verify=stream` mode
- Add verification logging
-
-### Phase 3: Pre-Verification
- Implement buffered verification
- Add retry mechanism
- Add `verify=pre` mode
-
-### Phase 4: Strict Mode
- Implement quarantine mechanism
- Add alerting integration
- Add `verify=strict` mode
-
-## Client Integration Examples
-
-### curl with Verification
-```bash
-#!/bin/bash
-URL="https://orchard.example.com/api/v1/project/myproject/mypackage/+/v1.0.0"
-
-# Get expected hash from headers
-EXPECTED=$(curl -sI "$URL" | grep -i "X-Checksum-SHA256" | tr -d '\r' | cut -d' ' -f2)
-
-# Download file
-curl -sO "$URL"
-FILENAME=$(basename "$URL")
-
-# Verify
-ACTUAL=$(sha256sum "$FILENAME" | cut -d' ' -f1)
-
-if [ "$EXPECTED" = "$ACTUAL" ]; then
-    echo "✓ Verification passed"
-else
-    echo "✗ Verification FAILED"
-    echo "  Expected: $EXPECTED"
-    echo "  Actual:   $ACTUAL"
-    exit 1
-fi
-```
-
-### Python Client
-```python
-import hashlib
-import requests
-
-def download_verified(url: str) -> bytes:
-    # Get headers first
-    head = requests.head(url)
-    expected_hash = head.headers.get('X-Checksum-SHA256')
-    expected_size = int(head.headers.get('Content-Length', 0))
-
-    # Download content
-    response = requests.get(url)
-    content = response.content
-
-    # Verify size
-    if len(content) != expected_size:
-        raise ValueError(f"Size mismatch: {len(content)} != {expected_size}")
-
-    # Verify hash
-    actual_hash = hashlib.sha256(content).hexdigest()
-    if actual_hash != expected_hash:
-        raise ValueError(f"Hash mismatch: {actual_hash} != {expected_hash}")
-
-    return content
-```
-
-### Server-Side Verification
-```bash
-# Force server to verify before sending
-curl -O "https://orchard.example.com/api/v1/project/myproject/mypackage/+/v1.0.0?verify=pre"
-
-# Check if verification was performed
-curl -I "https://orchard.example.com/api/v1/project/myproject/mypackage/+/v1.0.0?verify=pre" | grep X-Verified
-# X-Verified: true
-```
--- a/migrations/001_initial.sql
+++ b/migrations/001_initial.sql
@@ -41,8 +41,6 @@ CREATE TABLE IF NOT EXISTS artifacts (
    content_type VARCHAR(255),
    original_name VARCHAR(1024),
    checksum_md5 VARCHAR(32),    -- MD5 hash for additional verification
-    checksum_sha1 VARCHAR(40),   -- SHA1 hash for compatibility
-    s3_etag VARCHAR(64),         -- S3 ETag for verification
    metadata JSONB,              -- format-specific metadata
    created_at TIMESTAMP WITH TIME ZONE DEFAULT NOW(),
    created_by VARCHAR(255) NOT NULL,
@@ -53,8 +51,6 @@ CREATE TABLE IF NOT EXISTS artifacts (
 CREATE INDEX idx_artifacts_created_at ON artifacts(created_at);
 CREATE INDEX idx_artifacts_created_by ON artifacts(created_by);
 CREATE INDEX idx_artifacts_metadata ON artifacts USING GIN (metadata);
-CREATE INDEX idx_artifacts_checksum_md5 ON artifacts(checksum_md5) WHERE checksum_md5 IS NOT NULL;
-CREATE INDEX idx_artifacts_checksum_sha1 ON artifacts(checksum_sha1) WHERE checksum_sha1 IS NOT NULL;

 -- Tags (Aliases pointing to artifacts)
 CREATE TABLE IF NOT EXISTS tags (
--- a/migrations/003_checksum_fields.sql
+++ b/migrations/003_checksum_fields.sql
@@ -1,12 +0,0 @@
-- Migration 003: Additional Checksum Fields
-- Adds checksum_sha1 and s3_etag fields to artifacts table
-
-- ============================================
-- Artifacts: Add checksum_sha1 and s3_etag fields
-- ============================================
-ALTER TABLE artifacts ADD COLUMN IF NOT EXISTS checksum_sha1 VARCHAR(40);
-ALTER TABLE artifacts ADD COLUMN IF NOT EXISTS s3_etag VARCHAR(64);
-
-- Create indexes for checksum lookups (optional, for verification queries)
-CREATE INDEX IF NOT EXISTS idx_artifacts_checksum_md5 ON artifacts(checksum_md5) WHERE checksum_md5 IS NOT NULL;
-CREATE INDEX IF NOT EXISTS idx_artifacts_checksum_sha1 ON artifacts(checksum_sha1) WHERE checksum_sha1 IS NOT NULL;