1 Commits

Author SHA1 Message Date
Mondo Diaz
9296f7eb54 Add schema enhancements for uploads, artifacts, and audit tracking
- Add format and platform fields to packages table
- Add checksum_md5 and metadata JSONB to artifacts with CHECK constraints
- Add updated_at and composite index to tags table
- Add tag_name, user_agent, duration_ms, deduplicated, checksum_verified to uploads
- Add change_type field to tag_history table
- Add composite indexes and GIN index to audit_logs
- Add partial index for public projects
- Add triggers for ref_count accuracy and updated_at timestamps
- Create migration script (002) for existing databases
2025-12-12 15:14:44 -06:00
12 changed files with 47 additions and 872 deletions

View File

@@ -6,18 +6,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
## [Unreleased]
### Added
- Added integrity verification workflow design document (#24)
- Added `sha256` field to API responses for clarity (alias of `id`) (#25)
- Added `checksum_sha1` field to artifacts table for compatibility (#25)
- Added `s3_etag` field to artifacts table for S3 verification (#25)
- Compute and store MD5, SHA1, and S3 ETag alongside SHA256 during upload (#25)
- Added `Dockerfile.local` and `docker-compose.local.yml` for local development (#25)
- Added migration script `003_checksum_fields.sql` for existing databases (#25)
## [0.2.0] - 2025-12-15
### Changed
- Updated images to use internal container BSF proxy (#46)
### Added
- Added `format` and `platform` fields to packages table (#16)
- Added `checksum_md5` and `metadata` JSONB fields to artifacts table (#16)

View File

@@ -1,5 +1,5 @@
# Frontend build stage
FROM containers.global.bsf.tools/node:20-alpine AS frontend-builder
FROM node:20-alpine AS frontend-builder
ARG NPM_REGISTRY=https://deps.global.bsf.tools/artifactory/api/npm/registry.npmjs.org/
@@ -19,7 +19,7 @@ COPY frontend/ ./
RUN npm run build
# Runtime stage
FROM containers.global.bsf.tools/python:3.12-slim
FROM python:3.12-slim
# Disable proxy cache
RUN echo 'Acquire::http::Pipeline-Depth 0;\nAcquire::http::No-Cache true;\nAcquire::BrokenProxy true;\n' > /etc/apt/apt.conf.d/99fixbadproxy

View File

@@ -1,50 +0,0 @@
# Frontend build stage
FROM node:20-alpine AS frontend-builder
WORKDIR /app/frontend
# Copy package files
COPY frontend/package*.json ./
RUN npm install
# Copy frontend source
COPY frontend/ ./
# Build frontend
RUN npm run build
# Runtime stage
FROM python:3.12-slim
# Install system dependencies
RUN apt-get update && apt-get install -y --no-install-recommends \
curl \
&& rm -rf /var/lib/apt/lists/*
# Create non-root user
RUN groupadd -g 1000 orchard && \
useradd -u 1000 -g orchard -s /bin/bash -m orchard
WORKDIR /app
# Copy requirements and install Python dependencies
COPY backend/requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt
# Copy backend source
COPY backend/ ./backend/
# Copy frontend build
COPY --from=frontend-builder /app/frontend/dist ./frontend/dist
# Set ownership
RUN chown -R orchard:orchard /app
USER orchard
EXPOSE 8080
HEALTHCHECK --interval=30s --timeout=3s --start-period=5s --retries=3 \
CMD curl -f http://localhost:8080/health || exit 1
CMD ["uvicorn", "backend.app.main:app", "--host", "0.0.0.0", "--port", "8080"]

View File

@@ -74,9 +74,7 @@ class Artifact(Base):
content_type = Column(String(255))
original_name = Column(String(1024))
checksum_md5 = Column(String(32)) # MD5 hash for additional verification
checksum_sha1 = Column(String(40)) # SHA1 hash for compatibility
s3_etag = Column(String(64)) # S3 ETag for verification
artifact_metadata = Column("metadata", JSON, default=dict) # Format-specific metadata (column name is 'metadata')
metadata = Column(JSON, default=dict) # Format-specific metadata
created_at = Column(DateTime(timezone=True), default=datetime.utcnow)
created_by = Column(String(255), nullable=False)
ref_count = Column(Integer, default=1)
@@ -85,21 +83,6 @@ class Artifact(Base):
tags = relationship("Tag", back_populates="artifact")
uploads = relationship("Upload", back_populates="artifact")
@property
def sha256(self) -> str:
"""Alias for id - the SHA256 hash of the artifact content"""
return self.id
@property
def format_metadata(self):
"""Alias for artifact_metadata - backward compatibility"""
return self.artifact_metadata
@format_metadata.setter
def format_metadata(self, value):
"""Alias setter for artifact_metadata - backward compatibility"""
self.artifact_metadata = value
__table_args__ = (
Index("idx_artifacts_created_at", "created_at"),
Index("idx_artifacts_created_by", "created_by"),

View File

@@ -520,51 +520,40 @@ def upload_artifact(
)
# Store file (uses multipart for large files)
storage_result = storage.store(file.file, content_length)
sha256_hash, size, s3_key = storage.store(file.file, content_length)
# Check if this is a deduplicated upload
deduplicated = False
# Create or update artifact record
artifact = db.query(Artifact).filter(Artifact.id == storage_result.sha256).first()
artifact = db.query(Artifact).filter(Artifact.id == sha256_hash).first()
if artifact:
artifact.ref_count += 1
deduplicated = True
# Merge metadata if new metadata was extracted
if file_metadata and artifact.artifact_metadata:
artifact.artifact_metadata = {**artifact.artifact_metadata, **file_metadata}
if file_metadata and artifact.format_metadata:
artifact.format_metadata = {**artifact.format_metadata, **file_metadata}
elif file_metadata:
artifact.artifact_metadata = file_metadata
# Update checksums if not already set
if not artifact.checksum_md5 and storage_result.md5:
artifact.checksum_md5 = storage_result.md5
if not artifact.checksum_sha1 and storage_result.sha1:
artifact.checksum_sha1 = storage_result.sha1
if not artifact.s3_etag and storage_result.s3_etag:
artifact.s3_etag = storage_result.s3_etag
artifact.format_metadata = file_metadata
else:
artifact = Artifact(
id=storage_result.sha256,
size=storage_result.size,
id=sha256_hash,
size=size,
content_type=file.content_type,
original_name=file.filename,
checksum_md5=storage_result.md5,
checksum_sha1=storage_result.sha1,
s3_etag=storage_result.s3_etag,
created_by=user_id,
s3_key=storage_result.s3_key,
artifact_metadata=file_metadata or {},
s3_key=s3_key,
format_metadata=file_metadata or {},
)
db.add(artifact)
# Record upload
upload = Upload(
artifact_id=storage_result.sha256,
artifact_id=sha256_hash,
package_id=package.id,
original_name=file.filename,
uploaded_by=user_id,
source_ip=request.client.host if request.client else None,
deduplicated=deduplicated,
)
db.add(upload)
@@ -572,13 +561,13 @@ def upload_artifact(
if tag:
existing_tag = db.query(Tag).filter(Tag.package_id == package.id, Tag.name == tag).first()
if existing_tag:
existing_tag.artifact_id = storage_result.sha256
existing_tag.artifact_id = sha256_hash
existing_tag.created_by = user_id
else:
new_tag = Tag(
package_id=package.id,
name=tag,
artifact_id=storage_result.sha256,
artifact_id=sha256_hash,
created_by=user_id,
)
db.add(new_tag)
@@ -586,16 +575,12 @@ def upload_artifact(
db.commit()
return UploadResponse(
artifact_id=storage_result.sha256,
sha256=storage_result.sha256,
size=storage_result.size,
artifact_id=sha256_hash,
size=size,
project=project_name,
package=package_name,
tag=tag,
checksum_md5=storage_result.md5,
checksum_sha1=storage_result.sha1,
s3_etag=storage_result.s3_etag,
format_metadata=artifact.artifact_metadata,
format_metadata=artifact.format_metadata,
deduplicated=deduplicated,
)

View File

@@ -99,13 +99,9 @@ class PackageDetailResponse(BaseModel):
# Artifact schemas
class ArtifactResponse(BaseModel):
id: str
sha256: str # Explicit SHA256 field (same as id)
size: int
content_type: Optional[str]
original_name: Optional[str]
checksum_md5: Optional[str] = None
checksum_sha1: Optional[str] = None
s3_etag: Optional[str] = None
created_at: datetime
created_by: str
ref_count: int
@@ -177,13 +173,9 @@ class ArtifactTagInfo(BaseModel):
class ArtifactDetailResponse(BaseModel):
"""Artifact with list of tags/packages referencing it"""
id: str
sha256: str # Explicit SHA256 field (same as id)
size: int
content_type: Optional[str]
original_name: Optional[str]
checksum_md5: Optional[str] = None
checksum_sha1: Optional[str] = None
s3_etag: Optional[str] = None
created_at: datetime
created_by: str
ref_count: int
@@ -197,13 +189,9 @@ class ArtifactDetailResponse(BaseModel):
class PackageArtifactResponse(BaseModel):
"""Artifact with tags for package artifact listing"""
id: str
sha256: str # Explicit SHA256 field (same as id)
size: int
content_type: Optional[str]
original_name: Optional[str]
checksum_md5: Optional[str] = None
checksum_sha1: Optional[str] = None
s3_etag: Optional[str] = None
created_at: datetime
created_by: str
format_metadata: Optional[Dict[str, Any]] = None
@@ -216,14 +204,10 @@ class PackageArtifactResponse(BaseModel):
# Upload response
class UploadResponse(BaseModel):
artifact_id: str
sha256: str # Explicit SHA256 field (same as artifact_id)
size: int
project: str
package: str
tag: Optional[str]
checksum_md5: Optional[str] = None
checksum_sha1: Optional[str] = None
s3_etag: Optional[str] = None
format_metadata: Optional[Dict[str, Any]] = None
deduplicated: bool = False

View File

@@ -1,6 +1,6 @@
import hashlib
import logging
from typing import BinaryIO, Tuple, Optional, Dict, Any, Generator, NamedTuple
from typing import BinaryIO, Tuple, Optional, Dict, Any, Generator
import boto3
from botocore.config import Config
from botocore.exceptions import ClientError
@@ -18,16 +18,6 @@ MULTIPART_CHUNK_SIZE = 10 * 1024 * 1024
HASH_CHUNK_SIZE = 8 * 1024 * 1024
class StorageResult(NamedTuple):
"""Result of storing a file with all computed checksums"""
sha256: str
size: int
s3_key: str
md5: Optional[str] = None
sha1: Optional[str] = None
s3_etag: Optional[str] = None
class S3Storage:
def __init__(self):
config = Config(s3={"addressing_style": "path"} if settings.s3_use_path_style else {})
@@ -44,9 +34,9 @@ class S3Storage:
# Store active multipart uploads for resumable support
self._active_uploads: Dict[str, Dict[str, Any]] = {}
def store(self, file: BinaryIO, content_length: Optional[int] = None) -> StorageResult:
def store(self, file: BinaryIO, content_length: Optional[int] = None) -> Tuple[str, int, str]:
"""
Store a file and return StorageResult with all checksums.
Store a file and return its SHA256 hash, size, and s3_key.
Content-addressable: if the file already exists, just return the hash.
Uses multipart upload for files larger than MULTIPART_THRESHOLD.
"""
@@ -56,76 +46,45 @@ class S3Storage:
else:
return self._store_multipart(file, content_length)
def _store_simple(self, file: BinaryIO) -> StorageResult:
def _store_simple(self, file: BinaryIO) -> Tuple[str, int, str]:
"""Store a small file using simple put_object"""
# Read file and compute all hashes
# Read file and compute hash
content = file.read()
sha256_hash = hashlib.sha256(content).hexdigest()
md5_hash = hashlib.md5(content).hexdigest()
sha1_hash = hashlib.sha1(content).hexdigest()
size = len(content)
# Check if already exists
s3_key = f"fruits/{sha256_hash[:2]}/{sha256_hash[2:4]}/{sha256_hash}"
s3_etag = None
if not self._exists(s3_key):
response = self.client.put_object(
self.client.put_object(
Bucket=self.bucket,
Key=s3_key,
Body=content,
)
s3_etag = response.get("ETag", "").strip('"')
else:
# Get existing ETag
obj_info = self.get_object_info(s3_key)
if obj_info:
s3_etag = obj_info.get("etag", "").strip('"')
return StorageResult(
sha256=sha256_hash,
size=size,
s3_key=s3_key,
md5=md5_hash,
sha1=sha1_hash,
s3_etag=s3_etag,
)
return sha256_hash, size, s3_key
def _store_multipart(self, file: BinaryIO, content_length: int) -> StorageResult:
def _store_multipart(self, file: BinaryIO, content_length: int) -> Tuple[str, int, str]:
"""Store a large file using S3 multipart upload with streaming hash computation"""
# First pass: compute all hashes by streaming through file
sha256_hasher = hashlib.sha256()
md5_hasher = hashlib.md5()
sha1_hasher = hashlib.sha1()
# First pass: compute hash by streaming through file
hasher = hashlib.sha256()
size = 0
# Read file in chunks to compute hashes
# Read file in chunks to compute hash
while True:
chunk = file.read(HASH_CHUNK_SIZE)
if not chunk:
break
sha256_hasher.update(chunk)
md5_hasher.update(chunk)
sha1_hasher.update(chunk)
hasher.update(chunk)
size += len(chunk)
sha256_hash = sha256_hasher.hexdigest()
md5_hash = md5_hasher.hexdigest()
sha1_hash = sha1_hasher.hexdigest()
sha256_hash = hasher.hexdigest()
s3_key = f"fruits/{sha256_hash[:2]}/{sha256_hash[2:4]}/{sha256_hash}"
# Check if already exists (deduplication)
if self._exists(s3_key):
obj_info = self.get_object_info(s3_key)
s3_etag = obj_info.get("etag", "").strip('"') if obj_info else None
return StorageResult(
sha256=sha256_hash,
size=size,
s3_key=s3_key,
md5=md5_hash,
sha1=sha1_hash,
s3_etag=s3_etag,
)
return sha256_hash, size, s3_key
# Seek back to start for upload
file.seek(0)
@@ -157,22 +116,14 @@ class S3Storage:
part_number += 1
# Complete multipart upload
complete_response = self.client.complete_multipart_upload(
self.client.complete_multipart_upload(
Bucket=self.bucket,
Key=s3_key,
UploadId=upload_id,
MultipartUpload={"Parts": parts},
)
s3_etag = complete_response.get("ETag", "").strip('"')
return StorageResult(
sha256=sha256_hash,
size=size,
s3_key=s3_key,
md5=md5_hash,
sha1=sha1_hash,
s3_etag=s3_etag,
)
return sha256_hash, size, s3_key
except Exception as e:
# Abort multipart upload on failure
@@ -184,50 +135,33 @@ class S3Storage:
)
raise
def store_streaming(self, chunks: Generator[bytes, None, None]) -> StorageResult:
def store_streaming(self, chunks: Generator[bytes, None, None]) -> Tuple[str, int, str]:
"""
Store a file from a stream of chunks.
First accumulates to compute hash, then uploads.
For truly large files, consider using initiate_resumable_upload instead.
"""
# Accumulate chunks and compute all hashes
sha256_hasher = hashlib.sha256()
md5_hasher = hashlib.md5()
sha1_hasher = hashlib.sha1()
# Accumulate chunks and compute hash
hasher = hashlib.sha256()
all_chunks = []
size = 0
for chunk in chunks:
sha256_hasher.update(chunk)
md5_hasher.update(chunk)
sha1_hasher.update(chunk)
hasher.update(chunk)
all_chunks.append(chunk)
size += len(chunk)
sha256_hash = sha256_hasher.hexdigest()
md5_hash = md5_hasher.hexdigest()
sha1_hash = sha1_hasher.hexdigest()
sha256_hash = hasher.hexdigest()
s3_key = f"fruits/{sha256_hash[:2]}/{sha256_hash[2:4]}/{sha256_hash}"
s3_etag = None
# Check if already exists
if self._exists(s3_key):
obj_info = self.get_object_info(s3_key)
s3_etag = obj_info.get("etag", "").strip('"') if obj_info else None
return StorageResult(
sha256=sha256_hash,
size=size,
s3_key=s3_key,
md5=md5_hash,
sha1=sha1_hash,
s3_etag=s3_etag,
)
return sha256_hash, size, s3_key
# Upload based on size
if size < MULTIPART_THRESHOLD:
content = b"".join(all_chunks)
response = self.client.put_object(Bucket=self.bucket, Key=s3_key, Body=content)
s3_etag = response.get("ETag", "").strip('"')
self.client.put_object(Bucket=self.bucket, Key=s3_key, Body=content)
else:
# Use multipart for large files
mpu = self.client.create_multipart_upload(Bucket=self.bucket, Key=s3_key)
@@ -271,13 +205,12 @@ class S3Storage:
"ETag": response["ETag"],
})
complete_response = self.client.complete_multipart_upload(
self.client.complete_multipart_upload(
Bucket=self.bucket,
Key=s3_key,
UploadId=upload_id,
MultipartUpload={"Parts": parts},
)
s3_etag = complete_response.get("ETag", "").strip('"')
except Exception as e:
logger.error(f"Streaming multipart upload failed: {e}")
@@ -288,14 +221,7 @@ class S3Storage:
)
raise
return StorageResult(
sha256=sha256_hash,
size=size,
s3_key=s3_key,
md5=md5_hash,
sha1=sha1_hash,
s3_etag=s3_etag,
)
return sha256_hash, size, s3_key
def initiate_resumable_upload(self, expected_hash: str) -> Dict[str, Any]:
"""

View File

@@ -1,122 +0,0 @@
version: '3.8'
services:
orchard-server:
build:
context: .
dockerfile: Dockerfile.local
ports:
- "8080:8080"
environment:
- ORCHARD_SERVER_HOST=0.0.0.0
- ORCHARD_SERVER_PORT=8080
- ORCHARD_DATABASE_HOST=postgres
- ORCHARD_DATABASE_PORT=5432
- ORCHARD_DATABASE_USER=orchard
- ORCHARD_DATABASE_PASSWORD=orchard_secret
- ORCHARD_DATABASE_DBNAME=orchard
- ORCHARD_DATABASE_SSLMODE=disable
- ORCHARD_S3_ENDPOINT=http://minio:9000
- ORCHARD_S3_REGION=us-east-1
- ORCHARD_S3_BUCKET=orchard-artifacts
- ORCHARD_S3_ACCESS_KEY_ID=minioadmin
- ORCHARD_S3_SECRET_ACCESS_KEY=minioadmin
- ORCHARD_S3_USE_PATH_STYLE=true
- ORCHARD_REDIS_HOST=redis
- ORCHARD_REDIS_PORT=6379
depends_on:
postgres:
condition: service_healthy
minio:
condition: service_healthy
redis:
condition: service_healthy
networks:
- orchard-network
restart: unless-stopped
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:8080/health"]
interval: 30s
timeout: 3s
start_period: 10s
retries: 3
postgres:
image: postgres:16-alpine
environment:
- POSTGRES_USER=orchard
- POSTGRES_PASSWORD=orchard_secret
- POSTGRES_DB=orchard
volumes:
- postgres-data-local:/var/lib/postgresql/data
- ./migrations:/docker-entrypoint-initdb.d:ro
ports:
- "5432:5432"
healthcheck:
test: ["CMD-SHELL", "pg_isready -U orchard -d orchard"]
interval: 10s
timeout: 5s
retries: 5
networks:
- orchard-network
restart: unless-stopped
minio:
image: minio/minio:latest
command: server /data --console-address ":9001"
environment:
- MINIO_ROOT_USER=minioadmin
- MINIO_ROOT_PASSWORD=minioadmin
volumes:
- minio-data-local:/data
ports:
- "9000:9000"
- "9001:9001"
healthcheck:
test: ["CMD", "mc", "ready", "local"]
interval: 10s
timeout: 5s
retries: 5
networks:
- orchard-network
restart: unless-stopped
minio-init:
image: minio/mc:latest
depends_on:
minio:
condition: service_healthy
entrypoint: >
/bin/sh -c "
mc alias set myminio http://minio:9000 minioadmin minioadmin;
mc mb myminio/orchard-artifacts --ignore-existing;
mc anonymous set download myminio/orchard-artifacts;
exit 0;
"
networks:
- orchard-network
redis:
image: redis:7-alpine
command: redis-server --appendonly yes
volumes:
- redis-data-local:/data
ports:
- "6379:6379"
healthcheck:
test: ["CMD", "redis-cli", "ping"]
interval: 10s
timeout: 5s
retries: 5
networks:
- orchard-network
restart: unless-stopped
volumes:
postgres-data-local:
minio-data-local:
redis-data-local:
networks:
orchard-network:
driver: bridge

View File

@@ -36,7 +36,7 @@ services:
restart: unless-stopped
postgres:
image: containers.global.bsf.tools/postgres:16-alpine
image: postgres:16-alpine
environment:
- POSTGRES_USER=orchard
- POSTGRES_PASSWORD=orchard_secret
@@ -56,7 +56,7 @@ services:
restart: unless-stopped
minio:
image: containers.global.bsf.tools/minio/minio:latest
image: minio/minio:latest
command: server /data --console-address ":9001"
environment:
- MINIO_ROOT_USER=minioadmin
@@ -76,7 +76,7 @@ services:
restart: unless-stopped
minio-init:
image: containers.global.bsf.tools/minio/mc:latest
image: minio/mc:latest
depends_on:
minio:
condition: service_healthy
@@ -91,7 +91,7 @@ services:
- orchard-network
redis:
image: containers.global.bsf.tools/redis:7-alpine
image: redis:7-alpine
command: redis-server --appendonly yes
volumes:
- redis-data:/data

View File

@@ -1,504 +0,0 @@
# Integrity Verification Workflow Design
This document defines the process for SHA256 checksum verification on artifact downloads, including failure handling and retry mechanisms.
## Overview
Orchard uses content-addressable storage where the artifact ID is the SHA256 hash of the content. This design leverages that property to provide configurable integrity verification during downloads.
## Current State
| Aspect | Status |
|--------|--------|
| Download streams content directly from S3 | ✅ Implemented |
| Artifact ID is the SHA256 hash | ✅ Implemented |
| S3 key derived from SHA256 hash | ✅ Implemented |
| Verification during download | ❌ Not implemented |
| Checksum headers in response | ❌ Not implemented |
| Retry mechanism on failure | ❌ Not implemented |
| Failure handling beyond S3 errors | ❌ Not implemented |
## Verification Modes
The verification mode is selected via query parameter `?verify=<mode>` or server-wide default via `ORCHARD_VERIFY_MODE`.
| Mode | Performance | Integrity | Use Case |
|------|-------------|-----------|----------|
| `none` | ⚡ Fastest | Client-side | Trusted networks, high throughput |
| `header` | ⚡ Fast | Client-side | Standard downloads, client verification |
| `stream` | 🔄 Moderate | Post-hoc server | Logging/auditing, non-blocking |
| `pre` | 🐢 Slower | Guaranteed | Critical downloads, untrusted storage |
| `strict` | 🐢 Slower | Guaranteed + Alert | Security-sensitive, compliance |
### Mode: None (Default)
**Behavior:**
- Stream content directly from S3 with no server-side processing
- Maximum download performance
- Client is responsible for verification
**Headers Returned:**
```
X-Checksum-SHA256: <expected_hash>
Content-Length: <expected_size>
```
**Flow:**
```
Client Request → Lookup Artifact → Stream from S3 → Client
```
### Mode: Header
**Behavior:**
- Stream content directly from S3
- Include comprehensive checksum headers
- Client performs verification using headers
**Headers Returned:**
```
X-Checksum-SHA256: <expected_hash>
Content-Length: <expected_size>
Digest: sha-256=<base64_encoded_hash>
ETag: "<sha256_hash>"
X-Content-SHA256: <expected_hash>
```
**Flow:**
```
Client Request → Lookup Artifact → Add Headers → Stream from S3 → Client Verifies
```
**Client Verification Example:**
```bash
# Download and verify
curl -OJ https://orchard/project/foo/bar/+/v1.0.0
EXPECTED=$(curl -sI https://orchard/project/foo/bar/+/v1.0.0 | grep X-Checksum-SHA256 | cut -d' ' -f2)
ACTUAL=$(sha256sum downloaded_file | cut -d' ' -f1)
[ "$EXPECTED" = "$ACTUAL" ] && echo "OK" || echo "MISMATCH"
```
### Mode: Stream (Post-Hoc Verification)
**Behavior:**
- Wrap S3 stream with `HashingStreamWrapper`
- Compute SHA256 incrementally while streaming to client
- Verify hash after stream completes
- Log verification result
- Cannot reject content (already sent to client)
**Headers Returned:**
```
X-Checksum-SHA256: <expected_hash>
Content-Length: <expected_size>
X-Verify-Mode: stream
Trailer: X-Verified
```
**Trailers (if client supports):**
```
X-Verified: true|false
X-Computed-SHA256: <computed_hash>
```
**Flow:**
```
Client Request → Lookup Artifact → Wrap Stream → Stream to Client
Compute Hash Incrementally
Verify After Complete → Log Result
```
**Implementation:**
```python
class HashingStreamWrapper:
def __init__(self, stream, expected_hash: str, on_complete: Callable):
self.stream = stream
self.hasher = hashlib.sha256()
self.expected_hash = expected_hash
self.on_complete = on_complete
def __iter__(self):
for chunk in self.stream:
self.hasher.update(chunk)
yield chunk
# Stream complete, verify
computed = self.hasher.hexdigest()
self.on_complete(computed == self.expected_hash, computed)
```
### Mode: Pre-Verify (Blocking)
**Behavior:**
- Download entire content from S3 to memory/temp file
- Compute SHA256 hash before sending to client
- On match: stream verified content to client
- On mismatch: retry from S3 (up to N times)
- If retries exhausted: return 500 error
**Headers Returned:**
```
X-Checksum-SHA256: <expected_hash>
Content-Length: <expected_size>
X-Verify-Mode: pre
X-Verified: true
```
**Flow:**
```
Client Request → Lookup Artifact → Download from S3 → Compute Hash
Hash Matches?
↓ ↓
Yes No
↓ ↓
Stream to Client Retry?
Yes → Loop
No → 500 Error
```
**Memory Considerations:**
- For files < `ORCHARD_VERIFY_MEMORY_LIMIT` (default 100MB): buffer in memory
- For larger files: use temporary file with streaming hash computation
- Cleanup temp files after response sent
### Mode: Strict
**Behavior:**
- Same as pre-verify but with no retries
- Fail immediately on any mismatch
- Quarantine artifact on failure (mark as potentially corrupted)
- Trigger alert/notification on failure
- For security-critical downloads
**Headers Returned (on success):**
```
X-Checksum-SHA256: <expected_hash>
Content-Length: <expected_size>
X-Verify-Mode: strict
X-Verified: true
```
**Error Response (on failure):**
```json
{
"error": "integrity_verification_failed",
"message": "Artifact content does not match expected checksum",
"expected_hash": "<expected>",
"computed_hash": "<computed>",
"artifact_id": "<id>",
"action_taken": "quarantined"
}
```
**Quarantine Process:**
1. Mark artifact `status = 'quarantined'` in database
2. Log security event to audit_logs
3. Optionally notify via webhook/email
4. Artifact becomes unavailable for download until resolved
## Failure Detection
### Failure Types
| Failure Type | Detection Method | Severity |
|--------------|------------------|----------|
| Hash mismatch | Computed SHA256 ≠ Expected | Critical |
| Size mismatch | Actual bytes ≠ `Content-Length` | High |
| S3 read error | boto3 exception | Medium |
| Truncated content | Stream ends early | High |
| S3 object missing | `NoSuchKey` error | Critical |
| ETag mismatch | S3 ETag ≠ expected | Medium |
### Detection Implementation
```python
class VerificationResult:
success: bool
failure_type: Optional[str] # hash_mismatch, size_mismatch, etc.
expected_hash: str
computed_hash: Optional[str]
expected_size: int
actual_size: Optional[int]
error_message: Optional[str]
retry_count: int
```
## Retry Mechanism
### Configuration
| Environment Variable | Default | Description |
|---------------------|---------|-------------|
| `ORCHARD_VERIFY_MAX_RETRIES` | 3 | Maximum retry attempts |
| `ORCHARD_VERIFY_RETRY_DELAY_MS` | 100 | Base delay between retries |
| `ORCHARD_VERIFY_RETRY_BACKOFF` | 2.0 | Exponential backoff multiplier |
| `ORCHARD_VERIFY_RETRY_MAX_DELAY_MS` | 5000 | Maximum delay cap |
### Backoff Formula
```
delay = min(base_delay * (backoff ^ attempt), max_delay)
```
Example with defaults:
- Attempt 1: 100ms
- Attempt 2: 200ms
- Attempt 3: 400ms
### Retry Flow
```python
async def download_with_retry(artifact, max_retries=3):
for attempt in range(max_retries + 1):
try:
content = await fetch_from_s3(artifact.s3_key)
computed_hash = compute_sha256(content)
if computed_hash == artifact.id:
return content # Success
# Hash mismatch
log.warning(f"Verification failed, attempt {attempt + 1}/{max_retries + 1}")
if attempt < max_retries:
delay = calculate_backoff(attempt)
await asyncio.sleep(delay / 1000)
else:
raise IntegrityError("Max retries exceeded")
except S3Error as e:
if attempt < max_retries:
delay = calculate_backoff(attempt)
await asyncio.sleep(delay / 1000)
else:
raise
```
### Retryable vs Non-Retryable Failures
**Retryable:**
- S3 read timeout
- S3 connection error
- Hash mismatch (may be transient S3 issue)
- Truncated content
**Non-Retryable:**
- S3 object not found (404)
- S3 access denied (403)
- Artifact not in database
- Strict mode failures
## Configuration Reference
### Environment Variables
```bash
# Verification mode (none, header, stream, pre, strict)
ORCHARD_VERIFY_MODE=none
# Retry settings
ORCHARD_VERIFY_MAX_RETRIES=3
ORCHARD_VERIFY_RETRY_DELAY_MS=100
ORCHARD_VERIFY_RETRY_BACKOFF=2.0
ORCHARD_VERIFY_RETRY_MAX_DELAY_MS=5000
# Memory limit for pre-verify buffering (bytes)
ORCHARD_VERIFY_MEMORY_LIMIT=104857600 # 100MB
# Strict mode settings
ORCHARD_VERIFY_QUARANTINE_ON_FAILURE=true
ORCHARD_VERIFY_ALERT_WEBHOOK=https://alerts.example.com/webhook
# Allow per-request mode override
ORCHARD_VERIFY_ALLOW_OVERRIDE=true
```
### Per-Request Override
When `ORCHARD_VERIFY_ALLOW_OVERRIDE=true`, clients can specify verification mode:
```
GET /api/v1/project/foo/bar/+/v1.0.0?verify=pre
GET /api/v1/project/foo/bar/+/v1.0.0?verify=none
```
## API Changes
### Download Endpoint
**Request:**
```
GET /api/v1/project/{project}/{package}/+/{ref}?verify={mode}
```
**New Query Parameters:**
| Parameter | Type | Default | Description |
|-----------|------|---------|-------------|
| `verify` | string | from config | Verification mode |
**New Response Headers:**
| Header | Description |
|--------|-------------|
| `X-Checksum-SHA256` | Expected SHA256 hash |
| `X-Verify-Mode` | Active verification mode |
| `X-Verified` | `true` if server verified content |
| `Digest` | RFC 3230 digest header |
### New Endpoint: Verify Artifact
**Request:**
```
POST /api/v1/project/{project}/{package}/+/{ref}/verify
```
**Response:**
```json
{
"artifact_id": "abc123...",
"verified": true,
"expected_hash": "abc123...",
"computed_hash": "abc123...",
"size_match": true,
"expected_size": 1048576,
"actual_size": 1048576,
"verification_time_ms": 45
}
```
## Logging and Monitoring
### Log Events
| Event | Level | When |
|-------|-------|------|
| `verification.success` | INFO | Hash verified successfully |
| `verification.failure` | ERROR | Hash mismatch detected |
| `verification.retry` | WARN | Retry attempt initiated |
| `verification.quarantine` | ERROR | Artifact quarantined |
| `verification.skip` | DEBUG | Verification skipped (mode=none) |
### Metrics
| Metric | Type | Description |
|--------|------|-------------|
| `orchard_verification_total` | Counter | Total verification attempts |
| `orchard_verification_failures` | Counter | Failed verifications |
| `orchard_verification_retries` | Counter | Retry attempts |
| `orchard_verification_duration_ms` | Histogram | Verification time |
### Audit Log Entry
```json
{
"action": "artifact.download.verified",
"resource": "project/foo/package/bar/artifact/abc123",
"user_id": "user@example.com",
"details": {
"verification_mode": "pre",
"verified": true,
"retry_count": 0,
"duration_ms": 45
}
}
```
## Security Considerations
1. **Strict Mode for Sensitive Data**: Use strict mode for artifacts containing credentials, certificates, or security-critical code.
2. **Quarantine Isolation**: Quarantined artifacts should be moved to a separate S3 prefix or bucket for forensic analysis.
3. **Alert on Repeated Failures**: Multiple verification failures for the same artifact may indicate storage corruption or tampering.
4. **Audit Trail**: All verification events should be logged for compliance and forensic purposes.
5. **Client Trust**: In `none` and `header` modes, clients must implement their own verification for security guarantees.
## Implementation Phases
### Phase 1: Headers Only
- Add `X-Checksum-SHA256` header to all downloads
- Add `verify=header` mode support
- Add configuration options
### Phase 2: Stream Verification
- Implement `HashingStreamWrapper`
- Add `verify=stream` mode
- Add verification logging
### Phase 3: Pre-Verification
- Implement buffered verification
- Add retry mechanism
- Add `verify=pre` mode
### Phase 4: Strict Mode
- Implement quarantine mechanism
- Add alerting integration
- Add `verify=strict` mode
## Client Integration Examples
### curl with Verification
```bash
#!/bin/bash
URL="https://orchard.example.com/api/v1/project/myproject/mypackage/+/v1.0.0"
# Get expected hash from headers
EXPECTED=$(curl -sI "$URL" | grep -i "X-Checksum-SHA256" | tr -d '\r' | cut -d' ' -f2)
# Download file
curl -sO "$URL"
FILENAME=$(basename "$URL")
# Verify
ACTUAL=$(sha256sum "$FILENAME" | cut -d' ' -f1)
if [ "$EXPECTED" = "$ACTUAL" ]; then
echo "✓ Verification passed"
else
echo "✗ Verification FAILED"
echo " Expected: $EXPECTED"
echo " Actual: $ACTUAL"
exit 1
fi
```
### Python Client
```python
import hashlib
import requests
def download_verified(url: str) -> bytes:
# Get headers first
head = requests.head(url)
expected_hash = head.headers.get('X-Checksum-SHA256')
expected_size = int(head.headers.get('Content-Length', 0))
# Download content
response = requests.get(url)
content = response.content
# Verify size
if len(content) != expected_size:
raise ValueError(f"Size mismatch: {len(content)} != {expected_size}")
# Verify hash
actual_hash = hashlib.sha256(content).hexdigest()
if actual_hash != expected_hash:
raise ValueError(f"Hash mismatch: {actual_hash} != {expected_hash}")
return content
```
### Server-Side Verification
```bash
# Force server to verify before sending
curl -O "https://orchard.example.com/api/v1/project/myproject/mypackage/+/v1.0.0?verify=pre"
# Check if verification was performed
curl -I "https://orchard.example.com/api/v1/project/myproject/mypackage/+/v1.0.0?verify=pre" | grep X-Verified
# X-Verified: true
```

View File

@@ -41,8 +41,6 @@ CREATE TABLE IF NOT EXISTS artifacts (
content_type VARCHAR(255),
original_name VARCHAR(1024),
checksum_md5 VARCHAR(32), -- MD5 hash for additional verification
checksum_sha1 VARCHAR(40), -- SHA1 hash for compatibility
s3_etag VARCHAR(64), -- S3 ETag for verification
metadata JSONB, -- format-specific metadata
created_at TIMESTAMP WITH TIME ZONE DEFAULT NOW(),
created_by VARCHAR(255) NOT NULL,
@@ -53,8 +51,6 @@ CREATE TABLE IF NOT EXISTS artifacts (
CREATE INDEX idx_artifacts_created_at ON artifacts(created_at);
CREATE INDEX idx_artifacts_created_by ON artifacts(created_by);
CREATE INDEX idx_artifacts_metadata ON artifacts USING GIN (metadata);
CREATE INDEX idx_artifacts_checksum_md5 ON artifacts(checksum_md5) WHERE checksum_md5 IS NOT NULL;
CREATE INDEX idx_artifacts_checksum_sha1 ON artifacts(checksum_sha1) WHERE checksum_sha1 IS NOT NULL;
-- Tags (Aliases pointing to artifacts)
CREATE TABLE IF NOT EXISTS tags (

View File

@@ -1,12 +0,0 @@
-- Migration 003: Additional Checksum Fields
-- Adds checksum_sha1 and s3_etag fields to artifacts table
-- ============================================
-- Artifacts: Add checksum_sha1 and s3_etag fields
-- ============================================
ALTER TABLE artifacts ADD COLUMN IF NOT EXISTS checksum_sha1 VARCHAR(40);
ALTER TABLE artifacts ADD COLUMN IF NOT EXISTS s3_etag VARCHAR(64);
-- Create indexes for checksum lookups (optional, for verification queries)
CREATE INDEX IF NOT EXISTS idx_artifacts_checksum_md5 ON artifacts(checksum_md5) WHERE checksum_md5 IS NOT NULL;
CREATE INDEX IF NOT EXISTS idx_artifacts_checksum_sha1 ON artifacts(checksum_sha1) WHERE checksum_sha1 IS NOT NULL;