6 Commits

Author SHA1 Message Date
Mondo Diaz
ebcd1944bf Merge remote-tracking branch 'origin/main' into feature/integrity-verification-design 2025-12-15 12:43:05 -06:00
Mondo Diaz
b0d65f3509 Add integrity verification workflow design document
Define SHA256 checksum verification process for artifact downloads:
- Five verification modes: none, header, stream, pre, strict
- Failure detection for hash/size mismatch, S3 errors, truncation
- Retry mechanism with exponential backoff
- Quarantine process for strict mode failures
- Configuration options and client integration examples
2025-12-15 12:30:18 -06:00
Dane Moss
0eb2deb4ca Merge branch 'update_urls' into 'main'
update URLs to point to BSF

Closes #46

See merge request esv/bsf/bsf-integration/orchard/orchard-mvp!14
2025-12-15 11:30:07 -07:00
Dane Moss
3fe421f31d update URLs to point to BSF 2025-12-15 11:30:07 -07:00
Mondo Diaz
68660eacf6 Merge branch 'feature/schema-enhancements' into 'main'
Add schema enhancements for uploads, artifacts, and audit tracking

Closes #16

See merge request esv/bsf/bsf-integration/orchard/orchard-mvp!13
2025-12-12 15:23:50 -06:00
Mondo Diaz
b52c8840f1 Add schema enhancements for uploads, artifacts, and audit tracking 2025-12-12 15:23:50 -06:00
7 changed files with 808 additions and 11 deletions

View File

@@ -7,6 +7,24 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
## [Unreleased]
## [0.2.0] - 2025-12-15
### Changed
- Updated images to use internal container BSF proxy (#46)
### Added
- Added integrity verification workflow design document (#24)
- Added `format` and `platform` fields to packages table (#16)
- Added `checksum_md5` and `metadata` JSONB fields to artifacts table (#16)
- Added `updated_at` field to tags table (#16)
- Added `tag_name`, `user_agent`, `duration_ms`, `deduplicated`, `checksum_verified` fields to uploads table (#16)
- Added `change_type` field to tag_history table (#16)
- Added composite indexes for common query patterns (#16)
- Added GIN indexes on JSONB fields for efficient JSON queries (#16)
- Added partial index for public projects (#16)
- Added database triggers for `updated_at` timestamps (#16)
- Added database triggers for maintaining artifact `ref_count` accuracy (#16)
- Added CHECK constraints for data integrity (`size > 0`, `ref_count >= 0`) (#16)
- Added migration script `002_schema_enhancements.sql` for existing databases (#16)
## [0.1.0] - 2025-12-12
### Changed
- Changed the Dockerfile npm build arg to use the deps.global.bsf.tools URL as the default registry (#45)

View File

@@ -1,5 +1,5 @@
# Frontend build stage
FROM node:20-alpine AS frontend-builder
FROM containers.global.bsf.tools/node:20-alpine AS frontend-builder
ARG NPM_REGISTRY=https://deps.global.bsf.tools/artifactory/api/npm/registry.npmjs.org/
@@ -19,7 +19,7 @@ COPY frontend/ ./
RUN npm run build
# Runtime stage
FROM python:3.12-slim
FROM containers.global.bsf.tools/python:3.12-slim
# Disable proxy cache
RUN echo 'Acquire::http::Pipeline-Depth 0;\nAcquire::http::No-Cache true;\nAcquire::BrokenProxy true;\n' > /etc/apt/apt.conf.d/99fixbadproxy

View File

@@ -73,11 +73,12 @@ class Artifact(Base):
size = Column(BigInteger, nullable=False)
content_type = Column(String(255))
original_name = Column(String(1024))
checksum_md5 = Column(String(32)) # MD5 hash for additional verification
metadata = Column(JSON, default=dict) # Format-specific metadata
created_at = Column(DateTime(timezone=True), default=datetime.utcnow)
created_by = Column(String(255), nullable=False)
ref_count = Column(Integer, default=1)
s3_key = Column(String(1024), nullable=False)
format_metadata = Column(JSON, default=dict) # Format-specific metadata (version, etc.)
tags = relationship("Tag", back_populates="artifact")
uploads = relationship("Upload", back_populates="artifact")
@@ -99,6 +100,7 @@ class Tag(Base):
name = Column(String(255), nullable=False)
artifact_id = Column(String(64), ForeignKey("artifacts.id"), nullable=False)
created_at = Column(DateTime(timezone=True), default=datetime.utcnow)
updated_at = Column(DateTime(timezone=True), default=datetime.utcnow, onupdate=datetime.utcnow)
created_by = Column(String(255), nullable=False)
package = relationship("Package", back_populates="tags")
@@ -120,6 +122,7 @@ class TagHistory(Base):
tag_id = Column(UUID(as_uuid=True), ForeignKey("tags.id", ondelete="CASCADE"), nullable=False)
old_artifact_id = Column(String(64), ForeignKey("artifacts.id"))
new_artifact_id = Column(String(64), ForeignKey("artifacts.id"), nullable=False)
change_type = Column(String(20), nullable=False, default="update")
changed_at = Column(DateTime(timezone=True), default=datetime.utcnow)
changed_by = Column(String(255), nullable=False)
@@ -127,6 +130,8 @@ class TagHistory(Base):
__table_args__ = (
Index("idx_tag_history_tag_id", "tag_id"),
Index("idx_tag_history_changed_at", "changed_at"),
CheckConstraint("change_type IN ('create', 'update', 'delete')", name="check_change_type"),
)
@@ -137,6 +142,11 @@ class Upload(Base):
artifact_id = Column(String(64), ForeignKey("artifacts.id"), nullable=False)
package_id = Column(UUID(as_uuid=True), ForeignKey("packages.id"), nullable=False)
original_name = Column(String(1024))
tag_name = Column(String(255)) # Tag assigned during upload
user_agent = Column(String(512)) # Client identification
duration_ms = Column(Integer) # Upload timing in milliseconds
deduplicated = Column(Boolean, default=False) # Whether artifact was deduplicated
checksum_verified = Column(Boolean, default=True) # Whether checksum was verified
uploaded_at = Column(DateTime(timezone=True), default=datetime.utcnow)
uploaded_by = Column(String(255), nullable=False)
source_ip = Column(String(45))
@@ -148,6 +158,8 @@ class Upload(Base):
Index("idx_uploads_artifact_id", "artifact_id"),
Index("idx_uploads_package_id", "package_id"),
Index("idx_uploads_uploaded_at", "uploaded_at"),
Index("idx_uploads_package_uploaded_at", "package_id", "uploaded_at"),
Index("idx_uploads_uploaded_by_at", "uploaded_by", "uploaded_at"),
)
@@ -220,4 +232,6 @@ class AuditLog(Base):
Index("idx_audit_logs_resource", "resource"),
Index("idx_audit_logs_user_id", "user_id"),
Index("idx_audit_logs_timestamp", "timestamp"),
Index("idx_audit_logs_resource_timestamp", "resource", "timestamp"),
Index("idx_audit_logs_user_timestamp", "user_id", "timestamp"),
)

View File

@@ -36,7 +36,7 @@ services:
restart: unless-stopped
postgres:
image: postgres:16-alpine
image: containers.global.bsf.tools/postgres:16-alpine
environment:
- POSTGRES_USER=orchard
- POSTGRES_PASSWORD=orchard_secret
@@ -56,7 +56,7 @@ services:
restart: unless-stopped
minio:
image: minio/minio:latest
image: containers.global.bsf.tools/minio/minio:latest
command: server /data --console-address ":9001"
environment:
- MINIO_ROOT_USER=minioadmin
@@ -76,7 +76,7 @@ services:
restart: unless-stopped
minio-init:
image: minio/mc:latest
image: containers.global.bsf.tools/minio/mc:latest
depends_on:
minio:
condition: service_healthy
@@ -91,7 +91,7 @@ services:
- orchard-network
redis:
image: redis:7-alpine
image: containers.global.bsf.tools/redis:7-alpine
command: redis-server --appendonly yes
volumes:
- redis-data:/data

View File

@@ -0,0 +1,504 @@
# Integrity Verification Workflow Design
This document defines the process for SHA256 checksum verification on artifact downloads, including failure handling and retry mechanisms.
## Overview
Orchard uses content-addressable storage where the artifact ID is the SHA256 hash of the content. This design leverages that property to provide configurable integrity verification during downloads.
## Current State
| Aspect | Status |
|--------|--------|
| Download streams content directly from S3 | ✅ Implemented |
| Artifact ID is the SHA256 hash | ✅ Implemented |
| S3 key derived from SHA256 hash | ✅ Implemented |
| Verification during download | ❌ Not implemented |
| Checksum headers in response | ❌ Not implemented |
| Retry mechanism on failure | ❌ Not implemented |
| Failure handling beyond S3 errors | ❌ Not implemented |
## Verification Modes
The verification mode is selected via query parameter `?verify=<mode>` or server-wide default via `ORCHARD_VERIFY_MODE`.
| Mode | Performance | Integrity | Use Case |
|------|-------------|-----------|----------|
| `none` | ⚡ Fastest | Client-side | Trusted networks, high throughput |
| `header` | ⚡ Fast | Client-side | Standard downloads, client verification |
| `stream` | 🔄 Moderate | Post-hoc server | Logging/auditing, non-blocking |
| `pre` | 🐢 Slower | Guaranteed | Critical downloads, untrusted storage |
| `strict` | 🐢 Slower | Guaranteed + Alert | Security-sensitive, compliance |
### Mode: None (Default)
**Behavior:**
- Stream content directly from S3 with no server-side processing
- Maximum download performance
- Client is responsible for verification
**Headers Returned:**
```
X-Checksum-SHA256: <expected_hash>
Content-Length: <expected_size>
```
**Flow:**
```
Client Request → Lookup Artifact → Stream from S3 → Client
```
### Mode: Header
**Behavior:**
- Stream content directly from S3
- Include comprehensive checksum headers
- Client performs verification using headers
**Headers Returned:**
```
X-Checksum-SHA256: <expected_hash>
Content-Length: <expected_size>
Digest: sha-256=<base64_encoded_hash>
ETag: "<sha256_hash>"
X-Content-SHA256: <expected_hash>
```
**Flow:**
```
Client Request → Lookup Artifact → Add Headers → Stream from S3 → Client Verifies
```
**Client Verification Example:**
```bash
# Download and verify
curl -OJ https://orchard/project/foo/bar/+/v1.0.0
EXPECTED=$(curl -sI https://orchard/project/foo/bar/+/v1.0.0 | grep X-Checksum-SHA256 | cut -d' ' -f2)
ACTUAL=$(sha256sum downloaded_file | cut -d' ' -f1)
[ "$EXPECTED" = "$ACTUAL" ] && echo "OK" || echo "MISMATCH"
```
### Mode: Stream (Post-Hoc Verification)
**Behavior:**
- Wrap S3 stream with `HashingStreamWrapper`
- Compute SHA256 incrementally while streaming to client
- Verify hash after stream completes
- Log verification result
- Cannot reject content (already sent to client)
**Headers Returned:**
```
X-Checksum-SHA256: <expected_hash>
Content-Length: <expected_size>
X-Verify-Mode: stream
Trailer: X-Verified
```
**Trailers (if client supports):**
```
X-Verified: true|false
X-Computed-SHA256: <computed_hash>
```
**Flow:**
```
Client Request → Lookup Artifact → Wrap Stream → Stream to Client
Compute Hash Incrementally
Verify After Complete → Log Result
```
**Implementation:**
```python
class HashingStreamWrapper:
def __init__(self, stream, expected_hash: str, on_complete: Callable):
self.stream = stream
self.hasher = hashlib.sha256()
self.expected_hash = expected_hash
self.on_complete = on_complete
def __iter__(self):
for chunk in self.stream:
self.hasher.update(chunk)
yield chunk
# Stream complete, verify
computed = self.hasher.hexdigest()
self.on_complete(computed == self.expected_hash, computed)
```
### Mode: Pre-Verify (Blocking)
**Behavior:**
- Download entire content from S3 to memory/temp file
- Compute SHA256 hash before sending to client
- On match: stream verified content to client
- On mismatch: retry from S3 (up to N times)
- If retries exhausted: return 500 error
**Headers Returned:**
```
X-Checksum-SHA256: <expected_hash>
Content-Length: <expected_size>
X-Verify-Mode: pre
X-Verified: true
```
**Flow:**
```
Client Request → Lookup Artifact → Download from S3 → Compute Hash
Hash Matches?
↓ ↓
Yes No
↓ ↓
Stream to Client Retry?
Yes → Loop
No → 500 Error
```
**Memory Considerations:**
- For files < `ORCHARD_VERIFY_MEMORY_LIMIT` (default 100MB): buffer in memory
- For larger files: use temporary file with streaming hash computation
- Cleanup temp files after response sent
### Mode: Strict
**Behavior:**
- Same as pre-verify but with no retries
- Fail immediately on any mismatch
- Quarantine artifact on failure (mark as potentially corrupted)
- Trigger alert/notification on failure
- For security-critical downloads
**Headers Returned (on success):**
```
X-Checksum-SHA256: <expected_hash>
Content-Length: <expected_size>
X-Verify-Mode: strict
X-Verified: true
```
**Error Response (on failure):**
```json
{
"error": "integrity_verification_failed",
"message": "Artifact content does not match expected checksum",
"expected_hash": "<expected>",
"computed_hash": "<computed>",
"artifact_id": "<id>",
"action_taken": "quarantined"
}
```
**Quarantine Process:**
1. Mark artifact `status = 'quarantined'` in database
2. Log security event to audit_logs
3. Optionally notify via webhook/email
4. Artifact becomes unavailable for download until resolved
## Failure Detection
### Failure Types
| Failure Type | Detection Method | Severity |
|--------------|------------------|----------|
| Hash mismatch | Computed SHA256 ≠ Expected | Critical |
| Size mismatch | Actual bytes ≠ `Content-Length` | High |
| S3 read error | boto3 exception | Medium |
| Truncated content | Stream ends early | High |
| S3 object missing | `NoSuchKey` error | Critical |
| ETag mismatch | S3 ETag ≠ expected | Medium |
### Detection Implementation
```python
class VerificationResult:
success: bool
failure_type: Optional[str] # hash_mismatch, size_mismatch, etc.
expected_hash: str
computed_hash: Optional[str]
expected_size: int
actual_size: Optional[int]
error_message: Optional[str]
retry_count: int
```
## Retry Mechanism
### Configuration
| Environment Variable | Default | Description |
|---------------------|---------|-------------|
| `ORCHARD_VERIFY_MAX_RETRIES` | 3 | Maximum retry attempts |
| `ORCHARD_VERIFY_RETRY_DELAY_MS` | 100 | Base delay between retries |
| `ORCHARD_VERIFY_RETRY_BACKOFF` | 2.0 | Exponential backoff multiplier |
| `ORCHARD_VERIFY_RETRY_MAX_DELAY_MS` | 5000 | Maximum delay cap |
### Backoff Formula
```
delay = min(base_delay * (backoff ^ attempt), max_delay)
```
Example with defaults:
- Attempt 1: 100ms
- Attempt 2: 200ms
- Attempt 3: 400ms
### Retry Flow
```python
async def download_with_retry(artifact, max_retries=3):
for attempt in range(max_retries + 1):
try:
content = await fetch_from_s3(artifact.s3_key)
computed_hash = compute_sha256(content)
if computed_hash == artifact.id:
return content # Success
# Hash mismatch
log.warning(f"Verification failed, attempt {attempt + 1}/{max_retries + 1}")
if attempt < max_retries:
delay = calculate_backoff(attempt)
await asyncio.sleep(delay / 1000)
else:
raise IntegrityError("Max retries exceeded")
except S3Error as e:
if attempt < max_retries:
delay = calculate_backoff(attempt)
await asyncio.sleep(delay / 1000)
else:
raise
```
### Retryable vs Non-Retryable Failures
**Retryable:**
- S3 read timeout
- S3 connection error
- Hash mismatch (may be transient S3 issue)
- Truncated content
**Non-Retryable:**
- S3 object not found (404)
- S3 access denied (403)
- Artifact not in database
- Strict mode failures
## Configuration Reference
### Environment Variables
```bash
# Verification mode (none, header, stream, pre, strict)
ORCHARD_VERIFY_MODE=none
# Retry settings
ORCHARD_VERIFY_MAX_RETRIES=3
ORCHARD_VERIFY_RETRY_DELAY_MS=100
ORCHARD_VERIFY_RETRY_BACKOFF=2.0
ORCHARD_VERIFY_RETRY_MAX_DELAY_MS=5000
# Memory limit for pre-verify buffering (bytes)
ORCHARD_VERIFY_MEMORY_LIMIT=104857600 # 100MB
# Strict mode settings
ORCHARD_VERIFY_QUARANTINE_ON_FAILURE=true
ORCHARD_VERIFY_ALERT_WEBHOOK=https://alerts.example.com/webhook
# Allow per-request mode override
ORCHARD_VERIFY_ALLOW_OVERRIDE=true
```
### Per-Request Override
When `ORCHARD_VERIFY_ALLOW_OVERRIDE=true`, clients can specify verification mode:
```
GET /api/v1/project/foo/bar/+/v1.0.0?verify=pre
GET /api/v1/project/foo/bar/+/v1.0.0?verify=none
```
## API Changes
### Download Endpoint
**Request:**
```
GET /api/v1/project/{project}/{package}/+/{ref}?verify={mode}
```
**New Query Parameters:**
| Parameter | Type | Default | Description |
|-----------|------|---------|-------------|
| `verify` | string | from config | Verification mode |
**New Response Headers:**
| Header | Description |
|--------|-------------|
| `X-Checksum-SHA256` | Expected SHA256 hash |
| `X-Verify-Mode` | Active verification mode |
| `X-Verified` | `true` if server verified content |
| `Digest` | RFC 3230 digest header |
### New Endpoint: Verify Artifact
**Request:**
```
POST /api/v1/project/{project}/{package}/+/{ref}/verify
```
**Response:**
```json
{
"artifact_id": "abc123...",
"verified": true,
"expected_hash": "abc123...",
"computed_hash": "abc123...",
"size_match": true,
"expected_size": 1048576,
"actual_size": 1048576,
"verification_time_ms": 45
}
```
## Logging and Monitoring
### Log Events
| Event | Level | When |
|-------|-------|------|
| `verification.success` | INFO | Hash verified successfully |
| `verification.failure` | ERROR | Hash mismatch detected |
| `verification.retry` | WARN | Retry attempt initiated |
| `verification.quarantine` | ERROR | Artifact quarantined |
| `verification.skip` | DEBUG | Verification skipped (mode=none) |
### Metrics
| Metric | Type | Description |
|--------|------|-------------|
| `orchard_verification_total` | Counter | Total verification attempts |
| `orchard_verification_failures` | Counter | Failed verifications |
| `orchard_verification_retries` | Counter | Retry attempts |
| `orchard_verification_duration_ms` | Histogram | Verification time |
### Audit Log Entry
```json
{
"action": "artifact.download.verified",
"resource": "project/foo/package/bar/artifact/abc123",
"user_id": "user@example.com",
"details": {
"verification_mode": "pre",
"verified": true,
"retry_count": 0,
"duration_ms": 45
}
}
```
## Security Considerations
1. **Strict Mode for Sensitive Data**: Use strict mode for artifacts containing credentials, certificates, or security-critical code.
2. **Quarantine Isolation**: Quarantined artifacts should be moved to a separate S3 prefix or bucket for forensic analysis.
3. **Alert on Repeated Failures**: Multiple verification failures for the same artifact may indicate storage corruption or tampering.
4. **Audit Trail**: All verification events should be logged for compliance and forensic purposes.
5. **Client Trust**: In `none` and `header` modes, clients must implement their own verification for security guarantees.
## Implementation Phases
### Phase 1: Headers Only
- Add `X-Checksum-SHA256` header to all downloads
- Add `verify=header` mode support
- Add configuration options
### Phase 2: Stream Verification
- Implement `HashingStreamWrapper`
- Add `verify=stream` mode
- Add verification logging
### Phase 3: Pre-Verification
- Implement buffered verification
- Add retry mechanism
- Add `verify=pre` mode
### Phase 4: Strict Mode
- Implement quarantine mechanism
- Add alerting integration
- Add `verify=strict` mode
## Client Integration Examples
### curl with Verification
```bash
#!/bin/bash
URL="https://orchard.example.com/api/v1/project/myproject/mypackage/+/v1.0.0"
# Get expected hash from headers
EXPECTED=$(curl -sI "$URL" | grep -i "X-Checksum-SHA256" | tr -d '\r' | cut -d' ' -f2)
# Download file
curl -sO "$URL"
FILENAME=$(basename "$URL")
# Verify
ACTUAL=$(sha256sum "$FILENAME" | cut -d' ' -f1)
if [ "$EXPECTED" = "$ACTUAL" ]; then
echo "✓ Verification passed"
else
echo "✗ Verification FAILED"
echo " Expected: $EXPECTED"
echo " Actual: $ACTUAL"
exit 1
fi
```
### Python Client
```python
import hashlib
import requests
def download_verified(url: str) -> bytes:
# Get headers first
head = requests.head(url)
expected_hash = head.headers.get('X-Checksum-SHA256')
expected_size = int(head.headers.get('Content-Length', 0))
# Download content
response = requests.get(url)
content = response.content
# Verify size
if len(content) != expected_size:
raise ValueError(f"Size mismatch: {len(content)} != {expected_size}")
# Verify hash
actual_hash = hashlib.sha256(content).hexdigest()
if actual_hash != expected_hash:
raise ValueError(f"Hash mismatch: {actual_hash} != {expected_hash}")
return content
```
### Server-Side Verification
```bash
# Force server to verify before sending
curl -O "https://orchard.example.com/api/v1/project/myproject/mypackage/+/v1.0.0?verify=pre"
# Check if verification was performed
curl -I "https://orchard.example.com/api/v1/project/myproject/mypackage/+/v1.0.0?verify=pre" | grep X-Verified
# X-Verified: true
```

View File

@@ -14,6 +14,7 @@ CREATE TABLE IF NOT EXISTS projects (
CREATE INDEX idx_projects_name ON projects(name);
CREATE INDEX idx_projects_created_by ON projects(created_by);
CREATE INDEX idx_projects_public ON projects(name) WHERE is_public = true;
-- Packages (collections within projects)
CREATE TABLE IF NOT EXISTS packages (
@@ -21,6 +22,8 @@ CREATE TABLE IF NOT EXISTS packages (
project_id UUID NOT NULL REFERENCES projects(id) ON DELETE CASCADE,
name VARCHAR(255) NOT NULL,
description TEXT,
format VARCHAR(50) DEFAULT 'generic', -- package type: generic, npm, pypi, docker, etc.
platform VARCHAR(50) DEFAULT 'any', -- target platform: any, linux, darwin, windows, etc.
created_at TIMESTAMP WITH TIME ZONE DEFAULT NOW(),
updated_at TIMESTAMP WITH TIME ZONE DEFAULT NOW(),
UNIQUE(project_id, name)
@@ -28,21 +31,26 @@ CREATE TABLE IF NOT EXISTS packages (
CREATE INDEX idx_packages_project_id ON packages(project_id);
CREATE INDEX idx_packages_name ON packages(name);
CREATE INDEX idx_packages_format ON packages(format);
CREATE INDEX idx_packages_platform ON packages(platform);
-- Artifacts (Content-Addressable)
CREATE TABLE IF NOT EXISTS artifacts (
id VARCHAR(64) PRIMARY KEY, -- SHA256 hash
size BIGINT NOT NULL,
size BIGINT NOT NULL CHECK (size > 0),
content_type VARCHAR(255),
original_name VARCHAR(1024),
checksum_md5 VARCHAR(32), -- MD5 hash for additional verification
metadata JSONB, -- format-specific metadata
created_at TIMESTAMP WITH TIME ZONE DEFAULT NOW(),
created_by VARCHAR(255) NOT NULL,
ref_count INTEGER DEFAULT 1,
ref_count INTEGER DEFAULT 1 CHECK (ref_count >= 0),
s3_key VARCHAR(1024) NOT NULL
);
CREATE INDEX idx_artifacts_created_at ON artifacts(created_at);
CREATE INDEX idx_artifacts_created_by ON artifacts(created_by);
CREATE INDEX idx_artifacts_metadata ON artifacts USING GIN (metadata);
-- Tags (Aliases pointing to artifacts)
CREATE TABLE IF NOT EXISTS tags (
@@ -51,12 +59,14 @@ CREATE TABLE IF NOT EXISTS tags (
name VARCHAR(255) NOT NULL,
artifact_id VARCHAR(64) NOT NULL REFERENCES artifacts(id),
created_at TIMESTAMP WITH TIME ZONE DEFAULT NOW(),
updated_at TIMESTAMP WITH TIME ZONE DEFAULT NOW(),
created_by VARCHAR(255) NOT NULL,
UNIQUE(package_id, name)
);
CREATE INDEX idx_tags_package_id ON tags(package_id);
CREATE INDEX idx_tags_artifact_id ON tags(artifact_id);
CREATE INDEX idx_tags_package_created_at ON tags(package_id, created_at DESC);
-- Tag History (for rollback capability)
CREATE TABLE IF NOT EXISTS tag_history (
@@ -64,11 +74,13 @@ CREATE TABLE IF NOT EXISTS tag_history (
tag_id UUID NOT NULL REFERENCES tags(id) ON DELETE CASCADE,
old_artifact_id VARCHAR(64) REFERENCES artifacts(id),
new_artifact_id VARCHAR(64) NOT NULL REFERENCES artifacts(id),
change_type VARCHAR(20) NOT NULL DEFAULT 'update' CHECK (change_type IN ('create', 'update', 'delete')),
changed_at TIMESTAMP WITH TIME ZONE DEFAULT NOW(),
changed_by VARCHAR(255) NOT NULL
);
CREATE INDEX idx_tag_history_tag_id ON tag_history(tag_id);
CREATE INDEX idx_tag_history_changed_at ON tag_history(changed_at);
-- Uploads (upload event records)
CREATE TABLE IF NOT EXISTS uploads (
@@ -76,6 +88,11 @@ CREATE TABLE IF NOT EXISTS uploads (
artifact_id VARCHAR(64) NOT NULL REFERENCES artifacts(id),
package_id UUID NOT NULL REFERENCES packages(id),
original_name VARCHAR(1024),
tag_name VARCHAR(255), -- tag assigned during upload
user_agent VARCHAR(512), -- client identification
duration_ms INTEGER, -- upload timing in milliseconds
deduplicated BOOLEAN DEFAULT false, -- whether artifact was deduplicated
checksum_verified BOOLEAN DEFAULT true, -- whether checksum was verified
uploaded_at TIMESTAMP WITH TIME ZONE DEFAULT NOW(),
uploaded_by VARCHAR(255) NOT NULL,
source_ip VARCHAR(45)
@@ -84,6 +101,8 @@ CREATE TABLE IF NOT EXISTS uploads (
CREATE INDEX idx_uploads_artifact_id ON uploads(artifact_id);
CREATE INDEX idx_uploads_package_id ON uploads(package_id);
CREATE INDEX idx_uploads_uploaded_at ON uploads(uploaded_at);
CREATE INDEX idx_uploads_package_uploaded_at ON uploads(package_id, uploaded_at DESC);
CREATE INDEX idx_uploads_uploaded_by_at ON uploads(uploaded_by, uploaded_at DESC);
-- Consumers (Dependency tracking)
CREATE TABLE IF NOT EXISTS consumers (
@@ -141,14 +160,17 @@ CREATE INDEX idx_audit_logs_action ON audit_logs(action);
CREATE INDEX idx_audit_logs_resource ON audit_logs(resource);
CREATE INDEX idx_audit_logs_user_id ON audit_logs(user_id);
CREATE INDEX idx_audit_logs_timestamp ON audit_logs(timestamp);
CREATE INDEX idx_audit_logs_resource_timestamp ON audit_logs(resource, timestamp DESC);
CREATE INDEX idx_audit_logs_user_timestamp ON audit_logs(user_id, timestamp DESC);
CREATE INDEX idx_audit_logs_details ON audit_logs USING GIN (details);
-- Trigger to update tag history on changes
CREATE OR REPLACE FUNCTION track_tag_changes()
RETURNS TRIGGER AS $$
BEGIN
IF TG_OP = 'UPDATE' AND OLD.artifact_id != NEW.artifact_id THEN
INSERT INTO tag_history (id, tag_id, old_artifact_id, new_artifact_id, changed_at, changed_by)
VALUES (gen_random_uuid(), NEW.id, OLD.artifact_id, NEW.artifact_id, NOW(), NEW.created_by);
INSERT INTO tag_history (id, tag_id, old_artifact_id, new_artifact_id, change_type, changed_at, changed_by)
VALUES (gen_random_uuid(), NEW.id, OLD.artifact_id, NEW.artifact_id, 'update', NOW(), NEW.created_by);
END IF;
RETURN NEW;
END;
@@ -158,3 +180,72 @@ CREATE TRIGGER tag_changes_trigger
AFTER UPDATE ON tags
FOR EACH ROW
EXECUTE FUNCTION track_tag_changes();
-- Trigger to auto-update updated_at timestamps
CREATE OR REPLACE FUNCTION update_updated_at_column()
RETURNS TRIGGER AS $$
BEGIN
NEW.updated_at = NOW();
RETURN NEW;
END;
$$ LANGUAGE plpgsql;
CREATE TRIGGER projects_updated_at_trigger
BEFORE UPDATE ON projects
FOR EACH ROW
EXECUTE FUNCTION update_updated_at_column();
CREATE TRIGGER packages_updated_at_trigger
BEFORE UPDATE ON packages
FOR EACH ROW
EXECUTE FUNCTION update_updated_at_column();
CREATE TRIGGER tags_updated_at_trigger
BEFORE UPDATE ON tags
FOR EACH ROW
EXECUTE FUNCTION update_updated_at_column();
-- Triggers for maintaining artifact ref_count accuracy
CREATE OR REPLACE FUNCTION increment_artifact_ref_count()
RETURNS TRIGGER AS $$
BEGIN
UPDATE artifacts SET ref_count = ref_count + 1 WHERE id = NEW.artifact_id;
RETURN NEW;
END;
$$ LANGUAGE plpgsql;
CREATE OR REPLACE FUNCTION decrement_artifact_ref_count()
RETURNS TRIGGER AS $$
BEGIN
UPDATE artifacts SET ref_count = ref_count - 1 WHERE id = OLD.artifact_id;
RETURN OLD;
END;
$$ LANGUAGE plpgsql;
CREATE OR REPLACE FUNCTION update_artifact_ref_count()
RETURNS TRIGGER AS $$
BEGIN
IF OLD.artifact_id != NEW.artifact_id THEN
UPDATE artifacts SET ref_count = ref_count - 1 WHERE id = OLD.artifact_id;
UPDATE artifacts SET ref_count = ref_count + 1 WHERE id = NEW.artifact_id;
END IF;
RETURN NEW;
END;
$$ LANGUAGE plpgsql;
-- Note: ref_count triggers on tags table
-- These track how many tags reference each artifact
CREATE TRIGGER tags_ref_count_insert_trigger
AFTER INSERT ON tags
FOR EACH ROW
EXECUTE FUNCTION increment_artifact_ref_count();
CREATE TRIGGER tags_ref_count_delete_trigger
AFTER DELETE ON tags
FOR EACH ROW
EXECUTE FUNCTION decrement_artifact_ref_count();
CREATE TRIGGER tags_ref_count_update_trigger
AFTER UPDATE ON tags
FOR EACH ROW
EXECUTE FUNCTION update_artifact_ref_count();

View File

@@ -0,0 +1,170 @@
-- Migration 002: Schema Enhancements
-- Adds new fields, indexes, and triggers for improved functionality
-- ============================================
-- Packages: Add format and platform fields
-- ============================================
ALTER TABLE packages ADD COLUMN IF NOT EXISTS format VARCHAR(50) DEFAULT 'generic';
ALTER TABLE packages ADD COLUMN IF NOT EXISTS platform VARCHAR(50) DEFAULT 'any';
CREATE INDEX IF NOT EXISTS idx_packages_format ON packages(format);
CREATE INDEX IF NOT EXISTS idx_packages_platform ON packages(platform);
-- ============================================
-- Artifacts: Add checksum_md5, metadata, and CHECK constraints
-- ============================================
ALTER TABLE artifacts ADD COLUMN IF NOT EXISTS checksum_md5 VARCHAR(32);
ALTER TABLE artifacts ADD COLUMN IF NOT EXISTS metadata JSONB;
-- Add CHECK constraints (will fail if data violates them)
DO $$
BEGIN
IF NOT EXISTS (SELECT 1 FROM pg_constraint WHERE conname = 'artifacts_ref_count_check') THEN
ALTER TABLE artifacts ADD CONSTRAINT artifacts_ref_count_check CHECK (ref_count >= 0);
END IF;
IF NOT EXISTS (SELECT 1 FROM pg_constraint WHERE conname = 'artifacts_size_check') THEN
ALTER TABLE artifacts ADD CONSTRAINT artifacts_size_check CHECK (size > 0);
END IF;
END $$;
CREATE INDEX IF NOT EXISTS idx_artifacts_metadata ON artifacts USING GIN (metadata);
-- ============================================
-- Tags: Add updated_at and composite index
-- ============================================
ALTER TABLE tags ADD COLUMN IF NOT EXISTS updated_at TIMESTAMP WITH TIME ZONE DEFAULT NOW();
CREATE INDEX IF NOT EXISTS idx_tags_package_created_at ON tags(package_id, created_at DESC);
-- ============================================
-- Tag History: Add change_type and index
-- ============================================
ALTER TABLE tag_history ADD COLUMN IF NOT EXISTS change_type VARCHAR(20) DEFAULT 'update';
DO $$
BEGIN
IF NOT EXISTS (SELECT 1 FROM pg_constraint WHERE conname = 'tag_history_change_type_check') THEN
ALTER TABLE tag_history ADD CONSTRAINT tag_history_change_type_check
CHECK (change_type IN ('create', 'update', 'delete'));
END IF;
END $$;
CREATE INDEX IF NOT EXISTS idx_tag_history_changed_at ON tag_history(changed_at);
-- ============================================
-- Uploads: Add new fields and composite indexes
-- ============================================
ALTER TABLE uploads ADD COLUMN IF NOT EXISTS tag_name VARCHAR(255);
ALTER TABLE uploads ADD COLUMN IF NOT EXISTS user_agent VARCHAR(512);
ALTER TABLE uploads ADD COLUMN IF NOT EXISTS duration_ms INTEGER;
ALTER TABLE uploads ADD COLUMN IF NOT EXISTS deduplicated BOOLEAN DEFAULT false;
ALTER TABLE uploads ADD COLUMN IF NOT EXISTS checksum_verified BOOLEAN DEFAULT true;
CREATE INDEX IF NOT EXISTS idx_uploads_package_uploaded_at ON uploads(package_id, uploaded_at DESC);
CREATE INDEX IF NOT EXISTS idx_uploads_uploaded_by_at ON uploads(uploaded_by, uploaded_at DESC);
-- ============================================
-- Audit Logs: Add composite indexes and GIN index
-- ============================================
CREATE INDEX IF NOT EXISTS idx_audit_logs_resource_timestamp ON audit_logs(resource, timestamp DESC);
CREATE INDEX IF NOT EXISTS idx_audit_logs_user_timestamp ON audit_logs(user_id, timestamp DESC);
CREATE INDEX IF NOT EXISTS idx_audit_logs_details ON audit_logs USING GIN (details);
-- ============================================
-- Projects: Add partial index for public projects
-- ============================================
CREATE INDEX IF NOT EXISTS idx_projects_public ON projects(name) WHERE is_public = true;
-- ============================================
-- Triggers: Update tag_changes trigger for change_type
-- ============================================
CREATE OR REPLACE FUNCTION track_tag_changes()
RETURNS TRIGGER AS $$
BEGIN
IF TG_OP = 'UPDATE' AND OLD.artifact_id != NEW.artifact_id THEN
INSERT INTO tag_history (id, tag_id, old_artifact_id, new_artifact_id, change_type, changed_at, changed_by)
VALUES (gen_random_uuid(), NEW.id, OLD.artifact_id, NEW.artifact_id, 'update', NOW(), NEW.created_by);
END IF;
RETURN NEW;
END;
$$ LANGUAGE plpgsql;
-- ============================================
-- Triggers: Auto-update updated_at timestamps
-- ============================================
CREATE OR REPLACE FUNCTION update_updated_at_column()
RETURNS TRIGGER AS $$
BEGIN
NEW.updated_at = NOW();
RETURN NEW;
END;
$$ LANGUAGE plpgsql;
-- Drop triggers if they exist, then recreate
DROP TRIGGER IF EXISTS projects_updated_at_trigger ON projects;
CREATE TRIGGER projects_updated_at_trigger
BEFORE UPDATE ON projects
FOR EACH ROW
EXECUTE FUNCTION update_updated_at_column();
DROP TRIGGER IF EXISTS packages_updated_at_trigger ON packages;
CREATE TRIGGER packages_updated_at_trigger
BEFORE UPDATE ON packages
FOR EACH ROW
EXECUTE FUNCTION update_updated_at_column();
DROP TRIGGER IF EXISTS tags_updated_at_trigger ON tags;
CREATE TRIGGER tags_updated_at_trigger
BEFORE UPDATE ON tags
FOR EACH ROW
EXECUTE FUNCTION update_updated_at_column();
-- ============================================
-- Triggers: Maintain artifact ref_count accuracy
-- ============================================
CREATE OR REPLACE FUNCTION increment_artifact_ref_count()
RETURNS TRIGGER AS $$
BEGIN
UPDATE artifacts SET ref_count = ref_count + 1 WHERE id = NEW.artifact_id;
RETURN NEW;
END;
$$ LANGUAGE plpgsql;
CREATE OR REPLACE FUNCTION decrement_artifact_ref_count()
RETURNS TRIGGER AS $$
BEGIN
UPDATE artifacts SET ref_count = ref_count - 1 WHERE id = OLD.artifact_id;
RETURN OLD;
END;
$$ LANGUAGE plpgsql;
CREATE OR REPLACE FUNCTION update_artifact_ref_count()
RETURNS TRIGGER AS $$
BEGIN
IF OLD.artifact_id != NEW.artifact_id THEN
UPDATE artifacts SET ref_count = ref_count - 1 WHERE id = OLD.artifact_id;
UPDATE artifacts SET ref_count = ref_count + 1 WHERE id = NEW.artifact_id;
END IF;
RETURN NEW;
END;
$$ LANGUAGE plpgsql;
-- Note: ref_count triggers on tags table
-- These track how many tags reference each artifact
DROP TRIGGER IF EXISTS tags_ref_count_insert_trigger ON tags;
CREATE TRIGGER tags_ref_count_insert_trigger
AFTER INSERT ON tags
FOR EACH ROW
EXECUTE FUNCTION increment_artifact_ref_count();
DROP TRIGGER IF EXISTS tags_ref_count_delete_trigger ON tags;
CREATE TRIGGER tags_ref_count_delete_trigger
AFTER DELETE ON tags
FOR EACH ROW
EXECUTE FUNCTION decrement_artifact_ref_count();
DROP TRIGGER IF EXISTS tags_ref_count_update_trigger ON tags;
CREATE TRIGGER tags_ref_count_update_trigger
AFTER UPDATE ON tags
FOR EACH ROW
EXECUTE FUNCTION update_artifact_ref_count();