Upload workflow enhancements: S3 verification, timing, client checksum support (#19)

- Add S3 object verification after upload (size validation before DB commit)
- Add cleanup of S3 objects if DB commit fails
- Record upload duration_ms and user_agent
- Support X-Checksum-SHA256 header for client-side checksum verification
- Add already_existed flag to StorageResult for deduplication tracking
- Add status, error_message, client_checksum columns to Upload model
- Add UploadLock model for future 409 conflict detection
- Add consistency-check admin endpoint for detecting orphaned S3 objects
- Add migration 005_upload_enhancements.sql
This commit is contained in:
Mondo Diaz
2026-01-06 15:31:59 -06:00
parent 3056747f39
commit c184272cec
5 changed files with 350 additions and 4 deletions

View File

@@ -39,6 +39,7 @@ from .models import (
Tag,
TagHistory,
Upload,
UploadLock,
Consumer,
AuditLog,
)
@@ -82,6 +83,7 @@ from .schemas import (
PresignedUrlResponse,
GarbageCollectionResponse,
OrphanedArtifactResponse,
ConsistencyCheckResponse,
StorageStatsResponse,
DeduplicationStatsResponse,
ProjectStatsResponse,
@@ -121,6 +123,7 @@ def get_user_id(request: Request) -> str:
import logging
import time
logger = logging.getLogger(__name__)
@@ -1138,8 +1141,20 @@ def upload_artifact(
db: Session = Depends(get_db),
storage: S3Storage = Depends(get_storage),
content_length: Optional[int] = Header(None, alias="Content-Length"),
user_agent: Optional[str] = Header(None, alias="User-Agent"),
client_checksum: Optional[str] = Header(None, alias="X-Checksum-SHA256"),
):
"""
Upload an artifact to a package.
Headers:
- X-Checksum-SHA256: Optional client-provided SHA256 for verification
- User-Agent: Captured for audit purposes
"""
start_time = time.time()
user_id = get_user_id(request)
settings = get_settings()
storage_result = None
# Get project and package
project = db.query(Project).filter(Project.name == project_name).first()
@@ -1155,7 +1170,6 @@ def upload_artifact(
raise HTTPException(status_code=404, detail="Package not found")
# Validate file size
settings = get_settings()
if content_length is not None:
if content_length > settings.max_file_size:
raise HTTPException(
@@ -1168,6 +1182,17 @@ def upload_artifact(
detail="Empty files are not allowed",
)
# Validate client checksum format if provided
if client_checksum:
client_checksum = client_checksum.lower().strip()
if len(client_checksum) != 64 or not all(
c in "0123456789abcdef" for c in client_checksum
):
raise HTTPException(
status_code=400,
detail="Invalid X-Checksum-SHA256 header. Must be 64 hex characters.",
)
# Extract format-specific metadata before storing
file_metadata = {}
if file.filename:
@@ -1224,6 +1249,55 @@ def upload_artifact(
logger.error(f"Storage error during upload: {e}")
raise HTTPException(status_code=500, detail="Internal storage error")
# Verify client-provided checksum if present
checksum_verified = True
if client_checksum and client_checksum != storage_result.sha256:
# Checksum mismatch - clean up S3 object if it was newly uploaded
logger.warning(
f"Client checksum mismatch: expected {client_checksum}, got {storage_result.sha256}"
)
# Attempt cleanup of the uploaded object
try:
if not storage_result.already_existed:
storage.delete(storage_result.s3_key)
logger.info(
f"Cleaned up S3 object after checksum mismatch: {storage_result.s3_key}"
)
except Exception as cleanup_error:
logger.error(
f"Failed to clean up S3 object after checksum mismatch: {cleanup_error}"
)
raise HTTPException(
status_code=422,
detail=f"Checksum verification failed. Expected {client_checksum}, got {storage_result.sha256}",
)
# Verify S3 object exists and size matches before proceeding
try:
s3_info = storage.get_object_info(storage_result.s3_key)
if s3_info is None:
raise HTTPException(
status_code=500,
detail="Failed to verify uploaded object in storage",
)
if s3_info.get("size") != storage_result.size:
logger.error(
f"Size mismatch after upload: expected {storage_result.size}, "
f"got {s3_info.get('size')}"
)
raise HTTPException(
status_code=500,
detail="Upload verification failed: size mismatch",
)
except HTTPException:
raise
except Exception as e:
logger.error(f"Failed to verify S3 object: {e}")
raise HTTPException(
status_code=500,
detail="Failed to verify uploaded object",
)
# Check if this is a deduplicated upload
deduplicated = False
saved_bytes = 0
@@ -1275,14 +1349,23 @@ def upload_artifact(
)
db.add(artifact)
# Record upload
# Calculate upload duration
duration_ms = int((time.time() - start_time) * 1000)
# Record upload with enhanced metadata
upload = Upload(
artifact_id=storage_result.sha256,
package_id=package.id,
original_name=file.filename,
tag_name=tag,
user_agent=user_agent[:512] if user_agent else None, # Truncate if too long
duration_ms=duration_ms,
deduplicated=deduplicated,
checksum_verified=checksum_verified,
client_checksum=client_checksum,
status="completed",
uploaded_by=user_id,
source_ip=request.client.host if request.client else None,
deduplicated=deduplicated,
)
db.add(upload)
db.flush() # Flush to get upload ID
@@ -1311,10 +1394,32 @@ def upload_artifact(
"deduplicated": deduplicated,
"saved_bytes": saved_bytes,
"tag": tag,
"duration_ms": duration_ms,
"client_checksum_provided": client_checksum is not None,
},
)
db.commit()
# Commit with cleanup on failure
try:
db.commit()
except Exception as commit_error:
logger.error(f"Database commit failed after upload: {commit_error}")
db.rollback()
# Attempt to clean up newly uploaded S3 object
if storage_result and not storage_result.already_existed:
try:
storage.delete(storage_result.s3_key)
logger.info(
f"Cleaned up S3 object after commit failure: {storage_result.s3_key}"
)
except Exception as cleanup_error:
logger.error(
f"Failed to clean up S3 object after commit failure: {cleanup_error}"
)
raise HTTPException(
status_code=500,
detail="Failed to save upload record. Please retry.",
)
return UploadResponse(
artifact_id=storage_result.sha256,
@@ -2597,6 +2702,109 @@ def garbage_collect(
)
@router.get(
"/api/v1/admin/consistency-check",
response_model=ConsistencyCheckResponse,
)
def check_consistency(
limit: int = Query(
default=100, ge=1, le=1000, description="Max items to report per category"
),
db: Session = Depends(get_db),
storage: S3Storage = Depends(get_storage),
):
"""
Check consistency between database records and S3 storage.
Reports:
- Orphaned S3 objects (in S3 but not in database)
- Missing S3 objects (in database but not in S3)
- Size mismatches (database size != S3 size)
This is a read-only operation. Use garbage-collect to clean up issues.
"""
orphaned_s3_keys = []
missing_s3_keys = []
size_mismatches = []
# Get all artifacts from database
artifacts = db.query(Artifact).all()
total_checked = len(artifacts)
# Check each artifact exists in S3 and sizes match
for artifact in artifacts:
try:
s3_info = storage.get_object_info(artifact.s3_key)
if s3_info is None:
if len(missing_s3_keys) < limit:
missing_s3_keys.append(artifact.s3_key)
else:
s3_size = s3_info.get("ContentLength", 0)
if s3_size != artifact.size:
if len(size_mismatches) < limit:
size_mismatches.append(
{
"artifact_id": artifact.id,
"s3_key": artifact.s3_key,
"db_size": artifact.size,
"s3_size": s3_size,
}
)
except Exception as e:
logger.error(f"Error checking S3 object {artifact.s3_key}: {e}")
if len(missing_s3_keys) < limit:
missing_s3_keys.append(artifact.s3_key)
# Check for orphaned S3 objects (objects in S3 bucket but not in database)
# Note: This is expensive for large buckets, so we limit the scan
try:
# List objects in the fruits/ prefix (where artifacts are stored)
paginator = storage.client.get_paginator("list_objects_v2")
artifact_ids_in_db = {a.id for a in artifacts}
objects_checked = 0
for page in paginator.paginate(
Bucket=storage.bucket, Prefix="fruits/", MaxKeys=1000
):
if "Contents" not in page:
break
for obj in page["Contents"]:
objects_checked += 1
# Extract hash from key: fruits/ab/cd/abcdef...
key = obj["Key"]
parts = key.split("/")
if len(parts) == 4 and parts[0] == "fruits":
sha256_hash = parts[3]
if sha256_hash not in artifact_ids_in_db:
if len(orphaned_s3_keys) < limit:
orphaned_s3_keys.append(key)
# Limit total objects checked
if objects_checked >= 10000:
break
if objects_checked >= 10000:
break
except Exception as e:
logger.error(f"Error listing S3 objects for consistency check: {e}")
healthy = (
len(orphaned_s3_keys) == 0
and len(missing_s3_keys) == 0
and len(size_mismatches) == 0
)
return ConsistencyCheckResponse(
total_artifacts_checked=total_checked,
orphaned_s3_objects=len(orphaned_s3_keys),
missing_s3_objects=len(missing_s3_keys),
size_mismatches=len(size_mismatches),
healthy=healthy,
orphaned_s3_keys=orphaned_s3_keys,
missing_s3_keys=missing_s3_keys,
size_mismatch_artifacts=size_mismatches,
)
# =============================================================================
# Statistics Endpoints (ISSUE 34)
# =============================================================================