From e215ecabcd93f55834b0cea1324cba866d91d233 Mon Sep 17 00:00:00 2001 From: Mondo Diaz Date: Mon, 5 Jan 2026 14:46:18 -0600 Subject: [PATCH] Add S3 configuration options and improved error handling - Add s3_verify_ssl config option for SSL/TLS verification - Add s3_connect_timeout and s3_read_timeout config options - Add s3_max_retries config option with adaptive retry mode - Add S3StorageUnavailableError for backend availability issues - Add HashCollisionError for detecting extremely rare hash collisions - Add hash collision detection by comparing file sizes on dedup - Handle network interruption and timeout errors explicitly - Update routes.py to handle new exception types with appropriate HTTP codes --- backend/app/config.py | 12 +++++-- backend/app/routes.py | 15 ++++++++ backend/app/storage.py | 77 +++++++++++++++++++++++++++++++++++++++--- 3 files changed, 98 insertions(+), 6 deletions(-) diff --git a/backend/app/config.py b/backend/app/config.py index db396fb..a5b33d5 100644 --- a/backend/app/config.py +++ b/backend/app/config.py @@ -22,7 +22,9 @@ class Settings(BaseSettings): database_pool_size: int = 5 # Number of connections to keep open database_max_overflow: int = 10 # Max additional connections beyond pool_size database_pool_timeout: int = 30 # Seconds to wait for a connection from pool - database_pool_recycle: int = 1800 # Recycle connections after this many seconds (30 min) + database_pool_recycle: int = ( + 1800 # Recycle connections after this many seconds (30 min) + ) # S3 s3_endpoint: str = "" @@ -31,10 +33,16 @@ class Settings(BaseSettings): s3_access_key_id: str = "" s3_secret_access_key: str = "" s3_use_path_style: bool = True + s3_verify_ssl: bool = True # Set to False for self-signed certs (dev only) + s3_connect_timeout: int = 10 # Connection timeout in seconds + s3_read_timeout: int = 60 # Read timeout in seconds + s3_max_retries: int = 3 # Max retry attempts for transient failures # Download settings download_mode: str = "presigned" # "presigned", "redirect", or "proxy" - presigned_url_expiry: int = 3600 # Presigned URL expiry in seconds (default: 1 hour) + presigned_url_expiry: int = ( + 3600 # Presigned URL expiry in seconds (default: 1 hour) + ) @property def database_url(self) -> str: diff --git a/backend/app/routes.py b/backend/app/routes.py index 513b440..aea8ae4 100644 --- a/backend/app/routes.py +++ b/backend/app/routes.py @@ -29,6 +29,8 @@ from .storage import ( HashComputationError, S3ExistenceCheckError, S3UploadError, + S3StorageUnavailableError, + HashCollisionError, ) from .models import ( Project, @@ -998,6 +1000,19 @@ def upload_artifact( status_code=503, detail="Storage service temporarily unavailable. Please retry.", ) + except S3StorageUnavailableError as e: + logger.error(f"S3 storage unavailable: {e}") + raise HTTPException( + status_code=503, + detail="Storage backend is unavailable. Please retry later.", + ) + except HashCollisionError as e: + # This is extremely rare - log critical alert + logger.critical(f"HASH COLLISION DETECTED: {e}") + raise HTTPException( + status_code=500, + detail="Data integrity error detected. Please contact support.", + ) except StorageError as e: logger.error(f"Storage error during upload: {e}") raise HTTPException(status_code=500, detail="Internal storage error") diff --git a/backend/app/storage.py b/backend/app/storage.py index a691d8d..6944855 100644 --- a/backend/app/storage.py +++ b/backend/app/storage.py @@ -14,7 +14,13 @@ from typing import ( ) import boto3 from botocore.config import Config -from botocore.exceptions import ClientError +from botocore.exceptions import ( + ClientError, + ConnectionError as BotoConnectionError, + EndpointConnectionError, + ReadTimeoutError, + ConnectTimeoutError, +) from .config import get_settings @@ -193,10 +199,33 @@ class StorageResult(NamedTuple): s3_etag: Optional[str] = None +class S3StorageUnavailableError(StorageError): + """Raised when S3 storage backend is unavailable""" + + pass + + +class HashCollisionError(StorageError): + """Raised when a hash collision is detected (extremely rare)""" + + pass + + class S3Storage: def __init__(self): + # Build config with retry and timeout settings + s3_config = {} + if settings.s3_use_path_style: + s3_config["addressing_style"] = "path" + config = Config( - s3={"addressing_style": "path"} if settings.s3_use_path_style else {} + s3=s3_config if s3_config else None, + connect_timeout=settings.s3_connect_timeout, + read_timeout=settings.s3_read_timeout, + retries={ + "max_attempts": settings.s3_max_retries, + "mode": "adaptive", # Adaptive retry mode for better handling + }, ) self.client = boto3.client( @@ -206,6 +235,7 @@ class S3Storage: aws_access_key_id=settings.s3_access_key_id, aws_secret_access_key=settings.s3_secret_access_key, config=config, + verify=settings.s3_verify_ssl, # SSL/TLS verification ) self.bucket = settings.s3_bucket # Store active multipart uploads for resumable support @@ -271,14 +301,38 @@ class S3Storage: Body=content, ) s3_etag = response.get("ETag", "").strip('"') + except (EndpointConnectionError, BotoConnectionError) as e: + logger.error(f"S3 storage unavailable: {e}") + raise S3StorageUnavailableError( + f"Storage backend unavailable: {e}" + ) from e + except (ReadTimeoutError, ConnectTimeoutError) as e: + logger.error(f"S3 operation timed out: {e}") + raise S3UploadError(f"Upload timed out: {e}") from e except ClientError as e: + error_code = e.response.get("Error", {}).get("Code", "") + if error_code == "ServiceUnavailable": + logger.error(f"S3 service unavailable: {e}") + raise S3StorageUnavailableError( + f"Storage service unavailable: {e}" + ) from e logger.error(f"S3 upload failed: {e}") raise S3UploadError(f"Failed to upload to S3: {e}") from e else: - # Get existing ETag + # Get existing ETag and verify integrity (detect potential hash collision) obj_info = self.get_object_info(s3_key) if obj_info: s3_etag = obj_info.get("etag", "").strip('"') + # Check for hash collision by comparing size + existing_size = obj_info.get("size", 0) + if existing_size != size: + logger.critical( + f"HASH COLLISION DETECTED! Hash {sha256_hash} has size mismatch: " + f"existing={existing_size}, new={size}. This is extremely rare." + ) + raise HashCollisionError( + f"Hash collision detected for {sha256_hash}: size mismatch" + ) return StorageResult( sha256=sha256_hash, @@ -341,6 +395,17 @@ class S3Storage: if exists: obj_info = self.get_object_info(s3_key) s3_etag = obj_info.get("etag", "").strip('"') if obj_info else None + # Check for hash collision by comparing size + if obj_info: + existing_size = obj_info.get("size", 0) + if existing_size != size: + logger.critical( + f"HASH COLLISION DETECTED! Hash {sha256_hash} has size mismatch: " + f"existing={existing_size}, new={size}. This is extremely rare." + ) + raise HashCollisionError( + f"Hash collision detected for {sha256_hash}: size mismatch" + ) return StorageResult( sha256=sha256_hash, size=size, @@ -354,7 +419,11 @@ class S3Storage: file.seek(0) # Start multipart upload - mpu = self.client.create_multipart_upload(Bucket=self.bucket, Key=s3_key) + try: + mpu = self.client.create_multipart_upload(Bucket=self.bucket, Key=s3_key) + except (EndpointConnectionError, BotoConnectionError) as e: + logger.error(f"S3 storage unavailable for multipart upload: {e}") + raise S3StorageUnavailableError(f"Storage backend unavailable: {e}") from e upload_id = mpu["UploadId"] try: