From 584acd1e900c799dc41083f6534c6b99b2134b1a Mon Sep 17 00:00:00 2001 From: Mondo Diaz Date: Wed, 21 Jan 2026 09:35:12 -0600 Subject: [PATCH] Add comprehensive upload/download tests and streaming enhancements (#38, #40, #42, #43) --- .gitlab-ci.yml | 133 +-- .gitleaksignore | 4 + CHANGELOG.md | 45 + backend/app/database.py | 58 +- backend/app/routes.py | 357 +++++++- backend/app/schemas.py | 18 + backend/app/storage.py | 139 +++- backend/pytest.ini | 2 + backend/tests/conftest.py | 126 ++- backend/tests/factories.py | 35 + backend/tests/integration/test_auth_api.py | 473 ++++------- .../integration/test_concurrent_operations.py | 737 +++++++++++++++++ .../tests/integration/test_error_handling.py | 322 ++++++++ .../test_integrity_verification.py | 768 ++++++++++++++++++ .../tests/integration/test_large_uploads.py | 552 +++++++++++++ .../tests/integration/test_size_boundary.py | 583 +++++++++++++ .../integration/test_streaming_download.py | 535 ++++++++++++ .../integration/test_upload_download_api.py | 246 +++++- backend/tests/integration/test_version_api.py | 347 ++++++++ docs/integrity-verification.md | 294 +++++++ helm/orchard/templates/deployment.yaml | 6 + helm/orchard/values-dev.yaml | 5 + helm/orchard/values-stage.yaml | 5 + 23 files changed, 5385 insertions(+), 405 deletions(-) create mode 100644 backend/tests/integration/test_concurrent_operations.py create mode 100644 backend/tests/integration/test_error_handling.py create mode 100644 backend/tests/integration/test_integrity_verification.py create mode 100644 backend/tests/integration/test_large_uploads.py create mode 100644 backend/tests/integration/test_size_boundary.py create mode 100644 backend/tests/integration/test_streaming_download.py create mode 100644 backend/tests/integration/test_version_api.py create mode 100644 docs/integrity-verification.md diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 38a6514..5ff403b 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -8,6 +8,11 @@ variables: PROSPER_VERSION: v0.64.1 # Use internal PyPI proxy instead of public internet PIP_INDEX_URL: https://deps.global.bsf.tools/artifactory/api/pypi/pypi.org/simple + # Environment URLs (used by deploy and test jobs) + STAGE_URL: https://orchard-stage.common.global.bsf.tools + PROD_URL: https://orchard.common.global.bsf.tools + # Shared pip cache directory + PIP_CACHE_DIR: "$CI_PROJECT_DIR/.pip-cache" # Prevent duplicate pipelines for MRs workflow: @@ -29,11 +34,47 @@ kics: variables: KICS_CONFIG: kics.config -# Post-deployment integration tests template +# Full integration test suite template (for feature/stage deployments) +# Runs the complete pytest integration test suite against the deployed environment .integration_test_template: &integration_test_template stage: deploy # Runs in deploy stage, but after deployment due to 'needs' image: deps.global.bsf.tools/docker/python:3.12-slim - timeout: 10m + timeout: 20m # Full suite takes longer than smoke tests + interruptible: true # Cancel if new pipeline starts + retry: 1 # Retry once on failure (network flakiness) + cache: + key: pip-$CI_COMMIT_REF_SLUG + paths: + - .pip-cache/ + policy: pull-push + before_script: + - pip install --index-url "$PIP_INDEX_URL" -r backend/requirements.txt + - pip install --index-url "$PIP_INDEX_URL" pytest pytest-asyncio httpx + script: + - cd backend + # Run full integration test suite, excluding: + # - large/slow tests + # - requires_direct_s3 tests (can't access MinIO from outside K8s cluster) + # ORCHARD_TEST_URL tells the tests which server to connect to + # Note: Auth tests work because dev/stage deployments have relaxed rate limits + - | + python -m pytest tests/integration/ -v \ + --junitxml=integration-report.xml \ + -m "not large and not slow and not requires_direct_s3" \ + --tb=short + artifacts: + when: always + expire_in: 1 week + paths: + - backend/integration-report.xml + reports: + junit: backend/integration-report.xml + +# Lightweight smoke test template (for production - no test data creation) +.smoke_test_template: &smoke_test_template + stage: deploy + image: deps.global.bsf.tools/docker/python:3.12-slim + timeout: 5m before_script: - pip install --index-url "$PIP_INDEX_URL" httpx script: @@ -43,12 +84,12 @@ kics: import os import sys - BASE_URL = os.environ.get("BASE_URL") + BASE_URL = os.environ.get("ORCHARD_TEST_URL") if not BASE_URL: - print("ERROR: BASE_URL not set") + print("ERROR: ORCHARD_TEST_URL not set") sys.exit(1) - print(f"Running integration tests against {BASE_URL}") + print(f"Running smoke tests against {BASE_URL}") client = httpx.Client(base_url=BASE_URL, timeout=30.0) errors = [] @@ -86,38 +127,37 @@ kics: print(f" FAIL: {e}") sys.exit(1) else: - print("SUCCESS: All integration tests passed!") + print("SUCCESS: All smoke tests passed!") sys.exit(0) PYTEST_SCRIPT -# Integration tests for stage deployment +# Integration tests for stage deployment (full suite) integration_test_stage: <<: *integration_test_template needs: [deploy_stage] variables: - BASE_URL: https://orchard-stage.common.global.bsf.tools + ORCHARD_TEST_URL: $STAGE_URL rules: - if: '$CI_COMMIT_BRANCH == "main"' when: on_success -# Integration tests for feature deployment +# Integration tests for feature deployment (full suite) integration_test_feature: <<: *integration_test_template needs: [deploy_feature] variables: - BASE_URL: https://orchard-$CI_COMMIT_REF_SLUG.common.global.bsf.tools + ORCHARD_TEST_URL: https://orchard-$CI_COMMIT_REF_SLUG.common.global.bsf.tools rules: - if: '$CI_COMMIT_BRANCH && $CI_COMMIT_BRANCH != "main"' when: on_success -# Run Python backend tests -python_tests: +# Run Python backend unit tests +python_unit_tests: stage: test needs: [] # Run in parallel with build image: deps.global.bsf.tools/docker/python:3.12-slim timeout: 15m - variables: - PIP_CACHE_DIR: "$CI_PROJECT_DIR/.pip-cache" + interruptible: true # Cancel if new pipeline starts cache: key: pip-$CI_COMMIT_REF_SLUG paths: @@ -128,7 +168,7 @@ python_tests: - pip install --index-url "$PIP_INDEX_URL" pytest pytest-asyncio pytest-cov httpx script: - cd backend - # Only run unit tests - integration tests require Docker Compose services + # Run unit tests (integration tests run post-deployment against live environment) - python -m pytest tests/unit/ -v --cov=app --cov-report=term --cov-report=xml:coverage.xml --cov-report=html:coverage_html --junitxml=pytest-report.xml artifacts: when: always @@ -150,6 +190,7 @@ frontend_tests: needs: [] # Run in parallel with build image: deps.global.bsf.tools/docker/node:20-alpine timeout: 15m + interruptible: true # Cancel if new pipeline starts cache: key: npm-$CI_COMMIT_REF_SLUG paths: @@ -175,7 +216,7 @@ frontend_tests: # Shared deploy configuration .deploy_template: &deploy_template stage: deploy - needs: [build_image, test_image, kics, hadolint, python_tests, frontend_tests, secrets, app_deps_scan, cve_scan, cve_sbom_analysis, app_sbom_analysis] + needs: [build_image, test_image, kics, hadolint, python_unit_tests, frontend_tests, secrets, app_deps_scan, cve_scan, cve_sbom_analysis, app_sbom_analysis] image: deps.global.bsf.tools/registry-1.docker.io/alpine/k8s:1.29.12 .helm_setup: &helm_setup @@ -184,47 +225,21 @@ frontend_tests: # OCI-based charts from internal registry - no repo add needed - helm dependency update +# Simplified deployment verification - just health check +# Full API/frontend checks are done by integration tests post-deployment .verify_deployment: &verify_deployment | echo "=== Waiting for health endpoint (certs may take a few minutes) ===" for i in $(seq 1 30); do if curl -sf --max-time 10 "$BASE_URL/health" > /dev/null 2>&1; then echo "Health check passed!" - break + echo "Deployment URL: $BASE_URL" + exit 0 fi echo "Attempt $i/30 - waiting 10s..." sleep 10 done - - # Verify health endpoint - echo "" - echo "=== Health Check ===" - curl -sf "$BASE_URL/health" || { echo "Health check failed"; exit 1; } - echo "" - - # Verify API is responding - echo "" - echo "=== API Check (GET /api/v1/projects) ===" - HTTP_CODE=$(curl -sf -o /dev/null -w "%{http_code}" "$BASE_URL/api/v1/projects") - if [ "$HTTP_CODE" = "200" ]; then - echo "API responding: HTTP $HTTP_CODE" - else - echo "API check failed: HTTP $HTTP_CODE" - exit 1 - fi - - # Verify frontend is served - echo "" - echo "=== Frontend Check ===" - if curl -sf "$BASE_URL/" | grep -q ""; then - echo "Frontend is being served" - else - echo "Frontend check failed" - exit 1 - fi - - echo "" - echo "=== All checks passed! ===" - echo "Deployment URL: $BASE_URL" + echo "Health check failed after 30 attempts" + exit 1 # Deploy to stage (main branch) deploy_stage: @@ -232,7 +247,7 @@ deploy_stage: variables: NAMESPACE: orch-stage-namespace VALUES_FILE: helm/orchard/values-stage.yaml - BASE_URL: https://orchard-stage.common.global.bsf.tools + BASE_URL: $STAGE_URL before_script: - kubectl config use-context esv/bsf/bsf-integration/orchard/orchard-mvp:orchard-stage - *helm_setup @@ -251,7 +266,7 @@ deploy_stage: - *verify_deployment environment: name: stage - url: https://orchard-stage.common.global.bsf.tools + url: $STAGE_URL kubernetes: agent: esv/bsf/bsf-integration/orchard/orchard-mvp:orchard-stage rules: @@ -297,10 +312,12 @@ deploy_feature: - if: '$CI_COMMIT_BRANCH && $CI_COMMIT_BRANCH != "main"' when: on_success -# Cleanup feature branch deployment +# Cleanup feature branch deployment (standalone - doesn't need deploy dependencies) cleanup_feature: - <<: *deploy_template + stage: deploy needs: [] + image: deps.global.bsf.tools/registry-1.docker.io/alpine/k8s:1.29.12 + timeout: 5m variables: NAMESPACE: orch-dev-namespace GIT_STRATEGY: none # No source needed, branch may be deleted @@ -329,7 +346,7 @@ deploy_prod: variables: NAMESPACE: orch-prod-namespace VALUES_FILE: helm/orchard/values-prod.yaml - BASE_URL: https://orchard.common.global.bsf.tools + BASE_URL: $PROD_URL before_script: - kubectl config use-context esv/bsf/bsf-integration/orchard/orchard-mvp:orchard-prod - *helm_setup @@ -348,7 +365,7 @@ deploy_prod: - *verify_deployment environment: name: production - url: https://orchard.common.global.bsf.tools + url: $PROD_URL kubernetes: agent: esv/bsf/bsf-integration/orchard/orchard-mvp:orchard-prod rules: @@ -357,12 +374,12 @@ deploy_prod: when: manual # Require manual approval for prod allow_failure: false -# Integration tests for production deployment -integration_test_prod: - <<: *integration_test_template +# Smoke tests for production deployment (read-only, no test data creation) +smoke_test_prod: + <<: *smoke_test_template needs: [deploy_prod] variables: - BASE_URL: https://orchard.common.global.bsf.tools + ORCHARD_TEST_URL: $PROD_URL rules: - if: '$CI_COMMIT_TAG =~ /^v\d+\.\d+\.\d+$/' when: on_success diff --git a/.gitleaksignore b/.gitleaksignore index c57613c..c6bc836 100644 --- a/.gitleaksignore +++ b/.gitleaksignore @@ -11,3 +11,7 @@ bccbc71c13570d14b8b26a11335c45f102fe3072:backend/tests/unit/test_storage.py:gene 90bb2a3a393d2361dc3136ee8d761debb0726d8a:backend/tests/unit/test_storage.py:generic-api-key:381 37666e41a72d2a4f34447c0d1a8728e1d7271d24:backend/tests/unit/test_storage.py:generic-api-key:381 0cc4f253621a9601c5193f6ae1e7ae33f0e7fc9b:backend/tests/unit/test_storage.py:generic-api-key:381 +35fda65d381acc5ab59bc592ee3013f75906c197:backend/tests/unit/test_storage.py:generic-api-key:381 +08dce6cbb836b687002751fed4159bfc2da61f8b:backend/tests/unit/test_storage.py:generic-api-key:381 +617bcbe89cff9a009d77e4f1f1864efed1820e63:backend/tests/unit/test_storage.py:generic-api-key:381 +1cbd33544388e0fe6db752fa8886fab33cf9ce7c:backend/tests/unit/test_storage.py:generic-api-key:381 diff --git a/CHANGELOG.md b/CHANGELOG.md index d28e608..d945122 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,36 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] ### Added +- Added comprehensive upload/download tests for size boundaries (1B to 1GB) (#38) +- Added concurrent upload/download tests (2, 5, 10 parallel operations) (#38) +- Added data integrity tests (binary, text, unicode, compressed content) (#38) +- Added chunk boundary tests for edge cases (#38) +- Added `@pytest.mark.large` and `@pytest.mark.concurrent` test markers (#38) +- Added `generate_content()` and `generate_content_with_hash()` test helpers (#38) +- Added `sized_content` fixture for generating test content of specific sizes (#38) +- Added upload API tests: upload without tag, artifact creation verification, S3 object creation (#38) +- Added download API tests: tag: prefix resolution, 404 for nonexistent project/package/artifact (#38) +- Added download header tests: Content-Type, Content-Length, Content-Disposition, ETag, X-Checksum-SHA256 (#38) +- Added error handling tests: timeout behavior, checksum validation, resource cleanup, graceful error responses (#38) +- Added version API tests: version creation, auto-detection, listing, download by version prefix (#38) +- Added integrity verification tests: round-trip hash verification, client-side verification workflow, size variants (1KB-10MB) (#40) +- Added consistency check endpoint tests with response format validation (#40) +- Added corruption detection tests: bit flip, truncation, appended content, size mismatch, missing S3 objects (#40) +- Added Digest header tests (RFC 3230) and verification mode tests (#40) +- Added integrity verification documentation (`docs/integrity-verification.md`) (#40) +- Added conditional request support for downloads (If-None-Match, If-Modified-Since) returning 304 Not Modified (#42) +- Added caching headers to downloads: Cache-Control (immutable), Last-Modified (#42) +- Added 416 Range Not Satisfiable response for invalid range requests (#42) +- Added download completion logging with bytes transferred and throughput (#42) +- Added client disconnect handling during streaming downloads (#42) +- Added streaming download tests: range requests, conditional requests, caching headers, download resume (#42) +- Added upload duration and throughput metrics (`duration_ms`, `throughput_mbps`) to upload response (#43) +- Added upload progress logging for large files (hash computation and multipart upload phases) (#43) +- Added client disconnect handling during uploads with proper cleanup (#43) +- Added upload progress tracking endpoint `GET /upload/{upload_id}/progress` for resumable uploads (#43) +- Added large file upload tests (10MB, 100MB, 1GB) with multipart upload verification (#43) +- Added upload cancellation and timeout handling tests (#43) +- Added comprehensive API documentation for upload endpoints with curl, Python, and JavaScript examples (#43) - Added `package_versions` table for immutable version tracking separate from mutable tags (#56) - Versions are set at upload time via explicit `version` parameter or auto-detected from filename/metadata - Version detection priority: explicit parameter > package metadata > filename pattern @@ -31,6 +61,13 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Added internal proxy configuration for npm, pip, helm, and apt (#51) ### Changed +- CI integration tests now run full pytest suite (~350 tests) against deployed environment instead of 3 smoke tests +- CI production deployment uses lightweight smoke tests only (no test data creation in prod) +- CI pipeline improvements: shared pip cache, `interruptible` flag on test jobs, retry on integration tests +- Simplified deploy verification to health check only (full checks done by integration tests) +- Extracted environment URLs to global variables for maintainability +- Made `cleanup_feature` job standalone (no longer inherits deploy template dependencies) +- Renamed `integration_test_prod` to `smoke_test_prod` for clarity - Updated download ref resolution to check versions before tags (version → tag → artifact ID) (#56) - Deploy jobs now require all security scans to pass before deployment (added test_image, app_deps_scan, cve_scan, cve_sbom_analysis, app_sbom_analysis to dependencies) (#63) - Increased deploy job timeout from 5m to 10m (#63) @@ -44,6 +81,13 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Improved pod naming: Orchard pods now named `orchard-{env}-server-*` for clarity (#51) ### Fixed +- Fixed CI integration test rate limiting: added configurable `ORCHARD_LOGIN_RATE_LIMIT` env var, relaxed to 1000/minute for dev/stage +- Fixed duplicate `TestSecurityEdgeCases` class definition in test_auth_api.py +- Fixed integration tests auth: session-scoped client, configurable credentials via env vars, fail-fast on auth errors +- Fixed 413 Request Entity Too Large errors on uploads by adding `proxy-body-size: "0"` nginx annotation to Orchard ingress +- Fixed CI tests that require direct S3 access: added `@pytest.mark.requires_direct_s3` marker and excluded from CI +- Fixed ref_count triggers not being created: added auto-migration for tags ref_count trigger functions +- Fixed Content-Disposition header encoding for non-ASCII filenames using RFC 5987 (#38) - Fixed deploy jobs running even when tests or security scans fail (changed rules from `when: always` to `when: on_success`) (#63) - Fixed python_tests job not using internal PyPI proxy (#63) - Fixed `cleanup_feature` job failing when branch is deleted (`GIT_STRATEGY: none`) (#51) @@ -53,6 +97,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Fixed deploy jobs running when secrets scan fails (added `secrets` to deploy dependencies) - Fixed dev environment memory requests to equal limits per cluster Kyverno policy - Fixed init containers missing resource limits (Kyverno policy compliance) +- Fixed Python SyntaxWarning for invalid escape sequence in database migration regex pattern ### Removed - Removed unused `store_streaming()` method from storage.py (#51) diff --git a/backend/app/database.py b/backend/app/database.py index 1c21541..3a686d9 100644 --- a/backend/app/database.py +++ b/backend/app/database.py @@ -170,6 +170,62 @@ def _run_migrations(): END IF; END $$; """, + # Create ref_count trigger functions for tags (ensures triggers exist even if initial migration wasn't run) + """ + CREATE OR REPLACE FUNCTION increment_artifact_ref_count() + RETURNS TRIGGER AS $$ + BEGIN + UPDATE artifacts SET ref_count = ref_count + 1 WHERE id = NEW.artifact_id; + RETURN NEW; + END; + $$ LANGUAGE plpgsql; + """, + """ + CREATE OR REPLACE FUNCTION decrement_artifact_ref_count() + RETURNS TRIGGER AS $$ + BEGIN + UPDATE artifacts SET ref_count = ref_count - 1 WHERE id = OLD.artifact_id; + RETURN OLD; + END; + $$ LANGUAGE plpgsql; + """, + """ + CREATE OR REPLACE FUNCTION update_artifact_ref_count() + RETURNS TRIGGER AS $$ + BEGIN + IF OLD.artifact_id != NEW.artifact_id THEN + UPDATE artifacts SET ref_count = ref_count - 1 WHERE id = OLD.artifact_id; + UPDATE artifacts SET ref_count = ref_count + 1 WHERE id = NEW.artifact_id; + END IF; + RETURN NEW; + END; + $$ LANGUAGE plpgsql; + """, + # Create triggers for tags ref_count management + """ + DO $$ + BEGIN + -- Drop and recreate triggers to ensure they're current + DROP TRIGGER IF EXISTS tags_ref_count_insert_trigger ON tags; + CREATE TRIGGER tags_ref_count_insert_trigger + AFTER INSERT ON tags + FOR EACH ROW + EXECUTE FUNCTION increment_artifact_ref_count(); + + DROP TRIGGER IF EXISTS tags_ref_count_delete_trigger ON tags; + CREATE TRIGGER tags_ref_count_delete_trigger + AFTER DELETE ON tags + FOR EACH ROW + EXECUTE FUNCTION decrement_artifact_ref_count(); + + DROP TRIGGER IF EXISTS tags_ref_count_update_trigger ON tags; + CREATE TRIGGER tags_ref_count_update_trigger + AFTER UPDATE ON tags + FOR EACH ROW + WHEN (OLD.artifact_id IS DISTINCT FROM NEW.artifact_id) + EXECUTE FUNCTION update_artifact_ref_count(); + END $$; + """, # Create ref_count trigger functions for package_versions """ CREATE OR REPLACE FUNCTION increment_version_ref_count() @@ -210,7 +266,7 @@ def _run_migrations(): END $$; """, # Migrate existing semver tags to package_versions - """ + r""" DO $$ BEGIN IF EXISTS (SELECT 1 FROM information_schema.tables WHERE table_name = 'package_versions') THEN diff --git a/backend/app/routes.py b/backend/app/routes.py index 91fcb5b..2cb08d3 100644 --- a/backend/app/routes.py +++ b/backend/app/routes.py @@ -82,6 +82,7 @@ from .schemas import ( ResumableUploadCompleteRequest, ResumableUploadCompleteResponse, ResumableUploadStatusResponse, + UploadProgressResponse, GlobalSearchResponse, SearchResultProject, SearchResultPackage, @@ -143,6 +144,31 @@ def sanitize_filename(filename: str) -> str: return re.sub(r'[\r\n"]', "", filename) +def build_content_disposition(filename: str) -> str: + """Build a Content-Disposition header value with proper encoding. + + For ASCII filenames, uses simple: attachment; filename="name" + For non-ASCII filenames, uses RFC 5987 encoding with UTF-8. + """ + from urllib.parse import quote + + sanitized = sanitize_filename(filename) + + # Check if filename is pure ASCII + try: + sanitized.encode('ascii') + # Pure ASCII - simple format + return f'attachment; filename="{sanitized}"' + except UnicodeEncodeError: + # Non-ASCII - use RFC 5987 encoding + # Provide both filename (ASCII fallback) and filename* (UTF-8 encoded) + ascii_fallback = sanitized.encode('ascii', errors='replace').decode('ascii') + # RFC 5987: filename*=charset'language'encoded_value + # We use UTF-8 encoding and percent-encode non-ASCII chars + encoded = quote(sanitized, safe='') + return f'attachment; filename="{ascii_fallback}"; filename*=UTF-8\'\'{encoded}' + + def get_user_id_from_request( request: Request, db: Session, @@ -2258,10 +2284,56 @@ def upload_artifact( """ Upload an artifact to a package. - Headers: - - X-Checksum-SHA256: Optional client-provided SHA256 for verification - - User-Agent: Captured for audit purposes - - Authorization: Bearer for authentication + **Size Limits:** + - Minimum: 1 byte (empty files rejected) + - Maximum: 10GB (configurable via ORCHARD_MAX_FILE_SIZE) + - Files > 100MB automatically use S3 multipart upload + + **Headers:** + - `X-Checksum-SHA256`: Optional SHA256 hash for server-side verification + - `Content-Length`: File size (required for early rejection of oversized files) + - `Authorization`: Bearer for authentication + + **Deduplication:** + Content-addressable storage automatically deduplicates identical files. + If the same content is uploaded multiple times, only one copy is stored. + + **Response Metrics:** + - `duration_ms`: Upload duration in milliseconds + - `throughput_mbps`: Upload throughput in MB/s + - `deduplicated`: True if content already existed + + **Example (curl):** + ```bash + curl -X POST "http://localhost:8080/api/v1/project/myproject/mypackage/upload" \\ + -H "Authorization: Bearer " \\ + -F "file=@myfile.tar.gz" \\ + -F "tag=v1.0.0" + ``` + + **Example (Python requests):** + ```python + import requests + with open('myfile.tar.gz', 'rb') as f: + response = requests.post( + 'http://localhost:8080/api/v1/project/myproject/mypackage/upload', + files={'file': f}, + data={'tag': 'v1.0.0'}, + headers={'Authorization': 'Bearer '} + ) + ``` + + **Example (JavaScript fetch):** + ```javascript + const formData = new FormData(); + formData.append('file', fileInput.files[0]); + formData.append('tag', 'v1.0.0'); + const response = await fetch('/api/v1/project/myproject/mypackage/upload', { + method: 'POST', + headers: { 'Authorization': 'Bearer ' }, + body: formData + }); + ``` """ start_time = time.time() settings = get_settings() @@ -2363,6 +2435,30 @@ def upload_artifact( except StorageError as e: logger.error(f"Storage error during upload: {e}") raise HTTPException(status_code=500, detail="Internal storage error") + except (ConnectionResetError, BrokenPipeError) as e: + # Client disconnected during upload + logger.warning( + f"Client disconnected during upload: project={project_name} " + f"package={package_name} filename={file.filename} error={e}" + ) + raise HTTPException( + status_code=499, # Client Closed Request (nginx convention) + detail="Client disconnected during upload", + ) + except Exception as e: + # Catch-all for unexpected errors including client disconnects + error_str = str(e).lower() + if "connection" in error_str or "broken pipe" in error_str or "reset" in error_str: + logger.warning( + f"Client connection error during upload: project={project_name} " + f"package={package_name} filename={file.filename} error={e}" + ) + raise HTTPException( + status_code=499, + detail="Client connection error during upload", + ) + logger.error(f"Unexpected error during upload: {e}", exc_info=True) + raise HTTPException(status_code=500, detail="Internal server error during upload") # Verify client-provided checksum if present checksum_verified = True @@ -2555,6 +2651,12 @@ def upload_artifact( detail="Failed to save upload record. Please retry.", ) + # Calculate throughput + throughput_mbps = None + if duration_ms > 0: + duration_seconds = duration_ms / 1000.0 + throughput_mbps = round((storage_result.size / (1024 * 1024)) / duration_seconds, 2) + return UploadResponse( artifact_id=storage_result.sha256, sha256=storage_result.sha256, @@ -2574,6 +2676,8 @@ def upload_artifact( content_type=artifact.content_type, original_name=artifact.original_name, created_at=artifact.created_at, + duration_ms=duration_ms, + throughput_mbps=throughput_mbps, ) @@ -2591,8 +2695,46 @@ def init_resumable_upload( storage: S3Storage = Depends(get_storage), ): """ - Initialize a resumable upload session. - Client must provide the SHA256 hash of the file in advance. + Initialize a resumable upload session for large files. + + Resumable uploads allow uploading large files in chunks, with the ability + to resume after interruption. The client must compute the SHA256 hash + of the entire file before starting. + + **Workflow:** + 1. POST /upload/init - Initialize upload session (this endpoint) + 2. PUT /upload/{upload_id}/part/{part_number} - Upload each part + 3. GET /upload/{upload_id}/progress - Check upload progress (optional) + 4. POST /upload/{upload_id}/complete - Finalize upload + 5. DELETE /upload/{upload_id} - Abort upload (if needed) + + **Chunk Size:** + Use the `chunk_size` returned in the response (10MB default). + Each part except the last must be exactly this size. + + **Deduplication:** + If the expected_hash already exists in storage, the response will include + `already_exists: true` and no upload session is created. + + **Example (curl):** + ```bash + # Step 1: Initialize + curl -X POST "http://localhost:8080/api/v1/project/myproject/mypackage/upload/init" \\ + -H "Authorization: Bearer " \\ + -H "Content-Type: application/json" \\ + -d '{"expected_hash": "", "filename": "large.tar.gz", "size": 104857600}' + + # Step 2: Upload parts + curl -X PUT "http://localhost:8080/api/v1/project/myproject/mypackage/upload//part/1" \\ + -H "Authorization: Bearer " \\ + --data-binary @part1.bin + + # Step 3: Complete + curl -X POST "http://localhost:8080/api/v1/project/myproject/mypackage/upload//complete" \\ + -H "Authorization: Bearer " \\ + -H "Content-Type: application/json" \\ + -d '{"tag": "v1.0.0"}' + ``` """ user_id = get_user_id(request) @@ -2686,6 +2828,10 @@ def init_resumable_upload( # Initialize resumable upload session = storage.initiate_resumable_upload(init_request.expected_hash) + # Set expected size for progress tracking + if session["upload_id"] and init_request.size: + storage.set_upload_expected_size(session["upload_id"], init_request.size) + return ResumableUploadInitResponse( upload_id=session["upload_id"], already_exists=False, @@ -2752,6 +2898,64 @@ def upload_part( raise HTTPException(status_code=404, detail=str(e)) +@router.get( + "/api/v1/project/{project_name}/{package_name}/upload/{upload_id}/progress", + response_model=UploadProgressResponse, +) +def get_upload_progress( + project_name: str, + package_name: str, + upload_id: str, + db: Session = Depends(get_db), + storage: S3Storage = Depends(get_storage), +): + """ + Get progress information for an in-flight resumable upload. + + Returns progress metrics including bytes uploaded, percent complete, + elapsed time, and throughput. + """ + # Validate project and package exist + project = db.query(Project).filter(Project.name == project_name).first() + if not project: + raise HTTPException(status_code=404, detail="Project not found") + + package = ( + db.query(Package) + .filter(Package.project_id == project.id, Package.name == package_name) + .first() + ) + if not package: + raise HTTPException(status_code=404, detail="Package not found") + + progress = storage.get_upload_progress(upload_id) + if not progress: + # Return not_found status instead of 404 to allow polling + return UploadProgressResponse( + upload_id=upload_id, + status="not_found", + bytes_uploaded=0, + ) + + from datetime import datetime, timezone + started_at_dt = None + if progress.get("started_at"): + started_at_dt = datetime.fromtimestamp(progress["started_at"], tz=timezone.utc) + + return UploadProgressResponse( + upload_id=upload_id, + status=progress.get("status", "in_progress"), + bytes_uploaded=progress.get("bytes_uploaded", 0), + bytes_total=progress.get("bytes_total"), + percent_complete=progress.get("percent_complete"), + parts_uploaded=progress.get("parts_uploaded", 0), + parts_total=progress.get("parts_total"), + started_at=started_at_dt, + elapsed_seconds=progress.get("elapsed_seconds"), + throughput_mbps=progress.get("throughput_mbps"), + ) + + @router.post( "/api/v1/project/{project_name}/{package_name}/upload/{upload_id}/complete" ) @@ -2947,6 +3151,8 @@ def download_artifact( storage: S3Storage = Depends(get_storage), current_user: Optional[User] = Depends(get_current_user_optional), range: Optional[str] = Header(None), + if_none_match: Optional[str] = Header(None, alias="If-None-Match"), + if_modified_since: Optional[str] = Header(None, alias="If-Modified-Since"), mode: Optional[Literal["proxy", "redirect", "presigned"]] = Query( default=None, description="Download mode: proxy (stream through backend), redirect (302 to presigned URL), presigned (return JSON with URL)", @@ -2963,6 +3169,15 @@ def download_artifact( """ Download an artifact by reference (tag name, artifact:hash, tag:name). + Supports conditional requests: + - If-None-Match: Returns 304 Not Modified if ETag matches + - If-Modified-Since: Returns 304 Not Modified if not modified since date + + Supports range requests for partial downloads and resume: + - Range: bytes=0-1023 (first 1KB) + - Range: bytes=-1024 (last 1KB) + - Returns 206 Partial Content with Content-Range header + Verification modes: - verify=false (default): No verification, maximum performance - verify=true&verify_mode=stream: Compute hash while streaming, verify after completion. @@ -2975,6 +3190,9 @@ def download_artifact( - X-Content-Length: File size in bytes - ETag: Artifact ID (SHA256) - Digest: RFC 3230 format sha-256 hash + - Last-Modified: Artifact creation timestamp + - Cache-Control: Immutable caching for content-addressable storage + - Accept-Ranges: bytes (advertises range request support) When verify=true: - X-Verified: 'true' if verified, 'false' if verification failed @@ -2999,6 +3217,52 @@ def download_artifact( filename = sanitize_filename(artifact.original_name or f"{artifact.id}") + # Format Last-Modified header (RFC 7231 format) + last_modified = None + last_modified_str = None + if artifact.created_at: + last_modified = artifact.created_at + if last_modified.tzinfo is None: + last_modified = last_modified.replace(tzinfo=timezone.utc) + last_modified_str = last_modified.strftime("%a, %d %b %Y %H:%M:%S GMT") + + # Handle conditional requests (If-None-Match, If-Modified-Since) + # Return 304 Not Modified if content hasn't changed + artifact_etag = f'"{artifact.id}"' + + if if_none_match: + # Strip quotes and compare with artifact ETag + client_etag = if_none_match.strip().strip('"') + if client_etag == artifact.id or if_none_match == artifact_etag: + return Response( + status_code=304, + headers={ + "ETag": artifact_etag, + "Cache-Control": "public, max-age=31536000, immutable", + **({"Last-Modified": last_modified_str} if last_modified_str else {}), + }, + ) + + if if_modified_since and last_modified: + try: + # Parse If-Modified-Since header + from email.utils import parsedate_to_datetime + client_date = parsedate_to_datetime(if_modified_since) + if client_date.tzinfo is None: + client_date = client_date.replace(tzinfo=timezone.utc) + # If artifact hasn't been modified since client's date, return 304 + if last_modified <= client_date: + return Response( + status_code=304, + headers={ + "ETag": artifact_etag, + "Cache-Control": "public, max-age=31536000, immutable", + **({"Last-Modified": last_modified_str} if last_modified_str else {}), + }, + ) + except (ValueError, TypeError): + pass # Invalid date format, ignore and continue with download + # Audit log download user_id = get_user_id(request) _log_audit( @@ -3016,22 +3280,28 @@ def download_artifact( ) db.commit() - # Build common checksum headers (always included) - checksum_headers = { + # Build common headers (always included) + common_headers = { "X-Checksum-SHA256": artifact.id, "X-Content-Length": str(artifact.size), - "ETag": f'"{artifact.id}"', + "ETag": artifact_etag, + # Cache-Control: content-addressable storage is immutable + "Cache-Control": "public, max-age=31536000, immutable", } + # Add Last-Modified header + if last_modified_str: + common_headers["Last-Modified"] = last_modified_str + # Add RFC 3230 Digest header try: digest_base64 = sha256_to_base64(artifact.id) - checksum_headers["Digest"] = f"sha-256={digest_base64}" + common_headers["Digest"] = f"sha-256={digest_base64}" except Exception: pass # Skip if conversion fails # Add MD5 checksum if available if artifact.checksum_md5: - checksum_headers["X-Checksum-MD5"] = artifact.checksum_md5 + common_headers["X-Checksum-MD5"] = artifact.checksum_md5 # Determine download mode (query param overrides server default) download_mode = mode or settings.download_mode @@ -3071,15 +3341,29 @@ def download_artifact( # Proxy mode (default fallback) - stream through backend # Handle range requests (verification not supported for partial downloads) if range: - stream, content_length, content_range = storage.get_stream( - artifact.s3_key, range - ) + try: + stream, content_length, content_range = storage.get_stream( + artifact.s3_key, range + ) + except Exception as e: + # S3 returns InvalidRange error for unsatisfiable ranges + error_str = str(e).lower() + if "invalidrange" in error_str or "range" in error_str: + raise HTTPException( + status_code=416, + detail="Range Not Satisfiable", + headers={ + "Content-Range": f"bytes */{artifact.size}", + "Accept-Ranges": "bytes", + }, + ) + raise headers = { - "Content-Disposition": f'attachment; filename="{filename}"', + "Content-Disposition": build_content_disposition(filename), "Accept-Ranges": "bytes", "Content-Length": str(content_length), - **checksum_headers, + **common_headers, } if content_range: headers["Content-Range"] = content_range @@ -3094,9 +3378,9 @@ def download_artifact( # Full download with optional verification base_headers = { - "Content-Disposition": f'attachment; filename="{filename}"', + "Content-Disposition": build_content_disposition(filename), "Accept-Ranges": "bytes", - **checksum_headers, + **common_headers, } # Pre-verification mode: verify before streaming @@ -3164,11 +3448,42 @@ def download_artifact( }, ) - # No verification - direct streaming + # No verification - direct streaming with completion logging stream, content_length, _ = storage.get_stream(artifact.s3_key) + def logged_stream(): + """Generator that yields chunks and logs completion/disconnection.""" + import time + start_time = time.time() + bytes_sent = 0 + try: + for chunk in stream: + bytes_sent += len(chunk) + yield chunk + # Download completed successfully + duration = time.time() - start_time + throughput_mbps = (bytes_sent / (1024 * 1024)) / duration if duration > 0 else 0 + logger.info( + f"Download completed: artifact={artifact.id[:16]}... " + f"bytes={bytes_sent} duration={duration:.2f}s throughput={throughput_mbps:.2f}MB/s" + ) + except GeneratorExit: + # Client disconnected before download completed + duration = time.time() - start_time + logger.warning( + f"Download interrupted: artifact={artifact.id[:16]}... " + f"bytes_sent={bytes_sent}/{content_length} duration={duration:.2f}s" + ) + except Exception as e: + duration = time.time() - start_time + logger.error( + f"Download error: artifact={artifact.id[:16]}... " + f"bytes_sent={bytes_sent} duration={duration:.2f}s error={e}" + ) + raise + return StreamingResponse( - stream, + logged_stream(), media_type=artifact.content_type or "application/octet-stream", headers={ **base_headers, @@ -3276,7 +3591,7 @@ def head_artifact( # Build headers with checksum information headers = { - "Content-Disposition": f'attachment; filename="{filename}"', + "Content-Disposition": build_content_disposition(filename), "Accept-Ranges": "bytes", "Content-Length": str(artifact.size), "X-Artifact-Id": artifact.id, diff --git a/backend/app/schemas.py b/backend/app/schemas.py index 1fe395b..275f827 100644 --- a/backend/app/schemas.py +++ b/backend/app/schemas.py @@ -412,6 +412,9 @@ class UploadResponse(BaseModel): content_type: Optional[str] = None original_name: Optional[str] = None created_at: Optional[datetime] = None + # Upload metrics (Issue #43) + duration_ms: Optional[int] = None # Upload duration in milliseconds + throughput_mbps: Optional[float] = None # Upload throughput in MB/s # Resumable upload schemas @@ -478,6 +481,21 @@ class ResumableUploadStatusResponse(BaseModel): total_uploaded_bytes: int +class UploadProgressResponse(BaseModel): + """Progress information for an in-flight upload""" + + upload_id: str + status: str # 'in_progress', 'completed', 'failed', 'not_found' + bytes_uploaded: int = 0 + bytes_total: Optional[int] = None + percent_complete: Optional[float] = None + parts_uploaded: int = 0 + parts_total: Optional[int] = None + started_at: Optional[datetime] = None + elapsed_seconds: Optional[float] = None + throughput_mbps: Optional[float] = None + + # Consumer schemas class ConsumerResponse(BaseModel): id: UUID diff --git a/backend/app/storage.py b/backend/app/storage.py index ca3ffe3..cb7dbd4 100644 --- a/backend/app/storage.py +++ b/backend/app/storage.py @@ -378,10 +378,16 @@ class S3Storage: """ # First pass: compute all hashes by streaming through file try: + import time sha256_hasher = hashlib.sha256() md5_hasher = hashlib.md5() sha1_hasher = hashlib.sha1() size = 0 + hash_start_time = time.time() + last_log_time = hash_start_time + log_interval_seconds = 5 # Log progress every 5 seconds + + logger.info(f"Computing hashes for large file: expected_size={content_length}") # Read file in chunks to compute hashes while True: @@ -393,6 +399,18 @@ class S3Storage: sha1_hasher.update(chunk) size += len(chunk) + # Log hash computation progress periodically + current_time = time.time() + if current_time - last_log_time >= log_interval_seconds: + elapsed = current_time - hash_start_time + percent = (size / content_length) * 100 if content_length > 0 else 0 + throughput = (size / (1024 * 1024)) / elapsed if elapsed > 0 else 0 + logger.info( + f"Hash computation progress: bytes={size}/{content_length} ({percent:.1f}%) " + f"throughput={throughput:.2f}MB/s" + ) + last_log_time = current_time + # Enforce file size limit during streaming (protection against spoofing) if size > settings.max_file_size: raise FileSizeExceededError( @@ -405,6 +423,14 @@ class S3Storage: sha256_hash = sha256_hasher.hexdigest() md5_hash = md5_hasher.hexdigest() sha1_hash = sha1_hasher.hexdigest() + + # Log hash computation completion + hash_elapsed = time.time() - hash_start_time + hash_throughput = (size / (1024 * 1024)) / hash_elapsed if hash_elapsed > 0 else 0 + logger.info( + f"Hash computation completed: hash={sha256_hash[:16]}... " + f"size={size} duration={hash_elapsed:.2f}s throughput={hash_throughput:.2f}MB/s" + ) except (HashComputationError, FileSizeExceededError): raise except Exception as e: @@ -458,8 +484,19 @@ class S3Storage: upload_id = mpu["UploadId"] try: + import time parts = [] part_number = 1 + bytes_uploaded = 0 + upload_start_time = time.time() + last_log_time = upload_start_time + log_interval_seconds = 5 # Log progress every 5 seconds + + total_parts = (content_length + MULTIPART_CHUNK_SIZE - 1) // MULTIPART_CHUNK_SIZE + logger.info( + f"Starting multipart upload: hash={sha256_hash[:16]}... " + f"size={content_length} parts={total_parts}" + ) while True: chunk = file.read(MULTIPART_CHUNK_SIZE) @@ -479,8 +516,32 @@ class S3Storage: "ETag": response["ETag"], } ) + bytes_uploaded += len(chunk) + + # Log progress periodically + current_time = time.time() + if current_time - last_log_time >= log_interval_seconds: + elapsed = current_time - upload_start_time + percent = (bytes_uploaded / content_length) * 100 + throughput = (bytes_uploaded / (1024 * 1024)) / elapsed if elapsed > 0 else 0 + logger.info( + f"Upload progress: hash={sha256_hash[:16]}... " + f"part={part_number}/{total_parts} " + f"bytes={bytes_uploaded}/{content_length} ({percent:.1f}%) " + f"throughput={throughput:.2f}MB/s" + ) + last_log_time = current_time + part_number += 1 + # Log completion + total_elapsed = time.time() - upload_start_time + final_throughput = (content_length / (1024 * 1024)) / total_elapsed if total_elapsed > 0 else 0 + logger.info( + f"Multipart upload completed: hash={sha256_hash[:16]}... " + f"size={content_length} duration={total_elapsed:.2f}s throughput={final_throughput:.2f}MB/s" + ) + # Complete multipart upload complete_response = self.client.complete_multipart_upload( Bucket=self.bucket, @@ -502,12 +563,28 @@ class S3Storage: except Exception as e: # Abort multipart upload on failure - logger.error(f"Multipart upload failed: {e}") - self.client.abort_multipart_upload( - Bucket=self.bucket, - Key=s3_key, - UploadId=upload_id, + error_str = str(e).lower() + is_client_disconnect = ( + isinstance(e, (ConnectionResetError, BrokenPipeError)) or + "connection" in error_str or "broken pipe" in error_str or "reset" in error_str ) + if is_client_disconnect: + logger.warning( + f"Multipart upload aborted (client disconnect): hash={sha256_hash[:16]}... " + f"parts_uploaded={len(parts)} bytes_uploaded={bytes_uploaded}" + ) + else: + logger.error(f"Multipart upload failed: hash={sha256_hash[:16]}... error={e}") + + try: + self.client.abort_multipart_upload( + Bucket=self.bucket, + Key=s3_key, + UploadId=upload_id, + ) + logger.info(f"Multipart upload aborted and cleaned up: upload_id={upload_id[:16]}...") + except Exception as abort_error: + logger.error(f"Failed to abort multipart upload: {abort_error}") raise def initiate_resumable_upload(self, expected_hash: str) -> Dict[str, Any]: @@ -529,12 +606,17 @@ class S3Storage: mpu = self.client.create_multipart_upload(Bucket=self.bucket, Key=s3_key) upload_id = mpu["UploadId"] + import time session = { "upload_id": upload_id, "s3_key": s3_key, "already_exists": False, "parts": [], "expected_hash": expected_hash, + "started_at": time.time(), + "bytes_uploaded": 0, + "expected_size": None, # Set when init provides size + "status": "in_progress", } self._active_uploads[upload_id] = session return session @@ -561,10 +643,57 @@ class S3Storage: part_info = { "PartNumber": part_number, "ETag": response["ETag"], + "size": len(data), } session["parts"].append(part_info) + session["bytes_uploaded"] = session.get("bytes_uploaded", 0) + len(data) return part_info + def get_upload_progress(self, upload_id: str) -> Optional[Dict[str, Any]]: + """ + Get progress information for a resumable upload. + Returns None if upload not found. + """ + import time + session = self._active_uploads.get(upload_id) + if not session: + return None + + bytes_uploaded = session.get("bytes_uploaded", 0) + expected_size = session.get("expected_size") + started_at = session.get("started_at") + + progress = { + "upload_id": upload_id, + "status": session.get("status", "in_progress"), + "bytes_uploaded": bytes_uploaded, + "bytes_total": expected_size, + "parts_uploaded": len(session.get("parts", [])), + "parts_total": None, + "started_at": started_at, + "elapsed_seconds": None, + "percent_complete": None, + "throughput_mbps": None, + } + + if expected_size and expected_size > 0: + progress["percent_complete"] = round((bytes_uploaded / expected_size) * 100, 2) + progress["parts_total"] = (expected_size + MULTIPART_CHUNK_SIZE - 1) // MULTIPART_CHUNK_SIZE + + if started_at: + elapsed = time.time() - started_at + progress["elapsed_seconds"] = round(elapsed, 2) + if elapsed > 0 and bytes_uploaded > 0: + progress["throughput_mbps"] = round((bytes_uploaded / (1024 * 1024)) / elapsed, 2) + + return progress + + def set_upload_expected_size(self, upload_id: str, size: int): + """Set the expected size for an upload (for progress tracking).""" + session = self._active_uploads.get(upload_id) + if session: + session["expected_size"] = size + def complete_resumable_upload(self, upload_id: str) -> Tuple[str, str]: """ Complete a resumable upload. diff --git a/backend/pytest.ini b/backend/pytest.ini index 4480451..13f1367 100644 --- a/backend/pytest.ini +++ b/backend/pytest.ini @@ -12,6 +12,8 @@ markers = unit: Unit tests (no external dependencies) integration: Integration tests (require database/storage) slow: Slow tests (skip with -m "not slow") + large: Large file tests (100MB+, skip with -m "not large") + concurrent: Concurrent operation tests # Coverage configuration [coverage:run] diff --git a/backend/tests/conftest.py b/backend/tests/conftest.py index 9064602..71fb905 100644 --- a/backend/tests/conftest.py +++ b/backend/tests/conftest.py @@ -9,6 +9,37 @@ This module provides: import os import pytest + + +# ============================================================================= +# Pytest Markers +# ============================================================================= + + +def pytest_configure(config): + """Register custom pytest markers.""" + config.addinivalue_line( + "markers", + "auth_intensive: marks tests that make many login requests (excluded from CI integration tests due to rate limiting)", + ) + config.addinivalue_line( + "markers", + "integration: marks tests as integration tests", + ) + config.addinivalue_line( + "markers", + "large: marks tests that handle large files (slow)", + ) + config.addinivalue_line( + "markers", + "slow: marks tests as slow running", + ) + config.addinivalue_line( + "markers", + "requires_direct_s3: marks tests that require direct S3/MinIO access (skipped in CI where S3 is not directly accessible)", + ) + + import io from typing import Generator from unittest.mock import MagicMock @@ -32,6 +63,8 @@ from tests.factories import ( compute_md5, compute_sha1, upload_test_file, + generate_content, + generate_content_with_hash, TEST_CONTENT_HELLO, TEST_HASH_HELLO, TEST_MD5_HELLO, @@ -179,29 +212,64 @@ def test_app(): # ============================================================================= -@pytest.fixture +@pytest.fixture(scope="session") def integration_client(): """ Create an authenticated test client for integration tests. - Uses the real database and MinIO from docker-compose.local.yml. - Authenticates as admin for write operations. + Uses the real database and MinIO from docker-compose.local.yml or deployed environment. + Authenticates as admin for write operations. Session-scoped to reuse login across tests. + + Environment variables: + ORCHARD_TEST_URL: Base URL of the Orchard server (default: http://localhost:8080) + ORCHARD_TEST_USERNAME: Admin username for authentication (default: admin) + ORCHARD_TEST_PASSWORD: Admin password for authentication (default: changeme123) """ - from httpx import Client + import httpx - # Connect to the running orchard-server container + # Connect to the running orchard-server container or deployed environment base_url = os.environ.get("ORCHARD_TEST_URL", "http://localhost:8080") + username = os.environ.get("ORCHARD_TEST_USERNAME", "admin") + password = os.environ.get("ORCHARD_TEST_PASSWORD", "changeme123") - with Client(base_url=base_url, timeout=30.0) as client: + with httpx.Client(base_url=base_url, timeout=30.0) as client: # Login as admin to enable write operations login_response = client.post( "/api/v1/auth/login", - json={"username": "admin", "password": "changeme123"}, + json={"username": username, "password": password}, ) - # If login fails, tests will fail - that's expected if auth is broken if login_response.status_code != 200: - # Try to continue without auth for backward compatibility - pass + pytest.fail( + f"Authentication failed against {base_url}: {login_response.status_code} - {login_response.text}. " + f"Set ORCHARD_TEST_USERNAME and ORCHARD_TEST_PASSWORD environment variables if using non-default credentials." + ) + + # Verify cookie was set + if not client.cookies: + pytest.fail( + f"Login succeeded but no session cookie was set. Response headers: {login_response.headers}" + ) + + yield client + + +@pytest.fixture +def auth_client(): + """ + Create a function-scoped test client for authentication tests. + + Unlike integration_client (session-scoped), this creates a fresh client + for each test. Use this for tests that manipulate authentication state + (login, logout, cookie clearing) to avoid polluting other tests. + + Environment variables: + ORCHARD_TEST_URL: Base URL of the Orchard server (default: http://localhost:8080) + """ + import httpx + + base_url = os.environ.get("ORCHARD_TEST_URL", "http://localhost:8080") + + with httpx.Client(base_url=base_url, timeout=30.0) as client: yield client @@ -271,3 +339,41 @@ def test_content(): content = f"test-content-{uuid.uuid4().hex}".encode() sha256 = compute_sha256(content) return (content, sha256) + + +@pytest.fixture +def sized_content(): + """ + Factory fixture for generating content of specific sizes. + + Usage: + def test_example(sized_content): + content, hash = sized_content(1024) # 1KB + content, hash = sized_content(1024 * 1024) # 1MB + """ + def _generate(size: int, seed: int = None): + return generate_content_with_hash(size, seed) + return _generate + + +# ============================================================================= +# Size Constants for Tests +# ============================================================================= + +# Common file sizes for boundary testing +SIZE_1B = 1 +SIZE_1KB = 1024 +SIZE_10KB = 10 * 1024 +SIZE_100KB = 100 * 1024 +SIZE_1MB = 1024 * 1024 +SIZE_5MB = 5 * 1024 * 1024 +SIZE_10MB = 10 * 1024 * 1024 +SIZE_50MB = 50 * 1024 * 1024 +SIZE_100MB = 100 * 1024 * 1024 +SIZE_250MB = 250 * 1024 * 1024 +SIZE_500MB = 500 * 1024 * 1024 +SIZE_1GB = 1024 * 1024 * 1024 + +# Chunk size boundaries (based on typical S3 multipart chunk sizes) +CHUNK_SIZE = 64 * 1024 # 64KB typical chunk +MULTIPART_THRESHOLD = 100 * 1024 * 1024 # 100MB multipart threshold diff --git a/backend/tests/factories.py b/backend/tests/factories.py index a37acce..50112ea 100644 --- a/backend/tests/factories.py +++ b/backend/tests/factories.py @@ -130,6 +130,41 @@ def upload_test_file( return response.json() +def generate_content(size: int, seed: Optional[int] = None) -> bytes: + """ + Generate deterministic or random content of a specified size. + + Args: + size: Size of content in bytes + seed: Optional seed for reproducible content (None for random) + + Returns: + Bytes of the specified size + """ + if size == 0: + return b"" + if seed is not None: + import random + rng = random.Random(seed) + return bytes(rng.randint(0, 255) for _ in range(size)) + return os.urandom(size) + + +def generate_content_with_hash(size: int, seed: Optional[int] = None) -> tuple[bytes, str]: + """ + Generate content of specified size and compute its SHA256 hash. + + Args: + size: Size of content in bytes + seed: Optional seed for reproducible content + + Returns: + Tuple of (content_bytes, sha256_hash) + """ + content = generate_content(size, seed) + return content, compute_sha256(content) + + # ============================================================================= # Project/Package Factories # ============================================================================= diff --git a/backend/tests/integration/test_auth_api.py b/backend/tests/integration/test_auth_api.py index 817694a..13e5259 100644 --- a/backend/tests/integration/test_auth_api.py +++ b/backend/tests/integration/test_auth_api.py @@ -1,16 +1,25 @@ -"""Integration tests for authentication API endpoints.""" +"""Integration tests for authentication API endpoints. + +Note: These tests are marked as auth_intensive because they make many login +requests. Dev/stage deployments have relaxed rate limits (1000/minute) to +allow these tests to run. Production uses strict rate limits (5/minute). +""" import pytest from uuid import uuid4 +# Mark all tests in this module as auth_intensive (informational, not excluded from CI) +pytestmark = pytest.mark.auth_intensive + + class TestAuthLogin: """Tests for login endpoint.""" @pytest.mark.integration - def test_login_success(self, integration_client): + def test_login_success(self, auth_client): """Test successful login with default admin credentials.""" - response = integration_client.post( + response = auth_client.post( "/api/v1/auth/login", json={"username": "admin", "password": "changeme123"}, ) @@ -21,9 +30,9 @@ class TestAuthLogin: assert "orchard_session" in response.cookies @pytest.mark.integration - def test_login_invalid_password(self, integration_client): + def test_login_invalid_password(self, auth_client): """Test login with wrong password.""" - response = integration_client.post( + response = auth_client.post( "/api/v1/auth/login", json={"username": "admin", "password": "wrongpassword"}, ) @@ -31,9 +40,9 @@ class TestAuthLogin: assert "Invalid username or password" in response.json()["detail"] @pytest.mark.integration - def test_login_nonexistent_user(self, integration_client): + def test_login_nonexistent_user(self, auth_client): """Test login with non-existent user.""" - response = integration_client.post( + response = auth_client.post( "/api/v1/auth/login", json={"username": "nonexistent", "password": "password"}, ) @@ -44,24 +53,24 @@ class TestAuthLogout: """Tests for logout endpoint.""" @pytest.mark.integration - def test_logout_success(self, integration_client): + def test_logout_success(self, auth_client): """Test successful logout.""" # First login - login_response = integration_client.post( + login_response = auth_client.post( "/api/v1/auth/login", json={"username": "admin", "password": "changeme123"}, ) assert login_response.status_code == 200 # Then logout - logout_response = integration_client.post("/api/v1/auth/logout") + logout_response = auth_client.post("/api/v1/auth/logout") assert logout_response.status_code == 200 assert "Logged out successfully" in logout_response.json()["message"] @pytest.mark.integration - def test_logout_without_session(self, integration_client): + def test_logout_without_session(self, auth_client): """Test logout without being logged in.""" - response = integration_client.post("/api/v1/auth/logout") + response = auth_client.post("/api/v1/auth/logout") # Should succeed even without session assert response.status_code == 200 @@ -70,15 +79,15 @@ class TestAuthMe: """Tests for get current user endpoint.""" @pytest.mark.integration - def test_get_me_authenticated(self, integration_client): + def test_get_me_authenticated(self, auth_client): """Test getting current user when authenticated.""" # Login first - integration_client.post( + auth_client.post( "/api/v1/auth/login", json={"username": "admin", "password": "changeme123"}, ) - response = integration_client.get("/api/v1/auth/me") + response = auth_client.get("/api/v1/auth/me") assert response.status_code == 200 data = response.json() assert data["username"] == "admin" @@ -87,67 +96,88 @@ class TestAuthMe: assert "created_at" in data @pytest.mark.integration - def test_get_me_unauthenticated(self, integration_client): + def test_get_me_unauthenticated(self, auth_client): """Test getting current user without authentication.""" # Clear any existing cookies - integration_client.cookies.clear() + auth_client.cookies.clear() - response = integration_client.get("/api/v1/auth/me") + response = auth_client.get("/api/v1/auth/me") assert response.status_code == 401 assert "Not authenticated" in response.json()["detail"] class TestAuthChangePassword: - """Tests for change password endpoint.""" + """Tests for change password endpoint. + + Note: These tests use dedicated test users instead of admin to avoid + invalidating the integration_client session (which uses admin). + """ @pytest.mark.integration - def test_change_password_success(self, integration_client): + def test_change_password_success(self, auth_client): """Test successful password change.""" - # Login first - integration_client.post( + # Login as admin to create a test user + auth_client.post( "/api/v1/auth/login", json={"username": "admin", "password": "changeme123"}, ) + test_username = f"pwchange_{uuid4().hex[:8]}" + auth_client.post( + "/api/v1/admin/users", + json={"username": test_username, "password": "oldpassword123"}, + ) + + # Login as test user + auth_client.cookies.clear() + auth_client.post( + "/api/v1/auth/login", + json={"username": test_username, "password": "oldpassword123"}, + ) # Change password - response = integration_client.post( + response = auth_client.post( "/api/v1/auth/change-password", - json={"current_password": "changeme123", "new_password": "newpassword123"}, + json={"current_password": "oldpassword123", "new_password": "newpassword123"}, ) assert response.status_code == 200 # Verify old password no longer works - integration_client.cookies.clear() - response = integration_client.post( + auth_client.cookies.clear() + response = auth_client.post( "/api/v1/auth/login", - json={"username": "admin", "password": "changeme123"}, + json={"username": test_username, "password": "oldpassword123"}, ) assert response.status_code == 401 # Verify new password works - response = integration_client.post( + response = auth_client.post( "/api/v1/auth/login", - json={"username": "admin", "password": "newpassword123"}, + json={"username": test_username, "password": "newpassword123"}, ) assert response.status_code == 200 - # Reset password back to original for other tests - reset_response = integration_client.post( - "/api/v1/auth/change-password", - json={"current_password": "newpassword123", "new_password": "changeme123"}, - ) - assert reset_response.status_code == 200, "Failed to reset admin password back to default" - @pytest.mark.integration - def test_change_password_wrong_current(self, integration_client): + def test_change_password_wrong_current(self, auth_client): """Test password change with wrong current password.""" - # Login first - integration_client.post( + # Login as admin to create a test user + auth_client.post( "/api/v1/auth/login", json={"username": "admin", "password": "changeme123"}, ) + test_username = f"pwwrong_{uuid4().hex[:8]}" + auth_client.post( + "/api/v1/admin/users", + json={"username": test_username, "password": "password123"}, + ) - response = integration_client.post( + # Login as test user + auth_client.cookies.clear() + auth_client.post( + "/api/v1/auth/login", + json={"username": test_username, "password": "password123"}, + ) + + response = auth_client.post( "/api/v1/auth/change-password", json={"current_password": "wrongpassword", "new_password": "newpassword"}, ) @@ -159,16 +189,16 @@ class TestAPIKeys: """Tests for API key management endpoints.""" @pytest.mark.integration - def test_create_and_list_api_key(self, integration_client): + def test_create_and_list_api_key(self, auth_client): """Test creating and listing API keys.""" # Login first - integration_client.post( + auth_client.post( "/api/v1/auth/login", json={"username": "admin", "password": "changeme123"}, ) # Create API key - create_response = integration_client.post( + create_response = auth_client.post( "/api/v1/auth/keys", json={"name": "test-key", "description": "Test API key"}, ) @@ -182,23 +212,23 @@ class TestAPIKeys: api_key = data["key"] # List API keys - list_response = integration_client.get("/api/v1/auth/keys") + list_response = auth_client.get("/api/v1/auth/keys") assert list_response.status_code == 200 keys = list_response.json() assert any(k["id"] == key_id for k in keys) # Clean up - delete the key - integration_client.delete(f"/api/v1/auth/keys/{key_id}") + auth_client.delete(f"/api/v1/auth/keys/{key_id}") @pytest.mark.integration - def test_use_api_key_for_auth(self, integration_client): + def test_use_api_key_for_auth(self, auth_client): """Test using API key for authentication.""" # Login and create API key - integration_client.post( + auth_client.post( "/api/v1/auth/login", json={"username": "admin", "password": "changeme123"}, ) - create_response = integration_client.post( + create_response = auth_client.post( "/api/v1/auth/keys", json={"name": "auth-test-key"}, ) @@ -206,8 +236,8 @@ class TestAPIKeys: key_id = create_response.json()["id"] # Clear cookies and use API key - integration_client.cookies.clear() - response = integration_client.get( + auth_client.cookies.clear() + response = auth_client.get( "/api/v1/auth/me", headers={"Authorization": f"Bearer {api_key}"}, ) @@ -215,21 +245,21 @@ class TestAPIKeys: assert response.json()["username"] == "admin" # Clean up - integration_client.post( + auth_client.post( "/api/v1/auth/login", json={"username": "admin", "password": "changeme123"}, ) - integration_client.delete(f"/api/v1/auth/keys/{key_id}") + auth_client.delete(f"/api/v1/auth/keys/{key_id}") @pytest.mark.integration - def test_delete_api_key(self, integration_client): + def test_delete_api_key(self, auth_client): """Test revoking an API key.""" # Login and create API key - integration_client.post( + auth_client.post( "/api/v1/auth/login", json={"username": "admin", "password": "changeme123"}, ) - create_response = integration_client.post( + create_response = auth_client.post( "/api/v1/auth/keys", json={"name": "delete-test-key"}, ) @@ -237,12 +267,12 @@ class TestAPIKeys: api_key = create_response.json()["key"] # Delete the key - delete_response = integration_client.delete(f"/api/v1/auth/keys/{key_id}") + delete_response = auth_client.delete(f"/api/v1/auth/keys/{key_id}") assert delete_response.status_code == 200 # Verify key no longer works - integration_client.cookies.clear() - response = integration_client.get( + auth_client.cookies.clear() + response = auth_client.get( "/api/v1/auth/me", headers={"Authorization": f"Bearer {api_key}"}, ) @@ -253,32 +283,32 @@ class TestAdminUserManagement: """Tests for admin user management endpoints.""" @pytest.mark.integration - def test_list_users(self, integration_client): + def test_list_users(self, auth_client): """Test listing users as admin.""" # Login as admin - integration_client.post( + auth_client.post( "/api/v1/auth/login", json={"username": "admin", "password": "changeme123"}, ) - response = integration_client.get("/api/v1/admin/users") + response = auth_client.get("/api/v1/admin/users") assert response.status_code == 200 users = response.json() assert len(users) >= 1 assert any(u["username"] == "admin" for u in users) @pytest.mark.integration - def test_create_user(self, integration_client): + def test_create_user(self, auth_client): """Test creating a new user as admin.""" # Login as admin - integration_client.post( + auth_client.post( "/api/v1/auth/login", json={"username": "admin", "password": "changeme123"}, ) # Create new user test_username = f"testuser_{uuid4().hex[:8]}" - response = integration_client.post( + response = auth_client.post( "/api/v1/admin/users", json={ "username": test_username, @@ -293,31 +323,31 @@ class TestAdminUserManagement: assert data["is_admin"] is False # Verify new user can login - integration_client.cookies.clear() - login_response = integration_client.post( + auth_client.cookies.clear() + login_response = auth_client.post( "/api/v1/auth/login", json={"username": test_username, "password": "testpassword"}, ) assert login_response.status_code == 200 @pytest.mark.integration - def test_update_user(self, integration_client): + def test_update_user(self, auth_client): """Test updating a user as admin.""" # Login as admin - integration_client.post( + auth_client.post( "/api/v1/auth/login", json={"username": "admin", "password": "changeme123"}, ) # Create a test user test_username = f"updateuser_{uuid4().hex[:8]}" - integration_client.post( + auth_client.post( "/api/v1/admin/users", json={"username": test_username, "password": "password"}, ) # Update the user - response = integration_client.put( + response = auth_client.put( f"/api/v1/admin/users/{test_username}", json={"email": "updated@example.com", "is_admin": True}, ) @@ -327,59 +357,59 @@ class TestAdminUserManagement: assert data["is_admin"] is True @pytest.mark.integration - def test_reset_user_password(self, integration_client): + def test_reset_user_password(self, auth_client): """Test resetting a user's password as admin.""" # Login as admin - integration_client.post( + auth_client.post( "/api/v1/auth/login", json={"username": "admin", "password": "changeme123"}, ) # Create a test user test_username = f"resetuser_{uuid4().hex[:8]}" - integration_client.post( + auth_client.post( "/api/v1/admin/users", json={"username": test_username, "password": "oldpassword"}, ) # Reset password - response = integration_client.post( + response = auth_client.post( f"/api/v1/admin/users/{test_username}/reset-password", json={"new_password": "newpassword"}, ) assert response.status_code == 200 # Verify new password works - integration_client.cookies.clear() - login_response = integration_client.post( + auth_client.cookies.clear() + login_response = auth_client.post( "/api/v1/auth/login", json={"username": test_username, "password": "newpassword"}, ) assert login_response.status_code == 200 @pytest.mark.integration - def test_non_admin_cannot_access_admin_endpoints(self, integration_client): + def test_non_admin_cannot_access_admin_endpoints(self, auth_client): """Test that non-admin users cannot access admin endpoints.""" # Login as admin and create non-admin user - integration_client.post( + auth_client.post( "/api/v1/auth/login", json={"username": "admin", "password": "changeme123"}, ) test_username = f"nonadmin_{uuid4().hex[:8]}" - integration_client.post( + auth_client.post( "/api/v1/admin/users", json={"username": test_username, "password": "password", "is_admin": False}, ) # Login as non-admin - integration_client.cookies.clear() - integration_client.post( + auth_client.cookies.clear() + auth_client.post( "/api/v1/auth/login", json={"username": test_username, "password": "password"}, ) # Try to access admin endpoints - response = integration_client.get("/api/v1/admin/users") + response = auth_client.get("/api/v1/admin/users") assert response.status_code == 403 assert "Admin privileges required" in response.json()["detail"] @@ -388,28 +418,28 @@ class TestSecurityEdgeCases: """Tests for security edge cases and validation.""" @pytest.mark.integration - def test_login_inactive_user(self, integration_client): + def test_login_inactive_user(self, auth_client): """Test that inactive users cannot login.""" # Login as admin and create a user - integration_client.post( + auth_client.post( "/api/v1/auth/login", json={"username": "admin", "password": "changeme123"}, ) test_username = f"inactive_{uuid4().hex[:8]}" - integration_client.post( + auth_client.post( "/api/v1/admin/users", json={"username": test_username, "password": "password123"}, ) # Deactivate the user - integration_client.put( + auth_client.put( f"/api/v1/admin/users/{test_username}", json={"is_active": False}, ) # Try to login as inactive user - integration_client.cookies.clear() - response = integration_client.post( + auth_client.cookies.clear() + response = auth_client.post( "/api/v1/auth/login", json={"username": test_username, "password": "password123"}, ) @@ -417,14 +447,14 @@ class TestSecurityEdgeCases: assert "Invalid username or password" in response.json()["detail"] @pytest.mark.integration - def test_password_too_short_on_create(self, integration_client): + def test_password_too_short_on_create(self, auth_client): """Test that short passwords are rejected when creating users.""" - integration_client.post( + auth_client.post( "/api/v1/auth/login", json={"username": "admin", "password": "changeme123"}, ) - response = integration_client.post( + response = auth_client.post( "/api/v1/admin/users", json={"username": f"shortpw_{uuid4().hex[:8]}", "password": "short"}, ) @@ -432,36 +462,49 @@ class TestSecurityEdgeCases: assert "at least 8 characters" in response.json()["detail"] @pytest.mark.integration - def test_password_too_short_on_change(self, integration_client): + def test_password_too_short_on_change(self, auth_client): """Test that short passwords are rejected when changing password.""" - integration_client.post( + # Create test user + auth_client.post( "/api/v1/auth/login", json={"username": "admin", "password": "changeme123"}, ) + test_username = f"shortchange_{uuid4().hex[:8]}" + auth_client.post( + "/api/v1/admin/users", + json={"username": test_username, "password": "password123"}, + ) - response = integration_client.post( + # Login as test user + auth_client.cookies.clear() + auth_client.post( + "/api/v1/auth/login", + json={"username": test_username, "password": "password123"}, + ) + + response = auth_client.post( "/api/v1/auth/change-password", - json={"current_password": "changeme123", "new_password": "short"}, + json={"current_password": "password123", "new_password": "short"}, ) assert response.status_code == 400 assert "at least 8 characters" in response.json()["detail"] @pytest.mark.integration - def test_password_too_short_on_reset(self, integration_client): + def test_password_too_short_on_reset(self, auth_client): """Test that short passwords are rejected when resetting password.""" - integration_client.post( + auth_client.post( "/api/v1/auth/login", json={"username": "admin", "password": "changeme123"}, ) # Create a test user first test_username = f"resetshort_{uuid4().hex[:8]}" - integration_client.post( + auth_client.post( "/api/v1/admin/users", json={"username": test_username, "password": "password123"}, ) - response = integration_client.post( + response = auth_client.post( f"/api/v1/admin/users/{test_username}/reset-password", json={"new_password": "short"}, ) @@ -469,23 +512,23 @@ class TestSecurityEdgeCases: assert "at least 8 characters" in response.json()["detail"] @pytest.mark.integration - def test_duplicate_username_rejected(self, integration_client): + def test_duplicate_username_rejected(self, auth_client): """Test that duplicate usernames are rejected.""" - integration_client.post( + auth_client.post( "/api/v1/auth/login", json={"username": "admin", "password": "changeme123"}, ) test_username = f"duplicate_{uuid4().hex[:8]}" # Create user first time - response1 = integration_client.post( + response1 = auth_client.post( "/api/v1/admin/users", json={"username": test_username, "password": "password123"}, ) assert response1.status_code == 200 # Try to create same username again - response2 = integration_client.post( + response2 = auth_client.post( "/api/v1/admin/users", json={"username": test_username, "password": "password456"}, ) @@ -493,14 +536,14 @@ class TestSecurityEdgeCases: assert "already exists" in response2.json()["detail"] @pytest.mark.integration - def test_cannot_delete_other_users_api_key(self, integration_client): + def test_cannot_delete_other_users_api_key(self, auth_client): """Test that users cannot delete API keys owned by other users.""" # Login as admin and create an API key - integration_client.post( + auth_client.post( "/api/v1/auth/login", json={"username": "admin", "password": "changeme123"}, ) - create_response = integration_client.post( + create_response = auth_client.post( "/api/v1/auth/keys", json={"name": "admin-key"}, ) @@ -508,253 +551,65 @@ class TestSecurityEdgeCases: # Create a non-admin user test_username = f"nonadmin_{uuid4().hex[:8]}" - integration_client.post( + auth_client.post( "/api/v1/admin/users", json={"username": test_username, "password": "password123"}, ) # Login as non-admin - integration_client.cookies.clear() - integration_client.post( + auth_client.cookies.clear() + auth_client.post( "/api/v1/auth/login", json={"username": test_username, "password": "password123"}, ) # Try to delete admin's API key - response = integration_client.delete(f"/api/v1/auth/keys/{admin_key_id}") + response = auth_client.delete(f"/api/v1/auth/keys/{admin_key_id}") assert response.status_code == 403 assert "Cannot delete another user's API key" in response.json()["detail"] # Cleanup: login as admin and delete the key - integration_client.cookies.clear() - integration_client.post( + auth_client.cookies.clear() + auth_client.post( "/api/v1/auth/login", json={"username": "admin", "password": "changeme123"}, ) - integration_client.delete(f"/api/v1/auth/keys/{admin_key_id}") + auth_client.delete(f"/api/v1/auth/keys/{admin_key_id}") @pytest.mark.integration - def test_sessions_invalidated_on_password_change(self, integration_client): + def test_sessions_invalidated_on_password_change(self, auth_client): """Test that all sessions are invalidated when password is changed.""" # Create a test user - integration_client.post( + auth_client.post( "/api/v1/auth/login", json={"username": "admin", "password": "changeme123"}, ) test_username = f"sessiontest_{uuid4().hex[:8]}" - integration_client.post( + auth_client.post( "/api/v1/admin/users", json={"username": test_username, "password": "password123"}, ) # Login as test user - integration_client.cookies.clear() - login_response = integration_client.post( + auth_client.cookies.clear() + login_response = auth_client.post( "/api/v1/auth/login", json={"username": test_username, "password": "password123"}, ) assert login_response.status_code == 200 # Verify session works - me_response = integration_client.get("/api/v1/auth/me") + me_response = auth_client.get("/api/v1/auth/me") assert me_response.status_code == 200 # Change password - integration_client.post( + auth_client.post( "/api/v1/auth/change-password", json={"current_password": "password123", "new_password": "newpassword123"}, ) # Old session should be invalidated - try to access /me # (note: the change-password call itself may have cleared the session cookie) - me_response2 = integration_client.get("/api/v1/auth/me") - # This should fail because all sessions were invalidated - assert me_response2.status_code == 401 - - -class TestSecurityEdgeCases: - """Tests for security edge cases and validation.""" - - @pytest.mark.integration - def test_login_inactive_user(self, integration_client): - """Test that inactive users cannot login.""" - # Login as admin and create a user - integration_client.post( - "/api/v1/auth/login", - json={"username": "admin", "password": "changeme123"}, - ) - test_username = f"inactive_{uuid4().hex[:8]}" - integration_client.post( - "/api/v1/admin/users", - json={"username": test_username, "password": "password123"}, - ) - - # Deactivate the user - integration_client.put( - f"/api/v1/admin/users/{test_username}", - json={"is_active": False}, - ) - - # Try to login as inactive user - integration_client.cookies.clear() - response = integration_client.post( - "/api/v1/auth/login", - json={"username": test_username, "password": "password123"}, - ) - assert response.status_code == 401 - assert "Invalid username or password" in response.json()["detail"] - - @pytest.mark.integration - def test_password_too_short_on_create(self, integration_client): - """Test that short passwords are rejected when creating users.""" - integration_client.post( - "/api/v1/auth/login", - json={"username": "admin", "password": "changeme123"}, - ) - - response = integration_client.post( - "/api/v1/admin/users", - json={"username": f"shortpw_{uuid4().hex[:8]}", "password": "short"}, - ) - assert response.status_code == 400 - assert "at least 8 characters" in response.json()["detail"] - - @pytest.mark.integration - def test_password_too_short_on_change(self, integration_client): - """Test that short passwords are rejected when changing password.""" - integration_client.post( - "/api/v1/auth/login", - json={"username": "admin", "password": "changeme123"}, - ) - - response = integration_client.post( - "/api/v1/auth/change-password", - json={"current_password": "changeme123", "new_password": "short"}, - ) - assert response.status_code == 400 - assert "at least 8 characters" in response.json()["detail"] - - @pytest.mark.integration - def test_password_too_short_on_reset(self, integration_client): - """Test that short passwords are rejected when resetting password.""" - integration_client.post( - "/api/v1/auth/login", - json={"username": "admin", "password": "changeme123"}, - ) - - # Create a test user first - test_username = f"resetshort_{uuid4().hex[:8]}" - integration_client.post( - "/api/v1/admin/users", - json={"username": test_username, "password": "password123"}, - ) - - response = integration_client.post( - f"/api/v1/admin/users/{test_username}/reset-password", - json={"new_password": "short"}, - ) - assert response.status_code == 400 - assert "at least 8 characters" in response.json()["detail"] - - @pytest.mark.integration - def test_duplicate_username_rejected(self, integration_client): - """Test that duplicate usernames are rejected.""" - integration_client.post( - "/api/v1/auth/login", - json={"username": "admin", "password": "changeme123"}, - ) - - test_username = f"duplicate_{uuid4().hex[:8]}" - # Create user first time - response1 = integration_client.post( - "/api/v1/admin/users", - json={"username": test_username, "password": "password123"}, - ) - assert response1.status_code == 200 - - # Try to create same username again - response2 = integration_client.post( - "/api/v1/admin/users", - json={"username": test_username, "password": "password456"}, - ) - assert response2.status_code == 409 - assert "already exists" in response2.json()["detail"] - - @pytest.mark.integration - def test_cannot_delete_other_users_api_key(self, integration_client): - """Test that users cannot delete API keys owned by other users.""" - # Login as admin and create an API key - integration_client.post( - "/api/v1/auth/login", - json={"username": "admin", "password": "changeme123"}, - ) - create_response = integration_client.post( - "/api/v1/auth/keys", - json={"name": "admin-key"}, - ) - admin_key_id = create_response.json()["id"] - - # Create a non-admin user - test_username = f"nonadmin_{uuid4().hex[:8]}" - integration_client.post( - "/api/v1/admin/users", - json={"username": test_username, "password": "password123"}, - ) - - # Login as non-admin - integration_client.cookies.clear() - integration_client.post( - "/api/v1/auth/login", - json={"username": test_username, "password": "password123"}, - ) - - # Try to delete admin's API key - response = integration_client.delete(f"/api/v1/auth/keys/{admin_key_id}") - assert response.status_code == 403 - assert "Cannot delete another user's API key" in response.json()["detail"] - - # Cleanup: login as admin and delete the key - integration_client.cookies.clear() - integration_client.post( - "/api/v1/auth/login", - json={"username": "admin", "password": "changeme123"}, - ) - integration_client.delete(f"/api/v1/auth/keys/{admin_key_id}") - - @pytest.mark.integration - def test_sessions_invalidated_on_password_change(self, integration_client): - """Test that all sessions are invalidated when password is changed.""" - # Create a test user - integration_client.post( - "/api/v1/auth/login", - json={"username": "admin", "password": "changeme123"}, - ) - test_username = f"sessiontest_{uuid4().hex[:8]}" - integration_client.post( - "/api/v1/admin/users", - json={"username": test_username, "password": "password123"}, - ) - - # Login as test user - integration_client.cookies.clear() - login_response = integration_client.post( - "/api/v1/auth/login", - json={"username": test_username, "password": "password123"}, - ) - assert login_response.status_code == 200 - - # Verify session works - me_response = integration_client.get("/api/v1/auth/me") - assert me_response.status_code == 200 - - # Change password - integration_client.post( - "/api/v1/auth/change-password", - json={"current_password": "password123", "new_password": "newpassword123"}, - ) - - # Old session should be invalidated - try to access /me - # (note: the change-password call itself may have cleared the session cookie) - me_response2 = integration_client.get("/api/v1/auth/me") + me_response2 = auth_client.get("/api/v1/auth/me") # This should fail because all sessions were invalidated assert me_response2.status_code == 401 diff --git a/backend/tests/integration/test_concurrent_operations.py b/backend/tests/integration/test_concurrent_operations.py new file mode 100644 index 0000000..4237cf4 --- /dev/null +++ b/backend/tests/integration/test_concurrent_operations.py @@ -0,0 +1,737 @@ +""" +Integration tests for concurrent upload and download operations. + +Tests cover: +- Concurrent uploads of different files +- Concurrent uploads of same file (deduplication race) +- Concurrent downloads of same artifact +- Concurrent downloads of different artifacts +- Mixed concurrent uploads and downloads +- Data corruption prevention under concurrency +""" + +import pytest +import io +import os +from concurrent.futures import ThreadPoolExecutor, as_completed +from tests.factories import ( + compute_sha256, + upload_test_file, + generate_content_with_hash, +) + + +def get_api_key(integration_client): + """Create an API key for concurrent test workers.""" + import uuid + response = integration_client.post( + "/api/v1/auth/keys", + json={"name": f"concurrent-test-{uuid.uuid4().hex[:8]}"}, + ) + if response.status_code == 200: + return response.json()["key"] + return None + + +class TestConcurrentUploads: + """Tests for concurrent upload operations.""" + + @pytest.mark.integration + @pytest.mark.concurrent + def test_2_concurrent_uploads_different_files(self, integration_client, test_package): + """Test 2 concurrent uploads of different files.""" + project, package = test_package + api_key = get_api_key(integration_client) + assert api_key, "Failed to create API key" + + files_data = [ + generate_content_with_hash(1024, seed=i) for i in range(2) + ] + + results = [] + errors = [] + + def upload_worker(idx, content, expected_hash): + try: + from httpx import Client + base_url = os.environ.get("ORCHARD_TEST_URL", "http://localhost:8080") + + with Client(base_url=base_url, timeout=60.0) as client: + files = { + "file": (f"file-{idx}.bin", io.BytesIO(content), "application/octet-stream") + } + response = client.post( + f"/api/v1/project/{project}/{package}/upload", + files=files, + data={"tag": f"concurrent-{idx}"}, + headers={"Authorization": f"Bearer {api_key}"}, + ) + if response.status_code == 200: + result = response.json() + results.append((idx, result, expected_hash)) + else: + errors.append(f"Worker {idx}: Status {response.status_code}: {response.text}") + except Exception as e: + errors.append(f"Worker {idx}: {str(e)}") + + with ThreadPoolExecutor(max_workers=2) as executor: + futures = [ + executor.submit(upload_worker, i, content, hash) + for i, (content, hash) in enumerate(files_data) + ] + for future in as_completed(futures): + pass + + assert len(errors) == 0, f"Errors: {errors}" + assert len(results) == 2 + + # Verify each upload returned correct artifact_id + for idx, result, expected_hash in results: + assert result["artifact_id"] == expected_hash + + @pytest.mark.integration + @pytest.mark.concurrent + def test_5_concurrent_uploads_different_files(self, integration_client, test_package): + """Test 5 concurrent uploads of different files.""" + project, package = test_package + api_key = get_api_key(integration_client) + assert api_key, "Failed to create API key" + + num_files = 5 + files_data = [ + generate_content_with_hash(2048, seed=100 + i) for i in range(num_files) + ] + + results = [] + errors = [] + + def upload_worker(idx, content, expected_hash): + try: + from httpx import Client + base_url = os.environ.get("ORCHARD_TEST_URL", "http://localhost:8080") + + with Client(base_url=base_url, timeout=60.0) as client: + files = { + "file": (f"file-{idx}.bin", io.BytesIO(content), "application/octet-stream") + } + response = client.post( + f"/api/v1/project/{project}/{package}/upload", + files=files, + data={"tag": f"concurrent5-{idx}"}, + headers={"Authorization": f"Bearer {api_key}"}, + ) + if response.status_code == 200: + result = response.json() + results.append((idx, result, expected_hash)) + else: + errors.append(f"Worker {idx}: Status {response.status_code}") + except Exception as e: + errors.append(f"Worker {idx}: {str(e)}") + + with ThreadPoolExecutor(max_workers=num_files) as executor: + futures = [ + executor.submit(upload_worker, i, content, hash) + for i, (content, hash) in enumerate(files_data) + ] + for future in as_completed(futures): + pass + + assert len(errors) == 0, f"Errors: {errors}" + assert len(results) == num_files + + # Verify all uploads have unique artifact_ids + artifact_ids = set(r[1]["artifact_id"] for r in results) + assert len(artifact_ids) == num_files + + @pytest.mark.integration + @pytest.mark.concurrent + def test_10_concurrent_uploads_different_files(self, integration_client, test_package): + """Test 10 concurrent uploads of different files.""" + project, package = test_package + api_key = get_api_key(integration_client) + assert api_key, "Failed to create API key" + + num_files = 10 + files_data = [ + generate_content_with_hash(1024, seed=200 + i) for i in range(num_files) + ] + + results = [] + errors = [] + + def upload_worker(idx, content, expected_hash): + try: + from httpx import Client + base_url = os.environ.get("ORCHARD_TEST_URL", "http://localhost:8080") + + with Client(base_url=base_url, timeout=60.0) as client: + files = { + "file": (f"file-{idx}.bin", io.BytesIO(content), "application/octet-stream") + } + response = client.post( + f"/api/v1/project/{project}/{package}/upload", + files=files, + data={"tag": f"concurrent10-{idx}"}, + headers={"Authorization": f"Bearer {api_key}"}, + ) + if response.status_code == 200: + result = response.json() + results.append((idx, result, expected_hash)) + else: + errors.append(f"Worker {idx}: Status {response.status_code}") + except Exception as e: + errors.append(f"Worker {idx}: {str(e)}") + + with ThreadPoolExecutor(max_workers=num_files) as executor: + futures = [ + executor.submit(upload_worker, i, content, hash) + for i, (content, hash) in enumerate(files_data) + ] + for future in as_completed(futures): + pass + + assert len(errors) == 0, f"Errors: {errors}" + assert len(results) == num_files + + @pytest.mark.integration + @pytest.mark.concurrent + def test_concurrent_uploads_same_file_deduplication(self, integration_client, test_package): + """Test concurrent uploads of same file handle deduplication correctly.""" + project, package = test_package + api_key = get_api_key(integration_client) + assert api_key, "Failed to create API key" + + content, expected_hash = generate_content_with_hash(4096, seed=999) + num_concurrent = 5 + + results = [] + errors = [] + + def upload_worker(idx): + try: + from httpx import Client + base_url = os.environ.get("ORCHARD_TEST_URL", "http://localhost:8080") + + with Client(base_url=base_url, timeout=60.0) as client: + files = { + "file": (f"same-{idx}.bin", io.BytesIO(content), "application/octet-stream") + } + response = client.post( + f"/api/v1/project/{project}/{package}/upload", + files=files, + data={"tag": f"dedup-{idx}"}, + headers={"Authorization": f"Bearer {api_key}"}, + ) + if response.status_code == 200: + results.append(response.json()) + else: + errors.append(f"Worker {idx}: Status {response.status_code}") + except Exception as e: + errors.append(f"Worker {idx}: {str(e)}") + + with ThreadPoolExecutor(max_workers=num_concurrent) as executor: + futures = [executor.submit(upload_worker, i) for i in range(num_concurrent)] + for future in as_completed(futures): + pass + + assert len(errors) == 0, f"Errors: {errors}" + assert len(results) == num_concurrent + + # All should have same artifact_id + artifact_ids = set(r["artifact_id"] for r in results) + assert len(artifact_ids) == 1 + assert expected_hash in artifact_ids + + # Verify final ref_count equals number of uploads + response = integration_client.get(f"/api/v1/artifact/{expected_hash}") + assert response.status_code == 200 + assert response.json()["ref_count"] == num_concurrent + + @pytest.mark.integration + @pytest.mark.concurrent + def test_concurrent_uploads_to_different_packages(self, integration_client, test_project, unique_test_id): + """Test concurrent uploads to different packages.""" + project = test_project + api_key = get_api_key(integration_client) + assert api_key, "Failed to create API key" + + num_packages = 3 + package_names = [] + + # Create multiple packages + for i in range(num_packages): + pkg_name = f"pkg-{unique_test_id}-{i}" + response = integration_client.post( + f"/api/v1/project/{project}/packages", + json={"name": pkg_name, "description": f"Package {i}"}, + ) + assert response.status_code == 200 + package_names.append(pkg_name) + + files_data = [ + generate_content_with_hash(1024, seed=300 + i) for i in range(num_packages) + ] + + results = [] + errors = [] + + def upload_worker(idx, package, content, expected_hash): + try: + from httpx import Client + base_url = os.environ.get("ORCHARD_TEST_URL", "http://localhost:8080") + + with Client(base_url=base_url, timeout=60.0) as client: + files = { + "file": (f"file-{idx}.bin", io.BytesIO(content), "application/octet-stream") + } + response = client.post( + f"/api/v1/project/{project}/{package}/upload", + files=files, + data={"tag": "latest"}, + headers={"Authorization": f"Bearer {api_key}"}, + ) + if response.status_code == 200: + result = response.json() + results.append((package, result, expected_hash)) + else: + errors.append(f"Worker {idx}: Status {response.status_code}") + except Exception as e: + errors.append(f"Worker {idx}: {str(e)}") + + with ThreadPoolExecutor(max_workers=num_packages) as executor: + futures = [ + executor.submit(upload_worker, i, package_names[i], content, hash) + for i, (content, hash) in enumerate(files_data) + ] + for future in as_completed(futures): + pass + + assert len(errors) == 0, f"Errors: {errors}" + assert len(results) == num_packages + + +class TestConcurrentDownloads: + """Tests for concurrent download operations.""" + + @pytest.mark.integration + @pytest.mark.concurrent + def test_2_concurrent_downloads_same_artifact(self, integration_client, test_package): + """Test 2 concurrent downloads of same artifact.""" + project, package = test_package + content, expected_hash = generate_content_with_hash(2048, seed=400) + + # Upload first + upload_test_file(integration_client, project, package, content, tag="download-test") + + results = [] + errors = [] + + def download_worker(idx): + try: + from httpx import Client + base_url = os.environ.get("ORCHARD_TEST_URL", "http://localhost:8080") + + with Client(base_url=base_url, timeout=60.0) as client: + response = client.get( + f"/api/v1/project/{project}/{package}/+/download-test", + params={"mode": "proxy"}, + ) + if response.status_code == 200: + results.append((idx, response.content)) + else: + errors.append(f"Worker {idx}: Status {response.status_code}") + except Exception as e: + errors.append(f"Worker {idx}: {str(e)}") + + with ThreadPoolExecutor(max_workers=2) as executor: + futures = [executor.submit(download_worker, i) for i in range(2)] + for future in as_completed(futures): + pass + + assert len(errors) == 0, f"Errors: {errors}" + assert len(results) == 2 + + # All downloads should match original + for idx, downloaded in results: + assert downloaded == content + + @pytest.mark.integration + @pytest.mark.concurrent + def test_5_concurrent_downloads_same_artifact(self, integration_client, test_package): + """Test 5 concurrent downloads of same artifact.""" + project, package = test_package + content, expected_hash = generate_content_with_hash(4096, seed=500) + + upload_test_file(integration_client, project, package, content, tag="download5-test") + + num_downloads = 5 + results = [] + errors = [] + + def download_worker(idx): + try: + from httpx import Client + base_url = os.environ.get("ORCHARD_TEST_URL", "http://localhost:8080") + + with Client(base_url=base_url, timeout=60.0) as client: + response = client.get( + f"/api/v1/project/{project}/{package}/+/download5-test", + params={"mode": "proxy"}, + ) + if response.status_code == 200: + results.append((idx, response.content)) + else: + errors.append(f"Worker {idx}: Status {response.status_code}") + except Exception as e: + errors.append(f"Worker {idx}: {str(e)}") + + with ThreadPoolExecutor(max_workers=num_downloads) as executor: + futures = [executor.submit(download_worker, i) for i in range(num_downloads)] + for future in as_completed(futures): + pass + + assert len(errors) == 0, f"Errors: {errors}" + assert len(results) == num_downloads + + for idx, downloaded in results: + assert downloaded == content + + @pytest.mark.integration + @pytest.mark.concurrent + def test_10_concurrent_downloads_same_artifact(self, integration_client, test_package): + """Test 10 concurrent downloads of same artifact.""" + project, package = test_package + content, expected_hash = generate_content_with_hash(8192, seed=600) + + upload_test_file(integration_client, project, package, content, tag="download10-test") + + num_downloads = 10 + results = [] + errors = [] + + def download_worker(idx): + try: + from httpx import Client + base_url = os.environ.get("ORCHARD_TEST_URL", "http://localhost:8080") + + with Client(base_url=base_url, timeout=60.0) as client: + response = client.get( + f"/api/v1/project/{project}/{package}/+/download10-test", + params={"mode": "proxy"}, + ) + if response.status_code == 200: + results.append((idx, response.content)) + else: + errors.append(f"Worker {idx}: Status {response.status_code}") + except Exception as e: + errors.append(f"Worker {idx}: {str(e)}") + + with ThreadPoolExecutor(max_workers=num_downloads) as executor: + futures = [executor.submit(download_worker, i) for i in range(num_downloads)] + for future in as_completed(futures): + pass + + assert len(errors) == 0, f"Errors: {errors}" + assert len(results) == num_downloads + + for idx, downloaded in results: + assert downloaded == content + + @pytest.mark.integration + @pytest.mark.concurrent + def test_concurrent_downloads_different_artifacts(self, integration_client, test_package): + """Test concurrent downloads of different artifacts.""" + project, package = test_package + + # Upload multiple files + num_files = 5 + uploads = [] + for i in range(num_files): + content, expected_hash = generate_content_with_hash(1024, seed=700 + i) + upload_test_file( + integration_client, project, package, content, + tag=f"multi-download-{i}" + ) + uploads.append((f"multi-download-{i}", content)) + + results = [] + errors = [] + + def download_worker(tag, expected_content): + try: + from httpx import Client + base_url = os.environ.get("ORCHARD_TEST_URL", "http://localhost:8080") + + with Client(base_url=base_url, timeout=60.0) as client: + response = client.get( + f"/api/v1/project/{project}/{package}/+/{tag}", + params={"mode": "proxy"}, + ) + if response.status_code == 200: + results.append((tag, response.content, expected_content)) + else: + errors.append(f"Tag {tag}: Status {response.status_code}") + except Exception as e: + errors.append(f"Tag {tag}: {str(e)}") + + with ThreadPoolExecutor(max_workers=num_files) as executor: + futures = [ + executor.submit(download_worker, tag, content) + for tag, content in uploads + ] + for future in as_completed(futures): + pass + + assert len(errors) == 0, f"Errors: {errors}" + assert len(results) == num_files + + for tag, downloaded, expected in results: + assert downloaded == expected, f"Content mismatch for {tag}" + + +class TestMixedConcurrentOperations: + """Tests for mixed concurrent upload and download operations.""" + + @pytest.mark.integration + @pytest.mark.concurrent + def test_upload_while_download_in_progress(self, integration_client, test_package): + """Test uploading while a download is in progress.""" + project, package = test_package + api_key = get_api_key(integration_client) + assert api_key, "Failed to create API key" + + # Upload initial content + content1, hash1 = generate_content_with_hash(10240, seed=800) # 10KB + upload_test_file(integration_client, project, package, content1, tag="initial") + + # New content for upload during download + content2, hash2 = generate_content_with_hash(10240, seed=801) + + results = {"downloads": [], "uploads": []} + errors = [] + + def download_worker(): + try: + from httpx import Client + base_url = os.environ.get("ORCHARD_TEST_URL", "http://localhost:8080") + + with Client(base_url=base_url, timeout=60.0) as client: + response = client.get( + f"/api/v1/project/{project}/{package}/+/initial", + params={"mode": "proxy"}, + ) + if response.status_code == 200: + results["downloads"].append(response.content) + else: + errors.append(f"Download: Status {response.status_code}") + except Exception as e: + errors.append(f"Download: {str(e)}") + + def upload_worker(): + try: + from httpx import Client + base_url = os.environ.get("ORCHARD_TEST_URL", "http://localhost:8080") + + with Client(base_url=base_url, timeout=60.0) as client: + files = { + "file": ("new.bin", io.BytesIO(content2), "application/octet-stream") + } + response = client.post( + f"/api/v1/project/{project}/{package}/upload", + files=files, + data={"tag": "during-download"}, + headers={"Authorization": f"Bearer {api_key}"}, + ) + if response.status_code == 200: + results["uploads"].append(response.json()) + else: + errors.append(f"Upload: Status {response.status_code}") + except Exception as e: + errors.append(f"Upload: {str(e)}") + + with ThreadPoolExecutor(max_workers=2) as executor: + futures = [ + executor.submit(download_worker), + executor.submit(upload_worker), + ] + for future in as_completed(futures): + pass + + assert len(errors) == 0, f"Errors: {errors}" + assert len(results["downloads"]) == 1 + assert len(results["uploads"]) == 1 + + # Verify download got correct content + assert results["downloads"][0] == content1 + + # Verify upload succeeded + assert results["uploads"][0]["artifact_id"] == hash2 + + @pytest.mark.integration + @pytest.mark.concurrent + def test_multiple_uploads_and_downloads_simultaneously(self, integration_client, test_package): + """Test multiple uploads and downloads running simultaneously.""" + project, package = test_package + api_key = get_api_key(integration_client) + assert api_key, "Failed to create API key" + + # Pre-upload some files for downloading + existing_files = [] + for i in range(3): + content, hash = generate_content_with_hash(2048, seed=900 + i) + upload_test_file(integration_client, project, package, content, tag=f"existing-{i}") + existing_files.append((f"existing-{i}", content)) + + # New files for uploading + new_files = [ + generate_content_with_hash(2048, seed=910 + i) for i in range(3) + ] + + results = {"downloads": [], "uploads": []} + errors = [] + + def download_worker(tag, expected): + try: + from httpx import Client + base_url = os.environ.get("ORCHARD_TEST_URL", "http://localhost:8080") + + with Client(base_url=base_url, timeout=60.0) as client: + response = client.get( + f"/api/v1/project/{project}/{package}/+/{tag}", + params={"mode": "proxy"}, + ) + if response.status_code == 200: + results["downloads"].append((tag, response.content, expected)) + else: + errors.append(f"Download {tag}: Status {response.status_code}") + except Exception as e: + errors.append(f"Download {tag}: {str(e)}") + + def upload_worker(idx, content, expected_hash): + try: + from httpx import Client + base_url = os.environ.get("ORCHARD_TEST_URL", "http://localhost:8080") + + with Client(base_url=base_url, timeout=60.0) as client: + files = { + "file": (f"new-{idx}.bin", io.BytesIO(content), "application/octet-stream") + } + response = client.post( + f"/api/v1/project/{project}/{package}/upload", + files=files, + data={"tag": f"new-{idx}"}, + headers={"Authorization": f"Bearer {api_key}"}, + ) + if response.status_code == 200: + results["uploads"].append((idx, response.json(), expected_hash)) + else: + errors.append(f"Upload {idx}: Status {response.status_code}") + except Exception as e: + errors.append(f"Upload {idx}: {str(e)}") + + with ThreadPoolExecutor(max_workers=6) as executor: + futures = [] + + # Submit downloads + for tag, content in existing_files: + futures.append(executor.submit(download_worker, tag, content)) + + # Submit uploads + for i, (content, hash) in enumerate(new_files): + futures.append(executor.submit(upload_worker, i, content, hash)) + + for future in as_completed(futures): + pass + + assert len(errors) == 0, f"Errors: {errors}" + assert len(results["downloads"]) == 3 + assert len(results["uploads"]) == 3 + + # Verify downloads + for tag, downloaded, expected in results["downloads"]: + assert downloaded == expected, f"Download mismatch for {tag}" + + # Verify uploads + for idx, result, expected_hash in results["uploads"]: + assert result["artifact_id"] == expected_hash + + @pytest.mark.integration + @pytest.mark.concurrent + def test_no_data_corruption_under_concurrency(self, integration_client, test_package): + """Test that no data corruption occurs under concurrent operations.""" + project, package = test_package + api_key = get_api_key(integration_client) + assert api_key, "Failed to create API key" + + # Create content with recognizable patterns + num_files = 5 + files_data = [] + for i in range(num_files): + # Each file has unique repeating pattern for easy corruption detection + pattern = bytes([i] * 256) + content = pattern * 40 # 10KB each + hash = compute_sha256(content) + files_data.append((content, hash)) + + results = [] + errors = [] + + def upload_and_verify(idx, content, expected_hash): + try: + from httpx import Client + base_url = os.environ.get("ORCHARD_TEST_URL", "http://localhost:8080") + + with Client(base_url=base_url, timeout=60.0) as client: + # Upload + files = { + "file": (f"pattern-{idx}.bin", io.BytesIO(content), "application/octet-stream") + } + upload_resp = client.post( + f"/api/v1/project/{project}/{package}/upload", + files=files, + data={"tag": f"pattern-{idx}"}, + headers={"Authorization": f"Bearer {api_key}"}, + ) + if upload_resp.status_code != 200: + errors.append(f"Upload {idx}: Status {upload_resp.status_code}") + return + + upload_result = upload_resp.json() + if upload_result["artifact_id"] != expected_hash: + errors.append(f"Upload {idx}: Hash mismatch") + return + + # Immediately download and verify + download_resp = client.get( + f"/api/v1/project/{project}/{package}/+/pattern-{idx}", + params={"mode": "proxy"}, + ) + if download_resp.status_code != 200: + errors.append(f"Download {idx}: Status {download_resp.status_code}") + return + + if download_resp.content != content: + errors.append(f"Worker {idx}: DATA CORRUPTION DETECTED") + return + + # Verify the downloaded content hash + downloaded_hash = compute_sha256(download_resp.content) + if downloaded_hash != expected_hash: + errors.append(f"Worker {idx}: Hash verification failed") + return + + results.append(idx) + + except Exception as e: + errors.append(f"Worker {idx}: {str(e)}") + + with ThreadPoolExecutor(max_workers=num_files) as executor: + futures = [ + executor.submit(upload_and_verify, i, content, hash) + for i, (content, hash) in enumerate(files_data) + ] + for future in as_completed(futures): + pass + + assert len(errors) == 0, f"Errors: {errors}" + assert len(results) == num_files diff --git a/backend/tests/integration/test_error_handling.py b/backend/tests/integration/test_error_handling.py new file mode 100644 index 0000000..ce1f767 --- /dev/null +++ b/backend/tests/integration/test_error_handling.py @@ -0,0 +1,322 @@ +""" +Integration tests for error handling in upload and download operations. + +Tests cover: +- Timeout handling +- Invalid request handling +- Resource cleanup on failures +- Graceful error responses +""" + +import pytest +import io +import time +from tests.factories import ( + compute_sha256, + upload_test_file, + generate_content_with_hash, +) + + +class TestUploadErrorHandling: + """Tests for upload error handling.""" + + @pytest.mark.integration + def test_upload_to_nonexistent_project_returns_404( + self, integration_client, unique_test_id + ): + """Test upload to nonexistent project returns 404.""" + content = b"test content for nonexistent project" + + files = {"file": ("test.bin", io.BytesIO(content), "application/octet-stream")} + response = integration_client.post( + f"/api/v1/project/nonexistent-project-{unique_test_id}/nonexistent-pkg/upload", + files=files, + ) + assert response.status_code == 404 + + @pytest.mark.integration + def test_upload_to_nonexistent_package_returns_404( + self, integration_client, test_project, unique_test_id + ): + """Test upload to nonexistent package returns 404.""" + content = b"test content for nonexistent package" + + files = {"file": ("test.bin", io.BytesIO(content), "application/octet-stream")} + response = integration_client.post( + f"/api/v1/project/{test_project}/nonexistent-package-{unique_test_id}/upload", + files=files, + ) + assert response.status_code == 404 + + @pytest.mark.integration + def test_upload_empty_file_rejected(self, integration_client, test_package): + """Test empty file upload is rejected.""" + project, package = test_package + + files = {"file": ("empty.bin", io.BytesIO(b""), "application/octet-stream")} + response = integration_client.post( + f"/api/v1/project/{project}/{package}/upload", + files=files, + ) + assert response.status_code in [400, 422] + + @pytest.mark.integration + def test_upload_missing_file_returns_422(self, integration_client, test_package): + """Test upload without file field returns 422.""" + project, package = test_package + + response = integration_client.post( + f"/api/v1/project/{project}/{package}/upload", + data={"tag": "no-file-provided"}, + ) + assert response.status_code == 422 + + @pytest.mark.integration + def test_upload_invalid_checksum_format_returns_400( + self, integration_client, test_package + ): + """Test upload with invalid checksum format returns 400.""" + project, package = test_package + content = b"checksum format test" + + files = {"file": ("test.bin", io.BytesIO(content), "application/octet-stream")} + response = integration_client.post( + f"/api/v1/project/{project}/{package}/upload", + files=files, + headers={"X-Checksum-SHA256": "invalid-hash-format"}, + ) + assert response.status_code == 400 + + @pytest.mark.integration + def test_upload_checksum_mismatch_returns_422( + self, integration_client, test_package + ): + """Test upload with mismatched checksum returns 422.""" + project, package = test_package + content = b"checksum mismatch test" + wrong_hash = "0" * 64 # Valid format but wrong hash + + files = {"file": ("test.bin", io.BytesIO(content), "application/octet-stream")} + response = integration_client.post( + f"/api/v1/project/{project}/{package}/upload", + files=files, + headers={"X-Checksum-SHA256": wrong_hash}, + ) + assert response.status_code == 422 + + @pytest.mark.integration + def test_upload_with_correct_checksum_succeeds( + self, integration_client, test_package + ): + """Test upload with correct checksum succeeds.""" + project, package = test_package + content = b"correct checksum test" + correct_hash = compute_sha256(content) + + files = {"file": ("test.bin", io.BytesIO(content), "application/octet-stream")} + response = integration_client.post( + f"/api/v1/project/{project}/{package}/upload", + files=files, + headers={"X-Checksum-SHA256": correct_hash}, + ) + assert response.status_code == 200 + assert response.json()["artifact_id"] == correct_hash + + +class TestDownloadErrorHandling: + """Tests for download error handling.""" + + @pytest.mark.integration + def test_download_nonexistent_tag_returns_404( + self, integration_client, test_package + ): + """Test download of nonexistent tag returns 404.""" + project, package = test_package + + response = integration_client.get( + f"/api/v1/project/{project}/{package}/+/nonexistent-tag-xyz" + ) + assert response.status_code == 404 + + @pytest.mark.integration + def test_download_nonexistent_artifact_returns_404( + self, integration_client, test_package + ): + """Test download of nonexistent artifact ID returns 404.""" + project, package = test_package + fake_hash = "a" * 64 + + response = integration_client.get( + f"/api/v1/project/{project}/{package}/+/artifact:{fake_hash}" + ) + assert response.status_code == 404 + + @pytest.mark.integration + def test_download_invalid_artifact_id_format( + self, integration_client, test_package + ): + """Test download with invalid artifact ID format.""" + project, package = test_package + + # Too short + response = integration_client.get( + f"/api/v1/project/{project}/{package}/+/artifact:abc123" + ) + assert response.status_code == 404 + + @pytest.mark.integration + def test_download_from_nonexistent_project_returns_404( + self, integration_client, unique_test_id + ): + """Test download from nonexistent project returns 404.""" + response = integration_client.get( + f"/api/v1/project/nonexistent-{unique_test_id}/pkg/+/tag" + ) + assert response.status_code == 404 + + @pytest.mark.integration + def test_download_from_nonexistent_package_returns_404( + self, integration_client, test_project, unique_test_id + ): + """Test download from nonexistent package returns 404.""" + response = integration_client.get( + f"/api/v1/project/{test_project}/nonexistent-{unique_test_id}/+/tag" + ) + assert response.status_code == 404 + + +class TestTimeoutBehavior: + """Tests for timeout behavior (integration level).""" + + @pytest.mark.integration + @pytest.mark.slow + def test_large_upload_completes_within_reasonable_time( + self, integration_client, test_package, sized_content + ): + """Test that a 10MB upload completes within reasonable time.""" + project, package = test_package + content, expected_hash = sized_content(10 * 1024 * 1024, seed=999) # 10MB + + start_time = time.time() + result = upload_test_file( + integration_client, project, package, content, tag="timeout-test" + ) + elapsed = time.time() - start_time + + assert result["artifact_id"] == expected_hash + # Should complete within 60 seconds for 10MB on local docker + assert elapsed < 60, f"Upload took too long: {elapsed:.2f}s" + + @pytest.mark.integration + @pytest.mark.slow + def test_large_download_completes_within_reasonable_time( + self, integration_client, test_package, sized_content + ): + """Test that a 10MB download completes within reasonable time.""" + project, package = test_package + content, expected_hash = sized_content(10 * 1024 * 1024, seed=998) # 10MB + + # First upload + upload_test_file( + integration_client, project, package, content, tag="download-timeout-test" + ) + + # Then download and time it + start_time = time.time() + response = integration_client.get( + f"/api/v1/project/{project}/{package}/+/download-timeout-test", + params={"mode": "proxy"}, + ) + elapsed = time.time() - start_time + + assert response.status_code == 200 + assert len(response.content) == len(content) + # Should complete within 60 seconds for 10MB on local docker + assert elapsed < 60, f"Download took too long: {elapsed:.2f}s" + + +class TestResourceCleanup: + """Tests for proper resource cleanup on failures. + + Note: More comprehensive cleanup tests are in test_upload_download_api.py + (TestUploadFailureCleanup class) including S3 object cleanup verification. + """ + + @pytest.mark.integration + def test_checksum_mismatch_no_orphaned_artifact( + self, integration_client, test_package, unique_test_id + ): + """Test checksum mismatch doesn't leave orphaned artifact.""" + project, package = test_package + # Use unique content to ensure artifact doesn't exist from prior tests + content = f"checksum mismatch orphan test {unique_test_id}".encode() + wrong_hash = "0" * 64 + actual_hash = compute_sha256(content) + + # Verify artifact doesn't exist before test + pre_check = integration_client.get(f"/api/v1/artifact/{actual_hash}") + assert pre_check.status_code == 404, "Artifact should not exist before test" + + files = {"file": ("test.bin", io.BytesIO(content), "application/octet-stream")} + response = integration_client.post( + f"/api/v1/project/{project}/{package}/upload", + files=files, + headers={"X-Checksum-SHA256": wrong_hash}, + ) + assert response.status_code == 422 + + # Verify no artifact was created with either hash + response1 = integration_client.get(f"/api/v1/artifact/{wrong_hash}") + response2 = integration_client.get(f"/api/v1/artifact/{actual_hash}") + assert response1.status_code == 404 + assert response2.status_code == 404 + + +class TestGracefulErrorResponses: + """Tests for graceful and informative error responses.""" + + @pytest.mark.integration + def test_404_response_has_detail_message( + self, integration_client, test_package + ): + """Test 404 responses include a detail message.""" + project, package = test_package + + response = integration_client.get( + f"/api/v1/project/{project}/{package}/+/nonexistent-tag" + ) + assert response.status_code == 404 + data = response.json() + assert "detail" in data + assert len(data["detail"]) > 0 + + @pytest.mark.integration + def test_422_response_has_detail_message(self, integration_client, test_package): + """Test 422 responses include a detail message.""" + project, package = test_package + + # Upload with mismatched checksum + content = b"detail message test" + wrong_hash = "0" * 64 + + files = {"file": ("test.bin", io.BytesIO(content), "application/octet-stream")} + response = integration_client.post( + f"/api/v1/project/{project}/{package}/upload", + files=files, + headers={"X-Checksum-SHA256": wrong_hash}, + ) + assert response.status_code == 422 + data = response.json() + assert "detail" in data + + @pytest.mark.integration + def test_error_response_is_json(self, integration_client, unique_test_id): + """Test error responses are valid JSON.""" + response = integration_client.get( + f"/api/v1/project/nonexistent-{unique_test_id}/pkg/+/tag" + ) + assert response.status_code == 404 + # Should not raise exception - valid JSON + data = response.json() + assert isinstance(data, dict) diff --git a/backend/tests/integration/test_integrity_verification.py b/backend/tests/integration/test_integrity_verification.py new file mode 100644 index 0000000..504bc8c --- /dev/null +++ b/backend/tests/integration/test_integrity_verification.py @@ -0,0 +1,768 @@ +""" +Integration tests for artifact integrity verification. + +Tests cover: +- Round-trip verification (upload -> download -> verify hash) +- Consistency check endpoint +- Header-based verification +- Integrity verification across file sizes +- Client-side verification workflow +""" + +import pytest +import io +import hashlib +from tests.factories import ( + compute_sha256, + upload_test_file, + generate_content_with_hash, + s3_object_exists, + get_s3_client, + get_s3_bucket, +) +from tests.conftest import ( + SIZE_1KB, + SIZE_10KB, + SIZE_100KB, + SIZE_1MB, + SIZE_10MB, +) + + +class TestRoundTripVerification: + """Tests for complete round-trip integrity verification.""" + + @pytest.mark.integration + def test_upload_download_hash_matches(self, integration_client, test_package): + """Test that upload -> download round trip preserves content integrity.""" + project, package = test_package + content = b"Round trip integrity test content" + expected_hash = compute_sha256(content) + + # Upload and capture returned hash + result = upload_test_file( + integration_client, project, package, content, tag="roundtrip" + ) + uploaded_hash = result["artifact_id"] + + # Verify upload returned correct hash + assert uploaded_hash == expected_hash + + # Download artifact + response = integration_client.get( + f"/api/v1/project/{project}/{package}/+/roundtrip", + params={"mode": "proxy"}, + ) + assert response.status_code == 200 + + # Compute hash of downloaded content + downloaded_hash = compute_sha256(response.content) + + # All three hashes should match + assert downloaded_hash == expected_hash + assert downloaded_hash == uploaded_hash + + @pytest.mark.integration + def test_upload_response_contains_hash(self, integration_client, test_package): + """Test upload response contains artifact_id which is the SHA256 hash.""" + project, package = test_package + content = b"Upload response hash test" + expected_hash = compute_sha256(content) + + result = upload_test_file(integration_client, project, package, content) + + assert "artifact_id" in result + assert result["artifact_id"] == expected_hash + assert len(result["artifact_id"]) == 64 + assert all(c in "0123456789abcdef" for c in result["artifact_id"]) + + @pytest.mark.integration + def test_download_header_matches_artifact_id(self, integration_client, test_package): + """Test X-Checksum-SHA256 header matches artifact ID.""" + project, package = test_package + content = b"Header verification test" + expected_hash = compute_sha256(content) + + upload_test_file( + integration_client, project, package, content, tag="header-check" + ) + + response = integration_client.get( + f"/api/v1/project/{project}/{package}/+/header-check", + params={"mode": "proxy"}, + ) + assert response.status_code == 200 + assert response.headers.get("X-Checksum-SHA256") == expected_hash + + @pytest.mark.integration + def test_etag_matches_artifact_id(self, integration_client, test_package): + """Test ETag header matches artifact ID.""" + project, package = test_package + content = b"ETag verification test" + expected_hash = compute_sha256(content) + + upload_test_file( + integration_client, project, package, content, tag="etag-check" + ) + + response = integration_client.get( + f"/api/v1/project/{project}/{package}/+/etag-check", + params={"mode": "proxy"}, + ) + assert response.status_code == 200 + etag = response.headers.get("ETag", "").strip('"') + assert etag == expected_hash + + @pytest.mark.integration + def test_artifact_endpoint_returns_correct_hash(self, integration_client, test_package): + """Test artifact endpoint returns correct hash/ID.""" + project, package = test_package + content = b"Artifact endpoint hash test" + expected_hash = compute_sha256(content) + + upload_test_file(integration_client, project, package, content) + + # Query artifact directly + response = integration_client.get(f"/api/v1/artifact/{expected_hash}") + assert response.status_code == 200 + data = response.json() + assert data["id"] == expected_hash + assert data.get("sha256") == expected_hash + + +class TestClientSideVerificationWorkflow: + """Tests for client-side verification workflow.""" + + @pytest.mark.integration + def test_client_can_verify_before_upload(self, integration_client, test_package): + """Test client can compute hash before upload and verify response matches.""" + project, package = test_package + content = b"Client pre-upload verification test" + + # Client computes hash locally before upload + client_hash = compute_sha256(content) + + # Upload + result = upload_test_file(integration_client, project, package, content) + + # Client verifies server returned the same hash + assert result["artifact_id"] == client_hash + + @pytest.mark.integration + def test_client_can_provide_checksum_header(self, integration_client, test_package): + """Test client can provide X-Checksum-SHA256 header for verification.""" + project, package = test_package + content = b"Client checksum header test" + client_hash = compute_sha256(content) + + files = {"file": ("test.bin", io.BytesIO(content), "application/octet-stream")} + response = integration_client.post( + f"/api/v1/project/{project}/{package}/upload", + files=files, + headers={"X-Checksum-SHA256": client_hash}, + ) + assert response.status_code == 200 + assert response.json()["artifact_id"] == client_hash + + @pytest.mark.integration + def test_checksum_mismatch_rejected(self, integration_client, test_package): + """Test upload with wrong client checksum is rejected.""" + project, package = test_package + content = b"Checksum mismatch test" + wrong_hash = "0" * 64 + + files = {"file": ("test.bin", io.BytesIO(content), "application/octet-stream")} + response = integration_client.post( + f"/api/v1/project/{project}/{package}/upload", + files=files, + headers={"X-Checksum-SHA256": wrong_hash}, + ) + assert response.status_code == 422 + + @pytest.mark.integration + def test_client_can_verify_after_download(self, integration_client, test_package): + """Test client can verify downloaded content matches header hash.""" + project, package = test_package + content = b"Client post-download verification" + + upload_test_file( + integration_client, project, package, content, tag="verify-after" + ) + + response = integration_client.get( + f"/api/v1/project/{project}/{package}/+/verify-after", + params={"mode": "proxy"}, + ) + assert response.status_code == 200 + + # Client gets hash from header + header_hash = response.headers.get("X-Checksum-SHA256") + + # Client computes hash of downloaded content + downloaded_hash = compute_sha256(response.content) + + # Client verifies they match + assert downloaded_hash == header_hash + + +class TestIntegritySizeVariants: + """Tests for integrity verification across different file sizes.""" + + @pytest.mark.integration + def test_integrity_1kb(self, integration_client, test_package, sized_content): + """Test integrity verification for 1KB file.""" + project, package = test_package + content, expected_hash = sized_content(SIZE_1KB, seed=100) + + result = upload_test_file( + integration_client, project, package, content, tag="int-1kb" + ) + assert result["artifact_id"] == expected_hash + + response = integration_client.get( + f"/api/v1/project/{project}/{package}/+/int-1kb", + params={"mode": "proxy"}, + ) + assert response.status_code == 200 + assert compute_sha256(response.content) == expected_hash + assert response.headers.get("X-Checksum-SHA256") == expected_hash + + @pytest.mark.integration + def test_integrity_100kb(self, integration_client, test_package, sized_content): + """Test integrity verification for 100KB file.""" + project, package = test_package + content, expected_hash = sized_content(SIZE_100KB, seed=101) + + result = upload_test_file( + integration_client, project, package, content, tag="int-100kb" + ) + assert result["artifact_id"] == expected_hash + + response = integration_client.get( + f"/api/v1/project/{project}/{package}/+/int-100kb", + params={"mode": "proxy"}, + ) + assert response.status_code == 200 + assert compute_sha256(response.content) == expected_hash + assert response.headers.get("X-Checksum-SHA256") == expected_hash + + @pytest.mark.integration + def test_integrity_1mb(self, integration_client, test_package, sized_content): + """Test integrity verification for 1MB file.""" + project, package = test_package + content, expected_hash = sized_content(SIZE_1MB, seed=102) + + result = upload_test_file( + integration_client, project, package, content, tag="int-1mb" + ) + assert result["artifact_id"] == expected_hash + + response = integration_client.get( + f"/api/v1/project/{project}/{package}/+/int-1mb", + params={"mode": "proxy"}, + ) + assert response.status_code == 200 + assert compute_sha256(response.content) == expected_hash + assert response.headers.get("X-Checksum-SHA256") == expected_hash + + @pytest.mark.integration + @pytest.mark.slow + def test_integrity_10mb(self, integration_client, test_package, sized_content): + """Test integrity verification for 10MB file.""" + project, package = test_package + content, expected_hash = sized_content(SIZE_10MB, seed=103) + + result = upload_test_file( + integration_client, project, package, content, tag="int-10mb" + ) + assert result["artifact_id"] == expected_hash + + response = integration_client.get( + f"/api/v1/project/{project}/{package}/+/int-10mb", + params={"mode": "proxy"}, + ) + assert response.status_code == 200 + assert compute_sha256(response.content) == expected_hash + assert response.headers.get("X-Checksum-SHA256") == expected_hash + + +class TestConsistencyCheck: + """Tests for the admin consistency check endpoint.""" + + @pytest.mark.integration + def test_consistency_check_returns_200(self, integration_client): + """Test consistency check endpoint returns 200.""" + response = integration_client.get("/api/v1/admin/consistency-check") + assert response.status_code == 200 + + @pytest.mark.integration + def test_consistency_check_response_format(self, integration_client): + """Test consistency check returns expected response format.""" + response = integration_client.get("/api/v1/admin/consistency-check") + assert response.status_code == 200 + data = response.json() + + # Check expected fields + assert "total_artifacts_checked" in data + assert "orphaned_s3_objects" in data + assert "missing_s3_objects" in data + assert "size_mismatches" in data + assert "healthy" in data + assert "orphaned_s3_keys" in data + assert "missing_s3_keys" in data + assert "size_mismatch_artifacts" in data + # Verify types + assert isinstance(data["total_artifacts_checked"], int) + assert isinstance(data["orphaned_s3_objects"], int) + assert isinstance(data["missing_s3_objects"], int) + assert isinstance(data["size_mismatches"], int) + assert isinstance(data["healthy"], bool) + assert isinstance(data["orphaned_s3_keys"], list) + assert isinstance(data["missing_s3_keys"], list) + assert isinstance(data["size_mismatch_artifacts"], list) + + @pytest.mark.integration + def test_consistency_check_after_upload(self, integration_client, test_package): + """Test consistency check passes after valid upload.""" + project, package = test_package + content = b"Consistency check test content" + + # Upload artifact + upload_test_file(integration_client, project, package, content) + + # Run consistency check + response = integration_client.get("/api/v1/admin/consistency-check") + assert response.status_code == 200 + data = response.json() + + # Verify check ran and no issues + assert data["total_artifacts_checked"] >= 1 + assert data["healthy"] is True + + @pytest.mark.integration + def test_consistency_check_limit_parameter(self, integration_client): + """Test consistency check respects limit parameter.""" + response = integration_client.get( + "/api/v1/admin/consistency-check", + params={"limit": 10} + ) + assert response.status_code == 200 + data = response.json() + + # Lists should not exceed limit + assert len(data["orphaned_s3_keys"]) <= 10 + assert len(data["missing_s3_keys"]) <= 10 + assert len(data["size_mismatch_artifacts"]) <= 10 + + +class TestDigestHeader: + """Tests for RFC 3230 Digest header.""" + + @pytest.mark.integration + def test_download_includes_digest_header(self, integration_client, test_package): + """Test download includes Digest header in RFC 3230 format.""" + project, package = test_package + content = b"Digest header test" + expected_hash = compute_sha256(content) + + upload_test_file( + integration_client, project, package, content, tag="digest-test" + ) + + response = integration_client.get( + f"/api/v1/project/{project}/{package}/+/digest-test", + params={"mode": "proxy"}, + ) + assert response.status_code == 200 + assert "Digest" in response.headers + + # Verify Digest format (sha-256=base64hash) + digest = response.headers["Digest"] + assert digest.startswith("sha-256=") + + @pytest.mark.integration + def test_digest_header_base64_valid(self, integration_client, test_package): + """Test Digest header contains valid base64 encoding.""" + import base64 + + project, package = test_package + content = b"Digest base64 test" + expected_hash = compute_sha256(content) + + upload_test_file( + integration_client, project, package, content, tag="digest-b64" + ) + + response = integration_client.get( + f"/api/v1/project/{project}/{package}/+/digest-b64", + params={"mode": "proxy"}, + ) + assert response.status_code == 200 + + digest = response.headers["Digest"] + base64_part = digest.split("=", 1)[1] + + # Should be valid base64 + try: + decoded = base64.b64decode(base64_part) + assert len(decoded) == 32 # SHA256 is 32 bytes + except Exception as e: + pytest.fail(f"Invalid base64 in Digest header: {e}") + + +class TestVerificationModes: + """Tests for download verification modes.""" + + @pytest.mark.integration + def test_pre_verification_mode(self, integration_client, test_package): + """Test pre-verification mode verifies before streaming.""" + project, package = test_package + content = b"Pre-verification mode test" + + upload_test_file( + integration_client, project, package, content, tag="pre-verify" + ) + + response = integration_client.get( + f"/api/v1/project/{project}/{package}/+/pre-verify", + params={"mode": "proxy", "verify": "true", "verify_mode": "pre"}, + ) + assert response.status_code == 200 + assert response.content == content + + # X-Verified header should be true + assert response.headers.get("X-Verified") == "true" + + @pytest.mark.integration + def test_stream_verification_mode(self, integration_client, test_package): + """Test streaming verification mode.""" + project, package = test_package + content = b"Stream verification mode test" + + upload_test_file( + integration_client, project, package, content, tag="stream-verify" + ) + + response = integration_client.get( + f"/api/v1/project/{project}/{package}/+/stream-verify", + params={"mode": "proxy", "verify": "true", "verify_mode": "stream"}, + ) + assert response.status_code == 200 + assert response.content == content + + +class TestArtifactIntegrityEndpoint: + """Tests for artifact-specific integrity operations.""" + + @pytest.mark.integration + def test_artifact_size_matches(self, integration_client, test_package): + """Test artifact endpoint returns correct size.""" + project, package = test_package + content = b"Artifact size test content" + expected_size = len(content) + + result = upload_test_file(integration_client, project, package, content) + artifact_id = result["artifact_id"] + + response = integration_client.get(f"/api/v1/artifact/{artifact_id}") + assert response.status_code == 200 + data = response.json() + assert data["size"] == expected_size + + @pytest.mark.integration + def test_content_length_header_matches_size(self, integration_client, test_package): + """Test Content-Length header matches artifact size.""" + project, package = test_package + content = b"Content-Length header test" + expected_size = len(content) + + upload_test_file( + integration_client, project, package, content, tag="content-len" + ) + + response = integration_client.get( + f"/api/v1/project/{project}/{package}/+/content-len", + params={"mode": "proxy"}, + ) + assert response.status_code == 200 + assert int(response.headers.get("Content-Length", 0)) == expected_size + assert len(response.content) == expected_size + + +@pytest.mark.requires_direct_s3 +class TestCorruptionDetection: + """Tests for detecting corrupted S3 objects. + + These tests directly manipulate S3 objects to simulate corruption + and verify that the system can detect hash mismatches. + + Note: These tests require direct S3/MinIO access and are skipped in CI + where S3 is not directly accessible from the test runner. + """ + + @pytest.mark.integration + def test_detection_of_corrupted_content(self, integration_client, test_package): + """Test that corrupted S3 content is detected via hash mismatch. + + Uploads content, then directly modifies the S3 object, then + verifies that the downloaded content hash doesn't match. + """ + project, package = test_package + content = b"Original content for corruption test" + expected_hash = compute_sha256(content) + + # Upload original content + result = upload_test_file( + integration_client, project, package, content, tag="corrupt-test" + ) + assert result["artifact_id"] == expected_hash + + # Get the S3 object and corrupt it + s3_client = get_s3_client() + bucket = get_s3_bucket() + s3_key = f"fruits/{expected_hash[:2]}/{expected_hash[2:4]}/{expected_hash}" + + # Replace with corrupted content + corrupted_content = b"Corrupted content - different from original!" + s3_client.put_object(Bucket=bucket, Key=s3_key, Body=corrupted_content) + + # Download via proxy (bypasses hash verification) + response = integration_client.get( + f"/api/v1/project/{project}/{package}/+/corrupt-test", + params={"mode": "proxy"}, + ) + assert response.status_code == 200 + + # Verify the downloaded content doesn't match original hash + downloaded_hash = compute_sha256(response.content) + assert downloaded_hash != expected_hash, "Corruption was not detected - hashes match" + assert response.content == corrupted_content + + # The X-Checksum-SHA256 header should still show the original hash (from DB) + # but the actual content hash is different + header_hash = response.headers.get("X-Checksum-SHA256") + assert header_hash == expected_hash # Header shows expected hash + assert downloaded_hash != header_hash # But content is corrupted + + # Restore original content for cleanup + s3_client.put_object(Bucket=bucket, Key=s3_key, Body=content) + + @pytest.mark.integration + def test_detection_of_single_bit_flip(self, integration_client, test_package): + """Test detection of a single bit flip in S3 object content.""" + project, package = test_package + content = b"Content for single bit flip detection test" + expected_hash = compute_sha256(content) + + result = upload_test_file( + integration_client, project, package, content, tag="bitflip-test" + ) + assert result["artifact_id"] == expected_hash + + # Get S3 object and flip a single bit + s3_client = get_s3_client() + bucket = get_s3_bucket() + s3_key = f"fruits/{expected_hash[:2]}/{expected_hash[2:4]}/{expected_hash}" + + # Flip the first bit of the first byte + corrupted_content = bytearray(content) + corrupted_content[0] ^= 0x01 + corrupted_content = bytes(corrupted_content) + + s3_client.put_object(Bucket=bucket, Key=s3_key, Body=corrupted_content) + + # Download and verify hash mismatch + response = integration_client.get( + f"/api/v1/project/{project}/{package}/+/bitflip-test", + params={"mode": "proxy"}, + ) + assert response.status_code == 200 + + downloaded_hash = compute_sha256(response.content) + assert downloaded_hash != expected_hash, "Single bit flip not detected" + + # Restore original + s3_client.put_object(Bucket=bucket, Key=s3_key, Body=content) + + @pytest.mark.integration + def test_detection_of_truncated_content(self, integration_client, test_package): + """Test detection of truncated S3 object.""" + project, package = test_package + content = b"This is content that will be truncated for testing purposes" + expected_hash = compute_sha256(content) + + result = upload_test_file( + integration_client, project, package, content, tag="truncate-test" + ) + assert result["artifact_id"] == expected_hash + + # Get S3 object and truncate it + s3_client = get_s3_client() + bucket = get_s3_bucket() + s3_key = f"fruits/{expected_hash[:2]}/{expected_hash[2:4]}/{expected_hash}" + + # Truncate to half the original size + truncated_content = content[: len(content) // 2] + s3_client.put_object(Bucket=bucket, Key=s3_key, Body=truncated_content) + + # Download and verify hash mismatch + response = integration_client.get( + f"/api/v1/project/{project}/{package}/+/truncate-test", + params={"mode": "proxy"}, + ) + assert response.status_code == 200 + + downloaded_hash = compute_sha256(response.content) + assert downloaded_hash != expected_hash, "Truncation not detected" + assert len(response.content) < len(content), "Content was not truncated" + + # Restore original + s3_client.put_object(Bucket=bucket, Key=s3_key, Body=content) + + @pytest.mark.integration + def test_detection_of_appended_content(self, integration_client, test_package): + """Test detection of content with extra bytes appended.""" + project, package = test_package + content = b"Original content" + expected_hash = compute_sha256(content) + + result = upload_test_file( + integration_client, project, package, content, tag="append-test" + ) + assert result["artifact_id"] == expected_hash + + # Get S3 object and append extra bytes + s3_client = get_s3_client() + bucket = get_s3_bucket() + s3_key = f"fruits/{expected_hash[:2]}/{expected_hash[2:4]}/{expected_hash}" + + appended_content = content + b" - extra bytes appended" + s3_client.put_object(Bucket=bucket, Key=s3_key, Body=appended_content) + + # Download and verify hash mismatch + response = integration_client.get( + f"/api/v1/project/{project}/{package}/+/append-test", + params={"mode": "proxy"}, + ) + assert response.status_code == 200 + + downloaded_hash = compute_sha256(response.content) + assert downloaded_hash != expected_hash, "Appended content not detected" + assert len(response.content) > len(content), "Content was not extended" + + # Restore original + s3_client.put_object(Bucket=bucket, Key=s3_key, Body=content) + + @pytest.mark.integration + def test_client_detects_hash_mismatch_post_download( + self, integration_client, test_package + ): + """Test that a client can detect hash mismatch after downloading corrupted content. + + This simulates the full client verification workflow: + 1. Download content + 2. Get expected hash from header + 3. Compute actual hash of content + 4. Verify they match (or detect corruption) + """ + project, package = test_package + content = b"Content for client-side corruption detection" + expected_hash = compute_sha256(content) + + result = upload_test_file( + integration_client, project, package, content, tag="client-detect" + ) + + # Corrupt the S3 object + s3_client = get_s3_client() + bucket = get_s3_bucket() + s3_key = f"fruits/{expected_hash[:2]}/{expected_hash[2:4]}/{expected_hash}" + corrupted = b"This is completely different content" + s3_client.put_object(Bucket=bucket, Key=s3_key, Body=corrupted) + + # Simulate client download and verification + response = integration_client.get( + f"/api/v1/project/{project}/{package}/+/client-detect", + params={"mode": "proxy"}, + ) + assert response.status_code == 200 + + # Client gets expected hash from header + header_hash = response.headers.get("X-Checksum-SHA256") + + # Client computes hash of downloaded content + actual_hash = compute_sha256(response.content) + + # Client detects the mismatch + corruption_detected = actual_hash != header_hash + assert corruption_detected, "Client should detect hash mismatch" + + # Restore original + s3_client.put_object(Bucket=bucket, Key=s3_key, Body=content) + + @pytest.mark.integration + def test_consistency_check_detects_size_mismatch( + self, integration_client, test_package, unique_test_id + ): + """Test that consistency check detects size mismatches. + + Uploads content, modifies S3 object size, then runs consistency check. + """ + project, package = test_package + content = b"Content for size mismatch consistency check test " + unique_test_id.encode() + expected_hash = compute_sha256(content) + + result = upload_test_file( + integration_client, project, package, content, tag="size-mismatch" + ) + + # Modify S3 object to have different size + s3_client = get_s3_client() + bucket = get_s3_bucket() + s3_key = f"fruits/{expected_hash[:2]}/{expected_hash[2:4]}/{expected_hash}" + different_size_content = content + b"extra extra extra" + s3_client.put_object(Bucket=bucket, Key=s3_key, Body=different_size_content) + + # Run consistency check + response = integration_client.get("/api/v1/admin/consistency-check") + assert response.status_code == 200 + data = response.json() + + # Should detect the size mismatch + assert data["size_mismatches"] >= 1 or len(data["size_mismatch_artifacts"]) >= 1 + + # Restore original + s3_client.put_object(Bucket=bucket, Key=s3_key, Body=content) + + @pytest.mark.integration + def test_consistency_check_detects_missing_s3_object( + self, integration_client, test_package, unique_test_id + ): + """Test that consistency check detects missing S3 objects. + + Uploads content, deletes S3 object, then runs consistency check. + """ + project, package = test_package + content = b"Content for missing S3 object test " + unique_test_id.encode() + expected_hash = compute_sha256(content) + + result = upload_test_file( + integration_client, project, package, content, tag="missing-s3" + ) + + # Delete the S3 object + s3_client = get_s3_client() + bucket = get_s3_bucket() + s3_key = f"fruits/{expected_hash[:2]}/{expected_hash[2:4]}/{expected_hash}" + s3_client.delete_object(Bucket=bucket, Key=s3_key) + + # Run consistency check + response = integration_client.get("/api/v1/admin/consistency-check") + assert response.status_code == 200 + data = response.json() + + # Should detect the missing S3 object + assert data["missing_s3_objects"] >= 1 or len(data["missing_s3_keys"]) >= 1 + + # Restore the object for cleanup + s3_client.put_object(Bucket=bucket, Key=s3_key, Body=content) diff --git a/backend/tests/integration/test_large_uploads.py b/backend/tests/integration/test_large_uploads.py new file mode 100644 index 0000000..e18c7fc --- /dev/null +++ b/backend/tests/integration/test_large_uploads.py @@ -0,0 +1,552 @@ +""" +Integration tests for large file upload functionality. + +Tests cover: +- Large file uploads (100MB, 1GB) +- Multipart upload behavior +- Upload metrics (duration, throughput) +- Memory efficiency during uploads +- Upload progress tracking + +Note: Large tests are marked with @pytest.mark.slow and will be skipped +by default. Run with `pytest --run-slow` to include them. +""" + +import os +import pytest +import io +import time +from tests.factories import ( + compute_sha256, + upload_test_file, + s3_object_exists, +) +from tests.conftest import ( + SIZE_1KB, + SIZE_100KB, + SIZE_1MB, + SIZE_10MB, + SIZE_100MB, + SIZE_1GB, +) + + +class TestUploadMetrics: + """Tests for upload duration and throughput metrics.""" + + @pytest.mark.integration + def test_upload_response_includes_duration_ms(self, integration_client, test_package): + """Test upload response includes duration_ms field.""" + project, package = test_package + content = b"duration test content" + + result = upload_test_file( + integration_client, project, package, content, tag="duration-test" + ) + + assert "duration_ms" in result + assert result["duration_ms"] is not None + assert result["duration_ms"] >= 0 + + @pytest.mark.integration + def test_upload_response_includes_throughput(self, integration_client, test_package): + """Test upload response includes throughput_mbps field.""" + project, package = test_package + content = b"throughput test content" + + result = upload_test_file( + integration_client, project, package, content, tag="throughput-test" + ) + + assert "throughput_mbps" in result + # For small files throughput may be very high or None + # Just verify the field exists + + @pytest.mark.integration + def test_upload_duration_reasonable( + self, integration_client, test_package, sized_content + ): + """Test upload duration is reasonable for file size.""" + project, package = test_package + content, _ = sized_content(SIZE_1MB, seed=100) + + start = time.time() + result = upload_test_file( + integration_client, project, package, content, tag="duration-check" + ) + actual_duration = (time.time() - start) * 1000 # ms + + # Reported duration should be close to actual + assert result["duration_ms"] is not None + # Allow some variance (network overhead) + assert result["duration_ms"] <= actual_duration + 1000 # Within 1s + + +class TestLargeFileUploads: + """Tests for large file uploads using multipart.""" + + @pytest.mark.integration + def test_upload_10mb_file(self, integration_client, test_package, sized_content): + """Test uploading a 10MB file.""" + project, package = test_package + content, expected_hash = sized_content(SIZE_10MB, seed=200) + + result = upload_test_file( + integration_client, project, package, content, tag="large-10mb" + ) + + assert result["artifact_id"] == expected_hash + assert result["size"] == SIZE_10MB + assert result["duration_ms"] is not None + assert result["throughput_mbps"] is not None + + @pytest.mark.integration + @pytest.mark.slow + @pytest.mark.requires_direct_s3 + def test_upload_100mb_file(self, integration_client, test_package, sized_content): + """Test uploading a 100MB file (triggers multipart upload).""" + project, package = test_package + content, expected_hash = sized_content(SIZE_100MB, seed=300) + + result = upload_test_file( + integration_client, project, package, content, tag="large-100mb" + ) + + assert result["artifact_id"] == expected_hash + assert result["size"] == SIZE_100MB + # Verify S3 object exists + assert s3_object_exists(expected_hash) + + @pytest.mark.integration + @pytest.mark.slow + @pytest.mark.large + def test_upload_1gb_file(self, integration_client, test_package, sized_content): + """Test uploading a 1GB file.""" + project, package = test_package + content, expected_hash = sized_content(SIZE_1GB, seed=400) + + result = upload_test_file( + integration_client, project, package, content, tag="large-1gb" + ) + + assert result["artifact_id"] == expected_hash + assert result["size"] == SIZE_1GB + # Should have measurable throughput + assert result["throughput_mbps"] is not None + assert result["throughput_mbps"] > 0 + + @pytest.mark.integration + def test_large_file_deduplication( + self, integration_client, test_package, sized_content, unique_test_id + ): + """Test deduplication works for large files.""" + project, package = test_package + # Use unique_test_id to ensure unique content per test run + seed = hash(unique_test_id) % 10000 + content, expected_hash = sized_content(SIZE_10MB, seed=seed) + + # First upload + result1 = upload_test_file( + integration_client, project, package, content, tag=f"dedup-{unique_test_id}-1" + ) + # Note: may be True if previous test uploaded same content + first_dedupe = result1["deduplicated"] + + # Second upload of same content + result2 = upload_test_file( + integration_client, project, package, content, tag=f"dedup-{unique_test_id}-2" + ) + assert result2["artifact_id"] == expected_hash + # Second upload MUST be deduplicated + assert result2["deduplicated"] is True + + +class TestUploadProgress: + """Tests for upload progress tracking endpoint.""" + + @pytest.mark.integration + def test_progress_endpoint_returns_not_found_for_invalid_id( + self, integration_client, test_package + ): + """Test progress endpoint returns not_found status for invalid upload ID.""" + project, package = test_package + + response = integration_client.get( + f"/api/v1/project/{project}/{package}/upload/invalid-upload-id/progress" + ) + + assert response.status_code == 200 + data = response.json() + assert data["status"] == "not_found" + assert data["upload_id"] == "invalid-upload-id" + + @pytest.mark.integration + def test_progress_endpoint_requires_valid_project( + self, integration_client, unique_test_id + ): + """Test progress endpoint validates project exists.""" + response = integration_client.get( + f"/api/v1/project/nonexistent-{unique_test_id}/pkg/upload/upload-id/progress" + ) + + assert response.status_code == 404 + + @pytest.mark.integration + def test_progress_endpoint_requires_valid_package( + self, integration_client, test_project, unique_test_id + ): + """Test progress endpoint validates package exists.""" + response = integration_client.get( + f"/api/v1/project/{test_project}/nonexistent-{unique_test_id}/upload/upload-id/progress" + ) + + assert response.status_code == 404 + + +class TestResumableUploadProgress: + """Tests for progress tracking during resumable uploads.""" + + @pytest.mark.integration + def test_resumable_upload_init_and_progress( + self, integration_client, test_package, sized_content + ): + """Test initializing resumable upload and checking progress.""" + project, package = test_package + content, expected_hash = sized_content(SIZE_100KB, seed=600) + + # Get API key for auth + api_key_response = integration_client.post( + "/api/v1/auth/keys", + json={"name": "progress-test-key"}, + ) + assert api_key_response.status_code == 200 + api_key = api_key_response.json()["key"] + + # Initialize resumable upload + init_response = integration_client.post( + f"/api/v1/project/{project}/{package}/upload/init", + json={ + "expected_hash": expected_hash, + "filename": "progress-test.bin", + "size": SIZE_100KB, + }, + headers={"Authorization": f"Bearer {api_key}"}, + ) + assert init_response.status_code == 200 + upload_id = init_response.json().get("upload_id") + + if upload_id: + # Check initial progress + progress_response = integration_client.get( + f"/api/v1/project/{project}/{package}/upload/{upload_id}/progress", + headers={"Authorization": f"Bearer {api_key}"}, + ) + assert progress_response.status_code == 200 + progress = progress_response.json() + assert progress["status"] == "in_progress" + assert progress["bytes_uploaded"] == 0 + assert progress["bytes_total"] == SIZE_100KB + + # Abort to clean up + integration_client.delete( + f"/api/v1/project/{project}/{package}/upload/{upload_id}", + headers={"Authorization": f"Bearer {api_key}"}, + ) + + +class TestUploadSizeLimits: + """Tests for upload size limit enforcement.""" + + @pytest.mark.integration + def test_empty_file_rejected(self, integration_client, test_package): + """Test empty files are rejected.""" + project, package = test_package + + files = {"file": ("empty.txt", io.BytesIO(b""), "application/octet-stream")} + response = integration_client.post( + f"/api/v1/project/{project}/{package}/upload", + files=files, + ) + + assert response.status_code in [400, 422] + + @pytest.mark.integration + def test_minimum_size_accepted(self, integration_client, test_package): + """Test 1-byte file is accepted.""" + project, package = test_package + content = b"X" + + result = upload_test_file( + integration_client, project, package, content, tag="min-size" + ) + + assert result["size"] == 1 + + @pytest.mark.integration + def test_content_length_header_used_in_response(self, integration_client, test_package): + """Test that upload response size matches Content-Length.""" + project, package = test_package + content = b"content length verification test" + + result = upload_test_file( + integration_client, project, package, content, tag="content-length-test" + ) + + # Size in response should match actual content length + assert result["size"] == len(content) + + +class TestUploadErrorHandling: + """Tests for upload error handling.""" + + @pytest.mark.integration + def test_upload_to_nonexistent_project_returns_404( + self, integration_client, unique_test_id + ): + """Test upload to nonexistent project returns 404.""" + content = b"test content" + files = {"file": ("test.bin", io.BytesIO(content), "application/octet-stream")} + + response = integration_client.post( + f"/api/v1/project/nonexistent-{unique_test_id}/pkg/upload", + files=files, + ) + + assert response.status_code == 404 + + @pytest.mark.integration + def test_upload_to_nonexistent_package_returns_404( + self, integration_client, test_project, unique_test_id + ): + """Test upload to nonexistent package returns 404.""" + content = b"test content" + files = {"file": ("test.bin", io.BytesIO(content), "application/octet-stream")} + + response = integration_client.post( + f"/api/v1/project/{test_project}/nonexistent-{unique_test_id}/upload", + files=files, + ) + + assert response.status_code == 404 + + @pytest.mark.integration + def test_upload_without_file_returns_422(self, integration_client, test_package): + """Test upload without file field returns 422.""" + project, package = test_package + + response = integration_client.post( + f"/api/v1/project/{project}/{package}/upload", + data={"tag": "no-file"}, + ) + + assert response.status_code == 422 + + @pytest.mark.integration + def test_upload_with_invalid_checksum_rejected( + self, integration_client, test_package + ): + """Test upload with invalid checksum header format is rejected.""" + project, package = test_package + content = b"checksum test" + + files = {"file": ("test.bin", io.BytesIO(content), "application/octet-stream")} + response = integration_client.post( + f"/api/v1/project/{project}/{package}/upload", + files=files, + headers={"X-Checksum-SHA256": "invalid-checksum"}, + ) + + assert response.status_code == 400 + + @pytest.mark.integration + def test_upload_with_mismatched_checksum_rejected( + self, integration_client, test_package + ): + """Test upload with wrong checksum is rejected.""" + project, package = test_package + content = b"mismatch test" + wrong_hash = "0" * 64 + + files = {"file": ("test.bin", io.BytesIO(content), "application/octet-stream")} + response = integration_client.post( + f"/api/v1/project/{project}/{package}/upload", + files=files, + headers={"X-Checksum-SHA256": wrong_hash}, + ) + + assert response.status_code == 422 + assert "verification failed" in response.json().get("detail", "").lower() + + +class TestResumableUploadCancellation: + """Tests for resumable upload cancellation.""" + + @pytest.mark.integration + def test_abort_resumable_upload(self, integration_client, test_package, sized_content): + """Test aborting a resumable upload cleans up properly.""" + project, package = test_package + content, expected_hash = sized_content(SIZE_100KB, seed=700) + + # Get API key for auth + api_key_response = integration_client.post( + "/api/v1/auth/keys", + json={"name": "abort-test-key"}, + ) + assert api_key_response.status_code == 200 + api_key = api_key_response.json()["key"] + + # Initialize resumable upload + init_response = integration_client.post( + f"/api/v1/project/{project}/{package}/upload/init", + json={ + "expected_hash": expected_hash, + "filename": "abort-test.bin", + "size": SIZE_100KB, + }, + headers={"Authorization": f"Bearer {api_key}"}, + ) + assert init_response.status_code == 200 + upload_id = init_response.json().get("upload_id") + + if upload_id: + # Abort the upload (without uploading any parts) + abort_response = integration_client.delete( + f"/api/v1/project/{project}/{package}/upload/{upload_id}", + headers={"Authorization": f"Bearer {api_key}"}, + ) + assert abort_response.status_code in [200, 204] + + # Verify progress shows not_found after abort + progress_response = integration_client.get( + f"/api/v1/project/{project}/{package}/upload/{upload_id}/progress", + headers={"Authorization": f"Bearer {api_key}"}, + ) + assert progress_response.status_code == 200 + assert progress_response.json()["status"] == "not_found" + + @pytest.mark.integration + def test_abort_nonexistent_upload(self, integration_client, test_package): + """Test aborting nonexistent upload returns appropriate error.""" + project, package = test_package + + # Get API key for auth + api_key_response = integration_client.post( + "/api/v1/auth/keys", + json={"name": "abort-nonexistent-key"}, + ) + assert api_key_response.status_code == 200 + api_key = api_key_response.json()["key"] + + response = integration_client.delete( + f"/api/v1/project/{project}/{package}/upload/nonexistent-upload-id", + headers={"Authorization": f"Bearer {api_key}"}, + ) + + # Should return 404 or 200 (idempotent delete) + assert response.status_code in [200, 204, 404] + + +class TestUploadTimeout: + """Tests for upload timeout handling.""" + + @pytest.mark.integration + def test_upload_with_short_timeout_succeeds_for_small_file( + self, integration_client, test_package + ): + """Test small file upload succeeds with reasonable timeout.""" + project, package = test_package + content = b"small timeout test" + + # httpx client should handle this quickly + result = upload_test_file( + integration_client, project, package, content, tag="timeout-small" + ) + + assert result["artifact_id"] is not None + + @pytest.mark.integration + def test_upload_response_duration_under_timeout( + self, integration_client, test_package, sized_content + ): + """Test upload completes within reasonable time.""" + project, package = test_package + content, _ = sized_content(SIZE_1MB, seed=800) + + start = time.time() + result = upload_test_file( + integration_client, project, package, content, tag="timeout-check" + ) + duration = time.time() - start + + # 1MB should upload in well under 60 seconds on local + assert duration < 60 + assert result["artifact_id"] is not None + + +class TestConcurrentUploads: + """Tests for concurrent upload handling.""" + + @pytest.mark.integration + def test_concurrent_different_files( + self, integration_client, test_package, sized_content + ): + """Test concurrent uploads of different files succeed.""" + from concurrent.futures import ThreadPoolExecutor, as_completed + + project, package = test_package + + # Get API key for auth + api_key_response = integration_client.post( + "/api/v1/auth/keys", + json={"name": "concurrent-diff-key"}, + ) + assert api_key_response.status_code == 200 + api_key = api_key_response.json()["key"] + + num_uploads = 3 + results = [] + errors = [] + + def upload_unique_file(idx): + try: + from httpx import Client + + content, expected_hash = sized_content(SIZE_100KB, seed=900 + idx) + + base_url = os.environ.get("ORCHARD_TEST_URL", "http://localhost:8080") + with Client(base_url=base_url, timeout=30.0) as client: + files = { + "file": ( + f"concurrent-{idx}.bin", + io.BytesIO(content), + "application/octet-stream", + ) + } + response = client.post( + f"/api/v1/project/{project}/{package}/upload", + files=files, + data={"tag": f"concurrent-diff-{idx}"}, + headers={"Authorization": f"Bearer {api_key}"}, + ) + if response.status_code == 200: + results.append((idx, response.json(), expected_hash)) + else: + errors.append(f"Upload {idx}: {response.status_code} - {response.text}") + except Exception as e: + errors.append(f"Upload {idx}: {str(e)}") + + with ThreadPoolExecutor(max_workers=num_uploads) as executor: + futures = [executor.submit(upload_unique_file, i) for i in range(num_uploads)] + for future in as_completed(futures): + pass + + assert len(errors) == 0, f"Concurrent upload errors: {errors}" + assert len(results) == num_uploads + + # Each upload should have unique artifact ID + artifact_ids = set(r[1]["artifact_id"] for r in results) + assert len(artifact_ids) == num_uploads + + # Each should match expected hash + for idx, result, expected_hash in results: + assert result["artifact_id"] == expected_hash diff --git a/backend/tests/integration/test_size_boundary.py b/backend/tests/integration/test_size_boundary.py new file mode 100644 index 0000000..49ed3d2 --- /dev/null +++ b/backend/tests/integration/test_size_boundary.py @@ -0,0 +1,583 @@ +""" +Integration tests for upload/download with various file sizes. + +Tests cover: +- Small files (0B - 100KB) +- Medium files (1MB - 50MB) +- Large files (100MB - 1GB) - marked as slow/large +- Exact chunk boundaries +- Data integrity verification across all sizes +""" + +import pytest +import io +import time +from concurrent.futures import ThreadPoolExecutor, as_completed +from tests.factories import ( + compute_sha256, + upload_test_file, + generate_content, + generate_content_with_hash, +) +from tests.conftest import ( + SIZE_1B, + SIZE_1KB, + SIZE_10KB, + SIZE_100KB, + SIZE_1MB, + SIZE_5MB, + SIZE_10MB, + SIZE_50MB, + SIZE_100MB, + SIZE_250MB, + SIZE_500MB, + SIZE_1GB, + CHUNK_SIZE, + MULTIPART_THRESHOLD, +) + + +class TestSmallFileSizes: + """Tests for small file uploads/downloads (0B - 100KB).""" + + @pytest.mark.integration + def test_upload_download_1_byte(self, integration_client, test_package, sized_content): + """Test upload/download of 1 byte file.""" + project, package = test_package + content, expected_hash = sized_content(SIZE_1B, seed=1) + + result = upload_test_file( + integration_client, project, package, content, + filename="1byte.bin", tag="1byte" + ) + assert result["artifact_id"] == expected_hash + assert result["size"] == SIZE_1B + + # Download and verify + response = integration_client.get( + f"/api/v1/project/{project}/{package}/+/1byte", + params={"mode": "proxy"}, + ) + assert response.status_code == 200 + assert response.content == content + assert len(response.content) == SIZE_1B + + @pytest.mark.integration + def test_upload_download_1kb(self, integration_client, test_package, sized_content): + """Test upload/download of 1KB file.""" + project, package = test_package + content, expected_hash = sized_content(SIZE_1KB, seed=2) + + result = upload_test_file( + integration_client, project, package, content, + filename="1kb.bin", tag="1kb" + ) + assert result["artifact_id"] == expected_hash + assert result["size"] == SIZE_1KB + + response = integration_client.get( + f"/api/v1/project/{project}/{package}/+/1kb", + params={"mode": "proxy"}, + ) + assert response.status_code == 200 + assert response.content == content + + @pytest.mark.integration + def test_upload_download_10kb(self, integration_client, test_package, sized_content): + """Test upload/download of 10KB file.""" + project, package = test_package + content, expected_hash = sized_content(SIZE_10KB, seed=3) + + result = upload_test_file( + integration_client, project, package, content, + filename="10kb.bin", tag="10kb" + ) + assert result["artifact_id"] == expected_hash + assert result["size"] == SIZE_10KB + + response = integration_client.get( + f"/api/v1/project/{project}/{package}/+/10kb", + params={"mode": "proxy"}, + ) + assert response.status_code == 200 + assert response.content == content + + @pytest.mark.integration + def test_upload_download_100kb(self, integration_client, test_package, sized_content): + """Test upload/download of 100KB file.""" + project, package = test_package + content, expected_hash = sized_content(SIZE_100KB, seed=4) + + result = upload_test_file( + integration_client, project, package, content, + filename="100kb.bin", tag="100kb" + ) + assert result["artifact_id"] == expected_hash + assert result["size"] == SIZE_100KB + + response = integration_client.get( + f"/api/v1/project/{project}/{package}/+/100kb", + params={"mode": "proxy"}, + ) + assert response.status_code == 200 + assert response.content == content + + +class TestMediumFileSizes: + """Tests for medium file uploads/downloads (1MB - 50MB).""" + + @pytest.mark.integration + def test_upload_download_1mb(self, integration_client, test_package, sized_content): + """Test upload/download of 1MB file.""" + project, package = test_package + content, expected_hash = sized_content(SIZE_1MB, seed=10) + + result = upload_test_file( + integration_client, project, package, content, + filename="1mb.bin", tag="1mb" + ) + assert result["artifact_id"] == expected_hash + assert result["size"] == SIZE_1MB + + response = integration_client.get( + f"/api/v1/project/{project}/{package}/+/1mb", + params={"mode": "proxy"}, + ) + assert response.status_code == 200 + assert len(response.content) == SIZE_1MB + assert compute_sha256(response.content) == expected_hash + + @pytest.mark.integration + def test_upload_download_5mb(self, integration_client, test_package, sized_content): + """Test upload/download of 5MB file (multipart threshold boundary area).""" + project, package = test_package + content, expected_hash = sized_content(SIZE_5MB, seed=11) + + result = upload_test_file( + integration_client, project, package, content, + filename="5mb.bin", tag="5mb" + ) + assert result["artifact_id"] == expected_hash + assert result["size"] == SIZE_5MB + + response = integration_client.get( + f"/api/v1/project/{project}/{package}/+/5mb", + params={"mode": "proxy"}, + ) + assert response.status_code == 200 + assert len(response.content) == SIZE_5MB + assert compute_sha256(response.content) == expected_hash + + @pytest.mark.integration + @pytest.mark.slow + def test_upload_download_10mb(self, integration_client, test_package, sized_content): + """Test upload/download of 10MB file.""" + project, package = test_package + content, expected_hash = sized_content(SIZE_10MB, seed=12) + + result = upload_test_file( + integration_client, project, package, content, + filename="10mb.bin", tag="10mb" + ) + assert result["artifact_id"] == expected_hash + assert result["size"] == SIZE_10MB + + response = integration_client.get( + f"/api/v1/project/{project}/{package}/+/10mb", + params={"mode": "proxy"}, + ) + assert response.status_code == 200 + assert len(response.content) == SIZE_10MB + assert compute_sha256(response.content) == expected_hash + + @pytest.mark.integration + @pytest.mark.slow + def test_upload_download_50mb(self, integration_client, test_package, sized_content): + """Test upload/download of 50MB file.""" + project, package = test_package + content, expected_hash = sized_content(SIZE_50MB, seed=13) + + start_time = time.time() + result = upload_test_file( + integration_client, project, package, content, + filename="50mb.bin", tag="50mb" + ) + upload_time = time.time() - start_time + + assert result["artifact_id"] == expected_hash + assert result["size"] == SIZE_50MB + + start_time = time.time() + response = integration_client.get( + f"/api/v1/project/{project}/{package}/+/50mb", + params={"mode": "proxy"}, + ) + download_time = time.time() - start_time + + assert response.status_code == 200 + assert len(response.content) == SIZE_50MB + assert compute_sha256(response.content) == expected_hash + + # Log timing for performance tracking + print(f"\n50MB upload: {upload_time:.2f}s, download: {download_time:.2f}s") + + +class TestLargeFileSizes: + """Tests for large file uploads/downloads (100MB - 1GB). + + These tests are marked as slow and large, skipped by default. + Run with: pytest -m "large" to include these tests. + """ + + @pytest.mark.integration + @pytest.mark.slow + @pytest.mark.large + def test_upload_download_100mb(self, integration_client, test_package, sized_content): + """Test upload/download of 100MB file (multipart threshold).""" + project, package = test_package + content, expected_hash = sized_content(SIZE_100MB, seed=100) + + start_time = time.time() + result = upload_test_file( + integration_client, project, package, content, + filename="100mb.bin", tag="100mb" + ) + upload_time = time.time() - start_time + + assert result["artifact_id"] == expected_hash + assert result["size"] == SIZE_100MB + + start_time = time.time() + response = integration_client.get( + f"/api/v1/project/{project}/{package}/+/100mb", + params={"mode": "proxy"}, + ) + download_time = time.time() - start_time + + assert response.status_code == 200 + assert len(response.content) == SIZE_100MB + assert compute_sha256(response.content) == expected_hash + + print(f"\n100MB upload: {upload_time:.2f}s, download: {download_time:.2f}s") + + @pytest.mark.integration + @pytest.mark.slow + @pytest.mark.large + def test_upload_download_250mb(self, integration_client, test_package, sized_content): + """Test upload/download of 250MB file.""" + project, package = test_package + content, expected_hash = sized_content(SIZE_250MB, seed=250) + + start_time = time.time() + result = upload_test_file( + integration_client, project, package, content, + filename="250mb.bin", tag="250mb" + ) + upload_time = time.time() - start_time + + assert result["artifact_id"] == expected_hash + assert result["size"] == SIZE_250MB + + start_time = time.time() + response = integration_client.get( + f"/api/v1/project/{project}/{package}/+/250mb", + params={"mode": "proxy"}, + ) + download_time = time.time() - start_time + + assert response.status_code == 200 + assert len(response.content) == SIZE_250MB + assert compute_sha256(response.content) == expected_hash + + print(f"\n250MB upload: {upload_time:.2f}s, download: {download_time:.2f}s") + + @pytest.mark.integration + @pytest.mark.slow + @pytest.mark.large + def test_upload_download_500mb(self, integration_client, test_package, sized_content): + """Test upload/download of 500MB file.""" + project, package = test_package + content, expected_hash = sized_content(SIZE_500MB, seed=500) + + start_time = time.time() + result = upload_test_file( + integration_client, project, package, content, + filename="500mb.bin", tag="500mb" + ) + upload_time = time.time() - start_time + + assert result["artifact_id"] == expected_hash + assert result["size"] == SIZE_500MB + + start_time = time.time() + response = integration_client.get( + f"/api/v1/project/{project}/{package}/+/500mb", + params={"mode": "proxy"}, + ) + download_time = time.time() - start_time + + assert response.status_code == 200 + assert len(response.content) == SIZE_500MB + assert compute_sha256(response.content) == expected_hash + + print(f"\n500MB upload: {upload_time:.2f}s, download: {download_time:.2f}s") + + @pytest.mark.integration + @pytest.mark.slow + @pytest.mark.large + def test_upload_download_1gb(self, integration_client, test_package, sized_content): + """Test upload/download of 1GB file. + + This test may take several minutes depending on network/disk speed. + """ + project, package = test_package + content, expected_hash = sized_content(SIZE_1GB, seed=1024) + + start_time = time.time() + result = upload_test_file( + integration_client, project, package, content, + filename="1gb.bin", tag="1gb" + ) + upload_time = time.time() - start_time + + assert result["artifact_id"] == expected_hash + assert result["size"] == SIZE_1GB + + start_time = time.time() + response = integration_client.get( + f"/api/v1/project/{project}/{package}/+/1gb", + params={"mode": "proxy"}, + ) + download_time = time.time() - start_time + + assert response.status_code == 200 + assert len(response.content) == SIZE_1GB + assert compute_sha256(response.content) == expected_hash + + print(f"\n1GB upload: {upload_time:.2f}s, download: {download_time:.2f}s") + + +class TestChunkBoundaries: + """Tests for exact chunk size boundaries.""" + + @pytest.mark.integration + def test_upload_download_at_chunk_size(self, integration_client, test_package, sized_content): + """Test upload/download at exact chunk size (64KB).""" + project, package = test_package + content, expected_hash = sized_content(CHUNK_SIZE, seed=64) + + result = upload_test_file( + integration_client, project, package, content, + filename="chunk.bin", tag="chunk-exact" + ) + assert result["artifact_id"] == expected_hash + assert result["size"] == CHUNK_SIZE + + response = integration_client.get( + f"/api/v1/project/{project}/{package}/+/chunk-exact", + params={"mode": "proxy"}, + ) + assert response.status_code == 200 + assert response.content == content + + @pytest.mark.integration + def test_upload_download_chunk_size_plus_1(self, integration_client, test_package, sized_content): + """Test upload/download at chunk size + 1 byte.""" + project, package = test_package + size = CHUNK_SIZE + 1 + content, expected_hash = sized_content(size, seed=65) + + result = upload_test_file( + integration_client, project, package, content, + filename="chunk_plus.bin", tag="chunk-plus" + ) + assert result["artifact_id"] == expected_hash + assert result["size"] == size + + response = integration_client.get( + f"/api/v1/project/{project}/{package}/+/chunk-plus", + params={"mode": "proxy"}, + ) + assert response.status_code == 200 + assert response.content == content + + @pytest.mark.integration + def test_upload_download_chunk_size_minus_1(self, integration_client, test_package, sized_content): + """Test upload/download at chunk size - 1 byte.""" + project, package = test_package + size = CHUNK_SIZE - 1 + content, expected_hash = sized_content(size, seed=63) + + result = upload_test_file( + integration_client, project, package, content, + filename="chunk_minus.bin", tag="chunk-minus" + ) + assert result["artifact_id"] == expected_hash + assert result["size"] == size + + response = integration_client.get( + f"/api/v1/project/{project}/{package}/+/chunk-minus", + params={"mode": "proxy"}, + ) + assert response.status_code == 200 + assert response.content == content + + @pytest.mark.integration + def test_upload_download_multiple_chunks(self, integration_client, test_package, sized_content): + """Test upload/download spanning multiple chunks.""" + project, package = test_package + size = CHUNK_SIZE * 3 + 1000 # 3 full chunks + partial + content, expected_hash = sized_content(size, seed=300) + + result = upload_test_file( + integration_client, project, package, content, + filename="multi_chunk.bin", tag="multi-chunk" + ) + assert result["artifact_id"] == expected_hash + assert result["size"] == size + + response = integration_client.get( + f"/api/v1/project/{project}/{package}/+/multi-chunk", + params={"mode": "proxy"}, + ) + assert response.status_code == 200 + assert response.content == content + + +class TestDataIntegrity: + """Tests for data integrity with various content types.""" + + @pytest.mark.integration + def test_binary_content_integrity(self, integration_client, test_package): + """Test binary content (all byte values 0-255) integrity.""" + project, package = test_package + # Content with all 256 possible byte values + content = bytes(range(256)) * 100 # 25.6KB + expected_hash = compute_sha256(content) + + result = upload_test_file( + integration_client, project, package, content, + filename="binary.bin", tag="binary" + ) + assert result["artifact_id"] == expected_hash + + response = integration_client.get( + f"/api/v1/project/{project}/{package}/+/binary", + params={"mode": "proxy"}, + ) + assert response.status_code == 200 + assert response.content == content + + @pytest.mark.integration + def test_text_content_integrity(self, integration_client, test_package): + """Test UTF-8 text content integrity.""" + project, package = test_package + content = "Hello, World! 你好世界 🌍 مرحبا العالم".encode("utf-8") + expected_hash = compute_sha256(content) + + result = upload_test_file( + integration_client, project, package, content, + filename="text.txt", tag="text" + ) + assert result["artifact_id"] == expected_hash + + response = integration_client.get( + f"/api/v1/project/{project}/{package}/+/text", + params={"mode": "proxy"}, + ) + assert response.status_code == 200 + assert response.content == content + assert response.content.decode("utf-8") == "Hello, World! 你好世界 🌍 مرحبا العالم" + + @pytest.mark.integration + def test_null_bytes_content_integrity(self, integration_client, test_package): + """Test content with null bytes.""" + project, package = test_package + content = b"before\x00null\x00bytes\x00after" + expected_hash = compute_sha256(content) + + result = upload_test_file( + integration_client, project, package, content, + filename="nulls.bin", tag="nulls" + ) + assert result["artifact_id"] == expected_hash + + response = integration_client.get( + f"/api/v1/project/{project}/{package}/+/nulls", + params={"mode": "proxy"}, + ) + assert response.status_code == 200 + assert response.content == content + assert b"\x00" in response.content + + @pytest.mark.integration + def test_unicode_filename_integrity(self, integration_client, test_package): + """Test file with unicode filename.""" + project, package = test_package + content = b"unicode filename test" + expected_hash = compute_sha256(content) + + result = upload_test_file( + integration_client, project, package, content, + filename="文件名.txt", tag="unicode-name" + ) + assert result["artifact_id"] == expected_hash + assert result["original_name"] == "文件名.txt" + + response = integration_client.get( + f"/api/v1/project/{project}/{package}/+/unicode-name", + params={"mode": "proxy"}, + ) + assert response.status_code == 200 + assert response.content == content + + @pytest.mark.integration + def test_compressed_content_integrity(self, integration_client, test_package): + """Test gzip-compressed content integrity.""" + import gzip + + project, package = test_package + original = b"This is some text that will be compressed " * 100 + content = gzip.compress(original) + expected_hash = compute_sha256(content) + + result = upload_test_file( + integration_client, project, package, content, + filename="data.gz", tag="compressed" + ) + assert result["artifact_id"] == expected_hash + + response = integration_client.get( + f"/api/v1/project/{project}/{package}/+/compressed", + params={"mode": "proxy"}, + ) + assert response.status_code == 200 + assert response.content == content + # Verify we can decompress + assert gzip.decompress(response.content) == original + + @pytest.mark.integration + def test_hash_verification_matches(self, integration_client, test_package, sized_content): + """Test that computed hash matches artifact_id for various sizes.""" + project, package = test_package + + sizes = [SIZE_1B, SIZE_1KB, SIZE_10KB, SIZE_100KB, SIZE_1MB] + + for i, size in enumerate(sizes): + content, expected_hash = sized_content(size, seed=1000 + i) + + result = upload_test_file( + integration_client, project, package, content, + filename=f"hash_test_{size}.bin", tag=f"hash-{size}" + ) + + # Verify artifact_id matches expected hash + assert result["artifact_id"] == expected_hash + + # Download and verify hash of downloaded content + response = integration_client.get( + f"/api/v1/project/{project}/{package}/+/hash-{size}", + params={"mode": "proxy"}, + ) + downloaded_hash = compute_sha256(response.content) + assert downloaded_hash == expected_hash diff --git a/backend/tests/integration/test_streaming_download.py b/backend/tests/integration/test_streaming_download.py new file mode 100644 index 0000000..b6163ad --- /dev/null +++ b/backend/tests/integration/test_streaming_download.py @@ -0,0 +1,535 @@ +""" +Integration tests for streaming download functionality. + +Tests cover: +- HTTP Range requests (partial downloads, resume) +- Conditional requests (If-None-Match, If-Modified-Since) +- Caching headers (Cache-Control, Last-Modified, Accept-Ranges) +- Large file streaming +- Download modes (proxy, redirect, presigned) +""" + +import pytest +import io +import time +from email.utils import formatdate +from tests.factories import ( + compute_sha256, + upload_test_file, +) +from tests.conftest import ( + SIZE_1KB, + SIZE_100KB, + SIZE_1MB, +) + + +class TestRangeRequests: + """Tests for HTTP Range request support (partial downloads).""" + + @pytest.mark.integration + def test_range_request_first_bytes(self, integration_client, test_package): + """Test range request for first N bytes.""" + project, package = test_package + content = b"0123456789" * 100 # 1000 bytes + upload_test_file(integration_client, project, package, content, tag="range-test") + + # Request first 10 bytes + response = integration_client.get( + f"/api/v1/project/{project}/{package}/+/range-test", + params={"mode": "proxy"}, + headers={"Range": "bytes=0-9"}, + ) + assert response.status_code == 206 # Partial Content + assert response.content == b"0123456789" + assert "Content-Range" in response.headers + assert response.headers["Content-Range"].startswith("bytes 0-9/") + + @pytest.mark.integration + def test_range_request_middle_bytes(self, integration_client, test_package): + """Test range request for bytes in the middle.""" + project, package = test_package + content = b"ABCDEFGHIJKLMNOPQRSTUVWXYZ" + upload_test_file(integration_client, project, package, content, tag="range-mid") + + # Request bytes 10-19 (KLMNOPQRST) + response = integration_client.get( + f"/api/v1/project/{project}/{package}/+/range-mid", + params={"mode": "proxy"}, + headers={"Range": "bytes=10-19"}, + ) + assert response.status_code == 206 + assert response.content == b"KLMNOPQRST" + + @pytest.mark.integration + def test_range_request_suffix_bytes(self, integration_client, test_package): + """Test range request for last N bytes (suffix range).""" + project, package = test_package + content = b"0123456789ABCDEF" # 16 bytes + upload_test_file(integration_client, project, package, content, tag="range-suffix") + + # Request last 4 bytes + response = integration_client.get( + f"/api/v1/project/{project}/{package}/+/range-suffix", + params={"mode": "proxy"}, + headers={"Range": "bytes=-4"}, + ) + assert response.status_code == 206 + assert response.content == b"CDEF" + + @pytest.mark.integration + def test_range_request_open_ended(self, integration_client, test_package): + """Test range request from offset to end.""" + project, package = test_package + content = b"0123456789" + upload_test_file(integration_client, project, package, content, tag="range-open") + + # Request from byte 5 to end + response = integration_client.get( + f"/api/v1/project/{project}/{package}/+/range-open", + params={"mode": "proxy"}, + headers={"Range": "bytes=5-"}, + ) + assert response.status_code == 206 + assert response.content == b"56789" + + @pytest.mark.integration + def test_range_request_includes_accept_ranges_header( + self, integration_client, test_package + ): + """Test that range requests include Accept-Ranges header.""" + project, package = test_package + content = b"test content" + upload_test_file(integration_client, project, package, content, tag="accept-ranges") + + response = integration_client.get( + f"/api/v1/project/{project}/{package}/+/accept-ranges", + params={"mode": "proxy"}, + headers={"Range": "bytes=0-4"}, + ) + assert response.status_code == 206 + assert response.headers.get("Accept-Ranges") == "bytes" + + @pytest.mark.integration + def test_full_download_advertises_accept_ranges( + self, integration_client, test_package + ): + """Test that full downloads advertise range support.""" + project, package = test_package + content = b"test content" + upload_test_file(integration_client, project, package, content, tag="full-accept") + + response = integration_client.get( + f"/api/v1/project/{project}/{package}/+/full-accept", + params={"mode": "proxy"}, + ) + assert response.status_code == 200 + assert response.headers.get("Accept-Ranges") == "bytes" + + +class TestConditionalRequests: + """Tests for conditional request handling (304 Not Modified).""" + + @pytest.mark.integration + def test_if_none_match_returns_304(self, integration_client, test_package): + """Test If-None-Match with matching ETag returns 304.""" + project, package = test_package + content = b"conditional request test content" + expected_hash = compute_sha256(content) + upload_test_file(integration_client, project, package, content, tag="cond-etag") + + # Request with matching ETag + response = integration_client.get( + f"/api/v1/project/{project}/{package}/+/cond-etag", + params={"mode": "proxy"}, + headers={"If-None-Match": f'"{expected_hash}"'}, + ) + assert response.status_code == 304 + assert response.content == b"" # No body for 304 + + @pytest.mark.integration + def test_if_none_match_without_quotes(self, integration_client, test_package): + """Test If-None-Match works with or without quotes.""" + project, package = test_package + content = b"etag no quotes test" + expected_hash = compute_sha256(content) + upload_test_file(integration_client, project, package, content, tag="cond-noquote") + + # Request with ETag without quotes + response = integration_client.get( + f"/api/v1/project/{project}/{package}/+/cond-noquote", + params={"mode": "proxy"}, + headers={"If-None-Match": expected_hash}, + ) + assert response.status_code == 304 + + @pytest.mark.integration + def test_if_none_match_mismatch_returns_200(self, integration_client, test_package): + """Test If-None-Match with non-matching ETag returns 200.""" + project, package = test_package + content = b"etag mismatch test" + upload_test_file(integration_client, project, package, content, tag="cond-mismatch") + + # Request with different ETag + response = integration_client.get( + f"/api/v1/project/{project}/{package}/+/cond-mismatch", + params={"mode": "proxy"}, + headers={"If-None-Match": '"different-etag-value"'}, + ) + assert response.status_code == 200 + assert response.content == content + + @pytest.mark.integration + def test_if_modified_since_returns_304(self, integration_client, test_package): + """Test If-Modified-Since with future date returns 304.""" + project, package = test_package + content = b"modified since test" + upload_test_file(integration_client, project, package, content, tag="cond-modified") + + # Request with future date (artifact was definitely created before this) + future_date = formatdate(time.time() + 86400, usegmt=True) # Tomorrow + response = integration_client.get( + f"/api/v1/project/{project}/{package}/+/cond-modified", + params={"mode": "proxy"}, + headers={"If-Modified-Since": future_date}, + ) + assert response.status_code == 304 + + @pytest.mark.integration + def test_if_modified_since_old_date_returns_200( + self, integration_client, test_package + ): + """Test If-Modified-Since with old date returns 200.""" + project, package = test_package + content = b"old date test" + upload_test_file(integration_client, project, package, content, tag="cond-old") + + # Request with old date (2020-01-01) + old_date = "Wed, 01 Jan 2020 00:00:00 GMT" + response = integration_client.get( + f"/api/v1/project/{project}/{package}/+/cond-old", + params={"mode": "proxy"}, + headers={"If-Modified-Since": old_date}, + ) + assert response.status_code == 200 + assert response.content == content + + @pytest.mark.integration + def test_304_includes_etag(self, integration_client, test_package): + """Test 304 response includes ETag header.""" + project, package = test_package + content = b"304 etag test" + expected_hash = compute_sha256(content) + upload_test_file(integration_client, project, package, content, tag="304-etag") + + response = integration_client.get( + f"/api/v1/project/{project}/{package}/+/304-etag", + params={"mode": "proxy"}, + headers={"If-None-Match": f'"{expected_hash}"'}, + ) + assert response.status_code == 304 + assert response.headers.get("ETag") == f'"{expected_hash}"' + + @pytest.mark.integration + def test_304_includes_cache_control(self, integration_client, test_package): + """Test 304 response includes Cache-Control header.""" + project, package = test_package + content = b"304 cache test" + expected_hash = compute_sha256(content) + upload_test_file(integration_client, project, package, content, tag="304-cache") + + response = integration_client.get( + f"/api/v1/project/{project}/{package}/+/304-cache", + params={"mode": "proxy"}, + headers={"If-None-Match": f'"{expected_hash}"'}, + ) + assert response.status_code == 304 + assert "immutable" in response.headers.get("Cache-Control", "") + + +class TestCachingHeaders: + """Tests for caching headers on download responses.""" + + @pytest.mark.integration + def test_download_includes_cache_control(self, integration_client, test_package): + """Test download response includes Cache-Control header.""" + project, package = test_package + content = b"cache control test" + upload_test_file(integration_client, project, package, content, tag="cache-ctl") + + response = integration_client.get( + f"/api/v1/project/{project}/{package}/+/cache-ctl", + params={"mode": "proxy"}, + ) + assert response.status_code == 200 + cache_control = response.headers.get("Cache-Control", "") + assert "public" in cache_control + assert "immutable" in cache_control + assert "max-age" in cache_control + + @pytest.mark.integration + def test_download_includes_last_modified(self, integration_client, test_package): + """Test download response includes Last-Modified header.""" + project, package = test_package + content = b"last modified test" + upload_test_file(integration_client, project, package, content, tag="last-mod") + + response = integration_client.get( + f"/api/v1/project/{project}/{package}/+/last-mod", + params={"mode": "proxy"}, + ) + assert response.status_code == 200 + assert "Last-Modified" in response.headers + # Should be in RFC 7231 format + last_modified = response.headers["Last-Modified"] + assert "GMT" in last_modified + + @pytest.mark.integration + def test_download_includes_etag(self, integration_client, test_package): + """Test download response includes ETag header.""" + project, package = test_package + content = b"etag header test" + expected_hash = compute_sha256(content) + upload_test_file(integration_client, project, package, content, tag="etag-hdr") + + response = integration_client.get( + f"/api/v1/project/{project}/{package}/+/etag-hdr", + params={"mode": "proxy"}, + ) + assert response.status_code == 200 + assert response.headers.get("ETag") == f'"{expected_hash}"' + + +class TestDownloadResume: + """Tests for download resume functionality using range requests.""" + + @pytest.mark.integration + def test_resume_download_after_partial(self, integration_client, test_package): + """Test resuming download from where it left off.""" + project, package = test_package + content = b"ABCDEFGHIJ" * 100 # 1000 bytes + upload_test_file(integration_client, project, package, content, tag="resume-test") + + # Simulate partial download (first 500 bytes) + response1 = integration_client.get( + f"/api/v1/project/{project}/{package}/+/resume-test", + params={"mode": "proxy"}, + headers={"Range": "bytes=0-499"}, + ) + assert response1.status_code == 206 + first_half = response1.content + assert len(first_half) == 500 + + # Resume from byte 500 + response2 = integration_client.get( + f"/api/v1/project/{project}/{package}/+/resume-test", + params={"mode": "proxy"}, + headers={"Range": "bytes=500-"}, + ) + assert response2.status_code == 206 + second_half = response2.content + assert len(second_half) == 500 + + # Combine and verify + combined = first_half + second_half + assert combined == content + + @pytest.mark.integration + def test_resume_with_etag_verification(self, integration_client, test_package): + """Test that resumed download can verify content hasn't changed.""" + project, package = test_package + content = b"resume etag verification test content" + expected_hash = compute_sha256(content) + upload_test_file(integration_client, project, package, content, tag="resume-etag") + + # Get ETag from first request + response1 = integration_client.get( + f"/api/v1/project/{project}/{package}/+/resume-etag", + params={"mode": "proxy"}, + headers={"Range": "bytes=0-9"}, + ) + assert response1.status_code == 206 + etag = response1.headers.get("ETag") + assert etag == f'"{expected_hash}"' + + # Resume with If-Match to ensure content hasn't changed + # (Note: If-Match would fail and return 412 if content changed) + response2 = integration_client.get( + f"/api/v1/project/{project}/{package}/+/resume-etag", + params={"mode": "proxy"}, + headers={"Range": "bytes=10-"}, + ) + assert response2.status_code == 206 + # ETag should be the same + assert response2.headers.get("ETag") == etag + + +class TestLargeFileStreaming: + """Tests for streaming large files.""" + + @pytest.mark.integration + def test_stream_1mb_file(self, integration_client, test_package, sized_content): + """Test streaming a 1MB file.""" + project, package = test_package + content, expected_hash = sized_content(SIZE_1MB, seed=500) + + upload_test_file(integration_client, project, package, content, tag="stream-1mb") + + response = integration_client.get( + f"/api/v1/project/{project}/{package}/+/stream-1mb", + params={"mode": "proxy"}, + ) + assert response.status_code == 200 + assert len(response.content) == SIZE_1MB + assert compute_sha256(response.content) == expected_hash + + @pytest.mark.integration + def test_stream_large_file_has_correct_headers( + self, integration_client, test_package, sized_content + ): + """Test that large file streaming has correct headers.""" + project, package = test_package + content, expected_hash = sized_content(SIZE_100KB, seed=501) + + upload_test_file(integration_client, project, package, content, tag="stream-hdr") + + response = integration_client.get( + f"/api/v1/project/{project}/{package}/+/stream-hdr", + params={"mode": "proxy"}, + ) + assert response.status_code == 200 + assert int(response.headers.get("Content-Length", 0)) == SIZE_100KB + assert response.headers.get("X-Checksum-SHA256") == expected_hash + assert response.headers.get("Accept-Ranges") == "bytes" + + @pytest.mark.integration + def test_range_request_on_large_file( + self, integration_client, test_package, sized_content + ): + """Test range request on a larger file.""" + project, package = test_package + content, _ = sized_content(SIZE_100KB, seed=502) + + upload_test_file(integration_client, project, package, content, tag="range-large") + + # Request a slice from the middle + start = 50000 + end = 50999 + response = integration_client.get( + f"/api/v1/project/{project}/{package}/+/range-large", + params={"mode": "proxy"}, + headers={"Range": f"bytes={start}-{end}"}, + ) + assert response.status_code == 206 + assert len(response.content) == 1000 + assert response.content == content[start : end + 1] + + +class TestDownloadModes: + """Tests for different download modes.""" + + @pytest.mark.integration + def test_proxy_mode_streams_content(self, integration_client, test_package): + """Test proxy mode streams content through backend.""" + project, package = test_package + content = b"proxy mode test content" + upload_test_file(integration_client, project, package, content, tag="mode-proxy") + + response = integration_client.get( + f"/api/v1/project/{project}/{package}/+/mode-proxy", + params={"mode": "proxy"}, + ) + assert response.status_code == 200 + assert response.content == content + + @pytest.mark.integration + def test_presigned_mode_returns_url(self, integration_client, test_package): + """Test presigned mode returns JSON with URL.""" + project, package = test_package + content = b"presigned mode test" + upload_test_file(integration_client, project, package, content, tag="mode-presign") + + response = integration_client.get( + f"/api/v1/project/{project}/{package}/+/mode-presign", + params={"mode": "presigned"}, + ) + assert response.status_code == 200 + data = response.json() + assert "url" in data + assert "expires_at" in data + assert data["url"].startswith("http") + + @pytest.mark.integration + def test_redirect_mode_returns_302(self, integration_client, test_package): + """Test redirect mode returns 302 to presigned URL.""" + project, package = test_package + content = b"redirect mode test" + upload_test_file(integration_client, project, package, content, tag="mode-redir") + + response = integration_client.get( + f"/api/v1/project/{project}/{package}/+/mode-redir", + params={"mode": "redirect"}, + follow_redirects=False, + ) + assert response.status_code == 302 + assert "Location" in response.headers + + +class TestIntegrityDuringStreaming: + """Tests for data integrity during streaming downloads.""" + + @pytest.mark.integration + def test_checksum_header_matches_content(self, integration_client, test_package): + """Test X-Checksum-SHA256 header matches actual downloaded content.""" + project, package = test_package + content = b"integrity check content" + expected_hash = compute_sha256(content) + upload_test_file(integration_client, project, package, content, tag="integrity") + + response = integration_client.get( + f"/api/v1/project/{project}/{package}/+/integrity", + params={"mode": "proxy"}, + ) + assert response.status_code == 200 + + header_hash = response.headers.get("X-Checksum-SHA256") + actual_hash = compute_sha256(response.content) + + assert header_hash == expected_hash + assert actual_hash == expected_hash + assert header_hash == actual_hash + + @pytest.mark.integration + def test_etag_matches_content_hash(self, integration_client, test_package): + """Test ETag header matches content hash.""" + project, package = test_package + content = b"etag integrity test" + expected_hash = compute_sha256(content) + upload_test_file(integration_client, project, package, content, tag="etag-int") + + response = integration_client.get( + f"/api/v1/project/{project}/{package}/+/etag-int", + params={"mode": "proxy"}, + ) + assert response.status_code == 200 + + etag = response.headers.get("ETag", "").strip('"') + actual_hash = compute_sha256(response.content) + + assert etag == expected_hash + assert actual_hash == expected_hash + + @pytest.mark.integration + def test_digest_header_present(self, integration_client, test_package): + """Test Digest header is present in RFC 3230 format.""" + project, package = test_package + content = b"digest header test" + upload_test_file(integration_client, project, package, content, tag="digest") + + response = integration_client.get( + f"/api/v1/project/{project}/{package}/+/digest", + params={"mode": "proxy"}, + ) + assert response.status_code == 200 + assert "Digest" in response.headers + assert response.headers["Digest"].startswith("sha-256=") diff --git a/backend/tests/integration/test_upload_download_api.py b/backend/tests/integration/test_upload_download_api.py index 4d9b8b2..936a4ca 100644 --- a/backend/tests/integration/test_upload_download_api.py +++ b/backend/tests/integration/test_upload_download_api.py @@ -10,6 +10,7 @@ Tests cover: - S3 storage verification """ +import os import pytest import io import threading @@ -25,6 +26,19 @@ from tests.factories import ( class TestUploadBasics: """Tests for basic upload functionality.""" + @pytest.mark.integration + def test_upload_returns_200(self, integration_client, test_package): + """Test upload with valid file returns 200.""" + project, package = test_package + content = b"valid file upload test" + + files = {"file": ("test.bin", io.BytesIO(content), "application/octet-stream")} + response = integration_client.post( + f"/api/v1/project/{project}/{package}/upload", + files=files, + ) + assert response.status_code == 200 + @pytest.mark.integration def test_upload_returns_artifact_id(self, integration_client, test_package): """Test upload returns the artifact ID (SHA256 hash).""" @@ -101,6 +115,83 @@ class TestUploadBasics: assert "created_at" in result assert result["created_at"] is not None + @pytest.mark.integration + def test_upload_without_tag_succeeds(self, integration_client, test_package): + """Test upload without tag succeeds (no tag created).""" + project, package = test_package + content = b"upload without tag test" + expected_hash = compute_sha256(content) + + files = {"file": ("no_tag.bin", io.BytesIO(content), "application/octet-stream")} + response = integration_client.post( + f"/api/v1/project/{project}/{package}/upload", + files=files, + # No tag parameter + ) + assert response.status_code == 200 + result = response.json() + assert result["artifact_id"] == expected_hash + + # Verify no tag was created - list tags and check + tags_response = integration_client.get( + f"/api/v1/project/{project}/{package}/tags" + ) + assert tags_response.status_code == 200 + tags = tags_response.json() + # Filter for tags pointing to this artifact + artifact_tags = [t for t in tags.get("items", tags) if t.get("artifact_id") == expected_hash] + assert len(artifact_tags) == 0, "Tag should not be created when not specified" + + @pytest.mark.integration + def test_upload_creates_artifact_in_database(self, integration_client, test_package): + """Test upload creates artifact record in database.""" + project, package = test_package + content = b"database artifact test" + expected_hash = compute_sha256(content) + + upload_test_file(integration_client, project, package, content) + + # Verify artifact exists via API + response = integration_client.get(f"/api/v1/artifact/{expected_hash}") + assert response.status_code == 200 + artifact = response.json() + assert artifact["id"] == expected_hash + assert artifact["size"] == len(content) + + @pytest.mark.integration + @pytest.mark.requires_direct_s3 + def test_upload_creates_object_in_s3(self, integration_client, test_package): + """Test upload creates object in S3 storage.""" + project, package = test_package + content = b"s3 object creation test" + expected_hash = compute_sha256(content) + + upload_test_file(integration_client, project, package, content) + + # Verify S3 object exists + assert s3_object_exists(expected_hash), "S3 object should exist after upload" + + @pytest.mark.integration + def test_upload_with_tag_creates_tag_record(self, integration_client, test_package): + """Test upload with tag creates tag record.""" + project, package = test_package + content = b"tag creation test" + expected_hash = compute_sha256(content) + tag_name = "my-tag-v1" + + upload_test_file( + integration_client, project, package, content, tag=tag_name + ) + + # Verify tag exists + tags_response = integration_client.get( + f"/api/v1/project/{project}/{package}/tags" + ) + assert tags_response.status_code == 200 + tags = tags_response.json() + tag_names = [t["name"] for t in tags.get("items", tags)] + assert tag_name in tag_names + class TestDuplicateUploads: """Tests for duplicate upload deduplication behavior.""" @@ -248,6 +339,23 @@ class TestDownload: assert response.status_code == 200 assert response.content == original_content + @pytest.mark.integration + def test_download_by_tag_prefix(self, integration_client, test_package): + """Test downloading artifact using tag: prefix.""" + project, package = test_package + original_content = b"download by tag prefix test" + + upload_test_file( + integration_client, project, package, original_content, tag="prefix-tag" + ) + + response = integration_client.get( + f"/api/v1/project/{project}/{package}/+/tag:prefix-tag", + params={"mode": "proxy"}, + ) + assert response.status_code == 200 + assert response.content == original_content + @pytest.mark.integration def test_download_nonexistent_tag(self, integration_client, test_package): """Test downloading nonexistent tag returns 404.""" @@ -258,6 +366,33 @@ class TestDownload: ) assert response.status_code == 404 + @pytest.mark.integration + def test_download_nonexistent_artifact(self, integration_client, test_package): + """Test downloading nonexistent artifact ID returns 404.""" + project, package = test_package + fake_hash = "0" * 64 + + response = integration_client.get( + f"/api/v1/project/{project}/{package}/+/artifact:{fake_hash}" + ) + assert response.status_code == 404 + + @pytest.mark.integration + def test_download_from_nonexistent_project(self, integration_client, unique_test_id): + """Test downloading from nonexistent project returns 404.""" + response = integration_client.get( + f"/api/v1/project/nonexistent-project-{unique_test_id}/somepackage/+/sometag" + ) + assert response.status_code == 404 + + @pytest.mark.integration + def test_download_from_nonexistent_package(self, integration_client, test_project, unique_test_id): + """Test downloading from nonexistent package returns 404.""" + response = integration_client.get( + f"/api/v1/project/{test_project}/nonexistent-package-{unique_test_id}/+/sometag" + ) + assert response.status_code == 404 + @pytest.mark.integration def test_content_matches_original(self, integration_client, test_package): """Test downloaded content matches original exactly.""" @@ -275,6 +410,111 @@ class TestDownload: assert response.content == original_content +class TestDownloadHeaders: + """Tests for download response headers.""" + + @pytest.mark.integration + def test_download_content_type_header(self, integration_client, test_package): + """Test download returns correct Content-Type header.""" + project, package = test_package + content = b"content type header test" + + upload_test_file( + integration_client, project, package, content, + filename="test.txt", tag="content-type-test" + ) + + response = integration_client.get( + f"/api/v1/project/{project}/{package}/+/content-type-test", + params={"mode": "proxy"}, + ) + assert response.status_code == 200 + # Content-Type should be set (either text/plain or application/octet-stream) + assert "content-type" in response.headers + + @pytest.mark.integration + def test_download_content_length_header(self, integration_client, test_package): + """Test download returns correct Content-Length header.""" + project, package = test_package + content = b"content length header test - exactly 41 bytes!" + expected_length = len(content) + + upload_test_file( + integration_client, project, package, content, tag="content-length-test" + ) + + response = integration_client.get( + f"/api/v1/project/{project}/{package}/+/content-length-test", + params={"mode": "proxy"}, + ) + assert response.status_code == 200 + assert "content-length" in response.headers + assert int(response.headers["content-length"]) == expected_length + + @pytest.mark.integration + def test_download_content_disposition_header(self, integration_client, test_package): + """Test download returns correct Content-Disposition header.""" + project, package = test_package + content = b"content disposition test" + filename = "my-test-file.bin" + + upload_test_file( + integration_client, project, package, content, + filename=filename, tag="disposition-test" + ) + + response = integration_client.get( + f"/api/v1/project/{project}/{package}/+/disposition-test", + params={"mode": "proxy"}, + ) + assert response.status_code == 200 + assert "content-disposition" in response.headers + disposition = response.headers["content-disposition"] + assert "attachment" in disposition + assert filename in disposition + + @pytest.mark.integration + def test_download_checksum_headers(self, integration_client, test_package): + """Test download returns checksum headers.""" + project, package = test_package + content = b"checksum header test content" + expected_hash = compute_sha256(content) + + upload_test_file( + integration_client, project, package, content, tag="checksum-headers" + ) + + response = integration_client.get( + f"/api/v1/project/{project}/{package}/+/checksum-headers", + params={"mode": "proxy"}, + ) + assert response.status_code == 200 + # Check for checksum headers + assert "x-checksum-sha256" in response.headers + assert response.headers["x-checksum-sha256"] == expected_hash + + @pytest.mark.integration + def test_download_etag_header(self, integration_client, test_package): + """Test download returns ETag header (artifact ID).""" + project, package = test_package + content = b"etag header test" + expected_hash = compute_sha256(content) + + upload_test_file( + integration_client, project, package, content, tag="etag-test" + ) + + response = integration_client.get( + f"/api/v1/project/{project}/{package}/+/etag-test", + params={"mode": "proxy"}, + ) + assert response.status_code == 200 + assert "etag" in response.headers + # ETag should contain the artifact ID (hash) + etag = response.headers["etag"].strip('"') + assert etag == expected_hash + + class TestConcurrentUploads: """Tests for concurrent upload handling.""" @@ -301,7 +541,7 @@ class TestConcurrentUploads: try: from httpx import Client - base_url = "http://localhost:8080" + base_url = os.environ.get("ORCHARD_TEST_URL", "http://localhost:8080") with Client(base_url=base_url, timeout=30.0) as client: files = { "file": ( @@ -397,6 +637,7 @@ class TestUploadFailureCleanup: """Tests for cleanup when uploads fail.""" @pytest.mark.integration + @pytest.mark.requires_direct_s3 def test_upload_failure_invalid_project_no_orphaned_s3( self, integration_client, unique_test_id ): @@ -419,6 +660,7 @@ class TestUploadFailureCleanup: ) @pytest.mark.integration + @pytest.mark.requires_direct_s3 def test_upload_failure_invalid_package_no_orphaned_s3( self, integration_client, test_project, unique_test_id ): @@ -466,6 +708,7 @@ class TestS3StorageVerification: """Tests to verify S3 storage behavior.""" @pytest.mark.integration + @pytest.mark.requires_direct_s3 def test_s3_single_object_after_duplicates( self, integration_client, test_package, unique_test_id ): @@ -521,6 +764,7 @@ class TestSecurityPathTraversal: """ @pytest.mark.integration + @pytest.mark.requires_direct_s3 def test_path_traversal_in_filename_stored_safely( self, integration_client, test_package ): diff --git a/backend/tests/integration/test_version_api.py b/backend/tests/integration/test_version_api.py new file mode 100644 index 0000000..42b63f2 --- /dev/null +++ b/backend/tests/integration/test_version_api.py @@ -0,0 +1,347 @@ +""" +Integration tests for package version API endpoints. + +Tests cover: +- Version creation via upload +- Version auto-detection from filename +- Version listing and retrieval +- Download by version prefix +- Version deletion +""" + +import pytest +import io +from tests.factories import ( + compute_sha256, + upload_test_file, +) + + +class TestVersionCreation: + """Tests for creating versions via upload.""" + + @pytest.mark.integration + def test_upload_with_explicit_version(self, integration_client, test_package): + """Test upload with explicit version parameter creates version record.""" + project, package = test_package + content = b"version creation test" + expected_hash = compute_sha256(content) + + files = {"file": ("app.tar.gz", io.BytesIO(content), "application/octet-stream")} + response = integration_client.post( + f"/api/v1/project/{project}/{package}/upload", + files=files, + data={"version": "1.0.0"}, + ) + assert response.status_code == 200 + result = response.json() + assert result["artifact_id"] == expected_hash + assert result.get("version") == "1.0.0" + assert result.get("version_source") == "explicit" + + @pytest.mark.integration + def test_upload_with_version_and_tag(self, integration_client, test_package): + """Test upload with both version and tag creates both records.""" + project, package = test_package + content = b"version and tag test" + + files = {"file": ("app.tar.gz", io.BytesIO(content), "application/octet-stream")} + response = integration_client.post( + f"/api/v1/project/{project}/{package}/upload", + files=files, + data={"version": "2.0.0", "tag": "latest"}, + ) + assert response.status_code == 200 + result = response.json() + assert result.get("version") == "2.0.0" + + # Verify tag was also created + tags_response = integration_client.get( + f"/api/v1/project/{project}/{package}/tags" + ) + assert tags_response.status_code == 200 + tags = tags_response.json() + tag_names = [t["name"] for t in tags.get("items", tags)] + assert "latest" in tag_names + + @pytest.mark.integration + def test_duplicate_version_same_content_succeeds(self, integration_client, test_package): + """Test uploading same version with same content succeeds (deduplication).""" + project, package = test_package + content = b"version dedup test" + + # First upload with version + files1 = {"file": ("app1.tar.gz", io.BytesIO(content), "application/octet-stream")} + response1 = integration_client.post( + f"/api/v1/project/{project}/{package}/upload", + files=files1, + data={"version": "3.0.0"}, + ) + assert response1.status_code == 200 + + # Second upload with same version and same content succeeds + files2 = {"file": ("app2.tar.gz", io.BytesIO(content), "application/octet-stream")} + response2 = integration_client.post( + f"/api/v1/project/{project}/{package}/upload", + files=files2, + data={"version": "3.0.0"}, + ) + # This succeeds because it's the same artifact (deduplication) + assert response2.status_code == 200 + + +class TestVersionAutoDetection: + """Tests for automatic version detection from filename.""" + + @pytest.mark.integration + def test_version_detected_from_filename_tarball(self, integration_client, test_package): + """Test version is auto-detected from tarball filename or metadata.""" + project, package = test_package + content = b"auto detect version tarball" + + files = {"file": ("myapp-1.2.3.tar.gz", io.BytesIO(content), "application/octet-stream")} + response = integration_client.post( + f"/api/v1/project/{project}/{package}/upload", + files=files, + ) + assert response.status_code == 200 + result = response.json() + assert result.get("version") == "1.2.3" + # Version source can be 'filename' or 'metadata' depending on detection order + assert result.get("version_source") in ["filename", "metadata"] + + @pytest.mark.integration + def test_version_detected_from_filename_zip(self, integration_client, test_package): + """Test version is auto-detected from zip filename.""" + project, package = test_package + content = b"auto detect version zip" + + files = {"file": ("package-2.0.0.zip", io.BytesIO(content), "application/octet-stream")} + response = integration_client.post( + f"/api/v1/project/{project}/{package}/upload", + files=files, + ) + assert response.status_code == 200 + result = response.json() + assert result.get("version") == "2.0.0" + assert result.get("version_source") == "filename" + + @pytest.mark.integration + def test_explicit_version_overrides_filename(self, integration_client, test_package): + """Test explicit version parameter overrides filename detection.""" + project, package = test_package + content = b"explicit override test" + + files = {"file": ("myapp-1.0.0.tar.gz", io.BytesIO(content), "application/octet-stream")} + response = integration_client.post( + f"/api/v1/project/{project}/{package}/upload", + files=files, + data={"version": "9.9.9"}, + ) + assert response.status_code == 200 + result = response.json() + assert result.get("version") == "9.9.9" + assert result.get("version_source") == "explicit" + + @pytest.mark.integration + def test_no_version_detected_from_plain_filename(self, integration_client, test_package): + """Test no version is created for filenames without version pattern.""" + project, package = test_package + content = b"no version in filename" + + files = {"file": ("plain-file.bin", io.BytesIO(content), "application/octet-stream")} + response = integration_client.post( + f"/api/v1/project/{project}/{package}/upload", + files=files, + ) + assert response.status_code == 200 + result = response.json() + # Version should be None or not present + assert result.get("version") is None + + +class TestVersionListing: + """Tests for listing and retrieving versions.""" + + @pytest.mark.integration + def test_list_versions(self, integration_client, test_package): + """Test listing all versions for a package.""" + project, package = test_package + + # Create multiple versions + for ver in ["1.0.0", "1.1.0", "2.0.0"]: + content = f"version {ver} content".encode() + files = {"file": (f"app-{ver}.tar.gz", io.BytesIO(content), "application/octet-stream")} + response = integration_client.post( + f"/api/v1/project/{project}/{package}/upload", + files=files, + data={"version": ver}, + ) + assert response.status_code == 200 + + # List versions + response = integration_client.get( + f"/api/v1/project/{project}/{package}/versions" + ) + assert response.status_code == 200 + data = response.json() + versions = [v["version"] for v in data.get("items", data)] + assert "1.0.0" in versions + assert "1.1.0" in versions + assert "2.0.0" in versions + + @pytest.mark.integration + def test_get_specific_version(self, integration_client, test_package): + """Test getting details for a specific version.""" + project, package = test_package + content = b"specific version test" + expected_hash = compute_sha256(content) + + # Create version + files = {"file": ("app-4.0.0.tar.gz", io.BytesIO(content), "application/octet-stream")} + integration_client.post( + f"/api/v1/project/{project}/{package}/upload", + files=files, + data={"version": "4.0.0"}, + ) + + # Get version details + response = integration_client.get( + f"/api/v1/project/{project}/{package}/versions/4.0.0" + ) + assert response.status_code == 200 + data = response.json() + assert data["version"] == "4.0.0" + assert data["artifact_id"] == expected_hash + + @pytest.mark.integration + def test_get_nonexistent_version_returns_404(self, integration_client, test_package): + """Test getting nonexistent version returns 404.""" + project, package = test_package + + response = integration_client.get( + f"/api/v1/project/{project}/{package}/versions/99.99.99" + ) + assert response.status_code == 404 + + +class TestDownloadByVersion: + """Tests for downloading artifacts by version.""" + + @pytest.mark.integration + def test_download_by_version_prefix(self, integration_client, test_package): + """Test downloading artifact using version: prefix.""" + project, package = test_package + content = b"download by version test" + expected_hash = compute_sha256(content) + + # Upload with version + files = {"file": ("app.tar.gz", io.BytesIO(content), "application/octet-stream")} + integration_client.post( + f"/api/v1/project/{project}/{package}/upload", + files=files, + data={"version": "5.0.0"}, + ) + + # Download by version prefix + response = integration_client.get( + f"/api/v1/project/{project}/{package}/+/version:5.0.0", + params={"mode": "proxy"}, + ) + assert response.status_code == 200 + assert response.content == content + + @pytest.mark.integration + def test_download_nonexistent_version_returns_404(self, integration_client, test_package): + """Test downloading nonexistent version returns 404.""" + project, package = test_package + + response = integration_client.get( + f"/api/v1/project/{project}/{package}/+/version:99.0.0" + ) + assert response.status_code == 404 + + @pytest.mark.integration + def test_version_resolution_priority(self, integration_client, test_package): + """Test that version: prefix explicitly resolves to version, not tag.""" + project, package = test_package + version_content = b"this is the version content" + tag_content = b"this is the tag content" + + # Create a version 6.0.0 + files1 = {"file": ("app-v.tar.gz", io.BytesIO(version_content), "application/octet-stream")} + integration_client.post( + f"/api/v1/project/{project}/{package}/upload", + files=files1, + data={"version": "6.0.0"}, + ) + + # Create a tag named "6.0.0" pointing to different content + files2 = {"file": ("app-t.tar.gz", io.BytesIO(tag_content), "application/octet-stream")} + integration_client.post( + f"/api/v1/project/{project}/{package}/upload", + files=files2, + data={"tag": "6.0.0"}, + ) + + # Download with version: prefix should get version content + response = integration_client.get( + f"/api/v1/project/{project}/{package}/+/version:6.0.0", + params={"mode": "proxy"}, + ) + assert response.status_code == 200 + assert response.content == version_content + + # Download with tag: prefix should get tag content + response2 = integration_client.get( + f"/api/v1/project/{project}/{package}/+/tag:6.0.0", + params={"mode": "proxy"}, + ) + assert response2.status_code == 200 + assert response2.content == tag_content + + +class TestVersionDeletion: + """Tests for deleting versions.""" + + @pytest.mark.integration + def test_delete_version(self, integration_client, test_package): + """Test deleting a version.""" + project, package = test_package + content = b"delete version test" + + # Create version + files = {"file": ("app.tar.gz", io.BytesIO(content), "application/octet-stream")} + integration_client.post( + f"/api/v1/project/{project}/{package}/upload", + files=files, + data={"version": "7.0.0"}, + ) + + # Verify version exists + response = integration_client.get( + f"/api/v1/project/{project}/{package}/versions/7.0.0" + ) + assert response.status_code == 200 + + # Delete version - returns 204 No Content on success + delete_response = integration_client.delete( + f"/api/v1/project/{project}/{package}/versions/7.0.0" + ) + assert delete_response.status_code == 204 + + # Verify version no longer exists + response2 = integration_client.get( + f"/api/v1/project/{project}/{package}/versions/7.0.0" + ) + assert response2.status_code == 404 + + @pytest.mark.integration + def test_delete_nonexistent_version_returns_404(self, integration_client, test_package): + """Test deleting nonexistent version returns 404.""" + project, package = test_package + + response = integration_client.delete( + f"/api/v1/project/{project}/{package}/versions/99.0.0" + ) + assert response.status_code == 404 diff --git a/docs/integrity-verification.md b/docs/integrity-verification.md new file mode 100644 index 0000000..ced5b3d --- /dev/null +++ b/docs/integrity-verification.md @@ -0,0 +1,294 @@ +# Integrity Verification + +Orchard uses content-addressable storage with SHA256 hashing to ensure artifact integrity. This document describes how integrity verification works and how to use it. + +## How It Works + +### Content-Addressable Storage + +Orchard stores artifacts using their SHA256 hash as the unique identifier. This provides several benefits: + +1. **Automatic deduplication**: Identical content is stored only once +2. **Built-in integrity**: The artifact ID *is* the content hash +3. **Tamper detection**: Any modification changes the hash, making corruption detectable + +When you upload a file: +1. Orchard computes the SHA256 hash of the content +2. The hash becomes the artifact ID (64-character hex string) +3. The file is stored in S3 at `fruits/{hash[0:2]}/{hash[2:4]}/{hash}` +4. The hash and metadata are recorded in the database + +### Hash Format + +- Algorithm: SHA256 +- Format: 64-character lowercase hexadecimal string +- Example: `dffd6021bb2bd5b0af676290809ec3a53191dd81c7f70a4b28688a362182986f` + +## Client-Side Verification + +### Before Upload + +Compute the hash locally before uploading to verify the server received your content correctly: + +```python +import hashlib + +def compute_sha256(content: bytes) -> str: + return hashlib.sha256(content).hexdigest() + +# Compute hash before upload +content = open("myfile.tar.gz", "rb").read() +local_hash = compute_sha256(content) + +# Upload the file +response = requests.post( + f"{base_url}/api/v1/project/{project}/{package}/upload", + files={"file": ("myfile.tar.gz", content)}, +) +result = response.json() + +# Verify server computed the same hash +assert result["artifact_id"] == local_hash, "Hash mismatch!" +``` + +### Providing Expected Hash on Upload + +You can provide the expected hash in the upload request. The server will reject the upload if the computed hash doesn't match: + +```python +response = requests.post( + f"{base_url}/api/v1/project/{project}/{package}/upload", + files={"file": ("myfile.tar.gz", content)}, + headers={"X-Checksum-SHA256": local_hash}, +) + +# Returns 422 if hash doesn't match +if response.status_code == 422: + print("Checksum mismatch - upload rejected") +``` + +### After Download + +Verify downloaded content matches the expected hash using response headers: + +```python +response = requests.get( + f"{base_url}/api/v1/project/{project}/{package}/+/{tag}", + params={"mode": "proxy"}, +) + +# Get expected hash from header +expected_hash = response.headers.get("X-Checksum-SHA256") + +# Compute hash of downloaded content +actual_hash = compute_sha256(response.content) + +# Verify +if actual_hash != expected_hash: + raise Exception(f"Integrity check failed! Expected {expected_hash}, got {actual_hash}") +``` + +### Response Headers for Verification + +Download responses include multiple headers for verification: + +| Header | Format | Description | +|--------|--------|-------------| +| `X-Checksum-SHA256` | Hex string | SHA256 hash (64 chars) | +| `ETag` | `""` | SHA256 hash in quotes | +| `Digest` | `sha-256=` | RFC 3230 format (base64-encoded) | +| `Content-Length` | Integer | File size in bytes | + +### Server-Side Verification on Download + +Request server-side verification during download: + +```bash +# Pre-verification: Server verifies before streaming (returns 500 if corrupt) +curl "${base_url}/api/v1/project/${project}/${package}/+/${tag}?mode=proxy&verify=true&verify_mode=pre" + +# Stream verification: Server verifies while streaming (logs error if corrupt) +curl "${base_url}/api/v1/project/${project}/${package}/+/${tag}?mode=proxy&verify=true&verify_mode=stream" +``` + +The `X-Verified` header indicates whether server-side verification was performed: +- `X-Verified: true` - Content was verified by the server + +## Server-Side Consistency Check + +### Consistency Check Endpoint + +Administrators can run a consistency check to verify all stored artifacts: + +```bash +curl "${base_url}/api/v1/admin/consistency-check" +``` + +Response: +```json +{ + "total_artifacts_checked": 1234, + "healthy": true, + "orphaned_s3_objects": 0, + "missing_s3_objects": 0, + "size_mismatches": 0, + "orphaned_s3_keys": [], + "missing_s3_keys": [], + "size_mismatch_artifacts": [] +} +``` + +### What the Check Verifies + +1. **Missing S3 objects**: Database records with no corresponding S3 object +2. **Orphaned S3 objects**: S3 objects with no database record +3. **Size mismatches**: S3 object size doesn't match database record + +### Running Consistency Checks + +**Manual check:** +```bash +# Check all artifacts +curl "${base_url}/api/v1/admin/consistency-check" + +# Limit results (for large deployments) +curl "${base_url}/api/v1/admin/consistency-check?limit=100" +``` + +**Scheduled checks (recommended):** + +Set up a cron job or Kubernetes CronJob to run periodic checks: + +```yaml +# Kubernetes CronJob example +apiVersion: batch/v1 +kind: CronJob +metadata: + name: orchard-consistency-check +spec: + schedule: "0 2 * * *" # Daily at 2 AM + jobTemplate: + spec: + template: + spec: + containers: + - name: check + image: curlimages/curl + command: + - /bin/sh + - -c + - | + response=$(curl -s "${ORCHARD_URL}/api/v1/admin/consistency-check") + healthy=$(echo "$response" | jq -r '.healthy') + if [ "$healthy" != "true" ]; then + echo "ALERT: Consistency check failed!" + echo "$response" + exit 1 + fi + echo "Consistency check passed" + restartPolicy: OnFailure +``` + +## Recovery Procedures + +### Corrupted Artifact (Size Mismatch) + +If the consistency check reports size mismatches: + +1. **Identify affected artifacts:** + ```bash + curl "${base_url}/api/v1/admin/consistency-check" | jq '.size_mismatch_artifacts' + ``` + +2. **Check if artifact can be re-uploaded:** + - If the original content is available, delete the corrupted artifact and re-upload + - The same content will produce the same artifact ID + +3. **If original content is lost:** + - The artifact data is corrupted and cannot be recovered + - Delete the artifact record and notify affected users + - Consider restoring from backup if available + +### Missing S3 Object + +If database records exist but S3 objects are missing: + +1. **Identify affected artifacts:** + ```bash + curl "${base_url}/api/v1/admin/consistency-check" | jq '.missing_s3_keys' + ``` + +2. **Check S3 bucket:** + - Verify the S3 bucket exists and is accessible + - Check S3 access logs for deletion events + - Check if objects were moved or lifecycle-deleted + +3. **Recovery options:** + - Restore from S3 versioning (if enabled) + - Restore from backup + - Re-upload original content (if available) + - Delete orphaned database records + +### Orphaned S3 Objects + +If S3 objects exist without database records: + +1. **Identify orphaned objects:** + ```bash + curl "${base_url}/api/v1/admin/consistency-check" | jq '.orphaned_s3_keys' + ``` + +2. **Investigate cause:** + - Upload interrupted before database commit? + - Database record deleted but S3 cleanup failed? + +3. **Resolution:** + - If content is needed, create database record manually + - If content is not needed, delete the S3 object to reclaim storage + +### Preventive Measures + +1. **Enable S3 versioning** to recover from accidental deletions +2. **Regular backups** of both database and S3 bucket +3. **Scheduled consistency checks** to detect issues early +4. **Monitoring and alerting** on consistency check failures +5. **Audit logging** to track all artifact operations + +## Verification in CI/CD + +### Verifying Artifacts in Pipelines + +```bash +#!/bin/bash +# Download and verify artifact in CI pipeline + +ARTIFACT_URL="${ORCHARD_URL}/api/v1/project/${PROJECT}/${PACKAGE}/+/${TAG}" + +# Download with verification headers +response=$(curl -s -D - "${ARTIFACT_URL}?mode=proxy" -o artifact.tar.gz) +expected_hash=$(echo "$response" | grep -i "X-Checksum-SHA256" | cut -d: -f2 | tr -d ' \r') + +# Compute actual hash +actual_hash=$(sha256sum artifact.tar.gz | cut -d' ' -f1) + +# Verify +if [ "$actual_hash" != "$expected_hash" ]; then + echo "ERROR: Integrity check failed!" + echo "Expected: $expected_hash" + echo "Actual: $actual_hash" + exit 1 +fi + +echo "Integrity verified: $actual_hash" +``` + +### Using Server-Side Verification + +For critical deployments, use server-side pre-verification: + +```bash +# Server verifies before streaming - returns 500 if corrupt +curl -f "${ARTIFACT_URL}?mode=proxy&verify=true&verify_mode=pre" -o artifact.tar.gz +``` + +This ensures the artifact is verified before any bytes are streamed to your pipeline. diff --git a/helm/orchard/templates/deployment.yaml b/helm/orchard/templates/deployment.yaml index 1353547..9c6acfc 100644 --- a/helm/orchard/templates/deployment.yaml +++ b/helm/orchard/templates/deployment.yaml @@ -110,6 +110,12 @@ spec: value: {{ .Values.orchard.download.mode | quote }} - name: ORCHARD_PRESIGNED_URL_EXPIRY value: {{ .Values.orchard.download.presignedUrlExpiry | quote }} + {{- if .Values.orchard.rateLimit }} + {{- if .Values.orchard.rateLimit.login }} + - name: ORCHARD_LOGIN_RATE_LIMIT + value: {{ .Values.orchard.rateLimit.login | quote }} + {{- end }} + {{- end }} livenessProbe: {{- toYaml .Values.livenessProbe | nindent 12 }} readinessProbe: diff --git a/helm/orchard/values-dev.yaml b/helm/orchard/values-dev.yaml index 2b461df..2b11bbf 100644 --- a/helm/orchard/values-dev.yaml +++ b/helm/orchard/values-dev.yaml @@ -42,6 +42,7 @@ ingress: className: "nginx" annotations: cert-manager.io/cluster-issuer: "letsencrypt" + nginx.ingress.kubernetes.io/proxy-body-size: "0" # Disable body size limit for uploads hosts: - host: orchard-dev.common.global.bsf.tools # Overridden by CI paths: @@ -113,6 +114,10 @@ orchard: mode: "presigned" presignedUrlExpiry: 3600 + # Relaxed rate limits for dev/feature environments (allows integration tests to run) + rateLimit: + login: "1000/minute" # Default is 5/minute, relaxed for CI integration tests + # PostgreSQL - ephemeral, no persistence postgresql: enabled: true diff --git a/helm/orchard/values-stage.yaml b/helm/orchard/values-stage.yaml index c702bcb..5ac66bb 100644 --- a/helm/orchard/values-stage.yaml +++ b/helm/orchard/values-stage.yaml @@ -41,6 +41,7 @@ ingress: className: "nginx" annotations: cert-manager.io/cluster-issuer: "letsencrypt" + nginx.ingress.kubernetes.io/proxy-body-size: "0" # Disable body size limit for uploads hosts: - host: orchard-stage.common.global.bsf.tools paths: @@ -120,6 +121,10 @@ orchard: mode: "presigned" # presigned, redirect, or proxy presignedUrlExpiry: 3600 # Presigned URL expiry in seconds + # Relaxed rate limits for stage (allows CI integration tests to run) + rateLimit: + login: "1000/minute" # Default is 5/minute, relaxed for CI integration tests + # PostgreSQL subchart configuration postgresql: enabled: true