Add comprehensive upload/download tests and streaming enhancements (#38, #40, #42, #43)

2026-01-21 09:35:12 -06:00
parent f7ffc1c877
commit 584acd1e90
23 changed files with 5385 additions and 405 deletions
--- a/backend/tests/integration/test_large_uploads.py
+++ b/backend/tests/integration/test_large_uploads.py
@@ -0,0 +1,552 @@
+"""
+Integration tests for large file upload functionality.
+
+Tests cover:
+- Large file uploads (100MB, 1GB)
+- Multipart upload behavior
+- Upload metrics (duration, throughput)
+- Memory efficiency during uploads
+- Upload progress tracking
+
+Note: Large tests are marked with @pytest.mark.slow and will be skipped
+by default. Run with `pytest --run-slow` to include them.
+"""
+
+import os
+import pytest
+import io
+import time
+from tests.factories import (
+    compute_sha256,
+    upload_test_file,
+    s3_object_exists,
+)
+from tests.conftest import (
+    SIZE_1KB,
+    SIZE_100KB,
+    SIZE_1MB,
+    SIZE_10MB,
+    SIZE_100MB,
+    SIZE_1GB,
+)
+
+
+class TestUploadMetrics:
+    """Tests for upload duration and throughput metrics."""
+
+    @pytest.mark.integration
+    def test_upload_response_includes_duration_ms(self, integration_client, test_package):
+        """Test upload response includes duration_ms field."""
+        project, package = test_package
+        content = b"duration test content"
+
+        result = upload_test_file(
+            integration_client, project, package, content, tag="duration-test"
+        )
+
+        assert "duration_ms" in result
+        assert result["duration_ms"] is not None
+        assert result["duration_ms"] >= 0
+
+    @pytest.mark.integration
+    def test_upload_response_includes_throughput(self, integration_client, test_package):
+        """Test upload response includes throughput_mbps field."""
+        project, package = test_package
+        content = b"throughput test content"
+
+        result = upload_test_file(
+            integration_client, project, package, content, tag="throughput-test"
+        )
+
+        assert "throughput_mbps" in result
+        # For small files throughput may be very high or None
+        # Just verify the field exists
+
+    @pytest.mark.integration
+    def test_upload_duration_reasonable(
+        self, integration_client, test_package, sized_content
+    ):
+        """Test upload duration is reasonable for file size."""
+        project, package = test_package
+        content, _ = sized_content(SIZE_1MB, seed=100)
+
+        start = time.time()
+        result = upload_test_file(
+            integration_client, project, package, content, tag="duration-check"
+        )
+        actual_duration = (time.time() - start) * 1000  # ms
+
+        # Reported duration should be close to actual
+        assert result["duration_ms"] is not None
+        # Allow some variance (network overhead)
+        assert result["duration_ms"] <= actual_duration + 1000  # Within 1s
+
+
+class TestLargeFileUploads:
+    """Tests for large file uploads using multipart."""
+
+    @pytest.mark.integration
+    def test_upload_10mb_file(self, integration_client, test_package, sized_content):
+        """Test uploading a 10MB file."""
+        project, package = test_package
+        content, expected_hash = sized_content(SIZE_10MB, seed=200)
+
+        result = upload_test_file(
+            integration_client, project, package, content, tag="large-10mb"
+        )
+
+        assert result["artifact_id"] == expected_hash
+        assert result["size"] == SIZE_10MB
+        assert result["duration_ms"] is not None
+        assert result["throughput_mbps"] is not None
+
+    @pytest.mark.integration
+    @pytest.mark.slow
+    @pytest.mark.requires_direct_s3
+    def test_upload_100mb_file(self, integration_client, test_package, sized_content):
+        """Test uploading a 100MB file (triggers multipart upload)."""
+        project, package = test_package
+        content, expected_hash = sized_content(SIZE_100MB, seed=300)
+
+        result = upload_test_file(
+            integration_client, project, package, content, tag="large-100mb"
+        )
+
+        assert result["artifact_id"] == expected_hash
+        assert result["size"] == SIZE_100MB
+        # Verify S3 object exists
+        assert s3_object_exists(expected_hash)
+
+    @pytest.mark.integration
+    @pytest.mark.slow
+    @pytest.mark.large
+    def test_upload_1gb_file(self, integration_client, test_package, sized_content):
+        """Test uploading a 1GB file."""
+        project, package = test_package
+        content, expected_hash = sized_content(SIZE_1GB, seed=400)
+
+        result = upload_test_file(
+            integration_client, project, package, content, tag="large-1gb"
+        )
+
+        assert result["artifact_id"] == expected_hash
+        assert result["size"] == SIZE_1GB
+        # Should have measurable throughput
+        assert result["throughput_mbps"] is not None
+        assert result["throughput_mbps"] > 0
+
+    @pytest.mark.integration
+    def test_large_file_deduplication(
+        self, integration_client, test_package, sized_content, unique_test_id
+    ):
+        """Test deduplication works for large files."""
+        project, package = test_package
+        # Use unique_test_id to ensure unique content per test run
+        seed = hash(unique_test_id) % 10000
+        content, expected_hash = sized_content(SIZE_10MB, seed=seed)
+
+        # First upload
+        result1 = upload_test_file(
+            integration_client, project, package, content, tag=f"dedup-{unique_test_id}-1"
+        )
+        # Note: may be True if previous test uploaded same content
+        first_dedupe = result1["deduplicated"]
+
+        # Second upload of same content
+        result2 = upload_test_file(
+            integration_client, project, package, content, tag=f"dedup-{unique_test_id}-2"
+        )
+        assert result2["artifact_id"] == expected_hash
+        # Second upload MUST be deduplicated
+        assert result2["deduplicated"] is True
+
+
+class TestUploadProgress:
+    """Tests for upload progress tracking endpoint."""
+
+    @pytest.mark.integration
+    def test_progress_endpoint_returns_not_found_for_invalid_id(
+        self, integration_client, test_package
+    ):
+        """Test progress endpoint returns not_found status for invalid upload ID."""
+        project, package = test_package
+
+        response = integration_client.get(
+            f"/api/v1/project/{project}/{package}/upload/invalid-upload-id/progress"
+        )
+
+        assert response.status_code == 200
+        data = response.json()
+        assert data["status"] == "not_found"
+        assert data["upload_id"] == "invalid-upload-id"
+
+    @pytest.mark.integration
+    def test_progress_endpoint_requires_valid_project(
+        self, integration_client, unique_test_id
+    ):
+        """Test progress endpoint validates project exists."""
+        response = integration_client.get(
+            f"/api/v1/project/nonexistent-{unique_test_id}/pkg/upload/upload-id/progress"
+        )
+
+        assert response.status_code == 404
+
+    @pytest.mark.integration
+    def test_progress_endpoint_requires_valid_package(
+        self, integration_client, test_project, unique_test_id
+    ):
+        """Test progress endpoint validates package exists."""
+        response = integration_client.get(
+            f"/api/v1/project/{test_project}/nonexistent-{unique_test_id}/upload/upload-id/progress"
+        )
+
+        assert response.status_code == 404
+
+
+class TestResumableUploadProgress:
+    """Tests for progress tracking during resumable uploads."""
+
+    @pytest.mark.integration
+    def test_resumable_upload_init_and_progress(
+        self, integration_client, test_package, sized_content
+    ):
+        """Test initializing resumable upload and checking progress."""
+        project, package = test_package
+        content, expected_hash = sized_content(SIZE_100KB, seed=600)
+
+        # Get API key for auth
+        api_key_response = integration_client.post(
+            "/api/v1/auth/keys",
+            json={"name": "progress-test-key"},
+        )
+        assert api_key_response.status_code == 200
+        api_key = api_key_response.json()["key"]
+
+        # Initialize resumable upload
+        init_response = integration_client.post(
+            f"/api/v1/project/{project}/{package}/upload/init",
+            json={
+                "expected_hash": expected_hash,
+                "filename": "progress-test.bin",
+                "size": SIZE_100KB,
+            },
+            headers={"Authorization": f"Bearer {api_key}"},
+        )
+        assert init_response.status_code == 200
+        upload_id = init_response.json().get("upload_id")
+
+        if upload_id:
+            # Check initial progress
+            progress_response = integration_client.get(
+                f"/api/v1/project/{project}/{package}/upload/{upload_id}/progress",
+                headers={"Authorization": f"Bearer {api_key}"},
+            )
+            assert progress_response.status_code == 200
+            progress = progress_response.json()
+            assert progress["status"] == "in_progress"
+            assert progress["bytes_uploaded"] == 0
+            assert progress["bytes_total"] == SIZE_100KB
+
+            # Abort to clean up
+            integration_client.delete(
+                f"/api/v1/project/{project}/{package}/upload/{upload_id}",
+                headers={"Authorization": f"Bearer {api_key}"},
+            )
+
+
+class TestUploadSizeLimits:
+    """Tests for upload size limit enforcement."""
+
+    @pytest.mark.integration
+    def test_empty_file_rejected(self, integration_client, test_package):
+        """Test empty files are rejected."""
+        project, package = test_package
+
+        files = {"file": ("empty.txt", io.BytesIO(b""), "application/octet-stream")}
+        response = integration_client.post(
+            f"/api/v1/project/{project}/{package}/upload",
+            files=files,
+        )
+
+        assert response.status_code in [400, 422]
+
+    @pytest.mark.integration
+    def test_minimum_size_accepted(self, integration_client, test_package):
+        """Test 1-byte file is accepted."""
+        project, package = test_package
+        content = b"X"
+
+        result = upload_test_file(
+            integration_client, project, package, content, tag="min-size"
+        )
+
+        assert result["size"] == 1
+
+    @pytest.mark.integration
+    def test_content_length_header_used_in_response(self, integration_client, test_package):
+        """Test that upload response size matches Content-Length."""
+        project, package = test_package
+        content = b"content length verification test"
+
+        result = upload_test_file(
+            integration_client, project, package, content, tag="content-length-test"
+        )
+
+        # Size in response should match actual content length
+        assert result["size"] == len(content)
+
+
+class TestUploadErrorHandling:
+    """Tests for upload error handling."""
+
+    @pytest.mark.integration
+    def test_upload_to_nonexistent_project_returns_404(
+        self, integration_client, unique_test_id
+    ):
+        """Test upload to nonexistent project returns 404."""
+        content = b"test content"
+        files = {"file": ("test.bin", io.BytesIO(content), "application/octet-stream")}
+
+        response = integration_client.post(
+            f"/api/v1/project/nonexistent-{unique_test_id}/pkg/upload",
+            files=files,
+        )
+
+        assert response.status_code == 404
+
+    @pytest.mark.integration
+    def test_upload_to_nonexistent_package_returns_404(
+        self, integration_client, test_project, unique_test_id
+    ):
+        """Test upload to nonexistent package returns 404."""
+        content = b"test content"
+        files = {"file": ("test.bin", io.BytesIO(content), "application/octet-stream")}
+
+        response = integration_client.post(
+            f"/api/v1/project/{test_project}/nonexistent-{unique_test_id}/upload",
+            files=files,
+        )
+
+        assert response.status_code == 404
+
+    @pytest.mark.integration
+    def test_upload_without_file_returns_422(self, integration_client, test_package):
+        """Test upload without file field returns 422."""
+        project, package = test_package
+
+        response = integration_client.post(
+            f"/api/v1/project/{project}/{package}/upload",
+            data={"tag": "no-file"},
+        )
+
+        assert response.status_code == 422
+
+    @pytest.mark.integration
+    def test_upload_with_invalid_checksum_rejected(
+        self, integration_client, test_package
+    ):
+        """Test upload with invalid checksum header format is rejected."""
+        project, package = test_package
+        content = b"checksum test"
+
+        files = {"file": ("test.bin", io.BytesIO(content), "application/octet-stream")}
+        response = integration_client.post(
+            f"/api/v1/project/{project}/{package}/upload",
+            files=files,
+            headers={"X-Checksum-SHA256": "invalid-checksum"},
+        )
+
+        assert response.status_code == 400
+
+    @pytest.mark.integration
+    def test_upload_with_mismatched_checksum_rejected(
+        self, integration_client, test_package
+    ):
+        """Test upload with wrong checksum is rejected."""
+        project, package = test_package
+        content = b"mismatch test"
+        wrong_hash = "0" * 64
+
+        files = {"file": ("test.bin", io.BytesIO(content), "application/octet-stream")}
+        response = integration_client.post(
+            f"/api/v1/project/{project}/{package}/upload",
+            files=files,
+            headers={"X-Checksum-SHA256": wrong_hash},
+        )
+
+        assert response.status_code == 422
+        assert "verification failed" in response.json().get("detail", "").lower()
+
+
+class TestResumableUploadCancellation:
+    """Tests for resumable upload cancellation."""
+
+    @pytest.mark.integration
+    def test_abort_resumable_upload(self, integration_client, test_package, sized_content):
+        """Test aborting a resumable upload cleans up properly."""
+        project, package = test_package
+        content, expected_hash = sized_content(SIZE_100KB, seed=700)
+
+        # Get API key for auth
+        api_key_response = integration_client.post(
+            "/api/v1/auth/keys",
+            json={"name": "abort-test-key"},
+        )
+        assert api_key_response.status_code == 200
+        api_key = api_key_response.json()["key"]
+
+        # Initialize resumable upload
+        init_response = integration_client.post(
+            f"/api/v1/project/{project}/{package}/upload/init",
+            json={
+                "expected_hash": expected_hash,
+                "filename": "abort-test.bin",
+                "size": SIZE_100KB,
+            },
+            headers={"Authorization": f"Bearer {api_key}"},
+        )
+        assert init_response.status_code == 200
+        upload_id = init_response.json().get("upload_id")
+
+        if upload_id:
+            # Abort the upload (without uploading any parts)
+            abort_response = integration_client.delete(
+                f"/api/v1/project/{project}/{package}/upload/{upload_id}",
+                headers={"Authorization": f"Bearer {api_key}"},
+            )
+            assert abort_response.status_code in [200, 204]
+
+            # Verify progress shows not_found after abort
+            progress_response = integration_client.get(
+                f"/api/v1/project/{project}/{package}/upload/{upload_id}/progress",
+                headers={"Authorization": f"Bearer {api_key}"},
+            )
+            assert progress_response.status_code == 200
+            assert progress_response.json()["status"] == "not_found"
+
+    @pytest.mark.integration
+    def test_abort_nonexistent_upload(self, integration_client, test_package):
+        """Test aborting nonexistent upload returns appropriate error."""
+        project, package = test_package
+
+        # Get API key for auth
+        api_key_response = integration_client.post(
+            "/api/v1/auth/keys",
+            json={"name": "abort-nonexistent-key"},
+        )
+        assert api_key_response.status_code == 200
+        api_key = api_key_response.json()["key"]
+
+        response = integration_client.delete(
+            f"/api/v1/project/{project}/{package}/upload/nonexistent-upload-id",
+            headers={"Authorization": f"Bearer {api_key}"},
+        )
+
+        # Should return 404 or 200 (idempotent delete)
+        assert response.status_code in [200, 204, 404]
+
+
+class TestUploadTimeout:
+    """Tests for upload timeout handling."""
+
+    @pytest.mark.integration
+    def test_upload_with_short_timeout_succeeds_for_small_file(
+        self, integration_client, test_package
+    ):
+        """Test small file upload succeeds with reasonable timeout."""
+        project, package = test_package
+        content = b"small timeout test"
+
+        # httpx client should handle this quickly
+        result = upload_test_file(
+            integration_client, project, package, content, tag="timeout-small"
+        )
+
+        assert result["artifact_id"] is not None
+
+    @pytest.mark.integration
+    def test_upload_response_duration_under_timeout(
+        self, integration_client, test_package, sized_content
+    ):
+        """Test upload completes within reasonable time."""
+        project, package = test_package
+        content, _ = sized_content(SIZE_1MB, seed=800)
+
+        start = time.time()
+        result = upload_test_file(
+            integration_client, project, package, content, tag="timeout-check"
+        )
+        duration = time.time() - start
+
+        # 1MB should upload in well under 60 seconds on local
+        assert duration < 60
+        assert result["artifact_id"] is not None
+
+
+class TestConcurrentUploads:
+    """Tests for concurrent upload handling."""
+
+    @pytest.mark.integration
+    def test_concurrent_different_files(
+        self, integration_client, test_package, sized_content
+    ):
+        """Test concurrent uploads of different files succeed."""
+        from concurrent.futures import ThreadPoolExecutor, as_completed
+
+        project, package = test_package
+
+        # Get API key for auth
+        api_key_response = integration_client.post(
+            "/api/v1/auth/keys",
+            json={"name": "concurrent-diff-key"},
+        )
+        assert api_key_response.status_code == 200
+        api_key = api_key_response.json()["key"]
+
+        num_uploads = 3
+        results = []
+        errors = []
+
+        def upload_unique_file(idx):
+            try:
+                from httpx import Client
+
+                content, expected_hash = sized_content(SIZE_100KB, seed=900 + idx)
+
+                base_url = os.environ.get("ORCHARD_TEST_URL", "http://localhost:8080")
+                with Client(base_url=base_url, timeout=30.0) as client:
+                    files = {
+                        "file": (
+                            f"concurrent-{idx}.bin",
+                            io.BytesIO(content),
+                            "application/octet-stream",
+                        )
+                    }
+                    response = client.post(
+                        f"/api/v1/project/{project}/{package}/upload",
+                        files=files,
+                        data={"tag": f"concurrent-diff-{idx}"},
+                        headers={"Authorization": f"Bearer {api_key}"},
+                    )
+                    if response.status_code == 200:
+                        results.append((idx, response.json(), expected_hash))
+                    else:
+                        errors.append(f"Upload {idx}: {response.status_code} - {response.text}")
+            except Exception as e:
+                errors.append(f"Upload {idx}: {str(e)}")
+
+        with ThreadPoolExecutor(max_workers=num_uploads) as executor:
+            futures = [executor.submit(upload_unique_file, i) for i in range(num_uploads)]
+            for future in as_completed(futures):
+                pass
+
+        assert len(errors) == 0, f"Concurrent upload errors: {errors}"
+        assert len(results) == num_uploads
+
+        # Each upload should have unique artifact ID
+        artifact_ids = set(r[1]["artifact_id"] for r in results)
+        assert len(artifact_ids) == num_uploads
+
+        # Each should match expected hash
+        for idx, result, expected_hash in results:
+            assert result["artifact_id"] == expected_hash