orchard/backend/tests/integration/test_large_uploads.py

"""
Integration tests for large file upload functionality.

Tests cover:
- Large file uploads (100MB, 1GB)
- Multipart upload behavior
- Upload metrics (duration, throughput)
- Memory efficiency during uploads
- Upload progress tracking

Note: Large tests are marked with @pytest.mark.slow and will be skipped
by default. Run with `pytest --run-slow` to include them.
"""

import os
import pytest
import io
import time
from tests.factories import (
    compute_sha256,
    upload_test_file,
    s3_object_exists,
)
from tests.conftest import (
    SIZE_1KB,
    SIZE_100KB,
    SIZE_1MB,
    SIZE_10MB,
    SIZE_100MB,
    SIZE_1GB,
)


class TestUploadMetrics:
    """Tests for upload duration and throughput metrics."""

    @pytest.mark.integration
    def test_upload_response_includes_duration_ms(self, integration_client, test_package):
        """Test upload response includes duration_ms field."""
        project, package = test_package
        content = b"duration test content"

        result = upload_test_file(
            integration_client, project, package, content, version="duration-test"
        )

        assert "duration_ms" in result
        assert result["duration_ms"] is not None
        assert result["duration_ms"] >= 0

    @pytest.mark.integration
    def test_upload_response_includes_throughput(self, integration_client, test_package):
        """Test upload response includes throughput_mbps field."""
        project, package = test_package
        content = b"throughput test content"

        result = upload_test_file(
            integration_client, project, package, content, version="throughput-test"
        )

        assert "throughput_mbps" in result
        # For small files throughput may be very high or None
        # Just verify the field exists

    @pytest.mark.integration
    def test_upload_duration_reasonable(
        self, integration_client, test_package, sized_content
    ):
        """Test upload duration is reasonable for file size."""
        project, package = test_package
        content, _ = sized_content(SIZE_1MB, seed=100)

        start = time.time()
        result = upload_test_file(
            integration_client, project, package, content, version="duration-check"
        )
        actual_duration = (time.time() - start) * 1000  # ms

        # Reported duration should be close to actual
        assert result["duration_ms"] is not None
        # Allow some variance (network overhead)
        assert result["duration_ms"] <= actual_duration + 1000  # Within 1s


class TestLargeFileUploads:
    """Tests for large file uploads using multipart."""

    @pytest.mark.integration
    def test_upload_10mb_file(self, integration_client, test_package, sized_content):
        """Test uploading a 10MB file."""
        project, package = test_package
        content, expected_hash = sized_content(SIZE_10MB, seed=200)

        result = upload_test_file(
            integration_client, project, package, content, version="large-10mb"
        )

        assert result["artifact_id"] == expected_hash
        assert result["size"] == SIZE_10MB
        assert result["duration_ms"] is not None
        assert result["throughput_mbps"] is not None

    @pytest.mark.integration
    @pytest.mark.slow
    @pytest.mark.requires_direct_s3
    def test_upload_100mb_file(self, integration_client, test_package, sized_content):
        """Test uploading a 100MB file (triggers multipart upload)."""
        project, package = test_package
        content, expected_hash = sized_content(SIZE_100MB, seed=300)

        result = upload_test_file(
            integration_client, project, package, content, version="large-100mb"
        )

        assert result["artifact_id"] == expected_hash
        assert result["size"] == SIZE_100MB
        # Verify S3 object exists
        assert s3_object_exists(expected_hash)

    @pytest.mark.integration
    @pytest.mark.slow
    @pytest.mark.large
    def test_upload_1gb_file(self, integration_client, test_package, sized_content):
        """Test uploading a 1GB file."""
        project, package = test_package
        content, expected_hash = sized_content(SIZE_1GB, seed=400)

        result = upload_test_file(
            integration_client, project, package, content, version="large-1gb"
        )

        assert result["artifact_id"] == expected_hash
        assert result["size"] == SIZE_1GB
        # Should have measurable throughput
        assert result["throughput_mbps"] is not None
        assert result["throughput_mbps"] > 0

    @pytest.mark.integration
    def test_large_file_deduplication(
        self, integration_client, test_package, sized_content, unique_test_id
    ):
        """Test deduplication works for large files."""
        project, package = test_package
        # Use unique_test_id to ensure unique content per test run
        seed = hash(unique_test_id) % 10000
        content, expected_hash = sized_content(SIZE_10MB, seed=seed)

        # First upload
        result1 = upload_test_file(
            integration_client, project, package, content, version=f"dedup-{unique_test_id}-1"
        )
        # Note: may be True if previous test uploaded same content
        first_dedupe = result1["deduplicated"]

        # Second upload of same content
        result2 = upload_test_file(
            integration_client, project, package, content, version=f"dedup-{unique_test_id}-2"
        )
        assert result2["artifact_id"] == expected_hash
        # Second upload MUST be deduplicated
        assert result2["deduplicated"] is True


class TestUploadProgress:
    """Tests for upload progress tracking endpoint."""

    @pytest.mark.integration
    def test_progress_endpoint_returns_not_found_for_invalid_id(
        self, integration_client, test_package
    ):
        """Test progress endpoint returns not_found status for invalid upload ID."""
        project, package = test_package

        response = integration_client.get(
            f"/api/v1/project/{project}/{package}/upload/invalid-upload-id/progress"
        )

        assert response.status_code == 200
        data = response.json()
        assert data["status"] == "not_found"
        assert data["upload_id"] == "invalid-upload-id"

    @pytest.mark.integration
    def test_progress_endpoint_requires_valid_project(
        self, integration_client, unique_test_id
    ):
        """Test progress endpoint validates project exists."""
        response = integration_client.get(
            f"/api/v1/project/nonexistent-{unique_test_id}/pkg/upload/upload-id/progress"
        )

        assert response.status_code == 404

    @pytest.mark.integration
    def test_progress_endpoint_requires_valid_package(
        self, integration_client, test_project, unique_test_id
    ):
        """Test progress endpoint validates package exists."""
        response = integration_client.get(
            f"/api/v1/project/{test_project}/nonexistent-{unique_test_id}/upload/upload-id/progress"
        )

        assert response.status_code == 404


class TestResumableUploadProgress:
    """Tests for progress tracking during resumable uploads."""

    @pytest.mark.integration
    def test_resumable_upload_init_and_progress(
        self, integration_client, test_package, sized_content
    ):
        """Test initializing resumable upload and checking progress."""
        project, package = test_package
        content, expected_hash = sized_content(SIZE_100KB, seed=600)

        # Get API key for auth
        api_key_response = integration_client.post(
            "/api/v1/auth/keys",
            json={"name": "progress-test-key"},
        )
        assert api_key_response.status_code == 200
        api_key = api_key_response.json()["key"]

        # Initialize resumable upload
        init_response = integration_client.post(
            f"/api/v1/project/{project}/{package}/upload/init",
            json={
                "expected_hash": expected_hash,
                "filename": "progress-test.bin",
                "size": SIZE_100KB,
            },
            headers={"Authorization": f"Bearer {api_key}"},
        )
        assert init_response.status_code == 200
        upload_id = init_response.json().get("upload_id")

        if upload_id:
            # Check initial progress
            progress_response = integration_client.get(
                f"/api/v1/project/{project}/{package}/upload/{upload_id}/progress",
                headers={"Authorization": f"Bearer {api_key}"},
            )
            assert progress_response.status_code == 200
            progress = progress_response.json()
            assert progress["status"] == "in_progress"
            assert progress["bytes_uploaded"] == 0
            assert progress["bytes_total"] == SIZE_100KB

            # Abort to clean up
            integration_client.delete(
                f"/api/v1/project/{project}/{package}/upload/{upload_id}",
                headers={"Authorization": f"Bearer {api_key}"},
            )


class TestUploadSizeLimits:
    """Tests for upload size limit enforcement."""

    @pytest.mark.integration
    def test_empty_file_rejected(self, integration_client, test_package):
        """Test empty files are rejected."""
        project, package = test_package

        files = {"file": ("empty.txt", io.BytesIO(b""), "application/octet-stream")}
        response = integration_client.post(
            f"/api/v1/project/{project}/{package}/upload",
            files=files,
        )

        assert response.status_code in [400, 422]

    @pytest.mark.integration
    def test_minimum_size_accepted(self, integration_client, test_package):
        """Test 1-byte file is accepted."""
        project, package = test_package
        content = b"X"

        result = upload_test_file(
            integration_client, project, package, content, version="min-size"
        )

        assert result["size"] == 1

    @pytest.mark.integration
    def test_content_length_header_used_in_response(self, integration_client, test_package):
        """Test that upload response size matches Content-Length."""
        project, package = test_package
        content = b"content length verification test"

        result = upload_test_file(
            integration_client, project, package, content, version="content-length-test"
        )

        # Size in response should match actual content length
        assert result["size"] == len(content)


class TestUploadErrorHandling:
    """Tests for upload error handling."""

    @pytest.mark.integration
    def test_upload_to_nonexistent_project_returns_404(
        self, integration_client, unique_test_id
    ):
        """Test upload to nonexistent project returns 404."""
        content = b"test content"
        files = {"file": ("test.bin", io.BytesIO(content), "application/octet-stream")}

        response = integration_client.post(
            f"/api/v1/project/nonexistent-{unique_test_id}/pkg/upload",
            files=files,
        )

        assert response.status_code == 404

    @pytest.mark.integration
    def test_upload_to_nonexistent_package_returns_404(
        self, integration_client, test_project, unique_test_id
    ):
        """Test upload to nonexistent package returns 404."""
        content = b"test content"
        files = {"file": ("test.bin", io.BytesIO(content), "application/octet-stream")}

        response = integration_client.post(
            f"/api/v1/project/{test_project}/nonexistent-{unique_test_id}/upload",
            files=files,
        )

        assert response.status_code == 404

    @pytest.mark.integration
    def test_upload_without_file_returns_422(self, integration_client, test_package):
        """Test upload without file field returns 422."""
        project, package = test_package

        response = integration_client.post(
            f"/api/v1/project/{project}/{package}/upload",
            data={"version": "no-file"},
        )

        assert response.status_code == 422

    @pytest.mark.integration
    def test_upload_with_invalid_checksum_rejected(
        self, integration_client, test_package
    ):
        """Test upload with invalid checksum header format is rejected."""
        project, package = test_package
        content = b"checksum test"

        files = {"file": ("test.bin", io.BytesIO(content), "application/octet-stream")}
        response = integration_client.post(
            f"/api/v1/project/{project}/{package}/upload",
            files=files,
            headers={"X-Checksum-SHA256": "invalid-checksum"},
        )

        assert response.status_code == 400

    @pytest.mark.integration
    def test_upload_with_mismatched_checksum_rejected(
        self, integration_client, test_package
    ):
        """Test upload with wrong checksum is rejected."""
        project, package = test_package
        content = b"mismatch test"
        wrong_hash = "0" * 64

        files = {"file": ("test.bin", io.BytesIO(content), "application/octet-stream")}
        response = integration_client.post(
            f"/api/v1/project/{project}/{package}/upload",
            files=files,
            headers={"X-Checksum-SHA256": wrong_hash},
        )

        assert response.status_code == 422
        assert "verification failed" in response.json().get("detail", "").lower()


class TestResumableUploadCancellation:
    """Tests for resumable upload cancellation."""

    @pytest.mark.integration
    def test_abort_resumable_upload(self, integration_client, test_package, sized_content):
        """Test aborting a resumable upload cleans up properly."""
        project, package = test_package
        content, expected_hash = sized_content(SIZE_100KB, seed=700)

        # Get API key for auth
        api_key_response = integration_client.post(
            "/api/v1/auth/keys",
            json={"name": "abort-test-key"},
        )
        assert api_key_response.status_code == 200
        api_key = api_key_response.json()["key"]

        # Initialize resumable upload
        init_response = integration_client.post(
            f"/api/v1/project/{project}/{package}/upload/init",
            json={
                "expected_hash": expected_hash,
                "filename": "abort-test.bin",
                "size": SIZE_100KB,
            },
            headers={"Authorization": f"Bearer {api_key}"},
        )
        assert init_response.status_code == 200
        upload_id = init_response.json().get("upload_id")

        if upload_id:
            # Abort the upload (without uploading any parts)
            abort_response = integration_client.delete(
                f"/api/v1/project/{project}/{package}/upload/{upload_id}",
                headers={"Authorization": f"Bearer {api_key}"},
            )
            assert abort_response.status_code in [200, 204]

            # Verify progress shows not_found after abort
            progress_response = integration_client.get(
                f"/api/v1/project/{project}/{package}/upload/{upload_id}/progress",
                headers={"Authorization": f"Bearer {api_key}"},
            )
            assert progress_response.status_code == 200
            assert progress_response.json()["status"] == "not_found"

    @pytest.mark.integration
    def test_abort_nonexistent_upload(self, integration_client, test_package):
        """Test aborting nonexistent upload returns appropriate error."""
        project, package = test_package

        # Get API key for auth
        api_key_response = integration_client.post(
            "/api/v1/auth/keys",
            json={"name": "abort-nonexistent-key"},
        )
        assert api_key_response.status_code == 200
        api_key = api_key_response.json()["key"]

        response = integration_client.delete(
            f"/api/v1/project/{project}/{package}/upload/nonexistent-upload-id",
            headers={"Authorization": f"Bearer {api_key}"},
        )

        # Should return 404 or 200 (idempotent delete)
        assert response.status_code in [200, 204, 404]


class TestUploadTimeout:
    """Tests for upload timeout handling."""

    @pytest.mark.integration
    def test_upload_with_short_timeout_succeeds_for_small_file(
        self, integration_client, test_package
    ):
        """Test small file upload succeeds with reasonable timeout."""
        project, package = test_package
        content = b"small timeout test"

        # httpx client should handle this quickly
        result = upload_test_file(
            integration_client, project, package, content, version="timeout-small"
        )

        assert result["artifact_id"] is not None

    @pytest.mark.integration
    def test_upload_response_duration_under_timeout(
        self, integration_client, test_package, sized_content
    ):
        """Test upload completes within reasonable time."""
        project, package = test_package
        content, _ = sized_content(SIZE_1MB, seed=800)

        start = time.time()
        result = upload_test_file(
            integration_client, project, package, content, version="timeout-check"
        )
        duration = time.time() - start

        # 1MB should upload in well under 60 seconds on local
        assert duration < 60
        assert result["artifact_id"] is not None


class TestConcurrentUploads:
    """Tests for concurrent upload handling."""

    @pytest.mark.integration
    def test_concurrent_different_files(
        self, integration_client, test_package, sized_content
    ):
        """Test concurrent uploads of different files succeed."""
        from concurrent.futures import ThreadPoolExecutor, as_completed

        project, package = test_package

        # Get API key for auth
        api_key_response = integration_client.post(
            "/api/v1/auth/keys",
            json={"name": "concurrent-diff-key"},
        )
        assert api_key_response.status_code == 200
        api_key = api_key_response.json()["key"]

        num_uploads = 3
        results = []
        errors = []

        def upload_unique_file(idx):
            try:
                from httpx import Client

                content, expected_hash = sized_content(SIZE_100KB, seed=900 + idx)

                base_url = os.environ.get("ORCHARD_TEST_URL", "http://localhost:8080")
                with Client(base_url=base_url, timeout=30.0) as client:
                    files = {
                        "file": (
                            f"concurrent-{idx}.bin",
                            io.BytesIO(content),
                            "application/octet-stream",
                        )
                    }
                    response = client.post(
                        f"/api/v1/project/{project}/{package}/upload",
                        files=files,
                        data={"version": f"concurrent-diff-{idx}"},
                        headers={"Authorization": f"Bearer {api_key}"},
                    )
                    if response.status_code == 200:
                        results.append((idx, response.json(), expected_hash))
                    else:
                        errors.append(f"Upload {idx}: {response.status_code} - {response.text}")
            except Exception as e:
                errors.append(f"Upload {idx}: {str(e)}")

        with ThreadPoolExecutor(max_workers=num_uploads) as executor:
            futures = [executor.submit(upload_unique_file, i) for i in range(num_uploads)]
            for future in as_completed(futures):
                pass

        assert len(errors) == 0, f"Concurrent upload errors: {errors}"
        assert len(results) == num_uploads

        # Each upload should have unique artifact ID
        artifact_ids = set(r[1]["artifact_id"] for r in results)
        assert len(artifact_ids) == num_uploads

        # Each should match expected hash
        for idx, result, expected_hash in results:
            assert result["artifact_id"] == expected_hash