- Add test_integration_uploads.py with 12 tests for duplicate upload scenarios - Add test_ref_count.py with 7 tests for ref_count management - Fix ArtifactDetailResponse to include sha256 and checksum fields - Fix health check SQL warning by wrapping in text() - Update tests to use unique content per test run for idempotency
389 lines
14 KiB
Python
389 lines
14 KiB
Python
"""
|
|
Integration tests for duplicate uploads and storage verification.
|
|
|
|
These tests require the full stack to be running (docker-compose.local.yml).
|
|
|
|
Tests cover:
|
|
- Duplicate upload scenarios across packages and projects
|
|
- Storage verification (single S3 object, single artifact row)
|
|
- Upload table tracking
|
|
- Content integrity verification
|
|
- Concurrent upload handling
|
|
- Failure cleanup
|
|
"""
|
|
|
|
import pytest
|
|
import io
|
|
import threading
|
|
import time
|
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
from tests.conftest import (
|
|
compute_sha256,
|
|
upload_test_file,
|
|
)
|
|
|
|
|
|
class TestDuplicateUploadScenarios:
|
|
"""Integration tests for duplicate upload behavior."""
|
|
|
|
@pytest.mark.integration
|
|
def test_same_file_twice_returns_same_artifact_id(
|
|
self, integration_client, test_package
|
|
):
|
|
"""Test uploading same file twice returns same artifact_id."""
|
|
project, package = test_package
|
|
content = b"content uploaded twice for same artifact test"
|
|
expected_hash = compute_sha256(content)
|
|
|
|
# First upload
|
|
result1 = upload_test_file(
|
|
integration_client, project, package, content, tag="first"
|
|
)
|
|
assert result1["artifact_id"] == expected_hash
|
|
|
|
# Second upload
|
|
result2 = upload_test_file(
|
|
integration_client, project, package, content, tag="second"
|
|
)
|
|
assert result2["artifact_id"] == expected_hash
|
|
assert result1["artifact_id"] == result2["artifact_id"]
|
|
|
|
@pytest.mark.integration
|
|
def test_same_file_twice_increments_ref_count(
|
|
self, integration_client, test_package
|
|
):
|
|
"""Test uploading same file twice increments ref_count to 2."""
|
|
project, package = test_package
|
|
content = b"content for ref count increment test"
|
|
|
|
# First upload
|
|
result1 = upload_test_file(
|
|
integration_client, project, package, content, tag="v1"
|
|
)
|
|
assert result1["ref_count"] == 1
|
|
|
|
# Second upload
|
|
result2 = upload_test_file(
|
|
integration_client, project, package, content, tag="v2"
|
|
)
|
|
assert result2["ref_count"] == 2
|
|
|
|
@pytest.mark.integration
|
|
def test_same_file_different_packages_shares_artifact(
|
|
self, integration_client, test_project, unique_test_id
|
|
):
|
|
"""Test uploading same file to different packages shares artifact."""
|
|
project = test_project
|
|
content = f"content shared across packages {unique_test_id}".encode()
|
|
expected_hash = compute_sha256(content)
|
|
|
|
# Create two packages
|
|
pkg1 = f"package-a-{unique_test_id}"
|
|
pkg2 = f"package-b-{unique_test_id}"
|
|
|
|
integration_client.post(
|
|
f"/api/v1/project/{project}/packages",
|
|
json={"name": pkg1, "description": "Package A"},
|
|
)
|
|
integration_client.post(
|
|
f"/api/v1/project/{project}/packages",
|
|
json={"name": pkg2, "description": "Package B"},
|
|
)
|
|
|
|
# Upload to first package
|
|
result1 = upload_test_file(integration_client, project, pkg1, content, tag="v1")
|
|
assert result1["artifact_id"] == expected_hash
|
|
assert result1["deduplicated"] is False
|
|
|
|
# Upload to second package
|
|
result2 = upload_test_file(integration_client, project, pkg2, content, tag="v1")
|
|
assert result2["artifact_id"] == expected_hash
|
|
assert result2["deduplicated"] is True
|
|
|
|
@pytest.mark.integration
|
|
def test_same_file_different_projects_shares_artifact(
|
|
self, integration_client, unique_test_id
|
|
):
|
|
"""Test uploading same file to different projects shares artifact."""
|
|
content = f"content shared across projects {unique_test_id}".encode()
|
|
expected_hash = compute_sha256(content)
|
|
|
|
# Create two projects with packages
|
|
proj1 = f"project-x-{unique_test_id}"
|
|
proj2 = f"project-y-{unique_test_id}"
|
|
pkg_name = "shared-pkg"
|
|
|
|
try:
|
|
# Create projects and packages
|
|
integration_client.post(
|
|
"/api/v1/projects",
|
|
json={"name": proj1, "description": "Project X", "is_public": True},
|
|
)
|
|
integration_client.post(
|
|
"/api/v1/projects",
|
|
json={"name": proj2, "description": "Project Y", "is_public": True},
|
|
)
|
|
integration_client.post(
|
|
f"/api/v1/project/{proj1}/packages",
|
|
json={"name": pkg_name, "description": "Package"},
|
|
)
|
|
integration_client.post(
|
|
f"/api/v1/project/{proj2}/packages",
|
|
json={"name": pkg_name, "description": "Package"},
|
|
)
|
|
|
|
# Upload to first project
|
|
result1 = upload_test_file(
|
|
integration_client, proj1, pkg_name, content, tag="v1"
|
|
)
|
|
assert result1["artifact_id"] == expected_hash
|
|
assert result1["deduplicated"] is False
|
|
|
|
# Upload to second project
|
|
result2 = upload_test_file(
|
|
integration_client, proj2, pkg_name, content, tag="v1"
|
|
)
|
|
assert result2["artifact_id"] == expected_hash
|
|
assert result2["deduplicated"] is True
|
|
|
|
finally:
|
|
# Cleanup
|
|
integration_client.delete(f"/api/v1/projects/{proj1}")
|
|
integration_client.delete(f"/api/v1/projects/{proj2}")
|
|
|
|
@pytest.mark.integration
|
|
def test_same_file_different_filenames_shares_artifact(
|
|
self, integration_client, test_package
|
|
):
|
|
"""Test uploading same file with different original filenames shares artifact."""
|
|
project, package = test_package
|
|
content = b"content with different filenames"
|
|
expected_hash = compute_sha256(content)
|
|
|
|
# Upload with filename1
|
|
result1 = upload_test_file(
|
|
integration_client,
|
|
project,
|
|
package,
|
|
content,
|
|
filename="file1.bin",
|
|
tag="v1",
|
|
)
|
|
assert result1["artifact_id"] == expected_hash
|
|
|
|
# Upload with filename2
|
|
result2 = upload_test_file(
|
|
integration_client,
|
|
project,
|
|
package,
|
|
content,
|
|
filename="file2.bin",
|
|
tag="v2",
|
|
)
|
|
assert result2["artifact_id"] == expected_hash
|
|
assert result2["deduplicated"] is True
|
|
|
|
@pytest.mark.integration
|
|
def test_same_file_different_tags_shares_artifact(
|
|
self, integration_client, test_package, unique_test_id
|
|
):
|
|
"""Test uploading same file with different tags shares artifact."""
|
|
project, package = test_package
|
|
content = f"content with different tags {unique_test_id}".encode()
|
|
expected_hash = compute_sha256(content)
|
|
|
|
tags = ["latest", "stable", "v1.0.0", "release"]
|
|
for i, tag in enumerate(tags):
|
|
result = upload_test_file(
|
|
integration_client, project, package, content, tag=tag
|
|
)
|
|
assert result["artifact_id"] == expected_hash
|
|
if i == 0:
|
|
assert result["deduplicated"] is False
|
|
else:
|
|
assert result["deduplicated"] is True
|
|
|
|
|
|
class TestStorageVerification:
|
|
"""Tests to verify storage behavior after duplicate uploads."""
|
|
|
|
@pytest.mark.integration
|
|
def test_artifact_table_single_row_after_duplicates(
|
|
self, integration_client, test_package
|
|
):
|
|
"""Test artifact table contains only one row after duplicate uploads."""
|
|
project, package = test_package
|
|
content = b"content for single row test"
|
|
expected_hash = compute_sha256(content)
|
|
|
|
# Upload same content multiple times with different tags
|
|
for tag in ["v1", "v2", "v3"]:
|
|
upload_test_file(integration_client, project, package, content, tag=tag)
|
|
|
|
# Query artifact - should exist and be unique
|
|
response = integration_client.get(f"/api/v1/artifact/{expected_hash}")
|
|
assert response.status_code == 200
|
|
artifact = response.json()
|
|
assert artifact["id"] == expected_hash
|
|
assert artifact["ref_count"] == 3
|
|
|
|
@pytest.mark.integration
|
|
def test_upload_table_multiple_rows_for_duplicates(
|
|
self, integration_client, test_package
|
|
):
|
|
"""Test upload table contains multiple rows for duplicate uploads (event tracking)."""
|
|
project, package = test_package
|
|
content = b"content for upload tracking test"
|
|
|
|
# Upload same content 3 times
|
|
for tag in ["upload1", "upload2", "upload3"]:
|
|
upload_test_file(integration_client, project, package, content, tag=tag)
|
|
|
|
# Check package stats - should show 3 uploads but fewer unique artifacts
|
|
response = integration_client.get(
|
|
f"/api/v1/project/{project}/packages/{package}"
|
|
)
|
|
assert response.status_code == 200
|
|
pkg_info = response.json()
|
|
assert pkg_info["tag_count"] == 3
|
|
|
|
@pytest.mark.integration
|
|
def test_artifact_content_matches_original(self, integration_client, test_package):
|
|
"""Test artifact content retrieved matches original content exactly."""
|
|
project, package = test_package
|
|
original_content = b"exact content verification test data 12345"
|
|
|
|
# Upload
|
|
result = upload_test_file(
|
|
integration_client, project, package, original_content, tag="verify"
|
|
)
|
|
|
|
# Download and compare
|
|
download_response = integration_client.get(
|
|
f"/api/v1/project/{project}/{package}/+/verify", params={"mode": "proxy"}
|
|
)
|
|
assert download_response.status_code == 200
|
|
downloaded_content = download_response.content
|
|
assert downloaded_content == original_content
|
|
|
|
@pytest.mark.integration
|
|
def test_storage_stats_reflect_deduplication(
|
|
self, integration_client, test_package
|
|
):
|
|
"""Test total storage size matches single artifact size after duplicates."""
|
|
project, package = test_package
|
|
content = b"content for storage stats test - should only count once"
|
|
content_size = len(content)
|
|
|
|
# Upload same content 5 times
|
|
for tag in ["a", "b", "c", "d", "e"]:
|
|
upload_test_file(integration_client, project, package, content, tag=tag)
|
|
|
|
# Check global stats
|
|
response = integration_client.get("/api/v1/stats")
|
|
assert response.status_code == 200
|
|
stats = response.json()
|
|
|
|
# Deduplication should show savings
|
|
assert stats["deduplicated_uploads"] > 0
|
|
assert stats["storage_saved_bytes"] > 0
|
|
|
|
|
|
class TestConcurrentUploads:
|
|
"""Tests for concurrent upload handling."""
|
|
|
|
@pytest.mark.integration
|
|
def test_concurrent_uploads_same_file(self, integration_client, test_package):
|
|
"""Test concurrent uploads of same file handle deduplication correctly."""
|
|
project, package = test_package
|
|
content = b"content for concurrent upload test"
|
|
expected_hash = compute_sha256(content)
|
|
num_concurrent = 5
|
|
|
|
results = []
|
|
errors = []
|
|
|
|
def upload_worker(tag_suffix):
|
|
try:
|
|
# Create a new client for this thread
|
|
from httpx import Client
|
|
|
|
base_url = "http://localhost:8080"
|
|
with Client(base_url=base_url, timeout=30.0) as client:
|
|
files = {
|
|
"file": (
|
|
f"concurrent-{tag_suffix}.bin",
|
|
io.BytesIO(content),
|
|
"application/octet-stream",
|
|
)
|
|
}
|
|
response = client.post(
|
|
f"/api/v1/project/{project}/{package}/upload",
|
|
files=files,
|
|
data={"tag": f"concurrent-{tag_suffix}"},
|
|
)
|
|
if response.status_code == 200:
|
|
results.append(response.json())
|
|
else:
|
|
errors.append(f"Status {response.status_code}: {response.text}")
|
|
except Exception as e:
|
|
errors.append(str(e))
|
|
|
|
# Run concurrent uploads
|
|
with ThreadPoolExecutor(max_workers=num_concurrent) as executor:
|
|
futures = [executor.submit(upload_worker, i) for i in range(num_concurrent)]
|
|
for future in as_completed(futures):
|
|
pass # Wait for all to complete
|
|
|
|
# Verify results
|
|
assert len(errors) == 0, f"Errors during concurrent uploads: {errors}"
|
|
assert len(results) == num_concurrent
|
|
|
|
# All should have same artifact_id
|
|
artifact_ids = set(r["artifact_id"] for r in results)
|
|
assert len(artifact_ids) == 1
|
|
assert expected_hash in artifact_ids
|
|
|
|
# Verify final ref_count
|
|
response = integration_client.get(f"/api/v1/artifact/{expected_hash}")
|
|
assert response.status_code == 200
|
|
assert response.json()["ref_count"] == num_concurrent
|
|
|
|
|
|
class TestDeduplicationAcrossRestarts:
|
|
"""Tests for deduplication persistence."""
|
|
|
|
@pytest.mark.integration
|
|
def test_deduplication_persists(
|
|
self, integration_client, test_package, unique_test_id
|
|
):
|
|
"""
|
|
Test deduplication works with persisted data.
|
|
|
|
This test uploads content, then uploads the same content again.
|
|
Since the database persists, the second upload should detect
|
|
the existing artifact even without server restart.
|
|
"""
|
|
project, package = test_package
|
|
content = f"persisted content for dedup test {unique_test_id}".encode()
|
|
expected_hash = compute_sha256(content)
|
|
|
|
# First upload
|
|
result1 = upload_test_file(
|
|
integration_client, project, package, content, tag="persist1"
|
|
)
|
|
assert result1["artifact_id"] == expected_hash
|
|
assert result1["deduplicated"] is False
|
|
|
|
# Second upload (simulating after restart - data is persisted)
|
|
result2 = upload_test_file(
|
|
integration_client, project, package, content, tag="persist2"
|
|
)
|
|
assert result2["artifact_id"] == expected_hash
|
|
assert result2["deduplicated"] is True
|
|
|
|
# Verify artifact exists with correct ref_count
|
|
response = integration_client.get(f"/api/v1/artifact/{expected_hash}")
|
|
assert response.status_code == 200
|
|
assert response.json()["ref_count"] == 2
|