""" Unit tests for duplicate detection and deduplication logic. Tests cover: - _exists() method correctly identifies existing S3 keys - S3 key generation follows expected pattern - Storage layer skips upload when artifact already exists - Storage layer performs upload when artifact does not exist """ import pytest import io from unittest.mock import MagicMock, patch from tests.conftest import ( compute_sha256, TEST_CONTENT_HELLO, TEST_HASH_HELLO, ) class TestExistsMethod: """Tests for the _exists() method that checks S3 object existence.""" @pytest.mark.unit def test_exists_returns_true_for_existing_key(self, mock_storage, mock_s3_client): """Test _exists() returns True when object exists.""" # Pre-populate the mock storage test_key = "fruits/df/fd/test-hash" mock_s3_client.objects[test_key] = b"content" result = mock_storage._exists(test_key) assert result is True @pytest.mark.unit def test_exists_returns_false_for_nonexistent_key(self, mock_storage): """Test _exists() returns False when object doesn't exist.""" result = mock_storage._exists("fruits/no/ne/nonexistent-key") assert result is False @pytest.mark.unit def test_exists_handles_404_error(self, mock_storage): """Test _exists() handles 404 errors gracefully.""" # The mock client raises ClientError for nonexistent keys result = mock_storage._exists("fruits/xx/yy/does-not-exist") assert result is False class TestS3KeyGeneration: """Tests for S3 key pattern generation.""" @pytest.mark.unit def test_s3_key_pattern(self): """Test S3 key follows pattern: fruits/{hash[:2]}/{hash[2:4]}/{hash}""" test_hash = "abcdef1234567890abcdef1234567890abcdef1234567890abcdef1234567890" expected_key = f"fruits/{test_hash[:2]}/{test_hash[2:4]}/{test_hash}" # Expected: fruits/ab/cd/abcdef1234567890... assert expected_key == f"fruits/ab/cd/{test_hash}" @pytest.mark.unit def test_s3_key_generation_in_storage(self, mock_storage): """Test storage layer generates correct S3 key.""" content = TEST_CONTENT_HELLO file_obj = io.BytesIO(content) result = mock_storage._store_simple(file_obj) expected_key = ( f"fruits/{TEST_HASH_HELLO[:2]}/{TEST_HASH_HELLO[2:4]}/{TEST_HASH_HELLO}" ) assert result.s3_key == expected_key @pytest.mark.unit def test_s3_key_uses_sha256_hash(self, mock_storage): """Test S3 key is derived from SHA256 hash.""" content = b"unique test content for key test" file_obj = io.BytesIO(content) expected_hash = compute_sha256(content) result = mock_storage._store_simple(file_obj) # Key should contain the hash assert expected_hash in result.s3_key class TestDeduplicationBehavior: """Tests for deduplication (skip upload when exists).""" @pytest.mark.unit def test_skips_upload_when_exists(self, mock_storage, mock_s3_client): """Test storage skips S3 upload when artifact already exists.""" content = TEST_CONTENT_HELLO s3_key = ( f"fruits/{TEST_HASH_HELLO[:2]}/{TEST_HASH_HELLO[2:4]}/{TEST_HASH_HELLO}" ) # Pre-populate storage (simulate existing artifact) mock_s3_client.objects[s3_key] = content # Track put_object calls original_put = mock_s3_client.put_object put_called = [] def tracked_put(*args, **kwargs): put_called.append(True) return original_put(*args, **kwargs) mock_s3_client.put_object = tracked_put # Store the same content file_obj = io.BytesIO(content) result = mock_storage._store_simple(file_obj) # put_object should NOT have been called (deduplication) assert len(put_called) == 0 assert result.sha256 == TEST_HASH_HELLO @pytest.mark.unit def test_uploads_when_not_exists(self, mock_storage, mock_s3_client): """Test storage uploads to S3 when artifact doesn't exist.""" content = b"brand new unique content" content_hash = compute_sha256(content) s3_key = f"fruits/{content_hash[:2]}/{content_hash[2:4]}/{content_hash}" # Ensure object doesn't exist assert s3_key not in mock_s3_client.objects # Store the content file_obj = io.BytesIO(content) result = mock_storage._store_simple(file_obj) # Object should now exist in mock storage assert s3_key in mock_s3_client.objects assert mock_s3_client.objects[s3_key] == content @pytest.mark.unit def test_returns_same_hash_for_duplicate(self, mock_storage, mock_s3_client): """Test storing same content twice returns same hash.""" content = b"content to be stored twice" # First store file1 = io.BytesIO(content) result1 = mock_storage._store_simple(file1) # Second store (duplicate) file2 = io.BytesIO(content) result2 = mock_storage._store_simple(file2) assert result1.sha256 == result2.sha256 assert result1.s3_key == result2.s3_key @pytest.mark.unit def test_different_content_different_keys(self, mock_storage): """Test different content produces different S3 keys.""" content1 = b"first content" content2 = b"second content" file1 = io.BytesIO(content1) result1 = mock_storage._store_simple(file1) file2 = io.BytesIO(content2) result2 = mock_storage._store_simple(file2) assert result1.sha256 != result2.sha256 assert result1.s3_key != result2.s3_key class TestDeduplicationEdgeCases: """Edge case tests for deduplication.""" @pytest.mark.unit def test_same_content_different_filenames(self, mock_storage): """Test same content with different metadata is deduplicated.""" content = b"identical content" # Store with "filename1" file1 = io.BytesIO(content) result1 = mock_storage._store_simple(file1) # Store with "filename2" (same content) file2 = io.BytesIO(content) result2 = mock_storage._store_simple(file2) # Both should have same hash (content-addressable) assert result1.sha256 == result2.sha256 @pytest.mark.unit def test_whitespace_only_difference(self, mock_storage): """Test content differing only by whitespace produces different hashes.""" content1 = b"test content" content2 = b"test content" # Extra space content3 = b"test content " # Trailing space file1 = io.BytesIO(content1) file2 = io.BytesIO(content2) file3 = io.BytesIO(content3) result1 = mock_storage._store_simple(file1) result2 = mock_storage._store_simple(file2) result3 = mock_storage._store_simple(file3) # All should be different (content-addressable) assert len({result1.sha256, result2.sha256, result3.sha256}) == 3