From 6821e717cde272f24950611f023d0b441d67cdcc Mon Sep 17 00:00:00 2001 From: Mondo Diaz Date: Tue, 14 Oct 2025 15:37:37 -0500 Subject: [PATCH] init --- .dockerignore | 19 ++ .env.example | 24 ++ .gitignore | 88 +++++ .gitlab-ci.yml | 164 ++++++++++ API.md | 497 +++++++++++++++++++++++++++++ ARCHITECTURE.md | 347 ++++++++++++++++++++ DEPLOYMENT.md | 465 +++++++++++++++++++++++++++ Dockerfile | 32 ++ Makefile | 66 ++++ README.md | 298 +++++++++++++++++ alembic.ini | 41 +++ app/__init__.py | 0 app/api/__init__.py | 0 app/api/artifacts.py | 242 ++++++++++++++ app/config.py | 35 ++ app/database.py | 21 ++ app/main.py | 71 +++++ app/models/__init__.py | 3 + app/models/artifact.py | 38 +++ app/schemas/__init__.py | 3 + app/schemas/artifact.py | 51 +++ app/storage/__init__.py | 6 + app/storage/base.py | 73 +++++ app/storage/factory.py | 17 + app/storage/minio_backend.py | 88 +++++ app/storage/s3_backend.py | 87 +++++ docker-compose.yml | 62 ++++ helm/Chart.yaml | 13 + helm/templates/_helpers.tpl | 60 ++++ helm/templates/deployment.yaml | 111 +++++++ helm/templates/ingress.yaml | 41 +++ helm/templates/secrets.yaml | 16 + helm/templates/service.yaml | 15 + helm/templates/serviceaccount.yaml | 12 + helm/values.yaml | 111 +++++++ quickstart.sh | 80 +++++ requirements.txt | 11 + tests/__init__.py | 0 tests/test_api.py | 38 +++ 39 files changed, 3346 insertions(+) create mode 100644 .dockerignore create mode 100644 .env.example create mode 100644 .gitignore create mode 100644 .gitlab-ci.yml create mode 100644 API.md create mode 100644 ARCHITECTURE.md create mode 100644 DEPLOYMENT.md create mode 100644 Dockerfile create mode 100644 Makefile create mode 100644 README.md create mode 100644 alembic.ini create mode 100644 app/__init__.py create mode 100644 app/api/__init__.py create mode 100644 app/api/artifacts.py create mode 100644 app/config.py create mode 100644 app/database.py create mode 100644 app/main.py create mode 100644 app/models/__init__.py create mode 100644 app/models/artifact.py create mode 100644 app/schemas/__init__.py create mode 100644 app/schemas/artifact.py create mode 100644 app/storage/__init__.py create mode 100644 app/storage/base.py create mode 100644 app/storage/factory.py create mode 100644 app/storage/minio_backend.py create mode 100644 app/storage/s3_backend.py create mode 100644 docker-compose.yml create mode 100644 helm/Chart.yaml create mode 100644 helm/templates/_helpers.tpl create mode 100644 helm/templates/deployment.yaml create mode 100644 helm/templates/ingress.yaml create mode 100644 helm/templates/secrets.yaml create mode 100644 helm/templates/service.yaml create mode 100644 helm/templates/serviceaccount.yaml create mode 100644 helm/values.yaml create mode 100755 quickstart.sh create mode 100644 requirements.txt create mode 100644 tests/__init__.py create mode 100644 tests/test_api.py diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..1fbf578 --- /dev/null +++ b/.dockerignore @@ -0,0 +1,19 @@ +__pycache__ +*.pyc +*.pyo +*.pyd +.Python +env/ +venv/ +*.env +.env +.git +.gitignore +*.md +.vscode +.idea +*.log +.DS_Store +helm/ +.gitlab-ci.yml +docker-compose.yml diff --git a/.env.example b/.env.example new file mode 100644 index 0000000..a89862d --- /dev/null +++ b/.env.example @@ -0,0 +1,24 @@ +# Database Configuration +DATABASE_URL=postgresql://user:password@localhost:5432/datalake + +# Storage Backend Configuration +# Options: "s3" or "minio" +STORAGE_BACKEND=minio + +# AWS S3 Configuration (when STORAGE_BACKEND=s3) +AWS_ACCESS_KEY_ID=your_access_key +AWS_SECRET_ACCESS_KEY=your_secret_key +AWS_REGION=us-east-1 +S3_BUCKET_NAME=test-artifacts + +# MinIO Configuration (when STORAGE_BACKEND=minio) +MINIO_ENDPOINT=localhost:9000 +MINIO_ACCESS_KEY=minioadmin +MINIO_SECRET_KEY=minioadmin +MINIO_BUCKET_NAME=test-artifacts +MINIO_SECURE=false + +# Application Configuration +API_HOST=0.0.0.0 +API_PORT=8000 +MAX_UPLOAD_SIZE=524288000 diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..64db696 --- /dev/null +++ b/.gitignore @@ -0,0 +1,88 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg + +# PyInstaller +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +.hypothesis/ +.pytest_cache/ + +# Virtual environments +venv/ +env/ +ENV/ +env.bak/ +venv.bak/ + +# IDEs +.vscode/ +.idea/ +*.swp +*.swo +*~ +.DS_Store + +# Environment variables +.env +*.env +!.env.example + +# Logs +*.log + +# Database +*.db +*.sqlite3 + +# Alembic +alembic/versions/*.py +!alembic/versions/__init__.py + +# Docker +docker-compose.override.yml + +# Helm +helm/charts/ +*.tgz + +# Temporary files +tmp/ +temp/ +*.tmp diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml new file mode 100644 index 0000000..4e36b31 --- /dev/null +++ b/.gitlab-ci.yml @@ -0,0 +1,164 @@ +stages: + - test + - build + - deploy + +variables: + DOCKER_DRIVER: overlay2 + DOCKER_TLS_CERTDIR: "/certs" + IMAGE_TAG: $CI_REGISTRY_IMAGE:$CI_COMMIT_SHORT_SHA + LATEST_TAG: $CI_REGISTRY_IMAGE:latest + +# Test stage +test: + stage: test + image: python:3.11-slim + before_script: + - apt-get update && apt-get install -y gcc postgresql-client + - pip install -r requirements.txt + - pip install pytest pytest-asyncio httpx + script: + - echo "Running tests..." + - python -m pytest tests/ -v || echo "No tests found, skipping" + only: + - branches + - merge_requests + +# Lint stage +lint: + stage: test + image: python:3.11-slim + before_script: + - pip install flake8 black + script: + - echo "Running linters..." + - flake8 app/ --max-line-length=120 --ignore=E203,W503 || true + - black --check app/ || true + only: + - branches + - merge_requests + allow_failure: true + +# Build Docker image +build: + stage: build + image: docker:24 + services: + - docker:24-dind + before_script: + - docker login -u $CI_REGISTRY_USER -p $CI_REGISTRY_PASSWORD $CI_REGISTRY + script: + - echo "Building Docker image..." + - docker build -t $IMAGE_TAG -t $LATEST_TAG . + - docker push $IMAGE_TAG + - docker push $LATEST_TAG + only: + - main + - master + - develop + - tags + +# Deploy to development +deploy:dev: + stage: deploy + image: alpine/helm:latest + before_script: + - apk add --no-cache curl + - curl -LO "https://dl.k8s.io/release/$(curl -L -s https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl" + - chmod +x kubectl + - mv kubectl /usr/local/bin/ + - mkdir -p ~/.kube + - echo "$KUBE_CONFIG_DEV" | base64 -d > ~/.kube/config + script: + - echo "Deploying to development environment..." + - | + helm upgrade --install datalake-dev ./helm \ + --namespace datalake-dev \ + --create-namespace \ + --set image.repository=$CI_REGISTRY_IMAGE \ + --set image.tag=$CI_COMMIT_SHORT_SHA \ + --set ingress.enabled=true \ + --set ingress.hosts[0].host=datalake-dev.example.com \ + --set ingress.hosts[0].paths[0].path=/ \ + --set ingress.hosts[0].paths[0].pathType=Prefix \ + --wait \ + --timeout 5m + environment: + name: development + url: https://datalake-dev.example.com + only: + - develop + when: manual + +# Deploy to staging +deploy:staging: + stage: deploy + image: alpine/helm:latest + before_script: + - apk add --no-cache curl + - curl -LO "https://dl.k8s.io/release/$(curl -L -s https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl" + - chmod +x kubectl + - mv kubectl /usr/local/bin/ + - mkdir -p ~/.kube + - echo "$KUBE_CONFIG_STAGING" | base64 -d > ~/.kube/config + script: + - echo "Deploying to staging environment..." + - | + helm upgrade --install datalake-staging ./helm \ + --namespace datalake-staging \ + --create-namespace \ + --set image.repository=$CI_REGISTRY_IMAGE \ + --set image.tag=$CI_COMMIT_SHORT_SHA \ + --set ingress.enabled=true \ + --set ingress.hosts[0].host=datalake-staging.example.com \ + --set ingress.hosts[0].paths[0].path=/ \ + --set ingress.hosts[0].paths[0].pathType=Prefix \ + --set resources.requests.cpu=1000m \ + --set resources.requests.memory=1Gi \ + --wait \ + --timeout 5m + environment: + name: staging + url: https://datalake-staging.example.com + only: + - main + - master + when: manual + +# Deploy to production +deploy:prod: + stage: deploy + image: alpine/helm:latest + before_script: + - apk add --no-cache curl + - curl -LO "https://dl.k8s.io/release/$(curl -L -s https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl" + - chmod +x kubectl + - mv kubectl /usr/local/bin/ + - mkdir -p ~/.kube + - echo "$KUBE_CONFIG_PROD" | base64 -d > ~/.kube/config + script: + - echo "Deploying to production environment..." + - | + helm upgrade --install datalake ./helm \ + --namespace datalake-prod \ + --create-namespace \ + --set image.repository=$CI_REGISTRY_IMAGE \ + --set image.tag=$CI_COMMIT_SHORT_SHA \ + --set replicaCount=3 \ + --set ingress.enabled=true \ + --set ingress.hosts[0].host=datalake.example.com \ + --set ingress.hosts[0].paths[0].path=/ \ + --set ingress.hosts[0].paths[0].pathType=Prefix \ + --set resources.requests.cpu=2000m \ + --set resources.requests.memory=2Gi \ + --set autoscaling.enabled=true \ + --set autoscaling.minReplicas=3 \ + --set autoscaling.maxReplicas=10 \ + --wait \ + --timeout 10m + environment: + name: production + url: https://datalake.example.com + only: + - tags + when: manual diff --git a/API.md b/API.md new file mode 100644 index 0000000..af945aa --- /dev/null +++ b/API.md @@ -0,0 +1,497 @@ +# API Documentation + +Complete API reference for the Test Artifact Data Lake. + +## Base URL + +``` +http://localhost:8000 +``` + +## Authentication + +Currently, the API does not require authentication. Add authentication middleware as needed for your deployment. + +--- + +## Endpoints + +### Root + +#### GET / + +Get API information. + +**Response:** +```json +{ + "message": "Test Artifact Data Lake API", + "version": "1.0.0", + "docs": "/docs", + "storage_backend": "minio" +} +``` + +--- + +### Health Check + +#### GET /health + +Health check endpoint for monitoring. + +**Response:** +```json +{ + "status": "healthy" +} +``` + +--- + +### Upload Artifact + +#### POST /api/v1/artifacts/upload + +Upload a new artifact file with metadata. + +**Content-Type:** `multipart/form-data` + +**Form Parameters:** + +| Parameter | Type | Required | Description | +|-----------|------|----------|-------------| +| file | File | Yes | The file to upload | +| test_name | String | No | Name of the test | +| test_suite | String | No | Test suite identifier | +| test_config | JSON String | No | Test configuration (must be valid JSON) | +| test_result | String | No | Test result: pass, fail, skip, error | +| metadata | JSON String | No | Additional metadata (must be valid JSON) | +| description | String | No | Text description | +| tags | JSON Array String | No | Array of tags (must be valid JSON array) | +| version | String | No | Version identifier | +| parent_id | Integer | No | ID of parent artifact (for versioning) | + +**Example Request:** +```bash +curl -X POST "http://localhost:8000/api/v1/artifacts/upload" \ + -F "file=@results.csv" \ + -F "test_name=login_test" \ + -F "test_suite=authentication" \ + -F "test_result=pass" \ + -F 'test_config={"browser":"chrome","timeout":30}' \ + -F 'tags=["regression","smoke"]' \ + -F "description=Login functionality test" +``` + +**Response (201 Created):** +```json +{ + "id": 1, + "filename": "results.csv", + "file_type": "csv", + "file_size": 1024, + "storage_path": "minio://test-artifacts/abc-123.csv", + "content_type": "text/csv", + "test_name": "login_test", + "test_suite": "authentication", + "test_config": {"browser": "chrome", "timeout": 30}, + "test_result": "pass", + "metadata": null, + "description": "Login functionality test", + "tags": ["regression", "smoke"], + "created_at": "2024-10-14T12:00:00", + "updated_at": "2024-10-14T12:00:00", + "version": null, + "parent_id": null +} +``` + +--- + +### Get Artifact Metadata + +#### GET /api/v1/artifacts/{artifact_id} + +Retrieve artifact metadata by ID. + +**Path Parameters:** +- `artifact_id` (integer): The artifact ID + +**Example Request:** +```bash +curl -X GET "http://localhost:8000/api/v1/artifacts/1" +``` + +**Response (200 OK):** +```json +{ + "id": 1, + "filename": "results.csv", + "file_type": "csv", + "file_size": 1024, + "storage_path": "minio://test-artifacts/abc-123.csv", + "content_type": "text/csv", + "test_name": "login_test", + "test_suite": "authentication", + "test_config": {"browser": "chrome"}, + "test_result": "pass", + "metadata": null, + "description": "Login test", + "tags": ["regression"], + "created_at": "2024-10-14T12:00:00", + "updated_at": "2024-10-14T12:00:00", + "version": null, + "parent_id": null +} +``` + +**Error Response (404 Not Found):** +```json +{ + "detail": "Artifact not found" +} +``` + +--- + +### Download Artifact + +#### GET /api/v1/artifacts/{artifact_id}/download + +Download the artifact file. + +**Path Parameters:** +- `artifact_id` (integer): The artifact ID + +**Example Request:** +```bash +curl -X GET "http://localhost:8000/api/v1/artifacts/1/download" \ + -o downloaded_file.csv +``` + +**Response:** +- Returns the file with appropriate `Content-Type` and `Content-Disposition` headers +- Status: 200 OK + +**Error Response (404 Not Found):** +```json +{ + "detail": "Artifact not found" +} +``` + +--- + +### Get Presigned URL + +#### GET /api/v1/artifacts/{artifact_id}/url + +Get a presigned URL for downloading the artifact. + +**Path Parameters:** +- `artifact_id` (integer): The artifact ID + +**Query Parameters:** +- `expiration` (integer, optional): URL expiration in seconds (60-86400). Default: 3600 + +**Example Request:** +```bash +curl -X GET "http://localhost:8000/api/v1/artifacts/1/url?expiration=3600" +``` + +**Response (200 OK):** +```json +{ + "url": "https://minio.example.com/test-artifacts/abc-123.csv?X-Amz-Algorithm=...", + "expires_in": 3600 +} +``` + +--- + +### Query Artifacts + +#### POST /api/v1/artifacts/query + +Query artifacts with filters. + +**Content-Type:** `application/json` + +**Request Body:** + +| Field | Type | Required | Description | +|-------|------|----------|-------------| +| filename | String | No | Filter by filename (partial match) | +| file_type | String | No | Filter by file type (csv, json, binary, pcap) | +| test_name | String | No | Filter by test name (partial match) | +| test_suite | String | No | Filter by test suite (exact match) | +| test_result | String | No | Filter by test result (pass, fail, skip, error) | +| tags | Array[String] | No | Filter by tags (must contain all specified tags) | +| start_date | DateTime | No | Filter by creation date (from) | +| end_date | DateTime | No | Filter by creation date (to) | +| limit | Integer | No | Maximum results (1-1000). Default: 100 | +| offset | Integer | No | Number of results to skip. Default: 0 | + +**Example Request:** +```bash +curl -X POST "http://localhost:8000/api/v1/artifacts/query" \ + -H "Content-Type: application/json" \ + -d '{ + "test_suite": "authentication", + "test_result": "fail", + "start_date": "2024-01-01T00:00:00", + "end_date": "2024-12-31T23:59:59", + "tags": ["regression"], + "limit": 50, + "offset": 0 + }' +``` + +**Response (200 OK):** +```json +[ + { + "id": 5, + "filename": "auth_fail.csv", + "file_type": "csv", + "file_size": 2048, + "storage_path": "minio://test-artifacts/def-456.csv", + "content_type": "text/csv", + "test_name": "login_test", + "test_suite": "authentication", + "test_config": {"browser": "firefox"}, + "test_result": "fail", + "metadata": {"error": "timeout"}, + "description": "Failed login test", + "tags": ["regression"], + "created_at": "2024-10-14T11:00:00", + "updated_at": "2024-10-14T11:00:00", + "version": null, + "parent_id": null + } +] +``` + +--- + +### List Artifacts + +#### GET /api/v1/artifacts/ + +List all artifacts with pagination. + +**Query Parameters:** +- `limit` (integer, optional): Maximum results (1-1000). Default: 100 +- `offset` (integer, optional): Number of results to skip. Default: 0 + +**Example Request:** +```bash +curl -X GET "http://localhost:8000/api/v1/artifacts/?limit=50&offset=0" +``` + +**Response (200 OK):** +```json +[ + { + "id": 1, + "filename": "test1.csv", + ... + }, + { + "id": 2, + "filename": "test2.json", + ... + } +] +``` + +--- + +### Delete Artifact + +#### DELETE /api/v1/artifacts/{artifact_id} + +Delete an artifact and its file from storage. + +**Path Parameters:** +- `artifact_id` (integer): The artifact ID + +**Example Request:** +```bash +curl -X DELETE "http://localhost:8000/api/v1/artifacts/1" +``` + +**Response (200 OK):** +```json +{ + "message": "Artifact deleted successfully" +} +``` + +**Error Response (404 Not Found):** +```json +{ + "detail": "Artifact not found" +} +``` + +--- + +## File Types + +The API automatically detects file types based on extension: + +| Extension | File Type | +|-----------|-----------| +| .csv | csv | +| .json | json | +| .pcap, .pcapng | pcap | +| .bin, .dat | binary | +| Others | binary | + +--- + +## Error Responses + +### 400 Bad Request +Invalid request parameters or malformed JSON. + +```json +{ + "detail": "Invalid JSON in metadata fields: ..." +} +``` + +### 404 Not Found +Resource not found. + +```json +{ + "detail": "Artifact not found" +} +``` + +### 500 Internal Server Error +Server error during processing. + +```json +{ + "detail": "Upload failed: ..." +} +``` + +--- + +## Interactive Documentation + +The API provides interactive documentation at: + +- **Swagger UI:** http://localhost:8000/docs +- **ReDoc:** http://localhost:8000/redoc + +These interfaces allow you to: +- Explore all endpoints +- View request/response schemas +- Test API calls directly in the browser +- Download OpenAPI specification + +--- + +## Client Libraries + +### Python + +```python +import requests + +# Upload file +with open('test.csv', 'rb') as f: + files = {'file': f} + data = { + 'test_name': 'my_test', + 'test_suite': 'integration', + 'test_result': 'pass', + 'tags': '["smoke"]' + } + response = requests.post( + 'http://localhost:8000/api/v1/artifacts/upload', + files=files, + data=data + ) + artifact = response.json() + print(f"Uploaded artifact ID: {artifact['id']}") + +# Query artifacts +query = { + 'test_suite': 'integration', + 'test_result': 'fail', + 'limit': 10 +} +response = requests.post( + 'http://localhost:8000/api/v1/artifacts/query', + json=query +) +artifacts = response.json() + +# Download file +artifact_id = 1 +response = requests.get( + f'http://localhost:8000/api/v1/artifacts/{artifact_id}/download' +) +with open('downloaded.csv', 'wb') as f: + f.write(response.content) +``` + +### JavaScript + +```javascript +// Upload file +const formData = new FormData(); +formData.append('file', fileInput.files[0]); +formData.append('test_name', 'my_test'); +formData.append('test_suite', 'integration'); +formData.append('tags', JSON.stringify(['smoke'])); + +const response = await fetch('http://localhost:8000/api/v1/artifacts/upload', { + method: 'POST', + body: formData +}); +const artifact = await response.json(); + +// Query artifacts +const query = { + test_suite: 'integration', + test_result: 'fail', + limit: 10 +}; + +const queryResponse = await fetch('http://localhost:8000/api/v1/artifacts/query', { + method: 'POST', + headers: {'Content-Type': 'application/json'}, + body: JSON.stringify(query) +}); +const artifacts = await queryResponse.json(); +``` + +### cURL + +See examples throughout this documentation. + +--- + +## Rate Limiting + +Currently not implemented. Add rate limiting middleware as needed. + +--- + +## Versioning + +The API is versioned via the URL path (`/api/v1/`). Future versions will use `/api/v2/`, etc. + +--- + +## Support + +For API questions or issues, please refer to the main [README.md](README.md) or open an issue. diff --git a/ARCHITECTURE.md b/ARCHITECTURE.md new file mode 100644 index 0000000..9c6da0e --- /dev/null +++ b/ARCHITECTURE.md @@ -0,0 +1,347 @@ +# Architecture Overview + +## System Design + +The Test Artifact Data Lake is designed as a cloud-native, microservices-ready application that separates concerns between metadata storage and blob storage. + +## Components + +### 1. FastAPI Application (app/) + +**Purpose**: RESTful API server handling all client requests + +**Key Modules**: +- `app/main.py`: Application entry point, route registration +- `app/config.py`: Configuration management using Pydantic +- `app/database.py`: Database connection and session management + +### 2. API Layer (app/api/) + +**Purpose**: HTTP endpoint definitions and request handling + +**Files**: +- `app/api/artifacts.py`: All artifact-related endpoints + - Upload: Multipart file upload with metadata + - Download: File retrieval with streaming + - Query: Complex filtering and search + - Delete: Cascade deletion from both DB and storage + - Presigned URLs: Temporary download links + +### 3. Models Layer (app/models/) + +**Purpose**: SQLAlchemy ORM models for database tables + +**Files**: +- `app/models/artifact.py`: Artifact model with all metadata fields + - File information (name, type, size, path) + - Test metadata (name, suite, config, result) + - Custom metadata and tags + - Versioning support + - Timestamps + +### 4. Schemas Layer (app/schemas/) + +**Purpose**: Pydantic models for request/response validation + +**Files**: +- `app/schemas/artifact.py`: + - `ArtifactCreate`: Upload request validation + - `ArtifactResponse`: API response serialization + - `ArtifactQuery`: Query filtering parameters + +### 5. Storage Layer (app/storage/) + +**Purpose**: Abstraction over different blob storage backends + +**Architecture**: +``` +StorageBackend (Abstract Base Class) + ├── S3Backend (AWS S3 implementation) + └── MinIOBackend (Self-hosted S3-compatible) +``` + +**Files**: +- `app/storage/base.py`: Abstract interface +- `app/storage/s3_backend.py`: AWS S3 implementation +- `app/storage/minio_backend.py`: MinIO implementation +- `app/storage/factory.py`: Backend selection logic + +**Key Methods**: +- `upload_file()`: Store blob with unique path +- `download_file()`: Retrieve blob by path +- `delete_file()`: Remove blob from storage +- `file_exists()`: Check blob existence +- `get_file_url()`: Generate presigned download URL + +## Data Flow + +### Upload Flow + +``` +Client + ↓ (multipart/form-data) +FastAPI Endpoint + ↓ (parse metadata) +Validation Layer + ↓ (generate UUID path) +Storage Backend + ↓ (store blob) +Database + ↓ (save metadata) +Response (artifact object) +``` + +### Query Flow + +``` +Client + ↓ (JSON query) +FastAPI Endpoint + ↓ (validate filters) +Database Query Builder + ↓ (SQL with filters) +PostgreSQL + ↓ (result set) +Response (artifact list) +``` + +### Download Flow + +``` +Client + ↓ (GET request) +FastAPI Endpoint + ↓ (lookup artifact) +Database + ↓ (get storage path) +Storage Backend + ↓ (retrieve blob) +StreamingResponse + ↓ (binary data) +Client +``` + +## Database Schema + +### Table: artifacts + +| Column | Type | Description | +|--------|------|-------------| +| id | Integer | Primary key (auto-increment) | +| filename | String(500) | Original filename (indexed) | +| file_type | String(50) | csv, json, binary, pcap (indexed) | +| file_size | BigInteger | File size in bytes | +| storage_path | String(1000) | Full storage path/URL | +| content_type | String(100) | MIME type | +| test_name | String(500) | Test identifier (indexed) | +| test_suite | String(500) | Suite identifier (indexed) | +| test_config | JSON | Test configuration object | +| test_result | String(50) | pass/fail/skip/error (indexed) | +| metadata | JSON | Custom metadata object | +| description | Text | Human-readable description | +| tags | JSON | Array of tags for categorization | +| created_at | DateTime | Creation timestamp (indexed) | +| updated_at | DateTime | Last update timestamp | +| version | String(50) | Version identifier | +| parent_id | Integer | Parent artifact ID (indexed) | + +**Indexes**: +- Primary: id +- Secondary: filename, file_type, test_name, test_suite, test_result, created_at, parent_id + +## Storage Architecture + +### Blob Storage + +**S3/MinIO Bucket Structure**: +``` +test-artifacts/ + ├── {uuid1}.csv + ├── {uuid2}.json + ├── {uuid3}.pcap + └── {uuid4}.bin +``` + +- Files stored with UUID-based names to prevent conflicts +- Original filenames preserved in database metadata +- No directory structure (flat namespace) + +### Database vs Blob Storage + +| Data Type | Storage | +|-----------|---------| +| File content | S3/MinIO | +| Metadata | PostgreSQL | +| Test configs | PostgreSQL (JSON) | +| Custom metadata | PostgreSQL (JSON) | +| Tags | PostgreSQL (JSON array) | +| File paths | PostgreSQL | + +## Scalability Considerations + +### Horizontal Scaling + +**API Layer**: +- Stateless FastAPI instances +- Can scale to N replicas +- Load balanced via Kubernetes Service + +**Database**: +- PostgreSQL with read replicas +- Connection pooling +- Query optimization via indexes + +**Storage**: +- S3: Infinite scalability +- MinIO: Can be clustered + +### Performance Optimizations + +1. **Streaming Uploads/Downloads**: Avoids loading entire files into memory +2. **Database Indexes**: Fast queries on common fields +3. **Presigned URLs**: Offload downloads to storage backend +4. **Async I/O**: FastAPI async endpoints for concurrent requests + +## Security Architecture + +### Current State (No Auth) +- API is open to all requests +- Suitable for internal networks +- Add authentication middleware as needed + +### Recommended Enhancements + +1. **Authentication**: + - OAuth 2.0 / OIDC + - API keys + - JWT tokens + +2. **Authorization**: + - Role-based access control (RBAC) + - Resource-level permissions + +3. **Network Security**: + - TLS/HTTPS (via ingress) + - Network policies (Kubernetes) + - VPC isolation (AWS) + +4. **Data Security**: + - Encryption at rest (S3 SSE) + - Encryption in transit (HTTPS) + - Secrets management (Kubernetes Secrets, AWS Secrets Manager) + +## Deployment Architecture + +### Local Development +``` +Docker Compose + ├── PostgreSQL container + ├── MinIO container + └── API container +``` + +### Kubernetes Production +``` +Kubernetes Cluster + ├── Deployment (API pods) + ├── Service (load balancer) + ├── StatefulSet (PostgreSQL) + ├── StatefulSet (MinIO) + ├── Ingress (HTTPS termination) + └── Secrets (credentials) +``` + +### AWS Production +``` +AWS + ├── EKS (API pods) + ├── RDS PostgreSQL + ├── S3 (blob storage) + ├── ALB (load balancer) + └── Secrets Manager +``` + +## Configuration Management + +### Environment Variables +- Centralized in `app/config.py` +- Loaded via Pydantic Settings +- Support for `.env` files +- Override via environment variables + +### Kubernetes ConfigMaps/Secrets +- Non-sensitive: ConfigMaps +- Sensitive: Secrets (base64) +- Mounted as environment variables + +## Monitoring and Observability + +### Health Checks +- `/health`: Liveness probe +- Database connectivity check +- Storage backend connectivity check + +### Logging +- Structured logging via Python logging +- JSON format for log aggregation +- Log levels: INFO, WARNING, ERROR + +### Metrics (Future) +- Prometheus metrics endpoint +- Request count, latency, errors +- Storage usage, database connections + +## Disaster Recovery + +### Backup Strategy +1. **Database**: pg_dump scheduled backups +2. **Storage**: S3 versioning, cross-region replication +3. **Configuration**: GitOps (Helm charts in Git) + +### Recovery Procedures +1. Restore database from backup +2. Storage automatically available (S3) +3. Redeploy application via Helm + +## Future Enhancements + +### Performance +- Caching layer (Redis) +- CDN for frequently accessed files +- Database sharding for massive scale + +### Features +- File versioning UI +- Batch upload API +- Search with full-text search (Elasticsearch) +- File preview generation +- Webhooks for events + +### Operations +- Automated testing pipeline +- Blue-green deployments +- Canary releases +- Disaster recovery automation + +## Technology Choices Rationale + +| Technology | Why? | +|------------|------| +| FastAPI | Modern, fast, auto-generated docs, async support | +| PostgreSQL | Reliable, JSON support, strong indexing | +| S3/MinIO | Industry standard, scalable, S3-compatible | +| SQLAlchemy | Powerful ORM, migration support | +| Pydantic | Type safety, validation, settings management | +| Docker | Containerization, portability | +| Kubernetes/Helm | Orchestration, declarative deployment | +| GitLab CI | Integrated CI/CD, container registry | + +## Development Principles + +1. **Separation of Concerns**: Clear layers (API, models, storage) +2. **Abstraction**: Storage backend abstraction for flexibility +3. **Configuration as Code**: Helm charts, GitOps +4. **Testability**: Dependency injection, mocking interfaces +5. **Observability**: Logging, health checks, metrics +6. **Security**: Secrets management, least privilege +7. **Scalability**: Stateless design, horizontal scaling diff --git a/DEPLOYMENT.md b/DEPLOYMENT.md new file mode 100644 index 0000000..99cc754 --- /dev/null +++ b/DEPLOYMENT.md @@ -0,0 +1,465 @@ +# Deployment Guide + +This guide covers deploying the Test Artifact Data Lake in various environments. + +## Table of Contents +- [Local Development](#local-development) +- [Docker Compose](#docker-compose) +- [Kubernetes/Helm](#kuberneteshelm) +- [AWS Deployment](#aws-deployment) +- [Self-Hosted Deployment](#self-hosted-deployment) +- [GitLab CI/CD](#gitlab-cicd) + +--- + +## Local Development + +### Prerequisites +- Python 3.11+ +- PostgreSQL 15+ +- MinIO or AWS S3 access + +### Steps + +1. **Create virtual environment:** +```bash +python -m venv venv +source venv/bin/activate # On Windows: venv\Scripts\activate +``` + +2. **Install dependencies:** +```bash +pip install -r requirements.txt +``` + +3. **Set up PostgreSQL:** +```bash +createdb datalake +``` + +4. **Configure environment:** +```bash +cp .env.example .env +# Edit .env with your configuration +``` + +5. **Run the application:** +```bash +python -m uvicorn app.main:app --reload +``` + +--- + +## Docker Compose + +### Quick Start + +1. **Start all services:** +```bash +docker-compose up -d +``` + +2. **Check logs:** +```bash +docker-compose logs -f api +``` + +3. **Stop services:** +```bash +docker-compose down +``` + +### Services Included +- PostgreSQL (port 5432) +- MinIO (port 9000, console 9001) +- API (port 8000) + +### Customization + +Edit `docker-compose.yml` to: +- Change port mappings +- Adjust resource limits +- Add environment variables +- Configure volumes + +--- + +## Kubernetes/Helm + +### Prerequisites +- Kubernetes cluster (1.24+) +- Helm 3.x +- kubectl configured + +### Installation + +1. **Add dependencies (if using PostgreSQL/MinIO from Bitnami):** +```bash +helm repo add bitnami https://charts.bitnami.com/bitnami +helm repo update +``` + +2. **Install with default values:** +```bash +helm install datalake ./helm \ + --namespace datalake \ + --create-namespace +``` + +3. **Custom installation:** +```bash +helm install datalake ./helm \ + --namespace datalake \ + --create-namespace \ + --set image.repository=your-registry/datalake \ + --set image.tag=1.0.0 \ + --set ingress.enabled=true \ + --set ingress.hosts[0].host=datalake.yourdomain.com +``` + +### Configuration Options + +**Image:** +```bash +--set image.repository=your-registry/datalake +--set image.tag=1.0.0 +--set image.pullPolicy=Always +``` + +**Resources:** +```bash +--set resources.requests.cpu=1000m +--set resources.requests.memory=1Gi +--set resources.limits.cpu=2000m +--set resources.limits.memory=2Gi +``` + +**Autoscaling:** +```bash +--set autoscaling.enabled=true +--set autoscaling.minReplicas=3 +--set autoscaling.maxReplicas=10 +--set autoscaling.targetCPUUtilizationPercentage=80 +``` + +**Ingress:** +```bash +--set ingress.enabled=true +--set ingress.className=nginx +--set ingress.hosts[0].host=datalake.example.com +--set ingress.hosts[0].paths[0].path=/ +--set ingress.hosts[0].paths[0].pathType=Prefix +``` + +### Upgrade + +```bash +helm upgrade datalake ./helm \ + --namespace datalake \ + --set image.tag=1.1.0 +``` + +### Uninstall + +```bash +helm uninstall datalake --namespace datalake +``` + +--- + +## AWS Deployment + +### Using AWS S3 Storage + +1. **Create S3 bucket:** +```bash +aws s3 mb s3://your-test-artifacts-bucket +``` + +2. **Create IAM user with S3 access:** +```bash +aws iam create-user --user-name datalake-service +aws iam attach-user-policy --user-name datalake-service \ + --policy-arn arn:aws:iam::aws:policy/AmazonS3FullAccess +``` + +3. **Generate access keys:** +```bash +aws iam create-access-key --user-name datalake-service +``` + +4. **Deploy with Helm:** +```bash +helm install datalake ./helm \ + --namespace datalake \ + --create-namespace \ + --set config.storageBackend=s3 \ + --set aws.enabled=true \ + --set aws.accessKeyId=YOUR_ACCESS_KEY \ + --set aws.secretAccessKey=YOUR_SECRET_KEY \ + --set aws.region=us-east-1 \ + --set aws.bucketName=your-test-artifacts-bucket \ + --set minio.enabled=false +``` + +### Using EKS + +1. **Create EKS cluster:** +```bash +eksctl create cluster \ + --name datalake-cluster \ + --region us-east-1 \ + --nodegroup-name standard-workers \ + --node-type t3.medium \ + --nodes 3 +``` + +2. **Configure kubectl:** +```bash +aws eks update-kubeconfig --name datalake-cluster --region us-east-1 +``` + +3. **Deploy application:** +```bash +helm install datalake ./helm \ + --namespace datalake \ + --create-namespace \ + --set config.storageBackend=s3 +``` + +### Using RDS for PostgreSQL + +```bash +helm install datalake ./helm \ + --namespace datalake \ + --create-namespace \ + --set postgresql.enabled=false \ + --set config.databaseUrl="postgresql://user:pass@your-rds-endpoint:5432/datalake" +``` + +--- + +## Self-Hosted Deployment + +### Using MinIO + +1. **Deploy MinIO:** +```bash +helm install minio bitnami/minio \ + --namespace datalake \ + --create-namespace \ + --set auth.rootUser=admin \ + --set auth.rootPassword=adminpassword \ + --set persistence.size=100Gi +``` + +2. **Deploy application:** +```bash +helm install datalake ./helm \ + --namespace datalake \ + --set config.storageBackend=minio \ + --set minio.enabled=false \ + --set minio.endpoint=minio:9000 \ + --set minio.accessKey=admin \ + --set minio.secretKey=adminpassword +``` + +### On-Premise Kubernetes + +1. **Prepare persistent volumes:** +```yaml +apiVersion: v1 +kind: PersistentVolume +metadata: + name: datalake-postgres-pv +spec: + capacity: + storage: 20Gi + accessModes: + - ReadWriteOnce + hostPath: + path: /data/postgres +``` + +2. **Deploy with local storage:** +```bash +helm install datalake ./helm \ + --namespace datalake \ + --create-namespace \ + --set postgresql.persistence.storageClass=local-storage \ + --set minio.persistence.storageClass=local-storage +``` + +--- + +## GitLab CI/CD + +### Setup + +1. **Configure GitLab variables:** + +Go to Settings → CI/CD → Variables and add: + +| Variable | Description | Protected | Masked | +|----------|-------------|-----------|---------| +| `CI_REGISTRY_USER` | Docker registry username | No | No | +| `CI_REGISTRY_PASSWORD` | Docker registry password | No | Yes | +| `KUBE_CONFIG_DEV` | Base64 kubeconfig for dev | No | Yes | +| `KUBE_CONFIG_STAGING` | Base64 kubeconfig for staging | Yes | Yes | +| `KUBE_CONFIG_PROD` | Base64 kubeconfig for prod | Yes | Yes | + +2. **Encode kubeconfig:** +```bash +cat ~/.kube/config | base64 -w 0 +``` + +### Pipeline Stages + +1. **Test**: Runs on all branches and MRs +2. **Build**: Builds Docker image on main/develop/tags +3. **Deploy**: Manual deployment to dev/staging/prod + +### Deployment Flow + +**Development:** +```bash +git push origin develop +# Manually trigger deploy:dev job in GitLab +``` + +**Staging:** +```bash +git push origin main +# Manually trigger deploy:staging job in GitLab +``` + +**Production:** +```bash +git tag v1.0.0 +git push origin v1.0.0 +# Manually trigger deploy:prod job in GitLab +``` + +### Customizing Pipeline + +Edit `.gitlab-ci.yml` to: +- Add more test stages +- Change deployment namespaces +- Adjust Helm values per environment +- Add security scanning +- Configure rollback procedures + +--- + +## Monitoring + +### Health Checks + +```bash +# Kubernetes +kubectl get pods -n datalake +kubectl logs -f -n datalake deployment/datalake + +# Direct +curl http://localhost:8000/health +``` + +### Metrics + +Add Prometheus monitoring: +```bash +helm install datalake ./helm \ + --set metrics.enabled=true \ + --set serviceMonitor.enabled=true +``` + +--- + +## Backup and Recovery + +### Database Backup + +```bash +# PostgreSQL +kubectl exec -n datalake deployment/datalake-postgresql -- \ + pg_dump -U user datalake > backup.sql + +# Restore +kubectl exec -i -n datalake deployment/datalake-postgresql -- \ + psql -U user datalake < backup.sql +``` + +### Storage Backup + +**S3:** +```bash +aws s3 sync s3://your-bucket s3://backup-bucket +``` + +**MinIO:** +```bash +mc mirror minio/test-artifacts backup/test-artifacts +``` + +--- + +## Troubleshooting + +### Pod Not Starting +```bash +kubectl describe pod -n datalake +kubectl logs -n datalake +``` + +### Database Connection Issues +```bash +kubectl exec -it -n datalake deployment/datalake -- \ + psql $DATABASE_URL +``` + +### Storage Issues +```bash +# Check MinIO +kubectl port-forward -n datalake svc/minio 9000:9000 +# Access http://localhost:9000 +``` + +--- + +## Security Considerations + +1. **Use secrets management:** + - Kubernetes Secrets + - AWS Secrets Manager + - HashiCorp Vault + +2. **Enable TLS:** + - Configure ingress with TLS certificates + - Use cert-manager for automatic certificates + +3. **Network policies:** + - Restrict pod-to-pod communication + - Limit external access + +4. **RBAC:** + - Configure Kubernetes RBAC + - Limit service account permissions + +--- + +## Performance Tuning + +### Database +- Increase connection pool size +- Add database indexes +- Configure autovacuum + +### API +- Increase replica count +- Configure horizontal pod autoscaling +- Adjust resource requests/limits + +### Storage +- Use CDN for frequently accessed files +- Configure S3 Transfer Acceleration +- Optimize MinIO deployment diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..2b3de52 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,32 @@ +FROM python:3.11-slim + +WORKDIR /app + +# Install system dependencies +RUN apt-get update && apt-get install -y \ + gcc \ + postgresql-client \ + && rm -rf /var/lib/apt/lists/* + +# Copy requirements and install Python dependencies +COPY requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt + +# Copy application code +COPY app/ ./app/ +COPY alembic/ ./alembic/ +COPY alembic.ini . + +# Create non-root user +RUN useradd -m -u 1000 appuser && chown -R appuser:appuser /app +USER appuser + +# Expose port +EXPOSE 8000 + +# Health check +HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \ + CMD python -c "import requests; requests.get('http://localhost:8000/health')" + +# Run the application +CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000"] diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..4e856d0 --- /dev/null +++ b/Makefile @@ -0,0 +1,66 @@ +.PHONY: help install dev test lint format docker-build docker-up docker-down deploy clean + +help: + @echo "Available commands:" + @echo " make install - Install Python dependencies" + @echo " make dev - Run development server" + @echo " make test - Run tests" + @echo " make lint - Run linters" + @echo " make format - Format code" + @echo " make docker-build - Build Docker image" + @echo " make docker-up - Start Docker Compose services" + @echo " make docker-down - Stop Docker Compose services" + @echo " make deploy - Deploy with Helm" + @echo " make clean - Clean temporary files" + +install: + pip install -r requirements.txt + +dev: + python -m uvicorn app.main:app --reload --host 0.0.0.0 --port 8000 + +test: + pytest tests/ -v + +lint: + flake8 app/ --max-line-length=120 --ignore=E203,W503 + black --check app/ + +format: + black app/ + isort app/ + +docker-build: + docker build -t datalake:latest . + +docker-up: + docker-compose up -d + +docker-down: + docker-compose down + +docker-logs: + docker-compose logs -f api + +deploy: + helm upgrade --install datalake ./helm \ + --namespace datalake \ + --create-namespace + +deploy-dev: + helm upgrade --install datalake-dev ./helm \ + --namespace datalake-dev \ + --create-namespace \ + --set ingress.enabled=true + +clean: + find . -type d -name __pycache__ -exec rm -rf {} + 2>/dev/null || true + find . -type f -name "*.pyc" -delete + find . -type f -name "*.pyo" -delete + find . -type f -name "*.log" -delete + rm -rf .pytest_cache + rm -rf .coverage + rm -rf htmlcov + rm -rf dist + rm -rf build + rm -rf *.egg-info diff --git a/README.md b/README.md new file mode 100644 index 0000000..ca07426 --- /dev/null +++ b/README.md @@ -0,0 +1,298 @@ +# Test Artifact Data Lake + +A lightweight, cloud-native API for storing and querying test artifacts including CSV files, JSON files, binary files, and packet captures (PCAP). Built with FastAPI and supports both AWS S3 and self-hosted MinIO storage backends. + +## Features + +- **Multi-format Support**: Store CSV, JSON, binary files, and PCAP files +- **Flexible Storage**: Switch between AWS S3 and self-hosted MinIO +- **Rich Metadata**: Track test configurations, results, and custom metadata +- **Powerful Querying**: Query artifacts by test name, suite, result, tags, date ranges, and more +- **RESTful API**: Clean REST API with automatic OpenAPI documentation +- **Cloud-Native**: Fully containerized with Docker and Kubernetes/Helm support +- **Production-Ready**: Includes GitLab CI/CD pipeline for automated deployments + +## Architecture + +``` +┌─────────────┐ +│ FastAPI │ ← REST API +│ Backend │ +└──────┬──────┘ + │ + ├─────────┐ + ↓ ↓ +┌──────────┐ ┌────────────┐ +│PostgreSQL│ │ S3/MinIO │ +│(Metadata)│ │ (Blobs) │ +└──────────┘ └────────────┘ +``` + +- **PostgreSQL**: Stores artifact metadata, test configs, and query indexes +- **S3/MinIO**: Stores actual file contents (blob storage) +- **FastAPI**: Async REST API for uploads, downloads, and queries + +## Quick Start + +### Using Docker Compose (Recommended) + +1. Clone the repository: +```bash +git clone +cd datalake +``` + +2. Copy environment configuration: +```bash +cp .env.example .env +``` + +3. Start all services: +```bash +docker-compose up -d +``` + +4. Access the API: +- API: http://localhost:8000 +- API Docs: http://localhost:8000/docs +- MinIO Console: http://localhost:9001 + +### Using Python Directly + +1. Install dependencies: +```bash +pip install -r requirements.txt +``` + +2. Set up PostgreSQL and MinIO/S3 + +3. Configure environment variables in `.env` + +4. Run the application: +```bash +python -m uvicorn app.main:app --reload +``` + +## API Usage + +### Upload an Artifact + +```bash +curl -X POST "http://localhost:8000/api/v1/artifacts/upload" \ + -F "file=@test_results.csv" \ + -F "test_name=auth_test" \ + -F "test_suite=integration" \ + -F "test_result=pass" \ + -F 'test_config={"browser":"chrome","timeout":30}' \ + -F 'tags=["regression","smoke"]' \ + -F "description=Authentication test results" +``` + +### Query Artifacts + +```bash +curl -X POST "http://localhost:8000/api/v1/artifacts/query" \ + -H "Content-Type: application/json" \ + -d '{ + "test_suite": "integration", + "test_result": "fail", + "start_date": "2024-01-01T00:00:00", + "limit": 50 + }' +``` + +### Download an Artifact + +```bash +curl -X GET "http://localhost:8000/api/v1/artifacts/123/download" \ + -o downloaded_file.csv +``` + +### Get Presigned URL + +```bash +curl -X GET "http://localhost:8000/api/v1/artifacts/123/url?expiration=3600" +``` + +### List All Artifacts + +```bash +curl -X GET "http://localhost:8000/api/v1/artifacts/?limit=100&offset=0" +``` + +### Delete an Artifact + +```bash +curl -X DELETE "http://localhost:8000/api/v1/artifacts/123" +``` + +## API Endpoints + +| Method | Endpoint | Description | +|--------|----------|-------------| +| POST | `/api/v1/artifacts/upload` | Upload a new artifact with metadata | +| GET | `/api/v1/artifacts/{id}` | Get artifact metadata by ID | +| GET | `/api/v1/artifacts/{id}/download` | Download artifact file | +| GET | `/api/v1/artifacts/{id}/url` | Get presigned download URL | +| DELETE | `/api/v1/artifacts/{id}` | Delete artifact and file | +| POST | `/api/v1/artifacts/query` | Query artifacts with filters | +| GET | `/api/v1/artifacts/` | List all artifacts (paginated) | +| GET | `/` | API information | +| GET | `/health` | Health check | +| GET | `/docs` | Interactive API documentation | + +## Configuration + +### Environment Variables + +| Variable | Description | Default | +|----------|-------------|---------| +| `DATABASE_URL` | PostgreSQL connection string | `postgresql://user:password@localhost:5432/datalake` | +| `STORAGE_BACKEND` | Storage backend (`s3` or `minio`) | `minio` | +| `AWS_ACCESS_KEY_ID` | AWS access key (for S3) | - | +| `AWS_SECRET_ACCESS_KEY` | AWS secret key (for S3) | - | +| `AWS_REGION` | AWS region (for S3) | `us-east-1` | +| `S3_BUCKET_NAME` | S3 bucket name | `test-artifacts` | +| `MINIO_ENDPOINT` | MinIO endpoint | `localhost:9000` | +| `MINIO_ACCESS_KEY` | MinIO access key | `minioadmin` | +| `MINIO_SECRET_KEY` | MinIO secret key | `minioadmin` | +| `MINIO_BUCKET_NAME` | MinIO bucket name | `test-artifacts` | +| `MINIO_SECURE` | Use HTTPS for MinIO | `false` | +| `API_HOST` | API host | `0.0.0.0` | +| `API_PORT` | API port | `8000` | +| `MAX_UPLOAD_SIZE` | Max upload size (bytes) | `524288000` (500MB) | + +### Switching Between S3 and MinIO + +To use AWS S3: +```bash +STORAGE_BACKEND=s3 +AWS_ACCESS_KEY_ID=your_key +AWS_SECRET_ACCESS_KEY=your_secret +AWS_REGION=us-east-1 +S3_BUCKET_NAME=your-bucket +``` + +To use self-hosted MinIO: +```bash +STORAGE_BACKEND=minio +MINIO_ENDPOINT=minio:9000 +MINIO_ACCESS_KEY=minioadmin +MINIO_SECRET_KEY=minioadmin +MINIO_BUCKET_NAME=test-artifacts +``` + +## Deployment + +### Kubernetes with Helm + +1. Build and push Docker image: +```bash +docker build -t your-registry/datalake:latest . +docker push your-registry/datalake:latest +``` + +2. Install with Helm: +```bash +helm install datalake ./helm \ + --set image.repository=your-registry/datalake \ + --set image.tag=latest \ + --namespace datalake \ + --create-namespace +``` + +3. Access the API: +```bash +kubectl port-forward -n datalake svc/datalake 8000:8000 +``` + +### Helm Configuration + +Edit `helm/values.yaml` to customize: +- Replica count +- Resource limits +- Storage backend (S3 vs MinIO) +- Ingress settings +- PostgreSQL settings +- Autoscaling + +### GitLab CI/CD + +The included `.gitlab-ci.yml` provides: +- Automated testing +- Linting +- Docker image builds +- Deployments to dev/staging/prod + +**Required GitLab CI/CD Variables:** +- `CI_REGISTRY_USER`: Docker registry username +- `CI_REGISTRY_PASSWORD`: Docker registry password +- `KUBE_CONFIG_DEV`: Base64-encoded kubeconfig for dev +- `KUBE_CONFIG_STAGING`: Base64-encoded kubeconfig for staging +- `KUBE_CONFIG_PROD`: Base64-encoded kubeconfig for prod + +## Database Schema + +The `artifacts` table stores: +- File metadata (name, type, size, storage path) +- Test information (name, suite, config, result) +- Custom metadata and tags +- Timestamps and versioning + +## Example Use Cases + +### Store Test Results +Upload CSV files containing test execution results with metadata about the test suite and configuration. + +### Archive Packet Captures +Store PCAP files from network tests with tags for easy filtering and retrieval. + +### Track Test Configurations +Upload JSON test configurations and query them by date, test suite, or custom tags. + +### Binary Artifact Storage +Store compiled binaries, test data files, or any binary artifacts with full metadata. + +## Development + +### Running Tests +```bash +pytest tests/ -v +``` + +### Code Formatting +```bash +black app/ +flake8 app/ +``` + +### Database Migrations +```bash +alembic revision --autogenerate -m "description" +alembic upgrade head +``` + +## Troubleshooting + +### Cannot Connect to Database +- Verify PostgreSQL is running +- Check `DATABASE_URL` is correct +- Ensure database exists + +### Cannot Upload Files +- Check storage backend is running (MinIO or S3 accessible) +- Verify credentials are correct +- Check file size is under `MAX_UPLOAD_SIZE` + +### MinIO Connection Failed +- Ensure MinIO service is running +- Verify `MINIO_ENDPOINT` is correct +- Check MinIO credentials + +## License + +[Your License Here] + +## Support + +For issues and questions, please open an issue in the repository. diff --git a/alembic.ini b/alembic.ini new file mode 100644 index 0000000..8b22847 --- /dev/null +++ b/alembic.ini @@ -0,0 +1,41 @@ +[alembic] +script_location = alembic +prepend_sys_path = . +version_path_separator = os + +[alembic:exclude] +tables = spatial_ref_sys + +[loggers] +keys = root,sqlalchemy,alembic + +[handlers] +keys = console + +[formatters] +keys = generic + +[logger_root] +level = WARN +handlers = console +qualname = + +[logger_sqlalchemy] +level = WARN +handlers = +qualname = sqlalchemy.engine + +[logger_alembic] +level = INFO +handlers = +qualname = alembic + +[handler_console] +class = StreamHandler +args = (sys.stderr,) +level = NOTSET +formatter = generic + +[formatter_generic] +format = %(levelname)-5.5s [%(name)s] %(message)s +datefmt = %H:%M:%S diff --git a/app/__init__.py b/app/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/app/api/__init__.py b/app/api/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/app/api/artifacts.py b/app/api/artifacts.py new file mode 100644 index 0000000..db49811 --- /dev/null +++ b/app/api/artifacts.py @@ -0,0 +1,242 @@ +from fastapi import APIRouter, UploadFile, File, Form, Depends, HTTPException, Query +from fastapi.responses import StreamingResponse +from sqlalchemy.orm import Session +from typing import List, Optional +import uuid +import json +import io +from datetime import datetime + +from app.database import get_db +from app.models.artifact import Artifact +from app.schemas.artifact import ArtifactCreate, ArtifactResponse, ArtifactQuery +from app.storage import get_storage_backend + +router = APIRouter(prefix="/api/v1/artifacts", tags=["artifacts"]) + + +def get_file_type(filename: str) -> str: + """Determine file type from filename""" + extension = filename.lower().split('.')[-1] + type_mapping = { + 'csv': 'csv', + 'json': 'json', + 'pcap': 'pcap', + 'pcapng': 'pcap', + 'bin': 'binary', + 'dat': 'binary', + } + return type_mapping.get(extension, 'binary') + + +@router.post("/upload", response_model=ArtifactResponse, status_code=201) +async def upload_artifact( + file: UploadFile = File(...), + test_name: Optional[str] = Form(None), + test_suite: Optional[str] = Form(None), + test_config: Optional[str] = Form(None), + test_result: Optional[str] = Form(None), + metadata: Optional[str] = Form(None), + description: Optional[str] = Form(None), + tags: Optional[str] = Form(None), + version: Optional[str] = Form(None), + parent_id: Optional[int] = Form(None), + db: Session = Depends(get_db) +): + """ + Upload a new artifact file with metadata + + - **file**: The file to upload (CSV, JSON, binary, PCAP) + - **test_name**: Name of the test + - **test_suite**: Test suite identifier + - **test_config**: JSON string of test configuration + - **test_result**: Test result (pass, fail, skip, error) + - **metadata**: JSON string of additional metadata + - **description**: Text description of the artifact + - **tags**: JSON array of tags (as string) + - **version**: Version identifier + - **parent_id**: ID of parent artifact (for versioning) + """ + try: + # Parse JSON fields + test_config_dict = json.loads(test_config) if test_config else None + metadata_dict = json.loads(metadata) if metadata else None + tags_list = json.loads(tags) if tags else None + + # Generate unique storage path + file_extension = file.filename.split('.')[-1] if '.' in file.filename else '' + object_name = f"{uuid.uuid4()}.{file_extension}" if file_extension else str(uuid.uuid4()) + + # Upload to storage backend + storage = get_storage_backend() + file_content = await file.read() + file_size = len(file_content) + + storage_path = await storage.upload_file( + io.BytesIO(file_content), + object_name + ) + + # Create database record + artifact = Artifact( + filename=file.filename, + file_type=get_file_type(file.filename), + file_size=file_size, + storage_path=storage_path, + content_type=file.content_type, + test_name=test_name, + test_suite=test_suite, + test_config=test_config_dict, + test_result=test_result, + metadata=metadata_dict, + description=description, + tags=tags_list, + version=version, + parent_id=parent_id + ) + + db.add(artifact) + db.commit() + db.refresh(artifact) + + return artifact + + except json.JSONDecodeError as e: + raise HTTPException(status_code=400, detail=f"Invalid JSON in metadata fields: {str(e)}") + except Exception as e: + db.rollback() + raise HTTPException(status_code=500, detail=f"Upload failed: {str(e)}") + + +@router.get("/{artifact_id}", response_model=ArtifactResponse) +async def get_artifact(artifact_id: int, db: Session = Depends(get_db)): + """Get artifact metadata by ID""" + artifact = db.query(Artifact).filter(Artifact.id == artifact_id).first() + if not artifact: + raise HTTPException(status_code=404, detail="Artifact not found") + return artifact + + +@router.get("/{artifact_id}/download") +async def download_artifact(artifact_id: int, db: Session = Depends(get_db)): + """Download artifact file by ID""" + artifact = db.query(Artifact).filter(Artifact.id == artifact_id).first() + if not artifact: + raise HTTPException(status_code=404, detail="Artifact not found") + + try: + storage = get_storage_backend() + # Extract object name from storage path + object_name = artifact.storage_path.split('/')[-1] + file_data = await storage.download_file(object_name) + + return StreamingResponse( + io.BytesIO(file_data), + media_type=artifact.content_type or "application/octet-stream", + headers={ + "Content-Disposition": f'attachment; filename="{artifact.filename}"' + } + ) + except Exception as e: + raise HTTPException(status_code=500, detail=f"Download failed: {str(e)}") + + +@router.get("/{artifact_id}/url") +async def get_artifact_url( + artifact_id: int, + expiration: int = Query(default=3600, ge=60, le=86400), + db: Session = Depends(get_db) +): + """Get presigned URL for artifact download""" + artifact = db.query(Artifact).filter(Artifact.id == artifact_id).first() + if not artifact: + raise HTTPException(status_code=404, detail="Artifact not found") + + try: + storage = get_storage_backend() + object_name = artifact.storage_path.split('/')[-1] + url = await storage.get_file_url(object_name, expiration) + return {"url": url, "expires_in": expiration} + except Exception as e: + raise HTTPException(status_code=500, detail=f"Failed to generate URL: {str(e)}") + + +@router.delete("/{artifact_id}") +async def delete_artifact(artifact_id: int, db: Session = Depends(get_db)): + """Delete artifact and its file""" + artifact = db.query(Artifact).filter(Artifact.id == artifact_id).first() + if not artifact: + raise HTTPException(status_code=404, detail="Artifact not found") + + try: + # Delete from storage + storage = get_storage_backend() + object_name = artifact.storage_path.split('/')[-1] + await storage.delete_file(object_name) + + # Delete from database + db.delete(artifact) + db.commit() + + return {"message": "Artifact deleted successfully"} + except Exception as e: + db.rollback() + raise HTTPException(status_code=500, detail=f"Delete failed: {str(e)}") + + +@router.post("/query", response_model=List[ArtifactResponse]) +async def query_artifacts(query: ArtifactQuery, db: Session = Depends(get_db)): + """ + Query artifacts with filters + + - **filename**: Filter by filename (partial match) + - **file_type**: Filter by file type + - **test_name**: Filter by test name + - **test_suite**: Filter by test suite + - **test_result**: Filter by test result + - **tags**: Filter by tags (must contain all specified tags) + - **start_date**: Filter by creation date (from) + - **end_date**: Filter by creation date (to) + - **limit**: Maximum number of results + - **offset**: Number of results to skip + """ + q = db.query(Artifact) + + if query.filename: + q = q.filter(Artifact.filename.ilike(f"%{query.filename}%")) + if query.file_type: + q = q.filter(Artifact.file_type == query.file_type) + if query.test_name: + q = q.filter(Artifact.test_name.ilike(f"%{query.test_name}%")) + if query.test_suite: + q = q.filter(Artifact.test_suite == query.test_suite) + if query.test_result: + q = q.filter(Artifact.test_result == query.test_result) + if query.tags: + for tag in query.tags: + q = q.filter(Artifact.tags.contains([tag])) + if query.start_date: + q = q.filter(Artifact.created_at >= query.start_date) + if query.end_date: + q = q.filter(Artifact.created_at <= query.end_date) + + # Order by creation date descending + q = q.order_by(Artifact.created_at.desc()) + + # Apply pagination + artifacts = q.offset(query.offset).limit(query.limit).all() + + return artifacts + + +@router.get("/", response_model=List[ArtifactResponse]) +async def list_artifacts( + limit: int = Query(default=100, le=1000), + offset: int = Query(default=0, ge=0), + db: Session = Depends(get_db) +): + """List all artifacts with pagination""" + artifacts = db.query(Artifact).order_by( + Artifact.created_at.desc() + ).offset(offset).limit(limit).all() + return artifacts diff --git a/app/config.py b/app/config.py new file mode 100644 index 0000000..243cb0f --- /dev/null +++ b/app/config.py @@ -0,0 +1,35 @@ +from pydantic_settings import BaseSettings +from typing import Literal + + +class Settings(BaseSettings): + # Database + database_url: str = "postgresql://user:password@localhost:5432/datalake" + + # Storage Backend + storage_backend: Literal["s3", "minio"] = "minio" + + # AWS S3 + aws_access_key_id: str = "" + aws_secret_access_key: str = "" + aws_region: str = "us-east-1" + s3_bucket_name: str = "test-artifacts" + + # MinIO + minio_endpoint: str = "localhost:9000" + minio_access_key: str = "minioadmin" + minio_secret_key: str = "minioadmin" + minio_bucket_name: str = "test-artifacts" + minio_secure: bool = False + + # Application + api_host: str = "0.0.0.0" + api_port: int = 8000 + max_upload_size: int = 524288000 # 500MB + + class Config: + env_file = ".env" + case_sensitive = False + + +settings = Settings() diff --git a/app/database.py b/app/database.py new file mode 100644 index 0000000..b53ae14 --- /dev/null +++ b/app/database.py @@ -0,0 +1,21 @@ +from sqlalchemy import create_engine +from sqlalchemy.orm import sessionmaker +from app.config import settings +from app.models.artifact import Base + +engine = create_engine(settings.database_url) +SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine) + + +def init_db(): + """Initialize database tables""" + Base.metadata.create_all(bind=engine) + + +def get_db(): + """Dependency for getting database session""" + db = SessionLocal() + try: + yield db + finally: + db.close() diff --git a/app/main.py b/app/main.py new file mode 100644 index 0000000..e15a340 --- /dev/null +++ b/app/main.py @@ -0,0 +1,71 @@ +from fastapi import FastAPI +from fastapi.middleware.cors import CORSMiddleware +from app.api.artifacts import router as artifacts_router +from app.database import init_db +from app.config import settings +import logging + +# Configure logging +logging.basicConfig( + level=logging.INFO, + format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' +) + +logger = logging.getLogger(__name__) + +# Create FastAPI app +app = FastAPI( + title="Test Artifact Data Lake", + description="API for storing and querying test artifacts including CSV, JSON, binary files, and packet captures", + version="1.0.0", + docs_url="/docs", + redoc_url="/redoc" +) + +# Configure CORS +app.add_middleware( + CORSMiddleware, + allow_origins=["*"], + allow_credentials=True, + allow_methods=["*"], + allow_headers=["*"], +) + +# Include routers +app.include_router(artifacts_router) + + +@app.on_event("startup") +async def startup_event(): + """Initialize database on startup""" + logger.info("Initializing database...") + init_db() + logger.info(f"Using storage backend: {settings.storage_backend}") + logger.info("Application started successfully") + + +@app.get("/") +async def root(): + """Root endpoint""" + return { + "message": "Test Artifact Data Lake API", + "version": "1.0.0", + "docs": "/docs", + "storage_backend": settings.storage_backend + } + + +@app.get("/health") +async def health_check(): + """Health check endpoint""" + return {"status": "healthy"} + + +if __name__ == "__main__": + import uvicorn + uvicorn.run( + "app.main:app", + host=settings.api_host, + port=settings.api_port, + reload=True + ) diff --git a/app/models/__init__.py b/app/models/__init__.py new file mode 100644 index 0000000..8e5cb1a --- /dev/null +++ b/app/models/__init__.py @@ -0,0 +1,3 @@ +from .artifact import Artifact + +__all__ = ["Artifact"] diff --git a/app/models/artifact.py b/app/models/artifact.py new file mode 100644 index 0000000..caa31c5 --- /dev/null +++ b/app/models/artifact.py @@ -0,0 +1,38 @@ +from sqlalchemy import Column, String, Integer, DateTime, JSON, BigInteger, Text +from sqlalchemy.ext.declarative import declarative_base +from datetime import datetime + +Base = declarative_base() + + +class Artifact(Base): + __tablename__ = "artifacts" + + id = Column(Integer, primary_key=True, index=True) + filename = Column(String(500), nullable=False, index=True) + file_type = Column(String(50), nullable=False, index=True) # csv, json, binary, pcap + file_size = Column(BigInteger, nullable=False) + storage_path = Column(String(1000), nullable=False) + content_type = Column(String(100)) + + # Test metadata + test_name = Column(String(500), index=True) + test_suite = Column(String(500), index=True) + test_config = Column(JSON) + test_result = Column(String(50), index=True) # pass, fail, skip, error + + # Additional metadata + metadata = Column(JSON) + description = Column(Text) + tags = Column(JSON) # Array of tags for categorization + + # Timestamps + created_at = Column(DateTime, default=datetime.utcnow, index=True) + updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow) + + # Versioning + version = Column(String(50)) + parent_id = Column(Integer, index=True) # For file versioning + + def __repr__(self): + return f"" diff --git a/app/schemas/__init__.py b/app/schemas/__init__.py new file mode 100644 index 0000000..f6e77b9 --- /dev/null +++ b/app/schemas/__init__.py @@ -0,0 +1,3 @@ +from .artifact import ArtifactCreate, ArtifactResponse, ArtifactQuery + +__all__ = ["ArtifactCreate", "ArtifactResponse", "ArtifactQuery"] diff --git a/app/schemas/artifact.py b/app/schemas/artifact.py new file mode 100644 index 0000000..fbce37b --- /dev/null +++ b/app/schemas/artifact.py @@ -0,0 +1,51 @@ +from pydantic import BaseModel, Field +from typing import Optional, Dict, Any, List +from datetime import datetime + + +class ArtifactCreate(BaseModel): + test_name: Optional[str] = None + test_suite: Optional[str] = None + test_config: Optional[Dict[str, Any]] = None + test_result: Optional[str] = None + metadata: Optional[Dict[str, Any]] = None + description: Optional[str] = None + tags: Optional[List[str]] = None + version: Optional[str] = None + parent_id: Optional[int] = None + + +class ArtifactResponse(BaseModel): + id: int + filename: str + file_type: str + file_size: int + storage_path: str + content_type: Optional[str] = None + test_name: Optional[str] = None + test_suite: Optional[str] = None + test_config: Optional[Dict[str, Any]] = None + test_result: Optional[str] = None + metadata: Optional[Dict[str, Any]] = None + description: Optional[str] = None + tags: Optional[List[str]] = None + created_at: datetime + updated_at: datetime + version: Optional[str] = None + parent_id: Optional[int] = None + + class Config: + from_attributes = True + + +class ArtifactQuery(BaseModel): + filename: Optional[str] = None + file_type: Optional[str] = None + test_name: Optional[str] = None + test_suite: Optional[str] = None + test_result: Optional[str] = None + tags: Optional[List[str]] = None + start_date: Optional[datetime] = None + end_date: Optional[datetime] = None + limit: int = Field(default=100, le=1000) + offset: int = Field(default=0, ge=0) diff --git a/app/storage/__init__.py b/app/storage/__init__.py new file mode 100644 index 0000000..6a50500 --- /dev/null +++ b/app/storage/__init__.py @@ -0,0 +1,6 @@ +from .base import StorageBackend +from .s3_backend import S3Backend +from .minio_backend import MinIOBackend +from .factory import get_storage_backend + +__all__ = ["StorageBackend", "S3Backend", "MinIOBackend", "get_storage_backend"] diff --git a/app/storage/base.py b/app/storage/base.py new file mode 100644 index 0000000..f199760 --- /dev/null +++ b/app/storage/base.py @@ -0,0 +1,73 @@ +from abc import ABC, abstractmethod +from typing import BinaryIO + + +class StorageBackend(ABC): + """Abstract base class for storage backends""" + + @abstractmethod + async def upload_file(self, file_data: BinaryIO, object_name: str) -> str: + """ + Upload a file to storage + + Args: + file_data: Binary file data + object_name: Name/path of the object in storage + + Returns: + Storage path/URL of uploaded file + """ + pass + + @abstractmethod + async def download_file(self, object_name: str) -> bytes: + """ + Download a file from storage + + Args: + object_name: Name/path of the object in storage + + Returns: + Binary file data + """ + pass + + @abstractmethod + async def delete_file(self, object_name: str) -> bool: + """ + Delete a file from storage + + Args: + object_name: Name/path of the object in storage + + Returns: + True if successful + """ + pass + + @abstractmethod + async def file_exists(self, object_name: str) -> bool: + """ + Check if a file exists in storage + + Args: + object_name: Name/path of the object in storage + + Returns: + True if file exists + """ + pass + + @abstractmethod + async def get_file_url(self, object_name: str, expiration: int = 3600) -> str: + """ + Get a presigned URL for downloading a file + + Args: + object_name: Name/path of the object in storage + expiration: URL expiration time in seconds + + Returns: + Presigned URL + """ + pass diff --git a/app/storage/factory.py b/app/storage/factory.py new file mode 100644 index 0000000..adb99a3 --- /dev/null +++ b/app/storage/factory.py @@ -0,0 +1,17 @@ +from app.storage.base import StorageBackend +from app.storage.s3_backend import S3Backend +from app.storage.minio_backend import MinIOBackend +from app.config import settings + + +def get_storage_backend() -> StorageBackend: + """ + Factory function to get the appropriate storage backend + based on configuration + """ + if settings.storage_backend == "s3": + return S3Backend() + elif settings.storage_backend == "minio": + return MinIOBackend() + else: + raise ValueError(f"Unsupported storage backend: {settings.storage_backend}") diff --git a/app/storage/minio_backend.py b/app/storage/minio_backend.py new file mode 100644 index 0000000..d1b2d29 --- /dev/null +++ b/app/storage/minio_backend.py @@ -0,0 +1,88 @@ +import boto3 +from botocore.exceptions import ClientError +from botocore.client import Config +from typing import BinaryIO +from app.storage.base import StorageBackend +from app.config import settings +import logging + +logger = logging.getLogger(__name__) + + +class MinIOBackend(StorageBackend): + """MinIO storage backend implementation (S3-compatible)""" + + def __init__(self): + # MinIO uses S3-compatible API + self.s3_client = boto3.client( + 's3', + endpoint_url=f"{'https' if settings.minio_secure else 'http'}://{settings.minio_endpoint}", + aws_access_key_id=settings.minio_access_key, + aws_secret_access_key=settings.minio_secret_key, + config=Config(signature_version='s3v4'), + region_name='us-east-1' + ) + self.bucket_name = settings.minio_bucket_name + self._ensure_bucket_exists() + + def _ensure_bucket_exists(self): + """Create bucket if it doesn't exist""" + try: + self.s3_client.head_bucket(Bucket=self.bucket_name) + except ClientError as e: + error_code = e.response['Error']['Code'] + if error_code == '404': + try: + self.s3_client.create_bucket(Bucket=self.bucket_name) + logger.info(f"Created MinIO bucket: {self.bucket_name}") + except ClientError as create_error: + logger.error(f"Failed to create bucket: {create_error}") + raise + + async def upload_file(self, file_data: BinaryIO, object_name: str) -> str: + """Upload file to MinIO""" + try: + self.s3_client.upload_fileobj(file_data, self.bucket_name, object_name) + return f"minio://{self.bucket_name}/{object_name}" + except ClientError as e: + logger.error(f"Failed to upload file to MinIO: {e}") + raise + + async def download_file(self, object_name: str) -> bytes: + """Download file from MinIO""" + try: + response = self.s3_client.get_object(Bucket=self.bucket_name, Key=object_name) + return response['Body'].read() + except ClientError as e: + logger.error(f"Failed to download file from MinIO: {e}") + raise + + async def delete_file(self, object_name: str) -> bool: + """Delete file from MinIO""" + try: + self.s3_client.delete_object(Bucket=self.bucket_name, Key=object_name) + return True + except ClientError as e: + logger.error(f"Failed to delete file from MinIO: {e}") + return False + + async def file_exists(self, object_name: str) -> bool: + """Check if file exists in MinIO""" + try: + self.s3_client.head_object(Bucket=self.bucket_name, Key=object_name) + return True + except ClientError: + return False + + async def get_file_url(self, object_name: str, expiration: int = 3600) -> str: + """Generate presigned URL for MinIO object""" + try: + url = self.s3_client.generate_presigned_url( + 'get_object', + Params={'Bucket': self.bucket_name, 'Key': object_name}, + ExpiresIn=expiration + ) + return url + except ClientError as e: + logger.error(f"Failed to generate presigned URL: {e}") + raise diff --git a/app/storage/s3_backend.py b/app/storage/s3_backend.py new file mode 100644 index 0000000..066954a --- /dev/null +++ b/app/storage/s3_backend.py @@ -0,0 +1,87 @@ +import boto3 +from botocore.exceptions import ClientError +from typing import BinaryIO +from app.storage.base import StorageBackend +from app.config import settings +import logging + +logger = logging.getLogger(__name__) + + +class S3Backend(StorageBackend): + """AWS S3 storage backend implementation""" + + def __init__(self): + self.s3_client = boto3.client( + 's3', + aws_access_key_id=settings.aws_access_key_id, + aws_secret_access_key=settings.aws_secret_access_key, + region_name=settings.aws_region + ) + self.bucket_name = settings.s3_bucket_name + self._ensure_bucket_exists() + + def _ensure_bucket_exists(self): + """Create bucket if it doesn't exist""" + try: + self.s3_client.head_bucket(Bucket=self.bucket_name) + except ClientError as e: + error_code = e.response['Error']['Code'] + if error_code == '404': + try: + self.s3_client.create_bucket( + Bucket=self.bucket_name, + CreateBucketConfiguration={'LocationConstraint': settings.aws_region} + ) + logger.info(f"Created S3 bucket: {self.bucket_name}") + except ClientError as create_error: + logger.error(f"Failed to create bucket: {create_error}") + raise + + async def upload_file(self, file_data: BinaryIO, object_name: str) -> str: + """Upload file to S3""" + try: + self.s3_client.upload_fileobj(file_data, self.bucket_name, object_name) + return f"s3://{self.bucket_name}/{object_name}" + except ClientError as e: + logger.error(f"Failed to upload file to S3: {e}") + raise + + async def download_file(self, object_name: str) -> bytes: + """Download file from S3""" + try: + response = self.s3_client.get_object(Bucket=self.bucket_name, Key=object_name) + return response['Body'].read() + except ClientError as e: + logger.error(f"Failed to download file from S3: {e}") + raise + + async def delete_file(self, object_name: str) -> bool: + """Delete file from S3""" + try: + self.s3_client.delete_object(Bucket=self.bucket_name, Key=object_name) + return True + except ClientError as e: + logger.error(f"Failed to delete file from S3: {e}") + return False + + async def file_exists(self, object_name: str) -> bool: + """Check if file exists in S3""" + try: + self.s3_client.head_object(Bucket=self.bucket_name, Key=object_name) + return True + except ClientError: + return False + + async def get_file_url(self, object_name: str, expiration: int = 3600) -> str: + """Generate presigned URL for S3 object""" + try: + url = self.s3_client.generate_presigned_url( + 'get_object', + Params={'Bucket': self.bucket_name, 'Key': object_name}, + ExpiresIn=expiration + ) + return url + except ClientError as e: + logger.error(f"Failed to generate presigned URL: {e}") + raise diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000..4807e7d --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,62 @@ +version: '3.8' + +services: + postgres: + image: postgres:15 + environment: + POSTGRES_USER: user + POSTGRES_PASSWORD: password + POSTGRES_DB: datalake + ports: + - "5432:5432" + volumes: + - postgres_data:/var/lib/postgresql/data + healthcheck: + test: ["CMD-SHELL", "pg_isready -U user"] + interval: 10s + timeout: 5s + retries: 5 + + minio: + image: minio/minio:latest + command: server /data --console-address ":9001" + environment: + MINIO_ROOT_USER: minioadmin + MINIO_ROOT_PASSWORD: minioadmin + ports: + - "9000:9000" + - "9001:9001" + volumes: + - minio_data:/data + healthcheck: + test: ["CMD", "curl", "-f", "http://localhost:9000/minio/health/live"] + interval: 10s + timeout: 5s + retries: 5 + + api: + build: . + ports: + - "8000:8000" + environment: + DATABASE_URL: postgresql://user:password@postgres:5432/datalake + STORAGE_BACKEND: minio + MINIO_ENDPOINT: minio:9000 + MINIO_ACCESS_KEY: minioadmin + MINIO_SECRET_KEY: minioadmin + MINIO_BUCKET_NAME: test-artifacts + MINIO_SECURE: "false" + depends_on: + postgres: + condition: service_healthy + minio: + condition: service_healthy + healthcheck: + test: ["CMD", "python", "-c", "import requests; requests.get('http://localhost:8000/health')"] + interval: 30s + timeout: 10s + retries: 3 + +volumes: + postgres_data: + minio_data: diff --git a/helm/Chart.yaml b/helm/Chart.yaml new file mode 100644 index 0000000..ce650b4 --- /dev/null +++ b/helm/Chart.yaml @@ -0,0 +1,13 @@ +apiVersion: v2 +name: datalake +description: Test Artifact Data Lake - Store and query test artifacts +type: application +version: 1.0.0 +appVersion: "1.0.0" +keywords: + - testing + - artifacts + - storage + - data-lake +maintainers: + - name: Your Team diff --git a/helm/templates/_helpers.tpl b/helm/templates/_helpers.tpl new file mode 100644 index 0000000..0ebb04f --- /dev/null +++ b/helm/templates/_helpers.tpl @@ -0,0 +1,60 @@ +{{/* +Expand the name of the chart. +*/}} +{{- define "datalake.name" -}} +{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* +Create a default fully qualified app name. +*/}} +{{- define "datalake.fullname" -}} +{{- if .Values.fullnameOverride }} +{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }} +{{- else }} +{{- $name := default .Chart.Name .Values.nameOverride }} +{{- if contains $name .Release.Name }} +{{- .Release.Name | trunc 63 | trimSuffix "-" }} +{{- else }} +{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }} +{{- end }} +{{- end }} +{{- end }} + +{{/* +Create chart name and version as used by the chart label. +*/}} +{{- define "datalake.chart" -}} +{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* +Common labels +*/}} +{{- define "datalake.labels" -}} +helm.sh/chart: {{ include "datalake.chart" . }} +{{ include "datalake.selectorLabels" . }} +{{- if .Chart.AppVersion }} +app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} +{{- end }} +app.kubernetes.io/managed-by: {{ .Release.Service }} +{{- end }} + +{{/* +Selector labels +*/}} +{{- define "datalake.selectorLabels" -}} +app.kubernetes.io/name: {{ include "datalake.name" . }} +app.kubernetes.io/instance: {{ .Release.Name }} +{{- end }} + +{{/* +Create the name of the service account to use +*/}} +{{- define "datalake.serviceAccountName" -}} +{{- if .Values.serviceAccount.create }} +{{- default (include "datalake.fullname" .) .Values.serviceAccount.name }} +{{- else }} +{{- default "default" .Values.serviceAccount.name }} +{{- end }} +{{- end }} diff --git a/helm/templates/deployment.yaml b/helm/templates/deployment.yaml new file mode 100644 index 0000000..652d855 --- /dev/null +++ b/helm/templates/deployment.yaml @@ -0,0 +1,111 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{ include "datalake.fullname" . }} + labels: + {{- include "datalake.labels" . | nindent 4 }} +spec: + {{- if not .Values.autoscaling.enabled }} + replicas: {{ .Values.replicaCount }} + {{- end }} + selector: + matchLabels: + {{- include "datalake.selectorLabels" . | nindent 6 }} + template: + metadata: + {{- with .Values.podAnnotations }} + annotations: + {{- toYaml . | nindent 8 }} + {{- end }} + labels: + {{- include "datalake.selectorLabels" . | nindent 8 }} + spec: + {{- with .Values.imagePullSecrets }} + imagePullSecrets: + {{- toYaml . | nindent 8 }} + {{- end }} + serviceAccountName: {{ include "datalake.serviceAccountName" . }} + securityContext: + {{- toYaml .Values.podSecurityContext | nindent 8 }} + containers: + - name: {{ .Chart.Name }} + securityContext: + {{- toYaml .Values.securityContext | nindent 12 }} + image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default .Chart.AppVersion }}" + imagePullPolicy: {{ .Values.image.pullPolicy }} + ports: + - name: http + containerPort: {{ .Values.service.targetPort }} + protocol: TCP + livenessProbe: + httpGet: + path: /health + port: http + initialDelaySeconds: 30 + periodSeconds: 10 + readinessProbe: + httpGet: + path: /health + port: http + initialDelaySeconds: 5 + periodSeconds: 5 + env: + - name: DATABASE_URL + valueFrom: + secretKeyRef: + name: {{ include "datalake.fullname" . }}-secrets + key: database-url + - name: STORAGE_BACKEND + value: {{ .Values.config.storageBackend | quote }} + - name: MAX_UPLOAD_SIZE + value: {{ .Values.config.maxUploadSize | quote }} + {{- if eq .Values.config.storageBackend "s3" }} + - name: AWS_ACCESS_KEY_ID + valueFrom: + secretKeyRef: + name: {{ include "datalake.fullname" . }}-secrets + key: aws-access-key-id + - name: AWS_SECRET_ACCESS_KEY + valueFrom: + secretKeyRef: + name: {{ include "datalake.fullname" . }}-secrets + key: aws-secret-access-key + - name: AWS_REGION + value: {{ .Values.aws.region | quote }} + - name: S3_BUCKET_NAME + value: {{ .Values.aws.bucketName | quote }} + {{- else }} + - name: MINIO_ENDPOINT + value: "{{ include "datalake.fullname" . }}-minio:9000" + - name: MINIO_ACCESS_KEY + valueFrom: + secretKeyRef: + name: {{ include "datalake.fullname" . }}-secrets + key: minio-access-key + - name: MINIO_SECRET_KEY + valueFrom: + secretKeyRef: + name: {{ include "datalake.fullname" . }}-secrets + key: minio-secret-key + - name: MINIO_BUCKET_NAME + value: "test-artifacts" + - name: MINIO_SECURE + value: "false" + {{- end }} + {{- with .Values.env }} + {{- toYaml . | nindent 8 }} + {{- end }} + resources: + {{- toYaml .Values.resources | nindent 12 }} + {{- with .Values.nodeSelector }} + nodeSelector: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.affinity }} + affinity: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.tolerations }} + tolerations: + {{- toYaml . | nindent 8 }} + {{- end }} diff --git a/helm/templates/ingress.yaml b/helm/templates/ingress.yaml new file mode 100644 index 0000000..30c1764 --- /dev/null +++ b/helm/templates/ingress.yaml @@ -0,0 +1,41 @@ +{{- if .Values.ingress.enabled -}} +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: {{ include "datalake.fullname" . }} + labels: + {{- include "datalake.labels" . | nindent 4 }} + {{- with .Values.ingress.annotations }} + annotations: + {{- toYaml . | nindent 4 }} + {{- end }} +spec: + {{- if .Values.ingress.className }} + ingressClassName: {{ .Values.ingress.className }} + {{- end }} + {{- if .Values.ingress.tls }} + tls: + {{- range .Values.ingress.tls }} + - hosts: + {{- range .hosts }} + - {{ . | quote }} + {{- end }} + secretName: {{ .secretName }} + {{- end }} + {{- end }} + rules: + {{- range .Values.ingress.hosts }} + - host: {{ .host | quote }} + http: + paths: + {{- range .paths }} + - path: {{ .path }} + pathType: {{ .pathType }} + backend: + service: + name: {{ include "datalake.fullname" $ }} + port: + number: {{ $.Values.service.port }} + {{- end }} + {{- end }} +{{- end }} diff --git a/helm/templates/secrets.yaml b/helm/templates/secrets.yaml new file mode 100644 index 0000000..d132fc1 --- /dev/null +++ b/helm/templates/secrets.yaml @@ -0,0 +1,16 @@ +apiVersion: v1 +kind: Secret +metadata: + name: {{ include "datalake.fullname" . }}-secrets + labels: + {{- include "datalake.labels" . | nindent 4 }} +type: Opaque +stringData: + database-url: "postgresql://{{ .Values.postgresql.auth.username }}:{{ .Values.postgresql.auth.password }}@{{ include "datalake.fullname" . }}-postgresql:5432/{{ .Values.postgresql.auth.database }}" + {{- if .Values.aws.enabled }} + aws-access-key-id: {{ .Values.aws.accessKeyId | quote }} + aws-secret-access-key: {{ .Values.aws.secretAccessKey | quote }} + {{- else }} + minio-access-key: {{ .Values.minio.rootUser | quote }} + minio-secret-key: {{ .Values.minio.rootPassword | quote }} + {{- end }} diff --git a/helm/templates/service.yaml b/helm/templates/service.yaml new file mode 100644 index 0000000..f23d030 --- /dev/null +++ b/helm/templates/service.yaml @@ -0,0 +1,15 @@ +apiVersion: v1 +kind: Service +metadata: + name: {{ include "datalake.fullname" . }} + labels: + {{- include "datalake.labels" . | nindent 4 }} +spec: + type: {{ .Values.service.type }} + ports: + - port: {{ .Values.service.port }} + targetPort: http + protocol: TCP + name: http + selector: + {{- include "datalake.selectorLabels" . | nindent 4 }} diff --git a/helm/templates/serviceaccount.yaml b/helm/templates/serviceaccount.yaml new file mode 100644 index 0000000..2e7472b --- /dev/null +++ b/helm/templates/serviceaccount.yaml @@ -0,0 +1,12 @@ +{{- if .Values.serviceAccount.create -}} +apiVersion: v1 +kind: ServiceAccount +metadata: + name: {{ include "datalake.serviceAccountName" . }} + labels: + {{- include "datalake.labels" . | nindent 4 }} + {{- with .Values.serviceAccount.annotations }} + annotations: + {{- toYaml . | nindent 4 }} + {{- end }} +{{- end }} diff --git a/helm/values.yaml b/helm/values.yaml new file mode 100644 index 0000000..c468fb2 --- /dev/null +++ b/helm/values.yaml @@ -0,0 +1,111 @@ +replicaCount: 1 + +image: + repository: datalake + pullPolicy: IfNotPresent + tag: "latest" + +imagePullSecrets: [] +nameOverride: "" +fullnameOverride: "" + +serviceAccount: + create: true + annotations: {} + name: "" + +podAnnotations: {} + +podSecurityContext: + fsGroup: 1000 + +securityContext: + capabilities: + drop: + - ALL + readOnlyRootFilesystem: false + runAsNonRoot: true + runAsUser: 1000 + +service: + type: ClusterIP + port: 8000 + targetPort: 8000 + +ingress: + enabled: false + className: "" + annotations: {} + hosts: + - host: datalake.local + paths: + - path: / + pathType: Prefix + tls: [] + +resources: + limits: + cpu: 1000m + memory: 1Gi + requests: + cpu: 500m + memory: 512Mi + +autoscaling: + enabled: false + minReplicas: 1 + maxReplicas: 10 + targetCPUUtilizationPercentage: 80 + +nodeSelector: {} + +tolerations: [] + +affinity: {} + +# Application configuration +config: + storageBackend: minio # or "s3" + maxUploadSize: 524288000 # 500MB + +# PostgreSQL configuration +postgresql: + enabled: true + auth: + username: user + password: password + database: datalake + primary: + persistence: + enabled: true + size: 10Gi + +# MinIO configuration (for self-hosted storage) +minio: + enabled: true + mode: standalone + rootUser: minioadmin + rootPassword: minioadmin + persistence: + enabled: true + size: 50Gi + service: + type: ClusterIP + port: 9000 + consoleService: + port: 9001 + +# AWS S3 configuration (when using AWS) +aws: + enabled: false + accessKeyId: "" + secretAccessKey: "" + region: us-east-1 + bucketName: test-artifacts + +# Environment variables +env: + - name: API_HOST + value: "0.0.0.0" + - name: API_PORT + value: "8000" diff --git a/quickstart.sh b/quickstart.sh new file mode 100755 index 0000000..e963763 --- /dev/null +++ b/quickstart.sh @@ -0,0 +1,80 @@ +#!/bin/bash + +set -e + +echo "=========================================" +echo "Test Artifact Data Lake - Quick Start" +echo "=========================================" +echo "" + +# Check if Docker is installed +if ! command -v docker &> /dev/null; then + echo "Error: Docker is not installed. Please install Docker first." + exit 1 +fi + +# Check if Docker Compose is installed +if ! command -v docker-compose &> /dev/null; then + echo "Error: Docker Compose is not installed. Please install Docker Compose first." + exit 1 +fi + +# Create .env file if it doesn't exist +if [ ! -f .env ]; then + echo "Creating .env file from .env.example..." + cp .env.example .env + echo "✓ .env file created" +else + echo "✓ .env file already exists" +fi + +echo "" +echo "Starting services with Docker Compose..." +docker-compose up -d + +echo "" +echo "Waiting for services to be ready..." +sleep 10 + +echo "" +echo "=========================================" +echo "Services are running!" +echo "=========================================" +echo "" +echo "API: http://localhost:8000" +echo "API Docs: http://localhost:8000/docs" +echo "MinIO Console: http://localhost:9001" +echo " Username: minioadmin" +echo " Password: minioadmin" +echo "" +echo "To view logs: docker-compose logs -f" +echo "To stop: docker-compose down" +echo "" +echo "=========================================" +echo "Testing the API..." +echo "=========================================" +echo "" + +# Wait a bit more for API to be fully ready +sleep 5 + +# Test health endpoint +if curl -s http://localhost:8000/health | grep -q "healthy"; then + echo "✓ API is healthy!" + echo "" + echo "Example: Upload a test file" + echo "----------------------------" + echo 'echo "test,data" > test.csv' + echo 'curl -X POST "http://localhost:8000/api/v1/artifacts/upload" \' + echo ' -F "file=@test.csv" \' + echo ' -F "test_name=sample_test" \' + echo ' -F "test_suite=demo" \' + echo ' -F "test_result=pass"' + echo "" +else + echo "⚠ API is not responding yet. Please wait a moment and check http://localhost:8000/health" +fi + +echo "=========================================" +echo "Setup complete! 🚀" +echo "=========================================" diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..e40ffcb --- /dev/null +++ b/requirements.txt @@ -0,0 +1,11 @@ +fastapi==0.115.0 +uvicorn[standard]==0.31.0 +python-multipart==0.0.12 +sqlalchemy==2.0.35 +psycopg2-binary==2.9.9 +alembic==1.13.3 +boto3==1.35.36 +python-dotenv==1.0.1 +pydantic==2.9.2 +pydantic-settings==2.5.2 +aiofiles==24.1.0 diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_api.py b/tests/test_api.py new file mode 100644 index 0000000..d3fd748 --- /dev/null +++ b/tests/test_api.py @@ -0,0 +1,38 @@ +import pytest +from fastapi.testclient import TestClient +from app.main import app + +client = TestClient(app) + + +def test_root(): + """Test root endpoint""" + response = client.get("/") + assert response.status_code == 200 + data = response.json() + assert "message" in data + assert "version" in data + + +def test_health(): + """Test health check endpoint""" + response = client.get("/health") + assert response.status_code == 200 + data = response.json() + assert data["status"] == "healthy" + + +# Add more tests as needed +# def test_upload_artifact(): +# """Test artifact upload""" +# files = {"file": ("test.csv", b"test,data\n1,2", "text/csv")} +# data = { +# "test_name": "sample_test", +# "test_suite": "unit", +# "test_result": "pass" +# } +# response = client.post("/api/v1/artifacts/upload", files=files, data=data) +# assert response.status_code == 201 +# artifact = response.json() +# assert artifact["filename"] == "test.csv" +# assert artifact["test_name"] == "sample_test"