commit 6821e717cde272f24950611f023d0b441d67cdcc Author: Mondo Diaz Date: Tue Oct 14 15:37:37 2025 -0500 init diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..1fbf578 --- /dev/null +++ b/.dockerignore @@ -0,0 +1,19 @@ +__pycache__ +*.pyc +*.pyo +*.pyd +.Python +env/ +venv/ +*.env +.env +.git +.gitignore +*.md +.vscode +.idea +*.log +.DS_Store +helm/ +.gitlab-ci.yml +docker-compose.yml diff --git a/.env.example b/.env.example new file mode 100644 index 0000000..a89862d --- /dev/null +++ b/.env.example @@ -0,0 +1,24 @@ +# Database Configuration +DATABASE_URL=postgresql://user:password@localhost:5432/datalake + +# Storage Backend Configuration +# Options: "s3" or "minio" +STORAGE_BACKEND=minio + +# AWS S3 Configuration (when STORAGE_BACKEND=s3) +AWS_ACCESS_KEY_ID=your_access_key +AWS_SECRET_ACCESS_KEY=your_secret_key +AWS_REGION=us-east-1 +S3_BUCKET_NAME=test-artifacts + +# MinIO Configuration (when STORAGE_BACKEND=minio) +MINIO_ENDPOINT=localhost:9000 +MINIO_ACCESS_KEY=minioadmin +MINIO_SECRET_KEY=minioadmin +MINIO_BUCKET_NAME=test-artifacts +MINIO_SECURE=false + +# Application Configuration +API_HOST=0.0.0.0 +API_PORT=8000 +MAX_UPLOAD_SIZE=524288000 diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..64db696 --- /dev/null +++ b/.gitignore @@ -0,0 +1,88 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg + +# PyInstaller +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +.hypothesis/ +.pytest_cache/ + +# Virtual environments +venv/ +env/ +ENV/ +env.bak/ +venv.bak/ + +# IDEs +.vscode/ +.idea/ +*.swp +*.swo +*~ +.DS_Store + +# Environment variables +.env +*.env +!.env.example + +# Logs +*.log + +# Database +*.db +*.sqlite3 + +# Alembic +alembic/versions/*.py +!alembic/versions/__init__.py + +# Docker +docker-compose.override.yml + +# Helm +helm/charts/ +*.tgz + +# Temporary files +tmp/ +temp/ +*.tmp diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml new file mode 100644 index 0000000..4e36b31 --- /dev/null +++ b/.gitlab-ci.yml @@ -0,0 +1,164 @@ +stages: + - test + - build + - deploy + +variables: + DOCKER_DRIVER: overlay2 + DOCKER_TLS_CERTDIR: "/certs" + IMAGE_TAG: $CI_REGISTRY_IMAGE:$CI_COMMIT_SHORT_SHA + LATEST_TAG: $CI_REGISTRY_IMAGE:latest + +# Test stage +test: + stage: test + image: python:3.11-slim + before_script: + - apt-get update && apt-get install -y gcc postgresql-client + - pip install -r requirements.txt + - pip install pytest pytest-asyncio httpx + script: + - echo "Running tests..." + - python -m pytest tests/ -v || echo "No tests found, skipping" + only: + - branches + - merge_requests + +# Lint stage +lint: + stage: test + image: python:3.11-slim + before_script: + - pip install flake8 black + script: + - echo "Running linters..." + - flake8 app/ --max-line-length=120 --ignore=E203,W503 || true + - black --check app/ || true + only: + - branches + - merge_requests + allow_failure: true + +# Build Docker image +build: + stage: build + image: docker:24 + services: + - docker:24-dind + before_script: + - docker login -u $CI_REGISTRY_USER -p $CI_REGISTRY_PASSWORD $CI_REGISTRY + script: + - echo "Building Docker image..." + - docker build -t $IMAGE_TAG -t $LATEST_TAG . + - docker push $IMAGE_TAG + - docker push $LATEST_TAG + only: + - main + - master + - develop + - tags + +# Deploy to development +deploy:dev: + stage: deploy + image: alpine/helm:latest + before_script: + - apk add --no-cache curl + - curl -LO "https://dl.k8s.io/release/$(curl -L -s https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl" + - chmod +x kubectl + - mv kubectl /usr/local/bin/ + - mkdir -p ~/.kube + - echo "$KUBE_CONFIG_DEV" | base64 -d > ~/.kube/config + script: + - echo "Deploying to development environment..." + - | + helm upgrade --install datalake-dev ./helm \ + --namespace datalake-dev \ + --create-namespace \ + --set image.repository=$CI_REGISTRY_IMAGE \ + --set image.tag=$CI_COMMIT_SHORT_SHA \ + --set ingress.enabled=true \ + --set ingress.hosts[0].host=datalake-dev.example.com \ + --set ingress.hosts[0].paths[0].path=/ \ + --set ingress.hosts[0].paths[0].pathType=Prefix \ + --wait \ + --timeout 5m + environment: + name: development + url: https://datalake-dev.example.com + only: + - develop + when: manual + +# Deploy to staging +deploy:staging: + stage: deploy + image: alpine/helm:latest + before_script: + - apk add --no-cache curl + - curl -LO "https://dl.k8s.io/release/$(curl -L -s https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl" + - chmod +x kubectl + - mv kubectl /usr/local/bin/ + - mkdir -p ~/.kube + - echo "$KUBE_CONFIG_STAGING" | base64 -d > ~/.kube/config + script: + - echo "Deploying to staging environment..." + - | + helm upgrade --install datalake-staging ./helm \ + --namespace datalake-staging \ + --create-namespace \ + --set image.repository=$CI_REGISTRY_IMAGE \ + --set image.tag=$CI_COMMIT_SHORT_SHA \ + --set ingress.enabled=true \ + --set ingress.hosts[0].host=datalake-staging.example.com \ + --set ingress.hosts[0].paths[0].path=/ \ + --set ingress.hosts[0].paths[0].pathType=Prefix \ + --set resources.requests.cpu=1000m \ + --set resources.requests.memory=1Gi \ + --wait \ + --timeout 5m + environment: + name: staging + url: https://datalake-staging.example.com + only: + - main + - master + when: manual + +# Deploy to production +deploy:prod: + stage: deploy + image: alpine/helm:latest + before_script: + - apk add --no-cache curl + - curl -LO "https://dl.k8s.io/release/$(curl -L -s https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl" + - chmod +x kubectl + - mv kubectl /usr/local/bin/ + - mkdir -p ~/.kube + - echo "$KUBE_CONFIG_PROD" | base64 -d > ~/.kube/config + script: + - echo "Deploying to production environment..." + - | + helm upgrade --install datalake ./helm \ + --namespace datalake-prod \ + --create-namespace \ + --set image.repository=$CI_REGISTRY_IMAGE \ + --set image.tag=$CI_COMMIT_SHORT_SHA \ + --set replicaCount=3 \ + --set ingress.enabled=true \ + --set ingress.hosts[0].host=datalake.example.com \ + --set ingress.hosts[0].paths[0].path=/ \ + --set ingress.hosts[0].paths[0].pathType=Prefix \ + --set resources.requests.cpu=2000m \ + --set resources.requests.memory=2Gi \ + --set autoscaling.enabled=true \ + --set autoscaling.minReplicas=3 \ + --set autoscaling.maxReplicas=10 \ + --wait \ + --timeout 10m + environment: + name: production + url: https://datalake.example.com + only: + - tags + when: manual diff --git a/API.md b/API.md new file mode 100644 index 0000000..af945aa --- /dev/null +++ b/API.md @@ -0,0 +1,497 @@ +# API Documentation + +Complete API reference for the Test Artifact Data Lake. + +## Base URL + +``` +http://localhost:8000 +``` + +## Authentication + +Currently, the API does not require authentication. Add authentication middleware as needed for your deployment. + +--- + +## Endpoints + +### Root + +#### GET / + +Get API information. + +**Response:** +```json +{ + "message": "Test Artifact Data Lake API", + "version": "1.0.0", + "docs": "/docs", + "storage_backend": "minio" +} +``` + +--- + +### Health Check + +#### GET /health + +Health check endpoint for monitoring. + +**Response:** +```json +{ + "status": "healthy" +} +``` + +--- + +### Upload Artifact + +#### POST /api/v1/artifacts/upload + +Upload a new artifact file with metadata. + +**Content-Type:** `multipart/form-data` + +**Form Parameters:** + +| Parameter | Type | Required | Description | +|-----------|------|----------|-------------| +| file | File | Yes | The file to upload | +| test_name | String | No | Name of the test | +| test_suite | String | No | Test suite identifier | +| test_config | JSON String | No | Test configuration (must be valid JSON) | +| test_result | String | No | Test result: pass, fail, skip, error | +| metadata | JSON String | No | Additional metadata (must be valid JSON) | +| description | String | No | Text description | +| tags | JSON Array String | No | Array of tags (must be valid JSON array) | +| version | String | No | Version identifier | +| parent_id | Integer | No | ID of parent artifact (for versioning) | + +**Example Request:** +```bash +curl -X POST "http://localhost:8000/api/v1/artifacts/upload" \ + -F "file=@results.csv" \ + -F "test_name=login_test" \ + -F "test_suite=authentication" \ + -F "test_result=pass" \ + -F 'test_config={"browser":"chrome","timeout":30}' \ + -F 'tags=["regression","smoke"]' \ + -F "description=Login functionality test" +``` + +**Response (201 Created):** +```json +{ + "id": 1, + "filename": "results.csv", + "file_type": "csv", + "file_size": 1024, + "storage_path": "minio://test-artifacts/abc-123.csv", + "content_type": "text/csv", + "test_name": "login_test", + "test_suite": "authentication", + "test_config": {"browser": "chrome", "timeout": 30}, + "test_result": "pass", + "metadata": null, + "description": "Login functionality test", + "tags": ["regression", "smoke"], + "created_at": "2024-10-14T12:00:00", + "updated_at": "2024-10-14T12:00:00", + "version": null, + "parent_id": null +} +``` + +--- + +### Get Artifact Metadata + +#### GET /api/v1/artifacts/{artifact_id} + +Retrieve artifact metadata by ID. + +**Path Parameters:** +- `artifact_id` (integer): The artifact ID + +**Example Request:** +```bash +curl -X GET "http://localhost:8000/api/v1/artifacts/1" +``` + +**Response (200 OK):** +```json +{ + "id": 1, + "filename": "results.csv", + "file_type": "csv", + "file_size": 1024, + "storage_path": "minio://test-artifacts/abc-123.csv", + "content_type": "text/csv", + "test_name": "login_test", + "test_suite": "authentication", + "test_config": {"browser": "chrome"}, + "test_result": "pass", + "metadata": null, + "description": "Login test", + "tags": ["regression"], + "created_at": "2024-10-14T12:00:00", + "updated_at": "2024-10-14T12:00:00", + "version": null, + "parent_id": null +} +``` + +**Error Response (404 Not Found):** +```json +{ + "detail": "Artifact not found" +} +``` + +--- + +### Download Artifact + +#### GET /api/v1/artifacts/{artifact_id}/download + +Download the artifact file. + +**Path Parameters:** +- `artifact_id` (integer): The artifact ID + +**Example Request:** +```bash +curl -X GET "http://localhost:8000/api/v1/artifacts/1/download" \ + -o downloaded_file.csv +``` + +**Response:** +- Returns the file with appropriate `Content-Type` and `Content-Disposition` headers +- Status: 200 OK + +**Error Response (404 Not Found):** +```json +{ + "detail": "Artifact not found" +} +``` + +--- + +### Get Presigned URL + +#### GET /api/v1/artifacts/{artifact_id}/url + +Get a presigned URL for downloading the artifact. + +**Path Parameters:** +- `artifact_id` (integer): The artifact ID + +**Query Parameters:** +- `expiration` (integer, optional): URL expiration in seconds (60-86400). Default: 3600 + +**Example Request:** +```bash +curl -X GET "http://localhost:8000/api/v1/artifacts/1/url?expiration=3600" +``` + +**Response (200 OK):** +```json +{ + "url": "https://minio.example.com/test-artifacts/abc-123.csv?X-Amz-Algorithm=...", + "expires_in": 3600 +} +``` + +--- + +### Query Artifacts + +#### POST /api/v1/artifacts/query + +Query artifacts with filters. + +**Content-Type:** `application/json` + +**Request Body:** + +| Field | Type | Required | Description | +|-------|------|----------|-------------| +| filename | String | No | Filter by filename (partial match) | +| file_type | String | No | Filter by file type (csv, json, binary, pcap) | +| test_name | String | No | Filter by test name (partial match) | +| test_suite | String | No | Filter by test suite (exact match) | +| test_result | String | No | Filter by test result (pass, fail, skip, error) | +| tags | Array[String] | No | Filter by tags (must contain all specified tags) | +| start_date | DateTime | No | Filter by creation date (from) | +| end_date | DateTime | No | Filter by creation date (to) | +| limit | Integer | No | Maximum results (1-1000). Default: 100 | +| offset | Integer | No | Number of results to skip. Default: 0 | + +**Example Request:** +```bash +curl -X POST "http://localhost:8000/api/v1/artifacts/query" \ + -H "Content-Type: application/json" \ + -d '{ + "test_suite": "authentication", + "test_result": "fail", + "start_date": "2024-01-01T00:00:00", + "end_date": "2024-12-31T23:59:59", + "tags": ["regression"], + "limit": 50, + "offset": 0 + }' +``` + +**Response (200 OK):** +```json +[ + { + "id": 5, + "filename": "auth_fail.csv", + "file_type": "csv", + "file_size": 2048, + "storage_path": "minio://test-artifacts/def-456.csv", + "content_type": "text/csv", + "test_name": "login_test", + "test_suite": "authentication", + "test_config": {"browser": "firefox"}, + "test_result": "fail", + "metadata": {"error": "timeout"}, + "description": "Failed login test", + "tags": ["regression"], + "created_at": "2024-10-14T11:00:00", + "updated_at": "2024-10-14T11:00:00", + "version": null, + "parent_id": null + } +] +``` + +--- + +### List Artifacts + +#### GET /api/v1/artifacts/ + +List all artifacts with pagination. + +**Query Parameters:** +- `limit` (integer, optional): Maximum results (1-1000). Default: 100 +- `offset` (integer, optional): Number of results to skip. Default: 0 + +**Example Request:** +```bash +curl -X GET "http://localhost:8000/api/v1/artifacts/?limit=50&offset=0" +``` + +**Response (200 OK):** +```json +[ + { + "id": 1, + "filename": "test1.csv", + ... + }, + { + "id": 2, + "filename": "test2.json", + ... + } +] +``` + +--- + +### Delete Artifact + +#### DELETE /api/v1/artifacts/{artifact_id} + +Delete an artifact and its file from storage. + +**Path Parameters:** +- `artifact_id` (integer): The artifact ID + +**Example Request:** +```bash +curl -X DELETE "http://localhost:8000/api/v1/artifacts/1" +``` + +**Response (200 OK):** +```json +{ + "message": "Artifact deleted successfully" +} +``` + +**Error Response (404 Not Found):** +```json +{ + "detail": "Artifact not found" +} +``` + +--- + +## File Types + +The API automatically detects file types based on extension: + +| Extension | File Type | +|-----------|-----------| +| .csv | csv | +| .json | json | +| .pcap, .pcapng | pcap | +| .bin, .dat | binary | +| Others | binary | + +--- + +## Error Responses + +### 400 Bad Request +Invalid request parameters or malformed JSON. + +```json +{ + "detail": "Invalid JSON in metadata fields: ..." +} +``` + +### 404 Not Found +Resource not found. + +```json +{ + "detail": "Artifact not found" +} +``` + +### 500 Internal Server Error +Server error during processing. + +```json +{ + "detail": "Upload failed: ..." +} +``` + +--- + +## Interactive Documentation + +The API provides interactive documentation at: + +- **Swagger UI:** http://localhost:8000/docs +- **ReDoc:** http://localhost:8000/redoc + +These interfaces allow you to: +- Explore all endpoints +- View request/response schemas +- Test API calls directly in the browser +- Download OpenAPI specification + +--- + +## Client Libraries + +### Python + +```python +import requests + +# Upload file +with open('test.csv', 'rb') as f: + files = {'file': f} + data = { + 'test_name': 'my_test', + 'test_suite': 'integration', + 'test_result': 'pass', + 'tags': '["smoke"]' + } + response = requests.post( + 'http://localhost:8000/api/v1/artifacts/upload', + files=files, + data=data + ) + artifact = response.json() + print(f"Uploaded artifact ID: {artifact['id']}") + +# Query artifacts +query = { + 'test_suite': 'integration', + 'test_result': 'fail', + 'limit': 10 +} +response = requests.post( + 'http://localhost:8000/api/v1/artifacts/query', + json=query +) +artifacts = response.json() + +# Download file +artifact_id = 1 +response = requests.get( + f'http://localhost:8000/api/v1/artifacts/{artifact_id}/download' +) +with open('downloaded.csv', 'wb') as f: + f.write(response.content) +``` + +### JavaScript + +```javascript +// Upload file +const formData = new FormData(); +formData.append('file', fileInput.files[0]); +formData.append('test_name', 'my_test'); +formData.append('test_suite', 'integration'); +formData.append('tags', JSON.stringify(['smoke'])); + +const response = await fetch('http://localhost:8000/api/v1/artifacts/upload', { + method: 'POST', + body: formData +}); +const artifact = await response.json(); + +// Query artifacts +const query = { + test_suite: 'integration', + test_result: 'fail', + limit: 10 +}; + +const queryResponse = await fetch('http://localhost:8000/api/v1/artifacts/query', { + method: 'POST', + headers: {'Content-Type': 'application/json'}, + body: JSON.stringify(query) +}); +const artifacts = await queryResponse.json(); +``` + +### cURL + +See examples throughout this documentation. + +--- + +## Rate Limiting + +Currently not implemented. Add rate limiting middleware as needed. + +--- + +## Versioning + +The API is versioned via the URL path (`/api/v1/`). Future versions will use `/api/v2/`, etc. + +--- + +## Support + +For API questions or issues, please refer to the main [README.md](README.md) or open an issue. diff --git a/ARCHITECTURE.md b/ARCHITECTURE.md new file mode 100644 index 0000000..9c6da0e --- /dev/null +++ b/ARCHITECTURE.md @@ -0,0 +1,347 @@ +# Architecture Overview + +## System Design + +The Test Artifact Data Lake is designed as a cloud-native, microservices-ready application that separates concerns between metadata storage and blob storage. + +## Components + +### 1. FastAPI Application (app/) + +**Purpose**: RESTful API server handling all client requests + +**Key Modules**: +- `app/main.py`: Application entry point, route registration +- `app/config.py`: Configuration management using Pydantic +- `app/database.py`: Database connection and session management + +### 2. API Layer (app/api/) + +**Purpose**: HTTP endpoint definitions and request handling + +**Files**: +- `app/api/artifacts.py`: All artifact-related endpoints + - Upload: Multipart file upload with metadata + - Download: File retrieval with streaming + - Query: Complex filtering and search + - Delete: Cascade deletion from both DB and storage + - Presigned URLs: Temporary download links + +### 3. Models Layer (app/models/) + +**Purpose**: SQLAlchemy ORM models for database tables + +**Files**: +- `app/models/artifact.py`: Artifact model with all metadata fields + - File information (name, type, size, path) + - Test metadata (name, suite, config, result) + - Custom metadata and tags + - Versioning support + - Timestamps + +### 4. Schemas Layer (app/schemas/) + +**Purpose**: Pydantic models for request/response validation + +**Files**: +- `app/schemas/artifact.py`: + - `ArtifactCreate`: Upload request validation + - `ArtifactResponse`: API response serialization + - `ArtifactQuery`: Query filtering parameters + +### 5. Storage Layer (app/storage/) + +**Purpose**: Abstraction over different blob storage backends + +**Architecture**: +``` +StorageBackend (Abstract Base Class) + ├── S3Backend (AWS S3 implementation) + └── MinIOBackend (Self-hosted S3-compatible) +``` + +**Files**: +- `app/storage/base.py`: Abstract interface +- `app/storage/s3_backend.py`: AWS S3 implementation +- `app/storage/minio_backend.py`: MinIO implementation +- `app/storage/factory.py`: Backend selection logic + +**Key Methods**: +- `upload_file()`: Store blob with unique path +- `download_file()`: Retrieve blob by path +- `delete_file()`: Remove blob from storage +- `file_exists()`: Check blob existence +- `get_file_url()`: Generate presigned download URL + +## Data Flow + +### Upload Flow + +``` +Client + ↓ (multipart/form-data) +FastAPI Endpoint + ↓ (parse metadata) +Validation Layer + ↓ (generate UUID path) +Storage Backend + ↓ (store blob) +Database + ↓ (save metadata) +Response (artifact object) +``` + +### Query Flow + +``` +Client + ↓ (JSON query) +FastAPI Endpoint + ↓ (validate filters) +Database Query Builder + ↓ (SQL with filters) +PostgreSQL + ↓ (result set) +Response (artifact list) +``` + +### Download Flow + +``` +Client + ↓ (GET request) +FastAPI Endpoint + ↓ (lookup artifact) +Database + ↓ (get storage path) +Storage Backend + ↓ (retrieve blob) +StreamingResponse + ↓ (binary data) +Client +``` + +## Database Schema + +### Table: artifacts + +| Column | Type | Description | +|--------|------|-------------| +| id | Integer | Primary key (auto-increment) | +| filename | String(500) | Original filename (indexed) | +| file_type | String(50) | csv, json, binary, pcap (indexed) | +| file_size | BigInteger | File size in bytes | +| storage_path | String(1000) | Full storage path/URL | +| content_type | String(100) | MIME type | +| test_name | String(500) | Test identifier (indexed) | +| test_suite | String(500) | Suite identifier (indexed) | +| test_config | JSON | Test configuration object | +| test_result | String(50) | pass/fail/skip/error (indexed) | +| metadata | JSON | Custom metadata object | +| description | Text | Human-readable description | +| tags | JSON | Array of tags for categorization | +| created_at | DateTime | Creation timestamp (indexed) | +| updated_at | DateTime | Last update timestamp | +| version | String(50) | Version identifier | +| parent_id | Integer | Parent artifact ID (indexed) | + +**Indexes**: +- Primary: id +- Secondary: filename, file_type, test_name, test_suite, test_result, created_at, parent_id + +## Storage Architecture + +### Blob Storage + +**S3/MinIO Bucket Structure**: +``` +test-artifacts/ + ├── {uuid1}.csv + ├── {uuid2}.json + ├── {uuid3}.pcap + └── {uuid4}.bin +``` + +- Files stored with UUID-based names to prevent conflicts +- Original filenames preserved in database metadata +- No directory structure (flat namespace) + +### Database vs Blob Storage + +| Data Type | Storage | +|-----------|---------| +| File content | S3/MinIO | +| Metadata | PostgreSQL | +| Test configs | PostgreSQL (JSON) | +| Custom metadata | PostgreSQL (JSON) | +| Tags | PostgreSQL (JSON array) | +| File paths | PostgreSQL | + +## Scalability Considerations + +### Horizontal Scaling + +**API Layer**: +- Stateless FastAPI instances +- Can scale to N replicas +- Load balanced via Kubernetes Service + +**Database**: +- PostgreSQL with read replicas +- Connection pooling +- Query optimization via indexes + +**Storage**: +- S3: Infinite scalability +- MinIO: Can be clustered + +### Performance Optimizations + +1. **Streaming Uploads/Downloads**: Avoids loading entire files into memory +2. **Database Indexes**: Fast queries on common fields +3. **Presigned URLs**: Offload downloads to storage backend +4. **Async I/O**: FastAPI async endpoints for concurrent requests + +## Security Architecture + +### Current State (No Auth) +- API is open to all requests +- Suitable for internal networks +- Add authentication middleware as needed + +### Recommended Enhancements + +1. **Authentication**: + - OAuth 2.0 / OIDC + - API keys + - JWT tokens + +2. **Authorization**: + - Role-based access control (RBAC) + - Resource-level permissions + +3. **Network Security**: + - TLS/HTTPS (via ingress) + - Network policies (Kubernetes) + - VPC isolation (AWS) + +4. **Data Security**: + - Encryption at rest (S3 SSE) + - Encryption in transit (HTTPS) + - Secrets management (Kubernetes Secrets, AWS Secrets Manager) + +## Deployment Architecture + +### Local Development +``` +Docker Compose + ├── PostgreSQL container + ├── MinIO container + └── API container +``` + +### Kubernetes Production +``` +Kubernetes Cluster + ├── Deployment (API pods) + ├── Service (load balancer) + ├── StatefulSet (PostgreSQL) + ├── StatefulSet (MinIO) + ├── Ingress (HTTPS termination) + └── Secrets (credentials) +``` + +### AWS Production +``` +AWS + ├── EKS (API pods) + ├── RDS PostgreSQL + ├── S3 (blob storage) + ├── ALB (load balancer) + └── Secrets Manager +``` + +## Configuration Management + +### Environment Variables +- Centralized in `app/config.py` +- Loaded via Pydantic Settings +- Support for `.env` files +- Override via environment variables + +### Kubernetes ConfigMaps/Secrets +- Non-sensitive: ConfigMaps +- Sensitive: Secrets (base64) +- Mounted as environment variables + +## Monitoring and Observability + +### Health Checks +- `/health`: Liveness probe +- Database connectivity check +- Storage backend connectivity check + +### Logging +- Structured logging via Python logging +- JSON format for log aggregation +- Log levels: INFO, WARNING, ERROR + +### Metrics (Future) +- Prometheus metrics endpoint +- Request count, latency, errors +- Storage usage, database connections + +## Disaster Recovery + +### Backup Strategy +1. **Database**: pg_dump scheduled backups +2. **Storage**: S3 versioning, cross-region replication +3. **Configuration**: GitOps (Helm charts in Git) + +### Recovery Procedures +1. Restore database from backup +2. Storage automatically available (S3) +3. Redeploy application via Helm + +## Future Enhancements + +### Performance +- Caching layer (Redis) +- CDN for frequently accessed files +- Database sharding for massive scale + +### Features +- File versioning UI +- Batch upload API +- Search with full-text search (Elasticsearch) +- File preview generation +- Webhooks for events + +### Operations +- Automated testing pipeline +- Blue-green deployments +- Canary releases +- Disaster recovery automation + +## Technology Choices Rationale + +| Technology | Why? | +|------------|------| +| FastAPI | Modern, fast, auto-generated docs, async support | +| PostgreSQL | Reliable, JSON support, strong indexing | +| S3/MinIO | Industry standard, scalable, S3-compatible | +| SQLAlchemy | Powerful ORM, migration support | +| Pydantic | Type safety, validation, settings management | +| Docker | Containerization, portability | +| Kubernetes/Helm | Orchestration, declarative deployment | +| GitLab CI | Integrated CI/CD, container registry | + +## Development Principles + +1. **Separation of Concerns**: Clear layers (API, models, storage) +2. **Abstraction**: Storage backend abstraction for flexibility +3. **Configuration as Code**: Helm charts, GitOps +4. **Testability**: Dependency injection, mocking interfaces +5. **Observability**: Logging, health checks, metrics +6. **Security**: Secrets management, least privilege +7. **Scalability**: Stateless design, horizontal scaling diff --git a/DEPLOYMENT.md b/DEPLOYMENT.md new file mode 100644 index 0000000..99cc754 --- /dev/null +++ b/DEPLOYMENT.md @@ -0,0 +1,465 @@ +# Deployment Guide + +This guide covers deploying the Test Artifact Data Lake in various environments. + +## Table of Contents +- [Local Development](#local-development) +- [Docker Compose](#docker-compose) +- [Kubernetes/Helm](#kuberneteshelm) +- [AWS Deployment](#aws-deployment) +- [Self-Hosted Deployment](#self-hosted-deployment) +- [GitLab CI/CD](#gitlab-cicd) + +--- + +## Local Development + +### Prerequisites +- Python 3.11+ +- PostgreSQL 15+ +- MinIO or AWS S3 access + +### Steps + +1. **Create virtual environment:** +```bash +python -m venv venv +source venv/bin/activate # On Windows: venv\Scripts\activate +``` + +2. **Install dependencies:** +```bash +pip install -r requirements.txt +``` + +3. **Set up PostgreSQL:** +```bash +createdb datalake +``` + +4. **Configure environment:** +```bash +cp .env.example .env +# Edit .env with your configuration +``` + +5. **Run the application:** +```bash +python -m uvicorn app.main:app --reload +``` + +--- + +## Docker Compose + +### Quick Start + +1. **Start all services:** +```bash +docker-compose up -d +``` + +2. **Check logs:** +```bash +docker-compose logs -f api +``` + +3. **Stop services:** +```bash +docker-compose down +``` + +### Services Included +- PostgreSQL (port 5432) +- MinIO (port 9000, console 9001) +- API (port 8000) + +### Customization + +Edit `docker-compose.yml` to: +- Change port mappings +- Adjust resource limits +- Add environment variables +- Configure volumes + +--- + +## Kubernetes/Helm + +### Prerequisites +- Kubernetes cluster (1.24+) +- Helm 3.x +- kubectl configured + +### Installation + +1. **Add dependencies (if using PostgreSQL/MinIO from Bitnami):** +```bash +helm repo add bitnami https://charts.bitnami.com/bitnami +helm repo update +``` + +2. **Install with default values:** +```bash +helm install datalake ./helm \ + --namespace datalake \ + --create-namespace +``` + +3. **Custom installation:** +```bash +helm install datalake ./helm \ + --namespace datalake \ + --create-namespace \ + --set image.repository=your-registry/datalake \ + --set image.tag=1.0.0 \ + --set ingress.enabled=true \ + --set ingress.hosts[0].host=datalake.yourdomain.com +``` + +### Configuration Options + +**Image:** +```bash +--set image.repository=your-registry/datalake +--set image.tag=1.0.0 +--set image.pullPolicy=Always +``` + +**Resources:** +```bash +--set resources.requests.cpu=1000m +--set resources.requests.memory=1Gi +--set resources.limits.cpu=2000m +--set resources.limits.memory=2Gi +``` + +**Autoscaling:** +```bash +--set autoscaling.enabled=true +--set autoscaling.minReplicas=3 +--set autoscaling.maxReplicas=10 +--set autoscaling.targetCPUUtilizationPercentage=80 +``` + +**Ingress:** +```bash +--set ingress.enabled=true +--set ingress.className=nginx +--set ingress.hosts[0].host=datalake.example.com +--set ingress.hosts[0].paths[0].path=/ +--set ingress.hosts[0].paths[0].pathType=Prefix +``` + +### Upgrade + +```bash +helm upgrade datalake ./helm \ + --namespace datalake \ + --set image.tag=1.1.0 +``` + +### Uninstall + +```bash +helm uninstall datalake --namespace datalake +``` + +--- + +## AWS Deployment + +### Using AWS S3 Storage + +1. **Create S3 bucket:** +```bash +aws s3 mb s3://your-test-artifacts-bucket +``` + +2. **Create IAM user with S3 access:** +```bash +aws iam create-user --user-name datalake-service +aws iam attach-user-policy --user-name datalake-service \ + --policy-arn arn:aws:iam::aws:policy/AmazonS3FullAccess +``` + +3. **Generate access keys:** +```bash +aws iam create-access-key --user-name datalake-service +``` + +4. **Deploy with Helm:** +```bash +helm install datalake ./helm \ + --namespace datalake \ + --create-namespace \ + --set config.storageBackend=s3 \ + --set aws.enabled=true \ + --set aws.accessKeyId=YOUR_ACCESS_KEY \ + --set aws.secretAccessKey=YOUR_SECRET_KEY \ + --set aws.region=us-east-1 \ + --set aws.bucketName=your-test-artifacts-bucket \ + --set minio.enabled=false +``` + +### Using EKS + +1. **Create EKS cluster:** +```bash +eksctl create cluster \ + --name datalake-cluster \ + --region us-east-1 \ + --nodegroup-name standard-workers \ + --node-type t3.medium \ + --nodes 3 +``` + +2. **Configure kubectl:** +```bash +aws eks update-kubeconfig --name datalake-cluster --region us-east-1 +``` + +3. **Deploy application:** +```bash +helm install datalake ./helm \ + --namespace datalake \ + --create-namespace \ + --set config.storageBackend=s3 +``` + +### Using RDS for PostgreSQL + +```bash +helm install datalake ./helm \ + --namespace datalake \ + --create-namespace \ + --set postgresql.enabled=false \ + --set config.databaseUrl="postgresql://user:pass@your-rds-endpoint:5432/datalake" +``` + +--- + +## Self-Hosted Deployment + +### Using MinIO + +1. **Deploy MinIO:** +```bash +helm install minio bitnami/minio \ + --namespace datalake \ + --create-namespace \ + --set auth.rootUser=admin \ + --set auth.rootPassword=adminpassword \ + --set persistence.size=100Gi +``` + +2. **Deploy application:** +```bash +helm install datalake ./helm \ + --namespace datalake \ + --set config.storageBackend=minio \ + --set minio.enabled=false \ + --set minio.endpoint=minio:9000 \ + --set minio.accessKey=admin \ + --set minio.secretKey=adminpassword +``` + +### On-Premise Kubernetes + +1. **Prepare persistent volumes:** +```yaml +apiVersion: v1 +kind: PersistentVolume +metadata: + name: datalake-postgres-pv +spec: + capacity: + storage: 20Gi + accessModes: + - ReadWriteOnce + hostPath: + path: /data/postgres +``` + +2. **Deploy with local storage:** +```bash +helm install datalake ./helm \ + --namespace datalake \ + --create-namespace \ + --set postgresql.persistence.storageClass=local-storage \ + --set minio.persistence.storageClass=local-storage +``` + +--- + +## GitLab CI/CD + +### Setup + +1. **Configure GitLab variables:** + +Go to Settings → CI/CD → Variables and add: + +| Variable | Description | Protected | Masked | +|----------|-------------|-----------|---------| +| `CI_REGISTRY_USER` | Docker registry username | No | No | +| `CI_REGISTRY_PASSWORD` | Docker registry password | No | Yes | +| `KUBE_CONFIG_DEV` | Base64 kubeconfig for dev | No | Yes | +| `KUBE_CONFIG_STAGING` | Base64 kubeconfig for staging | Yes | Yes | +| `KUBE_CONFIG_PROD` | Base64 kubeconfig for prod | Yes | Yes | + +2. **Encode kubeconfig:** +```bash +cat ~/.kube/config | base64 -w 0 +``` + +### Pipeline Stages + +1. **Test**: Runs on all branches and MRs +2. **Build**: Builds Docker image on main/develop/tags +3. **Deploy**: Manual deployment to dev/staging/prod + +### Deployment Flow + +**Development:** +```bash +git push origin develop +# Manually trigger deploy:dev job in GitLab +``` + +**Staging:** +```bash +git push origin main +# Manually trigger deploy:staging job in GitLab +``` + +**Production:** +```bash +git tag v1.0.0 +git push origin v1.0.0 +# Manually trigger deploy:prod job in GitLab +``` + +### Customizing Pipeline + +Edit `.gitlab-ci.yml` to: +- Add more test stages +- Change deployment namespaces +- Adjust Helm values per environment +- Add security scanning +- Configure rollback procedures + +--- + +## Monitoring + +### Health Checks + +```bash +# Kubernetes +kubectl get pods -n datalake +kubectl logs -f -n datalake deployment/datalake + +# Direct +curl http://localhost:8000/health +``` + +### Metrics + +Add Prometheus monitoring: +```bash +helm install datalake ./helm \ + --set metrics.enabled=true \ + --set serviceMonitor.enabled=true +``` + +--- + +## Backup and Recovery + +### Database Backup + +```bash +# PostgreSQL +kubectl exec -n datalake deployment/datalake-postgresql -- \ + pg_dump -U user datalake > backup.sql + +# Restore +kubectl exec -i -n datalake deployment/datalake-postgresql -- \ + psql -U user datalake < backup.sql +``` + +### Storage Backup + +**S3:** +```bash +aws s3 sync s3://your-bucket s3://backup-bucket +``` + +**MinIO:** +```bash +mc mirror minio/test-artifacts backup/test-artifacts +``` + +--- + +## Troubleshooting + +### Pod Not Starting +```bash +kubectl describe pod -n datalake +kubectl logs -n datalake +``` + +### Database Connection Issues +```bash +kubectl exec -it -n datalake deployment/datalake -- \ + psql $DATABASE_URL +``` + +### Storage Issues +```bash +# Check MinIO +kubectl port-forward -n datalake svc/minio 9000:9000 +# Access http://localhost:9000 +``` + +--- + +## Security Considerations + +1. **Use secrets management:** + - Kubernetes Secrets + - AWS Secrets Manager + - HashiCorp Vault + +2. **Enable TLS:** + - Configure ingress with TLS certificates + - Use cert-manager for automatic certificates + +3. **Network policies:** + - Restrict pod-to-pod communication + - Limit external access + +4. **RBAC:** + - Configure Kubernetes RBAC + - Limit service account permissions + +--- + +## Performance Tuning + +### Database +- Increase connection pool size +- Add database indexes +- Configure autovacuum + +### API +- Increase replica count +- Configure horizontal pod autoscaling +- Adjust resource requests/limits + +### Storage +- Use CDN for frequently accessed files +- Configure S3 Transfer Acceleration +- Optimize MinIO deployment diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..2b3de52 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,32 @@ +FROM python:3.11-slim + +WORKDIR /app + +# Install system dependencies +RUN apt-get update && apt-get install -y \ + gcc \ + postgresql-client \ + && rm -rf /var/lib/apt/lists/* + +# Copy requirements and install Python dependencies +COPY requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt + +# Copy application code +COPY app/ ./app/ +COPY alembic/ ./alembic/ +COPY alembic.ini . + +# Create non-root user +RUN useradd -m -u 1000 appuser && chown -R appuser:appuser /app +USER appuser + +# Expose port +EXPOSE 8000 + +# Health check +HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \ + CMD python -c "import requests; requests.get('http://localhost:8000/health')" + +# Run the application +CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000"] diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..4e856d0 --- /dev/null +++ b/Makefile @@ -0,0 +1,66 @@ +.PHONY: help install dev test lint format docker-build docker-up docker-down deploy clean + +help: + @echo "Available commands:" + @echo " make install - Install Python dependencies" + @echo " make dev - Run development server" + @echo " make test - Run tests" + @echo " make lint - Run linters" + @echo " make format - Format code" + @echo " make docker-build - Build Docker image" + @echo " make docker-up - Start Docker Compose services" + @echo " make docker-down - Stop Docker Compose services" + @echo " make deploy - Deploy with Helm" + @echo " make clean - Clean temporary files" + +install: + pip install -r requirements.txt + +dev: + python -m uvicorn app.main:app --reload --host 0.0.0.0 --port 8000 + +test: + pytest tests/ -v + +lint: + flake8 app/ --max-line-length=120 --ignore=E203,W503 + black --check app/ + +format: + black app/ + isort app/ + +docker-build: + docker build -t datalake:latest . + +docker-up: + docker-compose up -d + +docker-down: + docker-compose down + +docker-logs: + docker-compose logs -f api + +deploy: + helm upgrade --install datalake ./helm \ + --namespace datalake \ + --create-namespace + +deploy-dev: + helm upgrade --install datalake-dev ./helm \ + --namespace datalake-dev \ + --create-namespace \ + --set ingress.enabled=true + +clean: + find . -type d -name __pycache__ -exec rm -rf {} + 2>/dev/null || true + find . -type f -name "*.pyc" -delete + find . -type f -name "*.pyo" -delete + find . -type f -name "*.log" -delete + rm -rf .pytest_cache + rm -rf .coverage + rm -rf htmlcov + rm -rf dist + rm -rf build + rm -rf *.egg-info diff --git a/README.md b/README.md new file mode 100644 index 0000000..ca07426 --- /dev/null +++ b/README.md @@ -0,0 +1,298 @@ +# Test Artifact Data Lake + +A lightweight, cloud-native API for storing and querying test artifacts including CSV files, JSON files, binary files, and packet captures (PCAP). Built with FastAPI and supports both AWS S3 and self-hosted MinIO storage backends. + +## Features + +- **Multi-format Support**: Store CSV, JSON, binary files, and PCAP files +- **Flexible Storage**: Switch between AWS S3 and self-hosted MinIO +- **Rich Metadata**: Track test configurations, results, and custom metadata +- **Powerful Querying**: Query artifacts by test name, suite, result, tags, date ranges, and more +- **RESTful API**: Clean REST API with automatic OpenAPI documentation +- **Cloud-Native**: Fully containerized with Docker and Kubernetes/Helm support +- **Production-Ready**: Includes GitLab CI/CD pipeline for automated deployments + +## Architecture + +``` +┌─────────────┐ +│ FastAPI │ ← REST API +│ Backend │ +└──────┬──────┘ + │ + ├─────────┐ + ↓ ↓ +┌──────────┐ ┌────────────┐ +│PostgreSQL│ │ S3/MinIO │ +│(Metadata)│ │ (Blobs) │ +└──────────┘ └────────────┘ +``` + +- **PostgreSQL**: Stores artifact metadata, test configs, and query indexes +- **S3/MinIO**: Stores actual file contents (blob storage) +- **FastAPI**: Async REST API for uploads, downloads, and queries + +## Quick Start + +### Using Docker Compose (Recommended) + +1. Clone the repository: +```bash +git clone +cd datalake +``` + +2. Copy environment configuration: +```bash +cp .env.example .env +``` + +3. Start all services: +```bash +docker-compose up -d +``` + +4. Access the API: +- API: http://localhost:8000 +- API Docs: http://localhost:8000/docs +- MinIO Console: http://localhost:9001 + +### Using Python Directly + +1. Install dependencies: +```bash +pip install -r requirements.txt +``` + +2. Set up PostgreSQL and MinIO/S3 + +3. Configure environment variables in `.env` + +4. Run the application: +```bash +python -m uvicorn app.main:app --reload +``` + +## API Usage + +### Upload an Artifact + +```bash +curl -X POST "http://localhost:8000/api/v1/artifacts/upload" \ + -F "file=@test_results.csv" \ + -F "test_name=auth_test" \ + -F "test_suite=integration" \ + -F "test_result=pass" \ + -F 'test_config={"browser":"chrome","timeout":30}' \ + -F 'tags=["regression","smoke"]' \ + -F "description=Authentication test results" +``` + +### Query Artifacts + +```bash +curl -X POST "http://localhost:8000/api/v1/artifacts/query" \ + -H "Content-Type: application/json" \ + -d '{ + "test_suite": "integration", + "test_result": "fail", + "start_date": "2024-01-01T00:00:00", + "limit": 50 + }' +``` + +### Download an Artifact + +```bash +curl -X GET "http://localhost:8000/api/v1/artifacts/123/download" \ + -o downloaded_file.csv +``` + +### Get Presigned URL + +```bash +curl -X GET "http://localhost:8000/api/v1/artifacts/123/url?expiration=3600" +``` + +### List All Artifacts + +```bash +curl -X GET "http://localhost:8000/api/v1/artifacts/?limit=100&offset=0" +``` + +### Delete an Artifact + +```bash +curl -X DELETE "http://localhost:8000/api/v1/artifacts/123" +``` + +## API Endpoints + +| Method | Endpoint | Description | +|--------|----------|-------------| +| POST | `/api/v1/artifacts/upload` | Upload a new artifact with metadata | +| GET | `/api/v1/artifacts/{id}` | Get artifact metadata by ID | +| GET | `/api/v1/artifacts/{id}/download` | Download artifact file | +| GET | `/api/v1/artifacts/{id}/url` | Get presigned download URL | +| DELETE | `/api/v1/artifacts/{id}` | Delete artifact and file | +| POST | `/api/v1/artifacts/query` | Query artifacts with filters | +| GET | `/api/v1/artifacts/` | List all artifacts (paginated) | +| GET | `/` | API information | +| GET | `/health` | Health check | +| GET | `/docs` | Interactive API documentation | + +## Configuration + +### Environment Variables + +| Variable | Description | Default | +|----------|-------------|---------| +| `DATABASE_URL` | PostgreSQL connection string | `postgresql://user:password@localhost:5432/datalake` | +| `STORAGE_BACKEND` | Storage backend (`s3` or `minio`) | `minio` | +| `AWS_ACCESS_KEY_ID` | AWS access key (for S3) | - | +| `AWS_SECRET_ACCESS_KEY` | AWS secret key (for S3) | - | +| `AWS_REGION` | AWS region (for S3) | `us-east-1` | +| `S3_BUCKET_NAME` | S3 bucket name | `test-artifacts` | +| `MINIO_ENDPOINT` | MinIO endpoint | `localhost:9000` | +| `MINIO_ACCESS_KEY` | MinIO access key | `minioadmin` | +| `MINIO_SECRET_KEY` | MinIO secret key | `minioadmin` | +| `MINIO_BUCKET_NAME` | MinIO bucket name | `test-artifacts` | +| `MINIO_SECURE` | Use HTTPS for MinIO | `false` | +| `API_HOST` | API host | `0.0.0.0` | +| `API_PORT` | API port | `8000` | +| `MAX_UPLOAD_SIZE` | Max upload size (bytes) | `524288000` (500MB) | + +### Switching Between S3 and MinIO + +To use AWS S3: +```bash +STORAGE_BACKEND=s3 +AWS_ACCESS_KEY_ID=your_key +AWS_SECRET_ACCESS_KEY=your_secret +AWS_REGION=us-east-1 +S3_BUCKET_NAME=your-bucket +``` + +To use self-hosted MinIO: +```bash +STORAGE_BACKEND=minio +MINIO_ENDPOINT=minio:9000 +MINIO_ACCESS_KEY=minioadmin +MINIO_SECRET_KEY=minioadmin +MINIO_BUCKET_NAME=test-artifacts +``` + +## Deployment + +### Kubernetes with Helm + +1. Build and push Docker image: +```bash +docker build -t your-registry/datalake:latest . +docker push your-registry/datalake:latest +``` + +2. Install with Helm: +```bash +helm install datalake ./helm \ + --set image.repository=your-registry/datalake \ + --set image.tag=latest \ + --namespace datalake \ + --create-namespace +``` + +3. Access the API: +```bash +kubectl port-forward -n datalake svc/datalake 8000:8000 +``` + +### Helm Configuration + +Edit `helm/values.yaml` to customize: +- Replica count +- Resource limits +- Storage backend (S3 vs MinIO) +- Ingress settings +- PostgreSQL settings +- Autoscaling + +### GitLab CI/CD + +The included `.gitlab-ci.yml` provides: +- Automated testing +- Linting +- Docker image builds +- Deployments to dev/staging/prod + +**Required GitLab CI/CD Variables:** +- `CI_REGISTRY_USER`: Docker registry username +- `CI_REGISTRY_PASSWORD`: Docker registry password +- `KUBE_CONFIG_DEV`: Base64-encoded kubeconfig for dev +- `KUBE_CONFIG_STAGING`: Base64-encoded kubeconfig for staging +- `KUBE_CONFIG_PROD`: Base64-encoded kubeconfig for prod + +## Database Schema + +The `artifacts` table stores: +- File metadata (name, type, size, storage path) +- Test information (name, suite, config, result) +- Custom metadata and tags +- Timestamps and versioning + +## Example Use Cases + +### Store Test Results +Upload CSV files containing test execution results with metadata about the test suite and configuration. + +### Archive Packet Captures +Store PCAP files from network tests with tags for easy filtering and retrieval. + +### Track Test Configurations +Upload JSON test configurations and query them by date, test suite, or custom tags. + +### Binary Artifact Storage +Store compiled binaries, test data files, or any binary artifacts with full metadata. + +## Development + +### Running Tests +```bash +pytest tests/ -v +``` + +### Code Formatting +```bash +black app/ +flake8 app/ +``` + +### Database Migrations +```bash +alembic revision --autogenerate -m "description" +alembic upgrade head +``` + +## Troubleshooting + +### Cannot Connect to Database +- Verify PostgreSQL is running +- Check `DATABASE_URL` is correct +- Ensure database exists + +### Cannot Upload Files +- Check storage backend is running (MinIO or S3 accessible) +- Verify credentials are correct +- Check file size is under `MAX_UPLOAD_SIZE` + +### MinIO Connection Failed +- Ensure MinIO service is running +- Verify `MINIO_ENDPOINT` is correct +- Check MinIO credentials + +## License + +[Your License Here] + +## Support + +For issues and questions, please open an issue in the repository. diff --git a/alembic.ini b/alembic.ini new file mode 100644 index 0000000..8b22847 --- /dev/null +++ b/alembic.ini @@ -0,0 +1,41 @@ +[alembic] +script_location = alembic +prepend_sys_path = . +version_path_separator = os + +[alembic:exclude] +tables = spatial_ref_sys + +[loggers] +keys = root,sqlalchemy,alembic + +[handlers] +keys = console + +[formatters] +keys = generic + +[logger_root] +level = WARN +handlers = console +qualname = + +[logger_sqlalchemy] +level = WARN +handlers = +qualname = sqlalchemy.engine + +[logger_alembic] +level = INFO +handlers = +qualname = alembic + +[handler_console] +class = StreamHandler +args = (sys.stderr,) +level = NOTSET +formatter = generic + +[formatter_generic] +format = %(levelname)-5.5s [%(name)s] %(message)s +datefmt = %H:%M:%S diff --git a/app/__init__.py b/app/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/app/api/__init__.py b/app/api/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/app/api/artifacts.py b/app/api/artifacts.py new file mode 100644 index 0000000..db49811 --- /dev/null +++ b/app/api/artifacts.py @@ -0,0 +1,242 @@ +from fastapi import APIRouter, UploadFile, File, Form, Depends, HTTPException, Query +from fastapi.responses import StreamingResponse +from sqlalchemy.orm import Session +from typing import List, Optional +import uuid +import json +import io +from datetime import datetime + +from app.database import get_db +from app.models.artifact import Artifact +from app.schemas.artifact import ArtifactCreate, ArtifactResponse, ArtifactQuery +from app.storage import get_storage_backend + +router = APIRouter(prefix="/api/v1/artifacts", tags=["artifacts"]) + + +def get_file_type(filename: str) -> str: + """Determine file type from filename""" + extension = filename.lower().split('.')[-1] + type_mapping = { + 'csv': 'csv', + 'json': 'json', + 'pcap': 'pcap', + 'pcapng': 'pcap', + 'bin': 'binary', + 'dat': 'binary', + } + return type_mapping.get(extension, 'binary') + + +@router.post("/upload", response_model=ArtifactResponse, status_code=201) +async def upload_artifact( + file: UploadFile = File(...), + test_name: Optional[str] = Form(None), + test_suite: Optional[str] = Form(None), + test_config: Optional[str] = Form(None), + test_result: Optional[str] = Form(None), + metadata: Optional[str] = Form(None), + description: Optional[str] = Form(None), + tags: Optional[str] = Form(None), + version: Optional[str] = Form(None), + parent_id: Optional[int] = Form(None), + db: Session = Depends(get_db) +): + """ + Upload a new artifact file with metadata + + - **file**: The file to upload (CSV, JSON, binary, PCAP) + - **test_name**: Name of the test + - **test_suite**: Test suite identifier + - **test_config**: JSON string of test configuration + - **test_result**: Test result (pass, fail, skip, error) + - **metadata**: JSON string of additional metadata + - **description**: Text description of the artifact + - **tags**: JSON array of tags (as string) + - **version**: Version identifier + - **parent_id**: ID of parent artifact (for versioning) + """ + try: + # Parse JSON fields + test_config_dict = json.loads(test_config) if test_config else None + metadata_dict = json.loads(metadata) if metadata else None + tags_list = json.loads(tags) if tags else None + + # Generate unique storage path + file_extension = file.filename.split('.')[-1] if '.' in file.filename else '' + object_name = f"{uuid.uuid4()}.{file_extension}" if file_extension else str(uuid.uuid4()) + + # Upload to storage backend + storage = get_storage_backend() + file_content = await file.read() + file_size = len(file_content) + + storage_path = await storage.upload_file( + io.BytesIO(file_content), + object_name + ) + + # Create database record + artifact = Artifact( + filename=file.filename, + file_type=get_file_type(file.filename), + file_size=file_size, + storage_path=storage_path, + content_type=file.content_type, + test_name=test_name, + test_suite=test_suite, + test_config=test_config_dict, + test_result=test_result, + metadata=metadata_dict, + description=description, + tags=tags_list, + version=version, + parent_id=parent_id + ) + + db.add(artifact) + db.commit() + db.refresh(artifact) + + return artifact + + except json.JSONDecodeError as e: + raise HTTPException(status_code=400, detail=f"Invalid JSON in metadata fields: {str(e)}") + except Exception as e: + db.rollback() + raise HTTPException(status_code=500, detail=f"Upload failed: {str(e)}") + + +@router.get("/{artifact_id}", response_model=ArtifactResponse) +async def get_artifact(artifact_id: int, db: Session = Depends(get_db)): + """Get artifact metadata by ID""" + artifact = db.query(Artifact).filter(Artifact.id == artifact_id).first() + if not artifact: + raise HTTPException(status_code=404, detail="Artifact not found") + return artifact + + +@router.get("/{artifact_id}/download") +async def download_artifact(artifact_id: int, db: Session = Depends(get_db)): + """Download artifact file by ID""" + artifact = db.query(Artifact).filter(Artifact.id == artifact_id).first() + if not artifact: + raise HTTPException(status_code=404, detail="Artifact not found") + + try: + storage = get_storage_backend() + # Extract object name from storage path + object_name = artifact.storage_path.split('/')[-1] + file_data = await storage.download_file(object_name) + + return StreamingResponse( + io.BytesIO(file_data), + media_type=artifact.content_type or "application/octet-stream", + headers={ + "Content-Disposition": f'attachment; filename="{artifact.filename}"' + } + ) + except Exception as e: + raise HTTPException(status_code=500, detail=f"Download failed: {str(e)}") + + +@router.get("/{artifact_id}/url") +async def get_artifact_url( + artifact_id: int, + expiration: int = Query(default=3600, ge=60, le=86400), + db: Session = Depends(get_db) +): + """Get presigned URL for artifact download""" + artifact = db.query(Artifact).filter(Artifact.id == artifact_id).first() + if not artifact: + raise HTTPException(status_code=404, detail="Artifact not found") + + try: + storage = get_storage_backend() + object_name = artifact.storage_path.split('/')[-1] + url = await storage.get_file_url(object_name, expiration) + return {"url": url, "expires_in": expiration} + except Exception as e: + raise HTTPException(status_code=500, detail=f"Failed to generate URL: {str(e)}") + + +@router.delete("/{artifact_id}") +async def delete_artifact(artifact_id: int, db: Session = Depends(get_db)): + """Delete artifact and its file""" + artifact = db.query(Artifact).filter(Artifact.id == artifact_id).first() + if not artifact: + raise HTTPException(status_code=404, detail="Artifact not found") + + try: + # Delete from storage + storage = get_storage_backend() + object_name = artifact.storage_path.split('/')[-1] + await storage.delete_file(object_name) + + # Delete from database + db.delete(artifact) + db.commit() + + return {"message": "Artifact deleted successfully"} + except Exception as e: + db.rollback() + raise HTTPException(status_code=500, detail=f"Delete failed: {str(e)}") + + +@router.post("/query", response_model=List[ArtifactResponse]) +async def query_artifacts(query: ArtifactQuery, db: Session = Depends(get_db)): + """ + Query artifacts with filters + + - **filename**: Filter by filename (partial match) + - **file_type**: Filter by file type + - **test_name**: Filter by test name + - **test_suite**: Filter by test suite + - **test_result**: Filter by test result + - **tags**: Filter by tags (must contain all specified tags) + - **start_date**: Filter by creation date (from) + - **end_date**: Filter by creation date (to) + - **limit**: Maximum number of results + - **offset**: Number of results to skip + """ + q = db.query(Artifact) + + if query.filename: + q = q.filter(Artifact.filename.ilike(f"%{query.filename}%")) + if query.file_type: + q = q.filter(Artifact.file_type == query.file_type) + if query.test_name: + q = q.filter(Artifact.test_name.ilike(f"%{query.test_name}%")) + if query.test_suite: + q = q.filter(Artifact.test_suite == query.test_suite) + if query.test_result: + q = q.filter(Artifact.test_result == query.test_result) + if query.tags: + for tag in query.tags: + q = q.filter(Artifact.tags.contains([tag])) + if query.start_date: + q = q.filter(Artifact.created_at >= query.start_date) + if query.end_date: + q = q.filter(Artifact.created_at <= query.end_date) + + # Order by creation date descending + q = q.order_by(Artifact.created_at.desc()) + + # Apply pagination + artifacts = q.offset(query.offset).limit(query.limit).all() + + return artifacts + + +@router.get("/", response_model=List[ArtifactResponse]) +async def list_artifacts( + limit: int = Query(default=100, le=1000), + offset: int = Query(default=0, ge=0), + db: Session = Depends(get_db) +): + """List all artifacts with pagination""" + artifacts = db.query(Artifact).order_by( + Artifact.created_at.desc() + ).offset(offset).limit(limit).all() + return artifacts diff --git a/app/config.py b/app/config.py new file mode 100644 index 0000000..243cb0f --- /dev/null +++ b/app/config.py @@ -0,0 +1,35 @@ +from pydantic_settings import BaseSettings +from typing import Literal + + +class Settings(BaseSettings): + # Database + database_url: str = "postgresql://user:password@localhost:5432/datalake" + + # Storage Backend + storage_backend: Literal["s3", "minio"] = "minio" + + # AWS S3 + aws_access_key_id: str = "" + aws_secret_access_key: str = "" + aws_region: str = "us-east-1" + s3_bucket_name: str = "test-artifacts" + + # MinIO + minio_endpoint: str = "localhost:9000" + minio_access_key: str = "minioadmin" + minio_secret_key: str = "minioadmin" + minio_bucket_name: str = "test-artifacts" + minio_secure: bool = False + + # Application + api_host: str = "0.0.0.0" + api_port: int = 8000 + max_upload_size: int = 524288000 # 500MB + + class Config: + env_file = ".env" + case_sensitive = False + + +settings = Settings() diff --git a/app/database.py b/app/database.py new file mode 100644 index 0000000..b53ae14 --- /dev/null +++ b/app/database.py @@ -0,0 +1,21 @@ +from sqlalchemy import create_engine +from sqlalchemy.orm import sessionmaker +from app.config import settings +from app.models.artifact import Base + +engine = create_engine(settings.database_url) +SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine) + + +def init_db(): + """Initialize database tables""" + Base.metadata.create_all(bind=engine) + + +def get_db(): + """Dependency for getting database session""" + db = SessionLocal() + try: + yield db + finally: + db.close() diff --git a/app/main.py b/app/main.py new file mode 100644 index 0000000..e15a340 --- /dev/null +++ b/app/main.py @@ -0,0 +1,71 @@ +from fastapi import FastAPI +from fastapi.middleware.cors import CORSMiddleware +from app.api.artifacts import router as artifacts_router +from app.database import init_db +from app.config import settings +import logging + +# Configure logging +logging.basicConfig( + level=logging.INFO, + format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' +) + +logger = logging.getLogger(__name__) + +# Create FastAPI app +app = FastAPI( + title="Test Artifact Data Lake", + description="API for storing and querying test artifacts including CSV, JSON, binary files, and packet captures", + version="1.0.0", + docs_url="/docs", + redoc_url="/redoc" +) + +# Configure CORS +app.add_middleware( + CORSMiddleware, + allow_origins=["*"], + allow_credentials=True, + allow_methods=["*"], + allow_headers=["*"], +) + +# Include routers +app.include_router(artifacts_router) + + +@app.on_event("startup") +async def startup_event(): + """Initialize database on startup""" + logger.info("Initializing database...") + init_db() + logger.info(f"Using storage backend: {settings.storage_backend}") + logger.info("Application started successfully") + + +@app.get("/") +async def root(): + """Root endpoint""" + return { + "message": "Test Artifact Data Lake API", + "version": "1.0.0", + "docs": "/docs", + "storage_backend": settings.storage_backend + } + + +@app.get("/health") +async def health_check(): + """Health check endpoint""" + return {"status": "healthy"} + + +if __name__ == "__main__": + import uvicorn + uvicorn.run( + "app.main:app", + host=settings.api_host, + port=settings.api_port, + reload=True + ) diff --git a/app/models/__init__.py b/app/models/__init__.py new file mode 100644 index 0000000..8e5cb1a --- /dev/null +++ b/app/models/__init__.py @@ -0,0 +1,3 @@ +from .artifact import Artifact + +__all__ = ["Artifact"] diff --git a/app/models/artifact.py b/app/models/artifact.py new file mode 100644 index 0000000..caa31c5 --- /dev/null +++ b/app/models/artifact.py @@ -0,0 +1,38 @@ +from sqlalchemy import Column, String, Integer, DateTime, JSON, BigInteger, Text +from sqlalchemy.ext.declarative import declarative_base +from datetime import datetime + +Base = declarative_base() + + +class Artifact(Base): + __tablename__ = "artifacts" + + id = Column(Integer, primary_key=True, index=True) + filename = Column(String(500), nullable=False, index=True) + file_type = Column(String(50), nullable=False, index=True) # csv, json, binary, pcap + file_size = Column(BigInteger, nullable=False) + storage_path = Column(String(1000), nullable=False) + content_type = Column(String(100)) + + # Test metadata + test_name = Column(String(500), index=True) + test_suite = Column(String(500), index=True) + test_config = Column(JSON) + test_result = Column(String(50), index=True) # pass, fail, skip, error + + # Additional metadata + metadata = Column(JSON) + description = Column(Text) + tags = Column(JSON) # Array of tags for categorization + + # Timestamps + created_at = Column(DateTime, default=datetime.utcnow, index=True) + updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow) + + # Versioning + version = Column(String(50)) + parent_id = Column(Integer, index=True) # For file versioning + + def __repr__(self): + return f"" diff --git a/app/schemas/__init__.py b/app/schemas/__init__.py new file mode 100644 index 0000000..f6e77b9 --- /dev/null +++ b/app/schemas/__init__.py @@ -0,0 +1,3 @@ +from .artifact import ArtifactCreate, ArtifactResponse, ArtifactQuery + +__all__ = ["ArtifactCreate", "ArtifactResponse", "ArtifactQuery"] diff --git a/app/schemas/artifact.py b/app/schemas/artifact.py new file mode 100644 index 0000000..fbce37b --- /dev/null +++ b/app/schemas/artifact.py @@ -0,0 +1,51 @@ +from pydantic import BaseModel, Field +from typing import Optional, Dict, Any, List +from datetime import datetime + + +class ArtifactCreate(BaseModel): + test_name: Optional[str] = None + test_suite: Optional[str] = None + test_config: Optional[Dict[str, Any]] = None + test_result: Optional[str] = None + metadata: Optional[Dict[str, Any]] = None + description: Optional[str] = None + tags: Optional[List[str]] = None + version: Optional[str] = None + parent_id: Optional[int] = None + + +class ArtifactResponse(BaseModel): + id: int + filename: str + file_type: str + file_size: int + storage_path: str + content_type: Optional[str] = None + test_name: Optional[str] = None + test_suite: Optional[str] = None + test_config: Optional[Dict[str, Any]] = None + test_result: Optional[str] = None + metadata: Optional[Dict[str, Any]] = None + description: Optional[str] = None + tags: Optional[List[str]] = None + created_at: datetime + updated_at: datetime + version: Optional[str] = None + parent_id: Optional[int] = None + + class Config: + from_attributes = True + + +class ArtifactQuery(BaseModel): + filename: Optional[str] = None + file_type: Optional[str] = None + test_name: Optional[str] = None + test_suite: Optional[str] = None + test_result: Optional[str] = None + tags: Optional[List[str]] = None + start_date: Optional[datetime] = None + end_date: Optional[datetime] = None + limit: int = Field(default=100, le=1000) + offset: int = Field(default=0, ge=0) diff --git a/app/storage/__init__.py b/app/storage/__init__.py new file mode 100644 index 0000000..6a50500 --- /dev/null +++ b/app/storage/__init__.py @@ -0,0 +1,6 @@ +from .base import StorageBackend +from .s3_backend import S3Backend +from .minio_backend import MinIOBackend +from .factory import get_storage_backend + +__all__ = ["StorageBackend", "S3Backend", "MinIOBackend", "get_storage_backend"] diff --git a/app/storage/base.py b/app/storage/base.py new file mode 100644 index 0000000..f199760 --- /dev/null +++ b/app/storage/base.py @@ -0,0 +1,73 @@ +from abc import ABC, abstractmethod +from typing import BinaryIO + + +class StorageBackend(ABC): + """Abstract base class for storage backends""" + + @abstractmethod + async def upload_file(self, file_data: BinaryIO, object_name: str) -> str: + """ + Upload a file to storage + + Args: + file_data: Binary file data + object_name: Name/path of the object in storage + + Returns: + Storage path/URL of uploaded file + """ + pass + + @abstractmethod + async def download_file(self, object_name: str) -> bytes: + """ + Download a file from storage + + Args: + object_name: Name/path of the object in storage + + Returns: + Binary file data + """ + pass + + @abstractmethod + async def delete_file(self, object_name: str) -> bool: + """ + Delete a file from storage + + Args: + object_name: Name/path of the object in storage + + Returns: + True if successful + """ + pass + + @abstractmethod + async def file_exists(self, object_name: str) -> bool: + """ + Check if a file exists in storage + + Args: + object_name: Name/path of the object in storage + + Returns: + True if file exists + """ + pass + + @abstractmethod + async def get_file_url(self, object_name: str, expiration: int = 3600) -> str: + """ + Get a presigned URL for downloading a file + + Args: + object_name: Name/path of the object in storage + expiration: URL expiration time in seconds + + Returns: + Presigned URL + """ + pass diff --git a/app/storage/factory.py b/app/storage/factory.py new file mode 100644 index 0000000..adb99a3 --- /dev/null +++ b/app/storage/factory.py @@ -0,0 +1,17 @@ +from app.storage.base import StorageBackend +from app.storage.s3_backend import S3Backend +from app.storage.minio_backend import MinIOBackend +from app.config import settings + + +def get_storage_backend() -> StorageBackend: + """ + Factory function to get the appropriate storage backend + based on configuration + """ + if settings.storage_backend == "s3": + return S3Backend() + elif settings.storage_backend == "minio": + return MinIOBackend() + else: + raise ValueError(f"Unsupported storage backend: {settings.storage_backend}") diff --git a/app/storage/minio_backend.py b/app/storage/minio_backend.py new file mode 100644 index 0000000..d1b2d29 --- /dev/null +++ b/app/storage/minio_backend.py @@ -0,0 +1,88 @@ +import boto3 +from botocore.exceptions import ClientError +from botocore.client import Config +from typing import BinaryIO +from app.storage.base import StorageBackend +from app.config import settings +import logging + +logger = logging.getLogger(__name__) + + +class MinIOBackend(StorageBackend): + """MinIO storage backend implementation (S3-compatible)""" + + def __init__(self): + # MinIO uses S3-compatible API + self.s3_client = boto3.client( + 's3', + endpoint_url=f"{'https' if settings.minio_secure else 'http'}://{settings.minio_endpoint}", + aws_access_key_id=settings.minio_access_key, + aws_secret_access_key=settings.minio_secret_key, + config=Config(signature_version='s3v4'), + region_name='us-east-1' + ) + self.bucket_name = settings.minio_bucket_name + self._ensure_bucket_exists() + + def _ensure_bucket_exists(self): + """Create bucket if it doesn't exist""" + try: + self.s3_client.head_bucket(Bucket=self.bucket_name) + except ClientError as e: + error_code = e.response['Error']['Code'] + if error_code == '404': + try: + self.s3_client.create_bucket(Bucket=self.bucket_name) + logger.info(f"Created MinIO bucket: {self.bucket_name}") + except ClientError as create_error: + logger.error(f"Failed to create bucket: {create_error}") + raise + + async def upload_file(self, file_data: BinaryIO, object_name: str) -> str: + """Upload file to MinIO""" + try: + self.s3_client.upload_fileobj(file_data, self.bucket_name, object_name) + return f"minio://{self.bucket_name}/{object_name}" + except ClientError as e: + logger.error(f"Failed to upload file to MinIO: {e}") + raise + + async def download_file(self, object_name: str) -> bytes: + """Download file from MinIO""" + try: + response = self.s3_client.get_object(Bucket=self.bucket_name, Key=object_name) + return response['Body'].read() + except ClientError as e: + logger.error(f"Failed to download file from MinIO: {e}") + raise + + async def delete_file(self, object_name: str) -> bool: + """Delete file from MinIO""" + try: + self.s3_client.delete_object(Bucket=self.bucket_name, Key=object_name) + return True + except ClientError as e: + logger.error(f"Failed to delete file from MinIO: {e}") + return False + + async def file_exists(self, object_name: str) -> bool: + """Check if file exists in MinIO""" + try: + self.s3_client.head_object(Bucket=self.bucket_name, Key=object_name) + return True + except ClientError: + return False + + async def get_file_url(self, object_name: str, expiration: int = 3600) -> str: + """Generate presigned URL for MinIO object""" + try: + url = self.s3_client.generate_presigned_url( + 'get_object', + Params={'Bucket': self.bucket_name, 'Key': object_name}, + ExpiresIn=expiration + ) + return url + except ClientError as e: + logger.error(f"Failed to generate presigned URL: {e}") + raise diff --git a/app/storage/s3_backend.py b/app/storage/s3_backend.py new file mode 100644 index 0000000..066954a --- /dev/null +++ b/app/storage/s3_backend.py @@ -0,0 +1,87 @@ +import boto3 +from botocore.exceptions import ClientError +from typing import BinaryIO +from app.storage.base import StorageBackend +from app.config import settings +import logging + +logger = logging.getLogger(__name__) + + +class S3Backend(StorageBackend): + """AWS S3 storage backend implementation""" + + def __init__(self): + self.s3_client = boto3.client( + 's3', + aws_access_key_id=settings.aws_access_key_id, + aws_secret_access_key=settings.aws_secret_access_key, + region_name=settings.aws_region + ) + self.bucket_name = settings.s3_bucket_name + self._ensure_bucket_exists() + + def _ensure_bucket_exists(self): + """Create bucket if it doesn't exist""" + try: + self.s3_client.head_bucket(Bucket=self.bucket_name) + except ClientError as e: + error_code = e.response['Error']['Code'] + if error_code == '404': + try: + self.s3_client.create_bucket( + Bucket=self.bucket_name, + CreateBucketConfiguration={'LocationConstraint': settings.aws_region} + ) + logger.info(f"Created S3 bucket: {self.bucket_name}") + except ClientError as create_error: + logger.error(f"Failed to create bucket: {create_error}") + raise + + async def upload_file(self, file_data: BinaryIO, object_name: str) -> str: + """Upload file to S3""" + try: + self.s3_client.upload_fileobj(file_data, self.bucket_name, object_name) + return f"s3://{self.bucket_name}/{object_name}" + except ClientError as e: + logger.error(f"Failed to upload file to S3: {e}") + raise + + async def download_file(self, object_name: str) -> bytes: + """Download file from S3""" + try: + response = self.s3_client.get_object(Bucket=self.bucket_name, Key=object_name) + return response['Body'].read() + except ClientError as e: + logger.error(f"Failed to download file from S3: {e}") + raise + + async def delete_file(self, object_name: str) -> bool: + """Delete file from S3""" + try: + self.s3_client.delete_object(Bucket=self.bucket_name, Key=object_name) + return True + except ClientError as e: + logger.error(f"Failed to delete file from S3: {e}") + return False + + async def file_exists(self, object_name: str) -> bool: + """Check if file exists in S3""" + try: + self.s3_client.head_object(Bucket=self.bucket_name, Key=object_name) + return True + except ClientError: + return False + + async def get_file_url(self, object_name: str, expiration: int = 3600) -> str: + """Generate presigned URL for S3 object""" + try: + url = self.s3_client.generate_presigned_url( + 'get_object', + Params={'Bucket': self.bucket_name, 'Key': object_name}, + ExpiresIn=expiration + ) + return url + except ClientError as e: + logger.error(f"Failed to generate presigned URL: {e}") + raise diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000..4807e7d --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,62 @@ +version: '3.8' + +services: + postgres: + image: postgres:15 + environment: + POSTGRES_USER: user + POSTGRES_PASSWORD: password + POSTGRES_DB: datalake + ports: + - "5432:5432" + volumes: + - postgres_data:/var/lib/postgresql/data + healthcheck: + test: ["CMD-SHELL", "pg_isready -U user"] + interval: 10s + timeout: 5s + retries: 5 + + minio: + image: minio/minio:latest + command: server /data --console-address ":9001" + environment: + MINIO_ROOT_USER: minioadmin + MINIO_ROOT_PASSWORD: minioadmin + ports: + - "9000:9000" + - "9001:9001" + volumes: + - minio_data:/data + healthcheck: + test: ["CMD", "curl", "-f", "http://localhost:9000/minio/health/live"] + interval: 10s + timeout: 5s + retries: 5 + + api: + build: . + ports: + - "8000:8000" + environment: + DATABASE_URL: postgresql://user:password@postgres:5432/datalake + STORAGE_BACKEND: minio + MINIO_ENDPOINT: minio:9000 + MINIO_ACCESS_KEY: minioadmin + MINIO_SECRET_KEY: minioadmin + MINIO_BUCKET_NAME: test-artifacts + MINIO_SECURE: "false" + depends_on: + postgres: + condition: service_healthy + minio: + condition: service_healthy + healthcheck: + test: ["CMD", "python", "-c", "import requests; requests.get('http://localhost:8000/health')"] + interval: 30s + timeout: 10s + retries: 3 + +volumes: + postgres_data: + minio_data: diff --git a/helm/Chart.yaml b/helm/Chart.yaml new file mode 100644 index 0000000..ce650b4 --- /dev/null +++ b/helm/Chart.yaml @@ -0,0 +1,13 @@ +apiVersion: v2 +name: datalake +description: Test Artifact Data Lake - Store and query test artifacts +type: application +version: 1.0.0 +appVersion: "1.0.0" +keywords: + - testing + - artifacts + - storage + - data-lake +maintainers: + - name: Your Team diff --git a/helm/templates/_helpers.tpl b/helm/templates/_helpers.tpl new file mode 100644 index 0000000..0ebb04f --- /dev/null +++ b/helm/templates/_helpers.tpl @@ -0,0 +1,60 @@ +{{/* +Expand the name of the chart. +*/}} +{{- define "datalake.name" -}} +{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* +Create a default fully qualified app name. +*/}} +{{- define "datalake.fullname" -}} +{{- if .Values.fullnameOverride }} +{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }} +{{- else }} +{{- $name := default .Chart.Name .Values.nameOverride }} +{{- if contains $name .Release.Name }} +{{- .Release.Name | trunc 63 | trimSuffix "-" }} +{{- else }} +{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }} +{{- end }} +{{- end }} +{{- end }} + +{{/* +Create chart name and version as used by the chart label. +*/}} +{{- define "datalake.chart" -}} +{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* +Common labels +*/}} +{{- define "datalake.labels" -}} +helm.sh/chart: {{ include "datalake.chart" . }} +{{ include "datalake.selectorLabels" . }} +{{- if .Chart.AppVersion }} +app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} +{{- end }} +app.kubernetes.io/managed-by: {{ .Release.Service }} +{{- end }} + +{{/* +Selector labels +*/}} +{{- define "datalake.selectorLabels" -}} +app.kubernetes.io/name: {{ include "datalake.name" . }} +app.kubernetes.io/instance: {{ .Release.Name }} +{{- end }} + +{{/* +Create the name of the service account to use +*/}} +{{- define "datalake.serviceAccountName" -}} +{{- if .Values.serviceAccount.create }} +{{- default (include "datalake.fullname" .) .Values.serviceAccount.name }} +{{- else }} +{{- default "default" .Values.serviceAccount.name }} +{{- end }} +{{- end }} diff --git a/helm/templates/deployment.yaml b/helm/templates/deployment.yaml new file mode 100644 index 0000000..652d855 --- /dev/null +++ b/helm/templates/deployment.yaml @@ -0,0 +1,111 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{ include "datalake.fullname" . }} + labels: + {{- include "datalake.labels" . | nindent 4 }} +spec: + {{- if not .Values.autoscaling.enabled }} + replicas: {{ .Values.replicaCount }} + {{- end }} + selector: + matchLabels: + {{- include "datalake.selectorLabels" . | nindent 6 }} + template: + metadata: + {{- with .Values.podAnnotations }} + annotations: + {{- toYaml . | nindent 8 }} + {{- end }} + labels: + {{- include "datalake.selectorLabels" . | nindent 8 }} + spec: + {{- with .Values.imagePullSecrets }} + imagePullSecrets: + {{- toYaml . | nindent 8 }} + {{- end }} + serviceAccountName: {{ include "datalake.serviceAccountName" . }} + securityContext: + {{- toYaml .Values.podSecurityContext | nindent 8 }} + containers: + - name: {{ .Chart.Name }} + securityContext: + {{- toYaml .Values.securityContext | nindent 12 }} + image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default .Chart.AppVersion }}" + imagePullPolicy: {{ .Values.image.pullPolicy }} + ports: + - name: http + containerPort: {{ .Values.service.targetPort }} + protocol: TCP + livenessProbe: + httpGet: + path: /health + port: http + initialDelaySeconds: 30 + periodSeconds: 10 + readinessProbe: + httpGet: + path: /health + port: http + initialDelaySeconds: 5 + periodSeconds: 5 + env: + - name: DATABASE_URL + valueFrom: + secretKeyRef: + name: {{ include "datalake.fullname" . }}-secrets + key: database-url + - name: STORAGE_BACKEND + value: {{ .Values.config.storageBackend | quote }} + - name: MAX_UPLOAD_SIZE + value: {{ .Values.config.maxUploadSize | quote }} + {{- if eq .Values.config.storageBackend "s3" }} + - name: AWS_ACCESS_KEY_ID + valueFrom: + secretKeyRef: + name: {{ include "datalake.fullname" . }}-secrets + key: aws-access-key-id + - name: AWS_SECRET_ACCESS_KEY + valueFrom: + secretKeyRef: + name: {{ include "datalake.fullname" . }}-secrets + key: aws-secret-access-key + - name: AWS_REGION + value: {{ .Values.aws.region | quote }} + - name: S3_BUCKET_NAME + value: {{ .Values.aws.bucketName | quote }} + {{- else }} + - name: MINIO_ENDPOINT + value: "{{ include "datalake.fullname" . }}-minio:9000" + - name: MINIO_ACCESS_KEY + valueFrom: + secretKeyRef: + name: {{ include "datalake.fullname" . }}-secrets + key: minio-access-key + - name: MINIO_SECRET_KEY + valueFrom: + secretKeyRef: + name: {{ include "datalake.fullname" . }}-secrets + key: minio-secret-key + - name: MINIO_BUCKET_NAME + value: "test-artifacts" + - name: MINIO_SECURE + value: "false" + {{- end }} + {{- with .Values.env }} + {{- toYaml . | nindent 8 }} + {{- end }} + resources: + {{- toYaml .Values.resources | nindent 12 }} + {{- with .Values.nodeSelector }} + nodeSelector: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.affinity }} + affinity: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.tolerations }} + tolerations: + {{- toYaml . | nindent 8 }} + {{- end }} diff --git a/helm/templates/ingress.yaml b/helm/templates/ingress.yaml new file mode 100644 index 0000000..30c1764 --- /dev/null +++ b/helm/templates/ingress.yaml @@ -0,0 +1,41 @@ +{{- if .Values.ingress.enabled -}} +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: {{ include "datalake.fullname" . }} + labels: + {{- include "datalake.labels" . | nindent 4 }} + {{- with .Values.ingress.annotations }} + annotations: + {{- toYaml . | nindent 4 }} + {{- end }} +spec: + {{- if .Values.ingress.className }} + ingressClassName: {{ .Values.ingress.className }} + {{- end }} + {{- if .Values.ingress.tls }} + tls: + {{- range .Values.ingress.tls }} + - hosts: + {{- range .hosts }} + - {{ . | quote }} + {{- end }} + secretName: {{ .secretName }} + {{- end }} + {{- end }} + rules: + {{- range .Values.ingress.hosts }} + - host: {{ .host | quote }} + http: + paths: + {{- range .paths }} + - path: {{ .path }} + pathType: {{ .pathType }} + backend: + service: + name: {{ include "datalake.fullname" $ }} + port: + number: {{ $.Values.service.port }} + {{- end }} + {{- end }} +{{- end }} diff --git a/helm/templates/secrets.yaml b/helm/templates/secrets.yaml new file mode 100644 index 0000000..d132fc1 --- /dev/null +++ b/helm/templates/secrets.yaml @@ -0,0 +1,16 @@ +apiVersion: v1 +kind: Secret +metadata: + name: {{ include "datalake.fullname" . }}-secrets + labels: + {{- include "datalake.labels" . | nindent 4 }} +type: Opaque +stringData: + database-url: "postgresql://{{ .Values.postgresql.auth.username }}:{{ .Values.postgresql.auth.password }}@{{ include "datalake.fullname" . }}-postgresql:5432/{{ .Values.postgresql.auth.database }}" + {{- if .Values.aws.enabled }} + aws-access-key-id: {{ .Values.aws.accessKeyId | quote }} + aws-secret-access-key: {{ .Values.aws.secretAccessKey | quote }} + {{- else }} + minio-access-key: {{ .Values.minio.rootUser | quote }} + minio-secret-key: {{ .Values.minio.rootPassword | quote }} + {{- end }} diff --git a/helm/templates/service.yaml b/helm/templates/service.yaml new file mode 100644 index 0000000..f23d030 --- /dev/null +++ b/helm/templates/service.yaml @@ -0,0 +1,15 @@ +apiVersion: v1 +kind: Service +metadata: + name: {{ include "datalake.fullname" . }} + labels: + {{- include "datalake.labels" . | nindent 4 }} +spec: + type: {{ .Values.service.type }} + ports: + - port: {{ .Values.service.port }} + targetPort: http + protocol: TCP + name: http + selector: + {{- include "datalake.selectorLabels" . | nindent 4 }} diff --git a/helm/templates/serviceaccount.yaml b/helm/templates/serviceaccount.yaml new file mode 100644 index 0000000..2e7472b --- /dev/null +++ b/helm/templates/serviceaccount.yaml @@ -0,0 +1,12 @@ +{{- if .Values.serviceAccount.create -}} +apiVersion: v1 +kind: ServiceAccount +metadata: + name: {{ include "datalake.serviceAccountName" . }} + labels: + {{- include "datalake.labels" . | nindent 4 }} + {{- with .Values.serviceAccount.annotations }} + annotations: + {{- toYaml . | nindent 4 }} + {{- end }} +{{- end }} diff --git a/helm/values.yaml b/helm/values.yaml new file mode 100644 index 0000000..c468fb2 --- /dev/null +++ b/helm/values.yaml @@ -0,0 +1,111 @@ +replicaCount: 1 + +image: + repository: datalake + pullPolicy: IfNotPresent + tag: "latest" + +imagePullSecrets: [] +nameOverride: "" +fullnameOverride: "" + +serviceAccount: + create: true + annotations: {} + name: "" + +podAnnotations: {} + +podSecurityContext: + fsGroup: 1000 + +securityContext: + capabilities: + drop: + - ALL + readOnlyRootFilesystem: false + runAsNonRoot: true + runAsUser: 1000 + +service: + type: ClusterIP + port: 8000 + targetPort: 8000 + +ingress: + enabled: false + className: "" + annotations: {} + hosts: + - host: datalake.local + paths: + - path: / + pathType: Prefix + tls: [] + +resources: + limits: + cpu: 1000m + memory: 1Gi + requests: + cpu: 500m + memory: 512Mi + +autoscaling: + enabled: false + minReplicas: 1 + maxReplicas: 10 + targetCPUUtilizationPercentage: 80 + +nodeSelector: {} + +tolerations: [] + +affinity: {} + +# Application configuration +config: + storageBackend: minio # or "s3" + maxUploadSize: 524288000 # 500MB + +# PostgreSQL configuration +postgresql: + enabled: true + auth: + username: user + password: password + database: datalake + primary: + persistence: + enabled: true + size: 10Gi + +# MinIO configuration (for self-hosted storage) +minio: + enabled: true + mode: standalone + rootUser: minioadmin + rootPassword: minioadmin + persistence: + enabled: true + size: 50Gi + service: + type: ClusterIP + port: 9000 + consoleService: + port: 9001 + +# AWS S3 configuration (when using AWS) +aws: + enabled: false + accessKeyId: "" + secretAccessKey: "" + region: us-east-1 + bucketName: test-artifacts + +# Environment variables +env: + - name: API_HOST + value: "0.0.0.0" + - name: API_PORT + value: "8000" diff --git a/quickstart.sh b/quickstart.sh new file mode 100755 index 0000000..e963763 --- /dev/null +++ b/quickstart.sh @@ -0,0 +1,80 @@ +#!/bin/bash + +set -e + +echo "=========================================" +echo "Test Artifact Data Lake - Quick Start" +echo "=========================================" +echo "" + +# Check if Docker is installed +if ! command -v docker &> /dev/null; then + echo "Error: Docker is not installed. Please install Docker first." + exit 1 +fi + +# Check if Docker Compose is installed +if ! command -v docker-compose &> /dev/null; then + echo "Error: Docker Compose is not installed. Please install Docker Compose first." + exit 1 +fi + +# Create .env file if it doesn't exist +if [ ! -f .env ]; then + echo "Creating .env file from .env.example..." + cp .env.example .env + echo "✓ .env file created" +else + echo "✓ .env file already exists" +fi + +echo "" +echo "Starting services with Docker Compose..." +docker-compose up -d + +echo "" +echo "Waiting for services to be ready..." +sleep 10 + +echo "" +echo "=========================================" +echo "Services are running!" +echo "=========================================" +echo "" +echo "API: http://localhost:8000" +echo "API Docs: http://localhost:8000/docs" +echo "MinIO Console: http://localhost:9001" +echo " Username: minioadmin" +echo " Password: minioadmin" +echo "" +echo "To view logs: docker-compose logs -f" +echo "To stop: docker-compose down" +echo "" +echo "=========================================" +echo "Testing the API..." +echo "=========================================" +echo "" + +# Wait a bit more for API to be fully ready +sleep 5 + +# Test health endpoint +if curl -s http://localhost:8000/health | grep -q "healthy"; then + echo "✓ API is healthy!" + echo "" + echo "Example: Upload a test file" + echo "----------------------------" + echo 'echo "test,data" > test.csv' + echo 'curl -X POST "http://localhost:8000/api/v1/artifacts/upload" \' + echo ' -F "file=@test.csv" \' + echo ' -F "test_name=sample_test" \' + echo ' -F "test_suite=demo" \' + echo ' -F "test_result=pass"' + echo "" +else + echo "⚠ API is not responding yet. Please wait a moment and check http://localhost:8000/health" +fi + +echo "=========================================" +echo "Setup complete! 🚀" +echo "=========================================" diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..e40ffcb --- /dev/null +++ b/requirements.txt @@ -0,0 +1,11 @@ +fastapi==0.115.0 +uvicorn[standard]==0.31.0 +python-multipart==0.0.12 +sqlalchemy==2.0.35 +psycopg2-binary==2.9.9 +alembic==1.13.3 +boto3==1.35.36 +python-dotenv==1.0.1 +pydantic==2.9.2 +pydantic-settings==2.5.2 +aiofiles==24.1.0 diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_api.py b/tests/test_api.py new file mode 100644 index 0000000..d3fd748 --- /dev/null +++ b/tests/test_api.py @@ -0,0 +1,38 @@ +import pytest +from fastapi.testclient import TestClient +from app.main import app + +client = TestClient(app) + + +def test_root(): + """Test root endpoint""" + response = client.get("/") + assert response.status_code == 200 + data = response.json() + assert "message" in data + assert "version" in data + + +def test_health(): + """Test health check endpoint""" + response = client.get("/health") + assert response.status_code == 200 + data = response.json() + assert data["status"] == "healthy" + + +# Add more tests as needed +# def test_upload_artifact(): +# """Test artifact upload""" +# files = {"file": ("test.csv", b"test,data\n1,2", "text/csv")} +# data = { +# "test_name": "sample_test", +# "test_suite": "unit", +# "test_result": "pass" +# } +# response = client.post("/api/v1/artifacts/upload", files=files, data=data) +# assert response.status_code == 201 +# artifact = response.json() +# assert artifact["filename"] == "test.csv" +# assert artifact["test_name"] == "sample_test"