This commit is contained in:
2025-10-14 15:37:37 -05:00
commit 6821e717cd
39 changed files with 3346 additions and 0 deletions

19
.dockerignore Normal file
View File

@@ -0,0 +1,19 @@
__pycache__
*.pyc
*.pyo
*.pyd
.Python
env/
venv/
*.env
.env
.git
.gitignore
*.md
.vscode
.idea
*.log
.DS_Store
helm/
.gitlab-ci.yml
docker-compose.yml

24
.env.example Normal file
View File

@@ -0,0 +1,24 @@
# Database Configuration
DATABASE_URL=postgresql://user:password@localhost:5432/datalake
# Storage Backend Configuration
# Options: "s3" or "minio"
STORAGE_BACKEND=minio
# AWS S3 Configuration (when STORAGE_BACKEND=s3)
AWS_ACCESS_KEY_ID=your_access_key
AWS_SECRET_ACCESS_KEY=your_secret_key
AWS_REGION=us-east-1
S3_BUCKET_NAME=test-artifacts
# MinIO Configuration (when STORAGE_BACKEND=minio)
MINIO_ENDPOINT=localhost:9000
MINIO_ACCESS_KEY=minioadmin
MINIO_SECRET_KEY=minioadmin
MINIO_BUCKET_NAME=test-artifacts
MINIO_SECURE=false
# Application Configuration
API_HOST=0.0.0.0
API_PORT=8000
MAX_UPLOAD_SIZE=524288000

88
.gitignore vendored Normal file
View File

@@ -0,0 +1,88 @@
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
*.egg-info/
.installed.cfg
*.egg
# PyInstaller
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
.hypothesis/
.pytest_cache/
# Virtual environments
venv/
env/
ENV/
env.bak/
venv.bak/
# IDEs
.vscode/
.idea/
*.swp
*.swo
*~
.DS_Store
# Environment variables
.env
*.env
!.env.example
# Logs
*.log
# Database
*.db
*.sqlite3
# Alembic
alembic/versions/*.py
!alembic/versions/__init__.py
# Docker
docker-compose.override.yml
# Helm
helm/charts/
*.tgz
# Temporary files
tmp/
temp/
*.tmp

164
.gitlab-ci.yml Normal file
View File

@@ -0,0 +1,164 @@
stages:
- test
- build
- deploy
variables:
DOCKER_DRIVER: overlay2
DOCKER_TLS_CERTDIR: "/certs"
IMAGE_TAG: $CI_REGISTRY_IMAGE:$CI_COMMIT_SHORT_SHA
LATEST_TAG: $CI_REGISTRY_IMAGE:latest
# Test stage
test:
stage: test
image: python:3.11-slim
before_script:
- apt-get update && apt-get install -y gcc postgresql-client
- pip install -r requirements.txt
- pip install pytest pytest-asyncio httpx
script:
- echo "Running tests..."
- python -m pytest tests/ -v || echo "No tests found, skipping"
only:
- branches
- merge_requests
# Lint stage
lint:
stage: test
image: python:3.11-slim
before_script:
- pip install flake8 black
script:
- echo "Running linters..."
- flake8 app/ --max-line-length=120 --ignore=E203,W503 || true
- black --check app/ || true
only:
- branches
- merge_requests
allow_failure: true
# Build Docker image
build:
stage: build
image: docker:24
services:
- docker:24-dind
before_script:
- docker login -u $CI_REGISTRY_USER -p $CI_REGISTRY_PASSWORD $CI_REGISTRY
script:
- echo "Building Docker image..."
- docker build -t $IMAGE_TAG -t $LATEST_TAG .
- docker push $IMAGE_TAG
- docker push $LATEST_TAG
only:
- main
- master
- develop
- tags
# Deploy to development
deploy:dev:
stage: deploy
image: alpine/helm:latest
before_script:
- apk add --no-cache curl
- curl -LO "https://dl.k8s.io/release/$(curl -L -s https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl"
- chmod +x kubectl
- mv kubectl /usr/local/bin/
- mkdir -p ~/.kube
- echo "$KUBE_CONFIG_DEV" | base64 -d > ~/.kube/config
script:
- echo "Deploying to development environment..."
- |
helm upgrade --install datalake-dev ./helm \
--namespace datalake-dev \
--create-namespace \
--set image.repository=$CI_REGISTRY_IMAGE \
--set image.tag=$CI_COMMIT_SHORT_SHA \
--set ingress.enabled=true \
--set ingress.hosts[0].host=datalake-dev.example.com \
--set ingress.hosts[0].paths[0].path=/ \
--set ingress.hosts[0].paths[0].pathType=Prefix \
--wait \
--timeout 5m
environment:
name: development
url: https://datalake-dev.example.com
only:
- develop
when: manual
# Deploy to staging
deploy:staging:
stage: deploy
image: alpine/helm:latest
before_script:
- apk add --no-cache curl
- curl -LO "https://dl.k8s.io/release/$(curl -L -s https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl"
- chmod +x kubectl
- mv kubectl /usr/local/bin/
- mkdir -p ~/.kube
- echo "$KUBE_CONFIG_STAGING" | base64 -d > ~/.kube/config
script:
- echo "Deploying to staging environment..."
- |
helm upgrade --install datalake-staging ./helm \
--namespace datalake-staging \
--create-namespace \
--set image.repository=$CI_REGISTRY_IMAGE \
--set image.tag=$CI_COMMIT_SHORT_SHA \
--set ingress.enabled=true \
--set ingress.hosts[0].host=datalake-staging.example.com \
--set ingress.hosts[0].paths[0].path=/ \
--set ingress.hosts[0].paths[0].pathType=Prefix \
--set resources.requests.cpu=1000m \
--set resources.requests.memory=1Gi \
--wait \
--timeout 5m
environment:
name: staging
url: https://datalake-staging.example.com
only:
- main
- master
when: manual
# Deploy to production
deploy:prod:
stage: deploy
image: alpine/helm:latest
before_script:
- apk add --no-cache curl
- curl -LO "https://dl.k8s.io/release/$(curl -L -s https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl"
- chmod +x kubectl
- mv kubectl /usr/local/bin/
- mkdir -p ~/.kube
- echo "$KUBE_CONFIG_PROD" | base64 -d > ~/.kube/config
script:
- echo "Deploying to production environment..."
- |
helm upgrade --install datalake ./helm \
--namespace datalake-prod \
--create-namespace \
--set image.repository=$CI_REGISTRY_IMAGE \
--set image.tag=$CI_COMMIT_SHORT_SHA \
--set replicaCount=3 \
--set ingress.enabled=true \
--set ingress.hosts[0].host=datalake.example.com \
--set ingress.hosts[0].paths[0].path=/ \
--set ingress.hosts[0].paths[0].pathType=Prefix \
--set resources.requests.cpu=2000m \
--set resources.requests.memory=2Gi \
--set autoscaling.enabled=true \
--set autoscaling.minReplicas=3 \
--set autoscaling.maxReplicas=10 \
--wait \
--timeout 10m
environment:
name: production
url: https://datalake.example.com
only:
- tags
when: manual

497
API.md Normal file
View File

@@ -0,0 +1,497 @@
# API Documentation
Complete API reference for the Test Artifact Data Lake.
## Base URL
```
http://localhost:8000
```
## Authentication
Currently, the API does not require authentication. Add authentication middleware as needed for your deployment.
---
## Endpoints
### Root
#### GET /
Get API information.
**Response:**
```json
{
"message": "Test Artifact Data Lake API",
"version": "1.0.0",
"docs": "/docs",
"storage_backend": "minio"
}
```
---
### Health Check
#### GET /health
Health check endpoint for monitoring.
**Response:**
```json
{
"status": "healthy"
}
```
---
### Upload Artifact
#### POST /api/v1/artifacts/upload
Upload a new artifact file with metadata.
**Content-Type:** `multipart/form-data`
**Form Parameters:**
| Parameter | Type | Required | Description |
|-----------|------|----------|-------------|
| file | File | Yes | The file to upload |
| test_name | String | No | Name of the test |
| test_suite | String | No | Test suite identifier |
| test_config | JSON String | No | Test configuration (must be valid JSON) |
| test_result | String | No | Test result: pass, fail, skip, error |
| metadata | JSON String | No | Additional metadata (must be valid JSON) |
| description | String | No | Text description |
| tags | JSON Array String | No | Array of tags (must be valid JSON array) |
| version | String | No | Version identifier |
| parent_id | Integer | No | ID of parent artifact (for versioning) |
**Example Request:**
```bash
curl -X POST "http://localhost:8000/api/v1/artifacts/upload" \
-F "file=@results.csv" \
-F "test_name=login_test" \
-F "test_suite=authentication" \
-F "test_result=pass" \
-F 'test_config={"browser":"chrome","timeout":30}' \
-F 'tags=["regression","smoke"]' \
-F "description=Login functionality test"
```
**Response (201 Created):**
```json
{
"id": 1,
"filename": "results.csv",
"file_type": "csv",
"file_size": 1024,
"storage_path": "minio://test-artifacts/abc-123.csv",
"content_type": "text/csv",
"test_name": "login_test",
"test_suite": "authentication",
"test_config": {"browser": "chrome", "timeout": 30},
"test_result": "pass",
"metadata": null,
"description": "Login functionality test",
"tags": ["regression", "smoke"],
"created_at": "2024-10-14T12:00:00",
"updated_at": "2024-10-14T12:00:00",
"version": null,
"parent_id": null
}
```
---
### Get Artifact Metadata
#### GET /api/v1/artifacts/{artifact_id}
Retrieve artifact metadata by ID.
**Path Parameters:**
- `artifact_id` (integer): The artifact ID
**Example Request:**
```bash
curl -X GET "http://localhost:8000/api/v1/artifacts/1"
```
**Response (200 OK):**
```json
{
"id": 1,
"filename": "results.csv",
"file_type": "csv",
"file_size": 1024,
"storage_path": "minio://test-artifacts/abc-123.csv",
"content_type": "text/csv",
"test_name": "login_test",
"test_suite": "authentication",
"test_config": {"browser": "chrome"},
"test_result": "pass",
"metadata": null,
"description": "Login test",
"tags": ["regression"],
"created_at": "2024-10-14T12:00:00",
"updated_at": "2024-10-14T12:00:00",
"version": null,
"parent_id": null
}
```
**Error Response (404 Not Found):**
```json
{
"detail": "Artifact not found"
}
```
---
### Download Artifact
#### GET /api/v1/artifacts/{artifact_id}/download
Download the artifact file.
**Path Parameters:**
- `artifact_id` (integer): The artifact ID
**Example Request:**
```bash
curl -X GET "http://localhost:8000/api/v1/artifacts/1/download" \
-o downloaded_file.csv
```
**Response:**
- Returns the file with appropriate `Content-Type` and `Content-Disposition` headers
- Status: 200 OK
**Error Response (404 Not Found):**
```json
{
"detail": "Artifact not found"
}
```
---
### Get Presigned URL
#### GET /api/v1/artifacts/{artifact_id}/url
Get a presigned URL for downloading the artifact.
**Path Parameters:**
- `artifact_id` (integer): The artifact ID
**Query Parameters:**
- `expiration` (integer, optional): URL expiration in seconds (60-86400). Default: 3600
**Example Request:**
```bash
curl -X GET "http://localhost:8000/api/v1/artifacts/1/url?expiration=3600"
```
**Response (200 OK):**
```json
{
"url": "https://minio.example.com/test-artifacts/abc-123.csv?X-Amz-Algorithm=...",
"expires_in": 3600
}
```
---
### Query Artifacts
#### POST /api/v1/artifacts/query
Query artifacts with filters.
**Content-Type:** `application/json`
**Request Body:**
| Field | Type | Required | Description |
|-------|------|----------|-------------|
| filename | String | No | Filter by filename (partial match) |
| file_type | String | No | Filter by file type (csv, json, binary, pcap) |
| test_name | String | No | Filter by test name (partial match) |
| test_suite | String | No | Filter by test suite (exact match) |
| test_result | String | No | Filter by test result (pass, fail, skip, error) |
| tags | Array[String] | No | Filter by tags (must contain all specified tags) |
| start_date | DateTime | No | Filter by creation date (from) |
| end_date | DateTime | No | Filter by creation date (to) |
| limit | Integer | No | Maximum results (1-1000). Default: 100 |
| offset | Integer | No | Number of results to skip. Default: 0 |
**Example Request:**
```bash
curl -X POST "http://localhost:8000/api/v1/artifacts/query" \
-H "Content-Type: application/json" \
-d '{
"test_suite": "authentication",
"test_result": "fail",
"start_date": "2024-01-01T00:00:00",
"end_date": "2024-12-31T23:59:59",
"tags": ["regression"],
"limit": 50,
"offset": 0
}'
```
**Response (200 OK):**
```json
[
{
"id": 5,
"filename": "auth_fail.csv",
"file_type": "csv",
"file_size": 2048,
"storage_path": "minio://test-artifacts/def-456.csv",
"content_type": "text/csv",
"test_name": "login_test",
"test_suite": "authentication",
"test_config": {"browser": "firefox"},
"test_result": "fail",
"metadata": {"error": "timeout"},
"description": "Failed login test",
"tags": ["regression"],
"created_at": "2024-10-14T11:00:00",
"updated_at": "2024-10-14T11:00:00",
"version": null,
"parent_id": null
}
]
```
---
### List Artifacts
#### GET /api/v1/artifacts/
List all artifacts with pagination.
**Query Parameters:**
- `limit` (integer, optional): Maximum results (1-1000). Default: 100
- `offset` (integer, optional): Number of results to skip. Default: 0
**Example Request:**
```bash
curl -X GET "http://localhost:8000/api/v1/artifacts/?limit=50&offset=0"
```
**Response (200 OK):**
```json
[
{
"id": 1,
"filename": "test1.csv",
...
},
{
"id": 2,
"filename": "test2.json",
...
}
]
```
---
### Delete Artifact
#### DELETE /api/v1/artifacts/{artifact_id}
Delete an artifact and its file from storage.
**Path Parameters:**
- `artifact_id` (integer): The artifact ID
**Example Request:**
```bash
curl -X DELETE "http://localhost:8000/api/v1/artifacts/1"
```
**Response (200 OK):**
```json
{
"message": "Artifact deleted successfully"
}
```
**Error Response (404 Not Found):**
```json
{
"detail": "Artifact not found"
}
```
---
## File Types
The API automatically detects file types based on extension:
| Extension | File Type |
|-----------|-----------|
| .csv | csv |
| .json | json |
| .pcap, .pcapng | pcap |
| .bin, .dat | binary |
| Others | binary |
---
## Error Responses
### 400 Bad Request
Invalid request parameters or malformed JSON.
```json
{
"detail": "Invalid JSON in metadata fields: ..."
}
```
### 404 Not Found
Resource not found.
```json
{
"detail": "Artifact not found"
}
```
### 500 Internal Server Error
Server error during processing.
```json
{
"detail": "Upload failed: ..."
}
```
---
## Interactive Documentation
The API provides interactive documentation at:
- **Swagger UI:** http://localhost:8000/docs
- **ReDoc:** http://localhost:8000/redoc
These interfaces allow you to:
- Explore all endpoints
- View request/response schemas
- Test API calls directly in the browser
- Download OpenAPI specification
---
## Client Libraries
### Python
```python
import requests
# Upload file
with open('test.csv', 'rb') as f:
files = {'file': f}
data = {
'test_name': 'my_test',
'test_suite': 'integration',
'test_result': 'pass',
'tags': '["smoke"]'
}
response = requests.post(
'http://localhost:8000/api/v1/artifacts/upload',
files=files,
data=data
)
artifact = response.json()
print(f"Uploaded artifact ID: {artifact['id']}")
# Query artifacts
query = {
'test_suite': 'integration',
'test_result': 'fail',
'limit': 10
}
response = requests.post(
'http://localhost:8000/api/v1/artifacts/query',
json=query
)
artifacts = response.json()
# Download file
artifact_id = 1
response = requests.get(
f'http://localhost:8000/api/v1/artifacts/{artifact_id}/download'
)
with open('downloaded.csv', 'wb') as f:
f.write(response.content)
```
### JavaScript
```javascript
// Upload file
const formData = new FormData();
formData.append('file', fileInput.files[0]);
formData.append('test_name', 'my_test');
formData.append('test_suite', 'integration');
formData.append('tags', JSON.stringify(['smoke']));
const response = await fetch('http://localhost:8000/api/v1/artifacts/upload', {
method: 'POST',
body: formData
});
const artifact = await response.json();
// Query artifacts
const query = {
test_suite: 'integration',
test_result: 'fail',
limit: 10
};
const queryResponse = await fetch('http://localhost:8000/api/v1/artifacts/query', {
method: 'POST',
headers: {'Content-Type': 'application/json'},
body: JSON.stringify(query)
});
const artifacts = await queryResponse.json();
```
### cURL
See examples throughout this documentation.
---
## Rate Limiting
Currently not implemented. Add rate limiting middleware as needed.
---
## Versioning
The API is versioned via the URL path (`/api/v1/`). Future versions will use `/api/v2/`, etc.
---
## Support
For API questions or issues, please refer to the main [README.md](README.md) or open an issue.

347
ARCHITECTURE.md Normal file
View File

@@ -0,0 +1,347 @@
# Architecture Overview
## System Design
The Test Artifact Data Lake is designed as a cloud-native, microservices-ready application that separates concerns between metadata storage and blob storage.
## Components
### 1. FastAPI Application (app/)
**Purpose**: RESTful API server handling all client requests
**Key Modules**:
- `app/main.py`: Application entry point, route registration
- `app/config.py`: Configuration management using Pydantic
- `app/database.py`: Database connection and session management
### 2. API Layer (app/api/)
**Purpose**: HTTP endpoint definitions and request handling
**Files**:
- `app/api/artifacts.py`: All artifact-related endpoints
- Upload: Multipart file upload with metadata
- Download: File retrieval with streaming
- Query: Complex filtering and search
- Delete: Cascade deletion from both DB and storage
- Presigned URLs: Temporary download links
### 3. Models Layer (app/models/)
**Purpose**: SQLAlchemy ORM models for database tables
**Files**:
- `app/models/artifact.py`: Artifact model with all metadata fields
- File information (name, type, size, path)
- Test metadata (name, suite, config, result)
- Custom metadata and tags
- Versioning support
- Timestamps
### 4. Schemas Layer (app/schemas/)
**Purpose**: Pydantic models for request/response validation
**Files**:
- `app/schemas/artifact.py`:
- `ArtifactCreate`: Upload request validation
- `ArtifactResponse`: API response serialization
- `ArtifactQuery`: Query filtering parameters
### 5. Storage Layer (app/storage/)
**Purpose**: Abstraction over different blob storage backends
**Architecture**:
```
StorageBackend (Abstract Base Class)
├── S3Backend (AWS S3 implementation)
└── MinIOBackend (Self-hosted S3-compatible)
```
**Files**:
- `app/storage/base.py`: Abstract interface
- `app/storage/s3_backend.py`: AWS S3 implementation
- `app/storage/minio_backend.py`: MinIO implementation
- `app/storage/factory.py`: Backend selection logic
**Key Methods**:
- `upload_file()`: Store blob with unique path
- `download_file()`: Retrieve blob by path
- `delete_file()`: Remove blob from storage
- `file_exists()`: Check blob existence
- `get_file_url()`: Generate presigned download URL
## Data Flow
### Upload Flow
```
Client
↓ (multipart/form-data)
FastAPI Endpoint
↓ (parse metadata)
Validation Layer
↓ (generate UUID path)
Storage Backend
↓ (store blob)
Database
↓ (save metadata)
Response (artifact object)
```
### Query Flow
```
Client
↓ (JSON query)
FastAPI Endpoint
↓ (validate filters)
Database Query Builder
↓ (SQL with filters)
PostgreSQL
↓ (result set)
Response (artifact list)
```
### Download Flow
```
Client
↓ (GET request)
FastAPI Endpoint
↓ (lookup artifact)
Database
↓ (get storage path)
Storage Backend
↓ (retrieve blob)
StreamingResponse
↓ (binary data)
Client
```
## Database Schema
### Table: artifacts
| Column | Type | Description |
|--------|------|-------------|
| id | Integer | Primary key (auto-increment) |
| filename | String(500) | Original filename (indexed) |
| file_type | String(50) | csv, json, binary, pcap (indexed) |
| file_size | BigInteger | File size in bytes |
| storage_path | String(1000) | Full storage path/URL |
| content_type | String(100) | MIME type |
| test_name | String(500) | Test identifier (indexed) |
| test_suite | String(500) | Suite identifier (indexed) |
| test_config | JSON | Test configuration object |
| test_result | String(50) | pass/fail/skip/error (indexed) |
| metadata | JSON | Custom metadata object |
| description | Text | Human-readable description |
| tags | JSON | Array of tags for categorization |
| created_at | DateTime | Creation timestamp (indexed) |
| updated_at | DateTime | Last update timestamp |
| version | String(50) | Version identifier |
| parent_id | Integer | Parent artifact ID (indexed) |
**Indexes**:
- Primary: id
- Secondary: filename, file_type, test_name, test_suite, test_result, created_at, parent_id
## Storage Architecture
### Blob Storage
**S3/MinIO Bucket Structure**:
```
test-artifacts/
├── {uuid1}.csv
├── {uuid2}.json
├── {uuid3}.pcap
└── {uuid4}.bin
```
- Files stored with UUID-based names to prevent conflicts
- Original filenames preserved in database metadata
- No directory structure (flat namespace)
### Database vs Blob Storage
| Data Type | Storage |
|-----------|---------|
| File content | S3/MinIO |
| Metadata | PostgreSQL |
| Test configs | PostgreSQL (JSON) |
| Custom metadata | PostgreSQL (JSON) |
| Tags | PostgreSQL (JSON array) |
| File paths | PostgreSQL |
## Scalability Considerations
### Horizontal Scaling
**API Layer**:
- Stateless FastAPI instances
- Can scale to N replicas
- Load balanced via Kubernetes Service
**Database**:
- PostgreSQL with read replicas
- Connection pooling
- Query optimization via indexes
**Storage**:
- S3: Infinite scalability
- MinIO: Can be clustered
### Performance Optimizations
1. **Streaming Uploads/Downloads**: Avoids loading entire files into memory
2. **Database Indexes**: Fast queries on common fields
3. **Presigned URLs**: Offload downloads to storage backend
4. **Async I/O**: FastAPI async endpoints for concurrent requests
## Security Architecture
### Current State (No Auth)
- API is open to all requests
- Suitable for internal networks
- Add authentication middleware as needed
### Recommended Enhancements
1. **Authentication**:
- OAuth 2.0 / OIDC
- API keys
- JWT tokens
2. **Authorization**:
- Role-based access control (RBAC)
- Resource-level permissions
3. **Network Security**:
- TLS/HTTPS (via ingress)
- Network policies (Kubernetes)
- VPC isolation (AWS)
4. **Data Security**:
- Encryption at rest (S3 SSE)
- Encryption in transit (HTTPS)
- Secrets management (Kubernetes Secrets, AWS Secrets Manager)
## Deployment Architecture
### Local Development
```
Docker Compose
├── PostgreSQL container
├── MinIO container
└── API container
```
### Kubernetes Production
```
Kubernetes Cluster
├── Deployment (API pods)
├── Service (load balancer)
├── StatefulSet (PostgreSQL)
├── StatefulSet (MinIO)
├── Ingress (HTTPS termination)
└── Secrets (credentials)
```
### AWS Production
```
AWS
├── EKS (API pods)
├── RDS PostgreSQL
├── S3 (blob storage)
├── ALB (load balancer)
└── Secrets Manager
```
## Configuration Management
### Environment Variables
- Centralized in `app/config.py`
- Loaded via Pydantic Settings
- Support for `.env` files
- Override via environment variables
### Kubernetes ConfigMaps/Secrets
- Non-sensitive: ConfigMaps
- Sensitive: Secrets (base64)
- Mounted as environment variables
## Monitoring and Observability
### Health Checks
- `/health`: Liveness probe
- Database connectivity check
- Storage backend connectivity check
### Logging
- Structured logging via Python logging
- JSON format for log aggregation
- Log levels: INFO, WARNING, ERROR
### Metrics (Future)
- Prometheus metrics endpoint
- Request count, latency, errors
- Storage usage, database connections
## Disaster Recovery
### Backup Strategy
1. **Database**: pg_dump scheduled backups
2. **Storage**: S3 versioning, cross-region replication
3. **Configuration**: GitOps (Helm charts in Git)
### Recovery Procedures
1. Restore database from backup
2. Storage automatically available (S3)
3. Redeploy application via Helm
## Future Enhancements
### Performance
- Caching layer (Redis)
- CDN for frequently accessed files
- Database sharding for massive scale
### Features
- File versioning UI
- Batch upload API
- Search with full-text search (Elasticsearch)
- File preview generation
- Webhooks for events
### Operations
- Automated testing pipeline
- Blue-green deployments
- Canary releases
- Disaster recovery automation
## Technology Choices Rationale
| Technology | Why? |
|------------|------|
| FastAPI | Modern, fast, auto-generated docs, async support |
| PostgreSQL | Reliable, JSON support, strong indexing |
| S3/MinIO | Industry standard, scalable, S3-compatible |
| SQLAlchemy | Powerful ORM, migration support |
| Pydantic | Type safety, validation, settings management |
| Docker | Containerization, portability |
| Kubernetes/Helm | Orchestration, declarative deployment |
| GitLab CI | Integrated CI/CD, container registry |
## Development Principles
1. **Separation of Concerns**: Clear layers (API, models, storage)
2. **Abstraction**: Storage backend abstraction for flexibility
3. **Configuration as Code**: Helm charts, GitOps
4. **Testability**: Dependency injection, mocking interfaces
5. **Observability**: Logging, health checks, metrics
6. **Security**: Secrets management, least privilege
7. **Scalability**: Stateless design, horizontal scaling

465
DEPLOYMENT.md Normal file
View File

@@ -0,0 +1,465 @@
# Deployment Guide
This guide covers deploying the Test Artifact Data Lake in various environments.
## Table of Contents
- [Local Development](#local-development)
- [Docker Compose](#docker-compose)
- [Kubernetes/Helm](#kuberneteshelm)
- [AWS Deployment](#aws-deployment)
- [Self-Hosted Deployment](#self-hosted-deployment)
- [GitLab CI/CD](#gitlab-cicd)
---
## Local Development
### Prerequisites
- Python 3.11+
- PostgreSQL 15+
- MinIO or AWS S3 access
### Steps
1. **Create virtual environment:**
```bash
python -m venv venv
source venv/bin/activate # On Windows: venv\Scripts\activate
```
2. **Install dependencies:**
```bash
pip install -r requirements.txt
```
3. **Set up PostgreSQL:**
```bash
createdb datalake
```
4. **Configure environment:**
```bash
cp .env.example .env
# Edit .env with your configuration
```
5. **Run the application:**
```bash
python -m uvicorn app.main:app --reload
```
---
## Docker Compose
### Quick Start
1. **Start all services:**
```bash
docker-compose up -d
```
2. **Check logs:**
```bash
docker-compose logs -f api
```
3. **Stop services:**
```bash
docker-compose down
```
### Services Included
- PostgreSQL (port 5432)
- MinIO (port 9000, console 9001)
- API (port 8000)
### Customization
Edit `docker-compose.yml` to:
- Change port mappings
- Adjust resource limits
- Add environment variables
- Configure volumes
---
## Kubernetes/Helm
### Prerequisites
- Kubernetes cluster (1.24+)
- Helm 3.x
- kubectl configured
### Installation
1. **Add dependencies (if using PostgreSQL/MinIO from Bitnami):**
```bash
helm repo add bitnami https://charts.bitnami.com/bitnami
helm repo update
```
2. **Install with default values:**
```bash
helm install datalake ./helm \
--namespace datalake \
--create-namespace
```
3. **Custom installation:**
```bash
helm install datalake ./helm \
--namespace datalake \
--create-namespace \
--set image.repository=your-registry/datalake \
--set image.tag=1.0.0 \
--set ingress.enabled=true \
--set ingress.hosts[0].host=datalake.yourdomain.com
```
### Configuration Options
**Image:**
```bash
--set image.repository=your-registry/datalake
--set image.tag=1.0.0
--set image.pullPolicy=Always
```
**Resources:**
```bash
--set resources.requests.cpu=1000m
--set resources.requests.memory=1Gi
--set resources.limits.cpu=2000m
--set resources.limits.memory=2Gi
```
**Autoscaling:**
```bash
--set autoscaling.enabled=true
--set autoscaling.minReplicas=3
--set autoscaling.maxReplicas=10
--set autoscaling.targetCPUUtilizationPercentage=80
```
**Ingress:**
```bash
--set ingress.enabled=true
--set ingress.className=nginx
--set ingress.hosts[0].host=datalake.example.com
--set ingress.hosts[0].paths[0].path=/
--set ingress.hosts[0].paths[0].pathType=Prefix
```
### Upgrade
```bash
helm upgrade datalake ./helm \
--namespace datalake \
--set image.tag=1.1.0
```
### Uninstall
```bash
helm uninstall datalake --namespace datalake
```
---
## AWS Deployment
### Using AWS S3 Storage
1. **Create S3 bucket:**
```bash
aws s3 mb s3://your-test-artifacts-bucket
```
2. **Create IAM user with S3 access:**
```bash
aws iam create-user --user-name datalake-service
aws iam attach-user-policy --user-name datalake-service \
--policy-arn arn:aws:iam::aws:policy/AmazonS3FullAccess
```
3. **Generate access keys:**
```bash
aws iam create-access-key --user-name datalake-service
```
4. **Deploy with Helm:**
```bash
helm install datalake ./helm \
--namespace datalake \
--create-namespace \
--set config.storageBackend=s3 \
--set aws.enabled=true \
--set aws.accessKeyId=YOUR_ACCESS_KEY \
--set aws.secretAccessKey=YOUR_SECRET_KEY \
--set aws.region=us-east-1 \
--set aws.bucketName=your-test-artifacts-bucket \
--set minio.enabled=false
```
### Using EKS
1. **Create EKS cluster:**
```bash
eksctl create cluster \
--name datalake-cluster \
--region us-east-1 \
--nodegroup-name standard-workers \
--node-type t3.medium \
--nodes 3
```
2. **Configure kubectl:**
```bash
aws eks update-kubeconfig --name datalake-cluster --region us-east-1
```
3. **Deploy application:**
```bash
helm install datalake ./helm \
--namespace datalake \
--create-namespace \
--set config.storageBackend=s3
```
### Using RDS for PostgreSQL
```bash
helm install datalake ./helm \
--namespace datalake \
--create-namespace \
--set postgresql.enabled=false \
--set config.databaseUrl="postgresql://user:pass@your-rds-endpoint:5432/datalake"
```
---
## Self-Hosted Deployment
### Using MinIO
1. **Deploy MinIO:**
```bash
helm install minio bitnami/minio \
--namespace datalake \
--create-namespace \
--set auth.rootUser=admin \
--set auth.rootPassword=adminpassword \
--set persistence.size=100Gi
```
2. **Deploy application:**
```bash
helm install datalake ./helm \
--namespace datalake \
--set config.storageBackend=minio \
--set minio.enabled=false \
--set minio.endpoint=minio:9000 \
--set minio.accessKey=admin \
--set minio.secretKey=adminpassword
```
### On-Premise Kubernetes
1. **Prepare persistent volumes:**
```yaml
apiVersion: v1
kind: PersistentVolume
metadata:
name: datalake-postgres-pv
spec:
capacity:
storage: 20Gi
accessModes:
- ReadWriteOnce
hostPath:
path: /data/postgres
```
2. **Deploy with local storage:**
```bash
helm install datalake ./helm \
--namespace datalake \
--create-namespace \
--set postgresql.persistence.storageClass=local-storage \
--set minio.persistence.storageClass=local-storage
```
---
## GitLab CI/CD
### Setup
1. **Configure GitLab variables:**
Go to Settings → CI/CD → Variables and add:
| Variable | Description | Protected | Masked |
|----------|-------------|-----------|---------|
| `CI_REGISTRY_USER` | Docker registry username | No | No |
| `CI_REGISTRY_PASSWORD` | Docker registry password | No | Yes |
| `KUBE_CONFIG_DEV` | Base64 kubeconfig for dev | No | Yes |
| `KUBE_CONFIG_STAGING` | Base64 kubeconfig for staging | Yes | Yes |
| `KUBE_CONFIG_PROD` | Base64 kubeconfig for prod | Yes | Yes |
2. **Encode kubeconfig:**
```bash
cat ~/.kube/config | base64 -w 0
```
### Pipeline Stages
1. **Test**: Runs on all branches and MRs
2. **Build**: Builds Docker image on main/develop/tags
3. **Deploy**: Manual deployment to dev/staging/prod
### Deployment Flow
**Development:**
```bash
git push origin develop
# Manually trigger deploy:dev job in GitLab
```
**Staging:**
```bash
git push origin main
# Manually trigger deploy:staging job in GitLab
```
**Production:**
```bash
git tag v1.0.0
git push origin v1.0.0
# Manually trigger deploy:prod job in GitLab
```
### Customizing Pipeline
Edit `.gitlab-ci.yml` to:
- Add more test stages
- Change deployment namespaces
- Adjust Helm values per environment
- Add security scanning
- Configure rollback procedures
---
## Monitoring
### Health Checks
```bash
# Kubernetes
kubectl get pods -n datalake
kubectl logs -f -n datalake deployment/datalake
# Direct
curl http://localhost:8000/health
```
### Metrics
Add Prometheus monitoring:
```bash
helm install datalake ./helm \
--set metrics.enabled=true \
--set serviceMonitor.enabled=true
```
---
## Backup and Recovery
### Database Backup
```bash
# PostgreSQL
kubectl exec -n datalake deployment/datalake-postgresql -- \
pg_dump -U user datalake > backup.sql
# Restore
kubectl exec -i -n datalake deployment/datalake-postgresql -- \
psql -U user datalake < backup.sql
```
### Storage Backup
**S3:**
```bash
aws s3 sync s3://your-bucket s3://backup-bucket
```
**MinIO:**
```bash
mc mirror minio/test-artifacts backup/test-artifacts
```
---
## Troubleshooting
### Pod Not Starting
```bash
kubectl describe pod -n datalake <pod-name>
kubectl logs -n datalake <pod-name>
```
### Database Connection Issues
```bash
kubectl exec -it -n datalake deployment/datalake -- \
psql $DATABASE_URL
```
### Storage Issues
```bash
# Check MinIO
kubectl port-forward -n datalake svc/minio 9000:9000
# Access http://localhost:9000
```
---
## Security Considerations
1. **Use secrets management:**
- Kubernetes Secrets
- AWS Secrets Manager
- HashiCorp Vault
2. **Enable TLS:**
- Configure ingress with TLS certificates
- Use cert-manager for automatic certificates
3. **Network policies:**
- Restrict pod-to-pod communication
- Limit external access
4. **RBAC:**
- Configure Kubernetes RBAC
- Limit service account permissions
---
## Performance Tuning
### Database
- Increase connection pool size
- Add database indexes
- Configure autovacuum
### API
- Increase replica count
- Configure horizontal pod autoscaling
- Adjust resource requests/limits
### Storage
- Use CDN for frequently accessed files
- Configure S3 Transfer Acceleration
- Optimize MinIO deployment

32
Dockerfile Normal file
View File

@@ -0,0 +1,32 @@
FROM python:3.11-slim
WORKDIR /app
# Install system dependencies
RUN apt-get update && apt-get install -y \
gcc \
postgresql-client \
&& rm -rf /var/lib/apt/lists/*
# Copy requirements and install Python dependencies
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt
# Copy application code
COPY app/ ./app/
COPY alembic/ ./alembic/
COPY alembic.ini .
# Create non-root user
RUN useradd -m -u 1000 appuser && chown -R appuser:appuser /app
USER appuser
# Expose port
EXPOSE 8000
# Health check
HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
CMD python -c "import requests; requests.get('http://localhost:8000/health')"
# Run the application
CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000"]

66
Makefile Normal file
View File

@@ -0,0 +1,66 @@
.PHONY: help install dev test lint format docker-build docker-up docker-down deploy clean
help:
@echo "Available commands:"
@echo " make install - Install Python dependencies"
@echo " make dev - Run development server"
@echo " make test - Run tests"
@echo " make lint - Run linters"
@echo " make format - Format code"
@echo " make docker-build - Build Docker image"
@echo " make docker-up - Start Docker Compose services"
@echo " make docker-down - Stop Docker Compose services"
@echo " make deploy - Deploy with Helm"
@echo " make clean - Clean temporary files"
install:
pip install -r requirements.txt
dev:
python -m uvicorn app.main:app --reload --host 0.0.0.0 --port 8000
test:
pytest tests/ -v
lint:
flake8 app/ --max-line-length=120 --ignore=E203,W503
black --check app/
format:
black app/
isort app/
docker-build:
docker build -t datalake:latest .
docker-up:
docker-compose up -d
docker-down:
docker-compose down
docker-logs:
docker-compose logs -f api
deploy:
helm upgrade --install datalake ./helm \
--namespace datalake \
--create-namespace
deploy-dev:
helm upgrade --install datalake-dev ./helm \
--namespace datalake-dev \
--create-namespace \
--set ingress.enabled=true
clean:
find . -type d -name __pycache__ -exec rm -rf {} + 2>/dev/null || true
find . -type f -name "*.pyc" -delete
find . -type f -name "*.pyo" -delete
find . -type f -name "*.log" -delete
rm -rf .pytest_cache
rm -rf .coverage
rm -rf htmlcov
rm -rf dist
rm -rf build
rm -rf *.egg-info

298
README.md Normal file
View File

@@ -0,0 +1,298 @@
# Test Artifact Data Lake
A lightweight, cloud-native API for storing and querying test artifacts including CSV files, JSON files, binary files, and packet captures (PCAP). Built with FastAPI and supports both AWS S3 and self-hosted MinIO storage backends.
## Features
- **Multi-format Support**: Store CSV, JSON, binary files, and PCAP files
- **Flexible Storage**: Switch between AWS S3 and self-hosted MinIO
- **Rich Metadata**: Track test configurations, results, and custom metadata
- **Powerful Querying**: Query artifacts by test name, suite, result, tags, date ranges, and more
- **RESTful API**: Clean REST API with automatic OpenAPI documentation
- **Cloud-Native**: Fully containerized with Docker and Kubernetes/Helm support
- **Production-Ready**: Includes GitLab CI/CD pipeline for automated deployments
## Architecture
```
┌─────────────┐
│ FastAPI │ ← REST API
│ Backend │
└──────┬──────┘
├─────────┐
↓ ↓
┌──────────┐ ┌────────────┐
│PostgreSQL│ │ S3/MinIO │
│(Metadata)│ │ (Blobs) │
└──────────┘ └────────────┘
```
- **PostgreSQL**: Stores artifact metadata, test configs, and query indexes
- **S3/MinIO**: Stores actual file contents (blob storage)
- **FastAPI**: Async REST API for uploads, downloads, and queries
## Quick Start
### Using Docker Compose (Recommended)
1. Clone the repository:
```bash
git clone <repository-url>
cd datalake
```
2. Copy environment configuration:
```bash
cp .env.example .env
```
3. Start all services:
```bash
docker-compose up -d
```
4. Access the API:
- API: http://localhost:8000
- API Docs: http://localhost:8000/docs
- MinIO Console: http://localhost:9001
### Using Python Directly
1. Install dependencies:
```bash
pip install -r requirements.txt
```
2. Set up PostgreSQL and MinIO/S3
3. Configure environment variables in `.env`
4. Run the application:
```bash
python -m uvicorn app.main:app --reload
```
## API Usage
### Upload an Artifact
```bash
curl -X POST "http://localhost:8000/api/v1/artifacts/upload" \
-F "file=@test_results.csv" \
-F "test_name=auth_test" \
-F "test_suite=integration" \
-F "test_result=pass" \
-F 'test_config={"browser":"chrome","timeout":30}' \
-F 'tags=["regression","smoke"]' \
-F "description=Authentication test results"
```
### Query Artifacts
```bash
curl -X POST "http://localhost:8000/api/v1/artifacts/query" \
-H "Content-Type: application/json" \
-d '{
"test_suite": "integration",
"test_result": "fail",
"start_date": "2024-01-01T00:00:00",
"limit": 50
}'
```
### Download an Artifact
```bash
curl -X GET "http://localhost:8000/api/v1/artifacts/123/download" \
-o downloaded_file.csv
```
### Get Presigned URL
```bash
curl -X GET "http://localhost:8000/api/v1/artifacts/123/url?expiration=3600"
```
### List All Artifacts
```bash
curl -X GET "http://localhost:8000/api/v1/artifacts/?limit=100&offset=0"
```
### Delete an Artifact
```bash
curl -X DELETE "http://localhost:8000/api/v1/artifacts/123"
```
## API Endpoints
| Method | Endpoint | Description |
|--------|----------|-------------|
| POST | `/api/v1/artifacts/upload` | Upload a new artifact with metadata |
| GET | `/api/v1/artifacts/{id}` | Get artifact metadata by ID |
| GET | `/api/v1/artifacts/{id}/download` | Download artifact file |
| GET | `/api/v1/artifacts/{id}/url` | Get presigned download URL |
| DELETE | `/api/v1/artifacts/{id}` | Delete artifact and file |
| POST | `/api/v1/artifacts/query` | Query artifacts with filters |
| GET | `/api/v1/artifacts/` | List all artifacts (paginated) |
| GET | `/` | API information |
| GET | `/health` | Health check |
| GET | `/docs` | Interactive API documentation |
## Configuration
### Environment Variables
| Variable | Description | Default |
|----------|-------------|---------|
| `DATABASE_URL` | PostgreSQL connection string | `postgresql://user:password@localhost:5432/datalake` |
| `STORAGE_BACKEND` | Storage backend (`s3` or `minio`) | `minio` |
| `AWS_ACCESS_KEY_ID` | AWS access key (for S3) | - |
| `AWS_SECRET_ACCESS_KEY` | AWS secret key (for S3) | - |
| `AWS_REGION` | AWS region (for S3) | `us-east-1` |
| `S3_BUCKET_NAME` | S3 bucket name | `test-artifacts` |
| `MINIO_ENDPOINT` | MinIO endpoint | `localhost:9000` |
| `MINIO_ACCESS_KEY` | MinIO access key | `minioadmin` |
| `MINIO_SECRET_KEY` | MinIO secret key | `minioadmin` |
| `MINIO_BUCKET_NAME` | MinIO bucket name | `test-artifacts` |
| `MINIO_SECURE` | Use HTTPS for MinIO | `false` |
| `API_HOST` | API host | `0.0.0.0` |
| `API_PORT` | API port | `8000` |
| `MAX_UPLOAD_SIZE` | Max upload size (bytes) | `524288000` (500MB) |
### Switching Between S3 and MinIO
To use AWS S3:
```bash
STORAGE_BACKEND=s3
AWS_ACCESS_KEY_ID=your_key
AWS_SECRET_ACCESS_KEY=your_secret
AWS_REGION=us-east-1
S3_BUCKET_NAME=your-bucket
```
To use self-hosted MinIO:
```bash
STORAGE_BACKEND=minio
MINIO_ENDPOINT=minio:9000
MINIO_ACCESS_KEY=minioadmin
MINIO_SECRET_KEY=minioadmin
MINIO_BUCKET_NAME=test-artifacts
```
## Deployment
### Kubernetes with Helm
1. Build and push Docker image:
```bash
docker build -t your-registry/datalake:latest .
docker push your-registry/datalake:latest
```
2. Install with Helm:
```bash
helm install datalake ./helm \
--set image.repository=your-registry/datalake \
--set image.tag=latest \
--namespace datalake \
--create-namespace
```
3. Access the API:
```bash
kubectl port-forward -n datalake svc/datalake 8000:8000
```
### Helm Configuration
Edit `helm/values.yaml` to customize:
- Replica count
- Resource limits
- Storage backend (S3 vs MinIO)
- Ingress settings
- PostgreSQL settings
- Autoscaling
### GitLab CI/CD
The included `.gitlab-ci.yml` provides:
- Automated testing
- Linting
- Docker image builds
- Deployments to dev/staging/prod
**Required GitLab CI/CD Variables:**
- `CI_REGISTRY_USER`: Docker registry username
- `CI_REGISTRY_PASSWORD`: Docker registry password
- `KUBE_CONFIG_DEV`: Base64-encoded kubeconfig for dev
- `KUBE_CONFIG_STAGING`: Base64-encoded kubeconfig for staging
- `KUBE_CONFIG_PROD`: Base64-encoded kubeconfig for prod
## Database Schema
The `artifacts` table stores:
- File metadata (name, type, size, storage path)
- Test information (name, suite, config, result)
- Custom metadata and tags
- Timestamps and versioning
## Example Use Cases
### Store Test Results
Upload CSV files containing test execution results with metadata about the test suite and configuration.
### Archive Packet Captures
Store PCAP files from network tests with tags for easy filtering and retrieval.
### Track Test Configurations
Upload JSON test configurations and query them by date, test suite, or custom tags.
### Binary Artifact Storage
Store compiled binaries, test data files, or any binary artifacts with full metadata.
## Development
### Running Tests
```bash
pytest tests/ -v
```
### Code Formatting
```bash
black app/
flake8 app/
```
### Database Migrations
```bash
alembic revision --autogenerate -m "description"
alembic upgrade head
```
## Troubleshooting
### Cannot Connect to Database
- Verify PostgreSQL is running
- Check `DATABASE_URL` is correct
- Ensure database exists
### Cannot Upload Files
- Check storage backend is running (MinIO or S3 accessible)
- Verify credentials are correct
- Check file size is under `MAX_UPLOAD_SIZE`
### MinIO Connection Failed
- Ensure MinIO service is running
- Verify `MINIO_ENDPOINT` is correct
- Check MinIO credentials
## License
[Your License Here]
## Support
For issues and questions, please open an issue in the repository.

41
alembic.ini Normal file
View File

@@ -0,0 +1,41 @@
[alembic]
script_location = alembic
prepend_sys_path = .
version_path_separator = os
[alembic:exclude]
tables = spatial_ref_sys
[loggers]
keys = root,sqlalchemy,alembic
[handlers]
keys = console
[formatters]
keys = generic
[logger_root]
level = WARN
handlers = console
qualname =
[logger_sqlalchemy]
level = WARN
handlers =
qualname = sqlalchemy.engine
[logger_alembic]
level = INFO
handlers =
qualname = alembic
[handler_console]
class = StreamHandler
args = (sys.stderr,)
level = NOTSET
formatter = generic
[formatter_generic]
format = %(levelname)-5.5s [%(name)s] %(message)s
datefmt = %H:%M:%S

0
app/__init__.py Normal file
View File

0
app/api/__init__.py Normal file
View File

242
app/api/artifacts.py Normal file
View File

@@ -0,0 +1,242 @@
from fastapi import APIRouter, UploadFile, File, Form, Depends, HTTPException, Query
from fastapi.responses import StreamingResponse
from sqlalchemy.orm import Session
from typing import List, Optional
import uuid
import json
import io
from datetime import datetime
from app.database import get_db
from app.models.artifact import Artifact
from app.schemas.artifact import ArtifactCreate, ArtifactResponse, ArtifactQuery
from app.storage import get_storage_backend
router = APIRouter(prefix="/api/v1/artifacts", tags=["artifacts"])
def get_file_type(filename: str) -> str:
"""Determine file type from filename"""
extension = filename.lower().split('.')[-1]
type_mapping = {
'csv': 'csv',
'json': 'json',
'pcap': 'pcap',
'pcapng': 'pcap',
'bin': 'binary',
'dat': 'binary',
}
return type_mapping.get(extension, 'binary')
@router.post("/upload", response_model=ArtifactResponse, status_code=201)
async def upload_artifact(
file: UploadFile = File(...),
test_name: Optional[str] = Form(None),
test_suite: Optional[str] = Form(None),
test_config: Optional[str] = Form(None),
test_result: Optional[str] = Form(None),
metadata: Optional[str] = Form(None),
description: Optional[str] = Form(None),
tags: Optional[str] = Form(None),
version: Optional[str] = Form(None),
parent_id: Optional[int] = Form(None),
db: Session = Depends(get_db)
):
"""
Upload a new artifact file with metadata
- **file**: The file to upload (CSV, JSON, binary, PCAP)
- **test_name**: Name of the test
- **test_suite**: Test suite identifier
- **test_config**: JSON string of test configuration
- **test_result**: Test result (pass, fail, skip, error)
- **metadata**: JSON string of additional metadata
- **description**: Text description of the artifact
- **tags**: JSON array of tags (as string)
- **version**: Version identifier
- **parent_id**: ID of parent artifact (for versioning)
"""
try:
# Parse JSON fields
test_config_dict = json.loads(test_config) if test_config else None
metadata_dict = json.loads(metadata) if metadata else None
tags_list = json.loads(tags) if tags else None
# Generate unique storage path
file_extension = file.filename.split('.')[-1] if '.' in file.filename else ''
object_name = f"{uuid.uuid4()}.{file_extension}" if file_extension else str(uuid.uuid4())
# Upload to storage backend
storage = get_storage_backend()
file_content = await file.read()
file_size = len(file_content)
storage_path = await storage.upload_file(
io.BytesIO(file_content),
object_name
)
# Create database record
artifact = Artifact(
filename=file.filename,
file_type=get_file_type(file.filename),
file_size=file_size,
storage_path=storage_path,
content_type=file.content_type,
test_name=test_name,
test_suite=test_suite,
test_config=test_config_dict,
test_result=test_result,
metadata=metadata_dict,
description=description,
tags=tags_list,
version=version,
parent_id=parent_id
)
db.add(artifact)
db.commit()
db.refresh(artifact)
return artifact
except json.JSONDecodeError as e:
raise HTTPException(status_code=400, detail=f"Invalid JSON in metadata fields: {str(e)}")
except Exception as e:
db.rollback()
raise HTTPException(status_code=500, detail=f"Upload failed: {str(e)}")
@router.get("/{artifact_id}", response_model=ArtifactResponse)
async def get_artifact(artifact_id: int, db: Session = Depends(get_db)):
"""Get artifact metadata by ID"""
artifact = db.query(Artifact).filter(Artifact.id == artifact_id).first()
if not artifact:
raise HTTPException(status_code=404, detail="Artifact not found")
return artifact
@router.get("/{artifact_id}/download")
async def download_artifact(artifact_id: int, db: Session = Depends(get_db)):
"""Download artifact file by ID"""
artifact = db.query(Artifact).filter(Artifact.id == artifact_id).first()
if not artifact:
raise HTTPException(status_code=404, detail="Artifact not found")
try:
storage = get_storage_backend()
# Extract object name from storage path
object_name = artifact.storage_path.split('/')[-1]
file_data = await storage.download_file(object_name)
return StreamingResponse(
io.BytesIO(file_data),
media_type=artifact.content_type or "application/octet-stream",
headers={
"Content-Disposition": f'attachment; filename="{artifact.filename}"'
}
)
except Exception as e:
raise HTTPException(status_code=500, detail=f"Download failed: {str(e)}")
@router.get("/{artifact_id}/url")
async def get_artifact_url(
artifact_id: int,
expiration: int = Query(default=3600, ge=60, le=86400),
db: Session = Depends(get_db)
):
"""Get presigned URL for artifact download"""
artifact = db.query(Artifact).filter(Artifact.id == artifact_id).first()
if not artifact:
raise HTTPException(status_code=404, detail="Artifact not found")
try:
storage = get_storage_backend()
object_name = artifact.storage_path.split('/')[-1]
url = await storage.get_file_url(object_name, expiration)
return {"url": url, "expires_in": expiration}
except Exception as e:
raise HTTPException(status_code=500, detail=f"Failed to generate URL: {str(e)}")
@router.delete("/{artifact_id}")
async def delete_artifact(artifact_id: int, db: Session = Depends(get_db)):
"""Delete artifact and its file"""
artifact = db.query(Artifact).filter(Artifact.id == artifact_id).first()
if not artifact:
raise HTTPException(status_code=404, detail="Artifact not found")
try:
# Delete from storage
storage = get_storage_backend()
object_name = artifact.storage_path.split('/')[-1]
await storage.delete_file(object_name)
# Delete from database
db.delete(artifact)
db.commit()
return {"message": "Artifact deleted successfully"}
except Exception as e:
db.rollback()
raise HTTPException(status_code=500, detail=f"Delete failed: {str(e)}")
@router.post("/query", response_model=List[ArtifactResponse])
async def query_artifacts(query: ArtifactQuery, db: Session = Depends(get_db)):
"""
Query artifacts with filters
- **filename**: Filter by filename (partial match)
- **file_type**: Filter by file type
- **test_name**: Filter by test name
- **test_suite**: Filter by test suite
- **test_result**: Filter by test result
- **tags**: Filter by tags (must contain all specified tags)
- **start_date**: Filter by creation date (from)
- **end_date**: Filter by creation date (to)
- **limit**: Maximum number of results
- **offset**: Number of results to skip
"""
q = db.query(Artifact)
if query.filename:
q = q.filter(Artifact.filename.ilike(f"%{query.filename}%"))
if query.file_type:
q = q.filter(Artifact.file_type == query.file_type)
if query.test_name:
q = q.filter(Artifact.test_name.ilike(f"%{query.test_name}%"))
if query.test_suite:
q = q.filter(Artifact.test_suite == query.test_suite)
if query.test_result:
q = q.filter(Artifact.test_result == query.test_result)
if query.tags:
for tag in query.tags:
q = q.filter(Artifact.tags.contains([tag]))
if query.start_date:
q = q.filter(Artifact.created_at >= query.start_date)
if query.end_date:
q = q.filter(Artifact.created_at <= query.end_date)
# Order by creation date descending
q = q.order_by(Artifact.created_at.desc())
# Apply pagination
artifacts = q.offset(query.offset).limit(query.limit).all()
return artifacts
@router.get("/", response_model=List[ArtifactResponse])
async def list_artifacts(
limit: int = Query(default=100, le=1000),
offset: int = Query(default=0, ge=0),
db: Session = Depends(get_db)
):
"""List all artifacts with pagination"""
artifacts = db.query(Artifact).order_by(
Artifact.created_at.desc()
).offset(offset).limit(limit).all()
return artifacts

35
app/config.py Normal file
View File

@@ -0,0 +1,35 @@
from pydantic_settings import BaseSettings
from typing import Literal
class Settings(BaseSettings):
# Database
database_url: str = "postgresql://user:password@localhost:5432/datalake"
# Storage Backend
storage_backend: Literal["s3", "minio"] = "minio"
# AWS S3
aws_access_key_id: str = ""
aws_secret_access_key: str = ""
aws_region: str = "us-east-1"
s3_bucket_name: str = "test-artifacts"
# MinIO
minio_endpoint: str = "localhost:9000"
minio_access_key: str = "minioadmin"
minio_secret_key: str = "minioadmin"
minio_bucket_name: str = "test-artifacts"
minio_secure: bool = False
# Application
api_host: str = "0.0.0.0"
api_port: int = 8000
max_upload_size: int = 524288000 # 500MB
class Config:
env_file = ".env"
case_sensitive = False
settings = Settings()

21
app/database.py Normal file
View File

@@ -0,0 +1,21 @@
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
from app.config import settings
from app.models.artifact import Base
engine = create_engine(settings.database_url)
SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
def init_db():
"""Initialize database tables"""
Base.metadata.create_all(bind=engine)
def get_db():
"""Dependency for getting database session"""
db = SessionLocal()
try:
yield db
finally:
db.close()

71
app/main.py Normal file
View File

@@ -0,0 +1,71 @@
from fastapi import FastAPI
from fastapi.middleware.cors import CORSMiddleware
from app.api.artifacts import router as artifacts_router
from app.database import init_db
from app.config import settings
import logging
# Configure logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)
# Create FastAPI app
app = FastAPI(
title="Test Artifact Data Lake",
description="API for storing and querying test artifacts including CSV, JSON, binary files, and packet captures",
version="1.0.0",
docs_url="/docs",
redoc_url="/redoc"
)
# Configure CORS
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
# Include routers
app.include_router(artifacts_router)
@app.on_event("startup")
async def startup_event():
"""Initialize database on startup"""
logger.info("Initializing database...")
init_db()
logger.info(f"Using storage backend: {settings.storage_backend}")
logger.info("Application started successfully")
@app.get("/")
async def root():
"""Root endpoint"""
return {
"message": "Test Artifact Data Lake API",
"version": "1.0.0",
"docs": "/docs",
"storage_backend": settings.storage_backend
}
@app.get("/health")
async def health_check():
"""Health check endpoint"""
return {"status": "healthy"}
if __name__ == "__main__":
import uvicorn
uvicorn.run(
"app.main:app",
host=settings.api_host,
port=settings.api_port,
reload=True
)

3
app/models/__init__.py Normal file
View File

@@ -0,0 +1,3 @@
from .artifact import Artifact
__all__ = ["Artifact"]

38
app/models/artifact.py Normal file
View File

@@ -0,0 +1,38 @@
from sqlalchemy import Column, String, Integer, DateTime, JSON, BigInteger, Text
from sqlalchemy.ext.declarative import declarative_base
from datetime import datetime
Base = declarative_base()
class Artifact(Base):
__tablename__ = "artifacts"
id = Column(Integer, primary_key=True, index=True)
filename = Column(String(500), nullable=False, index=True)
file_type = Column(String(50), nullable=False, index=True) # csv, json, binary, pcap
file_size = Column(BigInteger, nullable=False)
storage_path = Column(String(1000), nullable=False)
content_type = Column(String(100))
# Test metadata
test_name = Column(String(500), index=True)
test_suite = Column(String(500), index=True)
test_config = Column(JSON)
test_result = Column(String(50), index=True) # pass, fail, skip, error
# Additional metadata
metadata = Column(JSON)
description = Column(Text)
tags = Column(JSON) # Array of tags for categorization
# Timestamps
created_at = Column(DateTime, default=datetime.utcnow, index=True)
updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow)
# Versioning
version = Column(String(50))
parent_id = Column(Integer, index=True) # For file versioning
def __repr__(self):
return f"<Artifact(id={self.id}, filename='{self.filename}', test_name='{self.test_name}')>"

3
app/schemas/__init__.py Normal file
View File

@@ -0,0 +1,3 @@
from .artifact import ArtifactCreate, ArtifactResponse, ArtifactQuery
__all__ = ["ArtifactCreate", "ArtifactResponse", "ArtifactQuery"]

51
app/schemas/artifact.py Normal file
View File

@@ -0,0 +1,51 @@
from pydantic import BaseModel, Field
from typing import Optional, Dict, Any, List
from datetime import datetime
class ArtifactCreate(BaseModel):
test_name: Optional[str] = None
test_suite: Optional[str] = None
test_config: Optional[Dict[str, Any]] = None
test_result: Optional[str] = None
metadata: Optional[Dict[str, Any]] = None
description: Optional[str] = None
tags: Optional[List[str]] = None
version: Optional[str] = None
parent_id: Optional[int] = None
class ArtifactResponse(BaseModel):
id: int
filename: str
file_type: str
file_size: int
storage_path: str
content_type: Optional[str] = None
test_name: Optional[str] = None
test_suite: Optional[str] = None
test_config: Optional[Dict[str, Any]] = None
test_result: Optional[str] = None
metadata: Optional[Dict[str, Any]] = None
description: Optional[str] = None
tags: Optional[List[str]] = None
created_at: datetime
updated_at: datetime
version: Optional[str] = None
parent_id: Optional[int] = None
class Config:
from_attributes = True
class ArtifactQuery(BaseModel):
filename: Optional[str] = None
file_type: Optional[str] = None
test_name: Optional[str] = None
test_suite: Optional[str] = None
test_result: Optional[str] = None
tags: Optional[List[str]] = None
start_date: Optional[datetime] = None
end_date: Optional[datetime] = None
limit: int = Field(default=100, le=1000)
offset: int = Field(default=0, ge=0)

6
app/storage/__init__.py Normal file
View File

@@ -0,0 +1,6 @@
from .base import StorageBackend
from .s3_backend import S3Backend
from .minio_backend import MinIOBackend
from .factory import get_storage_backend
__all__ = ["StorageBackend", "S3Backend", "MinIOBackend", "get_storage_backend"]

73
app/storage/base.py Normal file
View File

@@ -0,0 +1,73 @@
from abc import ABC, abstractmethod
from typing import BinaryIO
class StorageBackend(ABC):
"""Abstract base class for storage backends"""
@abstractmethod
async def upload_file(self, file_data: BinaryIO, object_name: str) -> str:
"""
Upload a file to storage
Args:
file_data: Binary file data
object_name: Name/path of the object in storage
Returns:
Storage path/URL of uploaded file
"""
pass
@abstractmethod
async def download_file(self, object_name: str) -> bytes:
"""
Download a file from storage
Args:
object_name: Name/path of the object in storage
Returns:
Binary file data
"""
pass
@abstractmethod
async def delete_file(self, object_name: str) -> bool:
"""
Delete a file from storage
Args:
object_name: Name/path of the object in storage
Returns:
True if successful
"""
pass
@abstractmethod
async def file_exists(self, object_name: str) -> bool:
"""
Check if a file exists in storage
Args:
object_name: Name/path of the object in storage
Returns:
True if file exists
"""
pass
@abstractmethod
async def get_file_url(self, object_name: str, expiration: int = 3600) -> str:
"""
Get a presigned URL for downloading a file
Args:
object_name: Name/path of the object in storage
expiration: URL expiration time in seconds
Returns:
Presigned URL
"""
pass

17
app/storage/factory.py Normal file
View File

@@ -0,0 +1,17 @@
from app.storage.base import StorageBackend
from app.storage.s3_backend import S3Backend
from app.storage.minio_backend import MinIOBackend
from app.config import settings
def get_storage_backend() -> StorageBackend:
"""
Factory function to get the appropriate storage backend
based on configuration
"""
if settings.storage_backend == "s3":
return S3Backend()
elif settings.storage_backend == "minio":
return MinIOBackend()
else:
raise ValueError(f"Unsupported storage backend: {settings.storage_backend}")

View File

@@ -0,0 +1,88 @@
import boto3
from botocore.exceptions import ClientError
from botocore.client import Config
from typing import BinaryIO
from app.storage.base import StorageBackend
from app.config import settings
import logging
logger = logging.getLogger(__name__)
class MinIOBackend(StorageBackend):
"""MinIO storage backend implementation (S3-compatible)"""
def __init__(self):
# MinIO uses S3-compatible API
self.s3_client = boto3.client(
's3',
endpoint_url=f"{'https' if settings.minio_secure else 'http'}://{settings.minio_endpoint}",
aws_access_key_id=settings.minio_access_key,
aws_secret_access_key=settings.minio_secret_key,
config=Config(signature_version='s3v4'),
region_name='us-east-1'
)
self.bucket_name = settings.minio_bucket_name
self._ensure_bucket_exists()
def _ensure_bucket_exists(self):
"""Create bucket if it doesn't exist"""
try:
self.s3_client.head_bucket(Bucket=self.bucket_name)
except ClientError as e:
error_code = e.response['Error']['Code']
if error_code == '404':
try:
self.s3_client.create_bucket(Bucket=self.bucket_name)
logger.info(f"Created MinIO bucket: {self.bucket_name}")
except ClientError as create_error:
logger.error(f"Failed to create bucket: {create_error}")
raise
async def upload_file(self, file_data: BinaryIO, object_name: str) -> str:
"""Upload file to MinIO"""
try:
self.s3_client.upload_fileobj(file_data, self.bucket_name, object_name)
return f"minio://{self.bucket_name}/{object_name}"
except ClientError as e:
logger.error(f"Failed to upload file to MinIO: {e}")
raise
async def download_file(self, object_name: str) -> bytes:
"""Download file from MinIO"""
try:
response = self.s3_client.get_object(Bucket=self.bucket_name, Key=object_name)
return response['Body'].read()
except ClientError as e:
logger.error(f"Failed to download file from MinIO: {e}")
raise
async def delete_file(self, object_name: str) -> bool:
"""Delete file from MinIO"""
try:
self.s3_client.delete_object(Bucket=self.bucket_name, Key=object_name)
return True
except ClientError as e:
logger.error(f"Failed to delete file from MinIO: {e}")
return False
async def file_exists(self, object_name: str) -> bool:
"""Check if file exists in MinIO"""
try:
self.s3_client.head_object(Bucket=self.bucket_name, Key=object_name)
return True
except ClientError:
return False
async def get_file_url(self, object_name: str, expiration: int = 3600) -> str:
"""Generate presigned URL for MinIO object"""
try:
url = self.s3_client.generate_presigned_url(
'get_object',
Params={'Bucket': self.bucket_name, 'Key': object_name},
ExpiresIn=expiration
)
return url
except ClientError as e:
logger.error(f"Failed to generate presigned URL: {e}")
raise

87
app/storage/s3_backend.py Normal file
View File

@@ -0,0 +1,87 @@
import boto3
from botocore.exceptions import ClientError
from typing import BinaryIO
from app.storage.base import StorageBackend
from app.config import settings
import logging
logger = logging.getLogger(__name__)
class S3Backend(StorageBackend):
"""AWS S3 storage backend implementation"""
def __init__(self):
self.s3_client = boto3.client(
's3',
aws_access_key_id=settings.aws_access_key_id,
aws_secret_access_key=settings.aws_secret_access_key,
region_name=settings.aws_region
)
self.bucket_name = settings.s3_bucket_name
self._ensure_bucket_exists()
def _ensure_bucket_exists(self):
"""Create bucket if it doesn't exist"""
try:
self.s3_client.head_bucket(Bucket=self.bucket_name)
except ClientError as e:
error_code = e.response['Error']['Code']
if error_code == '404':
try:
self.s3_client.create_bucket(
Bucket=self.bucket_name,
CreateBucketConfiguration={'LocationConstraint': settings.aws_region}
)
logger.info(f"Created S3 bucket: {self.bucket_name}")
except ClientError as create_error:
logger.error(f"Failed to create bucket: {create_error}")
raise
async def upload_file(self, file_data: BinaryIO, object_name: str) -> str:
"""Upload file to S3"""
try:
self.s3_client.upload_fileobj(file_data, self.bucket_name, object_name)
return f"s3://{self.bucket_name}/{object_name}"
except ClientError as e:
logger.error(f"Failed to upload file to S3: {e}")
raise
async def download_file(self, object_name: str) -> bytes:
"""Download file from S3"""
try:
response = self.s3_client.get_object(Bucket=self.bucket_name, Key=object_name)
return response['Body'].read()
except ClientError as e:
logger.error(f"Failed to download file from S3: {e}")
raise
async def delete_file(self, object_name: str) -> bool:
"""Delete file from S3"""
try:
self.s3_client.delete_object(Bucket=self.bucket_name, Key=object_name)
return True
except ClientError as e:
logger.error(f"Failed to delete file from S3: {e}")
return False
async def file_exists(self, object_name: str) -> bool:
"""Check if file exists in S3"""
try:
self.s3_client.head_object(Bucket=self.bucket_name, Key=object_name)
return True
except ClientError:
return False
async def get_file_url(self, object_name: str, expiration: int = 3600) -> str:
"""Generate presigned URL for S3 object"""
try:
url = self.s3_client.generate_presigned_url(
'get_object',
Params={'Bucket': self.bucket_name, 'Key': object_name},
ExpiresIn=expiration
)
return url
except ClientError as e:
logger.error(f"Failed to generate presigned URL: {e}")
raise

62
docker-compose.yml Normal file
View File

@@ -0,0 +1,62 @@
version: '3.8'
services:
postgres:
image: postgres:15
environment:
POSTGRES_USER: user
POSTGRES_PASSWORD: password
POSTGRES_DB: datalake
ports:
- "5432:5432"
volumes:
- postgres_data:/var/lib/postgresql/data
healthcheck:
test: ["CMD-SHELL", "pg_isready -U user"]
interval: 10s
timeout: 5s
retries: 5
minio:
image: minio/minio:latest
command: server /data --console-address ":9001"
environment:
MINIO_ROOT_USER: minioadmin
MINIO_ROOT_PASSWORD: minioadmin
ports:
- "9000:9000"
- "9001:9001"
volumes:
- minio_data:/data
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:9000/minio/health/live"]
interval: 10s
timeout: 5s
retries: 5
api:
build: .
ports:
- "8000:8000"
environment:
DATABASE_URL: postgresql://user:password@postgres:5432/datalake
STORAGE_BACKEND: minio
MINIO_ENDPOINT: minio:9000
MINIO_ACCESS_KEY: minioadmin
MINIO_SECRET_KEY: minioadmin
MINIO_BUCKET_NAME: test-artifacts
MINIO_SECURE: "false"
depends_on:
postgres:
condition: service_healthy
minio:
condition: service_healthy
healthcheck:
test: ["CMD", "python", "-c", "import requests; requests.get('http://localhost:8000/health')"]
interval: 30s
timeout: 10s
retries: 3
volumes:
postgres_data:
minio_data:

13
helm/Chart.yaml Normal file
View File

@@ -0,0 +1,13 @@
apiVersion: v2
name: datalake
description: Test Artifact Data Lake - Store and query test artifacts
type: application
version: 1.0.0
appVersion: "1.0.0"
keywords:
- testing
- artifacts
- storage
- data-lake
maintainers:
- name: Your Team

View File

@@ -0,0 +1,60 @@
{{/*
Expand the name of the chart.
*/}}
{{- define "datalake.name" -}}
{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }}
{{- end }}
{{/*
Create a default fully qualified app name.
*/}}
{{- define "datalake.fullname" -}}
{{- if .Values.fullnameOverride }}
{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }}
{{- else }}
{{- $name := default .Chart.Name .Values.nameOverride }}
{{- if contains $name .Release.Name }}
{{- .Release.Name | trunc 63 | trimSuffix "-" }}
{{- else }}
{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }}
{{- end }}
{{- end }}
{{- end }}
{{/*
Create chart name and version as used by the chart label.
*/}}
{{- define "datalake.chart" -}}
{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }}
{{- end }}
{{/*
Common labels
*/}}
{{- define "datalake.labels" -}}
helm.sh/chart: {{ include "datalake.chart" . }}
{{ include "datalake.selectorLabels" . }}
{{- if .Chart.AppVersion }}
app.kubernetes.io/version: {{ .Chart.AppVersion | quote }}
{{- end }}
app.kubernetes.io/managed-by: {{ .Release.Service }}
{{- end }}
{{/*
Selector labels
*/}}
{{- define "datalake.selectorLabels" -}}
app.kubernetes.io/name: {{ include "datalake.name" . }}
app.kubernetes.io/instance: {{ .Release.Name }}
{{- end }}
{{/*
Create the name of the service account to use
*/}}
{{- define "datalake.serviceAccountName" -}}
{{- if .Values.serviceAccount.create }}
{{- default (include "datalake.fullname" .) .Values.serviceAccount.name }}
{{- else }}
{{- default "default" .Values.serviceAccount.name }}
{{- end }}
{{- end }}

View File

@@ -0,0 +1,111 @@
apiVersion: apps/v1
kind: Deployment
metadata:
name: {{ include "datalake.fullname" . }}
labels:
{{- include "datalake.labels" . | nindent 4 }}
spec:
{{- if not .Values.autoscaling.enabled }}
replicas: {{ .Values.replicaCount }}
{{- end }}
selector:
matchLabels:
{{- include "datalake.selectorLabels" . | nindent 6 }}
template:
metadata:
{{- with .Values.podAnnotations }}
annotations:
{{- toYaml . | nindent 8 }}
{{- end }}
labels:
{{- include "datalake.selectorLabels" . | nindent 8 }}
spec:
{{- with .Values.imagePullSecrets }}
imagePullSecrets:
{{- toYaml . | nindent 8 }}
{{- end }}
serviceAccountName: {{ include "datalake.serviceAccountName" . }}
securityContext:
{{- toYaml .Values.podSecurityContext | nindent 8 }}
containers:
- name: {{ .Chart.Name }}
securityContext:
{{- toYaml .Values.securityContext | nindent 12 }}
image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default .Chart.AppVersion }}"
imagePullPolicy: {{ .Values.image.pullPolicy }}
ports:
- name: http
containerPort: {{ .Values.service.targetPort }}
protocol: TCP
livenessProbe:
httpGet:
path: /health
port: http
initialDelaySeconds: 30
periodSeconds: 10
readinessProbe:
httpGet:
path: /health
port: http
initialDelaySeconds: 5
periodSeconds: 5
env:
- name: DATABASE_URL
valueFrom:
secretKeyRef:
name: {{ include "datalake.fullname" . }}-secrets
key: database-url
- name: STORAGE_BACKEND
value: {{ .Values.config.storageBackend | quote }}
- name: MAX_UPLOAD_SIZE
value: {{ .Values.config.maxUploadSize | quote }}
{{- if eq .Values.config.storageBackend "s3" }}
- name: AWS_ACCESS_KEY_ID
valueFrom:
secretKeyRef:
name: {{ include "datalake.fullname" . }}-secrets
key: aws-access-key-id
- name: AWS_SECRET_ACCESS_KEY
valueFrom:
secretKeyRef:
name: {{ include "datalake.fullname" . }}-secrets
key: aws-secret-access-key
- name: AWS_REGION
value: {{ .Values.aws.region | quote }}
- name: S3_BUCKET_NAME
value: {{ .Values.aws.bucketName | quote }}
{{- else }}
- name: MINIO_ENDPOINT
value: "{{ include "datalake.fullname" . }}-minio:9000"
- name: MINIO_ACCESS_KEY
valueFrom:
secretKeyRef:
name: {{ include "datalake.fullname" . }}-secrets
key: minio-access-key
- name: MINIO_SECRET_KEY
valueFrom:
secretKeyRef:
name: {{ include "datalake.fullname" . }}-secrets
key: minio-secret-key
- name: MINIO_BUCKET_NAME
value: "test-artifacts"
- name: MINIO_SECURE
value: "false"
{{- end }}
{{- with .Values.env }}
{{- toYaml . | nindent 8 }}
{{- end }}
resources:
{{- toYaml .Values.resources | nindent 12 }}
{{- with .Values.nodeSelector }}
nodeSelector:
{{- toYaml . | nindent 8 }}
{{- end }}
{{- with .Values.affinity }}
affinity:
{{- toYaml . | nindent 8 }}
{{- end }}
{{- with .Values.tolerations }}
tolerations:
{{- toYaml . | nindent 8 }}
{{- end }}

View File

@@ -0,0 +1,41 @@
{{- if .Values.ingress.enabled -}}
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
name: {{ include "datalake.fullname" . }}
labels:
{{- include "datalake.labels" . | nindent 4 }}
{{- with .Values.ingress.annotations }}
annotations:
{{- toYaml . | nindent 4 }}
{{- end }}
spec:
{{- if .Values.ingress.className }}
ingressClassName: {{ .Values.ingress.className }}
{{- end }}
{{- if .Values.ingress.tls }}
tls:
{{- range .Values.ingress.tls }}
- hosts:
{{- range .hosts }}
- {{ . | quote }}
{{- end }}
secretName: {{ .secretName }}
{{- end }}
{{- end }}
rules:
{{- range .Values.ingress.hosts }}
- host: {{ .host | quote }}
http:
paths:
{{- range .paths }}
- path: {{ .path }}
pathType: {{ .pathType }}
backend:
service:
name: {{ include "datalake.fullname" $ }}
port:
number: {{ $.Values.service.port }}
{{- end }}
{{- end }}
{{- end }}

View File

@@ -0,0 +1,16 @@
apiVersion: v1
kind: Secret
metadata:
name: {{ include "datalake.fullname" . }}-secrets
labels:
{{- include "datalake.labels" . | nindent 4 }}
type: Opaque
stringData:
database-url: "postgresql://{{ .Values.postgresql.auth.username }}:{{ .Values.postgresql.auth.password }}@{{ include "datalake.fullname" . }}-postgresql:5432/{{ .Values.postgresql.auth.database }}"
{{- if .Values.aws.enabled }}
aws-access-key-id: {{ .Values.aws.accessKeyId | quote }}
aws-secret-access-key: {{ .Values.aws.secretAccessKey | quote }}
{{- else }}
minio-access-key: {{ .Values.minio.rootUser | quote }}
minio-secret-key: {{ .Values.minio.rootPassword | quote }}
{{- end }}

View File

@@ -0,0 +1,15 @@
apiVersion: v1
kind: Service
metadata:
name: {{ include "datalake.fullname" . }}
labels:
{{- include "datalake.labels" . | nindent 4 }}
spec:
type: {{ .Values.service.type }}
ports:
- port: {{ .Values.service.port }}
targetPort: http
protocol: TCP
name: http
selector:
{{- include "datalake.selectorLabels" . | nindent 4 }}

View File

@@ -0,0 +1,12 @@
{{- if .Values.serviceAccount.create -}}
apiVersion: v1
kind: ServiceAccount
metadata:
name: {{ include "datalake.serviceAccountName" . }}
labels:
{{- include "datalake.labels" . | nindent 4 }}
{{- with .Values.serviceAccount.annotations }}
annotations:
{{- toYaml . | nindent 4 }}
{{- end }}
{{- end }}

111
helm/values.yaml Normal file
View File

@@ -0,0 +1,111 @@
replicaCount: 1
image:
repository: datalake
pullPolicy: IfNotPresent
tag: "latest"
imagePullSecrets: []
nameOverride: ""
fullnameOverride: ""
serviceAccount:
create: true
annotations: {}
name: ""
podAnnotations: {}
podSecurityContext:
fsGroup: 1000
securityContext:
capabilities:
drop:
- ALL
readOnlyRootFilesystem: false
runAsNonRoot: true
runAsUser: 1000
service:
type: ClusterIP
port: 8000
targetPort: 8000
ingress:
enabled: false
className: ""
annotations: {}
hosts:
- host: datalake.local
paths:
- path: /
pathType: Prefix
tls: []
resources:
limits:
cpu: 1000m
memory: 1Gi
requests:
cpu: 500m
memory: 512Mi
autoscaling:
enabled: false
minReplicas: 1
maxReplicas: 10
targetCPUUtilizationPercentage: 80
nodeSelector: {}
tolerations: []
affinity: {}
# Application configuration
config:
storageBackend: minio # or "s3"
maxUploadSize: 524288000 # 500MB
# PostgreSQL configuration
postgresql:
enabled: true
auth:
username: user
password: password
database: datalake
primary:
persistence:
enabled: true
size: 10Gi
# MinIO configuration (for self-hosted storage)
minio:
enabled: true
mode: standalone
rootUser: minioadmin
rootPassword: minioadmin
persistence:
enabled: true
size: 50Gi
service:
type: ClusterIP
port: 9000
consoleService:
port: 9001
# AWS S3 configuration (when using AWS)
aws:
enabled: false
accessKeyId: ""
secretAccessKey: ""
region: us-east-1
bucketName: test-artifacts
# Environment variables
env:
- name: API_HOST
value: "0.0.0.0"
- name: API_PORT
value: "8000"

80
quickstart.sh Executable file
View File

@@ -0,0 +1,80 @@
#!/bin/bash
set -e
echo "========================================="
echo "Test Artifact Data Lake - Quick Start"
echo "========================================="
echo ""
# Check if Docker is installed
if ! command -v docker &> /dev/null; then
echo "Error: Docker is not installed. Please install Docker first."
exit 1
fi
# Check if Docker Compose is installed
if ! command -v docker-compose &> /dev/null; then
echo "Error: Docker Compose is not installed. Please install Docker Compose first."
exit 1
fi
# Create .env file if it doesn't exist
if [ ! -f .env ]; then
echo "Creating .env file from .env.example..."
cp .env.example .env
echo "✓ .env file created"
else
echo "✓ .env file already exists"
fi
echo ""
echo "Starting services with Docker Compose..."
docker-compose up -d
echo ""
echo "Waiting for services to be ready..."
sleep 10
echo ""
echo "========================================="
echo "Services are running!"
echo "========================================="
echo ""
echo "API: http://localhost:8000"
echo "API Docs: http://localhost:8000/docs"
echo "MinIO Console: http://localhost:9001"
echo " Username: minioadmin"
echo " Password: minioadmin"
echo ""
echo "To view logs: docker-compose logs -f"
echo "To stop: docker-compose down"
echo ""
echo "========================================="
echo "Testing the API..."
echo "========================================="
echo ""
# Wait a bit more for API to be fully ready
sleep 5
# Test health endpoint
if curl -s http://localhost:8000/health | grep -q "healthy"; then
echo "✓ API is healthy!"
echo ""
echo "Example: Upload a test file"
echo "----------------------------"
echo 'echo "test,data" > test.csv'
echo 'curl -X POST "http://localhost:8000/api/v1/artifacts/upload" \'
echo ' -F "file=@test.csv" \'
echo ' -F "test_name=sample_test" \'
echo ' -F "test_suite=demo" \'
echo ' -F "test_result=pass"'
echo ""
else
echo "⚠ API is not responding yet. Please wait a moment and check http://localhost:8000/health"
fi
echo "========================================="
echo "Setup complete! 🚀"
echo "========================================="

11
requirements.txt Normal file
View File

@@ -0,0 +1,11 @@
fastapi==0.115.0
uvicorn[standard]==0.31.0
python-multipart==0.0.12
sqlalchemy==2.0.35
psycopg2-binary==2.9.9
alembic==1.13.3
boto3==1.35.36
python-dotenv==1.0.1
pydantic==2.9.2
pydantic-settings==2.5.2
aiofiles==24.1.0

0
tests/__init__.py Normal file
View File

38
tests/test_api.py Normal file
View File

@@ -0,0 +1,38 @@
import pytest
from fastapi.testclient import TestClient
from app.main import app
client = TestClient(app)
def test_root():
"""Test root endpoint"""
response = client.get("/")
assert response.status_code == 200
data = response.json()
assert "message" in data
assert "version" in data
def test_health():
"""Test health check endpoint"""
response = client.get("/health")
assert response.status_code == 200
data = response.json()
assert data["status"] == "healthy"
# Add more tests as needed
# def test_upload_artifact():
# """Test artifact upload"""
# files = {"file": ("test.csv", b"test,data\n1,2", "text/csv")}
# data = {
# "test_name": "sample_test",
# "test_suite": "unit",
# "test_result": "pass"
# }
# response = client.post("/api/v1/artifacts/upload", files=files, data=data)
# assert response.status_code == 201
# artifact = response.json()
# assert artifact["filename"] == "test.csv"
# assert artifact["test_name"] == "sample_test"