diff --git a/Dockerfile b/Dockerfile index 1741c6c..4a688cc 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,12 +1,15 @@ -FROM python:3.11-slim +FROM python:3.11-alpine WORKDIR /app -# Install system dependencies -RUN apt-get update && apt-get install -y \ +# Install system dependencies for Alpine +# Alpine uses apk instead of apt-get and is lighter/faster +RUN apk add --no-cache \ gcc \ + musl-dev \ + postgresql-dev \ postgresql-client \ - && rm -rf /var/lib/apt/lists/* + linux-headers # Copy requirements and install Python dependencies COPY requirements.txt . @@ -18,8 +21,8 @@ COPY utils/ ./utils/ COPY alembic/ ./alembic/ COPY alembic.ini . -# Create non-root user -RUN useradd -m -u 1000 appuser && chown -R appuser:appuser /app +# Create non-root user (Alpine uses adduser instead of useradd) +RUN adduser -D -u 1000 appuser && chown -R appuser:appuser /app USER appuser # Expose port diff --git a/README.md b/README.md index c2faf59..66ac471 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,6 @@ -# Test Artifact Data Lake +# Obsidian + +**Enterprise Test Artifact Storage** A lightweight, cloud-native API for storing and querying test artifacts including CSV files, JSON files, binary files, and packet captures (PCAP). Built with FastAPI and supports both AWS S3 and self-hosted MinIO storage backends. diff --git a/alembic/.gitkeep b/alembic/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/alembic/env.py b/alembic/env.py new file mode 100644 index 0000000..4618cd6 --- /dev/null +++ b/alembic/env.py @@ -0,0 +1,84 @@ +from logging.config import fileConfig +import os + +from sqlalchemy import engine_from_config +from sqlalchemy import pool + +from alembic import context + +# Import your models Base +from app.models.artifact import Base + +# this is the Alembic Config object, which provides +# access to the values within the .ini file in use. +config = context.config + +# Override sqlalchemy.url from environment variable +if os.getenv("DATABASE_URL"): + config.set_main_option("sqlalchemy.url", os.getenv("DATABASE_URL")) + +# Interpret the config file for Python logging. +# This line sets up loggers basically. +if config.config_file_name is not None: + fileConfig(config.config_file_name) + +# add your model's MetaData object here +# for 'autogenerate' support +target_metadata = Base.metadata + +# other values from the config, defined by the needs of env.py, +# can be acquired: +# my_important_option = config.get_main_option("my_important_option") +# ... etc. + + +def run_migrations_offline() -> None: + """Run migrations in 'offline' mode. + + This configures the context with just a URL + and not an Engine, though an Engine is acceptable + here as well. By skipping the Engine creation + we don't even need a DBAPI to be available. + + Calls to context.execute() here emit the given string to the + script output. + + """ + url = config.get_main_option("sqlalchemy.url") + context.configure( + url=url, + target_metadata=target_metadata, + literal_binds=True, + dialect_opts={"paramstyle": "named"}, + ) + + with context.begin_transaction(): + context.run_migrations() + + +def run_migrations_online() -> None: + """Run migrations in 'online' mode. + + In this scenario we need to create an Engine + and associate a connection with the context. + + """ + connectable = engine_from_config( + config.get_section(config.config_ini_section, {}), + prefix="sqlalchemy.", + poolclass=pool.NullPool, + ) + + with connectable.connect() as connection: + context.configure( + connection=connection, target_metadata=target_metadata + ) + + with context.begin_transaction(): + context.run_migrations() + + +if context.is_offline_mode(): + run_migrations_offline() +else: + run_migrations_online() diff --git a/alembic/versions/.gitkeep b/alembic/versions/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/app/api/artifacts.py b/app/api/artifacts.py index 1d6b3d4..593413e 100644 --- a/app/api/artifacts.py +++ b/app/api/artifacts.py @@ -1,7 +1,7 @@ from fastapi import APIRouter, UploadFile, File, Form, Depends, HTTPException, Query from fastapi.responses import StreamingResponse from sqlalchemy.orm import Session -from typing import List, Optional +from typing import List, Optional, Dict import uuid import json import io @@ -36,6 +36,7 @@ async def upload_artifact( test_suite: Optional[str] = Form(None), test_config: Optional[str] = Form(None), test_result: Optional[str] = Form(None), + sim_source_id: Optional[str] = Form(None), custom_metadata: Optional[str] = Form(None), description: Optional[str] = Form(None), tags: Optional[str] = Form(None), @@ -51,6 +52,7 @@ async def upload_artifact( - **test_suite**: Test suite identifier - **test_config**: JSON string of test configuration - **test_result**: Test result (pass, fail, skip, error) + - **sim_source_id**: SIM source ID to group multiple artifacts - **custom_metadata**: JSON string of additional metadata - **description**: Text description of the artifact - **tags**: JSON array of tags (as string) @@ -88,6 +90,7 @@ async def upload_artifact( test_suite=test_suite, test_config=test_config_dict, test_result=test_result, + sim_source_id=sim_source_id, custom_metadata=metadata_dict, description=description, tags=tags_list, @@ -194,6 +197,7 @@ async def query_artifacts(query: ArtifactQuery, db: Session = Depends(get_db)): - **test_name**: Filter by test name - **test_suite**: Filter by test suite - **test_result**: Filter by test result + - **sim_source_id**: Filter by SIM source ID - **tags**: Filter by tags (must contain all specified tags) - **start_date**: Filter by creation date (from) - **end_date**: Filter by creation date (to) @@ -212,6 +216,8 @@ async def query_artifacts(query: ArtifactQuery, db: Session = Depends(get_db)): q = q.filter(Artifact.test_suite == query.test_suite) if query.test_result: q = q.filter(Artifact.test_result == query.test_result) + if query.sim_source_id: + q = q.filter(Artifact.sim_source_id == query.sim_source_id) if query.tags: for tag in query.tags: q = q.filter(Artifact.tags.contains([tag])) @@ -240,3 +246,20 @@ async def list_artifacts( Artifact.created_at.desc() ).offset(offset).limit(limit).all() return artifacts + + +@router.get("/grouped-by-sim-source", response_model=Dict[str, List[ArtifactResponse]]) +async def get_artifacts_grouped_by_sim_source( + db: Session = Depends(get_db) +): + """Get all artifacts grouped by SIM source ID""" + from collections import defaultdict + + artifacts = db.query(Artifact).order_by(Artifact.created_at.desc()).all() + grouped = defaultdict(list) + + for artifact in artifacts: + sim_source = artifact.sim_source_id or "ungrouped" + grouped[sim_source].append(artifact) + + return dict(grouped) diff --git a/docker-compose.yml b/docker-compose.yml index cf82292..5b7fa2b 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -2,7 +2,7 @@ version: '3.8' services: postgres: - image: postgres:15 + image: postgres:15-alpine environment: POSTGRES_USER: user POSTGRES_PASSWORD: password diff --git a/utils/seed_data.py b/utils/seed_data.py index 07cf9a8..105bef5 100755 --- a/utils/seed_data.py +++ b/utils/seed_data.py @@ -129,7 +129,7 @@ def generate_pcap_content() -> bytes: return bytes(pcap_header) -def create_artifact_data(index: int) -> Dict[str, Any]: +def create_artifact_data(index: int, sim_source_id: str = None) -> Dict[str, Any]: """Generate metadata for an artifact""" test_name = random.choice(TEST_NAMES) test_suite = random.choice(TEST_SUITES) @@ -164,6 +164,7 @@ def create_artifact_data(index: int) -> Dict[str, Any]: "test_name": test_name, "test_suite": test_suite, "test_result": test_result, + "sim_source_id": sim_source_id, "tags": artifact_tags, "test_config": test_config, "custom_metadata": custom_metadata, @@ -265,6 +266,27 @@ async def generate_seed_data(num_artifacts: int = 50) -> List[int]: print(f"Deployment mode: {settings.deployment_mode}") print(f"Storage backend: {settings.storage_backend}") + # Generate SIM source IDs - each source will have 2-4 artifacts + num_sim_sources = max(num_artifacts // 3, 1) + sim_sources = [f"sim_run_{uuid.uuid4().hex[:8]}" for _ in range(num_sim_sources)] + + # Pre-assign artifacts to SIM sources to ensure grouping + sim_source_assignments = [] + for sim_source in sim_sources: + # Each SIM source gets 2-4 artifacts + num_artifacts_for_source = random.randint(2, 4) + sim_source_assignments.extend([sim_source] * num_artifacts_for_source) + + # Pad remaining artifacts with None (ungrouped) or random sources + while len(sim_source_assignments) < num_artifacts: + if random.random() < 0.3: # 30% ungrouped + sim_source_assignments.append(None) + else: + sim_source_assignments.append(random.choice(sim_sources)) + + # Shuffle to randomize order + random.shuffle(sim_source_assignments) + for i in range(num_artifacts): # Randomly choose file type file_type_choice = random.choice(['csv', 'json', 'binary', 'pcap']) @@ -289,8 +311,11 @@ async def generate_seed_data(num_artifacts: int = 50) -> List[int]: # Upload to storage storage_path = await upload_artifact_to_storage(content, filename) + # Get pre-assigned SIM source ID for this artifact + sim_source_id = sim_source_assignments[i] + # Generate metadata - artifact_data = create_artifact_data(i) + artifact_data = create_artifact_data(i, sim_source_id) # Create database record artifact = Artifact( @@ -303,6 +328,7 @@ async def generate_seed_data(num_artifacts: int = 50) -> List[int]: test_suite=artifact_data["test_suite"], test_config=artifact_data["test_config"], test_result=artifact_data["test_result"], + sim_source_id=artifact_data["sim_source_id"], custom_metadata=artifact_data["custom_metadata"], description=artifact_data["description"], tags=artifact_data["tags"],