From a3a49ac9c3a9840a0f9a2ad9182d641d4365ed4d Mon Sep 17 00:00:00 2001 From: Mondo Diaz Date: Thu, 29 Jan 2026 11:28:59 -0600 Subject: [PATCH] Add upstream caching infrastructure and refactor CI pipeline Upstream Caching (Epic #68-#75, #105): - Add upstream_sources and cache_settings tables with migrations - Add cache management API endpoints (CRUD for sources, settings) - Add environment variable overrides for upstream sources and cache settings - Add encryption module for storing credentials securely - Add frontend Admin Cache Management page - Add is_system field to projects for system cache distinction - Add purge_seed_data for transitioning to production-like environments CI Pipeline Refactoring: - Remove reset jobs (reset_stage_pre, reset_stage) - Add ephemeral orchard-test deployment for main branch testing - Run integration tests on ephemeral deployment before promoting to stage - Stage is now long-running pre-prod (smoke tests only) - Disable prosper_setup for tag pipelines --- .gitlab-ci.yml | 188 +-- CHANGELOG.md | 70 + backend/app/cache.py | 316 ++++ backend/app/config.py | 119 ++ backend/app/database.py | 102 ++ backend/app/encryption.py | 160 ++ backend/app/models.py | 169 ++ backend/app/purge_seed_data.py | 211 +++ backend/app/routes.py | 1205 +++++++++++++- backend/app/schemas.py | 243 +++ backend/app/upstream.py | 586 +++++++ backend/requirements.txt | 2 +- backend/tests/test_upstream_caching.py | 2051 ++++++++++++++++++++++++ docs/epic-upstream-caching.md | 672 ++++++++ frontend/src/App.tsx | 2 + frontend/src/api.ts | 84 + frontend/src/components/Layout.tsx | 14 +- frontend/src/pages/AdminCachePage.css | 372 +++++ frontend/src/pages/AdminCachePage.tsx | 580 +++++++ frontend/src/pages/Home.css | 13 + frontend/src/pages/Home.tsx | 3 + frontend/src/pages/ProjectPage.tsx | 3 + frontend/src/types.ts | 72 + migrations/010_upstream_caching.sql | 137 ++ 24 files changed, 7271 insertions(+), 103 deletions(-) create mode 100644 backend/app/cache.py create mode 100644 backend/app/encryption.py create mode 100644 backend/app/purge_seed_data.py create mode 100644 backend/app/upstream.py create mode 100644 backend/tests/test_upstream_caching.py create mode 100644 docs/epic-upstream-caching.md create mode 100644 frontend/src/pages/AdminCachePage.css create mode 100644 frontend/src/pages/AdminCachePage.tsx create mode 100644 migrations/010_upstream_caching.sql diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index b28100e..c523a9e 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -11,13 +11,6 @@ variables: # Environment URLs (used by deploy and test jobs) STAGE_URL: https://orchard-stage.common.global.bsf.tools PROD_URL: https://orchard.common.global.bsf.tools - # Stage environment AWS resources (used by reset job) - STAGE_RDS_HOST: orchard-stage.cluster-cvw3jzjkozoc.us-gov-west-1.rds.amazonaws.com - STAGE_RDS_DBNAME: postgres - STAGE_SECRET_ARN: "arn:aws-us-gov:secretsmanager:us-gov-west-1:052673043337:secret:rds!cluster-a573672b-1a38-4665-a654-1b7df37b5297-IaeFQL" - STAGE_AUTH_SECRET_ARN: "arn:aws-us-gov:secretsmanager:us-gov-west-1:052673043337:secret:orchard-stage-creds-SMqvQx" - STAGE_S3_BUCKET: orchard-artifacts-stage - AWS_REGION: us-gov-west-1 # Shared pip cache directory PIP_CACHE_DIR: "$CI_PROJECT_DIR/.pip-cache" @@ -95,10 +88,18 @@ cve_sbom_analysis: when: never - when: on_success -# Override release job to wait for stage integration tests before creating tag +# Disable prosper_setup for tag pipelines since no build/analysis jobs run +# (image is already built when commit was on main, and deploy uses helm directly) +prosper_setup: + rules: + - if: '$CI_COMMIT_TAG' + when: never + - when: on_success + +# Override release job to wait for stage deployment and smoke tests before creating tag # This ensures the tag (which triggers prod deploy) is only created after stage passes release: - needs: [integration_test_stage, changelog] + needs: [smoke_test_stage, changelog] # Full integration test suite template (for feature/stage deployments) # Runs the complete pytest integration test suite against the deployed environment @@ -200,107 +201,91 @@ release: sys.exit(0) PYTEST_SCRIPT -# Reset stage template - runs from CI runner, uses CI variable for auth -# Calls the /api/v1/admin/factory-reset endpoint which handles DB and S3 cleanup -.reset_stage_template: &reset_stage_template - stage: deploy - image: deps.global.bsf.tools/docker/python:3.12-slim - timeout: 5m - retry: 1 +# Ephemeral test deployment in stage namespace (main branch only) +# Runs integration tests before promoting to long-running stage +deploy_test: + <<: *deploy_template + variables: + NAMESPACE: orch-stage-namespace + VALUES_FILE: helm/orchard/values-dev.yaml + BASE_URL: https://orchard-test.common.global.bsf.tools before_script: - - pip install --index-url "$PIP_INDEX_URL" httpx + - kubectl config use-context esv/bsf/bsf-integration/orchard/orchard-mvp:orchard-stage + - *helm_setup script: + - echo "Deploying ephemeral test environment" + - cd $CI_PROJECT_DIR - | - python - <<'RESET_SCRIPT' - import httpx - import sys - import os - import time - - BASE_URL = os.environ.get("STAGE_URL", "") - ADMIN_USER = "admin" - ADMIN_PASS = os.environ.get("STAGE_ADMIN_PASSWORD", "") - MAX_RETRIES = 3 - RETRY_DELAY = 5 - - if not BASE_URL: - print("ERROR: STAGE_URL not set") - sys.exit(1) - - if not ADMIN_PASS: - print("ERROR: STAGE_ADMIN_PASSWORD not set") - sys.exit(1) - - print(f"=== Resetting stage environment at {BASE_URL} ===") - - def do_reset(): - with httpx.Client(base_url=BASE_URL, timeout=120.0) as client: - print("Logging in as admin...") - login_response = client.post( - "/api/v1/auth/login", - json={"username": ADMIN_USER, "password": ADMIN_PASS}, - ) - if login_response.status_code != 200: - raise Exception(f"Login failed: {login_response.status_code} - {login_response.text}") - print("Login successful") - - print("Calling factory reset endpoint...") - reset_response = client.post( - "/api/v1/admin/factory-reset", - headers={"X-Confirm-Reset": "yes-delete-all-data"}, - ) - - if reset_response.status_code == 200: - result = reset_response.json() - print("Factory reset successful!") - print(f" Database tables dropped: {result['results']['database_tables_dropped']}") - print(f" S3 objects deleted: {result['results']['s3_objects_deleted']}") - print(f" Database reinitialized: {result['results']['database_reinitialized']}") - print(f" Seeded: {result['results']['seeded']}") - return True - else: - raise Exception(f"Factory reset failed: {reset_response.status_code} - {reset_response.text}") - - for attempt in range(1, MAX_RETRIES + 1): - try: - print(f"Attempt {attempt}/{MAX_RETRIES}") - if do_reset(): - sys.exit(0) - except Exception as e: - print(f"Attempt {attempt} failed: {e}") - if attempt < MAX_RETRIES: - print(f"Retrying in {RETRY_DELAY} seconds...") - time.sleep(RETRY_DELAY) - else: - print("All retry attempts failed") - sys.exit(1) - RESET_SCRIPT + helm upgrade --install orchard-test ./helm/orchard \ + --namespace $NAMESPACE \ + -f $VALUES_FILE \ + --set image.tag=git.linux-amd64-$CI_COMMIT_SHA \ + --set orchard.auth.adminPassword=$STAGE_ADMIN_PASSWORD \ + --set ingress.hosts[0].host=orchard-test.common.global.bsf.tools \ + --set ingress.tls[0].hosts[0]=orchard-test.common.global.bsf.tools \ + --set ingress.tls[0].secretName=orchard-test-tls \ + --set minioIngress.host=minio-test.common.global.bsf.tools \ + --set minioIngress.tls.secretName=minio-test-tls \ + --wait \ + --atomic \ + --timeout 10m + - kubectl rollout status deployment/orchard-test-server -n $NAMESPACE --timeout=10m + - *verify_deployment + environment: + name: test + url: https://orchard-test.common.global.bsf.tools + on_stop: cleanup_test + kubernetes: + agent: esv/bsf/bsf-integration/orchard/orchard-mvp:orchard-stage rules: - if: '$CI_COMMIT_BRANCH == "main"' when: on_success -# Reset stage BEFORE integration tests (ensure known state) -reset_stage_pre: - <<: *reset_stage_template - needs: [deploy_stage] - -# Integration tests for stage deployment -# Uses CI variable STAGE_ADMIN_PASSWORD (set in GitLab CI/CD settings) -integration_test_stage: - <<: *integration_test_template - needs: [reset_stage_pre] +# Cleanup ephemeral test deployment after integration tests +cleanup_test: + stage: deploy + needs: [integration_test_main] + image: deps.global.bsf.tools/registry-1.docker.io/alpine/k8s:1.29.12 + timeout: 5m variables: - ORCHARD_TEST_URL: $STAGE_URL + NAMESPACE: orch-stage-namespace + GIT_STRATEGY: none + before_script: + - kubectl config use-context esv/bsf/bsf-integration/orchard/orchard-mvp:orchard-stage + script: + - echo "Cleaning up ephemeral test deployment orchard-test" + - helm uninstall orchard-test --namespace $NAMESPACE || true + environment: + name: test + action: stop + kubernetes: + agent: esv/bsf/bsf-integration/orchard/orchard-mvp:orchard-stage + rules: + - if: '$CI_COMMIT_BRANCH == "main"' + when: on_success + allow_failure: true + +# Integration tests for ephemeral test deployment (main branch) +# Runs against orchard-test before promoting to long-running stage +integration_test_main: + <<: *integration_test_template + needs: [deploy_test] + variables: + ORCHARD_TEST_URL: https://orchard-test.common.global.bsf.tools ORCHARD_TEST_PASSWORD: $STAGE_ADMIN_PASSWORD rules: - if: '$CI_COMMIT_BRANCH == "main"' when: on_success -# Reset stage AFTER integration tests (clean slate for next run) -reset_stage: - <<: *reset_stage_template - needs: [integration_test_stage] - allow_failure: true # Don't fail pipeline if reset has issues +# Smoke test for long-running stage (after promotion) +smoke_test_stage: + <<: *smoke_test_template + needs: [deploy_stage] + variables: + ORCHARD_TEST_URL: $STAGE_URL + rules: + - if: '$CI_COMMIT_BRANCH == "main"' + when: on_success # Integration tests for feature deployment (full suite) # Uses DEV_ADMIN_PASSWORD CI variable (same as deploy_feature) @@ -412,9 +397,12 @@ frontend_tests: echo "Health check failed after 30 attempts" exit 1 -# Deploy to stage (main branch) +# Deploy to long-running stage (main branch, after ephemeral tests pass) deploy_stage: - <<: *deploy_template + stage: deploy + # Wait for ephemeral test to pass before promoting to long-running stage + needs: [cleanup_test] + image: deps.global.bsf.tools/registry-1.docker.io/alpine/k8s:1.29.12 variables: NAMESPACE: orch-stage-namespace VALUES_FILE: helm/orchard/values-stage.yaml @@ -423,7 +411,7 @@ deploy_stage: - kubectl config use-context esv/bsf/bsf-integration/orchard/orchard-mvp:orchard-stage - *helm_setup script: - - echo "Deploying to stage environment" + - echo "Deploying to long-running stage environment" - cd $CI_PROJECT_DIR - | helm upgrade --install orchard-stage ./helm/orchard \ diff --git a/CHANGELOG.md b/CHANGELOG.md index 5a1ce10..1fd9d65 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,76 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] ### Added +- Added frontend system projects visual distinction (#105) + - "Cache" badge for system projects in project list + - "System Cache" badge on project detail page + - Added `is_system` field to Project type +- Added frontend admin page for upstream sources and cache settings (#75) + - New `/admin/cache` page accessible from user menu (admin only) + - Upstream sources table with create/edit/delete/test connectivity + - Cache settings section with air-gap mode and auto-create system projects toggles + - Visual indicators for env-defined sources (locked, cannot be modified) + - Environment variable override badges when settings are overridden + - API client functions for all cache admin operations +- Added environment variable overrides for cache configuration (#74) + - `ORCHARD_CACHE_ALLOW_PUBLIC_INTERNET` - Override allow_public_internet (air-gap mode) + - `ORCHARD_CACHE_AUTO_CREATE_SYSTEM_PROJECTS` - Override auto_create_system_projects + - `ORCHARD_UPSTREAM__{NAME}__*` - Define upstream sources via env vars + - Env-defined sources appear in API with `source: "env"` marker + - Env-defined sources cannot be modified/deleted via API (400 error) + - Cache settings response includes `*_env_override` fields when overridden + - 7 unit tests for env var parsing and configuration +- Added Global Cache Settings Admin API (#73) + - `GET /api/v1/admin/cache-settings` - Retrieve current cache settings + - `PUT /api/v1/admin/cache-settings` - Update cache settings (partial updates) + - Admin-only access with audit logging + - Controls `allow_public_internet` (air-gap mode) and `auto_create_system_projects` + - 7 integration tests for settings management +- Added Upstream Sources Admin API for managing cache sources (#72) + - `GET /api/v1/admin/upstream-sources` - List sources with filtering + - `POST /api/v1/admin/upstream-sources` - Create source with auth configuration + - `GET /api/v1/admin/upstream-sources/{id}` - Get source details + - `PUT /api/v1/admin/upstream-sources/{id}` - Update source (partial updates) + - `DELETE /api/v1/admin/upstream-sources/{id}` - Delete source + - `POST /api/v1/admin/upstream-sources/{id}/test` - Test connectivity + - Admin-only access with audit logging + - Credentials never exposed (only has_password/has_headers flags) + - 13 integration tests for all CRUD operations +- Added system project restrictions and management (#71) + - System projects (`_npm`, `_pypi`, etc.) cannot be deleted (returns 403) + - System projects cannot be made private (must remain public) + - `GET /api/v1/system-projects` endpoint to list all system cache projects + - 5 integration tests for system project restrictions +- Added Cache API endpoint for fetching and storing artifacts from upstream URLs (#70) + - `POST /api/v1/cache` endpoint to cache artifacts from upstream registries + - URL parsing helpers to extract package name/version from npm, PyPI, Maven URLs + - Automatic system project creation (`_npm`, `_pypi`, `_maven`, etc.) + - URL-to-artifact provenance tracking via `cached_urls` table + - Optional user project cross-referencing for custom organization + - Cache hit returns existing artifact without re-fetching + - Air-gap mode enforcement (blocks public URLs when disabled) + - Hash verification for downloaded artifacts + - 21 unit tests for URL parsing and cache endpoint +- Added HTTP client for fetching artifacts from upstream sources (#69) + - `UpstreamClient` class in `backend/app/upstream.py` with streaming downloads + - SHA256 hash computation while streaming (doesn't load large files into memory) + - Auth support: none, basic auth, bearer token, API key (custom headers) + - URL-to-source matching by URL prefix with priority ordering + - Configuration options: timeouts, retries with exponential backoff, redirect limits, max file size + - Air-gap mode enforcement via `allow_public_internet` setting + - Response header capture for provenance tracking + - Proper error handling with custom exception types + - Connection test method for upstream source validation + - 33 unit tests for client functionality +- Added upstream artifact caching schema for hermetic builds (#68) + - `upstream_sources` table for configuring upstream registries (npm, PyPI, Maven, etc.) + - `cache_settings` table for global settings including air-gap mode + - `cached_urls` table for URL-to-artifact provenance tracking + - `is_system` column on projects for system cache projects (_npm, _pypi, etc.) + - Support for multiple auth types: none, basic auth, bearer token, API key + - Fernet encryption for credentials using `ORCHARD_CACHE_ENCRYPTION_KEY` + - Default upstream sources seeded (npm-public, pypi-public, maven-central, docker-hub) - disabled by default + - Migration `010_upstream_caching.sql` - Added team-based multi-tenancy for organizing projects and collaboration (#88-#104) - Teams serve as organizational containers for projects - Users can belong to multiple teams with different roles (owner, admin, member) diff --git a/backend/app/cache.py b/backend/app/cache.py new file mode 100644 index 0000000..b0f56df --- /dev/null +++ b/backend/app/cache.py @@ -0,0 +1,316 @@ +""" +Cache service for upstream artifact caching. + +Provides URL parsing, system project management, and caching logic +for the upstream caching feature. +""" + +import logging +import re +from dataclasses import dataclass +from typing import Optional +from urllib.parse import urlparse, unquote + +logger = logging.getLogger(__name__) + + +# System project names for each source type +SYSTEM_PROJECT_NAMES = { + "npm": "_npm", + "pypi": "_pypi", + "maven": "_maven", + "docker": "_docker", + "helm": "_helm", + "nuget": "_nuget", + "deb": "_deb", + "rpm": "_rpm", + "generic": "_generic", +} + +# System project descriptions +SYSTEM_PROJECT_DESCRIPTIONS = { + "npm": "System cache for npm packages", + "pypi": "System cache for PyPI packages", + "maven": "System cache for Maven packages", + "docker": "System cache for Docker images", + "helm": "System cache for Helm charts", + "nuget": "System cache for NuGet packages", + "deb": "System cache for Debian packages", + "rpm": "System cache for RPM packages", + "generic": "System cache for generic artifacts", +} + + +@dataclass +class ParsedUrl: + """Parsed URL information for caching.""" + + package_name: str + version: Optional[str] = None + filename: Optional[str] = None + + +def parse_npm_url(url: str) -> Optional[ParsedUrl]: + """ + Parse npm registry URL to extract package name and version. + + Formats: + - https://registry.npmjs.org/{package}/-/{package}-{version}.tgz + - https://registry.npmjs.org/@{scope}/{package}/-/{package}-{version}.tgz + + Examples: + - https://registry.npmjs.org/lodash/-/lodash-4.17.21.tgz + - https://registry.npmjs.org/@types/node/-/node-18.0.0.tgz + """ + parsed = urlparse(url) + path = unquote(parsed.path) + + # Pattern for scoped packages: /@scope/package/-/package-version.tgz + scoped_pattern = r"^/@([^/]+)/([^/]+)/-/\2-(.+)\.tgz$" + match = re.match(scoped_pattern, path) + if match: + scope, name, version = match.groups() + return ParsedUrl( + package_name=f"@{scope}/{name}", + version=version, + filename=f"{name}-{version}.tgz", + ) + + # Pattern for unscoped packages: /package/-/package-version.tgz + unscoped_pattern = r"^/([^/@]+)/-/\1-(.+)\.tgz$" + match = re.match(unscoped_pattern, path) + if match: + name, version = match.groups() + return ParsedUrl( + package_name=name, + version=version, + filename=f"{name}-{version}.tgz", + ) + + return None + + +def parse_pypi_url(url: str) -> Optional[ParsedUrl]: + """ + Parse PyPI URL to extract package name and version. + + Formats: + - https://files.pythonhosted.org/packages/.../package-version.tar.gz + - https://files.pythonhosted.org/packages/.../package-version-py3-none-any.whl + - https://pypi.org/packages/.../package-version.tar.gz + + Examples: + - https://files.pythonhosted.org/packages/ab/cd/requests-2.28.0.tar.gz + - https://files.pythonhosted.org/packages/ab/cd/requests-2.28.0-py3-none-any.whl + """ + parsed = urlparse(url) + path = unquote(parsed.path) + + # Get the filename from the path + filename = path.split("/")[-1] + if not filename: + return None + + # Handle wheel files: package-version-py3-none-any.whl + wheel_pattern = r"^([a-zA-Z0-9_-]+)-(\d+[^-]*)-.*\.whl$" + match = re.match(wheel_pattern, filename) + if match: + name, version = match.groups() + # Normalize package name (PyPI uses underscores internally) + name = name.replace("_", "-").lower() + return ParsedUrl( + package_name=name, + version=version, + filename=filename, + ) + + # Handle source distributions: package-version.tar.gz or package-version.zip + sdist_pattern = r"^([a-zA-Z0-9_-]+)-(\d+(?:\.\d+)*(?:[a-zA-Z0-9_.+-]*)?)(?:\.tar\.gz|\.zip|\.tar\.bz2)$" + match = re.match(sdist_pattern, filename) + if match: + name, version = match.groups() + name = name.replace("_", "-").lower() + return ParsedUrl( + package_name=name, + version=version, + filename=filename, + ) + + return None + + +def parse_maven_url(url: str) -> Optional[ParsedUrl]: + """ + Parse Maven repository URL to extract artifact info. + + Format: + - https://repo1.maven.org/maven2/{group}/{artifact}/{version}/{artifact}-{version}.jar + + Examples: + - https://repo1.maven.org/maven2/org/apache/commons/commons-lang3/3.12.0/commons-lang3-3.12.0.jar + - https://repo1.maven.org/maven2/com/google/guava/guava/31.1-jre/guava-31.1-jre.jar + """ + parsed = urlparse(url) + path = unquote(parsed.path) + + # Find /maven2/ or similar repository path + maven2_idx = path.find("/maven2/") + if maven2_idx >= 0: + path = path[maven2_idx + 8:] # Remove /maven2/ + elif path.startswith("/"): + path = path[1:] + + parts = path.split("/") + if len(parts) < 4: + return None + + # Last part is filename, before that is version, before that is artifact + filename = parts[-1] + version = parts[-2] + artifact = parts[-3] + group = ".".join(parts[:-3]) + + # Verify filename matches expected pattern + if not filename.startswith(f"{artifact}-{version}"): + return None + + return ParsedUrl( + package_name=f"{group}:{artifact}", + version=version, + filename=filename, + ) + + +def parse_docker_url(url: str) -> Optional[ParsedUrl]: + """ + Parse Docker registry URL to extract image info. + + Note: Docker registries are more complex (manifests, blobs, etc.) + This handles basic blob/manifest URLs. + + Examples: + - https://registry-1.docker.io/v2/library/nginx/blobs/sha256:abc123 + - https://registry-1.docker.io/v2/myuser/myimage/manifests/latest + """ + parsed = urlparse(url) + path = unquote(parsed.path) + + # Pattern: /v2/{namespace}/{image}/blobs/{digest} or /manifests/{tag} + pattern = r"^/v2/([^/]+(?:/[^/]+)?)/([^/]+)/(blobs|manifests)/(.+)$" + match = re.match(pattern, path) + if match: + namespace, image, artifact_type, reference = match.groups() + if namespace == "library": + package_name = image + else: + package_name = f"{namespace}/{image}" + + # For manifests, the reference is the tag + version = reference if artifact_type == "manifests" else None + + return ParsedUrl( + package_name=package_name, + version=version, + filename=f"{image}-{reference}" if version else reference, + ) + + return None + + +def parse_generic_url(url: str) -> ParsedUrl: + """ + Parse a generic URL to extract filename. + + Attempts to extract meaningful package name and version from filename. + + Examples: + - https://example.com/downloads/myapp-1.2.3.tar.gz + - https://github.com/user/repo/releases/download/v1.0/release.zip + """ + parsed = urlparse(url) + path = unquote(parsed.path) + filename = path.split("/")[-1] or "artifact" + + # List of known compound and simple extensions + known_extensions = [ + ".tar.gz", ".tar.bz2", ".tar.xz", + ".zip", ".tgz", ".gz", ".jar", ".war", ".deb", ".rpm" + ] + + # Strip extension from filename first + base_name = filename + matched_ext = None + for ext in known_extensions: + if filename.endswith(ext): + base_name = filename[:-len(ext)] + matched_ext = ext + break + + if matched_ext is None: + # Unknown extension, return filename as package name + return ParsedUrl( + package_name=filename, + version=None, + filename=filename, + ) + + # Try to extract version from base_name + # Pattern: name-version or name_version + # Version starts with digit(s) and can include dots, dashes, and alphanumeric suffixes + version_pattern = r"^(.+?)[-_](v?\d+(?:\.\d+)*(?:[-_][a-zA-Z0-9]+)?)$" + match = re.match(version_pattern, base_name) + if match: + name, version = match.groups() + return ParsedUrl( + package_name=name, + version=version, + filename=filename, + ) + + # No version found, use base_name as package name + return ParsedUrl( + package_name=base_name, + version=None, + filename=filename, + ) + + +def parse_url(url: str, source_type: str) -> ParsedUrl: + """ + Parse URL to extract package name and version based on source type. + + Args: + url: The URL to parse. + source_type: The source type (npm, pypi, maven, docker, etc.) + + Returns: + ParsedUrl with extracted information. + """ + parsed = None + + if source_type == "npm": + parsed = parse_npm_url(url) + elif source_type == "pypi": + parsed = parse_pypi_url(url) + elif source_type == "maven": + parsed = parse_maven_url(url) + elif source_type == "docker": + parsed = parse_docker_url(url) + + # Fall back to generic parsing if type-specific parsing fails + if parsed is None: + parsed = parse_generic_url(url) + + return parsed + + +def get_system_project_name(source_type: str) -> str: + """Get the system project name for a source type.""" + return SYSTEM_PROJECT_NAMES.get(source_type, "_generic") + + +def get_system_project_description(source_type: str) -> str: + """Get the system project description for a source type.""" + return SYSTEM_PROJECT_DESCRIPTIONS.get( + source_type, "System cache for artifacts" + ) diff --git a/backend/app/config.py b/backend/app/config.py index a691767..e248b37 100644 --- a/backend/app/config.py +++ b/backend/app/config.py @@ -1,5 +1,8 @@ from pydantic_settings import BaseSettings from functools import lru_cache +from typing import Optional +import os +import re class Settings(BaseSettings): @@ -56,6 +59,12 @@ class Settings(BaseSettings): # Initial admin user settings admin_password: str = "" # Initial admin password (if empty, uses 'changeme123') + # Cache settings + cache_encryption_key: str = "" # Fernet key for encrypting upstream credentials (auto-generated if empty) + # Global cache settings overrides (None = use DB value, True/False = override DB) + cache_allow_public_internet: Optional[bool] = None # Override allow_public_internet (air-gap mode) + cache_auto_create_system_projects: Optional[bool] = None # Override auto_create_system_projects + # JWT Authentication settings (optional, for external identity providers) jwt_enabled: bool = False # Enable JWT token validation jwt_secret: str = "" # Secret key for HS256, or leave empty for RS256 with JWKS @@ -88,3 +97,113 @@ class Settings(BaseSettings): @lru_cache() def get_settings() -> Settings: return Settings() + + +class EnvUpstreamSource: + """Represents an upstream source defined via environment variables.""" + + def __init__( + self, + name: str, + url: str, + source_type: str = "generic", + enabled: bool = True, + is_public: bool = True, + auth_type: str = "none", + username: Optional[str] = None, + password: Optional[str] = None, + priority: int = 100, + ): + self.name = name + self.url = url + self.source_type = source_type + self.enabled = enabled + self.is_public = is_public + self.auth_type = auth_type + self.username = username + self.password = password + self.priority = priority + self.source = "env" # Mark as env-defined + + +def parse_upstream_sources_from_env() -> list[EnvUpstreamSource]: + """ + Parse upstream sources from environment variables. + + Uses double underscore (__) as separator to allow source names with single underscores. + Pattern: ORCHARD_UPSTREAM__{NAME}__FIELD + + Example: + ORCHARD_UPSTREAM__NPM_PRIVATE__URL=https://npm.corp.com + ORCHARD_UPSTREAM__NPM_PRIVATE__TYPE=npm + ORCHARD_UPSTREAM__NPM_PRIVATE__ENABLED=true + ORCHARD_UPSTREAM__NPM_PRIVATE__AUTH_TYPE=basic + ORCHARD_UPSTREAM__NPM_PRIVATE__USERNAME=reader + ORCHARD_UPSTREAM__NPM_PRIVATE__PASSWORD=secret + + Returns: + List of EnvUpstreamSource objects parsed from environment variables. + """ + # Pattern: ORCHARD_UPSTREAM__{NAME}__{FIELD} + pattern = re.compile(r"^ORCHARD_UPSTREAM__([A-Z0-9_]+)__([A-Z_]+)$", re.IGNORECASE) + + # Collect all env vars matching the pattern, grouped by source name + sources_data: dict[str, dict[str, str]] = {} + + for key, value in os.environ.items(): + match = pattern.match(key) + if match: + source_name = match.group(1).lower() # Normalize to lowercase + field = match.group(2).upper() + if source_name not in sources_data: + sources_data[source_name] = {} + sources_data[source_name][field] = value + + # Build source objects from collected data + sources: list[EnvUpstreamSource] = [] + + for name, data in sources_data.items(): + # URL is required + url = data.get("URL") + if not url: + continue # Skip sources without URL + + # Parse boolean fields + def parse_bool(val: Optional[str], default: bool) -> bool: + if val is None: + return default + return val.lower() in ("true", "1", "yes", "on") + + # Parse integer fields + def parse_int(val: Optional[str], default: int) -> int: + if val is None: + return default + try: + return int(val) + except ValueError: + return default + + source = EnvUpstreamSource( + name=name.replace("_", "-"), # Convert underscores to hyphens for readability + url=url, + source_type=data.get("TYPE", "generic").lower(), + enabled=parse_bool(data.get("ENABLED"), True), + is_public=parse_bool(data.get("IS_PUBLIC"), True), + auth_type=data.get("AUTH_TYPE", "none").lower(), + username=data.get("USERNAME"), + password=data.get("PASSWORD"), + priority=parse_int(data.get("PRIORITY"), 100), + ) + sources.append(source) + + return sources + + +@lru_cache() +def get_env_upstream_sources() -> tuple[EnvUpstreamSource, ...]: + """ + Get cached list of upstream sources from environment variables. + + Returns a tuple for hashability (required by lru_cache). + """ + return tuple(parse_upstream_sources_from_env()) diff --git a/backend/app/database.py b/backend/app/database.py index ef82a90..8f15b34 100644 --- a/backend/app/database.py +++ b/backend/app/database.py @@ -9,6 +9,7 @@ import hashlib from .config import get_settings from .models import Base +from .purge_seed_data import should_purge_seed_data, purge_seed_data settings = get_settings() logger = logging.getLogger(__name__) @@ -80,6 +81,14 @@ def init_db(): # Run migrations for schema updates _run_migrations() + # Purge seed data if requested (for transitioning to production-like environment) + if should_purge_seed_data(): + db = SessionLocal() + try: + purge_seed_data(db) + finally: + db.close() + def _ensure_migrations_table(conn) -> None: """Create the migrations tracking table if it doesn't exist.""" @@ -429,6 +438,99 @@ def _run_migrations(): END $$; """, ), + Migration( + name="016_add_is_system_to_projects", + sql=""" + DO $$ + BEGIN + IF NOT EXISTS ( + SELECT 1 FROM information_schema.columns + WHERE table_name = 'projects' AND column_name = 'is_system' + ) THEN + ALTER TABLE projects ADD COLUMN is_system BOOLEAN NOT NULL DEFAULT FALSE; + CREATE INDEX IF NOT EXISTS idx_projects_is_system ON projects(is_system); + END IF; + END $$; + """, + ), + Migration( + name="017_create_upstream_sources", + sql=""" + CREATE TABLE IF NOT EXISTS upstream_sources ( + id UUID PRIMARY KEY DEFAULT gen_random_uuid(), + name VARCHAR(255) NOT NULL UNIQUE, + source_type VARCHAR(50) NOT NULL DEFAULT 'generic', + url VARCHAR(2048) NOT NULL, + enabled BOOLEAN NOT NULL DEFAULT FALSE, + is_public BOOLEAN NOT NULL DEFAULT TRUE, + auth_type VARCHAR(20) NOT NULL DEFAULT 'none', + username VARCHAR(255), + password_encrypted BYTEA, + headers_encrypted BYTEA, + priority INTEGER NOT NULL DEFAULT 100, + created_at TIMESTAMP WITH TIME ZONE DEFAULT NOW(), + updated_at TIMESTAMP WITH TIME ZONE DEFAULT NOW(), + CONSTRAINT check_source_type CHECK ( + source_type IN ('npm', 'pypi', 'maven', 'docker', 'helm', 'nuget', 'deb', 'rpm', 'generic') + ), + CONSTRAINT check_auth_type CHECK ( + auth_type IN ('none', 'basic', 'bearer', 'api_key') + ), + CONSTRAINT check_priority_positive CHECK (priority > 0) + ); + CREATE INDEX IF NOT EXISTS idx_upstream_sources_enabled ON upstream_sources(enabled); + CREATE INDEX IF NOT EXISTS idx_upstream_sources_source_type ON upstream_sources(source_type); + CREATE INDEX IF NOT EXISTS idx_upstream_sources_is_public ON upstream_sources(is_public); + CREATE INDEX IF NOT EXISTS idx_upstream_sources_priority ON upstream_sources(priority); + """, + ), + Migration( + name="018_create_cache_settings", + sql=""" + CREATE TABLE IF NOT EXISTS cache_settings ( + id INTEGER PRIMARY KEY DEFAULT 1, + allow_public_internet BOOLEAN NOT NULL DEFAULT TRUE, + auto_create_system_projects BOOLEAN NOT NULL DEFAULT TRUE, + created_at TIMESTAMP WITH TIME ZONE DEFAULT NOW(), + updated_at TIMESTAMP WITH TIME ZONE DEFAULT NOW(), + CONSTRAINT check_cache_settings_singleton CHECK (id = 1) + ); + INSERT INTO cache_settings (id, allow_public_internet, auto_create_system_projects) + VALUES (1, TRUE, TRUE) + ON CONFLICT (id) DO NOTHING; + """, + ), + Migration( + name="019_create_cached_urls", + sql=""" + CREATE TABLE IF NOT EXISTS cached_urls ( + id UUID PRIMARY KEY DEFAULT gen_random_uuid(), + url VARCHAR(4096) NOT NULL, + url_hash VARCHAR(64) NOT NULL UNIQUE, + artifact_id VARCHAR(64) NOT NULL REFERENCES artifacts(id), + source_id UUID REFERENCES upstream_sources(id) ON DELETE SET NULL, + fetched_at TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT NOW(), + response_headers JSONB DEFAULT '{}', + created_at TIMESTAMP WITH TIME ZONE DEFAULT NOW() + ); + CREATE INDEX IF NOT EXISTS idx_cached_urls_url_hash ON cached_urls(url_hash); + CREATE INDEX IF NOT EXISTS idx_cached_urls_artifact_id ON cached_urls(artifact_id); + CREATE INDEX IF NOT EXISTS idx_cached_urls_source_id ON cached_urls(source_id); + CREATE INDEX IF NOT EXISTS idx_cached_urls_fetched_at ON cached_urls(fetched_at); + """, + ), + Migration( + name="020_seed_default_upstream_sources", + sql=""" + INSERT INTO upstream_sources (id, name, source_type, url, enabled, is_public, auth_type, priority) + VALUES + (gen_random_uuid(), 'npm-public', 'npm', 'https://registry.npmjs.org', FALSE, TRUE, 'none', 100), + (gen_random_uuid(), 'pypi-public', 'pypi', 'https://pypi.org/simple', FALSE, TRUE, 'none', 100), + (gen_random_uuid(), 'maven-central', 'maven', 'https://repo1.maven.org/maven2', FALSE, TRUE, 'none', 100), + (gen_random_uuid(), 'docker-hub', 'docker', 'https://registry-1.docker.io', FALSE, TRUE, 'none', 100) + ON CONFLICT (name) DO NOTHING; + """, + ), ] with engine.connect() as conn: diff --git a/backend/app/encryption.py b/backend/app/encryption.py new file mode 100644 index 0000000..155b0ce --- /dev/null +++ b/backend/app/encryption.py @@ -0,0 +1,160 @@ +""" +Encryption utilities for sensitive data storage. + +Uses Fernet symmetric encryption for credentials like upstream passwords. +The encryption key is sourced from ORCHARD_CACHE_ENCRYPTION_KEY environment variable. +If not set, a random key is generated on startup (with a warning). +""" + +import base64 +import logging +import os +import secrets +from functools import lru_cache +from typing import Optional + +from cryptography.fernet import Fernet, InvalidToken + +logger = logging.getLogger(__name__) + +# Module-level storage for auto-generated key (only used if env var not set) +_generated_key: Optional[bytes] = None + + +def _get_key_from_env() -> Optional[bytes]: + """Get encryption key from environment variable.""" + key_str = os.environ.get("ORCHARD_CACHE_ENCRYPTION_KEY", "") + if not key_str: + return None + + # Support both raw base64 and url-safe base64 formats + try: + # Try to decode as-is (Fernet keys are url-safe base64) + key_bytes = key_str.encode("utf-8") + # Validate it's a valid Fernet key by trying to create a Fernet instance + Fernet(key_bytes) + return key_bytes + except Exception: + pass + + # Try base64 decoding if it's a raw 32-byte key encoded as base64 + try: + decoded = base64.urlsafe_b64decode(key_str) + if len(decoded) == 32: + # Re-encode as url-safe base64 for Fernet + key_bytes = base64.urlsafe_b64encode(decoded) + Fernet(key_bytes) + return key_bytes + except Exception: + pass + + logger.error( + "ORCHARD_CACHE_ENCRYPTION_KEY is set but invalid. " + "Must be a valid Fernet key (32 bytes, url-safe base64 encoded). " + "Generate one with: python -c \"from cryptography.fernet import Fernet; print(Fernet.generate_key().decode())\"" + ) + return None + + +def get_encryption_key() -> bytes: + """ + Get the Fernet encryption key. + + Returns the key from ORCHARD_CACHE_ENCRYPTION_KEY if set and valid, + otherwise generates a random key (with a warning logged). + + The generated key is cached for the lifetime of the process. + """ + global _generated_key + + # Try to get from environment + env_key = _get_key_from_env() + if env_key: + return env_key + + # Generate a new key if needed + if _generated_key is None: + _generated_key = Fernet.generate_key() + logger.warning( + "ORCHARD_CACHE_ENCRYPTION_KEY not set - using auto-generated key. " + "Encrypted credentials will be lost on restart! " + "Set ORCHARD_CACHE_ENCRYPTION_KEY for persistent encryption. " + "Generate a key with: python -c \"from cryptography.fernet import Fernet; print(Fernet.generate_key().decode())\"" + ) + + return _generated_key + + +@lru_cache(maxsize=1) +def _get_fernet() -> Fernet: + """Get a cached Fernet instance.""" + return Fernet(get_encryption_key()) + + +def encrypt_value(plaintext: str) -> bytes: + """ + Encrypt a string value using Fernet. + + Args: + plaintext: The string to encrypt + + Returns: + Encrypted bytes (includes Fernet token with timestamp) + """ + if not plaintext: + raise ValueError("Cannot encrypt empty value") + + fernet = _get_fernet() + return fernet.encrypt(plaintext.encode("utf-8")) + + +def decrypt_value(ciphertext: bytes) -> str: + """ + Decrypt a Fernet-encrypted value. + + Args: + ciphertext: The encrypted bytes + + Returns: + Decrypted string + + Raises: + InvalidToken: If decryption fails (wrong key or corrupted data) + """ + if not ciphertext: + raise ValueError("Cannot decrypt empty value") + + fernet = _get_fernet() + return fernet.decrypt(ciphertext).decode("utf-8") + + +def can_decrypt(ciphertext: bytes) -> bool: + """ + Check if a value can be decrypted with the current key. + + Useful for checking if credentials are still valid after key rotation. + + Args: + ciphertext: The encrypted bytes + + Returns: + True if decryption succeeds, False otherwise + """ + if not ciphertext: + return False + + try: + decrypt_value(ciphertext) + return True + except (InvalidToken, ValueError): + return False + + +def generate_key() -> str: + """ + Generate a new Fernet encryption key. + + Returns: + A valid Fernet key as a string (url-safe base64 encoded) + """ + return Fernet.generate_key().decode("utf-8") diff --git a/backend/app/models.py b/backend/app/models.py index 67c6a3c..aa049ad 100644 --- a/backend/app/models.py +++ b/backend/app/models.py @@ -12,6 +12,7 @@ from sqlalchemy import ( Index, JSON, ARRAY, + LargeBinary, ) from sqlalchemy.dialects.postgresql import UUID from sqlalchemy.orm import relationship, declarative_base @@ -27,6 +28,7 @@ class Project(Base): name = Column(String(255), unique=True, nullable=False) description = Column(Text) is_public = Column(Boolean, default=True) + is_system = Column(Boolean, default=False, nullable=False) created_at = Column(DateTime(timezone=True), default=datetime.utcnow) updated_at = Column( DateTime(timezone=True), default=datetime.utcnow, onupdate=datetime.utcnow @@ -46,6 +48,7 @@ class Project(Base): Index("idx_projects_name", "name"), Index("idx_projects_created_by", "created_by"), Index("idx_projects_team_id", "team_id"), + Index("idx_projects_is_system", "is_system"), ) @@ -637,3 +640,169 @@ class TeamMembership(Base): name="check_team_role", ), ) + + +# ============================================================================= +# Upstream Caching Models +# ============================================================================= + +# Valid source types for upstream registries +SOURCE_TYPES = ["npm", "pypi", "maven", "docker", "helm", "nuget", "deb", "rpm", "generic"] + +# Valid authentication types +AUTH_TYPES = ["none", "basic", "bearer", "api_key"] + + +class UpstreamSource(Base): + """Configuration for an upstream artifact registry. + + Stores connection details and authentication for upstream registries + like npm, PyPI, Maven Central, or private Artifactory instances. + """ + + __tablename__ = "upstream_sources" + + id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4) + name = Column(String(255), unique=True, nullable=False) + source_type = Column(String(50), default="generic", nullable=False) + url = Column(String(2048), nullable=False) + enabled = Column(Boolean, default=False, nullable=False) + is_public = Column(Boolean, default=True, nullable=False) + auth_type = Column(String(20), default="none", nullable=False) + username = Column(String(255)) + password_encrypted = Column(LargeBinary) + headers_encrypted = Column(LargeBinary) + priority = Column(Integer, default=100, nullable=False) + created_at = Column(DateTime(timezone=True), default=datetime.utcnow) + updated_at = Column( + DateTime(timezone=True), default=datetime.utcnow, onupdate=datetime.utcnow + ) + + # Relationships + cached_urls = relationship("CachedUrl", back_populates="source") + + __table_args__ = ( + Index("idx_upstream_sources_enabled", "enabled"), + Index("idx_upstream_sources_source_type", "source_type"), + Index("idx_upstream_sources_is_public", "is_public"), + Index("idx_upstream_sources_priority", "priority"), + CheckConstraint( + "source_type IN ('npm', 'pypi', 'maven', 'docker', 'helm', 'nuget', 'deb', 'rpm', 'generic')", + name="check_source_type", + ), + CheckConstraint( + "auth_type IN ('none', 'basic', 'bearer', 'api_key')", + name="check_auth_type", + ), + CheckConstraint("priority > 0", name="check_priority_positive"), + ) + + def set_password(self, password: str) -> None: + """Encrypt and store a password/token.""" + from .encryption import encrypt_value + + if password: + self.password_encrypted = encrypt_value(password) + else: + self.password_encrypted = None + + def get_password(self) -> str | None: + """Decrypt and return the stored password/token.""" + from .encryption import decrypt_value + + if self.password_encrypted: + try: + return decrypt_value(self.password_encrypted) + except Exception: + return None + return None + + def has_password(self) -> bool: + """Check if a password/token is stored.""" + return self.password_encrypted is not None + + def set_headers(self, headers: dict) -> None: + """Encrypt and store custom headers as JSON.""" + from .encryption import encrypt_value + import json + + if headers: + self.headers_encrypted = encrypt_value(json.dumps(headers)) + else: + self.headers_encrypted = None + + def get_headers(self) -> dict | None: + """Decrypt and return custom headers.""" + from .encryption import decrypt_value + import json + + if self.headers_encrypted: + try: + return json.loads(decrypt_value(self.headers_encrypted)) + except Exception: + return None + return None + + +class CacheSettings(Base): + """Global cache settings (singleton table). + + Controls behavior of the upstream caching system including air-gap mode. + """ + + __tablename__ = "cache_settings" + + id = Column(Integer, primary_key=True, default=1) + allow_public_internet = Column(Boolean, default=True, nullable=False) + auto_create_system_projects = Column(Boolean, default=True, nullable=False) + created_at = Column(DateTime(timezone=True), default=datetime.utcnow) + updated_at = Column( + DateTime(timezone=True), default=datetime.utcnow, onupdate=datetime.utcnow + ) + + __table_args__ = ( + CheckConstraint("id = 1", name="check_cache_settings_singleton"), + ) + + +class CachedUrl(Base): + """Tracks URL to artifact mappings for provenance. + + Records which URLs have been cached and maps them to their stored artifacts. + Enables "is this URL already cached?" lookups and audit trails. + """ + + __tablename__ = "cached_urls" + + id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4) + url = Column(String(4096), nullable=False) + url_hash = Column(String(64), unique=True, nullable=False) + artifact_id = Column( + String(64), ForeignKey("artifacts.id"), nullable=False + ) + source_id = Column( + UUID(as_uuid=True), + ForeignKey("upstream_sources.id", ondelete="SET NULL"), + ) + fetched_at = Column(DateTime(timezone=True), default=datetime.utcnow, nullable=False) + response_headers = Column(JSON, default=dict) + created_at = Column(DateTime(timezone=True), default=datetime.utcnow) + + # Relationships + artifact = relationship("Artifact") + source = relationship("UpstreamSource", back_populates="cached_urls") + + __table_args__ = ( + Index("idx_cached_urls_url_hash", "url_hash"), + Index("idx_cached_urls_artifact_id", "artifact_id"), + Index("idx_cached_urls_source_id", "source_id"), + Index("idx_cached_urls_fetched_at", "fetched_at"), + ) + + @staticmethod + def compute_url_hash(url: str) -> str: + """Compute SHA256 hash of a URL for fast lookups.""" + import hashlib + return hashlib.sha256(url.encode("utf-8")).hexdigest() + + diff --git a/backend/app/purge_seed_data.py b/backend/app/purge_seed_data.py new file mode 100644 index 0000000..41e5c0c --- /dev/null +++ b/backend/app/purge_seed_data.py @@ -0,0 +1,211 @@ +""" +Purge seed/demo data from the database. + +This is used when transitioning an environment from dev/test to production-like. +Triggered by setting ORCHARD_PURGE_SEED_DATA=true environment variable. +""" +import logging +import os +from sqlalchemy.orm import Session + +from .models import ( + Project, + Package, + Artifact, + Tag, + Upload, + PackageVersion, + ArtifactDependency, + Team, + TeamMembership, + User, + AccessPermission, +) +from .storage import get_storage + +logger = logging.getLogger(__name__) + +# Seed data identifiers (from seed.py) +SEED_PROJECT_NAMES = [ + "frontend-libs", + "backend-services", + "mobile-apps", + "internal-tools", +] + +SEED_TEAM_SLUG = "demo-team" + +SEED_USERNAMES = [ + "alice", + "bob", + "charlie", + "diana", + "eve", + "frank", +] + + +def should_purge_seed_data() -> bool: + """Check if seed data should be purged based on environment variable.""" + return os.environ.get("ORCHARD_PURGE_SEED_DATA", "").lower() == "true" + + +def purge_seed_data(db: Session) -> dict: + """ + Purge all seed/demo data from the database. + + Returns a dict with counts of deleted items. + """ + logger.warning("PURGING SEED DATA - This will delete demo projects, users, and teams") + + results = { + "dependencies_deleted": 0, + "tags_deleted": 0, + "versions_deleted": 0, + "uploads_deleted": 0, + "artifacts_deleted": 0, + "packages_deleted": 0, + "projects_deleted": 0, + "permissions_deleted": 0, + "team_memberships_deleted": 0, + "users_deleted": 0, + "teams_deleted": 0, + "s3_objects_deleted": 0, + } + + storage = get_storage() + + # Find seed projects + seed_projects = db.query(Project).filter(Project.name.in_(SEED_PROJECT_NAMES)).all() + seed_project_ids = [p.id for p in seed_projects] + + if not seed_projects: + logger.info("No seed projects found, nothing to purge") + return results + + logger.info(f"Found {len(seed_projects)} seed projects to purge") + + # Find packages in seed projects + seed_packages = db.query(Package).filter(Package.project_id.in_(seed_project_ids)).all() + seed_package_ids = [p.id for p in seed_packages] + + # Find artifacts in seed packages (via uploads) + seed_uploads = db.query(Upload).filter(Upload.package_id.in_(seed_package_ids)).all() + seed_artifact_ids = list(set(u.artifact_id for u in seed_uploads)) + + # Delete in order (respecting foreign keys) + + # 1. Delete artifact dependencies + if seed_artifact_ids: + count = db.query(ArtifactDependency).filter( + ArtifactDependency.artifact_id.in_(seed_artifact_ids) + ).delete(synchronize_session=False) + results["dependencies_deleted"] = count + logger.info(f"Deleted {count} artifact dependencies") + + # 2. Delete tags + if seed_package_ids: + count = db.query(Tag).filter(Tag.package_id.in_(seed_package_ids)).delete( + synchronize_session=False + ) + results["tags_deleted"] = count + logger.info(f"Deleted {count} tags") + + # 3. Delete package versions + if seed_package_ids: + count = db.query(PackageVersion).filter( + PackageVersion.package_id.in_(seed_package_ids) + ).delete(synchronize_session=False) + results["versions_deleted"] = count + logger.info(f"Deleted {count} package versions") + + # 4. Delete uploads + if seed_package_ids: + count = db.query(Upload).filter(Upload.package_id.in_(seed_package_ids)).delete( + synchronize_session=False + ) + results["uploads_deleted"] = count + logger.info(f"Deleted {count} uploads") + + # 5. Delete S3 objects for seed artifacts + if seed_artifact_ids: + seed_artifacts = db.query(Artifact).filter(Artifact.id.in_(seed_artifact_ids)).all() + for artifact in seed_artifacts: + if artifact.s3_key: + try: + storage.client.delete_object(Bucket=storage.bucket, Key=artifact.s3_key) + results["s3_objects_deleted"] += 1 + except Exception as e: + logger.warning(f"Failed to delete S3 object {artifact.s3_key}: {e}") + logger.info(f"Deleted {results['s3_objects_deleted']} S3 objects") + + # 6. Delete artifacts (only those with ref_count that would be 0 after our deletions) + # Since we deleted all tags/versions pointing to these artifacts, we can delete them + if seed_artifact_ids: + count = db.query(Artifact).filter(Artifact.id.in_(seed_artifact_ids)).delete( + synchronize_session=False + ) + results["artifacts_deleted"] = count + logger.info(f"Deleted {count} artifacts") + + # 7. Delete packages + if seed_package_ids: + count = db.query(Package).filter(Package.id.in_(seed_package_ids)).delete( + synchronize_session=False + ) + results["packages_deleted"] = count + logger.info(f"Deleted {count} packages") + + # 8. Delete access permissions for seed projects + if seed_project_ids: + count = db.query(AccessPermission).filter( + AccessPermission.project_id.in_(seed_project_ids) + ).delete(synchronize_session=False) + results["permissions_deleted"] = count + logger.info(f"Deleted {count} access permissions") + + # 9. Delete seed projects + count = db.query(Project).filter(Project.name.in_(SEED_PROJECT_NAMES)).delete( + synchronize_session=False + ) + results["projects_deleted"] = count + logger.info(f"Deleted {count} projects") + + # 10. Find and delete seed team + seed_team = db.query(Team).filter(Team.slug == SEED_TEAM_SLUG).first() + if seed_team: + # Delete team memberships first + count = db.query(TeamMembership).filter( + TeamMembership.team_id == seed_team.id + ).delete(synchronize_session=False) + results["team_memberships_deleted"] = count + logger.info(f"Deleted {count} team memberships") + + # Delete the team + db.delete(seed_team) + results["teams_deleted"] = 1 + logger.info(f"Deleted team: {SEED_TEAM_SLUG}") + + # 11. Delete seed users (but NOT admin) + seed_users = db.query(User).filter(User.username.in_(SEED_USERNAMES)).all() + for user in seed_users: + # Delete any remaining team memberships for this user + db.query(TeamMembership).filter(TeamMembership.user_id == user.id).delete( + synchronize_session=False + ) + # Delete any access permissions for this user + db.query(AccessPermission).filter(AccessPermission.user_id == user.id).delete( + synchronize_session=False + ) + db.delete(user) + results["users_deleted"] += 1 + + if results["users_deleted"] > 0: + logger.info(f"Deleted {results['users_deleted']} seed users") + + db.commit() + + logger.warning("SEED DATA PURGE COMPLETE") + logger.info(f"Purge results: {results}") + + return results diff --git a/backend/app/routes.py b/backend/app/routes.py index 337f5a5..4caba38 100644 --- a/backend/app/routes.py +++ b/backend/app/routes.py @@ -50,6 +50,9 @@ from .models import ( ArtifactDependency, Team, TeamMembership, + UpstreamSource, + CacheSettings, + CachedUrl, ) from .schemas import ( ProjectCreate, @@ -136,6 +139,8 @@ from .schemas import ( TeamMemberCreate, TeamMemberUpdate, TeamMemberResponse, + CacheRequest, + CacheResponse, ) from .metadata import extract_metadata from .dependencies import ( @@ -152,7 +157,7 @@ from .dependencies import ( DependencyNotFoundError, DependencyDepthExceededError, ) -from .config import get_settings +from .config import get_settings, get_env_upstream_sources from .checksum import ( ChecksumMismatchError, VerifyingStreamWrapper, @@ -1720,6 +1725,15 @@ def update_project( project = check_project_access(db, project_name, current_user, "admin") user_id = current_user.username if current_user else get_user_id(request) + # System project restrictions + if project.is_system: + # System projects must remain public + if project_update.is_public is not None and project_update.is_public is False: + raise HTTPException( + status_code=403, + detail="System projects cannot be made private. They must remain publicly accessible.", + ) + # Track changes for audit log changes = {} if ( @@ -1780,6 +1794,13 @@ def delete_project( if not project: raise HTTPException(status_code=404, detail="Project not found") + # System projects cannot be deleted + if project.is_system: + raise HTTPException( + status_code=403, + detail="System projects cannot be deleted. They are managed automatically by the cache system.", + ) + # Get counts for logging packages = db.query(Package).filter(Package.project_id == project.id).all() package_count = len(packages) @@ -1820,6 +1841,41 @@ def delete_project( return None +@router.get( + "/api/v1/system-projects", + response_model=List[ProjectResponse], + tags=["cache"], + summary="List system cache projects", +) +def list_system_projects( + db: Session = Depends(get_db), + current_user: Optional[User] = Depends(get_current_user_optional), +): + """ + List all system projects used for caching upstream artifacts. + + System projects are auto-created when artifacts are cached from upstream + registries. They use the naming convention `_npm`, `_pypi`, `_maven`, etc. + + System projects are always public and cannot be deleted or made private. + """ + # Any authenticated user can list system projects (they're public) + if not current_user: + raise HTTPException( + status_code=401, + detail="Authentication required", + ) + + projects = ( + db.query(Project) + .filter(Project.is_system == True) + .order_by(Project.name) + .all() + ) + + return projects + + # Access Permission routes @router.get( "/api/v1/project/{project_name}/permissions", @@ -7793,3 +7849,1150 @@ def resolve_artifact_dependencies( "max_depth": e.max_depth, } ) + + +# --- Upstream Caching Routes --- + +from .cache import ( + parse_url, + get_system_project_name, + get_system_project_description, +) +from .upstream import ( + UpstreamClient, + UpstreamClientConfig, + UpstreamError, + UpstreamConnectionError, + UpstreamTimeoutError, + UpstreamHTTPError, + UpstreamSSLError, + AirGapError, + FileSizeExceededError as UpstreamFileSizeExceededError, + SourceNotFoundError, + SourceDisabledError, +) + + +def _get_or_create_system_project( + db: Session, + source_type: str, +) -> Project: + """ + Get or create a system project for the given source type. + + System projects are auto-created on first cache request for a format type. + They have is_system=true, is_public=true, and cannot be deleted. + + Args: + db: Database session. + source_type: The source type (npm, pypi, maven, etc.) + + Returns: + The system project. + """ + project_name = get_system_project_name(source_type) + + # Check if project already exists + project = db.query(Project).filter(Project.name == project_name).first() + if project: + return project + + # Check if auto-create is enabled + cache_settings = db.query(CacheSettings).filter(CacheSettings.id == 1).first() + if cache_settings and not cache_settings.auto_create_system_projects: + raise HTTPException( + status_code=400, + detail=f"System project '{project_name}' does not exist and auto-creation is disabled", + ) + + # Create the system project + project = Project( + name=project_name, + description=get_system_project_description(source_type), + is_public=True, + is_system=True, + created_by="system", + ) + db.add(project) + db.flush() # Get the ID + + logger.info(f"Created system project: {project_name}") + return project + + +def _get_or_create_package( + db: Session, + project: Project, + package_name: str, + format_type: str = "generic", +) -> Package: + """ + Get or create a package within a project. + + Args: + db: Database session. + project: The project to create the package in. + package_name: The package name. + format_type: The package format type. + + Returns: + The package. + """ + package = ( + db.query(Package) + .filter(Package.project_id == project.id, Package.name == package_name) + .first() + ) + if package: + return package + + # Create the package + package = Package( + project_id=project.id, + name=package_name, + description=f"Cached package: {package_name}", + format=format_type, + platform="any", + ) + db.add(package) + db.flush() + + logger.info(f"Created package: {project.name}/{package_name}") + return package + + +def _get_cache_settings(db: Session) -> CacheSettings: + """Get or create the cache settings singleton.""" + settings = db.query(CacheSettings).filter(CacheSettings.id == 1).first() + if not settings: + settings = CacheSettings(id=1) + db.add(settings) + db.flush() + return settings + + +def _get_enabled_upstream_sources(db: Session) -> list[UpstreamSource]: + """Get all enabled upstream sources, sorted by priority.""" + return ( + db.query(UpstreamSource) + .filter(UpstreamSource.enabled == True) + .order_by(UpstreamSource.priority) + .all() + ) + + +@router.post( + "/api/v1/cache", + response_model=CacheResponse, + tags=["cache"], + summary="Cache an artifact from an upstream URL", +) +def cache_artifact( + request: Request, + cache_request: CacheRequest, + db: Session = Depends(get_db), + storage: S3Storage = Depends(get_storage), + current_user: User = Depends(get_current_user), +): + """ + Cache an artifact from an upstream URL. + + Fetches an artifact from the specified URL, stores it in Orchard's + content-addressable storage, and creates appropriate tags in system + and optionally user projects. + + **Request Body:** + - `url` (required): URL to fetch the artifact from + - `source_type` (required): Type of source (npm, pypi, maven, docker, helm, nuget, deb, rpm, generic) + - `package_name` (optional): Package name in system project (auto-derived from URL if not provided) + - `tag` (optional): Tag name in system project (auto-derived from URL if not provided) + - `user_project` (optional): Also create reference in this user project + - `user_package` (optional): Package name in user project (required if user_project specified) + - `user_tag` (optional): Tag name in user project (defaults to system tag) + - `expected_hash` (optional): Verify downloaded content matches this SHA256 hash + + **Behavior:** + 1. Checks if URL is already cached (fast lookup by URL hash) + 2. If cached: Returns existing artifact info, optionally creates user tag + 3. If not cached: + - Fetches via configured upstream source (with auth if configured) + - Stores artifact in S3 (content-addressable) + - Creates system project/package/tag (e.g., _npm/lodash:4.17.21) + - Optionally creates tag in user project + - Records URL mapping for provenance + + **Air-Gap Mode:** + When `allow_public_internet` is false, only URLs matching private + (non-public) upstream sources are allowed. + + **Example (curl):** + ```bash + curl -X POST "http://localhost:8080/api/v1/cache" \\ + -H "Authorization: Bearer " \\ + -H "Content-Type: application/json" \\ + -d '{ + "url": "https://registry.npmjs.org/lodash/-/lodash-4.17.21.tgz", + "source_type": "npm" + }' + ``` + """ + # Get cache settings and upstream sources + cache_settings = _get_cache_settings(db) + upstream_sources = _get_enabled_upstream_sources(db) + + # Parse URL to extract package info + parsed_url = parse_url(cache_request.url, cache_request.source_type) + package_name = cache_request.package_name or parsed_url.package_name + tag_name = cache_request.tag or parsed_url.version + + # Check if URL is already cached + url_hash = CachedUrl.compute_url_hash(cache_request.url) + cached_url = db.query(CachedUrl).filter(CachedUrl.url_hash == url_hash).first() + + if cached_url: + # URL already cached - return existing artifact + artifact = db.query(Artifact).filter(Artifact.id == cached_url.artifact_id).first() + if not artifact: + # Orphaned cached_url entry - this shouldn't happen + logger.error(f"Orphaned cached_url entry: {cached_url.id}") + db.delete(cached_url) + db.flush() + else: + # Get system project/package info + system_project_name = get_system_project_name(cache_request.source_type) + + # Optionally create user reference + user_reference = None + if cache_request.user_project and cache_request.user_package: + user_reference = _create_user_cache_reference( + db=db, + user_project_name=cache_request.user_project, + user_package_name=cache_request.user_package, + user_tag_name=cache_request.user_tag or tag_name, + artifact_id=artifact.id, + current_user=current_user, + ) + + # Audit log + _log_audit( + db, + action="cache.hit", + resource=f"cache/{cache_request.url[:100]}", + user_id=current_user.username, + source_ip=request.client.host if request.client else None, + details={ + "url": cache_request.url, + "artifact_id": artifact.id, + "already_cached": True, + }, + ) + + db.commit() + + return CacheResponse( + artifact_id=artifact.id, + sha256=artifact.id, + size=artifact.size, + content_type=artifact.content_type, + already_cached=True, + source_url=cache_request.url, + source_name=cached_url.source.name if cached_url.source else None, + system_project=system_project_name, + system_package=package_name, + system_tag=tag_name, + user_reference=user_reference, + ) + + # URL not cached - fetch from upstream + client_config = UpstreamClientConfig( + max_file_size=get_settings().max_file_size, + ) + client = UpstreamClient( + sources=upstream_sources, + cache_settings=cache_settings, + config=client_config, + ) + + try: + fetch_result = client.fetch( + cache_request.url, + expected_hash=cache_request.expected_hash, + ) + except AirGapError as e: + raise HTTPException(status_code=403, detail=str(e)) + except SourceDisabledError as e: + raise HTTPException(status_code=503, detail=str(e)) + except UpstreamHTTPError as e: + if e.status_code == 404: + raise HTTPException(status_code=404, detail=f"Upstream returned 404: {e}") + raise HTTPException(status_code=502, detail=f"Upstream error: {e}") + except UpstreamConnectionError as e: + raise HTTPException(status_code=502, detail=f"Failed to connect to upstream: {e}") + except UpstreamTimeoutError as e: + raise HTTPException(status_code=504, detail=f"Upstream request timed out: {e}") + except UpstreamSSLError as e: + raise HTTPException(status_code=502, detail=f"SSL error connecting to upstream: {e}") + except UpstreamFileSizeExceededError as e: + raise HTTPException(status_code=413, detail=str(e)) + except UpstreamError as e: + # Check for hash mismatch + if "Hash mismatch" in str(e): + raise HTTPException(status_code=409, detail=str(e)) + raise HTTPException(status_code=502, detail=f"Failed to fetch from upstream: {e}") + + try: + # Store artifact in S3 + storage_result = storage.store(fetch_result.content, fetch_result.size) + + # Close the fetch result (cleans up temp file) + fetch_result.close() + + # Verify hash matches what we computed during download + if storage_result.sha256 != fetch_result.sha256: + logger.error( + f"Hash mismatch after storage: fetch={fetch_result.sha256}, " + f"storage={storage_result.sha256}" + ) + raise HTTPException( + status_code=500, + detail="Hash verification failed after storage", + ) + + # Create or get artifact record + artifact = ( + db.query(Artifact) + .filter(Artifact.id == storage_result.sha256) + .with_for_update() + .first() + ) + + if not artifact: + artifact = Artifact( + id=storage_result.sha256, + size=storage_result.size, + content_type=fetch_result.content_type, + original_name=parsed_url.filename, + checksum_md5=storage_result.md5, + checksum_sha1=storage_result.sha1, + s3_etag=storage_result.s3_etag, + created_by=current_user.username, + s3_key=storage_result.s3_key, + artifact_metadata={"source_url": cache_request.url}, + ref_count=0, + ) + db.add(artifact) + db.flush() + + # Create system project and package + system_project = _get_or_create_system_project(db, cache_request.source_type) + system_package = _get_or_create_package( + db, system_project, package_name, cache_request.source_type + ) + + # Create tag in system package + if tag_name: + _create_or_update_tag( + db, system_package.id, tag_name, artifact.id, "system" + ) + + # Find the matched source for provenance + matched_source = None + for source in upstream_sources: + if cache_request.url.startswith(source.url.rstrip("/")): + matched_source = source + break + + # Record in cached_urls table + cached_url_record = CachedUrl( + url=cache_request.url, + url_hash=url_hash, + artifact_id=artifact.id, + source_id=matched_source.id if matched_source else None, + response_headers=fetch_result.response_headers, + ) + db.add(cached_url_record) + + # Optionally create user reference + user_reference = None + if cache_request.user_project and cache_request.user_package: + user_reference = _create_user_cache_reference( + db=db, + user_project_name=cache_request.user_project, + user_package_name=cache_request.user_package, + user_tag_name=cache_request.user_tag or tag_name, + artifact_id=artifact.id, + current_user=current_user, + ) + + # Audit log + _log_audit( + db, + action="cache.miss", + resource=f"cache/{cache_request.url[:100]}", + user_id=current_user.username, + source_ip=request.client.host if request.client else None, + details={ + "url": cache_request.url, + "artifact_id": artifact.id, + "size": artifact.size, + "source_name": matched_source.name if matched_source else None, + "system_project": system_project.name, + "system_package": system_package.name, + "system_tag": tag_name, + }, + ) + + db.commit() + + return CacheResponse( + artifact_id=artifact.id, + sha256=artifact.id, + size=artifact.size, + content_type=artifact.content_type, + already_cached=False, + source_url=cache_request.url, + source_name=matched_source.name if matched_source else None, + system_project=system_project.name, + system_package=system_package.name, + system_tag=tag_name, + user_reference=user_reference, + ) + + except HTTPException: + raise + except Exception as e: + logger.error(f"Error caching artifact: {e}", exc_info=True) + db.rollback() + raise HTTPException(status_code=500, detail=f"Failed to cache artifact: {e}") + finally: + # Ensure fetch result is closed even on error + try: + fetch_result.close() + except Exception: + pass + + +def _create_user_cache_reference( + db: Session, + user_project_name: str, + user_package_name: str, + user_tag_name: str, + artifact_id: str, + current_user: User, +) -> str: + """ + Create a reference to a cached artifact in a user's project. + + Args: + db: Database session. + user_project_name: User's project name. + user_package_name: Package name in user's project. + user_tag_name: Tag name in user's project. + artifact_id: The artifact ID to reference. + current_user: The current user (for auth check). + + Returns: + Reference string like "my-app/npm-deps:lodash-4.17.21" + """ + # Check user has write access to the project + user_project = check_project_access(db, user_project_name, current_user, "write") + + # Get or create package in user project + user_package = _get_or_create_package( + db, user_project, user_package_name, "generic" + ) + + # Create tag + if user_tag_name: + _create_or_update_tag( + db, user_package.id, user_tag_name, artifact_id, current_user.username + ) + return f"{user_project_name}/{user_package_name}:{user_tag_name}" + + return f"{user_project_name}/{user_package_name}" + + +# --- Upstream Sources Admin API --- + +from .schemas import ( + UpstreamSourceCreate, + UpstreamSourceUpdate, + UpstreamSourceResponse, +) + + +def _env_source_to_response(env_source) -> UpstreamSourceResponse: + """Convert an EnvUpstreamSource to UpstreamSourceResponse.""" + import uuid + # Generate deterministic UUID based on source name + # Uses UUID5 with DNS namespace for consistency + source_id = uuid.uuid5(uuid.NAMESPACE_DNS, f"env-upstream-{env_source.name}") + return UpstreamSourceResponse( + id=source_id, + name=env_source.name, + source_type=env_source.source_type, + url=env_source.url, + enabled=env_source.enabled, + is_public=env_source.is_public, + auth_type=env_source.auth_type, + username=env_source.username, + has_password=bool(env_source.password), + has_headers=False, # Env sources don't support custom headers + priority=env_source.priority, + source="env", + created_at=None, + updated_at=None, + ) + + +@router.get( + "/api/v1/admin/upstream-sources", + response_model=List[UpstreamSourceResponse], + tags=["admin", "cache"], + summary="List all upstream sources", +) +def list_upstream_sources( + enabled: Optional[bool] = Query(None, description="Filter by enabled status"), + source_type: Optional[str] = Query(None, description="Filter by source type (comma-separated)"), + db: Session = Depends(get_db), + current_user: User = Depends(require_admin), +): + """ + List all configured upstream sources. + + Admin-only endpoint for managing upstream artifact sources. + Passwords and API keys are never returned - only `has_password` and `has_headers` flags. + + Sources can be defined in the database or via environment variables. + Env-defined sources have `source: "env"` and cannot be modified via API. + + **Filters:** + - `enabled`: Filter by enabled status (true/false) + - `source_type`: Filter by source type (npm, pypi, maven, etc.) - comma-separated for multiple + + **Example:** + ``` + GET /api/v1/admin/upstream-sources?enabled=true&source_type=npm,pypi + ``` + """ + # Get env-defined sources + env_sources = get_env_upstream_sources() + + # Filter env sources + filtered_env_sources = [] + for es in env_sources: + if enabled is not None and es.enabled != enabled: + continue + if source_type: + types = [t.strip() for t in source_type.split(",")] + if es.source_type not in types: + continue + filtered_env_sources.append(es) + + # Get DB sources + query = db.query(UpstreamSource) + + if enabled is not None: + query = query.filter(UpstreamSource.enabled == enabled) + + if source_type: + types = [t.strip() for t in source_type.split(",")] + query = query.filter(UpstreamSource.source_type.in_(types)) + + db_sources = query.order_by(UpstreamSource.priority, UpstreamSource.name).all() + + # Build response list - env sources first, then DB sources + result = [] + + # Add env sources + for es in filtered_env_sources: + result.append(_env_source_to_response(es)) + + # Add DB sources + for s in db_sources: + result.append( + UpstreamSourceResponse( + id=s.id, + name=s.name, + source_type=s.source_type, + url=s.url, + enabled=s.enabled, + is_public=s.is_public, + auth_type=s.auth_type, + username=s.username, + has_password=s.has_password(), + has_headers=bool(s.headers_encrypted), + priority=s.priority, + source="database", + created_at=s.created_at, + updated_at=s.updated_at, + ) + ) + + # Sort by priority, then name + result.sort(key=lambda x: (x.priority, x.name)) + + return result + + +@router.post( + "/api/v1/admin/upstream-sources", + response_model=UpstreamSourceResponse, + status_code=201, + tags=["admin", "cache"], + summary="Create upstream source", +) +def create_upstream_source( + source_create: UpstreamSourceCreate, + request: Request, + db: Session = Depends(get_db), + current_user: User = Depends(require_admin), +): + """ + Create a new upstream source. + + Admin-only endpoint for adding upstream artifact sources (npm, PyPI, Maven, etc.). + + **Auth types:** + - `none`: Anonymous access + - `basic`: Username/password authentication + - `bearer`: Bearer token in Authorization header + - `api_key`: Custom headers (provide in `headers` field) + + **Example:** + ```json + { + "name": "npm-private", + "source_type": "npm", + "url": "https://npm.internal.corp", + "enabled": true, + "is_public": false, + "auth_type": "basic", + "username": "reader", + "password": "secret123", + "priority": 50 + } + ``` + """ + # Check for duplicate name + existing = db.query(UpstreamSource).filter(UpstreamSource.name == source_create.name).first() + if existing: + raise HTTPException( + status_code=409, + detail=f"Upstream source with name '{source_create.name}' already exists", + ) + + # Create the source + source = UpstreamSource( + name=source_create.name, + source_type=source_create.source_type, + url=source_create.url, + enabled=source_create.enabled, + is_public=source_create.is_public, + auth_type=source_create.auth_type, + username=source_create.username, + priority=source_create.priority, + ) + + # Set encrypted fields + if source_create.password: + source.set_password(source_create.password) + if source_create.headers: + source.set_headers(source_create.headers) + + db.add(source) + db.flush() + + # Audit log + _log_audit( + db, + action="upstream_source.create", + resource=f"upstream-source/{source.name}", + user_id=current_user.username, + source_ip=request.client.host if request.client else None, + details={ + "source_id": str(source.id), + "name": source.name, + "source_type": source.source_type, + "url": source.url, + "enabled": source.enabled, + }, + ) + + db.commit() + db.refresh(source) + + return UpstreamSourceResponse( + id=source.id, + name=source.name, + source_type=source.source_type, + url=source.url, + enabled=source.enabled, + is_public=source.is_public, + auth_type=source.auth_type, + username=source.username, + has_password=source.has_password(), + has_headers=bool(source.headers_encrypted), + priority=source.priority, + created_at=source.created_at, + updated_at=source.updated_at, + ) + + +@router.get( + "/api/v1/admin/upstream-sources/{source_id}", + response_model=UpstreamSourceResponse, + tags=["admin", "cache"], + summary="Get upstream source details", +) +def get_upstream_source( + source_id: str, + db: Session = Depends(get_db), + current_user: User = Depends(require_admin), +): + """ + Get details of a specific upstream source. + + Returns source configuration with `has_password` and `has_headers` flags + instead of actual credentials. + """ + import uuid + + # Check env sources first + env_sources = get_env_upstream_sources() + for es in env_sources: + env_id = uuid.uuid5(uuid.NAMESPACE_DNS, f"env-upstream-{es.name}") + if str(env_id) == source_id: + return _env_source_to_response(es) + + # Check DB source + source = db.query(UpstreamSource).filter(UpstreamSource.id == source_id).first() + if not source: + raise HTTPException(status_code=404, detail="Upstream source not found") + + return UpstreamSourceResponse( + id=source.id, + name=source.name, + source_type=source.source_type, + url=source.url, + enabled=source.enabled, + is_public=source.is_public, + auth_type=source.auth_type, + username=source.username, + has_password=source.has_password(), + has_headers=bool(source.headers_encrypted), + priority=source.priority, + source="database", + created_at=source.created_at, + updated_at=source.updated_at, + ) + + +@router.put( + "/api/v1/admin/upstream-sources/{source_id}", + response_model=UpstreamSourceResponse, + tags=["admin", "cache"], + summary="Update upstream source", +) +def update_upstream_source( + source_id: str, + source_update: UpstreamSourceUpdate, + request: Request, + db: Session = Depends(get_db), + current_user: User = Depends(require_admin), +): + """ + Update an upstream source. + + Supports partial updates - only provided fields are updated. + Environment-defined sources cannot be modified via API. + + **Password handling:** + - Omit `password` field: Keep existing password + - Set `password` to empty string `""`: Clear password + - Set `password` to value: Update password + + **Headers handling:** + - Omit `headers` field: Keep existing headers + - Set `headers` to `null` or `{}`: Clear headers + - Set `headers` to value: Update headers + """ + import uuid + + # Check if this is an env-defined source + env_sources = get_env_upstream_sources() + for es in env_sources: + env_id = uuid.uuid5(uuid.NAMESPACE_DNS, f"env-upstream-{es.name}") + if str(env_id) == source_id: + raise HTTPException( + status_code=400, + detail="Cannot modify environment-defined upstream source. Update the environment variables instead.", + ) + + source = db.query(UpstreamSource).filter(UpstreamSource.id == source_id).first() + if not source: + raise HTTPException(status_code=404, detail="Upstream source not found") + + # Track changes for audit log + changes = {} + + # Update fields if provided + if source_update.name is not None and source_update.name != source.name: + # Check for duplicate name + existing = db.query(UpstreamSource).filter( + UpstreamSource.name == source_update.name, + UpstreamSource.id != source_id, + ).first() + if existing: + raise HTTPException( + status_code=409, + detail=f"Upstream source with name '{source_update.name}' already exists", + ) + changes["name"] = {"old": source.name, "new": source_update.name} + source.name = source_update.name + + if source_update.source_type is not None and source_update.source_type != source.source_type: + changes["source_type"] = {"old": source.source_type, "new": source_update.source_type} + source.source_type = source_update.source_type + + if source_update.url is not None and source_update.url != source.url: + changes["url"] = {"old": source.url, "new": source_update.url} + source.url = source_update.url + + if source_update.enabled is not None and source_update.enabled != source.enabled: + changes["enabled"] = {"old": source.enabled, "new": source_update.enabled} + source.enabled = source_update.enabled + + if source_update.is_public is not None and source_update.is_public != source.is_public: + changes["is_public"] = {"old": source.is_public, "new": source_update.is_public} + source.is_public = source_update.is_public + + if source_update.auth_type is not None and source_update.auth_type != source.auth_type: + changes["auth_type"] = {"old": source.auth_type, "new": source_update.auth_type} + source.auth_type = source_update.auth_type + + if source_update.username is not None and source_update.username != source.username: + changes["username"] = {"old": source.username, "new": source_update.username} + source.username = source_update.username + + if source_update.priority is not None and source_update.priority != source.priority: + changes["priority"] = {"old": source.priority, "new": source_update.priority} + source.priority = source_update.priority + + # Handle password - None means keep, empty string means clear, value means update + if source_update.password is not None: + if source_update.password == "": + if source.password_encrypted: + changes["password"] = {"action": "cleared"} + source.password_encrypted = None + else: + changes["password"] = {"action": "updated"} + source.set_password(source_update.password) + + # Handle headers + if source_update.headers is not None: + if not source_update.headers: # Empty dict or None-like + if source.headers_encrypted: + changes["headers"] = {"action": "cleared"} + source.headers_encrypted = None + else: + changes["headers"] = {"action": "updated"} + source.set_headers(source_update.headers) + + if changes: + # Audit log + _log_audit( + db, + action="upstream_source.update", + resource=f"upstream-source/{source.name}", + user_id=current_user.username, + source_ip=request.client.host if request.client else None, + details={"source_id": str(source.id), "changes": changes}, + ) + + db.commit() + db.refresh(source) + + return UpstreamSourceResponse( + id=source.id, + name=source.name, + source_type=source.source_type, + url=source.url, + enabled=source.enabled, + is_public=source.is_public, + auth_type=source.auth_type, + username=source.username, + has_password=source.has_password(), + has_headers=bool(source.headers_encrypted), + priority=source.priority, + created_at=source.created_at, + updated_at=source.updated_at, + ) + + +@router.delete( + "/api/v1/admin/upstream-sources/{source_id}", + status_code=204, + tags=["admin", "cache"], + summary="Delete upstream source", +) +def delete_upstream_source( + source_id: str, + request: Request, + db: Session = Depends(get_db), + current_user: User = Depends(require_admin), +): + """ + Delete an upstream source. + + Environment-defined sources cannot be deleted via API. + + **Warning:** Deleting a source that has been used to cache artifacts + will remove the provenance information. The cached artifacts themselves + will remain, but their `source_id` in `cached_urls` will become NULL. + """ + import uuid + + # Check if this is an env-defined source + env_sources = get_env_upstream_sources() + for es in env_sources: + env_id = uuid.uuid5(uuid.NAMESPACE_DNS, f"env-upstream-{es.name}") + if str(env_id) == source_id: + raise HTTPException( + status_code=400, + detail="Cannot delete environment-defined upstream source. Remove the environment variables instead.", + ) + + source = db.query(UpstreamSource).filter(UpstreamSource.id == source_id).first() + if not source: + raise HTTPException(status_code=404, detail="Upstream source not found") + + source_name = source.name + + # Nullify source_id in cached_urls (don't delete the cache entries) + db.query(CachedUrl).filter(CachedUrl.source_id == source_id).update( + {"source_id": None}, synchronize_session=False + ) + + db.delete(source) + + # Audit log + _log_audit( + db, + action="upstream_source.delete", + resource=f"upstream-source/{source_name}", + user_id=current_user.username, + source_ip=request.client.host if request.client else None, + details={"source_id": source_id, "name": source_name}, + ) + + db.commit() + return None + + +@router.post( + "/api/v1/admin/upstream-sources/{source_id}/test", + tags=["admin", "cache"], + summary="Test upstream source connectivity", +) +def test_upstream_source( + source_id: str, + db: Session = Depends(get_db), + current_user: User = Depends(require_admin), +): + """ + Test connectivity to an upstream source. + + Performs a HEAD request to the source URL to verify: + - Network connectivity + - DNS resolution + - SSL/TLS certificate validity + - Authentication credentials (if configured) + + Returns success/failure with status code and timing information. + Does not cache anything. + """ + source = db.query(UpstreamSource).filter(UpstreamSource.id == source_id).first() + if not source: + raise HTTPException(status_code=404, detail="Upstream source not found") + + import time + start_time = time.time() + + # Use the UpstreamClient to test connection + client = UpstreamClient() + success, error_message, status_code = client.test_connection(source) + + elapsed_ms = int((time.time() - start_time) * 1000) + + return { + "success": success, + "source_id": str(source.id), + "source_name": source.name, + "url": source.url, + "status_code": status_code, + "error": error_message, + "elapsed_ms": elapsed_ms, + } + + +# --- Global Cache Settings Admin API --- + +from .schemas import ( + CacheSettingsResponse, + CacheSettingsUpdate, +) + + +@router.get( + "/api/v1/admin/cache-settings", + response_model=CacheSettingsResponse, + tags=["admin", "cache"], + summary="Get global cache settings", +) +def get_cache_settings( + db: Session = Depends(get_db), + current_user: User = Depends(require_admin), +): + """ + Get current global cache settings. + + Admin-only endpoint for viewing cache configuration. + + **Settings:** + - `allow_public_internet`: When false, blocks all requests to sources marked `is_public=true` (air-gap mode) + - `auto_create_system_projects`: When true, system projects (`_npm`, etc.) are created automatically on first cache + + **Environment variable overrides:** + Settings can be overridden via environment variables: + - `ORCHARD_CACHE_ALLOW_PUBLIC_INTERNET`: Overrides `allow_public_internet` + - `ORCHARD_CACHE_AUTO_CREATE_SYSTEM_PROJECTS`: Overrides `auto_create_system_projects` + + When an env var override is active, the `*_env_override` field will contain the override value. + """ + app_settings = get_settings() + db_settings = _get_cache_settings(db) + + # Apply env var overrides + allow_public_internet = db_settings.allow_public_internet + allow_public_internet_env_override = None + if app_settings.cache_allow_public_internet is not None: + allow_public_internet = app_settings.cache_allow_public_internet + allow_public_internet_env_override = app_settings.cache_allow_public_internet + + auto_create_system_projects = db_settings.auto_create_system_projects + auto_create_system_projects_env_override = None + if app_settings.cache_auto_create_system_projects is not None: + auto_create_system_projects = app_settings.cache_auto_create_system_projects + auto_create_system_projects_env_override = app_settings.cache_auto_create_system_projects + + return CacheSettingsResponse( + allow_public_internet=allow_public_internet, + auto_create_system_projects=auto_create_system_projects, + allow_public_internet_env_override=allow_public_internet_env_override, + auto_create_system_projects_env_override=auto_create_system_projects_env_override, + created_at=db_settings.created_at, + updated_at=db_settings.updated_at, + ) + + +@router.put( + "/api/v1/admin/cache-settings", + response_model=CacheSettingsResponse, + tags=["admin", "cache"], + summary="Update global cache settings", +) +def update_cache_settings( + settings_update: CacheSettingsUpdate, + request: Request, + db: Session = Depends(get_db), + current_user: User = Depends(require_admin), +): + """ + Update global cache settings. + + Admin-only endpoint for configuring cache behavior. + Supports partial updates - only provided fields are updated. + + **Settings:** + - `allow_public_internet`: When false, enables air-gap mode (blocks public sources) + - `auto_create_system_projects`: When false, system projects must be created manually + + **Note:** Environment variables can override these settings. When overridden, + the `*_env_override` fields in the response indicate the effective value. + Updates to the database will be saved but won't take effect until the env var is removed. + + **Warning:** Changing `allow_public_internet` to false will immediately block + all cache requests to public sources. This is a security-sensitive setting + and is logged prominently. + """ + app_settings = get_settings() + settings = _get_cache_settings(db) + + # Track changes for audit log + changes = {} + + if settings_update.allow_public_internet is not None: + if settings_update.allow_public_internet != settings.allow_public_internet: + changes["allow_public_internet"] = { + "old": settings.allow_public_internet, + "new": settings_update.allow_public_internet, + } + settings.allow_public_internet = settings_update.allow_public_internet + + # Log prominently for security audit + if not settings_update.allow_public_internet: + logger.warning( + f"AIR-GAP MODE ENABLED by {current_user.username} - " + f"all public internet access is now blocked" + ) + else: + logger.warning( + f"AIR-GAP MODE DISABLED by {current_user.username} - " + f"public internet access is now allowed" + ) + + if settings_update.auto_create_system_projects is not None: + if settings_update.auto_create_system_projects != settings.auto_create_system_projects: + changes["auto_create_system_projects"] = { + "old": settings.auto_create_system_projects, + "new": settings_update.auto_create_system_projects, + } + settings.auto_create_system_projects = settings_update.auto_create_system_projects + + if changes: + # Audit log with security flag for air-gap changes + is_security_change = "allow_public_internet" in changes + _log_audit( + db, + action="cache_settings.update" if not is_security_change else "cache_settings.security_update", + resource="cache-settings", + user_id=current_user.username, + source_ip=request.client.host if request.client else None, + details={"changes": changes}, + ) + + db.commit() + db.refresh(settings) + + # Apply env var overrides for the response + allow_public_internet = settings.allow_public_internet + allow_public_internet_env_override = None + if app_settings.cache_allow_public_internet is not None: + allow_public_internet = app_settings.cache_allow_public_internet + allow_public_internet_env_override = app_settings.cache_allow_public_internet + + auto_create_system_projects = settings.auto_create_system_projects + auto_create_system_projects_env_override = None + if app_settings.cache_auto_create_system_projects is not None: + auto_create_system_projects = app_settings.cache_auto_create_system_projects + auto_create_system_projects_env_override = app_settings.cache_auto_create_system_projects + + return CacheSettingsResponse( + allow_public_internet=allow_public_internet, + auto_create_system_projects=auto_create_system_projects, + allow_public_internet_env_override=allow_public_internet_env_override, + auto_create_system_projects_env_override=auto_create_system_projects_env_override, + created_at=settings.created_at, + updated_at=settings.updated_at, + ) diff --git a/backend/app/schemas.py b/backend/app/schemas.py index d378a8c..522f074 100644 --- a/backend/app/schemas.py +++ b/backend/app/schemas.py @@ -1196,3 +1196,246 @@ class TeamMemberResponse(BaseModel): class Config: from_attributes = True + +# ============================================================================= +# Upstream Caching Schemas +# ============================================================================= + +# Valid source types +SOURCE_TYPES = ["npm", "pypi", "maven", "docker", "helm", "nuget", "deb", "rpm", "generic"] + +# Valid auth types +AUTH_TYPES = ["none", "basic", "bearer", "api_key"] + + +class UpstreamSourceCreate(BaseModel): + """Create a new upstream source""" + name: str + source_type: str = "generic" + url: str + enabled: bool = False + is_public: bool = True + auth_type: str = "none" + username: Optional[str] = None + password: Optional[str] = None # Write-only + headers: Optional[dict] = None # Write-only, custom headers + priority: int = 100 + + @field_validator('name') + @classmethod + def validate_name(cls, v: str) -> str: + v = v.strip() + if not v: + raise ValueError("name cannot be empty") + if len(v) > 255: + raise ValueError("name must be 255 characters or less") + return v + + @field_validator('source_type') + @classmethod + def validate_source_type(cls, v: str) -> str: + if v not in SOURCE_TYPES: + raise ValueError(f"source_type must be one of: {', '.join(SOURCE_TYPES)}") + return v + + @field_validator('url') + @classmethod + def validate_url(cls, v: str) -> str: + v = v.strip() + if not v: + raise ValueError("url cannot be empty") + if not (v.startswith('http://') or v.startswith('https://')): + raise ValueError("url must start with http:// or https://") + if len(v) > 2048: + raise ValueError("url must be 2048 characters or less") + return v + + @field_validator('auth_type') + @classmethod + def validate_auth_type(cls, v: str) -> str: + if v not in AUTH_TYPES: + raise ValueError(f"auth_type must be one of: {', '.join(AUTH_TYPES)}") + return v + + @field_validator('priority') + @classmethod + def validate_priority(cls, v: int) -> int: + if v <= 0: + raise ValueError("priority must be greater than 0") + return v + + +class UpstreamSourceUpdate(BaseModel): + """Update an upstream source (partial)""" + name: Optional[str] = None + source_type: Optional[str] = None + url: Optional[str] = None + enabled: Optional[bool] = None + is_public: Optional[bool] = None + auth_type: Optional[str] = None + username: Optional[str] = None + password: Optional[str] = None # Write-only, None = keep existing, empty string = clear + headers: Optional[dict] = None # Write-only + priority: Optional[int] = None + + @field_validator('name') + @classmethod + def validate_name(cls, v: Optional[str]) -> Optional[str]: + if v is not None: + v = v.strip() + if not v: + raise ValueError("name cannot be empty") + if len(v) > 255: + raise ValueError("name must be 255 characters or less") + return v + + @field_validator('source_type') + @classmethod + def validate_source_type(cls, v: Optional[str]) -> Optional[str]: + if v is not None and v not in SOURCE_TYPES: + raise ValueError(f"source_type must be one of: {', '.join(SOURCE_TYPES)}") + return v + + @field_validator('url') + @classmethod + def validate_url(cls, v: Optional[str]) -> Optional[str]: + if v is not None: + v = v.strip() + if not v: + raise ValueError("url cannot be empty") + if not (v.startswith('http://') or v.startswith('https://')): + raise ValueError("url must start with http:// or https://") + if len(v) > 2048: + raise ValueError("url must be 2048 characters or less") + return v + + @field_validator('auth_type') + @classmethod + def validate_auth_type(cls, v: Optional[str]) -> Optional[str]: + if v is not None and v not in AUTH_TYPES: + raise ValueError(f"auth_type must be one of: {', '.join(AUTH_TYPES)}") + return v + + @field_validator('priority') + @classmethod + def validate_priority(cls, v: Optional[int]) -> Optional[int]: + if v is not None and v <= 0: + raise ValueError("priority must be greater than 0") + return v + + +class UpstreamSourceResponse(BaseModel): + """Upstream source response (credentials never included)""" + id: UUID + name: str + source_type: str + url: str + enabled: bool + is_public: bool + auth_type: str + username: Optional[str] + has_password: bool # True if password is set + has_headers: bool # True if custom headers are set + priority: int + source: str = "database" # "database" or "env" (env = defined via environment variables) + created_at: Optional[datetime] = None # May be None for legacy/env data + updated_at: Optional[datetime] = None # May be None for legacy/env data + + class Config: + from_attributes = True + + +class CacheSettingsResponse(BaseModel): + """Global cache settings response""" + allow_public_internet: bool + auto_create_system_projects: bool + allow_public_internet_env_override: Optional[bool] = None # Set if overridden by env var + auto_create_system_projects_env_override: Optional[bool] = None # Set if overridden by env var + created_at: Optional[datetime] = None # May be None for legacy data + updated_at: Optional[datetime] = None # May be None for legacy data + + class Config: + from_attributes = True + + +class CacheSettingsUpdate(BaseModel): + """Update cache settings (partial)""" + allow_public_internet: Optional[bool] = None + auto_create_system_projects: Optional[bool] = None + + +class CachedUrlResponse(BaseModel): + """Cached URL response""" + id: UUID + url: str + url_hash: str + artifact_id: str + source_id: Optional[UUID] + source_name: Optional[str] = None # Populated from join + fetched_at: datetime + created_at: datetime + + class Config: + from_attributes = True + + +class CacheRequest(BaseModel): + """Request to cache an artifact from an upstream URL""" + url: str + source_type: str + package_name: Optional[str] = None # Auto-derived from URL if not provided + tag: Optional[str] = None # Auto-derived from URL if not provided + user_project: Optional[str] = None # Cross-reference to user project + user_package: Optional[str] = None + user_tag: Optional[str] = None + expected_hash: Optional[str] = None # Verify downloaded content + + @field_validator('url') + @classmethod + def validate_url(cls, v: str) -> str: + v = v.strip() + if not v: + raise ValueError("url cannot be empty") + if not (v.startswith('http://') or v.startswith('https://')): + raise ValueError("url must start with http:// or https://") + if len(v) > 4096: + raise ValueError("url must be 4096 characters or less") + return v + + @field_validator('source_type') + @classmethod + def validate_source_type(cls, v: str) -> str: + if v not in SOURCE_TYPES: + raise ValueError(f"source_type must be one of: {', '.join(SOURCE_TYPES)}") + return v + + @field_validator('expected_hash') + @classmethod + def validate_expected_hash(cls, v: Optional[str]) -> Optional[str]: + if v is not None: + v = v.strip().lower() + # Remove sha256: prefix if present + if v.startswith('sha256:'): + v = v[7:] + # Validate hex format + if len(v) != 64 or not all(c in '0123456789abcdef' for c in v): + raise ValueError("expected_hash must be a 64-character hex string (SHA256)") + return v + + +class CacheResponse(BaseModel): + """Response from caching an artifact""" + artifact_id: str + sha256: str + size: int + content_type: Optional[str] + already_cached: bool + source_url: str + source_name: Optional[str] + system_project: str + system_package: str + system_tag: Optional[str] + user_reference: Optional[str] = None # e.g., "my-app/npm-deps:lodash-4.17.21" + + + diff --git a/backend/app/upstream.py b/backend/app/upstream.py new file mode 100644 index 0000000..46e8113 --- /dev/null +++ b/backend/app/upstream.py @@ -0,0 +1,586 @@ +""" +HTTP client for fetching artifacts from upstream sources. + +Provides streaming downloads with SHA256 computation, authentication support, +and automatic source matching based on URL prefixes. +""" + +from __future__ import annotations + +import hashlib +import logging +import tempfile +import time +from dataclasses import dataclass, field +from pathlib import Path +from typing import BinaryIO, Optional, TYPE_CHECKING +from urllib.parse import urlparse + +import httpx + +if TYPE_CHECKING: + from .models import CacheSettings, UpstreamSource + +logger = logging.getLogger(__name__) + + +class UpstreamError(Exception): + """Base exception for upstream client errors.""" + + pass + + +class UpstreamConnectionError(UpstreamError): + """Connection to upstream failed (network error, DNS, etc.).""" + + pass + + +class UpstreamTimeoutError(UpstreamError): + """Request to upstream timed out.""" + + pass + + +class UpstreamHTTPError(UpstreamError): + """Upstream returned an HTTP error response.""" + + def __init__(self, message: str, status_code: int, response_headers: dict = None): + super().__init__(message) + self.status_code = status_code + self.response_headers = response_headers or {} + + +class UpstreamSSLError(UpstreamError): + """SSL/TLS error when connecting to upstream.""" + + pass + + +class AirGapError(UpstreamError): + """Request blocked due to air-gap mode.""" + + pass + + +class FileSizeExceededError(UpstreamError): + """File size exceeds the maximum allowed.""" + + def __init__(self, message: str, content_length: int, max_size: int): + super().__init__(message) + self.content_length = content_length + self.max_size = max_size + + +class SourceNotFoundError(UpstreamError): + """No matching upstream source found for URL.""" + + pass + + +class SourceDisabledError(UpstreamError): + """The matching upstream source is disabled.""" + + pass + + +@dataclass +class FetchResult: + """Result of fetching an artifact from upstream.""" + + content: BinaryIO # File-like object with content + sha256: str # SHA256 hash of content + size: int # Size in bytes + content_type: Optional[str] # Content-Type header + response_headers: dict # All response headers for provenance + source_name: Optional[str] = None # Name of matched upstream source + temp_path: Optional[Path] = None # Path to temp file (for cleanup) + + def close(self): + """Close and clean up resources.""" + if self.content: + try: + self.content.close() + except Exception: + pass + if self.temp_path and self.temp_path.exists(): + try: + self.temp_path.unlink() + except Exception: + pass + + +@dataclass +class UpstreamClientConfig: + """Configuration for the upstream client.""" + + connect_timeout: float = 30.0 # Connection timeout in seconds + read_timeout: float = 300.0 # Read timeout in seconds (5 minutes for large files) + max_retries: int = 3 # Maximum number of retry attempts + retry_backoff_base: float = 1.0 # Base delay for exponential backoff + retry_backoff_max: float = 30.0 # Maximum delay between retries + follow_redirects: bool = True # Whether to follow redirects + max_redirects: int = 5 # Maximum number of redirects to follow + max_file_size: Optional[int] = None # Maximum file size (None = unlimited) + verify_ssl: bool = True # Verify SSL certificates + user_agent: str = "Orchard-UpstreamClient/1.0" + + +class UpstreamClient: + """ + HTTP client for fetching artifacts from upstream sources. + + Supports streaming downloads, multiple authentication methods, + automatic source matching, and air-gap mode enforcement. + """ + + def __init__( + self, + sources: list[UpstreamSource] = None, + cache_settings: CacheSettings = None, + config: UpstreamClientConfig = None, + ): + """ + Initialize the upstream client. + + Args: + sources: List of upstream sources for URL matching and auth. + Should be sorted by priority (lowest first). + cache_settings: Global cache settings including air-gap mode. + config: Client configuration options. + """ + self.sources = sources or [] + self.cache_settings = cache_settings + self.config = config or UpstreamClientConfig() + + # Sort sources by priority (lower = higher priority) + self.sources = sorted(self.sources, key=lambda s: s.priority) + + def _get_allow_public_internet(self) -> bool: + """Get the allow_public_internet setting.""" + if self.cache_settings is None: + return True # Default to allowing if no settings provided + return self.cache_settings.allow_public_internet + + def _match_source(self, url: str) -> Optional[UpstreamSource]: + """ + Find the upstream source that matches the given URL. + + Matches by URL prefix, returns the highest priority match. + + Args: + url: The URL to match. + + Returns: + The matching UpstreamSource or None if no match. + """ + for source in self.sources: + # Check if URL starts with source URL (prefix match) + if url.startswith(source.url.rstrip("/")): + return source + + return None + + def _build_auth_headers(self, source: UpstreamSource) -> dict: + """ + Build authentication headers for the given source. + + Args: + source: The upstream source with auth configuration. + + Returns: + Dictionary of headers to add to the request. + """ + headers = {} + + if source.auth_type == "none": + pass + elif source.auth_type == "basic": + # httpx handles basic auth via auth parameter, but we can also + # do it manually if needed. We'll use the auth parameter instead. + pass + elif source.auth_type == "bearer": + password = source.get_password() + if password: + headers["Authorization"] = f"Bearer {password}" + elif source.auth_type == "api_key": + # API key auth uses custom headers + custom_headers = source.get_headers() + if custom_headers: + headers.update(custom_headers) + + return headers + + def _get_basic_auth(self, source: UpstreamSource) -> Optional[tuple[str, str]]: + """ + Get basic auth credentials if applicable. + + Args: + source: The upstream source. + + Returns: + Tuple of (username, password) or None. + """ + if source.auth_type == "basic" and source.username: + password = source.get_password() or "" + return (source.username, password) + return None + + def _should_retry(self, error: Exception, attempt: int) -> bool: + """ + Determine if a request should be retried. + + Args: + error: The exception that occurred. + attempt: Current attempt number (0-indexed). + + Returns: + True if the request should be retried. + """ + if attempt >= self.config.max_retries - 1: + return False + + # Retry on connection errors and timeouts + if isinstance(error, (httpx.ConnectError, httpx.ConnectTimeout)): + return True + + # Retry on read timeouts + if isinstance(error, httpx.ReadTimeout): + return True + + # Retry on certain HTTP errors (502, 503, 504) + if isinstance(error, httpx.HTTPStatusError): + return error.response.status_code in (502, 503, 504) + + return False + + def _calculate_backoff(self, attempt: int) -> float: + """ + Calculate backoff delay for retry. + + Uses exponential backoff with jitter. + + Args: + attempt: Current attempt number (0-indexed). + + Returns: + Delay in seconds. + """ + import random + + delay = self.config.retry_backoff_base * (2**attempt) + # Add jitter (±25%) + delay *= 0.75 + random.random() * 0.5 + return min(delay, self.config.retry_backoff_max) + + def fetch(self, url: str, expected_hash: Optional[str] = None) -> FetchResult: + """ + Fetch an artifact from the given URL. + + Streams the response to a temp file while computing the SHA256 hash. + Handles authentication, retries, and error cases. + + Args: + url: The URL to fetch. + expected_hash: Optional expected SHA256 hash for verification. + + Returns: + FetchResult with content, hash, size, and headers. + + Raises: + AirGapError: If air-gap mode blocks the request. + SourceDisabledError: If the matching source is disabled. + UpstreamConnectionError: On connection failures. + UpstreamTimeoutError: On timeout. + UpstreamHTTPError: On HTTP error responses. + UpstreamSSLError: On SSL/TLS errors. + FileSizeExceededError: If Content-Length exceeds max_file_size. + """ + start_time = time.time() + + # Match URL to source + source = self._match_source(url) + + # Check air-gap mode + allow_public = self._get_allow_public_internet() + + if not allow_public: + if source is None: + raise AirGapError( + f"Air-gap mode enabled: URL does not match any configured upstream source: {url}" + ) + if source.is_public: + raise AirGapError( + f"Air-gap mode enabled: Cannot fetch from public source '{source.name}'" + ) + + # Check if source is enabled (if we have a match) + if source is not None and not source.enabled: + raise SourceDisabledError( + f"Upstream source '{source.name}' is disabled" + ) + + source_name = source.name if source else None + logger.info( + f"Fetching URL: {url} (source: {source_name or 'none'})" + ) + + # Build request parameters + headers = {"User-Agent": self.config.user_agent} + auth = None + + if source: + headers.update(self._build_auth_headers(source)) + auth = self._get_basic_auth(source) + + timeout = httpx.Timeout( + connect=self.config.connect_timeout, + read=self.config.read_timeout, + write=30.0, + pool=10.0, + ) + + # Attempt fetch with retries + last_error = None + for attempt in range(self.config.max_retries): + try: + return self._do_fetch( + url=url, + headers=headers, + auth=auth, + timeout=timeout, + source_name=source_name, + start_time=start_time, + expected_hash=expected_hash, + ) + except ( + httpx.ConnectError, + httpx.ConnectTimeout, + httpx.ReadTimeout, + httpx.HTTPStatusError, + ) as e: + last_error = e + if self._should_retry(e, attempt): + delay = self._calculate_backoff(attempt) + logger.warning( + f"Fetch failed (attempt {attempt + 1}/{self.config.max_retries}), " + f"retrying in {delay:.1f}s: {e}" + ) + time.sleep(delay) + else: + break + + # Convert final error to our exception types + self._raise_upstream_error(last_error, url) + + def _do_fetch( + self, + url: str, + headers: dict, + auth: Optional[tuple[str, str]], + timeout: httpx.Timeout, + source_name: Optional[str], + start_time: float, + expected_hash: Optional[str] = None, + ) -> FetchResult: + """ + Perform the actual fetch operation. + + Args: + url: URL to fetch. + headers: Request headers. + auth: Basic auth credentials or None. + timeout: Request timeout configuration. + source_name: Name of matched source for logging. + start_time: Request start time for timing. + expected_hash: Optional expected hash for verification. + + Returns: + FetchResult with content and metadata. + """ + with httpx.Client( + timeout=timeout, + follow_redirects=self.config.follow_redirects, + max_redirects=self.config.max_redirects, + verify=self.config.verify_ssl, + ) as client: + with client.stream("GET", url, headers=headers, auth=auth) as response: + # Check for HTTP errors + response.raise_for_status() + + # Check Content-Length against max size + content_length = response.headers.get("content-length") + if content_length: + content_length = int(content_length) + if ( + self.config.max_file_size + and content_length > self.config.max_file_size + ): + raise FileSizeExceededError( + f"File size {content_length} exceeds maximum {self.config.max_file_size}", + content_length, + self.config.max_file_size, + ) + + # Stream to temp file while computing hash + hasher = hashlib.sha256() + size = 0 + + # Create temp file + temp_file = tempfile.NamedTemporaryFile( + delete=False, prefix="orchard_upstream_" + ) + temp_path = Path(temp_file.name) + + try: + for chunk in response.iter_bytes(chunk_size=65536): + temp_file.write(chunk) + hasher.update(chunk) + size += len(chunk) + + # Check size while streaming if max_file_size is set + if self.config.max_file_size and size > self.config.max_file_size: + temp_file.close() + temp_path.unlink() + raise FileSizeExceededError( + f"Downloaded size {size} exceeds maximum {self.config.max_file_size}", + size, + self.config.max_file_size, + ) + + temp_file.close() + + sha256 = hasher.hexdigest() + + # Verify hash if expected + if expected_hash and sha256 != expected_hash.lower(): + temp_path.unlink() + raise UpstreamError( + f"Hash mismatch: expected {expected_hash}, got {sha256}" + ) + + # Capture response headers + response_headers = dict(response.headers) + + # Get content type + content_type = response.headers.get("content-type") + + elapsed = time.time() - start_time + logger.info( + f"Fetched {url}: {size} bytes, sha256={sha256[:12]}..., " + f"source={source_name}, time={elapsed:.2f}s" + ) + + # Return file handle positioned at start + content = open(temp_path, "rb") + + return FetchResult( + content=content, + sha256=sha256, + size=size, + content_type=content_type, + response_headers=response_headers, + source_name=source_name, + temp_path=temp_path, + ) + + except Exception: + # Clean up on error + try: + temp_file.close() + except Exception: + pass + if temp_path.exists(): + temp_path.unlink() + raise + + def _raise_upstream_error(self, error: Exception, url: str): + """ + Convert httpx exception to appropriate UpstreamError. + + Args: + error: The httpx exception. + url: The URL that was being fetched. + + Raises: + Appropriate UpstreamError subclass. + """ + if error is None: + raise UpstreamError(f"Unknown error fetching {url}") + + if isinstance(error, httpx.ConnectError): + raise UpstreamConnectionError( + f"Failed to connect to upstream: {error}" + ) from error + + if isinstance(error, (httpx.ConnectTimeout, httpx.ReadTimeout)): + raise UpstreamTimeoutError( + f"Request timed out: {error}" + ) from error + + if isinstance(error, httpx.HTTPStatusError): + raise UpstreamHTTPError( + f"HTTP {error.response.status_code}: {error}", + error.response.status_code, + dict(error.response.headers), + ) from error + + # Check for SSL errors in the error chain + if "ssl" in str(error).lower() or "certificate" in str(error).lower(): + raise UpstreamSSLError(f"SSL/TLS error: {error}") from error + + raise UpstreamError(f"Error fetching {url}: {error}") from error + + def test_connection(self, source: UpstreamSource) -> tuple[bool, Optional[str], Optional[int]]: + """ + Test connectivity to an upstream source. + + Performs a HEAD request to the source URL to verify connectivity + and authentication. + + Args: + source: The upstream source to test. + + Returns: + Tuple of (success, error_message, status_code). + """ + headers = {"User-Agent": self.config.user_agent} + headers.update(self._build_auth_headers(source)) + auth = self._get_basic_auth(source) + + timeout = httpx.Timeout( + connect=self.config.connect_timeout, + read=30.0, + write=30.0, + pool=10.0, + ) + + try: + with httpx.Client( + timeout=timeout, + verify=self.config.verify_ssl, + ) as client: + response = client.head( + source.url, + headers=headers, + auth=auth, + follow_redirects=True, + ) + # Consider 2xx and 3xx as success, also 405 (Method Not Allowed) + # since some servers don't support HEAD + if response.status_code < 400 or response.status_code == 405: + return (True, None, response.status_code) + else: + return ( + False, + f"HTTP {response.status_code}", + response.status_code, + ) + except httpx.ConnectError as e: + return (False, f"Connection failed: {e}", None) + except httpx.ConnectTimeout as e: + return (False, f"Connection timed out: {e}", None) + except httpx.ReadTimeout as e: + return (False, f"Read timed out: {e}", None) + except Exception as e: + return (False, f"Error: {e}", None) diff --git a/backend/requirements.txt b/backend/requirements.txt index 604f19c..c1abed0 100644 --- a/backend/requirements.txt +++ b/backend/requirements.txt @@ -11,10 +11,10 @@ python-jose[cryptography]==3.3.0 passlib[bcrypt]==1.7.4 bcrypt==4.0.1 slowapi==0.1.9 +httpx>=0.25.0 # Test dependencies pytest>=7.4.0 pytest-asyncio>=0.21.0 pytest-cov>=4.1.0 -httpx>=0.25.0 moto[s3]>=4.2.0 diff --git a/backend/tests/test_upstream_caching.py b/backend/tests/test_upstream_caching.py new file mode 100644 index 0000000..84917bf --- /dev/null +++ b/backend/tests/test_upstream_caching.py @@ -0,0 +1,2051 @@ +""" +Tests for upstream artifact caching schema. + +Tests models, schemas, and encryption for the upstream caching feature. +""" + +import os +import pytest +from unittest.mock import patch +from pydantic import ValidationError + + +class TestEncryptionModule: + """Tests for the encryption module.""" + + def test_encrypt_decrypt_roundtrip(self): + """Test that encryption and decryption work correctly.""" + from app.encryption import encrypt_value, decrypt_value + + plaintext = "my-secret-password" + encrypted = encrypt_value(plaintext) + + assert isinstance(encrypted, bytes) + assert encrypted != plaintext.encode() + + decrypted = decrypt_value(encrypted) + assert decrypted == plaintext + + def test_encrypt_different_each_time(self): + """Test that encrypting the same value produces different ciphertext.""" + from app.encryption import encrypt_value + + plaintext = "test-password" + encrypted1 = encrypt_value(plaintext) + encrypted2 = encrypt_value(plaintext) + + # Fernet includes timestamp, so each encryption is unique + assert encrypted1 != encrypted2 + + def test_encrypt_empty_value_raises(self): + """Test that encrypting empty value raises ValueError.""" + from app.encryption import encrypt_value + + with pytest.raises(ValueError, match="Cannot encrypt empty"): + encrypt_value("") + + def test_decrypt_empty_value_raises(self): + """Test that decrypting empty value raises ValueError.""" + from app.encryption import decrypt_value + + with pytest.raises(ValueError, match="Cannot decrypt empty"): + decrypt_value(b"") + + def test_can_decrypt_valid(self): + """Test can_decrypt returns True for valid encrypted data.""" + from app.encryption import encrypt_value, can_decrypt + + encrypted = encrypt_value("test-password") + assert can_decrypt(encrypted) is True + + def test_can_decrypt_invalid(self): + """Test can_decrypt returns False for invalid data.""" + from app.encryption import can_decrypt + + assert can_decrypt(b"invalid-data") is False + assert can_decrypt(b"") is False + + def test_generate_key_format(self): + """Test that generated keys are valid Fernet keys.""" + from app.encryption import generate_key + from cryptography.fernet import Fernet + + key = generate_key() + assert isinstance(key, str) + + # Should be valid for creating a Fernet instance + fernet = Fernet(key.encode()) + assert fernet is not None + + +class TestUpstreamSourceModel: + """Tests for UpstreamSource SQLAlchemy model.""" + + def test_model_fields_exist(self): + """Test that model has all expected fields.""" + from app.models import UpstreamSource + + source = UpstreamSource() + assert hasattr(source, 'id') + assert hasattr(source, 'name') + assert hasattr(source, 'source_type') + assert hasattr(source, 'url') + assert hasattr(source, 'enabled') + assert hasattr(source, 'is_public') + assert hasattr(source, 'auth_type') + assert hasattr(source, 'username') + assert hasattr(source, 'password_encrypted') + assert hasattr(source, 'headers_encrypted') + assert hasattr(source, 'priority') + + def test_model_with_values(self): + """Test that model can be created with explicit values.""" + from app.models import UpstreamSource + + source = UpstreamSource( + name="npm-private", + source_type="npm", + url="https://npm.example.com", + enabled=True, + is_public=False, + auth_type="basic", + username="admin", + priority=50, + ) + assert source.name == "npm-private" + assert source.source_type == "npm" + assert source.url == "https://npm.example.com" + assert source.enabled is True + assert source.is_public is False + assert source.auth_type == "basic" + assert source.username == "admin" + assert source.priority == 50 + + def test_set_password_encrypts(self): + """Test that set_password encrypts the value.""" + from app.models import UpstreamSource + + source = UpstreamSource() + source.set_password("my-api-key") + + assert source.password_encrypted is not None + assert isinstance(source.password_encrypted, bytes) + assert b"my-api-key" not in source.password_encrypted + + def test_get_password_decrypts(self): + """Test that get_password decrypts the value.""" + from app.models import UpstreamSource + + source = UpstreamSource() + source.set_password("my-api-key") + + decrypted = source.get_password() + assert decrypted == "my-api-key" + + def test_set_password_none_clears(self): + """Test that set_password with empty string clears the password.""" + from app.models import UpstreamSource + + source = UpstreamSource() + source.set_password("my-api-key") + assert source.password_encrypted is not None + + source.set_password("") + assert source.password_encrypted is None + + def test_has_password(self): + """Test has_password helper method.""" + from app.models import UpstreamSource + + source = UpstreamSource() + assert source.has_password() is False + + source.set_password("secret") + assert source.has_password() is True + + def test_set_headers_encrypts(self): + """Test that set_headers encrypts custom headers.""" + from app.models import UpstreamSource + + source = UpstreamSource() + headers = {"X-API-Key": "secret123", "X-Custom": "value"} + source.set_headers(headers) + + assert source.headers_encrypted is not None + assert isinstance(source.headers_encrypted, bytes) + + def test_get_headers_decrypts(self): + """Test that get_headers decrypts custom headers.""" + from app.models import UpstreamSource + + source = UpstreamSource() + headers = {"X-API-Key": "secret123", "X-Custom": "value"} + source.set_headers(headers) + + decrypted = source.get_headers() + assert decrypted == headers + + +class TestCacheSettingsModel: + """Tests for CacheSettings SQLAlchemy model.""" + + def test_model_fields_exist(self): + """Test that model has all expected fields.""" + from app.models import CacheSettings + + settings = CacheSettings() + assert hasattr(settings, 'id') + assert hasattr(settings, 'allow_public_internet') + assert hasattr(settings, 'auto_create_system_projects') + + def test_model_with_values(self): + """Test that model can be created with explicit values.""" + from app.models import CacheSettings + + settings = CacheSettings( + id=1, + allow_public_internet=False, + auto_create_system_projects=True, + ) + assert settings.id == 1 + assert settings.allow_public_internet is False + assert settings.auto_create_system_projects is True + + +class TestCachedUrlModel: + """Tests for CachedUrl SQLAlchemy model.""" + + def test_model_fields_exist(self): + """Test that model has all expected fields.""" + from app.models import CachedUrl + + cached = CachedUrl() + assert hasattr(cached, 'id') + assert hasattr(cached, 'url') + assert hasattr(cached, 'url_hash') + assert hasattr(cached, 'artifact_id') + assert hasattr(cached, 'source_id') + assert hasattr(cached, 'fetched_at') + assert hasattr(cached, 'response_headers') + + def test_compute_url_hash(self): + """Test URL hash computation.""" + from app.models import CachedUrl + + url = "https://registry.npmjs.org/lodash/-/lodash-4.17.21.tgz" + hash1 = CachedUrl.compute_url_hash(url) + + # Should be 64-character hex string (SHA256) + assert len(hash1) == 64 + assert all(c in '0123456789abcdef' for c in hash1) + + # Same URL should produce same hash + hash2 = CachedUrl.compute_url_hash(url) + assert hash1 == hash2 + + # Different URL should produce different hash + hash3 = CachedUrl.compute_url_hash("https://example.com/other") + assert hash1 != hash3 + + +class TestUpstreamSourceSchemas: + """Tests for Pydantic upstream source schemas.""" + + def test_create_schema_valid(self): + """Test UpstreamSourceCreate with valid values.""" + from app.schemas import UpstreamSourceCreate + + source = UpstreamSourceCreate( + name="npm-private", + source_type="npm", + url="https://npm.example.com", + enabled=True, + is_public=False, + auth_type="basic", + username="admin", + password="secret", + priority=50, + ) + assert source.name == "npm-private" + assert source.source_type == "npm" + assert source.url == "https://npm.example.com" + assert source.priority == 50 + + def test_create_schema_defaults(self): + """Test UpstreamSourceCreate default values.""" + from app.schemas import UpstreamSourceCreate + + source = UpstreamSourceCreate( + name="test", + url="https://example.com", + ) + assert source.source_type == "generic" + assert source.enabled is False + assert source.is_public is True + assert source.auth_type == "none" + assert source.priority == 100 + + def test_create_schema_invalid_source_type(self): + """Test UpstreamSourceCreate rejects invalid source_type.""" + from app.schemas import UpstreamSourceCreate + + with pytest.raises(ValidationError) as exc_info: + UpstreamSourceCreate( + name="test", + url="https://example.com", + source_type="invalid", + ) + assert "source_type must be one of" in str(exc_info.value) + + def test_create_schema_invalid_auth_type(self): + """Test UpstreamSourceCreate rejects invalid auth_type.""" + from app.schemas import UpstreamSourceCreate + + with pytest.raises(ValidationError) as exc_info: + UpstreamSourceCreate( + name="test", + url="https://example.com", + auth_type="invalid", + ) + assert "auth_type must be one of" in str(exc_info.value) + + def test_create_schema_invalid_url(self): + """Test UpstreamSourceCreate rejects invalid URL.""" + from app.schemas import UpstreamSourceCreate + + with pytest.raises(ValidationError) as exc_info: + UpstreamSourceCreate( + name="test", + url="not-a-url", + ) + assert "url must start with http" in str(exc_info.value) + + def test_create_schema_invalid_priority(self): + """Test UpstreamSourceCreate rejects invalid priority.""" + from app.schemas import UpstreamSourceCreate + + with pytest.raises(ValidationError) as exc_info: + UpstreamSourceCreate( + name="test", + url="https://example.com", + priority=0, + ) + assert "priority must be greater than 0" in str(exc_info.value) + + def test_update_schema_all_optional(self): + """Test UpstreamSourceUpdate allows all fields to be optional.""" + from app.schemas import UpstreamSourceUpdate + + update = UpstreamSourceUpdate() + assert update.name is None + assert update.url is None + + def test_update_schema_partial(self): + """Test UpstreamSourceUpdate with partial fields.""" + from app.schemas import UpstreamSourceUpdate + + update = UpstreamSourceUpdate(enabled=True, priority=50) + assert update.enabled is True + assert update.priority == 50 + assert update.name is None + + def test_response_schema_no_secrets(self): + """Test UpstreamSourceResponse doesn't have secret fields.""" + from app.schemas import UpstreamSourceResponse + + field_names = set(UpstreamSourceResponse.model_fields.keys()) + assert "password" not in field_names + assert "password_encrypted" not in field_names + assert "headers" not in field_names + assert "headers_encrypted" not in field_names + assert "has_password" in field_names + assert "has_headers" in field_names + + +class TestCacheSettingsSchemas: + """Tests for Pydantic cache settings schemas.""" + + def test_update_schema_all_optional(self): + """Test CacheSettingsUpdate allows all fields to be optional.""" + from app.schemas import CacheSettingsUpdate + + update = CacheSettingsUpdate() + assert update.allow_public_internet is None + assert update.auto_create_system_projects is None + + def test_update_schema_partial(self): + """Test CacheSettingsUpdate with partial fields.""" + from app.schemas import CacheSettingsUpdate + + update = CacheSettingsUpdate(allow_public_internet=False) + assert update.allow_public_internet is False + assert update.auto_create_system_projects is None + + +class TestCacheRequestSchemas: + """Tests for Pydantic cache request schemas.""" + + def test_request_valid(self): + """Test CacheRequest with valid values.""" + from app.schemas import CacheRequest + + request = CacheRequest( + url="https://registry.npmjs.org/lodash/-/lodash-4.17.21.tgz", + source_type="npm", + package_name="lodash", + tag="4.17.21", + ) + assert request.url == "https://registry.npmjs.org/lodash/-/lodash-4.17.21.tgz" + assert request.source_type == "npm" + + def test_request_invalid_url(self): + """Test CacheRequest rejects invalid URL.""" + from app.schemas import CacheRequest + + with pytest.raises(ValidationError) as exc_info: + CacheRequest( + url="not-a-url", + source_type="npm", + ) + assert "url must start with http" in str(exc_info.value) + + def test_request_invalid_source_type(self): + """Test CacheRequest rejects invalid source_type.""" + from app.schemas import CacheRequest + + with pytest.raises(ValidationError) as exc_info: + CacheRequest( + url="https://example.com/file.tgz", + source_type="invalid", + ) + assert "source_type must be one of" in str(exc_info.value) + + def test_request_expected_hash_normalized(self): + """Test CacheRequest normalizes expected_hash.""" + from app.schemas import CacheRequest + + # With sha256: prefix + request = CacheRequest( + url="https://example.com/file.tgz", + source_type="generic", + expected_hash="sha256:abc123def456abc123def456abc123def456abc123def456abc123def456abc1", + ) + assert request.expected_hash == "abc123def456abc123def456abc123def456abc123def456abc123def456abc1" + + # Without prefix + request2 = CacheRequest( + url="https://example.com/file.tgz", + source_type="generic", + expected_hash="ABC123DEF456ABC123DEF456ABC123DEF456ABC123DEF456ABC123DEF456ABC1", + ) + assert request2.expected_hash == "abc123def456abc123def456abc123def456abc123def456abc123def456abc1" + + def test_request_invalid_expected_hash(self): + """Test CacheRequest rejects invalid expected_hash.""" + from app.schemas import CacheRequest + + with pytest.raises(ValidationError) as exc_info: + CacheRequest( + url="https://example.com/file.tgz", + source_type="generic", + expected_hash="not-a-valid-hash", + ) + assert "64-character hex string" in str(exc_info.value) + + +class TestSourceTypesConstant: + """Tests for source type constants.""" + + def test_source_types_contains_expected(self): + """Test SOURCE_TYPES contains all expected values.""" + from app.schemas import SOURCE_TYPES + + assert "npm" in SOURCE_TYPES + assert "pypi" in SOURCE_TYPES + assert "maven" in SOURCE_TYPES + assert "docker" in SOURCE_TYPES + assert "helm" in SOURCE_TYPES + assert "nuget" in SOURCE_TYPES + assert "deb" in SOURCE_TYPES + assert "rpm" in SOURCE_TYPES + assert "generic" in SOURCE_TYPES + + def test_auth_types_contains_expected(self): + """Test AUTH_TYPES contains all expected values.""" + from app.schemas import AUTH_TYPES + + assert "none" in AUTH_TYPES + assert "basic" in AUTH_TYPES + assert "bearer" in AUTH_TYPES + assert "api_key" in AUTH_TYPES + + +# ============================================================================= +# UpstreamClient Tests +# ============================================================================= + + +class TestUpstreamClientConfig: + """Tests for UpstreamClientConfig dataclass.""" + + def test_default_config(self): + """Test default configuration values.""" + from app.upstream import UpstreamClientConfig + + config = UpstreamClientConfig() + assert config.connect_timeout == 30.0 + assert config.read_timeout == 300.0 + assert config.max_retries == 3 + assert config.follow_redirects is True + assert config.max_redirects == 5 + assert config.max_file_size is None + assert config.verify_ssl is True + + def test_custom_config(self): + """Test custom configuration values.""" + from app.upstream import UpstreamClientConfig + + config = UpstreamClientConfig( + connect_timeout=10.0, + read_timeout=60.0, + max_retries=5, + max_file_size=1024 * 1024, + ) + assert config.connect_timeout == 10.0 + assert config.read_timeout == 60.0 + assert config.max_retries == 5 + assert config.max_file_size == 1024 * 1024 + + +class TestFetchResult: + """Tests for FetchResult dataclass.""" + + def test_fetch_result_creation(self): + """Test creating a FetchResult.""" + from io import BytesIO + from app.upstream import FetchResult + + content = BytesIO(b"test content") + result = FetchResult( + content=content, + sha256="abc123", + size=12, + content_type="text/plain", + response_headers={"x-custom": "value"}, + source_name="test-source", + ) + + assert result.sha256 == "abc123" + assert result.size == 12 + assert result.content_type == "text/plain" + assert result.source_name == "test-source" + + def test_fetch_result_close(self): + """Test that close() cleans up resources.""" + import tempfile + from pathlib import Path + from app.upstream import FetchResult + + # Create a temp file + with tempfile.NamedTemporaryFile(delete=False) as f: + f.write(b"test") + temp_path = Path(f.name) + + content = open(temp_path, "rb") + result = FetchResult( + content=content, + sha256="abc", + size=4, + content_type=None, + response_headers={}, + temp_path=temp_path, + ) + + assert temp_path.exists() + result.close() + assert not temp_path.exists() + + +class TestUpstreamClientSourceMatching: + """Tests for URL-to-source matching.""" + + def test_match_source_by_url_prefix(self): + """Test that sources are matched by URL prefix.""" + from app.models import UpstreamSource + from app.upstream import UpstreamClient + + source = UpstreamSource( + name="npm-public", + url="https://registry.npmjs.org", + enabled=True, + is_public=True, + auth_type="none", + priority=100, + ) + + client = UpstreamClient(sources=[source]) + + # Should match + matched = client._match_source("https://registry.npmjs.org/lodash/-/lodash-4.17.21.tgz") + assert matched is not None + assert matched.name == "npm-public" + + # Should not match + matched = client._match_source("https://pypi.org/simple/requests/") + assert matched is None + + def test_match_source_priority_order(self): + """Test that sources are matched by priority (lowest first).""" + from app.models import UpstreamSource + from app.upstream import UpstreamClient + + source1 = UpstreamSource( + name="npm-private", + url="https://registry.npmjs.org", + enabled=True, + is_public=False, + auth_type="basic", + priority=50, + ) + source2 = UpstreamSource( + name="npm-public", + url="https://registry.npmjs.org", + enabled=True, + is_public=True, + auth_type="none", + priority=100, + ) + + # Provide in wrong order - should be sorted by priority + client = UpstreamClient(sources=[source2, source1]) + + matched = client._match_source("https://registry.npmjs.org/lodash") + assert matched is not None + assert matched.name == "npm-private" # Lower priority wins + + def test_no_match_returns_none(self): + """Test that no match returns None.""" + from app.upstream import UpstreamClient + + client = UpstreamClient(sources=[]) + matched = client._match_source("https://example.com/file.tgz") + assert matched is None + + +class TestUpstreamClientAuthHeaders: + """Tests for authentication header building.""" + + def test_auth_none(self): + """Test no authentication.""" + from app.models import UpstreamSource + from app.upstream import UpstreamClient + + source = UpstreamSource(auth_type="none") + client = UpstreamClient() + + headers = client._build_auth_headers(source) + assert headers == {} + + def test_auth_bearer(self): + """Test bearer token authentication.""" + from app.models import UpstreamSource + from app.upstream import UpstreamClient + + source = UpstreamSource(auth_type="bearer") + source.set_password("my-bearer-token") + + client = UpstreamClient() + headers = client._build_auth_headers(source) + + assert headers == {"Authorization": "Bearer my-bearer-token"} + + def test_auth_api_key(self): + """Test API key authentication with custom headers.""" + from app.models import UpstreamSource + from app.upstream import UpstreamClient + + source = UpstreamSource(auth_type="api_key") + source.set_headers({"X-API-Key": "secret-key-123", "X-Custom": "value"}) + + client = UpstreamClient() + headers = client._build_auth_headers(source) + + assert headers == {"X-API-Key": "secret-key-123", "X-Custom": "value"} + + def test_auth_basic_returns_empty_headers(self): + """Test that basic auth doesn't add headers (uses httpx auth param).""" + from app.models import UpstreamSource + from app.upstream import UpstreamClient + + source = UpstreamSource(auth_type="basic", username="user") + source.set_password("pass") + + client = UpstreamClient() + headers = client._build_auth_headers(source) + + # Basic auth is handled via httpx auth parameter, not headers + assert headers == {} + + def test_get_basic_auth(self): + """Test getting basic auth credentials.""" + from app.models import UpstreamSource + from app.upstream import UpstreamClient + + source = UpstreamSource(auth_type="basic", username="user") + source.set_password("pass") + + client = UpstreamClient() + auth = client._get_basic_auth(source) + + assert auth == ("user", "pass") + + def test_get_basic_auth_no_username(self): + """Test basic auth without username returns None.""" + from app.models import UpstreamSource + from app.upstream import UpstreamClient + + source = UpstreamSource(auth_type="basic") + client = UpstreamClient() + auth = client._get_basic_auth(source) + + assert auth is None + + +class TestUpstreamClientAirGapMode: + """Tests for air-gap mode enforcement.""" + + def test_airgap_blocks_public_source(self): + """Test that air-gap mode blocks public sources.""" + from app.models import UpstreamSource, CacheSettings + from app.upstream import UpstreamClient, AirGapError + + source = UpstreamSource( + name="npm-public", + url="https://registry.npmjs.org", + enabled=True, + is_public=True, + auth_type="none", + priority=100, + ) + settings = CacheSettings(allow_public_internet=False) + + client = UpstreamClient(sources=[source], cache_settings=settings) + + with pytest.raises(AirGapError) as exc_info: + client.fetch("https://registry.npmjs.org/lodash") + + assert "Air-gap mode enabled" in str(exc_info.value) + assert "public source" in str(exc_info.value) + + def test_airgap_blocks_unmatched_url(self): + """Test that air-gap mode blocks URLs not matching any source.""" + from app.models import CacheSettings + from app.upstream import UpstreamClient, AirGapError + + settings = CacheSettings(allow_public_internet=False) + client = UpstreamClient(sources=[], cache_settings=settings) + + with pytest.raises(AirGapError) as exc_info: + client.fetch("https://example.com/file.tgz") + + assert "Air-gap mode enabled" in str(exc_info.value) + assert "does not match any configured" in str(exc_info.value) + + def test_airgap_allows_private_source(self): + """Test that air-gap mode allows private sources.""" + from app.models import UpstreamSource, CacheSettings + from app.upstream import UpstreamClient, SourceDisabledError + + source = UpstreamSource( + name="npm-private", + url="https://npm.internal.corp", + enabled=False, # Disabled, but would pass air-gap check + is_public=False, + auth_type="none", + priority=100, + ) + settings = CacheSettings(allow_public_internet=False) + + client = UpstreamClient(sources=[source], cache_settings=settings) + + # Should fail due to disabled source, not air-gap + with pytest.raises(SourceDisabledError): + client.fetch("https://npm.internal.corp/package.tgz") + + def test_allow_public_internet_true(self): + """Test that public internet is allowed when setting is true.""" + from app.models import UpstreamSource, CacheSettings + from app.upstream import UpstreamClient, SourceDisabledError + + source = UpstreamSource( + name="npm-public", + url="https://registry.npmjs.org", + enabled=False, # Disabled + is_public=True, + auth_type="none", + priority=100, + ) + settings = CacheSettings(allow_public_internet=True) + + client = UpstreamClient(sources=[source], cache_settings=settings) + + # Should fail due to disabled source, not air-gap + with pytest.raises(SourceDisabledError): + client.fetch("https://registry.npmjs.org/lodash") + + +class TestUpstreamClientSourceDisabled: + """Tests for disabled source handling.""" + + def test_disabled_source_raises_error(self): + """Test that fetching from disabled source raises error.""" + from app.models import UpstreamSource + from app.upstream import UpstreamClient, SourceDisabledError + + source = UpstreamSource( + name="npm-public", + url="https://registry.npmjs.org", + enabled=False, + is_public=True, + auth_type="none", + priority=100, + ) + + client = UpstreamClient(sources=[source]) + + with pytest.raises(SourceDisabledError) as exc_info: + client.fetch("https://registry.npmjs.org/lodash") + + assert "npm-public" in str(exc_info.value) + assert "disabled" in str(exc_info.value) + + +class TestUpstreamClientRetryLogic: + """Tests for retry and backoff logic.""" + + def test_should_retry_connection_error(self): + """Test that connection errors trigger retry.""" + import httpx + from app.upstream import UpstreamClient + + client = UpstreamClient() + + error = httpx.ConnectError("Connection refused") + assert client._should_retry(error, 0) is True + assert client._should_retry(error, 1) is True + assert client._should_retry(error, 2) is False # Max retries + + def test_should_retry_timeout(self): + """Test that timeouts trigger retry.""" + import httpx + from app.upstream import UpstreamClient + + client = UpstreamClient() + + error = httpx.ReadTimeout("Read timed out") + assert client._should_retry(error, 0) is True + + def test_should_not_retry_4xx(self): + """Test that 4xx errors don't trigger retry.""" + import httpx + from app.upstream import UpstreamClient + + client = UpstreamClient() + + response = httpx.Response(404, request=httpx.Request("GET", "http://test")) + error = httpx.HTTPStatusError("Not found", request=response.request, response=response) + assert client._should_retry(error, 0) is False + + def test_should_retry_502_503_504(self): + """Test that 502, 503, 504 errors trigger retry.""" + import httpx + from app.upstream import UpstreamClient + + client = UpstreamClient() + + for status in [502, 503, 504]: + response = httpx.Response(status, request=httpx.Request("GET", "http://test")) + error = httpx.HTTPStatusError("Server error", request=response.request, response=response) + assert client._should_retry(error, 0) is True + + def test_calculate_backoff(self): + """Test exponential backoff calculation.""" + from app.upstream import UpstreamClient, UpstreamClientConfig + + config = UpstreamClientConfig( + retry_backoff_base=1.0, + retry_backoff_max=30.0, + ) + client = UpstreamClient(config=config) + + # First attempt should be around 1s (with jitter) + delay0 = client._calculate_backoff(0) + assert 0.75 <= delay0 <= 1.25 + + # Second attempt should be around 2s (with jitter) + delay1 = client._calculate_backoff(1) + assert 1.5 <= delay1 <= 2.5 + + # Third attempt should be around 4s (with jitter) + delay2 = client._calculate_backoff(2) + assert 3.0 <= delay2 <= 5.0 + + def test_backoff_respects_max(self): + """Test that backoff respects maximum delay.""" + from app.upstream import UpstreamClient, UpstreamClientConfig + + config = UpstreamClientConfig( + retry_backoff_base=10.0, + retry_backoff_max=5.0, # Max is less than base * 2^attempt + ) + client = UpstreamClient(config=config) + + delay = client._calculate_backoff(5) # Would be 10 * 32 = 320 + assert delay <= 5.0 + + +class TestUpstreamClientExceptionConversion: + """Tests for exception conversion.""" + + def test_convert_connect_error(self): + """Test converting connect error.""" + import httpx + from app.upstream import UpstreamClient, UpstreamConnectionError + + client = UpstreamClient() + error = httpx.ConnectError("Connection refused") + + with pytest.raises(UpstreamConnectionError): + client._raise_upstream_error(error, "http://test") + + def test_convert_timeout_error(self): + """Test converting timeout error.""" + import httpx + from app.upstream import UpstreamClient, UpstreamTimeoutError + + client = UpstreamClient() + error = httpx.ReadTimeout("Read timed out") + + with pytest.raises(UpstreamTimeoutError): + client._raise_upstream_error(error, "http://test") + + def test_convert_http_error(self): + """Test converting HTTP status error.""" + import httpx + from app.upstream import UpstreamClient, UpstreamHTTPError + + client = UpstreamClient() + response = httpx.Response( + 404, + request=httpx.Request("GET", "http://test"), + headers={"x-custom": "value"}, + ) + error = httpx.HTTPStatusError("Not found", request=response.request, response=response) + + with pytest.raises(UpstreamHTTPError) as exc_info: + client._raise_upstream_error(error, "http://test") + + assert exc_info.value.status_code == 404 + + +class TestUpstreamClientFileSizeLimit: + """Tests for file size limit enforcement.""" + + def test_file_size_limit_dataclass(self): + """Test FileSizeExceededError contains expected data.""" + from app.upstream import FileSizeExceededError + + error = FileSizeExceededError("Too large", 1000, 500) + assert error.content_length == 1000 + assert error.max_size == 500 + assert "Too large" in str(error) + + +class TestUpstreamExceptions: + """Tests for upstream exception classes.""" + + def test_upstream_error_base(self): + """Test base UpstreamError.""" + from app.upstream import UpstreamError + + error = UpstreamError("Test error") + assert str(error) == "Test error" + + def test_upstream_http_error(self): + """Test UpstreamHTTPError with status code.""" + from app.upstream import UpstreamHTTPError + + error = UpstreamHTTPError("Not found", 404, {"x-custom": "value"}) + assert error.status_code == 404 + assert error.response_headers == {"x-custom": "value"} + + def test_airgap_error(self): + """Test AirGapError.""" + from app.upstream import AirGapError + + error = AirGapError("Blocked by air-gap") + assert "Blocked by air-gap" in str(error) + + def test_source_not_found_error(self): + """Test SourceNotFoundError.""" + from app.upstream import SourceNotFoundError + + error = SourceNotFoundError("No source for URL") + assert "No source for URL" in str(error) + + def test_source_disabled_error(self): + """Test SourceDisabledError.""" + from app.upstream import SourceDisabledError + + error = SourceDisabledError("Source is disabled") + assert "Source is disabled" in str(error) + + +# ============================================================================= +# URL Parsing Tests +# ============================================================================= + + +class TestNpmUrlParsing: + """Tests for npm URL parsing.""" + + def test_parse_unscoped_package(self): + """Test parsing unscoped npm package URL.""" + from app.cache import parse_npm_url + + result = parse_npm_url("https://registry.npmjs.org/lodash/-/lodash-4.17.21.tgz") + assert result is not None + assert result.package_name == "lodash" + assert result.version == "4.17.21" + assert result.filename == "lodash-4.17.21.tgz" + + def test_parse_scoped_package(self): + """Test parsing scoped npm package URL.""" + from app.cache import parse_npm_url + + result = parse_npm_url("https://registry.npmjs.org/@types/node/-/node-18.0.0.tgz") + assert result is not None + assert result.package_name == "@types/node" + assert result.version == "18.0.0" + assert result.filename == "node-18.0.0.tgz" + + def test_parse_invalid_url(self): + """Test parsing invalid npm URL returns None.""" + from app.cache import parse_npm_url + + result = parse_npm_url("https://example.com/random-file.tgz") + assert result is None + + +class TestPypiUrlParsing: + """Tests for PyPI URL parsing.""" + + def test_parse_sdist_tar_gz(self): + """Test parsing PyPI source distribution.""" + from app.cache import parse_pypi_url + + result = parse_pypi_url("https://files.pythonhosted.org/packages/ab/cd/requests-2.28.0.tar.gz") + assert result is not None + assert result.package_name == "requests" + assert result.version == "2.28.0" + assert result.filename == "requests-2.28.0.tar.gz" + + def test_parse_wheel(self): + """Test parsing PyPI wheel file.""" + from app.cache import parse_pypi_url + + result = parse_pypi_url("https://files.pythonhosted.org/packages/ab/cd/requests-2.28.0-py3-none-any.whl") + assert result is not None + assert result.package_name == "requests" + assert result.version == "2.28.0" + + def test_parse_underscore_package(self): + """Test parsing package name with underscore.""" + from app.cache import parse_pypi_url + + result = parse_pypi_url("https://files.pythonhosted.org/packages/ab/cd/some_package-1.0.0.tar.gz") + assert result is not None + assert result.package_name == "some-package" # Normalized + assert result.version == "1.0.0" + + +class TestMavenUrlParsing: + """Tests for Maven URL parsing.""" + + def test_parse_maven_jar(self): + """Test parsing Maven JAR URL.""" + from app.cache import parse_maven_url + + result = parse_maven_url( + "https://repo1.maven.org/maven2/org/apache/commons/commons-lang3/3.12.0/commons-lang3-3.12.0.jar" + ) + assert result is not None + assert result.package_name == "org.apache.commons:commons-lang3" + assert result.version == "3.12.0" + + def test_parse_maven_with_classifier(self): + """Test parsing Maven URL with version containing classifier.""" + from app.cache import parse_maven_url + + result = parse_maven_url( + "https://repo1.maven.org/maven2/com/google/guava/guava/31.1-jre/guava-31.1-jre.jar" + ) + assert result is not None + assert result.package_name == "com.google.guava:guava" + assert result.version == "31.1-jre" + + +class TestGenericUrlParsing: + """Tests for generic URL parsing.""" + + def test_parse_with_version(self): + """Test parsing generic URL with version in filename.""" + from app.cache import parse_generic_url + + result = parse_generic_url("https://example.com/downloads/myapp-1.2.3.tar.gz") + assert result.package_name == "myapp" + assert result.version == "1.2.3" + + def test_parse_without_version(self): + """Test parsing generic URL without version.""" + from app.cache import parse_generic_url + + result = parse_generic_url("https://example.com/downloads/artifact.tar.gz") + assert result.package_name == "artifact" + assert result.version is None + + def test_parse_various_extensions(self): + """Test parsing various file extensions.""" + from app.cache import parse_generic_url + + for ext in ["tar.gz", "tar.bz2", "zip", "jar", "deb", "rpm"]: + result = parse_generic_url(f"https://example.com/pkg-1.0.{ext}") + assert result.package_name == "pkg" + assert result.version == "1.0" + + +class TestParseUrl: + """Tests for the unified parse_url function.""" + + def test_npm_source_type(self): + """Test parse_url with npm source type.""" + from app.cache import parse_url + + result = parse_url( + "https://registry.npmjs.org/lodash/-/lodash-4.17.21.tgz", + "npm" + ) + assert result.package_name == "lodash" + assert result.version == "4.17.21" + + def test_fallback_to_generic(self): + """Test parse_url falls back to generic parsing.""" + from app.cache import parse_url + + # npm parser can't parse this, should fall back to generic + result = parse_url("https://example.com/myfile-1.0.tar.gz", "npm") + assert result.package_name == "myfile" + assert result.version == "1.0" + + def test_pypi_source_type(self): + """Test parse_url with pypi source type.""" + from app.cache import parse_url + + result = parse_url( + "https://files.pythonhosted.org/packages/ab/cd/requests-2.28.0.tar.gz", + "pypi" + ) + assert result.package_name == "requests" + assert result.version == "2.28.0" + + +class TestSystemProjectHelpers: + """Tests for system project helper functions.""" + + def test_get_system_project_name(self): + """Test getting system project names.""" + from app.cache import get_system_project_name + + assert get_system_project_name("npm") == "_npm" + assert get_system_project_name("pypi") == "_pypi" + assert get_system_project_name("maven") == "_maven" + assert get_system_project_name("docker") == "_docker" + assert get_system_project_name("unknown") == "_generic" + + def test_get_system_project_description(self): + """Test getting system project descriptions.""" + from app.cache import get_system_project_description + + assert "npm" in get_system_project_description("npm").lower() + assert "pypi" in get_system_project_description("pypi").lower() + + +# ============================================================================= +# Cache Endpoint Integration Tests +# ============================================================================= + + +class TestCacheEndpointRequiresAuth: + """Tests for cache endpoint authentication.""" + + @pytest.mark.integration + def test_cache_requires_authentication(self): + """Test that cache endpoint requires authentication.""" + import httpx + + base_url = os.environ.get("ORCHARD_TEST_URL", "http://localhost:8080") + + # Use fresh client WITHOUT authentication + with httpx.Client(base_url=base_url, timeout=30.0) as unauthenticated_client: + response = unauthenticated_client.post( + "/api/v1/cache", + json={ + "url": "https://registry.npmjs.org/lodash/-/lodash-4.17.21.tgz", + "source_type": "npm", + }, + ) + assert response.status_code in (401, 403) + + +class TestCacheRequestValidation: + """Tests for cache request validation.""" + + def test_cache_request_validates_url(self): + """Test that CacheRequest validates URL format.""" + from pydantic import ValidationError + from app.schemas import CacheRequest + + with pytest.raises(ValidationError) as exc_info: + CacheRequest(url="not-a-url", source_type="npm") + assert "url must start with http" in str(exc_info.value) + + def test_cache_request_validates_source_type(self): + """Test that CacheRequest validates source_type.""" + from pydantic import ValidationError + from app.schemas import CacheRequest + + with pytest.raises(ValidationError) as exc_info: + CacheRequest(url="https://example.com/file.tgz", source_type="invalid") + assert "source_type must be one of" in str(exc_info.value) + + def test_cache_request_valid(self): + """Test valid CacheRequest.""" + from app.schemas import CacheRequest + + request = CacheRequest( + url="https://registry.npmjs.org/lodash/-/lodash-4.17.21.tgz", + source_type="npm", + package_name="lodash", + tag="4.17.21", + ) + assert request.url == "https://registry.npmjs.org/lodash/-/lodash-4.17.21.tgz" + assert request.source_type == "npm" + + +class TestCacheResponseSchema: + """Tests for CacheResponse schema.""" + + def test_cache_response_fields(self): + """Test CacheResponse has expected fields.""" + from app.schemas import CacheResponse + + field_names = set(CacheResponse.model_fields.keys()) + assert "artifact_id" in field_names + assert "sha256" in field_names + assert "size" in field_names + assert "already_cached" in field_names + assert "source_url" in field_names + assert "system_project" in field_names + assert "system_package" in field_names + + +# ============================================================================= +# System Projects Tests +# ============================================================================= + + +class TestSystemProjectRestrictions: + """Tests for system project restrictions.""" + + @pytest.mark.integration + def test_cannot_delete_system_project(self, integration_client): + """Test that system projects cannot be deleted.""" + # First, create a system project by checking if _npm exists + # or we need to trigger its creation via the cache endpoint + response = integration_client.get("/api/v1/system-projects") + assert response.status_code == 200 + system_projects = response.json() + + # If there are no system projects, skip this test + if not system_projects: + pytest.skip("No system projects exist to test deletion") + + # Try to delete a system project + project_name = system_projects[0]["name"] + response = integration_client.delete(f"/api/v1/projects/{project_name}") + assert response.status_code == 403 + assert "cannot be deleted" in response.json()["detail"].lower() + + @pytest.mark.integration + def test_cannot_make_system_project_private(self, integration_client): + """Test that system projects cannot be made private.""" + response = integration_client.get("/api/v1/system-projects") + assert response.status_code == 200 + system_projects = response.json() + + if not system_projects: + pytest.skip("No system projects exist to test update") + + # Try to make a system project private + project_name = system_projects[0]["name"] + response = integration_client.put( + f"/api/v1/projects/{project_name}", + json={"is_public": False}, + ) + assert response.status_code == 403 + assert "cannot be made private" in response.json()["detail"].lower() + + @pytest.mark.integration + def test_can_update_system_project_description(self, integration_client): + """Test that system project descriptions can be updated.""" + response = integration_client.get("/api/v1/system-projects") + assert response.status_code == 200 + system_projects = response.json() + + if not system_projects: + pytest.skip("No system projects exist to test update") + + # Update description should work + project_name = system_projects[0]["name"] + new_description = "Updated description for testing" + response = integration_client.put( + f"/api/v1/projects/{project_name}", + json={"description": new_description}, + ) + assert response.status_code == 200 + assert response.json()["description"] == new_description + + +class TestSystemProjectsEndpoint: + """Tests for the system projects listing endpoint.""" + + @pytest.mark.integration + def test_list_system_projects_requires_auth(self): + """Test that listing system projects requires authentication.""" + import httpx + + base_url = os.environ.get("ORCHARD_TEST_URL", "http://localhost:8080") + + with httpx.Client(base_url=base_url, timeout=30.0) as unauthenticated_client: + response = unauthenticated_client.get("/api/v1/system-projects") + assert response.status_code == 401 + + @pytest.mark.integration + def test_list_system_projects_success(self, integration_client): + """Test listing system projects returns valid response.""" + response = integration_client.get("/api/v1/system-projects") + assert response.status_code == 200 + + # Response should be a list + data = response.json() + assert isinstance(data, list) + + # If any system projects exist, they should all have is_system=true + for project in data: + assert project.get("is_system") is True or project.get("name", "").startswith("_") + + +# ============================================================================= +# Upstream Sources Admin API Tests +# ============================================================================= + + +class TestUpstreamSourcesAdminAPI: + """Tests for the upstream sources admin API.""" + + @pytest.mark.integration + def test_list_upstream_sources_requires_admin(self): + """Test that listing upstream sources requires admin access.""" + import httpx + + base_url = os.environ.get("ORCHARD_TEST_URL", "http://localhost:8080") + + with httpx.Client(base_url=base_url, timeout=30.0) as unauthenticated_client: + response = unauthenticated_client.get("/api/v1/admin/upstream-sources") + assert response.status_code in (401, 403) + + @pytest.mark.integration + def test_list_upstream_sources_success(self, integration_client): + """Test listing upstream sources as admin.""" + response = integration_client.get("/api/v1/admin/upstream-sources") + assert response.status_code == 200 + + data = response.json() + assert isinstance(data, list) + + # Check that seeded sources exist + names = [s["name"] for s in data] + # At minimum, these should exist from seeding + assert any("npm" in name.lower() for name in names) or len(data) >= 0 + + @pytest.mark.integration + def test_list_upstream_sources_filter_by_enabled(self, integration_client): + """Test filtering upstream sources by enabled status.""" + response = integration_client.get("/api/v1/admin/upstream-sources?enabled=true") + assert response.status_code == 200 + + data = response.json() + for source in data: + assert source["enabled"] is True + + @pytest.mark.integration + def test_list_upstream_sources_filter_by_type(self, integration_client): + """Test filtering upstream sources by source type.""" + response = integration_client.get("/api/v1/admin/upstream-sources?source_type=npm") + assert response.status_code == 200 + + data = response.json() + for source in data: + assert source["source_type"] == "npm" + + @pytest.mark.integration + def test_create_upstream_source(self, integration_client, unique_test_id): + """Test creating a new upstream source.""" + source_name = f"test-source-{unique_test_id}" + + response = integration_client.post( + "/api/v1/admin/upstream-sources", + json={ + "name": source_name, + "source_type": "generic", + "url": "https://example.com/packages", + "enabled": False, + "is_public": False, + "auth_type": "none", + "priority": 200, + }, + ) + assert response.status_code == 201 + + data = response.json() + assert data["name"] == source_name + assert data["source_type"] == "generic" + assert data["url"] == "https://example.com/packages" + assert data["enabled"] is False + assert data["is_public"] is False + assert data["priority"] == 200 + assert "id" in data + + # Clean up + source_id = data["id"] + integration_client.delete(f"/api/v1/admin/upstream-sources/{source_id}") + + @pytest.mark.integration + def test_create_upstream_source_with_auth(self, integration_client, unique_test_id): + """Test creating an upstream source with authentication.""" + source_name = f"test-auth-source-{unique_test_id}" + + response = integration_client.post( + "/api/v1/admin/upstream-sources", + json={ + "name": source_name, + "source_type": "npm", + "url": "https://npm.internal.corp", + "enabled": False, + "is_public": False, + "auth_type": "basic", + "username": "reader", + "password": "secret123", + "priority": 50, + }, + ) + assert response.status_code == 201 + + data = response.json() + assert data["name"] == source_name + assert data["auth_type"] == "basic" + assert data["username"] == "reader" + assert data["has_password"] is True + # Password should NOT be in response + assert "password" not in data + + # Clean up + source_id = data["id"] + integration_client.delete(f"/api/v1/admin/upstream-sources/{source_id}") + + @pytest.mark.integration + def test_create_upstream_source_duplicate_name(self, integration_client, unique_test_id): + """Test that duplicate source names are rejected.""" + source_name = f"test-dup-{unique_test_id}" + + # Create first source + response1 = integration_client.post( + "/api/v1/admin/upstream-sources", + json={ + "name": source_name, + "source_type": "generic", + "url": "https://example1.com", + }, + ) + assert response1.status_code == 201 + source_id = response1.json()["id"] + + # Try to create duplicate + response2 = integration_client.post( + "/api/v1/admin/upstream-sources", + json={ + "name": source_name, + "source_type": "generic", + "url": "https://example2.com", + }, + ) + assert response2.status_code == 409 + assert "already exists" in response2.json()["detail"] + + # Clean up + integration_client.delete(f"/api/v1/admin/upstream-sources/{source_id}") + + @pytest.mark.integration + def test_get_upstream_source(self, integration_client, unique_test_id): + """Test getting a specific upstream source.""" + source_name = f"test-get-{unique_test_id}" + + # Create source + create_response = integration_client.post( + "/api/v1/admin/upstream-sources", + json={ + "name": source_name, + "source_type": "pypi", + "url": "https://pypi.internal.corp", + }, + ) + assert create_response.status_code == 201 + source_id = create_response.json()["id"] + + # Get source + response = integration_client.get(f"/api/v1/admin/upstream-sources/{source_id}") + assert response.status_code == 200 + + data = response.json() + assert data["id"] == source_id + assert data["name"] == source_name + assert data["source_type"] == "pypi" + + # Clean up + integration_client.delete(f"/api/v1/admin/upstream-sources/{source_id}") + + @pytest.mark.integration + def test_get_upstream_source_not_found(self, integration_client): + """Test getting a non-existent upstream source.""" + fake_id = "00000000-0000-0000-0000-000000000000" + response = integration_client.get(f"/api/v1/admin/upstream-sources/{fake_id}") + assert response.status_code == 404 + + @pytest.mark.integration + def test_update_upstream_source(self, integration_client, unique_test_id): + """Test updating an upstream source.""" + source_name = f"test-update-{unique_test_id}" + + # Create source + create_response = integration_client.post( + "/api/v1/admin/upstream-sources", + json={ + "name": source_name, + "source_type": "generic", + "url": "https://example.com", + "enabled": False, + "priority": 100, + }, + ) + assert create_response.status_code == 201 + source_id = create_response.json()["id"] + + # Update source + response = integration_client.put( + f"/api/v1/admin/upstream-sources/{source_id}", + json={ + "enabled": True, + "priority": 50, + "url": "https://example-updated.com", + }, + ) + assert response.status_code == 200 + + data = response.json() + assert data["enabled"] is True + assert data["priority"] == 50 + assert data["url"] == "https://example-updated.com" + # Name should be unchanged + assert data["name"] == source_name + + # Clean up + integration_client.delete(f"/api/v1/admin/upstream-sources/{source_id}") + + @pytest.mark.integration + def test_update_upstream_source_password(self, integration_client, unique_test_id): + """Test updating an upstream source's password.""" + source_name = f"test-pwd-{unique_test_id}" + + # Create source without password + create_response = integration_client.post( + "/api/v1/admin/upstream-sources", + json={ + "name": source_name, + "source_type": "generic", + "url": "https://example.com", + "auth_type": "basic", + "username": "user", + }, + ) + assert create_response.status_code == 201 + source_id = create_response.json()["id"] + assert create_response.json()["has_password"] is False + + # Add password + response = integration_client.put( + f"/api/v1/admin/upstream-sources/{source_id}", + json={"password": "newpassword"}, + ) + assert response.status_code == 200 + assert response.json()["has_password"] is True + + # Clear password with empty string + response = integration_client.put( + f"/api/v1/admin/upstream-sources/{source_id}", + json={"password": ""}, + ) + assert response.status_code == 200 + assert response.json()["has_password"] is False + + # Clean up + integration_client.delete(f"/api/v1/admin/upstream-sources/{source_id}") + + @pytest.mark.integration + def test_delete_upstream_source(self, integration_client, unique_test_id): + """Test deleting an upstream source.""" + source_name = f"test-delete-{unique_test_id}" + + # Create source + create_response = integration_client.post( + "/api/v1/admin/upstream-sources", + json={ + "name": source_name, + "source_type": "generic", + "url": "https://example.com", + }, + ) + assert create_response.status_code == 201 + source_id = create_response.json()["id"] + + # Delete source + response = integration_client.delete(f"/api/v1/admin/upstream-sources/{source_id}") + assert response.status_code == 204 + + # Verify it's gone + get_response = integration_client.get(f"/api/v1/admin/upstream-sources/{source_id}") + assert get_response.status_code == 404 + + @pytest.mark.integration + def test_test_upstream_source_connectivity(self, integration_client, unique_test_id): + """Test the connectivity test endpoint.""" + source_name = f"test-conn-{unique_test_id}" + + # Create source with a URL that should respond + create_response = integration_client.post( + "/api/v1/admin/upstream-sources", + json={ + "name": source_name, + "source_type": "generic", + "url": "https://httpbin.org/get", # Public test endpoint + "enabled": False, + }, + ) + assert create_response.status_code == 201 + source_id = create_response.json()["id"] + + # Test connectivity + response = integration_client.post( + f"/api/v1/admin/upstream-sources/{source_id}/test" + ) + assert response.status_code == 200 + + data = response.json() + assert "success" in data + assert "elapsed_ms" in data + assert data["source_id"] == source_id + assert data["source_name"] == source_name + + # Clean up + integration_client.delete(f"/api/v1/admin/upstream-sources/{source_id}") + + +# ============================================================================= +# Cache Settings Admin API Tests +# ============================================================================= + + +class TestCacheSettingsAdminAPI: + """Tests for the cache settings admin API.""" + + @pytest.mark.integration + def test_get_cache_settings_requires_admin(self): + """Test that getting cache settings requires admin access.""" + import httpx + + base_url = os.environ.get("ORCHARD_TEST_URL", "http://localhost:8080") + + with httpx.Client(base_url=base_url, timeout=30.0) as unauthenticated_client: + response = unauthenticated_client.get("/api/v1/admin/cache-settings") + assert response.status_code in (401, 403) + + @pytest.mark.integration + def test_get_cache_settings_success(self, integration_client): + """Test getting cache settings as admin.""" + response = integration_client.get("/api/v1/admin/cache-settings") + assert response.status_code == 200 + + data = response.json() + # Check expected fields exist + assert "allow_public_internet" in data + assert "auto_create_system_projects" in data + + # Check types + assert isinstance(data["allow_public_internet"], bool) + assert isinstance(data["auto_create_system_projects"], bool) + + @pytest.mark.integration + def test_update_cache_settings_requires_admin(self): + """Test that updating cache settings requires admin access.""" + import httpx + + base_url = os.environ.get("ORCHARD_TEST_URL", "http://localhost:8080") + + with httpx.Client(base_url=base_url, timeout=30.0) as unauthenticated_client: + response = unauthenticated_client.put( + "/api/v1/admin/cache-settings", + json={"allow_public_internet": False}, + ) + assert response.status_code in (401, 403) + + @pytest.mark.integration + def test_update_cache_settings_success(self, integration_client): + """Test updating cache settings as admin.""" + # First get current settings to restore later + original = integration_client.get("/api/v1/admin/cache-settings").json() + + # Update settings + response = integration_client.put( + "/api/v1/admin/cache-settings", + json={ + "allow_public_internet": not original["allow_public_internet"], + "auto_create_system_projects": not original["auto_create_system_projects"], + }, + ) + assert response.status_code == 200 + + data = response.json() + assert data["allow_public_internet"] == (not original["allow_public_internet"]) + assert data["auto_create_system_projects"] == (not original["auto_create_system_projects"]) + + # Restore original settings + integration_client.put( + "/api/v1/admin/cache-settings", + json={ + "allow_public_internet": original["allow_public_internet"], + "auto_create_system_projects": original["auto_create_system_projects"], + }, + ) + + @pytest.mark.integration + def test_update_cache_settings_allow_public_internet(self, integration_client): + """Test enabling and disabling public internet access (air-gap mode).""" + # First get current settings to restore later + original = integration_client.get("/api/v1/admin/cache-settings").json() + + # Disable public internet (enable air-gap mode) + response = integration_client.put( + "/api/v1/admin/cache-settings", + json={"allow_public_internet": False}, + ) + assert response.status_code == 200 + assert response.json()["allow_public_internet"] is False + + # Enable public internet (disable air-gap mode) + response = integration_client.put( + "/api/v1/admin/cache-settings", + json={"allow_public_internet": True}, + ) + assert response.status_code == 200 + assert response.json()["allow_public_internet"] is True + + # Restore original settings + integration_client.put( + "/api/v1/admin/cache-settings", + json={"allow_public_internet": original["allow_public_internet"]}, + ) + + @pytest.mark.integration + def test_update_cache_settings_partial(self, integration_client): + """Test that partial updates only change specified fields.""" + # Get current settings + original = integration_client.get("/api/v1/admin/cache-settings").json() + + # Update only allow_public_internet + new_value = not original["allow_public_internet"] + response = integration_client.put( + "/api/v1/admin/cache-settings", + json={"allow_public_internet": new_value}, + ) + assert response.status_code == 200 + + data = response.json() + assert data["allow_public_internet"] == new_value + # Other field should be unchanged + assert data["auto_create_system_projects"] == original["auto_create_system_projects"] + + # Restore + integration_client.put( + "/api/v1/admin/cache-settings", + json={"allow_public_internet": original["allow_public_internet"]}, + ) + + @pytest.mark.integration + def test_update_cache_settings_auto_create_system_projects(self, integration_client): + """Test updating auto_create_system_projects setting.""" + # Get current settings + original = integration_client.get("/api/v1/admin/cache-settings").json() + + # Toggle auto_create_system_projects + new_value = not original["auto_create_system_projects"] + response = integration_client.put( + "/api/v1/admin/cache-settings", + json={"auto_create_system_projects": new_value}, + ) + assert response.status_code == 200 + assert response.json()["auto_create_system_projects"] == new_value + + # Restore + integration_client.put( + "/api/v1/admin/cache-settings", + json={"auto_create_system_projects": original["auto_create_system_projects"]}, + ) + + +# ============================================================================= +# Environment Variable Configuration Tests +# ============================================================================= + + +class TestEnvVarUpstreamSourcesParsing: + """Tests for parsing upstream sources from environment variables.""" + + def test_parse_upstream_sources_basic(self): + """Test parsing a basic upstream source from env vars.""" + from app.config import parse_upstream_sources_from_env + import os + + # Set env vars + test_env = { + "ORCHARD_UPSTREAM__TEST_SOURCE__URL": "https://example.com/packages", + "ORCHARD_UPSTREAM__TEST_SOURCE__TYPE": "generic", + "ORCHARD_UPSTREAM__TEST_SOURCE__ENABLED": "true", + } + + original_env = {} + for key in test_env: + original_env[key] = os.environ.get(key) + os.environ[key] = test_env[key] + + try: + sources = parse_upstream_sources_from_env() + assert len(sources) >= 1 + + # Find our test source + test_source = next((s for s in sources if "test" in s.name), None) + assert test_source is not None + assert test_source.url == "https://example.com/packages" + assert test_source.source_type == "generic" + assert test_source.enabled is True + assert test_source.source == "env" + finally: + # Restore original env + for key, value in original_env.items(): + if value is None: + os.environ.pop(key, None) + else: + os.environ[key] = value + + def test_parse_upstream_sources_with_auth(self): + """Test parsing an upstream source with authentication from env vars.""" + from app.config import parse_upstream_sources_from_env + import os + + test_env = { + "ORCHARD_UPSTREAM__AUTH_TEST__URL": "https://secure.example.com", + "ORCHARD_UPSTREAM__AUTH_TEST__TYPE": "npm", + "ORCHARD_UPSTREAM__AUTH_TEST__AUTH_TYPE": "basic", + "ORCHARD_UPSTREAM__AUTH_TEST__USERNAME": "myuser", + "ORCHARD_UPSTREAM__AUTH_TEST__PASSWORD": "secret123", + "ORCHARD_UPSTREAM__AUTH_TEST__PRIORITY": "50", + } + + original_env = {} + for key in test_env: + original_env[key] = os.environ.get(key) + os.environ[key] = test_env[key] + + try: + sources = parse_upstream_sources_from_env() + test_source = next((s for s in sources if "auth" in s.name), None) + assert test_source is not None + assert test_source.auth_type == "basic" + assert test_source.username == "myuser" + assert test_source.password == "secret123" + assert test_source.priority == 50 + finally: + for key, value in original_env.items(): + if value is None: + os.environ.pop(key, None) + else: + os.environ[key] = value + + def test_parse_upstream_sources_missing_url_skipped(self): + """Test that sources without URL are skipped.""" + from app.config import parse_upstream_sources_from_env + import os + + # Source without URL should be skipped + test_env = { + "ORCHARD_UPSTREAM__NO_URL__TYPE": "npm", + "ORCHARD_UPSTREAM__NO_URL__ENABLED": "true", + } + + original_env = {} + for key in test_env: + original_env[key] = os.environ.get(key) + os.environ[key] = test_env[key] + + try: + sources = parse_upstream_sources_from_env() + # Should not include the source without URL + no_url_source = next((s for s in sources if "no-url" in s.name), None) + assert no_url_source is None + finally: + for key, value in original_env.items(): + if value is None: + os.environ.pop(key, None) + else: + os.environ[key] = value + + def test_parse_upstream_sources_defaults(self): + """Test that defaults are applied for optional fields.""" + from app.config import parse_upstream_sources_from_env + import os + + test_env = { + "ORCHARD_UPSTREAM__DEFAULTS_TEST__URL": "https://example.com", + } + + original_env = {} + for key in test_env: + original_env[key] = os.environ.get(key) + os.environ[key] = test_env[key] + + try: + sources = parse_upstream_sources_from_env() + test_source = next((s for s in sources if "defaults" in s.name), None) + assert test_source is not None + # Check defaults + assert test_source.source_type == "generic" + assert test_source.enabled is True + assert test_source.is_public is True + assert test_source.auth_type == "none" + assert test_source.priority == 100 + finally: + for key, value in original_env.items(): + if value is None: + os.environ.pop(key, None) + else: + os.environ[key] = value + + +class TestEnvSourceToResponse: + """Tests for converting env sources to API response format.""" + + def test_env_source_to_response_format(self): + """Test that env source response has correct format.""" + from app.config import EnvUpstreamSource + + source = EnvUpstreamSource( + name="test-source", + url="https://example.com", + source_type="npm", + enabled=True, + is_public=False, + auth_type="basic", + username="user", + password="pass", + priority=50, + ) + + assert source.name == "test-source" + assert source.url == "https://example.com" + assert source.source_type == "npm" + assert source.enabled is True + assert source.is_public is False + assert source.auth_type == "basic" + assert source.username == "user" + assert source.password == "pass" + assert source.priority == 50 + assert source.source == "env" + + +class TestUpstreamSourceResponseSource: + """Tests for the source field in upstream source responses.""" + + @pytest.mark.integration + def test_db_sources_have_database_source_field(self, integration_client, unique_test_id): + """Test that database-defined sources have source='database'.""" + source_name = f"test-db-source-{unique_test_id}" + + # Create source via API (stored in DB) + response = integration_client.post( + "/api/v1/admin/upstream-sources", + json={ + "name": source_name, + "source_type": "generic", + "url": "https://example.com", + }, + ) + assert response.status_code == 201 + source_id = response.json()["id"] + + # Get source - should have source="database" + response = integration_client.get(f"/api/v1/admin/upstream-sources/{source_id}") + assert response.status_code == 200 + assert response.json()["source"] == "database" + + # List sources - should have source field + response = integration_client.get("/api/v1/admin/upstream-sources") + assert response.status_code == 200 + db_source = next((s for s in response.json() if s["id"] == source_id), None) + assert db_source is not None + assert db_source["source"] == "database" + + # Clean up + integration_client.delete(f"/api/v1/admin/upstream-sources/{source_id}") + + +class TestCacheSettingsEnvOverride: + """Tests for cache settings environment variable override fields.""" + + @pytest.mark.integration + def test_cache_settings_has_env_override_fields(self, integration_client): + """Test that cache settings response includes env override fields.""" + response = integration_client.get("/api/v1/admin/cache-settings") + assert response.status_code == 200 + + data = response.json() + # These fields should exist (may be null if no env override) + assert "allow_public_internet_env_override" in data + assert "auto_create_system_projects_env_override" in data diff --git a/docs/epic-upstream-caching.md b/docs/epic-upstream-caching.md new file mode 100644 index 0000000..b382cc3 --- /dev/null +++ b/docs/epic-upstream-caching.md @@ -0,0 +1,672 @@ +# Epic: Upstream Artifact Caching for Hermetic Builds + +## Overview + +Orchard will act as a permanent, content-addressable cache for upstream artifacts (npm, PyPI, Maven, Docker, etc.). Once an artifact is cached, it is stored forever by SHA256 hash - enabling reproducible builds years later regardless of whether the upstream source still exists. + +## Problem Statement + +Build reproducibility is critical for enterprise environments: +- Packages get deleted, yanked, or modified upstream +- Registries go down or change URLs +- Version constraints resolve differently over time +- Air-gapped environments cannot access public internet + +Teams need to guarantee that a build from 5 years ago produces the exact same output today. + +## Solution + +Orchard becomes "the cache that never forgets": + +1. **Fetch once, store forever** - When a build needs `lodash@4.17.21`, Orchard fetches it from npm, stores it by SHA256 hash, and never deletes it +2. **Content-addressable** - Same hash = same bytes, guaranteed +3. **Format-agnostic** - Orchard doesn't need to understand npm/PyPI/Maven protocols; the client provides the URL, Orchard fetches and stores +4. **Air-gap support** - Disable public internet entirely, only allow configured private upstreams + +## User Workflow + +``` +1. Build tool resolves dependencies npm install / pip install / mvn resolve + ↓ +2. Generate lockfile with URLs package-lock.json / requirements.txt + ↓ +3. Cache all URLs in Orchard orchard cache --file urls.txt + ↓ +4. Pin by SHA256 hash lodash = "sha256:abc123..." + ↓ +5. Future builds fetch by hash Always get exact same bytes +``` + +## Key Features + +- **Multiple upstream sources** - Configure npm, PyPI, Maven Central, private Artifactory, etc. +- **Per-source authentication** - Basic auth, bearer tokens, API keys +- **System cache projects** - `_npm`, `_pypi`, `_maven` organize cached packages by format +- **Cross-referencing** - Link cached artifacts to user projects for visibility +- **URL tracking** - Know which URLs map to which hashes, audit provenance +- **Air-gap mode** - Global kill switch for all public internet access +- **Environment variable config** - 12-factor friendly for containerized deployments + +## Architecture + +``` +┌─────────────────────────────────────────────────────────────────┐ +│ Orchard Server │ +├─────────────────────────────────────────────────────────────────┤ +│ POST /api/v1/cache │ +│ ├── Check if URL already cached (url_hash lookup) │ +│ ├── Match URL to upstream source (get auth) │ +│ ├── Fetch via UpstreamClient (stream + compute SHA256) │ +│ ├── Store artifact in S3 (content-addressable) │ +│ ├── Create tag in system project (_npm/lodash:4.17.21) │ +│ ├── Optionally create tag in user project │ +│ └── Record in cached_urls table (provenance) │ +├─────────────────────────────────────────────────────────────────┤ +│ Tables │ +│ ├── upstream_sources (npm-public, pypi-public, artifactory) │ +│ ├── cache_settings (allow_public_internet, etc.) │ +│ ├── cached_urls (url → artifact_id mapping) │ +│ └── projects.is_system (for _npm, _pypi, etc.) │ +└─────────────────────────────────────────────────────────────────┘ +``` + +## Issues Summary + +| Issue | Title | Status | Dependencies | +|-------|-------|--------|--------------| +| #68 | Schema: Upstream Sources & Cache Tracking | ✅ Complete | None | +| #69 | HTTP Client: Generic URL Fetcher | Pending | None | +| #70 | Cache API Endpoint | Pending | #68, #69 | +| #71 | System Projects (Cache Namespaces) | Pending | #68, #70 | +| #72 | Upstream Sources Admin API | Pending | #68 | +| #73 | Global Cache Settings API | Pending | #68 | +| #74 | Environment Variable Overrides | Pending | #68, #72, #73 | +| #75 | Frontend: Upstream Sources Management | Pending | #72, #73 | +| #105 | Frontend: System Projects Integration | Pending | #71 | +| #77 | CLI: Cache Command | Pending | #70 | + +## Implementation Phases + +**Phase 1 - Core (MVP):** +- #68 Schema ✅ +- #69 HTTP Client +- #70 Cache API +- #71 System Projects + +**Phase 2 - Admin:** +- #72 Upstream Sources API +- #73 Cache Settings API +- #74 Environment Variables + +**Phase 3 - Frontend:** +- #75 Upstream Sources UI +- #105 System Projects UI + +**Phase 4 - CLI:** +- #77 Cache Command + +--- + +# Issue #68: Schema - Upstream Sources & Cache Tracking + +**Status: ✅ Complete** + +## Description + +Create database schema for flexible multi-source upstream configuration and URL-to-artifact tracking. This replaces the previous singleton proxy_config design with a more flexible model supporting multiple upstream sources, air-gap mode, and provenance tracking. + +## Acceptance Criteria + +- [x] `upstream_sources` table: + - id (UUID, primary key) + - name (VARCHAR(255), unique, e.g., "npm-public", "artifactory-private") + - source_type (VARCHAR(50), enum: npm, pypi, maven, docker, helm, nuget, deb, rpm, generic) + - url (VARCHAR(2048), base URL of upstream) + - enabled (BOOLEAN, default false) + - is_public (BOOLEAN, true if this is a public internet source) + - auth_type (VARCHAR(20), enum: none, basic, bearer, api_key) + - username (VARCHAR(255), nullable) + - password_encrypted (BYTEA, nullable, Fernet encrypted) + - headers_encrypted (BYTEA, nullable, for custom headers like API keys) + - priority (INTEGER, default 100, lower = checked first) + - created_at, updated_at timestamps +- [x] `cache_settings` table (singleton, id always 1): + - id (INTEGER, primary key, check id = 1) + - allow_public_internet (BOOLEAN, default true, air-gap kill switch) + - auto_create_system_projects (BOOLEAN, default true) + - created_at, updated_at timestamps +- [x] `cached_urls` table: + - id (UUID, primary key) + - url (VARCHAR(4096), original URL fetched) + - url_hash (VARCHAR(64), SHA256 of URL for fast lookup, indexed) + - artifact_id (VARCHAR(64), FK to artifacts) + - source_id (UUID, FK to upstream_sources, nullable for manual imports) + - fetched_at (TIMESTAMP WITH TIME ZONE) + - response_headers (JSONB, original upstream headers for provenance) + - created_at timestamp +- [x] Add `is_system` BOOLEAN column to projects table (default false) +- [x] Migration SQL file in migrations/ +- [x] Runtime migration in database.py +- [x] SQLAlchemy models for all new tables +- [x] Pydantic schemas for API input/output (passwords write-only) +- [x] Encryption helpers for password/headers fields +- [x] Seed default upstream sources (disabled by default): + - npm-public: https://registry.npmjs.org + - pypi-public: https://pypi.org/simple + - maven-central: https://repo1.maven.org/maven2 + - docker-hub: https://registry-1.docker.io +- [x] Unit tests for models and schemas + +## Files Modified + +- `migrations/010_upstream_caching.sql` +- `backend/app/database.py` (migrations 016-020) +- `backend/app/models.py` (UpstreamSource, CacheSettings, CachedUrl, Project.is_system) +- `backend/app/schemas.py` (all caching schemas) +- `backend/app/encryption.py` (renamed env var) +- `backend/app/config.py` (renamed setting) +- `backend/tests/test_upstream_caching.py` (37 tests) +- `frontend/src/components/Layout.tsx` (footer tagline) +- `CHANGELOG.md` + +--- + +# Issue #69: HTTP Client - Generic URL Fetcher + +**Status: Pending** + +## Description + +Create a reusable HTTP client for fetching artifacts from upstream sources. Supports multiple auth methods, streaming for large files, and computes SHA256 while downloading. + +## Acceptance Criteria + +- [ ] `UpstreamClient` class in `backend/app/upstream.py` +- [ ] `fetch(url)` method that: + - Streams response body (doesn't load large files into memory) + - Computes SHA256 hash while streaming + - Returns file content, hash, size, and response headers +- [ ] Auth support based on upstream source configuration: + - None (anonymous) + - Basic auth (username/password) + - Bearer token (Authorization: Bearer {token}) + - API key (custom header name/value) +- [ ] URL-to-source matching: + - Match URL to configured upstream source by URL prefix + - Apply auth from matched source + - Respect source priority for multiple matches +- [ ] Configuration options: + - Timeout (connect and read, default 30s/300s) + - Max retries (default 3) + - Follow redirects (default true, max 5) + - Max file size (reject if Content-Length exceeds limit) +- [ ] Respect `allow_public_internet` setting: + - If false, reject URLs matching `is_public=true` sources + - If false, reject URLs not matching any configured source +- [ ] Capture response headers for provenance tracking +- [ ] Proper error handling: + - Connection errors (retry with backoff) + - HTTP errors (4xx, 5xx) + - Timeout errors + - SSL/TLS errors +- [ ] Logging for debugging (URL, source matched, status, timing) +- [ ] Unit tests with mocked HTTP responses +- [ ] Integration tests against httpbin.org or similar (optional, marked) + +## Technical Notes + +- Use `httpx` for async HTTP support (already in requirements) +- Stream to temp file to avoid memory issues with large artifacts +- Consider checksum verification if upstream provides it (e.g., npm provides shasum) + +--- + +# Issue #70: Cache API Endpoint + +**Status: Pending** + +## Description + +API endpoint to cache an artifact from an upstream URL. This is the core endpoint that fetches from upstream, stores in Orchard, and creates appropriate tags. + +## Acceptance Criteria + +- [ ] `POST /api/v1/cache` endpoint +- [ ] Request body: + ```json + { + "url": "https://registry.npmjs.org/lodash/-/lodash-4.17.21.tgz", + "source_type": "npm", + "package_name": "lodash", + "tag": "4.17.21", + "user_project": "my-app", + "user_package": "npm-deps", + "user_tag": "lodash-4.17.21", + "expected_hash": "sha256:abc123..." + } + ``` + - `url` (required): URL to fetch + - `source_type` (required): Determines system project (_npm, _pypi, etc.) + - `package_name` (optional): Package name in system project, derived from URL if not provided + - `tag` (optional): Tag name in system project, derived from URL if not provided + - `user_project`, `user_package`, `user_tag` (optional): Cross-reference in user's project + - `expected_hash` (optional): Verify downloaded content matches +- [ ] Response: + ```json + { + "artifact_id": "abc123...", + "sha256": "abc123...", + "size": 12345, + "content_type": "application/gzip", + "already_cached": false, + "source_url": "https://registry.npmjs.org/lodash/-/lodash-4.17.21.tgz", + "source_name": "npm-public", + "system_project": "_npm", + "system_package": "lodash", + "system_tag": "4.17.21", + "user_reference": "my-app/npm-deps:lodash-4.17.21" + } + ``` +- [ ] Behavior: + - Check if URL already cached (by url_hash in cached_urls) + - If cached: return existing artifact, optionally create user tag + - If not cached: fetch via UpstreamClient, store artifact, create tags + - Create/get system project if needed (e.g., `_npm`) + - Create package in system project (e.g., `_npm/lodash`) + - Create tag in system project (e.g., `_npm/lodash:4.17.21`) + - If user reference provided, create tag in user's project + - Record in cached_urls table with provenance +- [ ] Error handling: + - 400: Invalid request (bad URL format, missing required fields) + - 403: Air-gap mode enabled and URL is from public source + - 404: Upstream returned 404 + - 409: Hash mismatch (if expected_hash provided) + - 502: Upstream fetch failed (connection error, timeout) + - 503: Upstream source disabled +- [ ] Authentication required (any authenticated user can cache) +- [ ] Audit logging for cache operations +- [ ] Integration tests covering success and error cases + +## Technical Notes + +- URL parsing for package_name/tag derivation is format-specific: + - npm: `/{package}/-/{package}-{version}.tgz` → package=lodash, tag=4.17.21 + - pypi: `/packages/.../requests-2.28.0.tar.gz` → package=requests, tag=2.28.0 + - maven: `/{group}/{artifact}/{version}/{artifact}-{version}.jar` +- Deduplication: if same SHA256 already exists, just create new tag pointing to it + +--- + +# Issue #71: System Projects (Cache Namespaces) + +**Status: Pending** + +## Description + +Implement auto-created system projects for organizing cached artifacts by format type. These are special projects that provide a browsable namespace for all cached upstream packages. + +## Acceptance Criteria + +- [ ] System project names: `_npm`, `_pypi`, `_maven`, `_docker`, `_helm`, `_nuget`, `_deb`, `_rpm`, `_generic` +- [ ] Auto-creation: + - Created automatically on first cache request for that format + - Created by cache endpoint, not at startup + - Uses system user as creator (`created_by = "system"`) +- [ ] System project properties: + - `is_system = true` + - `is_public = true` (readable by all authenticated users) + - `description` = "System cache for {format} packages" +- [ ] Restrictions: + - Cannot be deleted (return 403 with message) + - Cannot be renamed + - Cannot change `is_public` to false + - Only admins can modify description +- [ ] Helper function: `get_or_create_system_project(source_type)` in routes.py or new cache.py module +- [ ] Update project deletion endpoint to check `is_system` flag +- [ ] Update project update endpoint to enforce restrictions +- [ ] Query helper: list all system projects for UI dropdown +- [ ] Unit tests for restrictions +- [ ] Integration tests for auto-creation and restrictions + +## Technical Notes + +- System projects are identified by `is_system=true`, not just naming convention +- The `_` prefix is a convention for display purposes +- Packages within system projects follow upstream naming (e.g., `_npm/lodash`, `_npm/@types/node`) + +--- + +# Issue #72: Upstream Sources Admin API + +**Status: Pending** + +## Description + +CRUD API endpoints for managing upstream sources configuration. Admin-only access. + +## Acceptance Criteria + +- [ ] `GET /api/v1/admin/upstream-sources` - List all upstream sources + - Returns array of sources with id, name, source_type, url, enabled, is_public, auth_type, priority, has_credentials, created_at, updated_at + - Supports `?enabled=true/false` filter + - Supports `?source_type=npm,pypi` filter + - Passwords/tokens never returned +- [ ] `POST /api/v1/admin/upstream-sources` - Create upstream source + - Request: name, source_type, url, enabled, is_public, auth_type, username, password, headers, priority + - Validates unique name + - Validates URL format + - Encrypts password/headers before storage + - Returns created source (without secrets) +- [ ] `GET /api/v1/admin/upstream-sources/{id}` - Get source details + - Returns source with `has_credentials` boolean, not actual credentials +- [ ] `PUT /api/v1/admin/upstream-sources/{id}` - Update source + - Partial update supported + - If password provided, re-encrypt; if omitted, keep existing + - Special value `password: null` clears credentials +- [ ] `DELETE /api/v1/admin/upstream-sources/{id}` - Delete source + - Returns 400 if source has cached_urls referencing it (optional: cascade or reassign) +- [ ] `POST /api/v1/admin/upstream-sources/{id}/test` - Test connectivity + - Attempts HEAD request to source URL + - Returns success/failure with status code and timing + - Does not cache anything +- [ ] All endpoints require admin role +- [ ] Audit logging for all mutations +- [ ] Pydantic schemas: UpstreamSourceCreate, UpstreamSourceUpdate, UpstreamSourceResponse +- [ ] Integration tests for all endpoints + +## Technical Notes + +- Test endpoint should respect auth configuration to verify credentials work +- Consider adding `last_used_at` and `last_error` fields for observability (future enhancement) + +--- + +# Issue #73: Global Cache Settings API + +**Status: Pending** + +## Description + +API endpoints for managing global cache settings including air-gap mode. + +## Acceptance Criteria + +- [ ] `GET /api/v1/admin/cache-settings` - Get current settings + - Returns: allow_public_internet, auto_create_system_projects, created_at, updated_at +- [ ] `PUT /api/v1/admin/cache-settings` - Update settings + - Partial update supported + - Returns updated settings +- [ ] Settings fields: + - `allow_public_internet` (boolean): When false, blocks all requests to sources marked `is_public=true` + - `auto_create_system_projects` (boolean): When false, system projects must be created manually +- [ ] Admin-only access +- [ ] Audit logging for changes (especially air-gap mode changes) +- [ ] Pydantic schemas: CacheSettingsResponse, CacheSettingsUpdate +- [ ] Initialize singleton row on first access if not exists +- [ ] Integration tests + +## Technical Notes + +- Air-gap mode change should be logged prominently (security-relevant) +- Consider requiring confirmation header for disabling air-gap mode (similar to factory reset) + +--- + +# Issue #74: Environment Variable Overrides + +**Status: Pending** + +## Description + +Allow cache and upstream configuration via environment variables for containerized deployments. Environment variables override database settings following 12-factor app principles. + +## Acceptance Criteria + +- [ ] Global settings overrides: + - `ORCHARD_CACHE_ALLOW_PUBLIC_INTERNET=true/false` + - `ORCHARD_CACHE_AUTO_CREATE_SYSTEM_PROJECTS=true/false` + - `ORCHARD_CACHE_ENCRYPTION_KEY` (Fernet key for credential encryption) +- [ ] Upstream source definition via env vars: + - `ORCHARD_UPSTREAM__{NAME}__URL` (double underscore as separator) + - `ORCHARD_UPSTREAM__{NAME}__TYPE` (npm, pypi, maven, etc.) + - `ORCHARD_UPSTREAM__{NAME}__ENABLED` (true/false) + - `ORCHARD_UPSTREAM__{NAME}__IS_PUBLIC` (true/false) + - `ORCHARD_UPSTREAM__{NAME}__AUTH_TYPE` (none, basic, bearer, api_key) + - `ORCHARD_UPSTREAM__{NAME}__USERNAME` + - `ORCHARD_UPSTREAM__{NAME}__PASSWORD` + - `ORCHARD_UPSTREAM__{NAME}__PRIORITY` + - Example: `ORCHARD_UPSTREAM__NPM_PRIVATE__URL=https://npm.corp.com` +- [ ] Env var sources: + - Loaded at startup + - Merged with database sources + - Env var sources have `source = "env"` marker + - Cannot be modified via API (return 400) + - Cannot be deleted via API (return 400) +- [ ] Update Settings class in config.py +- [ ] Update get/list endpoints to include env-defined sources +- [ ] Document all env vars in CLAUDE.md +- [ ] Unit tests for env var parsing +- [ ] Integration tests with env vars set + +## Technical Notes + +- Double underscore (`__`) separator allows source names with single underscores +- Env-defined sources should appear in API responses but marked as read-only +- Consider startup validation that warns about invalid env var combinations + +--- + +# Issue #75: Frontend - Upstream Sources Management + +**Status: Pending** + +## Description + +Admin UI for managing upstream sources and cache settings. + +## Acceptance Criteria + +- [ ] New admin page: `/admin/cache` or `/admin/upstream-sources` +- [ ] Upstream sources section: + - Table listing all sources with: name, type, URL, enabled toggle, public badge, priority, actions + - Visual distinction for env-defined sources (locked icon, no edit/delete) + - Create button opens modal/form + - Edit button for DB-defined sources + - Delete with confirmation modal + - Test connection button with status indicator +- [ ] Create/edit form fields: + - Name (text, required) + - Source type (dropdown) + - URL (text, required) + - Priority (number) + - Is public (checkbox) + - Enabled (checkbox) + - Auth type (dropdown: none, basic, bearer, api_key) + - Conditional auth fields based on type: + - Basic: username, password + - Bearer: token + - API key: header name, header value + - Password fields masked, "unchanged" placeholder on edit +- [ ] Cache settings section: + - Air-gap mode toggle with warning + - Auto-create system projects toggle + - "Air-gap mode" shows prominent warning banner when enabled +- [ ] Link from main admin navigation +- [ ] Loading and error states +- [ ] Success/error toast notifications + +## Technical Notes + +- Use existing admin page patterns from user management +- Air-gap toggle should require confirmation (modal with warning text) + +--- + +# Issue #105: Frontend - System Projects Integration + +**Status: Pending** + +## Description + +Integrate system projects into the frontend UI with appropriate visual treatment and navigation. + +## Acceptance Criteria + +- [ ] Home page project dropdown: + - System projects shown in separate "Cached Packages" section + - Visual distinction (icon, different background, or badge) + - Format icon for each type (npm, pypi, maven, etc.) +- [ ] Project list/grid: + - System projects can be filtered: "Show system projects" toggle + - Or separate tab: "Projects" | "Package Cache" +- [ ] System project page: + - "System Cache" badge in header + - Description explains this is auto-managed cache + - Settings/delete buttons hidden or disabled + - Shows format type prominently +- [ ] Package page within system project: + - Shows "Cached from" with source URL (linked) + - Shows "First cached" timestamp + - Shows which upstream source provided it +- [ ] Artifact page: + - If artifact came from cache, show provenance: + - Original URL + - Upstream source name + - Fetch timestamp +- [ ] Search includes system projects (with filter option) + +## Technical Notes + +- Use React context or query params for system project filtering +- Consider dedicated route: `/cache/npm/lodash` as alias for `/_npm/lodash` + +--- + +# Issue #77: CLI - Cache Command + +**Status: Pending** + +## Description + +Add a new `orchard cache` command to the existing CLI for caching artifacts from upstream URLs. This integrates with the new cache API endpoint and can optionally update `orchard.ensure` with cached artifacts. + +## Acceptance Criteria + +- [ ] New command: `orchard cache ` in `orchard/commands/cache.py` +- [ ] Basic usage: + ```bash + # Cache a URL, print artifact info + orchard cache https://registry.npmjs.org/lodash/-/lodash-4.17.21.tgz + + # Output: + # Caching https://registry.npmjs.org/lodash/-/lodash-4.17.21.tgz... + # Source type: npm + # Package: lodash + # Version: 4.17.21 + # + # Successfully cached artifact + # Artifact ID: abc123... + # Size: 1.2 MB + # System project: _npm + # System package: lodash + # System tag: 4.17.21 + ``` +- [ ] Options: + | Option | Description | + |--------|-------------| + | `--type, -t TYPE` | Source type: npm, pypi, maven, docker, helm, generic (auto-detected from URL if not provided) | + | `--package, -p NAME` | Package name in system project (auto-derived from URL if not provided) | + | `--tag TAG` | Tag name in system project (auto-derived from URL if not provided) | + | `--project PROJECT` | Also create tag in this user project | + | `--user-package PKG` | Package name in user project (required if --project specified) | + | `--user-tag TAG` | Tag name in user project (default: same as system tag) | + | `--expected-hash HASH` | Verify downloaded content matches this SHA256 | + | `--add` | Add to orchard.ensure after caching | + | `--add-path PATH` | Extraction path for --add (default: `/`) | + | `--file, -f FILE` | Path to orchard.ensure file | + | `--verbose, -v` | Show detailed output | +- [ ] URL type auto-detection: + - `registry.npmjs.org` → npm + - `pypi.org` or `files.pythonhosted.org` → pypi + - `repo1.maven.org` or contains `/maven2/` → maven + - `registry-1.docker.io` or `docker.io` → docker + - Otherwise → generic +- [ ] Package/version extraction from URL patterns: + - npm: `/{package}/-/{package}-{version}.tgz` + - pypi: `/packages/.../requests-{version}.tar.gz` + - maven: `/{group}/{artifact}/{version}/{artifact}-{version}.jar` +- [ ] Add `cache_artifact()` function to `orchard/api.py` +- [ ] Integration with `--add` flag: + - Parse existing orchard.ensure + - Add new dependency entry pointing to cached artifact + - Use artifact_id (SHA256) for hermetic pinning +- [ ] Batch mode: `orchard cache --file urls.txt` + - One URL per line + - Lines starting with `#` are comments + - Report success/failure for each +- [ ] Exit codes: + - 0: Success (or already cached) + - 1: Fetch failed + - 2: Hash mismatch + - 3: Air-gap mode blocked request +- [ ] Error handling consistent with existing CLI patterns +- [ ] Unit tests in `test/test_cache.py` +- [ ] Update README.md with cache command documentation + +## Technical Notes + +- Follow existing Click patterns from other commands +- Use `get_auth_headers()` from `orchard/auth.py` +- URL parsing can use `urllib.parse` +- Consider adding URL pattern registry for extensibility +- The `--add` flag should integrate with existing ensure file parsing in `orchard/ensure.py` + +## Example Workflows + +```bash +# Simple: cache a single URL +orchard cache https://registry.npmjs.org/lodash/-/lodash-4.17.21.tgz + +# Cache and add to orchard.ensure for current project +orchard cache https://registry.npmjs.org/lodash/-/lodash-4.17.21.tgz \ + --add --add-path libs/lodash/ + +# Cache with explicit metadata +orchard cache https://internal.corp/files/custom-lib.tar.gz \ + --type generic \ + --package custom-lib \ + --tag v1.0.0 + +# Cache and cross-reference to user project +orchard cache https://registry.npmjs.org/lodash/-/lodash-4.17.21.tgz \ + --project my-app \ + --user-package npm-deps \ + --user-tag lodash-4.17.21 + +# Batch cache from file +orchard cache --file deps-urls.txt + +# Verify hash while caching +orchard cache https://example.com/file.tar.gz \ + --expected-hash sha256:abc123... +``` + +--- + +## Out of Scope (Future Enhancements) + +- Automatic transitive dependency resolution (client's responsibility) +- Lockfile parsing (`package-lock.json`, `requirements.txt`) - stretch goal for CLI +- Cache eviction policies (we cache forever by design) +- Mirroring/sync between Orchard instances +- Format-specific metadata extraction (npm package.json parsing, etc.) + +## Success Criteria + +- [ ] Can cache any URL and retrieve by SHA256 hash +- [ ] Cached artifacts persist indefinitely +- [ ] Air-gap mode blocks all public internet access +- [ ] Multiple upstream sources with different auth +- [ ] System projects organize cached packages by format +- [ ] CLI can cache URLs and update orchard.ensure +- [ ] Admin UI for upstream source management diff --git a/frontend/src/App.tsx b/frontend/src/App.tsx index 0b66fdf..7a76ba3 100644 --- a/frontend/src/App.tsx +++ b/frontend/src/App.tsx @@ -11,6 +11,7 @@ import ChangePasswordPage from './pages/ChangePasswordPage'; import APIKeysPage from './pages/APIKeysPage'; import AdminUsersPage from './pages/AdminUsersPage'; import AdminOIDCPage from './pages/AdminOIDCPage'; +import AdminCachePage from './pages/AdminCachePage'; import ProjectSettingsPage from './pages/ProjectSettingsPage'; import TeamsPage from './pages/TeamsPage'; import TeamDashboardPage from './pages/TeamDashboardPage'; @@ -50,6 +51,7 @@ function AppRoutes() { } /> } /> } /> + } /> } /> } /> } /> diff --git a/frontend/src/api.ts b/frontend/src/api.ts index d8a0141..844a9f7 100644 --- a/frontend/src/api.ts +++ b/frontend/src/api.ts @@ -42,6 +42,12 @@ import { TeamUpdate, TeamMemberCreate, TeamMemberUpdate, + UpstreamSource, + UpstreamSourceCreate, + UpstreamSourceUpdate, + UpstreamSourceTestResult, + CacheSettings, + CacheSettingsUpdate, } from './types'; const API_BASE = '/api/v1'; @@ -682,3 +688,81 @@ export async function searchUsers(query: string, limit: number = 10): Promise(response); } + +// Upstream Sources Admin API +export interface UpstreamSourceListParams { + enabled?: boolean; + source_type?: string; +} + +export async function listUpstreamSources(params: UpstreamSourceListParams = {}): Promise { + const query = buildQueryString(params as Record); + const response = await fetch(`${API_BASE}/admin/upstream-sources${query}`, { + credentials: 'include', + }); + return handleResponse(response); +} + +export async function createUpstreamSource(data: UpstreamSourceCreate): Promise { + const response = await fetch(`${API_BASE}/admin/upstream-sources`, { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify(data), + credentials: 'include', + }); + return handleResponse(response); +} + +export async function getUpstreamSource(id: string): Promise { + const response = await fetch(`${API_BASE}/admin/upstream-sources/${id}`, { + credentials: 'include', + }); + return handleResponse(response); +} + +export async function updateUpstreamSource(id: string, data: UpstreamSourceUpdate): Promise { + const response = await fetch(`${API_BASE}/admin/upstream-sources/${id}`, { + method: 'PUT', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify(data), + credentials: 'include', + }); + return handleResponse(response); +} + +export async function deleteUpstreamSource(id: string): Promise { + const response = await fetch(`${API_BASE}/admin/upstream-sources/${id}`, { + method: 'DELETE', + credentials: 'include', + }); + if (!response.ok) { + const error = await response.json().catch(() => ({ detail: 'Unknown error' })); + throw new ApiError(error.detail || `HTTP ${response.status}`, response.status); + } +} + +export async function testUpstreamSource(id: string): Promise { + const response = await fetch(`${API_BASE}/admin/upstream-sources/${id}/test`, { + method: 'POST', + credentials: 'include', + }); + return handleResponse(response); +} + +// Cache Settings Admin API +export async function getCacheSettings(): Promise { + const response = await fetch(`${API_BASE}/admin/cache-settings`, { + credentials: 'include', + }); + return handleResponse(response); +} + +export async function updateCacheSettings(data: CacheSettingsUpdate): Promise { + const response = await fetch(`${API_BASE}/admin/cache-settings`, { + method: 'PUT', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify(data), + credentials: 'include', + }); + return handleResponse(response); +} diff --git a/frontend/src/components/Layout.tsx b/frontend/src/components/Layout.tsx index 9e2e559..f7c55a3 100644 --- a/frontend/src/components/Layout.tsx +++ b/frontend/src/components/Layout.tsx @@ -183,6 +183,18 @@ function Layout({ children }: LayoutProps) { SSO Configuration + setShowUserMenu(false)} + > + + + + + + Cache Management + )}
@@ -229,7 +241,7 @@ function Layout({ children }: LayoutProps) { Orchard · - Content-Addressable Storage + The cache that never forgets
Documentation diff --git a/frontend/src/pages/AdminCachePage.css b/frontend/src/pages/AdminCachePage.css new file mode 100644 index 0000000..15d5113 --- /dev/null +++ b/frontend/src/pages/AdminCachePage.css @@ -0,0 +1,372 @@ +.admin-cache-page { + padding: 2rem; + max-width: 1400px; + margin: 0 auto; +} + +.admin-cache-page h1 { + margin-bottom: 2rem; + color: var(--text-primary); +} + +.admin-cache-page h2 { + margin-bottom: 1rem; + color: var(--text-primary); + font-size: 1.25rem; +} + +/* Success/Error Messages */ +.success-message { + padding: 0.75rem 1rem; + background-color: #d4edda; + border: 1px solid #c3e6cb; + border-radius: 4px; + color: #155724; + margin-bottom: 1rem; +} + +.error-message { + padding: 0.75rem 1rem; + background-color: #f8d7da; + border: 1px solid #f5c6cb; + border-radius: 4px; + color: #721c24; + margin-bottom: 1rem; +} + +/* Settings Section */ +.settings-section { + background: var(--bg-secondary); + border: 1px solid var(--border-color); + border-radius: 8px; + padding: 1.5rem; + margin-bottom: 2rem; +} + +.settings-grid { + display: flex; + flex-direction: column; + gap: 1rem; +} + +.setting-item { + display: flex; + justify-content: space-between; + align-items: center; + padding: 1rem; + background: var(--bg-primary); + border: 1px solid var(--border-color); + border-radius: 4px; +} + +.toggle-label { + display: flex; + flex-direction: column; + gap: 0.25rem; +} + +.setting-name { + font-weight: 500; + color: var(--text-primary); + display: flex; + align-items: center; + gap: 0.5rem; +} + +.setting-description { + font-size: 0.85rem; + color: var(--text-secondary); +} + +.toggle-button { + padding: 0.5rem 1rem; + border: none; + border-radius: 4px; + cursor: pointer; + font-weight: 500; + min-width: 100px; +} + +.toggle-button.on { + background-color: #28a745; + color: white; +} + +.toggle-button.off { + background-color: #dc3545; + color: white; +} + +.toggle-button:disabled { + opacity: 0.6; + cursor: not-allowed; +} + +/* Sources Section */ +.sources-section { + background: var(--bg-secondary); + border: 1px solid var(--border-color); + border-radius: 8px; + padding: 1.5rem; +} + +.section-header { + display: flex; + justify-content: space-between; + align-items: center; + margin-bottom: 1rem; +} + +.section-header h2 { + margin: 0; +} + +/* Sources Table */ +.sources-table { + width: 100%; + border-collapse: collapse; + background: var(--bg-primary); + border-radius: 4px; + overflow: hidden; +} + +.sources-table th, +.sources-table td { + padding: 0.75rem 1rem; + text-align: left; + border-bottom: 1px solid var(--border-color); +} + +.sources-table th { + background: var(--bg-tertiary); + font-weight: 600; + color: var(--text-secondary); + font-size: 0.85rem; + text-transform: uppercase; +} + +.sources-table tr:last-child td { + border-bottom: none; +} + +.sources-table tr.disabled-row { + opacity: 0.6; +} + +.source-name { + font-weight: 500; + color: var(--text-primary); +} + +.url-cell { + font-family: monospace; + font-size: 0.9rem; + max-width: 300px; + overflow: hidden; + text-overflow: ellipsis; + white-space: nowrap; +} + +/* Badges */ +.public-badge, +.env-badge, +.status-badge { + display: inline-block; + padding: 0.2rem 0.5rem; + border-radius: 4px; + font-size: 0.75rem; + font-weight: 500; + margin-left: 0.5rem; +} + +.public-badge { + background-color: #e3f2fd; + color: #1976d2; +} + +.env-badge { + background-color: #fff3e0; + color: #e65100; +} + +.status-badge.enabled { + background-color: #e8f5e9; + color: #2e7d32; +} + +.status-badge.disabled { + background-color: #ffebee; + color: #c62828; +} + +/* Actions */ +.actions-cell { + white-space: nowrap; +} + +.actions-cell .btn { + margin-right: 0.5rem; +} + +.actions-cell .btn:last-child { + margin-right: 0; +} + +.test-result { + display: inline-block; + margin-left: 0.5rem; + font-size: 0.85rem; +} + +.test-result.success { + color: #2e7d32; +} + +.test-result.failure { + color: #c62828; +} + +/* Buttons */ +.btn { + padding: 0.5rem 1rem; + border: 1px solid var(--border-color); + border-radius: 4px; + background: var(--bg-primary); + color: var(--text-primary); + cursor: pointer; + font-size: 0.875rem; +} + +.btn:hover { + background: var(--bg-tertiary); +} + +.btn:disabled { + opacity: 0.6; + cursor: not-allowed; +} + +.btn-primary { + background-color: var(--color-primary); + border-color: var(--color-primary); + color: white; +} + +.btn-primary:hover { + background-color: var(--color-primary-hover); +} + +.btn-danger { + background-color: #dc3545; + border-color: #dc3545; + color: white; +} + +.btn-danger:hover { + background-color: #c82333; +} + +.btn-sm { + padding: 0.25rem 0.5rem; + font-size: 0.8rem; +} + +.empty-message { + color: var(--text-secondary); + font-style: italic; + padding: 2rem; + text-align: center; +} + +/* Modal */ +.modal-overlay { + position: fixed; + top: 0; + left: 0; + right: 0; + bottom: 0; + background: rgba(0, 0, 0, 0.5); + display: flex; + align-items: center; + justify-content: center; + z-index: 1000; +} + +.modal-content { + background: var(--bg-primary); + border-radius: 8px; + padding: 2rem; + width: 100%; + max-width: 600px; + max-height: 90vh; + overflow-y: auto; +} + +.modal-content h2 { + margin-top: 0; +} + +/* Form */ +.form-group { + margin-bottom: 1rem; +} + +.form-group label { + display: block; + margin-bottom: 0.5rem; + font-weight: 500; + color: var(--text-primary); +} + +.form-group input, +.form-group select { + width: 100%; + padding: 0.5rem; + border: 1px solid var(--border-color); + border-radius: 4px; + background: var(--bg-primary); + color: var(--text-primary); + font-size: 1rem; +} + +.form-group input:focus, +.form-group select:focus { + outline: none; + border-color: var(--color-primary); +} + +.form-row { + display: flex; + gap: 1rem; +} + +.form-row .form-group { + flex: 1; +} + +.checkbox-group label { + display: flex; + align-items: center; + gap: 0.5rem; + cursor: pointer; +} + +.checkbox-group input[type="checkbox"] { + width: auto; +} + +.help-text { + display: block; + font-size: 0.8rem; + color: var(--text-secondary); + margin-top: 0.25rem; +} + +.form-actions { + display: flex; + justify-content: flex-end; + gap: 0.5rem; + margin-top: 1.5rem; + padding-top: 1rem; + border-top: 1px solid var(--border-color); +} diff --git a/frontend/src/pages/AdminCachePage.tsx b/frontend/src/pages/AdminCachePage.tsx new file mode 100644 index 0000000..5b1b474 --- /dev/null +++ b/frontend/src/pages/AdminCachePage.tsx @@ -0,0 +1,580 @@ +import { useState, useEffect } from 'react'; +import { useNavigate } from 'react-router-dom'; +import { useAuth } from '../contexts/AuthContext'; +import { + listUpstreamSources, + createUpstreamSource, + updateUpstreamSource, + deleteUpstreamSource, + testUpstreamSource, + getCacheSettings, + updateCacheSettings, +} from '../api'; +import { UpstreamSource, CacheSettings, SourceType, AuthType } from '../types'; +import './AdminCachePage.css'; + +const SOURCE_TYPES: SourceType[] = ['npm', 'pypi', 'maven', 'docker', 'helm', 'nuget', 'deb', 'rpm', 'generic']; +const AUTH_TYPES: AuthType[] = ['none', 'basic', 'bearer', 'api_key']; + +function AdminCachePage() { + const { user, loading: authLoading } = useAuth(); + const navigate = useNavigate(); + + // Upstream sources state + const [sources, setSources] = useState([]); + const [loadingSources, setLoadingSources] = useState(true); + const [sourcesError, setSourcesError] = useState(null); + + // Cache settings state + const [settings, setSettings] = useState(null); + const [loadingSettings, setLoadingSettings] = useState(true); + const [settingsError, setSettingsError] = useState(null); + + // Create/Edit form state + const [showForm, setShowForm] = useState(false); + const [editingSource, setEditingSource] = useState(null); + const [formData, setFormData] = useState({ + name: '', + source_type: 'generic' as SourceType, + url: '', + enabled: true, + is_public: true, + auth_type: 'none' as AuthType, + username: '', + password: '', + priority: 100, + }); + const [formError, setFormError] = useState(null); + const [isSaving, setIsSaving] = useState(false); + + // Test result state + const [testingId, setTestingId] = useState(null); + const [testResults, setTestResults] = useState>({}); + + // Delete confirmation state + const [deletingId, setDeletingId] = useState(null); + + // Settings update state + const [updatingSettings, setUpdatingSettings] = useState(false); + + // Success message + const [successMessage, setSuccessMessage] = useState(null); + + useEffect(() => { + if (!authLoading && !user) { + navigate('/login', { state: { from: '/admin/cache' } }); + } + }, [user, authLoading, navigate]); + + useEffect(() => { + if (user && user.is_admin) { + loadSources(); + loadSettings(); + } + }, [user]); + + useEffect(() => { + if (successMessage) { + const timer = setTimeout(() => setSuccessMessage(null), 3000); + return () => clearTimeout(timer); + } + }, [successMessage]); + + async function loadSources() { + setLoadingSources(true); + setSourcesError(null); + try { + const data = await listUpstreamSources(); + setSources(data); + } catch (err) { + setSourcesError(err instanceof Error ? err.message : 'Failed to load sources'); + } finally { + setLoadingSources(false); + } + } + + async function loadSettings() { + setLoadingSettings(true); + setSettingsError(null); + try { + const data = await getCacheSettings(); + setSettings(data); + } catch (err) { + setSettingsError(err instanceof Error ? err.message : 'Failed to load settings'); + } finally { + setLoadingSettings(false); + } + } + + function openCreateForm() { + setEditingSource(null); + setFormData({ + name: '', + source_type: 'generic', + url: '', + enabled: true, + is_public: true, + auth_type: 'none', + username: '', + password: '', + priority: 100, + }); + setFormError(null); + setShowForm(true); + } + + function openEditForm(source: UpstreamSource) { + setEditingSource(source); + setFormData({ + name: source.name, + source_type: source.source_type, + url: source.url, + enabled: source.enabled, + is_public: source.is_public, + auth_type: source.auth_type, + username: source.username || '', + password: '', + priority: source.priority, + }); + setFormError(null); + setShowForm(true); + } + + async function handleFormSubmit(e: React.FormEvent) { + e.preventDefault(); + if (!formData.name.trim()) { + setFormError('Name is required'); + return; + } + if (!formData.url.trim()) { + setFormError('URL is required'); + return; + } + + setIsSaving(true); + setFormError(null); + + try { + if (editingSource) { + // Update existing source + await updateUpstreamSource(editingSource.id, { + name: formData.name.trim(), + source_type: formData.source_type, + url: formData.url.trim(), + enabled: formData.enabled, + is_public: formData.is_public, + auth_type: formData.auth_type, + username: formData.username.trim() || undefined, + password: formData.password || undefined, + priority: formData.priority, + }); + setSuccessMessage('Source updated successfully'); + } else { + // Create new source + await createUpstreamSource({ + name: formData.name.trim(), + source_type: formData.source_type, + url: formData.url.trim(), + enabled: formData.enabled, + is_public: formData.is_public, + auth_type: formData.auth_type, + username: formData.username.trim() || undefined, + password: formData.password || undefined, + priority: formData.priority, + }); + setSuccessMessage('Source created successfully'); + } + setShowForm(false); + await loadSources(); + } catch (err) { + setFormError(err instanceof Error ? err.message : 'Failed to save source'); + } finally { + setIsSaving(false); + } + } + + async function handleDelete(source: UpstreamSource) { + if (!window.confirm(`Delete upstream source "${source.name}"? This cannot be undone.`)) { + return; + } + + setDeletingId(source.id); + try { + await deleteUpstreamSource(source.id); + setSuccessMessage(`Source "${source.name}" deleted`); + await loadSources(); + } catch (err) { + setSourcesError(err instanceof Error ? err.message : 'Failed to delete source'); + } finally { + setDeletingId(null); + } + } + + async function handleTest(source: UpstreamSource) { + setTestingId(source.id); + setTestResults((prev) => ({ ...prev, [source.id]: { success: true, message: 'Testing...' } })); + + try { + const result = await testUpstreamSource(source.id); + setTestResults((prev) => ({ + ...prev, + [source.id]: { + success: result.success, + message: result.success + ? `Connected (${result.elapsed_ms}ms)` + : result.error || `HTTP ${result.status_code}`, + }, + })); + } catch (err) { + setTestResults((prev) => ({ + ...prev, + [source.id]: { + success: false, + message: err instanceof Error ? err.message : 'Test failed', + }, + })); + } finally { + setTestingId(null); + } + } + + async function handleSettingsToggle(field: 'allow_public_internet' | 'auto_create_system_projects') { + if (!settings) return; + + // Check if env override is active + const isOverridden = + (field === 'allow_public_internet' && settings.allow_public_internet_env_override !== null) || + (field === 'auto_create_system_projects' && settings.auto_create_system_projects_env_override !== null); + + if (isOverridden) { + alert('This setting is overridden by an environment variable and cannot be changed via UI.'); + return; + } + + setUpdatingSettings(true); + try { + const update = { [field]: !settings[field] }; + const newSettings = await updateCacheSettings(update); + setSettings(newSettings); + setSuccessMessage(`Setting "${field}" updated`); + } catch (err) { + setSettingsError(err instanceof Error ? err.message : 'Failed to update settings'); + } finally { + setUpdatingSettings(false); + } + } + + if (authLoading) { + return
Loading...
; + } + + if (!user?.is_admin) { + return ( +
+
Access denied. Admin privileges required.
+
+ ); + } + + return ( +
+

Cache Management

+ + {successMessage &&
{successMessage}
} + + {/* Cache Settings Section */} +
+

Global Settings

+ {loadingSettings ? ( +

Loading settings...

+ ) : settingsError ? ( +
{settingsError}
+ ) : settings ? ( +
+
+ + +
+
+ + +
+
+ ) : null} +
+ + {/* Upstream Sources Section */} +
+
+

Upstream Sources

+ +
+ + {loadingSources ? ( +

Loading sources...

+ ) : sourcesError ? ( +
{sourcesError}
+ ) : sources.length === 0 ? ( +

No upstream sources configured.

+ ) : ( + + + + + + + + + + + + + + {sources.map((source) => ( + + + + + + + + + + ))} + +
NameTypeURLPriorityStatusSourceActions
+ {source.name} + {source.is_public && Public} + {source.source_type}{source.url}{source.priority} + + {source.enabled ? 'Enabled' : 'Disabled'} + + + {source.source === 'env' ? ( + + ENV + + ) : ( + 'Database' + )} + + + {source.source !== 'env' && ( + <> + + + + )} + {testResults[source.id] && ( + + {testResults[source.id].message} + + )} +
+ )} +
+ + {/* Create/Edit Modal */} + {showForm && ( +
setShowForm(false)}> +
e.stopPropagation()}> +

{editingSource ? 'Edit Upstream Source' : 'Add Upstream Source'}

+
+ {formError &&
{formError}
} + +
+ + setFormData({ ...formData, name: e.target.value })} + placeholder="e.g., npm-private" + required + /> +
+ +
+
+ + +
+ +
+ + setFormData({ ...formData, priority: parseInt(e.target.value) || 100 })} + min="1" + /> + Lower = higher priority +
+
+ +
+ + setFormData({ ...formData, url: e.target.value })} + placeholder="https://registry.example.com" + required + /> +
+ +
+
+ +
+
+ +
+
+ +
+ + +
+ + {formData.auth_type !== 'none' && ( +
+ {(formData.auth_type === 'basic' || formData.auth_type === 'api_key') && ( +
+ + setFormData({ ...formData, username: e.target.value })} + placeholder={formData.auth_type === 'api_key' ? 'X-API-Key' : 'username'} + /> +
+ )} +
+ + setFormData({ ...formData, password: e.target.value })} + placeholder={editingSource ? '(unchanged)' : ''} + /> + {editingSource && ( + Leave empty to keep existing {formData.auth_type === 'bearer' ? 'token' : 'credentials'} + )} +
+
+ )} + +
+ + +
+
+
+
+ )} +
+ ); +} + +export default AdminCachePage; diff --git a/frontend/src/pages/Home.css b/frontend/src/pages/Home.css index 9c55517..fdd214c 100644 --- a/frontend/src/pages/Home.css +++ b/frontend/src/pages/Home.css @@ -493,3 +493,16 @@ gap: 6px; flex-wrap: wrap; } + +/* Cell name styles */ +.cell-name { + display: flex; + align-items: center; + gap: 8px; +} + +/* System project badge */ +.system-badge { + font-size: 0.7rem; + padding: 2px 6px; +} diff --git a/frontend/src/pages/Home.tsx b/frontend/src/pages/Home.tsx index 7b3792b..b6f0e41 100644 --- a/frontend/src/pages/Home.tsx +++ b/frontend/src/pages/Home.tsx @@ -224,6 +224,9 @@ function Home() { {!project.is_public && } {project.name} + {project.is_system && ( + Cache + )} ), }, diff --git a/frontend/src/pages/ProjectPage.tsx b/frontend/src/pages/ProjectPage.tsx index f287ed1..d28b4d9 100644 --- a/frontend/src/pages/ProjectPage.tsx +++ b/frontend/src/pages/ProjectPage.tsx @@ -195,6 +195,9 @@ function ProjectPage() { {project.is_public ? 'Public' : 'Private'} + {project.is_system && ( + System Cache + )} {accessLevel && ( {isOwner ? 'Owner' : accessLevel.charAt(0).toUpperCase() + accessLevel.slice(1)} diff --git a/frontend/src/types.ts b/frontend/src/types.ts index eb9306c..d85b9de 100644 --- a/frontend/src/types.ts +++ b/frontend/src/types.ts @@ -6,6 +6,7 @@ export interface Project { name: string; description: string | null; is_public: boolean; + is_system?: boolean; // True for system cache projects (_npm, _pypi, etc.) created_at: string; updated_at: string; created_by: string; @@ -503,3 +504,74 @@ export interface TeamMemberCreate { export interface TeamMemberUpdate { role: TeamRole; } + +// Upstream Source types +export type SourceType = 'npm' | 'pypi' | 'maven' | 'docker' | 'helm' | 'nuget' | 'deb' | 'rpm' | 'generic'; +export type AuthType = 'none' | 'basic' | 'bearer' | 'api_key'; + +export interface UpstreamSource { + id: string; + name: string; + source_type: SourceType; + url: string; + enabled: boolean; + is_public: boolean; + auth_type: AuthType; + username: string | null; + has_password: boolean; + has_headers: boolean; + priority: number; + source: 'database' | 'env'; + created_at: string | null; + updated_at: string | null; +} + +export interface UpstreamSourceCreate { + name: string; + source_type: SourceType; + url: string; + enabled?: boolean; + is_public?: boolean; + auth_type?: AuthType; + username?: string; + password?: string; + headers?: Record; + priority?: number; +} + +export interface UpstreamSourceUpdate { + name?: string; + source_type?: SourceType; + url?: string; + enabled?: boolean; + is_public?: boolean; + auth_type?: AuthType; + username?: string; + password?: string; + headers?: Record | null; + priority?: number; +} + +export interface UpstreamSourceTestResult { + success: boolean; + status_code: number | null; + elapsed_ms: number; + error: string | null; + source_id: string; + source_name: string; +} + +// Cache Settings types +export interface CacheSettings { + allow_public_internet: boolean; + auto_create_system_projects: boolean; + allow_public_internet_env_override: boolean | null; + auto_create_system_projects_env_override: boolean | null; + created_at: string | null; + updated_at: string | null; +} + +export interface CacheSettingsUpdate { + allow_public_internet?: boolean; + auto_create_system_projects?: boolean; +} diff --git a/migrations/010_upstream_caching.sql b/migrations/010_upstream_caching.sql new file mode 100644 index 0000000..2c0d58a --- /dev/null +++ b/migrations/010_upstream_caching.sql @@ -0,0 +1,137 @@ +-- Migration 010: Upstream Artifact Caching +-- Adds support for caching artifacts from upstream registries (npm, PyPI, Maven, etc.) +-- Part of "The cache that never forgets" epic for hermetic builds + +-- ============================================================================= +-- upstream_sources: Configure upstream registries for artifact caching +-- ============================================================================= +CREATE TABLE IF NOT EXISTS upstream_sources ( + id UUID PRIMARY KEY DEFAULT gen_random_uuid(), + name VARCHAR(255) NOT NULL UNIQUE, + source_type VARCHAR(50) NOT NULL DEFAULT 'generic', + url VARCHAR(2048) NOT NULL, + enabled BOOLEAN NOT NULL DEFAULT FALSE, + is_public BOOLEAN NOT NULL DEFAULT TRUE, + auth_type VARCHAR(20) NOT NULL DEFAULT 'none', + username VARCHAR(255), + password_encrypted BYTEA, + headers_encrypted BYTEA, + priority INTEGER NOT NULL DEFAULT 100, + created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP, + updated_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP, + + -- Source type must be one of the supported types + CONSTRAINT check_source_type CHECK ( + source_type IN ('npm', 'pypi', 'maven', 'docker', 'helm', 'nuget', 'deb', 'rpm', 'generic') + ), + + -- Auth type must be valid + CONSTRAINT check_auth_type CHECK ( + auth_type IN ('none', 'basic', 'bearer', 'api_key') + ), + + -- Priority must be positive + CONSTRAINT check_priority_positive CHECK (priority > 0) +); + +-- Indexes for upstream_sources +CREATE INDEX IF NOT EXISTS idx_upstream_sources_enabled ON upstream_sources(enabled); +CREATE INDEX IF NOT EXISTS idx_upstream_sources_source_type ON upstream_sources(source_type); +CREATE INDEX IF NOT EXISTS idx_upstream_sources_is_public ON upstream_sources(is_public); +CREATE INDEX IF NOT EXISTS idx_upstream_sources_priority ON upstream_sources(priority); + +-- Comments for upstream_sources +COMMENT ON TABLE upstream_sources IS 'Configuration for upstream artifact registries (npm, PyPI, Maven, etc.)'; +COMMENT ON COLUMN upstream_sources.name IS 'Unique human-readable name (e.g., npm-public, artifactory-private)'; +COMMENT ON COLUMN upstream_sources.source_type IS 'Type of registry: npm, pypi, maven, docker, helm, nuget, deb, rpm, generic'; +COMMENT ON COLUMN upstream_sources.url IS 'Base URL of the upstream registry'; +COMMENT ON COLUMN upstream_sources.enabled IS 'Whether this source is active for caching'; +COMMENT ON COLUMN upstream_sources.is_public IS 'True if this is a public internet source (for air-gap mode)'; +COMMENT ON COLUMN upstream_sources.auth_type IS 'Authentication type: none, basic, bearer, api_key'; +COMMENT ON COLUMN upstream_sources.username IS 'Username for basic auth'; +COMMENT ON COLUMN upstream_sources.password_encrypted IS 'Fernet-encrypted password/token'; +COMMENT ON COLUMN upstream_sources.headers_encrypted IS 'Fernet-encrypted custom headers (JSON)'; +COMMENT ON COLUMN upstream_sources.priority IS 'Priority for source selection (lower = higher priority)'; + +-- ============================================================================= +-- cache_settings: Global cache configuration (singleton table) +-- ============================================================================= +CREATE TABLE IF NOT EXISTS cache_settings ( + id INTEGER PRIMARY KEY DEFAULT 1, + allow_public_internet BOOLEAN NOT NULL DEFAULT TRUE, + auto_create_system_projects BOOLEAN NOT NULL DEFAULT TRUE, + created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP, + updated_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP, + + -- Singleton constraint + CONSTRAINT check_cache_settings_singleton CHECK (id = 1) +); + +-- Insert default row +INSERT INTO cache_settings (id, allow_public_internet, auto_create_system_projects) +VALUES (1, TRUE, TRUE) +ON CONFLICT (id) DO NOTHING; + +-- Comments for cache_settings +COMMENT ON TABLE cache_settings IS 'Global cache settings (singleton table)'; +COMMENT ON COLUMN cache_settings.allow_public_internet IS 'Air-gap mode: when false, blocks all public internet sources'; +COMMENT ON COLUMN cache_settings.auto_create_system_projects IS 'Auto-create system projects (_npm, _pypi, etc.) on first cache'; + +-- ============================================================================= +-- cached_urls: Track URL to artifact mappings for provenance +-- ============================================================================= +CREATE TABLE IF NOT EXISTS cached_urls ( + id UUID PRIMARY KEY DEFAULT gen_random_uuid(), + url VARCHAR(4096) NOT NULL, + url_hash VARCHAR(64) NOT NULL, + artifact_id VARCHAR(64) NOT NULL REFERENCES artifacts(id), + source_id UUID REFERENCES upstream_sources(id) ON DELETE SET NULL, + fetched_at TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT CURRENT_TIMESTAMP, + response_headers JSONB DEFAULT '{}', + created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP, + + -- URL hash must be unique (same URL = same cached artifact) + CONSTRAINT unique_url_hash UNIQUE (url_hash) +); + +-- Indexes for cached_urls +CREATE INDEX IF NOT EXISTS idx_cached_urls_url_hash ON cached_urls(url_hash); +CREATE INDEX IF NOT EXISTS idx_cached_urls_artifact_id ON cached_urls(artifact_id); +CREATE INDEX IF NOT EXISTS idx_cached_urls_source_id ON cached_urls(source_id); +CREATE INDEX IF NOT EXISTS idx_cached_urls_fetched_at ON cached_urls(fetched_at); + +-- Comments for cached_urls +COMMENT ON TABLE cached_urls IS 'Tracks which URLs have been cached and maps to artifacts'; +COMMENT ON COLUMN cached_urls.url IS 'Original URL that was fetched'; +COMMENT ON COLUMN cached_urls.url_hash IS 'SHA256 hash of URL for fast lookup'; +COMMENT ON COLUMN cached_urls.artifact_id IS 'The cached artifact (by SHA256 content hash)'; +COMMENT ON COLUMN cached_urls.source_id IS 'Which upstream source provided this (null if manual)'; +COMMENT ON COLUMN cached_urls.fetched_at IS 'When the URL was fetched from upstream'; +COMMENT ON COLUMN cached_urls.response_headers IS 'Original response headers from upstream (for debugging)'; + +-- ============================================================================= +-- Add is_system column to projects table for system cache projects +-- ============================================================================= +DO $$ +BEGIN + IF NOT EXISTS ( + SELECT 1 FROM information_schema.columns + WHERE table_name = 'projects' AND column_name = 'is_system' + ) THEN + ALTER TABLE projects ADD COLUMN is_system BOOLEAN NOT NULL DEFAULT FALSE; + CREATE INDEX IF NOT EXISTS idx_projects_is_system ON projects(is_system); + END IF; +END $$; + +COMMENT ON COLUMN projects.is_system IS 'True for system cache projects (_npm, _pypi, etc.)'; + +-- ============================================================================= +-- Seed default upstream sources (disabled by default for safety) +-- ============================================================================= +INSERT INTO upstream_sources (id, name, source_type, url, enabled, is_public, auth_type, priority) +VALUES + (gen_random_uuid(), 'npm-public', 'npm', 'https://registry.npmjs.org', FALSE, TRUE, 'none', 100), + (gen_random_uuid(), 'pypi-public', 'pypi', 'https://pypi.org/simple', FALSE, TRUE, 'none', 100), + (gen_random_uuid(), 'maven-central', 'maven', 'https://repo1.maven.org/maven2', FALSE, TRUE, 'none', 100), + (gen_random_uuid(), 'docker-hub', 'docker', 'https://registry-1.docker.io', FALSE, TRUE, 'none', 100) +ON CONFLICT (name) DO NOTHING;