Add upstream caching infrastructure and refactor CI pipeline

Upstream Caching (Epic #68-#75, #105):
- Add upstream_sources and cache_settings tables with migrations
- Add cache management API endpoints (CRUD for sources, settings)
- Add environment variable overrides for upstream sources and cache settings
- Add encryption module for storing credentials securely
- Add frontend Admin Cache Management page
- Add is_system field to projects for system cache distinction
- Add purge_seed_data for transitioning to production-like environments

CI Pipeline Refactoring:
- Remove reset jobs (reset_stage_pre, reset_stage)
- Add ephemeral orchard-test deployment for main branch testing
- Run integration tests on ephemeral deployment before promoting to stage
- Stage is now long-running pre-prod (smoke tests only)
- Disable prosper_setup for tag pipelines
This commit is contained in:
Mondo Diaz
2026-01-29 11:28:59 -06:00
parent c92895ffe9
commit a3a49ac9c3
24 changed files with 7271 additions and 103 deletions

View File

@@ -11,13 +11,6 @@ variables:
# Environment URLs (used by deploy and test jobs)
STAGE_URL: https://orchard-stage.common.global.bsf.tools
PROD_URL: https://orchard.common.global.bsf.tools
# Stage environment AWS resources (used by reset job)
STAGE_RDS_HOST: orchard-stage.cluster-cvw3jzjkozoc.us-gov-west-1.rds.amazonaws.com
STAGE_RDS_DBNAME: postgres
STAGE_SECRET_ARN: "arn:aws-us-gov:secretsmanager:us-gov-west-1:052673043337:secret:rds!cluster-a573672b-1a38-4665-a654-1b7df37b5297-IaeFQL"
STAGE_AUTH_SECRET_ARN: "arn:aws-us-gov:secretsmanager:us-gov-west-1:052673043337:secret:orchard-stage-creds-SMqvQx"
STAGE_S3_BUCKET: orchard-artifacts-stage
AWS_REGION: us-gov-west-1
# Shared pip cache directory
PIP_CACHE_DIR: "$CI_PROJECT_DIR/.pip-cache"
@@ -95,10 +88,18 @@ cve_sbom_analysis:
when: never
- when: on_success
# Override release job to wait for stage integration tests before creating tag
# Disable prosper_setup for tag pipelines since no build/analysis jobs run
# (image is already built when commit was on main, and deploy uses helm directly)
prosper_setup:
rules:
- if: '$CI_COMMIT_TAG'
when: never
- when: on_success
# Override release job to wait for stage deployment and smoke tests before creating tag
# This ensures the tag (which triggers prod deploy) is only created after stage passes
release:
needs: [integration_test_stage, changelog]
needs: [smoke_test_stage, changelog]
# Full integration test suite template (for feature/stage deployments)
# Runs the complete pytest integration test suite against the deployed environment
@@ -200,107 +201,91 @@ release:
sys.exit(0)
PYTEST_SCRIPT
# Reset stage template - runs from CI runner, uses CI variable for auth
# Calls the /api/v1/admin/factory-reset endpoint which handles DB and S3 cleanup
.reset_stage_template: &reset_stage_template
stage: deploy
image: deps.global.bsf.tools/docker/python:3.12-slim
timeout: 5m
retry: 1
# Ephemeral test deployment in stage namespace (main branch only)
# Runs integration tests before promoting to long-running stage
deploy_test:
<<: *deploy_template
variables:
NAMESPACE: orch-stage-namespace
VALUES_FILE: helm/orchard/values-dev.yaml
BASE_URL: https://orchard-test.common.global.bsf.tools
before_script:
- pip install --index-url "$PIP_INDEX_URL" httpx
- kubectl config use-context esv/bsf/bsf-integration/orchard/orchard-mvp:orchard-stage
- *helm_setup
script:
- echo "Deploying ephemeral test environment"
- cd $CI_PROJECT_DIR
- |
python - <<'RESET_SCRIPT'
import httpx
import sys
import os
import time
BASE_URL = os.environ.get("STAGE_URL", "")
ADMIN_USER = "admin"
ADMIN_PASS = os.environ.get("STAGE_ADMIN_PASSWORD", "")
MAX_RETRIES = 3
RETRY_DELAY = 5
if not BASE_URL:
print("ERROR: STAGE_URL not set")
sys.exit(1)
if not ADMIN_PASS:
print("ERROR: STAGE_ADMIN_PASSWORD not set")
sys.exit(1)
print(f"=== Resetting stage environment at {BASE_URL} ===")
def do_reset():
with httpx.Client(base_url=BASE_URL, timeout=120.0) as client:
print("Logging in as admin...")
login_response = client.post(
"/api/v1/auth/login",
json={"username": ADMIN_USER, "password": ADMIN_PASS},
)
if login_response.status_code != 200:
raise Exception(f"Login failed: {login_response.status_code} - {login_response.text}")
print("Login successful")
print("Calling factory reset endpoint...")
reset_response = client.post(
"/api/v1/admin/factory-reset",
headers={"X-Confirm-Reset": "yes-delete-all-data"},
)
if reset_response.status_code == 200:
result = reset_response.json()
print("Factory reset successful!")
print(f" Database tables dropped: {result['results']['database_tables_dropped']}")
print(f" S3 objects deleted: {result['results']['s3_objects_deleted']}")
print(f" Database reinitialized: {result['results']['database_reinitialized']}")
print(f" Seeded: {result['results']['seeded']}")
return True
else:
raise Exception(f"Factory reset failed: {reset_response.status_code} - {reset_response.text}")
for attempt in range(1, MAX_RETRIES + 1):
try:
print(f"Attempt {attempt}/{MAX_RETRIES}")
if do_reset():
sys.exit(0)
except Exception as e:
print(f"Attempt {attempt} failed: {e}")
if attempt < MAX_RETRIES:
print(f"Retrying in {RETRY_DELAY} seconds...")
time.sleep(RETRY_DELAY)
else:
print("All retry attempts failed")
sys.exit(1)
RESET_SCRIPT
helm upgrade --install orchard-test ./helm/orchard \
--namespace $NAMESPACE \
-f $VALUES_FILE \
--set image.tag=git.linux-amd64-$CI_COMMIT_SHA \
--set orchard.auth.adminPassword=$STAGE_ADMIN_PASSWORD \
--set ingress.hosts[0].host=orchard-test.common.global.bsf.tools \
--set ingress.tls[0].hosts[0]=orchard-test.common.global.bsf.tools \
--set ingress.tls[0].secretName=orchard-test-tls \
--set minioIngress.host=minio-test.common.global.bsf.tools \
--set minioIngress.tls.secretName=minio-test-tls \
--wait \
--atomic \
--timeout 10m
- kubectl rollout status deployment/orchard-test-server -n $NAMESPACE --timeout=10m
- *verify_deployment
environment:
name: test
url: https://orchard-test.common.global.bsf.tools
on_stop: cleanup_test
kubernetes:
agent: esv/bsf/bsf-integration/orchard/orchard-mvp:orchard-stage
rules:
- if: '$CI_COMMIT_BRANCH == "main"'
when: on_success
# Reset stage BEFORE integration tests (ensure known state)
reset_stage_pre:
<<: *reset_stage_template
needs: [deploy_stage]
# Integration tests for stage deployment
# Uses CI variable STAGE_ADMIN_PASSWORD (set in GitLab CI/CD settings)
integration_test_stage:
<<: *integration_test_template
needs: [reset_stage_pre]
# Cleanup ephemeral test deployment after integration tests
cleanup_test:
stage: deploy
needs: [integration_test_main]
image: deps.global.bsf.tools/registry-1.docker.io/alpine/k8s:1.29.12
timeout: 5m
variables:
ORCHARD_TEST_URL: $STAGE_URL
NAMESPACE: orch-stage-namespace
GIT_STRATEGY: none
before_script:
- kubectl config use-context esv/bsf/bsf-integration/orchard/orchard-mvp:orchard-stage
script:
- echo "Cleaning up ephemeral test deployment orchard-test"
- helm uninstall orchard-test --namespace $NAMESPACE || true
environment:
name: test
action: stop
kubernetes:
agent: esv/bsf/bsf-integration/orchard/orchard-mvp:orchard-stage
rules:
- if: '$CI_COMMIT_BRANCH == "main"'
when: on_success
allow_failure: true
# Integration tests for ephemeral test deployment (main branch)
# Runs against orchard-test before promoting to long-running stage
integration_test_main:
<<: *integration_test_template
needs: [deploy_test]
variables:
ORCHARD_TEST_URL: https://orchard-test.common.global.bsf.tools
ORCHARD_TEST_PASSWORD: $STAGE_ADMIN_PASSWORD
rules:
- if: '$CI_COMMIT_BRANCH == "main"'
when: on_success
# Reset stage AFTER integration tests (clean slate for next run)
reset_stage:
<<: *reset_stage_template
needs: [integration_test_stage]
allow_failure: true # Don't fail pipeline if reset has issues
# Smoke test for long-running stage (after promotion)
smoke_test_stage:
<<: *smoke_test_template
needs: [deploy_stage]
variables:
ORCHARD_TEST_URL: $STAGE_URL
rules:
- if: '$CI_COMMIT_BRANCH == "main"'
when: on_success
# Integration tests for feature deployment (full suite)
# Uses DEV_ADMIN_PASSWORD CI variable (same as deploy_feature)
@@ -412,9 +397,12 @@ frontend_tests:
echo "Health check failed after 30 attempts"
exit 1
# Deploy to stage (main branch)
# Deploy to long-running stage (main branch, after ephemeral tests pass)
deploy_stage:
<<: *deploy_template
stage: deploy
# Wait for ephemeral test to pass before promoting to long-running stage
needs: [cleanup_test]
image: deps.global.bsf.tools/registry-1.docker.io/alpine/k8s:1.29.12
variables:
NAMESPACE: orch-stage-namespace
VALUES_FILE: helm/orchard/values-stage.yaml
@@ -423,7 +411,7 @@ deploy_stage:
- kubectl config use-context esv/bsf/bsf-integration/orchard/orchard-mvp:orchard-stage
- *helm_setup
script:
- echo "Deploying to stage environment"
- echo "Deploying to long-running stage environment"
- cd $CI_PROJECT_DIR
- |
helm upgrade --install orchard-stage ./helm/orchard \