Simplify stage CI jobs to use CI variable for admin password

- Replace in-cluster k8s jobs with standard CI runner execution
- Use STAGE_ADMIN_PASSWORD CI variable instead of Secrets Manager
- Simplify reset_stage_template (no longer needs kubectl/IRSA)
- integration_test_stage now uses standard integration_test_template

Requires setting STAGE_ADMIN_PASSWORD CI variable in GitLab settings.
This commit is contained in:
Mondo Diaz
2026-01-27 18:39:20 +00:00
parent 1f3e19d3a5
commit 0cf349ddb3

View File

@@ -197,140 +197,81 @@ release:
sys.exit(0)
PYTEST_SCRIPT
# Reset stage template - runs in-cluster with IRSA for Secrets Manager access
# Reset stage template - runs from CI runner, uses CI variable for auth
# Calls the /api/v1/admin/factory-reset endpoint which handles DB and S3 cleanup
.reset_stage_template: &reset_stage_template
stage: deploy
image: deps.global.bsf.tools/registry-1.docker.io/alpine/k8s:1.29.12
timeout: 10m
image: deps.global.bsf.tools/docker/python:3.12-slim
timeout: 5m
retry: 1
variables:
NAMESPACE: orch-stage-namespace
before_script:
- kubectl config use-context esv/bsf/bsf-integration/orchard/orchard-mvp:orchard-stage
- pip install --index-url "$PIP_INDEX_URL" httpx
script:
- |
# Create a Job to run the reset in the cluster
cat <<EOF | kubectl apply -f -
apiVersion: batch/v1
kind: Job
metadata:
name: reset-stage-${CI_PIPELINE_ID}-${CI_JOB_ID}
namespace: ${NAMESPACE}
spec:
ttlSecondsAfterFinished: 300
backoffLimit: 2
template:
spec:
serviceAccountName: orchard
restartPolicy: Never
containers:
- name: reset-runner
image: deps.global.bsf.tools/docker/python:3.12-slim
env:
- name: STAGE_URL
value: "${STAGE_URL}"
- name: AWS_REGION
value: "${AWS_REGION}"
- name: STAGE_AUTH_SECRET_ARN
value: "${STAGE_AUTH_SECRET_ARN}"
- name: PIP_INDEX_URL
value: "${PIP_INDEX_URL}"
command:
- /bin/bash
- -c
- |
set -e
pip install --index-url "\$PIP_INDEX_URL" httpx boto3
python - <<'RESET_SCRIPT'
import httpx
import sys
import os
import time
python - <<'RESET_SCRIPT'
import httpx
import sys
import os
import time
import json
import boto3
BASE_URL = os.environ.get("STAGE_URL", "")
ADMIN_USER = "admin"
ADMIN_PASS = os.environ.get("STAGE_ADMIN_PASSWORD", "")
MAX_RETRIES = 3
RETRY_DELAY = 5
BASE_URL = os.environ.get("STAGE_URL", "")
ADMIN_USER = "admin"
MAX_RETRIES = 3
RETRY_DELAY = 5
if not BASE_URL:
print("ERROR: STAGE_URL not set")
sys.exit(1)
# Fetch admin password from AWS Secrets Manager using IRSA
secret_arn = os.environ.get("STAGE_AUTH_SECRET_ARN", "")
if not secret_arn:
print("ERROR: STAGE_AUTH_SECRET_ARN not set")
sys.exit(1)
if not ADMIN_PASS:
print("ERROR: STAGE_ADMIN_PASSWORD not set")
sys.exit(1)
try:
client = boto3.client('secretsmanager', region_name=os.environ.get("AWS_REGION"))
secret = client.get_secret_value(SecretId=secret_arn)
data = json.loads(secret['SecretString'])
ADMIN_PASS = data['admin_password']
print("Successfully fetched admin password from Secrets Manager")
except Exception as e:
print(f"ERROR: Failed to fetch secret: {e}")
sys.exit(1)
print(f"=== Resetting stage environment at {BASE_URL} ===")
if not BASE_URL:
print("ERROR: STAGE_URL not set")
sys.exit(1)
def do_reset():
with httpx.Client(base_url=BASE_URL, timeout=120.0) as client:
print("Logging in as admin...")
login_response = client.post(
"/api/v1/auth/login",
json={"username": ADMIN_USER, "password": ADMIN_PASS},
)
if login_response.status_code != 200:
raise Exception(f"Login failed: {login_response.status_code} - {login_response.text}")
print("Login successful")
print(f"=== Resetting stage environment at {BASE_URL} ===")
print("Calling factory reset endpoint...")
reset_response = client.post(
"/api/v1/admin/factory-reset",
headers={"X-Confirm-Reset": "yes-delete-all-data"},
)
def do_reset():
with httpx.Client(base_url=BASE_URL, timeout=120.0) as client:
print("Logging in as admin...")
login_response = client.post(
"/api/v1/auth/login",
json={"username": ADMIN_USER, "password": ADMIN_PASS},
)
if login_response.status_code != 200:
raise Exception(f"Login failed: {login_response.status_code} - {login_response.text}")
print("Login successful")
if reset_response.status_code == 200:
result = reset_response.json()
print("Factory reset successful!")
print(f" Database tables dropped: {result['results']['database_tables_dropped']}")
print(f" S3 objects deleted: {result['results']['s3_objects_deleted']}")
print(f" Database reinitialized: {result['results']['database_reinitialized']}")
print(f" Seeded: {result['results']['seeded']}")
return True
else:
raise Exception(f"Factory reset failed: {reset_response.status_code} - {reset_response.text}")
print("Calling factory reset endpoint...")
reset_response = client.post(
"/api/v1/admin/factory-reset",
headers={"X-Confirm-Reset": "yes-delete-all-data"},
)
if reset_response.status_code == 200:
result = reset_response.json()
print("Factory reset successful!")
print(f" Database tables dropped: {result['results']['database_tables_dropped']}")
print(f" S3 objects deleted: {result['results']['s3_objects_deleted']}")
print(f" Database reinitialized: {result['results']['database_reinitialized']}")
print(f" Seeded: {result['results']['seeded']}")
return True
else:
raise Exception(f"Factory reset failed: {reset_response.status_code} - {reset_response.text}")
for attempt in range(1, MAX_RETRIES + 1):
try:
print(f"Attempt {attempt}/{MAX_RETRIES}")
if do_reset():
sys.exit(0)
except Exception as e:
print(f"Attempt {attempt} failed: {e}")
if attempt < MAX_RETRIES:
print(f"Retrying in {RETRY_DELAY} seconds...")
time.sleep(RETRY_DELAY)
else:
print("All retry attempts failed")
sys.exit(1)
RESET_SCRIPT
EOF
- |
echo "Waiting for reset job to complete..."
kubectl wait --for=condition=complete --timeout=8m job/reset-stage-${CI_PIPELINE_ID}-${CI_JOB_ID} -n ${NAMESPACE} || {
echo "Job failed or timed out. Fetching logs..."
kubectl logs job/reset-stage-${CI_PIPELINE_ID}-${CI_JOB_ID} -n ${NAMESPACE} || true
kubectl delete job reset-stage-${CI_PIPELINE_ID}-${CI_JOB_ID} -n ${NAMESPACE} || true
exit 1
}
- kubectl logs job/reset-stage-${CI_PIPELINE_ID}-${CI_JOB_ID} -n ${NAMESPACE}
- kubectl delete job reset-stage-${CI_PIPELINE_ID}-${CI_JOB_ID} -n ${NAMESPACE} || true
for attempt in range(1, MAX_RETRIES + 1):
try:
print(f"Attempt {attempt}/{MAX_RETRIES}")
if do_reset():
sys.exit(0)
except Exception as e:
print(f"Attempt {attempt} failed: {e}")
if attempt < MAX_RETRIES:
print(f"Retrying in {RETRY_DELAY} seconds...")
time.sleep(RETRY_DELAY)
else:
print("All retry attempts failed")
sys.exit(1)
RESET_SCRIPT
rules:
- if: '$CI_COMMIT_BRANCH == "main"'
when: on_success
@@ -340,98 +281,14 @@ reset_stage_pre:
<<: *reset_stage_template
needs: [deploy_stage]
# Integration tests for stage deployment (runs in-cluster with IRSA for Secrets Manager access)
# Integration tests for stage deployment
# Uses CI variable STAGE_ADMIN_PASSWORD (set in GitLab CI/CD settings)
integration_test_stage:
stage: deploy
<<: *integration_test_template
needs: [reset_stage_pre]
image: deps.global.bsf.tools/registry-1.docker.io/alpine/k8s:1.29.12
timeout: 20m
variables:
NAMESPACE: orch-stage-namespace
before_script:
- kubectl config use-context esv/bsf/bsf-integration/orchard/orchard-mvp:orchard-stage
script:
- |
# Create a Job to run integration tests in the cluster
cat <<EOF | kubectl apply -f -
apiVersion: batch/v1
kind: Job
metadata:
name: integration-test-${CI_PIPELINE_ID}
namespace: ${NAMESPACE}
spec:
ttlSecondsAfterFinished: 300
backoffLimit: 1
template:
spec:
serviceAccountName: orchard
restartPolicy: Never
containers:
- name: test-runner
image: deps.global.bsf.tools/docker/python:3.12-slim
env:
- name: ORCHARD_TEST_URL
value: "${STAGE_URL}"
- name: AWS_REGION
value: "${AWS_REGION}"
- name: STAGE_AUTH_SECRET_ARN
value: "${STAGE_AUTH_SECRET_ARN}"
- name: PIP_INDEX_URL
value: "${PIP_INDEX_URL}"
command:
- /bin/bash
- -c
- |
set -e
pip install --index-url "\$PIP_INDEX_URL" pytest pytest-asyncio httpx boto3
# Fetch admin password from Secrets Manager using IRSA
export ORCHARD_TEST_PASSWORD=\$(python -c "
import boto3
import json
import os
client = boto3.client('secretsmanager', region_name=os.environ['AWS_REGION'])
secret = client.get_secret_value(SecretId=os.environ['STAGE_AUTH_SECRET_ARN'])
data = json.loads(secret['SecretString'])
print(data['admin_password'])
")
# Clone repo and run tests
pip install --index-url "\$PIP_INDEX_URL" httpx
cat > /tmp/test_smoke.py << 'TESTEOF'
import os
import httpx
def test_health():
url = os.environ["ORCHARD_TEST_URL"]
r = httpx.get(f"{url}/health", timeout=30)
assert r.status_code == 200
def test_login():
url = os.environ["ORCHARD_TEST_URL"]
password = os.environ["ORCHARD_TEST_PASSWORD"]
with httpx.Client(base_url=url, timeout=30) as client:
r = client.post("/api/v1/auth/login", json={"username": "admin", "password": password})
assert r.status_code == 200, f"Login failed: {r.status_code} {r.text}"
def test_api():
url = os.environ["ORCHARD_TEST_URL"]
r = httpx.get(f"{url}/api/v1/projects", timeout=30)
assert r.status_code == 200
TESTEOF
python -m pytest /tmp/test_smoke.py -v
EOF
- |
echo "Waiting for test job to complete..."
kubectl wait --for=condition=complete --timeout=15m job/integration-test-${CI_PIPELINE_ID} -n ${NAMESPACE} || {
echo "Job failed or timed out. Fetching logs..."
kubectl logs job/integration-test-${CI_PIPELINE_ID} -n ${NAMESPACE} || true
kubectl delete job integration-test-${CI_PIPELINE_ID} -n ${NAMESPACE} || true
exit 1
}
- kubectl logs job/integration-test-${CI_PIPELINE_ID} -n ${NAMESPACE}
- kubectl delete job integration-test-${CI_PIPELINE_ID} -n ${NAMESPACE} || true
ORCHARD_TEST_URL: $STAGE_URL
ORCHARD_TEST_PASSWORD: $STAGE_ADMIN_PASSWORD
rules:
- if: '$CI_COMMIT_BRANCH == "main"'
when: on_success