Simplify stage CI jobs to use CI variable for admin password

- Replace in-cluster k8s jobs with standard CI runner execution - Use STAGE_ADMIN_PASSWORD CI variable instead of Secrets Manager - Simplify reset_stage_template (no longer needs kubectl/IRSA) - integration_test_stage now uses standard integration_test_template Requires setting STAGE_ADMIN_PASSWORD CI variable in GitLab settings.
2026-01-27 18:39:20 +00:00
parent 1f3e19d3a5
commit 0cf349ddb3
1 changed files with 65 additions and 208 deletions
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -197,140 +197,81 @@ release:
          sys.exit(0)
      PYTEST_SCRIPT

-# Reset stage template - runs in-cluster with IRSA for Secrets Manager access
+# Reset stage template - runs from CI runner, uses CI variable for auth
 # Calls the /api/v1/admin/factory-reset endpoint which handles DB and S3 cleanup
 .reset_stage_template: &reset_stage_template
  stage: deploy
-  image: deps.global.bsf.tools/registry-1.docker.io/alpine/k8s:1.29.12
-  timeout: 10m
+  image: deps.global.bsf.tools/docker/python:3.12-slim
+  timeout: 5m
  retry: 1
-  variables:
-    NAMESPACE: orch-stage-namespace
  before_script:
-    - kubectl config use-context esv/bsf/bsf-integration/orchard/orchard-mvp:orchard-stage
+    - pip install --index-url "$PIP_INDEX_URL" httpx
  script:
    - |
-      # Create a Job to run the reset in the cluster
-      cat <<EOF | kubectl apply -f -
-      apiVersion: batch/v1
-      kind: Job
-      metadata:
-        name: reset-stage-${CI_PIPELINE_ID}-${CI_JOB_ID}
-        namespace: ${NAMESPACE}
-      spec:
-        ttlSecondsAfterFinished: 300
-        backoffLimit: 2
-        template:
-          spec:
-            serviceAccountName: orchard
-            restartPolicy: Never
-            containers:
-            - name: reset-runner
-              image: deps.global.bsf.tools/docker/python:3.12-slim
-              env:
-              - name: STAGE_URL
-                value: "${STAGE_URL}"
-              - name: AWS_REGION
-                value: "${AWS_REGION}"
-              - name: STAGE_AUTH_SECRET_ARN
-                value: "${STAGE_AUTH_SECRET_ARN}"
-              - name: PIP_INDEX_URL
-                value: "${PIP_INDEX_URL}"
-              command:
-              - /bin/bash
-              - -c
-              - |
-                set -e
-                pip install --index-url "\$PIP_INDEX_URL" httpx boto3
+      python - <<'RESET_SCRIPT'
+      import httpx
+      import sys
+      import os
+      import time

-                python - <<'RESET_SCRIPT'
-                import httpx
-                import sys
-                import os
-                import time
-                import json
-                import boto3
+      BASE_URL = os.environ.get("STAGE_URL", "")
+      ADMIN_USER = "admin"
+      ADMIN_PASS = os.environ.get("STAGE_ADMIN_PASSWORD", "")
+      MAX_RETRIES = 3
+      RETRY_DELAY = 5

-                BASE_URL = os.environ.get("STAGE_URL", "")
-                ADMIN_USER = "admin"
-                MAX_RETRIES = 3
-                RETRY_DELAY = 5
+      if not BASE_URL:
+          print("ERROR: STAGE_URL not set")
+          sys.exit(1)

-                # Fetch admin password from AWS Secrets Manager using IRSA
-                secret_arn = os.environ.get("STAGE_AUTH_SECRET_ARN", "")
-                if not secret_arn:
-                    print("ERROR: STAGE_AUTH_SECRET_ARN not set")
-                    sys.exit(1)
+      if not ADMIN_PASS:
+          print("ERROR: STAGE_ADMIN_PASSWORD not set")
+          sys.exit(1)

-                try:
-                    client = boto3.client('secretsmanager', region_name=os.environ.get("AWS_REGION"))
-                    secret = client.get_secret_value(SecretId=secret_arn)
-                    data = json.loads(secret['SecretString'])
-                    ADMIN_PASS = data['admin_password']
-                    print("Successfully fetched admin password from Secrets Manager")
-                except Exception as e:
-                    print(f"ERROR: Failed to fetch secret: {e}")
-                    sys.exit(1)
+      print(f"=== Resetting stage environment at {BASE_URL} ===")

-                if not BASE_URL:
-                    print("ERROR: STAGE_URL not set")
-                    sys.exit(1)
+      def do_reset():
+          with httpx.Client(base_url=BASE_URL, timeout=120.0) as client:
+              print("Logging in as admin...")
+              login_response = client.post(
+                  "/api/v1/auth/login",
+                  json={"username": ADMIN_USER, "password": ADMIN_PASS},
+              )
+              if login_response.status_code != 200:
+                  raise Exception(f"Login failed: {login_response.status_code} - {login_response.text}")
+              print("Login successful")

-                print(f"=== Resetting stage environment at {BASE_URL} ===")
+              print("Calling factory reset endpoint...")
+              reset_response = client.post(
+                  "/api/v1/admin/factory-reset",
+                  headers={"X-Confirm-Reset": "yes-delete-all-data"},
+              )

-                def do_reset():
-                    with httpx.Client(base_url=BASE_URL, timeout=120.0) as client:
-                        print("Logging in as admin...")
-                        login_response = client.post(
-                            "/api/v1/auth/login",
-                            json={"username": ADMIN_USER, "password": ADMIN_PASS},
-                        )
-                        if login_response.status_code != 200:
-                            raise Exception(f"Login failed: {login_response.status_code} - {login_response.text}")
-                        print("Login successful")
+              if reset_response.status_code == 200:
+                  result = reset_response.json()
+                  print("Factory reset successful!")
+                  print(f"  Database tables dropped: {result['results']['database_tables_dropped']}")
+                  print(f"  S3 objects deleted: {result['results']['s3_objects_deleted']}")
+                  print(f"  Database reinitialized: {result['results']['database_reinitialized']}")
+                  print(f"  Seeded: {result['results']['seeded']}")
+                  return True
+              else:
+                  raise Exception(f"Factory reset failed: {reset_response.status_code} - {reset_response.text}")

-                        print("Calling factory reset endpoint...")
-                        reset_response = client.post(
-                            "/api/v1/admin/factory-reset",
-                            headers={"X-Confirm-Reset": "yes-delete-all-data"},
-                        )
-
-                        if reset_response.status_code == 200:
-                            result = reset_response.json()
-                            print("Factory reset successful!")
-                            print(f"  Database tables dropped: {result['results']['database_tables_dropped']}")
-                            print(f"  S3 objects deleted: {result['results']['s3_objects_deleted']}")
-                            print(f"  Database reinitialized: {result['results']['database_reinitialized']}")
-                            print(f"  Seeded: {result['results']['seeded']}")
-                            return True
-                        else:
-                            raise Exception(f"Factory reset failed: {reset_response.status_code} - {reset_response.text}")
-
-                for attempt in range(1, MAX_RETRIES + 1):
-                    try:
-                        print(f"Attempt {attempt}/{MAX_RETRIES}")
-                        if do_reset():
-                            sys.exit(0)
-                    except Exception as e:
-                        print(f"Attempt {attempt} failed: {e}")
-                        if attempt < MAX_RETRIES:
-                            print(f"Retrying in {RETRY_DELAY} seconds...")
-                            time.sleep(RETRY_DELAY)
-                        else:
-                            print("All retry attempts failed")
-                            sys.exit(1)
-                RESET_SCRIPT
-      EOF
-    - |
-      echo "Waiting for reset job to complete..."
-      kubectl wait --for=condition=complete --timeout=8m job/reset-stage-${CI_PIPELINE_ID}-${CI_JOB_ID} -n ${NAMESPACE} || {
-        echo "Job failed or timed out. Fetching logs..."
-        kubectl logs job/reset-stage-${CI_PIPELINE_ID}-${CI_JOB_ID} -n ${NAMESPACE} || true
-        kubectl delete job reset-stage-${CI_PIPELINE_ID}-${CI_JOB_ID} -n ${NAMESPACE} || true
-        exit 1
-      }
-    - kubectl logs job/reset-stage-${CI_PIPELINE_ID}-${CI_JOB_ID} -n ${NAMESPACE}
-    - kubectl delete job reset-stage-${CI_PIPELINE_ID}-${CI_JOB_ID} -n ${NAMESPACE} || true
+      for attempt in range(1, MAX_RETRIES + 1):
+          try:
+              print(f"Attempt {attempt}/{MAX_RETRIES}")
+              if do_reset():
+                  sys.exit(0)
+          except Exception as e:
+              print(f"Attempt {attempt} failed: {e}")
+              if attempt < MAX_RETRIES:
+                  print(f"Retrying in {RETRY_DELAY} seconds...")
+                  time.sleep(RETRY_DELAY)
+              else:
+                  print("All retry attempts failed")
+                  sys.exit(1)
+      RESET_SCRIPT
  rules:
    - if: '$CI_COMMIT_BRANCH == "main"'
      when: on_success
@@ -340,98 +281,14 @@ reset_stage_pre:
  <<: *reset_stage_template
  needs: [deploy_stage]

-# Integration tests for stage deployment (runs in-cluster with IRSA for Secrets Manager access)
+# Integration tests for stage deployment
+# Uses CI variable STAGE_ADMIN_PASSWORD (set in GitLab CI/CD settings)
 integration_test_stage:
-  stage: deploy
+  <<: *integration_test_template
  needs: [reset_stage_pre]
-  image: deps.global.bsf.tools/registry-1.docker.io/alpine/k8s:1.29.12
-  timeout: 20m
  variables:
-    NAMESPACE: orch-stage-namespace
-  before_script:
-    - kubectl config use-context esv/bsf/bsf-integration/orchard/orchard-mvp:orchard-stage
-  script:
-    - |
-      # Create a Job to run integration tests in the cluster
-      cat <<EOF | kubectl apply -f -
-      apiVersion: batch/v1
-      kind: Job
-      metadata:
-        name: integration-test-${CI_PIPELINE_ID}
-        namespace: ${NAMESPACE}
-      spec:
-        ttlSecondsAfterFinished: 300
-        backoffLimit: 1
-        template:
-          spec:
-            serviceAccountName: orchard
-            restartPolicy: Never
-            containers:
-            - name: test-runner
-              image: deps.global.bsf.tools/docker/python:3.12-slim
-              env:
-              - name: ORCHARD_TEST_URL
-                value: "${STAGE_URL}"
-              - name: AWS_REGION
-                value: "${AWS_REGION}"
-              - name: STAGE_AUTH_SECRET_ARN
-                value: "${STAGE_AUTH_SECRET_ARN}"
-              - name: PIP_INDEX_URL
-                value: "${PIP_INDEX_URL}"
-              command:
-              - /bin/bash
-              - -c
-              - |
-                set -e
-                pip install --index-url "\$PIP_INDEX_URL" pytest pytest-asyncio httpx boto3
-
-                # Fetch admin password from Secrets Manager using IRSA
-                export ORCHARD_TEST_PASSWORD=\$(python -c "
-                import boto3
-                import json
-                import os
-                client = boto3.client('secretsmanager', region_name=os.environ['AWS_REGION'])
-                secret = client.get_secret_value(SecretId=os.environ['STAGE_AUTH_SECRET_ARN'])
-                data = json.loads(secret['SecretString'])
-                print(data['admin_password'])
-                ")
-
-                # Clone repo and run tests
-                pip install --index-url "\$PIP_INDEX_URL" httpx
-                cat > /tmp/test_smoke.py << 'TESTEOF'
-                import os
-                import httpx
-
-                def test_health():
-                    url = os.environ["ORCHARD_TEST_URL"]
-                    r = httpx.get(f"{url}/health", timeout=30)
-                    assert r.status_code == 200
-
-                def test_login():
-                    url = os.environ["ORCHARD_TEST_URL"]
-                    password = os.environ["ORCHARD_TEST_PASSWORD"]
-                    with httpx.Client(base_url=url, timeout=30) as client:
-                        r = client.post("/api/v1/auth/login", json={"username": "admin", "password": password})
-                        assert r.status_code == 200, f"Login failed: {r.status_code} {r.text}"
-
-                def test_api():
-                    url = os.environ["ORCHARD_TEST_URL"]
-                    r = httpx.get(f"{url}/api/v1/projects", timeout=30)
-                    assert r.status_code == 200
-                TESTEOF
-
-                python -m pytest /tmp/test_smoke.py -v
-      EOF
-    - |
-      echo "Waiting for test job to complete..."
-      kubectl wait --for=condition=complete --timeout=15m job/integration-test-${CI_PIPELINE_ID} -n ${NAMESPACE} || {
-        echo "Job failed or timed out. Fetching logs..."
-        kubectl logs job/integration-test-${CI_PIPELINE_ID} -n ${NAMESPACE} || true
-        kubectl delete job integration-test-${CI_PIPELINE_ID} -n ${NAMESPACE} || true
-        exit 1
-      }
-    - kubectl logs job/integration-test-${CI_PIPELINE_ID} -n ${NAMESPACE}
-    - kubectl delete job integration-test-${CI_PIPELINE_ID} -n ${NAMESPACE} || true
+    ORCHARD_TEST_URL: $STAGE_URL
+    ORCHARD_TEST_PASSWORD: $STAGE_ADMIN_PASSWORD
  rules:
    - if: '$CI_COMMIT_BRANCH == "main"'
      when: on_success