Add pre-test stage reset to ensure known environment state

- Add reset_stage_pre job that runs after deploy_stage but before integration tests - Extract reset script into reusable .reset_stage_template - Ensures stage environment is in known state even if manually modified - Pipeline flow: deploy_stage → reset_stage_pre → integration_test_stage → reset_stage
Merge branch 'release_0.5.1' into 'main'
2026-01-23 23:06:47 +00:00 · 2026-01-23 15:37:09 -07:00 · 2026-01-23 15:37:09 -07:00 · 2026-01-23 16:16:03 -06:00 · 2026-01-23 16:16:02 -06:00 · 2026-01-23 15:50:24 -06:00
6 changed files with 358 additions and 19 deletions
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -11,6 +11,12 @@ variables:
  # Environment URLs (used by deploy and test jobs)
  STAGE_URL: https://orchard-stage.common.global.bsf.tools
  PROD_URL: https://orchard.common.global.bsf.tools
+  # Stage environment AWS resources (used by reset job)
+  STAGE_RDS_HOST: orchard-stage.cluster-cvw3jzjkozoc.us-gov-west-1.rds.amazonaws.com
+  STAGE_RDS_DBNAME: postgres
+  STAGE_SECRET_ARN: "arn:aws-us-gov:secretsmanager:us-gov-west-1:052673043337:secret:rds!cluster-a573672b-1a38-4665-a654-1b7df37b5297-IaeFQL"
+  STAGE_S3_BUCKET: orchard-artifacts-stage
+  AWS_REGION: us-gov-west-1
  # Shared pip cache directory
  PIP_CACHE_DIR: "$CI_PROJECT_DIR/.pip-cache"

@@ -30,9 +36,68 @@ stages:
  - analyze
  - deploy

+# Override Prosper template jobs to exclude tag pipelines
+# Tags only run deploy_prod and smoke_test_prod (image already built on main)
+build_image:
+  rules:
+    - if: '$CI_COMMIT_TAG'
+      when: never
+    - when: on_success
+
+test_image:
+  rules:
+    - if: '$CI_COMMIT_TAG'
+      when: never
+    - when: on_success
+
+hadolint:
+  rules:
+    - if: '$CI_COMMIT_TAG'
+      when: never
+    - when: on_success
+
 kics:
  variables:
    KICS_CONFIG: kics.config
+  rules:
+    - if: '$CI_COMMIT_TAG'
+      when: never
+    - when: on_success
+
+secrets:
+  rules:
+    - if: '$CI_COMMIT_TAG'
+      when: never
+    - when: on_success
+
+app_deps_scan:
+  rules:
+    - if: '$CI_COMMIT_TAG'
+      when: never
+    - when: on_success
+
+cve_scan:
+  rules:
+    - if: '$CI_COMMIT_TAG'
+      when: never
+    - when: on_success
+
+app_sbom_analysis:
+  rules:
+    - if: '$CI_COMMIT_TAG'
+      when: never
+    - when: on_success
+
+cve_sbom_analysis:
+  rules:
+    - if: '$CI_COMMIT_TAG'
+      when: never
+    - when: on_success
+
+# Override release job to wait for stage integration tests before creating tag
+# This ensures the tag (which triggers prod deploy) is only created after stage passes
+release:
+  needs: [integration_test_stage, changelog]

 # Full integration test suite template (for feature/stage deployments)
 # Runs the complete pytest integration test suite against the deployed environment
@@ -131,16 +196,106 @@ kics:
          sys.exit(0)
      PYTEST_SCRIPT

+# Integration tests for stage deployment (full suite)
+# Reset stage template - shared by pre and post test reset jobs
+# Calls the /api/v1/admin/factory-reset endpoint which handles DB and S3 cleanup
+.reset_stage_template: &reset_stage_template
+  stage: deploy
+  image: deps.global.bsf.tools/docker/python:3.12-slim
+  timeout: 5m
+  retry: 1  # Retry once on transient failures
+  before_script:
+    - pip install --index-url "$PIP_INDEX_URL" httpx
+  script:
+    - |
+      python - <<'RESET_SCRIPT'
+      import httpx
+      import sys
+      import os
+      import time
+
+      BASE_URL = os.environ.get("STAGE_URL", "")
+      ADMIN_USER = "admin"
+      ADMIN_PASS = "changeme123"  # Default admin password
+      MAX_RETRIES = 3
+      RETRY_DELAY = 5  # seconds
+
+      if not BASE_URL:
+          print("ERROR: STAGE_URL environment variable not set")
+          sys.exit(1)
+
+      print(f"=== Resetting stage environment at {BASE_URL} ===")
+
+      def do_reset():
+          with httpx.Client(base_url=BASE_URL, timeout=120.0) as client:
+              # Login as admin
+              print("Logging in as admin...")
+              login_response = client.post(
+                  "/api/v1/auth/login",
+                  json={"username": ADMIN_USER, "password": ADMIN_PASS},
+              )
+              if login_response.status_code != 200:
+                  raise Exception(f"Login failed: {login_response.status_code} - {login_response.text}")
+              print("Login successful")
+
+              # Call factory reset endpoint
+              print("Calling factory reset endpoint...")
+              reset_response = client.post(
+                  "/api/v1/admin/factory-reset",
+                  headers={"X-Confirm-Reset": "yes-delete-all-data"},
+              )
+
+              if reset_response.status_code == 200:
+                  result = reset_response.json()
+                  print("Factory reset successful!")
+                  print(f"  Database tables dropped: {result['results']['database_tables_dropped']}")
+                  print(f"  S3 objects deleted: {result['results']['s3_objects_deleted']}")
+                  print(f"  Database reinitialized: {result['results']['database_reinitialized']}")
+                  print(f"  Seeded: {result['results']['seeded']}")
+                  return True
+              else:
+                  raise Exception(f"Factory reset failed: {reset_response.status_code} - {reset_response.text}")
+
+      # Retry loop
+      for attempt in range(1, MAX_RETRIES + 1):
+          try:
+              print(f"Attempt {attempt}/{MAX_RETRIES}")
+              if do_reset():
+                  sys.exit(0)
+          except Exception as e:
+              print(f"Attempt {attempt} failed: {e}")
+              if attempt < MAX_RETRIES:
+                  print(f"Retrying in {RETRY_DELAY} seconds...")
+                  time.sleep(RETRY_DELAY)
+              else:
+                  print("All retry attempts failed")
+                  sys.exit(1)
+      RESET_SCRIPT
+  rules:
+    - if: '$CI_COMMIT_BRANCH == "main"'
+      when: on_success
+
+# Reset stage BEFORE integration tests (ensure known state)
+reset_stage_pre:
+  <<: *reset_stage_template
+  needs: [deploy_stage]
+
 # Integration tests for stage deployment (full suite)
 integration_test_stage:
  <<: *integration_test_template
-  needs: [deploy_stage]
+  needs: [reset_stage_pre]
  variables:
    ORCHARD_TEST_URL: $STAGE_URL
  rules:
    - if: '$CI_COMMIT_BRANCH == "main"'
      when: on_success

+# Reset stage AFTER integration tests (clean slate for next run)
+reset_stage:
+  <<: *reset_stage_template
+  needs: [integration_test_stage]
+  allow_failure: true  # Don't fail pipeline if reset has issues
+
 # Integration tests for feature deployment (full suite)
 integration_test_feature:
  <<: *integration_test_template
@@ -183,6 +338,10 @@ python_unit_tests:
        coverage_format: cobertura
        path: backend/coverage.xml
  coverage: '/TOTAL.*\s+(\d+%)/'
+  rules:
+    - if: '$CI_COMMIT_TAG'
+      when: never
+    - when: on_success

 # Run frontend tests
 frontend_tests:
@@ -212,6 +371,10 @@ frontend_tests:
        coverage_format: cobertura
        path: frontend/coverage/cobertura-coverage.xml
  coverage: '/All files[^|]*\|[^|]*\s+([\d\.]+)/'
+  rules:
+    - if: '$CI_COMMIT_TAG'
+      when: never
+    - when: on_success

 # Shared deploy configuration
 .deploy_template: &deploy_template
@@ -339,12 +502,11 @@ cleanup_feature:
 # Deploy to production (version tags only)
 deploy_prod:
  stage: deploy
-  # For tag pipelines, most jobs don't run (trusting main was tested)
-  # We only need build_image to have the image available
-  needs: [build_image]
+  # For tag pipelines, no other jobs run - image was already built when commit was on main
+  needs: []
  image: deps.global.bsf.tools/registry-1.docker.io/alpine/k8s:1.29.12
  variables:
-    NAMESPACE: orch-prod-namespace
+    NAMESPACE: orch-namespace
    VALUES_FILE: helm/orchard/values-prod.yaml
    BASE_URL: $PROD_URL
  before_script:
--- a/.gitleaks.toml
+++ b/.gitleaks.toml
@@ -0,0 +1,8 @@
+# Gitleaks configuration
+# https://github.com/gitleaks/gitleaks#configuration
+
+[allowlist]
+# Test files that contain variable names matching secret patterns (e.g., s3_key)
+paths = [
+  '''backend/tests/.*\.py''',
+]
--- a/.gitleaksignore
+++ b/.gitleaksignore
@@ -4,6 +4,7 @@
 # False positive: s3_key is an attribute name in test assertions, not a secret
 # These are historical commits - files have since been deleted or updated with inline comments
 7e68baed0886a3c928644cd01aa3b39f92d4f976:backend/tests/test_duplicate_detection.py:generic-api-key:154
+81458b3bcb5ace97109ba4c16f4afa6e55b1b8bd:backend/tests/test_duplicate_detection.py:generic-api-key:154
 2f1891cf0126ec0e7d4c789d872a2cb2dd3a1745:backend/tests/unit/test_storage.py:generic-api-key:381
 10d36947948de796f0bacea3827f4531529c405d:backend/tests/unit/test_storage.py:generic-api-key:381
 bccbc71c13570d14b8b26a11335c45f102fe3072:backend/tests/unit/test_storage.py:generic-api-key:381
@@ -15,3 +16,4 @@ bccbc71c13570d14b8b26a11335c45f102fe3072:backend/tests/unit/test_storage.py:gene
 08dce6cbb836b687002751fed4159bfc2da61f8b:backend/tests/unit/test_storage.py:generic-api-key:381
 617bcbe89cff9a009d77e4f1f1864efed1820e63:backend/tests/unit/test_storage.py:generic-api-key:381
 1cbd33544388e0fe6db752fa8886fab33cf9ce7c:backend/tests/unit/test_storage.py:generic-api-key:381
+7cfad28f678f5a5b8b927d694a17b9ba446b7138:backend/tests/unit/test_storage.py:generic-api-key:381
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -6,24 +6,27 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

 ## [Unreleased]
+### Changed
+- Added pre-test stage reset to ensure known environment state before integration tests (#54)
+
+## [0.5.1] - 2026-01-23
+### Changed
+- Simplified tag pipeline to only run deploy and smoke tests (image already built on main) (#54)
+
+### Fixed
+- Fixed production CI deployment namespace to use correct `orch-namespace` (#54)
+- Added gitleaks config to allowlist test files from secret scanning (#54)
+
+## [0.5.0] - 2026-01-23
 ### Added
+- Added factory reset endpoint `POST /api/v1/admin/factory-reset` for test environment cleanup (#54)
+  - Requires admin authentication and `X-Confirm-Reset: yes-delete-all-data` header
+  - Drops all database tables, clears S3 bucket, reinitializes schema, re-seeds default data
+  - CI pipeline automatically calls this after integration tests on stage
+- Added `delete_all()` method to storage backend for bulk S3 object deletion (#54)
 - Added AWS Secrets Manager CSI driver support for database credentials (#54)
 - Added SecretProviderClass template for Secrets Manager integration (#54)
 - Added IRSA service account annotations for prod and stage environments (#54)
-
-### Changed
- Configured stage and prod to use AWS RDS instead of PostgreSQL subchart (#54)
- Configured stage and prod to use AWS S3 instead of MinIO subchart (#54)
- Changed prod deployment from manual to automatic on version tags (#54)
- Updated S3 client to support IRSA credentials when no explicit keys provided (#54)
- Changed prod image pullPolicy to Always (#54)
- Added proxy-body-size annotation to prod ingress for large uploads (#54)
-
-### Removed
- Disabled PostgreSQL subchart for stage and prod environments (#54)
- Disabled MinIO subchart for stage and prod environments (#54)
-
-### Added
 - Added comprehensive upload/download tests for size boundaries (1B to 1GB) (#38)
 - Added concurrent upload/download tests (2, 5, 10 parallel operations) (#38)
 - Added data integrity tests (binary, text, unicode, compressed content) (#38)
@@ -78,6 +81,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Added internal proxy configuration for npm, pip, helm, and apt (#51)

 ### Changed
+- Configured stage and prod to use AWS RDS instead of PostgreSQL subchart (#54)
+- Configured stage and prod to use AWS S3 instead of MinIO subchart (#54)
+- Changed prod deployment from manual to automatic on version tags (#54)
+- Updated S3 client to support IRSA credentials when no explicit keys provided (#54)
+- Changed prod image pullPolicy to Always (#54)
+- Added proxy-body-size annotation to prod ingress for large uploads (#54)
 - CI integration tests now run full pytest suite (~350 tests) against deployed environment instead of 3 smoke tests
 - CI production deployment uses lightweight smoke tests only (no test data creation in prod)
 - CI pipeline improvements: shared pip cache, `interruptible` flag on test jobs, retry on integration tests
@@ -98,6 +107,14 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Improved pod naming: Orchard pods now named `orchard-{env}-server-*` for clarity (#51)

 ### Fixed
+- Fixed factory reset not creating default admin user after reset (#60)
+- Admin user was only created at server startup, not after factory reset
+- CI reset job would fail to login because admin user didn't exist
+- Improved reset_stage CI job reliability (#60)
+- Added application-level retry logic (3 attempts with 5s delay)
+- Added job-level retry for transient failures
+- Fixed httpx client to use proper context manager
+- Increased timeout to 120s for reset operations
 - Fixed CI integration test rate limiting: added configurable `ORCHARD_LOGIN_RATE_LIMIT` env var, relaxed to 1000/minute for dev/stage
 - Fixed duplicate `TestSecurityEdgeCases` class definition in test_auth_api.py
 - Fixed integration tests auth: session-scoped client, configurable credentials via env vars, fail-fast on auth errors
@@ -118,6 +135,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

 ### Removed
 - Removed unused `store_streaming()` method from storage.py (#51)
+- Disabled PostgreSQL subchart for stage and prod environments (#54)
+- Disabled MinIO subchart for stage and prod environments (#54)

 ## [0.4.0] - 2026-01-12
 ### Added
--- a/backend/app/routes.py
+++ b/backend/app/routes.py
@@ -6390,3 +6390,121 @@ def get_artifact_provenance(
        tags=tag_list,
        uploads=upload_history,
    )
+
+
+# =============================================================================
+# Factory Reset Endpoint (Admin Only)
+# =============================================================================
+
+
+@router.post("/api/v1/admin/factory-reset", tags=["admin"])
+def factory_reset(
+    request: Request,
+    db: Session = Depends(get_db),
+    storage: S3Storage = Depends(get_storage),
+    current_user: User = Depends(require_admin),
+):
+    """
+    Factory reset - delete all data and restore to initial state.
+
+    This endpoint:
+    1. Drops all database tables
+    2. Deletes all objects from S3 storage
+    3. Recreates the database schema
+    4. Re-seeds with default admin user
+
+    Requires:
+    - Admin authentication
+    - X-Confirm-Reset header set to "yes-delete-all-data"
+
+    WARNING: This is a destructive operation that cannot be undone.
+    """
+    # Require explicit confirmation header
+    confirm_header = request.headers.get("X-Confirm-Reset")
+    if confirm_header != "yes-delete-all-data":
+        raise HTTPException(
+            status_code=status.HTTP_400_BAD_REQUEST,
+            detail="Factory reset requires X-Confirm-Reset header set to 'yes-delete-all-data'",
+        )
+
+    # Capture username before we drop tables (user object will become invalid)
+    admin_username = current_user.username
+    logger.warning(f"Factory reset initiated by admin user: {admin_username}")
+
+    results = {
+        "database_tables_dropped": 0,
+        "s3_objects_deleted": 0,
+        "database_reinitialized": False,
+        "seeded": False,
+    }
+
+    try:
+        # Step 1: Drop all tables in public schema
+        logger.info("Dropping all database tables...")
+        drop_result = db.execute(
+            text("""
+                DO $$
+                DECLARE
+                    r RECORD;
+                    table_count INT := 0;
+                BEGIN
+                    SET session_replication_role = 'replica';
+                    FOR r IN (SELECT tablename FROM pg_tables WHERE schemaname = 'public') LOOP
+                        EXECUTE 'DROP TABLE IF EXISTS public.' || quote_ident(r.tablename) || ' CASCADE';
+                        table_count := table_count + 1;
+                    END LOOP;
+                    SET session_replication_role = 'origin';
+                    RAISE NOTICE 'Dropped % tables', table_count;
+                END $$;
+            """)
+        )
+        db.commit()
+
+        # Count tables that were dropped
+        count_result = db.execute(
+            text("SELECT COUNT(*) FROM pg_tables WHERE schemaname = 'public'")
+        )
+        remaining_tables = count_result.scalar()
+        results["database_tables_dropped"] = "all"
+        logger.info(f"Database tables dropped, remaining: {remaining_tables}")
+
+        # Step 2: Delete all S3 objects
+        logger.info("Deleting all S3 objects...")
+        results["s3_objects_deleted"] = storage.delete_all()
+
+        # Step 3: Reinitialize database schema
+        logger.info("Reinitializing database schema...")
+        from .database import init_db, SessionLocal
+        init_db()
+        results["database_reinitialized"] = True
+
+        # Step 4: Re-seed with default data (need fresh session after schema recreate)
+        logger.info("Seeding database with defaults...")
+        from .seed import seed_database
+        from .auth import create_default_admin
+        fresh_db = SessionLocal()
+        try:
+            # Create default admin user first (normally done at startup)
+            create_default_admin(fresh_db)
+            # Then seed other test data
+            seed_database(fresh_db)
+            fresh_db.commit()
+        finally:
+            fresh_db.close()
+        results["seeded"] = True
+
+        logger.warning(f"Factory reset completed by {admin_username}")
+
+        return {
+            "status": "success",
+            "message": "Factory reset completed successfully",
+            "results": results,
+        }
+
+    except Exception as e:
+        logger.error(f"Factory reset failed: {e}")
+        db.rollback()
+        raise HTTPException(
+            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+            detail=f"Factory reset failed: {str(e)}",
+        )
--- a/backend/app/storage.py
+++ b/backend/app/storage.py
@@ -835,6 +835,36 @@ class S3Storage:
        except ClientError:
            return False

+    def delete_all(self) -> int:
+        """
+        Delete all objects in the bucket.
+
+        Returns:
+            Number of objects deleted
+        """
+        deleted_count = 0
+        try:
+            paginator = self.client.get_paginator("list_objects_v2")
+            for page in paginator.paginate(Bucket=self.bucket):
+                objects = page.get("Contents", [])
+                if not objects:
+                    continue
+
+                # Delete objects in batches of 1000 (S3 limit)
+                delete_keys = [{"Key": obj["Key"]} for obj in objects]
+                if delete_keys:
+                    self.client.delete_objects(
+                        Bucket=self.bucket, Delete={"Objects": delete_keys}
+                    )
+                    deleted_count += len(delete_keys)
+                    logger.info(f"Deleted {len(delete_keys)} objects from S3")
+
+            logger.info(f"Total objects deleted from S3: {deleted_count}")
+            return deleted_count
+        except ClientError as e:
+            logger.error(f"Failed to delete all S3 objects: {e}")
+            raise
+
    def generate_presigned_url(
        self,
        s3_key: str,
Author	SHA1	Message	Date
Mondo Diaz	b2a860bafd	Add pre-test stage reset to ensure known environment state - Add reset_stage_pre job that runs after deploy_stage but before integration tests - Extract reset script into reusable .reset_stage_template - Ensures stage environment is in known state even if manually modified - Pipeline flow: deploy_stage → reset_stage_pre → integration_test_stage → reset_stage	2026-01-23 23:06:47 +00:00
Dane Moss	b5579f1643	Merge branch 'release_0.5.1' into 'main' add CL entry to bump version See merge request esv/bsf/bsf-integration/orchard/orchard-mvp!43	2026-01-23 15:37:09 -07:00
Dane Moss	fafa03e4ce	add CL entry to bump version	2026-01-23 15:37:09 -07:00
Mondo Diaz	d4b2da3232	Merge branch 'fix/release-wait-for-stage-tests' into 'main' Add gitleaks fingerprint for test file false positive See merge request esv/bsf/bsf-integration/orchard/orchard-mvp!42	2026-01-23 16:16:03 -06:00
Mondo Diaz	7b04bbdf05	Add gitleaks fingerprint for test file false positive	2026-01-23 16:16:02 -06:00
Mondo Diaz	3a807870a3	Merge branch 'fix/ci-prod-namespace' into 'main' Fix production CI deployment and simplify tag pipeline See merge request esv/bsf/bsf-integration/orchard/orchard-mvp!41	2026-01-23 15:50:24 -06:00
Mondo Diaz	f966fde7df	Fix production CI deployment and simplify tag pipeline	2026-01-23 15:50:24 -06:00
Mondo Diaz	133d9cbfd6	Merge branch 'bump_version' into 'main' add changelog entry to cut a new release See merge request esv/bsf/bsf-integration/orchard/orchard-mvp!40	2026-01-23 13:00:51 -06:00
Dane Moss	276b4f2743	add changelog entry to cut a new release	2026-01-23 10:46:20 -07:00
Mondo Diaz	67ac6bb3f8	Merge branch 'fix/factory-reset-admin-user' into 'main' Update CHANGELOG with factory reset fixes (#60) See merge request esv/bsf/bsf-integration/orchard/orchard-mvp!39	2026-01-23 09:33:01 -06:00
Mondo Diaz	b0bb3ed569	Update CHANGELOG with factory reset fixes (#60 )	2026-01-21 23:44:45 +00:00
Mondo Diaz	1ac75e1017	Fix factory reset and improve reset_stage CI job - Add create_default_admin() call to factory reset (admin user wasn't being created after reset, only on server restart) - Add retry logic to reset_stage CI job (3 attempts with 5s delay) - Use proper context manager for httpx client - Increase timeout to 120s for reset operation - Add retry: 1 at job level for transient failures	2026-01-21 23:20:48 +00:00
Mondo Diaz	693613f111	Fix factory reset - capture username before dropping tables	2026-01-21 23:18:29 +00:00
Mondo Diaz	9da4ae8c0d	Add gitleaks fingerprint for test file false positive	2026-01-21 22:59:08 +00:00
Mondo Diaz	7ffdc64364	Fix seed_database call in factory reset - pass fresh db session	2026-01-21 22:51:03 +00:00
Mondo Diaz	6abc0c88b0	Merge branch 'feature/stage-reset-job' into 'main' Fix reset_stage job to read STAGE_URL from environment See merge request esv/bsf/bsf-integration/orchard/orchard-mvp!38	2026-01-21 16:39:39 -06:00
Mondo Diaz	e96dc5cde8	Fix reset_stage job to read STAGE_URL from environment	2026-01-21 22:25:04 +00:00
Mondo Diaz	cba5bac383	Merge branch 'feature/stage-reset-job' into 'main' Add factory reset endpoint for stage environment cleanup (#54) See merge request esv/bsf/bsf-integration/orchard/orchard-mvp!37	2026-01-21 16:00:02 -06:00
Mondo Diaz	535280a783	Add factory reset endpoint for stage environment cleanup (#54 )	2026-01-21 16:00:02 -06:00