#!/bin/bash # Fellowship EC2 Instance Setup Script # This script is downloaded from S3 and executed by user_data.sh # Contains all setup logic: Docker, Docker Compose, DevOps Escape Room, and Fellowship SUT set -e # Logging setup - redirect all output to log file LOG_FILE="/var/log/user-data.log" exec > >(tee -a "$LOG_FILE") 2>&1 # Function to log with timestamp log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*" } IMDS_BASE_URL="http://169.254.169.254/latest" IMDS_TOKEN="" get_imds_token() { if [ -n "$IMDS_TOKEN" ]; then echo "$IMDS_TOKEN" return 0 fi IMDS_TOKEN=$(curl -s --max-time 5 --connect-timeout 2 -X PUT "${IMDS_BASE_URL}/api/token" \ -H "X-aws-ec2-metadata-token-ttl-seconds: 21600" 2>/dev/null || echo "") if [ -n "$IMDS_TOKEN" ]; then echo "$IMDS_TOKEN" return 0 fi return 1 } get_instance_metadata() { local path="$1" local token token=$(get_imds_token 2>/dev/null || echo "") if [ -n "$token" ]; then curl -s --max-time 5 --connect-timeout 2 -H "X-aws-ec2-metadata-token: ${token}" \ "${IMDS_BASE_URL}/meta-data/${path}" 2>/dev/null || echo "" else curl -s --max-time 5 --connect-timeout 2 "${IMDS_BASE_URL}/meta-data/${path}" 2>/dev/null || echo "" fi } log "==========================================" log "Fellowship Setup Script Started" log "==========================================" # Get AWS region with retries and fallback AWS_REGION="" for i in {1..5}; do AWS_REGION=$(get_instance_metadata "placement/region") [ -n "$AWS_REGION" ] && break [ $i -lt 5 ] && sleep 2 done [ -z "$AWS_REGION" ] && AWS_REGION="eu-west-1" && log "Using default region: $AWS_REGION" || log "Region: $AWS_REGION" # Function to wait for yum lock wait_for_yum() { while sudo fuser /var/run/yum.pid >/dev/null 2>&1; do log "Waiting for yum lock to be released..." sleep 5 done } # Helper function to run docker commands as ec2-user with proper group membership run_as_ec2user_docker() { local cmd="$1" # Use sg (switch group) to ensure docker group is active in the subshell # This is more reliable than su - which may not pick up new group membership immediately sg docker -c "su - ec2-user -c '$cmd'" } ensure_swap_for_small_instances() { if [ -f /swapfile ]; then return 0 fi local mem_mb mem_mb=$(awk '/MemTotal/ {print int($2/1024)}' /proc/meminfo 2>/dev/null || echo "0") if [ "$mem_mb" -ge 3500 ]; then return 0 fi log "Low-memory instance detected (${mem_mb}MB). Creating 2GB swap to reduce bootstrap OOM risk..." if command -v fallocate >/dev/null 2>&1; then fallocate -l 2G /swapfile || dd if=/dev/zero of=/swapfile bs=1M count=2048 else dd if=/dev/zero of=/swapfile bs=1M count=2048 fi chmod 600 /swapfile mkswap /swapfile >/dev/null 2>&1 || true swapon /swapfile >/dev/null 2>&1 || true grep -q '^/swapfile' /etc/fstab || echo '/swapfile none swap sw 0 0' >> /etc/fstab log "✓ Swap configured" } # Wait for any existing yum processes to complete log "Checking for yum locks..." wait_for_yum # Update and install dependencies log "Installing Docker and Git..." yum update -y wait_for_yum yum install -y docker git systemctl start docker systemctl enable docker usermod -aG docker ec2-user ensure_swap_for_small_instances log "✓ Docker installed and started" # Install Docker Compose plugin log "Installing Docker Compose plugin..." mkdir -p /home/ec2-user/.docker/cli-plugins/ if curl -SL https://github.com/docker/compose/releases/download/v2.27.0/docker-compose-linux-x86_64 -o /home/ec2-user/.docker/cli-plugins/docker-compose; then chmod +x /home/ec2-user/.docker/cli-plugins/docker-compose chown -R ec2-user:ec2-user /home/ec2-user/.docker log "✓ Docker Compose plugin installed" else log "ERROR: Failed to download Docker Compose plugin" exit 1 fi dump_runtime_diagnostics() { log "Runtime diagnostics (docker compose ps):" run_as_ec2user_docker "cd ~ && docker compose ps" 2>&1 | sed 's/^/ /' || true log "Runtime diagnostics (caddy logs tail):" run_as_ec2user_docker "cd ~ && docker compose logs caddy --tail 120" 2>&1 | sed 's/^/ /' || true log "Runtime diagnostics (frontend logs tail):" run_as_ec2user_docker "cd ~ && docker compose logs frontend --tail 120" 2>&1 | sed 's/^/ /' || true log "Runtime diagnostics (backend logs tail):" run_as_ec2user_docker "cd ~ && docker compose logs backend --tail 120" 2>&1 | sed 's/^/ /' || true } # Function to verify network connectivity to external services (esp. github.com) verify_network_connectivity() { log "Verifying network connectivity to external services..." local connectivity_ok=true # Test DNS resolution for github.com log " Testing DNS resolution for github.com..." if ! getent hosts github.com >/dev/null 2>&1; then log " WARNING: DNS resolution for github.com failed initially, retrying..." sleep 2 if ! getent hosts github.com >/dev/null 2>&1; then log " ERROR: DNS resolution for github.com failed" connectivity_ok=false else log " OK: DNS resolution for github.com OK after retry" fi else log " OK: DNS resolution for github.com OK" fi # Test HTTP connectivity to github.com log " Testing HTTP connectivity to github.com..." if ! timeout 10 curl -s -o /dev/null --head https://github.com 2>/dev/null; then log " WARNING: Cannot reach github.com, retrying..." sleep 2 if ! timeout 10 curl -s -o /dev/null --head https://github.com 2>/dev/null; then log " ERROR: Cannot reach github.com (required for docker builds)" connectivity_ok=false else log " OK: Connectivity to github.com OK after retry" fi else log " OK: Connectivity to github.com OK" fi if [ "$connectivity_ok" = "true" ]; then return 0 else return 1 fi } # DevOps Escape Room stack (Jenkins + Gitea + code-server + MailHog) log "Setting up DevOps Escape Room stack..." # Derive DevOps HTTPS subdomain names early if CADDY_DOMAIN is already available # from the user_data environment variable (the common case for classroom instances). # If CADDY_DOMAIN is not set yet (EC2-tag fallback), these will be re-derived later # once the domain is confirmed. if [ -n "${CADDY_DOMAIN:-}" ]; then JENKINS_DOMAIN="${JENKINS_DOMAIN:-jenkins-${CADDY_DOMAIN}}" IDE_DOMAIN="${IDE_DOMAIN:-ide-${CADDY_DOMAIN}}" log "DevOps HTTPS subdomains derived from CADDY_DOMAIN:" log " Jenkins: ${JENKINS_DOMAIN}" log " IDE: ${IDE_DOMAIN}" else JENKINS_DOMAIN="" IDE_DOMAIN="" fi # The devops-escape-room directory ships with the SUT tarball (extracted above), # but the SUT extraction happens later. Write an inline compose file here so # the stack can start in parallel. After the SUT is extracted the files will be # replaced by the version from the tarball, which is identical. mkdir -p /home/ec2-user/devops-escape-room mkdir -p /home/ec2-user/jenkins/casc mkdir -p /home/ec2-user/gitea # ── Jenkins Dockerfile & plugins ──────────────────────────────────────────── cat > /home/ec2-user/jenkins/Dockerfile << 'JENKINSEOF' FROM jenkins/jenkins:lts-jdk17 ENV JAVA_OPTS="-Djenkins.install.runSetupWizard=false" ENV CASC_JENKINS_CONFIG=/var/jenkins_home/casc_configs USER root RUN apt-get update && apt-get install -y --no-install-recommends \ python3 python3-pip nodejs npm curl docker.io \ && rm -rf /var/lib/apt/lists/* \ && usermod -aG docker jenkins || true USER jenkins COPY plugins.txt /usr/share/jenkins/plugins.txt RUN jenkins-plugin-cli --plugin-file /usr/share/jenkins/plugins.txt --latest false COPY casc/ /var/jenkins_home/casc_configs/ JENKINSEOF cat > /home/ec2-user/jenkins/plugins.txt << 'PLUGINSEOF' workflow-aggregator pipeline-stage-view blueocean git gitea configuration-as-code job-dsl junit htmlpublisher build-timeout credentials credentials-binding plain-credentials ssh-credentials mailer email-ext dashboard-view build-monitor-plugin docker-workflow timestamper ws-cleanup antisamy-markup-formatter PLUGINSEOF cat > /home/ec2-user/jenkins/casc/jenkins.yaml << 'CASCEOF' jenkins: systemMessage: | 🧙 Welcome to the Fellowship's Jenkins CI! One does not simply skip the pipeline... numExecutors: 2 securityRealm: local: allowsSignup: false users: - id: "fellowship" name: "Gandalf the Grey" password: "${JENKINS_ADMIN_PASSWORD:-fellowship123}" authorizationStrategy: loggedInUsersCanDoAnything: allowAnonymousRead: true globalNodeProperties: - envVars: env: - key: "GITEA_URL" value: "http://gitea:3000" - key: "SUT_REPO" value: "http://gitea:3000/fellowship/lotr-sut.git" unclassified: location: url: "http://localhost:8080/" adminAddress: "gandalf@fellowship.local" mailer: smtpHost: "mailhog" smtpPort: "1025" useSsl: false charset: "UTF-8" jobs: - script: | pipelineJob('fellowship-sut-pipeline') { displayName('Fellowship SUT — CI Pipeline') description('One pipeline to build them all, and in the darkness test them.') definition { cpsScm { scm { git { remote { url('http://gitea:3000/fellowship/lotr-sut.git') } branch('*/main') } } scriptPath('Jenkinsfile') } } triggers { scm('H/5 * * * *') } logRotator { numToKeep(10) } } CASCEOF # ── Gitea init script ──────────────────────────────────────────────────────── cat > /home/ec2-user/gitea/init.sh << 'GITEAINITEOF' #!/bin/sh set -e GITEA_URL="${GITEA_URL:-http://gitea:3000}" ADMIN_USER="${GITEA_ADMIN_USER:-fellowship}" ADMIN_PASS="${GITEA_ADMIN_PASSWORD:-fellowship123}" ADMIN_EMAIL="${GITEA_ADMIN_EMAIL:-gandalf@fellowship.local}" ORG_NAME="fellowship" REPO_NAME="lotr-sut" log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] [gitea-init] $*"; } wait_for_gitea() { log "Waiting for Gitea at ${GITEA_URL}..." i=0; while [ $i -lt 30 ]; do curl -sf "${GITEA_URL}/api/v1/version" > /dev/null 2>&1 && log "✓ Gitea ready" && return 0 log " attempt $((i+1))/30 — waiting 5s..."; sleep 5; i=$((i+1)) done log "ERROR: Gitea not ready"; return 1 } wait_for_admin() { log "Waiting for Gitea admin user '${ADMIN_USER}' to be ready..." i=0; while [ $i -lt 30 ]; do curl -sf -u "${ADMIN_USER}:${ADMIN_PASS}" "${GITEA_URL}/api/v1/user" > /dev/null 2>&1 \ && log "✓ Admin user ready" && return 0 log " attempt $((i+1))/30 — waiting 3s..."; sleep 3; i=$((i+1)) done log "WARNING: Admin user not ready — proceeding anyway (some operations may fail)" return 0 } api() { curl -sf -u "${ADMIN_USER}:${ADMIN_PASS}" "$@"; } wait_for_gitea wait_for_admin # Create org (ignore if exists) api -X POST "${GITEA_URL}/api/v1/orgs" \ -H "Content-Type: application/json" \ -d "{\"username\":\"${ORG_NAME}\",\"full_name\":\"The Fellowship of the Ring\",\"visibility\":\"public\"}" \ > /dev/null 2>&1 || true log "✓ Organization '${ORG_NAME}' ready" # Create repo (ignore if exists) api -X POST "${GITEA_URL}/api/v1/orgs/${ORG_NAME}/repos" \ -H "Content-Type: application/json" \ -d "{\"name\":\"${REPO_NAME}\",\"description\":\"LOTR SUT\",\"private\":false,\"auto_init\":false,\"default_branch\":\"main\"}" \ > /dev/null 2>&1 || true log "✓ Repository '${ORG_NAME}/${REPO_NAME}' ready" # Push code if repo is empty COMMITS=$(api "${GITEA_URL}/api/v1/repos/${ORG_NAME}/${REPO_NAME}/commits?limit=1" 2>/dev/null | grep -c '"sha"' || echo "0") if [ "$COMMITS" -gt 0 ]; then log "✓ Repository already has commits — skipping push"; exit 0 fi SRC="" for d in /sut-source /home/ec2-user; do [ -f "${d}/docker-compose.yml" ] && [ -d "${d}/sut" ] && SRC="$d" && break done if [ -z "$SRC" ]; then log "WARNING: SUT source not found — skipping code push"; exit 0 fi log "Pushing code from ${SRC} to Gitea..." AUTH_URL=$(echo "${GITEA_URL}/${ORG_NAME}/${REPO_NAME}.git" | sed "s|http://|http://${ADMIN_USER}:${ADMIN_PASS}@|") TMP=$(mktemp -d) cp -a "${SRC}/." "${TMP}/" cd "${TMP}" rm -rf .git git init -b main git config user.email "${ADMIN_EMAIL}" git config user.name "Gandalf the Grey" git add -A git commit -m "🧙 Initial commit: The Fellowship's Quest List SUT" git remote add gitea "${AUTH_URL}" git push gitea main cd /; rm -rf "${TMP}" log "✓ SUT code pushed to Gitea" GITEAINITEOF chmod +x /home/ec2-user/gitea/init.sh # ── code-server custom Dockerfile + entrypoint ────────────────────────────── # Extends codercom/code-server with Docker CLI, Compose v2, and pre-installed # VS Code extensions (Python, Playwright, Copilot, Jupyter, Prettier). mkdir -p /home/ec2-user/devops-escape-room/code-server cat > /home/ec2-user/devops-escape-room/code-server/Dockerfile << 'CSRVDOCKEREOF' FROM codercom/code-server:latest USER root # Docker CLI + gosu (for clean privilege drop) + utilities RUN apt-get update && \ apt-get install -y --no-install-recommends \ docker.io \ gosu \ curl \ git \ && rm -rf /var/lib/apt/lists/* # Docker Compose v2 plugin + legacy symlink RUN mkdir -p /usr/local/lib/docker/cli-plugins && \ curl -fsSL \ "https://github.com/docker/compose/releases/download/v2.27.0/docker-compose-linux-x86_64" \ -o /usr/local/lib/docker/cli-plugins/docker-compose && \ chmod +x /usr/local/lib/docker/cli-plugins/docker-compose && \ ln -sf /usr/local/lib/docker/cli-plugins/docker-compose /usr/local/bin/docker-compose # Coder user docker group (GID re-aligned at runtime by entrypoint) RUN groupadd -g 999 docker 2>/dev/null || groupmod -g 999 docker 2>/dev/null || true && \ usermod -aG docker coder COPY entrypoint.sh /usr/bin/fellowship-docker-init.sh RUN chmod +x /usr/bin/fellowship-docker-init.sh USER root ENTRYPOINT ["/usr/bin/fellowship-docker-init.sh"] CSRVDOCKEREOF cat > /home/ec2-user/devops-escape-room/code-server/entrypoint.sh << 'CSRVENTRYEOF' #!/bin/bash # Fellowship code-server entrypoint: fixes Docker GID, installs extensions, starts IDE set -e log() { echo "[$(date '+%H:%M:%S')] [fellowship-init] $*"; } # Fix docker group GID to match host socket if [ -S /var/run/docker.sock ]; then DOCK_GID=$(stat -c '%g' /var/run/docker.sock) if getent group docker > /dev/null 2>&1; then groupmod -g "${DOCK_GID}" docker 2>/dev/null || true else groupadd -g "${DOCK_GID}" docker 2>/dev/null || true fi usermod -aG docker coder 2>/dev/null || true chmod 666 /var/run/docker.sock 2>/dev/null || true log "Docker group GID aligned to ${DOCK_GID}" else log "WARNING: docker.sock not mounted — docker unavailable in IDE terminal" fi # Install VS Code extensions as coder user log "Installing VS Code extensions..." for ext in ms-python.python github.copilot ms-playwright.playwright esbenp.prettier-vscode ms-toolsai.jupyter redhat.vscode-yaml ms-azuretools.vscode-docker; do gosu coder code-server --install-extension "${ext}" --force > /dev/null 2>&1 && \ log " OK ${ext}" || log " SKIP ${ext}" done # Default settings SETTINGS_DIR="/home/coder/.local/share/code-server/User" if [ ! -f "${SETTINGS_DIR}/settings.json" ]; then gosu coder mkdir -p "${SETTINGS_DIR}" cat > "${SETTINGS_DIR}/settings.json" << 'SETTINGSEOF' { "python.defaultInterpreterPath": "/usr/bin/python3", "editor.formatOnSave": true, "terminal.integrated.defaultProfile.linux": "bash", "git.autofetch": true, "docker.host": "unix:///var/run/docker.sock" } SETTINGSEOF chown coder:coder "${SETTINGS_DIR}/settings.json" log "Default settings.json written" fi log "Starting code-server..." exec gosu coder /usr/bin/entrypoint.sh "$@" CSRVENTRYEOF chmod +x /home/ec2-user/devops-escape-room/code-server/entrypoint.sh chown -R ec2-user:ec2-user /home/ec2-user/devops-escape-room/code-server log "✓ code-server Dockerfile and entrypoint written" # ── devops-escape-room docker-compose ──────────────────────────────────────── cat > /home/ec2-user/devops-escape-room/docker-compose.yml << 'COMPOSEEOF' # Fellowship DevOps Escape Room Stack # Jenkins CI | Gitea Git | code-server IDE | MailHog mail # # Jenkins: http://HOST:8080 (fellowship / fellowship123) # Gitea: http://HOST:3030 (fellowship / fellowship123) # code-server: http://HOST:8443 (password: fellowship) # MailHog: http://HOST:8025 services: jenkins: build: context: ../jenkins dockerfile: Dockerfile image: fellowship-jenkins:latest container_name: fellowship-jenkins restart: unless-stopped ports: - "8080:8080" - "50000:50000" volumes: - jenkins_home:/var/jenkins_home - /var/run/docker.sock:/var/run/docker.sock environment: JENKINS_ADMIN_PASSWORD: ${JENKINS_ADMIN_PASSWORD:-fellowship123} CASC_JENKINS_CONFIG: /var/jenkins_home/casc_configs # JENKINS_URL is written to devops-escape-room/.env by setup_fellowship.sh # once CADDY_DOMAIN is known, so Jenkins knows its canonical HTTPS URL. JENKINS_URL: ${JENKINS_URL:-http://localhost:8080/} depends_on: gitea: condition: service_healthy healthcheck: test: ["CMD-SHELL", "curl -sf http://localhost:8080/login || exit 1"] interval: 30s timeout: 10s retries: 5 start_period: 90s gitea: image: gitea/gitea:1.22 container_name: fellowship-gitea restart: unless-stopped ports: - "3030:3000" - "2222:22" volumes: - gitea_data:/data environment: USER_UID: "1000" USER_GID: "1000" GITEA__database__DB_TYPE: sqlite3 GITEA__server__DOMAIN: ${GITEA_DOMAIN:-localhost} GITEA__server__HTTP_PORT: "3000" GITEA__server__ROOT_URL: ${GITEA_ROOT_URL:-http://localhost:3030/} GITEA__server__SSH_DOMAIN: localhost GITEA__server__SSH_PORT: "2222" GITEA__service__DISABLE_REGISTRATION: "false" GITEA__service__REQUIRE_SIGNIN_VIEW: "false" GITEA__security__INSTALL_LOCK: "true" GITEA__mailer__ENABLED: "false" # Pre-create the admin user on first boot via Gitea environment variables. # Without these the gitea-init container cannot authenticate against the API. GITEA__admin__ADMIN_USER: ${GITEA_ADMIN_USER:-fellowship} GITEA__admin__ADMIN_PASSWD: ${GITEA_ADMIN_PASSWORD:-fellowship123} GITEA__admin__ADMIN_EMAIL: ${GITEA_ADMIN_EMAIL:-gandalf@fellowship.local} GITEA__admin__SEND_NOTIFY: "false" healthcheck: test: ["CMD-SHELL", "curl -sf http://localhost:3000/api/v1/version || exit 1"] interval: 15s timeout: 5s retries: 6 start_period: 30s gitea-init: image: alpine/git:latest container_name: fellowship-gitea-init restart: on-failure:5 environment: GITEA_URL: "http://gitea:3000" GITEA_ADMIN_USER: ${GITEA_ADMIN_USER:-fellowship} GITEA_ADMIN_PASSWORD: ${GITEA_ADMIN_PASSWORD:-fellowship123} GITEA_ADMIN_EMAIL: ${GITEA_ADMIN_EMAIL:-gandalf@fellowship.local} GITEA_DOMAIN: ${GITEA_DOMAIN:-} SUT_SOURCE_DIR: /sut-source volumes: - ../gitea/init.sh:/init.sh:ro - /home/ec2-user/sut:/sut-source/sut:ro - /home/ec2-user/docker-compose.yml:/sut-source/docker-compose.yml:ro - /home/ec2-user/caddy:/sut-source/caddy:ro - /home/ec2-user/nginx:/sut-source/nginx:ro - /home/ec2-user/Jenkinsfile:/sut-source/Jenkinsfile:ro entrypoint: ["/bin/sh", "/init.sh"] depends_on: gitea: condition: service_healthy code-server: build: context: ./code-server dockerfile: Dockerfile image: fellowship-code-server:latest container_name: fellowship-code-server restart: unless-stopped ports: - "8443:8080" volumes: - /home/ec2-user:/home/coder/fellowship:rw - codeserver_config:/home/coder/.config # Mount Docker socket so students can run docker compose from the IDE terminal - /var/run/docker.sock:/var/run/docker.sock environment: PASSWORD: ${CODESERVER_PASSWORD:-fellowship} command: - --auth=password - --bind-addr=0.0.0.0:8080 - /home/coder/fellowship healthcheck: test: ["CMD-SHELL", "curl -sf http://localhost:8080/ || exit 1"] interval: 30s timeout: 10s retries: 3 start_period: 30s mailhog: image: mailhog/mailhog:v1.0.1 container_name: fellowship-mailhog restart: unless-stopped ports: - "1025:1025" - "8025:8025" volumes: jenkins_home: driver: local gitea_data: driver: local COMPOSEEOF # Pre-create code-server config directory with proper permissions (bind mount) # The coder user inside the container runs as 1000:1000, so the host directory # must be owned by ec2-user (uid 1000) to avoid "permission denied" errors. mkdir -p /home/ec2-user/.codeserver-config chown 1000:1000 /home/ec2-user/.codeserver-config chmod 700 /home/ec2-user/.codeserver-config chown -R ec2-user:ec2-user /home/ec2-user/devops-escape-room chown -R ec2-user:ec2-user /home/ec2-user/jenkins chown -R ec2-user:ec2-user /home/ec2-user/gitea # Write the devops-escape-room .env so Docker Compose can inject the correct # JENKINS_URL into the Jenkins container before it starts. # If JENKINS_DOMAIN is not yet known (EC2-tag path), Jenkins defaults to localhost # and will be updated automatically after the SUT domain is resolved. DEVOPS_JENKINS_URL="${JENKINS_DOMAIN:+https://${JENKINS_DOMAIN}/}" DEVOPS_JENKINS_URL="${DEVOPS_JENKINS_URL:-http://localhost:8080/}" # Copy devops-escape-room environment template to .env log "Setting up devops-escape-room environment..." if [ -f /home/ec2-user/devops-escape-room/.env.prod ]; then cp /home/ec2-user/devops-escape-room/.env.prod /home/ec2-user/devops-escape-room/.env chown ec2-user:ec2-user /home/ec2-user/devops-escape-room/.env chmod 644 /home/ec2-user/devops-escape-room/.env log "✓ Copied devops-escape-room/.env.prod to .env" # Update JENKINS_URL if domain is available if [ -n "$JENKINS_DOMAIN" ]; then sed -i "" "s|^JENKINS_URL=.*|JENKINS_URL=${DEVOPS_JENKINS_URL}|" /home/ec2-user/devops-escape-room/.env fi else # Fallback: create .env inline if template not found cat > /home/ec2-user/devops-escape-room/.env << EOF JENKINS_ADMIN_PASSWORD=fellowship123 GITEA_ADMIN_USER=fellowship GITEA_ADMIN_PASSWORD=fellowship123 GITEA_ADMIN_EMAIL=gandalf@fellowship.local CODESERVER_PASSWORD=fellowship JENKINS_URL=${DEVOPS_JENKINS_URL} GITEA_DOMAIN= EOF chown ec2-user:ec2-user /home/ec2-user/devops-escape-room/.env log "✓ Wrote devops-escape-room/.env (fallback mode, JENKINS_URL=${DEVOPS_JENKINS_URL})" fi log "Building and starting DevOps Escape Room stack..." if run_as_ec2user_docker "cd ~/devops-escape-room && docker compose up -d --build" 2>&1 | \ tee -a "$LOG_FILE"; then log "✓ DevOps Escape Room stack started (Jenkins, Gitea, code-server, MailHog)" else log "WARNING: DevOps Escape Room stack may not have started cleanly — check logs" fi # Fellowship SUT Setup log "Setting up Fellowship SUT..." # Get SUT bucket from SSM log "Retrieving SUT bucket from SSM: /classroom/fellowship/sut-bucket" SUT_BUCKET=$(aws ssm get-parameter --name "/classroom/fellowship/sut-bucket" --query "Parameter.Value" --output text --region "${AWS_REGION}" 2>&1) if [ $? -ne 0 ] || [ -z "$SUT_BUCKET" ] || [ "$SUT_BUCKET" = "None" ]; then log "ERROR: Failed to get SUT bucket from SSM" log "Error: $SUT_BUCKET" exit 1 fi log "SUT bucket: $SUT_BUCKET" # Download SUT from S3 log "Finding latest SUT artifact in S3..." LATEST_TAR=$(aws s3 ls "s3://${SUT_BUCKET}/" --region "${AWS_REGION}" | \ awk '/fellowship-sut-.*\.tar\.gz$/ {print $1" "$2" "$4}' | \ sort | tail -n 1 | awk '{print $3}') if [ -z "$LATEST_TAR" ]; then log "ERROR: No fellowship-sut-*.tar.gz artifact found in S3 bucket" exit 1 fi log "Downloading latest SUT artifact: $LATEST_TAR" if ! aws s3 cp "s3://${SUT_BUCKET}/${LATEST_TAR}" /tmp/fellowship-sut.tar.gz --region "${AWS_REGION}" >/dev/null 2>&1 || [ ! -f "/tmp/fellowship-sut.tar.gz" ]; then log "ERROR: Failed to download SUT from S3" log "Expected location: s3://${SUT_BUCKET}/${LATEST_TAR}" exit 1 fi log "✓ SUT downloaded" # Extract SUT log "Extracting SUT..." if ! tar -xzf /tmp/fellowship-sut.tar.gz -C /home/ec2-user/ 2>/dev/null; then log "ERROR: Failed to extract SUT" exit 1 fi rm -f /tmp/fellowship-sut.tar.gz # Tarball extracts to sut/ and docker-compose.yml at home root - chown both chown -R ec2-user:ec2-user /home/ec2-user/sut 2>/dev/null || true chown ec2-user:ec2-user /home/ec2-user/docker-compose.yml 2>/dev/null || true chown -R ec2-user:ec2-user /home/ec2-user/caddy 2>/dev/null || true chown -R ec2-user:ec2-user /home/ec2-user/nginx 2>/dev/null || true log "✓ SUT extracted" # Copy environment template to .env for docker-compose # This ensures COMPOSE_PROJECT_NAME=fellowship (production) is used log "Setting up production environment (.env.prod → .env)..." if [ -f /home/ec2-user/.env.prod ]; then cp /home/ec2-user/.env.prod /home/ec2-user/.env chown ec2-user:ec2-user /home/ec2-user/.env chmod 644 /home/ec2-user/.env log "✓ Copied .env.prod to .env" else log "WARNING: .env.prod not found — .env will be created from scratch" fi # Get instance domain for Caddy # PRIORITY 1: Check if domain was passed via user_data environment variable # This is the most reliable method - domain is known before instance creation log "Getting instance domain for Caddy..." if [ -n "$CADDY_DOMAIN" ] && [ "$CADDY_DOMAIN" != "" ]; then log "✓ Found Caddy domain from user_data environment: $CADDY_DOMAIN" # Domain is already set, no need to query EC2 tags else # PRIORITY 2: Fallback to EC2 tags (requires instance ID from metadata service) log "Domain not in environment, attempting to get from EC2 tags..." INSTANCE_ID="" CADDY_DOMAIN="" # Retry getting instance ID (metadata service may not be ready immediately) for i in {1..10}; do INSTANCE_ID=$(get_instance_metadata "instance-id") if [ -n "$INSTANCE_ID" ]; then log "✓ Got instance ID: $INSTANCE_ID" break fi if [ $i -lt 10 ]; then log " Attempt $i/10: Instance ID not available yet, waiting 2s..." sleep 2 fi done if [ -n "$INSTANCE_ID" ]; then # Get domain from instance tags (set by Lambda BEFORE instance creation) # With predictable domain names, this should be available immediately log "Retrieving HttpsDomain tag from instance tags..." for i in {1..6}; do CADDY_DOMAIN=$(aws ec2 describe-tags --region "${AWS_REGION}" --filters "Name=resource-id,Values=${INSTANCE_ID}" "Name=key,Values=HttpsDomain" --query "Tags[0].Value" --output text 2>/dev/null || echo "") if [ -n "$CADDY_DOMAIN" ] && [ "$CADDY_DOMAIN" != "None" ] && [ "$CADDY_DOMAIN" != "" ]; then log "✓ Found Caddy domain from tags: $CADDY_DOMAIN" break fi if [ $i -lt 6 ]; then log " Attempt $i/6: HttpsDomain tag not found yet, waiting 2s..." sleep 2 fi done else log "WARNING: Could not get instance ID after retries" fi # Final check if [ -z "$CADDY_DOMAIN" ] || [ "$CADDY_DOMAIN" = "None" ] || [ "$CADDY_DOMAIN" = "" ]; then log "ERROR: Caddy domain not found - cannot deploy AWS Fellowship SUT without a valid domain" log " Ensure HttpsDomain tag is set before instance bootstrap" exit 1 fi fi # Normalize to lowercase to avoid mixed-case DNS/tag drift CADDY_DOMAIN=$(echo "$CADDY_DOMAIN" | tr '[:upper:]' '[:lower:]') # Enforce domain presence for AWS deployment if [ -z "$CADDY_DOMAIN" ] || [ "$CADDY_DOMAIN" = "None" ] || [ "$CADDY_DOMAIN" = "" ]; then log "ERROR: Caddy domain is required for AWS deployment" exit 1 fi # Wait for DNS propagation before starting containers (required for Caddy automatic HTTPS) PUBLIC_IP_FOR_DNS=$(get_instance_metadata "public-ipv4") if [ -z "$PUBLIC_IP_FOR_DNS" ]; then log "ERROR: Could not retrieve instance public IP for DNS verification" exit 1 fi resolve_domain_ipv4() { local domain="$1" local resolved_ip resolved_ip=$(getent ahostsv4 "$domain" 2>/dev/null | awk '{print $1}' | grep -E '^[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+$' | head -1 || true) if [ -z "$resolved_ip" ]; then resolved_ip=$(nslookup "$domain" 2>/dev/null | awk '/^Address: / {print $2}' | grep -E '^[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+$' | tail -1 || true) fi echo "$resolved_ip" } log "Waiting for DNS propagation: ${CADDY_DOMAIN} -> ${PUBLIC_IP_FOR_DNS}" DNS_MATCHED="false" for i in {1..30}; do RESOLVED_IP=$(resolve_domain_ipv4 "$CADDY_DOMAIN") if [ "$RESOLVED_IP" = "$PUBLIC_IP_FOR_DNS" ]; then DNS_MATCHED="true" log "✓ DNS propagation complete (${CADDY_DOMAIN} resolves to ${RESOLVED_IP})" break fi log " Attempt $i/30: ${CADDY_DOMAIN} resolves to '${RESOLVED_IP:-unresolved}' (expected ${PUBLIC_IP_FOR_DNS}), waiting 10s..." sleep 10 done if [ "$DNS_MATCHED" != "true" ]; then log "ERROR: DNS propagation timeout after 5 minutes" log " ${CADDY_DOMAIN} did not resolve to instance public IP ${PUBLIC_IP_FOR_DNS}" log " Caddy automatic HTTPS cannot succeed until DNS is correct" exit 1 fi # ── Derive DevOps HTTPS subdomain names ────────────────────────────────────── # Pattern: jenkins-{CADDY_DOMAIN} and ide-{CADDY_DOMAIN} # These may already be set from the early derivation above; re-assign to ensure # they reflect the confirmed (possibly tag-derived) CADDY_DOMAIN. JENKINS_DOMAIN="jenkins-${CADDY_DOMAIN}" IDE_DOMAIN="ide-${CADDY_DOMAIN}" GITEA_DOMAIN="gitea-${CADDY_DOMAIN}" log "DevOps HTTPS subdomains confirmed:" log " Jenkins: ${JENKINS_DOMAIN}" log " IDE: ${IDE_DOMAIN}" log " Gitea: ${GITEA_DOMAIN}" # ── Create Route53 A records for jenkins and ide subdomains ────────────────── # Both subdomains must point to the same instance IP as CADDY_DOMAIN. # Try to find the hosted-zone ID from (in order): # 1. EC2 instance tag Route53ZoneId # 2. SSM parameter /classroom/fellowship/route53-zone-id # 3. Route53 lookup by parent domain ROUTE53_ZONE_ID="${ROUTE53_ZONE_ID:-}" if [ -n "$ROUTE53_ZONE_ID" ] && [ "$ROUTE53_ZONE_ID" != "None" ]; then log "Using Route53 zone ID from environment: ${ROUTE53_ZONE_ID}" else ROUTE53_ZONE_ID="" fi # Source 1: EC2 instance tag (set by the provisioning Lambda) if [ -n "${INSTANCE_ID:-}" ]; then ROUTE53_ZONE_ID=$(aws ec2 describe-tags --region "${AWS_REGION}" \ --filters "Name=resource-id,Values=${INSTANCE_ID}" "Name=key,Values=Route53ZoneId" \ --query "Tags[0].Value" --output text 2>/dev/null || echo "") [ "$ROUTE53_ZONE_ID" = "None" ] && ROUTE53_ZONE_ID="" fi # Source 2: SSM parameter if [ -z "$ROUTE53_ZONE_ID" ]; then ROUTE53_ZONE_ID=$(aws ssm get-parameter \ --name "/classroom/fellowship/route53-zone-id" \ --query "Parameter.Value" --output text --region "${AWS_REGION}" 2>/dev/null || echo "") [ "$ROUTE53_ZONE_ID" = "None" ] && ROUTE53_ZONE_ID="" fi # Source 3: Walk up the DNS tree stripping one label at a time until a # matching hosted zone is found. A single sed strip is not enough when # CADDY_DOMAIN has multiple subdomain levels (e.g. # fellowship-.fellowship.testingfantasy.com — stripping one label gives # fellowship.testingfantasy.com which is NOT a hosted zone; the actual zone # is testingfantasy.com two levels up). if [ -z "$ROUTE53_ZONE_ID" ]; then ZONE_SUFFIX=$(echo "$CADDY_DOMAIN" | sed 's/^[^.]*\.//') while [ -n "$ZONE_SUFFIX" ] && [ "$ZONE_SUFFIX" != "${ZONE_SUFFIX#*.}" ]; do CANDIDATE=$(aws route53 list-hosted-zones-by-name \ --dns-name "${ZONE_SUFFIX}" \ --query "HostedZones[?Name==\`${ZONE_SUFFIX}.\`].Id" \ --output text 2>/dev/null \ | sed 's|/hostedzone/||' || echo "") if [ -n "$CANDIDATE" ] && [ "$CANDIDATE" != "None" ] && [ "$CANDIDATE" != "\t" ]; then ROUTE53_ZONE_ID="$CANDIDATE" log " Route53 zone found via DNS tree walk: ${ZONE_SUFFIX} → ${ROUTE53_ZONE_ID}" break fi ZONE_SUFFIX=$(echo "$ZONE_SUFFIX" | sed 's/^[^.]*\.//') done [ "${ROUTE53_ZONE_ID:-}" = "None" ] && ROUTE53_ZONE_ID="" fi if [ -n "$ROUTE53_ZONE_ID" ]; then log "Creating/updating Route53 A records for DevOps subdomains (zone ${ROUTE53_ZONE_ID})..." aws route53 change-resource-record-sets \ --hosted-zone-id "$ROUTE53_ZONE_ID" \ --change-batch "{ \"Comment\": \"Fellowship DevOps Escape Room HTTPS subdomains\", \"Changes\": [ { \"Action\": \"UPSERT\", \"ResourceRecordSet\": { \"Name\": \"${JENKINS_DOMAIN}\", \"Type\": \"A\", \"TTL\": 60, \"ResourceRecords\": [{\"Value\": \"${PUBLIC_IP_FOR_DNS}\"}] } }, { \"Action\": \"UPSERT\", \"ResourceRecordSet\": { \"Name\": \"${IDE_DOMAIN}\", \"Type\": \"A\", \"TTL\": 60, \"ResourceRecords\": [{\"Value\": \"${PUBLIC_IP_FOR_DNS}\"}] } }, { \"Action\": \"UPSERT\", \"ResourceRecordSet\": { \"Name\": \"${GITEA_DOMAIN}\", \"Type\": \"A\", \"TTL\": 60, \"ResourceRecords\": [{\"Value\": \"${PUBLIC_IP_FOR_DNS}\"}] } } ] }" \ --region "$AWS_REGION" 2>&1 | tee -a "$LOG_FILE" \ && log "✓ Route53 A records upserted for ${JENKINS_DOMAIN}, ${IDE_DOMAIN} and ${GITEA_DOMAIN}" \ || log "WARNING: Route53 record update failed — manual DNS setup may be required" else log "WARNING: Route53 zone ID not found — DevOps subdomains need manual DNS setup:" log " A record: ${JENKINS_DOMAIN} → ${PUBLIC_IP_FOR_DNS}" log " A record: ${IDE_DOMAIN} → ${PUBLIC_IP_FOR_DNS}" log " A record: ${GITEA_DOMAIN} → ${PUBLIC_IP_FOR_DNS}" fi # Update the devops-escape-room .env so Jenkins knows its canonical HTTPS URL. # The devops stack may already be running; the container will pick up the new URL # on the next restart or if Jenkins JCasC is reloaded. cat > /home/ec2-user/devops-escape-room/.env << EOF JENKINS_ADMIN_PASSWORD=fellowship123 GITEA_ADMIN_USER=fellowship GITEA_ADMIN_PASSWORD=fellowship123 GITEA_ADMIN_EMAIL=gandalf@fellowship.local CODESERVER_PASSWORD=fellowship JENKINS_URL=https://${JENKINS_DOMAIN}/ GITEA_DOMAIN=${GITEA_DOMAIN} GITEA_ROOT_URL=https://${GITEA_DOMAIN}/ EOF chown ec2-user:ec2-user /home/ec2-user/devops-escape-room/.env log "✓ Updated devops-escape-room/.env with JENKINS_URL=https://${JENKINS_DOMAIN}/ GITEA_DOMAIN=${GITEA_DOMAIN}" # Restart Gitea so it picks up the correct GITEA_ROOT_URL and GITEA_DOMAIN. # (The escape room stack was started earlier with an empty GITEA_DOMAIN; now that # the domain is known, we restart just the gitea service so ROOT_URL is correct.) run_as_ec2user_docker "cd ~/devops-escape-room && docker compose restart gitea" 2>&1 | tee -a "$LOG_FILE" \ && log "✓ Gitea restarted with GITEA_ROOT_URL=https://${GITEA_DOMAIN}/" \ || log "WARNING: Gitea restart failed — ROOT_URL may still point to localhost" # Deploy SUT log "Deploying SUT..." if [ ! -f "/home/ec2-user/docker-compose.yml" ]; then log "ERROR: SUT docker-compose.yml not found" exit 1 fi if [ ! -x "/home/ec2-user/.docker/cli-plugins/docker-compose" ]; then log "ERROR: Docker Compose plugin not executable" exit 1 fi # Get Azure OpenAI credentials from Secrets Manager log "Retrieving Azure OpenAI credentials from Secrets Manager..." AZURE_SECRET="" AZURE_ENDPOINT="" AZURE_API_KEY="" AZURE_DEPLOYMENT="" AZURE_API_VERSION="" # Primary secret path (new schema) SECRET_NAME="azure/llm/configs" # Legacy fallback path (old schema) ENVIRONMENT="${ENVIRONMENT:-dev}" LEGACY_SECRET_NAME="classroom/shared/${ENVIRONMENT}/azure-openai" normalize_azure_endpoint() { local raw_endpoint="$1" # Remove query string if present raw_endpoint="${raw_endpoint%%\?*}" # Convert full Azure OpenAI operation URL to resource base endpoint # Example: # https://resource.openai.azure.com/openai/deployments/gpt-4o/chat/completions?api-version=... # -> https://resource.openai.azure.com if [[ "$raw_endpoint" == *"/openai/"* ]]; then raw_endpoint="${raw_endpoint%%/openai/*}" fi # Remove trailing slash for consistency raw_endpoint="${raw_endpoint%/}" echo "$raw_endpoint" } AZURE_SECRET=$(aws secretsmanager get-secret-value \ --secret-id "$SECRET_NAME" \ --region "${AWS_REGION}" \ --query SecretString \ --output text 2>/dev/null || echo "") if [ -z "$AZURE_SECRET" ] || [ "$AZURE_SECRET" = "None" ]; then log "Primary secret not found, trying legacy Azure OpenAI secret..." AZURE_SECRET=$(aws secretsmanager get-secret-value \ --secret-id "$LEGACY_SECRET_NAME" \ --region "${AWS_REGION}" \ --query SecretString \ --output text 2>/dev/null || echo "") if [ -n "$AZURE_SECRET" ] && [ "$AZURE_SECRET" != "None" ]; then SECRET_NAME="$LEGACY_SECRET_NAME" fi fi if [ -z "$AZURE_SECRET" ] || [ "$AZURE_SECRET" = "None" ]; then log "WARNING: Failed to retrieve Azure OpenAI secret from Secrets Manager" log " Secret name: $SECRET_NAME" log " Region: $AWS_REGION" log " Fellowship SUT will work with fallback responses only (no Azure AI)" else # Parse secret JSON and extract values from either: # - New schema: array under azure/llm/configs # - Legacy schema: single object under classroom/shared//azure-openai if command -v jq &> /dev/null; then JQ_PICK_FILTER=' def pick: if type == "array" then (map(select((.config_name // "" | ascii_downcase) | test("gpt[ -]?4o"))) | .[0]) // (map(select((.endpoint // "" | ascii_downcase) | contains("/chat/completions"))) | .[0]) // (map(select((.config_name // "" | ascii_downcase) | test("gpt"))) | .[0]) // .[0] else . end; pick ' AZURE_ENDPOINT=$(echo "$AZURE_SECRET" | jq -r "$JQ_PICK_FILTER | .endpoint // empty" 2>/dev/null || echo "") AZURE_API_KEY=$(echo "$AZURE_SECRET" | jq -r "$JQ_PICK_FILTER | .api_key // empty" 2>/dev/null || echo "") AZURE_DEPLOYMENT=$(echo "$AZURE_SECRET" | jq -r "$JQ_PICK_FILTER | .deployment_name // .deployment // empty" 2>/dev/null || echo "") AZURE_API_VERSION=$(echo "$AZURE_SECRET" | jq -r "$JQ_PICK_FILTER | .api_version // empty" 2>/dev/null || echo "") elif command -v python3 &> /dev/null; then PARSED_AZURE=$(AZURE_SECRET="$AZURE_SECRET" python3 << 'PY' import json import os raw = os.environ.get("AZURE_SECRET", "") try: data = json.loads(raw) except Exception: data = {} selected = {} if isinstance(data, list): def config_name(item): return str(item.get("config_name", "")).lower() def endpoint(item): return str(item.get("endpoint", "")).lower() selected = ( next((x for x in data if "gpt-4o" in config_name(x) or "gpt 4-o" in config_name(x)), None) or next((x for x in data if "/chat/completions" in endpoint(x)), None) or next((x for x in data if "gpt" in config_name(x)), None) or (data[0] if data else {}) ) elif isinstance(data, dict): selected = data endpoint_val = selected.get("endpoint", "") api_key_val = selected.get("api_key", "") deployment_val = selected.get("deployment_name") or selected.get("deployment") or "" api_version_val = selected.get("api_version", "") print(f"endpoint={endpoint_val}") print(f"api_key={api_key_val}") print(f"deployment={deployment_val}") print(f"api_version={api_version_val}") PY ) while IFS='=' read -r key value; do case "$key" in endpoint) AZURE_ENDPOINT="$value" ;; api_key) AZURE_API_KEY="$value" ;; deployment) AZURE_DEPLOYMENT="$value" ;; api_version) AZURE_API_VERSION="$value" ;; esac done <<< "$PARSED_AZURE" else log "WARNING: Neither jq nor python3 available; using best-effort grep parsing" AZURE_ENDPOINT=$(echo "$AZURE_SECRET" | grep -m1 -o '"endpoint"[[:space:]]*:[[:space:]]*"[^"]*"' | sed -E 's/^.*"endpoint"[[:space:]]*:[[:space:]]*"([^"]*)"$/\1/' || echo "") AZURE_API_KEY=$(echo "$AZURE_SECRET" | grep -m1 -o '"api_key"[[:space:]]*:[[:space:]]*"[^"]*"' | sed -E 's/^.*"api_key"[[:space:]]*:[[:space:]]*"([^"]*)"$/\1/' || echo "") AZURE_DEPLOYMENT=$(echo "$AZURE_SECRET" | grep -m1 -o '"deployment_name"[[:space:]]*:[[:space:]]*"[^"]*"' | sed -E 's/^.*"deployment_name"[[:space:]]*:[[:space:]]*"([^"]*)"$/\1/' || echo "") if [ -z "$AZURE_DEPLOYMENT" ]; then AZURE_DEPLOYMENT=$(echo "$AZURE_SECRET" | grep -m1 -o '"deployment"[[:space:]]*:[[:space:]]*"[^"]*"' | sed -E 's/^.*"deployment"[[:space:]]*:[[:space:]]*"([^"]*)"$/\1/' || echo "") fi AZURE_API_VERSION=$(echo "$AZURE_SECRET" | grep -m1 -o '"api_version"[[:space:]]*:[[:space:]]*"[^"]*"' | sed -E 's/^.*"api_version"[[:space:]]*:[[:space:]]*"([^"]*)"$/\1/' || echo "") fi # Ensure endpoint is in Azure resource base URL format expected by AzureOpenAI client AZURE_ENDPOINT=$(normalize_azure_endpoint "$AZURE_ENDPOINT") if [ -n "$AZURE_ENDPOINT" ] && [ -n "$AZURE_API_KEY" ] && [ -n "$AZURE_DEPLOYMENT" ]; then log "✓ Azure OpenAI credentials retrieved successfully" log " Secret source: ${SECRET_NAME}" log " Deployment: ${AZURE_DEPLOYMENT:-not set}" log " API Version: ${AZURE_API_VERSION:-not set}" log " Endpoint: ${AZURE_ENDPOINT}" else log "WARNING: Failed to parse Azure credentials from secret" log " Secret source: ${SECRET_NAME}" log " Endpoint: ${AZURE_ENDPOINT:-empty}" log " API Key: ${AZURE_API_KEY:0:10}***" log " Deployment: ${AZURE_DEPLOYMENT:-empty}" fi fi # Create .env file for docker-compose BEFORE deployment # This ensures all environment variables are persistently available log "Updating .env file with deployment-specific configuration..." # Function to update or add an env variable in .env file update_env_var() { local key="$1" local value="$2" local env_file="/home/ec2-user/.env" if grep -q "^${key}=" "$env_file" 2>/dev/null; then # Variable exists, update it sed -i "" "s|^${key}=.*|${key}=${value}|" "$env_file" else # Variable doesn't exist, append it echo "${key}=${value}" >> "$env_file" fi } # Update domain configuration from resolved values update_env_var "CADDY_DOMAIN" "${CADDY_DOMAIN:-localhost}" update_env_var "JENKINS_DOMAIN" "${JENKINS_DOMAIN:-}" update_env_var "IDE_DOMAIN" "${IDE_DOMAIN:-}" update_env_var "GITEA_DOMAIN" "${GITEA_DOMAIN:-}" update_env_var "CADDYFILE_PATH" "./caddy/Caddyfile" update_env_var "FRONTEND_MODE" "prod" update_env_var "FLASK_ENV" "production" update_env_var "NODE_ENV" "production" # Add optional Azure OpenAI Configuration if available if [ -n "$AZURE_ENDPOINT" ]; then update_env_var "AZURE_OPENAI_ENDPOINT" "${AZURE_ENDPOINT}" update_env_var "AZURE_OPENAI_API_KEY" "${AZURE_API_KEY}" update_env_var "AZURE_OPENAI_DEPLOYMENT" "${AZURE_DEPLOYMENT}" update_env_var "AZURE_OPENAI_API_VERSION" "${AZURE_API_VERSION}" fi # Add GitHub Token if available if [ -n "$GITHUB_TOKEN" ]; then update_env_var "GITHUB_TOKEN" "${GITHUB_TOKEN}" fi chown ec2-user:ec2-user /home/ec2-user/.env chmod 644 /home/ec2-user/.env log "✓ Updated /home/ec2-user/.env with deployment configuration" if [ -n "$AZURE_ENDPOINT" ]; then log " ✓ Azure OpenAI configured (deployment: ${AZURE_DEPLOYMENT})" else log " ⚠ Azure OpenAI not configured - using fallback responses" fi # Verify .env file contents (mask sensitive values) log "Verifying .env file contents:" grep -E "^CADDY_DOMAIN=|^AZURE_OPENAI" /home/ec2-user/.env 2>/dev/null | sed 's/AZURE_OPENAI_API_KEY=.*/AZURE_OPENAI_API_KEY=***MASKED***/g' | sed 's/^/ /' || true # Additional safety check: ensure CADDY_DOMAIN is not empty if [ -z "$CADDY_DOMAIN" ]; then log "ERROR: CADDY_DOMAIN is empty - docker-compose will not start properly" exit 1 fi # Verify network connectivity BEFORE docker-compose build (critical for github access) log "Performing pre-deployment network connectivity checks..." if ! verify_network_connectivity; then log "ERROR: Network connectivity checks failed" log " Cannot proceed with docker-compose build (needs github.com access)" exit 1 fi log "OK: Network connectivity verified" # Retrieve GitHub token from AWS Secrets Manager for private repo access during docker builds log "Retrieving GitHub credentials from AWS Secrets Manager..." GITHUB_TOKEN="" GITHUB_SECRET="" # Try to get GitHub token from Secrets Manager GITHUB_SECRET=$(aws secretsmanager get-secret-value \ --secret-id "classroom/shared/github-token" \ --region "${AWS_REGION}" \ --query SecretString \ --output text 2>/dev/null || echo "") if [ -n "$GITHUB_SECRET" ] && [ "$GITHUB_SECRET" != "None" ]; then # Try to extract token (could be plain string or JSON) if echo "$GITHUB_SECRET" | grep -q '{'; then # JSON format if command -v jq &> /dev/null; then GITHUB_TOKEN=$(echo "$GITHUB_SECRET" | jq -r '.token // .github_token // .pat // empty' 2>/dev/null || echo "") else GITHUB_TOKEN=$(echo "$GITHUB_SECRET" | grep -o '"token":"[^"]*' | cut -d'"' -f4 || echo "") fi else # Plain token string GITHUB_TOKEN="$GITHUB_SECRET" fi fi if [ -n "$GITHUB_TOKEN" ] && [ "$GITHUB_TOKEN" != "None" ]; then log "OK: GitHub credentials found - configuring git" # Configure git to use token for HTTPS cloning # This enables docker builds that clone from private GitHub repositories git config --global credential.helper store echo "https://${GITHUB_TOKEN}@github.com" > /home/ec2-user/.git-credentials chmod 600 /home/ec2-user/.git-credentials export GIT_ASKPASS=/bin/true export GITHUB_TOKEN else log "WARNING: GitHub token not found in Secrets Manager (public repos only)" fi # Deploy SUT containers using docker-compose with retry logic on network failures # Note: Pass environment variables both via .env file AND explicit exports for maximum compatibility log "Starting SUT containers (with automatic retry on network issues)..." log " CADDY_DOMAIN: ${CADDY_DOMAIN}" # ── Select the fellowship Caddyfile for the tutorial stack ─────────────────── # Tutorial instances serve three HTTPS sites via the single Caddy container: # • CADDY_DOMAIN → SUT (reverse_proxy to backend:5000 / frontend:3000) # • JENKINS_DOMAIN → Jenkins CI (reverse_proxy to host.docker.internal:8080) # • IDE_DOMAIN → code-server (reverse_proxy to host.docker.internal:8443) # Caddyfile.fellowship contains all three site blocks. # Caddyfile (staging) and Caddyfile.prod only contain the SUT block and must # NOT be used here — they would cause Caddy to fail if JENKINS_DOMAIN / IDE_DOMAIN # are empty, and would not expose the DevOps Escape Room tools over HTTPS at all. FELLOWSHIP_CADDYFILE="/home/ec2-user/caddy/Caddyfile.fellowship" ACTIVE_CADDYFILE="/home/ec2-user/caddy/Caddyfile" if [ -f "$FELLOWSHIP_CADDYFILE" ]; then cp "$FELLOWSHIP_CADDYFILE" "$ACTIVE_CADDYFILE" chown ec2-user:ec2-user "$ACTIVE_CADDYFILE" log "✓ Copied Caddyfile.fellowship → caddy/Caddyfile (SUT + Jenkins + IDE HTTPS)" else log "WARNING: Caddyfile.fellowship not found at ${FELLOWSHIP_CADDYFILE}" log " Jenkins and IDE will NOT be served via HTTPS." log " Ensure caddy/Caddyfile.fellowship is present in the SUT tarball (see caddy/ directory)." fi # Function to deploy SUT with retry logic for network failures deploy_sut_with_retry() { local max_attempts=3 local attempt=1 local wait_time=10 while [ $attempt -le $max_attempts ]; do log " Deployment attempt $attempt/$max_attempts..." # Use cd to set working directory, then docker compose will auto-load .env DEPLOY_OUTPUT=$(run_as_ec2user_docker "cd ~ && docker compose up -d 2>&1" 2>&1) DEPLOY_EXIT_CODE=$? if [ $DEPLOY_EXIT_CODE -eq 0 ]; then log "OK: Docker Compose started successfully" return 0 fi # Check if error is network-related (github, DNS, connectivity, etc) if echo "$DEPLOY_OUTPUT" | grep -iE "network|dns|resolve|github|credential|authentication|connection refused|timeout|no such device|temporary failure" >/dev/null 2>&1; then log " WARNING: Network-related error detected, will retry..." log " Error: $(echo \"$DEPLOY_OUTPUT\" | head -2 | tail -1)" if [ $attempt -lt $max_attempts ]; then log " Waiting ${wait_time}s before retry (attempt $((attempt + 1))/$max_attempts)..." sleep $wait_time wait_time=$((wait_time * 2)) # Exponential backoff: 10s, 20s, 40s attempt=$((attempt + 1)) continue fi else # Non-network error, fail immediately log "ERROR: Failed to start SUT containers (non-recoverable error)" log "Docker Compose output:" echo "$DEPLOY_OUTPUT" | sed 's/^/ /' return 1 fi attempt=$((attempt + 1)) done # All retries exhausted log "ERROR: Failed to start SUT containers after $max_attempts attempts" log "Docker Compose output:" echo "$DEPLOY_OUTPUT" | sed 's/^/ /' log "Checking Docker logs for more information..." run_as_ec2user_docker "cd ~ && docker compose logs" 2>&1 | tail -50 | sed 's/^/ /' return 1 } # Execute deployment with retry if ! deploy_sut_with_retry; then exit 1 fi log "Waiting for containers to be in running state..." # Wait for containers to be running (up to 60 seconds) CONTAINER_WAIT_COUNT=0 while [ $CONTAINER_WAIT_COUNT -lt 12 ]; do RUNNING_CONTAINERS=$(run_as_ec2user_docker "cd ~ && docker compose ps -q --status running 2>/dev/null | wc -l" 2>/dev/null || echo "0") EXPECTED_CONTAINERS=3 if [ "$RUNNING_CONTAINERS" -ge "$EXPECTED_CONTAINERS" ]; then log "✓ All required containers running ($RUNNING_CONTAINERS/$EXPECTED_CONTAINERS)" break fi log " Waiting for containers... ($RUNNING_CONTAINERS/$EXPECTED_CONTAINERS running, attempt $((CONTAINER_WAIT_COUNT + 1))/12)" sleep 5 CONTAINER_WAIT_COUNT=$((CONTAINER_WAIT_COUNT + 1)) done # Wait for backend health check to pass (up to 20 attempts * 3 seconds = 60 seconds) log "Waiting for backend service to be healthy..." BACKEND_HEALTH_COUNT=0 BACKEND_READY=false while [ $BACKEND_HEALTH_COUNT -lt 20 ]; do BACKEND_STATUS=$(run_as_ec2user_docker "cd ~ && docker compose ps backend --format json 2>/dev/null" | grep -o '"State":"running"' || echo "") if [ -n "$BACKEND_STATUS" ]; then log "✓ Backend container is running" BACKEND_READY=true break fi log " Waiting for backend to be ready... (attempt $((BACKEND_HEALTH_COUNT + 1))/20)" sleep 3 BACKEND_HEALTH_COUNT=$((BACKEND_HEALTH_COUNT + 1)) done # Wait for frontend to compile and start (React dev server, up to 60 seconds) log "Waiting for frontend to compile and start..." FRONTEND_WAIT_COUNT=0 FRONTEND_READY=false while [ $FRONTEND_WAIT_COUNT -lt 20 ]; do FRONTEND_LOGS=$(run_as_ec2user_docker "cd ~ && docker compose logs frontend 2>&1" | grep -iE "compiled successfully|webpack compiled|app is running on" || echo "") if [ -n "$FRONTEND_LOGS" ]; then log "✓ Frontend is ready" FRONTEND_READY=true break fi log " Waiting for frontend compilation... (attempt $((FRONTEND_WAIT_COUNT + 1))/20)" sleep 3 FRONTEND_WAIT_COUNT=$((FRONTEND_WAIT_COUNT + 1)) done if [ "$FRONTEND_READY" != "true" ]; then log "ERROR: Frontend did not become ready within timeout" dump_runtime_diagnostics exit 1 fi log "Running post-deploy health gates..." HTTP_OK=false for i in {1..20}; do if curl -sSf --max-time 5 "http://localhost/" >/dev/null 2>&1; then HTTP_OK=true log "✓ Local HTTP health gate passed" break fi log " Waiting for local HTTP health gate... (attempt $i/20)" sleep 3 done HTTPS_OK=false for i in {1..30}; do if curl -k -sSf --max-time 6 "https://localhost/" >/dev/null 2>&1; then HTTPS_OK=true log "✓ Local HTTPS health gate passed" break fi log " Waiting for local HTTPS health gate... (attempt $i/30)" sleep 3 done if [ "$HTTP_OK" != "true" ] || [ "$HTTPS_OK" != "true" ]; then log "ERROR: Post-deploy health gates failed (HTTP_OK=${HTTP_OK}, HTTPS_OK=${HTTPS_OK})" dump_runtime_diagnostics exit 1 fi # Verify environment variables made it to Caddy container log "Verifying environment variables in Caddy container..." CADDY_ENV=$(run_as_ec2user_docker "cd ~ && docker inspect fellowship-caddy --format='{{.Config.Env}}' 2>/dev/null | grep -o 'CADDY_DOMAIN=[^[:space:]]*' || echo 'NOT FOUND'" 2>/dev/null) if [ -n "$CADDY_ENV" ] && [ "$CADDY_ENV" != "NOT FOUND" ]; then log "✓ CADDY_DOMAIN verified in container: $CADDY_ENV" else log "WARNING: CADDY_DOMAIN not found in container environment" log " This may cause connection issues" log " Container environment (first 20 vars):" run_as_ec2user_docker "cd ~ && docker exec fellowship-caddy env 2>/dev/null | head -20" 2>/dev/null | sed 's/^/ /' || true fi # Final container status check log "Final container status:" run_as_ec2user_docker "cd ~ && docker compose ps" 2>&1 | sed 's/^/ /' # Final status PUBLIC_IP=$(get_instance_metadata "public-ipv4") [ -z "$PUBLIC_IP" ] && PUBLIC_IP="N/A" log "==========================================" log "Setup Complete" log "==========================================" log "Public IP: $PUBLIC_IP" log "" log "─── Fellowship SUT ────────────────────────" log " HTTPS: https://${CADDY_DOMAIN}/" log "" log "─── DevOps Escape Room ────────────────────" log " Jenkins CI (HTTPS): https://${JENKINS_DOMAIN}/" log " Jenkins CI (direct): http://${PUBLIC_IP}:8080 (fellowship / fellowship123)" log " IDE / code-server (HTTPS): https://${IDE_DOMAIN}/" log " IDE / code-server (direct): http://${PUBLIC_IP}:8443 (password: fellowship)" log " Gitea Git: http://${PUBLIC_IP}:3030 (fellowship / fellowship123)" log " MailHog UI: http://${PUBLIC_IP}:8025" log "=========================================="