lotr-sut/setup_fellowship.sh
Fellowship Scholar f6a5823439 init commit
2026-03-29 20:07:56 +00:00

1454 lines
55 KiB
Bash
Executable File

#!/bin/bash
# Fellowship EC2 Instance Setup Script
# This script is downloaded from S3 and executed by user_data.sh
# Contains all setup logic: Docker, Docker Compose, DevOps Escape Room, and Fellowship SUT
set -e
# Logging setup - redirect all output to log file
LOG_FILE="/var/log/user-data.log"
exec > >(tee -a "$LOG_FILE") 2>&1
# Function to log with timestamp
log() {
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*"
}
IMDS_BASE_URL="http://169.254.169.254/latest"
IMDS_TOKEN=""
get_imds_token() {
if [ -n "$IMDS_TOKEN" ]; then
echo "$IMDS_TOKEN"
return 0
fi
IMDS_TOKEN=$(curl -s --max-time 5 --connect-timeout 2 -X PUT "${IMDS_BASE_URL}/api/token" \
-H "X-aws-ec2-metadata-token-ttl-seconds: 21600" 2>/dev/null || echo "")
if [ -n "$IMDS_TOKEN" ]; then
echo "$IMDS_TOKEN"
return 0
fi
return 1
}
get_instance_metadata() {
local path="$1"
local token
token=$(get_imds_token 2>/dev/null || echo "")
if [ -n "$token" ]; then
curl -s --max-time 5 --connect-timeout 2 -H "X-aws-ec2-metadata-token: ${token}" \
"${IMDS_BASE_URL}/meta-data/${path}" 2>/dev/null || echo ""
else
curl -s --max-time 5 --connect-timeout 2 "${IMDS_BASE_URL}/meta-data/${path}" 2>/dev/null || echo ""
fi
}
log "=========================================="
log "Fellowship Setup Script Started"
log "=========================================="
# Get AWS region with retries and fallback
AWS_REGION=""
for i in {1..5}; do
AWS_REGION=$(get_instance_metadata "placement/region")
[ -n "$AWS_REGION" ] && break
[ $i -lt 5 ] && sleep 2
done
[ -z "$AWS_REGION" ] && AWS_REGION="eu-west-1" && log "Using default region: $AWS_REGION" || log "Region: $AWS_REGION"
# Function to wait for yum lock
wait_for_yum() {
while sudo fuser /var/run/yum.pid >/dev/null 2>&1; do
log "Waiting for yum lock to be released..."
sleep 5
done
}
# Helper function to run docker commands as ec2-user with proper group membership
run_as_ec2user_docker() {
local cmd="$1"
# Use sg (switch group) to ensure docker group is active in the subshell
# This is more reliable than su - which may not pick up new group membership immediately
sg docker -c "su - ec2-user -c '$cmd'"
}
ensure_swap_for_small_instances() {
if [ -f /swapfile ]; then
return 0
fi
local mem_mb
mem_mb=$(awk '/MemTotal/ {print int($2/1024)}' /proc/meminfo 2>/dev/null || echo "0")
if [ "$mem_mb" -ge 3500 ]; then
return 0
fi
log "Low-memory instance detected (${mem_mb}MB). Creating 2GB swap to reduce bootstrap OOM risk..."
if command -v fallocate >/dev/null 2>&1; then
fallocate -l 2G /swapfile || dd if=/dev/zero of=/swapfile bs=1M count=2048
else
dd if=/dev/zero of=/swapfile bs=1M count=2048
fi
chmod 600 /swapfile
mkswap /swapfile >/dev/null 2>&1 || true
swapon /swapfile >/dev/null 2>&1 || true
grep -q '^/swapfile' /etc/fstab || echo '/swapfile none swap sw 0 0' >> /etc/fstab
log "✓ Swap configured"
}
# Wait for any existing yum processes to complete
log "Checking for yum locks..."
wait_for_yum
# Update and install dependencies
log "Installing Docker and Git..."
yum update -y
wait_for_yum
yum install -y docker git
systemctl start docker
systemctl enable docker
usermod -aG docker ec2-user
ensure_swap_for_small_instances
log "✓ Docker installed and started"
# Install Docker Compose plugin
log "Installing Docker Compose plugin..."
mkdir -p /home/ec2-user/.docker/cli-plugins/
if curl -SL https://github.com/docker/compose/releases/download/v2.27.0/docker-compose-linux-x86_64 -o /home/ec2-user/.docker/cli-plugins/docker-compose; then
chmod +x /home/ec2-user/.docker/cli-plugins/docker-compose
chown -R ec2-user:ec2-user /home/ec2-user/.docker
log "✓ Docker Compose plugin installed"
else
log "ERROR: Failed to download Docker Compose plugin"
exit 1
fi
dump_runtime_diagnostics() {
log "Runtime diagnostics (docker compose ps):"
run_as_ec2user_docker "cd ~ && docker compose ps" 2>&1 | sed 's/^/ /' || true
log "Runtime diagnostics (caddy logs tail):"
run_as_ec2user_docker "cd ~ && docker compose logs caddy --tail 120" 2>&1 | sed 's/^/ /' || true
log "Runtime diagnostics (frontend logs tail):"
run_as_ec2user_docker "cd ~ && docker compose logs frontend --tail 120" 2>&1 | sed 's/^/ /' || true
log "Runtime diagnostics (backend logs tail):"
run_as_ec2user_docker "cd ~ && docker compose logs backend --tail 120" 2>&1 | sed 's/^/ /' || true
}
# Function to verify network connectivity to external services (esp. github.com)
verify_network_connectivity() {
log "Verifying network connectivity to external services..."
local connectivity_ok=true
# Test DNS resolution for github.com
log " Testing DNS resolution for github.com..."
if ! getent hosts github.com >/dev/null 2>&1; then
log " WARNING: DNS resolution for github.com failed initially, retrying..."
sleep 2
if ! getent hosts github.com >/dev/null 2>&1; then
log " ERROR: DNS resolution for github.com failed"
connectivity_ok=false
else
log " OK: DNS resolution for github.com OK after retry"
fi
else
log " OK: DNS resolution for github.com OK"
fi
# Test HTTP connectivity to github.com
log " Testing HTTP connectivity to github.com..."
if ! timeout 10 curl -s -o /dev/null --head https://github.com 2>/dev/null; then
log " WARNING: Cannot reach github.com, retrying..."
sleep 2
if ! timeout 10 curl -s -o /dev/null --head https://github.com 2>/dev/null; then
log " ERROR: Cannot reach github.com (required for docker builds)"
connectivity_ok=false
else
log " OK: Connectivity to github.com OK after retry"
fi
else
log " OK: Connectivity to github.com OK"
fi
if [ "$connectivity_ok" = "true" ]; then
return 0
else
return 1
fi
}
# DevOps Escape Room stack (Jenkins + Gitea + code-server + MailHog)
log "Setting up DevOps Escape Room stack..."
# Derive DevOps HTTPS subdomain names early if CADDY_DOMAIN is already available
# from the user_data environment variable (the common case for classroom instances).
# If CADDY_DOMAIN is not set yet (EC2-tag fallback), these will be re-derived later
# once the domain is confirmed.
if [ -n "${CADDY_DOMAIN:-}" ]; then
JENKINS_DOMAIN="${JENKINS_DOMAIN:-jenkins-${CADDY_DOMAIN}}"
IDE_DOMAIN="${IDE_DOMAIN:-ide-${CADDY_DOMAIN}}"
log "DevOps HTTPS subdomains derived from CADDY_DOMAIN:"
log " Jenkins: ${JENKINS_DOMAIN}"
log " IDE: ${IDE_DOMAIN}"
else
JENKINS_DOMAIN=""
IDE_DOMAIN=""
fi
# The devops-escape-room directory ships with the SUT tarball (extracted above),
# but the SUT extraction happens later. Write an inline compose file here so
# the stack can start in parallel. After the SUT is extracted the files will be
# replaced by the version from the tarball, which is identical.
mkdir -p /home/ec2-user/devops-escape-room
mkdir -p /home/ec2-user/jenkins/casc
mkdir -p /home/ec2-user/gitea
# ── Jenkins Dockerfile & plugins ────────────────────────────────────────────
cat > /home/ec2-user/jenkins/Dockerfile << 'JENKINSEOF'
FROM jenkins/jenkins:lts-jdk17
ENV JAVA_OPTS="-Djenkins.install.runSetupWizard=false"
ENV CASC_JENKINS_CONFIG=/var/jenkins_home/casc_configs
USER root
RUN apt-get update && apt-get install -y --no-install-recommends \
python3 python3-pip nodejs npm curl docker.io \
&& rm -rf /var/lib/apt/lists/* \
&& usermod -aG docker jenkins || true
USER jenkins
COPY plugins.txt /usr/share/jenkins/plugins.txt
RUN jenkins-plugin-cli --plugin-file /usr/share/jenkins/plugins.txt --latest false
COPY casc/ /var/jenkins_home/casc_configs/
JENKINSEOF
cat > /home/ec2-user/jenkins/plugins.txt << 'PLUGINSEOF'
workflow-aggregator
pipeline-stage-view
blueocean
git
gitea
configuration-as-code
job-dsl
junit
htmlpublisher
build-timeout
credentials
credentials-binding
plain-credentials
ssh-credentials
mailer
email-ext
dashboard-view
build-monitor-plugin
docker-workflow
timestamper
ws-cleanup
antisamy-markup-formatter
PLUGINSEOF
cat > /home/ec2-user/jenkins/casc/jenkins.yaml << 'CASCEOF'
jenkins:
systemMessage: |
🧙 Welcome to the Fellowship's Jenkins CI!
One does not simply skip the pipeline...
numExecutors: 2
securityRealm:
local:
allowsSignup: false
users:
- id: "fellowship"
name: "Gandalf the Grey"
password: "${JENKINS_ADMIN_PASSWORD:-fellowship123}"
authorizationStrategy:
loggedInUsersCanDoAnything:
allowAnonymousRead: true
globalNodeProperties:
- envVars:
env:
- key: "GITEA_URL"
value: "http://gitea:3000"
- key: "SUT_REPO"
value: "http://gitea:3000/fellowship/lotr-sut.git"
unclassified:
location:
url: "http://localhost:8080/"
adminAddress: "gandalf@fellowship.local"
mailer:
smtpHost: "mailhog"
smtpPort: "1025"
useSsl: false
charset: "UTF-8"
jobs:
- script: |
pipelineJob('fellowship-sut-pipeline') {
displayName('Fellowship SUT — CI Pipeline')
description('One pipeline to build them all, and in the darkness test them.')
definition {
cpsScm {
scm {
git {
remote {
url('http://gitea:3000/fellowship/lotr-sut.git')
}
branch('*/main')
}
}
scriptPath('Jenkinsfile')
}
}
triggers {
scm('H/5 * * * *')
}
logRotator {
numToKeep(10)
}
}
CASCEOF
# ── Gitea init script ────────────────────────────────────────────────────────
cat > /home/ec2-user/gitea/init.sh << 'GITEAINITEOF'
#!/bin/sh
set -e
GITEA_URL="${GITEA_URL:-http://gitea:3000}"
ADMIN_USER="${GITEA_ADMIN_USER:-fellowship}"
ADMIN_PASS="${GITEA_ADMIN_PASSWORD:-fellowship123}"
ADMIN_EMAIL="${GITEA_ADMIN_EMAIL:-gandalf@fellowship.local}"
ORG_NAME="fellowship"
REPO_NAME="lotr-sut"
log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] [gitea-init] $*"; }
wait_for_gitea() {
log "Waiting for Gitea at ${GITEA_URL}..."
i=0; while [ $i -lt 30 ]; do
curl -sf "${GITEA_URL}/api/v1/version" > /dev/null 2>&1 && log "✓ Gitea ready" && return 0
log " attempt $((i+1))/30 — waiting 5s..."; sleep 5; i=$((i+1))
done
log "ERROR: Gitea not ready"; return 1
}
wait_for_admin() {
log "Waiting for Gitea admin user '${ADMIN_USER}' to be ready..."
i=0; while [ $i -lt 30 ]; do
curl -sf -u "${ADMIN_USER}:${ADMIN_PASS}" "${GITEA_URL}/api/v1/user" > /dev/null 2>&1 \
&& log "✓ Admin user ready" && return 0
log " attempt $((i+1))/30 — waiting 3s..."; sleep 3; i=$((i+1))
done
log "WARNING: Admin user not ready — proceeding anyway (some operations may fail)"
return 0
}
api() { curl -sf -u "${ADMIN_USER}:${ADMIN_PASS}" "$@"; }
wait_for_gitea
wait_for_admin
# Create org (ignore if exists)
api -X POST "${GITEA_URL}/api/v1/orgs" \
-H "Content-Type: application/json" \
-d "{\"username\":\"${ORG_NAME}\",\"full_name\":\"The Fellowship of the Ring\",\"visibility\":\"public\"}" \
> /dev/null 2>&1 || true
log "✓ Organization '${ORG_NAME}' ready"
# Create repo (ignore if exists)
api -X POST "${GITEA_URL}/api/v1/orgs/${ORG_NAME}/repos" \
-H "Content-Type: application/json" \
-d "{\"name\":\"${REPO_NAME}\",\"description\":\"LOTR SUT\",\"private\":false,\"auto_init\":false,\"default_branch\":\"main\"}" \
> /dev/null 2>&1 || true
log "✓ Repository '${ORG_NAME}/${REPO_NAME}' ready"
# Push code if repo is empty
COMMITS=$(api "${GITEA_URL}/api/v1/repos/${ORG_NAME}/${REPO_NAME}/commits?limit=1" 2>/dev/null | grep -c '"sha"' || echo "0")
if [ "$COMMITS" -gt 0 ]; then
log "✓ Repository already has commits — skipping push"; exit 0
fi
SRC=""
for d in /sut-source /home/ec2-user; do
[ -f "${d}/docker-compose.yml" ] && [ -d "${d}/sut" ] && SRC="$d" && break
done
if [ -z "$SRC" ]; then
log "WARNING: SUT source not found — skipping code push"; exit 0
fi
log "Pushing code from ${SRC} to Gitea..."
AUTH_URL=$(echo "${GITEA_URL}/${ORG_NAME}/${REPO_NAME}.git" | sed "s|http://|http://${ADMIN_USER}:${ADMIN_PASS}@|")
TMP=$(mktemp -d)
cp -a "${SRC}/." "${TMP}/"
cd "${TMP}"
rm -rf .git
git init -b main
git config user.email "${ADMIN_EMAIL}"
git config user.name "Gandalf the Grey"
git add -A
git commit -m "🧙 Initial commit: The Fellowship's Quest List SUT"
git remote add gitea "${AUTH_URL}"
git push gitea main
cd /; rm -rf "${TMP}"
log "✓ SUT code pushed to Gitea"
GITEAINITEOF
chmod +x /home/ec2-user/gitea/init.sh
# ── code-server custom Dockerfile + entrypoint ──────────────────────────────
# Extends codercom/code-server with Docker CLI, Compose v2, and pre-installed
# VS Code extensions (Python, Playwright, Copilot, Jupyter, Prettier).
mkdir -p /home/ec2-user/devops-escape-room/code-server
cat > /home/ec2-user/devops-escape-room/code-server/Dockerfile << 'CSRVDOCKEREOF'
FROM codercom/code-server:latest
USER root
# Docker CLI + gosu (for clean privilege drop) + utilities
RUN apt-get update && \
apt-get install -y --no-install-recommends \
docker.io \
gosu \
curl \
git \
&& rm -rf /var/lib/apt/lists/*
# Docker Compose v2 plugin + legacy symlink
RUN mkdir -p /usr/local/lib/docker/cli-plugins && \
curl -fsSL \
"https://github.com/docker/compose/releases/download/v2.27.0/docker-compose-linux-x86_64" \
-o /usr/local/lib/docker/cli-plugins/docker-compose && \
chmod +x /usr/local/lib/docker/cli-plugins/docker-compose && \
ln -sf /usr/local/lib/docker/cli-plugins/docker-compose /usr/local/bin/docker-compose
# Coder user docker group (GID re-aligned at runtime by entrypoint)
RUN groupadd -g 999 docker 2>/dev/null || groupmod -g 999 docker 2>/dev/null || true && \
usermod -aG docker coder
COPY entrypoint.sh /usr/bin/fellowship-docker-init.sh
RUN chmod +x /usr/bin/fellowship-docker-init.sh
USER root
ENTRYPOINT ["/usr/bin/fellowship-docker-init.sh"]
CSRVDOCKEREOF
cat > /home/ec2-user/devops-escape-room/code-server/entrypoint.sh << 'CSRVENTRYEOF'
#!/bin/bash
# Fellowship code-server entrypoint: fixes Docker GID, installs extensions, starts IDE
set -e
log() { echo "[$(date '+%H:%M:%S')] [fellowship-init] $*"; }
# Fix docker group GID to match host socket
if [ -S /var/run/docker.sock ]; then
DOCK_GID=$(stat -c '%g' /var/run/docker.sock)
if getent group docker > /dev/null 2>&1; then
groupmod -g "${DOCK_GID}" docker 2>/dev/null || true
else
groupadd -g "${DOCK_GID}" docker 2>/dev/null || true
fi
usermod -aG docker coder 2>/dev/null || true
chmod 666 /var/run/docker.sock 2>/dev/null || true
log "Docker group GID aligned to ${DOCK_GID}"
else
log "WARNING: docker.sock not mounted — docker unavailable in IDE terminal"
fi
# Install VS Code extensions as coder user
log "Installing VS Code extensions..."
for ext in ms-python.python github.copilot ms-playwright.playwright esbenp.prettier-vscode ms-toolsai.jupyter redhat.vscode-yaml ms-azuretools.vscode-docker; do
gosu coder code-server --install-extension "${ext}" --force > /dev/null 2>&1 && \
log " OK ${ext}" || log " SKIP ${ext}"
done
# Default settings
SETTINGS_DIR="/home/coder/.local/share/code-server/User"
if [ ! -f "${SETTINGS_DIR}/settings.json" ]; then
gosu coder mkdir -p "${SETTINGS_DIR}"
cat > "${SETTINGS_DIR}/settings.json" << 'SETTINGSEOF'
{
"python.defaultInterpreterPath": "/usr/bin/python3",
"editor.formatOnSave": true,
"terminal.integrated.defaultProfile.linux": "bash",
"git.autofetch": true,
"docker.host": "unix:///var/run/docker.sock"
}
SETTINGSEOF
chown coder:coder "${SETTINGS_DIR}/settings.json"
log "Default settings.json written"
fi
log "Starting code-server..."
exec gosu coder /usr/bin/entrypoint.sh "$@"
CSRVENTRYEOF
chmod +x /home/ec2-user/devops-escape-room/code-server/entrypoint.sh
chown -R ec2-user:ec2-user /home/ec2-user/devops-escape-room/code-server
log "✓ code-server Dockerfile and entrypoint written"
# ── devops-escape-room docker-compose ────────────────────────────────────────
cat > /home/ec2-user/devops-escape-room/docker-compose.yml << 'COMPOSEEOF'
# Fellowship DevOps Escape Room Stack
# Jenkins CI | Gitea Git | code-server IDE | MailHog mail
#
# Jenkins: http://HOST:8080 (fellowship / fellowship123)
# Gitea: http://HOST:3030 (fellowship / fellowship123)
# code-server: http://HOST:8443 (password: fellowship)
# MailHog: http://HOST:8025
services:
jenkins:
build:
context: ../jenkins
dockerfile: Dockerfile
image: fellowship-jenkins:latest
container_name: fellowship-jenkins
restart: unless-stopped
ports:
- "8080:8080"
- "50000:50000"
volumes:
- jenkins_home:/var/jenkins_home
- /var/run/docker.sock:/var/run/docker.sock
environment:
JENKINS_ADMIN_PASSWORD: ${JENKINS_ADMIN_PASSWORD:-fellowship123}
CASC_JENKINS_CONFIG: /var/jenkins_home/casc_configs
# JENKINS_URL is written to devops-escape-room/.env by setup_fellowship.sh
# once CADDY_DOMAIN is known, so Jenkins knows its canonical HTTPS URL.
JENKINS_URL: ${JENKINS_URL:-http://localhost:8080/}
depends_on:
gitea:
condition: service_healthy
healthcheck:
test: ["CMD-SHELL", "curl -sf http://localhost:8080/login || exit 1"]
interval: 30s
timeout: 10s
retries: 5
start_period: 90s
gitea:
image: gitea/gitea:1.22
container_name: fellowship-gitea
restart: unless-stopped
ports:
- "3030:3000"
- "2222:22"
volumes:
- gitea_data:/data
environment:
USER_UID: "1000"
USER_GID: "1000"
GITEA__database__DB_TYPE: sqlite3
GITEA__server__DOMAIN: ${GITEA_DOMAIN:-localhost}
GITEA__server__HTTP_PORT: "3000"
GITEA__server__ROOT_URL: ${GITEA_ROOT_URL:-http://localhost:3030/}
GITEA__server__SSH_DOMAIN: localhost
GITEA__server__SSH_PORT: "2222"
GITEA__service__DISABLE_REGISTRATION: "false"
GITEA__service__REQUIRE_SIGNIN_VIEW: "false"
GITEA__security__INSTALL_LOCK: "true"
GITEA__mailer__ENABLED: "false"
# Pre-create the admin user on first boot via Gitea environment variables.
# Without these the gitea-init container cannot authenticate against the API.
GITEA__admin__ADMIN_USER: ${GITEA_ADMIN_USER:-fellowship}
GITEA__admin__ADMIN_PASSWD: ${GITEA_ADMIN_PASSWORD:-fellowship123}
GITEA__admin__ADMIN_EMAIL: ${GITEA_ADMIN_EMAIL:-gandalf@fellowship.local}
GITEA__admin__SEND_NOTIFY: "false"
healthcheck:
test: ["CMD-SHELL", "curl -sf http://localhost:3000/api/v1/version || exit 1"]
interval: 15s
timeout: 5s
retries: 6
start_period: 30s
gitea-init:
image: alpine/git:latest
container_name: fellowship-gitea-init
restart: on-failure:5
environment:
GITEA_URL: "http://gitea:3000"
GITEA_ADMIN_USER: ${GITEA_ADMIN_USER:-fellowship}
GITEA_ADMIN_PASSWORD: ${GITEA_ADMIN_PASSWORD:-fellowship123}
GITEA_ADMIN_EMAIL: ${GITEA_ADMIN_EMAIL:-gandalf@fellowship.local}
GITEA_DOMAIN: ${GITEA_DOMAIN:-}
SUT_SOURCE_DIR: /sut-source
volumes:
- ../gitea/init.sh:/init.sh:ro
- /home/ec2-user/sut:/sut-source/sut:ro
- /home/ec2-user/docker-compose.yml:/sut-source/docker-compose.yml:ro
- /home/ec2-user/caddy:/sut-source/caddy:ro
- /home/ec2-user/nginx:/sut-source/nginx:ro
- /home/ec2-user/Jenkinsfile:/sut-source/Jenkinsfile:ro
entrypoint: ["/bin/sh", "/init.sh"]
depends_on:
gitea:
condition: service_healthy
code-server:
build:
context: ./code-server
dockerfile: Dockerfile
image: fellowship-code-server:latest
container_name: fellowship-code-server
restart: unless-stopped
ports:
- "8443:8080"
volumes:
- /home/ec2-user:/home/coder/fellowship:rw
- codeserver_config:/home/coder/.config
# Mount Docker socket so students can run docker compose from the IDE terminal
- /var/run/docker.sock:/var/run/docker.sock
environment:
PASSWORD: ${CODESERVER_PASSWORD:-fellowship}
command:
- --auth=password
- --bind-addr=0.0.0.0:8080
- /home/coder/fellowship
healthcheck:
test: ["CMD-SHELL", "curl -sf http://localhost:8080/ || exit 1"]
interval: 30s
timeout: 10s
retries: 3
start_period: 30s
mailhog:
image: mailhog/mailhog:v1.0.1
container_name: fellowship-mailhog
restart: unless-stopped
ports:
- "1025:1025"
- "8025:8025"
volumes:
jenkins_home:
driver: local
gitea_data:
driver: local
COMPOSEEOF
# Pre-create code-server config directory with proper permissions (bind mount)
# The coder user inside the container runs as 1000:1000, so the host directory
# must be owned by ec2-user (uid 1000) to avoid "permission denied" errors.
mkdir -p /home/ec2-user/.codeserver-config
chown 1000:1000 /home/ec2-user/.codeserver-config
chmod 700 /home/ec2-user/.codeserver-config
chown -R ec2-user:ec2-user /home/ec2-user/devops-escape-room
chown -R ec2-user:ec2-user /home/ec2-user/jenkins
chown -R ec2-user:ec2-user /home/ec2-user/gitea
# Write the devops-escape-room .env so Docker Compose can inject the correct
# JENKINS_URL into the Jenkins container before it starts.
# If JENKINS_DOMAIN is not yet known (EC2-tag path), Jenkins defaults to localhost
# and will be updated automatically after the SUT domain is resolved.
DEVOPS_JENKINS_URL="${JENKINS_DOMAIN:+https://${JENKINS_DOMAIN}/}"
DEVOPS_JENKINS_URL="${DEVOPS_JENKINS_URL:-http://localhost:8080/}"
# Copy devops-escape-room environment template to .env
log "Setting up devops-escape-room environment..."
if [ -f /home/ec2-user/devops-escape-room/.env.prod ]; then
cp /home/ec2-user/devops-escape-room/.env.prod /home/ec2-user/devops-escape-room/.env
chown ec2-user:ec2-user /home/ec2-user/devops-escape-room/.env
chmod 644 /home/ec2-user/devops-escape-room/.env
log "✓ Copied devops-escape-room/.env.prod to .env"
# Update JENKINS_URL if domain is available
if [ -n "$JENKINS_DOMAIN" ]; then
sed -i "" "s|^JENKINS_URL=.*|JENKINS_URL=${DEVOPS_JENKINS_URL}|" /home/ec2-user/devops-escape-room/.env
fi
else
# Fallback: create .env inline if template not found
cat > /home/ec2-user/devops-escape-room/.env << EOF
JENKINS_ADMIN_PASSWORD=fellowship123
GITEA_ADMIN_USER=fellowship
GITEA_ADMIN_PASSWORD=fellowship123
GITEA_ADMIN_EMAIL=gandalf@fellowship.local
CODESERVER_PASSWORD=fellowship
JENKINS_URL=${DEVOPS_JENKINS_URL}
GITEA_DOMAIN=
EOF
chown ec2-user:ec2-user /home/ec2-user/devops-escape-room/.env
log "✓ Wrote devops-escape-room/.env (fallback mode, JENKINS_URL=${DEVOPS_JENKINS_URL})"
fi
log "Building and starting DevOps Escape Room stack..."
if run_as_ec2user_docker "cd ~/devops-escape-room && docker compose up -d --build" 2>&1 | \
tee -a "$LOG_FILE"; then
log "✓ DevOps Escape Room stack started (Jenkins, Gitea, code-server, MailHog)"
else
log "WARNING: DevOps Escape Room stack may not have started cleanly — check logs"
fi
# Fellowship SUT Setup
log "Setting up Fellowship SUT..."
# Get SUT bucket from SSM
log "Retrieving SUT bucket from SSM: /classroom/fellowship/sut-bucket"
SUT_BUCKET=$(aws ssm get-parameter --name "/classroom/fellowship/sut-bucket" --query "Parameter.Value" --output text --region "${AWS_REGION}" 2>&1)
if [ $? -ne 0 ] || [ -z "$SUT_BUCKET" ] || [ "$SUT_BUCKET" = "None" ]; then
log "ERROR: Failed to get SUT bucket from SSM"
log "Error: $SUT_BUCKET"
exit 1
fi
log "SUT bucket: $SUT_BUCKET"
# Download SUT from S3
log "Finding latest SUT artifact in S3..."
LATEST_TAR=$(aws s3 ls "s3://${SUT_BUCKET}/" --region "${AWS_REGION}" | \
awk '/fellowship-sut-.*\.tar\.gz$/ {print $1" "$2" "$4}' | \
sort | tail -n 1 | awk '{print $3}')
if [ -z "$LATEST_TAR" ]; then
log "ERROR: No fellowship-sut-*.tar.gz artifact found in S3 bucket"
exit 1
fi
log "Downloading latest SUT artifact: $LATEST_TAR"
if ! aws s3 cp "s3://${SUT_BUCKET}/${LATEST_TAR}" /tmp/fellowship-sut.tar.gz --region "${AWS_REGION}" >/dev/null 2>&1 || [ ! -f "/tmp/fellowship-sut.tar.gz" ]; then
log "ERROR: Failed to download SUT from S3"
log "Expected location: s3://${SUT_BUCKET}/${LATEST_TAR}"
exit 1
fi
log "✓ SUT downloaded"
# Extract SUT
log "Extracting SUT..."
if ! tar -xzf /tmp/fellowship-sut.tar.gz -C /home/ec2-user/ 2>/dev/null; then
log "ERROR: Failed to extract SUT"
exit 1
fi
rm -f /tmp/fellowship-sut.tar.gz
# Tarball extracts to sut/ and docker-compose.yml at home root - chown both
chown -R ec2-user:ec2-user /home/ec2-user/sut 2>/dev/null || true
chown ec2-user:ec2-user /home/ec2-user/docker-compose.yml 2>/dev/null || true
chown -R ec2-user:ec2-user /home/ec2-user/caddy 2>/dev/null || true
chown -R ec2-user:ec2-user /home/ec2-user/nginx 2>/dev/null || true
log "✓ SUT extracted"
# Copy environment template to .env for docker-compose
# This ensures COMPOSE_PROJECT_NAME=fellowship (production) is used
log "Setting up production environment (.env.prod → .env)..."
if [ -f /home/ec2-user/.env.prod ]; then
cp /home/ec2-user/.env.prod /home/ec2-user/.env
chown ec2-user:ec2-user /home/ec2-user/.env
chmod 644 /home/ec2-user/.env
log "✓ Copied .env.prod to .env"
else
log "WARNING: .env.prod not found — .env will be created from scratch"
fi
# Get instance domain for Caddy
# PRIORITY 1: Check if domain was passed via user_data environment variable
# This is the most reliable method - domain is known before instance creation
log "Getting instance domain for Caddy..."
if [ -n "$CADDY_DOMAIN" ] && [ "$CADDY_DOMAIN" != "" ]; then
log "✓ Found Caddy domain from user_data environment: $CADDY_DOMAIN"
# Domain is already set, no need to query EC2 tags
else
# PRIORITY 2: Fallback to EC2 tags (requires instance ID from metadata service)
log "Domain not in environment, attempting to get from EC2 tags..."
INSTANCE_ID=""
CADDY_DOMAIN=""
# Retry getting instance ID (metadata service may not be ready immediately)
for i in {1..10}; do
INSTANCE_ID=$(get_instance_metadata "instance-id")
if [ -n "$INSTANCE_ID" ]; then
log "✓ Got instance ID: $INSTANCE_ID"
break
fi
if [ $i -lt 10 ]; then
log " Attempt $i/10: Instance ID not available yet, waiting 2s..."
sleep 2
fi
done
if [ -n "$INSTANCE_ID" ]; then
# Get domain from instance tags (set by Lambda BEFORE instance creation)
# With predictable domain names, this should be available immediately
log "Retrieving HttpsDomain tag from instance tags..."
for i in {1..6}; do
CADDY_DOMAIN=$(aws ec2 describe-tags --region "${AWS_REGION}" --filters "Name=resource-id,Values=${INSTANCE_ID}" "Name=key,Values=HttpsDomain" --query "Tags[0].Value" --output text 2>/dev/null || echo "")
if [ -n "$CADDY_DOMAIN" ] && [ "$CADDY_DOMAIN" != "None" ] && [ "$CADDY_DOMAIN" != "" ]; then
log "✓ Found Caddy domain from tags: $CADDY_DOMAIN"
break
fi
if [ $i -lt 6 ]; then
log " Attempt $i/6: HttpsDomain tag not found yet, waiting 2s..."
sleep 2
fi
done
else
log "WARNING: Could not get instance ID after retries"
fi
# Final check
if [ -z "$CADDY_DOMAIN" ] || [ "$CADDY_DOMAIN" = "None" ] || [ "$CADDY_DOMAIN" = "" ]; then
log "ERROR: Caddy domain not found - cannot deploy AWS Fellowship SUT without a valid domain"
log " Ensure HttpsDomain tag is set before instance bootstrap"
exit 1
fi
fi
# Normalize to lowercase to avoid mixed-case DNS/tag drift
CADDY_DOMAIN=$(echo "$CADDY_DOMAIN" | tr '[:upper:]' '[:lower:]')
# Enforce domain presence for AWS deployment
if [ -z "$CADDY_DOMAIN" ] || [ "$CADDY_DOMAIN" = "None" ] || [ "$CADDY_DOMAIN" = "" ]; then
log "ERROR: Caddy domain is required for AWS deployment"
exit 1
fi
# Wait for DNS propagation before starting containers (required for Caddy automatic HTTPS)
PUBLIC_IP_FOR_DNS=$(get_instance_metadata "public-ipv4")
if [ -z "$PUBLIC_IP_FOR_DNS" ]; then
log "ERROR: Could not retrieve instance public IP for DNS verification"
exit 1
fi
resolve_domain_ipv4() {
local domain="$1"
local resolved_ip
resolved_ip=$(getent ahostsv4 "$domain" 2>/dev/null | awk '{print $1}' | grep -E '^[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+$' | head -1 || true)
if [ -z "$resolved_ip" ]; then
resolved_ip=$(nslookup "$domain" 2>/dev/null | awk '/^Address: / {print $2}' | grep -E '^[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+$' | tail -1 || true)
fi
echo "$resolved_ip"
}
log "Waiting for DNS propagation: ${CADDY_DOMAIN} -> ${PUBLIC_IP_FOR_DNS}"
DNS_MATCHED="false"
for i in {1..30}; do
RESOLVED_IP=$(resolve_domain_ipv4 "$CADDY_DOMAIN")
if [ "$RESOLVED_IP" = "$PUBLIC_IP_FOR_DNS" ]; then
DNS_MATCHED="true"
log "✓ DNS propagation complete (${CADDY_DOMAIN} resolves to ${RESOLVED_IP})"
break
fi
log " Attempt $i/30: ${CADDY_DOMAIN} resolves to '${RESOLVED_IP:-unresolved}' (expected ${PUBLIC_IP_FOR_DNS}), waiting 10s..."
sleep 10
done
if [ "$DNS_MATCHED" != "true" ]; then
log "ERROR: DNS propagation timeout after 5 minutes"
log " ${CADDY_DOMAIN} did not resolve to instance public IP ${PUBLIC_IP_FOR_DNS}"
log " Caddy automatic HTTPS cannot succeed until DNS is correct"
exit 1
fi
# ── Derive DevOps HTTPS subdomain names ──────────────────────────────────────
# Pattern: jenkins-{CADDY_DOMAIN} and ide-{CADDY_DOMAIN}
# These may already be set from the early derivation above; re-assign to ensure
# they reflect the confirmed (possibly tag-derived) CADDY_DOMAIN.
JENKINS_DOMAIN="jenkins-${CADDY_DOMAIN}"
IDE_DOMAIN="ide-${CADDY_DOMAIN}"
GITEA_DOMAIN="gitea-${CADDY_DOMAIN}"
log "DevOps HTTPS subdomains confirmed:"
log " Jenkins: ${JENKINS_DOMAIN}"
log " IDE: ${IDE_DOMAIN}"
log " Gitea: ${GITEA_DOMAIN}"
# ── Create Route53 A records for jenkins and ide subdomains ──────────────────
# Both subdomains must point to the same instance IP as CADDY_DOMAIN.
# Try to find the hosted-zone ID from (in order):
# 1. EC2 instance tag Route53ZoneId
# 2. SSM parameter /classroom/fellowship/route53-zone-id
# 3. Route53 lookup by parent domain
ROUTE53_ZONE_ID="${ROUTE53_ZONE_ID:-}"
if [ -n "$ROUTE53_ZONE_ID" ] && [ "$ROUTE53_ZONE_ID" != "None" ]; then
log "Using Route53 zone ID from environment: ${ROUTE53_ZONE_ID}"
else
ROUTE53_ZONE_ID=""
fi
# Source 1: EC2 instance tag (set by the provisioning Lambda)
if [ -n "${INSTANCE_ID:-}" ]; then
ROUTE53_ZONE_ID=$(aws ec2 describe-tags --region "${AWS_REGION}" \
--filters "Name=resource-id,Values=${INSTANCE_ID}" "Name=key,Values=Route53ZoneId" \
--query "Tags[0].Value" --output text 2>/dev/null || echo "")
[ "$ROUTE53_ZONE_ID" = "None" ] && ROUTE53_ZONE_ID=""
fi
# Source 2: SSM parameter
if [ -z "$ROUTE53_ZONE_ID" ]; then
ROUTE53_ZONE_ID=$(aws ssm get-parameter \
--name "/classroom/fellowship/route53-zone-id" \
--query "Parameter.Value" --output text --region "${AWS_REGION}" 2>/dev/null || echo "")
[ "$ROUTE53_ZONE_ID" = "None" ] && ROUTE53_ZONE_ID=""
fi
# Source 3: Walk up the DNS tree stripping one label at a time until a
# matching hosted zone is found. A single sed strip is not enough when
# CADDY_DOMAIN has multiple subdomain levels (e.g.
# fellowship-<id>.fellowship.testingfantasy.com — stripping one label gives
# fellowship.testingfantasy.com which is NOT a hosted zone; the actual zone
# is testingfantasy.com two levels up).
if [ -z "$ROUTE53_ZONE_ID" ]; then
ZONE_SUFFIX=$(echo "$CADDY_DOMAIN" | sed 's/^[^.]*\.//')
while [ -n "$ZONE_SUFFIX" ] && [ "$ZONE_SUFFIX" != "${ZONE_SUFFIX#*.}" ]; do
CANDIDATE=$(aws route53 list-hosted-zones-by-name \
--dns-name "${ZONE_SUFFIX}" \
--query "HostedZones[?Name==\`${ZONE_SUFFIX}.\`].Id" \
--output text 2>/dev/null \
| sed 's|/hostedzone/||' || echo "")
if [ -n "$CANDIDATE" ] && [ "$CANDIDATE" != "None" ] && [ "$CANDIDATE" != "\t" ]; then
ROUTE53_ZONE_ID="$CANDIDATE"
log " Route53 zone found via DNS tree walk: ${ZONE_SUFFIX}${ROUTE53_ZONE_ID}"
break
fi
ZONE_SUFFIX=$(echo "$ZONE_SUFFIX" | sed 's/^[^.]*\.//')
done
[ "${ROUTE53_ZONE_ID:-}" = "None" ] && ROUTE53_ZONE_ID=""
fi
if [ -n "$ROUTE53_ZONE_ID" ]; then
log "Creating/updating Route53 A records for DevOps subdomains (zone ${ROUTE53_ZONE_ID})..."
aws route53 change-resource-record-sets \
--hosted-zone-id "$ROUTE53_ZONE_ID" \
--change-batch "{
\"Comment\": \"Fellowship DevOps Escape Room HTTPS subdomains\",
\"Changes\": [
{
\"Action\": \"UPSERT\",
\"ResourceRecordSet\": {
\"Name\": \"${JENKINS_DOMAIN}\",
\"Type\": \"A\",
\"TTL\": 60,
\"ResourceRecords\": [{\"Value\": \"${PUBLIC_IP_FOR_DNS}\"}]
}
},
{
\"Action\": \"UPSERT\",
\"ResourceRecordSet\": {
\"Name\": \"${IDE_DOMAIN}\",
\"Type\": \"A\",
\"TTL\": 60,
\"ResourceRecords\": [{\"Value\": \"${PUBLIC_IP_FOR_DNS}\"}]
}
},
{
\"Action\": \"UPSERT\",
\"ResourceRecordSet\": {
\"Name\": \"${GITEA_DOMAIN}\",
\"Type\": \"A\",
\"TTL\": 60,
\"ResourceRecords\": [{\"Value\": \"${PUBLIC_IP_FOR_DNS}\"}]
}
}
]
}" \
--region "$AWS_REGION" 2>&1 | tee -a "$LOG_FILE" \
&& log "✓ Route53 A records upserted for ${JENKINS_DOMAIN}, ${IDE_DOMAIN} and ${GITEA_DOMAIN}" \
|| log "WARNING: Route53 record update failed — manual DNS setup may be required"
else
log "WARNING: Route53 zone ID not found — DevOps subdomains need manual DNS setup:"
log " A record: ${JENKINS_DOMAIN}${PUBLIC_IP_FOR_DNS}"
log " A record: ${IDE_DOMAIN}${PUBLIC_IP_FOR_DNS}"
log " A record: ${GITEA_DOMAIN}${PUBLIC_IP_FOR_DNS}"
fi
# Update the devops-escape-room .env so Jenkins knows its canonical HTTPS URL.
# The devops stack may already be running; the container will pick up the new URL
# on the next restart or if Jenkins JCasC is reloaded.
cat > /home/ec2-user/devops-escape-room/.env << EOF
JENKINS_ADMIN_PASSWORD=fellowship123
GITEA_ADMIN_USER=fellowship
GITEA_ADMIN_PASSWORD=fellowship123
GITEA_ADMIN_EMAIL=gandalf@fellowship.local
CODESERVER_PASSWORD=fellowship
JENKINS_URL=https://${JENKINS_DOMAIN}/
GITEA_DOMAIN=${GITEA_DOMAIN}
GITEA_ROOT_URL=https://${GITEA_DOMAIN}/
EOF
chown ec2-user:ec2-user /home/ec2-user/devops-escape-room/.env
log "✓ Updated devops-escape-room/.env with JENKINS_URL=https://${JENKINS_DOMAIN}/ GITEA_DOMAIN=${GITEA_DOMAIN}"
# Restart Gitea so it picks up the correct GITEA_ROOT_URL and GITEA_DOMAIN.
# (The escape room stack was started earlier with an empty GITEA_DOMAIN; now that
# the domain is known, we restart just the gitea service so ROOT_URL is correct.)
run_as_ec2user_docker "cd ~/devops-escape-room && docker compose restart gitea" 2>&1 | tee -a "$LOG_FILE" \
&& log "✓ Gitea restarted with GITEA_ROOT_URL=https://${GITEA_DOMAIN}/" \
|| log "WARNING: Gitea restart failed — ROOT_URL may still point to localhost"
# Deploy SUT
log "Deploying SUT..."
if [ ! -f "/home/ec2-user/docker-compose.yml" ]; then
log "ERROR: SUT docker-compose.yml not found"
exit 1
fi
if [ ! -x "/home/ec2-user/.docker/cli-plugins/docker-compose" ]; then
log "ERROR: Docker Compose plugin not executable"
exit 1
fi
# Get Azure OpenAI credentials from Secrets Manager
log "Retrieving Azure OpenAI credentials from Secrets Manager..."
AZURE_SECRET=""
AZURE_ENDPOINT=""
AZURE_API_KEY=""
AZURE_DEPLOYMENT=""
AZURE_API_VERSION=""
# Primary secret path (new schema)
SECRET_NAME="azure/llm/configs"
# Legacy fallback path (old schema)
ENVIRONMENT="${ENVIRONMENT:-dev}"
LEGACY_SECRET_NAME="classroom/shared/${ENVIRONMENT}/azure-openai"
normalize_azure_endpoint() {
local raw_endpoint="$1"
# Remove query string if present
raw_endpoint="${raw_endpoint%%\?*}"
# Convert full Azure OpenAI operation URL to resource base endpoint
# Example:
# https://resource.openai.azure.com/openai/deployments/gpt-4o/chat/completions?api-version=...
# -> https://resource.openai.azure.com
if [[ "$raw_endpoint" == *"/openai/"* ]]; then
raw_endpoint="${raw_endpoint%%/openai/*}"
fi
# Remove trailing slash for consistency
raw_endpoint="${raw_endpoint%/}"
echo "$raw_endpoint"
}
AZURE_SECRET=$(aws secretsmanager get-secret-value \
--secret-id "$SECRET_NAME" \
--region "${AWS_REGION}" \
--query SecretString \
--output text 2>/dev/null || echo "")
if [ -z "$AZURE_SECRET" ] || [ "$AZURE_SECRET" = "None" ]; then
log "Primary secret not found, trying legacy Azure OpenAI secret..."
AZURE_SECRET=$(aws secretsmanager get-secret-value \
--secret-id "$LEGACY_SECRET_NAME" \
--region "${AWS_REGION}" \
--query SecretString \
--output text 2>/dev/null || echo "")
if [ -n "$AZURE_SECRET" ] && [ "$AZURE_SECRET" != "None" ]; then
SECRET_NAME="$LEGACY_SECRET_NAME"
fi
fi
if [ -z "$AZURE_SECRET" ] || [ "$AZURE_SECRET" = "None" ]; then
log "WARNING: Failed to retrieve Azure OpenAI secret from Secrets Manager"
log " Secret name: $SECRET_NAME"
log " Region: $AWS_REGION"
log " Fellowship SUT will work with fallback responses only (no Azure AI)"
else
# Parse secret JSON and extract values from either:
# - New schema: array under azure/llm/configs
# - Legacy schema: single object under classroom/shared/<env>/azure-openai
if command -v jq &> /dev/null; then
JQ_PICK_FILTER='
def pick:
if type == "array" then
(map(select((.config_name // "" | ascii_downcase) | test("gpt[ -]?4o"))) | .[0])
// (map(select((.endpoint // "" | ascii_downcase) | contains("/chat/completions"))) | .[0])
// (map(select((.config_name // "" | ascii_downcase) | test("gpt"))) | .[0])
// .[0]
else . end;
pick
'
AZURE_ENDPOINT=$(echo "$AZURE_SECRET" | jq -r "$JQ_PICK_FILTER | .endpoint // empty" 2>/dev/null || echo "")
AZURE_API_KEY=$(echo "$AZURE_SECRET" | jq -r "$JQ_PICK_FILTER | .api_key // empty" 2>/dev/null || echo "")
AZURE_DEPLOYMENT=$(echo "$AZURE_SECRET" | jq -r "$JQ_PICK_FILTER | .deployment_name // .deployment // empty" 2>/dev/null || echo "")
AZURE_API_VERSION=$(echo "$AZURE_SECRET" | jq -r "$JQ_PICK_FILTER | .api_version // empty" 2>/dev/null || echo "")
elif command -v python3 &> /dev/null; then
PARSED_AZURE=$(AZURE_SECRET="$AZURE_SECRET" python3 << 'PY'
import json
import os
raw = os.environ.get("AZURE_SECRET", "")
try:
data = json.loads(raw)
except Exception:
data = {}
selected = {}
if isinstance(data, list):
def config_name(item):
return str(item.get("config_name", "")).lower()
def endpoint(item):
return str(item.get("endpoint", "")).lower()
selected = (
next((x for x in data if "gpt-4o" in config_name(x) or "gpt 4-o" in config_name(x)), None)
or next((x for x in data if "/chat/completions" in endpoint(x)), None)
or next((x for x in data if "gpt" in config_name(x)), None)
or (data[0] if data else {})
)
elif isinstance(data, dict):
selected = data
endpoint_val = selected.get("endpoint", "")
api_key_val = selected.get("api_key", "")
deployment_val = selected.get("deployment_name") or selected.get("deployment") or ""
api_version_val = selected.get("api_version", "")
print(f"endpoint={endpoint_val}")
print(f"api_key={api_key_val}")
print(f"deployment={deployment_val}")
print(f"api_version={api_version_val}")
PY
)
while IFS='=' read -r key value; do
case "$key" in
endpoint) AZURE_ENDPOINT="$value" ;;
api_key) AZURE_API_KEY="$value" ;;
deployment) AZURE_DEPLOYMENT="$value" ;;
api_version) AZURE_API_VERSION="$value" ;;
esac
done <<< "$PARSED_AZURE"
else
log "WARNING: Neither jq nor python3 available; using best-effort grep parsing"
AZURE_ENDPOINT=$(echo "$AZURE_SECRET" | grep -m1 -o '"endpoint"[[:space:]]*:[[:space:]]*"[^"]*"' | sed -E 's/^.*"endpoint"[[:space:]]*:[[:space:]]*"([^"]*)"$/\1/' || echo "")
AZURE_API_KEY=$(echo "$AZURE_SECRET" | grep -m1 -o '"api_key"[[:space:]]*:[[:space:]]*"[^"]*"' | sed -E 's/^.*"api_key"[[:space:]]*:[[:space:]]*"([^"]*)"$/\1/' || echo "")
AZURE_DEPLOYMENT=$(echo "$AZURE_SECRET" | grep -m1 -o '"deployment_name"[[:space:]]*:[[:space:]]*"[^"]*"' | sed -E 's/^.*"deployment_name"[[:space:]]*:[[:space:]]*"([^"]*)"$/\1/' || echo "")
if [ -z "$AZURE_DEPLOYMENT" ]; then
AZURE_DEPLOYMENT=$(echo "$AZURE_SECRET" | grep -m1 -o '"deployment"[[:space:]]*:[[:space:]]*"[^"]*"' | sed -E 's/^.*"deployment"[[:space:]]*:[[:space:]]*"([^"]*)"$/\1/' || echo "")
fi
AZURE_API_VERSION=$(echo "$AZURE_SECRET" | grep -m1 -o '"api_version"[[:space:]]*:[[:space:]]*"[^"]*"' | sed -E 's/^.*"api_version"[[:space:]]*:[[:space:]]*"([^"]*)"$/\1/' || echo "")
fi
# Ensure endpoint is in Azure resource base URL format expected by AzureOpenAI client
AZURE_ENDPOINT=$(normalize_azure_endpoint "$AZURE_ENDPOINT")
if [ -n "$AZURE_ENDPOINT" ] && [ -n "$AZURE_API_KEY" ] && [ -n "$AZURE_DEPLOYMENT" ]; then
log "✓ Azure OpenAI credentials retrieved successfully"
log " Secret source: ${SECRET_NAME}"
log " Deployment: ${AZURE_DEPLOYMENT:-not set}"
log " API Version: ${AZURE_API_VERSION:-not set}"
log " Endpoint: ${AZURE_ENDPOINT}"
else
log "WARNING: Failed to parse Azure credentials from secret"
log " Secret source: ${SECRET_NAME}"
log " Endpoint: ${AZURE_ENDPOINT:-empty}"
log " API Key: ${AZURE_API_KEY:0:10}***"
log " Deployment: ${AZURE_DEPLOYMENT:-empty}"
fi
fi
# Create .env file for docker-compose BEFORE deployment
# This ensures all environment variables are persistently available
log "Updating .env file with deployment-specific configuration..."
# Function to update or add an env variable in .env file
update_env_var() {
local key="$1"
local value="$2"
local env_file="/home/ec2-user/.env"
if grep -q "^${key}=" "$env_file" 2>/dev/null; then
# Variable exists, update it
sed -i "" "s|^${key}=.*|${key}=${value}|" "$env_file"
else
# Variable doesn't exist, append it
echo "${key}=${value}" >> "$env_file"
fi
}
# Update domain configuration from resolved values
update_env_var "CADDY_DOMAIN" "${CADDY_DOMAIN:-localhost}"
update_env_var "JENKINS_DOMAIN" "${JENKINS_DOMAIN:-}"
update_env_var "IDE_DOMAIN" "${IDE_DOMAIN:-}"
update_env_var "GITEA_DOMAIN" "${GITEA_DOMAIN:-}"
update_env_var "CADDYFILE_PATH" "./caddy/Caddyfile"
update_env_var "FRONTEND_MODE" "prod"
update_env_var "FLASK_ENV" "production"
update_env_var "NODE_ENV" "production"
# Add optional Azure OpenAI Configuration if available
if [ -n "$AZURE_ENDPOINT" ]; then
update_env_var "AZURE_OPENAI_ENDPOINT" "${AZURE_ENDPOINT}"
update_env_var "AZURE_OPENAI_API_KEY" "${AZURE_API_KEY}"
update_env_var "AZURE_OPENAI_DEPLOYMENT" "${AZURE_DEPLOYMENT}"
update_env_var "AZURE_OPENAI_API_VERSION" "${AZURE_API_VERSION}"
fi
# Add GitHub Token if available
if [ -n "$GITHUB_TOKEN" ]; then
update_env_var "GITHUB_TOKEN" "${GITHUB_TOKEN}"
fi
chown ec2-user:ec2-user /home/ec2-user/.env
chmod 644 /home/ec2-user/.env
log "✓ Updated /home/ec2-user/.env with deployment configuration"
if [ -n "$AZURE_ENDPOINT" ]; then
log " ✓ Azure OpenAI configured (deployment: ${AZURE_DEPLOYMENT})"
else
log " ⚠ Azure OpenAI not configured - using fallback responses"
fi
# Verify .env file contents (mask sensitive values)
log "Verifying .env file contents:"
grep -E "^CADDY_DOMAIN=|^AZURE_OPENAI" /home/ec2-user/.env 2>/dev/null | sed 's/AZURE_OPENAI_API_KEY=.*/AZURE_OPENAI_API_KEY=***MASKED***/g' | sed 's/^/ /' || true
# Additional safety check: ensure CADDY_DOMAIN is not empty
if [ -z "$CADDY_DOMAIN" ]; then
log "ERROR: CADDY_DOMAIN is empty - docker-compose will not start properly"
exit 1
fi
# Verify network connectivity BEFORE docker-compose build (critical for github access)
log "Performing pre-deployment network connectivity checks..."
if ! verify_network_connectivity; then
log "ERROR: Network connectivity checks failed"
log " Cannot proceed with docker-compose build (needs github.com access)"
exit 1
fi
log "OK: Network connectivity verified"
# Retrieve GitHub token from AWS Secrets Manager for private repo access during docker builds
log "Retrieving GitHub credentials from AWS Secrets Manager..."
GITHUB_TOKEN=""
GITHUB_SECRET=""
# Try to get GitHub token from Secrets Manager
GITHUB_SECRET=$(aws secretsmanager get-secret-value \
--secret-id "classroom/shared/github-token" \
--region "${AWS_REGION}" \
--query SecretString \
--output text 2>/dev/null || echo "")
if [ -n "$GITHUB_SECRET" ] && [ "$GITHUB_SECRET" != "None" ]; then
# Try to extract token (could be plain string or JSON)
if echo "$GITHUB_SECRET" | grep -q '{'; then
# JSON format
if command -v jq &> /dev/null; then
GITHUB_TOKEN=$(echo "$GITHUB_SECRET" | jq -r '.token // .github_token // .pat // empty' 2>/dev/null || echo "")
else
GITHUB_TOKEN=$(echo "$GITHUB_SECRET" | grep -o '"token":"[^"]*' | cut -d'"' -f4 || echo "")
fi
else
# Plain token string
GITHUB_TOKEN="$GITHUB_SECRET"
fi
fi
if [ -n "$GITHUB_TOKEN" ] && [ "$GITHUB_TOKEN" != "None" ]; then
log "OK: GitHub credentials found - configuring git"
# Configure git to use token for HTTPS cloning
# This enables docker builds that clone from private GitHub repositories
git config --global credential.helper store
echo "https://${GITHUB_TOKEN}@github.com" > /home/ec2-user/.git-credentials
chmod 600 /home/ec2-user/.git-credentials
export GIT_ASKPASS=/bin/true
export GITHUB_TOKEN
else
log "WARNING: GitHub token not found in Secrets Manager (public repos only)"
fi
# Deploy SUT containers using docker-compose with retry logic on network failures
# Note: Pass environment variables both via .env file AND explicit exports for maximum compatibility
log "Starting SUT containers (with automatic retry on network issues)..."
log " CADDY_DOMAIN: ${CADDY_DOMAIN}"
# ── Select the fellowship Caddyfile for the tutorial stack ───────────────────
# Tutorial instances serve three HTTPS sites via the single Caddy container:
# • CADDY_DOMAIN → SUT (reverse_proxy to backend:5000 / frontend:3000)
# • JENKINS_DOMAIN → Jenkins CI (reverse_proxy to host.docker.internal:8080)
# • IDE_DOMAIN → code-server (reverse_proxy to host.docker.internal:8443)
# Caddyfile.fellowship contains all three site blocks.
# Caddyfile (staging) and Caddyfile.prod only contain the SUT block and must
# NOT be used here — they would cause Caddy to fail if JENKINS_DOMAIN / IDE_DOMAIN
# are empty, and would not expose the DevOps Escape Room tools over HTTPS at all.
FELLOWSHIP_CADDYFILE="/home/ec2-user/caddy/Caddyfile.fellowship"
ACTIVE_CADDYFILE="/home/ec2-user/caddy/Caddyfile"
if [ -f "$FELLOWSHIP_CADDYFILE" ]; then
cp "$FELLOWSHIP_CADDYFILE" "$ACTIVE_CADDYFILE"
chown ec2-user:ec2-user "$ACTIVE_CADDYFILE"
log "✓ Copied Caddyfile.fellowship → caddy/Caddyfile (SUT + Jenkins + IDE HTTPS)"
else
log "WARNING: Caddyfile.fellowship not found at ${FELLOWSHIP_CADDYFILE}"
log " Jenkins and IDE will NOT be served via HTTPS."
log " Ensure caddy/Caddyfile.fellowship is present in the SUT tarball (see caddy/ directory)."
fi
# Function to deploy SUT with retry logic for network failures
deploy_sut_with_retry() {
local max_attempts=3
local attempt=1
local wait_time=10
while [ $attempt -le $max_attempts ]; do
log " Deployment attempt $attempt/$max_attempts..."
# Use cd to set working directory, then docker compose will auto-load .env
DEPLOY_OUTPUT=$(run_as_ec2user_docker "cd ~ && docker compose up -d 2>&1" 2>&1)
DEPLOY_EXIT_CODE=$?
if [ $DEPLOY_EXIT_CODE -eq 0 ]; then
log "OK: Docker Compose started successfully"
return 0
fi
# Check if error is network-related (github, DNS, connectivity, etc)
if echo "$DEPLOY_OUTPUT" | grep -iE "network|dns|resolve|github|credential|authentication|connection refused|timeout|no such device|temporary failure" >/dev/null 2>&1; then
log " WARNING: Network-related error detected, will retry..."
log " Error: $(echo \"$DEPLOY_OUTPUT\" | head -2 | tail -1)"
if [ $attempt -lt $max_attempts ]; then
log " Waiting ${wait_time}s before retry (attempt $((attempt + 1))/$max_attempts)..."
sleep $wait_time
wait_time=$((wait_time * 2)) # Exponential backoff: 10s, 20s, 40s
attempt=$((attempt + 1))
continue
fi
else
# Non-network error, fail immediately
log "ERROR: Failed to start SUT containers (non-recoverable error)"
log "Docker Compose output:"
echo "$DEPLOY_OUTPUT" | sed 's/^/ /'
return 1
fi
attempt=$((attempt + 1))
done
# All retries exhausted
log "ERROR: Failed to start SUT containers after $max_attempts attempts"
log "Docker Compose output:"
echo "$DEPLOY_OUTPUT" | sed 's/^/ /'
log "Checking Docker logs for more information..."
run_as_ec2user_docker "cd ~ && docker compose logs" 2>&1 | tail -50 | sed 's/^/ /'
return 1
}
# Execute deployment with retry
if ! deploy_sut_with_retry; then
exit 1
fi
log "Waiting for containers to be in running state..."
# Wait for containers to be running (up to 60 seconds)
CONTAINER_WAIT_COUNT=0
while [ $CONTAINER_WAIT_COUNT -lt 12 ]; do
RUNNING_CONTAINERS=$(run_as_ec2user_docker "cd ~ && docker compose ps -q --status running 2>/dev/null | wc -l" 2>/dev/null || echo "0")
EXPECTED_CONTAINERS=3
if [ "$RUNNING_CONTAINERS" -ge "$EXPECTED_CONTAINERS" ]; then
log "✓ All required containers running ($RUNNING_CONTAINERS/$EXPECTED_CONTAINERS)"
break
fi
log " Waiting for containers... ($RUNNING_CONTAINERS/$EXPECTED_CONTAINERS running, attempt $((CONTAINER_WAIT_COUNT + 1))/12)"
sleep 5
CONTAINER_WAIT_COUNT=$((CONTAINER_WAIT_COUNT + 1))
done
# Wait for backend health check to pass (up to 20 attempts * 3 seconds = 60 seconds)
log "Waiting for backend service to be healthy..."
BACKEND_HEALTH_COUNT=0
BACKEND_READY=false
while [ $BACKEND_HEALTH_COUNT -lt 20 ]; do
BACKEND_STATUS=$(run_as_ec2user_docker "cd ~ && docker compose ps backend --format json 2>/dev/null" | grep -o '"State":"running"' || echo "")
if [ -n "$BACKEND_STATUS" ]; then
log "✓ Backend container is running"
BACKEND_READY=true
break
fi
log " Waiting for backend to be ready... (attempt $((BACKEND_HEALTH_COUNT + 1))/20)"
sleep 3
BACKEND_HEALTH_COUNT=$((BACKEND_HEALTH_COUNT + 1))
done
# Wait for frontend to compile and start (React dev server, up to 60 seconds)
log "Waiting for frontend to compile and start..."
FRONTEND_WAIT_COUNT=0
FRONTEND_READY=false
while [ $FRONTEND_WAIT_COUNT -lt 20 ]; do
FRONTEND_LOGS=$(run_as_ec2user_docker "cd ~ && docker compose logs frontend 2>&1" | grep -iE "compiled successfully|webpack compiled|app is running on" || echo "")
if [ -n "$FRONTEND_LOGS" ]; then
log "✓ Frontend is ready"
FRONTEND_READY=true
break
fi
log " Waiting for frontend compilation... (attempt $((FRONTEND_WAIT_COUNT + 1))/20)"
sleep 3
FRONTEND_WAIT_COUNT=$((FRONTEND_WAIT_COUNT + 1))
done
if [ "$FRONTEND_READY" != "true" ]; then
log "ERROR: Frontend did not become ready within timeout"
dump_runtime_diagnostics
exit 1
fi
log "Running post-deploy health gates..."
HTTP_OK=false
for i in {1..20}; do
if curl -sSf --max-time 5 "http://localhost/" >/dev/null 2>&1; then
HTTP_OK=true
log "✓ Local HTTP health gate passed"
break
fi
log " Waiting for local HTTP health gate... (attempt $i/20)"
sleep 3
done
HTTPS_OK=false
for i in {1..30}; do
if curl -k -sSf --max-time 6 "https://localhost/" >/dev/null 2>&1; then
HTTPS_OK=true
log "✓ Local HTTPS health gate passed"
break
fi
log " Waiting for local HTTPS health gate... (attempt $i/30)"
sleep 3
done
if [ "$HTTP_OK" != "true" ] || [ "$HTTPS_OK" != "true" ]; then
log "ERROR: Post-deploy health gates failed (HTTP_OK=${HTTP_OK}, HTTPS_OK=${HTTPS_OK})"
dump_runtime_diagnostics
exit 1
fi
# Verify environment variables made it to Caddy container
log "Verifying environment variables in Caddy container..."
CADDY_ENV=$(run_as_ec2user_docker "cd ~ && docker inspect fellowship-caddy --format='{{.Config.Env}}' 2>/dev/null | grep -o 'CADDY_DOMAIN=[^[:space:]]*' || echo 'NOT FOUND'" 2>/dev/null)
if [ -n "$CADDY_ENV" ] && [ "$CADDY_ENV" != "NOT FOUND" ]; then
log "✓ CADDY_DOMAIN verified in container: $CADDY_ENV"
else
log "WARNING: CADDY_DOMAIN not found in container environment"
log " This may cause connection issues"
log " Container environment (first 20 vars):"
run_as_ec2user_docker "cd ~ && docker exec fellowship-caddy env 2>/dev/null | head -20" 2>/dev/null | sed 's/^/ /' || true
fi
# Final container status check
log "Final container status:"
run_as_ec2user_docker "cd ~ && docker compose ps" 2>&1 | sed 's/^/ /'
# Final status
PUBLIC_IP=$(get_instance_metadata "public-ipv4")
[ -z "$PUBLIC_IP" ] && PUBLIC_IP="N/A"
log "=========================================="
log "Setup Complete"
log "=========================================="
log "Public IP: $PUBLIC_IP"
log ""
log "─── Fellowship SUT ────────────────────────"
log " HTTPS: https://${CADDY_DOMAIN}/"
log ""
log "─── DevOps Escape Room ────────────────────"
log " Jenkins CI (HTTPS): https://${JENKINS_DOMAIN}/"
log " Jenkins CI (direct): http://${PUBLIC_IP}:8080 (fellowship / fellowship123)"
log " IDE / code-server (HTTPS): https://${IDE_DOMAIN}/"
log " IDE / code-server (direct): http://${PUBLIC_IP}:8443 (password: fellowship)"
log " Gitea Git: http://${PUBLIC_IP}:3030 (fellowship / fellowship123)"
log " MailHog UI: http://${PUBLIC_IP}:8025"
log "=========================================="