#!/usr/bin/env bash
# scripts/phase-03-core-verify.sh [MODEL]
#
# Read-only Step 3.1 gate for phase-03 (cloud settled + haproxy backends healthy).
# Packages two checks:
# 3.1a acceptance walk -- only the expected post-deploy exceptions may be non-active/idle
# (octavia awaiting configure-resources; glance-simplestreams-sync image-sync state).
# Delegates the classify/gate to scripts/phase03_accept_walk.py (identity, not count).
# 3.1b haproxy backend-health sweep (D-045 / DOCFIX-031) -- juju status is BLIND to a
# charm-rendered haproxy backend that is silently DOWN (it hid a dead nova-api ~3
# days behind a green status), so probe every principal unit's admin socket directly.
#
# Mutates NOTHING. A DOWN backend's remediation (haproxy -c validate + systemctl reload)
# stays a gated per-unit human step -- this script only DETECTS and reports it.
#
# Usage: scripts/phase-03-core-verify.sh [MODEL] (default MODEL=openstack)
# Exit: 0 PROCEED | 1 HOLD (a gate failed / juju error) | 2 precondition
# (jq / python3 / helper missing, or MODEL not present)
#
# Values resolve dynamically from live status; nothing host/IP/ID hardcoded.
# Read-only. Safe to re-run. ASCII + LF.
set -euo pipefail
shopt -s inherit_errexit 2>/dev/null || true
IFS=$'\n\t'
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
# shellcheck source=scripts/lib-net.sh
. "$SCRIPT_DIR/lib-net.sh"
MODEL="${1:-openstack}"
WALK="$SCRIPT_DIR/phase03_accept_walk.py"
FATAL=0
fail() { echo "FAIL: $*" >&2; FATAL=$((FATAL + 1)); }
pass() { echo "PASS: $*"; }
need_jq || exit 2
command -v python3 >/dev/null 2>&1 || { echo "FAIL: python3 required" >&2; exit 2; }
[ -f "$WALK" ] || { echo "FAIL: helper not found: $WALK" >&2; exit 2; }
# --- A. auth: whoami DIRECTLY so a stale-macaroon prompt reaches the tty first ----
echo "=== A. juju identity (a stale macaroon prompts/EOFs here -> run 'juju login') ==="
juju whoami || { fail "juju whoami failed (auth/macaroon)"; echo "Summary: HOLD (auth)"; exit 1; }
echo
if ! juju models --format json 2>/dev/null | jq -r '.models[]?.name' 2>/dev/null \
| sed 's#.*/##' | grep -qx "$MODEL"; then
echo "NOTE: model '$MODEL' not present -- run 'juju add-model $MODEL' first"
echo "Summary: precondition (model absent)"
exit 2
fi
J="$(juju status -m "$MODEL" --format json 2>/dev/null || echo "")"
if [ -z "$J" ]; then
fail "juju status -m $MODEL returned nothing (juju error?)"
echo "Summary: HOLD (no status)"
exit 1
fi
# --- 3.1a acceptance walk (identity-gated; helper exits 1 on any UNEXPECTED) -------
echo "=== 3.1a acceptance walk (only octavia + glance-simplestreams-sync may be non-active/idle) ==="
if printf '%s' "$J" | python3 "$WALK"; then
pass "settled -- only the expected post-deploy exceptions are non-active/idle"
else
fail "unexpected non-active/idle unit(s) above (marked XX)"
fi
echo
# --- 3.1b haproxy backend-health sweep (D-045) -------------------------------------
echo "=== 3.1b haproxy backend-health sweep (juju status is BLIND to a DOWN backend) ==="
down_units=0
checked=0
# principal units only (haproxy runs on the API principals, not subordinates)
while IFS= read -r unit; do
[ -n "$unit" ] || continue
checked=$((checked + 1))
out="$(juju ssh -m "$MODEL" "$unit" -- "test -S /var/run/haproxy/admin.sock || exit 0; sudo python3 -c 'import socket;s=socket.socket(socket.AF_UNIX);s.connect(\"/var/run/haproxy/admin.sock\");s.sendall(b\"show stat\n\");print(s.makefile().read())' | grep -vE 'FRONTEND|BACKEND' | grep ',DOWN,'" </dev/null 2>/dev/null || true)"
if [ -n "$out" ]; then
printf '%s\n' "$out" | sed "s|^| [$unit] DOWN: |"
down_units=$((down_units + 1))
fi
done < <(printf '%s' "$J" | jq -r '.applications[]?.units // {} | keys[]?')
if [ "$down_units" -eq 0 ]; then
pass "all haproxy backends UP across $checked principal unit(s) (zero DOWN)"
else
fail "$down_units unit(s) with a DOWN backend -- gated remediation: 'sudo haproxy -c -f /etc/haproxy/haproxy.cfg' then 'sudo systemctl reload haproxy' on each, re-run"
fi
echo
# --- verdict -----------------------------------------------------------------------
if [ "$FATAL" -eq 0 ]; then
echo "Summary: PROCEED -- cloud settled and all haproxy backends healthy (Step 3.1 clear)."
echo " Next: phase-03 Step 3.2 (build admin-openrc) -- gated, secret-handling."
exit 0
else
echo "Summary: HOLD -- $FATAL gate(s) failed. Resolve before Step 3.2."
exit 1
fi