#!/usr/bin/env bash
# scripts/cloud-assert.sh [--capture] [MODEL]
#
# Behavioral cloud verifier (DOCFIX-075). Runs EVERY "the service's own verdict"
# gate learned from the D-045/D-046/D-051/D-042 incident family -- the checks
# juju status is BLIND to -- as one idempotent, read-only sweep. Run it:
# post-deploy (phase-08 acceptance), post-restart (ops-restart-procedure Stage 7),
# pre-change baseline, and post-incident. Absorbs the jumphost-local
# post-maintenance-health-check.sh (which was never version-controlled).
#
# Sections (each independent; worst exit wins):
# A0 juju reachable; no units in error/blocked (carried gss 'unknown' tolerated)
# A1 vault: Initialized=true, Sealed=false (sealed-after-reboot is BY DESIGN --
# the FIX is manual unseal per ops-restart-procedure, not a redeploy)
# A2 mysql: 3 units 'Cluster is ONLINE', exactly one R/W (D-062 world)
# A3 OVN central: Cluster ID uniform across the 3 units, both NB + SB DBs
# A4 OVN chassis: ovn-controller 'connected' on every nova-compute + octavia
# A5 compute plane: hypervisors up, compute services enabled+up [needs OS_*]
# A6 octavia: every LB provisioning=ACTIVE + operating=ONLINE (skip if none) [OS_*]
# A7 identity/magnum behavior: keystone shows 'PO:' (D-051/DOCFIX-071);
# trustee domain 'magnum' + magnum_domain_admin exist (D-046);
# 'openstack coe service list' returns the conductor row (no 403) [OS_*]
# A8 conductor graft: magnum-conductor LIVE process args carry --config-dir
# (D-037: verify the launched cmdline, never the config text)
#
# --capture: additionally writes a deploy BOM (bill of materials) to
# asbuilt/<UTC timestamp>/ : exported bundle, juju status yaml, driver pip
# version, image list. Commit the directory -- it is the drift baseline for
# Roosevelt multi-DC comparison. Capture is the ONLY write this script does,
# and only under asbuilt/.
#
# Requirements: jumphost with juju; sections A5-A7 need an admin scope
# (source ~/admin-openrc first) -- absent scope is a HOLD (exit 2), not a skip,
# so a half-run is never mistaken for a pass.
# Exit: 0 all pass | 1 any FAIL | 2 warnings/holds only. ASCII + LF.
set -uo pipefail
shopt -s inherit_errexit 2>/dev/null || true
MODEL="openstack"; CAPTURE=0
for a in "$@"; do
case "$a" in
--capture) CAPTURE=1 ;;
-*) echo "FAIL: unknown flag $a"; exit 1 ;;
*) MODEL="$a" ;;
esac
done
J() { juju "$@" </dev/null 2>&1; } # capture-then-test everywhere (SIGPIPE rule)
WORST=0
fail() { echo " [FAIL] $*"; WORST=1; }
warn() { echo " [WARN] $*"; [ "$WORST" -ne 1 ] && WORST=2; }
ok() { echo " [ok] $*"; }
command -v juju >/dev/null 2>&1 || { echo "FAIL: juju not found"; exit 1; }
command -v jq >/dev/null 2>&1 || { echo "FAIL: jq required"; exit 1; }
echo "================ A0: juju model + unit states ================"
ST=$(J status -m "$MODEL" --format=json || true)
if ! jq -e .applications >/dev/null 2>&1 <<<"$ST"; then
fail "cannot read juju status for model '$MODEL'"; echo "$ST" | head -3
echo; echo "CLOUD-ASSERT: FAIL (model unreachable)"; exit 1
fi
BAD=$(jq -r '.applications | to_entries[] as $app | ($app.value.units // {}) | to_entries[]
| select(.value["workload-status"].current | IN("error","blocked"))
| "\(.key) \(.value["workload-status"].current): \(.value["workload-status"].message // "")"' <<<"$ST" || true)
UNK=$(jq -r '.applications | to_entries[] as $app | ($app.value.units // {}) | to_entries[]
| select(.value["workload-status"].current == "unknown") | .key' <<<"$ST" || true)
if [ -n "$BAD" ]; then while IFS= read -r l; do fail "unit $l"; done <<<"$BAD"
else ok "no units in error/blocked"; fi
if [ -n "$UNK" ]; then
if grep -qv 'glance-simplestreams-sync' <<<"$UNK"; then warn "units 'unknown' beyond parked gss: $(tr '\n' ' ' <<<"$UNK")"
else ok "only parked gss reports 'unknown' (documented carry)"; fi
fi
echo "================ A1: vault unsealed ================"
V=$(J ssh -m "$MODEL" vault/0 -- 'VAULT_ADDR=http://127.0.0.1:8200 vault status 2>&1' || true)
grep -q 'Initialized *true' <<<"$V" || fail "vault not initialized"
if grep -q 'Sealed *false' <<<"$V"; then ok "vault Initialized=true Sealed=false"
else fail "vault SEALED -- manual 3-of-5 unseal (ops-restart-procedure Stage 3); sealed-after-restart is by design"; fi
echo "================ A2: mysql innodb cluster ================"
M=$(jq -r '.applications["mysql-innodb-cluster"].units // {} | to_entries[]
| "\(.key) \(.value["workload-status"].message // "")"' <<<"$ST" || true)
N=$(grep -c 'Cluster is ONLINE' <<<"$M" || true)
RW=$(grep -c 'Mode: R/W' <<<"$M" || true)
[ "${N:-0}" -eq 3 ] || fail "mysql ONLINE units=$N (want 3): $(tr '\n' ' | ' <<<"$M")"
[ "${RW:-0}" -eq 1 ] || fail "mysql R/W count=$RW (want exactly 1)"
[ "${N:-0}" -eq 3 ] && [ "${RW:-0}" -eq 1 ] && ok "mysql 3x ONLINE, exactly one R/W"
echo "================ A3: OVN central cluster unity ================"
UNITS=$(jq -r '.applications["ovn-central"].units // {} | keys[]' <<<"$ST" || true)
if [ -z "$UNITS" ]; then warn "ovn-central units not found -- skipped"
else
for spec in 'OVN_Northbound:ovnnb_db.ctl' 'OVN_Southbound:ovnsb_db.ctl'; do
db=${spec%%:*}; sock=${spec##*:}; IDS=""
while IFS= read -r u; do
O=$(J ssh -m "$MODEL" "$u" -- "sudo ovs-appctl -t /var/run/ovn/${sock} cluster/status ${db} 2>&1" || true)
CID=$(awk '/^Cluster ID/{print $3; exit}' <<<"$O")
[ -n "$CID" ] && IDS="$IDS $CID" || fail "$u: no Cluster ID for $db"
done <<<"$UNITS"
U=$(tr ' ' '\n' <<<"$IDS" | sed '/^$/d' | sort -u | wc -l)
[ "$U" -eq 1 ] && ok "$db Cluster ID uniform across units" || fail "$db Cluster IDs differ:$IDS"
done
fi
echo "================ A4: OVN chassis connectivity ================"
CH=$(jq -r '.applications | to_entries[] | select(.key=="nova-compute" or .key=="octavia")
| .value.units // {} | keys[]' <<<"$ST" || true)
if [ -z "$CH" ]; then warn "no chassis principals found -- skipped"
else while IFS= read -r u; do
C=$(J ssh -m "$MODEL" "$u" -- 'sudo ovn-appctl -t ovn-controller connection-status 2>&1' | tr -d '\r' || true)
grep -q '^connected' <<<"$C" && ok "$u chassis connected" \
|| fail "$u chassis NOT connected ($C) -- post-vault TLS sweep (appendix-A / restart Stage 4)"
done <<<"$CH"; fi
if [ -z "${OS_AUTH_URL:-}" ]; then
warn "A5-A7 HELD: no admin scope in env (source ~/admin-openrc, then re-run)"
else
echo "================ A5: compute plane ================"
H=$(openstack hypervisor list -f json </dev/null 2>&1 || true)
if jq -e . >/dev/null 2>&1 <<<"$H"; then
DOWN=$(jq -r '.[] | select((.State // ."state") != "up") | (."Hypervisor Hostname" // .hypervisor_hostname)' <<<"$H" || true)
[ -z "$DOWN" ] && ok "all hypervisors up" || fail "hypervisors down: $(tr '\n' ' ' <<<"$DOWN") (restart nova-compute; see appendix-A)"
else fail "hypervisor list unreadable: $(head -1 <<<"$H")"; fi
S=$(openstack compute service list -f json </dev/null 2>&1 || true)
BADS=$(jq -r '.[] | select(.Status=="enabled" and .State!="up") | "\(.Binary)@\(.Host)"' <<<"$S" 2>/dev/null || true)
[ -z "$BADS" ] && ok "all enabled compute services up" || fail "compute services down: $(tr '\n' ' ' <<<"$BADS")"
echo "================ A6: octavia load balancers ================"
L=$(openstack loadbalancer list -f json </dev/null 2>&1 || true)
if jq -e . >/dev/null 2>&1 <<<"$L"; then
CNT=$(jq 'length' <<<"$L")
if [ "$CNT" -eq 0 ]; then ok "no LBs present (nothing to assert)"
else
BADL=$(jq -r '.[] | select(.provisioning_status!="ACTIVE" or .operating_status!="ONLINE")
| "\(.name) prov=\(.provisioning_status) op=\(.operating_status)"' <<<"$L" || true)
[ -z "$BADL" ] && ok "$CNT LB(s) ACTIVE/ONLINE" \
|| fail "LBs unhealthy: $(tr '\n' ' | ' <<<"$BADL") (failover pattern: ops-restart-procedure Stage 6)"
fi
else warn "octavia not answering (absent?): $(head -1 <<<"$L")"; fi
echo "================ A7: identity + magnum behavior ================"
KS=$(J status -m "$MODEL" keystone --format=line || true)
if grep -q 'PO (broken)' <<<"$KS"; then fail "keystone PO (broken) -- policy zip attached but unparsed"
elif grep -q 'PO:' <<<"$KS"; then ok "keystone policy override loaded (PO:) -- run G3 (appendix-C C.4) for the behavioral gate"
else fail "keystone shows NO policy override (DOCFIX-071: redeploy shipped without domain-manager RBAC?)"; fi
D=$(openstack domain show magnum -f value -c enabled </dev/null 2>&1 || true)
grep -q '^True$' <<<"$D" && ok "trustee domain 'magnum' exists (D-046)" \
|| fail "trustee domain 'magnum' missing -- run 'juju run magnum/leader domain-setup' (D-046)"
U=$(openstack user show magnum_domain_admin --domain magnum -f value -c name </dev/null 2>&1 || true)
grep -q '^magnum_domain_admin$' <<<"$U" && ok "magnum_domain_admin exists" || fail "magnum_domain_admin missing (D-046)"
C=$(openstack coe service list -f value -c binary </dev/null 2>&1 || true)
grep -q 'magnum-conductor' <<<"$C" && ok "coe service list returns conductor (no 403)" \
|| fail "coe service list has no conductor row: $(head -1 <<<"$C")"
fi
echo "================ A8: conductor graft (launched args, D-037) ================"
P=$(J ssh -m "$MODEL" magnum/0 -- 'ps -ww -C magnum-conductor -o args= 2>&1' || true)
if grep -q -- '--config-dir /etc/magnum/magnum.conf.d' <<<"$P"; then ok "conductor runs with --config-dir (graft live)"
elif grep -qi 'no such\|not found\|cannot' <<<"$P"; then warn "magnum/0 unreachable for A8: $(head -1 <<<"$P")"
else fail "conductor LIVE args lack --config-dir (graft not in effect; config-file presence proves nothing): $(head -1 <<<"$P")"; fi
if [ "$CAPTURE" -eq 1 ]; then
echo "================ CAPTURE: as-built BOM ================"
DIR="asbuilt/$(date -u +%Y%m%d-%H%M%S)"; mkdir -p "$DIR"
J export-bundle -m "$MODEL" > "$DIR/bundle-exported.yaml" || warn "export-bundle failed"
printf '%s\n' "$ST" > "$DIR/juju-status.json"
J ssh -m "$MODEL" magnum/0 -- 'pip show magnum-capi-helm 2>/dev/null | egrep "Version|Location"' > "$DIR/driver-version.txt" || true
if [ -n "${OS_AUTH_URL:-}" ]; then
openstack image list --long -f json </dev/null > "$DIR/images.json" 2>/dev/null || true
fi
ok "BOM written to $DIR -- commit it (Roosevelt drift baseline)"
fi
case "$WORST" in
0) echo; echo "CLOUD-ASSERT: PASS" ;;
2) echo; echo "CLOUD-ASSERT: WARN/HELD -- review before trusting" ;;
*) echo; echo "CLOUD-ASSERT: FAIL" ;;
esac
exit "$WORST"