#!/usr/bin/env bash
# scripts/phase-02-vault-preflight.sh [MODEL]
#
# Read-only verify-before-mutate GATE for phase-02 Step 2.1 (vault init).
# Packages the manual pre-flight audit into ONE re-runnable check so the DC-DC
# rehearsal has a single command to clear before the IRREVERSIBLE vault init.
#
# Mutates NOTHING. This scripts the read-only CHECKS only; the vault
# init/unseal/authorize MUTATIONS stay gated human steps (secret custody) --
# run those verbatim from runbooks/phase-02-vault-bringup.md, never from here.
#
# Asserts (all must hold to print PROCEED):
# A. juju controller auth reachable (no stale macaroon).
# B. every machine in MODEL is 'started'.
# C. mysql-innodb-cluster (vault's storage backend): 3 units, all active+ONLINE,
# exactly one R/W and the rest R/O. [GATE]
# D. vault is FRESH: exactly one vault unit, workload blocked with
# "Vault needs to be initialized". [GATE]
# The irreversibility guard: if vault is NOT fresh it may already hold
# keys -- DO NOT re-init; escalate.
# E. zero workload-error and zero agent-error (hook-failed) anywhere,
# subordinates included. [GATE]
#
# Usage: scripts/phase-02-vault-preflight.sh [MODEL] (default MODEL=openstack)
# Exit: 0 PROCEED | 1 HOLD (a gate failed / juju error) | 2 precondition
# (jq missing, or MODEL not present yet)
#
# Values resolve dynamically from live status; nothing host/IP/ID is hardcoded.
# Read-only. Safe to re-run. ASCII + LF.
set -euo pipefail
shopt -s inherit_errexit 2>/dev/null || true
IFS=$'\n\t'
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
# shellcheck source=scripts/lib-net.sh
. "$SCRIPT_DIR/lib-net.sh"
MODEL="${1:-openstack}"
DB_APP="mysql-innodb-cluster"
VAULT_APP="vault"
FATAL=0
fail() { echo "FAIL: $*" >&2; FATAL=$((FATAL + 1)); }
pass() { echo "PASS: $*"; }
note() { echo "NOTE: $*"; }
need_jq || exit 2
# --- A. auth: whoami DIRECTLY (not in a substitution) so a stale-macaroon
# password prompt reaches the tty BEFORE any captured juju call
# (appendix-A: juju-macaroon -> 'juju login' then retry). ----------------
echo "=== A. juju identity (a stale macaroon prompts/EOFs here -> run 'juju login') ==="
juju whoami || { fail "juju whoami failed (auth/macaroon)"; echo "Summary: HOLD (auth)"; exit 1; }
echo
# MODEL present? Spaces and units are per-model; a model typo would otherwise
# read as a healthy empty model. Strip any owner/ prefix; match the bare name.
if ! juju models --format json 2>/dev/null | jq -r '.models[]?.name' 2>/dev/null \
| sed 's#.*/##' | grep -qx "$MODEL"; then
note "model '$MODEL' not present -- run 'juju add-model $MODEL' first"
echo "Summary: precondition (model absent)"
exit 2
fi
# --- one consistent status snapshot (avoids repeated calls / settle races) -----
J="$(juju status -m "$MODEL" --format json 2>/dev/null || echo "")"
if [ -z "$J" ]; then
fail "juju status -m $MODEL returned nothing (juju error?)"
echo "Summary: HOLD (no status)"
exit 1
fi
# --- single metrics pass: every count emitted as key=value (always all keys) ---
KV="$(printf '%s' "$J" | jq -r --arg dbapp "$DB_APP" --arg vaultapp "$VAULT_APP" '
def allunits: [ .applications[] | (.units? // {}) | .. | objects | select(has("workload-status")) ];
def appunits($a): [ (.applications[$a].units // {}) | .[] ];
(allunits) as $u
| (appunits($dbapp)) as $db
| (appunits($vaultapp)) as $vt
| "mach_total=" + ((.machines // {}) | length | tostring)
+ "\nmach_started=" + ([ (.machines // {}) | to_entries[] | select(.value."juju-status".current=="started") ] | length | tostring)
+ "\ndb_units=" + ($db | length | tostring)
+ "\ndb_online=" + ([ $db[] | select((."workload-status".message // "") | test("ONLINE")) ] | length | tostring)
+ "\ndb_rw=" + ([ $db[] | select((."workload-status".message // "") | test("Mode: R/W")) ] | length | tostring)
+ "\ndb_active=" + ([ $db[] | select(."workload-status".current=="active") ] | length | tostring)
+ "\nv_units=" + ($vt | length | tostring)
+ "\nv_fresh=" + ([ $vt[] | select(."workload-status".current=="blocked" and ((."workload-status".message // "") | test("needs to be initialized";"i"))) ] | length | tostring)
+ "\nwe=" + ([ $u[] | select(."workload-status".current=="error") ] | length | tostring)
+ "\nae=" + ([ $u[] | select(."agent-status".current=="error") ] | length | tostring)
+ "\nc_blocked=" + ([ $u[] | select(."workload-status".current=="blocked") ] | length | tostring)
+ "\nc_waiting=" + ([ $u[] | select(."workload-status".current=="waiting") ] | length | tostring)
+ "\nc_active=" + ([ $u[] | select(."workload-status".current=="active") ] | length | tostring)
+ "\nc_unknown=" + ([ $u[] | select(."workload-status".current=="unknown") ] | length | tostring)
+ "\nc_total=" + ($u | length | tostring)
' 2>/dev/null || echo "")"
if [ -z "$KV" ]; then
fail "metrics extraction failed (jq) -- cannot evaluate gates"
echo "Summary: HOLD (jq)"
exit 1
fi
# KV is only integer 'key=value' lines from our own jq program.
eval "$KV"
# --- B. machines all started --------------------------------------------------
echo "=== B. machines (every machine must be 'started') ==="
printf ' %s started / %s total\n' "$mach_started" "$mach_total"
printf '%s' "$J" | jq -r '.machines // {} | to_entries[]
| " machine " + .key + " [" + (.value."juju-status".current) + "] "
+ ((.value.hostname // .value."instance-id" // "") | tostring)' 2>/dev/null || true
if [ "$mach_total" -eq 0 ]; then
fail "no machines in model '$MODEL'"
elif [ "$mach_started" -ne "$mach_total" ]; then
fail "$((mach_total - mach_started)) machine(s) not 'started'"
else
pass "all $mach_total machines started"
fi
echo
# --- C. mysql-innodb-cluster = vault's backend --------------------------------
echo "=== C. $DB_APP (vault backend): 3 units, all active+ONLINE, 1x R/W + rest R/O ==="
printf '%s' "$J" | jq -r --arg a "$DB_APP" '(.applications[$a].units // {}) | to_entries[]
| " " + .key + " [" + (.value."workload-status".current) + "] "
+ (.value."workload-status".message // "")' 2>/dev/null || true
if [ "$db_units" -eq 3 ]; then pass "$DB_APP unit count = 3"
else fail "$DB_APP unit count = $db_units (expect 3)"; fi
if [ "$db_active" -eq "$db_units" ] && [ "$db_online" -eq "$db_units" ] && [ "$db_units" -gt 0 ]; then
pass "$DB_APP all $db_units units active+ONLINE"
else fail "$DB_APP: active=$db_active online=$db_online of $db_units"; fi
if [ "$db_rw" -eq 1 ]; then pass "$DB_APP exactly 1 unit R/W"
else fail "$DB_APP R/W count = $db_rw (expect exactly 1)"; fi
echo
# --- D. vault must be FRESH (the irreversibility guard) -----------------------
echo "=== D. $VAULT_APP must be FRESH: 1 unit, [blocked] \"Vault needs to be initialized\" ==="
printf '%s' "$J" | jq -r --arg a "$VAULT_APP" '(.applications[$a].units // {}) | to_entries[]
| " " + .key + " [" + (.value."workload-status".current) + "] "
+ (.value."workload-status".message // "")' 2>/dev/null || true
if [ "$v_units" -eq 1 ] && [ "$v_fresh" -eq 1 ]; then
pass "$VAULT_APP is fresh/uninitialized -- Step 2.1 (vault init) is safe"
else
fail "$VAULT_APP NOT in the fresh blocked-needs-init state (units=$v_units fresh=$v_fresh)"
echo " -> DO NOT run vault init; it may already hold keys. Escalate (phase-02 do-doc / appendix-A)." >&2
fi
echo
# --- E. census: zero workload-error and zero agent-error (hook) ---------------
echo "=== E. census (workload-error AND agent-error(hook) MUST be 0; subordinates included) ==="
printf ' units=%s | workload-error=%s agent-error(hook)=%s | blocked=%s waiting=%s active=%s unknown=%s\n' \
"$c_total" "$we" "$ae" "$c_blocked" "$c_waiting" "$c_active" "$c_unknown"
if [ "$we" -eq 0 ]; then pass "no workload-error units"
else fail "$we unit(s) in workload-error"; fi
if [ "$ae" -eq 0 ]; then pass "no agent-error (hook-failed) units"
else fail "$ae unit(s) with agent-error (hook failed)"; fi
echo
# --- verdict ------------------------------------------------------------------
if [ "$FATAL" -eq 0 ]; then
echo "Summary: PROCEED -- vault is fresh and the backend is healthy."
echo " Next: runbooks/phase-02-vault-bringup.md Step 2.1 (vault init -- IRREVERSIBLE one-shot)."
exit 0
else
echo "Summary: HOLD -- $FATAL gate(s) failed. Do NOT start vault init; resolve first."
exit 1
fi