Newer
Older
openstack-caracal-ipv4 / scripts / phase-02-vault-preflight.sh
#!/usr/bin/env bash
# scripts/phase-02-vault-preflight.sh [MODEL]
#
# Read-only verify-before-mutate GATE for phase-02 Step 2.1 (vault init).
# Packages the manual pre-flight audit into ONE re-runnable check so the DC-DC
# rehearsal has a single command to clear before the IRREVERSIBLE vault init.
#
# Mutates NOTHING. This scripts the read-only CHECKS only; the vault
# init/unseal/authorize MUTATIONS stay gated human steps (secret custody) --
# run those verbatim from runbooks/phase-02-vault-bringup.md, never from here.
#
# Asserts (all must hold to print PROCEED):
#   A. juju controller auth reachable (no stale macaroon).
#   B. every machine in MODEL is 'started'.
#   C. mysql-innodb-cluster (vault's storage backend): 3 units, all active+ONLINE,
#      exactly one R/W and the rest R/O.                                  [GATE]
#   D. vault is FRESH: exactly one vault unit, workload blocked with
#      "Vault needs to be initialized".                                   [GATE]
#      The irreversibility guard: if vault is NOT fresh it may already hold
#      keys -- DO NOT re-init; escalate.
#   E. zero workload-error and zero agent-error (hook-failed) anywhere,
#      subordinates included.  (Unit agent state is juju-status.current in
#      `juju status` JSON -- there is no agent-status key on units.)        [GATE]
#
# Usage:  scripts/phase-02-vault-preflight.sh [MODEL]      (default MODEL=openstack)
# Exit:   0 PROCEED | 1 HOLD (a gate failed / juju error) | 2 precondition
#         (jq missing, or MODEL not present yet)
#
# Values resolve dynamically from live status; nothing host/IP/ID is hardcoded.
# Read-only. Safe to re-run. ASCII + LF.

set -euo pipefail
shopt -s inherit_errexit 2>/dev/null || true
IFS=$'\n\t'

SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
# shellcheck source=scripts/lib-net.sh
. "$SCRIPT_DIR/lib-net.sh"

MODEL="${1:-openstack}"
DB_APP="mysql-innodb-cluster"
VAULT_APP="vault"

FATAL=0
fail() { echo "FAIL: $*" >&2; FATAL=$((FATAL + 1)); }
pass() { echo "PASS: $*"; }
note() { echo "NOTE: $*"; }

need_jq || exit 2

# --- A. auth: whoami DIRECTLY (not in a substitution) so a stale-macaroon
#        password prompt reaches the tty BEFORE any captured juju call
#        (appendix-A: juju-macaroon -> 'juju login' then retry). ----------------
echo "=== A. juju identity (a stale macaroon prompts/EOFs here -> run 'juju login') ==="
juju whoami || { fail "juju whoami failed (auth/macaroon)"; echo "Summary: HOLD (auth)"; exit 1; }
echo

# MODEL present? Spaces and units are per-model; a model typo would otherwise
# read as a healthy empty model. Strip any owner/ prefix; match the bare name.
if ! juju models --format json 2>/dev/null | jq -r '.models[]?.name' 2>/dev/null \
     | sed 's#.*/##' | grep -qx "$MODEL"; then
  note "model '$MODEL' not present -- run 'juju add-model $MODEL' first"
  echo "Summary: precondition (model absent)"
  exit 2
fi

# --- one consistent status snapshot (avoids repeated calls / settle races) -----
J="$(juju status -m "$MODEL" --format json 2>/dev/null || echo "")"
if [ -z "$J" ]; then
  fail "juju status -m $MODEL returned nothing (juju error?)"
  echo "Summary: HOLD (no status)"
  exit 1
fi

# --- single metrics pass: every count emitted as key=value (always all keys) ---
KV="$(printf '%s' "$J" | jq -r --arg dbapp "$DB_APP" --arg vaultapp "$VAULT_APP" '
  def allunits: [ .applications[] | (.units? // {}) | .. | objects | select(has("workload-status")) ];
  def appunits($a): [ (.applications[$a].units // {}) | .[] ];
  (allunits)            as $u
  | (appunits($dbapp))    as $db
  | (appunits($vaultapp)) as $vt
  | "mach_total="    + ((.machines // {}) | length | tostring)
  + "\nmach_started=" + ([ (.machines // {}) | to_entries[] | select(.value."juju-status".current=="started") ] | length | tostring)
  + "\ndb_units="    + ($db | length | tostring)
  + "\ndb_online="   + ([ $db[] | select((."workload-status".message // "") | test("ONLINE")) ] | length | tostring)
  + "\ndb_rw="       + ([ $db[] | select((."workload-status".message // "") | test("Mode: R/W")) ] | length | tostring)
  + "\ndb_active="   + ([ $db[] | select(."workload-status".current=="active") ] | length | tostring)
  + "\nv_units="     + ($vt | length | tostring)
  + "\nv_fresh="     + ([ $vt[] | select(."workload-status".current=="blocked" and ((."workload-status".message // "") | test("needs to be initialized";"i"))) ] | length | tostring)
  + "\nwe="          + ([ $u[] | select(."workload-status".current=="error") ] | length | tostring)
  + "\nae="          + ([ $u[] | select(."juju-status".current=="error") ] | length | tostring)
  + "\nc_blocked="   + ([ $u[] | select(."workload-status".current=="blocked") ] | length | tostring)
  + "\nc_waiting="   + ([ $u[] | select(."workload-status".current=="waiting") ] | length | tostring)
  + "\nc_active="    + ([ $u[] | select(."workload-status".current=="active") ] | length | tostring)
  + "\nc_unknown="   + ([ $u[] | select(."workload-status".current=="unknown") ] | length | tostring)
  + "\nc_total="     + ($u | length | tostring)
' 2>/dev/null || echo "")"
if [ -z "$KV" ]; then
  fail "metrics extraction failed (jq) -- cannot evaluate gates"
  echo "Summary: HOLD (jq)"
  exit 1
fi
# KV is only integer 'key=value' lines from our own jq program.
eval "$KV"

# --- B. machines all started --------------------------------------------------
echo "=== B. machines (every machine must be 'started') ==="
printf '  %s started / %s total\n' "$mach_started" "$mach_total"
printf '%s' "$J" | jq -r '.machines // {} | to_entries[]
  | "    machine " + .key + "  [" + (.value."juju-status".current) + "]  "
    + ((.value.hostname // .value."instance-id" // "") | tostring)' 2>/dev/null || true
if [ "$mach_total" -eq 0 ]; then
  fail "no machines in model '$MODEL'"
elif [ "$mach_started" -ne "$mach_total" ]; then
  fail "$((mach_total - mach_started)) machine(s) not 'started'"
else
  pass "all $mach_total machines started"
fi
echo

# --- C. mysql-innodb-cluster = vault's backend --------------------------------
echo "=== C. $DB_APP (vault backend): 3 units, all active+ONLINE, 1x R/W + rest R/O ==="
printf '%s' "$J" | jq -r --arg a "$DB_APP" '(.applications[$a].units // {}) | to_entries[]
  | "  " + .key + "  [" + (.value."workload-status".current) + "]  "
    + (.value."workload-status".message // "")' 2>/dev/null || true
if [ "$db_units" -eq 3 ]; then pass "$DB_APP unit count = 3"
  else fail "$DB_APP unit count = $db_units (expect 3)"; fi
if [ "$db_active" -eq "$db_units" ] && [ "$db_online" -eq "$db_units" ] && [ "$db_units" -gt 0 ]; then
  pass "$DB_APP all $db_units units active+ONLINE"
  else fail "$DB_APP: active=$db_active online=$db_online of $db_units"; fi
if [ "$db_rw" -eq 1 ]; then pass "$DB_APP exactly 1 unit R/W"
  else fail "$DB_APP R/W count = $db_rw (expect exactly 1)"; fi
echo

# --- D. vault must be FRESH (the irreversibility guard) -----------------------
echo "=== D. $VAULT_APP must be FRESH: 1 unit, [blocked] \"Vault needs to be initialized\" ==="
printf '%s' "$J" | jq -r --arg a "$VAULT_APP" '(.applications[$a].units // {}) | to_entries[]
  | "  " + .key + "  [" + (.value."workload-status".current) + "]  "
    + (.value."workload-status".message // "")' 2>/dev/null || true
if [ "$v_units" -eq 1 ] && [ "$v_fresh" -eq 1 ]; then
  pass "$VAULT_APP is fresh/uninitialized -- Step 2.1 (vault init) is safe"
else
  fail "$VAULT_APP NOT in the fresh blocked-needs-init state (units=$v_units fresh=$v_fresh)"
  echo "      -> DO NOT run vault init; it may already hold keys. Escalate (phase-02 do-doc / appendix-A)." >&2
fi
echo

# --- E. census: zero workload-error and zero agent-error (hook) ---------------
echo "=== E. census (workload-error AND agent-error(hook) MUST be 0; subordinates included) ==="
printf '  units=%s  |  workload-error=%s  agent-error(hook)=%s  |  blocked=%s  waiting=%s  active=%s  unknown=%s\n' \
  "$c_total" "$we" "$ae" "$c_blocked" "$c_waiting" "$c_active" "$c_unknown"
if [ "$we" -eq 0 ]; then pass "no workload-error units"
  else fail "$we unit(s) in workload-error"; fi
if [ "$ae" -eq 0 ]; then pass "no agent-error (hook-failed) units"
  else fail "$ae unit(s) with agent-error (hook failed)"; fi
echo

# --- verdict ------------------------------------------------------------------
if [ "$FATAL" -eq 0 ]; then
  echo "Summary: PROCEED -- vault is fresh and the backend is healthy."
  echo "  Next: runbooks/phase-02-vault-bringup.md Step 2.1 (vault init -- IRREVERSIBLE one-shot)."
  exit 0
else
  echo "Summary: HOLD -- $FATAL gate(s) failed. Do NOT start vault init; resolve first."
  exit 1
fi