diff --git a/scripts/phase-02-vault-preflight.sh b/scripts/phase-02-vault-preflight.sh new file mode 100644 index 0000000..51dce0b --- /dev/null +++ b/scripts/phase-02-vault-preflight.sh @@ -0,0 +1,165 @@ +#!/usr/bin/env bash +# scripts/phase-02-vault-preflight.sh [MODEL] +# +# Read-only verify-before-mutate GATE for phase-02 Step 2.1 (vault init). +# Packages the manual pre-flight audit into ONE re-runnable check so the DC-DC +# rehearsal has a single command to clear before the IRREVERSIBLE vault init. +# +# Mutates NOTHING. This scripts the read-only CHECKS only; the vault +# init/unseal/authorize MUTATIONS stay gated human steps (secret custody) -- +# run those verbatim from runbooks/phase-02-vault-bringup.md, never from here. +# +# Asserts (all must hold to print PROCEED): +# A. juju controller auth reachable (no stale macaroon). +# B. every machine in MODEL is 'started'. +# C. mysql-innodb-cluster (vault's storage backend): 3 units, all active+ONLINE, +# exactly one R/W and the rest R/O. [GATE] +# D. vault is FRESH: exactly one vault unit, workload blocked with +# "Vault needs to be initialized". [GATE] +# The irreversibility guard: if vault is NOT fresh it may already hold +# keys -- DO NOT re-init; escalate. +# E. zero workload-error and zero agent-error (hook-failed) anywhere, +# subordinates included. [GATE] +# +# Usage: scripts/phase-02-vault-preflight.sh [MODEL] (default MODEL=openstack) +# Exit: 0 PROCEED | 1 HOLD (a gate failed / juju error) | 2 precondition +# (jq missing, or MODEL not present yet) +# +# Values resolve dynamically from live status; nothing host/IP/ID is hardcoded. +# Read-only. Safe to re-run. ASCII + LF. + +set -euo pipefail +shopt -s inherit_errexit 2>/dev/null || true +IFS=$'\n\t' + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +# shellcheck source=scripts/lib-net.sh +. "$SCRIPT_DIR/lib-net.sh" + +MODEL="${1:-openstack}" +DB_APP="mysql-innodb-cluster" +VAULT_APP="vault" + +FATAL=0 +fail() { echo "FAIL: $*" >&2; FATAL=$((FATAL + 1)); } +pass() { echo "PASS: $*"; } +note() { echo "NOTE: $*"; } + +need_jq || exit 2 + +# --- A. auth: whoami DIRECTLY (not in a substitution) so a stale-macaroon +# password prompt reaches the tty BEFORE any captured juju call +# (appendix-A: juju-macaroon -> 'juju login' then retry). ---------------- +echo "=== A. juju identity (a stale macaroon prompts/EOFs here -> run 'juju login') ===" +juju whoami || { fail "juju whoami failed (auth/macaroon)"; echo "Summary: HOLD (auth)"; exit 1; } +echo + +# MODEL present? Spaces and units are per-model; a model typo would otherwise +# read as a healthy empty model. Strip any owner/ prefix; match the bare name. +if ! juju models --format json 2>/dev/null | jq -r '.models[]?.name' 2>/dev/null \ + | sed 's#.*/##' | grep -qx "$MODEL"; then + note "model '$MODEL' not present -- run 'juju add-model $MODEL' first" + echo "Summary: precondition (model absent)" + exit 2 +fi + +# --- one consistent status snapshot (avoids repeated calls / settle races) ----- +J="$(juju status -m "$MODEL" --format json 2>/dev/null || echo "")" +if [ -z "$J" ]; then + fail "juju status -m $MODEL returned nothing (juju error?)" + echo "Summary: HOLD (no status)" + exit 1 +fi + +# --- single metrics pass: every count emitted as key=value (always all keys) --- +KV="$(printf '%s' "$J" | jq -r --arg dbapp "$DB_APP" --arg vaultapp "$VAULT_APP" ' + def allunits: [ .applications[] | (.units? // {}) | .. | objects | select(has("workload-status")) ]; + def appunits($a): [ (.applications[$a].units // {}) | .[] ]; + (allunits) as $u + | (appunits($dbapp)) as $db + | (appunits($vaultapp)) as $vt + | "mach_total=" + ((.machines // {}) | length | tostring) + + "\nmach_started=" + ([ (.machines // {}) | to_entries[] | select(.value."juju-status".current=="started") ] | length | tostring) + + "\ndb_units=" + ($db | length | tostring) + + "\ndb_online=" + ([ $db[] | select((."workload-status".message // "") | test("ONLINE")) ] | length | tostring) + + "\ndb_rw=" + ([ $db[] | select((."workload-status".message // "") | test("Mode: R/W")) ] | length | tostring) + + "\ndb_active=" + ([ $db[] | select(."workload-status".current=="active") ] | length | tostring) + + "\nv_units=" + ($vt | length | tostring) + + "\nv_fresh=" + ([ $vt[] | select(."workload-status".current=="blocked" and ((."workload-status".message // "") | test("needs to be initialized";"i"))) ] | length | tostring) + + "\nwe=" + ([ $u[] | select(."workload-status".current=="error") ] | length | tostring) + + "\nae=" + ([ $u[] | select(."agent-status".current=="error") ] | length | tostring) + + "\nc_blocked=" + ([ $u[] | select(."workload-status".current=="blocked") ] | length | tostring) + + "\nc_waiting=" + ([ $u[] | select(."workload-status".current=="waiting") ] | length | tostring) + + "\nc_active=" + ([ $u[] | select(."workload-status".current=="active") ] | length | tostring) + + "\nc_unknown=" + ([ $u[] | select(."workload-status".current=="unknown") ] | length | tostring) + + "\nc_total=" + ($u | length | tostring) +' 2>/dev/null || echo "")" +if [ -z "$KV" ]; then + fail "metrics extraction failed (jq) -- cannot evaluate gates" + echo "Summary: HOLD (jq)" + exit 1 +fi +# KV is only integer 'key=value' lines from our own jq program. +eval "$KV" + +# --- B. machines all started -------------------------------------------------- +echo "=== B. machines (every machine must be 'started') ===" +printf ' %s started / %s total\n' "$mach_started" "$mach_total" +printf '%s' "$J" | jq -r '.machines // {} | to_entries[] + | " machine " + .key + " [" + (.value."juju-status".current) + "] " + + ((.value.hostname // .value."instance-id" // "") | tostring)' 2>/dev/null || true +if [ "$mach_total" -eq 0 ]; then + fail "no machines in model '$MODEL'" +elif [ "$mach_started" -ne "$mach_total" ]; then + fail "$((mach_total - mach_started)) machine(s) not 'started'" +else + pass "all $mach_total machines started" +fi +echo + +# --- C. mysql-innodb-cluster = vault's backend -------------------------------- +echo "=== C. $DB_APP (vault backend): 3 units, all active+ONLINE, 1x R/W + rest R/O ===" +printf '%s' "$J" | jq -r --arg a "$DB_APP" '(.applications[$a].units // {}) | to_entries[] + | " " + .key + " [" + (.value."workload-status".current) + "] " + + (.value."workload-status".message // "")' 2>/dev/null || true +if [ "$db_units" -eq 3 ]; then pass "$DB_APP unit count = 3" + else fail "$DB_APP unit count = $db_units (expect 3)"; fi +if [ "$db_active" -eq "$db_units" ] && [ "$db_online" -eq "$db_units" ] && [ "$db_units" -gt 0 ]; then + pass "$DB_APP all $db_units units active+ONLINE" + else fail "$DB_APP: active=$db_active online=$db_online of $db_units"; fi +if [ "$db_rw" -eq 1 ]; then pass "$DB_APP exactly 1 unit R/W" + else fail "$DB_APP R/W count = $db_rw (expect exactly 1)"; fi +echo + +# --- D. vault must be FRESH (the irreversibility guard) ----------------------- +echo "=== D. $VAULT_APP must be FRESH: 1 unit, [blocked] \"Vault needs to be initialized\" ===" +printf '%s' "$J" | jq -r --arg a "$VAULT_APP" '(.applications[$a].units // {}) | to_entries[] + | " " + .key + " [" + (.value."workload-status".current) + "] " + + (.value."workload-status".message // "")' 2>/dev/null || true +if [ "$v_units" -eq 1 ] && [ "$v_fresh" -eq 1 ]; then + pass "$VAULT_APP is fresh/uninitialized -- Step 2.1 (vault init) is safe" +else + fail "$VAULT_APP NOT in the fresh blocked-needs-init state (units=$v_units fresh=$v_fresh)" + echo " -> DO NOT run vault init; it may already hold keys. Escalate (phase-02 do-doc / appendix-A)." >&2 +fi +echo + +# --- E. census: zero workload-error and zero agent-error (hook) --------------- +echo "=== E. census (workload-error AND agent-error(hook) MUST be 0; subordinates included) ===" +printf ' units=%s | workload-error=%s agent-error(hook)=%s | blocked=%s waiting=%s active=%s unknown=%s\n' \ + "$c_total" "$we" "$ae" "$c_blocked" "$c_waiting" "$c_active" "$c_unknown" +if [ "$we" -eq 0 ]; then pass "no workload-error units" + else fail "$we unit(s) in workload-error"; fi +if [ "$ae" -eq 0 ]; then pass "no agent-error (hook-failed) units" + else fail "$ae unit(s) with agent-error (hook failed)"; fi +echo + +# --- verdict ------------------------------------------------------------------ +if [ "$FATAL" -eq 0 ]; then + echo "Summary: PROCEED -- vault is fresh and the backend is healthy." + echo " Next: runbooks/phase-02-vault-bringup.md Step 2.1 (vault init -- IRREVERSIBLE one-shot)." + exit 0 +else + echo "Summary: HOLD -- $FATAL gate(s) failed. Do NOT start vault init; resolve first." + exit 1 +fi