Newer
Older
openstack-caracal-ipv4 / scripts / cloud-assert.sh
@JANeumatrix JANeumatrix 8 hours ago 10 KB Patches
#!/usr/bin/env bash
# scripts/cloud-assert.sh [--capture] [MODEL]
#
# Behavioral cloud verifier (DOCFIX-075). Runs EVERY "the service's own verdict"
# gate learned from the D-045/D-046/D-051/D-042 incident family -- the checks
# juju status is BLIND to -- as one idempotent, read-only sweep. Run it:
# post-deploy (phase-08 acceptance), post-restart (ops-restart-procedure Stage 7),
# pre-change baseline, and post-incident. Absorbs the jumphost-local
# post-maintenance-health-check.sh (which was never version-controlled).
#
# Sections (each independent; worst exit wins):
#   A0 juju reachable; no units in error/blocked (carried gss 'unknown' tolerated)
#   A1 vault: Initialized=true, Sealed=false (sealed-after-reboot is BY DESIGN --
#      the FIX is manual unseal per ops-restart-procedure, not a redeploy)
#   A2 mysql: 3 units 'Cluster is ONLINE', exactly one R/W (D-062 world)
#   A3 OVN central: Cluster ID uniform across the 3 units, both NB + SB DBs
#   A4 OVN chassis: ovn-controller 'connected' on every nova-compute + octavia
#   A5 compute plane: hypervisors up, compute services enabled+up   [needs OS_*]
#   A6 octavia: every LB provisioning=ACTIVE + operating=ONLINE (skip if none) [OS_*]
#   A7 identity/magnum behavior: keystone shows 'PO:' (D-051/DOCFIX-071);
#      trustee domain 'magnum' + magnum_domain_admin exist (D-046);
#      'openstack coe service list' returns the conductor row (no 403)  [OS_*]
#   A8 conductor graft: magnum-conductor LIVE process args carry --config-dir
#      (D-037: verify the launched cmdline, never the config text)
#
# --capture: additionally writes a deploy BOM (bill of materials) to
#   asbuilt/<UTC timestamp>/ : exported bundle, juju status yaml, driver pip
#   version, image list. Commit the directory -- it is the drift baseline for
#   Roosevelt multi-DC comparison. Capture is the ONLY write this script does,
#   and only under asbuilt/.
#
# Requirements: jumphost with juju; sections A5-A7 need an admin scope
# (source ~/admin-openrc first) -- absent scope is a HOLD (exit 2), not a skip,
# so a half-run is never mistaken for a pass.
# Exit: 0 all pass | 1 any FAIL | 2 warnings/holds only.  ASCII + LF.

set -uo pipefail
shopt -s inherit_errexit 2>/dev/null || true

MODEL="openstack"; CAPTURE=0
for a in "$@"; do
  case "$a" in
    --capture) CAPTURE=1 ;;
    -*) echo "FAIL: unknown flag $a"; exit 1 ;;
    *) MODEL="$a" ;;
  esac
done
J() { juju "$@" </dev/null 2>&1; }   # capture-then-test everywhere (SIGPIPE rule)
WORST=0
fail() { echo "  [FAIL] $*"; WORST=1; }
warn() { echo "  [WARN] $*"; [ "$WORST" -ne 1 ] && WORST=2; }
ok()   { echo "  [ok]   $*"; }

command -v juju >/dev/null 2>&1 || { echo "FAIL: juju not found"; exit 1; }
command -v jq   >/dev/null 2>&1 || { echo "FAIL: jq required"; exit 1; }

echo "================ A0: juju model + unit states ================"
ST=$(J status -m "$MODEL" --format=json || true)
if ! jq -e .applications >/dev/null 2>&1 <<<"$ST"; then
  fail "cannot read juju status for model '$MODEL'"; echo "$ST" | head -3
  echo; echo "CLOUD-ASSERT: FAIL (model unreachable)"; exit 1
fi
BAD=$(jq -r '.applications | to_entries[] as $app | ($app.value.units // {}) | to_entries[]
      | select(.value["workload-status"].current | IN("error","blocked"))
      | "\(.key) \(.value["workload-status"].current): \(.value["workload-status"].message // "")"' <<<"$ST" || true)
UNK=$(jq -r '.applications | to_entries[] as $app | ($app.value.units // {}) | to_entries[]
      | select(.value["workload-status"].current == "unknown") | .key' <<<"$ST" || true)
if [ -n "$BAD" ]; then while IFS= read -r l; do fail "unit $l"; done <<<"$BAD"
else ok "no units in error/blocked"; fi
if [ -n "$UNK" ]; then
  if grep -qv 'glance-simplestreams-sync' <<<"$UNK"; then warn "units 'unknown' beyond parked gss: $(tr '\n' ' ' <<<"$UNK")"
  else ok "only parked gss reports 'unknown' (documented carry)"; fi
fi

echo "================ A1: vault unsealed ================"
V=$(J ssh -m "$MODEL" vault/0 -- 'VAULT_ADDR=http://127.0.0.1:8200 vault status 2>&1' || true)
grep -q 'Initialized *true'  <<<"$V" || fail "vault not initialized"
if grep -q 'Sealed *false' <<<"$V"; then ok "vault Initialized=true Sealed=false"
else fail "vault SEALED -- manual 3-of-5 unseal (ops-restart-procedure Stage 3); sealed-after-restart is by design"; fi

echo "================ A2: mysql innodb cluster ================"
M=$(jq -r '.applications["mysql-innodb-cluster"].units // {} | to_entries[]
     | "\(.key) \(.value["workload-status"].message // "")"' <<<"$ST" || true)
N=$(grep -c 'Cluster is ONLINE' <<<"$M" || true)
RW=$(grep -c 'Mode: R/W' <<<"$M" || true)
[ "${N:-0}" -eq 3 ] || fail "mysql ONLINE units=$N (want 3): $(tr '\n' ' | ' <<<"$M")"
[ "${RW:-0}" -eq 1 ] || fail "mysql R/W count=$RW (want exactly 1)"
[ "${N:-0}" -eq 3 ] && [ "${RW:-0}" -eq 1 ] && ok "mysql 3x ONLINE, exactly one R/W"

echo "================ A3: OVN central cluster unity ================"
UNITS=$(jq -r '.applications["ovn-central"].units // {} | keys[]' <<<"$ST" || true)
if [ -z "$UNITS" ]; then warn "ovn-central units not found -- skipped"
else
  for spec in 'OVN_Northbound:ovnnb_db.ctl' 'OVN_Southbound:ovnsb_db.ctl'; do
    db=${spec%%:*}; sock=${spec##*:}; IDS=""
    while IFS= read -r u; do
      O=$(J ssh -m "$MODEL" "$u" -- "sudo ovs-appctl -t /var/run/ovn/${sock} cluster/status ${db} 2>&1" || true)
      CID=$(awk '/^Cluster ID/{print $3; exit}' <<<"$O")
      [ -n "$CID" ] && IDS="$IDS $CID" || fail "$u: no Cluster ID for $db"
    done <<<"$UNITS"
    U=$(tr ' ' '\n' <<<"$IDS" | sed '/^$/d' | sort -u | wc -l)
    [ "$U" -eq 1 ] && ok "$db Cluster ID uniform across units" || fail "$db Cluster IDs differ:$IDS"
  done
fi

echo "================ A4: OVN chassis connectivity ================"
CH=$(jq -r '.applications | to_entries[] | select(.key=="nova-compute" or .key=="octavia")
     | .value.units // {} | keys[]' <<<"$ST" || true)
if [ -z "$CH" ]; then warn "no chassis principals found -- skipped"
else while IFS= read -r u; do
  C=$(J ssh -m "$MODEL" "$u" -- 'sudo ovn-appctl -t ovn-controller connection-status 2>&1' | tr -d '\r' || true)
  grep -q '^connected' <<<"$C" && ok "$u chassis connected" \
    || fail "$u chassis NOT connected ($C) -- post-vault TLS sweep (appendix-A / restart Stage 4)"
done <<<"$CH"; fi

if [ -z "${OS_AUTH_URL:-}" ]; then
  warn "A5-A7 HELD: no admin scope in env (source ~/admin-openrc, then re-run)"
else
  echo "================ A5: compute plane ================"
  H=$(openstack hypervisor list -f json </dev/null 2>&1 || true)
  if jq -e . >/dev/null 2>&1 <<<"$H"; then
    DOWN=$(jq -r '.[] | select((.State // ."state") != "up") | (."Hypervisor Hostname" // .hypervisor_hostname)' <<<"$H" || true)
    [ -z "$DOWN" ] && ok "all hypervisors up" || fail "hypervisors down: $(tr '\n' ' ' <<<"$DOWN") (restart nova-compute; see appendix-A)"
  else fail "hypervisor list unreadable: $(head -1 <<<"$H")"; fi
  S=$(openstack compute service list -f json </dev/null 2>&1 || true)
  BADS=$(jq -r '.[] | select(.Status=="enabled" and .State!="up") | "\(.Binary)@\(.Host)"' <<<"$S" 2>/dev/null || true)
  [ -z "$BADS" ] && ok "all enabled compute services up" || fail "compute services down: $(tr '\n' ' ' <<<"$BADS")"

  echo "================ A6: octavia load balancers ================"
  L=$(openstack loadbalancer list -f json </dev/null 2>&1 || true)
  if jq -e . >/dev/null 2>&1 <<<"$L"; then
    CNT=$(jq 'length' <<<"$L")
    if [ "$CNT" -eq 0 ]; then ok "no LBs present (nothing to assert)"
    else
      BADL=$(jq -r '.[] | select(.provisioning_status!="ACTIVE" or .operating_status!="ONLINE")
             | "\(.name) prov=\(.provisioning_status) op=\(.operating_status)"' <<<"$L" || true)
      [ -z "$BADL" ] && ok "$CNT LB(s) ACTIVE/ONLINE" \
        || fail "LBs unhealthy: $(tr '\n' ' | ' <<<"$BADL") (failover pattern: ops-restart-procedure Stage 6)"
    fi
  else warn "octavia not answering (absent?): $(head -1 <<<"$L")"; fi

  echo "================ A7: identity + magnum behavior ================"
  KS=$(J status -m "$MODEL" keystone --format=line || true)
  if grep -q 'PO (broken)' <<<"$KS"; then fail "keystone PO (broken) -- policy zip attached but unparsed"
  elif grep -q 'PO:' <<<"$KS"; then ok "keystone policy override loaded (PO:) -- run G3 (appendix-C C.4) for the behavioral gate"
  else fail "keystone shows NO policy override (DOCFIX-071: redeploy shipped without domain-manager RBAC?)"; fi
  D=$(openstack domain show magnum -f value -c enabled </dev/null 2>&1 || true)
  grep -q '^True$' <<<"$D" && ok "trustee domain 'magnum' exists (D-046)" \
    || fail "trustee domain 'magnum' missing -- run 'juju run magnum/leader domain-setup' (D-046)"
  U=$(openstack user show magnum_domain_admin --domain magnum -f value -c name </dev/null 2>&1 || true)
  grep -q '^magnum_domain_admin$' <<<"$U" && ok "magnum_domain_admin exists" || fail "magnum_domain_admin missing (D-046)"
  C=$(openstack coe service list -f value -c binary </dev/null 2>&1 || true)
  grep -q 'magnum-conductor' <<<"$C" && ok "coe service list returns conductor (no 403)" \
    || fail "coe service list has no conductor row: $(head -1 <<<"$C")"
fi

echo "================ A8: conductor graft (launched args, D-037) ================"
P=$(J ssh -m "$MODEL" magnum/0 -- 'ps -ww -C magnum-conductor -o args= 2>&1' || true)
if grep -q -- '--config-dir /etc/magnum/magnum.conf.d' <<<"$P"; then ok "conductor runs with --config-dir (graft live)"
elif grep -qi 'no such\|not found\|cannot' <<<"$P"; then warn "magnum/0 unreachable for A8: $(head -1 <<<"$P")"
else fail "conductor LIVE args lack --config-dir (graft not in effect; config-file presence proves nothing): $(head -1 <<<"$P")"; fi

if [ "$CAPTURE" -eq 1 ]; then
  echo "================ CAPTURE: as-built BOM ================"
  DIR="asbuilt/$(date -u +%Y%m%d-%H%M%S)"; mkdir -p "$DIR"
  J export-bundle -m "$MODEL" > "$DIR/bundle-exported.yaml" || warn "export-bundle failed"
  printf '%s\n' "$ST" > "$DIR/juju-status.json"
  J ssh -m "$MODEL" magnum/0 -- 'pip show magnum-capi-helm 2>/dev/null | egrep "Version|Location"' > "$DIR/driver-version.txt" || true
  if [ -n "${OS_AUTH_URL:-}" ]; then
    openstack image list --long -f json </dev/null > "$DIR/images.json" 2>/dev/null || true
  fi
  ok "BOM written to $DIR -- commit it (Roosevelt drift baseline)"
fi

case "$WORST" in
  0) echo; echo "CLOUD-ASSERT: PASS" ;;
  2) echo; echo "CLOUD-ASSERT: WARN/HELD -- review before trusting" ;;
  *) echo; echo "CLOUD-ASSERT: FAIL" ;;
esac
exit "$WORST"