diff --git a/scripts/phase-02-vault-preflight.sh b/scripts/phase-02-vault-preflight.sh index 51dce0b..8789d85 100644 --- a/scripts/phase-02-vault-preflight.sh +++ b/scripts/phase-02-vault-preflight.sh @@ -19,7 +19,8 @@ # The irreversibility guard: if vault is NOT fresh it may already hold # keys -- DO NOT re-init; escalate. # E. zero workload-error and zero agent-error (hook-failed) anywhere, -# subordinates included. [GATE] +# subordinates included. (Unit agent state is juju-status.current in +# `juju status` JSON -- there is no agent-status key on units.) [GATE] # # Usage: scripts/phase-02-vault-preflight.sh [MODEL] (default MODEL=openstack) # Exit: 0 PROCEED | 1 HOLD (a gate failed / juju error) | 2 precondition @@ -87,7 +88,7 @@ + "\nv_units=" + ($vt | length | tostring) + "\nv_fresh=" + ([ $vt[] | select(."workload-status".current=="blocked" and ((."workload-status".message // "") | test("needs to be initialized";"i"))) ] | length | tostring) + "\nwe=" + ([ $u[] | select(."workload-status".current=="error") ] | length | tostring) - + "\nae=" + ([ $u[] | select(."agent-status".current=="error") ] | length | tostring) + + "\nae=" + ([ $u[] | select(."juju-status".current=="error") ] | length | tostring) + "\nc_blocked=" + ([ $u[] | select(."workload-status".current=="blocked") ] | length | tostring) + "\nc_waiting=" + ([ $u[] | select(."workload-status".current=="waiting") ] | length | tostring) + "\nc_active=" + ([ $u[] | select(."workload-status".current=="active") ] | length | tostring) diff --git a/scripts/phase-03-core-verify.sh b/scripts/phase-03-core-verify.sh new file mode 100644 index 0000000..19b1723 --- /dev/null +++ b/scripts/phase-03-core-verify.sh @@ -0,0 +1,99 @@ +#!/usr/bin/env bash +# scripts/phase-03-core-verify.sh [MODEL] +# +# Read-only Step 3.1 gate for phase-03 (cloud settled + haproxy backends healthy). +# Packages two checks: +# 3.1a acceptance walk -- only the expected post-deploy exceptions may be non-active/idle +# (octavia awaiting configure-resources; glance-simplestreams-sync image-sync state). +# Delegates the classify/gate to scripts/phase03_accept_walk.py (identity, not count). +# 3.1b haproxy backend-health sweep (D-045 / DOCFIX-031) -- juju status is BLIND to a +# charm-rendered haproxy backend that is silently DOWN (it hid a dead nova-api ~3 +# days behind a green status), so probe every principal unit's admin socket directly. +# +# Mutates NOTHING. A DOWN backend's remediation (haproxy -c validate + systemctl reload) +# stays a gated per-unit human step -- this script only DETECTS and reports it. +# +# Usage: scripts/phase-03-core-verify.sh [MODEL] (default MODEL=openstack) +# Exit: 0 PROCEED | 1 HOLD (a gate failed / juju error) | 2 precondition +# (jq / python3 / helper missing, or MODEL not present) +# +# Values resolve dynamically from live status; nothing host/IP/ID hardcoded. +# Read-only. Safe to re-run. ASCII + LF. + +set -euo pipefail +shopt -s inherit_errexit 2>/dev/null || true +IFS=$'\n\t' + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +# shellcheck source=scripts/lib-net.sh +. "$SCRIPT_DIR/lib-net.sh" + +MODEL="${1:-openstack}" +WALK="$SCRIPT_DIR/phase03_accept_walk.py" + +FATAL=0 +fail() { echo "FAIL: $*" >&2; FATAL=$((FATAL + 1)); } +pass() { echo "PASS: $*"; } + +need_jq || exit 2 +command -v python3 >/dev/null 2>&1 || { echo "FAIL: python3 required" >&2; exit 2; } +[ -f "$WALK" ] || { echo "FAIL: helper not found: $WALK" >&2; exit 2; } + +# --- A. auth: whoami DIRECTLY so a stale-macaroon prompt reaches the tty first ---- +echo "=== A. juju identity (a stale macaroon prompts/EOFs here -> run 'juju login') ===" +juju whoami || { fail "juju whoami failed (auth/macaroon)"; echo "Summary: HOLD (auth)"; exit 1; } +echo + +if ! juju models --format json 2>/dev/null | jq -r '.models[]?.name' 2>/dev/null \ + | sed 's#.*/##' | grep -qx "$MODEL"; then + echo "NOTE: model '$MODEL' not present -- run 'juju add-model $MODEL' first" + echo "Summary: precondition (model absent)" + exit 2 +fi + +J="$(juju status -m "$MODEL" --format json 2>/dev/null || echo "")" +if [ -z "$J" ]; then + fail "juju status -m $MODEL returned nothing (juju error?)" + echo "Summary: HOLD (no status)" + exit 1 +fi + +# --- 3.1a acceptance walk (identity-gated; helper exits 1 on any UNEXPECTED) ------- +echo "=== 3.1a acceptance walk (only octavia + glance-simplestreams-sync may be non-active/idle) ===" +if printf '%s' "$J" | python3 "$WALK"; then + pass "settled -- only the expected post-deploy exceptions are non-active/idle" +else + fail "unexpected non-active/idle unit(s) above (marked XX)" +fi +echo + +# --- 3.1b haproxy backend-health sweep (D-045) ------------------------------------- +echo "=== 3.1b haproxy backend-health sweep (juju status is BLIND to a DOWN backend) ===" +down_units=0 +checked=0 +# principal units only (haproxy runs on the API principals, not subordinates) +while IFS= read -r unit; do + [ -n "$unit" ] || continue + checked=$((checked + 1)) + out="$(juju ssh -m "$MODEL" "$unit" -- "test -S /var/run/haproxy/admin.sock || exit 0; sudo python3 -c 'import socket;s=socket.socket(socket.AF_UNIX);s.connect(\"/var/run/haproxy/admin.sock\");s.sendall(b\"show stat\n\");print(s.makefile().read())' | grep -vE 'FRONTEND|BACKEND' | grep ',DOWN,'" /dev/null || true)" + if [ -n "$out" ]; then + printf '%s\n' "$out" | sed "s|^| [$unit] DOWN: |" + down_units=$((down_units + 1)) + fi +done < <(printf '%s' "$J" | jq -r '.applications[]?.units // {} | keys[]?') +if [ "$down_units" -eq 0 ]; then + pass "all haproxy backends UP across $checked principal unit(s) (zero DOWN)" +else + fail "$down_units unit(s) with a DOWN backend -- gated remediation: 'sudo haproxy -c -f /etc/haproxy/haproxy.cfg' then 'sudo systemctl reload haproxy' on each, re-run" +fi +echo + +# --- verdict ----------------------------------------------------------------------- +if [ "$FATAL" -eq 0 ]; then + echo "Summary: PROCEED -- cloud settled and all haproxy backends healthy (Step 3.1 clear)." + echo " Next: phase-03 Step 3.2 (build admin-openrc) -- gated, secret-handling." + exit 0 +else + echo "Summary: HOLD -- $FATAL gate(s) failed. Resolve before Step 3.2." + exit 1 +fi diff --git a/scripts/phase03_accept_walk.py b/scripts/phase03_accept_walk.py new file mode 100644 index 0000000..173b646 --- /dev/null +++ b/scripts/phase03_accept_walk.py @@ -0,0 +1,57 @@ +#!/usr/bin/env python3 +# scripts/phase03_accept_walk.py +# +# Read `juju status --format json` on stdin; report every unit (subordinates included) +# that is NOT workload=active / juju=idle, and classify each as EXPECTED or UNEXPECTED. +# +# This is the phase-03 Step 3.1 acceptance gate, hardened from the do-doc's bare COUNT +# (1 or 2) to an IDENTITY check: a different app sitting blocked would also produce +# count==2 yet must FAIL. The only post-deploy exceptions allowed here are: +# - octavia/* : workload=blocked, msg mentions 'configure-resources' +# (D-021; cleared in phase-05) +# - glance-simplestreams-sync/* : workload in {unknown, waiting} (image-sync state) +# +# Exit 0 if every non-active/idle unit is EXPECTED (gate clear); 1 if any UNEXPECTED; +# 2 if stdin is not parseable juju-status JSON. Read-only. ASCII + LF. +import json +import sys + + +def expected(name, ws, msg): + if name.startswith("octavia/") and ws == "blocked" and "configure-resources" in msg: + return True + if name.startswith("glance-simplestreams-sync/") and ws in ("unknown", "waiting"): + return True + return False + + +def walk(units, out): + for name, u in (units or {}).items(): + ws = (u.get("workload-status") or {}).get("current") or "" + js = (u.get("juju-status") or {}).get("current") or "" + msg = (u.get("workload-status") or {}).get("message") or "" + if ws != "active" or js != "idle": + out.append((name, ws, js, msg)) + walk(u.get("subordinates"), out) + + +def main(): + try: + d = json.load(sys.stdin) + except Exception as e: # noqa: BLE001 - any parse failure is a precondition fail + print("FATAL: cannot parse juju status JSON: %s" % e, file=sys.stderr) + return 2 + bad = [] + for app in (d.get("applications") or {}).values(): + walk(app.get("units"), bad) + unexpected = [b for b in bad if not expected(b[0], b[1], b[3])] + print("Non-active/idle units: %d (expected: %d, UNEXPECTED: %d)" + % (len(bad), len(bad) - len(unexpected), len(unexpected))) + for name, ws, js, msg in bad: + tag = "ok" if expected(name, ws, msg) else "XX" + print(" [%s] %s: workload=%s juju=%s msg=%s" % (tag, name, ws, js, msg)) + return 1 if unexpected else 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/tests/phase-02/fakebin/jq b/tests/phase-02/fakebin/jq new file mode 100644 index 0000000..f0906bd --- /dev/null +++ b/tests/phase-02/fakebin/jq @@ -0,0 +1,71 @@ +#!/usr/bin/env python3 +# fake jq for behavior-testing: implements ONLY the programs phase-02-vault-preflight.sh uses. +# Metrics are computed by an independent Python mirror of the jq logic, so the bash +# decision/exit logic runs for real against algorithmically-correct values. +import sys, json +argv = sys.argv[1:] +args = {}; prog = None; i = 0 +while i < len(argv): + a = argv[i] + if a in ("-r", "--raw-output", "-e", "--exit-status"): i += 1; continue + if a == "--arg": args[argv[i+1]] = argv[i+2]; i += 3; continue + if prog is None and not a.startswith("-"): prog = a; i += 1; continue + i += 1 +try: + data = json.load(sys.stdin) +except Exception: + sys.exit(0) + +def walk(x, out): + if isinstance(x, dict): + if "workload-status" in x: out.append(x) + for v in x.values(): walk(v, out) + elif isinstance(x, list): + for v in x: walk(v, out) + +def all_units(root): + out = [] + for app in (root.get("applications") or {}).values(): + walk(app.get("units") or {}, out) + return out + +def app_units(root, a): + app = (root.get("applications") or {}).get(a) or {} + return list((app.get("units") or {}).values()) + +def ws(u): return (u.get("workload-status") or {}) +def ags(u): return (u.get("juju-status") or {}) + +if prog and ".models[]?.name" in prog: + for m in (data.get("models") or []): + if m.get("name") is not None: print(m["name"]) + sys.exit(0) + +if prog and "mach_total=" in prog: + machines = data.get("machines") or {} + db = app_units(data, args.get("dbapp")) + vt = app_units(data, args.get("vaultapp")) + u = all_units(data) + c = lambda lst, p: sum(1 for x in lst if p(x)) + kv = { + "mach_total": len(machines), + "mach_started": c(list(machines.values()), lambda m: (m.get("juju-status") or {}).get("current") == "started"), + "db_units": len(db), + "db_online": c(db, lambda x: "ONLINE" in (ws(x).get("message") or "")), + "db_rw": c(db, lambda x: "Mode: R/W" in (ws(x).get("message") or "")), + "db_active": c(db, lambda x: ws(x).get("current") == "active"), + "v_units": len(vt), + "v_fresh": c(vt, lambda x: ws(x).get("current") == "blocked" and "needs to be initialized" in (ws(x).get("message") or "").lower()), + "we": c(u, lambda x: ws(x).get("current") == "error"), + "ae": c(u, lambda x: ags(x).get("current") == "error"), + "c_blocked": c(u, lambda x: ws(x).get("current") == "blocked"), + "c_waiting": c(u, lambda x: ws(x).get("current") == "waiting"), + "c_active": c(u, lambda x: ws(x).get("current") == "active"), + "c_unknown": c(u, lambda x: ws(x).get("current") == "unknown"), + "c_total": len(u), + } + print("\n".join(f"{k}={v}" for k, v in kv.items())) + sys.exit(0) + +# cosmetic display programs -> no output (does not affect the verdict) +sys.exit(0) diff --git a/tests/phase-02/fakebin/juju b/tests/phase-02/fakebin/juju new file mode 100644 index 0000000..16350d1 --- /dev/null +++ b/tests/phase-02/fakebin/juju @@ -0,0 +1,15 @@ +#!/usr/bin/env bash +# fake juju for behavior-testing phase-02-vault-preflight.sh +sub="${1:-}" +case "$sub" in + whoami) + echo "Controller: juju-controller" + echo "Model: ${FAKE_MODEL:-openstack}" + echo "User: jessea123" ;; + models) + printf '{"models":[{"name":"admin/%s"}]}\n' "${FAKE_MODEL:-openstack}" ;; + status) + cat "${FIXTURE:?FIXTURE env not set}" ;; + *) + echo "fake juju: unhandled subcommand: $sub" >&2; exit 1 ;; +esac diff --git a/tests/phase-02/make_fixtures.py b/tests/phase-02/make_fixtures.py index ecb22e4..06707d0 100644 --- a/tests/phase-02/make_fixtures.py +++ b/tests/phase-02/make_fixtures.py @@ -9,7 +9,7 @@ def unit(cur, msg, agent="idle", subs=None): u = {"workload-status": {"current": cur, "message": msg}, - "agent-status": {"current": agent}} + "juju-status": {"current": agent}} if subs: u["subordinates"] = subs return u @@ -68,7 +68,7 @@ # FAIL E: a hook failure (agent-status error) on an otherwise-maintenance unit f = copy.deepcopy(base) f["applications"]["nova-compute"]["units"]["nova-compute/0"]["workload-status"] = {"current": "maintenance", "message": "installing"} -f["applications"]["nova-compute"]["units"]["nova-compute/0"]["agent-status"] = {"current": "error", "message": 'hook failed: "install"'} +f["applications"]["nova-compute"]["units"]["nova-compute/0"]["juju-status"] = {"current": "error", "message": 'hook failed: "install"'} dump("fail-hook-error.json", f) # FAIL B: a machine not started diff --git a/tests/phase-03/fakebin/jq b/tests/phase-03/fakebin/jq new file mode 100644 index 0000000..25edbbf --- /dev/null +++ b/tests/phase-03/fakebin/jq @@ -0,0 +1,21 @@ +#!/usr/bin/env python3 +# fake jq: implements only the two programs phase-03-core-verify.sh uses. +import sys, json +argv = sys.argv[1:]; prog = None; i = 0 +while i < len(argv): + a = argv[i] + if a in ("-r", "--raw-output"): i += 1; continue + if a == "--arg": i += 3; continue + if prog is None and not a.startswith("-"): prog = a; i += 1; continue + i += 1 +try: data = json.load(sys.stdin) +except Exception: sys.exit(0) +if prog and ".models[]?.name" in prog: + for m in (data.get("models") or []): + if m.get("name") is not None: print(m["name"]) + sys.exit(0) +if prog and "keys[]?" in prog and "units" in prog: + for app in (data.get("applications") or {}).values(): + for k in (app.get("units") or {}).keys(): print(k) + sys.exit(0) +sys.exit(0) diff --git a/tests/phase-03/fakebin/juju b/tests/phase-03/fakebin/juju new file mode 100644 index 0000000..2dc2972 --- /dev/null +++ b/tests/phase-03/fakebin/juju @@ -0,0 +1,23 @@ +#!/usr/bin/env bash +# fake juju: status emits a fixture; ssh emits a ,DOWN, line iff the target unit == $HAPROXY_DOWN. +sub="${1:-}" +case "$sub" in + whoami) echo "Controller: juju-controller"; echo "Model: ${FAKE_MODEL:-openstack}"; echo "User: jessea123" ;; + models) printf '{"models":[{"name":"admin/%s"}]}\n' "${FAKE_MODEL:-openstack}" ;; + status) cat "${FIXTURE:?FIXTURE env not set}" ;; + ssh) + shift; unit="" + while [ $# -gt 0 ]; do + case "$1" in + -m) shift 2; continue ;; + --) shift; break ;; + -*) shift; continue ;; + *) unit="$1"; shift ;; + esac + done + if [ -n "${HAPROXY_DOWN:-}" ] && [ "$unit" = "$HAPROXY_DOWN" ]; then + echo "nova-api,nova-api-2-10-12-12-109,0,0,,0,,,,2,DOWN,1/1,..." + fi + ;; + *) echo "fake juju: unhandled subcommand: $sub" >&2; exit 1 ;; +esac diff --git a/tests/phase-03/make_fixtures.py b/tests/phase-03/make_fixtures.py new file mode 100644 index 0000000..10f3fcb --- /dev/null +++ b/tests/phase-03/make_fixtures.py @@ -0,0 +1,26 @@ +#!/usr/bin/env python3 +# tests/phase-03/make_fixtures.py [OUTDIR] -- juju status JSON fixtures for phase-03 3.1. +import json, copy, os, sys +OUTDIR = sys.argv[1] if len(sys.argv) > 1 else "." +def unit(cur, msg, agent="idle", subs=None): + u = {"workload-status": {"current": cur, "message": msg}, "juju-status": {"current": agent}} + if subs: u["subordinates"] = subs + return u +base = {"applications": { + "keystone": {"units": {"keystone/0": unit("active", "PO (broken): Unit is ready", + subs={"keystone-hacluster/0": unit("active", "Unit is ready and clustered"), + "keystone-mysql-router/0": unit("active", "Unit is ready")})}}, + "glance": {"units": {"glance/0": unit("active", "Unit is ready")}}, + "nova-cloud-controller": {"units": {"nova-cloud-controller/0": unit("active", "Unit is ready")}}, + "neutron-api": {"units": {"neutron-api/0": unit("active", "Unit is ready")}}, + "octavia": {"units": {"octavia/0": unit("blocked", + "Awaiting end-user execution of `configure-resources` action to create required resources")}}, + "glance-simplestreams-sync": {"units": {"glance-simplestreams-sync/0": unit("unknown", "")}}, + "vault": {"units": {"vault/0": unit("active", "Unit is ready (active: true, mlock: disabled)")}}, +}} +def dump(n, o): + p = os.path.join(OUTDIR, n); json.dump(o, open(p, "w"), indent=2); print(" wrote", p) +dump("pass.json", base) +f = copy.deepcopy(base) # an UNEXPECTED blocked unit must fail the identity gate +f["applications"]["neutron-api"]["units"]["neutron-api/0"]["workload-status"] = {"current": "blocked", "message": "some real problem"} +dump("fail-accept.json", f) diff --git a/tests/phase-03/run-tests.sh b/tests/phase-03/run-tests.sh new file mode 100644 index 0000000..6556d9d --- /dev/null +++ b/tests/phase-03/run-tests.sh @@ -0,0 +1,45 @@ +#!/usr/bin/env bash +# tests/phase-03/run-tests.sh -- offline regression for phase-03 Step 3.1 verify. +# Unit-tests phase03_accept_walk.py against fixtures; behavior-tests phase-03-core-verify.sh +# with juju+jq shims (incl an injected haproxy-DOWN case). No live infra. python3 + bash only. +set -euo pipefail +IFS=$'\n\t' +HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +SCRIPTS="$(cd "$HERE/../../scripts" && pwd)" +TARGET="$SCRIPTS/phase-03-core-verify.sh" +WALK="$SCRIPTS/phase03_accept_walk.py" +BIN="$HERE/fakebin" +command -v python3 >/dev/null 2>&1 || { echo "FAIL: python3 required" >&2; exit 1; } +[ -f "$TARGET" ] && [ -f "$WALK" ] || { echo "FAIL: target/helper missing" >&2; exit 1; } +chmod +x "$BIN/juju" "$BIN/jq" 2>/dev/null || true +WORK="$(mktemp -d)"; trap 'rm -rf "$WORK"' EXIT +python3 "$HERE/make_fixtures.py" "$WORK" >/dev/null +rc_all=0 + +echo "=== unit: phase03_accept_walk.py ===" +walk_case() { + local fix="$1" want="$2" label="$3" rc + set +e; python3 "$WALK" <"$WORK/$fix" >/dev/null 2>&1; rc=$?; set -e + if [ "$rc" -eq "$want" ]; then printf ' [OK] %-42s exit %s\n' "$label" "$rc" + else printf ' [XX] %-42s exit %s (WANT %s)\n' "$label" "$rc" "$want"; rc_all=1; fi +} +walk_case pass.json 0 "accept-walk pass (octavia+glance-ss only)" +walk_case fail-accept.json 1 "accept-walk fail (unexpected blocked)" + +echo "=== integration: phase-03-core-verify.sh (shims) ===" +run() { + local fix="$1" want="$2" label="$3"; shift 3 + local rc v + set +e + PATH="$BIN:$PATH" FIXTURE="$WORK/$fix" env "$@" bash "$TARGET" openstack >"$WORK/out" 2>&1 + rc=$?; set -e + v="$(grep -E '^Summary:' "$WORK/out" | head -1 || true)" + if [ "$rc" -eq "$want" ]; then printf ' [OK] %-42s exit %s | %s\n' "$label" "$rc" "$v" + else printf ' [XX] %-42s exit %s (WANT %s)\n' "$label" "$rc" "$want"; sed 's/^/ /' "$WORK/out"; rc_all=1; fi +} +run pass.json 0 "settled + haproxy clean" +run fail-accept.json 1 "unexpected non-active/idle unit" +run pass.json 1 "haproxy backend DOWN" HAPROXY_DOWN=nova-cloud-controller/0 + +[ "$rc_all" -eq 0 ] && echo "ALL PASS" || echo "FAILURES ABOVE" +exit "$rc_all"