diff --git a/docs/v1-redeploy-changelog.md b/docs/v1-redeploy-changelog.md index f6a346c..b05e535 100644 --- a/docs/v1-redeploy-changelog.md +++ b/docs/v1-redeploy-changelog.md @@ -1454,3 +1454,55 @@ (mock suite exists from the 2026-07-02 session). NOTE (tooling gate): scripts/run-tests-all.sh (committed 2026-07-03) now runs every tests/*/run-tests.sh; both harnesses in this package were verified under it pre-commit. + +### 2026-07-03 (addendum 3) -- DOCFIX-082/083 hardening; acme offboarded live; handoff reconciliation + +ACME OFFBOARDED (first live run of tenant-offboard.sh, pre-hardening build): audit -> typed-gate +apply, exit 0. CREATE_FAILED cluster 76aea330 deleted by magnum in ~30s (the delete path through +the now-healthy barbican); Phase B printed no residual-trust delete, i.e. magnum's cluster delete +cleaned trust 8592ee35 itself; L3 + both users + project + domain removed; four legacy loose cred +files flagged for manual handling. Post-apply verification (domain/trust/orphan-trustee/beta +untouched) recorded separately when run. + +DOCFIX-082 -- tenant-offboard.sh error-regime hardening (handoff H-1). + WHAT: set -uo pipefail; all mutations via run() (capture-then-test, loud FAILED lines, + per-phase counters); sweep failures now exit 22 (continue-and-report regime, stated in + header); identity failures exit 23 with count; is_id tightened to ^[0-9a-f]{32}$; + capture-then-grep converted to here-strings (SIGPIPE discipline under pipefail). + WHY: the pre-082 build lost mutation exit codes in display pipelines -- concretely, Phase E's + `... | sed ... || FAILED=1` bound the || to sed, making the failure flag DEAD CODE. The acme + run above was protected only by the domain-gone assertion (sufficient there: disabled-domain + delete cascades users/projects in keystone; but sweep-phase failures could strand resources + silently). Found on re-inspection prompted by handoff H-1; the register was right. + ACCEPT: tests/tenant-offboard 7/7 incl. new sweep-failure-injection case (exit 22 + FAILED + line asserted). REVERT: git checkout of scripts/tenant-offboard.sh + + tests/tenant-offboard/run-tests.sh at the parent of this commit. + +DOCFIX-083 -- vault-kv-inner-probe.sh: credentials off argv (handoff H-2). + WHAT: AppRole login body now fed to curl via stdin (--data @-); nothing secret-bearing on + curl argv. WHY: `-d "$BODY"` exposed role_id+secret_id in /proc/*/cmdline (ps) on the unit + for the request duration; 60s token TTL bounded it but the house rule is absolute. + ACCEPT: tests/vault-kv-health 8/8 incl. new source-level argv-hygiene assertion on the login + invocation. REVERT: git checkout of scripts/vault-kv-inner-probe.sh + + tests/vault-kv-health/run-tests.sh at the parent of this commit. + +HANDOFF H-3 (identifiers): the 2026-07-02/03 script deliveries are recorded in addenda 2-3 of +this file; vault-kv-health implements D-068 item 3 (cross-linked there by the D-068 text); +DOCFIX-082/083 above are their follow-up identifiers with reverts. Next-free after this +commit: D-071, DOCFIX-084, BUNDLEFIX-009. + +HANDOFF D-3 (D-042 status): CONFIRMED RESOLVED BY PIN. The magnum-capi-helm 1.4.0 pin (D-042) +was validated live 2026-07-03: tenant-created beta-cluster reports health_status=HEALTHY (the +1.3.x false-negative would have shown UNHEALTHY/None persistently). The seven-stage contract-ref +graft runbook staged for 1.3.x is OBSOLETE for v1 -- recommend closing D-042 with a +resolved-by-upgrade note rather than executing the graft. + +OFFBOARD v2 GAP (logged, not built): if magnum's cluster-delete cleanup is ever incomplete, an +orphaned PER-CLUSTER TRUSTEE USER (_) survives in the MAGNUM domain -- +outside tenant-offboard.sh's user sweep, which covers only the client domain. Post-apply verify +blocks check for this orphan; a --sweep-magnum-orphans mode is the v2 item. + +HARNESS LESSON (this session): assertion selectors need the same rigor as production greps -- +two consecutive mis-anchored source assertions (first-line-with-curl; regex that could not +cross || at line end) were caught by running the harness before delivery. Parameter expansion +beats regex for line-splitting assertions. diff --git a/scripts/tenant-offboard.sh b/scripts/tenant-offboard.sh index 1674151..a12e407 100644 --- a/scripts/tenant-offboard.sh +++ b/scripts/tenant-offboard.sh @@ -8,7 +8,23 @@ # users (app creds die with user) -> projects -> domain disable+delete -> local cred dir retired. # Exit: 0 ok | 20 precondition/blocklist | 21 cluster delete failed | 22 resource sweep failed # | 23 identity teardown failed | 24 confirmation refused/unavailable -set -u +# ERROR REGIME (DOCFIX-082): set -uo pipefail; every mutation goes through run() which +# CAPTURES output then TESTS exit status (no pipeline on the mutating command itself). +# Sweep phases are best-effort continue-and-report: failures are printed, counted, and the +# script exits nonzero at the end (22 sweep / 23 identity) rather than stranding silently. +set -uo pipefail +SWEEP_FAIL=0; ID_FAIL=0 +run(){ # run : capture-then-test; count failures loudly + local _c="$1"; shift; local out + if out=$("$@" &1); then + [ -n "$out" ] && printf '%s\n' "$out" | sed 's/^/ /' + else + printf '%s\n' "$out" | sed 's/^/ /' + echo " ^ FAILED: $*" + eval "$_c=\$(( $_c + 1 ))" + fi + return 0 +} CLIENT="${1:-}"; MODE="${2:---audit}" BLOCKLIST="default admin_domain service_domain magnum capi heat" [ -n "$CLIENT" ] || { echo "usage: tenant-offboard.sh [--audit|--apply]"; exit 20; } @@ -17,7 +33,7 @@ for b in $BLOCKLIST; do [ "$CL" = "$b" ] && { echo "REFUSED: '$CLIENT' is a protected domain"; exit 20; }; done admin_env(){ for v in $(env|awk -F= '/^OS_/{print $1}'); do unset "$v"; done; source "$HOME/admin-openrc"; } -is_id(){ case "$1" in *[!0-9a-f]*|'') return 1;; *) return 0;; esac; } +is_id(){ [[ "${1:-}" =~ ^[0-9a-f]{32}$ ]]; } admin_env DOM=$(openstack domain show "$CLIENT" -f value -c id &1) || true @@ -97,7 +113,7 @@ LEFT=$( (tenant_env; openstack coe cluster list -f value -c name -c status &1) || true ) printf '%s [%02d] remaining: %s\n' "$(date +%T)" "$i" "${LEFT:-}" [ -z "$LEFT" ] && break - printf '%s\n' "$LEFT" | grep -q 'DELETE_FAILED' && { echo "FATAL: DELETE_FAILED -- see appendix-A stuck-delete"; exit 21; } + grep -q 'DELETE_FAILED' <<<"$LEFT" && { echo "FATAL: DELETE_FAILED -- see appendix-A stuck-delete"; exit 21; } sleep 20 done LEFT=$( (tenant_env; openstack coe cluster list -f value -c name &1) || true ) @@ -109,45 +125,44 @@ printf '%s\n' "${TRUSTS:-}" | while read -r TID REST; do [ -n "$TID" ] && [ "$TID" != "" ] || continue T2=$(openstack trust show "$TID" -f value -c id &1) || true - [ "$T2" = "$TID" ] && { echo " deleting residual trust $TID"; openstack trust delete "$TID" &1 | sed 's/^/ /'; } + [ "$T2" = "$TID" ] && { echo " deleting residual trust $TID"; run SWEEP_FAIL openstack trust delete "$TID"; } done # ---- Phase C+D: per-project resource sweep + L3 unwind ---- for P in $(printf '%s\n' "$PROJS" | awk '{print $1}'); do for LB in $(openstack loadbalancer list --project "$P" -f value -c id &1); do is_id "${LB//-/}" 2>/dev/null || true - echo " LB delete --cascade $LB"; openstack loadbalancer delete --cascade "$LB" &1 | sed 's/^/ /' + echo " LB delete --cascade $LB"; run SWEEP_FAIL openstack loadbalancer delete --cascade "$LB" done for i in $(seq 1 20); do N=$(openstack loadbalancer list --project "$P" -f value -c id &1 | grep -c . || true) [ "$N" = 0 ] && break; echo " waiting LBs gone ($N left)"; sleep 15 done for F in $(openstack floating ip list --project "$P" -f value -c ID &1); do - echo " FIP delete $F"; openstack floating ip delete "$F" &1 | sed 's/^/ /'; done + echo " FIP delete $F"; run SWEEP_FAIL openstack floating ip delete "$F"; done for S in $(openstack server list --project "$P" -f value -c ID &1); do - echo " server delete $S"; openstack server delete "$S" &1 | sed 's/^/ /'; done + echo " server delete $S"; run SWEEP_FAIL openstack server delete "$S"; done for R in $(openstack router list --project "$P" -f value -c ID &1); do for SN in $(openstack subnet list --project "$P" -f value -c ID &1); do openstack router remove subnet "$R" "$SN" &1 || true; done openstack router unset --external-gateway "$R" &1 || true - echo " router delete $R"; openstack router delete "$R" &1 | sed 's/^/ /'; done + echo " router delete $R"; run SWEEP_FAIL openstack router delete "$R"; done for SN in $(openstack subnet list --project "$P" -f value -c ID &1); do - echo " subnet delete $SN"; openstack subnet delete "$SN" &1 | sed 's/^/ /'; done + echo " subnet delete $SN"; run SWEEP_FAIL openstack subnet delete "$SN"; done for NW in $(openstack network list --project "$P" -f value -c ID &1); do - echo " network delete $NW"; openstack network delete "$NW" &1 | sed 's/^/ /'; done + echo " network delete $NW"; run SWEEP_FAIL openstack network delete "$NW"; done done # ---- Phase E: identities ---- -FAILED=0 for U in $(printf '%s\n' "$USERS" | awk '{print $1}'); do - echo " user delete $U"; openstack user delete "$U" &1 | sed 's/^/ /' || FAILED=1; done + echo " user delete $U"; run ID_FAIL openstack user delete "$U"; done for P in $(printf '%s\n' "$PROJS" | awk '{print $1}'); do - echo " project delete $P"; openstack project delete "$P" &1 | sed 's/^/ /' || FAILED=1; done -openstack domain set --disable "$DOM" &1 | sed 's/^/ /' -echo " domain delete $DOM"; openstack domain delete "$DOM" &1 | sed 's/^/ /' || FAILED=1 + echo " project delete $P"; run ID_FAIL openstack project delete "$P"; done +run ID_FAIL openstack domain set --disable "$DOM" +echo " domain delete $DOM"; run ID_FAIL openstack domain delete "$DOM" D2=$(openstack domain show "$CLIENT" -f value -c id &1) || true is_id "$D2" && { echo "FATAL: domain still present"; exit 23; } -[ "$FAILED" = 0 ] || { echo "FATAL: identity teardown had failures (domain gone; review output)"; exit 23; } +[ "$ID_FAIL" = 0 ] || { echo "FATAL: identity teardown had $ID_FAIL failures (domain gone; review output)"; exit 23; } # ---- Phase F: retire local creds ---- if [ -d "$HOME/tenant-${CLIENT}" ]; then @@ -155,5 +170,9 @@ mv "$HOME/tenant-${CLIENT}" "$DST" && chmod 700 "$DST" && echo " local creds retired -> $DST (shred manually when record no longer needed)" fi ls "$HOME/${CLIENT}"-*.txt "$HOME/${CLIENT}"-*.pem 2>/dev/null | sed 's/^/ LEGACY loose file (handle manually): /' +if [ "$SWEEP_FAIL" -gt 0 ]; then + echo "OFFBOARD FINISHED WITH $SWEEP_FAIL SWEEP FAILURES: $CLIENT (identities gone; orphaned resources above)" + exit 22 +fi echo "OFFBOARD COMPLETE: $CLIENT" exit 0 diff --git a/scripts/vault-kv-inner-probe.sh b/scripts/vault-kv-inner-probe.sh index b71b1c4..9e297e7 100644 --- a/scripts/vault-kv-inner-probe.sh +++ b/scripts/vault-kv-inner-probe.sh @@ -16,9 +16,10 @@ HOST=${URL#*//}; HOST=${HOST%%:*} echo "route: $(ip -o route get "$HOST" 2>&1 | head -1)" command -v curl >/dev/null || { echo "PROBE-FAIL: curl absent"; exit 1; } +# DOCFIX-083: credentials NEVER transit argv (visible in ps); body fed via stdin. BODY=$(printf '{"role_id":"%s","secret_id":"%s"}' "$RID" "$SID") RESP=/root/.vaultkv-login-resp.json -HTTP=$(curl -s -o "$RESP" -w '%{http_code}' --max-time 10 -X POST -d "$BODY" "$URL/v1/auth/approle/login" 2>&1 || true) +HTTP=$(printf '%s' "$BODY" | curl -s -o "$RESP" -w '%{http_code}' --max-time 10 -X POST --data @- "$URL/v1/auth/approle/login" 2>&1 || true) echo "login_http=$HTTP" RC=1 if [ "$HTTP" = "200" ]; then echo "PROBE-PASS"; RC=0 diff --git a/tests/tenant-offboard/run-tests.sh b/tests/tenant-offboard/run-tests.sh index cff2102..f777993 100644 --- a/tests/tenant-offboard/run-tests.sh +++ b/tests/tenant-offboard/run-tests.sh @@ -43,7 +43,9 @@ "subnet list"*) [ -f "$ST/sndel" ] || echo "sn1 acme-subnet";; "subnet delete"*) mut "$*"; touch "$ST/sndel";; "network list"*) [ -f "$ST/nwdel" ] || echo "nw1 acme-net";; - "network delete"*) mut "$*"; touch "$ST/nwdel";; + "network delete"*) + if [ "${MOCK_SCEN:-}" = sweepfail ]; then echo "Conflict: one or more ports still in use (HTTP 409)"; exit 1; fi + mut "$*"; touch "$ST/nwdel";; "user delete"*|"project delete"*) mut "$*";; "domain set --disable"*) mut "$*";; "domain delete"*) mut "$*"; touch "$ST/domgone";; @@ -81,4 +83,10 @@ rm -rf "$W/state"; HOME="$W" PATH="$W/bin:$PATH" python3 "$W/ptyrun.py" bash "$SCRIPT" acme --apply WRONG >/dev/null 2>&1; chk tty-mismatch $? 24 rm -rf "$W/state"; HOME="$W" PATH="$W/bin:$PATH" python3 "$W/ptyrun.py" bash "$SCRIPT" acme --apply acme >/dev/null 2>&1; chk apply-happy $? 0 rm -rf "$W/state"; HOME="$W" MOCK_SCEN=delfail PATH="$W/bin:$PATH" python3 "$W/ptyrun.py" bash "$SCRIPT" acme --apply acme >/dev/null 2>&1; chk delete-failed $? 21 -echo; [ "$F" = 0 ] && { echo "ALL PASS ($P/6)"; exit 0; } || { echo "FAILURES: $F"; exit 1; } +# DOCFIX-082 failure injection: a mid-sweep resource-delete failure must surface as exit 22, +# never as silent success (the pre-082 script lost mutation exit codes in display pipelines). +rm -rf "$W/state"; HOME="$W" MOCK_SCEN=sweepfail PATH="$W/bin:$PATH" python3 "$W/ptyrun.py" bash "$SCRIPT" acme --apply acme > "$W/sweepfail.out" 2>&1; RC=$? +if [ "$RC" = 22 ] && grep -q '\^ FAILED: openstack network delete' "$W/sweepfail.out"; then + echo "PASS: sweep-failure-injection (exit 22, failure line printed)"; P=$((P+1)) +else echo "FAIL: sweep-failure-injection (exit $RC, want 22 + FAILED line)"; F=$((F+1)); fi +echo; [ "$F" = 0 ] && { echo "ALL PASS ($P/7)"; exit 0; } || { echo "FAILURES: $F"; exit 1; } diff --git a/tests/vault-kv-health/run-tests.sh b/tests/vault-kv-health/run-tests.sh index 25e2580..963325d 100644 --- a/tests/vault-kv-health/run-tests.sh +++ b/tests/vault-kv-health/run-tests.sh @@ -35,4 +35,12 @@ MOCK_SCEN="$s" PATH="$W/bin:$PATH" bash "$W/scripts/vault-kv-health.sh" >/dev/null 2>&1; RC=$? if [ "$RC" = "$e" ]; then echo "PASS: $s (exit $RC)"; P=$((P+1)); else echo "FAIL: $s (exit $RC, want $e)"; F=$((F+1)); fi done -echo; [ "$F" = 0 ] && { echo "ALL PASS ($P/7)"; exit 0; } || { echo "FAILURES: $F"; exit 1; } +# DOCFIX-083 argv hygiene: the probe's curl invocation must read the body from stdin +# (--data @-) and must not carry the secret-bearing var or -d on its argv. +LOGIN_LINE=$(grep 'HTTP=' "$W/scripts/vault-kv-inner-probe.sh" | head -1) +CURLPART="${LOGIN_LINE#*curl }" +if [ "$CURLPART" != "$LOGIN_LINE" ] && grep -q -- '--data @-' <<<"$CURLPART" \ + && ! grep -qE 'BODY|[[:space:]]-d[[:space:]]' <<<"$CURLPART"; then + echo "PASS: argv-hygiene (curl reads body from stdin, no secret var on argv)"; P=$((P+1)) +else echo "FAIL: argv-hygiene (curl part: $CURLPART)"; F=$((F+1)); fi +echo; [ "$F" = 0 ] && { echo "ALL PASS ($P/8)"; exit 0; } || { echo "FAILURES: $F"; exit 1; }