diff --git a/docs/v1-redeploy-changelog.md b/docs/v1-redeploy-changelog.md index 9ed6028..16fc7bc 100644 --- a/docs/v1-redeploy-changelog.md +++ b/docs/v1-redeploy-changelog.md @@ -907,5 +907,27 @@ create+poll+allocate; idempotent FIP reuse; VM-ERROR abort; never-ACTIVE abort; tenant-unresolvable abort; missing-auth exit 2) + net.env both-keys-0600 assertion. bash -n + shellcheck clean; ASCII+0CR. +### Phase-06 6.2 fix -- re-scope to capi-mgmt (DOCFIX-055) + ERROR-VM handling (2026-06-27) +First live 6.2 run FAILED: capi-mgmt-v2 -> scheduler ERROR, fault "Security group not found". +Root cause (measured): admin-openrc scopes the token to the ADMIN project (a11cf69...); `server +create` has no --project flag, so the VM + port landed in admin, where capi-mgmt-sg / capi-mgmt-net +(owned by project capi-mgmt 7072d66e...) are not visible. The do-doc 6.2 block never re-scopes. + +DOCFIX-055 -- do-doc Step 6.2 must re-scope to the capi-mgmt project (the resources are owned there; + 6.0/6.1 create them with --project capi-mgmt). Fix the do-doc 6.2 block before Roosevelt. + +scripts/phase-06-mgmt-vm.sh revised: + - DOCFIX-055: export OS_PROJECT_NAME=capi-mgmt OS_PROJECT_DOMAIN_NAME=capi before the server/FIP + calls (admin USER retains the keypair; image/flavor are --public; FIP allocates via the + capi-mgmt-router gateway). Re-scope validated up front (exit 2 if the D-039 member grant absent). + - Pre-create VISIBILITY preconditions: keypair + sg + network must resolve in the capi-mgmt scope + BEFORE create -> turns the old post-hoc VM ERROR into an upfront exit 2. + - Existing ERROR-state VM: clear abort + delete instruction by default; RECREATE_ON_ERROR=1 to + delete+recreate in-scope. NOTE the failed instance is in the ADMIN project (different scope), so + the fixed script will NOT see it -- it must be deleted manually under admin-openrc first. + tests/phase-06-mgmt-vm/ expanded to 12 integration + 5 resolver-unit cases ALL PASS (adds: re-scope + denied, keypair/sg/network not-visible, existing-ERROR abort, existing-ERROR+RECREATE). + shellcheck clean; ASCII+0CR. + ### Next-free numbers -Design decision: D-057. Doc fix: DOCFIX-055. +Design decision: D-057. Doc fix: DOCFIX-056. diff --git a/scripts/phase-06-mgmt-vm.sh b/scripts/phase-06-mgmt-vm.sh index 1420bab..bcfc71e 100644 --- a/scripts/phase-06-mgmt-vm.sh +++ b/scripts/phase-06-mgmt-vm.sh @@ -7,11 +7,19 @@ # (the single source for 6.3-6.6 + phase-07; NEITHER value is deterministic per rebuild -- # DOCFIX-038, never hardcode). D-056 flagged-mutation script; human-gated by invocation. # -# DOCFIX-054: FIP attach is IDEMPOTENT here -- if the VM already has a floating IP (looked up -# via its neutron port), it is REUSED; a new FIP is allocated only when none is attached. The -# do-doc block allocated unconditionally (a re-run would leak a second FIP). +# DOCFIX-055: the VM + its FIP must be created IN the capi-mgmt project. admin-openrc scopes the +# token to the ADMIN project, and `server create` has no --project flag (it lands the instance in +# the token's project), so without re-scoping the create cannot see capi-mgmt-sg / capi-mgmt-net +# ("Security group ... not found" -> scheduler ERROR). This script re-scopes to capi-mgmt (the +# D-039 member grant lets admin do so) and PRE-VERIFIES keypair/sg/net are visible in that scope, +# turning the old post-hoc VM ERROR into an upfront precondition. The do-doc 6.2 block omits the +# re-scope -- fix it before Roosevelt. # -# Tunables via env: VM PROJ EXT NET SG KEYPAIR FLAVOR IMAGE ENVFILE POLL_TRIES POLL_SLEEP +# DOCFIX-054: FIP attach is IDEMPOTENT -- reuse the VM's attached FIP (via its neutron port); +# allocate only when none is present (the do-doc allocated unconditionally -> a re-run would leak). +# +# Tunables via env: VM PROJ PROJ_DOMAIN EXT NET SG KEYPAIR FLAVOR IMAGE ENVFILE POLL_TRIES POLL_SLEEP +# RECREATE_ON_ERROR (1 = delete + recreate an existing ERROR-state VM; default 0) # Requires: jumphost; admin-openrc; openstack; jq; python3; scripts/resolve_tenant_ip.py. # Usage: source ~/admin-openrc && bash scripts/phase-06-mgmt-vm.sh # Exit: 0 VM ACTIVE + FIP attached + env persisted | 1 gate/resolve fail | 2 precondition @@ -24,6 +32,7 @@ VM="${VM:-capi-mgmt-v2}" PROJ="${PROJ:-capi-mgmt}" +PROJ_DOMAIN="${PROJ_DOMAIN:-capi}" EXT="${EXT:-provider-ext}" NET="${NET:-capi-mgmt-net}" SG="${SG:-capi-mgmt-sg}" @@ -40,11 +49,38 @@ . "$HOME/admin-openrc" fi [ -n "${OS_AUTH_URL:-}" ] || { echo "FAIL: OS_AUTH_URL unset and no ~/admin-openrc" >&2; exit 2; } -openstack token issue >/dev/null 2>&1 || { echo "FAIL: no scoped token (admin-openrc)" >&2; exit 2; } -# 1. VM verify-or-create +# DOCFIX-055: re-scope to the capi-mgmt project (same admin USER; keypair stays user-visible, +# image/flavor are --public, the FIP allocates through capi-mgmt-router's gateway on provider-ext). +export OS_PROJECT_NAME="$PROJ" OS_PROJECT_DOMAIN_NAME="$PROJ_DOMAIN" +unset OS_PROJECT_ID 2>/dev/null || true +openstack token issue >/dev/null 2>&1 || { echo "FAIL: cannot scope a token to project $PROJ (domain $PROJ_DOMAIN) -- is the D-039 member grant present?" >&2; exit 2; } +echo "[OK] scoped to project $PROJ (domain $PROJ_DOMAIN)" + +# Pre-create visibility (turns 'not found at create' into an upfront precondition -- DOCFIX-055) +openstack keypair show "$KEYPAIR" >/dev/null 2>&1 || { echo "FAIL: keypair $KEYPAIR not visible in scope $PROJ" >&2; exit 2; } +openstack security group show "$SG" >/dev/null 2>&1 || { echo "FAIL: security group $SG not visible in scope $PROJ" >&2; exit 2; } +openstack network show "$NET" -f value -c id >/dev/null 2>&1 || { echo "FAIL: network $NET not visible in scope $PROJ" >&2; exit 2; } +echo "[OK] keypair + security group + network visible in $PROJ" + +# 1. VM verify-or-create (handle a stale ERROR-state instance explicitly) if openstack server show "$VM" -f value -c id >/dev/null 2>&1; then - echo "[SKIP] server $VM exists" + CUR=$(openstack server show "$VM" -f value -c status 2>/dev/null || echo '?') + if [ "$CUR" = ERROR ]; then + if [ "${RECREATE_ON_ERROR:-0}" = 1 ]; then + echo "[..] $VM is in ERROR; deleting + recreating (RECREATE_ON_ERROR=1)" + openstack server delete "$VM" --wait + openstack server create --image "$IMAGE" --flavor "$FLAVOR" \ + --network "$NET" --security-group "$SG" --key-name "$KEYPAIR" "$VM" >/dev/null + echo "[OK] $VM recreated" + else + echo "GATE FAIL: $VM exists in ERROR state. Delete it, then re-run (or set RECREATE_ON_ERROR=1):" >&2 + echo " openstack server delete $VM" >&2 + exit 1 + fi + else + echo "[SKIP] server $VM exists (status=$CUR)" + fi else echo "[..] creating $VM ($FLAVOR / $IMAGE on $NET)" openstack server create --image "$IMAGE" --flavor "$FLAVOR" \ diff --git a/tests/phase-06-mgmt-vm/fakebin/openstack b/tests/phase-06-mgmt-vm/fakebin/openstack index b164614..9fd6661 100644 --- a/tests/phase-06-mgmt-vm/fakebin/openstack +++ b/tests/phase-06-mgmt-vm/fakebin/openstack @@ -1,27 +1,27 @@ #!/usr/bin/env bash a1="${1:-}"; a2="${2:-}"; a3="${3:-}"; rest=" $* " -[ "$a1 $a2" = "token issue" ] && exit 0 FIPVAL="${FIPVAL:-10.12.5.103}"; TENANT="${TENANT:-10.20.0.107}" -present_vm() { [ "${VM_PRESENT:-0}" = 1 ] || [ -f "${MK_VM:-/x}" ]; } case "$a1 $a2" in + "token issue") [ "${SCOPE_FAIL:-0}" = 1 ] && exit 1; exit 0 ;; + "keypair show") [ "${KP_VIS:-1}" = 1 ] && exit 0 || exit 1 ;; + "security group") [ "${SG_VIS:-1}" = 1 ] && { echo sg-id; exit 0; } || exit 1 ;; # 'security group show' + "network show") [ "${NET_VIS:-1}" = 1 ] && { echo net-id; exit 0; } || exit 1 ;; "server show") - present_vm || exit 1 + { [ "${VM_PRESENT:-0}" = 1 ] || [ -f "${MK_VM:-/x}" ]; } || exit 1 if printf '%s' "$rest" | grep -q -- '-f json'; then if [ "${ADDR_ONLY_FIP:-0}" = 1 ]; then echo "{\"addresses\":{\"capi-mgmt-net\":[\"$FIPVAL\"]}}" else echo "{\"addresses\":{\"capi-mgmt-net\":[\"$TENANT\",\"$FIPVAL\"]}}"; fi elif printf '%s' "$rest" | grep -q -- '-c status'; then - echo "${VM_STATUS:-ACTIVE}" # also covers '-c status -c addresses' confirm (status word present) + [ -f "${MK_RECREATED:-/x}" ] && echo ACTIVE || echo "${VM_STATUS:-ACTIVE}" else - echo vm-id # -c id existence probe + echo vm-id fi exit 0 ;; - "server create") : > "${MK_VM:?}"; exit 0 ;; - "server add") exit 0 ;; # server add floating ip + "server create") [ -f "${MK_DELETED:-/x}" ] && : > "${MK_RECREATED:?}"; : > "${MK_VM:?}"; exit 0 ;; + "server delete") : > "${MK_DELETED:?}"; rm -f "${MK_VM:-/x}"; exit 0 ;; + "server add") exit 0 ;; "port list") echo port-id; exit 0 ;; "floating ip") - case "$a3" in - list) [ "${FIP_PRESENT:-0}" = 1 ] && echo "$FIPVAL"; exit 0 ;; - create) echo "$FIPVAL"; exit 0 ;; - esac ;; + case "$a3" in list) [ "${FIP_PRESENT:-0}" = 1 ] && echo "$FIPVAL"; exit 0 ;; create) echo "$FIPVAL"; exit 0 ;; esac ;; esac exit 0 diff --git a/tests/phase-06-mgmt-vm/run-tests.sh b/tests/phase-06-mgmt-vm/run-tests.sh index 2a9b27a..0824985 100644 --- a/tests/phase-06-mgmt-vm/run-tests.sh +++ b/tests/phase-06-mgmt-vm/run-tests.sh @@ -19,45 +19,53 @@ if [ "$got" = "$want" ]; then printf ' [OK] %-30s -> %s\n' "$label" "${got:-}" else printf ' [XX] %-30s -> %s (want %s)\n' "$label" "${got:-}" "$want"; rc_all=1; fi } -u 10.20.0.107 10.12.5.103 '{"addresses":{"n":["10.20.0.107","10.12.5.103"]}}' "list shape" +u 10.20.0.107 10.12.5.103 '{"addresses":{"n":["10.20.0.107","10.12.5.103"]}}' "list shape" u 10.20.0.107 10.12.5.103 '{"addresses":{"n":[{"addr":"10.20.0.107"},{"addr":"10.12.5.103"}]}}' "dict {addr} shape" -u 10.20.0.107 10.12.5.103 '{"addresses":{"n":"10.20.0.107, 10.12.5.103"}}' "comma-string shape" -u "" 10.12.5.103 '{"addresses":{"n":["10.12.5.103"]}}' "only FIP -> empty" -u 10.20.0.107 "" '{"addresses":{"n":["10.20.0.107"]}}' "no FIP env -> first" +u 10.20.0.107 10.12.5.103 '{"addresses":{"n":"10.20.0.107, 10.12.5.103"}}' "comma-string shape" +u "" 10.12.5.103 '{"addresses":{"n":["10.12.5.103"]}}' "only FIP -> empty" +u 10.20.0.107 "" '{"addresses":{"n":["10.20.0.107"]}}' "no FIP env -> first" echo "=== integration: phase-06-mgmt-vm.sh ===" run() { local want="$1" re="$2" label="$3"; shift 3 - rm -f "$WORK/vm.marker" "$WORK/net.env" + rm -f "$WORK/vm.marker" "$WORK/deleted.marker" "$WORK/recreated.marker" "$WORK/net.env" local rc set +e PATH="$BIN:$PATH" HOME="$WORK" OS_AUTH_URL=x ENVFILE="$WORK/net.env" \ - MK_VM="$WORK/vm.marker" POLL_TRIES=2 POLL_SLEEP=0 FIPVAL=10.12.5.103 TENANT=10.20.0.107 \ + MK_VM="$WORK/vm.marker" MK_DELETED="$WORK/deleted.marker" MK_RECREATED="$WORK/recreated.marker" \ + POLL_TRIES=2 POLL_SLEEP=0 FIPVAL=10.12.5.103 TENANT=10.20.0.107 \ env "$@" bash "$TARGET" >"$WORK/out" 2>&1 rc=$?; set -e if [ "$rc" -eq "$want" ] && grep -qE "$re" "$WORK/out"; then - printf ' [OK] %-40s exit %s\n' "$label" "$rc" + printf ' [OK] %-44s exit %s\n' "$label" "$rc" else - printf ' [XX] %-40s exit %s (want %s; /%s/)\n' "$label" "$rc" "$want" "$re" + printf ' [XX] %-44s exit %s (want %s; /%s/)\n' "$label" "$rc" "$want" "$re" sed 's/^/ /' "$WORK/out"; rc_all=1 fi } -run 0 'allocated . associated FIP 10.12.5.103' "fresh: create+poll+allocate FIP" +run 0 'allocated . associated FIP 10.12.5.103' "fresh: re-scope+create+allocate FIP" run 0 'already has floating IP 10.12.5.103 .reusing.' "idempotent: reuse existing FIP" VM_PRESENT=1 FIP_PRESENT=1 -run 1 'entered ERROR' "VM ERROR -> abort" VM_PRESENT=1 VM_STATUS=ERROR -run 1 'not ACTIVE after' "VM never ACTIVE -> abort" VM_PRESENT=1 VM_STATUS=BUILD -run 1 'could not resolve tenant IP' "tenant unresolvable -> abort" VM_PRESENT=1 FIP_PRESENT=1 ADDR_ONLY_FIP=1 -run 2 'OS_AUTH_URL unset' "precondition: no auth -> exit 2" OS_AUTH_URL= +run 1 'entered ERROR' "fresh VM -> ERROR abort" VM_STATUS=ERROR +run 1 'not ACTIVE after' "VM never ACTIVE -> abort" VM_STATUS=BUILD +run 1 'could not resolve tenant IP' "tenant unresolvable -> abort" VM_PRESENT=1 FIP_PRESENT=1 ADDR_ONLY_FIP=1 +run 2 'OS_AUTH_URL unset' "precondition: no auth -> exit 2" OS_AUTH_URL= +run 2 'cannot scope a token to project' "re-scope denied -> exit 2" SCOPE_FAIL=1 +run 2 'keypair capi-mgmt-key not visible' "keypair not visible -> exit 2" KP_VIS=0 +run 2 'security group capi-mgmt-sg not visible' "sg not visible -> exit 2" SG_VIS=0 +run 2 'network capi-mgmt-net not visible' "network not visible -> exit 2" NET_VIS=0 +run 1 'exists in ERROR state' "existing ERROR, default -> abort" VM_PRESENT=1 VM_STATUS=ERROR +run 0 'recreated' "existing ERROR + RECREATE=1 -> ok" VM_PRESENT=1 VM_STATUS=ERROR RECREATE_ON_ERROR=1 echo "=== assert: net.env persisted (both keys) + mode 600 ===" -rm -f "$WORK/vm.marker" "$WORK/net.env" +rm -f "$WORK/vm.marker" "$WORK/deleted.marker" "$WORK/recreated.marker" "$WORK/net.env" PATH="$BIN:$PATH" HOME="$WORK" OS_AUTH_URL=x ENVFILE="$WORK/net.env" MK_VM="$WORK/vm.marker" \ + MK_DELETED="$WORK/deleted.marker" MK_RECREATED="$WORK/recreated.marker" \ POLL_TRIES=2 POLL_SLEEP=0 FIPVAL=10.12.5.103 TENANT=10.20.0.107 bash "$TARGET" >/dev/null 2>&1 || true if grep -q '^MGMT_FIP=10.12.5.103$' "$WORK/net.env" && grep -q '^MGMT_TENANT_IP=10.20.0.107$' "$WORK/net.env"; then perm=$(stat -c '%a' "$WORK/net.env" 2>/dev/null || echo '?') [ "$perm" = 600 ] && echo " [OK] net.env has both keys, mode 600" || { echo " [XX] net.env mode=$perm (want 600)"; rc_all=1; } else - echo " [XX] net.env missing keys"; sed 's/^/ /' "$WORK/net.env" 2>/dev/null; rc_all=1 + echo " [XX] net.env missing keys"; rc_all=1 fi echo [ "$rc_all" -eq 0 ] && echo "ALL PASS" || echo "SOME FAILED"