diff --git a/docs/v1-redeploy-changelog.md b/docs/v1-redeploy-changelog.md index 470f003..25a32ee 100644 --- a/docs/v1-redeploy-changelog.md +++ b/docs/v1-redeploy-changelog.md @@ -272,3 +272,43 @@ metal-admin/provider/internal VIP+mgmt reserve band (.2-.100). A reserved range blocks AUTO assignment, not explicit STATIC, so it did not break the carve -- but host octets arguably belong outside the VIP band. Log for the reserve-layout review. + +### DC-DC script audit (post-carve hardening batch) + +Reviewed all MAAS scripts against what this session actually hit, so the DC-DC build +replays cleanly instead of re-deriving the metal-IP archaeology. + +- **carve gate rewrite (the big one).** `release_self_discovered` keyed on + `node_summary.system_id`, which is EMPTY on a fresh discovered record -> it silently + no-op'd and the metal static (.8.41/.42/.43) had to be released by hand on three + hosts. Replaced with `release_self_indexed`: the target is this host's + architecturally-indexed metal IP (10.12.8. from HOST_OCTET), so a DISCOVERED + observation on it is this host's own commissioning ghost. SAFETY: refuses if the + record's system_id (when present) OR the discoveries-table MAC (when present) + identifies a DIFFERENT host; releases otherwise. Removed the (unneeded) release call + from carve_raw -- the no-DHCP planes never produce discovered records. Tested: 5 + branches (foreign-sysid refuse, foreign-MAC refuse, indexed-basis release, MAC-basis + release, no-record no-op). + +- **missing step added: openstack tag.** `reenroll-hosts.sh` now ensures the + `openstack` tag exists and applies it to all four hosts after the Ready/boot-NIC + gate (idempotent; --check-aware). Without it the bundle cannot place units + (constraint tags=openstack). Was a manual step every rebuild. + +- **DOCFIX-040 COMPLETE.** `pre-flight-checks.sh` and `osd-blank-check.sh` both looped + over the dead system_ids (4na83t...) via lib-net's SYSID maps -- broken for any + rebuilt/DC-DC cluster. Migrated both to hostname-keyed (lib-hosts HOSTS / HOST_OCTET + / host_sysid). Retired the SYSID/SYSID_HOST/SYSID_OCTET maps from lib-net.sh and + added its sourced-library shellcheck directive. osd-blank verified via mock + (iterates the four hostnames, RC=0). + +- **validate.sh**: em-dashes -> ASCII (the silent-UnicodeDecodeError class; ASCII-only + rule for all scripts). Still a placeholder body otherwise. + +REMAINING DC-DC scope (done MANUALLY this session; scripting them would make the +bring-up fully hands-off -- NOT yet built): +1. A multi-host carve-verify wrapper (assert all four hosts show the six expected + static links on the right fabrics) -- currently an ad-hoc jq loop. +2. A redeploy-prep wrapper: set model-defaults default-space=metal-admin, add-model, + verify the MODEL's effective default-space (the value that poisoned the last + deploy), reload-spaces, run juju-spaces-check. Currently manual steps R1-R3. diff --git a/scripts/carve-host-interfaces.sh b/scripts/carve-host-interfaces.sh index ed99192..4d3907e 100644 --- a/scripts/carve-host-interfaces.sh +++ b/scripts/carve-host-interfaces.sh @@ -105,23 +105,35 @@ fi } -# release_self_discovered : if MAAS holds as a DISCOVERED (alloc_type 6) -# address observed from THIS host (node==SID), release it so a STATIC can take it. -# Gated: only releases when the discovered record belongs to this host -- never -# touches an address discovered on another node (that would be a real conflict). -# (Re-enrolled hosts PXE-lease their own metal IP at commission; that self-lease -# otherwise blocks the br-metal static. See troubleshooting appendix.) -release_self_discovered() { - local ip="$1" subid="$2" owner - owner="$(maas_q subnet ip-addresses "$subid" 2>/dev/null \ - | jq -r --arg ip "$ip" '.[]|select(.ip==$ip and .alloc_type==6)|.node_summary.system_id // empty' | head -1)" - [ -z "$owner" ] && return 0 # not discovered -> nothing to do - if [ "$owner" != "$SID" ]; then - fail "$ip is DISCOVERED by a DIFFERENT node ($owner), not $HN -- refusing to release (possible real conflict)" - return 1 +# release_self_indexed : release a DISCOVERED (alloc_type 6) record on +# so a STATIC can take it. is THIS host's architecturally-indexed metal +# address (10.12.8. from HOST_OCTET), so a discovered observation on it is this +# host's own commissioning ghost. SAFETY: refuse if ANY source positively identifies a +# DIFFERENT owner -- the StaticIPAddress node_summary.system_id (when present) must equal +# this host's SID, and the discoveries-table MAC (when present) must equal this host's +# boot MAC. Absent positive foreign identification, releasing a DISCOVERED (advisory) +# observation on this host's own indexed IP is safe. +# (node_summary.system_id is often EMPTY on a fresh discovered record -- that is why the +# earlier system_id-only gate silently no-op'd and the metal static had to be released by +# hand. See troubleshooting appendix / changelog.) +release_self_indexed() { + local ip="$1" subid="$2" rec sid_owner disc_mac basis + rec="$(maas_q subnet ip-addresses "$subid" 2>/dev/null \ + | jq -c --arg ip "$ip" '.[]|select(.ip==$ip and .alloc_type==6)' | head -1)" + [ -z "$rec" ] && return 0 # no discovered record -> nothing to do + sid_owner="$(printf '%s' "$rec" | jq -r '.node_summary.system_id // empty')" + if [ -n "$sid_owner" ] && [ "$sid_owner" != "$SID" ]; then + fail "$ip is DISCOVERED by a different node ($sid_owner), not $HN -- refusing (real conflict)"; return 1 fi - emit "release self-discovered $ip (alloc_type 6, node=$HN)" \ - ipaddresses release ip="$ip" force=true discovered=true + disc_mac="$(maas_q discoveries read 2>/dev/null | jq -r --arg ip "$ip" '.[]?|select(.ip==$ip)|.mac_address' | head -1)" + if [ -n "$disc_mac" ] && [ "$disc_mac" != "${HOST_BOOT_MAC[$HN]}" ]; then + fail "$ip observed from MAC $disc_mac, not $HN boot MAC ${HOST_BOOT_MAC[$HN]} -- refusing (real conflict)"; return 1 + fi + basis="this host's indexed metal IP" + [ "$sid_owner" = "$SID" ] && basis="system_id match ($SID)" + [ -n "$disc_mac" ] && [ "$disc_mac" = "${HOST_BOOT_MAC[$HN]}" ] && basis="boot-MAC match ($disc_mac)" + note "releasing DISCOVERED $ip (basis: $basis)" + emit "release DISCOVERED $ip" ipaddresses release ip="$ip" force=true discovered=true } hdr "$HN ($SID) octet=.$OCTET mode=$MODE" @@ -142,7 +154,6 @@ if linked_to "$nic" "$cidr"; then note "$nic already STATIC on $cidr -- SKIP"; return 0; fi vlan="$(vlanid_of "$cidr")"; sub="$(subid_of "$cidr")" emit "$nic(id=$id) -> VLAN $vlan ($cidr)" interface update "$SID" "$id" vlan="$vlan" - release_self_discovered "$ip" "$sub" || return 1 emit "$nic(id=$id) -> STATIC $ip on subnet $sub" interface link-subnet "$SID" "$id" mode=STATIC subnet="$sub" ip_address="$ip" } @@ -163,7 +174,7 @@ else note "br-metal exists -- SKIP create"; fi [ "$MODE" = apply ] && BMID="$(ifid_of br-metal)" || BMID="" if ! linked_to br-metal "$C_METAL"; then - release_self_discovered "10.12.8.$OCTET" "$(subid_of "$C_METAL")" || true + release_self_indexed "10.12.8.$OCTET" "$(subid_of "$C_METAL")" || true emit "br-metal(id=$BMID) -> STATIC 10.12.8.$OCTET on subnet $(subid_of "$C_METAL")" \ interface link-subnet "$SID" "$BMID" mode=STATIC subnet="$(subid_of "$C_METAL")" ip_address="10.12.8.$OCTET" else note "br-metal already on $C_METAL -- SKIP"; fi diff --git a/scripts/lib-net.sh b/scripts/lib-net.sh index 2448bab..885fefb 100644 --- a/scripts/lib-net.sh +++ b/scripts/lib-net.sh @@ -9,6 +9,9 @@ # (the D-052 cutover moved metal-internal to id=10, not the old id=6), so every # lookup resolves BY CIDR, never by a hardcoded subnet ID. (PATTERN-1.) +# shellcheck shell=bash +# shellcheck disable=SC2034 # constants consumed by sourcing scripts + # Guard: refuse to run directly (it is a library). if [ "${BASH_SOURCE[0]:-}" = "${0}" ]; then echo "lib-net.sh is a sourced library; do not run it directly." >&2 @@ -41,10 +44,10 @@ METAL_INTERNAL_VID="103" METAL_INTERNAL_IFACE="br-internal" -# The four KVM hosts: system_id -> hostname -> last host octet (.40-.43). -SYSIDS=( 4na83t qdbqd6 h8frng tmsafc ) -declare -A SYSID_HOST=( [4na83t]=openstack0 [qdbqd6]=openstack1 [h8frng]=openstack2 [tmsafc]=openstack3 ) -declare -A SYSID_OCTET=( [4na83t]=40 [qdbqd6]=41 [h8frng]=42 [tmsafc]=43 ) +# Host identity (hostnames, octets, boot MACs, system_id resolution) now lives in +# scripts/lib-hosts.sh, keyed by HOSTNAME -- system_ids are re-minted on every +# (re-)enrollment, so the old SYSID-keyed maps here were a landmine and were retired +# (DOCFIX-040). Source lib-hosts.sh for HOSTS / HOST_OCTET / host_sysid(). # Triple HA VIPs (D-020 + D-052): each API charm carries provider/admin/internal columns, # matching last octet, in the .50-.60 band. 11 clustered API charms. diff --git a/scripts/osd-blank-check.sh b/scripts/osd-blank-check.sh index 74db03e..5379ab4 100644 --- a/scripts/osd-blank-check.sh +++ b/scripts/osd-blank-check.sh @@ -5,7 +5,7 @@ # Confirms each host VM's OSD disk is wiped blank before redeploy so Ceph re-bootstraps clean. # Override the image dir with IMGDIR=... if needed. # -# Pinned host list comes from scripts/lib-net.sh. +# Pinned host list comes from scripts/lib-hosts.sh (hostname-keyed). # Exit codes: 0 all blank | 1 a disk looks non-blank | 2 a disk image missing set -euo pipefail @@ -15,11 +15,12 @@ SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" # shellcheck source=scripts/lib-net.sh . "$SCRIPT_DIR/lib-net.sh" +# shellcheck source=scripts/lib-hosts.sh +. "$SCRIPT_DIR/lib-hosts.sh" IMGDIR="${IMGDIR:-/var/lib/libvirt/images}" RC=0 -for sid in "${SYSIDS[@]}"; do - h="${SYSID_HOST[$sid]}" +for h in "${HOSTS[@]}"; do img="$IMGDIR/${h}-1.qcow2" echo "== $h ($img) ==" if [ ! -f "$img" ]; then echo " FAIL: image not found"; RC=2; continue; fi diff --git a/scripts/pre-flight-checks.sh b/scripts/pre-flight-checks.sh index 3a80942..3c3f52b 100644 --- a/scripts/pre-flight-checks.sh +++ b/scripts/pre-flight-checks.sh @@ -28,6 +28,8 @@ SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" # shellcheck source=scripts/lib-net.sh . "$SCRIPT_DIR/lib-net.sh" +# shellcheck source=scripts/lib-hosts.sh +. "$SCRIPT_DIR/lib-hosts.sh" FATAL=0 WARN=0 @@ -128,31 +130,33 @@ # --------------------------------------------------------------------------- hdr "CHECK 2: data/storage NIC links BY CIDR (per host; expect octet .40-.43)" -for sid in "${SYSIDS[@]}"; do - echo " == $sid (${SYSID_HOST[$sid]}, octet .${SYSID_OCTET[$sid]}) ==" - IFJSON="$(maas "$MAAS_PROFILE" interfaces read "$sid" 2>/dev/null)" || { fail "cannot read interfaces for $sid"; continue; } +for h in "${HOSTS[@]}"; do + sid="$(host_sysid "$h" || true)" + if [ -z "$sid" ]; then fail "$h not enrolled in MAAS"; continue; fi + echo " == $h ($sid, octet .${HOST_OCTET[$h]}) ==" + IFJSON="$(maas "$MAAS_PROFILE" interfaces read "$sid" 2>/dev/null)" || { fail "cannot read interfaces for $h"; continue; } for c in "${DATA_PLANE_CIDRS[@]}"; do line="$(printf '%s' "$IFJSON" | jq -r --arg c "$c" '.[] as $if | $if.links[]? | select(.subnet.cidr==$c) | "\($if.name) \(.ip_address // "(no-ip)")"' | head -1)" - if [ -z "$line" ]; then fail "$sid missing link on ${PLANE_NAME[$c]} ($c)"; continue; fi + if [ -z "$line" ]; then fail "$h missing link on ${PLANE_NAME[$c]} ($c)"; continue; fi ifname="${line%% *}"; ip="${line##* }" printf " %-14s -> %-16s %s\n" "$ifname" "$c" "$ip" if [ "$c" = "$METAL_INTERNAL_CIDR" ] && [ "$ifname" != "$METAL_INTERNAL_IFACE" ]; then - fail "$sid metal-internal on '$ifname' (want $METAL_INTERNAL_IFACE)" + fail "$h metal-internal on '$ifname' (want $METAL_INTERNAL_IFACE)" fi o="$(fourth_octet "$ip")" - if [ "$o" != "${SYSID_OCTET[$sid]}" ]; then warn "$sid ${PLANE_NAME[$c]} ip=$ip (want last octet .${SYSID_OCTET[$sid]})"; fi + if [ "$o" != "${HOST_OCTET[$h]}" ]; then warn "$h ${PLANE_NAME[$c]} ip=$ip (want last octet .${HOST_OCTET[$h]})"; fi done done # --------------------------------------------------------------------------- hdr "CHECK 4: the four KVM hosts -- status / power" MJSON="$(maas "$MAAS_PROFILE" machines read 2>/dev/null)" || { fail "cannot read machines"; finish; } -for sid in "${SYSIDS[@]}"; do - row="$(printf '%s' "$MJSON" | jq -r --arg s "$sid" '.[]|select(.system_id==$s)|"\(.hostname) \(.status_name) power=\(.power_state)"')" - if [ -z "$row" ]; then fail "host $sid not found in MAAS"; continue; fi +for h in "${HOSTS[@]}"; do + row="$(printf '%s' "$MJSON" | jq -r --arg n "$h" '.[]|select(.hostname==$n)|"\(.hostname) \(.status_name) power=\(.power_state)"')" + if [ -z "$row" ]; then fail "host $h not found in MAAS"; continue; fi echo " $row" - st="$(printf '%s' "$MJSON" | jq -r --arg s "$sid" '.[]|select(.system_id==$s)|.status_name')" - if [ "$st" = "Ready" ]; then pass "$sid Ready"; else fail "$sid status=$st (want Ready)"; fi + st="$(printf '%s' "$MJSON" | jq -r --arg n "$h" '.[]|select(.hostname==$n)|.status_name')" + if [ "$st" = "Ready" ]; then pass "$h Ready"; else fail "$h status=$st (want Ready)"; fi done finish diff --git a/scripts/reenroll-hosts.sh b/scripts/reenroll-hosts.sh index dfefe2a..23ed459 100644 --- a/scripts/reenroll-hosts.sh +++ b/scripts/reenroll-hosts.sh @@ -145,5 +145,33 @@ if [ "$fab" = "2_metal" ]; then pass "$hn boot NIC on 2_metal"; else fail "$hn boot NIC on '${fab:-?}' (want 2_metal)"; fi done -note "next: re-tag '${HOST_TAG}' on all four, then the Strategy-B interface carve" +# Apply the deploy placement tag (bundle places units via constraint tags=). +# Idempotent: ensures the tag exists, then adds each host's live system_id. Without this +# the bundle cannot bind any unit. (Was a manual step; now part of enrollment.) +hdr "Apply MAAS tag '${HOST_TAG}' (deploy placement prereq)" +if ! maas "$MAAS_PROFILE" tags read 2>/dev/null | jq -e --arg t "$HOST_TAG" '.[]|select(.name==$t)' >/dev/null 2>&1; then + if [ "$MODE" != "check" ]; then + echo " creating tag '${HOST_TAG}'" + maas "$MAAS_PROFILE" tags create name="$HOST_TAG" >/dev/null 2>&1 || warn "could not create tag '${HOST_TAG}'" + else + note "tag '${HOST_TAG}' absent (check mode: not creating)" + fi +fi +for hn in "${HOSTS[@]}"; do + sid="$(host_sysid "$hn" || true)"; [ -n "$sid" ] || { fail "$hn not enrolled (cannot tag)"; continue; } + if maas "$MAAS_PROFILE" tag machines "$HOST_TAG" 2>/dev/null | jq -e --arg s "$sid" '.[]|select(.system_id==$s)' >/dev/null 2>&1; then + note "$hn already tagged '${HOST_TAG}' -- SKIP" + elif [ "$MODE" = "check" ]; then + note "$hn would be tagged '${HOST_TAG}' (check mode)" + else + echo " tagging $hn ($sid) '${HOST_TAG}'" + if maas "$MAAS_PROFILE" tag update-nodes "$HOST_TAG" add="$sid" >/dev/null 2>&1; then + pass "$hn tagged '${HOST_TAG}'" + else + fail "tag-apply failed for $hn" + fi + fi +done + +note "next: the Strategy-B interface carve (scripts/carve-host-interfaces.sh)" finish diff --git a/scripts/validate.sh b/scripts/validate.sh index 91bfb95..646b6e6 100644 --- a/scripts/validate.sh +++ b/scripts/validate.sh @@ -1,7 +1,7 @@ #!/usr/bin/env bash # scripts/validate.sh # -# STATUS: PLACEHOLDER — drafted post-deploy. +# STATUS: PLACEHOLDER -- drafted post-deploy. # # Roosevelt-rehearsal validation runner per D-011. Executes the validation # criteria sequentially and produces a structured report. @@ -28,7 +28,7 @@ # - Designate resolves API hostnames from tenant VM # - Snapshot 1 + Snapshot 2 existence verified -echo "Placeholder validate.sh — not yet implemented." +echo "Placeholder validate.sh -- not yet implemented." echo echo "Summary: ${PASS} pass, ${FAIL} fail, ${SKIP} skip"