diff --git a/scripts/maas-fabric-prune.sh b/scripts/maas-fabric-prune.sh new file mode 100644 index 0000000..74e42e7 --- /dev/null +++ b/scripts/maas-fabric-prune.sh @@ -0,0 +1,83 @@ +#!/usr/bin/env bash +# maas-fabric-prune.sh -- safely delete orphaned MAAS auto-fabrics (fabric-NN). +# +# WHY THIS EXISTS: auto-fabrics matching ^fabric-[0-9]+$ are minted at COMMISSIONING +# (one per non-boot NIC MAAS cannot map to a known fabric) and are NOT reclaimed when +# a machine is decomposed -- so they accumulate on every teardown/rebuild cycle. This +# is recurring maintenance, not a one-off. +# +# WHAT IT TOUCHES: ONLY auto-fabrics with ZERO subnets AND ZERO interfaces. It never +# deletes a named/renamed fabric, the default, or an auto-fabric that still carries a +# subnet (e.g. an LXD/substrate bridge: 10.37.x.0/24 + fd42::/64) or an interface. +# +# WHEN TO RUN: AFTER the interface carve (scripts/carve-host-interfaces.sh --apply on +# all hosts) has relocated host NICs onto the named fabrics. Run before that and this +# cycle's fabric-NN still hold a NIC each and are correctly reported WAIT (skipped). +# +# Dry-run by DEFAULT. Pass --apply to delete. Idempotent; safe to re-run. +set -euo pipefail + +HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +CLASSIFY="$HERE/maas_fabric_classify.py" +PROFILE="${MAAS_PROFILE:-admin}" + +MODE="dryrun" +case "${1:-}" in + --apply) MODE="apply" ;; + ""|--dry-run|--dryrun) MODE="dryrun" ;; + *) echo "usage: $0 [--apply]"; exit 2 ;; +esac + +command -v maas >/dev/null 2>&1 || { echo "FATAL: maas CLI not found"; exit 1; } +command -v jq >/dev/null 2>&1 || { echo "FATAL: jq not found"; exit 1; } +command -v python3 >/dev/null 2>&1 || { echo "FATAL: python3 not found"; exit 1; } +[ -f "$CLASSIFY" ] || { echo "FATAL: classifier missing: $CLASSIFY"; exit 1; } + +TMP="$(mktemp -d)"; trap 'rm -rf "$TMP"' EXIT + +# read-only snapshot of the three sources the classifier needs +maas "$PROFILE" fabrics read > "$TMP/fabrics.json" +maas "$PROFILE" subnets read > "$TMP/subnets.json" +maas "$PROFILE" machines read > "$TMP/machines.json" + +RESULT="$(python3 "$CLASSIFY" "$TMP/fabrics.json" "$TMP/subnets.json" "$TMP/machines.json")" + +echo "=== fabric audit (mode=$MODE) ===" +echo "$RESULT" | jq -r '.audit[] + | " id=\(.id)\tsubnets=\(.subnets)\tifaces=\(.ifaces)\t\(.name)\t\(.verdict)"' | sort -t= -k2 -n + +mapfile -t DEL < <(echo "$RESULT" | jq -r '.delete_ids[]') + +echo +if [ "${#DEL[@]}" -eq 0 ]; then + echo "No orphaned auto-fabrics to delete." + exit 0 +fi +echo "Orphans (auto-fabric, 0 subnets, 0 ifaces) -> ${#DEL[@]}: ${DEL[*]}" + +if [ "$MODE" != apply ]; then + echo + echo "DRY-RUN -- nothing deleted. Re-run with --apply to delete the orphans above." + exit 0 +fi + +echo +fail=0 +for id in "${DEL[@]}"; do + echo " deleting fabric id=$id ..." + # MAAS itself refuses to delete a fabric with attached subnets/VLANs/interfaces; + # we do NOT suppress that error -- a non-empty fabric is left intact and flagged. + if maas "$PROFILE" fabric delete "$id" fabrics read -> fabrics.json + maas subnets read -> subnets.json + maas machines read -> machines.json + +SAFETY PREDICATE (delete iff ALL hold): + * fabric name matches ^fabric-[0-9]+$ (an auto-minted commissioning fabric), AND + * ZERO subnets are attached to the fabric, AND + * ZERO machine interfaces are attached to the fabric. + +Everything else is kept: + * named / renamed fabrics (1_provider, 2_metal, f_oob, ...) -> never deleted; + * an auto-fabric that still carries a subnet (e.g. an LXD/substrate bridge: + 10.37.x.0/24 + fd42::/64) -> never deleted (substrate, not a deploy artifact); + * an auto-fabric still holding an interface -> WAIT (a host NIC the carve has not + yet relocated); deletable only after the carve vacates it. + +Output (stdout): deterministic JSON {"audit":[...], "delete_ids":[...]} sorted by +fabric id. delete_ids is the safe set. + +Field shape (verified against MAAS 3.7 live output): + fabrics[].id, fabrics[].name + subnets[].vlan.fabric_id + machines[].interface_set[].vlan.fabric_id (present even for link_up-only NICs) +""" +import json +import re +import sys + +AUTO = re.compile(r'^fabric-[0-9]+$') + + +def _count_by_fabric_subnets(subnets): + out = {} + for s in subnets: + fid = (s.get("vlan") or {}).get("fabric_id") + if fid is not None: + out[fid] = out.get(fid, 0) + 1 + return out + + +def _count_by_fabric_ifaces(machines): + out = {} + for m in machines: + for i in (m.get("interface_set") or []): + fid = (i.get("vlan") or {}).get("fabric_id") + if fid is not None: + out[fid] = out.get(fid, 0) + 1 + return out + + +def classify(fabrics, subnets, machines): + sub = _count_by_fabric_subnets(subnets) + iff = _count_by_fabric_ifaces(machines) + audit, delete_ids = [], [] + for f in sorted(fabrics, key=lambda x: x["id"]): + fid = f["id"] + name = f.get("name", "") + ns = sub.get(fid, 0) + ni = iff.get(fid, 0) + auto = bool(AUTO.match(name)) + if not auto: + verdict = "KEEP (named/default)" + elif ns > 0: + # auto-name but carries a subnet: substrate (e.g. LXD bridge) -- never delete + verdict = "KEEP auto-fabric HAS SUBNET(S) -- substrate/in-use, never delete" + elif ni > 0: + verdict = "WAIT auto-fabric in use by interface(s) -- vacate (carve) before prune" + else: + verdict = "ORPHAN -- delete" + delete_ids.append(fid) + audit.append({ + "id": fid, "name": name, "subnets": ns, "ifaces": ni, + "auto": auto, "verdict": verdict, + }) + return {"audit": audit, "delete_ids": delete_ids} + + +def _load(path): + with open(path) as fh: + return json.load(fh) + + +def main(argv): + if len(argv) != 4: + sys.stderr.write( + "usage: maas_fabric_classify.py \n") + return 2 + out = classify(_load(argv[1]), _load(argv[2]), _load(argv[3])) + print(json.dumps(out, indent=2, sort_keys=True)) + return 0 + + +if __name__ == "__main__": + sys.exit(main(sys.argv))