diff --git a/.claude/hooks/guard-destructive.py b/.claude/hooks/guard-destructive.py new file mode 100644 index 0000000..4b66798 --- /dev/null +++ b/.claude/hooks/guard-destructive.py @@ -0,0 +1,50 @@ +#!/usr/bin/env python3 +""" +.claude/hooks/guard-destructive.py -- PreToolUse belt-and-suspenders for the +jumphost (2026-07-03). Settings deny/ask rules are the first line; this hook +exists because (a) a hook exit-2 blocks BEFORE permission evaluation in every +permission mode, and (b) Bash settings-rule enforcement has a documented +reliability history upstream. Blocks the NEVER class and secret-file shell +reads that Read() rules cannot see (arbitrary subprocess reads). + +stdin: PreToolUse JSON. exit 0 = no opinion (permission rules proceed); +exit 2 = hard block (stderr shown to Claude). ASCII + LF. +Offline test: tests/claude-guard/run-tests.sh. +""" +import json +import re +import sys + +NEVER = [ + (r"vault\s+operator\s+(init|rekey|generate-root)", + "one-shot vault operation: operator-only, from the runbook, VERBATIM (DOCFIX-006/D-069)"), + (r"juju\s+destroy-controller", + "controller destruction is out of scope for any session on this host"), + (r"\bmaas\s+list\b", + "prints the MAAS API key (DOCFIX-016); use 'maas admin ...' directly"), + (r"git\s+push\s+(--force|-f)\b", + "force-push is banned on this repo"), + (r"(cat|less|more|head|tail|cp|scp|base64|xxd|od|strings)\b[^|;&]*" + r"(vault-init/|as-executed/|-cred\.txt|appcred)", + "secret-adjacent file: never read key/cred material into context (whitelist-print rule)"), + (r"rm\s+-rf\s+(/|~)\s*$", + "catastrophic rm"), +] + + +def main(): + try: + data = json.load(sys.stdin) + except Exception: + return 0 # malformed input: no opinion; permission rules still apply + cmd = (data.get("tool_input") or {}).get("command", "") or "" + for rx, why in NEVER: + if re.search(rx, cmd): + sys.stderr.write( + "BLOCKED by .claude/hooks/guard-destructive.py: %s\n" % why) + return 2 + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/.claude/settings.json b/.claude/settings.json new file mode 100644 index 0000000..e3b13d4 --- /dev/null +++ b/.claude/settings.json @@ -0,0 +1,54 @@ +{ + "permissions": { + "allow": [ + "Bash(git status*)", "Bash(git diff*)", "Bash(git log*)", "Bash(git pull*)", + "Bash(git grep*)", "Bash(grep *)", "Bash(ls *)", "Bash(cat scripts/*)", + "Bash(cat runbooks/*)", "Bash(cat docs/*)", "Bash(jq *)", + "Bash(juju status*)", "Bash(juju models*)", "Bash(juju machines*)", + "Bash(juju spaces*)", "Bash(juju show-*)", "Bash(juju info *)", + "Bash(maas admin * read*)", + "Bash(openstack * list*)", "Bash(openstack * show*)", + "Bash(bash scripts/repo-lint.sh*)", + "Bash(bash scripts/run-tests-all.sh*)", + "Bash(bash scripts/cloud-assert.sh)", + "Bash(bash scripts/preflight.sh*)", + "Bash(python3 scripts/repo_lint.py*)", + "Bash(python3 scripts/provider-bundle-check.py*)", + "Bash(bash tests/*)" + ], + "ask": [ + "Bash(juju destroy-model *)", "Bash(juju remove-machine *)", + "Bash(juju remove-application *)", "Bash(juju remove-unit *)", + "Bash(juju run *)", "Bash(juju ssh *)", "Bash(juju exec *)", + "Bash(juju config * *=*)", "Bash(juju attach-resource *)", + "Bash(juju deploy *)", "Bash(juju add-model *)", + "Bash(maas admin machine delete *)", "Bash(maas admin * update*)", + "Bash(maas admin * create*)", "Bash(maas admin * release*)", + "Bash(openstack * create*)", "Bash(openstack * delete*)", + "Bash(openstack * set*)", "Bash(openstack * unset*)", + "Bash(* --apply*)", + "Bash(git commit*)", "Bash(git push*)", + "Bash(sudo *)", "Bash(virsh *)", "Bash(rm *)" + ], + "deny": [ + "Bash(vault operator init*)", "Bash(vault operator rekey*)", + "Bash(vault operator generate-root*)", + "Bash(juju destroy-controller *)", + "Bash(maas list*)", + "Bash(git push --force*)", "Bash(git push -f*)", + "Read(~/vault-init/**)", "Read(~/as-executed/**)", + "Read(~/tenant-*/**)", "Read(**/*-cred.txt)", "Read(**/*appcred*)", + "Edit(~/vault-init/**)" + ] + }, + "hooks": { + "PreToolUse": [ + { + "matcher": "Bash", + "hooks": [ + { "type": "command", "command": "python3 \"$CLAUDE_PROJECT_DIR\"/.claude/hooks/guard-destructive.py" } + ] + } + ] + } +} diff --git a/.claude/skills/openstack-cloud-ops/SKILL.md b/.claude/skills/openstack-cloud-ops/SKILL.md new file mode 100644 index 0000000..c7c78ec --- /dev/null +++ b/.claude/skills/openstack-cloud-ops/SKILL.md @@ -0,0 +1,123 @@ +--- +name: openstack-cloud-ops +description: "Operate, install, extend, and troubleshoot the Omega Cloud - a commercial multi-tenant Charmed OpenStack (Caracal 2024.1) deployment managed with Juju and MAAS, with Vault TLS, OVN, Ceph, Octavia, and Magnum/CAPI tenant Kubernetes. Use this skill for ANY work touching OpenStack, Juju, MAAS, Magnum, CAPI, Ceph, OVN, Octavia, Keystone, Vault-for-OpenStack, tenant onboarding, or the openstack-caracal-ipv4 repository - including writing or reviewing bash/python operational scripts, debugging failed deploys or cluster creates, runbook work, design-decision (D-NNN) discussion, and incident triage. Use it even for seemingly simple OpenStack questions: this deployment has strict operating discipline and known charm traps that make generic answers wrong." +--- + +# openstack-cloud-ops + +Operating skill for the Omega Cloud: a commercial, multi-tenant, tenant +self-administered OpenStack cloud. Current phase: single-DC virtual rehearsal +("testcloud", VR0 DC0) on four KVM hosts, rehearsing a future bare-metal +multi-datacenter deployment ("Roosevelt"). The governing design constraint is +MINIMIZE DELTA TO ROOSEVELT: the runbooks and scripts are primary deliverables +alongside the running cloud, so transferable answers beat quick fixes. + +## Step 0 - locate the source of truth + +The repository `openstack-caracal-ipv4` (GitBucket, git.baldurkeep.com) is +authoritative for everything: bundle, runbooks, scripts, design decisions, +as-built values. This skill is a discipline-and-routing layer OVER that repo, +not a substitute for it. + +1. Look for a local clone (common paths: `~/openstack-caracal-ipv4`, a repo + dir in the working tree, `/home/claude/repo`). If found, `git log -1` to + note HEAD and work from it. +2. No clone and you have shell + network: ask before cloning + (`https://git.baldurkeep.com/git/OpenStack/openstack-caracal-ipv4.git`). + The repo may be private; if the clone fails, ask the operator to provide + access or the relevant files. +3. No clone obtainable (e.g. chat without sandbox network): say so, ask the + operator to paste the relevant runbook/script, and proceed only on what is + actually in front of you. + +**Divergence rule:** if this skill and repo HEAD disagree, the repo wins - +but FLAG the divergence to the operator rather than silently following either. +The repo is a living draft; this skill's invariants (discipline, hardening) +change slowly, its facts (IPs, versions, phase status) go stale fast. + +## Step 1 - detect the environment + +- **Live shell to the jumphost / infra** (Claude Code on `vopenstack-jesse` or + similar): you may RUN read-only audits directly. Every mutation remains + individually human-gated - present the command, state what it changes, wait + for approval. A live shell relaxes the transport, never the discipline. +- **Chat / no infra shell**: operate the gated copy-paste model - prepare + labeled blocks, the operator runs them and pastes output back. Never assume + a block ran or succeeded; wait for the pasted evidence. + +Read `references/operating-discipline.md` before doing either. + +## The three hard operating rules (non-negotiable) + +1. **Execute only the current runbook step, exactly as written.** No added + scope, no adjacent improvements, no live re-architecture mid-step. Findings + and improvement ideas are LOGGED (changelog / D-NNN proposal), never + executed live mid-step. +2. **Never use an inferred value.** No IP, ID, name, or scope goes into a + command unless it was measured this session or carried from confirmed + as-built. If a value would be inferred: stop and measure it. Never run a + destructive or session-altering command from memory without confirming it + is the minimal correct action for the current live state. +3. **Prefer dynamic lookups over hardcoded literals.** Discover VIPs, project + names, IDs, and version sets at runtime. Where a literal is unavoidable it + is tagged and centralized (`scripts/lib-net.sh`, `lib-hosts.sh`), keyed by + stable identity (CIDR, hostname) - never by drifting IDs. + +Corollary that governs everything: **verify before mutate**. A read-only audit +precedes every mutation; destructive and secret-handling steps are gated +individually, never batched. + +## Routing - where to go for what + +| Task | Read first | +|---|---| +| Any command block, script, or paste block you are about to write | `references/script-authoring.md` | +| Deploy / redeploy / teardown | repo `runbooks/README.md`, then the phase-NN runbook; conventions in `references/operating-discipline.md` | +| Something is broken (triage, incidents) | `references/troubleshooting.md`, then repo `runbooks/appendix-A-troubleshooting.md` | +| CAPI / Magnum / mgmt-VM recovery | repo `runbooks/ops-capi-recovery.md` | +| Deliver ANY repo change (script, runbook, doc) | run `bash scripts/repo-lint.sh` + the touched script's `tests//run-tests.sh` BEFORE handing it over | +| Pre-deploy gate (before add-model) | `bash scripts/preflight.sh` -- THE single entry; do not run the sub-gates piecemeal | +| Is the cloud actually healthy? (post-deploy, post-restart, pre-change baseline, incident) | `bash scripts/cloud-assert.sh` (add `--capture` at deploy completion for the committed BOM) | +| Full-cloud restart after outage/maintenance | repo `runbooks/ops-restart-procedure.md` | +| Starting any consequential live session | `bash scripts/run-logged.sh