diff --git a/.claude/skills/openstack-cloud-ops/openstack-cloud-ops.skill b/.claude/skills/openstack-cloud-ops/openstack-cloud-ops.skill new file mode 100644 index 0000000..0bde5e3 --- /dev/null +++ b/.claude/skills/openstack-cloud-ops/openstack-cloud-ops.skill Binary files differ diff --git a/.claude/skills/openstack-cloud-ops/references/operating-discipline.md b/.claude/skills/openstack-cloud-ops/references/operating-discipline.md index 3c7a2f2..f103762 100644 --- a/.claude/skills/openstack-cloud-ops/references/operating-discipline.md +++ b/.claude/skills/openstack-cloud-ops/references/operating-discipline.md @@ -62,6 +62,8 @@ identifier it consumes in its commit; the other re-greps AFTER the parallel push lands and resumes above it. `scripts/ledger-scan.sh` derives next-free from decision HEADERS (not prose mentions -- a "next-free D-072" pointer must not inflate the count). + Keep "Next-free:" pointer lines on ONE line (a word-wrapped continuation escapes the + line-based exclusion); or just rely on the scan, which is the next-free authority. - Mid-task findings are logged as proposals, not acted on (hard rule 1). A finding that changes a runbook becomes a DOCFIX; one that changes architecture becomes a D-NNN with status PROPOSED until the operator rules. diff --git a/scripts/checks/d011-02-vip-jumphost.sh b/scripts/checks/d011-02-vip-jumphost.sh new file mode 100644 index 0000000..8a35895 --- /dev/null +++ b/scripts/checks/d011-02-vip-jumphost.sh @@ -0,0 +1,82 @@ +#!/usr/bin/env bash +# scripts/checks/d011-02-vip-jumphost.sh -- D-011 item 2: public API VIPs respond, +# on hostname, from the jumphost. +# +# Enumerates PUBLIC endpoints from the keystone catalog DYNAMICALLY (never a hardcoded +# VIP list), reduces to unique scheme://host:port, and probes each root over TLS from +# the jumphost. "Respond on hostname" (D-019: public endpoints are FQDNs resolving via +# corporate DNS) means the probe tests DNS + TLS (vault CA) + the VIP answering. +# +# Healthy-response policy (documented, debatable): an HTTP status in +# {200,201,300,301,302,401,403,404} = the VIP RESPONDS (version-discovery 300 and +# unauthenticated 401/403 are healthy). 5xx = reachable but erroring -> FAIL (an +# acceptance bar should not pass a 5xx-ing API). No HTTP code (curl transport error: +# DNS/conn/TLS) -> FAIL, with the failure class surfaced (a TLS-verify failure is +# retried with -k ONLY to classify reachable-but-cert-broken vs unreachable; it still +# FAILs -- we never pass an unverifiable endpoint). +# +# Exit: 0 PASS (all public VIPs healthy) | 1 FAIL (any unhealthy/unreachable) +# | 2 HOLD (no admin scope, no endpoints, or curl/tooling missing). +set -uo pipefail +HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +# shellcheck source=scripts/lib-validate.sh +. "$HERE/../lib-validate.sh" +ID=d011-02-vip-jumphost; vr_begin "$ID" + +vr_need openstack curl python3 || { emit "$ID" "$VR_HOLD" "missing tool"; exit "$VR_HOLD"; } +vr_admin_env || { emit "$ID" "$VR_HOLD" "no admin scope (source ~/admin-openrc)"; exit "$VR_HOLD"; } + +if ! vr_json EPJSON openstack endpoint list --interface public -f json; then + vr_err_tail; emit "$ID" "$VR_HOLD" "endpoint list failed"; exit "$VR_HOLD" +fi +jq -e . >/dev/null 2>&1 <<<"$EPJSON" || { emit "$ID" "$VR_HOLD" "endpoint list not JSON"; exit "$VR_HOLD"; } + +# unique scheme://host:port -> space-joined list of service names on it +mapfile -t ORIGINS < <(python3 -c ' +import sys, json +from urllib.parse import urlparse +try: rows=json.load(sys.stdin) +except Exception: sys.exit(0) +seen={} +for r in rows: + url=r.get("URL") or r.get("url") or "" + svc=r.get("Service Name") or r.get("Service Type") or r.get("service_name") or "?" + p=urlparse(url) + if not p.scheme or not p.hostname: continue + port=p.port or (443 if p.scheme=="https" else 80) + origin="%s://%s:%d" % (p.scheme, p.hostname, port) + seen.setdefault(origin, set()).add(svc) +for o in sorted(seen): print(o, ",".join(sorted(seen[o]))) +' <<<"$EPJSON") + +[ "${#ORIGINS[@]}" -gt 0 ] || { emit "$ID" "$VR_HOLD" "no public endpoints found in catalog"; exit "$VR_HOLD"; } + +CA="${OS_CACERT:-$HOME/vault-init/vault-ca-root.pem}" +HEALTHY_RE='^(200|201|300|301|302|401|403|404)$' +BAD=0; N=0 +for line in "${ORIGINS[@]}"; do + origin="${line%% *}"; svcs="${line#* }"; N=$((N+1)) + url="$origin/" + CODE="$(curl -sS -o /dev/null -w '%{http_code}' --cacert "$CA" \ + --connect-timeout 6 --max-time 12 "$url" 2>/dev/null || true)" + RC=$? + if [ "$RC" -eq 0 ] && [[ "$CODE" =~ $HEALTHY_RE ]]; then + echo " OK $origin http=$CODE [$svcs]" + elif [ "$RC" -eq 0 ] && [[ "$CODE" =~ ^5 ]]; then + echo " BAD $origin http=$CODE (reachable but 5xx) [$svcs]"; BAD=$((BAD+1)) + else + # transport error -- classify; retry -k ONLY to distinguish cert-broken from unreachable + KCODE="$(curl -sS -k -o /dev/null -w '%{http_code}' --connect-timeout 6 --max-time 12 "$url" 2>/dev/null || true)" + if [ -n "$KCODE" ] && [[ "$KCODE" =~ ^[0-9]{3}$ ]] && [ "$KCODE" != 000 ]; then + echo " BAD $origin TLS-verify-FAILED (reachable with -k http=$KCODE; cert not valid for hostname) [$svcs]" + else + echo " BAD $origin UNREACHABLE (curl rc=$RC, http=${CODE:-none}) [$svcs]" + fi + BAD=$((BAD+1)) + fi +done + +if [ "$BAD" -gt 0 ]; then + emit "$ID" "$VR_FAIL" "$BAD/$N public VIP origins unhealthy"; exit "$VR_FAIL" +fi +emit "$ID" "$VR_PASS" "all $N public VIP origins respond healthy over TLS on hostname"; exit "$VR_PASS" diff --git a/scripts/checks/d011-03-vip-tenant.sh b/scripts/checks/d011-03-vip-tenant.sh new file mode 100644 index 0000000..60d47a0 --- /dev/null +++ b/scripts/checks/d011-03-vip-tenant.sh @@ -0,0 +1,92 @@ +#!/usr/bin/env bash +# scripts/checks/d011-03-vip-tenant.sh -- D-011 item 3: API reachability from a tenant +# VM (Option B), via the D-035 pod-egress proof. +# +# Runs an ephemeral agnhost pod IN a tenant k8s cluster and has it TCP-connect to the +# keystone public VIP. This is the exact test the dual-homed D-033 node FAILED and the +# single-homed D-035 topology must PASS -- it proves a tenant workload egresses through +# its router SNAT to the provider API VIP. +# +# Dynamic values (never hardcoded): the keystone VIP:port is derived from the tenant +# cred's auth_url; the kubeconfig is the one tenant-acceptance leaves at +# ~/tenant-/kube/config, or fetched via `coe cluster config` AS THE TENANT. +# +# Mutation note: creates ONE short-lived agnhost pod and deletes it (even on failure). +# Additive + self-cleaning, not destructive -> NOT --disruptive-gated (a post-restart +# confidence run legitimately exercises tenant egress). +# +# Exit: 0 PASS (pod connected to the VIP) | 1 FAIL (egress blocked: timeout/refused) +# | 2 HOLD (no kubectl/creds/kubeconfig/VIP derivable). +set -uo pipefail +HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +# shellcheck source=scripts/lib-validate.sh +. "$HERE/../lib-validate.sh" +ID=d011-03-vip-tenant; vr_begin "$ID" +CLIENT="${VR_TENANT:-beta}" +AGNHOST="${VR_AGNHOST:-registry.k8s.io/e2e-test-images/agnhost:2.47}" + +vr_need kubectl awk || { emit "$ID" "$VR_HOLD" "missing tool"; exit "$VR_HOLD"; } + +# --- derive keystone VIP:port from the tenant cred auth_url (dynamic) --- +CF="" +for c in "$HOME/tenant-${CLIENT}/${CLIENT}-cluster-cred.txt" \ + "$HOME/tenant-${CLIENT}/${CLIENT}-svc-cred.txt"; do + [ -s "$c" ] && grep -qE '^auth_url=' "$c" && { CF="$c"; break; } +done +[ -n "$CF" ] || { emit "$ID" "$VR_HOLD" "no tenant cred for '$CLIENT' (auth_url source)"; exit "$VR_HOLD"; } +AUTHURL="$(awk -F= '/^auth_url=/{print $2}' "$CF")" +VIP="$(python3 -c ' +import sys +from urllib.parse import urlparse +p=urlparse(sys.argv[1]) +port=p.port or (443 if p.scheme=="https" else 80) +print("%s %d" % (p.hostname or "", port))' "$AUTHURL" 2>/dev/null)" +VHOST="${VIP%% *}"; VPORT="${VIP##* }" +vr_is_ipv4 "$VHOST" 2>/dev/null || [ -n "$VHOST" ] || { emit "$ID" "$VR_HOLD" "could not derive keystone VIP from auth_url"; exit "$VR_HOLD"; } +[ -n "$VPORT" ] || { emit "$ID" "$VR_HOLD" "could not derive keystone port"; exit "$VR_HOLD"; } +echo " target keystone VIP: $VHOST:$VPORT (from $CLIENT auth_url)" + +# --- obtain the tenant kubeconfig (reuse existing; else fetch as tenant) --- +KDIR="$HOME/tenant-${CLIENT}/kube"; KCFG="$KDIR/config" +if [ ! -s "$KCFG" ]; then + echo " no cached kubeconfig; fetching via coe cluster config (as tenant)" + command -v openstack >/dev/null || { emit "$ID" "$VR_HOLD" "no cached kubeconfig and openstack absent"; exit "$VR_HOLD"; } + mkdir -p "$KDIR" + ( vr_tenant_env "$CF" || exit 2 + cd "$KDIR" && openstack coe cluster config "${CLIENT}-cluster" --dir "$KDIR" --force &1 ) | sed 's/^/ /' + [ -s "$KCFG" ] || { emit "$ID" "$VR_HOLD" "kubeconfig fetch produced no config"; exit "$VR_HOLD"; } + chmod 600 "$KCFG" +fi +export KUBECONFIG="$KCFG" +kubectl version --client >/dev/null 2>&1 || { emit "$ID" "$VR_HOLD" "kubectl not functional"; exit "$VR_HOLD"; } + +# --- ephemeral agnhost pod: connect to the VIP; delete always --- +POD="vip-probe-$$-$RANDOM" +cleanup(){ kubectl delete pod "$POD" --ignore-not-found --now >/dev/null 2>&1 || true; } +trap cleanup EXIT +echo " launching probe pod $POD -> $VHOST:$VPORT" +if ! run kubectl run "$POD" --image="$AGNHOST" --restart=Never --command -- \ + /agnhost connect "$VHOST:$VPORT" --timeout=8s; then + emit "$ID" "$VR_HOLD" "could not create probe pod (image pull / RBAC?)"; exit "$VR_HOLD" +fi +# wait for terminal phase (Succeeded/Failed), up to ~60s +PHASE="" +for _ in $(seq 1 30); do + PHASE="$(kubectl get pod "$POD" -o jsonpath='{.status.phase}' 2>/dev/null || true)" + case "$PHASE" in Succeeded|Failed) break;; esac + sleep 2 +done +LOGS="$(kubectl logs "$POD" 2>/dev/null || true)" +printf '%s\n' "$LOGS" | sed 's/^/ /' +echo " pod phase: ${PHASE:-}" + +if [ "$PHASE" = Succeeded ]; then + emit "$ID" "$VR_PASS" "tenant pod egress reached keystone VIP $VHOST:$VPORT (Option B proven)"; exit "$VR_PASS" +fi +if printf '%s\n' "$LOGS" | grep -qiE 'TIMEOUT|REFUSED|no route|i/o timeout'; then + emit "$ID" "$VR_FAIL" "tenant pod egress to $VHOST:$VPORT BLOCKED ($(printf '%s' "$LOGS" | tr -d '\n' | tail -c 40))"; exit "$VR_FAIL" +fi +if [ "$PHASE" = Failed ]; then + emit "$ID" "$VR_FAIL" "tenant pod egress to $VHOST:$VPORT failed (phase Failed)"; exit "$VR_FAIL" +fi +emit "$ID" "$VR_HOLD" "probe pod did not reach terminal phase (phase=${PHASE:-none})"; exit "$VR_HOLD" diff --git a/skills/openstack-cloud-ops/openstack-cloud-ops.skill b/skills/openstack-cloud-ops/openstack-cloud-ops.skill index e42006b..0bde5e3 100644 --- a/skills/openstack-cloud-ops/openstack-cloud-ops.skill +++ b/skills/openstack-cloud-ops/openstack-cloud-ops.skill Binary files differ