Newer
Older
openstack-caracal-ipv4 / scripts / phase-06-kubeconfig-gate.sh
#!/usr/bin/env bash
# scripts/phase-06-kubeconfig-gate.sh
#
# Phase-06 Step 6.5 encapsulated (D-056) with the DOCFIX-062 fix baked in.
# Runs on the jumphost.
#   1. Pull the mgmt cluster's admin kubeconfig to the jumphost.
#   2. DOCFIX-062: k8s-snap 1.32.13's `k8s config server=<url>` does NOT override
#      the emitted apiserver URL -- it writes the node's TENANT IP (unroutable from
#      the jumphost), so kubectl i/o-times-out. Fix: pull the RAW config, then
#      rewrite the server field to the FIP with `kubectl config set-cluster
#      --server` (a local file op; the cluster name is read dynamically). The FIP
#      is in the cert extra-sans (written by 6.4), so TLS holds against it.
#   3. Node check + GATE 2: the agnhost pod-egress probe to the Keystone PUBLIC
#      endpoint -- the exact test the dual-homed D-033 node FAILED; on this
#      single-NIC VM it must Complete with exitCode 0. Keystone host:port is the
#      as-run literal 10.12.4.50:5000 (6.5 tagged it verbatim); env-overridable
#      per site via KEYSTONE_HOSTPORT.
#
# [SENSITIVE] the kubeconfig it writes ($KUBECONFIG_OUT) holds a cluster-admin
# credential; it is created with mode 600 and kept on the jumphost.
# The throwaway probe pod is always cleaned up (even on gate failure).
#
# Tunables via env: ENVFILE SSH_KEY KUBECONFIG_OUT API_PORT KEYSTONE_HOSTPORT
#                   AGNHOST_IMAGE PROBE_TRIES PROBE_SLEEP
# Requires: jumphost; ssh + the VM key; kubectl; ~/capi-mgmt-net.env (from
#           phase-06-mgmt-vm.sh). All tunables DEFAULT to the as-run values.
# Usage:  bash scripts/phase-06-kubeconfig-gate.sh
# Exit:   0 GATE 2 pass (kubeconfig usable + pod egress works) | 1 gate fail | 2 precondition
# ASCII + LF.

set -euo pipefail
shopt -s inherit_errexit 2>/dev/null || true

ENVFILE="${ENVFILE:-$HOME/capi-mgmt-net.env}"
SSH_KEY="${SSH_KEY:-$HOME/.ssh/id_ed25519}"
KUBECONFIG_OUT="${KUBECONFIG_OUT:-$HOME/capi-mgmt.kubeconfig}"
API_PORT="${API_PORT:-6443}"
AGNHOST_IMAGE="${AGNHOST_IMAGE:-registry.k8s.io/e2e-test-images/agnhost:2.40}"
PROBE_TRIES="${PROBE_TRIES:-20}"
PROBE_SLEEP="${PROBE_SLEEP:-10}"

for c in ssh kubectl; do command -v "$c" >/dev/null 2>&1 || { echo "FAIL: $c not found" >&2; exit 2; }; done
[ -f "$ENVFILE" ] || { echo "FAIL: $ENVFILE not found (run phase-06-mgmt-vm.sh first)" >&2; exit 2; }
# shellcheck disable=SC1090
. "$ENVFILE"
[ -n "${MGMT_FIP:-}" ] || { echo "FAIL: MGMT_FIP unset in $ENVFILE" >&2; exit 2; }
[ -f "$SSH_KEY" ]     || { echo "FAIL: ssh key $SSH_KEY not found" >&2; exit 2; }

MGMT_VM="$MGMT_FIP"
SSH_OPTS=(-i "$SSH_KEY" -o BatchMode=yes -o StrictHostKeyChecking=no \
          -o UserKnownHostsFile=/dev/null -o ConnectTimeout=10)

# --- Keystone public host:port -- the as-run literal (6.5 tagged it verbatim);
#     env-overridable per site. NOT discovered. ---
KEYSTONE_HOSTPORT="${KEYSTONE_HOSTPORT:-10.12.4.50:5000}"
echo "[OK] Keystone public endpoint: $KEYSTONE_HOSTPORT"

# --- 1. pull the RAW admin kubeconfig (no server= arg; we rewrite locally) ---
echo "=== pull kubeconfig -> $KUBECONFIG_OUT ==="
umask 077
if ! ssh "${SSH_OPTS[@]}" ubuntu@"$MGMT_VM" 'sudo k8s config </dev/null' > "$KUBECONFIG_OUT" 2>/dev/null; then
  echo "GATE FAIL: could not pull kubeconfig from the mgmt VM" >&2; exit 1
fi
chmod 600 "$KUBECONFIG_OUT"
[ -s "$KUBECONFIG_OUT" ] || { echo "GATE FAIL: $KUBECONFIG_OUT is empty" >&2; exit 1; }
head -1 "$KUBECONFIG_OUT" | grep -q 'apiVersion: v1' || { echo "GATE FAIL: $KUBECONFIG_OUT does not look like a kubeconfig" >&2; exit 1; }
echo "[OK] kubeconfig pulled ($(wc -l < "$KUBECONFIG_OUT") lines)"

# --- 2. DOCFIX-062: rewrite the server field to the FIP (routable; cert carries the FIP SAN) ---
export KUBECONFIG="$KUBECONFIG_OUT"
CLUSTER=$(kubectl config view -o jsonpath='{.clusters[0].name}' 2>/dev/null)
[ -n "$CLUSTER" ] || { echo "GATE FAIL: no cluster entry in $KUBECONFIG_OUT" >&2; exit 1; }
kubectl config set-cluster "$CLUSTER" --server="https://${MGMT_FIP}:${API_PORT}" >/dev/null
grep -qE "^[[:space:]]*server:[[:space:]]*https://${MGMT_FIP//./\\.}:${API_PORT}\$" "$KUBECONFIG_OUT" \
  || { echo "GATE FAIL: server rewrite to https://${MGMT_FIP}:${API_PORT} did not take (DOCFIX-062)" >&2; exit 1; }
echo "[OK] kubeconfig server rewritten to https://${MGMT_FIP}:${API_PORT} (cluster '$CLUSTER')"

# --- 3a. node check ---
echo "=== node check ==="
if ! nodes=$(kubectl get nodes -o wide 2>&1); then
  printf '%s\n' "$nodes" | sed 's/^/  /'
  echo "GATE FAIL: kubectl cannot reach the apiserver via the FIP" >&2; exit 1
fi
printf '%s\n' "$nodes" | sed 's/^/  /'
printf '%s\n' "$nodes" | awk 'NR>1 && $2!="Ready"{bad=1} END{exit bad?1:0}' \
  || { echo "GATE FAIL: a node is not Ready" >&2; exit 1; }
echo "[OK] node(s) Ready"

# --- 3b. GATE 2: agnhost pod-egress probe to the Keystone public endpoint ---
echo "=== GATE 2: agnhost pod-egress probe -> $KEYSTONE_HOSTPORT ==="
cleanup() { kubectl delete pod egress-test --now --ignore-not-found >/dev/null 2>&1 || true; }
trap cleanup EXIT
kubectl delete pod egress-test --now --ignore-not-found >/dev/null 2>&1 || true
kubectl run egress-test --image="$AGNHOST_IMAGE" --restart=Never \
  --command -- /agnhost connect "$KEYSTONE_HOSTPORT" --timeout=5s >/dev/null

phase=""; state=""
for i in $(seq 1 "$PROBE_TRIES"); do
  phase=$(kubectl get pod egress-test -o jsonpath='{.status.phase}' 2>/dev/null || echo '?')
  state=$(kubectl get pod egress-test -o jsonpath='{.status.containerStatuses[0].state}' 2>/dev/null || echo '')
  echo "  [$i] phase=$phase state=$state"
  case "$phase" in
    Succeeded) break ;;
    Failed)    echo "GATE FAIL: probe pod Failed (egress to $KEYSTONE_HOSTPORT blocked)" >&2; exit 1 ;;
  esac
  sleep "$PROBE_SLEEP"
done

if [ "$phase" = Succeeded ] && printf '%s' "$state" | grep -q '"exitCode":0'; then
  echo "[OK] GATE 2 passed -- pod egress to $KEYSTONE_HOSTPORT returned exitCode 0 (D-035 proof)"
else
  echo "GATE FAIL: probe pod did not reach Succeeded/exitCode 0 in $((PROBE_TRIES*PROBE_SLEEP))s (last: phase=$phase state=$state)" >&2
  exit 1
fi

echo "Summary: kubeconfig usable via FIP; GATE 2 pod-egress proof passed. $KUBECONFIG_OUT ready for phase-07."