#!/usr/bin/env bash
# scripts/phase-06-k8s-bootstrap.sh
#
# Phase-06 Steps 6.3 + 6.4 encapsulated (D-056). Runs on the jumphost; drives the
# in-cloud CAPI management VM over ssh.
#   6.3 GATE 1 -- prove the single-homed VM's egress: it can reach the OpenStack
#       public API (the D-035 premise) and the internet (image pulls). The API
#       target is the Keystone PUBLIC endpoint -- the as-run literal 10.12.4.50:5000
#       (6.3 tagged ENV(keystone-vip)); env-overridable per site via KEYSTONE_HOSTPORT.
#   6.4 -- install k8s-snap on the VM and bootstrap it. The bootstrap config MUST
#       carry a cluster-config block (DOCFIX-024 -- without it network+dns are
#       disabled and the node never goes Ready). extra-sans MUST be the real
#       FIP + tenant IP (from ~/capi-mgmt-net.env, per-rebuild, DOCFIX-038).
#
# One-shot -- matches the as-run 6.4 block verbatim (NO idempotency guard): install +
# bootstrap run unconditionally. Re-run is not safe; purge on the VM first (retry hint
# below), exactly the runbook's documented retry path.
# DOCFIX-021: every remote `sudo` gets </dev/null (remote `bash -s` reads the
# script from stdin, so an unredirected `sudo` would consume the script body).
#
# Tunables via env (all DEFAULT to the as-run values): ENVFILE SSH_KEY CHANNEL POD_CIDR
#                   SVC_CIDR CLUSTER_NAME KEYSTONE_HOSTPORT INET_PROBE PROBE_TIMEOUT
#                   BOOT_TIMEOUT READY_TIMEOUT
# Requires: jumphost; ssh + the VM key; ~/capi-mgmt-net.env (from phase-06-mgmt-vm.sh).
# Usage:  bash scripts/phase-06-k8s-bootstrap.sh
# Exit:   0 egress gate PASS + k8s ready | 1 gate fail | 2 precondition
# ASCII + LF.

set -euo pipefail
shopt -s inherit_errexit 2>/dev/null || true

ENVFILE="${ENVFILE:-$HOME/capi-mgmt-net.env}"
SSH_KEY="${SSH_KEY:-$HOME/.ssh/id_ed25519}"
CHANNEL="${CHANNEL:-1.32-classic/stable}"
POD_CIDR="${POD_CIDR:-10.1.0.0/16}"
SVC_CIDR="${SVC_CIDR:-10.152.183.0/24}"
CLUSTER_NAME="${CLUSTER_NAME:-capi-mgmt-v2}"
INET_PROBE="${INET_PROBE:-1.1.1.1:443}"
PROBE_TIMEOUT="${PROBE_TIMEOUT:-6}"
BOOT_TIMEOUT="${BOOT_TIMEOUT:-10m}"
READY_TIMEOUT="${READY_TIMEOUT:-5m}"

command -v ssh >/dev/null 2>&1 || { echo "FAIL: ssh not found" >&2; exit 2; }
[ -f "$ENVFILE" ] || { echo "FAIL: $ENVFILE not found (run phase-06-mgmt-vm.sh first)" >&2; exit 2; }
# shellcheck disable=SC1090
. "$ENVFILE"
[ -n "${MGMT_FIP:-}" ]       || { echo "FAIL: MGMT_FIP unset in $ENVFILE" >&2; exit 2; }
[ -n "${MGMT_TENANT_IP:-}" ] || { echo "FAIL: MGMT_TENANT_IP unset in $ENVFILE" >&2; exit 2; }
[ -f "$SSH_KEY" ]            || { echo "FAIL: ssh key $SSH_KEY not found" >&2; exit 2; }

MGMT_VM="$MGMT_FIP"
SSH_OPTS=(-i "$SSH_KEY" -o BatchMode=yes -o StrictHostKeyChecking=no \
          -o UserKnownHostsFile=/dev/null -o ConnectTimeout=10)

# --- Keystone public host:port -- the as-run literal (6.3 tagged ENV(keystone-vip));
#     env-overridable per site. NOT discovered -- this is the value that ran verbatim. ---
KEYSTONE_HOSTPORT="${KEYSTONE_HOSTPORT:-10.12.4.50:5000}"
echo "[OK] Keystone public endpoint: $KEYSTONE_HOSTPORT"
KHOST="${KEYSTONE_HOSTPORT%%:*}"; KPORT="${KEYSTONE_HOSTPORT##*:}"
IHOST="${INET_PROBE%%:*}";        IPORT="${INET_PROBE##*:}"
if [ -z "$KHOST" ] || [ -z "$KPORT" ] || [ "$KHOST" = "$KPORT" ]; then
  echo "FAIL: bad KEYSTONE_HOSTPORT '$KEYSTONE_HOSTPORT' (want host:port)" >&2; exit 2
fi

# --- 6.3 GATE 1: VM egress (API VIP + internet) ---
echo "=== 6.3 GATE 1: VM -> Keystone $KHOST:$KPORT + internet $IHOST:$IPORT ==="
g1=$(ssh "${SSH_OPTS[@]}" ubuntu@"$MGMT_VM" \
       bash -s "$KHOST" "$KPORT" "$IHOST" "$IPORT" "$PROBE_TIMEOUT" <<'REOF' 2>&1 || true
set -u
khost="$1"; kport="$2"; ihost="$3"; iport="$4"; t="$5"; ok=1
if timeout "$t" bash -c "exec 3<>/dev/tcp/$khost/$kport" 2>/dev/null; then echo "VIP-OK $khost:$kport"; else echo "VIP-FAIL $khost:$kport"; ok=0; fi
if timeout "$t" bash -c "exec 3<>/dev/tcp/$ihost/$iport" 2>/dev/null; then echo "NET-OK $ihost:$iport"; else echo "NET-FAIL $ihost:$iport"; ok=0; fi
[ "$ok" = 1 ] && echo "GATE1: PASS" || echo "GATE1: FAIL"
REOF
)
printf '%s\n' "$g1" | sed 's/^/  /'
printf '%s\n' "$g1" | grep -q 'GATE1: PASS' || { echo "GATE FAIL: VM egress probe did not pass (see above)" >&2; exit 1; }
echo "[OK] GATE 1 passed -- single-NIC VM egress to the OpenStack public API works (D-035 premise)"

# --- 6.4 k8s-snap install + bootstrap ---
echo "=== 6.4 k8s-snap install + bootstrap ($CHANNEL) ==="
b=$(ssh "${SSH_OPTS[@]}" ubuntu@"$MGMT_VM" \
      bash -s "$MGMT_FIP" "$MGMT_TENANT_IP" "$CHANNEL" "$POD_CIDR" "$SVC_CIDR" "$CLUSTER_NAME" "$BOOT_TIMEOUT" "$READY_TIMEOUT" <<'REOF' 2>&1 || true
set -euo pipefail
FIP="$1"; TENANT="$2"; CH="$3"; POD="$4"; SVC="$5"; NAME="$6"; BT="$7"; RT="$8"

echo "=== install k8s snap $CH ==="
sudo snap install k8s --classic --channel="$CH" </dev/null

echo "=== write bootstrap config (DOCFIX-024: cluster-config block REQUIRED) ==="
sudo tee /root/bootstrap-config.yaml >/dev/null <<CFG
cluster-config:
  network:
    enabled: true
  dns:
    enabled: true
pod-cidr: $POD
service-cidr: $SVC
extra-sans:
- $FIP
- $TENANT
CFG
sudo cat /root/bootstrap-config.yaml

echo "=== bootstrap (timeout $BT) ==="
sudo k8s bootstrap --name "$NAME" --file /root/bootstrap-config.yaml --timeout "$BT" </dev/null

echo "=== status ==="
sudo k8s status --wait-ready --timeout "$RT" </dev/null
echo "BOOT: READY"
REOF
)
printf '%s\n' "$b" | sed 's/^/  /'
printf '%s\n' "$b" | grep -q 'BOOT: READY' || {
  echo "GATE FAIL: k8s did not reach ready (see above)." >&2
  echo "  Retry on the VM: sudo snap remove k8s --purge </dev/null   then re-run this script." >&2
  exit 1; }

echo "Summary: GATE 1 PASS; k8s ($CHANNEL) bootstrapped and ready on $CLUSTER_NAME (FIP $MGMT_FIP / tenant $MGMT_TENANT_IP)."
