#!/usr/bin/env bash
# scripts/phase-07-conductor-graft.sh
#
# Phase-07 -- Magnum conductor graft (D-031 / D-037 / D-042 / D-046 / D-047),
# encapsulating the validated 2026-07-01 as-run (Steps 7.0-7.8) with DOCFIX-063
# baked in. Runs on the jumphost; every conductor-side action ships via
# `juju ssh -m <model> <conductor> ... </dev/null` (DOCFIX-021).
#
# DOCFIX-063 (folded here so it cannot be skipped):
#   1. 7.1 is VERIFY-FIRST: the phase-06 capi-mgmt-sg already opens tcp/6443, so
#      the conductor reaches the apiserver with NO per-conductor rule. This script
#      proves reachability; it does NOT add a hardcoded source (the pre-D-052
#      10.12.4.76 literal is gone). If 6443 is NOT reachable it FAILS LOUD with the
#      measured-source fallback instructions -- it never guesses a source.
#   2. The helm auth-proof runs AFTER the driver/helm install (7.4), not in 7.2
#      (helm is absent on a fresh conductor). Integrity (sha256) + reachability
#      (7.1 TCP) gate the kubeconfig transfer without it.
#   3. 7.3 probes served versions with `kubectl api-versions` (api-resources shows
#      only the PREFERRED version -> a false "v1beta1 not served").
#   4. 7.6 runs `install -d /etc/magnum/magnum.conf.d` before the tee (absent on a
#      fresh deploy; also the 7.7 --config-dir target).
#   5. ASCII checks `sudo` the grep (a non-sudo read of the root-owned path gave a
#      false "ASCII clean").
#   6. the helm egress pre-check hits a REAL asset (bare get.helm.sh/ 404s).
#
# [SENSITIVE] Step 7.2 base64-pipes the FIP-rewritten kubeconfig into a root-written
# 0600 file owned by the conductor user (magnum) and gates on a sha256 both-sides
# match. The kubeconfig holds a cluster-admin credential; it is never staged in /tmp.
#
# Health poll + create/delete regression are NOT in this script -- on a fresh deploy
# no cluster/template exists yet; phase-08 (D-011) is the superset acceptance.
#
# Tunables via env (all DEFAULT to the as-run measured values):
#   MODEL CONDUCTOR ENVFILE KUBECONFIG_SRC DRIVER_VERSION HELM_VERSION CHART_VERSION
#   API_PORT SG_NAME CAPI_PROJECT CAPI_PROJECT_DOMAIN ADMIN_OPENRC
# Requires: jumphost; juju (model reachable); openstack (admin-openrc); base64;
#           sha256sum; ~/capi-mgmt-net.env (MGMT_FIP, from phase-06); ~/capi-mgmt.kubeconfig.
# Usage:  bash scripts/phase-07-conductor-graft.sh
# Exit:   0 all phase-07 mechanisms in place | 1 gate fail | 2 precondition fail
# ASCII + LF.

# shellcheck disable=SC1090  # $ADMIN_OPENRC / $ENVFILE are intentionally dynamic source paths
set -euo pipefail
shopt -s inherit_errexit 2>/dev/null || true

MODEL="${MODEL:-openstack}"
CONDUCTOR="${CONDUCTOR:-magnum/0}"
MAGNUM_APP="${CONDUCTOR%%/*}"
ENVFILE="${ENVFILE:-$HOME/capi-mgmt-net.env}"
KUBECONFIG_SRC="${KUBECONFIG_SRC:-$HOME/capi-mgmt.kubeconfig}"
DRIVER_VERSION="${DRIVER_VERSION:-1.4.0}"
HELM_VERSION="${HELM_VERSION:-v3.17.3}"
CHART_VERSION="${CHART_VERSION:-0.25.1}"
API_PORT="${API_PORT:-6443}"
SG_NAME="${SG_NAME:-capi-mgmt-sg}"
CAPI_PROJECT="${CAPI_PROJECT:-capi-mgmt}"
CAPI_PROJECT_DOMAIN="${CAPI_PROJECT_DOMAIN:-capi}"
ADMIN_OPENRC="${ADMIN_OPENRC:-$HOME/admin-openrc}"
CONF_DIR="/etc/magnum/magnum.conf.d"

say()  { printf '\n=== %s ===\n' "$*"; }
ok()   { printf '[OK] %s\n' "$*"; }
die1() { printf 'GATE FAIL: %s\n' "$*" >&2; exit 1; }
die2() { printf 'PRECONDITION FAIL: %s\n' "$*" >&2; exit 2; }

# ---- helper: run a command string on the conductor (stdin closed; DOCFIX-021) ----
rc() { juju ssh -m "$MODEL" "$CONDUCTOR" "$1" </dev/null; }
# capture conductor output for a gate WITHOUT piping juju into grep. Piping juju
# into `grep -q`/`head` under `set -o pipefail` races: grep closes the pipe on the
# first match, juju takes SIGPIPE and exits ~141, and pipefail then fails the gate
# even though the match succeeded. Capture first, then test with a here-string.
rcap() { rc "$1" 2>/dev/null || true; }

# ============================ Preconditions ============================
for c in juju openstack base64 sha256sum; do
  command -v "$c" >/dev/null 2>&1 || die2 "$c not found on the jumphost"
done
[ -f "$ADMIN_OPENRC" ]   || die2 "$ADMIN_OPENRC not found"
[ -f "$ENVFILE" ]        || die2 "$ENVFILE not found (run phase-06 first)"
[ -s "$KUBECONFIG_SRC" ] || die2 "$KUBECONFIG_SRC not found/empty (run phase-06 6.5 first)"
# shellcheck disable=SC1090
. "$ENVFILE"
[ -n "${MGMT_FIP:-}" ]   || die2 "MGMT_FIP unset in $ENVFILE"
grep -qE "^[[:space:]]*server:[[:space:]]*https://${MGMT_FIP//./\\.}:${API_PORT}\$" "$KUBECONFIG_SRC" \
  || die2 "$KUBECONFIG_SRC server is not the FIP https://${MGMT_FIP}:${API_PORT} (phase-06 DOCFIX-062 rewrite missing)"
ok "preconditions met (model=$MODEL conductor=$CONDUCTOR MGMT_FIP=$MGMT_FIP driver=$DRIVER_VERSION)"

# ============================ 7.0 domain-setup (D-046) ============================
say "7.0 magnum trustee domain-setup (D-046; idempotent)"
juju run "${MAGNUM_APP}/leader" domain-setup </dev/null || die1 "domain-setup action failed"
# `|| true` on each capture: the subshell exits non-zero when the object is absent;
# without it, `set -e` aborts on the ASSIGNMENT before the descriptive die1 below.
DOM_ID=$( ( . "$ADMIN_OPENRC"; openstack domain show magnum -f value -c id ) 2>/dev/null </dev/null || true )
USR_ID=$( ( . "$ADMIN_OPENRC"; openstack user show magnum_domain_admin --domain magnum -f value -c id ) 2>/dev/null </dev/null || true )
[ -n "$DOM_ID" ] || die1 "keystone domain 'magnum' absent after domain-setup"
[ -n "$USR_ID" ] || die1 "keystone user 'magnum_domain_admin' absent after domain-setup"
COE=$( ( . "$ADMIN_OPENRC"; openstack coe service list ) 2>&1 </dev/null || true )   # capture (no pipe -> no SIGPIPE gate race)
grep -q 'magnum-conductor' <<<"$COE" || die1 "coe service list did not return magnum-conductor (trustee 403?)"
ok "domain magnum=$DOM_ID user magnum_domain_admin=$USR_ID; coe service list OK"

# ============================ 7.1 reachability (VERIFY-FIRST; DOCFIX-063) ============================
say "7.1 conductor -> mgmt apiserver reachability (verify-first; no hardcoded SG rule)"
CAPI_PID=$( ( . "$ADMIN_OPENRC"; openstack project show "$CAPI_PROJECT" --domain "$CAPI_PROJECT_DOMAIN" -f value -c id ) 2>/dev/null </dev/null || true )
[ -n "$CAPI_PID" ] || die1 "could not resolve project $CAPI_PROJECT in domain $CAPI_PROJECT_DOMAIN"
TCP=$(rc "timeout 6 bash -c 'exec 3<>/dev/tcp/${MGMT_FIP}/${API_PORT}' && echo TCP-OK || echo TCP-FAIL" || true)
case "$TCP" in
  *TCP-OK*) ok "conductor reaches ${MGMT_FIP}:${API_PORT} (phase-06 capi-mgmt-sg already permits it; no rule added)" ;;
  *) die1 "conductor cannot reach ${MGMT_FIP}:${API_PORT}. DOCFIX-063 fallback (manual): scope to project $CAPI_PID, \
MEASURE the source the mgmt VM sees from $CONDUCTOR (conntrack/listener on the VM), then \
'openstack security group rule create --proto tcp --dst-port ${API_PORT} --remote-ip <measured-src>/32 $SG_NAME'. Never guess the source." ;;
esac

# ============================ 7.2 kubeconfig -> conductor [SENSITIVE] ============================
say "7.2 place the FIP kubeconfig on the conductor [SENSITIVE]"
CUSER=$(rc "systemctl show magnum-conductor -p User --value" | tr -d '\r')
[ -z "$CUSER" ] && CUSER=$(rc "ps -eo user:32,args | awk '/[m]agnum-conductor/{print \$1; exit}'" | tr -d '\r')
[ -n "$CUSER" ] || die1 "could not determine the conductor service user"
rc "getent passwd $CUSER >/dev/null" || die1 "conductor user '$CUSER' does not exist on the conductor"
ok "conductor user = $CUSER"
# base64-pipe: stdin IS the payload -> NO </dev/null on this juju ssh
base64 "$KUBECONFIG_SRC" | juju ssh -m "$MODEL" "$CONDUCTOR" \
  "sudo bash -c 'umask 077; base64 -d > /etc/magnum/kubeconfig && \
   getent passwd $CUSER >/dev/null && chown $CUSER: /etc/magnum/kubeconfig && \
   chmod 0600 /etc/magnum/kubeconfig'" \
  || die1 "kubeconfig transfer to the conductor failed"
L_SHA=$(sha256sum "$KUBECONFIG_SRC" | cut -d' ' -f1)
R_SHA=$(rc "sudo sha256sum /etc/magnum/kubeconfig" | cut -d' ' -f1)
[ -n "$R_SHA" ] && [ "$L_SHA" = "$R_SHA" ] || die1 "kubeconfig sha256 mismatch (local=$L_SHA remote=$R_SHA)"
ok "kubeconfig on conductor: 0600 $CUSER, sha256 match ($L_SHA)"

# ============================ 7.3 served CAPI versions (DOCFIX-063 probe) ============================
say "7.3 confirm v1beta1 is SERVED per core CAPI group (kubectl api-versions)"
SERVED=$(KUBECONFIG="$KUBECONFIG_SRC" kubectl api-versions 2>/dev/null | grep -E 'cluster\.x-k8s\.io/' | sort || true)
[ -n "$SERVED" ] || die1 "no cluster.x-k8s.io api-versions returned (mgmt cluster unreachable via $KUBECONFIG_SRC)"
printf '%s\n' "$SERVED"
for g in cluster.x-k8s.io controlplane.cluster.x-k8s.io bootstrap.cluster.x-k8s.io infrastructure.cluster.x-k8s.io; do
  printf '%s\n' "$SERVED" | grep -qx "${g}/v1beta1" \
    || die1 "core group ${g} does NOT serve v1beta1 -- set an api_resources override for it in 7.6 (edit CHART/driver map)"
done
ok "v1beta1 served for all core CAPI groups; empty api_resources={} is correct (D-042 premise)"

# ============================ 7.4 driver + helm install ============================
say "7.4 install helm $HELM_VERSION + magnum-capi-helm $DRIVER_VERSION on the conductor"
# (a) egress pre-check -- REAL assets (DOCFIX-063: bare get.helm.sh/ 404s)
rc "curl -s -o /dev/null -w 'pypi:%{http_code}\n' https://pypi.org/simple/ ; \
    curl -s -o /dev/null -w 'helm:%{http_code}\n' https://get.helm.sh/helm-${HELM_VERSION}-linux-amd64.tar.gz.sha256sum"
# (b) helm -- checksum-verified; /usr/local/bin + /usr/bin symlink (DOCFIX-035). WANT injected from the local tunable.
juju ssh -m "$MODEL" "$CONDUCTOR" "WANT='$HELM_VERSION'; "'set -e
  if [ -x /usr/bin/helm ] && /usr/bin/helm version --short 2>/dev/null | grep -q "$WANT"; then
    echo "[SKIP] /usr/bin/helm already $WANT"
  else
    T=helm-$WANT-linux-amd64.tar.gz; D=$(mktemp -d); cd "$D"
    curl -fsSLO "https://get.helm.sh/$T"
    EXP=$(curl -fsSL "https://get.helm.sh/$T.sha256sum" | cut -d" " -f1)
    GOT=$(sha256sum "$T" | cut -d" " -f1)
    [ -n "$EXP" ] && [ "$EXP" = "$GOT" ] || { echo "GATE FAIL: helm checksum exp=$EXP got=$GOT"; exit 1; }
    tar xzf "$T"
    sudo install -o root -g root -m 0755 linux-amd64/helm /usr/local/bin/helm
    sudo ln -sfn /usr/local/bin/helm /usr/bin/helm
    cd /; rm -rf "$D"; echo "[OK] installed $(/usr/bin/helm version --short)"
  fi' </dev/null || die1 "helm install/verify failed on the conductor"
# (c) DOCFIX-035 GATE: helm resolves from the RESTRICTED init PATH (no /usr/local/bin)
PATHCHK=$(rcap "env -i PATH=/usr/sbin:/usr/bin:/sbin:/bin sh -c 'command -v helm && helm version --short'")
grep -q '/usr/bin/helm' <<<"$PATHCHK" || die1 "helm not on the conductor's restricted init PATH (DOCFIX-035)"
# (d) driver
rc "sudo python3 -m pip install --no-deps --upgrade 'magnum-capi-helm==$DRIVER_VERSION'" \
  || die1 "pip install magnum-capi-helm==$DRIVER_VERSION failed"
# (e) verify version + entry point
grep -q "$DRIVER_VERSION" <<<"$(rcap "pip show magnum-capi-helm 2>/dev/null | grep -E '^Version:'")" \
  || die1 "installed magnum-capi-helm is not $DRIVER_VERSION"
grep -q 'k8s_capi_helm_v1' <<<"$(rcap "python3 -c \"import importlib.metadata as m; print([e.name for e in m.entry_points(group='magnum.drivers')])\"")" \
  || die1 "k8s_capi_helm_v1 entry point missing after install"
ok "helm $HELM_VERSION (restricted PATH) + magnum-capi-helm $DRIVER_VERSION; entry point present"

# ---- moved 7.2 auth-proof (helm now present) ----
say "7.2/7.4 end-to-end auth proof (helm list -A as $CUSER via the FIP)"
AUTH=$(rc "sudo -u $CUSER env HOME=/tmp helm --kubeconfig /etc/magnum/kubeconfig list -A" || true)
printf '%s\n' "$AUTH"
grep -q 'cert-manager' <<<"$AUTH" || die1 "conductor could not auth/list mgmt-cluster releases (expected cert-manager et al.)"
ok "conductor authenticates to the mgmt cluster; releases listed"

# ============================ 7.6 [capi_helm] drop-in (D-037) ============================
say "7.6 stage the [capi_helm] conf.d drop-in (D-037)"
rc "sudo install -d -o root -g root -m 0755 $CONF_DIR"   # DOCFIX-063: dir absent on fresh deploy
CONF_CONTENT="[capi_helm]
kubeconfig_file = /etc/magnum/kubeconfig
helm_chart_repo = https://azimuth-cloud.github.io/capi-helm-charts
helm_chart_name = openstack-cluster
default_helm_chart_version = $CHART_VERSION
api_resources = {}"
# stdin IS the payload -> NO </dev/null
printf '%s\n' "$CONF_CONTENT" | juju ssh -m "$MODEL" "$CONDUCTOR" \
  "sudo tee $CONF_DIR/00-capi-helm.conf >/dev/null" || die1 "writing 00-capi-helm.conf failed"
rc "sudo chmod 0644 $CONF_DIR/00-capi-helm.conf"
# verify content + perms + ASCII (DOCFIX-063: sudo the grep)
rc "sudo grep -q '^default_helm_chart_version = $CHART_VERSION\$' $CONF_DIR/00-capi-helm.conf" \
  || die1 "00-capi-helm.conf missing default_helm_chart_version = $CHART_VERSION"
rc "sudo env LC_ALL=C grep -nP '[^\x00-\x7F]' $CONF_DIR/00-capi-helm.conf" \
  && die1 "00-capi-helm.conf has non-ASCII bytes" || true
ok "00-capi-helm.conf staged (chart $CHART_VERSION, api_resources={}, ASCII clean)"

# ============================ 7.7 conductor --config-dir (D-037) ============================
say "7.7 wire --config-dir into the conductor via /etc/default (LSB init)"
juju ssh -m "$MODEL" "$CONDUCTOR" \
  "echo 'DAEMON_ARGS=\"\$DAEMON_ARGS --config-dir $CONF_DIR\"' | sudo tee /etc/default/magnum-conductor >/dev/null && \
   sudo chmod 0644 /etc/default/magnum-conductor" </dev/null || die1 "writing /etc/default/magnum-conductor failed"
grep -q -- "--config-dir $CONF_DIR" <<<"$(rcap "/etc/init.d/magnum-conductor show-args")" \
  || die1 "show-args does not assemble --config-dir (conductor /etc/default not sourced)"
ok "conductor /etc/default wired; show-args carries --config-dir"

# ============================ 7.7b keystone v3 for magnum-api (D-047) ============================
say "7.7b force keystone v3 for magnum-api via drop-in (D-047)"
juju ssh -m "$MODEL" "$CONDUCTOR" sudo bash -s <<REOF || die1 "keystone-v3 drop-in failed"
set -e
grep -q -- '--config-dir $CONF_DIR' /etc/default/magnum-api 2>/dev/null \
  || echo 'DAEMON_ARGS="\$DAEMON_ARGS --config-dir $CONF_DIR"' >> /etc/default/magnum-api
chmod 0644 /etc/default/magnum-api
WWW=\$(awk -F'= ' '/^\[keystone_authtoken\]/{s=1} s&&/^www_authenticate_uri/{print \$2; exit}' /etc/magnum/magnum.conf)
AURL=\$(awk -F'= ' '/^\[keystone_authtoken\]/{s=1} s&&/^auth_url/{print \$2; exit}' /etc/magnum/magnum.conf)
WWW3=\${WWW/\/v2.0//v3};   case "\$WWW3"  in */v3) ;; *) WWW3="\${WWW3%/}/v3";;  esac
AURL3=\${AURL/\/v2.0//v3}; case "\$AURL3" in */v3) ;; *) AURL3="\${AURL3%/}/v3";; esac
printf '[keystone_authtoken]\nauth_version = v3\nwww_authenticate_uri = %s\nauth_url = %s\n[keystone_auth]\nauth_version = v3\nwww_authenticate_uri = %s\nauth_url = %s\n' \
  "\$WWW3" "\$AURL3" "\$WWW3" "\$AURL3" > $CONF_DIR/50-keystone-v3-override.conf
chmod 0644 $CONF_DIR/50-keystone-v3-override.conf
REOF
[ "$(rcap "sudo grep -c '^auth_version = v3\$' $CONF_DIR/50-keystone-v3-override.conf")" = 2 ] \
  || die1 "50-keystone-v3-override.conf missing auth_version=v3 in both sections"
[ "$(rcap "sudo grep -c -- '--config-dir $CONF_DIR' /etc/default/magnum-api")" = 1 ] \
  || die1 "/etc/default/magnum-api does not carry exactly one --config-dir line"
ok "keystone-v3 override written (both sections v3); magnum-api /etc/default wired"

# ============================ 7.8 restart + driver enabled ============================
say "7.8 restart conductor + api; verify both live cmdlines carry --config-dir"
ACT=$(rcap "sudo systemctl restart magnum-conductor magnum-api && sleep 3 && systemctl is-active magnum-conductor magnum-api")
[ "$(grep -c '^active$' <<<"$ACT")" = 2 ] || die1 "magnum-conductor and/or magnum-api not active after restart"
grep -q -- "--config-dir $CONF_DIR" <<<"$(rcap "ps -ww -C magnum-conductor -o args= | head -1")" \
  || die1 "running conductor cmdline lacks --config-dir after restart"
grep -q -- "--config-dir $CONF_DIR" <<<"$(rcap "ps -ww -C magnum-api -o args= | head -1")" \
  || die1 "running magnum-api cmdline lacks --config-dir after restart"
grep -q 'k8s_capi_helm_v1' <<<"$(rcap "sudo magnum-driver-manage list-drivers 2>/dev/null")" \
  || die1 "k8s_capi_helm_v1 not enabled in magnum-driver-manage list-drivers"
ok "both services active with --config-dir; k8s_capi_helm_v1 enabled"

say "PHASE-07 COMPLETE"
echo "All conductor-graft mechanisms in place. HEALTHY poll + create/delete regression are"
echo "phase-08 (D-011) -- no cluster/template exists yet on a fresh deploy."
exit 0
