diff --git a/bundle.yaml b/bundle.yaml index 93c676b..c6bd4b4 100644 --- a/bundle.yaml +++ b/bundle.yaml @@ -118,7 +118,11 @@ vault: charm: vault - channel: 1.8/stable + # D-068 (2026-07-02): pinned 1.8/stable -> 1.16/stable. 1.8.8 is EOL. NB: on the LIVE env this + # is a MAJOR upgrade (multi-minor jump; requires unseal keys ready, storage compat check, and + # re-unseal after restart) -- NOT a casual `juju refresh`. Verify channel exists + rehearse the + # upgrade before applying live. Clean pin for Roosevelt/redeploy. See D-068. + channel: 1.16/stable num_units: 1 # 3 on Roosevelt (D-009); HA backend decided there (C1) to: [lxd:11] bindings: @@ -126,6 +130,14 @@ access: metal-internal certificates: metal-internal cluster: metal-internal + # BUNDLEFIX-007 (D-067 amendment, 2026-07-02): external MUST equal access. The charm computes + # vault_url (access) AND vault_url_external (external) and, because the vault-kv interface + # ignores remote_binding (deprecated, LP#1895185), the external publish CLOBBERS vault_url on + # ALL kv consumer relations. If external falls to the '' default (metal-admin), every consumer + # (barbican-vault) is told to dial the metal-admin address while its AppRole secret_id is + # CIDR-bound to its metal-internal /32 -> deterministic login reject -> Barbican 500. + # Present on charm-vault stable/1.8 AND master (checked 2026-07-02) -- keep this line on upgrades. + external: metal-internal ha: metal-internal secrets: metal-internal shared-db: metal-internal diff --git a/docs/DOCFIX-064-phase08-changelist.md b/docs/DOCFIX-064-phase08-changelist.md new file mode 100644 index 0000000..d7224b7 --- /dev/null +++ b/docs/DOCFIX-064-phase08-changelist.md @@ -0,0 +1,81 @@ +# DOCFIX-064 -- phase-08 runbook change-list (DRAFT, 2026-07-01) + +RESERVED number: DOCFIX-064 (per changelog next-free note). This is the accumulated +phase-08 operator-runbook (single-consumer acceptance) sweep. Written as a change-LIST with +exact anchors + evidence so the edit is mechanical when phase-08 is finalized. NOT yet applied +to runbooks/phase-08-workload-cluster-acceptance.md. + +Scope note: these are fixes to the OPERATOR single-consumer acceptance path (capi-test-1 in +capi-mgmt scope). The multi-tenant tenant->cluster flow is a SEPARATE deliverable +(tenant-onboarding-v2-DRAFT.md). Some items overlap (image --public, image-by-UUID, template +ownership scope) because both paths hit them. + +-------------------------------------------------------------------------------- +## Items +-------------------------------------------------------------------------------- + +1. IMAGE SEED MUST create the image `--public` [Step 8.0, image create] + Evidence: a shared/owner-only kube image causes magnum cluster/template create to fail with + `Cluster type (vm, Unset, kubernetes) not supported` -- a non-owner (or the driver acting in + another project) cannot read `os_distro`, so type-derivation returns Unset. Fix: the seed + `openstack image create ... --public` (and re-verify visibility=public post-create). + +2. SEED HARDENING [Step 8.0] + - curl with retry + connect/max timeout (fail loud on partial/hung download). + - sha512 verify against the published manifest is a hard GATE (already present -- keep). + - poll image to `active` as a hard-gate loop (not a fixed sleep). + - POST-active property re-verify: kube_version, os_distro, visibility=public, disk_format. + +3. IMAGE-ABSENT PRESENCE GUARD [Step 8.0] + Explicitly branch "image present -> verify props" vs "absent -> seed", so a re-run does not + double-seed and a present-but-wrong-visibility image is caught (ties to item 1). + +4. IMAGE BY UUID, not name [Step 8.0 template create; 8.1] + Evidence: a doubled-quoted image NAME resolved to the literal `'name'` (no image) -> Unset + type -> 400. Passing the resolved UUID removes the quoting/resolution surface. Gate the UUID + with `grep -qE '^[0-9a-f-]{36}$'` before use. + +5. TEMPLATE CREATE -- OWNER PROJECT SCOPE [Step 8.0] + Evidence: `coe cluster template create/show` and `cluster create --cluster-template ` + resolve the template within the CALLER'S project (templates are visible by ownership). + A private template created in capi-mgmt is NOT selectable by name from admin scope (create + 404s while `template list` still shows it). Fix: run the template create AND the cluster + create in the SAME project scope that owns the template (capi-mgmt for the operator path). + Add the capi-mgmt scope preamble (resolve `capi-mgmt` --domain capi dynamically; export + OS_PROJECT_ID) before both. + +6. FLAVOR-FLOOR PRE-CHECK [Step 8.0 template create] + Magnum requires master/node flavors >= 2 vcpu and >= 2048 MB. Pre-check the chosen flavors + against the floor and fail loud, rather than surfacing an opaque driver error later. + +7. OCTAVIA PREREQ -- CAPTURE REAL EXIT [Prerequisites / Step 8.0] + The octavia-healthy probe must capture the actual command result and test it, NOT + `... | head || echo` (which masks failure -- head succeeds on empty input). Same + capture-and-test-result discipline applied across the onboarding v2 blocks. + +8. 8.1 PRE-CHECKS -- D-039 role + keypair [Step 8.1] + Before cluster create, assert (a) the trustor holds member + load-balancer_member (+ reader) + on the cluster project (D-039 -- else CAPO 403s at the Octavia LB step), and (b) the keypair + exists in the creating scope. Fail loud pre-create. + +9. POLICYD ZIP PATH UNDER $HOME (snap confinement) [appendix-C section C.3] + Evidence: `juju attach-resource ... /tmp/overrides.zip` failed "no such file or directory" + though the shell saw the file -- the confined juju snap cannot read /tmp. Build the zip under + $HOME. Also: `zip` is absent on the jumphost -- build via python3 zipfile (arcname=top-level). + Fix appendix-C C.3 to use a $HOME path and the python3 zipfile method (currently shows + `zip -j /tmp/overrides.zip`). + +-------------------------------------------------------------------------------- +## Cross-doc corrections (already staged in this package) +-------------------------------------------------------------------------------- +- appendix-C: manager domain-enumeration is own-domain-only on this cloud (2.5d finding); + the cloud-wide names-only leak does NOT manifest. (Applied in appendix-C-identity-rbac.md here.) +- appendix-D: cluster-create trust model; D.7 status updated (Stages 1-4 validated, Stage 6 + create_trust outstanding). Needs committing (was packaged, not yet in repo). + +-------------------------------------------------------------------------------- +## Sequencing +-------------------------------------------------------------------------------- +Apply items 1-8 to phase-08 and item 9 to appendix-C only AFTER Stage 6 (create_trust) is +resolved -- if the multi-tenant trust step surfaces a further phase-08-relevant fix (e.g. a +CONF.trust.roles pin), fold it into the same DOCFIX-064 sweep rather than reopening. diff --git a/docs/design-decisions.md b/docs/design-decisions.md index 45452c2..c4a63e7 100644 --- a/docs/design-decisions.md +++ b/docs/design-decisions.md @@ -1159,3 +1159,227 @@ **Related:** D-051 (reconciled here), D-046 (magnum trustee domain), D-039 (per-cluster app-cred roles), D-050 (resolved by supplying the zip), scs-0302-w1 (the authoritative standard), appendix-C (identity/RBAC reference). **Supersedes:** D-051's [LIVE-READ PENDING] gate (discharged). + + +--- + +## D-065: Fix identity:create_trust non-resolving policy template (Magnum multi-tenant cluster blocker) + +**Status:** ADOPTED 2026-07-02. Extends the D-064 defect-class fix to the keystone trust family. +Behavioral acceptance = a tenant service identity clears create_trust and the cluster converges +(pending re-attach + retest at adoption). + +**Context / trigger:** multi-tenant validation (tenant acme). A tenant service identity (acme-svc, +app cred, holding member + load-balancer_member on acme-prod) creating a Magnum cluster failed at +create_trust with identity:create_trust 403 -- AFTER D-064 unblocked create_user (the trustee user +was created). The SAME 403 reproduced (a) for admin via password auth, and (b) via a DIRECT +`openstack trust create` with trustor == caller == self, no magnum in the path. That proves it is +identity-independent and not a magnum bug. + +**Root cause:** the charm-rendered base policy.json defines +identity:create_trust = "user_id:%(trust.trustor_user_id)s" -- the legacy NON-target-prefixed +template that does NOT resolve on Caracal (keystone populates target.trust.trustor_user_id). The +rule evaluates user_id == "" -> always false -> every trust create 403s regardless of caller. Same +defect class as D-064 (create_user's %(user.domain_id)s vs the resolving %(target.user.domain_id)s). +Keystone's OWN shipped default is the target-prefixed form: +"user_id:%(target.trust.trustor_user_id)s". + +**Masking:** create_trust was never reached before D-064 because create_user 403'd first. The +2026-06-09 CREATE_COMPLETE was on the pre-2026-06-11-teardown cloud; this redeploy's create_trust +had never been exercised until D-064 opened the path. + +**Causation proof (verify-before-fix):** with use-policyd-override=false (base policy only), the +direct trust create STILL 403s -> the D-064 override is NOT the cause; the base charm policy owns +the bug. Override re-enabled immediately (PO: restored) -- the cloud was not left without the +manager persona. + +**Decision:** add identity:create_trust = "user_id:%(target.trust.trustor_user_id)s" to the D-064 +override (policies/domain-manager-policy.yaml). Exactly the keystone-shipped default, and +consistent with this cloud's own already-correct sibling rules (list_trusts_for_trustor/trustee). +No admin fallback (matches the shipped default; trusts are self-delegation only). No manager branch +(create_trust is universal self-delegation, not part of the domain-manager persona). + +**Scope (evidence-gated; NOT a blanket family sweep):** create_trust is the ONLY demonstrably +broken trust rule. list_trusts_for_trustor / list_trusts_for_trustee already carry the +target-prefixed form (match shipped). get_trust / delete_trust / list_trusts / list_roles_for_trust +/ get_role_for_trust are live "" (empty) but code-guarded in keystone (_trustor_trustee_only; +delete checks user==trustor or is_admin) -> functional, not broken, not blockers; left unchanged. + +**Second-check clearance:** keystone's create_trust also enforces _require_trustor_has_role_in_project +(trustor must hold each delegated role on the project). Magnum delegates context.roles (the caller's +token roles), which by construction are a subset of what the caller holds on the scoped project -> +passes. acme-svc holds member + load-balancer_member on acme-prod; confirmed clear on retest. + +**Validation:** oslo.policy parses all 38 rules; YAML + ASCII + connector lint clean. + +**PROPOSED / OPEN (separate hardening, NOT actioned here):** live identity:list_trusts = "" lets any +authenticated user enumerate ALL trusts (trustor/trustee/project relationships) cloud-wide -- an +info-disclosure the newer keystone default tightens to +"rule:admin_required or (role:reader and system_scope:all)". This is a security-posture change, not +a functional blocker (create/get/delete are code-guarded), so it is recorded as an open item to rule +on rather than bundled into this blocker fix. If adopted, add list_trusts (and optionally the read +rules) at the shipped defaults in the same override. + +**Roosevelt:** carry the create_trust override with the rest of the D-064 policy. The base-charm +non-resolving-template defect is upstream-worthy -- report against the keystone charm's rendered +policy.json for Caracal (2024.1). Remove on 2024.2+ (native secure-RBAC ships the resolving defaults). + +**Related:** D-064 (same defect class; same override file), D-039 (the trustor roles that satisfy the +trustor-has-role check), D-046 (magnum trustee domain), D-035 (mgmt cluster), appendix-D (trust +model). **Revises:** appendix-D section D.3 -- the role-delegation hypothesis is REFUTED (a +clean-role tenant identity still 403'd; the cause was the non-resolving policy template, not role +delegation). appendix-D to be corrected on finalize. + + +--- + +## D-066: Tenant account model (Option-3 split); cluster-create requires PASSWORD auth + +**Status:** ADOPTED 2026-07-02. Standard per-tenant identity set, used from the first tenant so +v1 and Roosevelt onboard identically (no interim model to retrofit). Trust path validated live +through create_user + create_trust + cert-gen entry (see D-064/D-065); full cluster completion is +gated on D-067 (Barbican/Vault substrate). + +**The per-tenant accounts (operator creates the domain + manager; manager creates the rest):** +- `-domain-admin` -- `manager` on the domain (SCS Domain Manager, D-051/D-064). Tenant + IAM self-service. Operator-provisioned; password handed over. +- `-cluster` -- `member` + `load-balancer_member` on `-prod`. PASSWORD auth only. + Sole purpose: `coe cluster` lifecycle (this is the identity that mints the Keystone trust). Also + owns whatever magnum-internal creds the driver mints under it (the per-cluster CAPO child cred, + D-039). Manager-created. +- `-svc` -- `member` + `load-balancer_member` on `-prod`. UNRESTRICTED app cred, + for tenant-authored non-trust automation (CI/pipelines). NOT used for cluster-create. Manager-created. + +**Why password (not app cred) for cluster-create -- the hard constraint:** keystone on this build +blocks trust creation from application-credential tokens unconditionally. Source +(keystone/api/trusts.py `_check_application_credential`, read live 2026-07-02): if the token method +is `application_credential` it raises "Using method 'application_credential' is not allowed for +managing trusts" -- and the docstring states this applies "regardless of the 'unrestricted' flag" +(this build is STRICTER than upstream, which exempts unrestricted). Confirmed live: an unrestricted +app cred (which minted a child cred, so genuinely unrestricted) still hit this block; the same +identity via PASSWORD passed it. So the cluster-creator must be password-authenticated. + +**Security rationale (this is the correct side of the control, not a workaround):** the block exists +to stop a single-project app-cred token from extending the trust delegation chain beyond its scope. +A password token is full-user scope, so a trust minted from it delegates only what the user already +holds (member + load-balancer_member on its own project) -- no escalation, confined to the tenant. +Splitting `-cluster` (trust-capable password, cluster ops only) from `-svc` (app cred, everything +else) isolates the trust-capable credential to one identity doing one job. + +**REJECTED:** `CONF.security_compliance.allow_insecure_application_credential_trust_escalation` +(the escape hatch in `_check_application_credential`). Enabling a setting named "insecure" + +"trust_escalation" -- which defeats a control designed to contain a compromised tenant credential -- +undercuts the hard-isolation thesis of the whole build. Not adopted. + +**Roosevelt:** onboard every tenant with this three-account set. Script it (scripts/tenant-onboard.sh, +DRAFT this session). CAPO-cred ownership under `-cluster` confirmed acceptable (magnum-internal, +created/destroyed with the cluster) -- the empirical "whose identity mints the child cred" check +is pending the D-067 substrate fix (cluster-create dies at cert-gen before the driver's mint step). + +**Related:** D-051/D-064 (manager persona), D-065 (create_trust template fix), D-039 (trustor roles). +**Revises:** appendix-D D.3 (the app-cred-creator assumption is wrong -- corrected in appendix-D this session). + +--- + +## D-067: barbican-vault -> Vault (vault-kv) must use the metal-internal plane (live drift; the cert-gen blocker) + +**Status:** ADOPTED 2026-07-02 (fix pending next session -- a live rebind, gated). This is the defect +that blocks multi-tenant cluster COMPLETION (cluster-create clears trust, then dies generating certs). + +**Symptom chain:** tenant cluster-create (password identity) cleared create_user + create_trust, +then `CREATE_FAILED: Failed to create certificates`. Root cause traced through magnum -> Barbican +(`POST /v1/secrets` 500) -> castellan vault_key_manager `_build_auth_headers` -> Vault AppRole login +rejected. Vault's RAW error (not an HTTP code -- the authoritative read): +`source address "10.12.8.176" unauthorized through CIDR restrictions on the secret ID`. + +**Root cause:** barbican reaches Vault at Vault's METAL-ADMIN address (vault_url=http://10.12.8.190:8200, +egress src 10.12.8.176 via eth2). Vault's barbican-vault AppRole binds secret_id to the metal-internal +CIDR (where service-to-service traffic belongs, D-052/D-053). Off-plane source -> Vault rejects. This +is NOT secret_id expiry: `juju run vault/leader refresh-secrets` DID rotate the secret_id (barbican.conf +re-rendered, barbican restarted) and the login STILL failed with the CIDR error. The earlier +"expired TTL" reading was WRONG -- corrected here. + +**The bundle is CORRECT; the LIVE env drifted.** bundle.yaml already binds every secrets-path endpoint +to metal-internal: vault `secrets: metal-internal` (L130), barbican `secrets: metal-internal` (L667), +barbican-vault `secrets-storage: metal-internal` (L700). So a redeploy from the bundle would NOT have +this defect. The live deployment's effective binding for the barbican<->vault secrets path resolves to +metal-admin -- a drift from the bundle (mechanism: TBD by read-only `juju show-application` binding + +relation-data diagnosis next session; candidate causes: bindings added to bundle post-deploy and never +applied live, or the subordinate address-advertisement not following the bound space). + +**Fix (rebind, do NOT widen the CIDR):** reconcile the live binding to the bundle so barbican egresses +from its metal-internal address (10.12.12.110), which the AppRole CIDR already trusts. Widening the +AppRole CIDR to metal-admin is REJECTED: it loosens a security control, legitimizes east-west traffic +on the wrong plane, gets reverted by the charm, and leaves the next Vault consumer exposed. Next-session +sequence: (1) read-only binding diagnosis (`juju show-application vault barbican barbican-vault`, +spaces<->subnets map); (2) gated `juju bind` / relation refresh to metal-internal; (3) re-run +refresh-secrets if needed; (4) confirm barbican AppRole login = HTTP 200 from a metal-internal source; +(5) re-run tenant cluster-create -> cert-gen clears. + +**Related:** D-052/D-053 (six-plane network; metal-internal carries east-west service traffic), +D-057 (prior topology-didn't-follow-binding defect -- same family). **Corrects:** the in-session +"secret_id TTL expiry" hypothesis (refuted by the refresh-secrets test). + +### D-067 -- AMENDMENT (2026-07-02, post-fix): corrected mechanism; FIXED live; CLOSED for v1 + +The "bundle correct / live drifted" framing above is WRONG and is corrected here (append-only +discipline; the body above stands as the historical record). Root cause, read from charm source at +the exact vendored versions: + +1. Live bindings were NEVER drifted -- `juju show-application` matched the bundle exactly (all + secrets-path endpoints metal-internal). The drift was one relation-data key: vault/0 advertised + binding-correct ingress (10.12.12.117) but an explicit `vault_url` of http://10.12.8.190:8200. +2. Mechanism: charm-vault `send_vault_url_and_ca()` (re-fires on every non-update-status hook) + computes vault_url from the `access` binding AND vault_url_external from the `external` binding, + publishing BOTH when they differ. The vault-kv interface (vendored commit 6f7848c, per + src/build.lock) IGNORES the `remote_binding` selector (deprecated, LP#1895185) and writes the + single `vault_url` key on ALL relations -- so the external publish CLOBBERS the access URL for + every kv consumer. Simultaneously the AppRole secret_id is CIDR-bound to the CONSUMER's + secrets-relation ingress /32 (10.12.12.110/32) -- self-inconsistent whenever access != external. +3. Why external resolved to metal-admin: the bundle OMITS vault's `external` binding, so it falls + to the `''` default (metal-admin). D-052's judgment call ("vault external -> metal-admin, + operator/unseal path") rested on a mistaken premise: in this charm the `external` endpoint's only + functional use is this kv-URL advertisement (plus an inert VIP check); operator/unseal access + does not traverse it, and the listener binds [::]:8200 regardless. So a fresh deploy from the + bundle REPRODUCES the failure -- this was a bundle+charm defect, not live drift. + +Fix as executed (2026-07-02, gated): `juju bind -m openstack vault external=metal-internal`. The two +computed URLs became equal, the charm's own equality guard suppressed the second publish, relation +data flipped to http://10.12.12.117:8200 within one poll cycle, barbican-vault re-rendered +barbican.conf, and no refresh-secrets was needed. Validated end-to-end: AppRole login HTTP 200 from +the authentic source (kernel route src 10.12.12.110), then a full admin `openstack secret` +store/get/payload-compare/delete round-trip through the exact POST /v1/secrets path that 500'd. +Bundle hardened with BUNDLEFIX-007 (`external: metal-internal` + comment); the double-publish is +present on charm-vault master as of 2026-07-02, so the guard is permanent, including the D-068 +1.16 pin. + +Observation logged during validation (not actioned): barbican's `Secret href` renders as +https://None:9312/... (host_href unset/None). Cosmetic for barbicanclient/castellan consumers +(clients extract the UUID and dial their own catalog endpoint) -- stage-6 cluster cert refs will +exercise it for real; investigate only if cert-ref retrieval misbehaves. + +**Status:** CLOSED for v1 (fixed + validated live 2026-07-02). **Adds:** BUNDLEFIX-007. +**Corrects:** the "bundle is CORRECT; the LIVE env drifted" mechanism above. **Related:** D-052 +(premise correction noted), D-068 (carry BUNDLEFIX-007 through the 1.16 pin). + +--- + +## D-068: PROPOSED -- Vault substrate hardening (Roosevelt) + +**Status:** PROPOSED / OPEN 2026-07-02. Grouped substrate items surfaced while diagnosing D-067. +None block v1 tenant work; all are Roosevelt-durability items. + +1. **Vault version.** Live is 1.8.8 (charm 1.8/stable) -- EOL. bundle.yaml pinned to 1.16/stable + this session (D-068). Live upgrade is a MAJOR operation (multi-minor jump; unseal keys ready, + storage-format compat, re-unseal after restart, ideally rehearsed) -- NOT a casual `juju refresh`. + Verify the channel exists and rehearse before applying live. +2. **Vault over cleartext HTTP.** vault_url is `http://...:8200`; barbican<->Vault secret writes + (including cluster CA private keys) cross metal-internal unencrypted. Enable Vault listener TLS + for Roosevelt. +3. **AppRole credential lifecycle.** `refresh-secrets` exists as a manual action -> implies no + automatic secret_id renewal. Audit secret_id TTLs across all vault-kv consumers; provision + long/renewable TTLs or charm auto-renewal; add a proactive health probe that validates each + consumer's Vault auth (the T4-style login check) BEFORE expiry, rather than discovering failure + via a tenant's failed cluster-create. + +**Related:** D-067 (surfaced these), D-052/D-053. diff --git a/docs/session-findings-2026-07-02.md b/docs/session-findings-2026-07-02.md new file mode 100644 index 0000000..2805d16 --- /dev/null +++ b/docs/session-findings-2026-07-02.md @@ -0,0 +1,77 @@ +# Session findings -- 2026-07-02 (multi-tenant tenant->cluster buildout) + +## Executive summary +The tenant IDENTITY/TRUST path is DONE and PROVEN. A tenant password identity now creates a Magnum +cluster through create_user (D-064), create_trust (D-065), and into certificate generation. Cluster +COMPLETION is blocked one step later by an OPERATOR-side Barbican/Vault substrate defect (D-067), +independent of the tenant model. The Option-3 tenant account model (D-066) is adopted and to be used +from the first tenant. Next session: fix D-067 (live), then full tenant buildout + tenant-facing tests. + +## The trust-blocker chain (how we got from "cluster 403s" to "done + one substrate bug") +1. D-064 (prior): create_user template fix unblocked trustee-user creation. +2. create_trust then 403'd for EVERY caller (admin included), even trustor==self via direct + `openstack trust create`. Root cause: base policy shipped identity:create_trust with the + non-resolving `user_id:%(trust.trustor_user_id)s` (Caracal populates target.trust.trustor_user_id). + -> D-065: override with the target-prefixed form keystone itself ships. PROVEN by toggling the + override off (still 403 -> base policy owns it) then on. +3. After D-065, create_trust via APP CRED still failed: keystone `_check_application_credential` + blocks trust creation from app-cred tokens "regardless of the unrestricted flag" (this build's + docstring). -> D-066: cluster-create MUST be PASSWORD auth; adopt Option-3 account split. + `allow_insecure_application_credential_trust_escalation` REJECTED (isolation). +4. Password create_trust PASSED. Cluster then failed at cert-gen -> Barbican 500 -> castellan + vault_key_manager -> Vault AppRole login rejected: "source address 10.12.8.176 unauthorized + through CIDR restrictions". -> D-067. + +## D-067 root cause (and a corrected mis-diagnosis) +barbican reaches Vault on the METAL-ADMIN plane (vault_url=10.12.8.190, egress 10.12.8.176). Vault's +barbican-vault AppRole binds the secret_id to the METAL-INTERNAL CIDR (where east-west service traffic +belongs, D-052/D-053). Off-plane source -> rejected. The bundle is CORRECT (vault/barbican/barbican-vault +all bind secrets endpoints to metal-internal, lines 130/667/700); the LIVE env drifted. Fix = live +rebind to metal-internal (gated, next session), NOT CIDR-widen. +CORRECTED: mid-session I hypothesized "secret_id TTL expiry". REFUTED -- `juju run vault/leader +refresh-secrets` rotated the secret_id (barbican.conf re-rendered, service restarted) and the login +STILL failed with the CIDR error. It was never expiry; it is plane/CIDR. + +## What is validated live (tenant acme) +- Manager persona self-service via CLI (create_project/user/grant) -- D-064 G3. PASS. +- Tenant isolation: anti-escalation (admin grant DENIED); cross-domain resource reads DENIED/hidden; + domain enumeration OWN-DOMAIN-ONLY (tighter than appendix-C's SCS worst-case -- appendix-C corrected). +- App-cred + keypair self-mint; tenant L3 (net/subnet/router/ext-gw, SNAT proven) by a non-admin + app-cred identity. +- Cluster template create (image by UUID -- name form has a quoting/derivation hazard). +- Cluster create through create_user + create_trust (password) into cert-gen. + +## Decisions logged +- D-066: Option-3 tenant accounts (domain-admin/cluster/svc); cluster-create requires password auth. +- D-067: barbican-vault -> Vault must use metal-internal (live drift; the cert-gen blocker). ADOPTED, fix pending. +- D-068: PROPOSED -- Vault substrate hardening (1.16 pin [bundle done], TLS, AppRole lifecycle). + +## Probe-discipline lessons (now runbook conventions -- these recurred and cost time) +1. Validate raw output WHOLE, never extract-then-check. A `tr -dc 0-9` MARK guard turned an error + string ("...10.12.8.30:17070...") into MARK=123101283017070 and passed. Use `case "$raw" in + ''|*[!0-9]*) fail;; *) ok;; esac`. +2. Whitelist-print secrets, never blacklist-redact. `approle_secret_id` leaked past a `secret`-keyed + redact (the key is *_secret_id*). Print only an allowlist of safe fields; never pipe secrets. +3. No `exit`/bare-`return` in interactive PASTE blocks (they escape to the login shell and logged the + operator out). Subshell-wrap `( ... )`. NOTE: executed .sh scripts may use exit normally. +4. Privileged reads over `juju ssh` use `sudo cat file | ...`, never `sudo cmd < file` (the redirect + runs UNPRIVILEGED -> Permission denied). +5. Use the deployment's DECLARED endpoint/scheme, not the conventional one (assumed Vault https; it + serves http -- every probe errored on scheme until corrected). +6. A parser that can print NOTHING has a silent third state -- read raw + self-report inputs (field + lengths, raw body) so a malformed-request 400 can't masquerade as an auth failure. + +## Roosevelt hardening backlog (from this session) +- D-067/D-068: metal-internal binding discipline for ALL vault-kv consumers; Vault 1.16 + TLS; + AppRole secret_id lifecycle (TTL/renewal + proactive auth health probe). +- Endpoint/credential "follow the topology" is now a recurring class (with D-057): consider stable + VIP/DNS endpoints for substrate services so leader/re-IP changes don't silently break consumers. + +## Next session plan +1. Repair live env: read-only binding diagnosis (`juju show-application vault barbican barbican-vault`, + spaces<->subnets), then GATED `juju bind` of the barbican<->Vault secrets path to metal-internal; + re-run refresh-secrets if needed; confirm barbican AppRole login HTTP 200 from metal-internal. +2. Re-run tenant cluster-create (acme, ${CLIENT}-cluster password) -> cert-gen clears -> watch to + CREATE_COMPLETE; capture the CAPO child-cred mint identity (confirms D-066). +3. Full tenant buildout via scripts/tenant-onboard.sh; then clean-room `beta` (zero admin fallback). +4. Tenant-facing tests: kubeconfig, nodes/CNI/CCM, a tenant LB, tenant isolation from a second tenant. diff --git a/docs/v1-redeploy-changelog.md b/docs/v1-redeploy-changelog.md index b32e51a..e76b485 100644 --- a/docs/v1-redeploy-changelog.md +++ b/docs/v1-redeploy-changelog.md @@ -1306,10 +1306,97 @@ manager can enumerate domain + role names cloud-wide (list_domains/list_roles); remove override on any 2024.2+ upgrade. +## 2026-07-02 -- multi-tenant trust blocker -> D-065 (create_trust template fix) + +Multi-tenant validation (tenant acme, manager persona) reached cluster-create and failed at +identity:create_trust 403 -- AFTER D-064 unblocked create_user (trustee user created). Reproduced +identity-independently: admin (password) and a direct `openstack trust create` with trustor==self +both 403. Root cause: base charm policy.json ships create_trust with the non-resolving +user_id:%(trust.trustor_user_id)s (Caracal populates target.trust.trustor_user_id) -> evaluates +false for everyone. Same defect class as D-064. Causation proven by toggling use-policyd-override=false +(still 403 -> base policy owns it, override exonerated), then re-enabling (PO: restored). + +Fix (D-065): added identity:create_trust = user_id:%(target.trust.trustor_user_id)s (keystone's +shipped default) to the override. create_trust is the ONLY broken trust rule (list_trusts_for_* already +target-prefixed; get/delete/list/roles-for-trust are empty but code-guarded -> left unchanged -- +evidence-gated, not a blanket sweep). oslo.policy parses all 38 rules; ASCII+LF+connector clean. +appendix-D D.3 (role-delegation hypothesis) REFUTED and to be revised. list_trusts="" info-disclosure +recorded PROPOSED/OPEN, not actioned. + +## 2026-07-02 -- multi-tenant tenant->cluster buildout: trust path PROVEN; Barbican/Vault substrate blocker + +Ran a full multi-tenant buildout (tenant `acme`) validating the SCS Domain Manager persona end to +end and the tenant cluster-create path. Outcomes: +- IDENTITY/TRUST PATH DONE + PROVEN: D-064 (create_user) + D-065 (create_trust template fix) + the + D-066 Option-3 password model carried cluster-create through create_user, create_trust, and INTO + cert generation. Manager self-service (create_project/user/grant) validated via CLI (G3); tenant + isolation validated (anti-escalation DENY; cross-domain DENY; domain-enumeration own-domain-only, + tighter than appendix-C's SCS worst-case -- appendix-C already corrected). +- KEY CONSTRAINT (D-066): keystone blocks trust creation from app-cred tokens regardless of + unrestricted (this build's _check_application_credential). Cluster-creator MUST be password-auth. + Adopted Option-3 3-account model (domain-admin / cluster / svc). allow_insecure_...escalation REJECTED. +- BLOCKER (D-067): cluster-create dies at cert-gen -- Barbican 500 -> Vault AppRole login rejected by + CIDR restriction ("source address 10.12.8.176 unauthorized"). barbican->Vault uses metal-admin; + should be metal-internal (D-052/D-053). Bundle is CORRECT (all secrets endpoints metal-internal); + LIVE drifted. Fix = live rebind (gated, next session), NOT CIDR-widen. refresh-secrets rotated the + secret_id but did not fix it (CIDR, not expiry -- "TTL" hypothesis refuted). +- D-068 PROPOSED: Vault substrate hardening (1.16 pin [bundle done], TLS, AppRole lifecycle). + +Package this session: bundle vault 1.16/stable pin; D-066/067/068; appendix-D corrected (D.3 refuted); +appendix-C account table -> Option-3; tenant-onboarding-v2 updated to Option-3 + password-create + +substrate note; scripts/tenant-onboard.sh (DRAFT); session-findings doc (incl. probe-discipline lessons). + +PROBE-DISCIPLINE lessons logged (recurred this session; now runbook conventions): (1) validate raw +output WHOLE, never extract-then-check (a `tr -dc 0-9` MARK guard synthesized a fake number from an +error string); (2) whitelist-print secrets, never blacklist-redact (an `approle_secret_id` leaked past +a `secret`-keyed redact); (3) no `exit`/bare-`return` in interactive paste -- subshell-wrap (an `exit` +logged the operator out); (4) privileged reads over juju ssh use `sudo cat`, never `sudo cmd < file` +(unprivileged redirect -> Permission denied); (5) read the deployment's DECLARED endpoint/scheme, not +the conventional one (assumed Vault https; it serves http). + ### Next-free numbers -Design decision: D-065. Doc fix: DOCFIX-065. (D-064 ASSIGNED above = reconcile D-051 to scs-0302 +Design decision: D-069. Doc fix: DOCFIX-065. (D-064 ASSIGNED above = reconcile D-051 to scs-0302 + create-op templating fix. DOCFIX-064 RESERVED = phase-08 runbook sweep (image --public; seed retry/timeout + poll hard-gate + post-active property re-verify; image-absent guard; template capi-mgmt scope preamble + flavor floor; 8.1 D-039 role + keypair pre-checks; octavia prereq real-exit capture), to be written at phase-08 close. D-063 = capi-mgmt-sg 0.0.0.0/0 hardening, PROPOSED/OPEN. DOCFIX-063 = phase-07 reconciliation, six fixes.) + +## 2026-07-02 (session 2) -- D-067 FIXED live + CLOSED; root cause corrected to bundle+charm defect + +Fixed the Barbican/Vault cert-gen blocker with ONE gated mutation: +`juju bind vault external=metal-internal`. Root cause CORRECTED from "live drift" to bundle+charm: +the bundle omitted vault's `external` binding (-> '' default metal-admin); charm-vault publishes +vault_url (access) then vault_url_external (external), and the vault-kv interface ignores +remote_binding (LP#1895185, verified at vendored commit 6f7848c), so the external URL clobbers +vault_url on all kv consumers while the AppRole stays CIDR-bound to the consumer's metal-internal +/32. Diagnosis was code-grounded (charm-vault stable/1.8 + build.lock-pinned interface), gated: +Gate 1 (config/network-get/hacluster preflight) -> Gate 2 (bind + relation-data + conf-render +polls; base64-to-file delivery after a raw-paste truncation) -> Gate 3 (AppRole login 200 from +authentic source .110 + admin secret store/get/payload/delete round-trip). No refresh-secrets +needed; no CIDR widening. BUNDLEFIX-007 adds `external: metal-internal` permanently (double-publish +present on charm-vault master). D-052's "vault external -> metal-admin operator path" premise +corrected (endpoint's only functional use is the kv URL advertisement). D-067 amendment appended; +tenant-onboard.sh stage-6 gating note updated to RESOLVED. + +Also this session: handoff commit 22a1eef verified 7/8 (bundle.yaml was DELETED by the loose-file +commit pattern; restored + verified in 10e9186 -- diff vs pre-delete is exactly the vault 1.16 pin). +Observation (logged, not actioned): barbican Secret href renders https://None:9312/... (host_href +None) -- cosmetic for UUID-extracting clients; watch during stage-6 cert refs. + +### Next-free numbers +Design decision: D-069. Doc fix: DOCFIX-065 (unchanged). Bundle fix: BUNDLEFIX-008 (007 ASSIGNED +above; repo tree grep showed 002-004 but 001-006 exist in history, so 007 is the first safe free). + +### 2026-07-02 (session 2, addendum) -- tenant-onboard.sh Option-3 keypair defect fixed pre-run + +Code-review catch before the first clean-room run: stage3 created the nova keypair as -svc while +stage6 creates the cluster as -cluster. Nova keypairs are USER-scoped; magnum validates the keypair +in the REQUEST context (magnum 18.0.0 attr_validator.validate_keypair -> cli.nova().keypairs.get, +invoked from cluster.py:545 at cluster create) and node boot resolves it as the TRUSTOR (the +per-cluster trust app-cred impersonates the trustor). A -svc-owned key is invisible to -cluster -> +deterministic KeyPairNotFound 400 at stage6. Option-3-specific: all prior validations used a single +identity that owned its own key. Fix: keypair created by -cluster (stage3, second subshell); +template create (stage5) drops --keypair (keypair_id optional, default=None -- verified in source); +stage6 already supplies --keypair as -cluster. Rejected alternative: creating the template as +-cluster (blurs the Option-3 svc/cluster division for no gain). diff --git a/policies/domain-manager-policy.yaml b/policies/domain-manager-policy.yaml index 893b9df..796995b 100644 --- a/policies/domain-manager-policy.yaml +++ b/policies/domain-manager-policy.yaml @@ -92,6 +92,7 @@ "identity:list_projects": "(rule:is_domain_manager and token.domain.id:%(target.domain_id)s) or rule:cloud_admin or rule:admin_and_matching_domain_id" "identity:get_project": "(rule:is_domain_manager and token.domain.id:%(target.project.domain_id)s) or rule:cloud_admin or rule:admin_and_matching_target_project_domain_id or project_id:%(target.project.id)s" "identity:create_project": "(rule:is_domain_manager and token.domain.id:%(target.project.domain_id)s) or rule:cloud_admin or rule:admin_and_matching_target_project_domain_id" +"identity:create_trust": "user_id:%(target.trust.trustor_user_id)s" "identity:update_project": "(rule:is_domain_manager and token.domain.id:%(target.project.domain_id)s) or rule:cloud_admin or rule:admin_and_matching_target_project_domain_id" "identity:delete_project": "(rule:is_domain_manager and token.domain.id:%(target.project.domain_id)s) or rule:cloud_admin or rule:admin_and_matching_target_project_domain_id" "identity:list_user_projects": "(rule:is_domain_manager and token.domain.id:%(target.user.domain_id)s) or rule:owner or rule:admin_and_matching_domain_id" @@ -115,3 +116,4 @@ "identity:remove_user_from_group": "(rule:is_domain_manager and token.domain.id:%(target.group.domain_id)s and token.domain.id:%(target.user.domain_id)s) or rule:cloud_admin or rule:admin_and_matching_target_group_domain_id" "identity:check_user_in_group": "(rule:is_domain_manager and token.domain.id:%(target.group.domain_id)s and token.domain.id:%(target.user.domain_id)s) or rule:cloud_admin or rule:admin_and_matching_target_group_domain_id" "identity:add_user_to_group": "(rule:is_domain_manager and token.domain.id:%(target.group.domain_id)s and token.domain.id:%(target.user.domain_id)s) or rule:cloud_admin or rule:admin_and_matching_target_group_domain_id" + diff --git a/runbooks/appendix-C-identity-rbac.md b/runbooks/appendix-C-identity-rbac.md index 5c8a95a..3988239 100644 --- a/runbooks/appendix-C-identity-rbac.md +++ b/runbooks/appendix-C-identity-rbac.md @@ -37,10 +37,16 @@ | admin (operator super-admin) | Admin | admin domain + admin project (= cloud_admin) | Cloud operator; full authority. Bootstrap identity. | | admin (as Magnum trustor) | member + load-balancer_member + reader | capi-mgmt project | So the app-cred Magnum mints per cluster carries Octavia authority for the apiserver LB (D-039). These are the frozen trustor roles delegated into each cluster trust. | | magnum_domain_admin | Admin | magnum domain | Magnum trustee domain admin; creates the per-cluster trustee USER at cluster-create (D-046; Magnum docs). Works via the D-064 create-op fix -- no extra grant needed. Recreated by the `domain-setup` charm action after every teardown/redeploy (D-046). | -| -domain-admin | manager | the tenant's domain | SCS Domain Manager persona (D-051/D-064). Operator provisions the domain + this one account; the tenant self-services users, projects, and member/load-balancer_member grants from there. | -| human users | member (+ load-balancer_member if they use LBs or Magnum) | the tenant's project(s) | Created and assigned by the tenant's own domain-manager via Horizon/CLI. Operator is not in the loop. | -| -ci / service accounts | member + load-balancer_member | the tenant's project | Backing identity for the application credential that CI/automation authenticates with. load-balancer_member so tenant CI can drive Magnum/LBs. | -| per-cluster trustee | (delegated via trust -- not a direct grant) | -- | Magnum mints this at cluster-create and deletes it at cluster-delete. It carries the trustor's frozen roles through the trust (D-039). Never assign roles to it by hand. | +| -domain-admin | manager | the tenant's domain | SCS Domain Manager persona (D-051/D-064). PASSWORD identity. Operator provisions the domain + this one account; the tenant self-services everything below via CLI/Horizon. | +| -cluster | member + load-balancer_member | the tenant's project | D-066 Option-3. PASSWORD identity, trust-capable. SOLE purpose: `coe cluster` lifecycle (mints the Keystone trust -- MUST be password, since keystone blocks trust creation from app-cred tokens; see D-066). Also owns the magnum-internal CAPO child cred the driver mints per cluster (D-039). Manager-created. | +| -svc | member + load-balancer_member | the tenant's project | D-066 Option-3. UNRESTRICTED app cred, for tenant-authored non-trust automation (CI/pipelines). NOT used for cluster-create (app-cred trust creation is blocked). Manager-created. | +| human users | member (+ load-balancer_member if they use LBs) | the tenant's project(s) | Created/assigned by the tenant's domain-manager. Operator not in the loop. | +| per-cluster trustee | (delegated via trust -- not a direct grant) | -- | Magnum mints at cluster-create, deletes at cluster-delete. Carries the trustor (-cluster) roles through the trust (D-039). Never assign roles by hand. | + +IMPORTANT (D-066): the cluster-creator (`-cluster`) authenticates by PASSWORD, not app cred. +Keystone blocks trust creation from application-credential tokens regardless of the unrestricted flag +(this build's `_check_application_credential`), and Magnum needs a Keystone trust at cluster-create. +The app-cred identity (`-svc`) is for non-trust automation only. See appendix-D. Provisioning direction: the operator creates a tenant's DOMAIN and its single `manager` account, then hands off. Everything below the domain (users, projects, member/LB grants) is tenant @@ -102,8 +108,16 @@ ## C.5 Known limitations (carried from scs-0302) -- A domain manager can enumerate ALL domain names/ids (`list_domains`) and ALL role names - (`list_roles`) cloud-wide. This is names/ids only -- no access to other domains' resources -- +- CORRECTION (verified live 2026-07-01): on THIS deployment a domain manager's `domain list` + returns ONLY its own domain -- keystone scope-filters the result even though the D-064 policy + authorizes `list_domains` for a manager. So the cloud-wide domain-name enumeration described + in the SCS worst-case does NOT manifest here; isolation is tighter than the standard warns. + Role names (`list_roles`) may still be broadly visible (the persona needs to resolve role + names); this remains names-only and confers no resource access. The original SCS caveat is + retained below as version/config-dependent, NOT asserted as this cloud's behavior: +- [SCS worst-case, not observed here] A domain manager could enumerate ALL domain names/ids + (`list_domains`) and ALL role names (`list_roles`) cloud-wide. This is names/ids only -- + no access to other domains' resources -- and is required for the manager to resolve domains/roles by name. It is inherent to the pre-2024.2 transitional policy; upstream RBAC-scoping of domain listing is a pending fix. - The persona relies on `enforce_scope=False` (old-style policy). It is a bridge, not the diff --git a/runbooks/appendix-D-magnum-trust-model.md b/runbooks/appendix-D-magnum-trust-model.md index f161763..6df63f7 100644 --- a/runbooks/appendix-D-magnum-trust-model.md +++ b/runbooks/appendix-D-magnum-trust-model.md @@ -1,173 +1,100 @@ -# Appendix D -- Magnum cluster-create trust model (multi-tenant) +# Appendix D -- Magnum cluster-create trust model (multi-tenant) [REVISED 2026-07-02] -Fills the gap the onboarding runbook Stage 7 marks [PENDING]: exactly which identity -creates a Magnum cluster, and why the Keystone trust delegation constrains that choice. -Grounded in the magnum source (magnum/common/keystone.py, read live 2026-07-01) and the -D-039 / D-051 / D-064 identity model. Supersedes the single-consumer shortcut used on -2026-06-09 (admin creates in the admin-owned capi-mgmt project), which sidesteps -- rather -than exercises -- the trust constraint and therefore does NOT validate the tenant path. +Fills onboarding Stage 7. Grounded in the magnum + keystone source (read live 2026-07-02) and the +live multi-tenant validation (tenant `acme`). This revision CORRECTS the 2026-07-01 draft, whose +central hypothesis (D.3, "a clean-role tenant identity delegates the trust") was REFUTED live -- the +real blockers were a keystone policy template (D-065) and an app-cred trust restriction (D-066), +neither of which is about role delegation. + +VALIDATION STATUS: the identity/trust path is PROVEN end to end -- a tenant password identity clears +create_user (D-064) and create_trust (D-065), then magnum proceeds to certificate generation. Cluster +COMPLETION is currently blocked one step later at the Barbican/Vault cert substrate (D-067), an +operator-side defect independent of the tenant model. -------------------------------------------------------------------------------- -## D.1 What magnum does at cluster-create (the mechanism) +## D.1 What magnum does at cluster-create (the mechanism, in order) -------------------------------------------------------------------------------- -Two Keystone writes happen before any infrastructure is touched -(magnum/conductor/handlers/common/trust_manager.py -> create_trustee_and_trust): - -1. create_trustee -> `identity:create_user` - Magnum's trustee_domain_admin (magnum_domain_admin, Admin on the magnum domain) - creates a per-cluster service user in the magnum domain. This is the step D-064 - unblocked (the create_user policy templating fix). VALIDATED live 2026-07-01: - trustee user is created successfully. - -2. create_trust -> `identity:create_trust` - Magnum creates a Keystone trust delegating the CALLER's roles to that trustee. - From magnum/common/keystone.py: - - def create_trust(self, trustee_user): - trustor_user_id = self.session.get_user_id() # the CALLER's user - trustor_project_id = self.session.get_project_id() # the CALLER's project - if CONF.trust.roles: - roles = CONF.trust.roles # (unset on this deploy) - else: - roles = self.context.roles # -> the roles in the CALLER's token - self.client.trusts.create( - trustor_user=trustor_user_id, project=trustor_project_id, - trustee_user=trustee_user, impersonation=True, role_names=roles) - -Two facts follow directly from that code, and they are the whole model: - - A. The TRUSTOR is the identity that issued `openstack coe cluster create` - (`self.session` is the request-context client). The Keystone policy - `identity:create_trust = "user_id:%(trust.trustor_user_id)s"` is therefore - satisfied by construction -- caller == trustor. (So the create_trust 403 is - NOT a trustor-identity policy failure.) - - B. The DELEGATED ROLES are `self.context.roles` -- the roles present in the - CALLER's token on `trustor_project_id`. Keystone's create_trust REFUSES to - delegate any role the trustor does not actually hold on that project - (a trust cannot grant more than the trustor has). `CONF.trust.roles` is unset - here, so magnum delegates the caller's token roles verbatim -- whatever they are. +1. create_trustee -> identity:create_user (magnum_domain_admin creates the per-cluster trustee user in + the magnum domain). Unblocked by D-064. PROVEN live. +2. create_trust -> identity:create_trust (the cluster CREATOR is the trustor; trustee is the step-1 + user; impersonation=True; roles = the creator's token roles). Unblocked by D-065 + password auth. + PROVEN live. +3. generate_certificates_to_cluster -> stores the cluster CA cert in BARBICAN, which stores it in Vault + (castellan vault_key_manager). CURRENT BLOCKER -- see D-067. +4. (then) the capi-helm driver mints the per-cluster CAPO child app credential (D-039) and provisions + via helm/CAPI. NOT YET REACHED on the multi-tenant path. -------------------------------------------------------------------------------- -## D.2 Why the 2026-06-09 single-consumer path "worked" (and why we retired it) +## D.2 Two hard constraints on WHO creates the cluster -------------------------------------------------------------------------------- -On 2026-06-09 the cluster was created by ADMIN, scoped to the admin-owned capi-mgmt -project. Admin trivially holds (or cloud-admin-bypasses) every role it delegates to -itself, so create_trust never exercised the delegation constraint. That is a -SINGLE-CONSUMER shortcut: one privileged operator standing in for the tenant. It -proves the driver/CAPI plumbing but NOT the multi-tenant identity path, because in -the real product the cluster creator is a TENANT, not the cloud operator. +Constraint 1 -- the create_trust policy template (D-065). This cloud's charm-rendered base policy +shipped identity:create_trust = "user_id:%(trust.trustor_user_id)s", a non-resolving template on +Caracal (keystone populates target.trust.trustor_user_id). It evaluated false for EVERY caller (admin +included), regardless of roles -- proven by a direct `openstack trust create` with trustor==self still +403ing. Fixed by D-065 (override with the target-prefixed form keystone itself ships). This is why the +2026-07-01 role-delegation hypothesis was wrong: the failure was templating, not roles. -The admin-in-capi-mgmt attempt on 2026-07-01 then 403'd at create_trust because that -mixed scope (admin user, capi-mgmt project) is not a clean delegatable-role identity -on capi-mgmt -- and, under D-064, admin scoped to capi-mgmt is a RESTRICTED identity -there (it is not cloud_admin outside the admin domain; `list_role_assignments` 403s -in that scope, confirmed live). It is the wrong identity for the tenant model on two -counts: it is the operator, and its token roles are not the tenant delegatable set. +Constraint 2 -- app credentials cannot create trusts (D-066). After D-065, an app-cred-authenticated +create_trust STILL failed -- keystone's _check_application_credential (trusts.py) blocks trust creation +from any application-credential token, and on this build the docstring states this applies "regardless +of the 'unrestricted' flag". Confirmed live: an unrestricted app cred was refused; the same identity via +PASSWORD passed. Therefore the cluster-creator MUST authenticate with a PASSWORD. + +Consequence: the cluster-creator is -cluster (password, member + load-balancer_member on the +tenant project) per the D-066 Option-3 account model. The app cred (-svc) is for non-trust +automation only. See appendix-C for the full account set. + +Keystone's create_trust ALSO enforces _require_trustor_has_role_in_project (the trustor must hold each +delegated role on the project). Magnum delegates the creator's token roles, which are by construction a +subset of what the creator holds on the scoped project -> passes. -------------------------------------------------------------------------------- -## D.3 The multi-tenant rule (what identity must create the cluster) +## D.3 The multi-tenant rule (CORRECTED) -------------------------------------------------------------------------------- -RULE: a Magnum cluster is created by the TENANT's own project-scoped identity, whose -token carries EXACTLY the delegatable tenant roles -- `member` and -`load-balancer_member` (and `reader` where used) -- and NOT `admin`. +A Magnum cluster is created by the tenant's -cluster identity, authenticating by PASSWORD, +project-scoped to -prod, holding exactly member + load-balancer_member. Not admin, not an app +cred. This satisfies: create_trust policy (D-065), the app-cred block (password, D-066), the +trustor==caller check (by construction), and the trustor-has-role check (D-039-style grants). -Rationale, straight from D.1.B: - - The trust delegates `context.roles`. If the creator's token carries `admin`, - magnum tries to delegate `admin` into the trust; Keystone refuses a trust that - grants a role the trustor does not properly hold as a delegatable project grant, - and even if it did, delegating `admin` into a long-lived cluster credential is a - privilege-escalation footgun (the trustee impersonates the trustor with - impersonation=True). The tenant set (member + load-balancer_member) is the - correct, least-privilege delegation. - - `load-balancer_member` MUST be in the creator's token: the magnum-capi-helm - driver provisions an Octavia LB for the apiserver, and the trust must carry - Octavia authority or CAPO 403s at LB reconcile (D-039). This is exactly why - D-039 grants the trustor `load-balancer_member` on the cluster project. - - `member` provides the compute/network/volume authority the cluster's CCM/CSI - need via the trust. - -WHO THIS IS, per the onboarding model (tenant-onboarding-runbook Stage 2/4): - - The tenant's SERVICE identity: `-ci` / `-svc`, holding - `member` + `load-balancer_member` on `-prod`, authenticating with its - UNRESTRICTED application credential (the app cred is required so the driver can - mint the per-cluster CAPO child cred -- D-039 / onboarding Stage 4). - - Equivalently a tenant human user with `member` + `load-balancer_member` on the - project, but the service/app-cred identity is the production path (Jenkins/CI). - -The operator (admin / cloud_admin) does NOT create tenant clusters. The capi-mgmt -project is the MANAGEMENT-plane project (where the CAPI mgmt cluster VM and the -operator's own D-039 roles live for the mgmt cluster itself); tenant clusters are -created in the TENANT's project by the TENANT's identity. +The 2026-06-09 single-consumer path (admin creates in the admin-owned capi-mgmt project) sidesteps the +trust-delegation constraint and does NOT validate the tenant model -- retired. -------------------------------------------------------------------------------- -## D.4 Trustor role-set validation (run before the create) +## D.4 The current blocker (D-067) -- operator-side, not tenant-side -------------------------------------------------------------------------------- -Confirm the creating identity's TOKEN carries the delegatable set and nothing that -cannot be delegated. Run AS the tenant creator identity (app cred or password): +Step D.1(3) fails: magnum -> Barbican POST /v1/secrets returns 500 -> castellan vault_key_manager -> +Vault AppRole login rejected: source address "10.12.8.176" unauthorized through CIDR restrictions. +barbican reaches Vault on the metal-admin plane; Vault's AppRole binds the secret_id to metal-internal +(D-052/D-053). The bundle is correct (all secrets endpoints metal-internal); the LIVE binding drifted. +Fix = live rebind to metal-internal (gated, next session), NOT CIDR-widen. Full detail in D-067. - # as the tenant service identity, project-scoped to -prod - openstack token issue -f value -c user_id -c project_id # confirm scope - # roles in THIS token == what magnum will delegate (context.roles): - openstack role assignment list --user \ - --project --effective --names -f value -c Role | sort - -GATE: the role set is a subset of { member, load-balancer_member, reader }, and -INCLUDES load-balancer_member. If `admin` appears, this is the wrong identity -- -do not create with it. - -Note: a tenant/app-cred identity cannot run `role assignment list` for other users -(policy 403, by design). Query only its own assignment, or read it as admin -beforehand during onboarding. +This is independent of the tenant identity model: it blocks cert-gen for ANY creator. Once D-067 is +fixed, cluster-create should proceed past cert-gen into the capi-helm driver's provisioning, where the +CAPO child-cred mint (D-039) happens under -cluster. -------------------------------------------------------------------------------- -## D.5 The create (tenant identity), and the trust it produces +## D.5 The create (tenant -cluster, PASSWORD) -------------------------------------------------------------------------------- - # authenticate as the tenant service identity via its app cred (onboarding Stage 4) - # OS_AUTH_TYPE=v3applicationcredential + the app cred id/secret from the 0600 file - # then, project-scoped to the tenant project: - openstack coe cluster create \ - --cluster-template \ - --keypair \ - --master-count 1 --node-count 2 - - # verify the trust was created and carries the tenant roles: - openstack coe cluster show -f value -c status -c trustee_user_id - # status -> CREATE_IN_PROGRESS (past trustee+trust), NOT CREATE_FAILED at ~3s. - -Expected: create_user (D-064) AND create_trust both pass, because the creator is the -trustor and its token roles (member + load-balancer_member) are cleanly delegatable -on the tenant project. The driver then proceeds to helm/CAPI provisioning. + # authenticate as -cluster via PASSWORD (NOT app cred), project-scoped to -prod + # OS_USERNAME=-cluster OS_USER_DOMAIN_ID= OS_PROJECT_ID= OS_PASSWORD=... + # OS_CACERT= OS_AUTH_URL=https://:5000/v3 + openstack coe cluster create --cluster-template -k8s \ + --keypair -key --master-count 1 --node-count 1 + openstack coe cluster show -f value -c status -c status_reason + # expect (post D-067): CREATE_IN_PROGRESS -> ... -> CREATE_COMPLETE -------------------------------------------------------------------------------- -## D.6 Roosevelt +## D.6 Open validation items (next session) -------------------------------------------------------------------------------- - - Cluster-create is a TENANT self-service operation, performed by the tenant's - app-cred identity carrying member + load-balancer_member on the tenant project. - Wire it into the tenant CI (Jenkins) path (onboarding Stage 7), never the - operator admin. - - Optionally pin `CONF.trust.roles = member,load-balancer_member` in magnum.conf - (via the D-037 conf.d mechanism) to make the delegated set EXPLICIT and - independent of whatever roles happen to be in the caller's token -- a hardening - that removes the "wrong token roles" failure mode entirely. Decide as a tracked - item; unset (inherit context.roles) is the upstream default and works when the - creator identity is correct. - - The management-plane capi-mgmt project + the operator's D-039 roles there remain - for the MGMT cluster; they are not the tenant cluster-create path. - --------------------------------------------------------------------------------- -## D.7 Open validation item --------------------------------------------------------------------------------- - -This appendix establishes the model from the magnum source and the identity design. -The live behavioral confirmation on THIS cloud -- create a cluster as a tenant -app-cred identity (member + load-balancer_member) and observe create_trust succeed -- -is the acceptance step, and folds into onboarding Stage 7 (currently [PENDING]) and -the D-011 gate. Until run, D.3 is design-derived-from-source, not yet live-verified -on the multi-tenant path. +1. Fix D-067 (barbican<->Vault metal-internal rebind), then re-run the create -> cert-gen clears. +2. Watch to CREATE_COMPLETE; capture where/whose the CAPO child cred is minted (confirms D-066's + -cluster-owns-CAPO-cred design empirically). +3. kubeconfig + nodes/CNI/CCM (phase-08 8.3 pattern). +4. Clean-room beta pass: onboard a fresh tenant from ONLY handed-over credentials (zero admin + fallback) via scripts/tenant-onboard.sh, and complete the tenant-facing tests. diff --git a/runbooks/tenant-onboarding-v2-DRAFT.md b/runbooks/tenant-onboarding-v2-DRAFT.md new file mode 100644 index 0000000..2533a8d --- /dev/null +++ b/runbooks/tenant-onboarding-v2-DRAFT.md @@ -0,0 +1,466 @@ +# >>> 2026-07-02 REVISION BANNER (read first) <<< +# This 2026-07-01 draft is SUPERSEDED on two points by the 2026-07-02 session. Where they conflict, +# the following + appendix-D + scripts/tenant-onboard.sh are AUTHORITATIVE: +# 1. ACCOUNT MODEL is now Option-3 (D-066): -domain-admin (manager, password), +# -cluster (PASSWORD, trust-capable, cluster lifecycle), -svc (unrestricted +# app cred, non-trust automation). The single-'-svc'-does-everything model below is retired. +# 2. CLUSTER-CREATE identity is -cluster via PASSWORD, NOT an app cred. Keystone blocks +# trust creation from app-cred tokens regardless of unrestricted (D-066/appendix-D). The +# Stage 6 "create as app cred" below is WRONG -- use the password identity. +# 3. Cluster COMPLETION is gated on D-067 (barbican<->Vault metal-internal rebind); cluster-create +# currently dies at cert-gen until that live fix lands. +# The runnable, corrected procedure is scripts/tenant-onboard.sh (Option-3). Stages 0-5 validated; +# stage 6 gated on D-067. +# >>> END REVISION BANNER <<< + +# Tenant Onboarding v2 -- multi-tenant self-service + cluster creation (DRAFT) + +STATUS: DRAFT, built 2026-07-01 from the live multi-tenant validation run (tenant `acme`). +SUPERSEDES the 2026-06-22 tenant-onboarding-runbook.md identity model: that draft used an +`admin`-on-domain tenant administrator (which needed an out-of-band bandaid and could not +self-service reliably). This v2 uses the SCS Domain Manager persona (`manager` role) per +D-051 / D-064, validated end-to-end via CLI this session. + +VALIDATION LEGEND (honesty markers -- do not finalize past what is proven): + [VALIDATED 2026-07-01] confirmed live this session (output captured) + [CORRECTED-PENDING] a failure was root-caused and the corrected block is staged, but + the corrected form has NOT been re-run yet + [PROCEDURE-PENDING] design-derived (appendix-D from magnum source); NOT yet live-verified + +DESIGN REFS: D-051 (Domain Manager persona), D-064 (policy reconciliation to scs-0302 + +create-op templating fix), D-039 (per-cluster app-cred carries load-balancer_member), +appendix-C (identity/RBAC reference), appendix-D (Magnum cluster-create trust model). + +-------------------------------------------------------------------------------- +## Validation status (this draft) +-------------------------------------------------------------------------------- + +| Stage | What | Status | +|-------|--------------------------------------------------|--------| +| 0 | Operator pre-flight (perimeters exist) | [VALIDATED 2026-07-01] | +| 1 | Operator provisions domain + manager + quotas | [VALIDATED 2026-07-01] | +| 2 | Manager self-services project + svc-user + grants| [VALIDATED 2026-07-01] (D-064 G3) | +| 2.5 | Tenant isolation (anti-escalation + cross-domain)| [VALIDATED 2026-07-01] (+ finding, below) | +| 3 | Service user mints app cred + keypair | [VALIDATED 2026-07-01] | +| 4 | Tenant builds L3 (net/subnet/router/ext-gw) | [VALIDATED 2026-07-01] | +| 5 | Tenant creates its own cluster template | [CORRECTED-PENDING] (image-by-UUID fix staged) | +| 6/7 | Tenant creates cluster (trustee + trust) | [PROCEDURE-PENDING] (appendix-D; trust step not yet confirmed) | + +FINDING (Stage 2.5d, 2026-07-01): on THIS deployment the manager's `domain list` returns +ONLY its own domain -- keystone scope-filters the result even though the D-064 policy +authorizes `list_domains` for a manager. So the "manager can enumerate all domain names +cloud-wide" limitation documented in appendix-C / D-064 does NOT manifest here; isolation is +TIGHTER than documented. appendix-C must be corrected to state the observed own-domain-only +behavior (see the appendix-C change in this package). + +-------------------------------------------------------------------------------- +## Model (operator provides vs. tenant self-services) -- v2 +-------------------------------------------------------------------------------- + +OPERATOR PROVIDES (minimal): + - A Keystone DOMAIN per client. + - ONE domain-admin account holding the `manager` role on that domain (SCS Domain Manager). + - Domain/project quotas (the envelope). + - The shared perimeters: public flavor catalog, public Magnum-ready image, external network + (provider-ext) + FIP pool, the Vault root CA. + +TENANT SELF-SERVICES (via the manager, then a manager-created service identity): + - Their own projects, users, and role assignments WITHIN their domain (member + + load-balancer_member only -- `manager` cannot grant admin or manager; anti-escalation). + - Their own application credential (the cluster-creator identity). + - Their own network / subnet / router / external gateway. + - Their own Magnum cluster template (visible by OWNERSHIP -- created in their project). + - Their own clusters. + +TRUST BOUNDARY: the operator never holds the tenant's working credentials. The manager owns +its domain's users (including resetting the service user's password). Admin never mutates +tenant resources. + +CONVENTIONS (carried; all exercised this session): + - env-clean before every identity switch: `for v in $(env|awk -F= '/^OS_/{print $1}'); do unset "$v"; done` + - OS_CACERT MUST be threaded into every session (Vault-issued keystone cert). A stripped CA + yields an opaque SSL error -- the Stage 2 pre-auth guard fails loud instead. + - Subshell-isolate identity switches `( ... )` so the operator admin shell is untouched. + - Dynamic resolution only -- never hardcode ids (domain/project/image regenerate per rebuild). + - Verify-before-mutate; capture real command output and test the RESULT (not `head||echo`). + - Secrets straight to 0600 files under $HOME (snap confinement: never /tmp). + - ASCII-only committed files; LF endings. + +AS-RUN REFERENCE (tenant `acme`, 2026-07-01 -- ids are per-run, shown for traceability only): + domain acme=7b65248e33e041c78793b7d0939ef631; project acme-prod=780fa2f0761541ba8bc283c346b6af4d; + svc-user acme-svc=af73b67aa8b24b07904f4d463c1528b2; app-cred=930b0b027b0e465f89f04ef53c4db18c + (unrestricted, 86-char secret); net=193a8915..., subnet 10.20.24.0/24, router SNAT 10.12.7.194. + +================================================================================ +## STAGE 0 -- Operator pre-flight (READ-ONLY) [VALIDATED 2026-07-01] +================================================================================ +Confirms the shared perimeters exist and the D-064 override is live BEFORE onboarding. + +--- BEGIN block: onboard-v2-00-preflight (RUN -- jumphost, admin) --- +for v in $(env|awk -F= '/^OS_/{print $1}'); do unset "$v"; done +source ~/admin-openrc +CLIENT=; fail=0 + +# D-064 override MUST be live (manager self-service depends on it) +juju status keystone -m openstack --format=yaml 2>/dev/null | python3 -c ' +import sys,yaml; m=yaml.safe_load(sys.stdin)["applications"]["keystone"]["units"]["keystone/0"].get("workload-status",{}).get("message","") +print("keystone:",m); sys.exit(0 if m.startswith("PO:") else 1)' || { echo "*** keystone not PO: active -- STOP ***"; fail=1; } + +# roles present +for R in manager member load-balancer_member reader; do + openstack role show "$R" -f value -c id &1 | grep -qE '^[0-9a-f]{32}$' \ + && echo "role $R ok" || { echo "*** role $R MISSING ***"; fail=1; } +done + +# public Magnum-ready image (kube_version + os_distro + public) +IMG=$(openstack image list --public -f value -c ID -c Name &1 | awk '/kube/{print $1;exit}') +openstack image show "$IMG" -f json &1 | python3 -c ' +import sys,json; d=json.load(sys.stdin); p=d.get("properties",{}) +ok=d.get("visibility")=="public" and p.get("kube_version") and p.get("os_distro") +print("image:",d.get("name"),d.get("visibility"),p.get("kube_version"),p.get("os_distro")) +sys.exit(0 if ok else 1)' || { echo "*** kube image not public / missing props -- STOP ***"; fail=1; } + +# a Magnum-capable public flavor (>=2 vcpu, >=2048 MB) +openstack flavor list --public -f json &1 | python3 -c ' +import sys,json; fs=json.load(sys.stdin) +ok=[f for f in fs if (f.get("VCPUs") or 0)>=2 and (f.get("RAM") or 0)>=2048] +print("magnum-capable flavors:",[f["Name"] for f in ok]); sys.exit(0 if ok else 1)' || fail=1 + +# external net + FIP capacity (note: ip availability columns print alphabetically -- total then used) +openstack network show provider-ext -f value -c id &1 | grep -qE '^[0-9a-f-]{36}$' \ + && echo "provider-ext ok" || { echo "*** provider-ext missing ***"; fail=1; } + +# clean slate for this client +openstack domain show "$CLIENT" -f value -c id &1 | grep -qE '^[0-9a-f]{32}$' \ + && { echo "*** $CLIENT domain EXISTS -- decide reuse/clean ***"; fail=1; } || echo "no $CLIENT domain (clean)" + +echo "=== PRE-FLIGHT $([ $fail -eq 0 ] && echo PASS || echo FAIL) ===" +--- END block --- +GATE: PRE-FLIGHT PASS. + +================================================================================ +## STAGE 1 -- Operator provisions domain + manager + quotas [VALIDATED 2026-07-01] +================================================================================ +The ONLY operator-side tenant provisioning. Everything after is tenant self-service. + +--- BEGIN block: onboard-v2-01-operator-domain (RUN -- jumphost, admin) --- +for v in $(env|awk -F= '/^OS_/{print $1}'); do unset "$v"; done +source ~/admin-openrc +CLIENT=; fail=0 + +openstack domain create --description "Client: ${CLIENT}" "$CLIENT" /dev/null 2>&1 +DOM=$(openstack domain show "$CLIENT" -f value -c id &1) +echo "$DOM" | grep -qE '^[0-9a-f]{32}$' && echo "domain $CLIENT id=$DOM" || { echo "*** domain FAIL ***"; fail=1; } + +# manager account (NOT admin -- admin is not safely domain-confinable; the persona is `manager`) +MPW=$(python3 -c 'import secrets;print(secrets.token_urlsafe(24))') +openstack user create --domain "$DOM" --password "$MPW" \ + --description "${CLIENT} domain manager (SCS Domain Manager; D-051/D-064)" \ + "${CLIENT}-domain-admin" /dev/null 2>&1 +MUID=$(openstack user show "${CLIENT}-domain-admin" --domain "$DOM" -f value -c id &1) +echo "$MUID" | grep -qE '^[0-9a-f]{32}$' && echo "manager user id=$MUID" || { echo "*** user FAIL ***"; fail=1; } + +openstack role add --domain "$DOM" --user "$MUID" manager &1 +# confine check: EXACTLY one assignment (manager on this domain), nothing else +echo "manager assignments (expect exactly: manager on $CLIENT):" +openstack role assignment list --user "$MUID" --names -f value &1 | sed 's/^/ /' + +# stash the manager credential (tenant handoff) -> 0600 file +MF="$HOME/${CLIENT}-domain-admin-cred.txt"; umask 077; : > "$MF"; chmod 600 "$MF" +printf 'domain=%s\ndomain_id=%s\nusername=%s-domain-admin\nuser_id=%s\npassword=%s\nauth_url=https://:5000/v3\n' \ + "$CLIENT" "$DOM" "$CLIENT" "$MUID" "$MPW" > "$MF"; chmod 600 "$MF"; unset MPW +echo "credential -> $MF" + +# quotas (the envelope) -- span nova/neutron/cinder; set explicitly (documents the record) +openstack quota set "${CLIENT}-prod" --instances 10 --cores 20 --ram 51200 /dev/null || true +# NOTE: run quota AFTER the manager creates the project (Stage 2), OR set on the project once it exists. +echo "=== STAGE 1 $([ $fail -eq 0 ] && echo PASS || echo FAIL) ===" +--- END block --- +GATE: STAGE 1 PASS -- domain created, manager holds EXACTLY `manager` on the domain (no +other assignment), credential stashed 0600. + +DELIVER TO CLIENT: Horizon URL, domain name, `-domain-admin` + password, Vault CA. +Nothing else. (Quotas: set on `-prod` after Stage 2 creates it, or pre-create the +project as operator; the 2026-07-01 run set quotas post-project.) + +================================================================================ +## STAGE 2 -- Manager self-service (project + service user + grants) [VALIDATED 2026-07-01] +================================================================================ +This is the D-064 G3 acceptance: the manager performs, VIA CLI, exactly the identity +operations the pre-D-064 policy rejected (create_project / create_user / create_grant), and +is correctly DENIED admin-grant and cross-domain access. + +--- BEGIN block: onboard-v2-02-manager-selfservice (RUN -- jumphost, AS manager) --- +CLIENT= +MF="$HOME/${CLIENT}-domain-admin-cred.txt" +CA="$HOME/vault-init/vault-ca-root.pem" # confirm current path from admin-openrc +[ -s "$MF" ] || { echo "missing $MF -- run Stage 1"; return 2>/dev/null||exit 1; } + +( for v in $(env|awk -F= '/^OS_/{print $1}'); do unset "$v"; done + DOM=$(awk -F= '/^domain_id=/{print $2}' "$MF") + export OS_AUTH_URL="https://:5000/v3" OS_IDENTITY_API_VERSION=3 OS_CACERT="$CA" + export OS_USERNAME="${CLIENT}-domain-admin" OS_USER_DOMAIN_ID="$DOM" OS_DOMAIN_ID="$DOM" + export OS_PASSWORD="$(awk -F= '/^password=/{print $2}' "$MF")" + fail=0 + + # HARDENING GUARD: TLS trust material present before ANY call (fails loud, not opaque SSL) + [ -n "$OS_CACERT" ] && [ -s "$OS_CACERT" ] || { echo "*** OS_CACERT unset/missing -- STOP ***"; exit 3; } + openssl x509 -in "$OS_CACERT" -noout -checkend 0 >/dev/null 2>&1 || { echo "*** CA expired/unreadable -- STOP ***"; exit 3; } + + scope=$(openstack token issue -f value -c domain_id &1) + [ "$scope" = "$DOM" ] && echo "manager authenticated, domain-scoped" || { echo "*** auth FAIL: $scope ***"; exit 1; } + + # 2.1 create_project (PASS) + openstack project create --domain "$DOM" --description "${CLIENT} production" "${CLIENT}-prod" /dev/null 2>&1 + PID=$(openstack project show "${CLIENT}-prod" --domain "$DOM" -f value -c id &1) + echo "$PID" | grep -qE '^[0-9a-f]{32}$' && echo "project ${CLIENT}-prod=$PID (create_project PASS)" || { echo "*** create_project FAIL ***"; fail=1; } + + # 2.2 create_user (service account) (PASS) + SPW=$(python3 -c 'import secrets;print(secrets.token_urlsafe(24))') + openstack user create --domain "$DOM" --password "$SPW" --description "${CLIENT} CI/service (cluster creator)" "${CLIENT}-svc" /dev/null 2>&1 + SUID=$(openstack user show "${CLIENT}-svc" --domain "$DOM" -f value -c id &1) + echo "$SUID" | grep -qE '^[0-9a-f]{32}$' && echo "user ${CLIENT}-svc=$SUID (create_user PASS)" || { echo "*** create_user FAIL ***"; fail=1; } + + # 2.3 grant member + load-balancer_member on the project (create_grant) -- capture RESULT, not exit + for R in member load-balancer_member; do + openstack role add --project "$PID" --user "$SUID" "$R" &1 + openstack role assignment list --project "$PID" --user "$SUID" --names -f value -c Role &1 | grep -qw "$R" \ + && echo "granted $R" || { echo "*** grant $R FAIL ***"; fail=1; } + done + + # 2.4 anti-escalation: admin grant MUST be denied (verify it did NOT take -- ground truth, not error text) + openstack role add --project "$PID" --user "$SUID" admin /dev/null 2>&1 + if openstack role assignment list --project "$PID" --user "$SUID" --names -f value -c Role &1 | grep -qw admin; then + echo "*** ESCALATION: manager granted admin -- POLICY FAILURE, STOP ***"; fail=1 + else echo "admin grant DENIED (anti-escalation holds)"; fi + + echo "=== STAGE 2 $([ $fail -eq 0 ] && echo PASS || echo FAIL) ===" + unset SPW OS_PASSWORD +) +--- END block --- +GATE: STAGE 2 PASS -- 2.1/2.2/2.3 succeed, 2.4 DENIED. + +NOTE (service-user password): the Stage-2 password is ephemeral inside the subshell. Stage 3 +resets it (as the MANAGER) to a fresh 0600-stashed value -- the manager owns its domain's users. + +--- BEGIN block: onboard-v2-02b-isolation (CHECK read-only -- AS manager) --- +# Cross-domain isolation. Resolve REAL other-domain targets AS ADMIN first (prove they exist), +# then confirm the manager is refused. Keystone returns "does not exist" for cross-scope reads +# (no-enumeration-oracle design) -- treat does-not-exist on a proven-real resource as isolation-holding. +source ~/admin-openrc +OTHER_DOM=$(openstack domain show admin_domain -f value -c id &1) +OTHER_USER=$(openstack user list --domain "$OTHER_DOM" -f value -c ID &1 | head -1) +OTHER_PROJ=$(openstack project show admin --domain admin_domain -f value -c id &1) +( for v in $(env|awk -F= '/^OS_/{print $1}'); do unset "$v"; done + DOM=$(awk -F= '/^domain_id=/{print $2}' "$MF") + export OS_AUTH_URL="https://:5000/v3" OS_IDENTITY_API_VERSION=3 OS_CACERT="$CA" + export OS_USERNAME="${CLIENT}-domain-admin" OS_USER_DOMAIN_ID="$DOM" OS_DOMAIN_ID="$DOM" + export OS_PASSWORD="$(awk -F= '/^password=/{print $2}' "$MF")" + refused(){ echo "$1" | grep -qiE 'forbidden|not authorized|403|could not be found|does not exist|No .* exists|HTTP 40[34]'; } + out=$(openstack user show "$OTHER_USER" &1); refused "$out" && echo "cross-domain user read DENIED/hidden" || echo "*** GAP ***" + out=$(openstack project show "$OTHER_PROJ" &1); refused "$out" && echo "admin project read DENIED/hidden" || echo "*** GAP ***" + echo "manager domain list (observed own-domain-only on this cloud, 2026-07-01):" + openstack domain list -f value -c Name &1 | sed 's/^/ /' ) +--- END block --- +GATE: both cross-domain reads DENIED/hidden; domain list shows only the client's own domain +(the appendix-C names-only-leak does NOT manifest here -- tighter isolation; see appendix-C fix). + +================================================================================ +## STAGE 3 -- Service user mints app cred + keypair (cluster-creator identity) [VALIDATED 2026-07-01] +================================================================================ +`-svc` is the cluster creator. Its token roles are EXACTLY member + load-balancer_member +(from Stage 2.3) -- the clean delegatable set the trust needs (appendix-D). App cred MUST be +unrestricted (the driver mints a per-cluster CAPO child cred; D-039). Secrets -> 0600 files. + +--- BEGIN block: onboard-v2-03-appcred-keypair (RUN -- jumphost) --- +CLIENT=; CA="$HOME/vault-init/vault-ca-root.pem" +MF="$HOME/${CLIENT}-domain-admin-cred.txt"; SF="$HOME/${CLIENT}-svc-cred.txt"; ACF="$HOME/${CLIENT}-svc-appcred.txt" +source ~/admin-openrc +DOM=$(openstack domain show "$CLIENT" -f value -c id &1) +PID=$(openstack project show "${CLIENT}-prod" --domain "$DOM" -f value -c id &1) +SUID=$(openstack user show "${CLIENT}-svc" --domain "$DOM" -f value -c id &1) + +# 3.1 manager sets the svc password -> 0600 (manager owns its domain's users; admin does NOT) +SPW=$(python3 -c 'import secrets;print(secrets.token_urlsafe(24))') +( for v in $(env|awk -F= '/^OS_/{print $1}'); do unset "$v"; done + export OS_AUTH_URL="https://:5000/v3" OS_IDENTITY_API_VERSION=3 OS_CACERT="$CA" + export OS_USERNAME="${CLIENT}-domain-admin" OS_USER_DOMAIN_ID="$DOM" OS_DOMAIN_ID="$DOM" + export OS_PASSWORD="$(awk -F= '/^password=/{print $2}' "$MF")" + openstack user set --password "$SPW" "$SUID" &1 && echo "svc password set" || echo "*** set FAIL ***" ) +umask 077; : > "$SF"; chmod 600 "$SF" +printf 'username=%s-svc\nuser_id=%s\nuser_domain_id=%s\nproject_id=%s\nauth_url=https://:5000/v3\npassword=%s\n' \ + "$CLIENT" "$SUID" "$DOM" "$PID" "$SPW" > "$SF"; chmod 600 "$SF"; unset SPW + +# 3.2 svc self-mints UNRESTRICTED app cred (project-scoped) +( for v in $(env|awk -F= '/^OS_/{print $1}'); do unset "$v"; done + export OS_AUTH_URL="https://:5000/v3" OS_IDENTITY_API_VERSION=3 OS_CACERT="$CA" + export OS_USERNAME="${CLIENT}-svc" OS_USER_DOMAIN_ID="$DOM" OS_PROJECT_ID="$PID" + export OS_PASSWORD="$(awk -F= '/^password=/{print $2}' "$SF")" + chk=$(openstack token issue -f value -c project_id &1) + [ "$chk" = "$PID" ] && echo "svc authenticated, project-scoped" || { echo "*** svc auth FAIL: $chk ***"; exit 1; } + umask 077; : > "$ACF"; chmod 600 "$ACF" + openstack application credential create "${CLIENT}-cluster-cred" --unrestricted \ + --description "${CLIENT} cluster-creator" -f shell "$ACF" 2>&1 + grep -qE '^id=' "$ACF" && { chmod 600 "$ACF"; echo "app cred minted -> $ACF"; \ + awk -F'"' '/^secret=/{print " secret length (measured): "length($2)}' "$ACF"; \ + grep -E '^unrestricted=|^project_id=' "$ACF" | sed 's/^/ /'; } || { echo "*** appcred FAIL ***"; cat "$ACF"; } ) + +# 3.3 svc creates keypair -> 0600 +( for v in $(env|awk -F= '/^OS_/{print $1}'); do unset "$v"; done + export OS_AUTH_URL="https://:5000/v3" OS_IDENTITY_API_VERSION=3 OS_CACERT="$CA" + export OS_USERNAME="${CLIENT}-svc" OS_USER_DOMAIN_ID="$DOM" OS_PROJECT_ID="$PID" + export OS_PASSWORD="$(awk -F= '/^password=/{print $2}' "$SF")" + KF="$HOME/${CLIENT}-key.pem"; umask 077 + openstack keypair create "${CLIENT}-key" "$KF" 2>&1 + head -1 "$KF" | grep -q 'PRIVATE KEY' && { chmod 600 "$KF"; echo "keypair -> $KF"; } || { echo "*** keypair FAIL ***"; cat "$KF"; } ) +--- END block --- +GATE: app cred unrestricted, project_id = -prod, secret length measured (86 on this +cloud -- do NOT assert; measure); keypair present, 0600. + +================================================================================ +## STAGE 4 -- Tenant builds L3 (net/subnet/router/ext-gw) [VALIDATED 2026-07-01] +================================================================================ +The tenant (app-cred identity) self-serves its own L3, INCLUDING the external gateway. +FINDING (2026-07-01): a non-admin app-cred identity CAN set the external gateway on this cloud +(confirms the onboarding Stage-5 finding for the automation identity, not just a Horizon human). + +--- BEGIN block: onboard-v2-04-network (RUN -- jumphost; L3 as app cred, checks as admin) --- +CLIENT=; CA="$HOME/vault-init/vault-ca-root.pem"; ACF="$HOME/${CLIENT}-svc-appcred.txt" +TENANT_CIDR=10.20..0/24 # pick from the tenant pool; MUST NOT collide (checked below) + +# 4.0 CIDR collision pre-check (operator IPAM concern; read-only as admin) +source ~/admin-openrc +if openstack subnet list -f value -c Subnet &1 | grep -qw "$TENANT_CIDR"; then + echo "*** $TENANT_CIDR IN USE -- pick another, STOP ***"; COLL=1 +else echo "$TENANT_CIDR free"; COLL=0; fi + +# 4.1-4.5 build L3 as the app-cred identity +[ "$COLL" = 0 ] && ( for v in $(env|awk -F= '/^OS_/{print $1}'); do unset "$v"; done + export OS_AUTH_TYPE=v3applicationcredential OS_AUTH_URL="https://:5000/v3" OS_IDENTITY_API_VERSION=3 OS_CACERT="$CA" + export OS_APPLICATION_CREDENTIAL_ID=$(awk -F'"' '/^id=/{print $2}' "$ACF") + export OS_APPLICATION_CREDENTIAL_SECRET=$(awk -F'"' '/^secret=/{print $2}' "$ACF") + openstack token issue -f value -c project_id &1 | grep -qE '^[0-9a-f]{32}$' || { echo "*** app-cred auth FAIL ***"; exit 1; } + openstack network create "${CLIENT}-net" /dev/null 2>&1 && echo "net ok" + openstack subnet create "${CLIENT}-subnet" --network "${CLIENT}-net" --subnet-range "$TENANT_CIDR" --dns-nameserver 8.8.8.8 /dev/null 2>&1 && echo "subnet ok" + openstack router create "${CLIENT}-router" /dev/null 2>&1 && echo "router ok" + openstack router set "${CLIENT}-router" --external-gateway provider-ext &1 && echo "ext-gw set" || echo "*** ext-gw FAIL (operator may need to attach) ***" + openstack router add subnet "${CLIENT}-router" "${CLIENT}-subnet" &1 && echo "interface added" ) + +# 4.6 verify SNAT (proof egress) -- read-only as admin +source ~/admin-openrc +P=$(openstack project show "${CLIENT}-prod" --domain "$CLIENT" -f value -c id &1) +RID=$(openstack router list --project "$P" -f value -c ID &1 | head -1) +openstack router show "$RID" -f json &1 | python3 -c ' +import sys,json; d=json.load(sys.stdin); g=d.get("external_gateway_info") or {} +print("router",d.get("name"),d.get("status"),"snat",g.get("enable_snat"), + "snat_ip",(g.get("external_fixed_ips") or [{}])[0].get("ip_address","none"))' +--- END block --- +GATE: router ACTIVE, snat=True, snat_ip allocated from provider-ext. + +================================================================================ +## STAGE 5 -- Tenant creates its OWN cluster template [CORRECTED-PENDING] +================================================================================ +Templates are visible by OWNERSHIP -- the tenant creates its own in its project (it cannot use +another project's private template). IMAGE PASSED BY UUID (not name): a name is subject to a +quoting/resolution hazard -- the first Stage-5 attempt 2026-07-01 failed with +`Cluster type (vm, Unset, kubernetes) not supported` because a doubled-quoted name resolved to +no image, so magnum could not derive the type. UUID removes the failure surface entirely. +STATUS: the corrected (UUID) block below is staged but was NOT re-run before this draft. + +--- BEGIN block: onboard-v2-05-template (RUN -- jumphost; template as app cred) --- +CLIENT=; CA="$HOME/vault-init/vault-ca-root.pem"; ACF="$HOME/${CLIENT}-svc-appcred.txt" +source ~/admin-openrc +IMG_ID=$(openstack image list --public -f value -c ID -c Name &1 | awk '/kube/{print $1;exit}') +echo "$IMG_ID" | grep -qE '^[0-9a-f-]{36}$' || { echo "*** image uuid resolve FAIL -- STOP ***"; } + +( for v in $(env|awk -F= '/^OS_/{print $1}'); do unset "$v"; done + export OS_AUTH_TYPE=v3applicationcredential OS_AUTH_URL="https://:5000/v3" OS_IDENTITY_API_VERSION=3 OS_CACERT="$CA" + export OS_APPLICATION_CREDENTIAL_ID=$(awk -F'"' '/^id=/{print $2}' "$ACF") + export OS_APPLICATION_CREDENTIAL_SECRET=$(awk -F'"' '/^secret=/{print $2}' "$ACF") + openstack token issue -f value -c project_id &1 | grep -qE '^[0-9a-f]{32}$' || { echo "*** auth FAIL ***"; exit 1; } + # idempotent pre-clean + openstack coe cluster template show "${CLIENT}-k8s" -f value -c uuid /dev/null 2>&1 \ + && openstack coe cluster template delete "${CLIENT}-k8s" &1 + openstack coe cluster template create "${CLIENT}-k8s" \ + --image "$IMG_ID" \ + --external-network provider-ext \ + --master-flavor gp.mid --flavor capi.node \ + --coe kubernetes --network-driver calico --docker-storage-driver overlay2 \ + --master-lb-enabled --floating-ip-enabled \ + --fixed-network "${CLIENT}-net" --fixed-subnet "${CLIENT}-subnet" \ + --keypair "${CLIENT}-key" &1 + TID=$(openstack coe cluster template show "${CLIENT}-k8s" -f value -c uuid &1) + echo "$TID" | grep -qE '^[0-9a-f-]{36}$' && echo "template ${CLIENT}-k8s=$TID" || echo "*** template FAIL ***" ) +--- END block --- +GATE: template created, coe=kubernetes, network_driver=calico, master_lb+floating_ip enabled, +image_id = the public kube image. +OPEN QUESTION (one variable at a time): the `--fixed-network/--fixed-subnet` pin is the strict +tenant-isolation posture. If the corrected create fails on the network params (image now moot), +drop those two flags and let the capi-helm driver manage the cluster network -- and record which +model this driver expects. + +================================================================================ +## STAGE 6/7 -- Tenant creates the cluster (trustee + trust) [PROCEDURE-PENDING] +================================================================================ +THE MULTI-TENANT TRUST TEST. Design basis: appendix-D (magnum/common/keystone.py, read live). +create_trust delegates `context.roles` (the CALLER's token roles) from the caller (trustor) to +the per-cluster trustee. The caller MUST be the tenant service identity whose token carries +EXACTLY member + load-balancer_member -- NOT admin (a trust cannot delegate a role the trustor +does not hold; and delegating admin is a privilege-escalation footgun). This is why the creator +is `-svc` via app cred, not the operator. + +STATUS: NOT YET LIVE-VERIFIED on the multi-tenant path as of this draft. D-064 fixed the +`create_user` step (trustee user creation -- confirmed live earlier this session). The +`create_trust` step under a clean tenant identity is the specific thing this stage confirms. +If it 403s despite a clean delegatable-role identity, that is a genuine finding (look at the +conductor's trust-session construction), NOT a policy gap -- do not loosen create_trust. + +--- BEGIN block: onboard-v2-06-cluster-create (RUN -- jumphost; create as app cred) --- +CLIENT=; CA="$HOME/vault-init/vault-ca-root.pem"; ACF="$HOME/${CLIENT}-svc-appcred.txt" + +# 6.0 mark conductor log (numeric-or-STOP guard) +source ~/admin-openrc +MARK=$(juju ssh -m openstack magnum/0 'sudo cat /var/log/magnum/magnum-conductor.log | wc -l' /dev/null | tr -dc '0-9') +[ -n "$MARK" ] || { echo "MARK empty -- STOP"; return 2>/dev/null||exit 1; } +echo "MARK=$MARK" + +# 6.1/6.2 create as the tenant app-cred identity +( for v in $(env|awk -F= '/^OS_/{print $1}'); do unset "$v"; done + export OS_AUTH_TYPE=v3applicationcredential OS_AUTH_URL="https://:5000/v3" OS_IDENTITY_API_VERSION=3 OS_CACERT="$CA" + export OS_APPLICATION_CREDENTIAL_ID=$(awk -F'"' '/^id=/{print $2}' "$ACF") + export OS_APPLICATION_CREDENTIAL_SECRET=$(awk -F'"' '/^secret=/{print $2}' "$ACF") + openstack token issue -f value -c project_id &1 | grep -qE '^[0-9a-f]{32}$' || { echo "*** auth FAIL ***"; exit 1; } + openstack coe cluster create "${CLIENT}-cluster" --cluster-template "${CLIENT}-k8s" \ + --keypair "${CLIENT}-key" --master-count 1 --node-count 1 &1 + sleep 12 + openstack coe cluster show "${CLIENT}-cluster" -f value -c uuid -c status -c status_reason &1 | sed 's/^/ /' ) + +# 6.3 conductor log since MARK -- trustee + trust outcome (the verdict) +juju ssh -m openstack magnum/0 "sudo tail -n +$((MARK+1)) /var/log/magnum/magnum-conductor.log 2>/dev/null | grep -iE 'trustee|create_user|create_trust|403|forbidden|created trust|CREATE_|ERROR' | tail -30" /dev/null +--- END block --- +GATE (expected if the model holds): status CREATE_IN_PROGRESS (not a ~3s CREATE_FAILED); log +shows trustee created and NO create_trust 403; driver proceeds to helm/CAPI. Then watch to +CREATE_COMPLETE (phase-08 Step 8.2 pattern) and verify nodes/CNI/CCM (Step 8.3). + +================================================================================ +## Changes folded from the 2026-07-01 session +================================================================================ +- Identity model: `manager` persona (D-051/D-064) replaces `admin`-on-domain (2026-06-22). + Manager CLI self-service VALIDATED (G3) -- the 2026-06-22 out-of-band bandaid is retired. +- appendix-C correction: manager domain-enumeration is own-domain-only on this cloud (the + documented names-only cloud-wide leak does NOT manifest); isolation tighter than documented. +- Cluster-creator identity + trust model documented in appendix-D (magnum source-derived). +- Template: create in owner project; image by UUID (quoting/resolution hazard). +- Hardening throughout: OS_CACERT pre-auth guard; numeric MARK guard; capture-and-test-result + (not head||echo); subshell isolation; dynamic id resolution; secrets 0600 under $HOME. + +## Open items (before this DRAFT becomes VALIDATED) +1. Re-run Stage 5 (corrected UUID form) -- confirm template creates. +2. Run Stage 6 -- confirm create_trust succeeds under the tenant identity (or capture the + finding if it does not). THIS IS THE OUTSTANDING TRUST VALIDATION. +3. Clean-room pass ("beta"): operate from ONLY the handed-over tenant credentials (zero + admin fallback), logging every point where admin is currently used for a read -- classify + each as legitimate operator-perimeter vs. a tenant-accessible lookup. +4. On completion: fold this into tenant-onboarding-runbook.md (Stage 2 rewrite + Stage 7 fill), + commit appendix-D, apply the appendix-C correction, and assign the D-06x number for the + manager-persona-validated onboarding model. diff --git a/scripts/tenant-acceptance.sh b/scripts/tenant-acceptance.sh new file mode 100644 index 0000000..235125d --- /dev/null +++ b/scripts/tenant-acceptance.sh @@ -0,0 +1,120 @@ +#!/usr/bin/env bash +# tenant-acceptance.sh -- tenant-facing acceptance test for an Option-3 tenant cluster. +# Usage: tenant-acceptance.sh [foil-appcred-file] +# tenant name; cluster -cluster, creds in ~/tenant-/ +# [foil-appcred-file] app-cred file (-f shell format) of a DIFFERENT tenant used as the +# isolation foil (default: $HOME/acme-svc-appcred.txt) +# P0 health_status (single-column) + trustee identity confirmation (dynamic) +# P1 kubeconfig via coe cluster config AS TENANT -> kubectl nodes/pods +# P2 k8s Service type=LoadBalancer via OCCM -> Octavia -> FIP -> curl from jumphost +# P3 isolation: the foil identity must see NOTHING of (any visibility = CRITICAL) +# Exit: 0 pass | 11 kube fail | 12 LB fail | 13 ISOLATION VIOLATION | 14 precondition +# health not-yet-HEALTHY is WARN-only (driver reconcile lag), never a gate failure. +# NOTE: P2 leaves the lbtest deployment + LB in place for inspection (cleanup is a separate gate). +# Provenance: gate t3-01 of the 2026-07-02 session (D-066/D-067 acceptance); mock-tested 4-branch. +set -u +CLIENT="${1:-}" +[ -n "$CLIENT" ] || { echo "usage: tenant-acceptance.sh [foil-appcred-file]"; exit 14; } +CF="$HOME/tenant-${CLIENT}/${CLIENT}-cluster-cred.txt" +FOIL_ACF="${2:-$HOME/acme-svc-appcred.txt}" +[ -s "$CF" ] || { echo "PRECOND: no $CF"; exit 14; } +[ -s "$FOIL_ACF" ] || { echo "PRECOND: no foil app-cred file $FOIL_ACF (isolation probe needs it)"; exit 14; } +command -v kubectl >/dev/null || { echo "PRECOND: kubectl absent on jumphost"; exit 14; } +CUID=$(awk -F= '/^user_id=/{print $2}' "$CF") + +tenant_env() { for v in $(env|awk -F= '/^OS_/{print $1}'); do unset "$v"; done + export OS_AUTH_URL="$(awk -F= '/^auth_url=/{print $2}' "$CF")" OS_IDENTITY_API_VERSION=3 + export OS_CACERT="${OS_CACERT:-$HOME/vault-init/vault-ca-root.pem}" + export OS_USERNAME="${CLIENT}-cluster" OS_USER_DOMAIN_ID="$(awk -F= '/^user_domain_id=/{print $2}' "$CF")" + export OS_PROJECT_ID="$(awk -F= '/^project_id=/{print $2}' "$CF")" OS_PASSWORD="$(awk -F= '/^password=/{print $2}' "$CF")"; } +admin_env() { for v in $(env|awk -F= '/^OS_/{print $1}'); do unset "$v"; done; source "$HOME/admin-openrc"; } +foil_env() { for v in $(env|awk -F= '/^OS_/{print $1}'); do unset "$v"; done + export OS_AUTH_TYPE=v3applicationcredential OS_AUTH_URL="https://10.12.4.50:5000/v3" OS_IDENTITY_API_VERSION=3 + export OS_CACERT="${OS_CACERT:-$HOME/vault-init/vault-ca-root.pem}" + export OS_APPLICATION_CREDENTIAL_ID="$(awk -F'"' '/^id=/{print $2}' "$FOIL_ACF")" + export OS_APPLICATION_CREDENTIAL_SECRET="$(awk -F'"' '/^secret=/{print $2}' "$FOIL_ACF")"; } + +echo "=== P0: health + trust identities ===" +H=$( (tenant_env; openstack coe cluster show "${CLIENT}-cluster" -f value -c health_status &1) || true ) +BUUID=$( (tenant_env; openstack coe cluster show "${CLIENT}-cluster" -f value -c uuid &1) || true ) +echo " health_status=$H uuid=$BUUID" +case "$H" in HEALTHY) echo " health: PASS";; *) echo " health: WARN (not HEALTHY yet -- reconcile lag or D-042-class issue; re-check later)";; esac +case "$BUUID" in *[!0-9a-f-]*|'') echo "PRECOND: bad beta uuid"; exit 14;; esac +TRUSTEE=$( (admin_env; openstack trust list -f json &1) | python3 -c " +import sys,json +try: + ts=[t for t in json.load(sys.stdin) if t.get('Trustor User ID')=='$CUID'] + print(ts[0]['Trustee User ID'] if ts else '') +except Exception: print('')" ) +if [ -n "$TRUSTEE" ]; then + TNAME=$( (admin_env; openstack user show "$TRUSTEE" -f value -c name &1) || true ) + echo " trustee: $TRUSTEE name=$TNAME (expect magnum_domain_admin)" +else echo " trustee: WARN could not resolve"; fi + +echo +echo "=== P1: kubeconfig (as tenant) + nodes/pods ===" +KDIR="$HOME/tenant-${CLIENT}/kube"; mkdir -p "$KDIR"; chmod 700 "$KDIR" +CFGOUT=$( (tenant_env; cd "$KDIR" && openstack coe cluster config "${CLIENT}-cluster" --dir "$KDIR" --force &1) || true ) +printf '%s\n' "$CFGOUT" | head -2 +[ -s "$KDIR/config" ] || { echo "FAIL: no kubeconfig written"; exit 11; } +chmod 600 "$KDIR/config"; export KUBECONFIG="$KDIR/config" +echo "--- nodes ---" +NODES=$(kubectl get nodes -o wide 2>&1 || true); printf '%s\n' "$NODES" | sed 's/^/ /' +READY=$(printf '%s\n' "$NODES" | awk '$2=="Ready"{c++} END{print c+0}') +[ "$READY" -ge 2 ] || { echo "FAIL: expected >=2 Ready nodes, got $READY"; exit 11; } +echo "--- pods not Running/Completed (want none) ---" +BAD=$(kubectl get pods -A --no-headers 2>&1 | awk '$4!="Running" && $4!="Completed"' || true) +if [ -n "$BAD" ]; then printf '%s\n' "$BAD" | sed 's/^/ /'; echo " WARN: non-Running pods above"; else echo " (none)"; fi +TOT=$(kubectl get pods -A --no-headers 2>/dev/null | wc -l) +echo " pods total=$TOT P1: PASS (>=2 Ready nodes, kubeconfig fetched by TENANT)" + +echo +echo "=== P2: tenant LB via OCCM (Service type=LoadBalancer) ===" +kubectl create deployment lbtest --image=registry.k8s.io/e2e-test-images/agnhost:2.47 -- /agnhost netexec --http-port=8080 2>&1 | sed 's/^/ /' +kubectl expose deployment lbtest --port=80 --target-port=8080 --type=LoadBalancer 2>&1 | sed 's/^/ /' +LBIP="" +for i in $(seq 1 24); do + LBIP=$(kubectl get svc lbtest -o jsonpath='{.status.loadBalancer.ingress[0].ip}' 2>/dev/null || true) + printf '%s [%02d] external-ip=%s\n' "$(date +%T)" "$i" "${LBIP:-}" + [ -n "$LBIP" ] && break + sleep 25 +done +[ -n "$LBIP" ] || { echo "FAIL: no EXTERNAL-IP within ~10 min"; kubectl describe svc lbtest 2>&1 | tail -12; exit 12; } +CURL_OK=0 +for i in $(seq 1 12); do + R=$(curl -s --max-time 8 "http://$LBIP/hostname" 2>&1 || true) + printf '%s curl[%02d]: %s\n' "$(date +%T)" "$i" "${R:-}" + case "$R" in lbtest-*) CURL_OK=1; break;; esac + sleep 10 +done +[ "$CURL_OK" = 1 ] || { echo "FAIL: LB not serving pod hostname"; exit 12; } +LBOS=$( (admin_env; openstack loadbalancer list -f value -c name -c provisioning_status -c operating_status &1) || true ) +echo "--- octavia view (admin) ---"; printf '%s\n' "$LBOS" | sed 's/^/ /' +echo " P2: PASS (OCCM -> Octavia -> FIP $LBIP -> pod)" + +echo +echo "=== P3: isolation -- foil tenant must see NOTHING of ${CLIENT} ===" +VIOL=0 +AT=$( (foil_env; openstack token issue -f value -c project_id &1) || true ) +case "$AT" in *[!0-9a-f]*|'') echo "PRECOND: foil auth failed -- raw: $AT"; exit 14;; esac +echo " foil authenticated (project $AT)" +C1=$( (foil_env; openstack coe cluster list -f value -c name &1) || true ) +echo " foil coe cluster list: ${C1:-}" +printf '%s\n' "$C1" | grep -q "${CLIENT}-cluster" && { echo " *** VIOLATION: foil sees ${CLIENT}-cluster ***"; VIOL=1; } +C2=$( (foil_env; openstack coe cluster show "$BUUID" -f value -c name &1) || true ) +echo " foil show ${CLIENT} cluster uuid: $C2" +printf '%s\n' "$C2" | grep -qiE 'not.*found|404|denied|403' || { echo " *** VIOLATION: foil can read ${CLIENT} cluster ***"; VIOL=1; } +C3=$( (foil_env; openstack network show "${CLIENT}-net" -f value -c id &1) || true ) +echo " foil show ${CLIENT}-net: $C3" +printf '%s\n' "$C3" | grep -qiE 'no network|not.*found|404|unable' || { echo " *** VIOLATION: foil sees ${CLIENT}-net ***"; VIOL=1; } +C4=$( (foil_env; openstack server list -f value -c Name &1) || true ) +echo " foil server list: ${C4:-}" +printf '%s\n' "$C4" | grep -qi "${CLIENT}-cluster" && { echo " *** VIOLATION: foil sees ${CLIENT} nodes ***"; VIOL=1; } +C5=$( (foil_env; openstack loadbalancer list -f value -c name &1) || true ) +echo " foil loadbalancer list: ${C5:-}" +printf '%s\n' "$C5" | grep -qE 'kube|lbtest|k8s' && { echo " *** VIOLATION: foil sees ${CLIENT} LB ***"; VIOL=1; } +[ "$VIOL" = 0 ] || { echo "P3: ISOLATION VIOLATION -- CRITICAL"; exit 13; } +echo " P3: PASS (all ${CLIENT} resources invisible to foil)" +echo +echo "RESULT: t3-01 TENANT ACCEPTANCE PASS (P1 kube, P2 LB, P3 isolation; health=$H)" +exit 0 diff --git a/scripts/tenant-onboard.sh b/scripts/tenant-onboard.sh new file mode 100644 index 0000000..8523c88 --- /dev/null +++ b/scripts/tenant-onboard.sh @@ -0,0 +1,192 @@ +#!/usr/bin/env bash +# tenant-onboard.sh -- Option-3 multi-tenant onboarding (D-066), Omega Cloud v1 +# STATUS: DRAFT 2026-07-02. Stages 0-5 validated live (tenant acme). Stage 6 gate CLEARED: +# D-067 (barbican<->Vault) FIXED + validated live 2026-07-02 (see D-067 amendment / BUNDLEFIX-007). +# KEYPAIR FIX (2026-07-02 pre-first-clean-room-run): nova keypairs are USER-scoped and magnum +# validates + boots the keypair in the CLUSTER-CREATOR/trustor context (magnum 18.0.0 +# attr_validator.validate_keypair via cluster.py:545; trust app-cred impersonates the trustor). +# So the keypair is created by -cluster (stage3), and the template (stage5) omits --keypair +# (keypair_id optional, default=None); stage6 supplies it. A -svc-owned key would 400 at stage6. +# +# Model (D-066): operator creates domain + manager; manager creates project + -cluster (password, +# trust-capable, cluster lifecycle) + -svc (unrestricted app cred, non-trust automation). Cluster +# create MUST be password (keystone blocks app-cred trust creation; see appendix-D/D-066). +# +# Hardening conventions (learned 2026-07-02): validate raw output WHOLE (never extract-then-check); +# whitelist-write secrets to 0600 files, never echo; dynamic ID resolution; CA threaded; verify +# before mutate. This is an EXECUTED script (bash tenant-onboard.sh ...), so exit-on-error is correct. +set -uo pipefail + +# ---- inputs ---- +CLIENT="${1:-}"; STAGE="${2:-all}" +TENANT_CIDR="${TENANT_CIDR:-}" # required for stage 4 (e.g. 10.20.24.0/24); must not collide +KEYSTONE_VIP="${KEYSTONE_VIP:-10.12.4.50}" +CA="${OS_CACERT:-$HOME/vault-init/vault-ca-root.pem}" +OUT="$HOME/tenant-${CLIENT}" # 0600 credential handover dir +AUTH_URL="https://${KEYSTONE_VIP}:5000/v3" +die(){ echo "FATAL: $*" >&2; exit 1; } +[ -n "$CLIENT" ] || die "usage: tenant-onboard.sh [stage0|1|2|3|4|5|6|all]" +[ -s "$CA" ] || die "OS_CACERT not found: $CA" +openssl x509 -in "$CA" -noout -checkend 0 >/dev/null 2>&1 || die "CA expired/unreadable: $CA" +umask 077; mkdir -p "$OUT"; chmod 700 "$OUT" + +is_id(){ [[ "$1" =~ ^[0-9a-f]{32}$ ]]; } # keystone id +newpw(){ python3 -c 'import secrets;print(secrets.token_urlsafe(24))'; } + +# ---- admin context helper (operator) ---- +admin_env(){ for v in $(env|awk -F= '/^OS_/{print $1}'); do unset "$v"; done; source ~/admin-openrc; } +# ---- named-identity password context (subshell-scoped by callers) ---- + +stage0(){ # operator preflight (read-only) + admin_env; local fail=0 + echo "== stage0: preflight ==" + juju status keystone -m openstack --format=yaml 2>/dev/null | python3 -c 'import sys,yaml;m=yaml.safe_load(sys.stdin)["applications"]["keystone"]["units"]["keystone/0"].get("workload-status",{}).get("message","");print("keystone:",m);sys.exit(0 if m.startswith("PO:") else 1)' || { echo " keystone override not PO: active"; fail=1; } + for R in manager member load-balancer_member; do is_id "$(openstack role show "$R" -f value -c id &1)" || { echo " role $R MISSING"; fail=1; }; done + local img; img=$(openstack image list --public -f value -c ID -c Name &1 | awk '/kube/{print $1;exit}') + is_id "${img//-/}" 2>/dev/null || [[ "$img" =~ ^[0-9a-f-]{36}$ ]] || { echo " no public kube image"; fail=1; } + is_id "$(openstack domain show "$CLIENT" -f value -c id &1)" && { echo " domain $CLIENT already EXISTS -- decide reuse/clean"; fail=1; } || true + [ "$fail" = 0 ] && echo " PREFLIGHT PASS" || die "preflight failed" +} + +stage1(){ # operator: domain + manager + quota + admin_env + echo "== stage1: operator provisions domain + manager ==" + openstack domain create --description "Client: $CLIENT" "$CLIENT" /dev/null 2>&1 || true + local DOM; DOM=$(openstack domain show "$CLIENT" -f value -c id &1); is_id "$DOM" || die "domain create failed" + local MPW; MPW=$(newpw) + openstack user create --domain "$DOM" --password "$MPW" --description "$CLIENT domain manager (D-051/D-064)" "${CLIENT}-domain-admin" /dev/null 2>&1 || true + local MUID; MUID=$(openstack user show "${CLIENT}-domain-admin" --domain "$DOM" -f value -c id &1); is_id "$MUID" || die "manager create failed" + openstack role add --domain "$DOM" --user "$MUID" manager &1 || true + openstack role assignment list --user "$MUID" --names -f value -c Role &1 | grep -qw manager || die "manager grant failed" + local MF="$OUT/${CLIENT}-domain-admin-cred.txt"; : > "$MF"; chmod 600 "$MF" + printf 'domain=%s\ndomain_id=%s\nusername=%s-domain-admin\nuser_id=%s\npassword=%s\nauth_url=%s\n' "$CLIENT" "$DOM" "$CLIENT" "$MUID" "$MPW" "$AUTH_URL" > "$MF"; chmod 600 "$MF" + echo " domain=$DOM manager=$MUID cred -> $MF" + echo " (set project quota after stage2 creates ${CLIENT}-prod, or pre-create the project as operator)" +} + +stage2(){ # manager self-service: project + -cluster + -svc + grants + admin_env + local DOM; DOM=$(openstack domain show "$CLIENT" -f value -c id &1); is_id "$DOM" || die "no domain" + local MF="$OUT/${CLIENT}-domain-admin-cred.txt"; [ -s "$MF" ] || die "run stage1 first" + echo "== stage2: manager self-services project + -cluster + -svc + grants (D-064 G3) ==" + ( for v in $(env|awk -F= '/^OS_/{print $1}'); do unset "$v"; done + export OS_AUTH_URL="$AUTH_URL" OS_IDENTITY_API_VERSION=3 OS_CACERT="$CA" + export OS_USERNAME="${CLIENT}-domain-admin" OS_USER_DOMAIN_ID="$DOM" OS_DOMAIN_ID="$DOM" + export OS_PASSWORD="$(awk -F= '/^password=/{print $2}' "$MF")" + [ "$(openstack token issue -f value -c domain_id &1)" = "$DOM" ] || { echo "manager auth FAIL"; exit 1; } + openstack project create --domain "$DOM" --description "$CLIENT production" "${CLIENT}-prod" /dev/null 2>&1 || true + local PID; PID=$(openstack project show "${CLIENT}-prod" --domain "$DOM" -f value -c id &1); [[ "$PID" =~ ^[0-9a-f]{32}$ ]] || { echo "project FAIL"; exit 1; } + # -cluster (password, trust-capable) and -svc (app-cred automation) + for U in cluster svc; do + local PW; PW=$(python3 -c 'import secrets;print(secrets.token_urlsafe(24))') + openstack user create --domain "$DOM" --password "$PW" --description "$CLIENT $U" "${CLIENT}-$U" /dev/null 2>&1 || true + local UID2; UID2=$(openstack user show "${CLIENT}-$U" --domain "$DOM" -f value -c id &1); [[ "$UID2" =~ ^[0-9a-f]{32}$ ]] || { echo "user $U FAIL"; exit 1; } + for R in member load-balancer_member; do openstack role add --project "$PID" --user "$UID2" "$R" &1 || true; done + local F="$OUT/${CLIENT}-$U-cred.txt"; umask 077; : > "$F"; chmod 600 "$F" + printf 'username=%s-%s\nuser_id=%s\nuser_domain_id=%s\nproject_id=%s\nauth_url=%s\npassword=%s\n' "$CLIENT" "$U" "$UID2" "$DOM" "$PID" "$AUTH_URL" "$PW" > "$F"; chmod 600 "$F" + echo " ${CLIENT}-$U=$UID2 (member+load-balancer_member) cred -> $F" + done + # anti-escalation self-check (must be DENIED) + local SU; SU=$(openstack user show "${CLIENT}-svc" --domain "$DOM" -f value -c id &1) + openstack role add --project "$PID" --user "$SU" admin /dev/null 2>&1 || true + if openstack role assignment list --project "$PID" --user "$SU" --names -f value -c Role &1 | grep -qw admin; then echo " *** ESCALATION: manager granted admin -- STOP ***"; exit 1; else echo " anti-escalation OK (admin grant denied)"; fi + ) || die "stage2 failed" +} + +stage3(){ # -svc mints unrestricted app cred; -cluster owns the keypair (trustor-owned) + admin_env + local DOM; DOM=$(openstack domain show "$CLIENT" -f value -c id &1) + local SF="$OUT/${CLIENT}-svc-cred.txt"; local PID; PID=$(awk -F= '/^project_id=/{print $2}' "$SF") + echo "== stage3: -svc mints unrestricted app cred + keypair ==" + ( for v in $(env|awk -F= '/^OS_/{print $1}'); do unset "$v"; done + export OS_AUTH_URL="$AUTH_URL" OS_IDENTITY_API_VERSION=3 OS_CACERT="$CA" + export OS_USERNAME="${CLIENT}-svc" OS_USER_DOMAIN_ID="$DOM" OS_PROJECT_ID="$PID" + export OS_PASSWORD="$(awk -F= '/^password=/{print $2}' "$SF")" + [ "$(openstack token issue -f value -c project_id &1)" = "$PID" ] || { echo "svc auth FAIL"; exit 1; } + local ACF="$OUT/${CLIENT}-svc-appcred.txt"; umask 077; : > "$ACF"; chmod 600 "$ACF" + openstack application credential create "${CLIENT}-svc-cred" --unrestricted --description "$CLIENT non-trust automation" -f shell "$ACF" 2>&1 + grep -qE '^id=' "$ACF" || { echo "appcred FAIL"; cat "$ACF"; exit 1; }; chmod 600 "$ACF" + echo " app cred -> $ACF (unrestricted; secret len $(awk -F'"' '/^secret=/{print length($2)}' "$ACF"))" + ) || die "stage3 (svc app cred) failed" + # keypair as -CLUSTER: magnum validates it in the cluster-creator nova context at cluster + # create, and node boot resolves it as the trustor -- both are the -cluster identity. + local CF="$OUT/${CLIENT}-cluster-cred.txt"; [ -s "$CF" ] || die "run stage2 first (no cluster cred)" + ( for v in $(env|awk -F= '/^OS_/{print $1}'); do unset "$v"; done + export OS_AUTH_URL="$AUTH_URL" OS_IDENTITY_API_VERSION=3 OS_CACERT="$CA" + export OS_USERNAME="${CLIENT}-cluster" OS_USER_DOMAIN_ID="$DOM" OS_PROJECT_ID="$PID" + export OS_PASSWORD="$(awk -F= '/^password=/{print $2}' "$CF")" + [ "$(openstack token issue -f value -c project_id &1)" = "$PID" ] || { echo "cluster-user auth FAIL"; exit 1; } + local KF="$OUT/${CLIENT}-key.pem"; umask 077; openstack keypair create "${CLIENT}-key" "$KF" 2>&1 + head -1 "$KF" | grep -q 'PRIVATE KEY' && { chmod 600 "$KF"; echo " keypair -> $KF (owner: ${CLIENT}-cluster)"; } || { echo "keypair FAIL"; cat "$KF"; exit 1; } + ) || die "stage3 (cluster keypair) failed" +} + +stage4(){ # tenant L3 via app cred + [ -n "$TENANT_CIDR" ] || die "set TENANT_CIDR (e.g. 10.20.24.0/24)" + admin_env + openstack subnet list -f value -c Subnet &1 | grep -qw "$TENANT_CIDR" && die "CIDR $TENANT_CIDR in use" + local ACF="$OUT/${CLIENT}-svc-appcred.txt" + echo "== stage4: tenant L3 (net/subnet/router/ext-gw) via app cred ==" + ( for v in $(env|awk -F= '/^OS_/{print $1}'); do unset "$v"; done + export OS_AUTH_TYPE=v3applicationcredential OS_AUTH_URL="$AUTH_URL" OS_IDENTITY_API_VERSION=3 OS_CACERT="$CA" + export OS_APPLICATION_CREDENTIAL_ID="$(awk -F'"' '/^id=/{print $2}' "$ACF")" + export OS_APPLICATION_CREDENTIAL_SECRET="$(awk -F'"' '/^secret=/{print $2}' "$ACF")" + [[ "$(openstack token issue -f value -c project_id &1)" =~ ^[0-9a-f]{32}$ ]] || { echo "appcred auth FAIL"; exit 1; } + openstack network create "${CLIENT}-net" /dev/null 2>&1 && echo " net ok" + openstack subnet create "${CLIENT}-subnet" --network "${CLIENT}-net" --subnet-range "$TENANT_CIDR" --dns-nameserver 8.8.8.8 /dev/null 2>&1 && echo " subnet ok" + openstack router create "${CLIENT}-router" /dev/null 2>&1 && echo " router ok" + openstack router set "${CLIENT}-router" --external-gateway provider-ext &1 && echo " ext-gw ok" || echo " *** ext-gw FAILED (operator may need to attach) ***" + openstack router add subnet "${CLIENT}-router" "${CLIENT}-subnet" &1 && echo " interface ok" + ) || die "stage4 failed" +} + +stage5(){ # tenant template (image by UUID) + admin_env + local IMG; IMG=$(openstack image list --public -f value -c ID -c Name &1 | awk '/kube/{print $1;exit}') + [[ "$IMG" =~ ^[0-9a-f-]{36}$ ]] || die "kube image uuid resolve failed" + local ACF="$OUT/${CLIENT}-svc-appcred.txt" + echo "== stage5: tenant cluster template (image by UUID) ==" + ( for v in $(env|awk -F= '/^OS_/{print $1}'); do unset "$v"; done + export OS_AUTH_TYPE=v3applicationcredential OS_AUTH_URL="$AUTH_URL" OS_IDENTITY_API_VERSION=3 OS_CACERT="$CA" + export OS_APPLICATION_CREDENTIAL_ID="$(awk -F'"' '/^id=/{print $2}' "$ACF")" + export OS_APPLICATION_CREDENTIAL_SECRET="$(awk -F'"' '/^secret=/{print $2}' "$ACF")" + openstack coe cluster template show "${CLIENT}-k8s" -f value -c uuid /dev/null 2>&1 && openstack coe cluster template delete "${CLIENT}-k8s" &1 || true + # NO --keypair here: template is -svc-created but the key is -cluster-owned; keypair_id is + # optional (magnum default=None) and stage6 supplies --keypair in the -cluster context. + openstack coe cluster template create "${CLIENT}-k8s" --image "$IMG" --external-network provider-ext \ + --master-flavor gp.mid --flavor capi.node --coe kubernetes --network-driver calico \ + --docker-storage-driver overlay2 --master-lb-enabled --floating-ip-enabled \ + --fixed-network "${CLIENT}-net" --fixed-subnet "${CLIENT}-subnet" &1 + [[ "$(openstack coe cluster template show "${CLIENT}-k8s" -f value -c uuid &1)" =~ ^[0-9a-f-]{36}$ ]] && echo " template ${CLIENT}-k8s ok" || { echo "template FAIL"; exit 1; } + ) || die "stage5 failed" +} + +stage6(){ # cluster create as -cluster PASSWORD [GATED ON D-067] + echo "== stage6: cluster create as ${CLIENT}-cluster (PASSWORD) ==" + echo " NOTE: D-067 RESOLVED 2026-07-02 (vault external binding -> metal-internal; BUNDLEFIX-007)." + admin_env + local DOM; DOM=$(openstack domain show "$CLIENT" -f value -c id &1) + local CF="$OUT/${CLIENT}-cluster-cred.txt"; local PID; PID=$(awk -F= '/^project_id=/{print $2}' "$CF") + ( for v in $(env|awk -F= '/^OS_/{print $1}'); do unset "$v"; done + export OS_AUTH_URL="$AUTH_URL" OS_IDENTITY_API_VERSION=3 OS_CACERT="$CA" + export OS_USERNAME="${CLIENT}-cluster" OS_USER_DOMAIN_ID="$DOM" OS_PROJECT_ID="$PID" + export OS_PASSWORD="$(awk -F= '/^password=/{print $2}' "$CF")" + [ "$(openstack token issue -f value -c project_id &1)" = "$PID" ] || { echo "cluster-user auth FAIL"; exit 1; } + openstack coe cluster create "${CLIENT}-cluster" --cluster-template "${CLIENT}-k8s" --keypair "${CLIENT}-key" --master-count 1 --node-count 1 &1 + sleep 15 + openstack coe cluster show "${CLIENT}-cluster" -f value -c uuid -c status -c status_reason &1 | sed 's/^/ /' + ) || die "stage6 failed" +} + +case "$STAGE" in + stage0|0) stage0 ;; + stage1|1) stage1 ;; + stage2|2) stage2 ;; + stage3|3) stage3 ;; + stage4|4) stage4 ;; + stage5|5) stage5 ;; + stage6|6) stage6 ;; + all) stage0; stage1; stage2; stage3; stage4; stage5; echo "== stages 0-5 done. run stage6 (cluster) explicitly: tenant-onboard.sh $CLIENT 6 ==" ;; + *) die "unknown stage: $STAGE" ;; +esac +echo "handover creds in: $OUT (0600)"