diff --git a/bundle.yaml b/bundle.yaml deleted file mode 100644 index 93c676b..0000000 --- a/bundle.yaml +++ /dev/null @@ -1,903 +0,0 @@ -# ============================================================ -# Caracal 2024.1 -- VR0 DC0 Omega Cloud testcloud rebuild bundle -# ============================================================ -# Generated: 2026-05-22 (rebuild revision 2026-06-01, bundle-cleanup change-set) -# Replaces: bundle-pre-destroy.yaml (Bobcat 2023.2) -# Charm channels: verified against Charmhub 2026-05-22 (see Caracal_Rebuild handoff D-002) -# Bindings: public:provider, else:metal for API charms; all-metal for backend charms. -# Ceph data nets via public/cluster BINDINGS on ceph-mon/ceph-osd (these provision the -# container/host NICs; ceph-*-network config would NOT). Ceph CLIENTS bind ceph->storage, -# and each subordinate's storage/data binding is mirrored on its PRINCIPAL (subset rule). (C2) -# Endpoints: IP-ONLY -- os-public-hostname dropped on all API charms; the dual VIPs ARE the -# catalog endpoints (public 10.12.4.N / internal+admin 10.12.8.N). Vault issues -# per-VIP IP-SAN certs. No control-plane DNS dependency. (B5) -# HA chain: hacluster subordinates + dual VIPs + :ha relations ACTIVE for 11 API charms -# (10 prior + ceph-radosgw, un-deferred). VIPs front-loaded into the MAAS-reserved -# /26: provider 10.12.4.2-.63, metal 10.12.8.2-.63 (supersedes .224-.254). (B1) -# Vault: single unit, MYSQL storage backend (via vault-mysql-router). etcd + easyrsa -# REMOVED -- the etcd backend was never used (live storage = mysql) and is moot at -# 1 unit; HA backend (Raft vs etcd) is a Roosevelt rehearsal item. (C1; revises D-006) -# Ceph networks: FULL separation via network-space BINDINGS -- ceph-mon/ceph-osd public->storage -# (10.12.16.0/22), ceph-osd cluster->replication (10.12.20.0/22). Bindings, NOT -# ceph-*-network config, so the LXD-contained mon actually gets a storage NIC. -# Clients bind ceph->storage; container principals carry it too (subset rule). (C2) -# Magnum: Layer A only -- CAPI driver graft is Layer B (runbooks/phase-06..08) -# Octavia: lb-mgmt PKI options supplied via overlays/octavia-pki.yaml (gitignored). -# Amphora-pipeline options baked (use-internal-endpoints etc.). (B4) -# OVN tunnels: geneve overlay on the DATA space (10.12.12.0/22) -- ovn-chassis + ovn-chassis-octavia -# 'data' binding; their principals also carry data (nova-compute:neutron-plugin bare-metal, -# octavia:ovsdb-cms provisions the container NIC) per the subset rule. Prereq: enp8s0 -# link-subnet to 10.12.12.4N (rebuild-prep, machines Ready). -# Resources: omitted -- let charms use latest available resource revisions -# ============================================================ - -name: vr0-dc0-omega-caracal-testcloud -description: | - Charmed OpenStack Caracal (2024.1) on Ubuntu 22.04 LTS (Jammy) deployed via Juju 3.6 bundle - against MAAS-managed VMs (openstack0-3, virsh). - Decisions referenced (see Caracal_Rebuild handoff + 2026-06-01 bundle-cleanup change-set): - D-001 Path 2A (Juju-bundle paradigm) - D-002 channel matrix - D-003 Option B (provider /22 carries FIPs + API VIPs) - D-005 Ceph Squid - D-006 Vault HA backend -- REVISED: etcd/easyrsa dropped for testcloud; Raft-vs-etcd is a Roosevelt item (C1) - D-007 Magnum Layer A + Layer B graft - D-019 (supersedes D-008) Designate deferred to v2 - D-009 hacluster subordinates (decorative on testcloud) - D-016 IPv4-only v1 - D-018 MAAS-release-direct teardown - Bundle-cleanup (2026-06-01): B5 IP-only endpoints; C1 vault-on-mysql (etcd/easyrsa removed); - C2 full Ceph network separation; B1 VIP front-load + radosgw HA un-defer; B2 ovn prefer-chassis-as-gw; - B3 nova Ceph-RBD ephemeral; B4 octavia amphora-pipeline options. C3 radosgw unchanged (already correct). - -default-base: ubuntu@22.04/stable - -variables: - # ----- UCA pocket + Ceph source ---------------------------------------------- - openstack-origin: &openstack-origin cloud:jammy-caracal - ceph-source: &ceph-source cloud:jammy-caracal - - # ----- Bindings for external-API-facing charms (public on provider) ---------- -machines: - "8": - constraints: arch=amd64 tags=openstack - "9": - constraints: arch=amd64 tags=openstack - "10": - constraints: arch=amd64 tags=openstack - "11": - constraints: arch=amd64 tags=openstack - -# ===================================================================== -# Network-space bindings (D-052): EXPLICIT per-application blocks, no anchors. -# "" -> metal-admin (operator/MAAS/monitoring; admin API; default) -# internal/shared-db/amqp/certificates/cluster/identity/ovsdb -> metal-internal -# public -> provider-public (public API + floating IPs) -# ceph public -> storage ; ceph cluster -> replication -# geneve overlay -> data-tenant (nova-compute:neutron-plugin, ovn-chassis:data, -# ovn-chassis-octavia:data, octavia:ovsdb-cms) -# Subordinate subset rule: a subordinate's spaces are a subset of its principal's; -# nova-compute keeps data-tenant (via neutron-plugin) for the ovn-chassis geneve. -# Re-IP is MAAS-side only (no CIDR options here). See docs/design-decisions.md D-052. -# ===================================================================== -applications: - - # ===================================================================== - # Datastores: MySQL InnoDB Cluster, RabbitMQ, Vault - # ===================================================================== - # C1: etcd + easyrsa REMOVED. Vault is single-unit and uses the MySQL storage backend via - # vault-mysql-router (matches the live deploy; the etcd HA backend was never exercised and is - # moot at one unit). Vault HA backend (Raft vs etcd) is a Roosevelt rehearsal item. - - mysql-innodb-cluster: - charm: mysql-innodb-cluster - channel: 8.0/stable - num_units: 3 - to: [lxd:8, lxd:9, lxd:10] - bindings: - '': metal-admin - certificates: metal-internal - cluster: metal-internal - coordinator: metal-internal - db-router: metal-internal - shared-db: metal-internal - constraints: arch=amd64 - - rabbitmq-server: - charm: rabbitmq-server - channel: 3.9/stable - num_units: 1 - to: [lxd:10] - bindings: - '': metal-admin - amqp: metal-internal - certificates: metal-internal - cluster: metal-internal - ha: metal-internal - constraints: arch=amd64 - - vault: - charm: vault - channel: 1.8/stable - num_units: 1 # 3 on Roosevelt (D-009); HA backend decided there (C1) - to: [lxd:11] - bindings: - '': metal-admin - access: metal-internal - certificates: metal-internal - cluster: metal-internal - ha: metal-internal - secrets: metal-internal - shared-db: metal-internal - constraints: arch=amd64 - - vault-mysql-router: - charm: mysql-router - channel: 8.0/stable - bindings: - '': metal-admin - certificates: metal-internal - db-router: metal-internal - shared-db: metal-internal - - # ===================================================================== - # Identity: Keystone - # ===================================================================== - - keystone: - charm: keystone - channel: 2024.1/stable - num_units: 1 # 3 on Roosevelt (D-009) - to: [lxd:8] - options: - vip: "10.12.4.50 10.12.8.50 10.12.12.50" # B1 front-loaded VIP; IS the catalog endpoint (B5, no os-public-hostname) - use-policyd-override: true # as-built reconcile 2026-06-09 (origin untraced -- Review-later) - bindings: - '': metal-admin - certificates: metal-internal - cluster: metal-internal - domain-backend: metal-internal - ha: metal-internal - identity-admin: metal-internal - identity-credentials: metal-internal - identity-notifications: metal-internal - identity-service: metal-internal - internal: metal-internal - keystone-fid-service-provider: metal-internal - keystone-middleware: metal-internal - public: provider-public - shared-db: metal-internal - websso-trusted-dashboard: metal-internal - constraints: arch=amd64 - - keystone-mysql-router: - charm: mysql-router - channel: 8.0/stable - bindings: - '': metal-admin - certificates: metal-internal - db-router: metal-internal - shared-db: metal-internal - - # ===================================================================== - # Image: Glance + simplestreams-sync - # ===================================================================== - - glance: - charm: glance - channel: 2024.1/stable - num_units: 1 - to: [lxd:11] - options: - vip: "10.12.4.53 10.12.8.53 10.12.12.53" # B1 - image-conversion: true # as-built; image conversion enabled (raw on Ceph-backed glance) - bindings: - '': metal-admin - amqp: metal-internal - ceph: storage - certificates: metal-internal - cinder-volume-service: metal-internal - cluster: metal-internal - ha: metal-internal - identity-service: metal-internal - image-service: metal-internal - internal: metal-internal - object-store: metal-internal - public: provider-public - shared-db: metal-internal - storage-backend: metal-internal - constraints: arch=amd64 - - glance-mysql-router: - charm: mysql-router - channel: 8.0/stable - bindings: - '': metal-admin - certificates: metal-internal - db-router: metal-internal - shared-db: metal-internal - - glance-simplestreams-sync: - charm: glance-simplestreams-sync - channel: 2024.1/stable - num_units: 1 - to: [lxd:8] - options: # B4 amphora-pipeline - use-internal-endpoints: true # use internal (IP) catalog endpoints - use_swift: false # skip swift index; sidesteps radosgw object path for the amphora seed - bindings: - '': metal-admin - certificates: metal-internal - identity-service: metal-internal - image-modifier: metal-internal - simplestreams-image-service: metal-internal - constraints: arch=amd64 - - # ===================================================================== - # Compute: Nova cloud-controller + compute + Placement - # ===================================================================== - - nova-cloud-controller: - charm: nova-cloud-controller - channel: 2024.1/stable - num_units: 1 - to: [lxd:11] - options: - console-access-protocol: novnc - network-manager: Neutron - vip: "10.12.4.56 10.12.8.56 10.12.12.56" # B1 - bindings: - '': metal-admin - amqp: metal-internal - amqp-cell: metal-internal - certificates: metal-internal - cinder-volume-service: metal-internal - cloud-compute: metal-internal - cloud-controller: metal-internal - cluster: metal-internal - dashboard: metal-internal - ha: metal-internal - identity-service: metal-internal - image-service: metal-internal - internal: metal-internal - memcache: metal-internal - neutron-api: metal-internal - nova-cell-api: metal-internal - placement: metal-internal - public: provider-public - shared-db: metal-internal - shared-db-cell: metal-internal - constraints: arch=amd64 - - nova-compute: - charm: nova-compute - channel: 2024.1/stable - num_units: 3 - to: ["9", "10", "11"] - options: - config-flags: default_ephemeral_format=ext4 - enable-live-migration: true # now genuinely usable -- shared Ceph storage = memory-only migrate (B3) - enable-resize: true - libvirt-image-backend: rbd # B3 Ceph-RBD ephemeral: DISK_GB from the Ceph pool, not local fs; unlocks Magnum - migration-auth-type: ssh - resume-guests-state-on-host-boot: true - virt-type: qemu # Testcloud nested-KVM; Roosevelt will use 'kvm' - reserved-host-memory: 8192 # ENV(testcloud 16GiB hosts) D-040 OOM fix; charm default 512 -- DO NOT drop - bindings: - '': metal-admin - amqp: metal-internal - ceph: storage - ceph-access: storage - cloud-compute: metal-internal - cloud-credentials: metal-internal - compute-peer: metal-internal - image-service: metal-internal - internal: metal-internal - migration: metal-internal - neutron-plugin: data-tenant - secrets-storage: metal-internal - storage-backend: metal-internal - constraints: arch=amd64 - - ncc-mysql-router: - charm: mysql-router - channel: 8.0/stable - bindings: - '': metal-admin - certificates: metal-internal - db-router: metal-internal - shared-db: metal-internal - - placement: - charm: placement - channel: 2024.1/stable - num_units: 1 - to: [lxd:11] - options: - vip: "10.12.4.59 10.12.8.59 10.12.12.59" # B1 - bindings: - '': metal-admin - amqp: metal-internal - certificates: metal-internal - cluster: metal-internal - ha: metal-internal - identity-service: metal-internal - internal: metal-internal - placement: metal-internal - public: provider-public - shared-db: metal-internal - constraints: arch=amd64 - - placement-mysql-router: - charm: mysql-router - channel: 8.0/stable - bindings: - '': metal-admin - certificates: metal-internal - db-router: metal-internal - shared-db: metal-internal - - # ===================================================================== - # Networking: Neutron + OVN - # ===================================================================== - - neutron-api: - charm: neutron-api - channel: 2024.1/stable - num_units: 1 - to: [lxd:9] - options: - enable-ml2-port-security: true - flat-network-providers: physnet1 - neutron-security-groups: true - vip: "10.12.4.55 10.12.8.55 10.12.12.55" # B1 - bindings: - '': metal-admin - amqp: metal-internal - certificates: metal-internal - cluster: metal-internal - ha: metal-internal - identity-service: metal-internal - internal: metal-internal - neutron-api: metal-internal - neutron-plugin-api: metal-internal - neutron-plugin-api-subordinate: metal-internal - public: provider-public - shared-db: metal-internal - constraints: arch=amd64 - - neutron-api-mysql-router: - charm: mysql-router - channel: 8.0/stable - bindings: - '': metal-admin - certificates: metal-internal - db-router: metal-internal - shared-db: metal-internal - - neutron-api-plugin-ovn: - charm: neutron-api-plugin-ovn - channel: 2024.1/stable - - bindings: - '': metal-admin - certificates: metal-internal - neutron-plugin: metal-internal - ovsdb-cms: metal-internal - ovn-central: - charm: ovn-central - channel: 24.03/stable - num_units: 3 - to: [lxd:8, lxd:9, lxd:10] - bindings: - '': metal-admin - certificates: metal-internal - coordinator: metal-internal - ovsdb: metal-internal - ovsdb-cms: metal-internal - ovsdb-peer: metal-internal - ovsdb-server: metal-internal - constraints: arch=amd64 - - # ovn-chassis: subordinate to nova-compute. MAC-based bridge-interface-mappings captured from - # MAAS 2026-05-22 (Bobcat used hardcoded 'enp1s0' -- anti-pattern fix). The charm picks whichever - # MAC is found locally per unit; non-matching MACs are ignored. - ovn-chassis: - charm: ovn-chassis - channel: 24.03/stable - options: - ovn-bridge-mappings: physnet1:br-ex - prefer-chassis-as-gw: true # B2 -- elects gateway chassis so tenant routers get external egress - bridge-interface-mappings: >- - br-ex:52:54:00:3d:fd:54 - br-ex:52:54:00:9d:63:77 - br-ex:52:54:00:89:7f:ce - br-ex:52:54:00:99:fc:c2 - bindings: - '': metal-admin - amqp: metal-internal - certificates: metal-internal - data: data-tenant - ovsdb: metal-internal - ovsdb-subordinate: metal-internal - ovn-chassis-octavia: - charm: ovn-chassis - channel: 24.03/stable - bindings: - '': metal-admin - amqp: metal-internal - certificates: metal-internal - data: data-tenant - ovsdb: metal-internal - ovsdb-subordinate: metal-internal - cinder: - charm: cinder - channel: 2024.1/stable - num_units: 1 - to: [lxd:9] - options: - block-device: None - glance-api-version: 2 - vip: "10.12.4.52 10.12.8.52 10.12.12.52" # B1 - bindings: - '': metal-admin - amqp: metal-internal - backup-backend: metal-internal - ceph: storage - certificates: metal-internal - cinder-volume-service: metal-internal - cluster: metal-internal - ha: metal-internal - identity-credentials: metal-internal - identity-service: metal-internal - image-service: metal-internal - internal: metal-internal - public: provider-public - shared-db: metal-internal - storage-backend: metal-internal - constraints: arch=amd64 # owns the relation -- but the binding still provisions the NIC. - - cinder-mysql-router: - charm: mysql-router - channel: 8.0/stable - bindings: - '': metal-admin - certificates: metal-internal - db-router: metal-internal - shared-db: metal-internal - - cinder-ceph: - charm: cinder-ceph - channel: 2024.1/stable - bindings: - '': metal-admin - ceph: storage - ceph-access: storage - storage-backend: metal-internal - ceph-mon: - charm: ceph-mon - channel: squid/stable - num_units: 3 - to: [lxd:8, lxd:9, lxd:10] - options: - source: *ceph-source - expected-osd-count: 4 - monitor-count: 3 - bindings: - '': metal-admin - bootstrap-source: storage - client: storage - cluster: replication - mds: storage - mon: storage - osd: storage - public: storage - radosgw: storage - rbd-mirror: storage - constraints: arch=amd64 # provisions the NIC and sets the Ceph public net. Mons use only the - # public net (no cluster binding needed). - - ceph-osd: - charm: ceph-osd - channel: squid/stable - num_units: 4 - to: ["8", "9", "10", "11"] - options: - source: *ceph-source - osd-devices: /dev/vdb # libvirt-attached, MAAS-untracked, wiped pre-deploy - bindings: - '': metal-admin - cluster: replication - mon: storage - public: storage - secrets-storage: metal-internal - constraints: arch=amd64 tags=openstack - - ceph-radosgw: - charm: ceph-radosgw - channel: squid/stable - num_units: 1 - to: [lxd:8] - options: - source: *ceph-source - vip: "10.12.4.60 10.12.8.60 10.12.12.60" # B1 -- radosgw HA un-deferred for Roosevelt fidelity (decorative HA on testcloud) - bindings: - '': metal-admin - certificates: metal-internal - cluster: metal-internal - gateway: metal-internal - ha: metal-internal - identity-service: metal-internal - internal: metal-internal - mon: storage - object-store: metal-internal - public: provider-public - radosgw-user: metal-internal - s3: metal-internal - constraints: arch=amd64 - - # ===================================================================== - # Dashboard: openstack-dashboard (Horizon) - # ===================================================================== - - openstack-dashboard: - charm: openstack-dashboard - channel: 2024.1/stable - num_units: 1 - to: [lxd:10] - options: - debug: "false" - vip: "10.12.4.58 10.12.8.58 10.12.12.58" # B1 -- browse HTTPS by IP (B5); ALLOWED_HOSTS must permit the VIP IP (verify at deploy) - bindings: - '': metal-admin - application-dashboard: metal-internal - certificates: metal-internal - cluster: metal-internal - dashboard: metal-internal - dashboard-plugin: metal-internal - ha: metal-internal - identity-service: metal-internal - public: provider-public - shared-db: metal-internal - website: metal-internal - websso-fid-service-provider: metal-internal - websso-trusted-dashboard: metal-internal - constraints: arch=amd64 - - dashboard-mysql-router: - charm: mysql-router - channel: 8.0/stable - bindings: - '': metal-admin - certificates: metal-internal - db-router: metal-internal - shared-db: metal-internal - - # ===================================================================== - # Load Balancer: Octavia - # ===================================================================== - # CRITICAL: vault:certificates must be in bundle from day-one (post-deploy add causes the - # documented apache2/octavia-api masking bug -- see test deployment v3 handoff). - - octavia: - charm: octavia - channel: 2024.1/stable - num_units: 1 - to: [lxd:11] - options: - debug: false - openstack-origin: *openstack-origin - amp-image-tag: octavia-amphora # B4 -- MUST match the tag octavia-diskimage-retrofit stamps - # ----- PKI material ------------------------------------------------- - # 5 lb-mgmt-* options are supplied via overlays/octavia-pki.yaml - # (gitignored). Generated per runbooks/01a-octavia-pki-generation.md. - # Deploy with: - # juju deploy ./bundle.yaml \ - # --overlay overlays/vr0-dc0-testcloud.yaml \ - # --overlay overlays/octavia-pki.yaml - vip: "10.12.4.57 10.12.8.57 10.12.12.57" # B1 - bindings: - '': metal-admin - amqp: metal-internal - certificates: metal-internal - cluster: metal-internal - ha: metal-internal - identity-service: metal-internal - internal: metal-internal - neutron-api: metal-internal - neutron-openvswitch: metal-internal - ovsdb-cms: data-tenant - ovsdb-subordinate: metal-internal - public: provider-public - shared-db: metal-internal - constraints: arch=amd64 # subset for the subordinate's data binding (subset rule). - - octavia-mysql-router: - charm: mysql-router - channel: 8.0/stable - bindings: - '': metal-admin - certificates: metal-internal - db-router: metal-internal - shared-db: metal-internal - - octavia-dashboard: - charm: octavia-dashboard - channel: 2024.1/stable - - bindings: - '': metal-admin - certificates: metal-internal - dashboard: metal-internal - octavia-diskimage-retrofit: - charm: octavia-diskimage-retrofit - channel: 2024.1/stable - options: - amp-image-tag: octavia-amphora - use-internal-endpoints: true # B4 -- charm ships FALSE; required so the retrofit glance client uses the internal (IP) endpoint - image-format: raw # B4 -- RAW, not the qcow2 default: glance is Ceph-backed, and the charm - # + Ceph docs recommend raw so RBD can fast-clone the amphora (qcow2 - # forces a convert-on-import and defeats CoW). - - # ===================================================================== - # Secrets: Barbican - # ===================================================================== - - bindings: - '': metal-admin - certificates: metal-internal - identity-credentials: metal-internal - barbican: - charm: barbican - channel: 2024.1/stable - num_units: 1 - to: [lxd:11] - options: - openstack-origin: *openstack-origin - vip: "10.12.4.51 10.12.8.51 10.12.12.51" # B1 - bindings: - '': metal-admin - amqp: metal-internal - certificates: metal-internal - cluster: metal-internal - ha: metal-internal - identity-service: metal-internal - internal: metal-internal - public: provider-public - secrets: metal-internal - shared-db: metal-internal - constraints: arch=amd64 - - barbican-mysql-router: - charm: mysql-router - channel: 8.0/stable - bindings: - '': metal-admin - certificates: metal-internal - db-router: metal-internal - shared-db: metal-internal - - barbican-vault: - charm: barbican-vault - channel: 2024.1/stable - - # ===================================================================== - # Kubernetes-as-a-Service: Magnum (Layer A -- CAPI graft is Layer B) - # ===================================================================== - # NOTE: After bundle deploys, magnum/0 will show active/idle but CANNOT create K8s clusters. - # Layer B (post-deploy) brings it to life -- see runbooks/phase-06..08: - # 1. In-cloud single-homed mgmt VM (capi-mgmt-v2) with k8s-snap + CAPI/CAPO (phase-06; D-035) - # 2. magnum-capi-helm==1.4.0 grafted onto the conductor (phase-07; D-037/D-042) - # 3. /etc/magnum/magnum.conf.d/00-capi-helm.conf (driver) + 50-keystone-v3-override.conf, - # both read via --config-dir wired into /etc/default/magnum-{conductor,api} (D-037/D-047) - # 4. kubeconfig at /etc/magnum/kubeconfig (server = the mgmt FIP) (phase-07) - # 5. magnum trustee domain-setup (REQUIRED; D-046); per-cluster app-creds are - # minted by magnum at cluster-create -- NO static capo user/app-cred (D-039) - - bindings: - '': metal-admin - certificates: metal-internal - secrets: metal-internal - secrets-storage: metal-internal - magnum: - charm: magnum - channel: 2024.1/stable - num_units: 1 - to: [lxd:9] - options: - openstack-origin: *openstack-origin - region: RegionOne - vip: "10.12.4.54 10.12.8.54 10.12.12.54" # B1 - bindings: - '': metal-admin - amqp: metal-internal - certificates: metal-internal - cluster: metal-internal - ha: metal-internal - identity-service: metal-internal - internal: metal-internal - public: provider-public - shared-db: metal-internal - constraints: arch=amd64 - - magnum-mysql-router: - charm: mysql-router - channel: 8.0/stable - bindings: - '': metal-admin - certificates: metal-internal - db-router: metal-internal - shared-db: metal-internal - - magnum-dashboard: - charm: magnum-dashboard - channel: 2024.1/stable - - # ===================================================================== - # HA Cluster Subordinates (11 active for v1: 10 API charms + ceph-radosgw) - # ===================================================================== - # Channel: 2.4/stable (per Caracal Charm Delivery table, D-002 verified 2026-05-22). - # cluster_count: 1 (decorative on single-unit testcloud, D-009 / BUNDLEFIX-003). - # VIPs front-loaded into the MAAS-reserved provider/metal /26 per B1 (.2-.63). - # vault-hacluster stays commented (vault single-unit on mysql, C1 / BUNDLEFIX-002). - # designate-hacluster stays deferred (D-019). - # - bindings: - '': metal-admin - certificates: metal-internal - dashboard: metal-internal - keystone-hacluster: {charm: hacluster, channel: 2.4/stable, options: {cluster_count: 1}, bindings: {'': metal-admin, ha: metal-internal, hanode: metal-internal, pacemaker-remote: metal-internal, peer-availability: metal-internal}} - glance-hacluster: {charm: hacluster, channel: 2.4/stable, options: {cluster_count: 1}, bindings: {'': metal-admin, ha: metal-internal, hanode: metal-internal, pacemaker-remote: metal-internal, peer-availability: metal-internal}} - neutron-api-hacluster: {charm: hacluster, channel: 2.4/stable, options: {cluster_count: 1}, bindings: {'': metal-admin, ha: metal-internal, hanode: metal-internal, pacemaker-remote: metal-internal, peer-availability: metal-internal}} - nova-cloud-controller-hacluster: {charm: hacluster, channel: 2.4/stable, options: {cluster_count: 1}, bindings: {'': metal-admin, ha: metal-internal, hanode: metal-internal, pacemaker-remote: metal-internal, peer-availability: metal-internal}} - placement-hacluster: {charm: hacluster, channel: 2.4/stable, options: {cluster_count: 1}, bindings: {'': metal-admin, ha: metal-internal, hanode: metal-internal, pacemaker-remote: metal-internal, peer-availability: metal-internal}} - openstack-dashboard-hacluster: {charm: hacluster, channel: 2.4/stable, options: {cluster_count: 1}, bindings: {'': metal-admin, ha: metal-internal, hanode: metal-internal, pacemaker-remote: metal-internal, peer-availability: metal-internal}} - cinder-hacluster: {charm: hacluster, channel: 2.4/stable, options: {cluster_count: 1}, bindings: {'': metal-admin, ha: metal-internal, hanode: metal-internal, pacemaker-remote: metal-internal, peer-availability: metal-internal}} - octavia-hacluster: {charm: hacluster, channel: 2.4/stable, options: {cluster_count: 1}, bindings: {'': metal-admin, ha: metal-internal, hanode: metal-internal, pacemaker-remote: metal-internal, peer-availability: metal-internal}} - barbican-hacluster: {charm: hacluster, channel: 2.4/stable, options: {cluster_count: 1}, bindings: {'': metal-admin, ha: metal-internal, hanode: metal-internal, pacemaker-remote: metal-internal, peer-availability: metal-internal}} - magnum-hacluster: {charm: hacluster, channel: 2.4/stable, options: {cluster_count: 1}, bindings: {'': metal-admin, ha: metal-internal, hanode: metal-internal, pacemaker-remote: metal-internal, peer-availability: metal-internal}} - ceph-radosgw-hacluster: {charm: hacluster, channel: 2.4/stable, options: {cluster_count: 1}, bindings: {'': metal-admin, ha: metal-internal, hanode: metal-internal, pacemaker-remote: metal-internal, peer-availability: metal-internal}} # B1 -- un-deferred - # vault-hacluster: { charm: hacluster, channel: 2.4/stable } # C1: vault single-unit on mysql; HA at Roosevelt - # v2-deferred (D-019): designate-hacluster: { charm: hacluster, channel: 2.4/stable } - - # memcached: nova-cloud-controller token/cell caching (BUNDLEFIX-004) - memcached: - charm: memcached - channel: latest/stable - num_units: 1 - to: [lxd:8] - bindings: - '': metal-admin - cache: metal-internal - cluster: metal-internal - constraints: arch=amd64 - -relations: - - [nova-cloud-controller:memcache, memcached:cache] - - # ---- Vault (single unit, MySQL storage backend via vault-mysql-router; C1 -- etcd+easyrsa removed) - - [vault-mysql-router:db-router, mysql-innodb-cluster:db-router] - - [vault:shared-db, vault-mysql-router:shared-db] - - [mysql-innodb-cluster:certificates, vault:certificates] - # - [vault:ha, vault-hacluster:ha] # vault de-HA'd on testcloud (C1/BUNDLEFIX-002); HA backend a Roosevelt item - - # ---- Keystone (identity, hub of all OS service relations) - - [keystone-mysql-router:db-router, mysql-innodb-cluster:db-router] - - [keystone-mysql-router:shared-db, keystone:shared-db] - - [keystone:certificates, vault:certificates] - - [keystone:ha, keystone-hacluster:ha] - - # ---- Glance (image) - - [glance-mysql-router:db-router, mysql-innodb-cluster:db-router] - - [glance-mysql-router:shared-db, glance:shared-db] - - [glance:identity-service, keystone:identity-service] - - [glance:certificates, vault:certificates] - - [glance:ha, glance-hacluster:ha] - - # ---- Glance simplestreams sync (Octavia amphora pipeline source) - - [glance-simplestreams-sync:identity-service, keystone:identity-service] - - [glance-simplestreams-sync:certificates, vault:certificates] - - # ---- Nova cloud controller (NCC) - - [ncc-mysql-router:db-router, mysql-innodb-cluster:db-router] - - [ncc-mysql-router:shared-db, nova-cloud-controller:shared-db] - - [nova-cloud-controller:identity-service, keystone:identity-service] - - [nova-cloud-controller:amqp, rabbitmq-server:amqp] - - [nova-cloud-controller:image-service, glance:image-service] - - [nova-cloud-controller:neutron-api, neutron-api:neutron-api] - - [nova-cloud-controller:cloud-compute, nova-compute:cloud-compute] - - [nova-cloud-controller:cinder-volume-service, cinder:cinder-volume-service] - - [nova-cloud-controller:certificates, vault:certificates] - - [nova-cloud-controller:ha, nova-cloud-controller-hacluster:ha] - - # ---- Nova compute - - [nova-compute:amqp, rabbitmq-server:amqp] - - [nova-compute:image-service, glance:image-service] - - # ---- Placement - - [placement-mysql-router:db-router, mysql-innodb-cluster:db-router] - - [placement-mysql-router:shared-db, placement:shared-db] - - [placement:identity-service, keystone:identity-service] - - [placement:placement, nova-cloud-controller:placement] - - [placement:certificates, vault:certificates] - - [placement:ha, placement-hacluster:ha] - - # ---- Neutron API + OVN - - [neutron-api-mysql-router:db-router, mysql-innodb-cluster:db-router] - - [neutron-api-mysql-router:shared-db, neutron-api:shared-db] - - [neutron-api:identity-service, keystone:identity-service] - - [neutron-api:amqp, rabbitmq-server:amqp] - - [neutron-api:certificates, vault:certificates] - - [neutron-api-plugin-ovn:neutron-plugin, neutron-api:neutron-plugin-api-subordinate] - - [neutron-api-plugin-ovn:ovsdb-cms, ovn-central:ovsdb-cms] - - [neutron-api-plugin-ovn:certificates, vault:certificates] - - [ovn-central:certificates, vault:certificates] - - [ovn-chassis:ovsdb, ovn-central:ovsdb] - - [ovn-chassis:nova-compute, nova-compute:neutron-plugin] - - [ovn-chassis:certificates, vault:certificates] - - [neutron-api:ha, neutron-api-hacluster:ha] - - # ---- Cinder + cinder-ceph - - [cinder-mysql-router:db-router, mysql-innodb-cluster:db-router] - - [cinder-mysql-router:shared-db, cinder:shared-db] - - [cinder:identity-service, keystone:identity-service] - - [cinder:amqp, rabbitmq-server:amqp] - - [cinder:image-service, glance:image-service] - - [cinder:certificates, vault:certificates] - - [cinder-ceph:storage-backend, cinder:storage-backend] - - [cinder-ceph:ceph, ceph-mon:client] - - [cinder-ceph:ceph-access, nova-compute:ceph-access] - - [cinder:ha, cinder-hacluster:ha] - - # ---- Ceph mon + osd + radosgw - - [ceph-mon:osd, ceph-osd:mon] - - [ceph-mon:client, nova-compute:ceph] - - [ceph-mon:client, glance:ceph] - - [ceph-radosgw:mon, ceph-mon:radosgw] - - [ceph-radosgw:identity-service, keystone:identity-service] - - [ceph-radosgw:certificates, vault:certificates] - - [ceph-radosgw:ha, ceph-radosgw-hacluster:ha] # B1 -- un-deferred - - # ---- OpenStack Dashboard (Horizon) - - [dashboard-mysql-router:db-router, mysql-innodb-cluster:db-router] - - [dashboard-mysql-router:shared-db, openstack-dashboard:shared-db] - - [openstack-dashboard:identity-service, keystone:identity-service] - - [openstack-dashboard:certificates, vault:certificates] - - [openstack-dashboard:ha, openstack-dashboard-hacluster:ha] - - # ---- Octavia (LBaaS) - # CRITICAL: octavia:certificates <-> vault:certificates MUST be present at deploy time - - [octavia-mysql-router:db-router, mysql-innodb-cluster:db-router] - - [octavia-mysql-router:shared-db, octavia:shared-db] - - [octavia:identity-service, keystone:identity-service] - - [octavia:amqp, rabbitmq-server:amqp] - - [octavia:neutron-api, neutron-api:neutron-load-balancer] - - [octavia:certificates, vault:certificates] - - [octavia-dashboard:dashboard, openstack-dashboard:dashboard-plugin] - - [ovn-chassis-octavia:ovsdb, ovn-central:ovsdb] - - [ovn-chassis-octavia:ovsdb-subordinate, octavia:ovsdb-subordinate] - - [ovn-chassis-octavia:certificates, vault:certificates] - # Octavia amphora image pipeline - - [octavia-diskimage-retrofit:juju-info, glance-simplestreams-sync:juju-info] - - [octavia-diskimage-retrofit:identity-credentials, keystone:identity-credentials] - - [octavia:ha, octavia-hacluster:ha] - - # ---- Barbican (secrets) - - [barbican-mysql-router:db-router, mysql-innodb-cluster:db-router] - - [barbican-mysql-router:shared-db, barbican:shared-db] - - [barbican:identity-service, keystone:identity-service] - - [barbican:amqp, rabbitmq-server:amqp] - - [barbican:certificates, vault:certificates] - - [barbican:secrets, barbican-vault:secrets] - - [barbican-vault:certificates, vault:certificates] - - [barbican-vault:secrets-storage, vault:secrets] - - [barbican:ha, barbican-hacluster:ha] - - # ---- Magnum (Layer A only; CAPI graft is Layer B/runbooks phase-06..08) - - [magnum-mysql-router:db-router, mysql-innodb-cluster:db-router] - - [magnum:shared-db, magnum-mysql-router:shared-db] - - [magnum:identity-service, keystone:identity-service] - - [magnum:amqp, rabbitmq-server:amqp] - - [magnum:certificates, vault:certificates] - - [magnum-dashboard:dashboard, openstack-dashboard:dashboard-plugin] - - [magnum:ha, magnum-hacluster:ha] diff --git a/docs/design-decisions.md b/docs/design-decisions.md index b828737..9f8384c 100644 --- a/docs/design-decisions.md +++ b/docs/design-decisions.md @@ -1229,3 +1229,115 @@ model). **Revises:** appendix-D section D.3 -- the role-delegation hypothesis is REFUTED (a clean-role tenant identity still 403'd; the cause was the non-resolving policy template, not role delegation). appendix-D to be corrected on finalize. + + +--- + +## D-066: Tenant account model (Option-3 split); cluster-create requires PASSWORD auth + +**Status:** ADOPTED 2026-07-02. Standard per-tenant identity set, used from the first tenant so +v1 and Roosevelt onboard identically (no interim model to retrofit). Trust path validated live +through create_user + create_trust + cert-gen entry (see D-064/D-065); full cluster completion is +gated on D-067 (Barbican/Vault substrate). + +**The per-tenant accounts (operator creates the domain + manager; manager creates the rest):** +- `-domain-admin` -- `manager` on the domain (SCS Domain Manager, D-051/D-064). Tenant + IAM self-service. Operator-provisioned; password handed over. +- `-cluster` -- `member` + `load-balancer_member` on `-prod`. PASSWORD auth only. + Sole purpose: `coe cluster` lifecycle (this is the identity that mints the Keystone trust). Also + owns whatever magnum-internal creds the driver mints under it (the per-cluster CAPO child cred, + D-039). Manager-created. +- `-svc` -- `member` + `load-balancer_member` on `-prod`. UNRESTRICTED app cred, + for tenant-authored non-trust automation (CI/pipelines). NOT used for cluster-create. Manager-created. + +**Why password (not app cred) for cluster-create -- the hard constraint:** keystone on this build +blocks trust creation from application-credential tokens unconditionally. Source +(keystone/api/trusts.py `_check_application_credential`, read live 2026-07-02): if the token method +is `application_credential` it raises "Using method 'application_credential' is not allowed for +managing trusts" -- and the docstring states this applies "regardless of the 'unrestricted' flag" +(this build is STRICTER than upstream, which exempts unrestricted). Confirmed live: an unrestricted +app cred (which minted a child cred, so genuinely unrestricted) still hit this block; the same +identity via PASSWORD passed it. So the cluster-creator must be password-authenticated. + +**Security rationale (this is the correct side of the control, not a workaround):** the block exists +to stop a single-project app-cred token from extending the trust delegation chain beyond its scope. +A password token is full-user scope, so a trust minted from it delegates only what the user already +holds (member + load-balancer_member on its own project) -- no escalation, confined to the tenant. +Splitting `-cluster` (trust-capable password, cluster ops only) from `-svc` (app cred, everything +else) isolates the trust-capable credential to one identity doing one job. + +**REJECTED:** `CONF.security_compliance.allow_insecure_application_credential_trust_escalation` +(the escape hatch in `_check_application_credential`). Enabling a setting named "insecure" + +"trust_escalation" -- which defeats a control designed to contain a compromised tenant credential -- +undercuts the hard-isolation thesis of the whole build. Not adopted. + +**Roosevelt:** onboard every tenant with this three-account set. Script it (scripts/tenant-onboard.sh, +DRAFT this session). CAPO-cred ownership under `-cluster` confirmed acceptable (magnum-internal, +created/destroyed with the cluster) -- the empirical "whose identity mints the child cred" check +is pending the D-067 substrate fix (cluster-create dies at cert-gen before the driver's mint step). + +**Related:** D-051/D-064 (manager persona), D-065 (create_trust template fix), D-039 (trustor roles). +**Revises:** appendix-D D.3 (the app-cred-creator assumption is wrong -- corrected in appendix-D this session). + +--- + +## D-067: barbican-vault -> Vault (vault-kv) must use the metal-internal plane (live drift; the cert-gen blocker) + +**Status:** ADOPTED 2026-07-02 (fix pending next session -- a live rebind, gated). This is the defect +that blocks multi-tenant cluster COMPLETION (cluster-create clears trust, then dies generating certs). + +**Symptom chain:** tenant cluster-create (password identity) cleared create_user + create_trust, +then `CREATE_FAILED: Failed to create certificates`. Root cause traced through magnum -> Barbican +(`POST /v1/secrets` 500) -> castellan vault_key_manager `_build_auth_headers` -> Vault AppRole login +rejected. Vault's RAW error (not an HTTP code -- the authoritative read): +`source address "10.12.8.176" unauthorized through CIDR restrictions on the secret ID`. + +**Root cause:** barbican reaches Vault at Vault's METAL-ADMIN address (vault_url=http://10.12.8.190:8200, +egress src 10.12.8.176 via eth2). Vault's barbican-vault AppRole binds secret_id to the metal-internal +CIDR (where service-to-service traffic belongs, D-052/D-053). Off-plane source -> Vault rejects. This +is NOT secret_id expiry: `juju run vault/leader refresh-secrets` DID rotate the secret_id (barbican.conf +re-rendered, barbican restarted) and the login STILL failed with the CIDR error. The earlier +"expired TTL" reading was WRONG -- corrected here. + +**The bundle is CORRECT; the LIVE env drifted.** bundle.yaml already binds every secrets-path endpoint +to metal-internal: vault `secrets: metal-internal` (L130), barbican `secrets: metal-internal` (L667), +barbican-vault `secrets-storage: metal-internal` (L700). So a redeploy from the bundle would NOT have +this defect. The live deployment's effective binding for the barbican<->vault secrets path resolves to +metal-admin -- a drift from the bundle (mechanism: TBD by read-only `juju show-application` binding + +relation-data diagnosis next session; candidate causes: bindings added to bundle post-deploy and never +applied live, or the subordinate address-advertisement not following the bound space). + +**Fix (rebind, do NOT widen the CIDR):** reconcile the live binding to the bundle so barbican egresses +from its metal-internal address (10.12.12.110), which the AppRole CIDR already trusts. Widening the +AppRole CIDR to metal-admin is REJECTED: it loosens a security control, legitimizes east-west traffic +on the wrong plane, gets reverted by the charm, and leaves the next Vault consumer exposed. Next-session +sequence: (1) read-only binding diagnosis (`juju show-application vault barbican barbican-vault`, +spaces<->subnets map); (2) gated `juju bind` / relation refresh to metal-internal; (3) re-run +refresh-secrets if needed; (4) confirm barbican AppRole login = HTTP 200 from a metal-internal source; +(5) re-run tenant cluster-create -> cert-gen clears. + +**Related:** D-052/D-053 (six-plane network; metal-internal carries east-west service traffic), +D-057 (prior topology-didn't-follow-binding defect -- same family). **Corrects:** the in-session +"secret_id TTL expiry" hypothesis (refuted by the refresh-secrets test). + +--- + +## D-068: PROPOSED -- Vault substrate hardening (Roosevelt) + +**Status:** PROPOSED / OPEN 2026-07-02. Grouped substrate items surfaced while diagnosing D-067. +None block v1 tenant work; all are Roosevelt-durability items. + +1. **Vault version.** Live is 1.8.8 (charm 1.8/stable) -- EOL. bundle.yaml pinned to 1.16/stable + this session (D-068). Live upgrade is a MAJOR operation (multi-minor jump; unseal keys ready, + storage-format compat, re-unseal after restart, ideally rehearsed) -- NOT a casual `juju refresh`. + Verify the channel exists and rehearse before applying live. +2. **Vault over cleartext HTTP.** vault_url is `http://...:8200`; barbican<->Vault secret writes + (including cluster CA private keys) cross metal-internal unencrypted. Enable Vault listener TLS + for Roosevelt. +3. **AppRole credential lifecycle.** `refresh-secrets` exists as a manual action -> implies no + automatic secret_id renewal. Audit secret_id TTLs across all vault-kv consumers; provision + long/renewable TTLs or charm auto-renewal; add a proactive health probe that validates each + consumer's Vault auth (the T4-style login check) BEFORE expiry, rather than discovering failure + via a tenant's failed cluster-create. + +**Related:** D-067 (surfaced these), D-052/D-053. diff --git a/docs/session-findings-2026-07-02.md b/docs/session-findings-2026-07-02.md new file mode 100644 index 0000000..2805d16 --- /dev/null +++ b/docs/session-findings-2026-07-02.md @@ -0,0 +1,77 @@ +# Session findings -- 2026-07-02 (multi-tenant tenant->cluster buildout) + +## Executive summary +The tenant IDENTITY/TRUST path is DONE and PROVEN. A tenant password identity now creates a Magnum +cluster through create_user (D-064), create_trust (D-065), and into certificate generation. Cluster +COMPLETION is blocked one step later by an OPERATOR-side Barbican/Vault substrate defect (D-067), +independent of the tenant model. The Option-3 tenant account model (D-066) is adopted and to be used +from the first tenant. Next session: fix D-067 (live), then full tenant buildout + tenant-facing tests. + +## The trust-blocker chain (how we got from "cluster 403s" to "done + one substrate bug") +1. D-064 (prior): create_user template fix unblocked trustee-user creation. +2. create_trust then 403'd for EVERY caller (admin included), even trustor==self via direct + `openstack trust create`. Root cause: base policy shipped identity:create_trust with the + non-resolving `user_id:%(trust.trustor_user_id)s` (Caracal populates target.trust.trustor_user_id). + -> D-065: override with the target-prefixed form keystone itself ships. PROVEN by toggling the + override off (still 403 -> base policy owns it) then on. +3. After D-065, create_trust via APP CRED still failed: keystone `_check_application_credential` + blocks trust creation from app-cred tokens "regardless of the unrestricted flag" (this build's + docstring). -> D-066: cluster-create MUST be PASSWORD auth; adopt Option-3 account split. + `allow_insecure_application_credential_trust_escalation` REJECTED (isolation). +4. Password create_trust PASSED. Cluster then failed at cert-gen -> Barbican 500 -> castellan + vault_key_manager -> Vault AppRole login rejected: "source address 10.12.8.176 unauthorized + through CIDR restrictions". -> D-067. + +## D-067 root cause (and a corrected mis-diagnosis) +barbican reaches Vault on the METAL-ADMIN plane (vault_url=10.12.8.190, egress 10.12.8.176). Vault's +barbican-vault AppRole binds the secret_id to the METAL-INTERNAL CIDR (where east-west service traffic +belongs, D-052/D-053). Off-plane source -> rejected. The bundle is CORRECT (vault/barbican/barbican-vault +all bind secrets endpoints to metal-internal, lines 130/667/700); the LIVE env drifted. Fix = live +rebind to metal-internal (gated, next session), NOT CIDR-widen. +CORRECTED: mid-session I hypothesized "secret_id TTL expiry". REFUTED -- `juju run vault/leader +refresh-secrets` rotated the secret_id (barbican.conf re-rendered, service restarted) and the login +STILL failed with the CIDR error. It was never expiry; it is plane/CIDR. + +## What is validated live (tenant acme) +- Manager persona self-service via CLI (create_project/user/grant) -- D-064 G3. PASS. +- Tenant isolation: anti-escalation (admin grant DENIED); cross-domain resource reads DENIED/hidden; + domain enumeration OWN-DOMAIN-ONLY (tighter than appendix-C's SCS worst-case -- appendix-C corrected). +- App-cred + keypair self-mint; tenant L3 (net/subnet/router/ext-gw, SNAT proven) by a non-admin + app-cred identity. +- Cluster template create (image by UUID -- name form has a quoting/derivation hazard). +- Cluster create through create_user + create_trust (password) into cert-gen. + +## Decisions logged +- D-066: Option-3 tenant accounts (domain-admin/cluster/svc); cluster-create requires password auth. +- D-067: barbican-vault -> Vault must use metal-internal (live drift; the cert-gen blocker). ADOPTED, fix pending. +- D-068: PROPOSED -- Vault substrate hardening (1.16 pin [bundle done], TLS, AppRole lifecycle). + +## Probe-discipline lessons (now runbook conventions -- these recurred and cost time) +1. Validate raw output WHOLE, never extract-then-check. A `tr -dc 0-9` MARK guard turned an error + string ("...10.12.8.30:17070...") into MARK=123101283017070 and passed. Use `case "$raw" in + ''|*[!0-9]*) fail;; *) ok;; esac`. +2. Whitelist-print secrets, never blacklist-redact. `approle_secret_id` leaked past a `secret`-keyed + redact (the key is *_secret_id*). Print only an allowlist of safe fields; never pipe secrets. +3. No `exit`/bare-`return` in interactive PASTE blocks (they escape to the login shell and logged the + operator out). Subshell-wrap `( ... )`. NOTE: executed .sh scripts may use exit normally. +4. Privileged reads over `juju ssh` use `sudo cat file | ...`, never `sudo cmd < file` (the redirect + runs UNPRIVILEGED -> Permission denied). +5. Use the deployment's DECLARED endpoint/scheme, not the conventional one (assumed Vault https; it + serves http -- every probe errored on scheme until corrected). +6. A parser that can print NOTHING has a silent third state -- read raw + self-report inputs (field + lengths, raw body) so a malformed-request 400 can't masquerade as an auth failure. + +## Roosevelt hardening backlog (from this session) +- D-067/D-068: metal-internal binding discipline for ALL vault-kv consumers; Vault 1.16 + TLS; + AppRole secret_id lifecycle (TTL/renewal + proactive auth health probe). +- Endpoint/credential "follow the topology" is now a recurring class (with D-057): consider stable + VIP/DNS endpoints for substrate services so leader/re-IP changes don't silently break consumers. + +## Next session plan +1. Repair live env: read-only binding diagnosis (`juju show-application vault barbican barbican-vault`, + spaces<->subnets), then GATED `juju bind` of the barbican<->Vault secrets path to metal-internal; + re-run refresh-secrets if needed; confirm barbican AppRole login HTTP 200 from metal-internal. +2. Re-run tenant cluster-create (acme, ${CLIENT}-cluster password) -> cert-gen clears -> watch to + CREATE_COMPLETE; capture the CAPO child-cred mint identity (confirms D-066). +3. Full tenant buildout via scripts/tenant-onboard.sh; then clean-room `beta` (zero admin fallback). +4. Tenant-facing tests: kubeconfig, nodes/CNI/CCM, a tenant LB, tenant isolation from a second tenant. diff --git a/docs/v1-redeploy-changelog.md b/docs/v1-redeploy-changelog.md index db523ff..0c7ee16 100644 --- a/docs/v1-redeploy-changelog.md +++ b/docs/v1-redeploy-changelog.md @@ -1323,8 +1323,39 @@ appendix-D D.3 (role-delegation hypothesis) REFUTED and to be revised. list_trusts="" info-disclosure recorded PROPOSED/OPEN, not actioned. +## 2026-07-02 -- multi-tenant tenant->cluster buildout: trust path PROVEN; Barbican/Vault substrate blocker + +Ran a full multi-tenant buildout (tenant `acme`) validating the SCS Domain Manager persona end to +end and the tenant cluster-create path. Outcomes: +- IDENTITY/TRUST PATH DONE + PROVEN: D-064 (create_user) + D-065 (create_trust template fix) + the + D-066 Option-3 password model carried cluster-create through create_user, create_trust, and INTO + cert generation. Manager self-service (create_project/user/grant) validated via CLI (G3); tenant + isolation validated (anti-escalation DENY; cross-domain DENY; domain-enumeration own-domain-only, + tighter than appendix-C's SCS worst-case -- appendix-C already corrected). +- KEY CONSTRAINT (D-066): keystone blocks trust creation from app-cred tokens regardless of + unrestricted (this build's _check_application_credential). Cluster-creator MUST be password-auth. + Adopted Option-3 3-account model (domain-admin / cluster / svc). allow_insecure_...escalation REJECTED. +- BLOCKER (D-067): cluster-create dies at cert-gen -- Barbican 500 -> Vault AppRole login rejected by + CIDR restriction ("source address 10.12.8.176 unauthorized"). barbican->Vault uses metal-admin; + should be metal-internal (D-052/D-053). Bundle is CORRECT (all secrets endpoints metal-internal); + LIVE drifted. Fix = live rebind (gated, next session), NOT CIDR-widen. refresh-secrets rotated the + secret_id but did not fix it (CIDR, not expiry -- "TTL" hypothesis refuted). +- D-068 PROPOSED: Vault substrate hardening (1.16 pin [bundle done], TLS, AppRole lifecycle). + +Package this session: bundle vault 1.16/stable pin; D-066/067/068; appendix-D corrected (D.3 refuted); +appendix-C account table -> Option-3; tenant-onboarding-v2 updated to Option-3 + password-create + +substrate note; scripts/tenant-onboard.sh (DRAFT); session-findings doc (incl. probe-discipline lessons). + +PROBE-DISCIPLINE lessons logged (recurred this session; now runbook conventions): (1) validate raw +output WHOLE, never extract-then-check (a `tr -dc 0-9` MARK guard synthesized a fake number from an +error string); (2) whitelist-print secrets, never blacklist-redact (an `approle_secret_id` leaked past +a `secret`-keyed redact); (3) no `exit`/bare-`return` in interactive paste -- subshell-wrap (an `exit` +logged the operator out); (4) privileged reads over juju ssh use `sudo cat`, never `sudo cmd < file` +(unprivileged redirect -> Permission denied); (5) read the deployment's DECLARED endpoint/scheme, not +the conventional one (assumed Vault https; it serves http). + ### Next-free numbers -Design decision: D-066. Doc fix: DOCFIX-065. (D-064 ASSIGNED above = reconcile D-051 to scs-0302 +Design decision: D-069. Doc fix: DOCFIX-065. (D-064 ASSIGNED above = reconcile D-051 to scs-0302 + create-op templating fix. DOCFIX-064 RESERVED = phase-08 runbook sweep (image --public; seed retry/timeout + poll hard-gate + post-active property re-verify; image-absent guard; template capi-mgmt scope preamble + flavor floor; 8.1 D-039 role + keypair pre-checks; octavia prereq diff --git a/runbooks/appendix-C-identity-rbac.md b/runbooks/appendix-C-identity-rbac.md index e8a1f55..3988239 100644 --- a/runbooks/appendix-C-identity-rbac.md +++ b/runbooks/appendix-C-identity-rbac.md @@ -37,10 +37,16 @@ | admin (operator super-admin) | Admin | admin domain + admin project (= cloud_admin) | Cloud operator; full authority. Bootstrap identity. | | admin (as Magnum trustor) | member + load-balancer_member + reader | capi-mgmt project | So the app-cred Magnum mints per cluster carries Octavia authority for the apiserver LB (D-039). These are the frozen trustor roles delegated into each cluster trust. | | magnum_domain_admin | Admin | magnum domain | Magnum trustee domain admin; creates the per-cluster trustee USER at cluster-create (D-046; Magnum docs). Works via the D-064 create-op fix -- no extra grant needed. Recreated by the `domain-setup` charm action after every teardown/redeploy (D-046). | -| -domain-admin | manager | the tenant's domain | SCS Domain Manager persona (D-051/D-064). Operator provisions the domain + this one account; the tenant self-services users, projects, and member/load-balancer_member grants from there. | -| human users | member (+ load-balancer_member if they use LBs or Magnum) | the tenant's project(s) | Created and assigned by the tenant's own domain-manager via Horizon/CLI. Operator is not in the loop. | -| -ci / service accounts | member + load-balancer_member | the tenant's project | Backing identity for the application credential that CI/automation authenticates with. load-balancer_member so tenant CI can drive Magnum/LBs. | -| per-cluster trustee | (delegated via trust -- not a direct grant) | -- | Magnum mints this at cluster-create and deletes it at cluster-delete. It carries the trustor's frozen roles through the trust (D-039). Never assign roles to it by hand. | +| -domain-admin | manager | the tenant's domain | SCS Domain Manager persona (D-051/D-064). PASSWORD identity. Operator provisions the domain + this one account; the tenant self-services everything below via CLI/Horizon. | +| -cluster | member + load-balancer_member | the tenant's project | D-066 Option-3. PASSWORD identity, trust-capable. SOLE purpose: `coe cluster` lifecycle (mints the Keystone trust -- MUST be password, since keystone blocks trust creation from app-cred tokens; see D-066). Also owns the magnum-internal CAPO child cred the driver mints per cluster (D-039). Manager-created. | +| -svc | member + load-balancer_member | the tenant's project | D-066 Option-3. UNRESTRICTED app cred, for tenant-authored non-trust automation (CI/pipelines). NOT used for cluster-create (app-cred trust creation is blocked). Manager-created. | +| human users | member (+ load-balancer_member if they use LBs) | the tenant's project(s) | Created/assigned by the tenant's domain-manager. Operator not in the loop. | +| per-cluster trustee | (delegated via trust -- not a direct grant) | -- | Magnum mints at cluster-create, deletes at cluster-delete. Carries the trustor (-cluster) roles through the trust (D-039). Never assign roles by hand. | + +IMPORTANT (D-066): the cluster-creator (`-cluster`) authenticates by PASSWORD, not app cred. +Keystone blocks trust creation from application-credential tokens regardless of the unrestricted flag +(this build's `_check_application_credential`), and Magnum needs a Keystone trust at cluster-create. +The app-cred identity (`-svc`) is for non-trust automation only. See appendix-D. Provisioning direction: the operator creates a tenant's DOMAIN and its single `manager` account, then hands off. Everything below the domain (users, projects, member/LB grants) is tenant diff --git a/runbooks/appendix-D-magnum-trust-model.md b/runbooks/appendix-D-magnum-trust-model.md index b6d193c..6df63f7 100644 --- a/runbooks/appendix-D-magnum-trust-model.md +++ b/runbooks/appendix-D-magnum-trust-model.md @@ -1,176 +1,100 @@ -# Appendix D -- Magnum cluster-create trust model (multi-tenant) +# Appendix D -- Magnum cluster-create trust model (multi-tenant) [REVISED 2026-07-02] -Fills the gap the onboarding runbook Stage 7 marks [PENDING]: exactly which identity -creates a Magnum cluster, and why the Keystone trust delegation constrains that choice. -Grounded in the magnum source (magnum/common/keystone.py, read live 2026-07-01) and the -D-039 / D-051 / D-064 identity model. Supersedes the single-consumer shortcut used on -2026-06-09 (admin creates in the admin-owned capi-mgmt project), which sidesteps -- rather -than exercises -- the trust constraint and therefore does NOT validate the tenant path. +Fills onboarding Stage 7. Grounded in the magnum + keystone source (read live 2026-07-02) and the +live multi-tenant validation (tenant `acme`). This revision CORRECTS the 2026-07-01 draft, whose +central hypothesis (D.3, "a clean-role tenant identity delegates the trust") was REFUTED live -- the +real blockers were a keystone policy template (D-065) and an app-cred trust restriction (D-066), +neither of which is about role delegation. + +VALIDATION STATUS: the identity/trust path is PROVEN end to end -- a tenant password identity clears +create_user (D-064) and create_trust (D-065), then magnum proceeds to certificate generation. Cluster +COMPLETION is currently blocked one step later at the Barbican/Vault cert substrate (D-067), an +operator-side defect independent of the tenant model. -------------------------------------------------------------------------------- -## D.1 What magnum does at cluster-create (the mechanism) +## D.1 What magnum does at cluster-create (the mechanism, in order) -------------------------------------------------------------------------------- -Two Keystone writes happen before any infrastructure is touched -(magnum/conductor/handlers/common/trust_manager.py -> create_trustee_and_trust): - -1. create_trustee -> `identity:create_user` - Magnum's trustee_domain_admin (magnum_domain_admin, Admin on the magnum domain) - creates a per-cluster service user in the magnum domain. This is the step D-064 - unblocked (the create_user policy templating fix). VALIDATED live 2026-07-01: - trustee user is created successfully. - -2. create_trust -> `identity:create_trust` - Magnum creates a Keystone trust delegating the CALLER's roles to that trustee. - From magnum/common/keystone.py: - - def create_trust(self, trustee_user): - trustor_user_id = self.session.get_user_id() # the CALLER's user - trustor_project_id = self.session.get_project_id() # the CALLER's project - if CONF.trust.roles: - roles = CONF.trust.roles # (unset on this deploy) - else: - roles = self.context.roles # -> the roles in the CALLER's token - self.client.trusts.create( - trustor_user=trustor_user_id, project=trustor_project_id, - trustee_user=trustee_user, impersonation=True, role_names=roles) - -Two facts follow directly from that code, and they are the whole model: - - A. The TRUSTOR is the identity that issued `openstack coe cluster create` - (`self.session` is the request-context client). The Keystone policy - `identity:create_trust = "user_id:%(trust.trustor_user_id)s"` is therefore - satisfied by construction -- caller == trustor. (So the create_trust 403 is - NOT a trustor-identity policy failure.) - - B. The DELEGATED ROLES are `self.context.roles` -- the roles present in the - CALLER's token on `trustor_project_id`. Keystone's create_trust REFUSES to - delegate any role the trustor does not actually hold on that project - (a trust cannot grant more than the trustor has). `CONF.trust.roles` is unset - here, so magnum delegates the caller's token roles verbatim -- whatever they are. +1. create_trustee -> identity:create_user (magnum_domain_admin creates the per-cluster trustee user in + the magnum domain). Unblocked by D-064. PROVEN live. +2. create_trust -> identity:create_trust (the cluster CREATOR is the trustor; trustee is the step-1 + user; impersonation=True; roles = the creator's token roles). Unblocked by D-065 + password auth. + PROVEN live. +3. generate_certificates_to_cluster -> stores the cluster CA cert in BARBICAN, which stores it in Vault + (castellan vault_key_manager). CURRENT BLOCKER -- see D-067. +4. (then) the capi-helm driver mints the per-cluster CAPO child app credential (D-039) and provisions + via helm/CAPI. NOT YET REACHED on the multi-tenant path. -------------------------------------------------------------------------------- -## D.2 Why the 2026-06-09 single-consumer path "worked" (and why we retired it) +## D.2 Two hard constraints on WHO creates the cluster -------------------------------------------------------------------------------- -On 2026-06-09 the cluster was created by ADMIN, scoped to the admin-owned capi-mgmt -project. Admin trivially holds (or cloud-admin-bypasses) every role it delegates to -itself, so create_trust never exercised the delegation constraint. That is a -SINGLE-CONSUMER shortcut: one privileged operator standing in for the tenant. It -proves the driver/CAPI plumbing but NOT the multi-tenant identity path, because in -the real product the cluster creator is a TENANT, not the cloud operator. +Constraint 1 -- the create_trust policy template (D-065). This cloud's charm-rendered base policy +shipped identity:create_trust = "user_id:%(trust.trustor_user_id)s", a non-resolving template on +Caracal (keystone populates target.trust.trustor_user_id). It evaluated false for EVERY caller (admin +included), regardless of roles -- proven by a direct `openstack trust create` with trustor==self still +403ing. Fixed by D-065 (override with the target-prefixed form keystone itself ships). This is why the +2026-07-01 role-delegation hypothesis was wrong: the failure was templating, not roles. -The admin-in-capi-mgmt attempt on 2026-07-01 then 403'd at create_trust because that -mixed scope (admin user, capi-mgmt project) is not a clean delegatable-role identity -on capi-mgmt -- and, under D-064, admin scoped to capi-mgmt is a RESTRICTED identity -there (it is not cloud_admin outside the admin domain; `list_role_assignments` 403s -in that scope, confirmed live). It is the wrong identity for the tenant model on two -counts: it is the operator, and its token roles are not the tenant delegatable set. +Constraint 2 -- app credentials cannot create trusts (D-066). After D-065, an app-cred-authenticated +create_trust STILL failed -- keystone's _check_application_credential (trusts.py) blocks trust creation +from any application-credential token, and on this build the docstring states this applies "regardless +of the 'unrestricted' flag". Confirmed live: an unrestricted app cred was refused; the same identity via +PASSWORD passed. Therefore the cluster-creator MUST authenticate with a PASSWORD. + +Consequence: the cluster-creator is -cluster (password, member + load-balancer_member on the +tenant project) per the D-066 Option-3 account model. The app cred (-svc) is for non-trust +automation only. See appendix-C for the full account set. + +Keystone's create_trust ALSO enforces _require_trustor_has_role_in_project (the trustor must hold each +delegated role on the project). Magnum delegates the creator's token roles, which are by construction a +subset of what the creator holds on the scoped project -> passes. -------------------------------------------------------------------------------- -## D.3 The multi-tenant rule (what identity must create the cluster) +## D.3 The multi-tenant rule (CORRECTED) -------------------------------------------------------------------------------- -RULE: a Magnum cluster is created by the TENANT's own project-scoped identity, whose -token carries EXACTLY the delegatable tenant roles -- `member` and -`load-balancer_member` (and `reader` where used) -- and NOT `admin`. +A Magnum cluster is created by the tenant's -cluster identity, authenticating by PASSWORD, +project-scoped to -prod, holding exactly member + load-balancer_member. Not admin, not an app +cred. This satisfies: create_trust policy (D-065), the app-cred block (password, D-066), the +trustor==caller check (by construction), and the trustor-has-role check (D-039-style grants). -Rationale, straight from D.1.B: - - The trust delegates `context.roles`. If the creator's token carries `admin`, - magnum tries to delegate `admin` into the trust; Keystone refuses a trust that - grants a role the trustor does not properly hold as a delegatable project grant, - and even if it did, delegating `admin` into a long-lived cluster credential is a - privilege-escalation footgun (the trustee impersonates the trustor with - impersonation=True). The tenant set (member + load-balancer_member) is the - correct, least-privilege delegation. - - `load-balancer_member` MUST be in the creator's token: the magnum-capi-helm - driver provisions an Octavia LB for the apiserver, and the trust must carry - Octavia authority or CAPO 403s at LB reconcile (D-039). This is exactly why - D-039 grants the trustor `load-balancer_member` on the cluster project. - - `member` provides the compute/network/volume authority the cluster's CCM/CSI - need via the trust. - -WHO THIS IS, per the onboarding model (tenant-onboarding-runbook Stage 2/4): - - The tenant's SERVICE identity: `-ci` / `-svc`, holding - `member` + `load-balancer_member` on `-prod`, authenticating with its - UNRESTRICTED application credential (the app cred is required so the driver can - mint the per-cluster CAPO child cred -- D-039 / onboarding Stage 4). - - Equivalently a tenant human user with `member` + `load-balancer_member` on the - project, but the service/app-cred identity is the production path (Jenkins/CI). - -The operator (admin / cloud_admin) does NOT create tenant clusters. The capi-mgmt -project is the MANAGEMENT-plane project (where the CAPI mgmt cluster VM and the -operator's own D-039 roles live for the mgmt cluster itself); tenant clusters are -created in the TENANT's project by the TENANT's identity. +The 2026-06-09 single-consumer path (admin creates in the admin-owned capi-mgmt project) sidesteps the +trust-delegation constraint and does NOT validate the tenant model -- retired. -------------------------------------------------------------------------------- -## D.4 Trustor role-set validation (run before the create) +## D.4 The current blocker (D-067) -- operator-side, not tenant-side -------------------------------------------------------------------------------- -Confirm the creating identity's TOKEN carries the delegatable set and nothing that -cannot be delegated. Run AS the tenant creator identity (app cred or password): +Step D.1(3) fails: magnum -> Barbican POST /v1/secrets returns 500 -> castellan vault_key_manager -> +Vault AppRole login rejected: source address "10.12.8.176" unauthorized through CIDR restrictions. +barbican reaches Vault on the metal-admin plane; Vault's AppRole binds the secret_id to metal-internal +(D-052/D-053). The bundle is correct (all secrets endpoints metal-internal); the LIVE binding drifted. +Fix = live rebind to metal-internal (gated, next session), NOT CIDR-widen. Full detail in D-067. - # as the tenant service identity, project-scoped to -prod - openstack token issue -f value -c user_id -c project_id # confirm scope - # roles in THIS token == what magnum will delegate (context.roles): - openstack role assignment list --user \ - --project --effective --names -f value -c Role | sort - -GATE: the role set is a subset of { member, load-balancer_member, reader }, and -INCLUDES load-balancer_member. If `admin` appears, this is the wrong identity -- -do not create with it. - -Note: a tenant/app-cred identity cannot run `role assignment list` for other users -(policy 403, by design). Query only its own assignment, or read it as admin -beforehand during onboarding. +This is independent of the tenant identity model: it blocks cert-gen for ANY creator. Once D-067 is +fixed, cluster-create should proceed past cert-gen into the capi-helm driver's provisioning, where the +CAPO child-cred mint (D-039) happens under -cluster. -------------------------------------------------------------------------------- -## D.5 The create (tenant identity), and the trust it produces +## D.5 The create (tenant -cluster, PASSWORD) -------------------------------------------------------------------------------- - # authenticate as the tenant service identity via its app cred (onboarding Stage 4) - # OS_AUTH_TYPE=v3applicationcredential + the app cred id/secret from the 0600 file - # then, project-scoped to the tenant project: - openstack coe cluster create \ - --cluster-template \ - --keypair \ - --master-count 1 --node-count 2 - - # verify the trust was created and carries the tenant roles: - openstack coe cluster show -f value -c status -c trustee_user_id - # status -> CREATE_IN_PROGRESS (past trustee+trust), NOT CREATE_FAILED at ~3s. - -Expected: create_user (D-064) AND create_trust both pass, because the creator is the -trustor and its token roles (member + load-balancer_member) are cleanly delegatable -on the tenant project. The driver then proceeds to helm/CAPI provisioning. + # authenticate as -cluster via PASSWORD (NOT app cred), project-scoped to -prod + # OS_USERNAME=-cluster OS_USER_DOMAIN_ID= OS_PROJECT_ID= OS_PASSWORD=... + # OS_CACERT= OS_AUTH_URL=https://:5000/v3 + openstack coe cluster create --cluster-template -k8s \ + --keypair -key --master-count 1 --node-count 1 + openstack coe cluster show -f value -c status -c status_reason + # expect (post D-067): CREATE_IN_PROGRESS -> ... -> CREATE_COMPLETE -------------------------------------------------------------------------------- -## D.6 Roosevelt +## D.6 Open validation items (next session) -------------------------------------------------------------------------------- - - Cluster-create is a TENANT self-service operation, performed by the tenant's - app-cred identity carrying member + load-balancer_member on the tenant project. - Wire it into the tenant CI (Jenkins) path (onboarding Stage 7), never the - operator admin. - - Optionally pin `CONF.trust.roles = member,load-balancer_member` in magnum.conf - (via the D-037 conf.d mechanism) to make the delegated set EXPLICIT and - independent of whatever roles happen to be in the caller's token -- a hardening - that removes the "wrong token roles" failure mode entirely. Decide as a tracked - item; unset (inherit context.roles) is the upstream default and works when the - creator identity is correct. - - The management-plane capi-mgmt project + the operator's D-039 roles there remain - for the MGMT cluster; they are not the tenant cluster-create path. - --------------------------------------------------------------------------------- -## D.7 Open validation item --------------------------------------------------------------------------------- - -This appendix establishes the model from the magnum source and the identity design. -The live behavioral confirmation on THIS cloud -- create a cluster as a tenant -app-cred identity (member + load-balancer_member) and observe create_trust succeed -- -is the acceptance step, and folds into onboarding Stage 7 (currently [PENDING]) and -the D-011 gate. Until run, D.3 is design-derived-from-source, not yet live-verified -on the multi-tenant path. (UPDATE 2026-07-01: onboarding Stages 1-4 VALIDATED live as tenant acme -- manager -self-service, app-cred cluster-creator with member+load-balancer_member, tenant L3. Stage 5 -template = corrected-pending (image-by-UUID). Stage 6 create_trust = the outstanding item; -the create_user half (D-064) is confirmed live.) +1. Fix D-067 (barbican<->Vault metal-internal rebind), then re-run the create -> cert-gen clears. +2. Watch to CREATE_COMPLETE; capture where/whose the CAPO child cred is minted (confirms D-066's + -cluster-owns-CAPO-cred design empirically). +3. kubeconfig + nodes/CNI/CCM (phase-08 8.3 pattern). +4. Clean-room beta pass: onboard a fresh tenant from ONLY handed-over credentials (zero admin + fallback) via scripts/tenant-onboard.sh, and complete the tenant-facing tests. diff --git a/runbooks/tenant-onboarding-v2-DRAFT.md b/runbooks/tenant-onboarding-v2-DRAFT.md index cdda314..2533a8d 100644 --- a/runbooks/tenant-onboarding-v2-DRAFT.md +++ b/runbooks/tenant-onboarding-v2-DRAFT.md @@ -1,3 +1,18 @@ +# >>> 2026-07-02 REVISION BANNER (read first) <<< +# This 2026-07-01 draft is SUPERSEDED on two points by the 2026-07-02 session. Where they conflict, +# the following + appendix-D + scripts/tenant-onboard.sh are AUTHORITATIVE: +# 1. ACCOUNT MODEL is now Option-3 (D-066): -domain-admin (manager, password), +# -cluster (PASSWORD, trust-capable, cluster lifecycle), -svc (unrestricted +# app cred, non-trust automation). The single-'-svc'-does-everything model below is retired. +# 2. CLUSTER-CREATE identity is -cluster via PASSWORD, NOT an app cred. Keystone blocks +# trust creation from app-cred tokens regardless of unrestricted (D-066/appendix-D). The +# Stage 6 "create as app cred" below is WRONG -- use the password identity. +# 3. Cluster COMPLETION is gated on D-067 (barbican<->Vault metal-internal rebind); cluster-create +# currently dies at cert-gen until that live fix lands. +# The runnable, corrected procedure is scripts/tenant-onboard.sh (Option-3). Stages 0-5 validated; +# stage 6 gated on D-067. +# >>> END REVISION BANNER <<< + # Tenant Onboarding v2 -- multi-tenant self-service + cluster creation (DRAFT) STATUS: DRAFT, built 2026-07-01 from the live multi-tenant validation run (tenant `acme`). diff --git a/scripts/tenant-onboard.sh b/scripts/tenant-onboard.sh new file mode 100644 index 0000000..e4a3cbf --- /dev/null +++ b/scripts/tenant-onboard.sh @@ -0,0 +1,178 @@ +#!/usr/bin/env bash +# tenant-onboard.sh -- Option-3 multi-tenant onboarding (D-066), Omega Cloud v1 +# STATUS: DRAFT 2026-07-02. Stages 0-5 validated live (tenant acme). Stage 6 (cluster create) is +# GATED on D-067 (barbican<->Vault metal-internal rebind) -- it will fail at cert-gen until that is +# fixed. Run stages 0-5 to build the tenant; run stage 6 only after D-067 is resolved. +# +# Model (D-066): operator creates domain + manager; manager creates project + -cluster (password, +# trust-capable, cluster lifecycle) + -svc (unrestricted app cred, non-trust automation). Cluster +# create MUST be password (keystone blocks app-cred trust creation; see appendix-D/D-066). +# +# Hardening conventions (learned 2026-07-02): validate raw output WHOLE (never extract-then-check); +# whitelist-write secrets to 0600 files, never echo; dynamic ID resolution; CA threaded; verify +# before mutate. This is an EXECUTED script (bash tenant-onboard.sh ...), so exit-on-error is correct. +set -uo pipefail + +# ---- inputs ---- +CLIENT="${1:-}"; STAGE="${2:-all}" +TENANT_CIDR="${TENANT_CIDR:-}" # required for stage 4 (e.g. 10.20.24.0/24); must not collide +KEYSTONE_VIP="${KEYSTONE_VIP:-10.12.4.50}" +CA="${OS_CACERT:-$HOME/vault-init/vault-ca-root.pem}" +OUT="$HOME/tenant-${CLIENT}" # 0600 credential handover dir +AUTH_URL="https://${KEYSTONE_VIP}:5000/v3" +die(){ echo "FATAL: $*" >&2; exit 1; } +[ -n "$CLIENT" ] || die "usage: tenant-onboard.sh [stage0|1|2|3|4|5|6|all]" +[ -s "$CA" ] || die "OS_CACERT not found: $CA" +openssl x509 -in "$CA" -noout -checkend 0 >/dev/null 2>&1 || die "CA expired/unreadable: $CA" +umask 077; mkdir -p "$OUT"; chmod 700 "$OUT" + +is_id(){ [[ "$1" =~ ^[0-9a-f]{32}$ ]]; } # keystone id +newpw(){ python3 -c 'import secrets;print(secrets.token_urlsafe(24))'; } + +# ---- admin context helper (operator) ---- +admin_env(){ for v in $(env|awk -F= '/^OS_/{print $1}'); do unset "$v"; done; source ~/admin-openrc; } +# ---- named-identity password context (subshell-scoped by callers) ---- + +stage0(){ # operator preflight (read-only) + admin_env; local fail=0 + echo "== stage0: preflight ==" + juju status keystone -m openstack --format=yaml 2>/dev/null | python3 -c 'import sys,yaml;m=yaml.safe_load(sys.stdin)["applications"]["keystone"]["units"]["keystone/0"].get("workload-status",{}).get("message","");print("keystone:",m);sys.exit(0 if m.startswith("PO:") else 1)' || { echo " keystone override not PO: active"; fail=1; } + for R in manager member load-balancer_member; do is_id "$(openstack role show "$R" -f value -c id &1)" || { echo " role $R MISSING"; fail=1; }; done + local img; img=$(openstack image list --public -f value -c ID -c Name &1 | awk '/kube/{print $1;exit}') + is_id "${img//-/}" 2>/dev/null || [[ "$img" =~ ^[0-9a-f-]{36}$ ]] || { echo " no public kube image"; fail=1; } + is_id "$(openstack domain show "$CLIENT" -f value -c id &1)" && { echo " domain $CLIENT already EXISTS -- decide reuse/clean"; fail=1; } || true + [ "$fail" = 0 ] && echo " PREFLIGHT PASS" || die "preflight failed" +} + +stage1(){ # operator: domain + manager + quota + admin_env + echo "== stage1: operator provisions domain + manager ==" + openstack domain create --description "Client: $CLIENT" "$CLIENT" /dev/null 2>&1 || true + local DOM; DOM=$(openstack domain show "$CLIENT" -f value -c id &1); is_id "$DOM" || die "domain create failed" + local MPW; MPW=$(newpw) + openstack user create --domain "$DOM" --password "$MPW" --description "$CLIENT domain manager (D-051/D-064)" "${CLIENT}-domain-admin" /dev/null 2>&1 || true + local MUID; MUID=$(openstack user show "${CLIENT}-domain-admin" --domain "$DOM" -f value -c id &1); is_id "$MUID" || die "manager create failed" + openstack role add --domain "$DOM" --user "$MUID" manager &1 || true + openstack role assignment list --user "$MUID" --names -f value -c Role &1 | grep -qw manager || die "manager grant failed" + local MF="$OUT/${CLIENT}-domain-admin-cred.txt"; : > "$MF"; chmod 600 "$MF" + printf 'domain=%s\ndomain_id=%s\nusername=%s-domain-admin\nuser_id=%s\npassword=%s\nauth_url=%s\n' "$CLIENT" "$DOM" "$CLIENT" "$MUID" "$MPW" "$AUTH_URL" > "$MF"; chmod 600 "$MF" + echo " domain=$DOM manager=$MUID cred -> $MF" + echo " (set project quota after stage2 creates ${CLIENT}-prod, or pre-create the project as operator)" +} + +stage2(){ # manager self-service: project + -cluster + -svc + grants + admin_env + local DOM; DOM=$(openstack domain show "$CLIENT" -f value -c id &1); is_id "$DOM" || die "no domain" + local MF="$OUT/${CLIENT}-domain-admin-cred.txt"; [ -s "$MF" ] || die "run stage1 first" + echo "== stage2: manager self-services project + -cluster + -svc + grants (D-064 G3) ==" + ( for v in $(env|awk -F= '/^OS_/{print $1}'); do unset "$v"; done + export OS_AUTH_URL="$AUTH_URL" OS_IDENTITY_API_VERSION=3 OS_CACERT="$CA" + export OS_USERNAME="${CLIENT}-domain-admin" OS_USER_DOMAIN_ID="$DOM" OS_DOMAIN_ID="$DOM" + export OS_PASSWORD="$(awk -F= '/^password=/{print $2}' "$MF")" + [ "$(openstack token issue -f value -c domain_id &1)" = "$DOM" ] || { echo "manager auth FAIL"; exit 1; } + openstack project create --domain "$DOM" --description "$CLIENT production" "${CLIENT}-prod" /dev/null 2>&1 || true + local PID; PID=$(openstack project show "${CLIENT}-prod" --domain "$DOM" -f value -c id &1); [[ "$PID" =~ ^[0-9a-f]{32}$ ]] || { echo "project FAIL"; exit 1; } + # -cluster (password, trust-capable) and -svc (app-cred automation) + for U in cluster svc; do + local PW; PW=$(python3 -c 'import secrets;print(secrets.token_urlsafe(24))') + openstack user create --domain "$DOM" --password "$PW" --description "$CLIENT $U" "${CLIENT}-$U" /dev/null 2>&1 || true + local UID2; UID2=$(openstack user show "${CLIENT}-$U" --domain "$DOM" -f value -c id &1); [[ "$UID2" =~ ^[0-9a-f]{32}$ ]] || { echo "user $U FAIL"; exit 1; } + for R in member load-balancer_member; do openstack role add --project "$PID" --user "$UID2" "$R" &1 || true; done + local F="$OUT/${CLIENT}-$U-cred.txt"; umask 077; : > "$F"; chmod 600 "$F" + printf 'username=%s-%s\nuser_id=%s\nuser_domain_id=%s\nproject_id=%s\nauth_url=%s\npassword=%s\n' "$CLIENT" "$U" "$UID2" "$DOM" "$PID" "$AUTH_URL" "$PW" > "$F"; chmod 600 "$F" + echo " ${CLIENT}-$U=$UID2 (member+load-balancer_member) cred -> $F" + done + # anti-escalation self-check (must be DENIED) + local SU; SU=$(openstack user show "${CLIENT}-svc" --domain "$DOM" -f value -c id &1) + openstack role add --project "$PID" --user "$SU" admin /dev/null 2>&1 || true + if openstack role assignment list --project "$PID" --user "$SU" --names -f value -c Role &1 | grep -qw admin; then echo " *** ESCALATION: manager granted admin -- STOP ***"; exit 1; else echo " anti-escalation OK (admin grant denied)"; fi + ) || die "stage2 failed" +} + +stage3(){ # -svc mints unrestricted app cred; -cluster keeps password; keypair + admin_env + local DOM; DOM=$(openstack domain show "$CLIENT" -f value -c id &1) + local SF="$OUT/${CLIENT}-svc-cred.txt"; local PID; PID=$(awk -F= '/^project_id=/{print $2}' "$SF") + echo "== stage3: -svc mints unrestricted app cred + keypair ==" + ( for v in $(env|awk -F= '/^OS_/{print $1}'); do unset "$v"; done + export OS_AUTH_URL="$AUTH_URL" OS_IDENTITY_API_VERSION=3 OS_CACERT="$CA" + export OS_USERNAME="${CLIENT}-svc" OS_USER_DOMAIN_ID="$DOM" OS_PROJECT_ID="$PID" + export OS_PASSWORD="$(awk -F= '/^password=/{print $2}' "$SF")" + [ "$(openstack token issue -f value -c project_id &1)" = "$PID" ] || { echo "svc auth FAIL"; exit 1; } + local ACF="$OUT/${CLIENT}-svc-appcred.txt"; umask 077; : > "$ACF"; chmod 600 "$ACF" + openstack application credential create "${CLIENT}-svc-cred" --unrestricted --description "$CLIENT non-trust automation" -f shell "$ACF" 2>&1 + grep -qE '^id=' "$ACF" || { echo "appcred FAIL"; cat "$ACF"; exit 1; }; chmod 600 "$ACF" + echo " app cred -> $ACF (unrestricted; secret len $(awk -F'"' '/^secret=/{print length($2)}' "$ACF"))" + local KF="$OUT/${CLIENT}-key.pem"; umask 077; openstack keypair create "${CLIENT}-key" "$KF" 2>&1 + head -1 "$KF" | grep -q 'PRIVATE KEY' && { chmod 600 "$KF"; echo " keypair -> $KF"; } || { echo "keypair FAIL"; cat "$KF"; exit 1; } + ) || die "stage3 failed" +} + +stage4(){ # tenant L3 via app cred + [ -n "$TENANT_CIDR" ] || die "set TENANT_CIDR (e.g. 10.20.24.0/24)" + admin_env + openstack subnet list -f value -c Subnet &1 | grep -qw "$TENANT_CIDR" && die "CIDR $TENANT_CIDR in use" + local ACF="$OUT/${CLIENT}-svc-appcred.txt" + echo "== stage4: tenant L3 (net/subnet/router/ext-gw) via app cred ==" + ( for v in $(env|awk -F= '/^OS_/{print $1}'); do unset "$v"; done + export OS_AUTH_TYPE=v3applicationcredential OS_AUTH_URL="$AUTH_URL" OS_IDENTITY_API_VERSION=3 OS_CACERT="$CA" + export OS_APPLICATION_CREDENTIAL_ID="$(awk -F'"' '/^id=/{print $2}' "$ACF")" + export OS_APPLICATION_CREDENTIAL_SECRET="$(awk -F'"' '/^secret=/{print $2}' "$ACF")" + [[ "$(openstack token issue -f value -c project_id &1)" =~ ^[0-9a-f]{32}$ ]] || { echo "appcred auth FAIL"; exit 1; } + openstack network create "${CLIENT}-net" /dev/null 2>&1 && echo " net ok" + openstack subnet create "${CLIENT}-subnet" --network "${CLIENT}-net" --subnet-range "$TENANT_CIDR" --dns-nameserver 8.8.8.8 /dev/null 2>&1 && echo " subnet ok" + openstack router create "${CLIENT}-router" /dev/null 2>&1 && echo " router ok" + openstack router set "${CLIENT}-router" --external-gateway provider-ext &1 && echo " ext-gw ok" || echo " *** ext-gw FAILED (operator may need to attach) ***" + openstack router add subnet "${CLIENT}-router" "${CLIENT}-subnet" &1 && echo " interface ok" + ) || die "stage4 failed" +} + +stage5(){ # tenant template (image by UUID) + admin_env + local IMG; IMG=$(openstack image list --public -f value -c ID -c Name &1 | awk '/kube/{print $1;exit}') + [[ "$IMG" =~ ^[0-9a-f-]{36}$ ]] || die "kube image uuid resolve failed" + local ACF="$OUT/${CLIENT}-svc-appcred.txt" + echo "== stage5: tenant cluster template (image by UUID) ==" + ( for v in $(env|awk -F= '/^OS_/{print $1}'); do unset "$v"; done + export OS_AUTH_TYPE=v3applicationcredential OS_AUTH_URL="$AUTH_URL" OS_IDENTITY_API_VERSION=3 OS_CACERT="$CA" + export OS_APPLICATION_CREDENTIAL_ID="$(awk -F'"' '/^id=/{print $2}' "$ACF")" + export OS_APPLICATION_CREDENTIAL_SECRET="$(awk -F'"' '/^secret=/{print $2}' "$ACF")" + openstack coe cluster template show "${CLIENT}-k8s" -f value -c uuid /dev/null 2>&1 && openstack coe cluster template delete "${CLIENT}-k8s" &1 || true + openstack coe cluster template create "${CLIENT}-k8s" --image "$IMG" --external-network provider-ext \ + --master-flavor gp.mid --flavor capi.node --coe kubernetes --network-driver calico \ + --docker-storage-driver overlay2 --master-lb-enabled --floating-ip-enabled \ + --fixed-network "${CLIENT}-net" --fixed-subnet "${CLIENT}-subnet" --keypair "${CLIENT}-key" &1 + [[ "$(openstack coe cluster template show "${CLIENT}-k8s" -f value -c uuid &1)" =~ ^[0-9a-f-]{36}$ ]] && echo " template ${CLIENT}-k8s ok" || { echo "template FAIL"; exit 1; } + ) || die "stage5 failed" +} + +stage6(){ # cluster create as -cluster PASSWORD [GATED ON D-067] + echo "== stage6: cluster create as ${CLIENT}-cluster (PASSWORD) ==" + echo " NOTE: GATED ON D-067 -- until the barbican<->Vault metal-internal rebind is done, this dies" + echo " at cert-gen (Barbican 500 / Vault AppRole CIDR reject). Proceed only post-D-067." + admin_env + local DOM; DOM=$(openstack domain show "$CLIENT" -f value -c id &1) + local CF="$OUT/${CLIENT}-cluster-cred.txt"; local PID; PID=$(awk -F= '/^project_id=/{print $2}' "$CF") + ( for v in $(env|awk -F= '/^OS_/{print $1}'); do unset "$v"; done + export OS_AUTH_URL="$AUTH_URL" OS_IDENTITY_API_VERSION=3 OS_CACERT="$CA" + export OS_USERNAME="${CLIENT}-cluster" OS_USER_DOMAIN_ID="$DOM" OS_PROJECT_ID="$PID" + export OS_PASSWORD="$(awk -F= '/^password=/{print $2}' "$CF")" + [ "$(openstack token issue -f value -c project_id &1)" = "$PID" ] || { echo "cluster-user auth FAIL"; exit 1; } + openstack coe cluster create "${CLIENT}-cluster" --cluster-template "${CLIENT}-k8s" --keypair "${CLIENT}-key" --master-count 1 --node-count 1 &1 + sleep 15 + openstack coe cluster show "${CLIENT}-cluster" -f value -c uuid -c status -c status_reason &1 | sed 's/^/ /' + ) || die "stage6 failed" +} + +case "$STAGE" in + stage0|0) stage0 ;; + stage1|1) stage1 ;; + stage2|2) stage2 ;; + stage3|3) stage3 ;; + stage4|4) stage4 ;; + stage5|5) stage5 ;; + stage6|6) stage6 ;; + all) stage0; stage1; stage2; stage3; stage4; stage5; echo "== stages 0-5 done. stage6 (cluster) is gated on D-067 -- run explicitly: tenant-onboard.sh $CLIENT 6 ==" ;; + *) die "unknown stage: $STAGE" ;; +esac +echo "handover creds in: $OUT (0600)"