diff --git a/.github/workflows/overcloud-host-image-build.yml b/.github/workflows/overcloud-host-image-build.yml index 79eb472a1a..adeb7ffe75 100644 --- a/.github/workflows/overcloud-host-image-build.yml +++ b/.github/workflows/overcloud-host-image-build.yml @@ -23,22 +23,39 @@ on: env: ANSIBLE_FORCE_COLOR: True + KAYOBE_ENVIRONMENT: ci-builder + KAYOBE_VAULT_PASSWORD: ${{ secrets.KAYOBE_VAULT_PASSWORD }} jobs: overcloud-host-image-build: name: Build overcloud host images if: github.repository == 'stackhpc/stackhpc-kayobe-config' - runs-on: [self-hosted, stackhpc-kayobe-config-kolla-builder-rl9] + runs-on: arc-skc-host-image-builder-runner permissions: {} steps: - - uses: actions/checkout@v4 + - name: Install Package + uses: ConorMacBride/install-package@main + with: + apt: git unzip nodejs python3-pip python3-venv openssh-server openssh-client jq + + - name: Start the SSH service + run: | + sudo /etc/init.d/ssh start + + - name: Checkout + uses: actions/checkout@v4 with: path: src/kayobe-config + - name: Output image tag of the builder + id: builder_image_tag + run: | + echo image_tag=$(grep stackhpc_rocky_9_overcloud_host_image_version: etc/kayobe/pulp-host-image-versions.yml | awk '{print $2}') >> $GITHUB_OUTPUT + - name: Determine OpenStack release id: openstack_release run: | BRANCH=$(awk -F'=' '/defaultbranch/ {print $2}' src/kayobe-config/.gitreview) - echo "openstack_release=${BRANCH}" | sed "s|stable/||" >> $GITHUB_OUTPUT + echo "openstack_release=${BRANCH}" | sed -E "s,(stable|unmaintained)/,," >> $GITHUB_OUTPUT # Generate a tag to apply to all built overcloud host images. - name: Generate overcloud host image tag @@ -50,10 +67,6 @@ jobs: run: | echo "${{ steps.host_image_tag.outputs.host_image_tag }}" - - name: Clean any previous build artifact - run: | - rm -f /tmp/updated_images.txt - - name: Clone StackHPC Kayobe repository uses: actions/checkout@v4 with: @@ -61,25 +74,6 @@ jobs: ref: refs/heads/stackhpc/${{ steps.openstack_release.outputs.openstack_release }} path: src/kayobe - - name: Install dependencies - run: | - sudo dnf -y install zstd debootstrap - - - name: Setup networking - run: | - if ! ip l show breth1 >/dev/null 2>&1; then - sudo ip l add breth1 type bridge - fi - sudo ip l set breth1 up - if ! ip a show breth1 | grep 192.168.33.3/24; then - sudo ip a add 192.168.33.3/24 dev breth1 - fi - if ! ip l show dummy1 >/dev/null 2>&1; then - sudo ip l add dummy1 type dummy - fi - sudo ip l set dummy1 up - sudo ip l set dummy1 master breth1 - - name: Install Kayobe run: | mkdir -p venvs && @@ -89,36 +83,132 @@ jobs: pip install -U pip && pip install ../src/kayobe + - name: Install terraform + uses: hashicorp/setup-terraform@v2 + + - name: Initialise terraform + run: terraform init + working-directory: ${{ github.workspace }}/src/kayobe-config/terraform/aio + + - name: Generate SSH keypair + run: ssh-keygen -f id_rsa -N '' + working-directory: ${{ github.workspace }}/src/kayobe-config/terraform/aio + + - name: Generate clouds.yaml + run: | + cat << EOF > clouds.yaml + ${{ secrets.CLOUDS_YAML }} + EOF + working-directory: ${{ github.workspace }}/src/kayobe-config/terraform/aio + + - name: Generate terraform.tfvars + run: | + cat << EOF > terraform.tfvars + ssh_public_key = "id_rsa.pub" + ssh_username = "rocky" + aio_vm_name = "skc-host-image-builder" + # Must be a Rocky Linux 9 host to successfully build all images + # This MUST NOT be an LVM image. It can cause confusing conficts with the built image. + aio_vm_image = "Rocky-9-GenericCloud-Base-9.3-20231113.0.x86_64.qcow2" + aio_vm_flavor = "en1.medium" + aio_vm_network = "stackhpc-ci" + aio_vm_subnet = "stackhpc-ci" + aio_vm_interface = "eth0" + EOF + working-directory: ${{ github.workspace }}/src/kayobe-config/terraform/aio + + - name: Terraform Plan + run: terraform plan + working-directory: ${{ github.workspace }}/src/kayobe-config/terraform/aio + env: + OS_CLOUD: "openstack" + OS_APPLICATION_CREDENTIAL_ID: ${{ secrets.OS_APPLICATION_CREDENTIAL_ID }} + OS_APPLICATION_CREDENTIAL_SECRET: ${{ secrets.OS_APPLICATION_CREDENTIAL_SECRET }} + + - name: Terraform Apply + run: | + for attempt in $(seq 5); do + if terraform apply -auto-approve; then + echo "Created infrastructure on attempt $attempt" + exit 0 + fi + echo "Failed to create infrastructure on attempt $attempt" + sleep 10 + terraform destroy -auto-approve + sleep 60 + done + echo "Failed to create infrastructure after $attempt attempts" + exit 1 + working-directory: ${{ github.workspace }}/src/kayobe-config/terraform/aio + env: + OS_CLOUD: "openstack" + OS_APPLICATION_CREDENTIAL_ID: ${{ secrets.OS_APPLICATION_CREDENTIAL_ID }} + OS_APPLICATION_CREDENTIAL_SECRET: ${{ secrets.OS_APPLICATION_CREDENTIAL_SECRET }} + + - name: Get Terraform outputs + id: tf_outputs + run: | + terraform output -json + working-directory: ${{ github.workspace }}/src/kayobe-config/terraform/aio + + - name: Write Terraform outputs + run: | + cat << EOF > src/kayobe-config/etc/kayobe/environments/ci-builder/tf-outputs.yml + ${{ steps.tf_outputs.outputs.stdout }} + EOF + + - name: Write Terraform network config + run: | + cat << EOF > src/kayobe-config/etc/kayobe/environments/ci-builder/tf-network-allocation.yml + --- + aio_ips: + builder: "{{ access_ip_v4.value }}" + EOF + + - name: Write Terraform network interface config + run: | + mkdir -p src/kayobe-config/etc/kayobe/environments/$KAYOBE_ENVIRONMENT/inventory/group_vars/seed + rm -f src/kayobe-config/etc/kayobe/environments/$KAYOBE_ENVIRONMENT/inventory/group_vars/seed/network-interfaces + cat << EOF > src/kayobe-config/etc/kayobe/environments/$KAYOBE_ENVIRONMENT/inventory/group_vars/seed/network-interfaces + admin_interface: "{{ access_interface.value }}" + aio_interface: "{{ access_interface.value }}" + EOF + + - name: Manage SSH keys + run: | + mkdir -p ~/.ssh + touch ~/.ssh/authorized_keys + cat src/kayobe-config/terraform/aio/id_rsa.pub >> ~/.ssh/authorized_keys + cp src/kayobe-config/terraform/aio/id_rsa* ~/.ssh/ + - name: Bootstrap the control host run: | source venvs/kayobe/bin/activate && source src/kayobe-config/kayobe-env --environment ci-builder && kayobe control host bootstrap - - name: Configure the seed host + - name: Configure the seed host (Builder VM) + run: | + source venvs/kayobe/bin/activate && + source src/kayobe-config/kayobe-env --environment ci-builder && + kayobe seed host configure -e seed_bootstrap_user=rocky --skip-tags network + + - name: Install dependencies run: | source venvs/kayobe/bin/activate && source src/kayobe-config/kayobe-env --environment ci-builder && - kayobe seed host configure + kayobe seed host command run \ + --command "sudo dnf config-manager --set-enabled crb && sudo dnf -y install epel-release && sudo dnf -y install zstd debootstrap kpartx cloud-init" --show-output env: KAYOBE_VAULT_PASSWORD: ${{ secrets.KAYOBE_VAULT_PASSWORD }} - name: Create bifrost_httpboot Docker volume - run: | - if [[ $(sudo docker volume ls -f Name=bifrost_httpboot -q | wc -l) = 0 ]]; then - sudo docker volume create bifrost_httpboot - fi - - - name: Generate clouds.yaml - run: | - cat << EOF > clouds.yaml - ${{ secrets.CLOUDS_YAML }} - EOF - - - name: Install OpenStack client run: | source venvs/kayobe/bin/activate && - pip install python-openstackclient -c https://opendev.org/openstack/requirements/raw/branch/stable/${{ steps.openstack_release.outputs.openstack_release }}/upper-constraints.txt + source src/kayobe-config/kayobe-env --environment ci-builder && + kayobe seed host command run --command "sudo mkdir -p /var/lib/docker/volumes/bifrost_httpboot/_data" --show-output + env: + KAYOBE_VAULT_PASSWORD: ${{ secrets.KAYOBE_VAULT_PASSWORD }} - name: Build a Rocky Linux 9 overcloud host image id: build_rocky_9 @@ -134,6 +224,16 @@ jobs: KAYOBE_VAULT_PASSWORD: ${{ secrets.KAYOBE_VAULT_PASSWORD }} if: inputs.rocky9 + - name: Show last error logs + continue-on-error: true + run: | + source venvs/kayobe/bin/activate && + source src/kayobe-config/kayobe-env --environment ci-builder && + kayobe seed host command run --command "tail -200 /opt/kayobe/images/overcloud-rocky-9/overcloud-rocky-9.stdout" --show-output + env: + KAYOBE_VAULT_PASSWORD: ${{ secrets.KAYOBE_VAULT_PASSWORD }} + if: steps.build_rocky_9.outcome == 'failure' + - name: Upload Rocky Linux 9 overcloud host image to Ark run: | source venvs/kayobe/bin/activate && @@ -147,19 +247,17 @@ jobs: env: KAYOBE_VAULT_PASSWORD: ${{ secrets.KAYOBE_VAULT_PASSWORD }} if: inputs.rocky9 && steps.build_rocky_9.outcome == 'success' - - - name: Upload Rocky Linux 9 overcloud host image to SMS + + - name: Upload Rocky Linux 9 overcloud host image to Dev Cloud run: | source venvs/kayobe/bin/activate && - openstack image create \ - overcloud-rocky-9-${{ steps.host_image_tag.outputs.host_image_tag }} \ - --container-format bare \ - --disk-format qcow2 \ - --file /opt/kayobe/images/overcloud-rocky-9/overcloud-rocky-9.qcow2 \ - --private \ - --os-cloud sms-lab-release \ - --progress + source src/kayobe-config/kayobe-env --environment ci-builder && + kayobe playbook run \ + src/kayobe-config/etc/kayobe/ansible/openstack-host-image-upload.yml \ + -e local_image_path="/opt/kayobe/images/overcloud-rocky-9/overcloud-rocky-9.qcow2" \ + -e image_name=overcloud-rocky-9-${{ steps.host_image_tag.outputs.host_image_tag }} env: + CLOUDS_YAML: ${{ secrets.CLOUDS_YAML }} OS_APPLICATION_CREDENTIAL_ID: ${{ secrets.OS_APPLICATION_CREDENTIAL_ID }} OS_APPLICATION_CREDENTIAL_SECRET: ${{ secrets.OS_APPLICATION_CREDENTIAL_SECRET }} if: inputs.rocky9 && steps.build_rocky_9.outcome == 'success' @@ -178,6 +276,16 @@ jobs: KAYOBE_VAULT_PASSWORD: ${{ secrets.KAYOBE_VAULT_PASSWORD }} if: inputs.ubuntu-jammy + - name: Show last error logs + continue-on-error: true + run: | + source venvs/kayobe/bin/activate && + source src/kayobe-config/kayobe-env --environment ci-builder && + kayobe seed host command run --command "tail -200 /opt/kayobe/images/overcloud-ubuntu-jammy/overcloud-ubuntu-jammy.stdout" --show-output + env: + KAYOBE_VAULT_PASSWORD: ${{ secrets.KAYOBE_VAULT_PASSWORD }} + if: steps.build_ubuntu_jammy.outcome == 'failure' + - name: Upload Ubuntu Jammy 22.04 overcloud host image to Ark run: | source venvs/kayobe/bin/activate && @@ -192,50 +300,27 @@ jobs: KAYOBE_VAULT_PASSWORD: ${{ secrets.KAYOBE_VAULT_PASSWORD }} if: inputs.ubuntu-jammy && steps.build_ubuntu_jammy.outcome == 'success' - - name: Upload Ubuntu Jammy 22.04 overcloud host image to SMS + - name: Upload Ubuntu Jammy overcloud host image to Dev Cloud run: | source venvs/kayobe/bin/activate && - openstack image create \ - overcloud-ubuntu-jammy-${{ steps.host_image_tag.outputs.host_image_tag }} \ - --container-format bare \ - --disk-format qcow2 \ - --file /opt/kayobe/images/overcloud-ubuntu-jammy/overcloud-ubuntu-jammy.qcow2 \ - --private \ - --os-cloud sms-lab-release \ - --progress + source src/kayobe-config/kayobe-env --environment ci-builder && + kayobe playbook run \ + src/kayobe-config/etc/kayobe/ansible/openstack-host-image-upload.yml \ + -e local_image_path="/opt/kayobe/images/overcloud-ubuntu-jammy/overcloud-ubuntu-jammy.qcow2" \ + -e image_name=overcloud-ubuntu-jammy-${{ steps.host_image_tag.outputs.host_image_tag }} env: + CLOUDS_YAML: ${{ secrets.CLOUDS_YAML }} OS_APPLICATION_CREDENTIAL_ID: ${{ secrets.OS_APPLICATION_CREDENTIAL_ID }} OS_APPLICATION_CREDENTIAL_SECRET: ${{ secrets.OS_APPLICATION_CREDENTIAL_SECRET }} if: inputs.ubuntu-jammy && steps.build_ubuntu_jammy.outcome == 'success' - - name: Upload updated images artifact - uses: actions/upload-artifact@v4 - with: - name: Updated images list - path: /tmp/updated_images.txt - retention-days: 7 - if: steps.build_rocky_9.outcome == 'success' || - steps.build_ubuntu_jammy.outcome == 'success' - - - name: Upload Rocky 9 build logs if build failed - uses: actions/upload-artifact@v4 - with: - name: Rocky 9 build logs - path: | - /opt/kayobe/images/overcloud-rocky-9/overcloud-rocky-9.stdout - /opt/kayobe/images/overcloud-rocky-9/overcloud-rocky-9.stderr - retention-days: 7 - if: steps.build_rocky_9.outcome == 'failure' - - - name: Upload Ubuntu Jammy 22.04 build logs if build failed - uses: actions/upload-artifact@v4 - with: - name: Ubuntu Jammy 22.04 build logs - path: | - /opt/kayobe/images/overcloud-ubuntu-jammy/overcloud-ubuntu-jammy.stdout - /opt/kayobe/images/overcloud-ubuntu-jammy/overcloud-ubuntu-jammy.stderr - retention-days: 7 - if: steps.build_ubuntu_jammy.outcome == 'failure' + - name: Copy logs back + continue-on-error: true + run: | + mkdir logs + scp -r rocky@$(jq -r .access_ip_v4.value src/kayobe-config/etc/kayobe/environments/ci-builder/tf-outputs.yml):/opt/kayobe/images/*/*.std* ./logs/ + scp -r rocky@$(jq -r .access_ip_v4.value src/kayobe-config/etc/kayobe/environments/ci-builder/tf-outputs.yml):/tmp/updated_images.txt ./logs/ || true + if: always() - name: Fail if any overcloud host image builds failed run: | @@ -244,7 +329,18 @@ jobs: if: steps.build_rocky_9.outcome == 'failure' || steps.build_ubuntu_jammy.outcome == 'failure' - - name: Clean up build artifacts - run: | - sudo rm -rf /opt/kayobe/images/ + - name: Upload logs artifact + uses: actions/upload-artifact@v4 + with: + name: Build logs + path: ./logs + if: always() + + - name: Destroy + run: terraform destroy -auto-approve + working-directory: ${{ github.workspace }}/src/kayobe-config/terraform/aio + env: + OS_CLOUD: openstack + OS_APPLICATION_CREDENTIAL_ID: ${{ secrets.OS_APPLICATION_CREDENTIAL_ID }} + OS_APPLICATION_CREDENTIAL_SECRET: ${{ secrets.OS_APPLICATION_CREDENTIAL_SECRET }} if: always() diff --git a/.github/workflows/overcloud-host-image-promote.yml b/.github/workflows/overcloud-host-image-promote.yml index 449068f6ff..a8a64f60f9 100644 --- a/.github/workflows/overcloud-host-image-promote.yml +++ b/.github/workflows/overcloud-host-image-promote.yml @@ -31,7 +31,7 @@ jobs: id: openstack_release run: | BRANCH=$(awk -F'=' '/defaultbranch/ {print $2}' .gitreview) - echo "openstack_release=${BRANCH}" | sed "s|stable/||" >> $GITHUB_OUTPUT + echo "openstack_release=${BRANCH}" | sed -E "s,(stable|unmaintained)/,," >> $GITHUB_OUTPUT working-directory: src/kayobe-config - name: Clone StackHPC Kayobe repository diff --git a/.github/workflows/overcloud-host-image-upload.yml b/.github/workflows/overcloud-host-image-upload.yml index 9892a6bf23..b08f573812 100644 --- a/.github/workflows/overcloud-host-image-upload.yml +++ b/.github/workflows/overcloud-host-image-upload.yml @@ -47,7 +47,7 @@ jobs: id: openstack_release run: | BRANCH=$(awk -F'=' '/defaultbranch/ {print $2}' src/kayobe-config/.gitreview) - echo "openstack_release=${BRANCH}" | sed "s|stable/||" >> $GITHUB_OUTPUT + echo "openstack_release=${BRANCH}" | sed -E "s,(stable|unmaintained)/,," >> $GITHUB_OUTPUT - name: Clone StackHPC Kayobe repository uses: actions/checkout@v4 @@ -80,7 +80,7 @@ jobs: - name: Install OpenStack client run: | source venvs/kayobe/bin/activate && - pip install python-openstackclient -c https://opendev.org/openstack/requirements/raw/branch/stable/${{ steps.openstack_release.outputs.openstack_release }}/upper-constraints.txt + pip install python-openstackclient -c https://releases.openstack.org/constraints/upper/${{ steps.openstack_release.outputs.openstack_release }} - name: Output Rocky Linux 9 image tag id: rocky_9_image_tag diff --git a/.github/workflows/stackhpc-all-in-one.yml b/.github/workflows/stackhpc-all-in-one.yml index c84ae6e0b9..c967fbe0ff 100644 --- a/.github/workflows/stackhpc-all-in-one.yml +++ b/.github/workflows/stackhpc-all-in-one.yml @@ -38,7 +38,7 @@ on: vm_flavor: description: Flavor for the all-in-one VM type: string - default: en1.large + default: en1.medium vm_network: description: Network for the all-in-one VM type: string @@ -73,7 +73,7 @@ jobs: # NOTE: Runner needs unzip and nodejs packages. all-in-one: name: All in one - if: inputs.if + if: ${{ inputs.if && !cancelled() }} runs-on: arc-skc-aio-runner permissions: {} env: @@ -156,6 +156,7 @@ jobs: aio_vm_network = "${{ env.VM_NETWORK }}" aio_vm_subnet = "${{ env.VM_SUBNET }}" aio_vm_volume_size = "${{ env.VM_VOLUME_SIZE }}" + aio_vm_tags = ${{ env.VM_TAGS }} EOF working-directory: ${{ github.workspace }}/terraform/aio env: @@ -167,6 +168,7 @@ jobs: VM_SUBNET: ${{ inputs.vm_subnet }} VM_INTERFACE: ${{ inputs.vm_interface }} VM_VOLUME_SIZE: ${{ inputs.upgrade && '45' || '35' }} + VM_TAGS: '["skc-ci-aio", "PR=${{ github.event.number }}"]' - name: Terraform Plan run: terraform plan @@ -181,13 +183,15 @@ jobs: for attempt in $(seq 5); do if terraform apply -auto-approve; then echo "Created infrastructure on attempt $attempt" - break + exit 0 fi echo "Failed to create infrastructure on attempt $attempt" sleep 10 terraform destroy -auto-approve sleep 60 done + echo "Failed to create infrastructure after $attempt attempts" + exit 1 working-directory: ${{ github.workspace }}/terraform/aio env: OS_CLOUD: ${{ inputs.OS_CLOUD }} diff --git a/.github/workflows/stackhpc-check-tags.yml b/.github/workflows/stackhpc-check-tags.yml index f5a12a714f..4016c00e9e 100644 --- a/.github/workflows/stackhpc-check-tags.yml +++ b/.github/workflows/stackhpc-check-tags.yml @@ -23,7 +23,7 @@ env: jobs: check-tags: name: Check container image tags - if: inputs.if + if: ${{ inputs.if && ! cancelled() }} runs-on: arc-skc-aio-runner permissions: {} env: diff --git a/.github/workflows/stackhpc-ci-cleanup.yml b/.github/workflows/stackhpc-ci-cleanup.yml new file mode 100644 index 0000000000..ed9ec327c3 --- /dev/null +++ b/.github/workflows/stackhpc-ci-cleanup.yml @@ -0,0 +1,77 @@ +--- +name: Clean up stale CI resources +on: + schedule: + # Every 2 hours at quarter past + - cron: '15 0/2 * * *' + +jobs: + ci-cleanup: + name: Clean up stale CI resources + if: github.repository == 'stackhpc/stackhpc-kayobe-config' + runs-on: ubuntu-latest + permissions: {} + steps: + - name: Checkout + uses: actions/checkout@v4 + with: + path: src/kayobe-config + + - name: Setup Python + uses: actions/setup-python@v5 + + - name: Generate clouds.yaml + run: | + cat << EOF > clouds.yaml + ${{ secrets.CLOUDS_YAML }} + EOF + + - name: Determine OpenStack release + id: openstack_release + run: | + BRANCH=$(awk -F'=' '/defaultbranch/ {print $2}' src/kayobe-config/.gitreview) + echo "openstack_release=${BRANCH}" | sed -E "s,(stable|unmaintained)/,," >> $GITHUB_OUTPUT + + - name: Install OpenStack client + run: | + pip install python-openstackclient -c https://releases.openstack.org/constraints/upper/${{ steps.openstack_release.outputs.openstack_release }} + + - name: Clean up aio instances over 3 hours old + run: | + result=0 + changes_before=$(date -Imin -d -3hours) + for status in ACTIVE BUILD ERROR SHUTOFF; do + for instance in $(openstack server list --tags skc-ci-aio --os-compute-api-version 2.66 --format value --column ID --changes-before $changes_before --status $status); do + echo "Cleaning up $status instance $instance" + openstack server show $instance + if ! openstack server delete $instance; then + echo "Failed to delete $status instance $instance" + result=1 + fi + done + done + exit $result + env: + OS_CLOUD: openstack + OS_APPLICATION_CREDENTIAL_ID: ${{ secrets.OS_APPLICATION_CREDENTIAL_ID }} + OS_APPLICATION_CREDENTIAL_SECRET: ${{ secrets.OS_APPLICATION_CREDENTIAL_SECRET }} + + - name: Clean up host image builder instances over 5 hours old + run: | + result=0 + changes_before=$(date -Imin -d -5hours) + for status in ACTIVE BUILD ERROR SHUTOFF; do + for instance in $(openstack server list --tags skc-host-image-build --os-compute-api-version 2.66 --format value --column ID --changes-before $changes_before --status $status); do + echo "Cleaning up $status instance $instance" + openstack server show $instance + if ! openstack server delete $instance; then + echo "Failed to delete $status instance $instance" + result=1 + fi + done + done + exit $result + env: + OS_CLOUD: openstack + OS_APPLICATION_CREDENTIAL_ID: ${{ secrets.OS_APPLICATION_CREDENTIAL_ID }} + OS_APPLICATION_CREDENTIAL_SECRET: ${{ secrets.OS_APPLICATION_CREDENTIAL_SECRET }} diff --git a/.github/workflows/stackhpc-container-image-build.yml b/.github/workflows/stackhpc-container-image-build.yml index 6a8055dedc..4478e6fe5c 100644 --- a/.github/workflows/stackhpc-container-image-build.yml +++ b/.github/workflows/stackhpc-container-image-build.yml @@ -33,6 +33,12 @@ on: type: boolean required: false default: true + push-dirty: + description: Push scanned images that have vulnerabilities? + type: boolean + required: false + # NOTE(Alex-Welsh): This default should be flipped once we resolve existing failures + default: true env: ANSIBLE_FORCE_COLOR: True @@ -54,7 +60,7 @@ jobs: id: openstack_release run: | BRANCH=$(awk -F'=' '/defaultbranch/ {print $2}' .gitreview) - echo "openstack_release=${BRANCH}" | sed "s|stable/||" >> $GITHUB_OUTPUT + echo "openstack_release=${BRANCH}" | sed -E "s,(stable|unmaintained)/,," >> $GITHUB_OUTPUT # Generate a tag to apply to all built container images. # Without this, each kayobe * container image build command would use a different tag. @@ -100,7 +106,15 @@ jobs: - name: Install package dependencies run: | sudo apt update - sudo apt install -y build-essential git unzip nodejs python3-wheel python3-pip python3-venv + sudo apt install -y build-essential git unzip nodejs python3-wheel python3-pip python3-venv curl jq wget + + - name: Install gh + run: | + sudo mkdir -p -m 755 /etc/apt/keyrings && wget -qO- https://cli.github.com/packages/githubcli-archive-keyring.gpg | sudo tee /etc/apt/keyrings/githubcli-archive-keyring.gpg > /dev/null + sudo chmod go+r /etc/apt/keyrings/githubcli-archive-keyring.gpg + echo "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/githubcli-archive-keyring.gpg] https://cli.github.com/packages stable main" | sudo tee /etc/apt/sources.list.d/github-cli.list > /dev/null + sudo apt update + sudo apt install gh -y - name: Checkout uses: actions/checkout@v4 @@ -118,6 +132,10 @@ jobs: run: | docker ps + - name: Install Trivy + run: | + curl -sfL https://raw.githubusercontent.com/aquasecurity/trivy/main/contrib/install.sh | sudo sh -s -- -b /usr/local/bin v0.49.0 + - name: Install Kayobe run: | mkdir -p venvs && @@ -132,6 +150,10 @@ jobs: - name: Install Docker Python SDK run: | sudo pip install docker + + - name: Get Kolla tag + id: write-kolla-tag + run: echo "kolla-tag=${{ needs.generate-tag.outputs.openstack_release }}-${{ matrix.distro }}-${{ matrix.distro == 'rocky' && '9' || 'jammy' }}-${{ needs.generate-tag.outputs.datetime_tag }}" >> $GITHUB_OUTPUT - name: Configure localhost as a seed run: | @@ -153,67 +175,124 @@ jobs: env: KAYOBE_VAULT_PASSWORD: ${{ secrets.KAYOBE_VAULT_PASSWORD }} - - name: Build and push kolla overcloud images + - name: Create build logs output directory + run: mkdir image-build-logs + + - name: Build kolla overcloud images + id: build_overcloud_images + continue-on-error: true run: | - args="${{ github.event.inputs.regexes }}" + args="${{ inputs.regexes }}" args="$args -e kolla_base_distro=${{ matrix.distro }}" - args="$args -e kolla_tag=$KOLLA_TAG" + args="$args -e kolla_tag=${{ steps.write-kolla-tag.outputs.kolla-tag }}" args="$args -e stackhpc_repo_mirror_auth_proxy_enabled=true" - if ${{ inputs.push }} == 'true'; then - args="$args --push" - fi source venvs/kayobe/bin/activate && source src/kayobe-config/kayobe-env --environment ci-builder && kayobe overcloud container image build $args env: KAYOBE_VAULT_PASSWORD: ${{ secrets.KAYOBE_VAULT_PASSWORD }} - KOLLA_TAG: "${{ needs.generate-tag.outputs.openstack_release }}-${{ matrix.distro }}-${{ matrix.distro == 'rocky' && '9' || 'jammy' }}-${{ needs.generate-tag.outputs.datetime_tag }}" - if: github.event.inputs.overcloud == 'true' + if: inputs.overcloud + + - name: Copy overcloud container image build logs to output directory + run: sudo mv /var/log/kolla-build.log image-build-logs/kolla-build-overcloud.log + if: inputs.overcloud - - name: Build and push kolla seed images + - name: Build kolla seed images + id: build_seed_images + continue-on-error: true run: | args="-e kolla_base_distro=${{ matrix.distro }}" - args="$args -e kolla_tag=$KOLLA_TAG" + args="$args -e kolla_tag=${{ steps.write-kolla-tag.outputs.kolla-tag }}" args="$args -e stackhpc_repo_mirror_auth_proxy_enabled=true" - if ${{ inputs.push }} == 'true'; then - args="$args --push" - fi source venvs/kayobe/bin/activate && source src/kayobe-config/kayobe-env --environment ci-builder && kayobe seed container image build $args env: KAYOBE_VAULT_PASSWORD: ${{ secrets.KAYOBE_VAULT_PASSWORD }} - KOLLA_TAG: "${{ needs.generate-tag.outputs.openstack_release }}-${{ matrix.distro }}-${{ matrix.distro == 'rocky' && '9' || 'jammy' }}-${{ needs.generate-tag.outputs.datetime_tag }}" - if: github.event.inputs.seed == 'true' + if: inputs.seed + + - name: Copy seed container image build logs to output directory + run: sudo mv /var/log/kolla-build.log image-build-logs/kolla-build-seed.log + if: inputs.seed - name: Get built container images - run: | - docker image ls --filter "reference=ark.stackhpc.com/stackhpc-dev/*:*${{ matrix.distro }}*${{ needs.generate-tag.outputs.datetime_tag }}" > ${{ matrix.distro }}-container-images + run: docker image ls --filter "reference=ark.stackhpc.com/stackhpc-dev/*:${{ steps.write-kolla-tag.outputs.kolla-tag }}" > ${{ matrix.distro }}-container-images - name: Fail if no images have been built run: if [ $(wc -l < ${{ matrix.distro }}-container-images) -le 1 ]; then exit 1; fi - - name: Upload container images artifact + - name: Scan built container images + run: src/kayobe-config/tools/scan-images.sh ${{ matrix.distro }} ${{ steps.write-kolla-tag.outputs.kolla-tag }} + + - name: Move image scan logs to output artifact + run: mv image-scan-output image-build-logs/image-scan-output + + - name: Fail if no images have passed scanning + run: if [ $(wc -l < image-build-logs/image-scan-output/clean-images.txt) -le 0 ]; then exit 1; fi + if: ${{ !inputs.push-dirty }} + + - name: Copy clean images to push-attempt-images list + run: cp image-build-logs/image-scan-output/clean-images.txt image-build-logs/push-attempt-images.txt + if: inputs.push + + - name: Append dirty images to push list + run: | + cat image-build-logs/image-scan-output/dirty-images.txt >> image-build-logs/push-attempt-images.txt + if: ${{ inputs.push && inputs.push-dirty }} + + - name: Push images + run: | + touch image-build-logs/push-failed-images.txt + source venvs/kayobe/bin/activate && + source src/kayobe-config/kayobe-env --environment ci-builder && + kayobe playbook run ${KAYOBE_CONFIG_PATH}/ansible/docker-registry-login.yml && + + while read -r image; do + # Retries! + for i in {1..5}; do + if docker push $image; then + echo "Pushed $image" + break + elif $i == 5; then + echo "Failed to push $image" + echo $image >> image-build-logs/push-failed-images.txt + else + echo "Failed on retry $i" + sleep 5 + fi; + done + done < image-build-logs/push-attempt-images.txt + shell: bash + env: + KAYOBE_VAULT_PASSWORD: ${{ secrets.KAYOBE_VAULT_PASSWORD }} + if: inputs.push + + - name: Upload output artifact uses: actions/upload-artifact@v4 with: - name: ${{ matrix.distro }} container images - path: ${{ matrix.distro }}-container-images + name: ${{ matrix.distro }}-logs + path: image-build-logs retention-days: 7 + if: ${{ !cancelled() }} + + - name: Fail when images failed to build + run: echo "An image build failed. Check the workflow artifact for build logs" && exit 1 + if: ${{ steps.build_overcloud_images.outcome == 'failure' || steps.build_seed_images.outcome == 'failure' }} + + - name: Fail when images failed to push + run: if [ $(wc -l < image-build-logs/push-failed-images.txt) -gt 0 ]; then cat image-build-logs/push-failed-images.txt && exit 1; fi + if: ${{ !cancelled() }} + + - name: Fail when images failed scanning + run: if [ $(wc -l < image-build-logs/dirty-images.txt) -gt 0 ]; then cat image-build-logs/dirty-images.txt && exit 1; fi + if: ${{ !inputs.push-dirty && !cancelled() }} - sync-container-repositories: - name: Trigger container image repository sync - needs: - - container-image-build - if: github.repository == 'stackhpc/stackhpc-kayobe-config' && inputs.push - runs-on: ubuntu-latest - permissions: {} - steps: # NOTE(mgoddard): Trigger another CI workflow in the # stackhpc-release-train repository. - name: Trigger container image repository sync run: | filter='${{ inputs.regexes }}' - if [[ -n $filter ]] && [[ ${{ github.event.inputs.seed }} == 'true' ]]; then + if [[ -n $filter ]] && [[ ${{ inputs.seed }} == 'true' ]]; then filter="$filter bifrost" fi gh workflow run \ @@ -224,7 +303,9 @@ jobs: -f sync-old-images=false env: GITHUB_TOKEN: ${{ secrets.STACKHPC_RELEASE_TRAIN_TOKEN }} + if: ${{ github.repository == 'stackhpc/stackhpc-kayobe-config' && inputs.push && !cancelled() }} - name: Display link to container image repository sync workflows run: | echo "::notice Container image repository sync workflows: https://github.com/stackhpc/stackhpc-release-train/actions/workflows/container-sync.yml" + if: ${{ github.repository == 'stackhpc/stackhpc-kayobe-config' && inputs.push && !cancelled() }} diff --git a/.github/workflows/stackhpc-pull-request.yml b/.github/workflows/stackhpc-pull-request.yml index fbb6e378a6..34d644bd96 100644 --- a/.github/workflows/stackhpc-pull-request.yml +++ b/.github/workflows/stackhpc-pull-request.yml @@ -87,7 +87,7 @@ jobs: kayobe_image: ${{ needs.build-kayobe-image.outputs.kayobe_image }} if: ${{ needs.check-changes.outputs.check-tags == 'true' }} secrets: inherit - if: ${{ ! failure() && github.repository == 'stackhpc/stackhpc-kayobe-config' }} + if: ${{ ! failure() && ! cancelled() && github.repository == 'stackhpc/stackhpc-kayobe-config' }} all-in-one-ubuntu-jammy-ovs: name: aio (Ubuntu Jammy OVS) @@ -104,7 +104,7 @@ jobs: OS_CLOUD: openstack if: ${{ needs.check-changes.outputs.aio == 'true' }} secrets: inherit - if: ${{ ! failure() && github.repository == 'stackhpc/stackhpc-kayobe-config' }} + if: ${{ ! failure() && ! cancelled() && github.repository == 'stackhpc/stackhpc-kayobe-config' }} all-in-one-ubuntu-jammy-ovn: name: aio (Ubuntu Jammy OVN) @@ -121,7 +121,7 @@ jobs: OS_CLOUD: openstack if: ${{ needs.check-changes.outputs.aio == 'true' }} secrets: inherit - if: ${{ ! failure() && github.repository == 'stackhpc/stackhpc-kayobe-config' }} + if: ${{ ! failure() && ! cancelled() && github.repository == 'stackhpc/stackhpc-kayobe-config' }} all-in-one-rocky-9-ovs: name: aio (Rocky 9 OVS) @@ -138,7 +138,7 @@ jobs: OS_CLOUD: openstack if: ${{ needs.check-changes.outputs.aio == 'true' }} secrets: inherit - if: ${{ ! failure() && github.repository == 'stackhpc/stackhpc-kayobe-config' }} + if: ${{ ! failure() && ! cancelled() && github.repository == 'stackhpc/stackhpc-kayobe-config' }} all-in-one-rocky-9-ovn: name: aio (Rocky 9 OVN) @@ -155,7 +155,7 @@ jobs: OS_CLOUD: openstack if: ${{ needs.check-changes.outputs.aio == 'true' }} secrets: inherit - if: ${{ ! failure() && github.repository == 'stackhpc/stackhpc-kayobe-config' }} + if: ${{ ! failure() && ! cancelled() && github.repository == 'stackhpc/stackhpc-kayobe-config' }} # Test two upgrade scenarios: Ubuntu Jammy OVS and Rocky 9 OVN. @@ -175,7 +175,7 @@ jobs: if: ${{ needs.check-changes.outputs.aio == 'true' }} upgrade: true secrets: inherit - if: ${{ ! failure() && github.repository == 'stackhpc/stackhpc-kayobe-config' }} + if: ${{ ! failure() && ! cancelled() && github.repository == 'stackhpc/stackhpc-kayobe-config' }} all-in-one-upgrade-rocky-9-ovn: name: aio upgrade (Rocky 9 OVN) @@ -193,4 +193,4 @@ jobs: if: ${{ needs.check-changes.outputs.aio == 'true' }} upgrade: true secrets: inherit - if: ${{ ! failure() && github.repository == 'stackhpc/stackhpc-kayobe-config' }} + if: ${{ ! failure() && ! cancelled() && github.repository == 'stackhpc/stackhpc-kayobe-config' }} diff --git a/.yamllint b/.yamllint index 96b2b10ddf..1c115e29b7 100644 --- a/.yamllint +++ b/.yamllint @@ -20,3 +20,4 @@ ignore: | .github/ .gitlab/ .gitlab-ci.yml + etc/kayobe/kolla/config/prometheus/prometheus.yml.d/70-oscapacity.yml diff --git a/doc/source/_static/images/capi-architecture-diagram.png b/doc/source/_static/images/capi-architecture-diagram.png new file mode 100644 index 0000000000..259cb89390 Binary files /dev/null and b/doc/source/_static/images/capi-architecture-diagram.png differ diff --git a/doc/source/configuration/cephadm.rst b/doc/source/configuration/cephadm.rst index c9607d600f..d7f41c91da 100644 --- a/doc/source/configuration/cephadm.rst +++ b/doc/source/configuration/cephadm.rst @@ -448,7 +448,7 @@ Configure the Ceph hosts: .. code:: bash - kayobe overcloud host configure --limit storage --kolla-limit storage + kayobe overcloud host configure --limit storage Ceph deployment --------------- diff --git a/doc/source/configuration/magnum-capi.rst b/doc/source/configuration/magnum-capi.rst index ab7dc88737..c05a80bcf2 100644 --- a/doc/source/configuration/magnum-capi.rst +++ b/doc/source/configuration/magnum-capi.rst @@ -1,45 +1,120 @@ ========================= Magnum Cluster API Driver ========================= -A new driver for magnum has been written. It is an alternative to heat (as heat gets phased out due to maintenance burden) that allows the definition of clusters as Kubernetes CRDs as opposed to heat templates. The two are compatible and can both be active on the same deployment, and the decision of which driver is used for a given template depends on certain parameters inferred from the template. For the new driver, these are `{'server_type' : 'vm', 'os' : 'ubuntu', 'coe': kubernetes'}`. -Drivers can be enabled and disabled via the `disabled_drivers` parameter of `[drivers]` under `magnum.conf`. -Prerequisites for deploying the CAPI driver in magnum: +A new driver for Magnum has been written which is an alternative to Heat (as Heat gets phased out due to maintenance burden) and instead uses the Kubernetes `Cluster API project `_ to manage the OpenStack infrastructure required by Magnum clusters. The idea behind the Cluster API (CAPI) project is that infrastructure is managed using Kubernetes-style declarative APIs, which in practice means a set of Custom Resource Definitions (CRDs) and Kubernetes `operators `_ to translate instances of those custom Kubernetes resources into the required OpenStack API resources. These same operators also handle resource reconciliation (i.e. when the Kubernetes custom resource is modified, the operator will make the required OpenStack API calls to reflect those changes). -Management Cluster -=================== -The CAPI driver relies on a management Kubernetes cluster, installed inside the cloud, to manage tenant Kubernetes clusters. -The easiest way to get one is by deploying `this `__ branch of azimuth-config, and look at the `capi-mgmt-example` environment. Refer to the `azimuth-config wiki `__ for detailed steps on how to deploy. +The new CAPI driver and the old Heat driver are compatible and can both be active on the same deployment, and the decision of which driver is used for a given template depends on certain parameters inferred from the Magnum cluster template. For the new driver, these parameters are ``{'server_type': 'vm', 'os': 'ubuntu', 'coe': kubernetes'}``. Drivers can be enabled and disabled using the ``disabled_drivers`` parameter in the ``[drivers]`` section of ``magnum.conf``. -Ensure that you have set `capi_cluster_apiserver_floating_ip: true`, as the management cluster will need an externally accessible IP. The external network this corresponds to is whatever you have set `azimuth_capi_operator_external_network_id` to. This network needs to be reachable from wherever the magnum container is running. +Deployment Prerequisites +======================== -It's preferable that most Day 2 ops be done via a `CD Pipeline `__. +The Cluster API architecture relies on a CAPI management cluster in order to run the aforementioned Kubernetes operators which interact directly with the OpenStack APIs. The two requirements for this management cluster are: -Kayobe Config -============== -Ensure that your kayobe-config branch is up to date on |current_release_git_branch_name|. +1. It must be capable of reaching the public OpenStack APIs. + +2. It must be reachable from the control plane nodes (either controllers or dedicated network hosts) on which the Magnum containers are running (so that the Magnum can reach the IP listed in the management cluster's ``kubeconfig`` file). + +For testing purposes, a simple `k3s `_ cluster would suffice. For production deployments, the recommended solution is to instead set up a separate HA management cluster in an isolated OpenStack project by leveraging the CAPI management cluster configuration used in `Azimuth `_. This approach will provide a resilient HA management cluster with a standard set of component versions that are regularly tested in Azimuth CI. +The general process for setting up this CAPI management cluster using Azimuth tooling is described here, but the `Azimuth operator documentation `_ should be consulted for additional information if required. + +The diagram below shows the general architecture of the CAPI management cluster provisioned using Azimuth tooling. It consists of a Seed VM (a terraform-provisioned OpenStack VM) running a small k3s cluster (which itself is actually a CAPI management cluster but only for the purpose of managing the HA cluster) as well as a HA management cluster made up of (by default) 3 control plane VMs and 3 worker VMs. This HA cluster runs the various Kubernetes components responsible for managing Magnum tenant clusters. + +.. image:: /_static/images/capi-architecture-diagram.png + :width: 100% + +The setup and configuration of a CAPI management cluster using Azimuth tooling follow a pattern that should be familiar to Kayobe operators. There is an 'upstream' `azimuth-config `_ repository which contains recommended defaults for various configuration options (equivalent to stackhpc-kayobe-config), and then each client site will maintain an independent copy of this repository which will contain site-specific configuration. Together, these upstream and site-specific configuration repositories can set or override Ansible variables for the `azimuth-ops `_ Ansible collection, which contains the playbooks required to deploy or update a CAPI management cluster (or a full Azimuth deployment). + +In order to deploy a CAPI management cluster for use with Magnum, first create a copy of the upstream Azimuth config repository in the client's GitHub/GitLab. To do so, follow the instructions found in the `initial repository setup `_ section of the Azimuth operator docs. The site-specific repository should then be encrypted following `these instructions `_ to avoid leaking any secrets (such as cloud credentials) that will be added to the configuration later on. + +Next, rather than copying the ``example`` environment as recommended in the Azimuth docs, instead copy the ``capi-mgmt-example`` environment and give it a suitable site-specific name: + +.. code-block:: bash -Copy the kubeconfig found at `kubeconfig-capi-mgmt-.yaml` to your kayobe environment (e.g. `/kolla/config/magnum/kubeconfig`. It is highly likely you'll want to add this file to ansible vault. + cp -r ./environments/capi-mgmt-example ./environments/ -Ensure that your magnum.conf has the following set: +By default, both the seed VM name and the CAPI cluster VM names will be derived by prefixing the environment name with `capi-mgmt-` so naming the environment after the cloud (e.g. `sms-lab-prod`) is recommended. + +Having created this concrete environment to hold site-specific configuration, next open ``environments//inventory/group-vars/all/variables.yml`` and, at a minimum, set the following options to the desired values for the target cloud: .. code-block:: yaml - [nova_client] - endpoint_type = publicURL + infra_external_network_id: + infra_flavor_id: + capi_cluster_control_plane_flavor: + capi_cluster_worker_flavor: + +The comments surrounding each option in the ``variables.yml`` provide some tips on choosing sensible values (e.g. resource requirements for each flavor). In most cases, other configuration options can be left blank since they will fall back to the upstream defaults; however, if the default configuration is not suitable, the roles in `ansible-collection-azimuth-ops `_ contain a range of config variables which can be overridden in ``variables.yml`` as required. In particular, the `infra role variables `_ are mostly relevant to the seed VM configuration, and the `capi_cluster role variables `_ are relevant for HA cluster config. + +.. note:: + + One important distinction between azimuth-config and stackhpc-kayobe-config is that the environments in azimuth-config are `layered`. This can be seen in the ``ansible.cfg`` file for each environment, which will contain a line of the form ``inventory = `` showing the inheritance chain for variables defined in each environment. See `these docs `_ for more details. + +In addition to setting the required infrastructure variables, Terraform must also be configured to use a remote state store (either GitLab or S3) for the seed VM state. To do so, follow the instructions found `here `_. + +The HA cluster also contains a deployment of `kube-prometheus-stack `_ for monitoring and alerting. To send the cluster alerts to Slack, the ``alertmanager_config_slack_webhook_url`` variable should be set in ``environments//inventory/group-vars/all/secrets.yml``. If the repository was encrypted correctly above, this file will automatically be encrypted before a git push. Run ``git-crypt status -e`` to verify that this file is included in the encrypted list before git-committing the webhook URL. + +The final step before beginning deployment of the CAPI management cluster is to provide some cloud credentials. It is recommended that the CAPI management cluster is deployed in an isolated OpenStack project. After creating the target project (preferably using `openstack-config `_), generate an application credential for the project using the Identity tab in Horizon and then download the corresponding ``clouds.yaml`` file and place it in ``environments//clouds.yaml``. + +To deploy the CAPI management cluster using this site-specific environment, run + +.. code-block:: bash + # Activate the environment + ./bin/activate -This is used to generate the application credential config injected into the tenant Kubernetes clusters, such that it is usable from within an OpenStack project, so you can't use the "internal API" end point here. + # Install or update the local Ansible Python venv + ./bin/ensure-venv -Control Plane + # Install or update Ansible dependencies + ansible-galaxy install -f -r ./requirements.yml + + # Run the provision playbook from the azimuth-ops collection + # NOTE: THIS COMMAND RUNS A DIFFERENT PLAYBOOK FROM + # THE STANDARD AZIMUTH DEPLOYMENT INSTRUCTIONS + ansible-playbook stackhpc.azimuth_ops.provision_capi_mgmt + +The general running order of the provisioning playbook is the following: + +- Ensure Terraform is installed locally + +- Use Terraform to provision the seed VM (and create any required internal networks, volumes etc.) + +- Install k3s on the seed (with all k3s data stored on the attached Cinder volume) + +- Install the required components on the k3s cluster to provision the HA cluster + +- Provision the HA cluster + +- Install the required components on the HA cluster to manage Magnum user clusters + +Once the seed VM has been provisioned, it can be accessed via SSH by running ``./bin/seed-ssh`` from the root of the azimuth-config repository. Within the seed VM, the k3s cluster and the HA cluster can both be accessed using the pre-installed ``kubectl`` and ``helm`` command line tools. Both of these tools will target the k3s cluster by default; however, the ``kubeconfig`` file for the HA cluster can be found in the seed's home directory (named e.g. ``kubeconfig-capi-mgmt-.yaml``). + +.. note:: + + The provision playbook is responsible for copying the HA ``kubeconfig`` to this location *after* the HA cluster is up and running. If you need to access the HA cluster while it is still deploying, the ``kubeconfig`` file can be found stored as a Kubernetes secret on the k3s cluster. + +It is possible to reconfigure or upgrade the management cluster after initial deployment by simply re-running the ``provision_capi_mgmt`` playbook. However, it's preferable that most Day 2 ops (i.e. reconfigures and upgrades) be done via a CD Pipeline. See `these Azimuth docs `_ for more information. + +Kayobe Config ============== -Ensure that the nodes (either controllers or dedicated network hosts) that you are running the magnum containers on have connectivity to the network on which your management cluster has a floating IP (so that the magnum containers can reach the IP listed in the kubeconfig). -Magnum Templates -================ +To configure the Magnum service with the Cluster API driver enabled, first ensure that your kayobe-config branch is up to date with |current_release_git_branch_name|. + +Next, copy the CAPI management cluster's kubeconfig file into your stackhpc-kayobe-config environment (e.g. ``/kolla/config/magnum/kubeconfig``). This file must be Ansible vault encrypted. + +The following config should also be set in your stackhpc-kayobe-config environment: + +.. code-block:: yaml + :caption: kolla/globals.yml + + magnum_capi_helm_driver_enabled: true -`azimuth-images `__ builds the required Ubuntu Kubernetes images, and `capi-helm-charts `__ CI runs conformance tests on each image built. +To apply the configuration, run ``kayobe overcloud service reconfigure -kt magnum``. -Magnum templates can be deployed using `openstack-config `__. Typically, you would create a fork `-config` of this repository, move the resources defined in `examples/capi-templates-images.yml` into `etc/openstack-config/openstack-config.yml`, and then follow the instructions in the readme to deploy these. +Magnum Cluster Templates +======================== +The clusters deployed by the Cluster API driver make use of the Ubuntu Kubernetes images built in the `azimuth-images `_ repository and then use `capi-helm-charts `_ to provide the Helm charts which define the clusters based on these images. Between them, these two repositories have CI jobs that regularly build and test images and Helm charts for the latest Kubernetes versions. It is therefore important to update the cluster templates on each cloud regularly to make use of these new releases. +Magnum templates should be defined within an existing client-specific `openstack-config `_ repository. See the openstack-config `README `_ for more details. diff --git a/doc/source/configuration/monitoring.rst b/doc/source/configuration/monitoring.rst index 1bd0185347..069bf47007 100644 --- a/doc/source/configuration/monitoring.rst +++ b/doc/source/configuration/monitoring.rst @@ -42,17 +42,6 @@ The configuration options can be found in .. literalinclude:: ../../../etc/kayobe/stackhpc-monitoring.yml :language: yaml -In order to enable stock monitoring configuration within a particular -environment, create the following symbolic links: - -.. code-block:: console - - cd $KAYOBE_CONFIG_PATH - ln -s ../../../../kolla/config/grafana/ environments/$KAYOBE_ENVIRONMENT/kolla/config/ - ln -s ../../../../kolla/config/prometheus/ environments/$KAYOBE_ENVIRONMENT/kolla/config/ - -and commit them to the config repository. - SMART Drive Monitoring ====================== @@ -140,33 +129,41 @@ enable the ceph mgr exporter. OpenStack Capacity ================== -OpenStack Capacity allows you to see how much space you have avaliable -in your cloud. StackHPC Kayobe Config includes this exporter by default -and it's necessary that some variables are set to allow deployment. +OpenStack Capacity allows you to see how much space you have available +in your cloud. StackHPC Kayobe Config will deploy OpenStack Capacity +by default on a service deploy, this can be disabled by setting +``stackhpc_enable_os_capacity`` to false. -To successfully deploy OpenStack Capacity, you are required to specify -the OpenStack application credentials in ``kayobe/secrets.yml`` as: +OpenStack Capacity is deployed automatically using a service deploy hook +with the generated kolla-ansible admin credentials, you can override these +by setting the authentication url, username, password, project name and +project domain name in ``stackhpc-monitoring.yml``: .. code-block:: yaml - secrets_os_exporter_auth_url: - secrets_os_exporter_credential_id: - secrets_os_exporter_credential_secret: + stackhpc_os_capacity_auth_url: + stackhpc_os_capacity_username: + stackhpc_os_capacity_password: + stackhpc_os_capacity_project_name: + stackhpc_os_capacity_domain_name: + stackhpc_os_capacity_openstack_region_name: -After defining your credentials, You may deploy OpenStack Capacity -using the ``ansible/deploy-os-capacity-exporter.yml`` Ansible playbook -via Kayobe. +Additionally, you should ensure these credentials have the correct permissions +for the exporter. If you are deploying in a cloud with internal TLS, you may be required +to disable certificate verification for the OpenStack Capacity exporter +if your certificate is not signed by a trusted CA. -.. code-block:: console +.. code-block:: yaml - kayobe playbook run ansible/deploy-os-capacity-exporter.yml + stackhpc_os_capacity_openstack_verify: false -It is required that you re-configure the Prometheus, Grafana and HAProxy -services following deployment, to do this run the following Kayobe command. +If you've modified your credentials, you will need to re-deploy OpenStack Capacity +using the ``deploy-os-capacity-exporter.yml`` Ansible playbook +via Kayobe. .. code-block:: console - kayobe overcloud service reconfigure -kt grafana,prometheus,haproxy + kayobe playbook run $KAYOBE_CONFIG_PATH/ansible/deploy-os-capacity-exporter.yml If you notice ``HaproxyServerDown`` or ``HaproxyBackendDown`` prometheus alerts after deployment it's likely the os_exporter secrets have not been diff --git a/doc/source/configuration/vault.rst b/doc/source/configuration/vault.rst index 64ad4efb26..a1a8429e2e 100644 --- a/doc/source/configuration/vault.rst +++ b/doc/source/configuration/vault.rst @@ -6,6 +6,18 @@ This document describes how to deploy Hashicorp Vault for internal PKI purposes using the `StackHPC Hashicorp collection `_ +Vault may be used as a Certificate Authority to generate certificates for: + +* OpenStack internal API +* OpenStack backend APIs +* RabbitMQ + +TLS support is described in the :kolla-ansible-doc:`Kolla Ansible documentation +` and the :kayobe-doc:`Kayobe documentation +`. + +Vault may also be used as the secret store for Barbican. + Background ========== @@ -111,7 +123,7 @@ Setup HAProxy config for Vault .. code-block:: - kayobe overcloud service deploy -kt haproxy + kayobe overcloud service deploy --skip-tags os_capacity -kt haproxy Setup Vault HA on the overcloud hosts ------------------------------------- @@ -296,7 +308,9 @@ Configure Barbican [vault_plugin] vault_url = https://{{ kolla_internal_vip_address }}:8200 use_ssl = True - ssl_ca_crt_file = {% raw %}{{ openstack_cacert }}{% endraw %} + {% raw %} + ssl_ca_crt_file = {{ openstack_cacert }} + {% endraw %} approle_role_id = {{ secrets_barbican_approle_role_id }} approle_secret_id = {{ secrets_barbican_approle_secret_id }} kv_mountpoint = barbican diff --git a/doc/source/contributor/environments/aufn-ceph.rst b/doc/source/contributor/environments/aufn-ceph.rst index c88498982f..a5e8d72cc8 100644 --- a/doc/source/contributor/environments/aufn-ceph.rst +++ b/doc/source/contributor/environments/aufn-ceph.rst @@ -14,7 +14,7 @@ This environment creates a Universe-from-nothing_-style deployment of Kayobe con Prerequisites ============= -* a baremetal node with at least 64GB of RAM running CentOS Stream 8 (or Ubuntu) +* a baremetal node with at least 64GB of RAM running Rocky Linux 9 or Ubuntu Jammy. * access to the test pulp server on SMS lab diff --git a/doc/source/contributor/environments/ci-builder.rst b/doc/source/contributor/environments/ci-builder.rst index ae1f3e86e6..15a4560447 100644 --- a/doc/source/contributor/environments/ci-builder.rst +++ b/doc/source/contributor/environments/ci-builder.rst @@ -25,7 +25,7 @@ Access the host via SSH. Install package dependencies. -On CentOS: +On Rocky Linux: .. parsed-literal:: diff --git a/doc/source/contributor/package-updates.rst b/doc/source/contributor/package-updates.rst index 51473b8601..5577fce653 100644 --- a/doc/source/contributor/package-updates.rst +++ b/doc/source/contributor/package-updates.rst @@ -102,7 +102,7 @@ For Rocky Linux 9, bump the snapshot versions in /etc/yum/repos.d with: .. code-block:: console - kayobe overcloud host configure -t dnf -kt none + kayobe overcloud host configure -t dnf Install new packages: diff --git a/doc/source/operations/index.rst b/doc/source/operations/index.rst index 94880ba6a8..39f1bb847e 100644 --- a/doc/source/operations/index.rst +++ b/doc/source/operations/index.rst @@ -7,8 +7,10 @@ This guide is for operators of the StackHPC Kayobe configuration project. .. toctree:: :maxdepth: 1 - upgrading - rabbitmq - octavia hotfix-playbook + nova-compute-ironic + octavia + rabbitmq secret-rotation + tempest + upgrading diff --git a/doc/source/operations/nova-compute-ironic.rst b/doc/source/operations/nova-compute-ironic.rst new file mode 100644 index 0000000000..6cbe00550f --- /dev/null +++ b/doc/source/operations/nova-compute-ironic.rst @@ -0,0 +1,307 @@ +=================== +Nova Compute Ironic +=================== + +This section describes the deployment of the OpenStack Nova Compute +Ironic service. The Nova Compute Ironic service is used to integrate +OpenStack Ironic into Nova as a 'hypervisor' driver. The end users of Nova +can then deploy and manage baremetal hardware, in a similar way to VMs. + +High Availability (HA) +====================== + +The OpenStack Nova Compute service is designed to be installed once on every +hypervisor in an OpenStack deployment. In this configuration, it makes little +sense to run additional service instances. Even if you wanted to, it's not +supported by design. This pattern breaks down with the Ironic baremetal +service, which must run on the OpenStack control plane. It is not feasible +to have a 1:1 mapping of Nova Compute Ironic services to baremetal nodes. + +The obvious HA solution is to run multiple instances of Nova Compute Ironic +on the control plane, so that if one fails, the others can take over. However, +due to assumptions long baked into the Nova source code, this is not trivial. +The HA feature provided by the Nova Compute Ironic service has proven to be +unstable, and the direction upstream is to switch to an active/passive +solution [1]. + +However, challenges still exist with the active/passive solution. Since the +Nova Compute Ironic HA feature is 'always on', one must ensure that only a +single instance (per Ironic conductor group) is ever running. It is not +possible to simply put multiple service instances behind HAProxy and use the +active/passive mode. + +Such problems are commonly solved with a technology such as Pacemaker, or in +the modern world, with a container orchestration engine such as Kubernetes. +Kolla Ansible provides neither, because in general it doesn't need to. Its +goal is simplicity. + +The interim solution is to therefore run a single Nova Compute Ironic +service. If the service goes down, remedial action must be taken before +Ironic nodes can be managed. In many environments the loss of the Ironic +API for short periods is acceptable, providing that it can be easily +resurrected. The purpose of this document is to faciliate that. + +.. note:: + + The new sharding mode is not covered here and it is assumed that you are + not using it. See [1] for further information. This will be updated in + the future. + +Optimal configuration of Nova Compute Ironic +============================================ + +Determine the current configuration for the site. How many Nova Compute +Ironic instances are running on the control plane? + +.. code-block:: console + + $ openstack compute service list + +Typically you will see either three or one. By default the host will +marked with a postfix, eg. ``controller1-ironic``. If you find more than +one, you will need to remove some instances. You must complete the +following section. + +Moving from multiple Nova Compute Instances to a single instance +---------------------------------------------------------------- + +1. Decide where the single instance should run. This should normally be + one of the three OpenStack control plane hosts. For convention, pick + the first one, unless you can think of a good reason not to. Once you + have chosen, set the following variable in ``etc/kayobe/nova.yml``. + Here we have picked ``controller1``. + + .. code-block:: yaml + + kolla_nova_compute_ironic_host: controller1 + +2. Ensure that you have organised a maintenance window, during which + there will be no Ironic operations. You will be breaking the Ironic + API. + +3. Perform a database backup. + + .. code-block:: console + + $ kayobe overcloud database backup -vvv + + Check the output of the command, and locate the backup files. + +4. Identify baremetal nodes associated with Nova Compute Ironic instances + that will be removed. You don't need to do anything with these + specifically, it's just for reference later. For example: + + .. code-block:: console + + $ openstack baremetal node list --long -c "Instance Info" | grep controller3-ironic | wc -l + 61 + $ openstack baremetal node list --long -c "Instance Info" | grep controller2-ironic | wc -l + 35 + $ openstack baremetal node list --long -c "Instance Info" | grep controller1-ironic | wc -l + 55 + +5. Disable the redundant Nova Compute Ironic services: + + .. code-block:: console + + $ openstack compute service set controller3-ironic nova-compute --disable + $ openstack compute service set controller2-ironic nova-compute --disable + +6. Delete the redundant Nova Compute Ironic services. You will need the service + ID. For example: + + .. code-block:: console + + $ ID=$(openstack compute service list | grep foo | awk '{print $2}') + $ openstack compute service delete --os-compute-api-version 2.53 $ID + + In older releases, you may hit a bug where the service can't be deleted if it + is not managing any instances. In this case just move on and leave the service + disabled. Eg. + + .. code-block:: console + + $ openstack compute service delete --os-compute-api-version 2.53 c993b57e-f60c-4652-8328-5fb0e17c99c0 + Failed to delete compute service with ID 'c993b57e-f60c-4652-8328-5fb0e17c99c0': HttpException: 500: Server Error for url: + https://acme.pl-2.internal.hpc.is:8774/v2.1/os-services/c993b57e-f60c-4652-8328-5fb0e17c99c0, Unexpected API Error. + Please report this at http://bugs.launchpad.net/nova/ and attach the Nova API log if possible. + +7. Remove the Docker containers for the redundant Nova Compute Ironic services: + + .. code-block:: console + + $ ssh controller2 sudo docker rm -f nova_compute_ironic + $ ssh controller3 sudo docker rm -f nova_compute_ironic + +8. Ensure that all Ironic nodes are using the single remaining Nova Compute + Ironic instance. Eg. Baremetal nodes in use by compute instances will not + fail over to the remaining Nova Compute Ironic service. Here, the active + service is running on ``controller1``: + + .. code-block:: console + + $ ssh controller1 + $ sudo docker exec -it mariadb mysql -u nova -p$(sudo grep 'mysql+pymysql://nova:' /etc/kolla/nova-api/nova.conf | awk -F'[:,@]' '{print $3}') + $ MariaDB [(none)]> use nova; + + Proceed with caution. It is good practise to update one record first: + + .. code-block:: console + + $ MariaDB [nova]> update instances set host='controller1-ironic' where uuid=0 and host='controller3-ironic' limit 1; + Query OK, 1 row affected (0.002 sec) + Rows matched: 1 Changed: 1 Warnings: 0 + + At this stage you should go back to step 4 and check that the numbers have + changed as expected. When you are happy, update remaining records for all + services which have been removed: + + .. code-block:: console + + $ MariaDB [nova]> update instances set host='controller1-ironic' where deleted=0 and host='controller3-ironic'; + Query OK, 59 rows affected (0.009 sec) + Rows matched: 59 Changed: 59 Warnings: 0 + $ MariaDB [nova]> update instances set host='controller1-ironic' where deleted=0 and host='controller2-ironic'; + Query OK, 35 rows affected (0.003 sec) + Rows matched: 35 Changed: 35 Warnings: 0 + +9. Repeat step 4. Verify that all Ironic nodes are using the single remaining + Nova Compute Ironic instance. + + +Making it easy to re-deploy Nova Compute Ironic +----------------------------------------------- + +In the previous section we saw that at any given time, a baremetal node is +associated with a single Nova Compute Ironic instance. At this stage, assuming +that you have diligently followed the instructions, you are in the situation +where all Ironic baremetal nodes are managed by a single Nova Compute Ironic +instance. If this service goes down, you will not be able to manage /any/ +baremetal nodes. + +By default, the single remaining Nova Compute Ironic instance will be named +after the host on which it is deployed. The host name is passed to the Nova +Compute Ironic instance via the default section of the ``nova.conf`` file, +using the field: ``host``. + +If you wish to re-deploy this instance, for example because the original host +was permanently mangled in the World Server Throwing Championship [2], you +must ensure that the new instance has the same name as the old one. Simply +setting ``kolla_nova_compute_ironic_host`` to another controller and +re-deploying the service is not enough; the new instance will be named after +the new host. + +To work around this you should set the ``host`` field in ``nova.conf`` to a +constant, such that the new Nova Compute Ironic instance comes up with the +same name as the one it replaces. + +For example, if the original instance resides on ``controller1``, then set the +following in ``etc/kayobe/nova.yml``: + +.. code-block:: yaml + + kolla_nova_compute_ironic_static_host_name: controller1-ironic + +Note that an ``-ironic`` postfix is added to the hostname. This comes from +a convention in Kolla Ansible. It is worth making this change ahead of time, +even if you don't need to immediately re-deploy the service. + +It is also possible to use an arbitrary ``host`` name, but you will need +to edit the database again. That is an optional exercise left for the reader. +See [1] for further details. + +.. note:: + + There is a bug when overriding the host name in Kolla Ansible, where it + is currently assumed that it will be set to the actual hostname + an + -ironic postfix. The service will come up correctly, but Kolla Ansible + will not detect it. See here: + https://bugs.launchpad.net/kolla-ansible/+bug/2056571 + +Re-deploying Nova Compute Ironic +-------------------------------- + +The decision to re-deploy Nova Compute Ironic to another host should only be +taken if there is a strong reason to do so. The objective is to minimise +the chance of the old instance starting up alongside the new one. If the +original host has been re-imaged, or physically replaced there is no risk. +However, if the original host has been taken down for non-destructive +maintenance, it is better to avoid re-deploying the service if the end users +can tolerate the wait. If you are forced to re-deploy the service, knowing +that the original instance may start when the host comes back online, you +must plan accordingly. For example, by booting the original host in maintenance +mode and removing the old service before it can start, or by stopping the +new instance before the original one comes back up, and then reverting the +config to move it to the new host. + +There are essentially two scenarios for re-deploying Nova Compute Ironic. +These are described in the following sub-sections: + +Current host is accessible +~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Adjust the ``kolla_nova_compute_ironic_host`` variable to point to the +new host, eg. + +.. code-block:: diff + + +kolla_nova_compute_ironic_host: controller2 + -kolla_nova_compute_ironic_host: controller1 + +Remove the old container: + +.. code-block:: console + + $ ssh controller1 sudo docker rm -f nova_compute_ironic + +Deploy the new service: + +.. code-block:: console + + $ kayobe overcloud service deploy -kl controller2 -l controller2 -kt nova + +Verify that the new service appears as 'up' and 'enabled': + +.. code-block:: console + + $ openstack compute service list + +Current host is not accessible +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +In this case you will need to remove the inaccessible host from the inventory. +For example, in ``etc/kayobe/inventory/hosts``, remove ``controller1`` from +the ``controllers`` group. + +Adjust the ``kolla_nova_compute_ironic_host`` variable to point to the +new host, eg. + +.. code-block:: diff + + +kolla_nova_compute_ironic_host: controller2 + -kolla_nova_compute_ironic_host: controller1 + +Deploy the new service: + +.. code-block:: console + + $ kayobe overcloud service reconfigure -kl controller2 -l controller2 -kt nova + +Verify that the new service appears as 'up' and 'enabled': + +.. code-block:: console + + $ openstack compute service list + +.. note:: + + It is important to stop the original service from starting up again. It is + up to you to prevent this. + +.. note:: + + Once merged, the work on 'Kayobe reliability' may allow this step to run + without modifying the inventory to remove the broken host. + +[1] https://specs.openstack.org/openstack/nova-specs/specs/2024.1/approved/ironic-shards.html#migrate-from-peer-list-to-shard-key +[2] https://www.cloudfest.com/world-server-throwing-championship diff --git a/doc/source/operations/tempest.rst b/doc/source/operations/tempest.rst new file mode 100644 index 0000000000..c747b53774 --- /dev/null +++ b/doc/source/operations/tempest.rst @@ -0,0 +1,329 @@ +====================================== +Running Tempest with Kayobe Automation +====================================== + +Overview +======== + +This document describes how to configure and run `Tempest +`_ using `kayobe-automation +`_ from the ``.automation`` +submodule included with ``stackhpc-kayobe-config``. + +The best way of running Tempest is to use CI/CD workflows. Before proceeding, +consider whether it would be possible to use/set up a CI/CD workflow instead. +For more information, see the :doc:`CI/CD workflows page +`. + +The following guide will assume all commands are run from your +``kayobe-config`` root and the environment has been configured to run Kayobe +commands unless stated otherwise. + +Prerequisites +============= + +Installing Docker +----------------- + +``kayobe-automation`` runs in a container on the Ansible control host. This +means that Docker must be installed on the Ansible control host if it is not +already. + +.. warning:: + + Docker can cause networking issues when it is installed. By default, it + will create a bridge and change ``iptables`` rules. These can be disabled + by setting the following in ``/etc/docker/daemon.json``: + + .. code-block:: json + + { + "bridge": "none", + "iptables": false + } + + The bridge is the most common cause of issues and is *usually* safe to + disable. Disabling the ``iptables`` rules will break any GitHub actions + runners running on the host. + +To install Docker on Ubuntu: + +.. code-block:: bash + + # Add Docker's official GPG key: + sudo apt-get update + sudo apt-get install ca-certificates curl + sudo install -m 0755 -d /etc/apt/keyrings + sudo curl -fsSL https://download.docker.com/linux/ubuntu/gpg -o /etc/apt/keyrings/docker.asc + sudo chmod a+r /etc/apt/keyrings/docker.asc + + # Add the repository to Apt sources: + echo \ + "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.asc] https://download.docker.com/linux/ubuntu \ + $(. /etc/os-release && echo "$VERSION_CODENAME") stable" | \ + sudo tee /etc/apt/sources.list.d/docker.list > /dev/null + sudo apt-get update + sudo apt-get install docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin + +Installing Docker on Rocky: + +.. code-block:: bash + + sudo dnf install -y dnf-utils + sudo dnf-config-manager --add-repo https://download.docker.com/linux/centos/docker-ce.repo + sudo dnf install docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin + +Ensure Docker is running & enabled: + +.. code-block:: bash + + sudo systemctl start docker + sudo systemctl enable docker + +The Docker ``buildx`` plugin must be installed. If you are using an existing +installation of docker, you may need to install it with: + +.. code-block:: bash + + sudo dnf/apt install docker-buildx-plugin + sudo docker buildx install + # or if that fails: + sudo docker plugin install buildx + +Building a Kayobe container +--------------------------- + +Build a Kayobe automation image: + +.. code-block:: bash + + git submodule init + git submodule update + # If running on Ubuntu, the fact cache can confuse Kayobe in the Rocky-based container + mv etc/kayobe/facts{,-old} + sudo DOCKER_BUILDKIT=1 docker build --build-arg BASE_IMAGE=rockylinux:9 --file .automation/docker/kayobe/Dockerfile --tag kayobe:latest . + +Configuration +============= + +Kayobe automation configuration files are stored in the ``.automation.conf/`` +directory. It contains: + +- A script used to export environment variables for meta configuration of + Tempest - ``.automation.conf/config.sh``. +- Tempest configuration override files, stored in ``.automation.conf/tempest/`` + and conventionally named ``tempest.overrides.conf`` or + ``tempest-.overrides.conf``. +- Tempest load lists, stored in ``.automation.conf/tempest/load-lists``. +- Tempest skip lists, stored in ``.automation.conf/tempest/skip-lists``. + +config.sh +--------- + +``config.sh`` is a mandatory shell script, primarily used to export environment +variables for the meta configuration of Tempest. + +See: +https://github.com/stackhpc/docker-rally/blob/master/bin/rally-verify-wrapper.sh +for a full list of Tempest parameters that can be overridden. + +The most common variables to override are: + +- ``TEMPEST_CONCURRENCY`` - The maximum number of tests to run in parallel at + one time. Higher values are faster but increase the risk of timeouts. 1-2 is + safest in CI/Tenks/Multinode/AIO etc. 8-32 is typical in production. Default + value is 2. +- ``KAYOBE_AUTOMATION_TEMPEST_LOADLIST``: the filename of a load list in the + ``load-lists`` directory. Default value is ``default`` (symlink to refstack). +- ``KAYOBE_AUTOMATION_TEMPEST_SKIPLIST``: the filename of a load list in the + ``skip-lists`` directory. Default value is unset. +- ``TEMPEST_OPENRC``: The **contents** of an ``openrc.sh`` file, to be used by + Tempest to create resources on the cloud. Default is to read in the contents + of ``etc/kolla/public-openrc.sh``. + +tempest.overrides.conf +---------------------- + +Tempest uses a configuration file to define which tests are run and how to run +them. A full sample configuration file can be found `here +`_. Sensible +defaults exist for all values and in most situations, a blank +``*overrides.conf`` file will successfully run many tests. It will however also +skip many tests which may otherwise be appropriate to run. + +`Shakespeare `_ is a tool for +generating Tempest configuration files. It contains elements for different +cloud features, which can be combined to template out a detailed configuration +file. This is the best-practice approach. + +Below is an example of a manually generated file including many of the most +common overrides. It makes many assumptions about the environment, so make sure +you understand all the options before applying them. + +.. NOTE(upgrade): Microversions change for each release +.. code-block:: ini + + [openstack] + # Use a StackHPC-built image without a default password. + img_url=https://github.com/stackhpc/cirros/releases/download/20231206/cirros-d231206-x86_64-disk.img + + [auth] + # Expect unlimited quotas for CPU cores and RAM + compute_quotas = cores:-1,ram:-1 + + [compute] + # Required for migration testing + min_compute_nodes = 2 + # Required to test some API features + min_microversion = 2.1 + max_microversion = 2.95 + # Flavors for creating test servers and server resize. The ``alt`` flavor should be larger. + flavor_ref = + flavor_ref_alt = + volume_multiattach = true + + [compute-feature-enabled] + # Required for migration testing + resize = true + live_migration = true + block_migration_for_live_migration = false + volume_backed_live_migration = true + + [placement] + min_microversion = 1.0 + max_microversion = 1.39 + + [volume] + storage_protocol = ceph + # Required to test some API features + min_microversion = 3.0 + max_microversion = 3.70 + +Tempest configuration override files are stored in +``.automation.conf/tempest/``. The default file used is +``tempest.overrides.conf`` or ``tempest-.overrides.conf`` +depending on whether a Kayobe environment is enabled. This can be changed by +setting ``KAYOBE_AUTOMATION_TEMPEST_CONF_OVERRIDES`` to a different file path. +An ``overrides.conf`` file must be supplied, even if it is blank. + +Load Lists +---------- + +Load lists are a newline-separated list of tests to run. They are stored in +``.automation.conf/tempest/load-lists/``. The directory contains three objects +by default: + +- ``tempest-full`` - A complete list of all possible tests. +- ``platform.2022.11-test-list.txt`` - A reduced list of tests to match the + `Refstack `_ standard. +- ``default`` - A symlink to ``platform.2022.11-test-list.txt``. + +Test lists can be selected by changing ``KAYOBE_AUTOMATION_TEMPEST_LOADLIST`` +in ``config.sh``. The default value is ``default``, which symlinks to +``platform.2022.11-test-list.txt``. + +A common use case is to use the ``failed-tests`` list output from a previous +Tempest run as a load list, to retry the failed tests after making changes. + +Skip Lists +---------- + +Skip lists are a newline-separated list of tests to Skip. They are stored in +``.automation.conf/tempest/skip-lists/``. Each line consists of a pattern to +match against test names, and a string explaining why the test is being +skipped e.g. + +.. code-block:: + + tempest.scenario.test_network_basic_ops.TestNetworkBasicOps.test_subnet_details.*: "Cirros image doesn't have /var/run/udhcpc.eth0.pid" + +There is no requirement for a skip list, and none is selected by default. A +skip list can be selected by setting ``KAYOBE_AUTOMATION_TEMPEST_SKIPLIST`` in +``config.sh``. + +Tempest runner +-------------- + +While the Kayobe automation container is always deployed to the ansible control +host, the Tempest container is deployed to the host in the ``tempest_runner`` +group, which can be any host in the Kayobe inventory. The group should only +ever contain one host. The seed is usually used as the tempest runner however +it is also common to use the Ansible control host or an infrastructure VM. The +main requirement of the host is that it can reach the OpenStack API. + +Running Tempest +=============== + +Kayobe automation will need to SSH to the Tempest runner (even if they are on +the same host), so requires an SSH key exported as +``KAYOBE_AUTOMATION_SSH_PRIVATE_KEY`` e.g. + +.. code-block:: bash + + export KAYOBE_AUTOMATION_SSH_PRIVATE_KEY=$(cat ~/.ssh/id_rsa) + +Tempest outputs will be sent to the ``tempest-artifacts/`` directory. Create +one if it does not exist. + +.. code-block:: bash + + mkdir tempest-artifacts + +The contents of ``tempest-artifacts`` will be overwritten. Ensure any previous +test results have been copied away. + +The Tempest playbook is invoked through the Kayobe container using this +command from the base of the ``kayobe-config`` directory: + +.. code-block:: bash + + sudo -E docker run --name kayobe-automation --detach -it --rm --network host \ + -v $(pwd):/stack/kayobe-automation-env/src/kayobe-config -v $(pwd)/tempest-artifacts:/stack/tempest-artifacts \ + -e KAYOBE_ENVIRONMENT -e KAYOBE_VAULT_PASSWORD -e KAYOBE_AUTOMATION_SSH_PRIVATE_KEY kayobe:latest \ + /stack/kayobe-automation-env/src/kayobe-config/.automation/pipeline/tempest.sh -e ansible_user=stack + +By default, ``no_log`` is set to stop credentials from leaking. This can be +disabled by adding ``-e rally_no_sensitive_log=false`` to the end. + +To follow the progress of the Kayobe automation container, either remove +``--detach`` from the above command, or follow the docker logs of the +``kayobe`` container. + +To follow the progress of the Tempest tests themselves, follow the logs of the +``tempest`` container on the ``tempest_runner`` host. + +.. code-block:: bash + + ssh + sudo docker logs -f tempest + +Tempest will keep running until completion if the ``kayobe`` container is +stopped. The ``tempest`` container must be stopped manually. Doing so will +however stop test resources (such as networks, images, and VMs) from being +automatically cleaned up. They must instead be manually removed. They should be +clearly labeled with either rally or tempest in the name, often alongside some +randomly generated string. + +Outputs +------- + +Tempest outputs will be sent to the ``tempest-artifacts/`` directory. It +contain the following artifacts: + +- ``docker.log`` - The logs from the ``tempest`` docker container +- ``failed-tests`` - A simple list of tests that failed +- ``rally-junit.xml`` - An XML file listing all tests in the test list and + their status (skipped/succeeded/failed). Usually not useful. +- ``rally-verify-report.html`` - An HTML page with all test results including + an error trace for failed tests. It is often best to ``scp`` this file back + to your local machine to view it. This is the most user-friendly way to view + the test results, however can be awkward to host. +- ``rally-verify-report.json`` - A JSON blob with all test results including an + error trace for failed tests. It contains all the same data as the HTML + report but without formatting. +- ``stderr.log`` - The stderr log. Usually not useful. +- ``stdout.log`` - The stdout log. Usually not useful. +- ``tempest-load-list`` - The load list that Tempest was invoked with. +- ``tempest.log`` - Detailed logs from Tempest. Contains more data than the + ``verify`` reports, but can be difficult to parse. Useful for tracing specific + errors. diff --git a/doc/source/operations/upgrading.rst b/doc/source/operations/upgrading.rst index a5d781ddc5..89f8f6aa8c 100644 --- a/doc/source/operations/upgrading.rst +++ b/doc/source/operations/upgrading.rst @@ -199,14 +199,14 @@ Known issues * Docker log-opts are currently not configured in Antelope. You will see these being removed when running a host configure in check+diff mode. See bug for - details (fix in progress): + details (fix released): https://bugs.launchpad.net/ansible-collection-kolla/+bug/2040105 * /etc/hosts are not templated correctly when running a host configure with ``--limit``. To work around this, run your host configures with ``--skip-tags etc-hosts``. If you do need to change ``/etc/hosts``, for example with any newly-added hosts, run a full host configure afterward with - ``--tags etc-hosts``. See bug for details (fix in progress): + ``--tags etc-hosts``. See bug for details (fix released): https://bugs.launchpad.net/kayobe/+bug/2051714 Security baseline @@ -948,7 +948,7 @@ least start with a small number of hosts.: .. code-block:: console - kayobe overcloud host configure --limit --kolla-limit + kayobe overcloud host configure --limit Alternatively, to apply the configuration to all hosts: diff --git a/etc/kayobe/ansible.cfg b/etc/kayobe/ansible.cfg index 515af8d324..b38cb8239d 100644 --- a/etc/kayobe/ansible.cfg +++ b/etc/kayobe/ansible.cfg @@ -8,6 +8,12 @@ bin_ansible_callbacks = True inject_facts_as_vars = False # Add timing information to output callbacks_enabled = ansible.posix.profile_tasks +# Silence warning about invalid characters found in group names +force_valid_group_names = ignore + +[inventory] +# Fail when any inventory source cannot be parsed. +any_unparsed_is_failed = True [ssh_connection] pipelining = True diff --git a/etc/kayobe/ansible/deploy-os-capacity-exporter.yml b/etc/kayobe/ansible/deploy-os-capacity-exporter.yml index 4eeb69431e..cc3afa7b0e 100644 --- a/etc/kayobe/ansible/deploy-os-capacity-exporter.yml +++ b/etc/kayobe/ansible/deploy-os-capacity-exporter.yml @@ -1,20 +1,51 @@ --- -- hosts: monitoring +- name: Remove legacy os_exporter.cfg file + hosts: network + tags: os_capacity gather_facts: false + tasks: + - name: Ensure legacy os_exporter.cfg config file is deleted + ansible.builtin.file: + path: /etc/kolla/haproxy/services.d/os_exporter.cfg + state: absent + become: true +- name: Deploy os-capacity exporter + hosts: monitoring + tags: os_capacity + gather_facts: false tasks: - name: Create os-capacity directory ansible.builtin.file: path: /opt/kayobe/os-capacity/ state: directory + when: stackhpc_enable_os_capacity + + - name: Read admin-openrc credential file + ansible.builtin.command: + cmd: "cat {{ lookup('ansible.builtin.env', 'KOLLA_CONFIG_PATH') }}/admin-openrc.sh" + delegate_to: localhost + register: credential + when: stackhpc_enable_os_capacity + + - name: Set facts for admin credentials + ansible.builtin.set_fact: + stackhpc_os_capacity_auth_url: "{{ credential.stdout_lines | select('match', '.*OS_AUTH_URL*.') | first | split('=') | last | replace(\"'\",'') }}" + stackhpc_os_capacity_project_name: "{{ credential.stdout_lines | select('match', '.*OS_PROJECT_NAME*.') | first | split('=') | last | replace(\"'\",'') }}" + stackhpc_os_capacity_domain_name: "{{ credential.stdout_lines | select('match', '.*OS_PROJECT_DOMAIN_NAME*.') | first | split('=') | last | replace(\"'\",'') }}" + stackhpc_os_capacity_openstack_region_name: "{{ credential.stdout_lines | select('match', '.*OS_REGION_NAME*.') | first | split('=') | last | replace(\"'\",'') }}" + stackhpc_os_capacity_username: "{{ credential.stdout_lines | select('match', '.*OS_USERNAME*.') | first | split('=') | last | replace(\"'\",'') }}" + stackhpc_os_capacity_password: "{{ credential.stdout_lines | select('match', '.*OS_PASSWORD*.') | first | split('=') | last | replace(\"'\",'') }}" + when: stackhpc_enable_os_capacity - name: Template clouds.yml ansible.builtin.template: src: templates/os_capacity-clouds.yml.j2 dest: /opt/kayobe/os-capacity/clouds.yaml + when: stackhpc_enable_os_capacity - name: Ensure os_capacity container is running - docker_container: + community.docker.docker_container: name: os_capacity image: ghcr.io/stackhpc/os-capacity:master env: @@ -27,3 +58,4 @@ network_mode: host restart_policy: unless-stopped become: true + when: stackhpc_enable_os_capacity diff --git a/etc/kayobe/ansible/docker-registry-login.yml b/etc/kayobe/ansible/docker-registry-login.yml new file mode 100644 index 0000000000..39ad036001 --- /dev/null +++ b/etc/kayobe/ansible/docker-registry-login.yml @@ -0,0 +1,11 @@ +--- +- name: Login to docker registry + gather_facts: false + hosts: container-image-builders + tasks: + - name: Login to docker registry + docker_login: + registry_url: "{{ kolla_docker_registry or omit }}" + username: "{{ kolla_docker_registry_username }}" + password: "{{ kolla_docker_registry_password }}" + reauthorize: yes diff --git a/etc/kayobe/ansible/fix-houston.yml b/etc/kayobe/ansible/fix-houston.yml new file mode 100644 index 0000000000..6fa865792b --- /dev/null +++ b/etc/kayobe/ansible/fix-houston.yml @@ -0,0 +1,44 @@ +--- +# When OVS HW offloading is enabled - typically in conjunction with VF-LAG and ASAP^2 +# the DMESG log reports frequent errors on the internal OVS Bridge interface: +# "tc mirred to Houston: device bond0-ovs is down". +# This interface is down by default. The errors are mitigated by bringing the interface up. +# For further context, see: +# https://bugs.launchpad.net/charm-neutron-openvswitch/+bug/1899364 +# https://patchwork.kernel.org/project/netdevbpf/patch/c2ef23da1d9a4eb62f4e7b7c4540f9bafb553c15.1658420239.git.dcaratti@redhat.com/ +# To deploy this playbook, use the following commands: +# kayobe playbook run $KAYOBE_CONFIG_PATH/ansible/fix-houston.yml +# Enable with Kayobe Hooks by running: +# mkdir -p ${KAYOBE_CONFIG_PATH}/hooks/overcloud-service-deploy/post.d +# cd ${KAYOBE_CONFIG_PATH}/hooks/overcloud-service-deploy/post.d +# ln -s ../../../ansible/fix-houston.yml 90-fix-houston.yml + +- name: Create Systemd Unit to workaround 'tc mirred to Houston' error + hosts: network,compute + become: yes + + tasks: + - name: Include kolla-ansible host vars + include_vars: "{{ kolla_config_path }}/inventory/overcloud/host_vars/{{ inventory_hostname }}" + + - name: Create systemd service for -ovs network interface + template: + src: fix-houston-interface.service.j2 + dest: "/etc/systemd/system/fix-houston-{{ item }}.service" + loop: "{{ neutron_bridge_name.split(',') }}" + vars: + interface_name: "{{ item }}" + when: neutron_bridge_name | length > 0 + notify: reload systemd + + - name: Enable and start systemd service for -ovs network interface + systemd: + name: "fix-houston-{{ item }}" + enabled: yes + state: started + when: neutron_bridge_name | length > 0 + loop: "{{ neutron_bridge_name.split(',') }}" + + handlers: + - name: reload systemd + command: systemctl daemon-reload diff --git a/etc/kayobe/ansible/fix-networking.yml b/etc/kayobe/ansible/fix-networking.yml index 0b14f9ddfa..01a8332642 100644 --- a/etc/kayobe/ansible/fix-networking.yml +++ b/etc/kayobe/ansible/fix-networking.yml @@ -10,11 +10,16 @@ # Work around no known_hosts entry on first boot. ansible_ssh_common_args: "-o StrictHostKeyChecking=no" tasks: - - name: Ensure `hosts` file contains pulp entries + - name: Ensure hosts are reachable + ansible.builtin.wait_for_connection: + + - name: Ensure `hosts` file contains pulp and API entries blockinfile: path: /etc/hosts - marker: "# {mark} Kayobe Pulp entries" + marker: "# {mark} Kayobe entries" block: | 10.0.0.34 pelican pelican.service.compute.sms-lab.cloud 10.205.3.187 pulp-server pulp-server.internal.sms-cloud + 192.168.37.2 internal.infra.mos.{{ root_domain }} + 192.168.39.2 public.infra.mos.{{ root_domain }} become: true diff --git a/etc/kayobe/ansible/growroot.yml b/etc/kayobe/ansible/growroot.yml index 337453ed7a..333991aa01 100644 --- a/etc/kayobe/ansible/growroot.yml +++ b/etc/kayobe/ansible/growroot.yml @@ -75,7 +75,7 @@ vars: pv: "{{ pvs.stdout | from_json }}" disk_tmp: "{{ pv.report[0].pv[0].pv_name[:-1] }}" - disk: "{{ disk_tmp[:-1] if disk_tmp[-1] == 'p' and disk_tmp[:4] == 'nvme' else disk_tmp }}" + disk: "{{ disk_tmp[:-1] if disk_tmp[-1] == 'p' and disk_tmp[:9] == '/dev/nvme' else disk_tmp }}" part_num: "{{ pv.report[0].pv[0].pv_name[-1] }}" become: true failed_when: "growpart.rc != 0 and 'NOCHANGE' not in growpart.stdout" diff --git a/etc/kayobe/ansible/hotfix-containers.yml b/etc/kayobe/ansible/hotfix-containers.yml index b6a811801a..677105f3e7 100644 --- a/etc/kayobe/ansible/hotfix-containers.yml +++ b/etc/kayobe/ansible/hotfix-containers.yml @@ -30,13 +30,13 @@ - name: Set fact for containers list set_fact: - containers_list: host_containers.stdout + containers_list: "{{ host_containers.stdout }}" - name: Fail if no containers match given regex vars: hotfix_containers: "{{ containers_list | split('\n') | regex_search(container_hotfix_container_regex) }}" fail: - msg: "No containers matched. Please check your regex. Containers running on host: {{ host_containers | split('\n') }}" + msg: "No containers matched. Please check your regex. Containers running on host: {{ host_containers.stdout_lines }}" when: hotfix_containers == "" - name: Ensure hotfix-files directory exists on the remote host diff --git a/etc/kayobe/ansible/openstack-host-image-upload.yml b/etc/kayobe/ansible/openstack-host-image-upload.yml new file mode 100644 index 0000000000..2c92d24463 --- /dev/null +++ b/etc/kayobe/ansible/openstack-host-image-upload.yml @@ -0,0 +1,54 @@ +--- +# This playbook is designed to be used by the overcloud-host-image-build.yml +# GitHub workflow to upload newly-built images to a development cloud for +# testing and use in CI. +- name: Upload an OS image to Glance + hosts: seed + vars: + local_image_path: "/opt/kayobe/images/overcloud-{{ os_distribution }}-{{ os_release }}/overcloud-{{ os_distribution }}-{{ os_release }}.qcow2" + image_name: "overcloud-{{ os_distribution }}-{{ os_release }}" + tasks: + - block: + - name: Write out clouds.yaml + copy: + content: "{{ lookup('ansible.builtin.env', 'CLOUDS_YAML') }}" + dest: clouds.yaml + mode: 0600 + + - name: Write out secure.yaml + no_log: true + vars: + os_secrets: + clouds: + openstack: + auth: + application_credential_id: "{{ lookup('ansible.builtin.env', 'OS_APPLICATION_CREDENTIAL_ID') }}" + application_credential_secret: "{{ lookup('ansible.builtin.env', 'OS_APPLICATION_CREDENTIAL_SECRET') }}" + copy: + content: "{{ os_secrets | to_nice_yaml }}" + dest: secure.yaml + mode: 0600 + + - name: Ensure dependencies are installed + pip: + name: openstacksdk + + - name: Upload an image to Glance + openstack.cloud.image: + cloud: openstack + name: "{{ image_name }}" + container_format: bare + disk_format: qcow2 + state: present + filename: "{{ local_image_path }}" + + always: + - name: Remove clouds.yaml + file: + path: clouds.yaml + state: absent + + - name: Remove secure.yaml + file: + path: secure.yaml + state: absent diff --git a/etc/kayobe/ansible/ovn-fix-chassis-priorities.yml b/etc/kayobe/ansible/ovn-fix-chassis-priorities.yml index 36566b6a3f..9ba469ce7f 100644 --- a/etc/kayobe/ansible/ovn-fix-chassis-priorities.yml +++ b/etc/kayobe/ansible/ovn-fix-chassis-priorities.yml @@ -21,22 +21,25 @@ - name: Find OVN DB DB Leader hosts: "{{ ovn_nb_db_group | default('controllers') }}" tasks: - - name: Find the OVN NB DB leader - ansible.builtin.command: docker exec ovn_nb_db ovn-nbctl get-connection - changed_when: false - failed_when: false - register: ovn_check_result - check_mode: false + - name: Find OVN DB Leader + when: kolla_enable_ovn | bool + block: + - name: Find the OVN NB DB leader + ansible.builtin.command: docker exec ovn_nb_db ovn-nbctl get-connection + changed_when: false + failed_when: false + register: ovn_check_result + check_mode: false - - name: Group hosts by leader/follower role - ansible.builtin.group_by: - key: "ovn_nb_{{ 'leader' if ovn_check_result.rc == 0 else 'follower' }}" - changed_when: false + - name: Group hosts by leader/follower role + ansible.builtin.group_by: + key: "ovn_nb_{{ 'leader' if ovn_check_result.rc == 0 else 'follower' }}" + changed_when: false - - name: Assert one leader exists - ansible.builtin.assert: - that: - - groups['ovn_nb_leader'] | default([]) | length == 1 + - name: Assert one leader exists + ansible.builtin.assert: + that: + - groups['ovn_nb_leader'] | default([]) | length == 1 - name: Fix OVN chassis priorities hosts: ovn_nb_leader diff --git a/etc/kayobe/ansible/pulp-host-image-promote.yml b/etc/kayobe/ansible/pulp-host-image-promote.yml index d93d71d51e..42f98b423e 100644 --- a/etc/kayobe/ansible/pulp-host-image-promote.yml +++ b/etc/kayobe/ansible/pulp-host-image-promote.yml @@ -19,6 +19,9 @@ name: "{{ repository_name }}_{{ promotion_tag }}" base_path: "{{ base_path }}/{{ promotion_tag }}" register: distribution_details + until: distribution_details is success + retries: 3 + delay: 5 - name: Fail if the image does not exist fail: @@ -34,6 +37,10 @@ base_path: "{{ base_path }}/{{ promotion_tag }}" content_guard: release state: present + register: content_guard_result + until: content_guard_result is success + retries: 3 + delay: 5 - name: Print version tag and os debug: diff --git a/etc/kayobe/ansible/pulp-host-image-upload.yml b/etc/kayobe/ansible/pulp-host-image-upload.yml index a06897d90f..cc48760801 100644 --- a/etc/kayobe/ansible/pulp-host-image-upload.yml +++ b/etc/kayobe/ansible/pulp-host-image-upload.yml @@ -1,12 +1,12 @@ --- - name: Upload and create a distribution for an image - hosts: localhost + hosts: seed vars: remote_pulp_url: "{{ stackhpc_release_pulp_url }}" remote_pulp_username: "{{ stackhpc_image_repository_username }}" remote_pulp_password: "{{ stackhpc_image_repository_password }}" repository_name: "kayobe-images-{{ openstack_release }}-{{ os_distribution }}-{{ os_release }}" - base_path: "kayobe-images/{{ openstack_release }}/{{ os_distribution }}/{{ os_release }}" + pulp_base_path: "kayobe-images/{{ openstack_release }}/{{ os_distribution }}/{{ os_release }}" tasks: - name: Print image tag debug: @@ -25,6 +25,10 @@ password: "{{ remote_pulp_password }}" file: "{{ found_files.files[0].path }}" state: present + register: upload_result + until: upload_result is success + retries: 3 + delay: 60 - name: Get sha256 hash ansible.builtin.stat: @@ -40,6 +44,10 @@ sha256: "{{ file_stats.stat.checksum }}" relative_path: "{{ found_files.files[0].path | basename }}" state: present + register: file_content_result + until: file_content_result is success + retries: 3 + delay: 5 - name: Ensure file repo exists pulp.squeezer.file_repository: @@ -48,6 +56,10 @@ password: "{{ remote_pulp_password }}" name: "{{ repository_name }}" state: present + register: file_repo_result + until: file_repo_result is success + retries: 3 + delay: 5 - name: Add content to file repo pulp.squeezer.file_repository_content: @@ -58,6 +70,10 @@ present_content: - relative_path: "{{ found_files.files[0].path | basename }}" sha256: "{{ file_stats.stat.checksum }}" + register: file_repo_content_result + until: file_repo_content_result is success + retries: 3 + delay: 5 - name: Create a new publication to point to this version pulp.squeezer.file_publication: @@ -67,6 +83,9 @@ repository: "{{ repository_name }}" state: present register: publication_details + until: publication_details is success + retries: 3 + delay: 5 - name: Update distribution for latest version pulp.squeezer.file_distribution: @@ -74,11 +93,14 @@ username: "{{ remote_pulp_username }}" password: "{{ remote_pulp_password }}" name: "{{ repository_name }}_latest" - base_path: "{{ base_path }}/latest" + base_path: "{{ pulp_base_path }}/latest" publication: "{{ publication_details.publication.pulp_href }}" content_guard: development state: present register: latest_distribution_details + until: latest_distribution_details is success + retries: 3 + delay: 5 - name: Create distribution for given version pulp.squeezer.file_distribution: @@ -86,35 +108,39 @@ username: "{{ remote_pulp_username }}" password: "{{ remote_pulp_password }}" name: "{{ repository_name }}_{{ host_image_tag }}" - base_path: "{{ base_path }}/{{ host_image_tag }}" + base_path: "{{ pulp_base_path }}/{{ host_image_tag }}" publication: "{{ publication_details.publication.pulp_href }}" content_guard: development state: present when: latest_distribution_details.changed + register: distribution_result + until: distribution_result is success + retries: 3 + delay: 5 - name: Update new images file with versioned path lineinfile: path: /tmp/updated_images.txt - line: "{{ remote_pulp_url }}/pulp/content/{{ base_path }}/\ + line: "{{ remote_pulp_url }}/pulp/content/{{ pulp_base_path }}/\ {{ host_image_tag }}/{{ found_files.files[0].path | basename }}" create: true - name: Update new images file with latest path lineinfile: path: /tmp/updated_images.txt - line: "{{ remote_pulp_url }}/pulp/content/{{ base_path }}/\ + line: "{{ remote_pulp_url }}/pulp/content/{{ pulp_base_path }}/\ latest/{{ found_files.files[0].path | basename }}" when: latest_distribution_details.changed - name: Print versioned path debug: - msg: "New versioned path: {{ remote_pulp_url }}/pulp/content/{{ base_path }}/\ + msg: "New versioned path: {{ remote_pulp_url }}/pulp/content/{{ pulp_base_path }}/\ {{ host_image_tag }}/{{ found_files.files[0].path | basename }}" when: latest_distribution_details.changed - name: Print latest path debug: - msg: "New latest path: {{ remote_pulp_url }}/pulp/content/{{ base_path }}/\ + msg: "New latest path: {{ remote_pulp_url }}/pulp/content/{{ pulp_base_path }}/\ latest/{{ found_files.files[0].path | basename }}" when: latest_distribution_details.changed diff --git a/etc/kayobe/ansible/requirements.yml b/etc/kayobe/ansible/requirements.yml index e09ed5b855..f1ae3bb631 100644 --- a/etc/kayobe/ansible/requirements.yml +++ b/etc/kayobe/ansible/requirements.yml @@ -1,7 +1,7 @@ --- collections: - name: stackhpc.cephadm - version: 1.14.0 + version: 1.15.1 # NOTE: Pinning pulp.squeezer to 0.0.13 because 0.0.14+ depends on the # pulp_glue Python library being installed. - name: pulp.squeezer diff --git a/etc/kayobe/ansible/run-container-hotfix.yml b/etc/kayobe/ansible/run-container-hotfix.yml index 582ade5dac..de652e451d 100644 --- a/etc/kayobe/ansible/run-container-hotfix.yml +++ b/etc/kayobe/ansible/run-container-hotfix.yml @@ -20,3 +20,4 @@ - name: Run container_hotfix_command command: "{{ kolla_container_engine | default('docker')}} exec {{ '-u 0' if container_hotfix_become else '' }} {{ hotfix_container }} {{ container_hotfix_command }}" + when: container_hotfix_command diff --git a/etc/kayobe/ansible/smartmon-tools.yml b/etc/kayobe/ansible/smartmon-tools.yml index bb5cf5dca1..b4a064b637 100644 --- a/etc/kayobe/ansible/smartmon-tools.yml +++ b/etc/kayobe/ansible/smartmon-tools.yml @@ -12,6 +12,13 @@ state: present become: true + - name: Ensure the cron/crond service is running + service: + name: "{{ 'cron' if ansible_facts['distribution'] == 'Ubuntu' else 'crond' }}" + state: started + enabled: true + become: true + - name: Copy smartmon.sh and nvmemon.sh from scripts folder copy: src: "scripts/{{ item }}" diff --git a/etc/kayobe/ansible/templates/fix-houston-interface.service.j2 b/etc/kayobe/ansible/templates/fix-houston-interface.service.j2 new file mode 100644 index 0000000000..24696b13ea --- /dev/null +++ b/etc/kayobe/ansible/templates/fix-houston-interface.service.j2 @@ -0,0 +1,20 @@ +[Unit] +# This service addresses a specific issue when OVS HW offloading is enabled +# typically in conjunction with VF-LAG and ASAP^2 +# the DMESG log reports frequent errors on the internal OVS Bridge interface: +# "tc mirred to Houston: device bond0-ovs is down". +# This interface is down by default. The errors are mitigated by bringing the interface up. +# For further context, see: +# https://bugs.launchpad.net/charm-neutron-openvswitch/+bug/1899364 +# https://patchwork.kernel.org/project/netdevbpf/patch/c2ef23da1d9a4eb62f4e7b7c4540f9bafb553c15.1658420239.git.dcaratti@redhat.com/ +Description=Bring up {{ interface_name }} interface +After=kolla-openvswitch_vswitchd-container.service + +[Service] +Type=oneshot +ExecStartPre=/usr/bin/timeout 60s /bin/bash -c 'until ip link show {{ interface_name }}; do sleep 1; done' +ExecStart=/sbin/ip link set {{ interface_name }} up +RemainAfterExit=yes + +[Install] +WantedBy=multi-user.target diff --git a/etc/kayobe/ansible/templates/os_capacity-clouds.yml.j2 b/etc/kayobe/ansible/templates/os_capacity-clouds.yml.j2 index 89d66c0bc3..ef3c8d7a50 100644 --- a/etc/kayobe/ansible/templates/os_capacity-clouds.yml.j2 +++ b/etc/kayobe/ansible/templates/os_capacity-clouds.yml.j2 @@ -1,10 +1,15 @@ clouds: openstack: auth: - auth_url: "{{ secrets_os_exporter_auth_url }}" - application_credential_id: "{{ secrets_os_exporter_credential_id }}" - application_credential_secret: "{{ secrets_os_exporter_credential_secret }}" - region_name: "RegionOne" + auth_url: "{{ stackhpc_os_capacity_auth_url }}" + project_name: "{{ stackhpc_os_capacity_project_name }}" + domain_name: "{{ stackhpc_os_capacity_domain_name }}" + username: "{{ stackhpc_os_capacity_username }}" + password: "{{ stackhpc_os_capacity_password }}" + region_name: "{{ stackhpc_os_capacity_openstack_region_name }}" interface: "internal" identity_api_version: 3 - auth_type: "v3applicationcredential" + auth_type: "password" +{% if not stackhpc_os_capacity_openstack_verify | bool %} + verify: False +{% endif %} diff --git a/etc/kayobe/ansible/ubuntu-upgrade.yml b/etc/kayobe/ansible/ubuntu-upgrade.yml new file mode 100644 index 0000000000..928e1c52d0 --- /dev/null +++ b/etc/kayobe/ansible/ubuntu-upgrade.yml @@ -0,0 +1,106 @@ +--- +# To prevent Ansible role dependency errors, this playbook requires that environment variable +# ANSIBLE_ROLES_PATH is defined and includes '$KAYOBE_PATH/ansible/roles' on the Ansible control host. +- name: Migrate hosts from Ubuntu Focal 20.04 to Jammy 22.04 + hosts: overcloud:infra-vms:seed:seed-hypervisor + vars: + ansible_python_interpreter: /usr/bin/python3 + tasks: + - name: Assert that hosts are running Ubuntu Focal + assert: + that: + - ansible_facts.distribution == 'Ubuntu' + - ansible_facts.distribution_major_version == '20' + - ansible_facts.distribution_release == 'focal' + - os_distribution == 'ubuntu' + fail_msg: >- + This playbook is only designed for Ubuntu Focal 20.04 hosts. Ensure + that you are limiting it to only run on Focal hosts and + os_distribution is set to ubuntu. + + - name: Ensure apt packages are up to date + apt: + update_cache: true + upgrade: yes + become: true + + - name: Ensure do-release-upgrade is installed + package: + name: ubuntu-release-upgrader-core + state: latest + become: true + + - name: Check whether a reboot is required + stat: + path: /var/run/reboot-required + register: file_status + + - name: Reboot to apply updates + reboot: + reboot_timeout: 1200 + connect_timeout: 600 + become: true + when: file_status.stat.exists + + # NOTE: We cannot use apt_repository here because definitions must exist within the standard repos.list + - name: Ensure Jammy repo definitions exist in sources.list + blockinfile: + path: /etc/apt/sources.list + block: | + deb {{ stackhpc_repo_ubuntu_jammy_url }} jammy main restricted universe multiverse + deb {{ stackhpc_repo_ubuntu_jammy_url }} jammy-updates main restricted universe multiverse + deb {{ stackhpc_repo_ubuntu_jammy_url }} jammy-backports main restricted universe multiverse + deb {{ stackhpc_repo_ubuntu_jammy_security_url }} jammy-security main restricted universe multiverse + become: true + + - name: Do release upgrade + command: do-release-upgrade -f DistUpgradeViewNonInteractive + become: true + + - name: Ensure old venvs do not exist + file: + path: "/opt/kayobe/venvs/{{ item }}" + state: absent + loop: + - kayobe + - kolla-ansible + become: true + + - name: Update Python and current user facts before re-creating Kayobe venv + ansible.builtin.setup: + filter: "{{ kayobe_ansible_setup_filter }}" + gather_subset: "{{ kayobe_ansible_setup_gather_subset }}" + +- name: Run the Kayobe kayobe-target-venv playbook to ensure kayobe venv exists on remote host + import_playbook: "{{ lookup('ansible.builtin.env', 'VIRTUAL_ENV') }}/share/kayobe/ansible/kayobe-target-venv.yml" + +- name: Run the Kayobe network configuration playbook, to ensure definitions are not lost on reboot + import_playbook: "{{ lookup('ansible.builtin.env', 'VIRTUAL_ENV') }}/share/kayobe/ansible/network.yml" + +- name: Reboot and confirm the host is upgraded to Jammy 22.04 + hosts: overcloud:infra-vms:seed:seed-hypervisor + vars: + ansible_python_interpreter: /usr/bin/python3 + tasks: + - name: Ensure Jammy repo definitions do not exist in sources.list + blockinfile: + path: /etc/apt/sources.list + state: absent + become: true + + - name: Reboot and wait + reboot: + reboot_timeout: 1200 + connect_timeout: 600 + become: true + + - name: Update distribution facts + ansible.builtin.setup: + filter: "{{ kayobe_ansible_setup_filter }}" + gather_subset: "{{ kayobe_ansible_setup_gather_subset }}" + + - name: Assert that hosts are now using Ubuntu 22 + assert: + that: + - ansible_facts.distribution_major_version == '22' + - ansible_facts.distribution_release == 'jammy' diff --git a/etc/kayobe/bifrost.yml b/etc/kayobe/bifrost.yml index a9eba19dd3..d15d186130 100644 --- a/etc/kayobe/bifrost.yml +++ b/etc/kayobe/bifrost.yml @@ -116,6 +116,9 @@ # Ironic inspector deployment ramdisk location. #kolla_bifrost_inspector_deploy_ramdisk: +# Ironic inspector legacy deployment kernel location. +#kolla_bifrost_inspector_legacy_deploy_kernel: + # Timeout of hardware inspection on overcloud nodes, in seconds. Default is # {{ inspector_inspection_timeout }}. #kolla_bifrost_inspection_timeout: diff --git a/etc/kayobe/compute.yml b/etc/kayobe/compute.yml index b3940f6492..75ff73c5f0 100644 --- a/etc/kayobe/compute.yml +++ b/etc/kayobe/compute.yml @@ -63,15 +63,15 @@ ############################################################################### # Compute node LVM configuration. -# List of compute volume groups. See mrlesmithjr.manage-lvm role for +# List of compute volume groups. See mrlesmithjr.manage_lvm role for # format. #compute_lvm_groups: -# Default list of compute volume groups. See mrlesmithjr.manage-lvm role for +# Default list of compute volume groups. See mrlesmithjr.manage_lvm role for # format. #compute_lvm_groups_default: -# Additional list of compute volume groups. See mrlesmithjr.manage-lvm role +# Additional list of compute volume groups. See mrlesmithjr.manage_lvm role # for format. #compute_lvm_groups_extra: @@ -82,7 +82,7 @@ # 'docker_storage_driver' is set to 'devicemapper', or false otherwise. #compute_lvm_group_data_enabled: -# Compute LVM volume group for data. See mrlesmithjr.manage-lvm role for +# Compute LVM volume group for data. See mrlesmithjr.manage_lvm role for # format. #compute_lvm_group_data: diff --git a/etc/kayobe/containers/pulp/post.yml b/etc/kayobe/containers/pulp/post.yml index fec1abb944..967c4e37d7 100644 --- a/etc/kayobe/containers/pulp/post.yml +++ b/etc/kayobe/containers/pulp/post.yml @@ -27,3 +27,10 @@ when: - stackhpc_pulp_sync_for_local_container_build | bool - pulp_settings.changed + +- name: Login to docker registry + docker_login: + registry_url: "{{ kolla_docker_registry or omit }}" + username: "{{ kolla_docker_registry_username }}" + password: "{{ kolla_docker_registry_password }}" + reauthorize: yes diff --git a/etc/kayobe/controllers.yml b/etc/kayobe/controllers.yml index d51da14873..cdb26a592c 100644 --- a/etc/kayobe/controllers.yml +++ b/etc/kayobe/controllers.yml @@ -72,15 +72,15 @@ ############################################################################### # Controller node LVM configuration. -# List of controller volume groups. See mrlesmithjr.manage-lvm role for +# List of controller volume groups. See mrlesmithjr.manage_lvm role for # format. #controller_lvm_groups: -# Default list of controller volume groups. See mrlesmithjr.manage-lvm role for +# Default list of controller volume groups. See mrlesmithjr.manage_lvm role for # format. #controller_lvm_groups_default: -# Additional list of controller volume groups. See mrlesmithjr.manage-lvm role +# Additional list of controller volume groups. See mrlesmithjr.manage_lvm role # for format. #controller_lvm_groups_extra: @@ -91,7 +91,7 @@ # 'docker_storage_driver' is set to 'devicemapper', or false otherwise. #controller_lvm_group_data_enabled: -# Controller LVM volume group for data. See mrlesmithjr.manage-lvm role for +# Controller LVM volume group for data. See mrlesmithjr.manage_lvm role for # format. #controller_lvm_group_data: diff --git a/etc/kayobe/environments/aufn-ceph/a-universe-from-nothing.sh b/etc/kayobe/environments/aufn-ceph/a-universe-from-nothing.sh index 43c47c8e3a..0af69fc693 100755 --- a/etc/kayobe/environments/aufn-ceph/a-universe-from-nothing.sh +++ b/etc/kayobe/environments/aufn-ceph/a-universe-from-nothing.sh @@ -87,7 +87,7 @@ kayobe seed vm provision kayobe seed host configure # Deploy local pulp server as a container on the seed VM -kayobe seed service deploy --tags seed-deploy-containers --kolla-tags none -e deploy_containers_registry_attempt_login=False +kayobe seed service deploy --tags seed-deploy-containers --kolla-tags none # Deploying the seed restarts networking interface, run configure-local-networking.sh again to re-add routes. $KAYOBE_CONFIG_PATH/environments/$KAYOBE_ENVIRONMENT/configure-local-networking.sh diff --git a/etc/kayobe/environments/aufn-ceph/configure-local-networking.sh b/etc/kayobe/environments/aufn-ceph/configure-local-networking.sh index ab3602d2a5..c22bbd5180 100755 --- a/etc/kayobe/environments/aufn-ceph/configure-local-networking.sh +++ b/etc/kayobe/environments/aufn-ceph/configure-local-networking.sh @@ -43,7 +43,7 @@ if ! sudo ip l show brcloud >/dev/null 2>&1; then sudo ip l set brcloud up fi -# On CentOS 8, bridges without a port are DOWN, which causes network +# On Rocky Linux, bridges without a port are DOWN, which causes network # configuration to fail. Add a dummy interface and plug it into the bridge. for i in mgmt prov cloud; do if ! sudo ip l show dummy-$i >/dev/null 2>&1; then diff --git a/etc/kayobe/environments/aufn-ceph/seed-hypervisor.yml b/etc/kayobe/environments/aufn-ceph/seed-hypervisor.yml index 6a1b7ffdf0..2f288f0303 100644 --- a/etc/kayobe/environments/aufn-ceph/seed-hypervisor.yml +++ b/etc/kayobe/environments/aufn-ceph/seed-hypervisor.yml @@ -10,5 +10,5 @@ seed_hypervisor_extra_network_interfaces: - "{{ public_net_name }}" - "{{ external_net_names[0] }}" -# Workaround change to cloud-user default login name on CentOS-Stream8 +# Workaround change to cloud-user default login name on Rocky Linux seed_hypervisor_bootstrap_user: "{{ lookup('env', 'USER') }}" diff --git a/etc/kayobe/environments/aufn-ceph/tenks.yml b/etc/kayobe/environments/aufn-ceph/tenks.yml index 9b0e9e9f42..25eac03744 100644 --- a/etc/kayobe/environments/aufn-ceph/tenks.yml +++ b/etc/kayobe/environments/aufn-ceph/tenks.yml @@ -87,3 +87,9 @@ bridge_type: linuxbridge # No placement service. wait_for_placement: false + +# NOTE(priteau): Disable libvirt_vm_trust_guest_rx_filters, which when enabled +# triggers the following errors when booting baremetal instances with Tenks on +# Libvirt 9: Cannot set interface flags on 'macvtap1': Value too large for +# defined data type +libvirt_vm_trust_guest_rx_filters: false diff --git a/etc/kayobe/environments/ci-aio/automated-setup.sh b/etc/kayobe/environments/ci-aio/automated-setup.sh index 9d05ea7ca7..d0f9a390d4 100644 --- a/etc/kayobe/environments/ci-aio/automated-setup.sh +++ b/etc/kayobe/environments/ci-aio/automated-setup.sh @@ -2,30 +2,35 @@ set -eux -cat << EOF | sudo tee -a /etc/hosts -10.205.3.187 pulp-server pulp-server.internal.sms-cloud -EOF - -if sudo vgdisplay | grep -q lvm2; then - sudo pvresize $(sudo pvs --noheadings | head -n 1 | awk '{print $1}') - sudo lvextend -L 4G /dev/rootvg/lv_home -r || true - sudo lvextend -L 4G /dev/rootvg/lv_tmp -r || true -fi - BASE_PATH=~ KAYOBE_BRANCH=stackhpc/2023.1 KAYOBE_CONFIG_BRANCH=stackhpc/2023.1 +KAYOBE_AIO_LVM=true if [[ ! -f $BASE_PATH/vault-pw ]]; then echo "Vault password file not found at $BASE_PATH/vault-pw" exit 1 fi +if sudo vgdisplay | grep -q lvm2; then + sudo pvresize $(sudo pvs --noheadings | head -n 1 | awk '{print $1}') + sudo lvextend -L 4G /dev/rootvg/lv_home -r || true + sudo lvextend -L 4G /dev/rootvg/lv_tmp -r || true +elif $KAYOBE_AIO_LVM; then + echo "This environment is only designed for LVM images. If possible, switch to an LVM image. + To ignore this warning, set KAYOBE_AIO_LVM to false in this script." + exit 1 +fi + +cat << EOF | sudo tee -a /etc/hosts +10.205.3.187 pulp-server pulp-server.internal.sms-cloud +EOF + if type dnf; then sudo dnf -y install git else sudo apt update - sudo apt -y install gcc git libffi-dev python3-dev python-is-python3 + sudo apt -y install gcc git libffi-dev python3-dev python-is-python3 python3-venv fi cd $BASE_PATH @@ -35,6 +40,11 @@ pushd src [[ -d kayobe-config ]] || git clone https://github.com/stackhpc/stackhpc-kayobe-config kayobe-config -b $KAYOBE_CONFIG_BRANCH popd +if ! sudo vgdisplay | grep -q lvm2; then + rm $BASE_PATH/src/kayobe-config/etc/kayobe/environments/ci-aio/inventory/group_vars/controllers/lvm.yml + sed -i -e '/controller_lvm_groups/,+2d' $BASE_PATH/src/kayobe-config/etc/kayobe/environments/ci-aio/controllers.yml +fi + mkdir -p venvs pushd venvs if [[ ! -d kayobe ]]; then @@ -62,13 +72,17 @@ fi sudo ip l set dummy1 up sudo ip l set dummy1 master breth1 +if type apt; then + sudo cp /run/systemd/network/* /etc/systemd/network +fi + export KAYOBE_VAULT_PASSWORD=$(cat $BASE_PATH/vault-pw) pushd $BASE_PATH/src/kayobe-config source kayobe-env --environment ci-aio kayobe control host bootstrap -kayobe playbook run etc/kayobe/ansible/growroot.yml +kayobe playbook run etc/kayobe/ansible/growroot.yml etc/kayobe/ansible/purge-command-not-found.yml kayobe overcloud host configure diff --git a/etc/kayobe/environments/ci-aio/kolla/globals.yml b/etc/kayobe/environments/ci-aio/kolla/globals.yml index 1d15c4473e..3967a5075a 100644 --- a/etc/kayobe/environments/ci-aio/kolla/globals.yml +++ b/etc/kayobe/environments/ci-aio/kolla/globals.yml @@ -10,7 +10,7 @@ openstack_service_workers: "1" openstack_service_rpc_workers: "1" # OpenSearch memory tuning -opensearch_heap_size: 1g +opensearch_heap_size: 200m # Increase Grafana timeout grafana_start_first_node_retries: 20 diff --git a/etc/kayobe/environments/ci-aio/stackhpc-ci.yml b/etc/kayobe/environments/ci-aio/stackhpc-ci.yml index f7f69d01ab..2814a7efae 100644 --- a/etc/kayobe/environments/ci-aio/stackhpc-ci.yml +++ b/etc/kayobe/environments/ci-aio/stackhpc-ci.yml @@ -5,12 +5,8 @@ # Docker namespace to use for Kolla images. Default is 'kolla'. kolla_docker_namespace: stackhpc-dev -############################################################################### -# Network configuration. - -# Don't touch resolv.conf: use Neutron DNS for accessing Pulp server via -# hostname. -resolv_is_managed: false +# Disable some services to reduce memory footprint. +kolla_enable_heat: false ############################################################################### # StackHPC configuration. @@ -19,8 +15,14 @@ resolv_is_managed: false # Build and deploy the development Pulp service repositories. # Use Ark's package repositories to install packages. stackhpc_repo_mirror_url: "{{ stackhpc_release_pulp_url }}" -stackhpc_repo_mirror_username: "{{ stackhpc_docker_registry_username }}" -stackhpc_repo_mirror_password: "{{ stackhpc_docker_registry_password }}" +stackhpc_repo_mirror_username: "skc-ci-aio" +stackhpc_repo_mirror_password: !vault | + $ANSIBLE_VAULT;1.1;AES256 + 31386366383365666135336331663635396237623139306362633933636233613765663731666338 + 3633633736333936383439623066653663333964343234350a393137383537316164323837386437 + 36613139323161643766666565643739373037623363636234343965343436653261326238393566 + 3837336661653962340a316631366463623138623530373133336665376433633437306631383666 + 30333461333535363433363336663664316634343432633766346564323833346663 # Build against released Pulp repository versions. stackhpc_repo_grafana_version: "{{ stackhpc_pulp_repo_grafana_version }}" @@ -55,19 +57,11 @@ stackhpc_include_os_minor_version_in_repo_url: true # Host and port of container registry. # Push built images to the development Pulp service registry. stackhpc_docker_registry: "{{ stackhpc_repo_mirror_url | regex_replace('^https?://', '') }}" - -# Username and password of container registry. -stackhpc_docker_registry_username: "release-train-ci" -stackhpc_docker_registry_password: !vault | - $ANSIBLE_VAULT;1.1;AES256 - 38356134376436656165303634626531653836366233383531343439646433376334396438373735 - 3135643664353934356237376134623235356137383263300a333165386562396134633534376532 - 34386133383366326639353432386235336132663839333337323739633434613934346462363031 - 3265323831663964360a643962346231386462323236373963633066393736323234303833363535 - 3664 +stackhpc_docker_registry_username: "{{ stackhpc_repo_mirror_username }}" +stackhpc_docker_registry_password: "{{ stackhpc_repo_mirror_password }}" # Override Pulp credentials to allow querying container image tags in the # check-tags.yml custom playbook. pulp_url: "{{ stackhpc_repo_mirror_url }}" -pulp_username: "{{ stackhpc_docker_registry_username }}" -pulp_password: "{{ stackhpc_docker_registry_password }}" +pulp_username: "{{ stackhpc_repo_mirror_username }}" +pulp_password: "{{ stackhpc_repo_mirror_password }}" diff --git a/etc/kayobe/environments/ci-builder/stackhpc-ci.yml b/etc/kayobe/environments/ci-builder/stackhpc-ci.yml index 4f79929193..dcd6582bbd 100644 --- a/etc/kayobe/environments/ci-builder/stackhpc-ci.yml +++ b/etc/kayobe/environments/ci-builder/stackhpc-ci.yml @@ -26,13 +26,7 @@ kolla_enable_octavia: true kolla_enable_opensearch: true kolla_enable_prometheus: true kolla_enable_redis: true - -############################################################################### -# Network configuration. - -# Don't touch resolv.conf: use Neutron DNS for accessing Pulp server via -# hostname. -resolv_is_managed: false +kolla_build_neutron_ovs: true ############################################################################### # StackHPC configuration. @@ -41,8 +35,14 @@ resolv_is_managed: false # Build against the development Pulp service repositories. # Use Ark's package repositories to install packages. stackhpc_repo_mirror_url: "{{ stackhpc_repo_mirror_auth_proxy_url if stackhpc_repo_mirror_auth_proxy_enabled | bool else stackhpc_release_pulp_url }}" -stackhpc_repo_mirror_username: "{{ stackhpc_docker_registry_username }}" -stackhpc_repo_mirror_password: "{{ stackhpc_docker_registry_password }}" +stackhpc_repo_mirror_username: "skc-ci-aio" +stackhpc_repo_mirror_password: !vault | + $ANSIBLE_VAULT;1.1;AES256 + 31386366383365666135336331663635396237623139306362633933636233613765663731666338 + 3633633736333936383439623066653663333964343234350a393137383537316164323837386437 + 36613139323161643766666565643739373037623363636234343965343436653261326238393566 + 3837336661653962340a316631366463623138623530373133336665376433633437306631383666 + 30333461333535363433363336663664316634343432633766346564323833346663 # Build against released Pulp repository versions. stackhpc_repo_grafana_version: "{{ stackhpc_pulp_repo_grafana_version }}" diff --git a/etc/kayobe/environments/ci-multinode/cephadm.yml b/etc/kayobe/environments/ci-multinode/cephadm.yml index 7885a57359..4a9d3f4488 100644 --- a/etc/kayobe/environments/ci-multinode/cephadm.yml +++ b/etc/kayobe/environments/ci-multinode/cephadm.yml @@ -2,6 +2,12 @@ ############################################################################### # Cephadm deployment configuration. +# Ceph release name. +cephadm_ceph_release: "{{ 'quincy' if (ansible_facts['distribution_release'] == 'jammy' or ansible_facts.distribution_major_version == '9') else 'pacific' }}" + +# Ceph container image tag. +cephadm_image_tag: "{{ 'v17.2.7' if cephadm_ceph_release == 'quincy' else 'v16.2.14' }}" + # Ceph OSD specification. cephadm_osd_spec: service_type: osd diff --git a/etc/kayobe/environments/ci-multinode/kolla/globals.yml b/etc/kayobe/environments/ci-multinode/kolla/globals.yml index f7f7ca77df..0f9dfe6f01 100644 --- a/etc/kayobe/environments/ci-multinode/kolla/globals.yml +++ b/etc/kayobe/environments/ci-multinode/kolla/globals.yml @@ -1,4 +1,9 @@ --- +# Most development environments will use nested virtualisation, and we can't +# guarantee that nested KVM support is available. Use QEMU as a lowest common +# denominator. +nova_compute_virt_type: qemu + # Reduce the control plane's memory footprint by limiting the number of worker # processes to two per-service when running in a VM. openstack_service_workers: "{% raw %}{{ [ansible_facts.processor_vcpus, 2 if ansible_facts.virtualization_role == 'guest' else 5] | min }}{% endraw %}" diff --git a/etc/kayobe/environments/ci-multinode/stackhpc-ci.yml b/etc/kayobe/environments/ci-multinode/stackhpc-ci.yml index f312ed479a..d7cc442d60 100644 --- a/etc/kayobe/environments/ci-multinode/stackhpc-ci.yml +++ b/etc/kayobe/environments/ci-multinode/stackhpc-ci.yml @@ -5,19 +5,21 @@ # Docker namespace to use for Kolla images. Default is 'kolla'. kolla_docker_namespace: stackhpc-dev -############################################################################### -# Network configuration. - -# Don't touch resolv.conf: use Neutron DNS for accessing Pulp server via -# hostname. -resolv_is_managed: false - ############################################################################### # StackHPC configuration. # Host and port of a package repository mirror. # Build and deploy the development Pulp service repositories. -stackhpc_repo_mirror_url: "http://pulp-server.internal.sms-cloud:8080" +# Use Ark's package repositories to install packages. +stackhpc_repo_mirror_url: "{{ stackhpc_release_pulp_url }}" +stackhpc_repo_mirror_username: "skc-ci-aio" +stackhpc_repo_mirror_password: !vault | + $ANSIBLE_VAULT;1.1;AES256 + 36373536303261313239613761653261663437356566343865383563346334396136653666383765 + 6634396534653865633936653038383132396532386665370a366562383166353966663838316266 + 65333133636330623936623438666632316238376264313234346333346461623765633163353635 + 6565326136313564320a303231383438333062643533333335663034613439393665656162626137 + 65356232656164663831316530333136336362393636656566353635306565626636 # Build and deploy released Pulp repository versions. stackhpc_repo_grafana_version: "{{ stackhpc_pulp_repo_grafana_version }}" @@ -53,12 +55,5 @@ stackhpc_include_os_minor_version_in_repo_url: true # Push built images to the development Pulp service registry. stackhpc_docker_registry: "{{ stackhpc_repo_mirror_url | regex_replace('^https?://', '') }}" -# Username and password of container registry. -stackhpc_docker_registry_username: "stackhpc-kayobe-ci" -stackhpc_docker_registry_password: !vault | - $ANSIBLE_VAULT;1.1;AES256 - 33356166343730633865363431306535613736663764373034396132356131343066636530393534 - 3262646436663034633131316438633230383330633533350a386365313239303464383636376338 - 61656662333939333063343131633963636431663136643137636664633233633133396339613861 - 3038613063626138610a333566393937643630366564653163613364323965396130613433316537 - 39653335393831633362343934363866346262613166393561666336623062393935 +stackhpc_docker_registry_username: "{{ stackhpc_repo_mirror_username }}" +stackhpc_docker_registry_password: "{{ stackhpc_repo_mirror_password }}" diff --git a/etc/kayobe/hooks/overcloud-service-deploy/post.d/deploy-os-capacity-exporter.yml b/etc/kayobe/hooks/overcloud-service-deploy/post.d/deploy-os-capacity-exporter.yml new file mode 120000 index 0000000000..0cc70aace5 --- /dev/null +++ b/etc/kayobe/hooks/overcloud-service-deploy/post.d/deploy-os-capacity-exporter.yml @@ -0,0 +1 @@ +../../../ansible/deploy-os-capacity-exporter.yml \ No newline at end of file diff --git a/etc/kayobe/infra-vms.yml b/etc/kayobe/infra-vms.yml index d89e3653f9..4fb76ddf75 100644 --- a/etc/kayobe/infra-vms.yml +++ b/etc/kayobe/infra-vms.yml @@ -92,15 +92,15 @@ ############################################################################### # Infrastructure VM node LVM configuration. -# List of infrastructure vm volume groups. See mrlesmithjr.manage-lvm role for +# List of infrastructure vm volume groups. See mrlesmithjr.manage_lvm role for # format. #infra_vm_lvm_groups: -# Default list of infrastructure vm volume groups. See mrlesmithjr.manage-lvm +# Default list of infrastructure vm volume groups. See mrlesmithjr.manage_lvm # role for format. #infra_vm_lvm_groups_default: -# Additional list of infrastructure vm volume groups. See mrlesmithjr.manage-lvm +# Additional list of infrastructure vm volume groups. See mrlesmithjr.manage_lvm # role for format. #infra_vm_lvm_groups_extra: @@ -111,7 +111,7 @@ # 'docker_storage_driver' is set to 'devicemapper', or false otherwise. #infra_vm_lvm_group_data_enabled: -# Infrastructure VM LVM volume group for data. See mrlesmithjr.manage-lvm role +# Infrastructure VM LVM volume group for data. See mrlesmithjr.manage_lvm role # for format. #infra_vm_lvm_group_data: diff --git a/etc/kayobe/kolla-image-tags.yml b/etc/kayobe/kolla-image-tags.yml index b9fa290c17..df3b5f4b64 100644 --- a/etc/kayobe/kolla-image-tags.yml +++ b/etc/kayobe/kolla-image-tags.yml @@ -5,23 +5,24 @@ kolla_image_tags: openstack: rocky-9: 2023.1-rocky-9-20240202T105928 - ubuntu-jammy: 2023.1-ubuntu-jammy-20231011T200357 - bifrost: - ubuntu-jammy: 2023.1-ubuntu-jammy-20231228T140806 - cloudkitty: - ubuntu-jammy: 2023.1-ubuntu-jammy-20231115T110235 + ubuntu-jammy: 2023.1-ubuntu-jammy-20240129T151608 haproxy_ssh: - ubuntu-jammy: 2023.1-ubuntu-jammy-20240104T071640 rocky-9: 2023.1-rocky-9-20240205T162323 + ubuntu-jammy: 2023.1-ubuntu-jammy-20240221T133905 + heat: + rocky-9: 2023.1-rocky-9-20240319T134201 + ubuntu-jammy: 2023.1-ubuntu-jammy-20240319T134201 + horizon: + ubuntu-jammy: 2023.1-ubuntu-jammy-20240402T104530 letsencrypt: - ubuntu-jammy: 2023.1-ubuntu-jammy-20240104T071640 rocky-9: 2023.1-rocky-9-20240205T162323 + ubuntu-jammy: 2023.1-ubuntu-jammy-20240221T133905 + magnum: + rocky-9: 2023.1-rocky-9-20240422T152338 + ubuntu-jammy: 2023.1-ubuntu-jammy-20240422T152338 neutron: rocky-9: 2023.1-rocky-9-20240202T145927 - ubuntu-jammy: 2023.1-ubuntu-jammy-20231220T222020 - nova: - ubuntu-jammy: 2023.1-ubuntu-jammy-20231220T222020 - octavia: - ubuntu-jammy: 2023.1-ubuntu-jammy-20231220T222020 - opensearch: - ubuntu-jammy: 2023.1-ubuntu-jammy-20231214T151917 + ubuntu-jammy: 2023.1-ubuntu-jammy-20240221T103817 + grafana: + rocky-9: 2023.1-rocky-9-20240313T165255 + ubuntu-jammy: 2023.1-ubuntu-jammy-20240313T165255 diff --git a/etc/kayobe/kolla.yml b/etc/kayobe/kolla.yml index c0d59610c0..9479ba6c1a 100644 --- a/etc/kayobe/kolla.yml +++ b/etc/kayobe/kolla.yml @@ -320,13 +320,15 @@ kolla_build_blocks: ADD additions-archive / grafana_plugins_install: | RUN grafana-cli plugins install vonage-status-panel \ - && grafana-cli plugins install grafana-piechart-panel + && grafana-cli plugins install grafana-piechart-panel \ + && grafana-cli plugins install grafana-opensearch-datasource \ + && grafana-cli plugins install gnocchixyz-gnocchi-datasource ironic_inspector_header: | ADD additions-archive / magnum_base_footer: | RUN curl https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | head -n -1 | bash {% raw %} - {% set magnum_capi_packages = ['git+https://github.com/stackhpc/magnum-capi-helm.git@v0.1.0'] %} + {% set magnum_capi_packages = ['git+https://github.com/stackhpc/magnum-capi-helm.git@v0.13.0'] %} RUN {{ macros.install_pip(magnum_capi_packages | customizable("pip_packages")) }} {% endraw %} # Dict mapping image customization variable names to their values. @@ -365,7 +367,7 @@ kolla_build_customizations: "{{ kolla_build_customizations_common | combine(koll # Dict mapping Kolla Dockerfile ARG names to their values. kolla_build_args: - node_exporter_version: "1.5.0" # kolla has 1.4.0 + node_exporter_version: "1.5.0" # kolla has 1.4.0 node_exporter_sha256sum: "af999fd31ab54ed3a34b9f0b10c28e9acee9ef5ac5a5d5edfdde85437db7acbb" ############################################################################### @@ -761,6 +763,10 @@ kolla_enable_prometheus: true # Kolla passwords file. #kolla_ansible_default_custom_passwords: +# Dictionary containing extra custom passwords to add or override in the Kolla +# passwords file. +#kolla_ansible_extra_custom_passwords: + # Dictionary containing custom passwords to add or override in the Kolla # passwords file. #kolla_ansible_custom_passwords: @@ -800,7 +806,7 @@ kolla_enable_prometheus: true # Path to a CA certificate file to use for the OS_CACERT environment variable # in public-openrc.sh file when TLS is enabled, instead of Kolla-Ansible's # default. -#kolla_external_fqdn_cacert: +#kolla_public_openrc_cacert: # Internal API certificate bundle. # @@ -813,7 +819,7 @@ kolla_enable_prometheus: true # Path to a CA certificate file to use for the OS_CACERT environment variable # in admin-openrc.sh file when TLS is enabled, instead of Kolla-Ansible's # default. -#kolla_internal_fqdn_cacert: +#kolla_admin_openrc_cacert: ############################################################################### # Proxy configuration diff --git a/etc/kayobe/kolla/config/grafana/dashboards/ceph/ceph_osds.json b/etc/kayobe/kolla/config/grafana/dashboards/ceph/ceph_osds.json index 6c6f525a7e..826701984b 100644 --- a/etc/kayobe/kolla/config/grafana/dashboards/ceph/ceph_osds.json +++ b/etc/kayobe/kolla/config/grafana/dashboards/ceph/ceph_osds.json @@ -1,10 +1,47 @@ {% raw %} { + "__inputs": [], + "__elements": {}, + "__requires": [ + { + "type": "grafana", + "id": "grafana", + "name": "Grafana", + "version": "10.1.4" + }, + { + "type": "panel", + "id": "graph", + "name": "Graph (old)", + "version": "" + }, + { + "type": "panel", + "id": "piechart", + "name": "Pie chart", + "version": "" + }, + { + "type": "datasource", + "id": "prometheus", + "name": "Prometheus", + "version": "1.0.0" + }, + { + "type": "panel", + "id": "table-old", + "name": "Table (old)", + "version": "" + } + ], "annotations": { "list": [ { "builtIn": 1, - "datasource": "-- Grafana --", + "datasource": { + "type": "datasource", + "uid": "grafana" + }, "enable": true, "hide": true, "iconColor": "rgba(0, 211, 255, 1)", @@ -14,11 +51,11 @@ ] }, "editable": true, - "gnetId": null, + "fiscalYearStartMonth": 0, "graphTooltip": 0, - "id": 191, - "iteration": 1616693984817, + "id": null, "links": [], + "liveNow": false, "panels": [ { "aliasColors": { @@ -27,7 +64,15 @@ "bars": false, "dashLength": 10, "dashes": false, - "datasource": "$datasource", + "datasource": { + "uid": "$datasource" + }, + "fieldConfig": { + "defaults": { + "links": [] + }, + "overrides": [] + }, "fill": 1, "fillGradient": 0, "gridPos": { @@ -52,9 +97,10 @@ "links": [], "nullPointMode": "null", "options": { - "dataLinks": [] + "alertThreshold": true }, "percentage": false, + "pluginVersion": "10.1.4", "pointradius": 5, "points": false, "renderer": "flot", @@ -64,6 +110,9 @@ "steppedLine": false, "targets": [ { + "datasource": { + "uid": "$datasource" + }, "expr": "avg (irate(ceph_osd_op_r_latency_sum[5m]) / on (ceph_daemon) irate(ceph_osd_op_r_latency_count[5m]) * 1000)", "format": "time_series", "intervalFactor": 1, @@ -71,6 +120,9 @@ "refId": "A" }, { + "datasource": { + "uid": "$datasource" + }, "expr": "max (irate(ceph_osd_op_r_latency_sum[5m]) / on (ceph_daemon) irate(ceph_osd_op_r_latency_count[5m]) * 1000)", "format": "time_series", "intervalFactor": 1, @@ -78,6 +130,9 @@ "refId": "B" }, { + "datasource": { + "uid": "$datasource" + }, "expr": "quantile(0.95,\n (irate(ceph_osd_op_r_latency_sum[5m]) / on (ceph_daemon) irate(ceph_osd_op_r_latency_count[5m]) * 1000)\n)", "format": "time_series", "intervalFactor": 1, @@ -86,9 +141,7 @@ } ], "thresholds": [], - "timeFrom": null, "timeRegions": [], - "timeShift": null, "title": "OSD Read Latencies", "tooltip": { "shared": true, @@ -97,38 +150,32 @@ }, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", - "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "ms", - "label": null, "logBase": 1, - "max": null, "min": "0", "show": true }, { "format": "short", - "label": null, "logBase": 1, - "max": null, - "min": null, "show": false } ], "yaxis": { - "align": false, - "alignLevel": null + "align": false } }, { "columns": [], - "datasource": "$datasource", + "datasource": { + "uid": "$datasource" + }, "description": "This table shows the osd's that are delivering the 10 highest read latencies within the cluster", "fontSize": "100%", "gridPos": { @@ -139,19 +186,15 @@ }, "id": 15, "links": [], - "options": {}, - "pageSize": null, "scroll": true, "showHeader": true, "sort": { - "col": null, "desc": false }, "styles": [ { "alias": "OSD ID", "align": "auto", - "colorMode": null, "colors": [ "rgba(245, 54, 54, 0.9)", "rgba(237, 129, 40, 0.89)", @@ -167,7 +210,6 @@ { "alias": "Latency (ms)", "align": "auto", - "colorMode": null, "colors": [ "rgba(245, 54, 54, 0.9)", "rgba(237, 129, 40, 0.89)", @@ -183,7 +225,6 @@ { "alias": "", "align": "auto", - "colorMode": null, "colors": [ "rgba(245, 54, 54, 0.9)", "rgba(237, 129, 40, 0.89)", @@ -199,6 +240,9 @@ ], "targets": [ { + "datasource": { + "uid": "$datasource" + }, "expr": "topk(10,\n (sort(\n (irate(ceph_osd_op_r_latency_sum[5m]) / on (ceph_daemon) irate(ceph_osd_op_r_latency_count[5m]) * 1000)\n ))\n)\n\n", "format": "table", "instant": true, @@ -209,7 +253,7 @@ ], "title": "Highest READ Latencies", "transform": "table", - "type": "table" + "type": "table-old" }, { "aliasColors": { @@ -218,7 +262,15 @@ "bars": false, "dashLength": 10, "dashes": false, - "datasource": "$datasource", + "datasource": { + "uid": "$datasource" + }, + "fieldConfig": { + "defaults": { + "links": [] + }, + "overrides": [] + }, "fill": 1, "fillGradient": 0, "gridPos": { @@ -243,9 +295,10 @@ "links": [], "nullPointMode": "null", "options": { - "dataLinks": [] + "alertThreshold": true }, "percentage": false, + "pluginVersion": "10.1.4", "pointradius": 5, "points": false, "renderer": "flot", @@ -255,6 +308,9 @@ "steppedLine": false, "targets": [ { + "datasource": { + "uid": "$datasource" + }, "expr": "avg (rate(ceph_osd_op_w_latency_sum[10m]) / on (ceph_daemon) rate(ceph_osd_op_w_latency_count[10m]) * 1000)", "format": "time_series", "intervalFactor": 1, @@ -262,6 +318,9 @@ "refId": "A" }, { + "datasource": { + "uid": "$datasource" + }, "expr": "max (irate(ceph_osd_op_w_latency_sum[5m]) / on (ceph_daemon) irate(ceph_osd_op_w_latency_count[5m]) * 1000)", "format": "time_series", "hide": false, @@ -270,6 +329,9 @@ "refId": "B" }, { + "datasource": { + "uid": "$datasource" + }, "expr": "quantile(0.95,\n (irate(ceph_osd_op_w_latency_sum[5m]) / on (ceph_daemon) irate(ceph_osd_op_w_latency_count[5m]) * 1000)\n)", "format": "time_series", "hide": false, @@ -279,9 +341,7 @@ } ], "thresholds": [], - "timeFrom": null, "timeRegions": [], - "timeShift": null, "title": "OSD Write Latencies", "tooltip": { "shared": true, @@ -290,38 +350,32 @@ }, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", - "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "ms", - "label": null, "logBase": 1, - "max": null, "min": "0", "show": true }, { "format": "short", - "label": null, "logBase": 1, - "max": null, - "min": null, "show": false } ], "yaxis": { - "align": false, - "alignLevel": null + "align": false } }, { "columns": [], - "datasource": "$datasource", + "datasource": { + "uid": "$datasource" + }, "description": "This table shows the osd's that are delivering the 10 highest write latencies within the cluster", "fontSize": "100%", "gridPos": { @@ -332,19 +386,15 @@ }, "id": 16, "links": [], - "options": {}, - "pageSize": null, "scroll": true, "showHeader": true, "sort": { - "col": null, "desc": false }, "styles": [ { "alias": "OSD ID", "align": "auto", - "colorMode": null, "colors": [ "rgba(245, 54, 54, 0.9)", "rgba(237, 129, 40, 0.89)", @@ -360,7 +410,6 @@ { "alias": "Latency (ms)", "align": "auto", - "colorMode": null, "colors": [ "rgba(245, 54, 54, 0.9)", "rgba(237, 129, 40, 0.89)", @@ -376,7 +425,6 @@ { "alias": "", "align": "auto", - "colorMode": null, "colors": [ "rgba(245, 54, 54, 0.9)", "rgba(237, 129, 40, 0.89)", @@ -392,6 +440,9 @@ ], "targets": [ { + "datasource": { + "uid": "$datasource" + }, "expr": "topk(10,\n (sort(\n (irate(ceph_osd_op_w_latency_sum[5m]) / on (ceph_daemon) irate(ceph_osd_op_w_latency_count[5m]) * 1000)\n ))\n)\n\n", "format": "table", "instant": true, @@ -402,12 +453,13 @@ ], "title": "Highest WRITE Latencies", "transform": "table", - "type": "table" + "type": "table-old" }, { - "cacheTimeout": null, "columns": [], - "datasource": "$datasource", + "datasource": { + "uid": "$datasource" + }, "fontSize": "100%", "gridPos": { "h": 8, @@ -417,8 +469,6 @@ }, "id": 2, "links": [], - "options": {}, - "pageSize": null, "pluginVersion": "6.6.1", "showHeader": true, "sort": { @@ -436,7 +486,6 @@ { "alias": "", "align": "right", - "colorMode": null, "colors": [ "rgba(245, 54, 54, 0.9)", "rgba(237, 129, 40, 0.89)", @@ -451,6 +500,9 @@ ], "targets": [ { + "datasource": { + "uid": "$datasource" + }, "expr": "count by (device_class) (ceph_osd_metadata)", "format": "table", "instant": true, @@ -459,16 +511,15 @@ "refId": "A" } ], - "timeFrom": null, - "timeShift": null, "title": "OSD Types Summary", "transform": "table", - "type": "table" + "type": "table-old" }, { - "cacheTimeout": null, "columns": [], - "datasource": "$datasource", + "datasource": { + "uid": "$datasource" + }, "fontSize": "100%", "gridPos": { "h": 8, @@ -479,8 +530,6 @@ "hideTimeOverride": true, "id": 4, "links": [], - "options": {}, - "pageSize": null, "showHeader": true, "sort": { "col": 0, @@ -497,7 +546,6 @@ { "alias": "", "align": "right", - "colorMode": null, "colors": [ "rgba(245, 54, 54, 0.9)", "rgba(237, 129, 40, 0.89)", @@ -512,6 +560,9 @@ ], "targets": [ { + "datasource": { + "uid": "$datasource" + }, "expr": "count(ceph_bluefs_wal_total_bytes)", "format": "time_series", "instant": true, @@ -521,6 +572,9 @@ "step": 240 }, { + "datasource": { + "uid": "$datasource" + }, "expr": "count(ceph_osd_metadata) - count(ceph_bluefs_wal_total_bytes)", "format": "time_series", "instant": true, @@ -530,6 +584,9 @@ "step": 240 }, { + "datasource": { + "uid": "$datasource" + }, "expr": "absent(ceph_bluefs_wal_total_bytes)*count(ceph_osd_metadata)", "format": "time_series", "instant": true, @@ -539,18 +596,36 @@ "step": 240 } ], - "timeFrom": null, - "timeShift": null, "title": "OSD Objectstore Types", "transform": "timeseries_to_columns", - "type": "table" + "type": "table-old" }, { - "cacheTimeout": null, - "columns": [], - "datasource": "$datasource", + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, "description": "The pie chart shows the various OSD sizes used within the cluster", - "fontSize": "100%", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + } + }, + "decimals": 2, + "displayName": "", + "mappings": [], + "noValue": "0", + "unit": "short" + }, + "overrides": [] + }, "gridPos": { "h": 8, "w": 4, @@ -560,39 +635,35 @@ "hideTimeOverride": true, "id": 8, "links": [], - "options": {}, - "pageSize": null, - "showHeader": true, - "sort": { - "col": 0, - "desc": true - }, - "styles": [ - { - "alias": "Time", - "align": "auto", - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "pattern": "Time", - "type": "date" + "options": { + "displayLabels": [ + "name" + ], + "legend": { + "displayMode": "list", + "placement": "bottom", + "showLegend": true }, - { - "alias": "", - "align": "right", - "colorMode": null, - "colors": [ - "rgba(245, 54, 54, 0.9)", - "rgba(237, 129, 40, 0.89)", - "rgba(50, 172, 45, 0.97)" + "pieType": "pie", + "reduceOptions": { + "calcs": [ + "lastNotNull" ], - "decimals": 2, - "pattern": "/.*/", - "thresholds": [], - "type": "number", - "unit": "short" + "fields": "", + "values": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" } - ], + }, + "pluginVersion": "10.1.4", "targets": [ { + "datasource": { + "uid": "$datasource" + }, + "editorMode": "code", "expr": "count(ceph_osd_stat_bytes < 1099511627776)", "format": "time_series", "instant": true, @@ -602,6 +673,10 @@ "step": 2 }, { + "datasource": { + "uid": "$datasource" + }, + "editorMode": "code", "expr": "count(ceph_osd_stat_bytes >= 1099511627776 < 2199023255552)", "format": "time_series", "instant": true, @@ -611,6 +686,10 @@ "step": 2 }, { + "datasource": { + "uid": "$datasource" + }, + "editorMode": "code", "expr": "count(ceph_osd_stat_bytes >= 2199023255552 < 3298534883328)", "format": "time_series", "instant": true, @@ -620,6 +699,10 @@ "step": 2 }, { + "datasource": { + "uid": "$datasource" + }, + "editorMode": "code", "expr": "count(ceph_osd_stat_bytes >= 3298534883328 < 4398046511104)", "format": "time_series", "instant": true, @@ -629,6 +712,10 @@ "step": 2 }, { + "datasource": { + "uid": "$datasource" + }, + "editorMode": "code", "expr": "count(ceph_osd_stat_bytes >= 4398046511104 < 6597069766656)", "format": "time_series", "instant": true, @@ -638,7 +725,11 @@ "step": 2 }, { - "expr": "count(ceph_osd_stat_bytes >= 6597069766656 < 8796093022208)", + "datasource": { + "uid": "$datasource" + }, + "editorMode": "code", + "expr": "count(ceph_osd_stat_bytes >= 6597069766656 < 8796093022208) ", "format": "time_series", "instant": true, "intervalFactor": 2, @@ -647,6 +738,10 @@ "step": 2 }, { + "datasource": { + "uid": "$datasource" + }, + "editorMode": "code", "expr": "count(ceph_osd_stat_bytes >= 8796093022208 < 10995116277760)", "format": "time_series", "instant": true, @@ -656,6 +751,10 @@ "step": 2 }, { + "datasource": { + "uid": "$datasource" + }, + "editorMode": "code", "expr": "count(ceph_osd_stat_bytes >= 10995116277760 < 13194139533312)", "format": "time_series", "instant": true, @@ -665,6 +764,10 @@ "step": 2 }, { + "datasource": { + "uid": "$datasource" + }, + "editorMode": "code", "expr": "count(ceph_osd_stat_bytes >= 13194139533312)", "format": "time_series", "instant": true, @@ -674,19 +777,25 @@ "step": 2 } ], - "timeFrom": null, - "timeShift": null, "title": "OSD Size Summary", - "transform": "timeseries_to_columns", - "type": "table" + "transformations": [], + "type": "piechart" }, { "aliasColors": {}, "bars": true, "dashLength": 10, "dashes": false, - "datasource": "$datasource", + "datasource": { + "uid": "$datasource" + }, "description": "Each bar indicates the number of OSD's that have a PG count in a specific range as shown on the x axis.", + "fieldConfig": { + "defaults": { + "links": [] + }, + "overrides": [] + }, "fill": 1, "fillGradient": 0, "gridPos": { @@ -715,9 +824,10 @@ "links": [], "nullPointMode": "null", "options": { - "dataLinks": [] + "alertThreshold": true }, "percentage": false, + "pluginVersion": "10.1.4", "pointradius": 5, "points": false, "renderer": "flot", @@ -727,6 +837,9 @@ "steppedLine": false, "targets": [ { + "datasource": { + "uid": "$datasource" + }, "expr": "ceph_osd_numpg\n", "format": "time_series", "instant": true, @@ -736,9 +849,7 @@ } ], "thresholds": [], - "timeFrom": null, "timeRegions": [], - "timeShift": null, "title": "Distribution of PGs per OSD", "tooltip": { "shared": false, @@ -749,7 +860,6 @@ "xaxis": { "buckets": 20, "mode": "histogram", - "name": null, "show": true, "values": [ "total" @@ -761,27 +871,25 @@ "format": "short", "label": "# of OSDs", "logBase": 1, - "max": null, "min": "0", "show": true }, { "format": "short", - "label": null, "logBase": 1, - "max": null, - "min": null, "show": false } ], "yaxis": { - "align": false, - "alignLevel": null + "align": false } }, { "collapsed": false, - "datasource": null, + "datasource": { + "type": "datasource", + "uid": "grafana" + }, "gridPos": { "h": 1, "w": 24, @@ -790,6 +898,15 @@ }, "id": 20, "panels": [], + "targets": [ + { + "datasource": { + "type": "datasource", + "uid": "grafana" + }, + "refId": "A" + } + ], "title": "R/W Profile", "type": "row" }, @@ -798,8 +915,16 @@ "bars": false, "dashLength": 10, "dashes": false, - "datasource": "$datasource", + "datasource": { + "uid": "$datasource" + }, "description": "Show the read/write workload profile overtime", + "fieldConfig": { + "defaults": { + "links": [] + }, + "overrides": [] + }, "fill": 1, "fillGradient": 0, "gridPos": { @@ -824,9 +949,10 @@ "links": [], "nullPointMode": "null", "options": { - "dataLinks": [] + "alertThreshold": true }, "percentage": false, + "pluginVersion": "10.1.4", "pointradius": 5, "points": false, "renderer": "flot", @@ -836,6 +962,9 @@ "steppedLine": false, "targets": [ { + "datasource": { + "uid": "$datasource" + }, "expr": "round(sum(irate(ceph_pool_rd[5m])))", "format": "time_series", "intervalFactor": 1, @@ -843,6 +972,9 @@ "refId": "A" }, { + "datasource": { + "uid": "$datasource" + }, "expr": "round(sum(irate(ceph_pool_wr[5m])))", "format": "time_series", "intervalFactor": 1, @@ -851,9 +983,7 @@ } ], "thresholds": [], - "timeFrom": null, "timeRegions": [], - "timeShift": null, "title": "Read/Write Profile", "tooltip": { "shared": true, @@ -862,46 +992,38 @@ }, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", - "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", - "label": null, "logBase": 1, - "max": null, - "min": null, "show": true }, { "format": "short", - "label": null, "logBase": 1, - "max": null, - "min": null, "show": true } ], "yaxis": { - "align": false, - "alignLevel": null + "align": false } } ], - "refresh": false, - "schemaVersion": 22, + "refresh": "", + "schemaVersion": 38, "style": "dark", "tags": [], "templating": { "list": [ { "current": { + "selected": false, "text": "Prometheus", - "value": "Prometheus" + "value": "PBFA97CFB590B2093" }, "hide": 0, "includeAll": false, @@ -949,6 +1071,7 @@ "timezone": "", "title": "Ceph OSD Overview", "uid": "lo02I1Aiz", - "version": 9 + "version": 1, + "weekStart": "" } {% endraw %} diff --git a/etc/kayobe/kolla/config/grafana/dashboards/ceph/ceph_overview.json b/etc/kayobe/kolla/config/grafana/dashboards/ceph/ceph_overview.json index e041d8ff0d..e5258168a6 100644 --- a/etc/kayobe/kolla/config/grafana/dashboards/ceph/ceph_overview.json +++ b/etc/kayobe/kolla/config/grafana/dashboards/ceph/ceph_overview.json @@ -1924,23 +1924,25 @@ } ], "spaceLength": 10, - "stack": true, + "stack": false, "steppedLine": false, "targets": [ { - "expr": "ceph_cluster_total_objects", + "datasource": { + "uid": "$datasource" + }, + "expr": "ceph_pool_objects * on(pool_id) group_left(instance,name) ceph_pool_metadata", "format": "time_series", "interval": "$interval", "intervalFactor": 1, - "legendFormat": "Total", + "legendFormat": "{{name}}", + "range": true, "refId": "A", "step": 300 } ], "thresholds": [], - "timeFrom": null, "timeRegions": [], - "timeShift": null, "title": "Objects in the Cluster", "tooltip": { "msResolution": false, diff --git a/etc/kayobe/kolla/config/grafana/dashboards/ceph/ceph_pools.json b/etc/kayobe/kolla/config/grafana/dashboards/ceph/ceph_pools.json index f2882ed603..b3a4af1bda 100644 --- a/etc/kayobe/kolla/config/grafana/dashboards/ceph/ceph_pools.json +++ b/etc/kayobe/kolla/config/grafana/dashboards/ceph/ceph_pools.json @@ -657,7 +657,7 @@ ], "targets": [ { - "expr": "topk(1,((ceph_pool_stored / (ceph_pool_stored + ceph_pool_max_avail)) * on(pool_id) group_left(name) ceph_pool_metadata))", + "expr": "topk($topk,((ceph_pool_stored / (ceph_pool_stored + ceph_pool_max_avail)) * on(pool_id) group_left(name) ceph_pool_metadata))", "format": "table", "hide": false, "instant": true, diff --git a/etc/kayobe/kolla/config/grafana/dashboards/openstack/grafana_cloud_dashboard.json b/etc/kayobe/kolla/config/grafana/dashboards/openstack/grafana_cloud_dashboard.json index a777c332e0..7bdbdee9ff 100644 --- a/etc/kayobe/kolla/config/grafana/dashboards/openstack/grafana_cloud_dashboard.json +++ b/etc/kayobe/kolla/config/grafana/dashboards/openstack/grafana_cloud_dashboard.json @@ -25,7 +25,6 @@ "editable": true, "fiscalYearStartMonth": 0, "graphTooltip": 0, - "id": 2084495, "links": [], "liveNow": false, "panels": [ @@ -66,7 +65,7 @@ }, "gridPos": { "h": 4, - "w": 2.4, + "w": 4.8, "x": 0, "y": 1 }, @@ -86,7 +85,7 @@ }, "textMode": "auto" }, - "pluginVersion": "9.4.7", + "pluginVersion": "10.1.5", "repeat": "flavors", "repeatDirection": "h", "targets": [ @@ -96,7 +95,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "openstack_free_capacity_by_flavor_total{flavor_name=~\"$flavors\"}", + "expr": "round(avg_over_time(openstack_free_capacity_by_flavor_total{flavor_name=~\"$flavors\"}[30m]), 1)", "legendFormat": "__auto", "range": true, "refId": "A" @@ -424,6 +423,7 @@ "tooltip": false, "viz": false }, + "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, @@ -465,6 +465,7 @@ "y": 17 }, "id": 5, + "interval": "10m", "options": { "legend": { "calcs": [ @@ -489,7 +490,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "openstack_project_usage{placement_resource=\"MEMORY_MB\"}", + "expr": "avg_over_time(openstack_project_usage{placement_resource=\"MEMORY_MB\"}[30m])", "legendFormat": "{{project_name}}", "range": true, "refId": "A" @@ -522,6 +523,7 @@ "tooltip": false, "viz": false }, + "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, @@ -552,7 +554,7 @@ } ] }, - "unit": "decmbytes" + "unit": "none" }, "overrides": [] }, @@ -563,6 +565,7 @@ "y": 17 }, "id": 16, + "interval": "10m", "options": { "legend": { "calcs": [ @@ -587,7 +590,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "openstack_project_usage{placement_resource=\"VCPU\"}", + "expr": "avg_over_time(openstack_project_usage{placement_resource=\"VCPU\"}[30m])", "legendFormat": "VCPU {{project_name}}", "range": true, "refId": "A" @@ -598,7 +601,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "openstack_project_usage{placement_resource=\"PCPU\"}", + "expr": "avg_over_time(openstack_project_usage{placement_resource=\"PCPU\"}[30m])", "hide": false, "legendFormat": "PCPU {{project_name}}", "range": true, @@ -646,6 +649,7 @@ "tooltip": false, "viz": false }, + "insertNulls": false, "lineInterpolation": "smooth", "lineStyle": { "fill": "solid" @@ -689,6 +693,7 @@ "y": 26 }, "id": 6, + "interval": "10m", "options": { "legend": { "calcs": [ @@ -715,15 +720,15 @@ }, "editorMode": "code", "exemplar": false, - "expr": "openstack_free_capacity_hypervisor_by_flavor{flavor_name=~\"$flavors\"}", + "expr": "avg_over_time(openstack_free_capacity_hypervisor_by_flavor{flavor_name=~\"$flavors\"}[30m])", "format": "time_series", "instant": false, "legendFormat": "{{flavor_name}} on {{hypervisor}}", "range": true, - "refId": "Avaliable Capacity on Hypervisors" + "refId": "Available Capacity on Hypervisors" } ], - "title": "Avaliable Capacity for $flavors", + "title": "Available Capacity for $flavors", "type": "timeseries" }, { @@ -750,6 +755,7 @@ "tooltip": false, "viz": false }, + "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, @@ -791,6 +797,7 @@ "y": 26 }, "id": 4, + "interval": "10m", "options": { "legend": { "calcs": [ @@ -814,8 +821,8 @@ "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, - "editorMode": "builder", - "expr": "openstack_hypervisor_placement_allocatable_capacity{resource=\"MEMORY_MB\"} - on(hypervisor) openstack_hypervisor_placement_allocated{resource=\"MEMORY_MB\"}", + "editorMode": "code", + "expr": "avg_over_time(openstack_hypervisor_placement_allocatable_capacity{resource=\"MEMORY_MB\"}[30m]) - on(hypervisor) avg_over_time(openstack_hypervisor_placement_allocated{resource=\"MEMORY_MB\"}[30m])", "legendFormat": "{{hypervisor}}", "range": true, "refId": "A" @@ -885,7 +892,7 @@ ] }, "time": { - "from": "now-24h", + "from": "now-2d", "to": "now" }, "timepicker": {}, @@ -895,4 +902,4 @@ "version": 1, "weekStart": "" } -{% endraw %} +{% endraw %} \ No newline at end of file diff --git a/etc/kayobe/kolla/config/grafana/dashboards/openstack/grafana_project_dashboard.json b/etc/kayobe/kolla/config/grafana/dashboards/openstack/grafana_project_dashboard.json index c3a483cf93..acb37f1953 100644 --- a/etc/kayobe/kolla/config/grafana/dashboards/openstack/grafana_project_dashboard.json +++ b/etc/kayobe/kolla/config/grafana/dashboards/openstack/grafana_project_dashboard.json @@ -25,7 +25,6 @@ "editable": true, "fiscalYearStartMonth": 0, "graphTooltip": 0, - "id": 2084480, "links": [], "liveNow": false, "panels": [ @@ -89,9 +88,10 @@ "fields": "", "values": false }, - "showUnfilled": true + "showUnfilled": true, + "valueMode": "color" }, - "pluginVersion": "9.4.7", + "pluginVersion": "10.1.5", "targets": [ { "datasource": { @@ -134,6 +134,7 @@ "tooltip": false, "viz": false }, + "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, @@ -175,6 +176,7 @@ "y": 11 }, "id": 5, + "interval": "10m", "options": { "legend": { "calcs": [ @@ -199,7 +201,7 @@ "uid": "${datasource}" }, "editorMode": "code", - "expr": "sum(openstack_project_usage{project_id=~\"${project_id}\"}) by (placement_resource)", + "expr": "sum(avg_over_time(openstack_project_usage{project_id=~\"${project_id}\"}[30m:])) by (placement_resource)", "hide": false, "legendFormat": "{{placement_resource}}", "range": true, @@ -234,6 +236,7 @@ "tooltip": false, "viz": false }, + "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, @@ -275,6 +278,7 @@ "y": 11 }, "id": 19, + "interval": "10m", "options": { "legend": { "calcs": [ @@ -299,7 +303,7 @@ "uid": "${datasource}" }, "editorMode": "code", - "expr": "openstack_project_quota{project_id=~\"${project_id}\"}", + "expr": "avg_over_time(openstack_project_quota{project_id=~\"${project_id}\"}[30m:])", "hide": false, "legendFormat": "{{project_name}}:{{quota_resource}}", "range": true, @@ -333,6 +337,7 @@ "tooltip": false, "viz": false }, + "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, @@ -433,6 +438,7 @@ "tooltip": false, "viz": false }, + "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, @@ -474,6 +480,7 @@ "y": 20 }, "id": 20, + "interval": "30m", "options": { "legend": { "calcs": [ @@ -498,7 +505,7 @@ "uid": "${datasource}" }, "editorMode": "code", - "expr": "sum(irate(libvirt_domain_vcpu_time_seconds_total{}[5m]) / ignoring(instance,vcpu) group_left(domain) libvirt_domain_info_virtual_cpus{}) by (domain) * on(domain) group_left(instance_name,project_name,project_uuid) libvirt_domain_info_meta{project_uuid=~\"${project_id}\"}", + "expr": "avg(sum(irate(libvirt_domain_vcpu_time_seconds_total{}[5m]) / ignoring(instance,vcpu) group_left(domain) libvirt_domain_info_virtual_cpus{}) by (domain) * on(domain) group_left(instance_name,project_name,project_uuid) libvirt_domain_info_meta{project_uuid=~\"${project_id}\"}) by (project_name)", "hide": false, "legendFormat": "{{instance_name}}", "range": true, @@ -532,6 +539,7 @@ "tooltip": false, "viz": false }, + "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, @@ -598,7 +606,7 @@ "uid": "${datasource}" }, "editorMode": "code", - "expr": "libvirt_domain_memory_stats_used_percent * on(domain) group_left(instance_name,project_name,project_uuid) libvirt_domain_info_meta{project_uuid=~\"${project_id}\"}", + "expr": "avg(libvirt_domain_memory_stats_used_percent * on(domain) group_left(instance_name,project_name,project_uuid) libvirt_domain_info_meta{project_uuid=~\"${project_id}\"}) by (project_name)", "hide": false, "legendFormat": "{{instance_name}}", "range": true, @@ -633,6 +641,7 @@ "tooltip": false, "viz": false }, + "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, @@ -700,9 +709,9 @@ "uid": "${datasource}" }, "editorMode": "code", - "expr": "rate(libvirt_domain_block_stats_read_time_seconds_total[5m]) * on(domain) group_left(instance_name,project_name,project_uuid) libvirt_domain_info_meta{project_uuid=~\"${project_id}\"}", + "expr": "avg(rate(libvirt_domain_block_stats_read_time_seconds_total[5m]) * on(domain) group_left(instance_name,project_name,project_uuid) libvirt_domain_info_meta{project_uuid=~\"${project_id}\"}) by (project_name)", "hide": false, - "legendFormat": "{{instance_name}} : read {{target_device}}", + "legendFormat": "read: {{project_name}}", "range": true, "refId": "B" }, @@ -712,9 +721,9 @@ "uid": "${datasource}" }, "editorMode": "code", - "expr": "rate(libvirt_domain_block_stats_write_time_seconds_total[5m]) * on(domain) group_left(instance_name,project_name,project_uuid) libvirt_domain_info_meta{project_uuid=~\"${project_id}\"} * -1", + "expr": "avg(rate(libvirt_domain_block_stats_write_time_seconds_total[5m]) * on(domain) group_left(instance_name,project_name,project_uuid) libvirt_domain_info_meta{project_uuid=~\"${project_id}\"} * -1) by (project_name)", "hide": false, - "legendFormat": "{{instance_name}} : write {{target_device}}", + "legendFormat": " write: {{project_name}}", "range": true, "refId": "C" } @@ -732,7 +741,7 @@ }, "id": 15, "panels": [], - "title": "Per Hypervisor Free Capacity", + "title": "Per Instance Utilization", "type": "row" }, { @@ -740,67 +749,319 @@ "type": "prometheus", "uid": "${datasource}" }, - "description": "", "fieldConfig": { "defaults": { "color": { - "mode": "thresholds" + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMax": 1, + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 23, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { - "color": "semi-dark-yellow", + "color": "green", "value": null }, { - "color": "green", - "value": 4 + "color": "red", + "value": 80 } ] - } + }, + "unit": "percentunit" }, "overrides": [] }, "gridPos": { - "h": 10, - "w": 24, + "h": 9, + "w": 8, "x": 0, "y": 30 }, - "id": 2, + "id": 23, + "interval": "30m", "options": { - "displayMode": "basic", - "minVizHeight": 10, - "minVizWidth": 0, - "orientation": "horizontal", - "reduceOptions": { + "legend": { "calcs": [ - "lastNotNull" + "min", + "max" ], - "fields": "", - "values": false + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "sortBy": "Max", + "sortDesc": true }, - "showUnfilled": true + "tooltip": { + "mode": "single", + "sort": "none" + } }, - "pluginVersion": "9.4.7", "targets": [ { "datasource": { "type": "prometheus", "uid": "${datasource}" }, - "editorMode": "builder", - "expr": "openstack_free_capacity_by_flavor_total", - "format": "time_series", - "legendFormat": "{{flavor_name}}", + "editorMode": "code", + "expr": "sum(irate(libvirt_domain_vcpu_time_seconds_total{}[5m]) / ignoring(instance,vcpu) group_left(domain) libvirt_domain_info_virtual_cpus{}) by (domain) * on(domain) group_left(instance_name,project_name,project_uuid) libvirt_domain_info_meta{project_uuid=~\"${project_id}\"}", + "hide": false, + "legendFormat": "{{instance_name}}", + "range": true, + "refId": "B" + } + ], + "title": "CPU utilization per instance", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 23, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "max": 100, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 8, + "x": 8, + "y": 30 + }, + "id": 24, + "interval": "30m", + "options": { + "legend": { + "calcs": [ + "min", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "sortBy": "Max", + "sortDesc": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "libvirt_domain_memory_stats_used_percent * on(domain) group_left(instance_name,project_name,project_uuid) libvirt_domain_info_meta{project_uuid=~\"${project_id}\"}", + "hide": false, + "legendFormat": "{{instance_name}}", "range": true, "refId": "A" } ], - "title": "Free Capacity by Flavor", - "type": "bargauge" + "title": "Memory utilization per instance", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": true, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 23, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "max": 1, + "min": -1, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 8, + "x": 16, + "y": 30 + }, + "id": 25, + "options": { + "legend": { + "calcs": [ + "min", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "sortBy": "Max", + "sortDesc": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "rate(libvirt_domain_block_stats_read_time_seconds_total[5m]) * on(domain) group_left(instance_name,project_name,project_uuid) libvirt_domain_info_meta{project_uuid=~\"${project_id}\"}", + "hide": false, + "legendFormat": "{{instance_name}} : read {{target_device}}", + "range": true, + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "avg(rate(libvirt_domain_block_stats_write_time_seconds_total[5m]) * on(domain) group_left(instance_name,project_name,project_uuid) libvirt_domain_info_meta{project_uuid=~\"${project_id}\"} * -1) by (project_name)", + "hide": false, + "legendFormat": "{{instance_name}} : write {{target_device}}", + "range": true, + "refId": "C" + } + ], + "title": "Disk utilization per instance", + "type": "timeseries" } ], "refresh": "", @@ -817,7 +1078,7 @@ "current": { "selected": false, "text": "Prometheus", - "value": "Prometheus" + "value": "PBFA97CFB590B2093" }, "description": "The prometheus datasource used for queries.", "hide": 0, @@ -867,14 +1128,14 @@ ] }, "time": { - "from": "now-3h", + "from": "now-2d", "to": "now" }, "timepicker": {}, "timezone": "", "title": "OpenStack Project Metrics", "uid": "mXiuBDe7z", - "version": 2, + "version": 1, "weekStart": "" } -{% endraw %} +{% endraw %} \ No newline at end of file diff --git a/etc/kayobe/kolla/config/haproxy/services.d/os_exporter.cfg b/etc/kayobe/kolla/config/haproxy/services.d/os_capacity.cfg similarity index 73% rename from etc/kayobe/kolla/config/haproxy/services.d/os_exporter.cfg rename to etc/kayobe/kolla/config/haproxy/services.d/os_capacity.cfg index e40c27a380..cebacb98b4 100644 --- a/etc/kayobe/kolla/config/haproxy/services.d/os_exporter.cfg +++ b/etc/kayobe/kolla/config/haproxy/services.d/os_capacity.cfg @@ -6,7 +6,11 @@ frontend os_capacity_frontend option httplog option forwardfor http-request set-header X-Forwarded-Proto https if { ssl_fc } - bind {{ kolla_internal_vip_address }}:9000 +{% if kolla_enable_tls_internal | bool %} + bind {{ kolla_internal_vip_address }}:9090 ssl crt /etc/haproxy/certificates/haproxy-internal.pem +{% else %} + bind {{ kolla_internal_vip_address }}:9090 +{% endif %} default_backend os_capacity_backend backend os_capacity_backend diff --git a/etc/kayobe/kolla/config/nova/nova-compute-ironic.conf b/etc/kayobe/kolla/config/nova/nova-compute-ironic.conf new file mode 100644 index 0000000000..9f6db7a552 --- /dev/null +++ b/etc/kayobe/kolla/config/nova/nova-compute-ironic.conf @@ -0,0 +1,4 @@ +{% if kolla_enable_ironic|bool and kolla_nova_compute_ironic_host is not none %} +[DEFAULT] +host = {{ kolla_nova_compute_ironic_static_host_name | mandatory('You must set a static host name to help with service failover. See the operations documentation, Ironic section.') }} +{% endif %} diff --git a/etc/kayobe/kolla/config/prometheus/prometheus.yml.d/70-oscapacity.yml b/etc/kayobe/kolla/config/prometheus/prometheus.yml.d/70-oscapacity.yml index 659c26047f..afed8d9159 100644 --- a/etc/kayobe/kolla/config/prometheus/prometheus.yml.d/70-oscapacity.yml +++ b/etc/kayobe/kolla/config/prometheus/prometheus.yml.d/70-oscapacity.yml @@ -6,8 +6,11 @@ scrape_configs: - job_name: os-capacity static_configs: - targets: - - '{{ kolla_internal_vip_address | put_address_in_context('url') }}:9000' + - '{{ kolla_internal_fqdn | put_address_in_context('url') }}:9090' scrape_interval: 15m scrape_timeout: 10m +{% if kolla_enable_tls_internal | bool %} + scheme: https +{% endif %} {% endraw %} {% endif %} diff --git a/etc/kayobe/kolla/config/prometheus/system.rules b/etc/kayobe/kolla/config/prometheus/system.rules index c82bed16ee..b7c757a562 100644 --- a/etc/kayobe/kolla/config/prometheus/system.rules +++ b/etc/kayobe/kolla/config/prometheus/system.rules @@ -96,6 +96,30 @@ groups: summary: Host clock not synchronising (instance {{ $labels.instance }}) description: "Clock not synchronising. Ensure NTP is configured on this host." + - alert: HostNetworkBondDegraded + expr: (node_bonding_active - node_bonding_slaves) != 0 + for: 2m + labels: + severity: warning + annotations: + summary: Host network bond degraded (instance {{ $labels.instance }}) + description: "Bond {{ $labels.master }} degraded on {{ $labels.instance }}" +{% endraw %} + +{% if alertmanager_warn_network_bond_single_link | bool %} +{% raw %} + - alert: HostNetworkBondSingleLink + expr: node_bonding_slaves == 1 + for: 2m + labels: + severity: warning + annotations: + summary: Host network bond with a single link (instance {{ $labels.instance }}) + description: "Bond {{ $labels.master }} configured with a single link on {{ $labels.instance }}" +{% endraw %} +{% endif %} + +{% raw %} - alert: HostConntrackLimit expr: node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8 for: 5m @@ -104,4 +128,24 @@ groups: annotations: summary: Host conntrack limit (instance {{ $labels.instance }}) description: "The number of conntrack is approaching limit" + + - alert: NodeRAIDDegraded + expr: | + node_md_disks_required{job="node",device!=""} - ignoring (state) (node_md_disks{state="active",job="node",device!=""}) > 0 + for: "15m" + labels: + severity: critical + annotations: + description: "RAID array '{{ $labels.device }}' at {{ $labels.instance }} is in degraded state due to one or more disks failures. Number of spare drives is insufficient to fix issue automatically." + summary: "RAID Array is degraded." + + - alert: NodeRAIDDiskFailure + expr: | + node_md_disks{state="failed",job="node",device!=""} > 0 + labels: + severity: warning + annotations: + description: "At least one device in RAID array at {{ $labels.instance }} failed. Array '{{ $labels.device }}' needs attention and possibly a disk swap." + summary: "Failed device in RAID array." + {% endraw %} diff --git a/etc/kayobe/kolla/globals.yml b/etc/kayobe/kolla/globals.yml index 1f4530ffb8..c0663d9393 100644 --- a/etc/kayobe/kolla/globals.yml +++ b/etc/kayobe/kolla/globals.yml @@ -5,6 +5,10 @@ # This is necessary for os migrations where mixed clouds might be deployed kolla_base_distro: "{% raw %}{{ ansible_facts.distribution | lower }}{% endraw %}" +# Use facts so this is determined correctly when the control host OS differs +# from os_distribuition. +kolla_base_distro_version: "{% raw %}{{ kolla_base_distro_version_default_map[kolla_base_distro] }}{% endraw %}" + # Convenience variable for base distro and version string. kolla_base_distro_and_version: "{% raw %}{{ kolla_base_distro }}-{{ kolla_base_distro_version }}{% endraw %}" diff --git a/etc/kayobe/kolla/kolla-build.conf b/etc/kayobe/kolla/kolla-build.conf index d88c98ef66..b444eae176 100644 --- a/etc/kayobe/kolla/kolla-build.conf +++ b/etc/kayobe/kolla/kolla-build.conf @@ -9,3 +9,8 @@ base_tag = jammy-20231004 base_tag = 9.{{ stackhpc_pulp_repo_rocky_9_minor_version }} {% endif %} build_args = {{ kolla_build_args.items() | map('join', ':') | join(',') }} + +[openstack-base] +type = git +location = https://github.com/stackhpc/requirements +reference = stackhpc/{{ openstack_release }} diff --git a/etc/kayobe/monitoring.yml b/etc/kayobe/monitoring.yml index 34ccf02e1a..7b555bc822 100644 --- a/etc/kayobe/monitoring.yml +++ b/etc/kayobe/monitoring.yml @@ -63,15 +63,15 @@ ############################################################################### # Monitoring node LVM configuration. -# List of monitoring node volume groups. See mrlesmithjr.manage-lvm role for +# List of monitoring node volume groups. See mrlesmithjr.manage_lvm role for # format. #monitoring_lvm_groups: -# Default list of monitoring node volume groups. See mrlesmithjr.manage-lvm +# Default list of monitoring node volume groups. See mrlesmithjr.manage_lvm # role for format. #monitoring_lvm_groups_default: -# Additional list of monitoring node volume groups. See mrlesmithjr.manage-lvm +# Additional list of monitoring node volume groups. See mrlesmithjr.manage_lvm # role for format. #monitoring_lvm_groups_extra: diff --git a/etc/kayobe/overcloud-dib.yml b/etc/kayobe/overcloud-dib.yml index 638fdfbfb6..73ae57302c 100644 --- a/etc/kayobe/overcloud-dib.yml +++ b/etc/kayobe/overcloud-dib.yml @@ -71,7 +71,7 @@ overcloud_dib_host_packages_extra: overcloud_dib_git_elements_extra: - repo: "https://github.com/stackhpc/stackhpc-image-elements" local: "{{ source_checkout_path }}/stackhpc-image-elements" - version: "v1.6.0" + version: "v1.6.1" elements_path: "elements" # List of git repositories containing Diskimage Builder (DIB) elements. See diff --git a/etc/kayobe/seed-hypervisor.yml b/etc/kayobe/seed-hypervisor.yml index ba2d413b6e..0a98f421a5 100644 --- a/etc/kayobe/seed-hypervisor.yml +++ b/etc/kayobe/seed-hypervisor.yml @@ -36,7 +36,7 @@ ############################################################################### # Seed hypervisor node LVM configuration. -# List of seed hypervisor volume groups. See mrlesmithjr.manage-lvm role for +# List of seed hypervisor volume groups. See mrlesmithjr.manage_lvm role for # format. Set to "{{ seed_hypervisor_lvm_groups_with_data }}" to create a # volume group for libvirt storage. #seed_hypervisor_lvm_groups: @@ -45,7 +45,7 @@ # default. #seed_hypervisor_lvm_groups_with_data: -# Seed LVM volume group for data. See mrlesmithjr.manage-lvm role for format. +# Seed LVM volume group for data. See mrlesmithjr.manage_lvm role for format. #seed_hypervisor_lvm_group_data: # List of disks for use by seed hypervisor LVM data volume group. Default to an diff --git a/etc/kayobe/seed.yml b/etc/kayobe/seed.yml index 3bef4f8787..2303006e3f 100644 --- a/etc/kayobe/seed.yml +++ b/etc/kayobe/seed.yml @@ -36,14 +36,14 @@ ############################################################################### # Seed node LVM configuration. -# List of seed volume groups. See mrlesmithjr.manage-lvm role for format. +# List of seed volume groups. See mrlesmithjr.manage_lvm role for format. #seed_lvm_groups: -# Default list of seed volume groups. See mrlesmithjr.manage-lvm role for +# Default list of seed volume groups. See mrlesmithjr.manage_lvm role for # format. #seed_lvm_groups_default: -# Additional list of seed volume groups. See mrlesmithjr.manage-lvm role for +# Additional list of seed volume groups. See mrlesmithjr.manage_lvm role for # format. #seed_lvm_groups_extra: @@ -54,7 +54,7 @@ # 'docker_storage_driver' is set to 'devicemapper', or false otherwise. #seed_lvm_group_data_enabled: -# Seed LVM volume group for data. See mrlesmithjr.manage-lvm role for format. +# Seed LVM volume group for data. See mrlesmithjr.manage_lvm role for format. #seed_lvm_group_data: # List of disks for use by seed LVM data volume group. Default to an invalid @@ -106,7 +106,7 @@ seed_pulp_container: image: pulp/pulp pre: "{{ kayobe_config_path }}/containers/pulp/pre.yml" post: "{{ kayobe_config_path }}/containers/pulp/post.yml" - tag: "3.24.0" + tag: "3.43.1" network_mode: host # Override deploy_containers_defaults.init == true to ensure # s6-overlay-suexec starts as pid 1 @@ -152,6 +152,10 @@ seed_containers: >- seed_extra_containers: {} +# Whether to attempt a basic authentication login to a registry when +# deploying seed containers +seed_deploy_containers_registry_attempt_login: "{{ not seed_pulp_container_enabled | bool }}" + ############################################################################### # Seed node firewalld configuration. diff --git a/etc/kayobe/stackhpc-monitoring.yml b/etc/kayobe/stackhpc-monitoring.yml index b48646e792..e8e0bb91f5 100644 --- a/etc/kayobe/stackhpc-monitoring.yml +++ b/etc/kayobe/stackhpc-monitoring.yml @@ -8,10 +8,18 @@ # of free memory is lower than this value an alert will be triggered. alertmanager_low_memory_threshold_gib: 5 +# Whether to raise an alert if any network bond is configured with a single +# link. Change to false to disable this alert. +alertmanager_warn_network_bond_single_link: true + ############################################################################### # Exporter configuration # Whether the OpenStack Capacity exporter is enabled. # Enabling this flag will result in HAProxy configuration and Prometheus scrape # targets being templated during deployment. -stackhpc_enable_os_capacity: false +stackhpc_enable_os_capacity: true + +# Whether TLS certificate verification is enabled for the OpenStack Capacity +# exporter during Keystone authentication. +stackhpc_os_capacity_openstack_verify: true diff --git a/etc/kayobe/storage.yml b/etc/kayobe/storage.yml index 8270dba2f5..32a7382aa6 100644 --- a/etc/kayobe/storage.yml +++ b/etc/kayobe/storage.yml @@ -68,15 +68,15 @@ ############################################################################### # Storage node LVM configuration. -# List of storage volume groups. See mrlesmithjr.manage-lvm role for +# List of storage volume groups. See mrlesmithjr.manage_lvm role for # format. #storage_lvm_groups: -# Default list of storage volume groups. See mrlesmithjr.manage-lvm role for +# Default list of storage volume groups. See mrlesmithjr.manage_lvm role for # format. #storage_lvm_groups_default: -# Additional list of storage volume groups. See mrlesmithjr.manage-lvm role +# Additional list of storage volume groups. See mrlesmithjr.manage_lvm role # for format. #storage_lvm_groups_extra: @@ -87,7 +87,7 @@ # 'docker_storage_driver' is set to 'devicemapper', or false otherwise. #storage_lvm_group_data_enabled: -# Storage LVM volume group for data. See mrlesmithjr.manage-lvm role for +# Storage LVM volume group for data. See mrlesmithjr.manage_lvm role for # format. #storage_lvm_group_data: diff --git a/kayobe-env b/kayobe-env index 5137927e53..28b1cccdbf 100644 --- a/kayobe-env +++ b/kayobe-env @@ -30,8 +30,8 @@ export KOLLA_CONFIG_PATH=$KAYOBE_CONFIG_ROOT/etc/kolla # kayobe/ # kolla-ansible/ base_path=$(realpath $KAYOBE_CONFIG_ROOT/../../) -export KOLLA_SOURCE_PATH=${KOLLA_SOURCE_PATH:-${base_path}/src/kolla-ansible} -export KOLLA_VENV_PATH=${KOLLA_VENV_PATH:-${base_path}/venvs/kolla-ansible} +export KOLLA_SOURCE_PATH=${base_path}/src/kolla-ansible +export KOLLA_VENV_PATH=${base_path}/venvs/kolla-ansible function check_and_export_env { # Look for existing Kayobe environments diff --git a/releasenotes/notes/add-grafana-plugins-f4856a30529ac686.yaml b/releasenotes/notes/add-grafana-plugins-f4856a30529ac686.yaml new file mode 100644 index 0000000000..b4235388b3 --- /dev/null +++ b/releasenotes/notes/add-grafana-plugins-f4856a30529ac686.yaml @@ -0,0 +1,6 @@ +--- +fixes: + - | + The grafana image now includes the `gnocchixyz-gnocchi-datasource` and the + `grafana-opensearch-datasource` plugins, which are the default upstream + plugins. diff --git a/releasenotes/notes/add-nova-compute-ironic-failover-doc-a0c4f45b1fb48c4a.yaml b/releasenotes/notes/add-nova-compute-ironic-failover-doc-a0c4f45b1fb48c4a.yaml new file mode 100644 index 0000000000..c5b52984fe --- /dev/null +++ b/releasenotes/notes/add-nova-compute-ironic-failover-doc-a0c4f45b1fb48c4a.yaml @@ -0,0 +1,12 @@ +--- +fixes: + - | + Adds basic support and a document explaining how to migrate to a single + nova-compute-ironic instance, and how to re-deploy the instance to another + machine in the event of failure. See the operations / nova-compute-ironic + doc for further details. +upgrade: + - | + Ensure that your deployment has only one nova-compute-ironic service running + per conductor group. See the operations / nova-compute-ironic doc for further + details. diff --git a/releasenotes/notes/adds-mdraid-alerts-309fb79e61389325.yaml b/releasenotes/notes/adds-mdraid-alerts-309fb79e61389325.yaml new file mode 100644 index 0000000000..8312816093 --- /dev/null +++ b/releasenotes/notes/adds-mdraid-alerts-309fb79e61389325.yaml @@ -0,0 +1,4 @@ +--- +features: + - | + Adds alerts for software raid failures. diff --git a/releasenotes/notes/bump-horizon-694d426decbf7df3.yaml b/releasenotes/notes/bump-horizon-694d426decbf7df3.yaml new file mode 100644 index 0000000000..780797d9ed --- /dev/null +++ b/releasenotes/notes/bump-horizon-694d426decbf7df3.yaml @@ -0,0 +1,5 @@ +--- +security: + - | + Update Horizon on Ubuntu to include apache2 package ``2.4.52-1ubuntu4.8`` + which fixes CVE-2023-31122. diff --git a/releasenotes/notes/bump-magnum-51e03a61ae8aa5a4.yaml b/releasenotes/notes/bump-magnum-51e03a61ae8aa5a4.yaml new file mode 100644 index 0000000000..b287648009 --- /dev/null +++ b/releasenotes/notes/bump-magnum-51e03a61ae8aa5a4.yaml @@ -0,0 +1,3 @@ +--- +fixes: + - Updates Magnum CAPI Helm driver version to v0.10.0 diff --git a/releasenotes/notes/bump-magnum-capi-helm-6723d89456e6a590.yaml b/releasenotes/notes/bump-magnum-capi-helm-6723d89456e6a590.yaml new file mode 100644 index 0000000000..7fc3cca1a8 --- /dev/null +++ b/releasenotes/notes/bump-magnum-capi-helm-6723d89456e6a590.yaml @@ -0,0 +1,4 @@ +--- +features: + - | + Updates Magnum CAPI Helm driver version to v0.11.0 diff --git a/releasenotes/notes/bump-magnum-capi-helm-6febfe840e81cea5.yaml b/releasenotes/notes/bump-magnum-capi-helm-6febfe840e81cea5.yaml new file mode 100644 index 0000000000..6677583fb1 --- /dev/null +++ b/releasenotes/notes/bump-magnum-capi-helm-6febfe840e81cea5.yaml @@ -0,0 +1,4 @@ +--- +features: + - | + Updates Magnum CAPI Helm driver version to v0.12.0 diff --git a/releasenotes/notes/bump-magnum-capi-helm-7e4ad37d3d9eecce.yaml b/releasenotes/notes/bump-magnum-capi-helm-7e4ad37d3d9eecce.yaml new file mode 100644 index 0000000000..8835210877 --- /dev/null +++ b/releasenotes/notes/bump-magnum-capi-helm-7e4ad37d3d9eecce.yaml @@ -0,0 +1,4 @@ +--- +upgrade: + - | + Updates Magnum CAPI Helm driver version to v0.11.0 diff --git a/releasenotes/notes/bump-magnum-capi-helm-d766b5956de65d31.yaml b/releasenotes/notes/bump-magnum-capi-helm-d766b5956de65d31.yaml new file mode 100644 index 0000000000..eb1e376407 --- /dev/null +++ b/releasenotes/notes/bump-magnum-capi-helm-d766b5956de65d31.yaml @@ -0,0 +1,4 @@ +--- + features: + - | + Updates Magnum CAPI Helm driver version to v0.13.0 diff --git a/releasenotes/notes/container-image-scanning-e5adf2c6b540b502.yaml b/releasenotes/notes/container-image-scanning-e5adf2c6b540b502.yaml new file mode 100644 index 0000000000..67a99f9c26 --- /dev/null +++ b/releasenotes/notes/container-image-scanning-e5adf2c6b540b502.yaml @@ -0,0 +1,6 @@ +--- +security: + - | + Kolla container images created using the + ``stackhpc-container-image-build.yml`` workflow are now automatically + scanned for vulnerablilities. diff --git a/releasenotes/notes/fail-unparsed-inventory-c3b4e2ffcb620a6b.yaml b/releasenotes/notes/fail-unparsed-inventory-c3b4e2ffcb620a6b.yaml new file mode 100644 index 0000000000..335691c30d --- /dev/null +++ b/releasenotes/notes/fail-unparsed-inventory-c3b4e2ffcb620a6b.yaml @@ -0,0 +1,7 @@ +--- +upgrade: + - | + Updates the Ansible configuration to `fail on any unparsed inventory source + `__. + If you are using a separate Ansible configuration for Kolla Ansible, you + may wish to add this setting in ``etc/kayobe/kolla/ansible.cfg``. diff --git a/releasenotes/notes/fix-ceph-pools-top-capacity-used-panel-26d495c45f2678c8.yaml b/releasenotes/notes/fix-ceph-pools-top-capacity-used-panel-26d495c45f2678c8.yaml new file mode 100644 index 0000000000..cdf38bb3a1 --- /dev/null +++ b/releasenotes/notes/fix-ceph-pools-top-capacity-used-panel-26d495c45f2678c8.yaml @@ -0,0 +1,6 @@ +--- +fixes: + - | + Fixes Grafana panel of top Ceph pools by capacity used. This panel was only + showing the most used pool instead of as many pools as configured with the + ``$topk`` variable. diff --git a/releasenotes/notes/fix-houston-tc-mirred-bfb16c89f63b472a.yaml b/releasenotes/notes/fix-houston-tc-mirred-bfb16c89f63b472a.yaml new file mode 100644 index 0000000000..64c619cee7 --- /dev/null +++ b/releasenotes/notes/fix-houston-tc-mirred-bfb16c89f63b472a.yaml @@ -0,0 +1,12 @@ +--- +fixes: + - | + Adds a custom ``fix-houston.yml`` playbook to address dmesg errors, specifically: + "tc mirred to Houston: device bond0-ovs is down". This error typically appears + when OVS HW offloading is enabled, often in conjunction with VF-LAG and ASAP^2. + Detailed usage instructions are provided within the playbook's comments. + Additional context is available at the following links: + `LP#1899364 + `__ + `Kernel Patch + `__ diff --git a/releasenotes/notes/fixes-osd-size-summary-9924ef4aac61d2b6.yaml b/releasenotes/notes/fixes-osd-size-summary-9924ef4aac61d2b6.yaml new file mode 100644 index 0000000000..d2c9fd1d3c --- /dev/null +++ b/releasenotes/notes/fixes-osd-size-summary-9924ef4aac61d2b6.yaml @@ -0,0 +1,4 @@ +--- +fixes: + - | + Fix an issue with the OSD summary pie chart not showing any data. diff --git a/releasenotes/notes/magnum-remove-cert-append-8797b640f25644ea.yaml b/releasenotes/notes/magnum-remove-cert-append-8797b640f25644ea.yaml new file mode 100644 index 0000000000..20d195596b --- /dev/null +++ b/releasenotes/notes/magnum-remove-cert-append-8797b640f25644ea.yaml @@ -0,0 +1,5 @@ +--- +fixes: + - | + Fixes appending to ca.crt in make-cert-client.sh causing multiple identical + ca certs being added into /etc/kubernetes/certs/ca.crt. diff --git a/releasenotes/notes/network-bond-degraded-alert-d2a0b05002609ac1.yaml b/releasenotes/notes/network-bond-degraded-alert-d2a0b05002609ac1.yaml new file mode 100644 index 0000000000..c987c7959c --- /dev/null +++ b/releasenotes/notes/network-bond-degraded-alert-d2a0b05002609ac1.yaml @@ -0,0 +1,5 @@ +--- +features: + - | + Adds a new Prometheus alert ``HostNetworkBondDegraded`` which will be + raised when at least one bond member is down. diff --git a/releasenotes/notes/network-bond-single-link-766adf41a3c2fd4e.yaml b/releasenotes/notes/network-bond-single-link-766adf41a3c2fd4e.yaml new file mode 100644 index 0000000000..66d66f40b4 --- /dev/null +++ b/releasenotes/notes/network-bond-single-link-766adf41a3c2fd4e.yaml @@ -0,0 +1,8 @@ +--- +features: + - | + Adds a new Prometheus alert ``HostNetworkBondSingleLink`` which will be + raised when a bond is configured with only one member. This can happen when + NetworkManager detects that a bond member is down at boot time. This alert + can be disabled by setting ``alertmanager_warn_network_bond_single_link`` + to ``false``. diff --git a/releasenotes/notes/os-capacity-94006f03f16583e4.yaml b/releasenotes/notes/os-capacity-94006f03f16583e4.yaml index f9d76b7f44..ca317682b6 100644 --- a/releasenotes/notes/os-capacity-94006f03f16583e4.yaml +++ b/releasenotes/notes/os-capacity-94006f03f16583e4.yaml @@ -9,7 +9,20 @@ upgrade: - | To deploy the OpenStack Capacity Grafana dashboard, you must define OpenStack application credential variables: - ``secrets_os_exporter_auth_url``, - ``secrets_os_exporter_credential_id`` and - ``secrets_os_exporter_credential_secret`` as laid out in the + ``secrets_os_capacity_credential_id`` and + ``secrets_os_capacity_credential_secret`` as laid out in the 'Monitoring' documentation. + + You must also enable the ``stackhpc_enable_os_capacity`` + flag for OpenStack Capacity HAProxy and Prometheus configuration + to be templated. + + You may also change the default authentication URL from the + kolla_internal_fqdn and change the default OpenStack region + from RegionOne with the variables: + ``stackhpc_os_capacity_auth_url`` and + ``stackhpc_os_capacity_openstack_region_name``. + + To disable certificate verification for the OpenStack Capacity + exporter, you can set ``stackhpc_os_capacity_openstack_verify`` + to false. diff --git a/releasenotes/notes/os-capacity-deploy-hook-b52e87c0819df6fd.yaml b/releasenotes/notes/os-capacity-deploy-hook-b52e87c0819df6fd.yaml new file mode 100644 index 0000000000..5479391995 --- /dev/null +++ b/releasenotes/notes/os-capacity-deploy-hook-b52e87c0819df6fd.yaml @@ -0,0 +1,9 @@ +--- +features: + - | + Automatic deployment for OpenStack Capacity via a Kayobe service + deploy hook using kolla admin credentials. +upgrade: + - | + OpenStack Capacity no longer uses application credentials. Please + delete any previously generated application credentials. \ No newline at end of file diff --git a/releasenotes/notes/rebuild-heat-with-yaql-3.0.0-4415d8232bc547df.yaml b/releasenotes/notes/rebuild-heat-with-yaql-3.0.0-4415d8232bc547df.yaml new file mode 100644 index 0000000000..da3cb5cbb1 --- /dev/null +++ b/releasenotes/notes/rebuild-heat-with-yaql-3.0.0-4415d8232bc547df.yaml @@ -0,0 +1,7 @@ +--- +security: + - | + The Heat container images are rebuilt with yaql 3.0.0 to include patch for + vulnerability OSSN/OSSN-0093. It is recommended that you redeploy Heat + services in your system with the current version of Heat images from + StackHPC Release Train. diff --git a/releasenotes/notes/smartmontools-bc8176f45d58a75d.yaml b/releasenotes/notes/smartmontools-bc8176f45d58a75d.yaml new file mode 100644 index 0000000000..ac34513473 --- /dev/null +++ b/releasenotes/notes/smartmontools-bc8176f45d58a75d.yaml @@ -0,0 +1,6 @@ +--- +features: + - | + The smartmon-tools playbook now ensures that the cron service is running as + in some cases it may not be running by default. + diff --git a/requirements.txt b/requirements.txt index 631266a78e..c2792b36b4 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,3 @@ kayobe@git+https://github.com/stackhpc/kayobe@stackhpc/2023.1 -ansible-modules-hashivault +ansible-modules-hashivault>=5.2.1 jmespath diff --git a/terraform/aio/vm.tf b/terraform/aio/vm.tf index 606b30b70c..50c0cc3dd4 100644 --- a/terraform/aio/vm.tf +++ b/terraform/aio/vm.tf @@ -38,6 +38,11 @@ variable "aio_vm_volume_size" { default = 35 } +variable "aio_vm_tags" { + type = list(string) + default = [] +} + locals { image_is_uuid = length(regexall("^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$", var.aio_vm_image)) > 0 } @@ -69,6 +74,8 @@ resource "openstack_compute_instance_v2" "kayobe-aio" { destination_type = "volume" delete_on_termination = true } + + tags = var.aio_vm_tags } # Wait for the instance to be accessible via SSH before progressing. diff --git a/tools/scan-images.sh b/tools/scan-images.sh new file mode 100755 index 0000000000..74223ad902 --- /dev/null +++ b/tools/scan-images.sh @@ -0,0 +1,79 @@ +#!/usr/bin/env bash +set -eo pipefail + +# Check correct usage +if [[ ! $2 ]]; then + echo "Usage: scan-images.sh " + exit 2 +fi + +set -u + +# Check that trivy is installed +if ! trivy --version; then + echo 'Please install trivy: curl -sfL https://raw.githubusercontent.com/aquasecurity/trivy/main/contrib/install.sh | sh -s -- -b /usr/local/bin v0.49.1' +fi + +# Clear any previous outputs +rm -rf image-scan-output + +# Make a fresh output directory +mkdir -p image-scan-output + +# Get built container images +docker image ls --filter "reference=ark.stackhpc.com/stackhpc-dev/*:$2" > $1-scanned-container-images.txt + +# Make a file of imagename:tag +images=$(grep --invert-match --no-filename ^REPOSITORY $1-scanned-container-images.txt | sed 's/ \+/:/g' | cut -f 1,2 -d:) + +# Ensure output files exist +touch image-scan-output/clean-images.txt image-scan-output/dirty-images.txt + +# If Trivy detects no vulnerabilities, add the image name to clean-images.txt. +# If there are vulnerabilities detected, add it to dirty-images.txt and +# generate a csv summary +for image in $images; do + filename=$(basename $image | sed 's/:/\./g') + if $(trivy image \ + --quiet \ + --exit-code 1 \ + --scanners vuln \ + --format json \ + --severity HIGH,CRITICAL \ + --output image-scan-output/${filename}.json \ + --ignore-unfixed \ + $image); then + # Clean up the output file for any images with no vulnerabilities + rm -f image-scan-output/${filename}.json + + # Add the image to the clean list + echo "${image}" >> image-scan-output/clean-images.txt + else + # Add the image to the dirty list + echo "${image}" >> image-scan-output/dirty-images.txt + + # Write a header for the summary CSV + echo '"PkgName","PkgPath","PkgID","VulnerabilityID","FixedVersion","PrimaryURL","Severity"' > image-scan-output/${filename}.summary.csv + + # Write the summary CSV data + jq -r '.Results[] + | select(.Vulnerabilities) + | .Vulnerabilities + # Ignore packages with "kernel" in the PkgName + | map(select(.PkgName | test("kernel") | not )) + | group_by(.VulnerabilityID) + | map( + [ + (map(.PkgName) | unique | join(";")), + (map(.PkgPath | select( . != null )) | join(";")), + .[0].PkgID, + .[0].VulnerabilityID, + .[0].FixedVersion, + .[0].PrimaryURL, + .[0].Severity + ] + ) + | .[] + | @csv' image-scan-output/${filename}.json >> image-scan-output/${filename}.summary.csv + fi +done diff --git a/tools/ubuntu-upgrade-infra-vm.sh b/tools/ubuntu-upgrade-infra-vm.sh new file mode 100755 index 0000000000..8d58101748 --- /dev/null +++ b/tools/ubuntu-upgrade-infra-vm.sh @@ -0,0 +1,34 @@ +#! /usr/bin/bash + +set -e + +if [[ ! $1 ]]; then + echo "Usage: infra-vm-ubuntu-upgrade.sh " + exit 2 +fi + +if [[ ! $KAYOBE_PATH ]]; then + echo "Environment variable \$KAYOBE_PATH is not defined" + exit 2 +fi + +if [[ ! $KAYOBE_CONFIG_PATH ]]; then + echo "Environment variable \$KAYOBE_CONFIG_PATH is not defined" + exit 2 +fi + +if [[ ! $ANSIBLE_ROLES_PATH ]]; then + set -x + export ANSIBLE_ROLES_PATH=$KAYOBE_PATH/ansible/roles + set +x +else + set -x + export ANSIBLE_ROLES_PATH=$ANSIBLE_ROLES_PATH:$KAYOBE_PATH/ansible/roles + set +x +fi + +set -x + +kayobe playbook run $KAYOBE_CONFIG_PATH/ansible/ubuntu-upgrade.yml -e os_release=jammy --limit $1 + +kayobe infra vm host configure --limit $1 -e os_release=jammy diff --git a/tools/ubuntu-upgrade-overcloud.sh b/tools/ubuntu-upgrade-overcloud.sh new file mode 100755 index 0000000000..50959c263c --- /dev/null +++ b/tools/ubuntu-upgrade-overcloud.sh @@ -0,0 +1,36 @@ +#! /usr/bin/bash + +set -e + +if [[ ! $1 ]]; then + echo "Usage: overcloud-ubuntu-upgrade.sh " + exit 2 +fi + +if [[ ! $KAYOBE_PATH ]]; then + echo "Environment variable \$KAYOBE_PATH is not defined" + exit 2 +fi + +if [[ ! $KAYOBE_CONFIG_PATH ]]; then + echo "Environment variable \$KAYOBE_CONFIG_PATH is not defined" + exit 2 +fi + +if [[ ! $ANSIBLE_ROLES_PATH ]]; then + set -x + export ANSIBLE_ROLES_PATH=$KAYOBE_PATH/ansible/roles + set +x +else + set -x + export ANSIBLE_ROLES_PATH=$ANSIBLE_ROLES_PATH:$KAYOBE_PATH/ansible/roles + set +x +fi + +set -x + +kayobe playbook run $KAYOBE_CONFIG_PATH/ansible/ubuntu-upgrade.yml -e os_release=jammy --limit $1 + +kayobe playbook run $KAYOBE_CONFIG_PATH/ansible/ovn-fix-chassis-priorities.yml + +kayobe overcloud host configure --limit $1 --kolla-limit $1 -e os_release=jammy diff --git a/tools/ubuntu-upgrade-seed-hypervisor.sh b/tools/ubuntu-upgrade-seed-hypervisor.sh new file mode 100755 index 0000000000..ad09f2b34c --- /dev/null +++ b/tools/ubuntu-upgrade-seed-hypervisor.sh @@ -0,0 +1,29 @@ +#! /usr/bin/bash + +set -e + +if [[ ! $KAYOBE_PATH ]]; then + echo "Environment variable \$KAYOBE_PATH is not defined" + exit 2 +fi + +if [[ ! $KAYOBE_CONFIG_PATH ]]; then + echo "Environment variable \$KAYOBE_CONFIG_PATH is not defined" + exit 2 +fi + +if [[ ! $ANSIBLE_ROLES_PATH ]]; then + set -x + export ANSIBLE_ROLES_PATH=$KAYOBE_PATH/ansible/roles + set +x +else + set -x + export ANSIBLE_ROLES_PATH=$ANSIBLE_ROLES_PATH:$KAYOBE_PATH/ansible/roles + set +x +fi + +set -x + +kayobe playbook run $KAYOBE_CONFIG_PATH/ansible/ubuntu-upgrade.yml -e os_release=jammy --limit seed-hypervisor + +kayobe seed hypervisor host configure diff --git a/tools/ubuntu-upgrade-seed.sh b/tools/ubuntu-upgrade-seed.sh new file mode 100755 index 0000000000..4a48d5f366 --- /dev/null +++ b/tools/ubuntu-upgrade-seed.sh @@ -0,0 +1,29 @@ +#! /usr/bin/bash + +set -e + +if [[ ! $KAYOBE_PATH ]]; then + echo "Environment variable \$KAYOBE_PATH is not defined" + exit 2 +fi + +if [[ ! $KAYOBE_CONFIG_PATH ]]; then + echo "Environment variable \$KAYOBE_CONFIG_PATH is not defined" + exit 2 +fi + +if [[ ! $ANSIBLE_ROLES_PATH ]]; then + set -x + export ANSIBLE_ROLES_PATH=$KAYOBE_PATH/ansible/roles + set +x +else + set -x + export ANSIBLE_ROLES_PATH=$ANSIBLE_ROLES_PATH:$KAYOBE_PATH/ansible/roles + set +x +fi + +set -x + +kayobe playbook run $KAYOBE_CONFIG_PATH/ansible/ubuntu-upgrade.yml -e os_release=jammy --limit seed + +kayobe seed host configure