diff --git a/.github/workflows/canary-integration-test.yml b/.github/workflows/canary-integration-test.yml index a5d7b8b5264b..49d4f7e81f37 100644 --- a/.github/workflows/canary-integration-test.yml +++ b/.github/workflows/canary-integration-test.yml @@ -1334,7 +1334,7 @@ jobs: # ceph-image: # use default - name: upload test result - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 if: always() with: name: rgw-multisite-testing @@ -1366,7 +1366,7 @@ jobs: github-token: ${{ secrets.GITHUB_TOKEN }} - name: upload test result - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 if: always() with: name: encryption-pvc-kms-ibm-kp diff --git a/.github/workflows/canary-test-config/action.yaml b/.github/workflows/canary-test-config/action.yaml index c13eb4570ef6..24ae8ae9f18d 100644 --- a/.github/workflows/canary-test-config/action.yaml +++ b/.github/workflows/canary-test-config/action.yaml @@ -23,7 +23,7 @@ runs: - name: Setup Minikube shell: bash --noprofile --norc -eo pipefail -x {0} run: | - tests/scripts/github-action-helper.sh install_minikube_with_none_driver v1.28.4 + tests/scripts/github-action-helper.sh install_minikube_with_none_driver v1.29.0 - name: install deps shell: bash --noprofile --norc -eo pipefail -x {0} diff --git a/.github/workflows/daily-nightly-jobs.yml b/.github/workflows/daily-nightly-jobs.yml index 886fd3ffa0b1..07fefa5ddd88 100644 --- a/.github/workflows/daily-nightly-jobs.yml +++ b/.github/workflows/daily-nightly-jobs.yml @@ -107,7 +107,7 @@ jobs: name: canary-arm64 - name: upload canary test result - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 if: always() with: name: canary-arm64 @@ -147,7 +147,7 @@ jobs: tests/scripts/collect-logs.sh - name: Artifact - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 if: failure() with: name: ceph-smoke-suite-quincy-artifact @@ -187,7 +187,7 @@ jobs: tests/scripts/collect-logs.sh - name: Artifact - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 if: failure() with: name: ceph-smoke-suite-reef-artifact @@ -227,7 +227,7 @@ jobs: tests/scripts/collect-logs.sh - name: Artifact - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 if: failure() with: name: ceph-smoke-suite-master-artifact @@ -267,7 +267,7 @@ jobs: tests/scripts/collect-logs.sh - name: Artifact - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 if: failure() with: name: ceph-object-suite-quincy-artifact @@ -307,7 +307,7 @@ jobs: tests/scripts/collect-logs.sh - name: Artifact - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 if: failure() with: name: ceph-object-suite-master-artifact @@ -347,7 +347,7 @@ jobs: tests/scripts/collect-logs.sh - name: Artifact - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 if: failure() with: name: ceph-upgrade-suite-reef-artifact @@ -387,7 +387,7 @@ jobs: tests/scripts/collect-logs.sh - name: Artifact - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 if: failure() with: name: ceph-upgrade-suite-quincy-artifact @@ -418,7 +418,7 @@ jobs: ceph-image: quay.io/ceph/daemon-base:${{ matrix.ceph-image-tag }} - name: upload test result - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 if: always() with: name: rgw-multisite-testing-ceph-${{ matrix.ceph-image-tag }} @@ -449,7 +449,7 @@ jobs: github-token: ${{ secrets.GITHUB_TOKEN }} - name: upload test result - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 if: always() with: name: encryption-pvc-kms-ibm-kp diff --git a/.github/workflows/integration-test-helm-suite.yaml b/.github/workflows/integration-test-helm-suite.yaml index 2640df307e4c..0e5a9a41095d 100644 --- a/.github/workflows/integration-test-helm-suite.yaml +++ b/.github/workflows/integration-test-helm-suite.yaml @@ -25,7 +25,7 @@ jobs: strategy: fail-fast: false matrix: - kubernetes-versions: ["v1.23.17", "v1.28.4"] + kubernetes-versions: ["v1.23.17", "v1.29.0"] steps: - name: checkout uses: actions/checkout@v4 @@ -59,11 +59,10 @@ jobs: run: | export LOG_DIR="/home/runner/work/rook/rook/tests/integration/_output/tests/" export CLUSTER_NAMESPACE="helm-ns" - export OPERATOR_NAMESPACE="helm-ns-system" tests/scripts/collect-logs.sh - name: Artifact - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 if: failure() with: name: ceph-helm-suite-artifact-${{ matrix.kubernetes-versions }} diff --git a/.github/workflows/integration-test-mgr-suite.yaml b/.github/workflows/integration-test-mgr-suite.yaml index 6eee2317dfcb..d0bea34a95f5 100644 --- a/.github/workflows/integration-test-mgr-suite.yaml +++ b/.github/workflows/integration-test-mgr-suite.yaml @@ -24,7 +24,7 @@ jobs: strategy: fail-fast: false matrix: - kubernetes-versions: ["v1.28.4"] + kubernetes-versions: ["v1.29.0"] steps: - name: checkout uses: actions/checkout@v4 @@ -58,7 +58,7 @@ jobs: tests/scripts/collect-logs.sh - name: Artifact - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 if: failure() with: name: ceph-mgr-suite-artifact-${{ matrix.kubernetes-versions }} diff --git a/.github/workflows/integration-test-multi-cluster-suite.yaml b/.github/workflows/integration-test-multi-cluster-suite.yaml index f552df5d8b9c..b826fe4d22b0 100644 --- a/.github/workflows/integration-test-multi-cluster-suite.yaml +++ b/.github/workflows/integration-test-multi-cluster-suite.yaml @@ -25,7 +25,7 @@ jobs: strategy: fail-fast: false matrix: - kubernetes-versions: ["v1.28.4"] + kubernetes-versions: ["v1.29.0"] steps: - name: checkout uses: actions/checkout@v4 @@ -59,7 +59,7 @@ jobs: CLUSTER_NAMESPACE="multi-external" tests/scripts/collect-logs.sh - name: Artifact - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 if: failure() with: name: ceph-multi-cluster-deploy-suite-artifact-${{ matrix.kubernetes-versions }} diff --git a/.github/workflows/integration-test-object-suite.yaml b/.github/workflows/integration-test-object-suite.yaml index ccecbeb7f83c..9e4c1c24a3ad 100644 --- a/.github/workflows/integration-test-object-suite.yaml +++ b/.github/workflows/integration-test-object-suite.yaml @@ -25,7 +25,7 @@ jobs: strategy: fail-fast: false matrix: - kubernetes-versions: ["v1.23.17", "v1.28.4"] + kubernetes-versions: ["v1.23.17", "v1.29.0"] steps: - name: checkout uses: actions/checkout@v4 @@ -58,7 +58,7 @@ jobs: tests/scripts/collect-logs.sh - name: Artifact - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 if: failure() with: name: ceph-object-suite-artifact-${{ matrix.kubernetes-versions }} diff --git a/.github/workflows/integration-test-smoke-suite.yaml b/.github/workflows/integration-test-smoke-suite.yaml index c6bb24bc959e..418a9ad6a496 100644 --- a/.github/workflows/integration-test-smoke-suite.yaml +++ b/.github/workflows/integration-test-smoke-suite.yaml @@ -25,7 +25,7 @@ jobs: strategy: fail-fast: false matrix: - kubernetes-versions: ["v1.23.17", "v1.28.4"] + kubernetes-versions: ["v1.23.17", "v1.29.0"] steps: - name: checkout uses: actions/checkout@v4 @@ -58,7 +58,7 @@ jobs: tests/scripts/collect-logs.sh - name: Artifact - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 if: failure() with: name: ceph-smoke-suite-artifact-${{ matrix.kubernetes-versions }} diff --git a/.github/workflows/integration-test-upgrade-suite.yaml b/.github/workflows/integration-test-upgrade-suite.yaml index 10122b68bce9..03d46b4b2dd2 100644 --- a/.github/workflows/integration-test-upgrade-suite.yaml +++ b/.github/workflows/integration-test-upgrade-suite.yaml @@ -25,7 +25,7 @@ jobs: strategy: fail-fast: false matrix: - kubernetes-versions: ["v1.23.17", "v1.28.4"] + kubernetes-versions: ["v1.23.17", "v1.29.0"] steps: - name: checkout uses: actions/checkout@v4 @@ -58,7 +58,7 @@ jobs: tests/scripts/collect-logs.sh - name: Artifact - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 if: failure() with: name: ceph-upgrade-suite-artifact-${{ matrix.kubernetes-versions }} @@ -70,7 +70,7 @@ jobs: strategy: fail-fast: false matrix: - kubernetes-versions: ["v1.23.17", "v1.28.4"] + kubernetes-versions: ["v1.23.17", "v1.29.0"] steps: - name: checkout uses: actions/checkout@v4 @@ -106,7 +106,7 @@ jobs: tests/scripts/collect-logs.sh - name: Artifact - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 if: failure() with: name: ceph-upgrade-helm-suite-artifact-${{ matrix.kubernetes-versions }} diff --git a/.github/workflows/integration-tests-on-release.yaml b/.github/workflows/integration-tests-on-release.yaml index 9f714501b1d1..83aa41341c95 100644 --- a/.github/workflows/integration-tests-on-release.yaml +++ b/.github/workflows/integration-tests-on-release.yaml @@ -18,7 +18,7 @@ jobs: strategy: fail-fast: false matrix: - kubernetes-versions: ["v1.23.17", "v1.24.17", "v1.26.11", "v1.28.4"] + kubernetes-versions: ["v1.23.17", "v1.25.16", "v1.27.8", "v1.29.0"] steps: - name: checkout uses: actions/checkout@v4 @@ -48,7 +48,7 @@ jobs: tests/scripts/collect-logs.sh - name: Artifact - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 if: failure() with: name: ceph-helm-suite-artifact-${{ matrix.kubernetes-versions }} @@ -59,7 +59,7 @@ jobs: strategy: fail-fast: false matrix: - kubernetes-versions: ["v1.23.17", "v1.24.17", "v1.26.11", "v1.28.4"] + kubernetes-versions: ["v1.23.17", "v1.25.16", "v1.27.8", "v1.29.0"] steps: - name: checkout uses: actions/checkout@v4 @@ -88,7 +88,7 @@ jobs: CLUSTER_NAMESPACE="multi-external" tests/scripts/collect-logs.sh - name: Artifact - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 if: failure() with: name: ceph-multi-cluster-deploy-suite-artifact-${{ matrix.kubernetes-versions }} @@ -99,7 +99,7 @@ jobs: strategy: fail-fast: false matrix: - kubernetes-versions: ["v1.23.17", "v1.24.17", "v1.26.11", "v1.28.4"] + kubernetes-versions: ["v1.23.17", "v1.25.16", "v1.27.8", "v1.29.0"] steps: - name: checkout uses: actions/checkout@v4 @@ -127,7 +127,7 @@ jobs: tests/scripts/collect-logs.sh - name: Artifact - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 if: failure() with: name: ceph-smoke-suite-artifact-${{ matrix.kubernetes-versions }} @@ -138,7 +138,7 @@ jobs: strategy: fail-fast: false matrix: - kubernetes-versions: ["v1.23.17", "v1.24.17", "v1.26.11", "v1.28.4"] + kubernetes-versions: ["v1.23.17", "v1.25.16", "v1.27.8", "v1.29.0"] steps: - name: checkout uses: actions/checkout@v4 @@ -166,7 +166,7 @@ jobs: tests/scripts/collect-logs.sh - name: Artifact - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 if: failure() with: name: ceph-upgrade-suite-artifact-${{ matrix.kubernetes-versions }} @@ -177,7 +177,7 @@ jobs: strategy: fail-fast: false matrix: - kubernetes-versions: ["v1.23.17", "v1.25.16", "v1.28.4"] + kubernetes-versions: ["v1.23.17", "v1.26.11", "v1.29.0"] steps: - name: checkout uses: actions/checkout@v4 @@ -208,7 +208,7 @@ jobs: tests/scripts/collect-logs.sh - name: Artifact - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 if: failure() with: name: ceph-upgrade-suite-artifact-${{ matrix.kubernetes-versions }} @@ -219,7 +219,7 @@ jobs: strategy: fail-fast: false matrix: - kubernetes-versions: ["v1.23.17", "v1.28.4"] + kubernetes-versions: ["v1.23.17", "v1.29.0"] steps: - name: checkout uses: actions/checkout@v4 @@ -247,7 +247,7 @@ jobs: tests/scripts/collect-logs.sh - name: Artifact - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 if: failure() with: name: ceph-object-suite-artifact-${{ matrix.kubernetes-versions }} diff --git a/Documentation/CRDs/specification.md b/Documentation/CRDs/specification.md index e7592e645de3..f946897f7e4c 100644 --- a/Documentation/CRDs/specification.md +++ b/Documentation/CRDs/specification.md @@ -8420,7 +8420,8 @@ NetworkProviderType (Optional) -

Provider is what provides network connectivity to the cluster e.g. “host” or “multus”

+

Provider is what provides network connectivity to the cluster e.g. “host” or “multus”. +If the Provider is updated from being empty to “host” on a running cluster, then the operator will automatically fail over all the mons to apply the “host” network settings.

@@ -8492,7 +8493,9 @@ bool (Optional) -

HostNetwork to enable host network

+

HostNetwork to enable host network. +If host networking is enabled or disabled on a running cluster, then the operator will automatically fail over all the mons to +apply the new network settings.

diff --git a/Documentation/Getting-Started/quickstart.md b/Documentation/Getting-Started/quickstart.md index 42b528dfa519..1fc735fd88eb 100644 --- a/Documentation/Getting-Started/quickstart.md +++ b/Documentation/Getting-Started/quickstart.md @@ -36,7 +36,7 @@ To configure the Ceph storage cluster, at least one of these local storage optio A simple Rook cluster is created for Kubernetes with the following `kubectl` commands and [example manifests](https://github.com/rook/rook/blob/master/deploy/examples). ```console -$ git clone --single-branch --branch v1.13.0 https://github.com/rook/rook.git +$ git clone --single-branch --branch v1.13.1 https://github.com/rook/rook.git cd rook/deploy/examples kubectl create -f crds.yaml -f common.yaml -f operator.yaml kubectl create -f cluster.yaml diff --git a/Documentation/Helm-Charts/operator-chart.md b/Documentation/Helm-Charts/operator-chart.md index 4aa24f5bfe69..ec825d8dd5ef 100644 --- a/Documentation/Helm-Charts/operator-chart.md +++ b/Documentation/Helm-Charts/operator-chart.md @@ -59,12 +59,12 @@ The following table lists the configurable parameters of the rook-operator chart | `csi.cephFSKernelMountOptions` | Set CephFS Kernel mount options to use https://docs.ceph.com/en/latest/man/8/mount.ceph/#options. Set to "ms_mode=secure" when connections.encrypted is enabled in CephCluster CR | `nil` | | `csi.cephFSPluginUpdateStrategy` | CSI CephFS plugin daemonset update strategy, supported values are OnDelete and RollingUpdate | `RollingUpdate` | | `csi.cephFSPluginUpdateStrategyMaxUnavailable` | A maxUnavailable parameter of CSI cephFS plugin daemonset update strategy. | `1` | -| `csi.cephcsi.image` | Ceph CSI image | `quay.io/cephcsi/cephcsi:v3.10.0` | +| `csi.cephcsi.image` | Ceph CSI image | `quay.io/cephcsi/cephcsi:v3.10.1` | | `csi.cephfsLivenessMetricsPort` | CSI CephFS driver metrics port | `9081` | | `csi.cephfsPodLabels` | Labels to add to the CSI CephFS Deployments and DaemonSets Pods | `nil` | | `csi.clusterName` | Cluster name identifier to set as metadata on the CephFS subvolume and RBD images. This will be useful in cases like for example, when two container orchestrator clusters (Kubernetes/OCP) are using a single ceph cluster | `nil` | | `csi.csiAddons.enabled` | Enable CSIAddons | `false` | -| `csi.csiAddons.image` | CSIAddons Sidecar image | `"quay.io/csiaddons/k8s-sidecar:v0.7.0"` | +| `csi.csiAddons.image` | CSIAddons Sidecar image | `"quay.io/csiaddons/k8s-sidecar:v0.8.0"` | | `csi.csiAddonsPort` | CSI Addons server port | `9070` | | `csi.csiCephFSPluginResource` | CEPH CSI CephFS plugin resource requirement list | see values.yaml | | `csi.csiCephFSPluginVolume` | The volume of the CephCSI CephFS plugin DaemonSet | `nil` | diff --git a/Documentation/Storage-Configuration/Advanced/ceph-mon-health.md b/Documentation/Storage-Configuration/Advanced/ceph-mon-health.md index 169c78b6efa2..ce80aa6fc355 100644 --- a/Documentation/Storage-Configuration/Advanced/ceph-mon-health.md +++ b/Documentation/Storage-Configuration/Advanced/ceph-mon-health.md @@ -115,3 +115,10 @@ $ ceph -s osd: 3 osds: 3 up (since 10m), 3 in (since 10m) [...] ``` + +## Automatic Monitor Failover + +Rook will automatically fail over the mons when the following settings are updated in the CephCluster CR: +- `spec.network.hostNetwork`: When enabled or disabled, Rook fails over all monitors, configuring them to enable or disable host networking. +- `spec.network.Provider` : When updated from being empty to "host", Rook fails over all monitors, configuring them to enable or disable host networking. +- `spec.network.multiClusterService`: When enabled or disabled, Rook fails over all monitors, configuring them to start (or stop) using service IPs compatible with the multi-cluster service. diff --git a/Documentation/Storage-Configuration/Block-Storage-RBD/block-storage.md b/Documentation/Storage-Configuration/Block-Storage-RBD/block-storage.md index ba0d33440bd2..5ce95ead3c4d 100644 --- a/Documentation/Storage-Configuration/Block-Storage-RBD/block-storage.md +++ b/Documentation/Storage-Configuration/Block-Storage-RBD/block-storage.md @@ -204,9 +204,9 @@ If a node goes down where a pod is running where a RBD RWO volume is mounted, th Deploy the csi-addons manifests: ```console -kubectl create -f https://raw.githubusercontent.com/csi-addons/kubernetes-csi-addons/v0.7.0/deploy/controller/crds.yaml -kubectl create -f https://raw.githubusercontent.com/csi-addons/kubernetes-csi-addons/v0.7.0/deploy/controller/rbac.yaml -kubectl create -f https://raw.githubusercontent.com/csi-addons/kubernetes-csi-addons/v0.7.0/deploy/controller/setup-controller.yaml +kubectl create -f https://raw.githubusercontent.com/csi-addons/kubernetes-csi-addons/v0.8.0/deploy/controller/crds.yaml +kubectl create -f https://raw.githubusercontent.com/csi-addons/kubernetes-csi-addons/v0.8.0/deploy/controller/rbac.yaml +kubectl create -f https://raw.githubusercontent.com/csi-addons/kubernetes-csi-addons/v0.8.0/deploy/controller/setup-controller.yaml ``` Enable the `csi-addons` sidecar in the Rook operator configuration. diff --git a/Documentation/Storage-Configuration/Ceph-CSI/ceph-csi-drivers.md b/Documentation/Storage-Configuration/Ceph-CSI/ceph-csi-drivers.md index 38ee97104994..fd8a79f25014 100644 --- a/Documentation/Storage-Configuration/Ceph-CSI/ceph-csi-drivers.md +++ b/Documentation/Storage-Configuration/Ceph-CSI/ceph-csi-drivers.md @@ -127,9 +127,9 @@ that the controller inspects and forwards to one or more CSI-Addons sidecars for Deploy the controller by running the following commands: ```console -kubectl create -f https://raw.githubusercontent.com/csi-addons/kubernetes-csi-addons/v0.7.0/deploy/controller/crds.yaml -kubectl create -f https://raw.githubusercontent.com/csi-addons/kubernetes-csi-addons/v0.7.0/deploy/controller/rbac.yaml -kubectl create -f https://raw.githubusercontent.com/csi-addons/kubernetes-csi-addons/v0.7.0/deploy/controller/setup-controller.yaml +kubectl create -f https://raw.githubusercontent.com/csi-addons/kubernetes-csi-addons/v0.8.0/deploy/controller/crds.yaml +kubectl create -f https://raw.githubusercontent.com/csi-addons/kubernetes-csi-addons/v0.8.0/deploy/controller/rbac.yaml +kubectl create -f https://raw.githubusercontent.com/csi-addons/kubernetes-csi-addons/v0.8.0/deploy/controller/setup-controller.yaml ``` This creates the required CRDs and configures permissions. @@ -157,15 +157,15 @@ will start automatically in the RBD CSI provisioner and nodeplugin pods. CSI-Addons supports the following operations: * Reclaim Space - * [Creating a ReclaimSpaceJob](https://github.com/csi-addons/kubernetes-csi-addons/blob/v0.7.0/docs/reclaimspace.md#reclaimspacejob) - * [Creating a ReclaimSpaceCronJob](https://github.com/csi-addons/kubernetes-csi-addons/blob/v0.7.0/docs/reclaimspace.md#reclaimspacecronjob) - * [Annotating PersistentVolumeClaims](https://github.com/csi-addons/kubernetes-csi-addons/blob/v0.7.0/docs/reclaimspace.md#annotating-perstentvolumeclaims) - * [Annotating Namespace](https://github.com/csi-addons/kubernetes-csi-addons/blob/v0.7.0/docs/reclaimspace.md#annotating-namespace) + * [Creating a ReclaimSpaceJob](https://github.com/csi-addons/kubernetes-csi-addons/blob/v0.8.0/docs/reclaimspace.md#reclaimspacejob) + * [Creating a ReclaimSpaceCronJob](https://github.com/csi-addons/kubernetes-csi-addons/blob/v0.8.0/docs/reclaimspace.md#reclaimspacecronjob) + * [Annotating PersistentVolumeClaims](https://github.com/csi-addons/kubernetes-csi-addons/blob/v0.8.0/docs/reclaimspace.md#annotating-perstentvolumeclaims) + * [Annotating Namespace](https://github.com/csi-addons/kubernetes-csi-addons/blob/v0.8.0/docs/reclaimspace.md#annotating-namespace) * Network Fencing - * [Creating a NetworkFence](https://github.com/csi-addons/kubernetes-csi-addons/blob/v0.7.0/docs/networkfence.md) + * [Creating a NetworkFence](https://github.com/csi-addons/kubernetes-csi-addons/blob/v0.8.0/docs/networkfence.md) * Volume Replication - * [Creating VolumeReplicationClass](https://github.com/csi-addons/kubernetes-csi-addons/blob/v0.7.0/docs/volumereplicationclass.md) - * [Creating VolumeReplication CR](https://github.com/csi-addons/kubernetes-csi-addons/blob/v0.7.0/docs/volumereplication.md) + * [Creating VolumeReplicationClass](https://github.com/csi-addons/kubernetes-csi-addons/blob/v0.8.0/docs/volumereplicationclass.md) + * [Creating VolumeReplication CR](https://github.com/csi-addons/kubernetes-csi-addons/blob/v0.8.0/docs/volumereplication.md) ## Enable RBD Encryption Support diff --git a/Documentation/Storage-Configuration/Ceph-CSI/custom-images.md b/Documentation/Storage-Configuration/Ceph-CSI/custom-images.md index a45b5e04eded..cf805fe30f95 100644 --- a/Documentation/Storage-Configuration/Ceph-CSI/custom-images.md +++ b/Documentation/Storage-Configuration/Ceph-CSI/custom-images.md @@ -18,13 +18,13 @@ kubectl -n $ROOK_OPERATOR_NAMESPACE edit configmap rook-ceph-operator-config The default upstream images are included below, which you can change to your desired images. ```yaml -ROOK_CSI_CEPH_IMAGE: "quay.io/cephcsi/cephcsi:v3.10.0" +ROOK_CSI_CEPH_IMAGE: "quay.io/cephcsi/cephcsi:v3.10.1" ROOK_CSI_REGISTRAR_IMAGE: "registry.k8s.io/sig-storage/csi-node-driver-registrar:v2.9.1" ROOK_CSI_PROVISIONER_IMAGE: "registry.k8s.io/sig-storage/csi-provisioner:v3.6.2" ROOK_CSI_ATTACHER_IMAGE: "registry.k8s.io/sig-storage/csi-attacher:v4.4.2" ROOK_CSI_RESIZER_IMAGE: "registry.k8s.io/sig-storage/csi-resizer:v1.9.2" ROOK_CSI_SNAPSHOTTER_IMAGE: "registry.k8s.io/sig-storage/csi-snapshotter:v6.3.2" -ROOK_CSIADDONS_IMAGE: "quay.io/csiaddons/k8s-sidecar:v0.7.0" +ROOK_CSIADDONS_IMAGE: "quay.io/csiaddons/k8s-sidecar:v0.8.0" ``` ### **Use private repository** @@ -32,7 +32,7 @@ ROOK_CSIADDONS_IMAGE: "quay.io/csiaddons/k8s-sidecar:v0.7.0" If image version is not passed along with the image name in any of the variables above, Rook will add the corresponding default version to that image. Example: if `ROOK_CSI_CEPH_IMAGE: "quay.io/private-repo/cephcsi"` is passed, -Rook will add internal default version and consume it as `"quay.io/private-repo/cephcsi:v3.10.0"`. +Rook will add internal default version and consume it as `"quay.io/private-repo/cephcsi:v3.10.1"`. ### **Use default images** diff --git a/Documentation/Storage-Configuration/Monitoring/ceph-monitoring.md b/Documentation/Storage-Configuration/Monitoring/ceph-monitoring.md index fead6dc6962d..750dea9c72a7 100644 --- a/Documentation/Storage-Configuration/Monitoring/ceph-monitoring.md +++ b/Documentation/Storage-Configuration/Monitoring/ceph-monitoring.md @@ -44,7 +44,7 @@ There are two sources for metrics collection: From the root of your locally cloned Rook repo, go the monitoring directory: ```console -$ git clone --single-branch --branch v1.13.0 https://github.com/rook/rook.git +$ git clone --single-branch --branch v1.13.1 https://github.com/rook/rook.git cd rook/deploy/examples/monitoring ``` diff --git a/Documentation/Upgrade/rook-upgrade.md b/Documentation/Upgrade/rook-upgrade.md index 98659fa7125b..06ab0fdf199c 100644 --- a/Documentation/Upgrade/rook-upgrade.md +++ b/Documentation/Upgrade/rook-upgrade.md @@ -128,8 +128,8 @@ In order to successfully upgrade a Rook cluster, the following prerequisites mus ## Rook Operator Upgrade -The examples given in this guide upgrade a live Rook cluster running `v1.12.9` to -the version `v1.13.0`. This upgrade should work from any official patch release of Rook v1.12 to any +The examples given in this guide upgrade a live Rook cluster running `v1.12.10` to +the version `v1.13.1`. This upgrade should work from any official patch release of Rook v1.12 to any official patch release of v1.13. Let's get started! @@ -156,7 +156,7 @@ by the Operator. Also update the Custom Resource Definitions (CRDs). Get the latest common resources manifests that contain the latest changes. ```console -git clone --single-branch --depth=1 --branch v1.13.0 https://github.com/rook/rook.git +git clone --single-branch --depth=1 --branch v1.13.1 https://github.com/rook/rook.git cd rook/deploy/examples ``` @@ -195,7 +195,7 @@ The largest portion of the upgrade is triggered when the operator's image is upd When the operator is updated, it will proceed to update all of the Ceph daemons. ```console -kubectl -n $ROOK_OPERATOR_NAMESPACE set image deploy/rook-ceph-operator rook-ceph-operator=rook/ceph:v1.13.0 +kubectl -n $ROOK_OPERATOR_NAMESPACE set image deploy/rook-ceph-operator rook-ceph-operator=rook/ceph:v1.13.1 ``` ### **3. Update Ceph CSI** @@ -225,18 +225,18 @@ watch --exec kubectl -n $ROOK_CLUSTER_NAMESPACE get deployments -l rook_cluster= ``` As an example, this cluster is midway through updating the OSDs. When all deployments report `1/1/1` -availability and `rook-version=v1.13.0`, the Ceph cluster's core components are fully updated. +availability and `rook-version=v1.13.1`, the Ceph cluster's core components are fully updated. ```console Every 2.0s: kubectl -n rook-ceph get deployment -o j... -rook-ceph-mgr-a req/upd/avl: 1/1/1 rook-version=v1.13.0 -rook-ceph-mon-a req/upd/avl: 1/1/1 rook-version=v1.13.0 -rook-ceph-mon-b req/upd/avl: 1/1/1 rook-version=v1.13.0 -rook-ceph-mon-c req/upd/avl: 1/1/1 rook-version=v1.13.0 -rook-ceph-osd-0 req/upd/avl: 1// rook-version=v1.13.0 -rook-ceph-osd-1 req/upd/avl: 1/1/1 rook-version=v1.12.9 -rook-ceph-osd-2 req/upd/avl: 1/1/1 rook-version=v1.12.9 +rook-ceph-mgr-a req/upd/avl: 1/1/1 rook-version=v1.13.1 +rook-ceph-mon-a req/upd/avl: 1/1/1 rook-version=v1.13.1 +rook-ceph-mon-b req/upd/avl: 1/1/1 rook-version=v1.13.1 +rook-ceph-mon-c req/upd/avl: 1/1/1 rook-version=v1.13.1 +rook-ceph-osd-0 req/upd/avl: 1// rook-version=v1.13.1 +rook-ceph-osd-1 req/upd/avl: 1/1/1 rook-version=v1.12.10 +rook-ceph-osd-2 req/upd/avl: 1/1/1 rook-version=v1.12.10 ``` An easy check to see if the upgrade is totally finished is to check that there is only one @@ -245,14 +245,14 @@ An easy check to see if the upgrade is totally finished is to check that there i ```console # kubectl -n $ROOK_CLUSTER_NAMESPACE get deployment -l rook_cluster=$ROOK_CLUSTER_NAMESPACE -o jsonpath='{range .items[*]}{"rook-version="}{.metadata.labels.rook-version}{"\n"}{end}' | sort | uniq This cluster is not yet finished: - rook-version=v1.12.9 - rook-version=v1.13.0 + rook-version=v1.12.10 + rook-version=v1.13.1 This cluster is finished: - rook-version=v1.13.0 + rook-version=v1.13.1 ``` ### **5. Verify the updated cluster** -At this point, the Rook operator should be running version `rook/ceph:v1.13.0`. +At this point, the Rook operator should be running version `rook/ceph:v1.13.1`. Verify the CephCluster health using the [health verification doc](health-verification.md). diff --git a/deploy/charts/rook-ceph-cluster/templates/cephcluster.yaml b/deploy/charts/rook-ceph-cluster/templates/cephcluster.yaml index 3a568e22a383..4f5c78ca1e07 100644 --- a/deploy/charts/rook-ceph-cluster/templates/cephcluster.yaml +++ b/deploy/charts/rook-ceph-cluster/templates/cephcluster.yaml @@ -15,6 +15,9 @@ spec: {{- if .Values.monitoring.externalMgrPrometheusPort }} externalMgrPrometheusPort: {{ toYaml .Values.monitoring.externalMgrPrometheusPort }} {{- end }} +{{- if .Values.monitoring.interval }} + interval: {{ .Values.monitoring.interval }} +{{- end }} {{- end }} {{ toYaml .Values.cephClusterSpec | indent 2 }} diff --git a/deploy/charts/rook-ceph-cluster/values.yaml b/deploy/charts/rook-ceph-cluster/values.yaml index 67ed48f4bc41..fb2499c47221 100644 --- a/deploy/charts/rook-ceph-cluster/values.yaml +++ b/deploy/charts/rook-ceph-cluster/values.yaml @@ -61,6 +61,8 @@ monitoring: # Monitoring settings for external clusters: # externalMgrEndpoints: # externalMgrPrometheusPort: + # Scrape interval for prometheus + # interval: 5s # allow adding custom labels and annotations to the prometheus rule prometheusRule: # -- Labels applied to PrometheusRule diff --git a/deploy/charts/rook-ceph/templates/deployment.yaml b/deploy/charts/rook-ceph/templates/deployment.yaml index e7feeb64b8a7..a0d2be74fb91 100644 --- a/deploy/charts/rook-ceph/templates/deployment.yaml +++ b/deploy/charts/rook-ceph/templates/deployment.yaml @@ -32,6 +32,9 @@ spec: key: node.kubernetes.io/unreachable operator: Exists tolerationSeconds: 5 +{{- if .Values.tolerations }} +{{ toYaml .Values.tolerations | indent 8 }} +{{- end }} containers: - name: rook-ceph-operator image: "{{ .Values.image.repository }}:{{ .Values.image.tag }}" @@ -110,10 +113,6 @@ spec: nodeSelector: {{ toYaml .Values.nodeSelector | indent 8 }} {{- end }} -{{- if .Values.tolerations }} - tolerations: -{{ toYaml .Values.tolerations | indent 8 }} -{{- end }} {{- if .Values.rbacEnable }} serviceAccountName: rook-ceph-system {{- end }} diff --git a/deploy/charts/rook-ceph/templates/resources.yaml b/deploy/charts/rook-ceph/templates/resources.yaml index dae49adf1ede..c188bca43c04 100644 --- a/deploy/charts/rook-ceph/templates/resources.yaml +++ b/deploy/charts/rook-ceph/templates/resources.yaml @@ -2232,7 +2232,7 @@ spec: description: DualStack determines whether Ceph daemons should listen on both IPv4 and IPv6 type: boolean hostNetwork: - description: HostNetwork to enable host network + description: HostNetwork to enable host network. If host networking is enabled or disabled on a running cluster, then the operator will automatically fail over all the mons to apply the new network settings. type: boolean ipFamily: description: IPFamily is the single stack IPv6 or IPv4 protocol @@ -2252,7 +2252,7 @@ spec: type: boolean type: object provider: - description: Provider is what provides network connectivity to the cluster e.g. "host" or "multus" + description: Provider is what provides network connectivity to the cluster e.g. "host" or "multus". If the Provider is updated from being empty to "host" on a running cluster, then the operator will automatically fail over all the mons to apply the "host" network settings. enum: - "" - host diff --git a/deploy/charts/rook-ceph/values.yaml b/deploy/charts/rook-ceph/values.yaml index 28d29289dee2..e06dcc4cdc66 100644 --- a/deploy/charts/rook-ceph/values.yaml +++ b/deploy/charts/rook-ceph/values.yaml @@ -479,7 +479,7 @@ csi: cephcsi: # -- Ceph CSI image - # @default -- `quay.io/cephcsi/cephcsi:v3.10.0` + # @default -- `quay.io/cephcsi/cephcsi:v3.10.1` image: registrar: @@ -523,7 +523,7 @@ csi: # -- Enable CSIAddons enabled: false # -- CSIAddons Sidecar image - image: "quay.io/csiaddons/k8s-sidecar:v0.7.0" + image: "quay.io/csiaddons/k8s-sidecar:v0.8.0" nfs: # -- Enable the nfs csi driver diff --git a/deploy/examples/cluster-external-management.yaml b/deploy/examples/cluster-external-management.yaml index d0a79b088b98..8201e3b9e7ad 100644 --- a/deploy/examples/cluster-external-management.yaml +++ b/deploy/examples/cluster-external-management.yaml @@ -19,4 +19,4 @@ spec: dataDirHostPath: /var/lib/rook # providing an image is required, if you want to create other CRs (rgw, mds, nfs) cephVersion: - image: quay.io/ceph/ceph:v18.2.0 # Should match external cluster version + image: quay.io/ceph/ceph:v18.2.1 # Should match external cluster version diff --git a/deploy/examples/cluster-on-local-pvc.yaml b/deploy/examples/cluster-on-local-pvc.yaml index 0418c7e79d69..96f6ac6c0345 100644 --- a/deploy/examples/cluster-on-local-pvc.yaml +++ b/deploy/examples/cluster-on-local-pvc.yaml @@ -173,7 +173,7 @@ spec: requests: storage: 10Gi cephVersion: - image: quay.io/ceph/ceph:v18.2.0 + image: quay.io/ceph/ceph:v18.2.1 allowUnsupported: false skipUpgradeChecks: false continueUpgradeAfterChecksEvenIfNotHealthy: false diff --git a/deploy/examples/cluster-on-pvc.yaml b/deploy/examples/cluster-on-pvc.yaml index 1c5bde156b1e..a55773d6de79 100644 --- a/deploy/examples/cluster-on-pvc.yaml +++ b/deploy/examples/cluster-on-pvc.yaml @@ -33,7 +33,7 @@ spec: requests: storage: 10Gi cephVersion: - image: quay.io/ceph/ceph:v18.2.0 + image: quay.io/ceph/ceph:v18.2.1 allowUnsupported: false skipUpgradeChecks: false continueUpgradeAfterChecksEvenIfNotHealthy: false diff --git a/deploy/examples/cluster-stretched-aws.yaml b/deploy/examples/cluster-stretched-aws.yaml index 1414586685eb..20a3a1f9fb4a 100644 --- a/deploy/examples/cluster-stretched-aws.yaml +++ b/deploy/examples/cluster-stretched-aws.yaml @@ -44,7 +44,7 @@ spec: mgr: count: 2 cephVersion: - image: quay.io/ceph/ceph:v18.2.0 + image: quay.io/ceph/ceph:v18.2.1 allowUnsupported: true skipUpgradeChecks: false continueUpgradeAfterChecksEvenIfNotHealthy: false diff --git a/deploy/examples/cluster-stretched.yaml b/deploy/examples/cluster-stretched.yaml index 9feed4a742b2..adb19a347c3f 100644 --- a/deploy/examples/cluster-stretched.yaml +++ b/deploy/examples/cluster-stretched.yaml @@ -38,7 +38,7 @@ spec: mgr: count: 2 cephVersion: - image: quay.io/ceph/ceph:v18.2.0 + image: quay.io/ceph/ceph:v18.2.1 allowUnsupported: true skipUpgradeChecks: false continueUpgradeAfterChecksEvenIfNotHealthy: false diff --git a/deploy/examples/cluster.yaml b/deploy/examples/cluster.yaml index 1045a283c5e9..9b3451112fab 100644 --- a/deploy/examples/cluster.yaml +++ b/deploy/examples/cluster.yaml @@ -21,7 +21,7 @@ spec: # versions running within the cluster. See tags available at https://hub.docker.com/r/ceph/ceph/tags/. # If you want to be more precise, you can always use a timestamp tag such as quay.io/ceph/ceph:v17.2.6-20231027 # This tag might not contain a new Ceph version, just security fixes from the underlying operating system, which will reduce vulnerabilities - image: quay.io/ceph/ceph:v18.2.0 + image: quay.io/ceph/ceph:v18.2.1 # Whether to allow unsupported versions of Ceph. Currently `quincy` and `reef` are supported. # Future versions such as `squid` (v19) would require this to be set to `true`. # Do not set to true in production. diff --git a/deploy/examples/crds.yaml b/deploy/examples/crds.yaml index fa232557c08a..f381e9c3b604 100644 --- a/deploy/examples/crds.yaml +++ b/deploy/examples/crds.yaml @@ -2230,7 +2230,7 @@ spec: description: DualStack determines whether Ceph daemons should listen on both IPv4 and IPv6 type: boolean hostNetwork: - description: HostNetwork to enable host network + description: HostNetwork to enable host network. If host networking is enabled or disabled on a running cluster, then the operator will automatically fail over all the mons to apply the new network settings. type: boolean ipFamily: description: IPFamily is the single stack IPv6 or IPv4 protocol @@ -2250,7 +2250,7 @@ spec: type: boolean type: object provider: - description: Provider is what provides network connectivity to the cluster e.g. "host" or "multus" + description: Provider is what provides network connectivity to the cluster e.g. "host" or "multus". If the Provider is updated from being empty to "host" on a running cluster, then the operator will automatically fail over all the mons to apply the "host" network settings. enum: - "" - host diff --git a/deploy/examples/direct-mount.yaml b/deploy/examples/direct-mount.yaml index 90cae9979a88..6cc56827bd0f 100644 --- a/deploy/examples/direct-mount.yaml +++ b/deploy/examples/direct-mount.yaml @@ -18,7 +18,7 @@ spec: dnsPolicy: ClusterFirstWithHostNet containers: - name: rook-direct-mount - image: rook/ceph:v1.13.0 + image: rook/ceph:v1.13.1 command: ["/bin/bash"] args: ["-m", "-c", "/usr/local/bin/toolbox.sh"] imagePullPolicy: IfNotPresent diff --git a/deploy/examples/images.txt b/deploy/examples/images.txt index de7ad9f7eb4b..63d10c8be30b 100644 --- a/deploy/examples/images.txt +++ b/deploy/examples/images.txt @@ -1,11 +1,11 @@ gcr.io/k8s-staging-sig-storage/objectstorage-sidecar/objectstorage-sidecar:v20230130-v0.1.0-24-gc0cf995 - quay.io/ceph/ceph:v18.2.0 + quay.io/ceph/ceph:v18.2.1 quay.io/ceph/cosi:v0.1.1 - quay.io/cephcsi/cephcsi:v3.10.0 - quay.io/csiaddons/k8s-sidecar:v0.7.0 + quay.io/cephcsi/cephcsi:v3.10.1 + quay.io/csiaddons/k8s-sidecar:v0.8.0 registry.k8s.io/sig-storage/csi-attacher:v4.4.2 registry.k8s.io/sig-storage/csi-node-driver-registrar:v2.9.1 registry.k8s.io/sig-storage/csi-provisioner:v3.6.2 registry.k8s.io/sig-storage/csi-resizer:v1.9.2 registry.k8s.io/sig-storage/csi-snapshotter:v6.3.2 - rook/ceph:v1.13.0 + rook/ceph:v1.13.1 diff --git a/deploy/examples/multus-validation.yaml b/deploy/examples/multus-validation.yaml index c99160fb6153..febd364850b0 100644 --- a/deploy/examples/multus-validation.yaml +++ b/deploy/examples/multus-validation.yaml @@ -101,7 +101,7 @@ spec: serviceAccountName: rook-ceph-multus-validation containers: - name: multus-validation - image: rook/ceph:v1.13.0 + image: rook/ceph:v1.13.1 command: ["rook"] args: - "multus" diff --git a/deploy/examples/operator-openshift.yaml b/deploy/examples/operator-openshift.yaml index a66924f8d134..0d2f912fb3df 100644 --- a/deploy/examples/operator-openshift.yaml +++ b/deploy/examples/operator-openshift.yaml @@ -190,7 +190,7 @@ data: # The default version of CSI supported by Rook will be started. To change the version # of the CSI driver to something other than what is officially supported, change # these images to the desired release of the CSI driver. - # ROOK_CSI_CEPH_IMAGE: "quay.io/cephcsi/cephcsi:v3.10.0" + # ROOK_CSI_CEPH_IMAGE: "quay.io/cephcsi/cephcsi:v3.10.1" # ROOK_CSI_REGISTRAR_IMAGE: "registry.k8s.io/sig-storage/csi-node-driver-registrar:v2.9.1" # ROOK_CSI_RESIZER_IMAGE: "registry.k8s.io/sig-storage/csi-resizer:v1.9.2" # ROOK_CSI_PROVISIONER_IMAGE: "registry.k8s.io/sig-storage/csi-provisioner:v3.6.2" @@ -572,7 +572,7 @@ data: CSI_ENABLE_CSIADDONS: "false" # Enable watch for faster recovery from rbd rwo node loss ROOK_WATCH_FOR_NODE_FAILURE: "true" - # ROOK_CSIADDONS_IMAGE: "quay.io/csiaddons/k8s-sidecar:v0.5.0" + # ROOK_CSIADDONS_IMAGE: "quay.io/csiaddons/k8s-sidecar:v0.8.0" # The GCSI RPC timeout value (in seconds). It should be >= 120. If this variable is not set or is an invalid value, it's default to 150. CSI_GRPC_TIMEOUT_SECONDS: "150" @@ -672,7 +672,7 @@ spec: serviceAccountName: rook-ceph-system containers: - name: rook-ceph-operator - image: rook/ceph:v1.13.0 + image: rook/ceph:v1.13.1 args: ["ceph", "operator"] securityContext: runAsNonRoot: true diff --git a/deploy/examples/operator.yaml b/deploy/examples/operator.yaml index 76a10d5479af..94169106693d 100644 --- a/deploy/examples/operator.yaml +++ b/deploy/examples/operator.yaml @@ -106,7 +106,7 @@ data: # The default version of CSI supported by Rook will be started. To change the version # of the CSI driver to something other than what is officially supported, change # these images to the desired release of the CSI driver. - # ROOK_CSI_CEPH_IMAGE: "quay.io/cephcsi/cephcsi:v3.10.0" + # ROOK_CSI_CEPH_IMAGE: "quay.io/cephcsi/cephcsi:v3.10.1" # ROOK_CSI_REGISTRAR_IMAGE: "registry.k8s.io/sig-storage/csi-node-driver-registrar:v2.9.1" # ROOK_CSI_RESIZER_IMAGE: "registry.k8s.io/sig-storage/csi-resizer:v1.9.2" # ROOK_CSI_PROVISIONER_IMAGE: "registry.k8s.io/sig-storage/csi-provisioner:v3.6.2" @@ -499,7 +499,7 @@ data: CSI_ENABLE_CSIADDONS: "false" # Enable watch for faster recovery from rbd rwo node loss ROOK_WATCH_FOR_NODE_FAILURE: "true" - # ROOK_CSIADDONS_IMAGE: "quay.io/csiaddons/k8s-sidecar:v0.7.0" + # ROOK_CSIADDONS_IMAGE: "quay.io/csiaddons/k8s-sidecar:v0.8.0" # The CSI GRPC timeout value (in seconds). It should be >= 120. If this variable is not set or is an invalid value, it's default to 150. CSI_GRPC_TIMEOUT_SECONDS: "150" @@ -598,7 +598,7 @@ spec: serviceAccountName: rook-ceph-system containers: - name: rook-ceph-operator - image: rook/ceph:v1.13.0 + image: rook/ceph:v1.13.1 args: ["ceph", "operator"] securityContext: runAsNonRoot: true diff --git a/deploy/examples/osd-purge.yaml b/deploy/examples/osd-purge.yaml index aeda0e13f990..f5fd57942b6d 100644 --- a/deploy/examples/osd-purge.yaml +++ b/deploy/examples/osd-purge.yaml @@ -28,7 +28,7 @@ spec: serviceAccountName: rook-ceph-purge-osd containers: - name: osd-removal - image: rook/ceph:v1.13.0 + image: rook/ceph:v1.13.1 # TODO: Insert the OSD ID in the last parameter that is to be removed # The OSD IDs are a comma-separated list. For example: "0" or "0,2". # If you want to preserve the OSD PVCs, set `--preserve-pvc true`. diff --git a/deploy/examples/toolbox-job.yaml b/deploy/examples/toolbox-job.yaml index 10afd44b2cc0..e5778b132205 100644 --- a/deploy/examples/toolbox-job.yaml +++ b/deploy/examples/toolbox-job.yaml @@ -10,7 +10,7 @@ spec: spec: initContainers: - name: config-init - image: rook/ceph:v1.13.0 + image: rook/ceph:v1.13.1 command: ["/usr/local/bin/toolbox.sh"] args: ["--skip-watch"] imagePullPolicy: IfNotPresent @@ -29,7 +29,7 @@ spec: mountPath: /var/lib/rook-ceph-mon containers: - name: script - image: rook/ceph:v1.13.0 + image: rook/ceph:v1.13.1 volumeMounts: - mountPath: /etc/ceph name: ceph-config diff --git a/deploy/examples/toolbox-operator-image.yaml b/deploy/examples/toolbox-operator-image.yaml index 21d693bcea88..07fc123d8bd5 100644 --- a/deploy/examples/toolbox-operator-image.yaml +++ b/deploy/examples/toolbox-operator-image.yaml @@ -24,7 +24,7 @@ spec: dnsPolicy: ClusterFirstWithHostNet containers: - name: rook-ceph-tools-operator-image - image: rook/ceph:v1.13.0 + image: rook/ceph:v1.13.1 command: - /bin/bash - -c diff --git a/deploy/examples/toolbox.yaml b/deploy/examples/toolbox.yaml index fe9350f29e5a..d90bb52c94fd 100644 --- a/deploy/examples/toolbox.yaml +++ b/deploy/examples/toolbox.yaml @@ -18,7 +18,7 @@ spec: dnsPolicy: ClusterFirstWithHostNet containers: - name: rook-ceph-tools - image: quay.io/ceph/ceph:v18.2.0 + image: quay.io/ceph/ceph:v18.2.1 command: - /bin/bash - -c diff --git a/images/ceph/Makefile b/images/ceph/Makefile index 992fb1e4e747..495fc037f87e 100755 --- a/images/ceph/Makefile +++ b/images/ceph/Makefile @@ -18,9 +18,9 @@ include ../image.mk # Image Build Options ifeq ($(GOARCH),amd64) -CEPH_VERSION ?= v18.2.0-20231018 +CEPH_VERSION ?= v18.2.1-20231215 else -CEPH_VERSION ?= v18.2.0-20231018 +CEPH_VERSION ?= v18.2.1-20231215 endif REGISTRY_NAME = quay.io BASEIMAGE = $(REGISTRY_NAME)/ceph/ceph-$(GOARCH):$(CEPH_VERSION) diff --git a/pkg/apis/ceph.rook.io/v1/types.go b/pkg/apis/ceph.rook.io/v1/types.go index 050977f420f5..70806eb7e7af 100755 --- a/pkg/apis/ceph.rook.io/v1/types.go +++ b/pkg/apis/ceph.rook.io/v1/types.go @@ -2317,7 +2317,8 @@ type SSSDSidecarAdditionalFile struct { // NetworkSpec for Ceph includes backward compatibility code // +kubebuilder:validation:XValidation:message="at least one network selector must be specified when using multus",rule="!has(self.provider) || (self.provider != 'multus' || (self.provider == 'multus' && size(self.selectors) > 0))" type NetworkSpec struct { - // Provider is what provides network connectivity to the cluster e.g. "host" or "multus" + // Provider is what provides network connectivity to the cluster e.g. "host" or "multus". + // If the Provider is updated from being empty to "host" on a running cluster, then the operator will automatically fail over all the mons to apply the "host" network settings. // +kubebuilder:validation:XValidation:message="network provider must be disabled (reverted to empty string) before a new provider is enabled",rule="self == '' || self == oldSelf" // +nullable // +optional @@ -2363,7 +2364,9 @@ type NetworkSpec struct { // +optional Connections *ConnectionsSpec `json:"connections,omitempty"` - // HostNetwork to enable host network + // HostNetwork to enable host network. + // If host networking is enabled or disabled on a running cluster, then the operator will automatically fail over all the mons to + // apply the new network settings. // +optional HostNetwork bool `json:"hostNetwork,omitempty"` diff --git a/pkg/operator/ceph/cluster/mon/mon.go b/pkg/operator/ceph/cluster/mon/mon.go index f3012d668cff..a5961793aa11 100644 --- a/pkg/operator/ceph/cluster/mon/mon.go +++ b/pkg/operator/ceph/cluster/mon/mon.go @@ -1343,6 +1343,12 @@ func (c *Cluster) startMon(m *monConfig, schedule *controller.MonScheduleInfo) e return nil } + // skip update if mon fail over is required due to change in hostnetwork settings + if isMonIPUpdateRequiredForHostNetwork(m.DaemonName, m.UseHostNetwork, &c.spec.Network) { + c.monsToFailover.Insert(m.DaemonName) + return nil + } + // the existing deployment may have a node selector. if the cluster // isn't using host networking and the deployment is using pvc storage, // then the node selector can be removed. this may happen after @@ -1408,6 +1414,19 @@ func (c *Cluster) startMon(m *monConfig, schedule *controller.MonScheduleInfo) e return nil } +func isMonIPUpdateRequiredForHostNetwork(mon string, isMonUsingHostNetwork bool, network *cephv1.NetworkSpec) bool { + isHostNetworkEnabledInSpec := network.IsHost() + if isHostNetworkEnabledInSpec && !isMonUsingHostNetwork { + logger.Infof("host network is enabled for the cluster but mon %q is not running on host IP address", mon) + return true + } else if !isHostNetworkEnabledInSpec && isMonUsingHostNetwork { + logger.Infof("host network is disabled for the cluster but mon %q is still running on host IP address", mon) + return true + } + + return false +} + func hasMonPathChanged(d *apps.Deployment, claim *v1.PersistentVolumeClaim) bool { if d.Labels["pvc_name"] == "" && claim != nil { logger.Infof("skipping update for mon %q where path has changed from hostPath to pvc", d.Name) diff --git a/pkg/operator/ceph/cluster/mon/mon_test.go b/pkg/operator/ceph/cluster/mon/mon_test.go index 49be715a0bae..c23d130e36bc 100644 --- a/pkg/operator/ceph/cluster/mon/mon_test.go +++ b/pkg/operator/ceph/cluster/mon/mon_test.go @@ -44,6 +44,7 @@ import ( apps "k8s.io/api/apps/v1" v1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/util/sets" ) // generate a standard mon config from a mon id w/ default port and IP 2.4.6.{1,2,3,...} @@ -120,7 +121,8 @@ func newCluster(context *clusterd.Context, namespace string, allowMultiplePerNod mapping: &opcontroller.Mapping{ Schedule: map[string]*opcontroller.MonScheduleInfo{}, }, - ownerInfo: ownerInfo, + ownerInfo: ownerInfo, + monsToFailover: sets.New[string](), } } @@ -917,3 +919,40 @@ func TestHasMonPathChanged(t *testing.T) { assert.False(t, hasMonPathChanged(monDeployment, nil)) }) } + +func TestIsMonIPUpdateRequiredForHostNetwork(t *testing.T) { + t.Run("both cluster and mon are set to use host network", func(t *testing.T) { + hostNetwork := &cephv1.NetworkSpec{HostNetwork: true} + monUsingHostNetwork := true + assert.False(t, isMonIPUpdateRequiredForHostNetwork("a", monUsingHostNetwork, hostNetwork)) + }) + + t.Run("both cluster and mon are not set for host network", func(t *testing.T) { + hostNetwork := &cephv1.NetworkSpec{} + monUsingHostNetwork := false + assert.False(t, isMonIPUpdateRequiredForHostNetwork("a", monUsingHostNetwork, hostNetwork)) + }) + t.Run("cluster is set for host networking but mon pod is not", func(t *testing.T) { + hostNetwork := &cephv1.NetworkSpec{HostNetwork: true} + monUsingHostNetwork := false + assert.True(t, isMonIPUpdateRequiredForHostNetwork("a", monUsingHostNetwork, hostNetwork)) + }) + + t.Run("mon is using host networking but cluster is updated to not use host network ", func(t *testing.T) { + hostNetwork := &cephv1.NetworkSpec{} + monUsingHostNetwork := true + assert.True(t, isMonIPUpdateRequiredForHostNetwork("a", monUsingHostNetwork, hostNetwork)) + }) + + t.Run("mon is using host networking and cluster is set host network via NetworkProviderHost ", func(t *testing.T) { + hostNetwork := &cephv1.NetworkSpec{Provider: cephv1.NetworkProviderHost} + monUsingHostNetwork := true + assert.False(t, isMonIPUpdateRequiredForHostNetwork("a", monUsingHostNetwork, hostNetwork)) + }) + + t.Run("mon is not using host networking but cluster is updated to use host network via NetworkProviderHost ", func(t *testing.T) { + hostNetwork := &cephv1.NetworkSpec{Provider: cephv1.NetworkProviderHost} + monUsingHostNetwork := false + assert.True(t, isMonIPUpdateRequiredForHostNetwork("a", monUsingHostNetwork, hostNetwork)) + }) +} diff --git a/pkg/operator/ceph/cluster/watcher.go b/pkg/operator/ceph/cluster/watcher.go index da55e20ce641..25edc67d074d 100644 --- a/pkg/operator/ceph/cluster/watcher.go +++ b/pkg/operator/ceph/cluster/watcher.go @@ -201,47 +201,79 @@ func (c *clientCluster) fenceNode(ctx context.Context, node *corev1.Node, cluste } logger.Debugf("volumesInuse %s", volumesInuse) - rbdVolumesInUse := getCephVolumesInUse(cluster, volumesInuse) - if len(rbdVolumesInUse) == 0 { - logger.Debugf("no rbd volumes in use for out of service node %q", node.Name) + rbdVolumesInUse, cephFSVolumeInUse := getCephVolumesInUse(cluster, volumesInuse) + if len(rbdVolumesInUse) == 0 && len(cephFSVolumeInUse) == 0 { + logger.Debugf("no rbd or cephFS volumes in use for out of service node %q", node.Name) return nil } - logger.Infof("node %q require fencing, found rbd volumes in use", node.Name) listPVs, err := c.context.Clientset.CoreV1().PersistentVolumes().List(ctx, metav1.ListOptions{}) if err != nil { return pkgerror.Wrapf(err, "failed to list PV") } - rbdPVList := listRBDPV(listPVs, cluster, rbdVolumesInUse) - if len(rbdPVList) == 0 { - logger.Debug("No rbd PVs found on the node") - return nil - } + if len(rbdVolumesInUse) != 0 { + rbdPVList := listRBDPV(listPVs, cluster, rbdVolumesInUse) + if len(rbdPVList) == 0 { + logger.Debug("No rbd PVs found on the node") + } else { + logger.Infof("node %q require fencing, found rbd volumes in use", node.Name) + clusterInfo, _, _, err := opcontroller.LoadClusterInfo(c.context, ctx, cluster.Namespace, &cluster.Spec) + if err != nil { + return pkgerror.Wrapf(err, "Failed to load cluster info.") + } - clusterInfo, _, _, err := opcontroller.LoadClusterInfo(c.context, ctx, cluster.Namespace, &cluster.Spec) - if err != nil { - return pkgerror.Wrapf(err, "Failed to load cluster info.") + for i := range rbdPVList { + err = c.fenceRbdImage(ctx, node, cluster, clusterInfo, rbdPVList[i]) + // We only need to create the network fence for any one of rbd pv. + if err == nil { + break + } + + if i == len(rbdPVList)-1 { + return pkgerror.Wrapf(err, "failed to fence rbd volumes") + } + logger.Errorf("failed to fence rbd volumes %q, trying next rbd volume", rbdPVList[i].Name) + } + } } - for i := range rbdPVList { - err = c.fenceRbdImage(ctx, node, cluster, clusterInfo, rbdPVList[i]) - // We only need to create the network fence for any one of rbd pv. - if err == nil { - break + if len(cephFSVolumeInUse) != 0 { + cephFSVolumeInUseMap := make(map[string]struct{}) + for _, vol := range cephFSVolumeInUse { + cephFSVolumeInUseMap[vol] = struct{}{} } + cephFSPVList := listRWOCephFSPV(listPVs, cluster, cephFSVolumeInUseMap) + if len(cephFSPVList) == 0 { + logger.Debug("No cephFS PVs found on the node") + return nil + } + logger.Infof("node %q require fencing, found cephFS volumes in use", node.Name) + clusterInfo, _, _, err := opcontroller.LoadClusterInfo(c.context, ctx, cluster.Namespace, &cluster.Spec) + if err != nil { + return pkgerror.Wrapf(err, "Failed to load cluster info.") + } + + for i := range cephFSPVList { + err = c.fenceCephFSVolume(ctx, node, cluster, clusterInfo, cephFSPVList[i]) + // We only need to create the network fence for any one of cephFS pv. + if err == nil { + break + } - if i == len(rbdPVList)-1 { - return pkgerror.Wrapf(err, "failed to fence rbd volumes") + if i == len(cephFSPVList)-1 { + return pkgerror.Wrapf(err, "failed to fence cephFS volumes") + } + logger.Errorf("failed to fence cephFS volumes %q, trying next cephFS volume", cephFSPVList[i].Name) } - logger.Errorf("failed to fence rbd volumes %q, trying next rbd volume", rbdPVList[i].Name) + } return nil } -func getCephVolumesInUse(cluster *cephv1.CephCluster, volumesInUse []corev1.UniqueVolumeName) []string { - var rbdVolumesInUse []string +func getCephVolumesInUse(cluster *cephv1.CephCluster, volumesInUse []corev1.UniqueVolumeName) ([]string, []string) { + var rbdVolumesInUse, cephFSVolumeInUse []string for _, volume := range volumesInUse { splitVolumeInUseBased := trimeVolumeInUse(volume) @@ -250,8 +282,13 @@ func getCephVolumesInUse(cluster *cephv1.CephCluster, volumesInUse []corev1.Uniq if len(splitVolumeInUseBased) == 2 && splitVolumeInUseBased[0] == fmt.Sprintf("%s.rbd.csi.ceph.com", cluster.Namespace) { rbdVolumesInUse = append(rbdVolumesInUse, splitVolumeInUseBased[1]) } + + if len(splitVolumeInUseBased) == 2 && splitVolumeInUseBased[0] == fmt.Sprintf("%s.cephfs.csi.ceph.com", cluster.Namespace) { + cephFSVolumeInUse = append(cephFSVolumeInUse, splitVolumeInUseBased[1]) + } } - return rbdVolumesInUse + + return rbdVolumesInUse, cephFSVolumeInUse } func trimeVolumeInUse(volume corev1.UniqueVolumeName) []string { @@ -290,6 +327,36 @@ func listRBDPV(listPVs *corev1.PersistentVolumeList, cluster *cephv1.CephCluster return listRbdPV } +func listRWOCephFSPV(listPVs *corev1.PersistentVolumeList, cluster *cephv1.CephCluster, cephFSVolumesInUse map[string]struct{}) []corev1.PersistentVolume { + var listCephFSPV []corev1.PersistentVolume + + for _, pv := range listPVs.Items { + // Skip if pv is not provisioned by CSI + if pv.Spec.CSI == nil { + logger.Debugf("pv %q is not provisioned by CSI", pv.Name) + continue + } + + if pv.Spec.CSI.Driver == fmt.Sprintf("%s.cephfs.csi.ceph.com", cluster.Namespace) { + // Ignore PVs that support multinode access (RWX, ROX), since they can be mounted on multiple nodes. + if pvSupportsMultiNodeAccess(pv.Spec.AccessModes) { + continue + } + + if pv.Spec.CSI.VolumeAttributes["staticVolume"] == "true" || pv.Spec.CSI.VolumeAttributes["pool"] == "" { + logger.Debugf("skipping, static pv %q", pv.Name) + continue + } + // Check if the volume is in use + if _, exists := cephFSVolumesInUse[pv.Spec.CSI.VolumeHandle]; exists { + listCephFSPV = append(listCephFSPV, pv) + } + } + + } + return listCephFSPV +} + // pvSupportsMultiNodeAccess returns true if the PV access modes contain ReadWriteMany or ReadOnlyMany. func pvSupportsMultiNodeAccess(accessModes []corev1.PersistentVolumeAccessMode) bool { for _, accessMode := range accessModes { @@ -330,6 +397,80 @@ func (c *clientCluster) fenceRbdImage( return nil } +func (c *clientCluster) fenceCephFSVolume( + ctx context.Context, node *corev1.Node, cluster *cephv1.CephCluster, + clusterInfo *cephclient.ClusterInfo, cephFSPV corev1.PersistentVolume) error { + + logger.Infof("fencing cephfs volume %q on node %q", cephFSPV.Name, node.Name) + + status, err := cephclient.StatusWithUser(c.context, clusterInfo) + if err != nil { + return fmt.Errorf("failed to get ceph status for check active mds. %v", err) + } + + var activeMDS string + for _, fsRank := range status.Fsmap.ByRank { + if fsRank.Status == "up:active" { + activeMDS = fsRank.Name + } + } + + args := []string{"tell", fmt.Sprintf("mds.%s", activeMDS), "client", "ls", "--format", "json"} + cmd := cephclient.NewCephCommand(c.context, clusterInfo, args) + cmd.JsonOutput = true + + buf, err := cmd.Run() + if err != nil { + return fmt.Errorf("failed to list watchers for cephfs pool/subvoumeName %s/%s. %v", cephFSPV.Spec.CSI.VolumeAttributes["pool"], cephFSPV.Spec.CSI.VolumeAttributes["subvolumeName"], err) + } + ips, err := cephFSMDSClientMarshal(buf, cephFSPV) + if err != nil || ips == nil { + return fmt.Errorf("failed to unmarshal cephfs mds output. %v", err) + } + + err = c.createNetworkFence(ctx, cephFSPV, node, cluster, ips) + if err != nil { + return fmt.Errorf("failed to create network fence for node %q. %v", node.Name, err) + } + + return nil +} + +func cephFSMDSClientMarshal(output []byte, cephFSPV corev1.PersistentVolume) ([]string, error) { + type entity struct { + Addr struct { + Addr string `json:"addr"` + Nonce int `json:"nonce"` + } `json:"addr"` + } + + type clientMetadata struct { + Root string `json:"root"` + } + + type cephFSData struct { + Entity entity `json:"entity"` + ClientMetadata clientMetadata `json:"client_metadata"` + } + + var data []cephFSData + err := json.Unmarshal([]byte(output), &data) + if err != nil { + return []string{}, pkgerror.Wrapf(err, "failed to unmarshal cephFS data output") + } + + watcherIPlist := []string{} + for _, d := range data { + if cephFSPV.Spec.CSI.VolumeAttributes["subvolumePath"] == d.ClientMetadata.Root { + logger.Infof("cephfs mds client ips to fence %v", d.Entity.Addr) + watcherIP := concatenateWatcherIp(d.Entity.Addr.Addr) + watcherIPlist = append(watcherIPlist, watcherIP) + } + } + + return watcherIPlist, nil +} + func rbdStatusUnMarshal(output []byte) ([]string, error) { type rbdStatus struct { Watchers []struct { @@ -352,9 +493,10 @@ func rbdStatusUnMarshal(output []byte) ([]string, error) { } func concatenateWatcherIp(address string) string { - // address is in format `10.63.0.5:0/1254753579` - // split with separation ':0/' to remove nounce and concatenating `/32` to define a network with only one IP address - watcherIP := strings.Split(address, ":0/")[0] + "/32" + // address is in format `10.63.0.5:0/1254753579` for rbd and + // in the format '10.244.0.12:0' for cephfs + // split with separation ':0' to remove nounce and concatenating `/32` to define a network with only one IP address + watcherIP := strings.Split(address, ":0")[0] + "/32" return watcherIP } diff --git a/pkg/operator/ceph/cluster/watcher_test.go b/pkg/operator/ceph/cluster/watcher_test.go index 19d659437ac1..d37cb84c15d9 100644 --- a/pkg/operator/ceph/cluster/watcher_test.go +++ b/pkg/operator/ceph/cluster/watcher_test.go @@ -175,9 +175,11 @@ func TestHandleNodeFailure(t *testing.T) { switch { case command == "rbd" && args[0] == "status": return `{"watchers":[{"address":"192.168.39.137:0/3762982934","client":4307,"cookie":18446462598732840961}]}`, nil + case command == "ceph" && args[0] == "tell": + return `{"watchers":[{"id":5201,"entity":[{"addr": [{"addr": "10.244.0.12:0", "nonce":3247243972}]}]]}`, nil } - return "", errors.Errorf("unexpected rbd command %q", args) + return "", errors.Errorf("unexpected rbd/ceph command %q", args) } node := &corev1.Node{ @@ -201,6 +203,7 @@ func TestHandleNodeFailure(t *testing.T) { }, VolumesInUse: []corev1.UniqueVolumeName{ "kubernetes.io/csi/rook-ceph.rbd.csi.ceph.com^0001-0009-rook-ceph-0000000000000002-24862838-240d-4215-9183-abfc0e9e4002", + "kubernetes.io/csi/rook-ceph.cephfs.csi.ceph.com^0001-0009-rook-ceph-0000000000000002-24862838-240d-4215-9183-abfc0e9e4001", }, }, } @@ -228,7 +231,7 @@ func TestHandleNodeFailure(t *testing.T) { }, } - staticPV := &corev1.PersistentVolume{ + staticRbdPV := &corev1.PersistentVolume{ ObjectMeta: metav1.ObjectMeta{ Name: "pvc-58469d41-f6c0-4720-b23a-0a0826b841cb", Annotations: map[string]string{ @@ -248,6 +251,26 @@ func TestHandleNodeFailure(t *testing.T) { }, } + staticCephfsPV := &corev1.PersistentVolume{ + ObjectMeta: metav1.ObjectMeta{ + Name: "pvc-58469d41-f6c0-4720-b23a-0a0826b842cb", + Annotations: map[string]string{ + "pv.kubernetes.io/provisioned-by": fmt.Sprintf("%s.cephfs.csi.ceph.com", ns), + "volume.kubernetes.io/provisioner-deletion-secret-name": "rook-csi-cephfs-provisioner", + "volume.kubernetes.io/provisioner-deletion-secret-namespace": ns, + }, + }, + Spec: corev1.PersistentVolumeSpec{ + PersistentVolumeSource: corev1.PersistentVolumeSource{ + CSI: &corev1.CSIPersistentVolumeSource{ + Driver: fmt.Sprintf("%s.cephfs.csi.ceph.com", ns), + VolumeHandle: "0001-0009-rook-ceph-0000000000000002-24862838-240d-4215-9183-abfc0e9e4001", + VolumeAttributes: map[string]string{}, + }, + }, + }, + } + pvNotProvisionByCSI := &corev1.PersistentVolume{ ObjectMeta: metav1.ObjectMeta{ Name: "pvc-58469d41-f6c0-4720-b23a-0a0826b841cc", @@ -302,30 +325,48 @@ func TestHandleNodeFailure(t *testing.T) { err = c.client.Get(ctx, types.NamespacedName{Name: node.Name, Namespace: cephCluster.Namespace}, networkFence) assert.NoError(t, err) - // For static pv - _, err = c.context.Clientset.CoreV1().PersistentVolumes().Create(ctx, staticPV, metav1.CreateOptions{}) + // For static rbd pv + _, err = c.context.Clientset.CoreV1().PersistentVolumes().Create(ctx, staticRbdPV, metav1.CreateOptions{}) assert.NoError(t, err) pvList, err := c.context.Clientset.CoreV1().PersistentVolumes().List(ctx, metav1.ListOptions{}) assert.NoError(t, err) - volumeInUse := getCephVolumesInUse(cephCluster, node.Status.VolumesInUse) - rbdPVList := listRBDPV(pvList, cephCluster, volumeInUse) - assert.Equal(t, len(rbdPVList), 1) // it will be equal to once since we have one pv provisioned by csi named `PV` + rbdVolumesInUse, _ := getCephVolumesInUse(cephCluster, node.Status.VolumesInUse) + rbdPVList := listRBDPV(pvList, cephCluster, rbdVolumesInUse) + assert.Equal(t, len(rbdPVList), 1) // it will be equal to one since we have one pv provisioned by csi named `PV` err = c.handleNodeFailure(ctx, cephCluster, node) assert.NoError(t, err) - // For static pv + // For static cephfs pv + _, err = c.context.Clientset.CoreV1().PersistentVolumes().Create(ctx, staticCephfsPV, metav1.CreateOptions{}) + assert.NoError(t, err) + + pvList, err = c.context.Clientset.CoreV1().PersistentVolumes().List(ctx, metav1.ListOptions{}) + assert.NoError(t, err) + + _, cephFSVolumesInUse := getCephVolumesInUse(cephCluster, node.Status.VolumesInUse) + cephFSVolumesInUseMap := make(map[string]struct{}) + for _, vol := range cephFSVolumesInUse { + cephFSVolumesInUseMap[vol] = struct{}{} + } + cephFSPVList := listRWOCephFSPV(pvList, cephCluster, cephFSVolumesInUseMap) + assert.Equal(t, len(cephFSPVList), 0) + + err = c.handleNodeFailure(ctx, cephCluster, node) + assert.NoError(t, err) + + // For pv not provisioned by CSI _, err = c.context.Clientset.CoreV1().PersistentVolumes().Create(ctx, pvNotProvisionByCSI, metav1.CreateOptions{}) assert.NoError(t, err) pvList, err = c.context.Clientset.CoreV1().PersistentVolumes().List(ctx, metav1.ListOptions{}) assert.NoError(t, err) - volumeInUse = getCephVolumesInUse(cephCluster, node.Status.VolumesInUse) - rbdPVList = listRBDPV(pvList, cephCluster, volumeInUse) - assert.Equal(t, len(rbdPVList), 1) // it will be equal to once since we have one pv provisioned by csi named `PV` + rbdVolumesInUse, _ = getCephVolumesInUse(cephCluster, node.Status.VolumesInUse) + rbdPVList = listRBDPV(pvList, cephCluster, rbdVolumesInUse) + assert.Equal(t, len(rbdPVList), 1) // it will be equal to one since we have one pv provisioned by csi named `PV` err = c.handleNodeFailure(ctx, cephCluster, node) assert.NoError(t, err) @@ -345,6 +386,8 @@ func TestGetCephVolumesInUse(t *testing.T) { volInUse := []corev1.UniqueVolumeName{ "kubernetes.io/csi/rook-ceph.rbd.csi.ceph.com^0001-0009-rook-ceph-0000000000000002-24862838-240d-4215-9183-abfc0e9e4002", "kubernetes.io/csi/rook-ceph.rbd.csi.ceph.com^0001-0009-rook-ceph-0000000000000002-24862838-240d-4215-9183-abfc0e9e4003", + "kubernetes.io/csi/rook-ceph.cephfs.csi.ceph.com^0001-0009-rook-ceph-0000000000000002-24862838-240d-4215-9183-abfc0e9e4001", + "kubernetes.io/csi/rook-ceph.cephfs.csi.ceph.com^0001-0009-rook-ceph-0000000000000002-24862838-240d-4215-9183-abfc0e9e4004", } splitVolInUse := trimeVolumeInUse(volInUse[0]) @@ -355,9 +398,21 @@ func TestGetCephVolumesInUse(t *testing.T) { assert.Equal(t, splitVolInUse[0], "rook-ceph.rbd.csi.ceph.com") assert.Equal(t, splitVolInUse[1], "0001-0009-rook-ceph-0000000000000002-24862838-240d-4215-9183-abfc0e9e4003") - trimVolInUse := getCephVolumesInUse(cephCluster, volInUse) - expected := []string{"0001-0009-rook-ceph-0000000000000002-24862838-240d-4215-9183-abfc0e9e4002", "0001-0009-rook-ceph-0000000000000002-24862838-240d-4215-9183-abfc0e9e4003"} - assert.Equal(t, expected, trimVolInUse) + splitVolInUse = trimeVolumeInUse(volInUse[2]) + assert.Equal(t, splitVolInUse[0], "rook-ceph.cephfs.csi.ceph.com") + assert.Equal(t, splitVolInUse[1], "0001-0009-rook-ceph-0000000000000002-24862838-240d-4215-9183-abfc0e9e4001") + + splitVolInUse = trimeVolumeInUse(volInUse[3]) + assert.Equal(t, splitVolInUse[0], "rook-ceph.cephfs.csi.ceph.com") + assert.Equal(t, splitVolInUse[1], "0001-0009-rook-ceph-0000000000000002-24862838-240d-4215-9183-abfc0e9e4004") + + trimRbdVolInUse, trimCephFSVolInUse := getCephVolumesInUse(cephCluster, volInUse) + + expectedRbd := []string{"0001-0009-rook-ceph-0000000000000002-24862838-240d-4215-9183-abfc0e9e4002", "0001-0009-rook-ceph-0000000000000002-24862838-240d-4215-9183-abfc0e9e4003"} + expectedCephfs := []string{"0001-0009-rook-ceph-0000000000000002-24862838-240d-4215-9183-abfc0e9e4001", "0001-0009-rook-ceph-0000000000000002-24862838-240d-4215-9183-abfc0e9e4004"} + + assert.Equal(t, expectedRbd, trimRbdVolInUse) + assert.Equal(t, expectedCephfs, trimCephFSVolInUse) } func TestRBDStatusUnMarshal(t *testing.T) { diff --git a/pkg/operator/ceph/csi/secrets.go b/pkg/operator/ceph/csi/secrets.go index 324c26a706f6..5f11dd2e1d9f 100644 --- a/pkg/operator/ceph/csi/secrets.go +++ b/pkg/operator/ceph/csi/secrets.go @@ -105,9 +105,10 @@ func cephCSIKeyringCephFSNodeCaps() []string { func cephCSIKeyringCephFSProvisionerCaps() []string { return []string{ - "mon", "allow r", + "mon", "allow r, allow command 'osd blocklist'", "mgr", "allow rw", "osd", "allow rw tag cephfs metadata=*", + "mds", "allow *", // TODO: replace '*' with required permissions } } diff --git a/pkg/operator/ceph/csi/secrets_test.go b/pkg/operator/ceph/csi/secrets_test.go index c7c908a94b50..d76b9db2ae9f 100644 --- a/pkg/operator/ceph/csi/secrets_test.go +++ b/pkg/operator/ceph/csi/secrets_test.go @@ -39,5 +39,5 @@ func TestCephCSIKeyringCephFSNodeCaps(t *testing.T) { func TestCephCSIKeyringCephFSProvisionerCaps(t *testing.T) { caps := cephCSIKeyringCephFSProvisionerCaps() - assert.Equal(t, caps, []string{"mon", "allow r", "mgr", "allow rw", "osd", "allow rw tag cephfs metadata=*"}) + assert.Equal(t, caps, []string{"mon", "allow r, allow command 'osd blocklist'", "mgr", "allow rw", "osd", "allow rw tag cephfs metadata=*", "mds", "allow *"}) } diff --git a/pkg/operator/ceph/csi/spec.go b/pkg/operator/ceph/csi/spec.go index 6e8f0318b056..01c0f248b6eb 100644 --- a/pkg/operator/ceph/csi/spec.go +++ b/pkg/operator/ceph/csi/spec.go @@ -131,13 +131,13 @@ var ( // manually challenging. var ( // image names - DefaultCSIPluginImage = "quay.io/cephcsi/cephcsi:v3.10.0" + DefaultCSIPluginImage = "quay.io/cephcsi/cephcsi:v3.10.1" DefaultRegistrarImage = "registry.k8s.io/sig-storage/csi-node-driver-registrar:v2.9.1" DefaultProvisionerImage = "registry.k8s.io/sig-storage/csi-provisioner:v3.6.2" DefaultAttacherImage = "registry.k8s.io/sig-storage/csi-attacher:v4.4.2" DefaultSnapshotterImage = "registry.k8s.io/sig-storage/csi-snapshotter:v6.3.2" DefaultResizerImage = "registry.k8s.io/sig-storage/csi-resizer:v1.9.2" - DefaultCSIAddonsImage = "quay.io/csiaddons/k8s-sidecar:v0.7.0" + DefaultCSIAddonsImage = "quay.io/csiaddons/k8s-sidecar:v0.8.0" // image pull policy DefaultCSIImagePullPolicy = string(corev1.PullIfNotPresent) diff --git a/pkg/operator/ceph/csi/template/cephfs/csi-cephfsplugin-provisioner-dep.yaml b/pkg/operator/ceph/csi/template/cephfs/csi-cephfsplugin-provisioner-dep.yaml index 7b05a2c3ff34..cdf5c9c5115f 100644 --- a/pkg/operator/ceph/csi/template/cephfs/csi-cephfsplugin-provisioner-dep.yaml +++ b/pkg/operator/ceph/csi/template/cephfs/csi-cephfsplugin-provisioner-dep.yaml @@ -173,6 +173,10 @@ spec: - "--namespace=$(POD_NAMESPACE)" - "--pod-uid=$(POD_UID)" - "--stagingpath={{ .KubeletDirPath }}/plugins/kubernetes.io/csi/" + - "--leader-election-namespace={{ .Namespace }}" + - "--leader-election-lease-duration=137s" + - "--leader-election-renew-deadline=107s" + - "--leader-election-retry-period=26s" ports: - containerPort: {{ .CSIAddonsPort }} env: diff --git a/pkg/operator/ceph/csi/template/cephfs/csi-cephfsplugin-svc.yaml b/pkg/operator/ceph/csi/template/cephfs/csi-cephfsplugin-svc.yaml index 2eb4b036c9c7..890b6861466c 100644 --- a/pkg/operator/ceph/csi/template/cephfs/csi-cephfsplugin-svc.yaml +++ b/pkg/operator/ceph/csi/template/cephfs/csi-cephfsplugin-svc.yaml @@ -12,9 +12,5 @@ spec: port: 8080 protocol: TCP targetPort: {{ .CephFSLivenessMetricsPort }} - - name: csi-grpc-metrics - port: 8081 - protocol: TCP - targetPort: {{ .CephFSGRPCMetricsPort }} selector: contains: csi-cephfsplugin-metrics diff --git a/pkg/operator/ceph/csi/template/rbd/csi-rbdplugin-provisioner-dep.yaml b/pkg/operator/ceph/csi/template/rbd/csi-rbdplugin-provisioner-dep.yaml index 99a710d05be0..3abc84de1b97 100644 --- a/pkg/operator/ceph/csi/template/rbd/csi-rbdplugin-provisioner-dep.yaml +++ b/pkg/operator/ceph/csi/template/rbd/csi-rbdplugin-provisioner-dep.yaml @@ -145,6 +145,10 @@ spec: - "--namespace=$(POD_NAMESPACE)" - "--pod-uid=$(POD_UID)" - "--stagingpath={{ .KubeletDirPath }}/plugins/kubernetes.io/csi/" + - "--leader-election-namespace={{ .Namespace }}" + - "--leader-election-lease-duration=137s" + - "--leader-election-renew-deadline=107s" + - "--leader-election-retry-period=26s" ports: - containerPort: {{ .CSIAddonsPort }} env: diff --git a/pkg/operator/ceph/csi/template/rbd/csi-rbdplugin-svc.yaml b/pkg/operator/ceph/csi/template/rbd/csi-rbdplugin-svc.yaml index 6c432669c038..0493f06c7779 100644 --- a/pkg/operator/ceph/csi/template/rbd/csi-rbdplugin-svc.yaml +++ b/pkg/operator/ceph/csi/template/rbd/csi-rbdplugin-svc.yaml @@ -12,9 +12,5 @@ spec: port: 8080 protocol: TCP targetPort: {{ .RBDLivenessMetricsPort }} - - name: csi-grpc-metrics - port: 8081 - protocol: TCP - targetPort: {{ .RBDGRPCMetricsPort }} selector: contains: csi-rbdplugin-metrics diff --git a/pkg/operator/ceph/csi/util_test.go b/pkg/operator/ceph/csi/util_test.go index 9beaa1b3ce97..66ba19e74791 100644 --- a/pkg/operator/ceph/csi/util_test.go +++ b/pkg/operator/ceph/csi/util_test.go @@ -267,7 +267,7 @@ func Test_getImage(t *testing.T) { args: args{ data: map[string]string{}, settingName: "ROOK_CSI_CEPH_IMAGE", - defaultImage: "quay.io/cephcsi/cephcsi:v3.10.0", + defaultImage: "quay.io/cephcsi/cephcsi:v3.10.1", }, want: DefaultCSIPluginImage, }, @@ -278,7 +278,7 @@ func Test_getImage(t *testing.T) { "ROOK_CSI_CEPH_IMAGE": "registry.io/private/cephcsi:v8", }, settingName: "ROOK_CSI_CEPH_IMAGE", - defaultImage: "quay.io/cephcsi/cephcsi:v3.10.0", + defaultImage: "quay.io/cephcsi/cephcsi:v3.10.1", }, want: "registry.io/private/cephcsi:v8", }, @@ -289,9 +289,9 @@ func Test_getImage(t *testing.T) { "ROOK_CSI_CEPH_IMAGE": "registry.io/private/cephcsi", }, settingName: "ROOK_CSI_CEPH_IMAGE", - defaultImage: "quay.io/cephcsi/cephcsi:v3.10.0", + defaultImage: "quay.io/cephcsi/cephcsi:v3.10.1", }, - want: "registry.io/private/cephcsi:v3.10.0", + want: "registry.io/private/cephcsi:v3.10.1", }, } for _, tt := range tests { diff --git a/tests/framework/installer/ceph_helm_installer.go b/tests/framework/installer/ceph_helm_installer.go index 332a8f84d9f4..95a5f43592bc 100644 --- a/tests/framework/installer/ceph_helm_installer.go +++ b/tests/framework/installer/ceph_helm_installer.go @@ -173,6 +173,9 @@ func (h *CephInstaller) removeCephClusterHelmResources() { if err := h.k8shelper.RookClientset.CephV1().CephBlockPools(h.settings.Namespace).Delete(context.TODO(), BlockPoolName, v1.DeleteOptions{}); err != nil { assert.True(h.T(), kerrors.IsNotFound(err)) } + if err := h.k8shelper.RookClientset.CephV1().CephFilesystemSubVolumeGroups(h.settings.Namespace).Delete(context.TODO(), FilesystemName+"-csi", v1.DeleteOptions{}); err != nil { + assert.True(h.T(), kerrors.IsNotFound(err)) + } if err := h.k8shelper.RookClientset.CephV1().CephFilesystems(h.settings.Namespace).Delete(context.TODO(), FilesystemName, v1.DeleteOptions{}); err != nil { assert.True(h.T(), kerrors.IsNotFound(err)) } diff --git a/tests/scripts/collect-logs.sh b/tests/scripts/collect-logs.sh index c78259366921..aea3bf015cc8 100755 --- a/tests/scripts/collect-logs.sh +++ b/tests/scripts/collect-logs.sh @@ -18,11 +18,15 @@ $CEPH_CMD osd dump >"${LOG_DIR}"/ceph-osd-dump.txt $CEPH_CMD report >"${LOG_DIR}"/ceph-report.txt NAMESPACES=("$CLUSTER_NAMESPACE") -NAMESPACES+=("$KUBE_SYSTEM_NAMESPACE") if [[ "$OPERATOR_NAMESPACE" != "$CLUSTER_NAMESPACE" ]]; then NAMESPACES+=("$OPERATOR_NAMESPACE") fi +# Add kube-system namespace for multus test only as we need to debug network in multus test +if [ "$1" == "canary-multus" ]; then + NAMESPACES+=("$KUBE_SYSTEM_NAMESPACE") +fi + for NAMESPACE in "${NAMESPACES[@]}"; do # each namespace is a sub-directory for easier debugging NS_DIR="${LOG_DIR}"/namespace-"${NAMESPACE}"