From 52ca68d46e3f0ef3e1df8393ad8760ab5cf01efa Mon Sep 17 00:00:00 2001 From: Mathias Chapelain Date: Fri, 23 Aug 2024 15:32:56 +0200 Subject: [PATCH 1/8] osd: discover metadata and wal devices for raw device cleanup When doing the clean-up of raw device OSDs, it was not taking metadata and wal devices into account. This commit discover them via the output of `ceph-volume raw list`, which is returning `device_db` and/or `device_wal` if they are present. After discovering the metadata and wal device it also clean them and close the encrypted device if any. Signed-off-by: Mathias Chapelain --- pkg/daemon/ceph/cleanup/disk.go | 30 +++++++++++++++++++----------- pkg/daemon/ceph/osd/volume.go | 22 +++++++++++++++------- 2 files changed, 34 insertions(+), 18 deletions(-) diff --git a/pkg/daemon/ceph/cleanup/disk.go b/pkg/daemon/ceph/cleanup/disk.go index 484c48bf9557..e6d148a8cd85 100644 --- a/pkg/daemon/ceph/cleanup/disk.go +++ b/pkg/daemon/ceph/cleanup/disk.go @@ -179,21 +179,29 @@ func (s *DiskSanitizer) executeSanitizeCommand(osdInfo oposd.OSDInfo, wg *sync.W // On return, notify the WaitGroup that we’re done defer wg.Done() - output, err := s.context.Executor.ExecuteCommandWithCombinedOutput(shredUtility, s.buildShredArgs(osdInfo.BlockPath)...) - if err != nil { - logger.Errorf("failed to sanitize osd disk %q. %s. %v", osdInfo.BlockPath, output, err) - } + for _, device := range []string{osdInfo.BlockPath, osdInfo.MetadataPath, osdInfo.WalPath} { + if device == "" { + continue + } - logger.Infof("%s\n", output) - logger.Infof("successfully sanitized osd disk %q", osdInfo.BlockPath) + output, err := s.context.Executor.ExecuteCommandWithCombinedOutput(shredUtility, s.buildShredArgs(device)...) + + logger.Infof("%s\n", output) - // If the device is encrypted let's close it after sanitizing its content - if osdInfo.Encrypted { - err := osd.CloseEncryptedDevice(s.context, osdInfo.BlockPath) if err != nil { - logger.Errorf("failed to close encrypted osd disk %q. %v", osdInfo.BlockPath, err) + logger.Errorf("failed to sanitize osd disk %q. %s. %v", device, output, err) } else { - logger.Infof("successfully closed encrypted osd disk %q", osdInfo.BlockPath) + logger.Infof("successfully sanitized osd disk %q", device) + } + + // If the device is encrypted let's close it after sanitizing its content + if osdInfo.Encrypted { + err := osd.CloseEncryptedDevice(s.context, device) + if err != nil { + logger.Errorf("failed to close encrypted osd disk %q. %v", device, err) + } else { + logger.Infof("successfully closed encrypted osd disk %q", device) + } } } } diff --git a/pkg/daemon/ceph/osd/volume.go b/pkg/daemon/ceph/osd/volume.go index a01b67055e84..75029082b193 100644 --- a/pkg/daemon/ceph/osd/volume.go +++ b/pkg/daemon/ceph/osd/volume.go @@ -64,11 +64,13 @@ var ( ) type osdInfoBlock struct { - CephFsid string `json:"ceph_fsid"` - Device string `json:"device"` - OsdID int `json:"osd_id"` - OsdUUID string `json:"osd_uuid"` - Type string `json:"type"` + CephFsid string `json:"ceph_fsid"` + Device string `json:"device"` + DeviceDb string `json:"device_db"` + DeviceWal string `json:"device_wal"` + OsdID int `json:"osd_id"` + OsdUUID string `json:"osd_uuid"` + Type string `json:"type"` } type osdInfo struct { @@ -993,6 +995,8 @@ func GetCephVolumeRawOSDs(context *clusterd.Context, clusterInfo *client.Cluster // blockPath represents the path of the OSD block // it can be the one passed from the function's call or discovered by the c-v list command var blockPath string + var blockMetadataPath string + var blockWalPath string // If block is passed, check if it's an encrypted device, this is needed to get the correct // device path and populate the OSDInfo for that OSD @@ -1123,8 +1127,12 @@ func GetCephVolumeRawOSDs(context *clusterd.Context, clusterInfo *client.Cluster // If no block is specified let's take the one we discovered if setDevicePathFromList { blockPath = osdInfo.Device + blockMetadataPath = osdInfo.DeviceDb + blockWalPath = osdInfo.DeviceWal } else { blockPath = block + blockMetadataPath = metadataBlock + blockWalPath = walBlock } osdStore := osdInfo.Type @@ -1139,8 +1147,8 @@ func GetCephVolumeRawOSDs(context *clusterd.Context, clusterInfo *client.Cluster // Thus in the activation sequence we might activate the wrong OSD and have OSDInfo messed up // Hence, let's use the PVC name instead which will always remain consistent BlockPath: blockPath, - MetadataPath: metadataBlock, - WalPath: walBlock, + MetadataPath: blockMetadataPath, + WalPath: blockWalPath, SkipLVRelease: true, LVBackedPV: lvBackedPV, CVMode: cvMode, From 1ee58cc5bc7ddbdd121edd46202c41ade411af07 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 2 Sep 2024 12:13:48 +0000 Subject: [PATCH 2/8] build(deps): bump actions/setup-python from 5.1.1 to 5.2.0 Bumps [actions/setup-python](https://github.com/actions/setup-python) from 5.1.1 to 5.2.0. - [Release notes](https://github.com/actions/setup-python/releases) - [Commits](https://github.com/actions/setup-python/compare/39cd14951b08e74b54015e9e001cdefcf80e669f...f677139bbe7f9c59b41e40162b753c062f5d49a3) --- updated-dependencies: - dependency-name: actions/setup-python dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] --- .github/workflows/docs-check.yml | 2 +- .github/workflows/helm-lint.yaml | 2 +- .github/workflows/linters.yaml | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/docs-check.yml b/.github/workflows/docs-check.yml index b443286fdc85..c060ce5d4b74 100644 --- a/.github/workflows/docs-check.yml +++ b/.github/workflows/docs-check.yml @@ -32,7 +32,7 @@ jobs: with: go-version: "1.22" - - uses: actions/setup-python@39cd14951b08e74b54015e9e001cdefcf80e669f # v5.1.1 + - uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0 with: python-version: 3.9 diff --git a/.github/workflows/helm-lint.yaml b/.github/workflows/helm-lint.yaml index f06e9edd970c..013074f2ec48 100644 --- a/.github/workflows/helm-lint.yaml +++ b/.github/workflows/helm-lint.yaml @@ -35,7 +35,7 @@ jobs: with: version: v3.6.2 - - uses: actions/setup-python@39cd14951b08e74b54015e9e001cdefcf80e669f # v5.1.1 + - uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0 with: python-version: 3.9 diff --git a/.github/workflows/linters.yaml b/.github/workflows/linters.yaml index 3ce34e4ef14a..d995b051c75e 100644 --- a/.github/workflows/linters.yaml +++ b/.github/workflows/linters.yaml @@ -28,7 +28,7 @@ jobs: fetch-depth: 0 - name: Set up Python - uses: actions/setup-python@39cd14951b08e74b54015e9e001cdefcf80e669f # v5.1.1 + uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0 with: python-version: 3.9 @@ -46,7 +46,7 @@ jobs: fetch-depth: 0 - name: Set up Python - uses: actions/setup-python@39cd14951b08e74b54015e9e001cdefcf80e669f # v5.1.1 + uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0 with: python-version: 3.9 From fc9cea164e05ce9eaf4b29d41fe6ab4bdac3241e Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 2 Sep 2024 12:13:57 +0000 Subject: [PATCH 3/8] build(deps): bump actions/upload-artifact from 4.3.6 to 4.4.0 Bumps [actions/upload-artifact](https://github.com/actions/upload-artifact) from 4.3.6 to 4.4.0. - [Release notes](https://github.com/actions/upload-artifact/releases) - [Commits](https://github.com/actions/upload-artifact/compare/834a144ee995460fba8ed112a2fc961b36a5ec5a...50769540e7f4bd5e21e526ee35c689e35e0d6874) --- updated-dependencies: - dependency-name: actions/upload-artifact dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] --- .github/workflows/daily-nightly-jobs.yml | 20 +++++++++---------- .../integration-test-helm-suite.yaml | 2 +- .../integration-test-keystone-auth-suite.yaml | 2 +- .../workflows/integration-test-mgr-suite.yaml | 2 +- .../integration-test-multi-cluster-suite.yaml | 2 +- .../integration-test-object-suite.yaml | 2 +- .../integration-test-smoke-suite.yaml | 2 +- .../integration-test-upgrade-suite.yaml | 4 ++-- .../integration-tests-on-release.yaml | 12 +++++------ .github/workflows/scorecards.yml | 2 +- 10 files changed, 25 insertions(+), 25 deletions(-) diff --git a/.github/workflows/daily-nightly-jobs.yml b/.github/workflows/daily-nightly-jobs.yml index 735129fa4c77..af1676170776 100644 --- a/.github/workflows/daily-nightly-jobs.yml +++ b/.github/workflows/daily-nightly-jobs.yml @@ -112,7 +112,7 @@ jobs: name: canary-arm64 - name: upload canary test result - uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a # v4.3.6 + uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0 if: always() with: name: canary-arm64 @@ -152,7 +152,7 @@ jobs: tests/scripts/collect-logs.sh - name: Artifact - uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a # v4.3.6 + uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0 if: failure() with: name: ceph-smoke-suite-quincy-artifact @@ -192,7 +192,7 @@ jobs: tests/scripts/collect-logs.sh - name: Artifact - uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a # v4.3.6 + uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0 if: failure() with: name: ceph-smoke-suite-reef-artifact @@ -232,7 +232,7 @@ jobs: tests/scripts/collect-logs.sh - name: Artifact - uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a # v4.3.6 + uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0 if: failure() with: name: ceph-smoke-suite-squid-artifact @@ -272,7 +272,7 @@ jobs: tests/scripts/collect-logs.sh - name: Artifact - uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a # v4.3.6 + uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0 if: failure() with: name: ceph-smoke-suite-master-artifact @@ -312,7 +312,7 @@ jobs: tests/scripts/collect-logs.sh - name: Artifact - uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a # v4.3.6 + uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0 if: failure() with: name: ceph-object-suite-quincy-artifact @@ -352,7 +352,7 @@ jobs: tests/scripts/collect-logs.sh - name: Artifact - uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a # v4.3.6 + uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0 if: failure() with: name: ceph-object-suite-master-artifact @@ -392,7 +392,7 @@ jobs: tests/scripts/collect-logs.sh - name: Artifact - uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a # v4.3.6 + uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0 if: failure() with: name: ceph-upgrade-suite-squid-artifact @@ -433,7 +433,7 @@ jobs: tests/scripts/collect-logs.sh - name: Artifact - uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a # v4.3.6 + uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0 if: failure() with: name: ceph-upgrade-suite-reef-artifact @@ -473,7 +473,7 @@ jobs: tests/scripts/collect-logs.sh - name: Artifact - uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a # v4.3.6 + uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0 if: failure() with: name: ceph-upgrade-suite-quincy-artifact diff --git a/.github/workflows/integration-test-helm-suite.yaml b/.github/workflows/integration-test-helm-suite.yaml index 334d5992a277..8b9d7e42682d 100644 --- a/.github/workflows/integration-test-helm-suite.yaml +++ b/.github/workflows/integration-test-helm-suite.yaml @@ -64,7 +64,7 @@ jobs: tests/scripts/collect-logs.sh - name: Artifact - uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a # v4.3.6 + uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0 if: failure() with: name: ceph-helm-suite-artifact-${{ matrix.kubernetes-versions }} diff --git a/.github/workflows/integration-test-keystone-auth-suite.yaml b/.github/workflows/integration-test-keystone-auth-suite.yaml index e52f7932636d..dada9abca9cf 100644 --- a/.github/workflows/integration-test-keystone-auth-suite.yaml +++ b/.github/workflows/integration-test-keystone-auth-suite.yaml @@ -62,7 +62,7 @@ jobs: tests/scripts/collect-logs.sh - name: Artifact - uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a # v4.3.6 + uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0 if: failure() with: name: ceph-keystone-auth-suite-artifact-${{ matrix.kubernetes-versions }} diff --git a/.github/workflows/integration-test-mgr-suite.yaml b/.github/workflows/integration-test-mgr-suite.yaml index 6389f2a3c764..d88e14f67321 100644 --- a/.github/workflows/integration-test-mgr-suite.yaml +++ b/.github/workflows/integration-test-mgr-suite.yaml @@ -59,7 +59,7 @@ jobs: tests/scripts/collect-logs.sh - name: Artifact - uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a # v4.3.6 + uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0 if: failure() with: name: ceph-mgr-suite-artifact-${{ matrix.kubernetes-versions }} diff --git a/.github/workflows/integration-test-multi-cluster-suite.yaml b/.github/workflows/integration-test-multi-cluster-suite.yaml index 874074875fee..101fbf2e3841 100644 --- a/.github/workflows/integration-test-multi-cluster-suite.yaml +++ b/.github/workflows/integration-test-multi-cluster-suite.yaml @@ -63,7 +63,7 @@ jobs: CLUSTER_NAMESPACE="multi-external" tests/scripts/collect-logs.sh - name: Artifact - uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a # v4.3.6 + uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0 if: failure() with: name: ceph-multi-cluster-deploy-suite-artifact-${{ matrix.kubernetes-versions }} diff --git a/.github/workflows/integration-test-object-suite.yaml b/.github/workflows/integration-test-object-suite.yaml index 7513b7c144fd..b50e26f52c39 100644 --- a/.github/workflows/integration-test-object-suite.yaml +++ b/.github/workflows/integration-test-object-suite.yaml @@ -60,7 +60,7 @@ jobs: tests/scripts/collect-logs.sh - name: Artifact - uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a # v4.3.6 + uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0 if: failure() with: name: ceph-object-suite-artifact-${{ matrix.kubernetes-versions }} diff --git a/.github/workflows/integration-test-smoke-suite.yaml b/.github/workflows/integration-test-smoke-suite.yaml index 8b1d18fba568..829f1a14459a 100644 --- a/.github/workflows/integration-test-smoke-suite.yaml +++ b/.github/workflows/integration-test-smoke-suite.yaml @@ -60,7 +60,7 @@ jobs: tests/scripts/collect-logs.sh - name: Artifact - uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a # v4.3.6 + uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0 if: failure() with: name: ceph-smoke-suite-artifact-${{ matrix.kubernetes-versions }} diff --git a/.github/workflows/integration-test-upgrade-suite.yaml b/.github/workflows/integration-test-upgrade-suite.yaml index 694d60fe7de3..26c6e2fbf7dc 100644 --- a/.github/workflows/integration-test-upgrade-suite.yaml +++ b/.github/workflows/integration-test-upgrade-suite.yaml @@ -60,7 +60,7 @@ jobs: tests/scripts/collect-logs.sh - name: Artifact - uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a # v4.3.6 + uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0 if: failure() with: name: ceph-upgrade-suite-artifact-${{ matrix.kubernetes-versions }} @@ -107,7 +107,7 @@ jobs: tests/scripts/collect-logs.sh - name: Artifact - uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a # v4.3.6 + uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0 if: failure() with: name: ceph-upgrade-helm-suite-artifact-${{ matrix.kubernetes-versions }} diff --git a/.github/workflows/integration-tests-on-release.yaml b/.github/workflows/integration-tests-on-release.yaml index 3a0ec705212d..acc16b6987b9 100644 --- a/.github/workflows/integration-tests-on-release.yaml +++ b/.github/workflows/integration-tests-on-release.yaml @@ -50,7 +50,7 @@ jobs: tests/scripts/collect-logs.sh - name: Artifact - uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a # v4.3.6 + uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0 if: failure() with: name: ceph-helm-suite-artifact-${{ matrix.kubernetes-versions }} @@ -91,7 +91,7 @@ jobs: CLUSTER_NAMESPACE="multi-external" tests/scripts/collect-logs.sh - name: Artifact - uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a # v4.3.6 + uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0 if: failure() with: name: ceph-multi-cluster-deploy-suite-artifact-${{ matrix.kubernetes-versions }} @@ -129,7 +129,7 @@ jobs: tests/scripts/collect-logs.sh - name: Artifact - uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a # v4.3.6 + uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0 if: failure() with: name: ceph-smoke-suite-artifact-${{ matrix.kubernetes-versions }} @@ -167,7 +167,7 @@ jobs: tests/scripts/collect-logs.sh - name: Artifact - uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a # v4.3.6 + uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0 if: failure() with: name: ceph-upgrade-suite-artifact-${{ matrix.kubernetes-versions }} @@ -208,7 +208,7 @@ jobs: tests/scripts/collect-logs.sh - name: Artifact - uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a # v4.3.6 + uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0 if: failure() with: name: ceph-upgrade-suite-artifact-${{ matrix.kubernetes-versions }} @@ -246,7 +246,7 @@ jobs: tests/scripts/collect-logs.sh - name: Artifact - uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a # v4.3.6 + uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0 if: failure() with: name: ceph-object-suite-artifact-${{ matrix.kubernetes-versions }} diff --git a/.github/workflows/scorecards.yml b/.github/workflows/scorecards.yml index 00f556c37324..f21a5f2d275b 100644 --- a/.github/workflows/scorecards.yml +++ b/.github/workflows/scorecards.yml @@ -55,7 +55,7 @@ jobs: # Upload the results as artifacts (optional). Commenting out will disable uploads of run results in SARIF # format to the repository Actions tab. - name: "Upload artifact" - uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a # v4.3.6 + uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0 with: name: SARIF file path: results.sarif From 4458eae5c858cf321fbb48801c7ed850730f5aff Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 2 Sep 2024 12:14:04 +0000 Subject: [PATCH 4/8] build(deps): bump github/codeql-action from 3.26.5 to 3.26.6 Bumps [github/codeql-action](https://github.com/github/codeql-action) from 3.26.5 to 3.26.6. - [Release notes](https://github.com/github/codeql-action/releases) - [Changelog](https://github.com/github/codeql-action/blob/main/CHANGELOG.md) - [Commits](https://github.com/github/codeql-action/compare/2c779ab0d087cd7fe7b826087247c2c81f27bfa6...4dd16135b69a43b6c8efb853346f8437d92d3c93) --- updated-dependencies: - dependency-name: github/codeql-action dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] --- .github/workflows/scorecards.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/scorecards.yml b/.github/workflows/scorecards.yml index 00f556c37324..09d2301d4241 100644 --- a/.github/workflows/scorecards.yml +++ b/.github/workflows/scorecards.yml @@ -64,6 +64,6 @@ jobs: # Upload the results to GitHub's code scanning dashboard (optional). # Commenting out will disable upload of results to your repo's Code Scanning dashboard - name: "Upload to code-scanning" - uses: github/codeql-action/upload-sarif@2c779ab0d087cd7fe7b826087247c2c81f27bfa6 # v3.26.5 + uses: github/codeql-action/upload-sarif@4dd16135b69a43b6c8efb853346f8437d92d3c93 # v3.26.6 with: sarif_file: results.sarif From bf99a14c71b962082464f49ce06633f060e877b9 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 2 Sep 2024 12:31:43 +0000 Subject: [PATCH 5/8] build(deps): bump github.com/csi-addons/kubernetes-csi-addons Bumps the github-dependencies group with 1 update: [github.com/csi-addons/kubernetes-csi-addons](https://github.com/csi-addons/kubernetes-csi-addons). Updates `github.com/csi-addons/kubernetes-csi-addons` from 0.9.0 to 0.9.1 - [Release notes](https://github.com/csi-addons/kubernetes-csi-addons/releases) - [Commits](https://github.com/csi-addons/kubernetes-csi-addons/compare/v0.9.0...v0.9.1) --- updated-dependencies: - dependency-name: github.com/csi-addons/kubernetes-csi-addons dependency-type: direct:production update-type: version-update:semver-patch dependency-group: github-dependencies ... Signed-off-by: dependabot[bot] --- go.mod | 2 +- go.sum | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/go.mod b/go.mod index e770421d9085..076aa298e388 100644 --- a/go.mod +++ b/go.mod @@ -21,7 +21,7 @@ require ( github.com/ceph/ceph-csi-operator/api v0.0.0-20240819112305-88e6db254d6c github.com/ceph/go-ceph v0.29.0 github.com/coreos/pkg v0.0.0-20230601102743-20bbbf26f4d8 - github.com/csi-addons/kubernetes-csi-addons v0.9.0 + github.com/csi-addons/kubernetes-csi-addons v0.9.1 github.com/gemalto/kmip-go v0.0.10 github.com/go-ini/ini v1.67.0 github.com/google/go-cmp v0.6.0 diff --git a/go.sum b/go.sum index 87ef32262286..1faf19988f81 100644 --- a/go.sum +++ b/go.sum @@ -213,8 +213,8 @@ github.com/creack/pty v1.1.7/go.mod h1:lj5s0c3V2DBrqTV7llrYr5NG6My20zk30Fl46Y7Do github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E= github.com/creack/pty v1.1.18 h1:n56/Zwd5o6whRC5PMGretI4IdRLlmBXYNjScPaBgsbY= github.com/creack/pty v1.1.18/go.mod h1:MOBLtS5ELjhRRrroQr9kyvTxUAFNvYEK993ew/Vr4O4= -github.com/csi-addons/kubernetes-csi-addons v0.9.0 h1:Hhb44WcrxtbzmpLY+uqX+DBWCI6HgA/rwQMPyvsyCc8= -github.com/csi-addons/kubernetes-csi-addons v0.9.0/go.mod h1:/YROZDdEi1N/1Ls9rdU5W2VNjm8MK7HHApl8W4Sqt9s= +github.com/csi-addons/kubernetes-csi-addons v0.9.1 h1:2m8/Pls7Ws3ld1zr/w6lL6BWwsXqqLg2JnW0jP8AX6I= +github.com/csi-addons/kubernetes-csi-addons v0.9.1/go.mod h1:32kTa/Ngp7hMK2GEjx+Zk8yfKupR5WG4JG+oRzkM1TM= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc h1:U9qPSI2PIWSS1VwoXQT9A3Wy9MM3WgvqSxFWenqJduM= From 05d579b607226b35ccdb83033ef79caff5e68ebf Mon Sep 17 00:00:00 2001 From: Madhu Rajanna Date: Mon, 2 Sep 2024 12:20:04 +0200 Subject: [PATCH 6/8] csi: update csi-addons to v0.9.1 updating csi-addons to latest v0.9.1 release. Signed-off-by: Madhu Rajanna --- Documentation/Helm-Charts/operator-chart.md | 2 +- .../Ceph-CSI/ceph-csi-drivers.md | 20 +++++++++---------- .../Ceph-CSI/custom-images.md | 2 +- deploy/charts/rook-ceph/values.yaml | 2 +- deploy/examples/images.txt | 2 +- deploy/examples/operator-openshift.yaml | 2 +- deploy/examples/operator.yaml | 2 +- go.mod | 2 +- go.sum | 4 ++-- pkg/operator/ceph/csi/spec.go | 2 +- tests/scripts/csiaddons.sh | 2 +- 11 files changed, 21 insertions(+), 21 deletions(-) diff --git a/Documentation/Helm-Charts/operator-chart.md b/Documentation/Helm-Charts/operator-chart.md index 2a7173c73949..f3816c76c320 100644 --- a/Documentation/Helm-Charts/operator-chart.md +++ b/Documentation/Helm-Charts/operator-chart.md @@ -67,7 +67,7 @@ The following table lists the configurable parameters of the rook-operator chart | `csi.clusterName` | Cluster name identifier to set as metadata on the CephFS subvolume and RBD images. This will be useful in cases like for example, when two container orchestrator clusters (Kubernetes/OCP) are using a single ceph cluster | `nil` | | `csi.csiAddons.enabled` | Enable CSIAddons | `false` | | `csi.csiAddons.repository` | CSIAddons sidecar image repository | `"quay.io/csiaddons/k8s-sidecar"` | -| `csi.csiAddons.tag` | CSIAddons sidecar image tag | `"v0.9.0"` | +| `csi.csiAddons.tag` | CSIAddons sidecar image tag | `"v0.9.1"` | | `csi.csiAddonsPort` | CSI Addons server port | `9070` | | `csi.csiCephFSPluginResource` | CEPH CSI CephFS plugin resource requirement list | see values.yaml | | `csi.csiCephFSPluginVolume` | The volume of the CephCSI CephFS plugin DaemonSet | `nil` | diff --git a/Documentation/Storage-Configuration/Ceph-CSI/ceph-csi-drivers.md b/Documentation/Storage-Configuration/Ceph-CSI/ceph-csi-drivers.md index 9b6e986cd36e..b904d2da8ab5 100644 --- a/Documentation/Storage-Configuration/Ceph-CSI/ceph-csi-drivers.md +++ b/Documentation/Storage-Configuration/Ceph-CSI/ceph-csi-drivers.md @@ -166,9 +166,9 @@ that the controller inspects and forwards to one or more CSI-Addons sidecars for Deploy the controller by running the following commands: ```console -kubectl create -f https://github.com/csi-addons/kubernetes-csi-addons/releases/download/v0.9.0/crds.yaml -kubectl create -f https://github.com/csi-addons/kubernetes-csi-addons/releases/download/v0.9.0/rbac.yaml -kubectl create -f https://github.com/csi-addons/kubernetes-csi-addons/releases/download/v0.9.0/setup-controller.yaml +kubectl create -f https://github.com/csi-addons/kubernetes-csi-addons/releases/download/v0.9.1/crds.yaml +kubectl create -f https://github.com/csi-addons/kubernetes-csi-addons/releases/download/v0.9.1/rbac.yaml +kubectl create -f https://github.com/csi-addons/kubernetes-csi-addons/releases/download/v0.9.1/setup-controller.yaml ``` This creates the required CRDs and configures permissions. @@ -196,15 +196,15 @@ Execute the following to enable the CSI-Addons sidecars: CSI-Addons supports the following operations: * Reclaim Space - * [Creating a ReclaimSpaceJob](https://github.com/csi-addons/kubernetes-csi-addons/blob/v0.9.0/docs/reclaimspace.md#reclaimspacejob) - * [Creating a ReclaimSpaceCronJob](https://github.com/csi-addons/kubernetes-csi-addons/blob/v0.9.0/docs/reclaimspace.md#reclaimspacecronjob) - * [Annotating PersistentVolumeClaims](https://github.com/csi-addons/kubernetes-csi-addons/blob/v0.9.0/docs/reclaimspace.md#annotating-perstentvolumeclaims) - * [Annotating Namespace](https://github.com/csi-addons/kubernetes-csi-addons/blob/v0.9.0/docs/reclaimspace.md#annotating-namespace) + * [Creating a ReclaimSpaceJob](https://github.com/csi-addons/kubernetes-csi-addons/blob/v0.9.1/docs/reclaimspace.md#reclaimspacejob) + * [Creating a ReclaimSpaceCronJob](https://github.com/csi-addons/kubernetes-csi-addons/blob/v0.9.1/docs/reclaimspace.md#reclaimspacecronjob) + * [Annotating PersistentVolumeClaims](https://github.com/csi-addons/kubernetes-csi-addons/blob/v0.9.1/docs/reclaimspace.md#annotating-perstentvolumeclaims) + * [Annotating Namespace](https://github.com/csi-addons/kubernetes-csi-addons/blob/v0.9.1/docs/reclaimspace.md#annotating-namespace) * Network Fencing - * [Creating a NetworkFence](https://github.com/csi-addons/kubernetes-csi-addons/blob/v0.9.0/docs/networkfence.md) + * [Creating a NetworkFence](https://github.com/csi-addons/kubernetes-csi-addons/blob/v0.9.1/docs/networkfence.md) * Volume Replication - * [Creating VolumeReplicationClass](https://github.com/csi-addons/kubernetes-csi-addons/blob/v0.9.0/docs/volumereplicationclass.md) - * [Creating VolumeReplication CR](https://github.com/csi-addons/kubernetes-csi-addons/blob/v0.9.0/docs/volumereplication.md) + * [Creating VolumeReplicationClass](https://github.com/csi-addons/kubernetes-csi-addons/blob/v0.9.1/docs/volumereplicationclass.md) + * [Creating VolumeReplication CR](https://github.com/csi-addons/kubernetes-csi-addons/blob/v0.9.1/docs/volumereplication.md) ## Enable RBD and CephFS Encryption Support diff --git a/Documentation/Storage-Configuration/Ceph-CSI/custom-images.md b/Documentation/Storage-Configuration/Ceph-CSI/custom-images.md index 59c0575c5a9a..e3dedf6351c6 100644 --- a/Documentation/Storage-Configuration/Ceph-CSI/custom-images.md +++ b/Documentation/Storage-Configuration/Ceph-CSI/custom-images.md @@ -24,7 +24,7 @@ ROOK_CSI_PROVISIONER_IMAGE: "registry.k8s.io/sig-storage/csi-provisioner:v5.0.1" ROOK_CSI_ATTACHER_IMAGE: "registry.k8s.io/sig-storage/csi-attacher:v4.6.1" ROOK_CSI_RESIZER_IMAGE: "registry.k8s.io/sig-storage/csi-resizer:v1.11.1" ROOK_CSI_SNAPSHOTTER_IMAGE: "registry.k8s.io/sig-storage/csi-snapshotter:v8.0.1" -ROOK_CSIADDONS_IMAGE: "quay.io/csiaddons/k8s-sidecar:v0.9.0" +ROOK_CSIADDONS_IMAGE: "quay.io/csiaddons/k8s-sidecar:v0.9.1" ``` ### **Use private repository** diff --git a/deploy/charts/rook-ceph/values.yaml b/deploy/charts/rook-ceph/values.yaml index f93548f38619..1b931a492693 100644 --- a/deploy/charts/rook-ceph/values.yaml +++ b/deploy/charts/rook-ceph/values.yaml @@ -537,7 +537,7 @@ csi: # -- CSIAddons sidecar image repository repository: quay.io/csiaddons/k8s-sidecar # -- CSIAddons sidecar image tag - tag: v0.9.0 + tag: v0.9.1 nfs: # -- Enable the nfs csi driver diff --git a/deploy/examples/images.txt b/deploy/examples/images.txt index 88a25cdd0df7..12ccd7dfd6ee 100644 --- a/deploy/examples/images.txt +++ b/deploy/examples/images.txt @@ -3,7 +3,7 @@ quay.io/ceph/ceph:v18.2.4 quay.io/ceph/cosi:v0.1.2 quay.io/cephcsi/cephcsi:v3.12.0 - quay.io/csiaddons/k8s-sidecar:v0.9.0 + quay.io/csiaddons/k8s-sidecar:v0.9.1 registry.k8s.io/sig-storage/csi-attacher:v4.6.1 registry.k8s.io/sig-storage/csi-node-driver-registrar:v2.11.1 registry.k8s.io/sig-storage/csi-provisioner:v5.0.1 diff --git a/deploy/examples/operator-openshift.yaml b/deploy/examples/operator-openshift.yaml index 84e4aeae7383..7d0c052d59e2 100644 --- a/deploy/examples/operator-openshift.yaml +++ b/deploy/examples/operator-openshift.yaml @@ -558,7 +558,7 @@ data: CSI_ENABLE_CSIADDONS: "false" # Enable watch for faster recovery from rbd rwo node loss ROOK_WATCH_FOR_NODE_FAILURE: "true" - # ROOK_CSIADDONS_IMAGE: "quay.io/csiaddons/k8s-sidecar:v0.9.0" + # ROOK_CSIADDONS_IMAGE: "quay.io/csiaddons/k8s-sidecar:v0.9.1" # The GCSI RPC timeout value (in seconds). It should be >= 120. If this variable is not set or is an invalid value, it's default to 150. CSI_GRPC_TIMEOUT_SECONDS: "150" diff --git a/deploy/examples/operator.yaml b/deploy/examples/operator.yaml index 454380848316..7f86068f50d9 100644 --- a/deploy/examples/operator.yaml +++ b/deploy/examples/operator.yaml @@ -508,7 +508,7 @@ data: CSI_ENABLE_CSIADDONS: "false" # Enable watch for faster recovery from rbd rwo node loss ROOK_WATCH_FOR_NODE_FAILURE: "true" - # ROOK_CSIADDONS_IMAGE: "quay.io/csiaddons/k8s-sidecar:v0.9.0" + # ROOK_CSIADDONS_IMAGE: "quay.io/csiaddons/k8s-sidecar:v0.9.1" # The CSI GRPC timeout value (in seconds). It should be >= 120. If this variable is not set or is an invalid value, it's default to 150. CSI_GRPC_TIMEOUT_SECONDS: "150" diff --git a/go.mod b/go.mod index e770421d9085..076aa298e388 100644 --- a/go.mod +++ b/go.mod @@ -21,7 +21,7 @@ require ( github.com/ceph/ceph-csi-operator/api v0.0.0-20240819112305-88e6db254d6c github.com/ceph/go-ceph v0.29.0 github.com/coreos/pkg v0.0.0-20230601102743-20bbbf26f4d8 - github.com/csi-addons/kubernetes-csi-addons v0.9.0 + github.com/csi-addons/kubernetes-csi-addons v0.9.1 github.com/gemalto/kmip-go v0.0.10 github.com/go-ini/ini v1.67.0 github.com/google/go-cmp v0.6.0 diff --git a/go.sum b/go.sum index 87ef32262286..1faf19988f81 100644 --- a/go.sum +++ b/go.sum @@ -213,8 +213,8 @@ github.com/creack/pty v1.1.7/go.mod h1:lj5s0c3V2DBrqTV7llrYr5NG6My20zk30Fl46Y7Do github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E= github.com/creack/pty v1.1.18 h1:n56/Zwd5o6whRC5PMGretI4IdRLlmBXYNjScPaBgsbY= github.com/creack/pty v1.1.18/go.mod h1:MOBLtS5ELjhRRrroQr9kyvTxUAFNvYEK993ew/Vr4O4= -github.com/csi-addons/kubernetes-csi-addons v0.9.0 h1:Hhb44WcrxtbzmpLY+uqX+DBWCI6HgA/rwQMPyvsyCc8= -github.com/csi-addons/kubernetes-csi-addons v0.9.0/go.mod h1:/YROZDdEi1N/1Ls9rdU5W2VNjm8MK7HHApl8W4Sqt9s= +github.com/csi-addons/kubernetes-csi-addons v0.9.1 h1:2m8/Pls7Ws3ld1zr/w6lL6BWwsXqqLg2JnW0jP8AX6I= +github.com/csi-addons/kubernetes-csi-addons v0.9.1/go.mod h1:32kTa/Ngp7hMK2GEjx+Zk8yfKupR5WG4JG+oRzkM1TM= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc h1:U9qPSI2PIWSS1VwoXQT9A3Wy9MM3WgvqSxFWenqJduM= diff --git a/pkg/operator/ceph/csi/spec.go b/pkg/operator/ceph/csi/spec.go index 631b0446f01c..4be362ed25bb 100644 --- a/pkg/operator/ceph/csi/spec.go +++ b/pkg/operator/ceph/csi/spec.go @@ -157,7 +157,7 @@ var ( DefaultAttacherImage = "registry.k8s.io/sig-storage/csi-attacher:v4.6.1" DefaultSnapshotterImage = "registry.k8s.io/sig-storage/csi-snapshotter:v8.0.1" DefaultResizerImage = "registry.k8s.io/sig-storage/csi-resizer:v1.11.1" - DefaultCSIAddonsImage = "quay.io/csiaddons/k8s-sidecar:v0.9.0" + DefaultCSIAddonsImage = "quay.io/csiaddons/k8s-sidecar:v0.9.1" // image pull policy DefaultCSIImagePullPolicy = string(corev1.PullIfNotPresent) diff --git a/tests/scripts/csiaddons.sh b/tests/scripts/csiaddons.sh index 401061da3305..60daf61e217e 100755 --- a/tests/scripts/csiaddons.sh +++ b/tests/scripts/csiaddons.sh @@ -16,7 +16,7 @@ set -xEo pipefail -CSIADDONS_VERSION="v0.9.0" +CSIADDONS_VERSION="v0.9.1" CSIADDONS_CRD_NAME="csiaddonsnodes.csiaddons.openshift.io" CSIADDONS_CONTAINER_NAME="csi-addons" From 8987d26a6d89d1be9cb68097579238a6d32798b7 Mon Sep 17 00:00:00 2001 From: Travis Nielsen Date: Wed, 28 Aug 2024 15:20:47 -0600 Subject: [PATCH 7/8] mon: remove extra mon from quorum before taking down pod When removing a mon from quorum, there is a race condition that can result in mon quorum going being lost at least temporarily. The mon pod was being deleted first, and then the mon removed from quorum. If any other mon went down between the time the pod of the bad mon was deleted and when the mon was removed from quorum, there may not be sufficient quorum to complete the action of removing the mon from quorum and the operator would be stuck. For example, there could be 4 mons temporarily due to timing of upgrading K8s nodes where mons may be taken down for some number of minutes. Say a new mon is started while the down mon also comes back up. Now the operator sees it can remove the 4th mon from quorum, so it starts to remove it. Now say another mon goes down on another node that is being updated or otherwise drained. Since the 4th mon pod was deleted and another mon is down, there are only two mons remaining in quorum, but 3 mons are required in quorum when there are 4 mons. Therefore, the quorum is stuck until the third mon comes back up. The solution is to first remove the extra mon from quorum before taking down the mon pod. Signed-off-by: Travis Nielsen --- pkg/operator/ceph/cluster/mon/health.go | 56 +++++++++++++---------- pkg/operator/ceph/cluster/mon/mon.go | 34 +++++++++++++- pkg/operator/ceph/cluster/mon/mon_test.go | 44 ++++++++++++++++++ 3 files changed, 108 insertions(+), 26 deletions(-) diff --git a/pkg/operator/ceph/cluster/mon/health.go b/pkg/operator/ceph/cluster/mon/health.go index df8de52878a0..a8fe149fe02c 100644 --- a/pkg/operator/ceph/cluster/mon/health.go +++ b/pkg/operator/ceph/cluster/mon/health.go @@ -730,20 +730,6 @@ func (c *Cluster) removeMonWithOptionalQuorum(daemonName string, shouldRemoveFro } logger.Infof("ensuring removal of unhealthy monitor %s", daemonName) - resourceName := resourceName(daemonName) - - // Remove the mon pod if it is still there - var gracePeriod int64 - propagation := metav1.DeletePropagationForeground - options := &metav1.DeleteOptions{GracePeriodSeconds: &gracePeriod, PropagationPolicy: &propagation} - if err := c.context.Clientset.AppsV1().Deployments(c.Namespace).Delete(c.ClusterInfo.Context, resourceName, *options); err != nil { - if kerrors.IsNotFound(err) { - logger.Infof("dead mon %s was already gone", resourceName) - } else { - logger.Errorf("failed to remove dead mon deployment %q. %v", resourceName, err) - } - } - // Remove the bad monitor from quorum if shouldRemoveFromQuorum { if err := c.removeMonitorFromQuorum(daemonName); err != nil { @@ -751,10 +737,33 @@ func (c *Cluster) removeMonWithOptionalQuorum(daemonName string, shouldRemoveFro } } delete(c.ClusterInfo.Monitors, daemonName) - delete(c.mapping.Schedule, daemonName) + if err := c.saveMonConfig(); err != nil { + return errors.Wrapf(err, "failed to save mon config after failing over mon %s", daemonName) + } + + // Update cluster-wide RBD bootstrap peer token since Monitors have changed + _, err := controller.CreateBootstrapPeerSecret(c.context, c.ClusterInfo, &cephv1.CephCluster{ObjectMeta: metav1.ObjectMeta{Name: c.ClusterInfo.NamespacedName().Name, Namespace: c.Namespace}}, c.ownerInfo) + if err != nil { + return errors.Wrap(err, "failed to update cluster rbd bootstrap peer token") + } + + // When the mon is removed from quorum, it is possible that the operator will be restarted + // before the mon pod is deleted. In this case, the operator will need to delete the mon + // during the next reconcile. If the reconcile finds an extra mon pod, it will be removed + // at that later reconcile. Thus, we delete the mon pod last during the failover + // and in case the failover is interrupted, the operator can detect the resources to finish the cleanup. + c.removeMonResources(daemonName) + return nil +} + +func (c *Cluster) removeMonResources(daemonName string) { // Remove the service endpoint + resourceName := resourceName(daemonName) + var gracePeriod int64 + propagation := metav1.DeletePropagationForeground + options := &metav1.DeleteOptions{GracePeriodSeconds: &gracePeriod, PropagationPolicy: &propagation} if err := c.context.Clientset.CoreV1().Services(c.Namespace).Delete(c.ClusterInfo.Context, resourceName, *options); err != nil { if kerrors.IsNotFound(err) { logger.Infof("dead mon service %s was already gone", resourceName) @@ -772,17 +781,14 @@ func (c *Cluster) removeMonWithOptionalQuorum(daemonName string, shouldRemoveFro } } - if err := c.saveMonConfig(); err != nil { - return errors.Wrapf(err, "failed to save mon config after failing over mon %s", daemonName) - } - - // Update cluster-wide RBD bootstrap peer token since Monitors have changed - _, err := controller.CreateBootstrapPeerSecret(c.context, c.ClusterInfo, &cephv1.CephCluster{ObjectMeta: metav1.ObjectMeta{Name: c.ClusterInfo.NamespacedName().Name, Namespace: c.Namespace}}, c.ownerInfo) - if err != nil { - return errors.Wrap(err, "failed to update cluster rbd bootstrap peer token") + // Remove the mon pod if it is still there + if err := c.context.Clientset.AppsV1().Deployments(c.Namespace).Delete(c.ClusterInfo.Context, resourceName, *options); err != nil { + if kerrors.IsNotFound(err) { + logger.Infof("dead mon %s was already gone", resourceName) + } else { + logger.Errorf("failed to remove dead mon deployment %q. %v", resourceName, err) + } } - - return nil } func (c *Cluster) removeMonitorFromQuorum(name string) error { diff --git a/pkg/operator/ceph/cluster/mon/mon.go b/pkg/operator/ceph/cluster/mon/mon.go index d80839d65c2c..17258a5322d5 100644 --- a/pkg/operator/ceph/cluster/mon/mon.go +++ b/pkg/operator/ceph/cluster/mon/mon.go @@ -1060,7 +1060,39 @@ func (c *Cluster) startDeployments(mons []*monConfig, requireAllInQuorum bool) e requireAllInQuorum = true } } - return c.waitForMonsToJoin(mons, requireAllInQuorum) + err = c.waitForMonsToJoin(mons, requireAllInQuorum) + + // Check for the rare case of an extra mon deployment that needs to be cleaned up + c.checkForExtraMonResources(mons, deployments.Items) + return err +} + +func (c *Cluster) checkForExtraMonResources(mons []*monConfig, deployments []apps.Deployment) string { + if len(deployments) <= c.spec.Mon.Count || len(deployments) <= len(mons) { + return "" + } + + // If there are more deployments than expected mons from the ceph quorum, + // find the extra mon deployment and clean it up. + logger.Infof("there is an extra mon deployment that is not needed and not in quorum") + for _, deploy := range deployments { + monName := deploy.Labels[controller.DaemonIDLabel] + found := false + // Search for the mon in the list of mons expected in quorum + for _, monDaemon := range mons { + if monName == monDaemon.DaemonName { + found = true + break + } + } + if !found { + logger.Infof("deleting extra mon deployment %q", deploy.Name) + c.removeMonResources(monName) + return monName + } + } + + return "" } func (c *Cluster) waitForMonsToJoin(mons []*monConfig, requireAllInQuorum bool) error { diff --git a/pkg/operator/ceph/cluster/mon/mon_test.go b/pkg/operator/ceph/cluster/mon/mon_test.go index fb8643d8c251..9d27be4014c9 100644 --- a/pkg/operator/ceph/cluster/mon/mon_test.go +++ b/pkg/operator/ceph/cluster/mon/mon_test.go @@ -676,6 +676,50 @@ func TestMonVolumeClaimTemplate(t *testing.T) { }) } } + +func TestRemoveExtraMonDeployments(t *testing.T) { + namespace := "ns" + context, err := newTestStartCluster(t, namespace) + assert.NoError(t, err) + c := newCluster(context, namespace, true, v1.ResourceRequirements{}) + c.ClusterInfo = clienttest.CreateTestClusterInfo(1) + + // Nothing to remove when the mons match the deployment + mons := []*monConfig{ + {ResourceName: "rook-ceph-mon-a", DaemonName: "a"}, + } + deployments := []apps.Deployment{ + { + ObjectMeta: metav1.ObjectMeta{ + Name: "rook-ceph-mon-a", + Labels: map[string]string{"ceph_daemon_id": "a"}, + }, + }, + } + c.spec.Mon.Count = 1 + removed := c.checkForExtraMonResources(mons, deployments) + assert.Equal(t, "", removed) + + // Remove an extra mon deployment + deployments = append(deployments, apps.Deployment{ + ObjectMeta: metav1.ObjectMeta{ + Name: "rook-ceph-mon-b", + Labels: map[string]string{"ceph_daemon_id": "b"}, + }}) + removed = c.checkForExtraMonResources(mons, deployments) + assert.Equal(t, "b", removed) + + // Nothing to remove when there are not enough deployments for the expected mons + mons = []*monConfig{ + {ResourceName: "rook-ceph-mon-a", DaemonName: "a"}, + {ResourceName: "rook-ceph-mon-b", DaemonName: "b"}, + {ResourceName: "rook-ceph-mon-c", DaemonName: "c"}, + } + c.spec.Mon.Count = 3 + removed = c.checkForExtraMonResources(mons, deployments) + assert.Equal(t, "", removed) +} + func TestStretchMonVolumeClaimTemplate(t *testing.T) { generalSC := "generalSC" zoneSC := "zoneSC" From e3785883591ca6710a04e89bd4f0f88079775d81 Mon Sep 17 00:00:00 2001 From: Michael Adam Date: Fri, 16 Aug 2024 19:04:01 +0200 Subject: [PATCH 8/8] network: add a new operator config setting ROOK_ENFORCE_HOSTNETWORK This new setting is of Boolean type and defaults to "false". When set to "true", it changes the behavior of the rook operator to nable host network on all pods created by the cephcluster controller new method to check the setting: opcontroller.EnForceHostNetwork() Signed-off-by: Michael Adam --- pkg/apis/ceph.rook.io/v1/network.go | 17 ++++++++++-- pkg/apis/ceph.rook.io/v1/network_test.go | 24 ++++++++++++++++- pkg/operator/ceph/cluster/cleanup.go | 2 ++ .../ceph/cluster/osd/provision_spec.go | 3 ++- pkg/operator/ceph/controller.go | 1 + .../ceph/controller/controller_utils.go | 20 ++++++++++++-- .../ceph/controller/controller_utils_test.go | 27 +++++++++++++++++++ pkg/operator/ceph/csi/spec.go | 3 +++ pkg/operator/ceph/object/cosi/spec.go | 2 ++ pkg/operator/discover/discover.go | 3 ++- .../k8sutil/cmdreporter/cmdreporter.go | 2 ++ 11 files changed, 97 insertions(+), 7 deletions(-) diff --git a/pkg/apis/ceph.rook.io/v1/network.go b/pkg/apis/ceph.rook.io/v1/network.go index c950eed22b43..debe0d39f7c5 100644 --- a/pkg/apis/ceph.rook.io/v1/network.go +++ b/pkg/apis/ceph.rook.io/v1/network.go @@ -27,6 +27,11 @@ import ( "github.com/pkg/errors" ) +// enforceHostNetwork is a private package variable that can be set via the rook-operator-config +// setting "ROOK_ENFORCE_HOST_NETWORK". when set to "true", it lets rook create all pods with host network enabled. +// This can be used, for example, to run Rook in k8s clusters with no CNI where host networking is required +var enforceHostNetwork bool = false + // IsMultus get whether to use multus network provider func (n *NetworkSpec) IsMultus() bool { return n.Provider == NetworkProviderMultus @@ -40,7 +45,7 @@ func (n *NetworkSpec) IsMultus() bool { // together with an empty or unset network provider has the same effect as // network.Provider set to "host" func (n *NetworkSpec) IsHost() bool { - return (n.HostNetwork && n.Provider == NetworkProviderDefault) || n.Provider == NetworkProviderHost + return enforceHostNetwork || (n.HostNetwork && n.Provider == NetworkProviderDefault) || n.Provider == NetworkProviderHost } func ValidateNetworkSpec(clusterNamespace string, spec NetworkSpec) error { @@ -62,7 +67,7 @@ func ValidateNetworkSpec(clusterNamespace string, spec NetworkSpec) error { if !spec.AddressRanges.IsEmpty() { if !spec.IsMultus() && !spec.IsHost() { - // TODO: be sure to update docs that AddressRanges can be specified for host networking as + // TODO: be sure to update docs that AddressRanges can be specified for host networking as // well as multus so that the override configmap doesn't need to be set return errors.Errorf("network ranges can only be specified for %q and %q network providers", NetworkProviderHost, NetworkProviderMultus) } @@ -181,3 +186,11 @@ func (l *CIDRList) String() string { } return strings.Join(sl, ", ") } + +func SetEnforceHostNetwork(val bool) { + enforceHostNetwork = val +} + +func EnforceHostNetwork() bool { + return enforceHostNetwork +} diff --git a/pkg/apis/ceph.rook.io/v1/network_test.go b/pkg/apis/ceph.rook.io/v1/network_test.go index 6c6980f2ce8b..7269a86df979 100644 --- a/pkg/apis/ceph.rook.io/v1/network_test.go +++ b/pkg/apis/ceph.rook.io/v1/network_test.go @@ -75,6 +75,8 @@ func TestValidateNetworkSpec(t *testing.T) { // test the NetworkSpec.IsHost method with different network providers // Also test it in combination with the legacy // "HostNetwork" setting. +// Also test the effect of the operator config setting +// ROOK_ENFORCE_HOST_NETWORK. func TestNetworkCephIsHost(t *testing.T) { net := NetworkSpec{HostNetwork: false} @@ -85,6 +87,14 @@ func TestNetworkCephIsHost(t *testing.T) { net.HostNetwork = true assert.True(t, net.IsHost()) + // enforcing does not change the result if host network is selected + // anyway in the cluster. + SetEnforceHostNetwork(true) + assert.True(t, net.IsHost()) + + SetEnforceHostNetwork(false) + assert.True(t, net.IsHost()) + net = NetworkSpec{} net.Provider = NetworkProviderDefault net.HostNetwork = false @@ -95,16 +105,28 @@ func TestNetworkCephIsHost(t *testing.T) { net.HostNetwork = false assert.False(t, net.IsHost()) + // test that not enforcing does not change the result. + SetEnforceHostNetwork(false) + assert.False(t, net.IsHost()) + + // test enforcing of host network + SetEnforceHostNetwork(true) + assert.True(t, net.IsHost()) + + SetEnforceHostNetwork(false) net = NetworkSpec{} net.Provider = NetworkProviderMultus net.HostNetwork = true assert.False(t, net.IsHost()) - // test with nonempty but invalid provider + // test with nonempty but invalid provider net = NetworkSpec{} net.HostNetwork = true net.Provider = "foo" + SetEnforceHostNetwork(false) assert.False(t, net.IsHost()) + SetEnforceHostNetwork(true) + assert.True(t, net.IsHost()) } diff --git a/pkg/operator/ceph/cluster/cleanup.go b/pkg/operator/ceph/cluster/cleanup.go index 92b2933591ab..eb50d1a3354d 100644 --- a/pkg/operator/ceph/cluster/cleanup.go +++ b/pkg/operator/ceph/cluster/cleanup.go @@ -30,6 +30,7 @@ import ( "github.com/rook/rook/pkg/operator/ceph/cluster/osd" "github.com/rook/rook/pkg/operator/ceph/cluster/rbd" "github.com/rook/rook/pkg/operator/ceph/controller" + opcontroller "github.com/rook/rook/pkg/operator/ceph/controller" "github.com/rook/rook/pkg/operator/ceph/file/mds" "github.com/rook/rook/pkg/operator/ceph/file/mirror" "github.com/rook/rook/pkg/operator/ceph/object" @@ -162,6 +163,7 @@ func (c *ClusterController) cleanUpJobTemplateSpec(cluster *cephv1.CephCluster, RestartPolicy: v1.RestartPolicyOnFailure, PriorityClassName: cephv1.GetCleanupPriorityClassName(cluster.Spec.PriorityClassNames), ServiceAccountName: k8sutil.DefaultServiceAccount, + HostNetwork: opcontroller.EnforceHostNetwork(), }, } diff --git a/pkg/operator/ceph/cluster/osd/provision_spec.go b/pkg/operator/ceph/cluster/osd/provision_spec.go index 9e3673dfbb0b..4839ea6a230b 100644 --- a/pkg/operator/ceph/cluster/osd/provision_spec.go +++ b/pkg/operator/ceph/cluster/osd/provision_spec.go @@ -28,6 +28,7 @@ import ( "github.com/rook/rook/pkg/operator/ceph/cluster/mon" "github.com/rook/rook/pkg/operator/ceph/cluster/osd/config" "github.com/rook/rook/pkg/operator/ceph/controller" + opcontroller "github.com/rook/rook/pkg/operator/ceph/controller" "github.com/rook/rook/pkg/operator/k8sutil" batch "k8s.io/api/batch/v1" v1 "k8s.io/api/core/v1" @@ -154,7 +155,7 @@ func (c *Cluster) provisionPodTemplateSpec(osdProps osdProperties, restart v1.Re }, RestartPolicy: restart, Volumes: volumes, - HostNetwork: c.spec.Network.IsHost(), + HostNetwork: opcontroller.EnforceHostNetwork(), PriorityClassName: cephv1.GetOSDPriorityClassName(c.spec.PriorityClassNames), SchedulerName: osdProps.schedulerName, } diff --git a/pkg/operator/ceph/controller.go b/pkg/operator/ceph/controller.go index 0e32a898a1a5..238dd29bd43b 100644 --- a/pkg/operator/ceph/controller.go +++ b/pkg/operator/ceph/controller.go @@ -132,6 +132,7 @@ func (r *ReconcileConfig) reconcile(request reconcile.Request) (reconcile.Result } opcontroller.SetAllowLoopDevices(r.config.Parameters) + opcontroller.SetEnforceHostNetwork(r.config.Parameters) logger.Infof("%s done reconciling", controllerName) return reconcile.Result{}, nil diff --git a/pkg/operator/ceph/controller/controller_utils.go b/pkg/operator/ceph/controller/controller_utils.go index b726e2e97aed..f5cc1ca1da27 100644 --- a/pkg/operator/ceph/controller/controller_utils.go +++ b/pkg/operator/ceph/controller/controller_utils.go @@ -50,7 +50,9 @@ type ClusterHealth struct { const ( // OperatorSettingConfigMapName refers to ConfigMap that configures rook ceph operator - OperatorSettingConfigMapName string = "rook-ceph-operator-config" + OperatorSettingConfigMapName string = "rook-ceph-operator-config" + enforceHostNetworkSettingName string = "ROOK_ENFORCE_HOST_NETWORK" + enforceHostNetworkDefaultValue string = "false" // UninitializedCephConfigError refers to the error message printed by the Ceph CLI when there is no ceph configuration file // This typically is raised when the operator has not finished initializing @@ -116,6 +118,21 @@ func LoopDevicesAllowed() bool { return loopDevicesAllowed } +func SetEnforceHostNetwork(data map[string]string) { + strval := k8sutil.GetValue(data, enforceHostNetworkSettingName, enforceHostNetworkDefaultValue) + val, err := strconv.ParseBool(strval) + if err != nil { + logger.Warningf("failed to parse value %q for %q. assuming false value", strval, enforceHostNetworkSettingName) + cephv1.SetEnforceHostNetwork(false) + return + } + cephv1.SetEnforceHostNetwork(val) +} + +func EnforceHostNetwork() bool { + return cephv1.EnforceHostNetwork() +} + // canIgnoreHealthErrStatusInReconcile determines whether a status of HEALTH_ERR in the CephCluster can be ignored safely. func canIgnoreHealthErrStatusInReconcile(cephCluster cephv1.CephCluster, controllerName string) bool { // Get a list of all the keys causing the HEALTH_ERR status. @@ -153,7 +170,6 @@ func IsReadyToReconcile(ctx context.Context, c client.Client, namespacedName typ return cephCluster, false, cephClusterExists, WaitForRequeueIfCephClusterNotReady } cephCluster = clusterList.Items[0] - // If the cluster has a cleanup policy to destroy the cluster and it has been marked for deletion, treat it as if it does not exist if cephCluster.Spec.CleanupPolicy.HasDataDirCleanPolicy() && !cephCluster.DeletionTimestamp.IsZero() { logger.Infof("%q: CephCluster has a destructive cleanup policy, allowing %q to be deleted", controllerName, namespacedName) diff --git a/pkg/operator/ceph/controller/controller_utils_test.go b/pkg/operator/ceph/controller/controller_utils_test.go index eadcb2376c40..532cc0bcbecc 100644 --- a/pkg/operator/ceph/controller/controller_utils_test.go +++ b/pkg/operator/ceph/controller/controller_utils_test.go @@ -106,6 +106,33 @@ func TestSetAllowLoopDevices(t *testing.T) { assert.True(t, LoopDevicesAllowed()) } +func TestSetEnforceHostNetwork(t *testing.T) { + logger.Infof("testing default value for %v", enforceHostNetworkSettingName) + opConfig := map[string]string{} + SetEnforceHostNetwork(opConfig) + assert.False(t, EnforceHostNetwork()) + + // test invalid setting + var value string = "foo" + logger.Infof("testing invalid value'%v' for %v", value, enforceHostNetworkSettingName) + opConfig[enforceHostNetworkSettingName] = value + SetEnforceHostNetwork(opConfig) + assert.False(t, EnforceHostNetwork()) + + // test valid settings + value = "true" + logger.Infof("testing valid value'%v' for %v", value, enforceHostNetworkSettingName) + opConfig[enforceHostNetworkSettingName] = value + SetEnforceHostNetwork(opConfig) + assert.True(t, EnforceHostNetwork()) + + value = "false" + logger.Infof("testing valid value'%v' for %v", value, enforceHostNetworkSettingName) + opConfig[enforceHostNetworkSettingName] = value + SetEnforceHostNetwork(opConfig) + assert.False(t, EnforceHostNetwork()) +} + func TestIsReadyToReconcile(t *testing.T) { scheme := scheme.Scheme scheme.AddKnownTypes(cephv1.SchemeGroupVersion, &cephv1.CephCluster{}, &cephv1.CephClusterList{}) diff --git a/pkg/operator/ceph/csi/spec.go b/pkg/operator/ceph/csi/spec.go index 4be362ed25bb..d63e7bc9da2b 100644 --- a/pkg/operator/ceph/csi/spec.go +++ b/pkg/operator/ceph/csi/spec.go @@ -382,6 +382,7 @@ func (r *ReconcileCSI) startDrivers(ver *version.Info, ownerInfo *k8sutil.OwnerI if tp.CSILogRotation { applyLogrotateSidecar(&rbdProvisionerDeployment.Spec.Template, "csi-rbd-deployment-log-collector", LogrotateTemplatePath, tp) } + rbdProvisionerDeployment.Spec.Template.Spec.HostNetwork = opcontroller.EnforceHostNetwork() // Create service if either liveness or GRPC metrics are enabled. if CSIParam.EnableLiveness { @@ -419,6 +420,7 @@ func (r *ReconcileCSI) startDrivers(ver *version.Info, ownerInfo *k8sutil.OwnerI if tp.CSILogRotation { applyLogrotateSidecar(&cephfsProvisionerDeployment.Spec.Template, "csi-cephfs-deployment-log-collector", LogrotateTemplatePath, tp) } + cephfsProvisionerDeployment.Spec.Template.Spec.HostNetwork = opcontroller.EnforceHostNetwork() // Create service if either liveness or GRPC metrics are enabled. if CSIParam.EnableLiveness { @@ -457,6 +459,7 @@ func (r *ReconcileCSI) startDrivers(ver *version.Info, ownerInfo *k8sutil.OwnerI if tp.CSILogRotation { applyLogrotateSidecar(&nfsProvisionerDeployment.Spec.Template, "csi-nfs-deployment-log-collector", LogrotateTemplatePath, tp) } + nfsProvisionerDeployment.Spec.Template.Spec.HostNetwork = opcontroller.EnforceHostNetwork() enabledDrivers = append(enabledDrivers, driverDetails{ name: NFSDriverShortName, diff --git a/pkg/operator/ceph/object/cosi/spec.go b/pkg/operator/ceph/object/cosi/spec.go index f01ebabbd662..50bf737c5bab 100644 --- a/pkg/operator/ceph/object/cosi/spec.go +++ b/pkg/operator/ceph/object/cosi/spec.go @@ -20,6 +20,7 @@ import ( "github.com/pkg/errors" cephv1 "github.com/rook/rook/pkg/apis/ceph.rook.io/v1" "github.com/rook/rook/pkg/operator/ceph/controller" + opcontroller "github.com/rook/rook/pkg/operator/ceph/controller" appsv1 "k8s.io/api/apps/v1" corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -83,6 +84,7 @@ func createCOSIPodSpec(cephCOSIDriver *cephv1.CephCOSIDriver) (corev1.PodTemplat cosiSideCarContainer := createCOSISideCarContainer(cephCOSIDriver) podSpec := corev1.PodSpec{ + HostNetwork: opcontroller.EnforceHostNetwork(), Containers: []corev1.Container{ cosiDriverContainer, cosiSideCarContainer, diff --git a/pkg/operator/discover/discover.go b/pkg/operator/discover/discover.go index bf543524b328..ee4cdb3f6369 100644 --- a/pkg/operator/discover/discover.go +++ b/pkg/operator/discover/discover.go @@ -31,6 +31,7 @@ import ( "github.com/rook/rook/pkg/clusterd" discoverDaemon "github.com/rook/rook/pkg/daemon/discover" "github.com/rook/rook/pkg/operator/ceph/controller" + opcontroller "github.com/rook/rook/pkg/operator/ceph/controller" k8sutil "github.com/rook/rook/pkg/operator/k8sutil" "github.com/rook/rook/pkg/util/sys" @@ -173,7 +174,7 @@ func (d *Discover) createDiscoverDaemonSet(ctx context.Context, namespace, disco }, }, }, - HostNetwork: false, + HostNetwork: opcontroller.EnforceHostNetwork(), PriorityClassName: k8sutil.GetValue(data, discoverDaemonsetPriorityClassNameEnv, ""), }, }, diff --git a/pkg/operator/k8sutil/cmdreporter/cmdreporter.go b/pkg/operator/k8sutil/cmdreporter/cmdreporter.go index affc87558f9e..b651def6d24a 100644 --- a/pkg/operator/k8sutil/cmdreporter/cmdreporter.go +++ b/pkg/operator/k8sutil/cmdreporter/cmdreporter.go @@ -25,6 +25,7 @@ import ( "github.com/coreos/pkg/capnslog" "github.com/pkg/errors" + cephv1 "github.com/rook/rook/pkg/apis/ceph.rook.io/v1" "github.com/rook/rook/pkg/daemon/util" "github.com/rook/rook/pkg/operator/k8sutil" batch "k8s.io/api/batch/v1" @@ -302,6 +303,7 @@ func (cr *cmdReporterCfg) initJobSpec() (*batch.Job, error) { }, RestartPolicy: v1.RestartPolicyOnFailure, ServiceAccountName: k8sutil.DefaultServiceAccount, + HostNetwork: cephv1.EnforceHostNetwork(), } copyBinsVol, _ := copyBinariesVolAndMount() podSpec.Volumes = []v1.Volume{copyBinsVol}