Merge pull request #621 from red-hat-storage/sync_us--master

Syncing latest changes from upstream master for rook
red-hat-storage · Apr 12, 2024 · eb8c76f · eb8c76f
2 parents f8e5f81 + 5d2c92b
commit eb8c76f
Show file tree

Hide file tree

Showing 28 changed files with 1,073 additions and 35 deletions.
diff --git a/Documentation/CRDs/Cluster/ceph-cluster-crd.md b/Documentation/CRDs/Cluster/ceph-cluster-crd.md
@@ -38,6 +38,7 @@ Settings can be specified at the global level to apply to the cluster as a whole
 If this value is empty, each pod will get an ephemeral directory to store their config files that is tied to the lifetime of the pod running on that node. More details can be found in the Kubernetes [empty dir docs](https://kubernetes.io/docs/concepts/storage/volumes/#emptydir).
 * `skipUpgradeChecks`: if set to true Rook won't perform any upgrade checks on Ceph daemons during an upgrade. Use this at **YOUR OWN RISK**, only if you know what you're doing. To understand Rook's upgrade process of Ceph, read the [upgrade doc](../../Upgrade/rook-upgrade.md#ceph-version-upgrades).
 * `continueUpgradeAfterChecksEvenIfNotHealthy`: if set to true Rook will continue the OSD daemon upgrade process even if the PGs are not clean, or continue with the MDS upgrade even the file system is not healthy.
+* `upgradeOSDRequiresHealthyPGs`: if set to true OSD upgrade process won't start until PGs are healthy.
 * `dashboard`: Settings for the Ceph dashboard. To view the dashboard in your browser see the [dashboard guide](../../Storage-Configuration/Monitoring/ceph-dashboard.md).
     * `enabled`: Whether to enable the dashboard to view cluster status
     * `urlPrefix`: Allows to serve the dashboard under a subpath (useful when you are accessing the dashboard via a reverse proxy)

diff --git a/Documentation/CRDs/Shared-Filesystem/ceph-fs-subvolumegroup-crd.md b/Documentation/CRDs/Shared-Filesystem/ceph-fs-subvolumegroup-crd.md
@@ -32,6 +32,10 @@ spec:
     distributed: 1            # distributed=<0, 1> (disabled=0)
     # export:                 # export=<0-256> (disabled=-1)
     # random:                 # random=[0.0, 1.0](disabled=0.0)
+  # Quota size of the subvolume group.
+  #quota: 10G
+  # data pool name for the subvolume group layout instead of the default data pool.
+  #dataPoolName: myfs-replicated
 ```
 
 ## Settings
@@ -48,7 +52,11 @@ If any setting is unspecified, a suitable default will be used automatically.
 
 * `filesystemName`: The metadata name of the CephFilesystem CR where the subvolume group will be created.
 
-* `pinning`: To distribute load across MDS ranks in predictable and stable ways. Reference: https://docs.ceph.com/en/latest/cephfs/fs-volumes/#pinning-subvolumes-and-subvolume-groups.
+* `quota`: Quota size of the Ceph Filesystem subvolume group.
+
+* `dataPoolName`: The data pool name for the subvolume group layout instead of the default data pool.
+
+* `pinning`: To distribute load across MDS ranks in predictable and stable ways. See the Ceph doc for [Pinning subvolume groups](https://docs.ceph.com/en/latest/cephfs/fs-volumes/#pinning-subvolumes-and-subvolume-groups).
     * `distributed`: Range: <0, 1>, for disabling it set to 0
     * `export`: Range: <0-256>, for disabling it set to -1
     * `random`: Range: [0.0, 1.0], for disabling it set to 0.0

diff --git a/Documentation/CRDs/specification.md b/Documentation/CRDs/specification.md
@@ -945,6 +945,20 @@ The default wait timeout is 10 minutes.</p>
 </tr>
 <tr>
 <td>
+<code>upgradeOSDRequiresHealthyPGs</code><br/>
+<em>
+bool
+</em>
+</td>
+<td>
+<em>(Optional)</em>
+<p>UpgradeOSDRequiresHealthyPGs defines if OSD upgrade requires PGs are clean. If set to <code>true</code> OSD upgrade process won&rsquo;t start until PGs are healthy.
+This configuration will be ignored if <code>skipUpgradeChecks</code> is <code>true</code>.
+Default is false.</p>
+</td>
+</tr>
+<tr>
+<td>
 <code>disruptionManagement</code><br/>
 <em>
 <a href="#ceph.rook.io/v1.DisruptionManagementSpec">
@@ -1562,6 +1576,30 @@ reference <a href="https://docs.ceph.com/en/latest/cephfs/fs-volumes/#pinning-su
 only one out of (export, distributed, random) can be set at a time</p>
 </td>
 </tr>
+<tr>
+<td>
+<code>quota</code><br/>
+<em>
+k8s.io/apimachinery/pkg/api/resource.Quantity
+</em>
+</td>
+<td>
+<em>(Optional)</em>
+<p>Quota size of the Ceph Filesystem subvolume group.</p>
+</td>
+</tr>
+<tr>
+<td>
+<code>dataPoolName</code><br/>
+<em>
+string
+</em>
+</td>
+<td>
+<em>(Optional)</em>
+<p>The data pool name for the Ceph Filesystem subvolume group layout, if the default CephFS pool is not desired.</p>
+</td>
+</tr>
 </table>
 </td>
 </tr>
@@ -3702,6 +3740,30 @@ reference <a href="https://docs.ceph.com/en/latest/cephfs/fs-volumes/#pinning-su
 only one out of (export, distributed, random) can be set at a time</p>
 </td>
 </tr>
+<tr>
+<td>
+<code>quota</code><br/>
+<em>
+k8s.io/apimachinery/pkg/api/resource.Quantity
+</em>
+</td>
+<td>
+<em>(Optional)</em>
+<p>Quota size of the Ceph Filesystem subvolume group.</p>
+</td>
+</tr>
+<tr>
+<td>
+<code>dataPoolName</code><br/>
+<em>
+string
+</em>
+</td>
+<td>
+<em>(Optional)</em>
+<p>The data pool name for the Ceph Filesystem subvolume group layout, if the default CephFS pool is not desired.</p>
+</td>
+</tr>
 </tbody>
 </table>
 <h3 id="ceph.rook.io/v1.CephFilesystemSubVolumeGroupSpecPinning">CephFilesystemSubVolumeGroupSpecPinning
@@ -4370,6 +4432,20 @@ The default wait timeout is 10 minutes.</p>
 </tr>
 <tr>
 <td>
+<code>upgradeOSDRequiresHealthyPGs</code><br/>
+<em>
+bool
+</em>
+</td>
+<td>
+<em>(Optional)</em>
+<p>UpgradeOSDRequiresHealthyPGs defines if OSD upgrade requires PGs are clean. If set to <code>true</code> OSD upgrade process won&rsquo;t start until PGs are healthy.
+This configuration will be ignored if <code>skipUpgradeChecks</code> is <code>true</code>.
+Default is false.</p>
+</td>
+</tr>
+<tr>
+<td>
 <code>disruptionManagement</code><br/>
 <em>
 <a href="#ceph.rook.io/v1.DisruptionManagementSpec">

diff --git a/Documentation/Storage-Configuration/ceph-teardown.md b/Documentation/Storage-Configuration/ceph-teardown.md
@@ -2,6 +2,11 @@
 title: Cleanup
 ---
 
+Rook provides the following clean up options:
+
+1. [Uninstall: Clean up the entire cluster and delete all data](#cleaning-up-a-cluster)
+1. [Force delete individual resources](#force-delete-resources)
+
 ## Cleaning up a Cluster
 
 To tear down the cluster, the following resources need to be cleaned up:
@@ -179,3 +184,22 @@ If the operator is not able to remove the finalizers (i.e., the operator is not
 kubectl -n rook-ceph patch configmap rook-ceph-mon-endpoints --type merge -p '{"metadata":{"finalizers": []}}'
 kubectl -n rook-ceph patch secrets rook-ceph-mon --type merge -p '{"metadata":{"finalizers": []}}'
 ```
+
+## Force Delete Resources
+
+To keep your data safe in the cluster, Rook disallows deleting critical cluster resources by default. To override this behavior and force delete a specific custom resource, add the annotation `rook.io/force-deletion="true"` to the resource and then delete it. Rook will start a cleanup job that will delete all the related ceph resources created by that custom resource.
+
+For example, run the following commands to clean the `CephFilesystemSubVolumeGroup` resource named `my-subvolumegroup`
+
+``` console
+kubectl -n rook-ceph annotate cephfilesystemsubvolumegroups.ceph.rook.io my-subvolumegroup rook.io/force-deletion="true"
+kubectl -n rook-ceph delete cephfilesystemsubvolumegroups.ceph.rook.io my-subvolumegroup
+```
+
+Once the cleanup job is completed successfully, Rook will remove the finalizers from the deleted custom resource.
+
+This cleanup is supported only for the following custom resources:
+
+| Custom Resource                                | Ceph Resources to be cleaned up |
+| --------                                       | ------- |
+| CephFilesystemSubVolumeGroup                   | CSI stored RADOS OMAP details for pvc/volumesnapshots, subvolume snapshots, subvolume clones, subvolumes |
diff --git a/build/csv/ceph/ceph.rook.io_cephclusters.yaml b/build/csv/ceph/ceph.rook.io_cephclusters.yaml
@@ -2951,6 +2951,8 @@ spec:
                       type: object
                     type: array
                 type: object
+              upgradeOSDRequiresHealthyPGs:
+                type: boolean
               waitTimeoutForHealthyOSDInMinutes:
                 format: int64
                 type: integer

diff --git a/build/csv/ceph/ceph.rook.io_cephfilesystemsubvolumegroups.yaml b/build/csv/ceph/ceph.rook.io_cephfilesystemsubvolumegroups.yaml
@@ -33,6 +33,8 @@ spec:
             type: object
           spec:
             properties:
+              dataPoolName:
+                type: string
               filesystemName:
                 type: string
                 x-kubernetes-validations:
@@ -67,6 +69,12 @@ spec:
                     || (!has(self.export) && has(self.distributed) && !has(self.random))
                     || (!has(self.export) && !has(self.distributed) && has(self.random))
                     || (!has(self.export) && !has(self.distributed) && !has(self.random))
+              quota:
+                anyOf:
+                - type: integer
+                - type: string
+                pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
+                x-kubernetes-int-or-string: true
             required:
             - filesystemName
             type: object

diff --git a/cmd/rook/ceph/cleanup.go b/cmd/rook/ceph/cleanup.go
@@ -17,12 +17,14 @@ limitations under the License.
 package ceph
 
 import (
+	"fmt"
 	"os"
 
 	"github.com/rook/rook/cmd/rook/rook"
 	cephv1 "github.com/rook/rook/pkg/apis/ceph.rook.io/v1"
 	cleanup "github.com/rook/rook/pkg/daemon/ceph/cleanup"
 	"github.com/rook/rook/pkg/daemon/ceph/client"
+	opcontroller "github.com/rook/rook/pkg/operator/ceph/controller"
 	"github.com/rook/rook/pkg/operator/k8sutil"
 	"github.com/rook/rook/pkg/util/flags"
 	"github.com/spf13/cobra"
@@ -40,22 +42,39 @@ var (
 
 var cleanUpCmd = &cobra.Command{
 	Use:   "clean",
-	Short: "Starts the cleanup process on the disks after ceph cluster is deleted",
+	Short: "Starts the cleanup process",
+}
+
+var cleanUpHostCmd = &cobra.Command{
+	Use:   "host",
+	Short: "Starts the cleanup process on a host after the ceph cluster is deleted",
+}
+
+var cleanUpSubVolumeGroupCmd = &cobra.Command{
+	// the subcommand matches CRD kind of the custom resource to be cleaned up
+	Use:   "CephFilesystemSubVolumeGroup",
+	Short: "Starts the cleanup process of a CephFilesystemSubVolumeGroup",
 }
 
 func init() {
 	cleanUpCmd.Flags().StringVar(&dataDirHostPath, "data-dir-host-path", "", "dataDirHostPath on the node")
 	cleanUpCmd.Flags().StringVar(&namespaceDir, "namespace-dir", "", "dataDirHostPath on the node")
-	cleanUpCmd.Flags().StringVar(&monSecret, "mon-secret", "", "monitor secret from the keyring")
-	cleanUpCmd.Flags().StringVar(&clusterFSID, "cluster-fsid", "", "ceph cluster fsid")
-	cleanUpCmd.Flags().StringVar(&sanitizeMethod, "sanitize-method", string(cephv1.SanitizeMethodQuick), "sanitize method to use (metadata or data)")
-	cleanUpCmd.Flags().StringVar(&sanitizeDataSource, "sanitize-data-source", string(cephv1.SanitizeDataSourceZero), "data source to sanitize the disk (zero or random)")
-	cleanUpCmd.Flags().Int32Var(&sanitizeIteration, "sanitize-iteration", 1, "overwrite N times the disk")
-	flags.SetFlagsFromEnv(cleanUpCmd.Flags(), rook.RookEnvVarPrefix)
-	cleanUpCmd.RunE = startCleanUp
+	cleanUpHostCmd.Flags().StringVar(&monSecret, "mon-secret", "", "monitor secret from the keyring")
+	cleanUpHostCmd.Flags().StringVar(&clusterFSID, "cluster-fsid", "", "ceph cluster fsid")
+	cleanUpHostCmd.Flags().StringVar(&sanitizeMethod, "sanitize-method", string(cephv1.SanitizeMethodQuick), "sanitize method to use (metadata or data)")
+	cleanUpHostCmd.Flags().StringVar(&sanitizeDataSource, "sanitize-data-source", string(cephv1.SanitizeDataSourceZero), "data source to sanitize the disk (zero or random)")
+	cleanUpHostCmd.Flags().Int32Var(&sanitizeIteration, "sanitize-iteration", 1, "overwrite N times the disk")
+	flags.SetFlagsFromEnv(cleanUpHostCmd.Flags(), rook.RookEnvVarPrefix)
+
+	flags.SetFlagsFromEnv(cleanUpSubVolumeGroupCmd.Flags(), rook.RookEnvVarPrefix)
+
+	cleanUpCmd.AddCommand(cleanUpHostCmd, cleanUpSubVolumeGroupCmd)
+
+	cleanUpHostCmd.RunE = startHostCleanUp
+	cleanUpSubVolumeGroupCmd.RunE = startSubVolumeGroupCleanUp
 }
 
-func startCleanUp(cmd *cobra.Command, args []string) error {
+func startHostCleanUp(cmd *cobra.Command, args []string) error {
 	rook.SetLogLevel()
 	rook.LogStartupInfo(cleanUpCmd.Flags())
 
@@ -87,3 +106,37 @@ func startCleanUp(cmd *cobra.Command, args []string) error {
 
 	return nil
 }
+
+func startSubVolumeGroupCleanUp(cmd *cobra.Command, args []string) error {
+	rook.SetLogLevel()
+	rook.LogStartupInfo(cleanUpSubVolumeGroupCmd.Flags())
+
+	ctx := cmd.Context()
+	context := createContext()
+	namespace := os.Getenv(k8sutil.PodNamespaceEnvVar)
+	clusterInfo := client.AdminClusterInfo(ctx, namespace, "")
+
+	fsName := os.Getenv(opcontroller.CephFSNameEnv)
+	if fsName == "" {
+		rook.TerminateFatal(fmt.Errorf("ceph filesystem name is not available in the pod environment variables"))
+	}
+	subVolumeGroupName := os.Getenv(opcontroller.CephFSSubVolumeGroupNameEnv)
+	if subVolumeGroupName == "" {
+		rook.TerminateFatal(fmt.Errorf("cephFS SubVolumeGroup name is not available in the pod environment variables"))
+	}
+	csiNamespace := os.Getenv(opcontroller.CSICephFSRadosNamesaceEnv)
+	if csiNamespace == "" {
+		rook.TerminateFatal(fmt.Errorf("CSI rados namespace name is not available in the pod environment variables"))
+	}
+	poolName := os.Getenv(opcontroller.CephFSMetaDataPoolNameEnv)
+	if poolName == "" {
+		rook.TerminateFatal(fmt.Errorf("cephFS metadata pool name is not available in the pod environment variables"))
+	}
+
+	err := cleanup.SubVolumeGroupCleanup(context, clusterInfo, fsName, subVolumeGroupName, poolName, csiNamespace)
+	if err != nil {
+		rook.TerminateFatal(fmt.Errorf("failed to cleanup cephFS %q SubVolumeGroup %q in the namespace %q. %v", fsName, subVolumeGroupName, namespace, err))
+	}
+
+	return nil
+}
diff --git a/deploy/charts/rook-ceph-cluster/values.yaml b/deploy/charts/rook-ceph-cluster/values.yaml
@@ -121,6 +121,11 @@ cephClusterSpec:
   # The default wait timeout is 10 minutes.
   waitTimeoutForHealthyOSDInMinutes: 10
 
+  # Whether or not requires PGs are clean before an OSD upgrade. If set to `true` OSD upgrade process won't start until PGs are healthy.
+  # This configuration will be ignored if `skipUpgradeChecks` is `true`.
+  # Default is false.
+  upgradeOSDRequiresHealthyPGs: false
+
   mon:
     # Set the number of mons to be started. Generally recommended to be 3.
     # For highest availability, an odd number of mons should be specified.

diff --git a/deploy/charts/rook-ceph/templates/resources.yaml b/deploy/charts/rook-ceph/templates/resources.yaml
@@ -5004,6 +5004,12 @@ spec:
                         type: object
                       type: array
                   type: object
+                upgradeOSDRequiresHealthyPGs:
+                  description: |-
+                    UpgradeOSDRequiresHealthyPGs defines if OSD upgrade requires PGs are clean. If set to `true` OSD upgrade process won't start until PGs are healthy.
+                    This configuration will be ignored if `skipUpgradeChecks` is `true`.
+                    Default is false.
+                  type: boolean
                 waitTimeoutForHealthyOSDInMinutes:
                   description: |-
                     WaitTimeoutForHealthyOSDInMinutes defines the time the operator would wait before an OSD can be stopped for upgrade or restart.
@@ -7977,6 +7983,9 @@ spec:
             spec:
               description: Spec represents the specification of a Ceph Filesystem SubVolumeGroup
               properties:
+                dataPoolName:
+                  description: The data pool name for the Ceph Filesystem subvolume group layout, if the default CephFS pool is not desired.
+                  type: string
                 filesystemName:
                   description: |-
                     FilesystemName is the name of Ceph Filesystem SubVolumeGroup volume name. Typically it's the name of
@@ -8018,6 +8027,13 @@ spec:
                   x-kubernetes-validations:
                     - message: only one pinning type should be set
                       rule: (has(self.export) && !has(self.distributed) && !has(self.random)) || (!has(self.export) && has(self.distributed) && !has(self.random)) || (!has(self.export) && !has(self.distributed) && has(self.random)) || (!has(self.export) && !has(self.distributed) && !has(self.random))
+                quota:
+                  anyOf:
+                    - type: integer
+                    - type: string
+                  description: Quota size of the Ceph Filesystem subvolume group.
+                  pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
+                  x-kubernetes-int-or-string: true
               required:
                 - filesystemName
               type: object

diff --git a/deploy/examples/cluster.yaml b/deploy/examples/cluster.yaml
@@ -43,6 +43,10 @@ spec:
   # continue with the upgrade of an OSD even if its not ok to stop after the timeout. This timeout won't be applied if `skipUpgradeChecks` is `true`.
   # The default wait timeout is 10 minutes.
   waitTimeoutForHealthyOSDInMinutes: 10
+  # Whether or not requires PGs are clean before an OSD upgrade. If set to `true` OSD upgrade process won't start until PGs are healthy.
+  # This configuration will be ignored if `skipUpgradeChecks` is `true`.
+  # Default is false.
+  upgradeOSDRequiresHealthyPGs: false
   mon:
     # Set the number of mons to be started. Generally recommended to be 3.
     # For highest availability, an odd number of mons should be specified.