From b4a2285aa6ef08ceb27fafcb96bbcb7e1491eeae Mon Sep 17 00:00:00 2001 From: Blaine Gardner Date: Fri, 2 Aug 2024 13:04:52 -0600 Subject: [PATCH] object: use advertise endpoint for admin ops RGW can only serve a single certificate. This limitation means that the prior behavior of using the default service for admin ops when TLS is enabled may mean it requires additional complex certificate management to make sure the object store uses a certificate valid for Rook internal admin ops and user connections. This is needlessly complex for users. Instead, change Rook's behavior and documentation to clarify that it will use the same endpoint intended for S3 client applications. This means that users have a more straightforward path to enabling both Rook and consuming applications. More info: https://github.com/rook/rook/issues/14530 Signed-off-by: Blaine Gardner --- .../Object-Storage-RGW/object-storage.md | 68 ++++++++++++------- design/ceph/object/store.md | 23 ++++--- pkg/operator/ceph/object/admin.go | 25 ++----- pkg/operator/ceph/object/admin_test.go | 56 ++++++--------- 4 files changed, 81 insertions(+), 91 deletions(-) diff --git a/Documentation/Storage-Configuration/Object-Storage-RGW/object-storage.md b/Documentation/Storage-Configuration/Object-Storage-RGW/object-storage.md index d5234172145e..241159791e44 100644 --- a/Documentation/Storage-Configuration/Object-Storage-RGW/object-storage.md +++ b/Documentation/Storage-Configuration/Object-Storage-RGW/object-storage.md @@ -200,7 +200,7 @@ Then create a secret with the user credentials: kubectl -n rook-ceph create secret generic --type="kubernetes.io/rook" rgw-admin-ops-user --from-literal=accessKey= --from-literal=secretKey= ``` -If you have an external `CephCluster` CR, you can instruct Rook to consume external gateways with the following: +For an external CephCluster, configure Rook to consume external RGW servers with the following: ```yaml apiVersion: ceph.rook.io/v1 @@ -216,42 +216,35 @@ spec: # hostname: example.com ``` -Use the existing `object-external.yaml` file. Even though multiple endpoints can be specified, it is recommend to use only one endpoint. This endpoint is randomly added to `configmap` of OBC and secret of the `cephobjectstoreuser`. Rook never guarantees the randomly picked endpoint is a working one or not. -If there are multiple endpoints, please add load balancer in front of them and use the load balancer endpoint in the `externalRgwEndpoints` list. +See `object-external.yaml` for a more detailed example. -When ready, the message in the `cephobjectstore` status similar to this one: - -```console -kubectl -n rook-ceph get cephobjectstore external-store -NAME PHASE -external-store Ready - -``` - -Any pod from your cluster can now access this endpoint: - -```console -$ curl 192.168.39.182:8080 -anonymous -``` +Even though multiple `externalRgwEndpoints` can be specified, it is best to use a single endpoint. +Only the first endpoint in the list will be advertised to any consuming resources like +CephObjectStoreUsers, ObjectBucketClaims, or COSI resources. If there are multiple external RGW +endpoints, add load balancer in front of them, then use the single load balancer endpoint in the +`externalRgwEndpoints` list. ## Object store endpoint The CephObjectStore resource `status.info` contains `endpoint` (and `secureEndpoint`) fields, which -report the endpoint that can be used to access the object store as a client. +report the endpoint that can be used to access the object store as a client. This endpoint is also +advertised as the default endpoint for CephObjectStoreUsers, ObjectBucketClaims, and +Container Object Store Interface (COSI) resources. Each object store also creates a Kubernetes service that can be used as a client endpoint from within the Kubernetes cluster. The DNS name of the service is `rook-ceph-rgw-..svc`. This service DNS name is the default `endpoint` (and `secureEndpoint`). -For [external clusters](#connect-to-an-external-object-store), the default endpoints contain the -first `spec.gateway.externalRgwEndpoint` instead of the service DNS name. +For [external clusters](#connect-to-an-external-object-store), the default endpoint is the first +`spec.gateway.externalRgwEndpoint` instead of the service DNS name. -Rook always uses the default endpoint to perform management operations against the object store. -When TLS is enabled, the TLS certificate must always specify the default endpoint DNS name to allow -secure management operations. TLS configuration specification can be found in object -[gateway `securePort` documentation](../../CRDs/Object-Storage/ceph-object-store-crd.md#gateway-settings). +The advertised endpoint can be overridden using `advertiseEndpoint` in the +[`spec.hosting` config](../../CRDs/Object-Storage/ceph-object-store-crd.md#hosting-settings). + +Rook always uses the advertised endpoint to perform management operations against the object store. +When [TLS is enabled](#enable-tls), the TLS certificate must always specify the endpoint DNS name to +allow secure management operations. ## Create a Bucket @@ -508,6 +501,29 @@ kubectl -n rook-ceph get secret rook-ceph-object-user-my-store-my-user -o jsonpa kubectl -n rook-ceph get secret rook-ceph-object-user-my-store-my-user -o jsonpath='{.data.SecretKey}' | base64 --decode ``` +## Enable TLS + +TLS is critical for securing object storage data access, and it is assumed as a default by many S3 +clients. TLS is enabled for CephObjectStores by configuring +[`gateway` options](../../CRDs/Object-Storage/ceph-object-store-crd.md#gateway-settings). +Set `securePort`, and give Rook access to a TLS certificate using `sslCertificateRef`. +`caBundleRef` may be necessary as well to give the deployed gateway (RGW) access to the TLS +certificate's CA signing bundle. + +Ceph RGW only supports a **single** TLS certificate. If the given TLS certificate is a concatenation +of multiple certificates, only the first certificate will be used by the RGW as the server +certificate. Therefore, the TLS certificate given must include all endpoints that clients will use +for access as subject alternate names (SANs). + +The [CephObjectStore service endpoint](#object-store-endpoint) must be added as a SAN on the TLS +certificate. If it is not possible to add the service DNS name as a SAN on the TLS certificate, +set `hosting.advertiseEndpoint` to a TLS-approved endpoint to help ensure Rook and clients use +secure data access. + +!!! note + OpenShift users can use add `service.beta.openshift.io/serving-cert-secret-name` as a service + annotation instead of using `sslCertificateRef`. + ## Virtual host-style Bucket Access The Ceph Object Gateway supports accessing buckets using @@ -530,7 +546,7 @@ Wildcard addressing can be configured in myriad ways. Some options: The minimum recommended `hosting` configuration is exemplified below. It is important to ensure that Rook advertises the wildcard-addressable endpoint as a priority over the default. TLS is also -recommended for security. +recommended for security, and the configured TLS certificate should specify the advertise endpoint. ```yaml spec: diff --git a/design/ceph/object/store.md b/design/ceph/object/store.md index 2bfcf442a59b..4b6c23f0ca73 100644 --- a/design/ceph/object/store.md +++ b/design/ceph/object/store.md @@ -397,8 +397,19 @@ the HTTPS (`securePort`) endpoint. Because the advertised endpoint is primarily resources internal to the Kubernetes cluster, this default should be sufficient for most users, and this is the behavior expected by users when `dnsNames` is not configured, so it should be familiar. -When this feature is enabled, there is also ambiguity about which endpoint Rook should use for Admin -Ops API communication. Some users have reported issues with Rook using a `dnsNames` endpoint +When this feature is enabled, there should be no ambiguity about which endpoint Rook will use for +Admin Ops API communication. As an HTTP server, RGW is only able to return a single TLS certificate +to S3 clients ([more detail](https://github.com/rook/rook/issues/14530)). For maximum compatibility +while TLS is enabled, Rook should connect to the same endpoint that users do. Internally, Rook will +use the advertise endpoint as configured. + +Rook documentation will inform users that if TLS is enabled, they must give Rook a certificate that +accepts the service endpoint. Alternately, if that is not possible, Rook will add an +`insecureSkipTlsVerification` option to the CephObjectStore to allow users to provision a healthy +CephObjectStore. This opens users up to machine-in-the-middle attacks, so users should be advised to +only use it for test/proof-of-concept clusters, or to work around bugs temporarily. + +Some users have reported issues with Rook using a `dnsNames` endpoint (or `advertiseEndpoint`) when they wish to set up ingress certificates after Rook deployment. The obvious alternative is to have Rook always use the CephObjectStore service, but other users have expressed troubles creating certificates or CAs that allow the service endpoint in the past. @@ -424,14 +435,6 @@ While Rook add endpoints to the list for safety and convenience, users might add which Rook should not treat as a configuration bug. Rook should also ensure the list ordering is consistent between reconciles. -In order to attempt to strike the best balance for everyone, and to provide the best clarity for -users and Rook internally, Rook will always use the service endpoint for admin ops. Rook -documentation must inform users that if TLS is enabled, they must give Rook a certificate that -accepts the service endpoint. Alternately, if that is not possible, Rook will add an -`insecureSkipTlsVerification` option to the CephObjectStore to allow users to provision a healthy -CephObjectStore. This opens users up to machine-in-the-middle attacks, so users should be advised to -only use it for test/proof-of-concept clusters, or to work around bugs temporarily. - Rook can refer users to this Kubernetes doc for a suggested way that they can manage certificates in a Kubernetes cluster that work with Kubernetes services like the CephObjectStore service: https://kubernetes.io/docs/tasks/tls/managing-tls-in-a-cluster/ diff --git a/pkg/operator/ceph/object/admin.go b/pkg/operator/ceph/object/admin.go index 3289cc9e80ad..5e5a2596f515 100644 --- a/pkg/operator/ceph/object/admin.go +++ b/pkg/operator/ceph/object/admin.go @@ -19,7 +19,6 @@ package object import ( "encoding/json" "fmt" - "math/rand" "net/http" "net/http/httputil" "regexp" @@ -64,9 +63,6 @@ type debugHTTPClient struct { logger *capnslog.PackageLogger } -// global rand source that can be overridden for unit tests -var randSrc = rand.New(rand.NewSource(rand.Int63())) //nolint:gosec // G404: cryptographically weak RNG is fine here - // NewDebugHTTPClient helps us mutating the HTTP client to debug the request/response func NewDebugHTTPClient(client admin.HTTPClient, logger *capnslog.PackageLogger) *debugHTTPClient { return &debugHTTPClient{client, logger} @@ -136,28 +132,15 @@ func NewMultisiteContext(context *clusterd.Context, clusterInfo *cephclient.Clus } // GetAdminOpsEndpoint returns an endpoint that can be used to perform RGW admin ops -// It returns an HTTPS endpoint if available. It prefers direct routes to the RGW(s). func GetAdminOpsEndpoint(s *cephv1.CephObjectStore) (string, error) { nsName := fmt.Sprintf("%s/%s", s.Namespace, s.Name) - port, err := s.Spec.GetPort() + // advertise endpoint should be most likely to have a valid cert, so use it for admin ops + endpoint, err := s.GetAdvertiseEndpointUrl() if err != nil { - return "", errors.Wrapf(err, "failed to get port for object store %q", nsName) + return "", errors.Wrapf(err, "failed to get advertise endpoint for object store %q", nsName) } - - domain := s.GetServiceDomainName() - if s.Spec.IsExternal() { - // if the store is external, pick a random external endpoint to use. if the endpoint is down, this - // reconcile may fail, but a future reconcile will eventually pick a different endpoint to try - endpoints := []string{} - for _, e := range s.Spec.Gateway.ExternalRgwEndpoints { - endpoints = append(endpoints, e.String()) - } - idx := randSrc.Intn(len(endpoints)) - domain = endpoints[idx] - } - - return BuildDNSEndpoint(domain, port, s.Spec.IsTLSEnabled()), nil + return endpoint, nil } // UpdateEndpointForAdminOps updates the object.Context endpoint with the latest admin ops endpoint diff --git a/pkg/operator/ceph/object/admin_test.go b/pkg/operator/ceph/object/admin_test.go index 047c440affeb..122b4ce86ef1 100644 --- a/pkg/operator/ceph/object/admin_test.go +++ b/pkg/operator/ceph/object/admin_test.go @@ -18,7 +18,6 @@ package object import ( "encoding/json" - "math/rand" "testing" "time" @@ -742,14 +741,9 @@ func TestGetAdminOpsEndpoint(t *testing.T) { }, Spec: cephv1.ObjectStoreSpec{ Gateway: cephv1.GatewaySpec{}, - // configure hosting settings to ensure they don't affect admin ops endpoints Hosting: &cephv1.ObjectStoreHostingSpec{ - AdvertiseEndpoint: &cephv1.ObjectEndpointSpec{ - DnsName: "should.not.appear", - Port: 7777, - UseTls: false, - }, - DNSNames: []string{"also.should.not.appear"}, + // dnsNames shouldn't affect admin ops endpoints + DNSNames: []string{"should.not.appear"}, }, }, } @@ -800,16 +794,9 @@ func TestGetAdminOpsEndpoint(t *testing.T) { } s.Spec.Gateway.Port = 8080 - // override rand src with known seed to keep tests stable - randSrc = rand.New(rand.NewSource(3)) //nolint:gosec // G404: cryptographically weak RNG is fine here - got, err := GetAdminOpsEndpoint(s) assert.NoError(t, err) assert.Equal(t, "http://192.168.1.1:8080", got) - - got, err = GetAdminOpsEndpoint(s) - assert.NoError(t, err) - assert.Equal(t, "http://s3.host.com:8080", got) }) t.Run("securePort, no cert", func(t *testing.T) { @@ -820,16 +807,9 @@ func TestGetAdminOpsEndpoint(t *testing.T) { } s.Spec.Gateway.SecurePort = 8443 - // override rand src with known seed to keep tests stable - randSrc = rand.New(rand.NewSource(3)) //nolint:gosec // G404: cryptographically weak RNG is fine here - got, err := GetAdminOpsEndpoint(s) assert.Error(t, err) assert.Equal(t, "", got) - - got, err = GetAdminOpsEndpoint(s) - assert.Error(t, err) - assert.Equal(t, "", got) }) t.Run("securePort", func(t *testing.T) { @@ -841,16 +821,9 @@ func TestGetAdminOpsEndpoint(t *testing.T) { s.Spec.Gateway.SecurePort = 8443 s.Spec.Gateway.SSLCertificateRef = "my-cert" - // override rand src with known seed to keep tests stable - randSrc = rand.New(rand.NewSource(3)) //nolint:gosec // G404: cryptographically weak RNG is fine here - got, err := GetAdminOpsEndpoint(s) assert.NoError(t, err) assert.Equal(t, "https://192.168.1.1:8443", got) - - got, err = GetAdminOpsEndpoint(s) - assert.NoError(t, err) - assert.Equal(t, "https://s3.host.com:8443", got) }) t.Run("port + securePort", func(t *testing.T) { @@ -863,16 +836,31 @@ func TestGetAdminOpsEndpoint(t *testing.T) { s.Spec.Gateway.SecurePort = 8443 s.Spec.Gateway.SSLCertificateRef = "my-cert" - // override rand src with known seed to keep tests stable - randSrc = rand.New(rand.NewSource(3)) //nolint:gosec // G404: cryptographically weak RNG is fine here - got, err := GetAdminOpsEndpoint(s) assert.NoError(t, err) assert.Equal(t, "https://192.168.1.1:8443", got) + }) + }) - got, err = GetAdminOpsEndpoint(s) + t.Run("advertise", func(t *testing.T) { + t.Run("port + securePort", func(t *testing.T) { + s := s.DeepCopy() + s.Spec.Gateway.ExternalRgwEndpoints = []cephv1.EndpointAddress{ + {IP: "192.168.1.1"}, + {Hostname: "s3.host.com"}, + } + s.Spec.Gateway.Port = 8080 + s.Spec.Gateway.SecurePort = 8443 + s.Spec.Gateway.SSLCertificateRef = "my-cert" + s.Spec.Hosting.AdvertiseEndpoint = &cephv1.ObjectEndpointSpec{ + DnsName: "advertise.me", + Port: 80, + UseTls: false, + } + + got, err := GetAdminOpsEndpoint(s) assert.NoError(t, err) - assert.Equal(t, "https://s3.host.com:8443", got) + assert.Equal(t, "http://advertise.me:80", got) }) }) }