diff --git a/README.md b/README.md index 83e30bab1..27f07a65a 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,13 @@ +# Fork Notes + +We experienced issues with having a root server in the cluster: routes can't be created and the node will be labeled as unscheduable because of networking issues. + +Changes in this fork make it work for our use-case. This is as follows: + +* a root server has a label `instance.hetzner.cloud/is-root-server` set to `true` +* when kubernetes wants this cloud provider to create/delete routes, nodes with that label are ignored (and no error is returned) +* when kubernetes wants this cloud provider to list routes we return a route for the root server based on it's pod CIDR and it's internal IP (our setup) + # Kubernetes Cloud Controller Manager for Hetzner Cloud [![GitHub Actions status](https://github.com/hetznercloud/hcloud-cloud-controller-manager/workflows/Run%20tests/badge.svg)](https://github.com/hetznercloud/hcloud-cloud-controller-manager/actions) diff --git a/hcloud/cloud.go b/hcloud/cloud.go index f427db683..e5940f30b 100644 --- a/hcloud/cloud.go +++ b/hcloud/cloud.go @@ -32,6 +32,7 @@ import ( "github.com/hetznercloud/hcloud-cloud-controller-manager/internal/metrics" "github.com/hetznercloud/hcloud-go/v2/hcloud" "github.com/hetznercloud/hcloud-go/v2/hcloud/metadata" + "github.com/hetznercloud/hcloud-cloud-controller-manager/internal/rootserver" ) const ( @@ -64,6 +65,7 @@ type cloud struct { routes *routes loadBalancer *loadBalancers networkID int64 + rootServerQueries rootserver.Queries } func newCloud(_ io.Reader) (cloudprovider.Interface, error) { @@ -164,12 +166,15 @@ func newCloud(_ io.Reader) (cloudprovider.Interface, error) { return nil, fmt.Errorf("%s: %w", op, err) } + rootServerQueries := rootserver.NewQueries() + return &cloud{ - client: client, - instances: newInstances(client, instancesAddressFamily, networkID), - loadBalancer: loadBalancers, - routes: nil, - networkID: networkID, + client: client, + instances: newInstances(client, instancesAddressFamily, networkID, rootServerQueries), + loadBalancer: loadBalancers, + routes: nil, + networkID: networkID, + rootServerQueries: rootServerQueries, }, nil } @@ -203,7 +208,7 @@ func (c *cloud) Clusters() (cloudprovider.Clusters, bool) { func (c *cloud) Routes() (cloudprovider.Routes, bool) { if c.networkID > 0 && os.Getenv(hcloudNetworkRoutesEnabledENVVar) != "false" { - r, err := newRoutes(c.client, c.networkID) + r, err := newRoutes(c.client, c.networkID, c.rootServerQueries) if err != nil { klog.ErrorS(err, "create routes provider", "networkID", c.networkID) return nil, false diff --git a/hcloud/instances.go b/hcloud/instances.go index caa00d322..e1adcebe7 100644 --- a/hcloud/instances.go +++ b/hcloud/instances.go @@ -24,6 +24,7 @@ import ( cloudprovider "k8s.io/cloud-provider" "github.com/hetznercloud/hcloud-cloud-controller-manager/internal/metrics" + "github.com/hetznercloud/hcloud-cloud-controller-manager/internal/rootserver" "github.com/hetznercloud/hcloud-go/v2/hcloud" ) @@ -39,10 +40,16 @@ type instances struct { client *hcloud.Client addressFamily addressFamily networkID int64 + rootServerQueries rootserver.Queries } -func newInstances(client *hcloud.Client, addressFamily addressFamily, networkID int64) *instances { - return &instances{client, addressFamily, networkID} +func newInstances( + client *hcloud.Client, + addressFamily addressFamily, + networkID int64, + rootServerQueries rootserver.Queries, +) *instances { + return &instances{client, addressFamily, networkID, rootServerQueries} } // lookupServer attempts to locate the corresponding hcloud.Server for a given corev1.Node @@ -74,6 +81,10 @@ func (i *instances) InstanceExists(ctx context.Context, node *corev1.Node) (bool const op = "hcloud/instancesv2.InstanceExists" metrics.OperationCalled.WithLabelValues(op).Inc() + if i.rootServerQueries.IsRootServerByNode(node) { + return true, nil + } + server, err := i.lookupServer(ctx, node) if err != nil { return false, err @@ -101,6 +112,10 @@ func (i *instances) InstanceMetadata(ctx context.Context, node *corev1.Node) (*c const op = "hcloud/instancesv2.InstanceMetadata" metrics.OperationCalled.WithLabelValues(op).Inc() + if i.rootServerQueries.IsRootServerByNode(node) { + return i.rootServerQueries.GetInstanceMetadata(node) + } + server, err := i.lookupServer(ctx, node) if err != nil { return nil, err diff --git a/hcloud/instances_test.go b/hcloud/instances_test.go index 5276f99ea..05cb1f74b 100644 --- a/hcloud/instances_test.go +++ b/hcloud/instances_test.go @@ -30,6 +30,7 @@ import ( "github.com/hetznercloud/hcloud-go/v2/hcloud" "github.com/hetznercloud/hcloud-go/v2/hcloud/schema" + "github.com/hetznercloud/hcloud-cloud-controller-manager/internal/rootserver" ) // TestInstances_InstanceExists also tests [lookupServer]. The other tests @@ -58,7 +59,7 @@ func TestInstances_InstanceExists(t *testing.T) { json.NewEncoder(w).Encode(schema.ServerListResponse{Servers: servers}) }) - instances := newInstances(env.Client, AddressFamilyIPv4, 0) + instances := newInstances(env.Client, AddressFamilyIPv4, 0, rootserver.NewQueriesMock()) tests := []struct { name string @@ -131,7 +132,7 @@ func TestInstances_InstanceShutdown(t *testing.T) { }) }) - instances := newInstances(env.Client, AddressFamilyIPv4, 0) + instances := newInstances(env.Client, AddressFamilyIPv4, 0, rootserver.NewQueriesMock()) tests := []struct { name string @@ -188,7 +189,7 @@ func TestInstances_InstanceMetadata(t *testing.T) { }) }) - instances := newInstances(env.Client, AddressFamilyIPv4, 0) + instances := newInstances(env.Client, AddressFamilyIPv4, 0, rootserver.NewQueriesMock()) metadata, err := instances.InstanceMetadata(context.TODO(), &corev1.Node{ Spec: corev1.NodeSpec{ProviderID: "hcloud://1"}, diff --git a/hcloud/routes.go b/hcloud/routes.go index 9221265d6..c16cc1115 100644 --- a/hcloud/routes.go +++ b/hcloud/routes.go @@ -13,16 +13,18 @@ import ( "github.com/hetznercloud/hcloud-cloud-controller-manager/internal/hcops" "github.com/hetznercloud/hcloud-cloud-controller-manager/internal/metrics" + "github.com/hetznercloud/hcloud-cloud-controller-manager/internal/rootserver" "github.com/hetznercloud/hcloud-go/v2/hcloud" ) type routes struct { - client *hcloud.Client - network *hcloud.Network - serverCache *hcops.AllServersCache + client *hcloud.Client + network *hcloud.Network + serverCache *hcops.AllServersCache + rootServerQueries rootserver.Queries } -func newRoutes(client *hcloud.Client, networkID int64) (*routes, error) { +func newRoutes(client *hcloud.Client, networkID int64, rootServerQueries rootserver.Queries) (*routes, error) { const op = "hcloud/newRoutes" metrics.OperationCalled.WithLabelValues(op).Inc() @@ -43,6 +45,7 @@ func newRoutes(client *hcloud.Client, networkID int64) (*routes, error) { LoadFunc: client.Server.All, Network: networkObj, }, + rootServerQueries: rootServerQueries, }, nil } @@ -78,6 +81,14 @@ func (r *routes) ListRoutes(ctx context.Context, _ string) ([]*cloudprovider.Rou } routes = append(routes, ro) } + + rootServerRoutes, err := r.rootServerQueries.GetRootServerRoutes(ctx) + if err != nil { + klog.ErrorS(err, "failed to query root server routes; won't add root server routes", "op", op) + } else { + routes = append(routes, rootServerRoutes...) + } + return routes, nil } @@ -88,6 +99,15 @@ func (r *routes) CreateRoute(ctx context.Context, clusterName string, nameHint s const op = "hcloud/CreateRoute" metrics.OperationCalled.WithLabelValues(op).Inc() + isRootServer, err := r.rootServerQueries.IsRootServer(ctx, route.TargetNode) + if err != nil { + klog.ErrorS(err, "failed to query if node is root server; will try to create routes as if it's not one", "op", op, "node", route.TargetNode) + } else if isRootServer { + // root server has it's own routing + klog.InfoS("skipping root server for route creation", "op", op, "node", route.TargetNode) + return nil + } + srv, err := r.serverCache.ByName(string(route.TargetNode)) if err != nil { return fmt.Errorf("%s: %v", op, err) @@ -151,6 +171,15 @@ func (r *routes) DeleteRoute(ctx context.Context, _ string, route *cloudprovider const op = "hcloud/DeleteRoute" metrics.OperationCalled.WithLabelValues(op).Inc() + isRootServer, err := r.rootServerQueries.IsRootServer(ctx, route.TargetNode) + if err != nil { + klog.ErrorS(err, "failed to query if node is root server; will try to delete routes as if it's not one", "op", op, "node", route.TargetNode) + } else if isRootServer { + // root server has it's own routing + klog.InfoS("skipping root server for route deletion", "op", op, "node", route.TargetNode) + return nil + } + // Get target IP from current list of routes, routes can be uniquely identified by their destination cidr. var ip net.IP for _, cloudRoute := range r.network.Routes { diff --git a/hcloud/routes_test.go b/hcloud/routes_test.go index a0f67e18a..8b213bd09 100644 --- a/hcloud/routes_test.go +++ b/hcloud/routes_test.go @@ -8,6 +8,7 @@ import ( cloudprovider "k8s.io/cloud-provider" + "github.com/hetznercloud/hcloud-cloud-controller-manager/internal/rootserver" "github.com/hetznercloud/hcloud-go/v2/hcloud" "github.com/hetznercloud/hcloud-go/v2/hcloud/schema" ) @@ -68,7 +69,7 @@ func TestRoutes_CreateRoute(t *testing.T) { }, }) }) - routes, err := newRoutes(env.Client, 1) + routes, err := newRoutes(env.Client, 1, rootserver.NewQueriesMock()) if err != nil { t.Fatalf("Unexpected error: %v", err) } @@ -117,7 +118,7 @@ func TestRoutes_ListRoutes(t *testing.T) { }, }) }) - routes, err := newRoutes(env.Client, 1) + routes, err := newRoutes(env.Client, 1, rootserver.NewQueriesMock()) if err != nil { t.Fatalf("Unexpected error: %v", err) } @@ -183,7 +184,7 @@ func TestRoutes_DeleteRoute(t *testing.T) { }, }) }) - routes, err := newRoutes(env.Client, 1) + routes, err := newRoutes(env.Client, 1, rootserver.NewQueriesMock()) if err != nil { t.Fatalf("Unexpected error: %v", err) } diff --git a/internal/hcops/load_balancer.go b/internal/hcops/load_balancer.go index ee420f641..d95d0b348 100644 --- a/internal/hcops/load_balancer.go +++ b/internal/hcops/load_balancer.go @@ -596,7 +596,7 @@ func (l *LoadBalancerOps) ReconcileHCLBTargets( for _, node := range nodes { id, err := providerIDToServerID(node.Spec.ProviderID) if err != nil { - return changed, fmt.Errorf("%s: %w", op, err) + klog.InfoS("failed to extract server ID from provider ID; skipping it", "op", op, "err", err, "provider_id", node.Spec.ProviderID) } k8sNodeIDs[id] = true k8sNodeNames[id] = node.Name diff --git a/internal/rootserver/queries.go b/internal/rootserver/queries.go new file mode 100644 index 000000000..750ac2aae --- /dev/null +++ b/internal/rootserver/queries.go @@ -0,0 +1,45 @@ +package rootserver + +import ( + "context" + + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/types" + cloudprovider "k8s.io/cloud-provider" +) + +type Queries interface { + // IsRootServer finds out if the given node is one looking at the k8s node + // label instance.hetzner.cloud/is-root-server + IsRootServer(ctx context.Context, nodeName types.NodeName) (bool, error) + IsRootServerByNode(node *corev1.Node) bool + GetInstanceMetadata(node *corev1.Node) (*cloudprovider.InstanceMetadata, error) + GetRootServerRoutes(ctx context.Context) ([]*cloudprovider.Route, error) +} + +type queriesMock struct { +} + +func (m *queriesMock) IsRootServer(context.Context, types.NodeName) (bool, error) { + return false, nil +} + +func (m *queriesMock) IsRootServerByNode(*corev1.Node) bool { + return false +} + +func (m *queriesMock) NodeAddresses(context.Context, types.NodeName) ([]corev1.NodeAddress, error) { + return []corev1.NodeAddress{}, nil +} + +func (m *queriesMock) GetInstanceMetadata(*corev1.Node) (*cloudprovider.InstanceMetadata, error) { + return &cloudprovider.InstanceMetadata{}, nil +} + +func (m *queriesMock) GetRootServerRoutes(context.Context) ([]*cloudprovider.Route, error) { + return []*cloudprovider.Route{}, nil +} + +func NewQueriesMock() Queries { + return &queriesMock{} +} diff --git a/internal/rootserver/queries_impl.go b/internal/rootserver/queries_impl.go new file mode 100644 index 000000000..50801fc77 --- /dev/null +++ b/internal/rootserver/queries_impl.go @@ -0,0 +1,179 @@ +package rootserver + +import ( + "context" + "fmt" + "strconv" + + "github.com/pkg/errors" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + "k8s.io/client-go/kubernetes" + "k8s.io/client-go/rest" + cloudprovider "k8s.io/cloud-provider" + "k8s.io/klog/v2" +) + +const ( + instanceIsRootServer = "instance.hetzner.cloud/is-root-server" +) + +type queries struct { + k8sClientSet *kubernetes.Clientset +} + +func NewQueries() Queries { + var k8sClientSet *kubernetes.Clientset + + k8sConfig, err := rest.InClusterConfig() + if err != nil { + klog.ErrorS(err, "k8s cluster config can't be created") + } else { + k8sClientSet, err = kubernetes.NewForConfig(k8sConfig) + if err != nil { + klog.ErrorS(err, "k8s clients can't be initialized") + } + } + + return &queries{k8sClientSet} +} + +func (q *queries) IsRootServer(ctx context.Context, nodeName types.NodeName) (bool, error) { + if err := q.validateK8sConnection(); err != nil { + return false, err + } + + node, err := q.k8sClientSet.CoreV1().Nodes().Get(ctx, string(nodeName), metav1.GetOptions{}) + if err != nil { + return false, errors.WithMessagef(err, "failed to retrieve k8s node info for node '%v'", nodeName) + } + + isRootServer, err := hasRootServerLabel(node) + if err != nil { + return false, err + } + + return isRootServer, nil +} + +func (q *queries) IsRootServerByNode(node *corev1.Node) bool { + if err := q.validateK8sConnection(); err != nil { + klog.Error(err) + return false + } + + isRootServer, err := hasRootServerLabel(node) + if err != nil { + klog.ErrorS(err, "failed to query root server label; assuming it's a cloud node", "node", node.Name) + return false + } + + return isRootServer +} + +func (q *queries) GetInstanceMetadata(node *corev1.Node) (*cloudprovider.InstanceMetadata, error) { + if err := q.validateK8sConnection(); err != nil { + return nil, err + } + + return &cloudprovider.InstanceMetadata{ + ProviderID: "", + InstanceType: "", + NodeAddresses: node.Status.Addresses, + Zone: "", + Region: "", + }, nil +} + +func (q *queries) GetRootServerRoutes(ctx context.Context) ([]*cloudprovider.Route, error) { + rootNodes, err := q.getRootServerNodes(ctx) + if err != nil { + return nil, err + } + + rootServerRoutes := make([]*cloudprovider.Route, 0, len(rootNodes)) + + for _, node := range rootNodes { + if len(node.Spec.PodCIDR) < 1 { + continue + } + + destination := node.Spec.PodCIDR + + var gateway string + + for _, nodeAddress := range node.Status.Addresses { + if nodeAddress.Type == corev1.NodeInternalIP && len(nodeAddress.Address) > 0 { + gateway = nodeAddress.Address + break + } + } + + if len(gateway) < 1 { + continue + } + + route := &cloudprovider.Route{ + DestinationCIDR: destination, + Name: fmt.Sprintf("%s-%s", gateway, destination), + TargetNode: types.NodeName(node.Name), + } + + rootServerRoutes = append(rootServerRoutes, route) + } + + return rootServerRoutes, nil +} + +func hasRootServerLabel(node *corev1.Node) (bool, error) { + value, ok := node.Labels[instanceIsRootServer] + if !ok { + return false, nil + } + + boolValue, err := strconv.ParseBool(value) + if err != nil { + return false, fmt.Errorf( + "node %s has invalid label '%s': %v", + node.Name, + instanceIsRootServer, + err, + ) + } + + return boolValue, nil +} + +func (q *queries) getRootServerNodes(ctx context.Context) ([]*corev1.Node, error) { + if err := q.validateK8sConnection(); err != nil { + return []*corev1.Node{}, err + } + + nodeList, err := q.k8sClientSet.CoreV1().Nodes().List(ctx, metav1.ListOptions{}) + if err != nil { + return []*corev1.Node{}, errors.WithMessage(err, "failed to list k8s nodes") + } + + rootServerNodes := make([]*corev1.Node, 0, len(nodeList.Items)) + + for _, node := range nodeList.Items { + pNode := node + isRootServer, err := hasRootServerLabel(&pNode) + if err != nil || !isRootServer { + continue + } + + rootServerNodes = append(rootServerNodes, &pNode) + } + + return rootServerNodes, nil +} + +func (q *queries) validateK8sConnection() error { + if q.k8sClientSet == nil { + return errors.New("no connection to kubernetes API") + } + + return nil +}