Skip to content

Commit

Permalink
deployer: add info generating commands under resource-allocation
Browse files Browse the repository at this point in the history
  • Loading branch information
consideRatio committed Oct 28, 2023
1 parent 221992f commit d7fcda1
Show file tree
Hide file tree
Showing 6 changed files with 664 additions and 0 deletions.
6 changes: 6 additions & 0 deletions deployer/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -264,6 +264,12 @@ This sub-command can be used to generate the resource allocation choices for giv
##### `generate resource-allocation choices`
This generates a custom number of resource allocation choices for a certain instance type, depending on a certain chosen strategy that can be used in the profile list of a hub.

##### `generate resource-allocation daemonset-overhead`
Updates `daemonset_overhead.yaml` with an individual cluster's DaemonSets with running pods combined requests of CPU and memory, excluding GPU related DaemonSets.

##### `generate resource-allocation instance-capacity`
Updates `instance_capacity.yaml` with instance types' total and allocatable capacity reported by `kubectl get node`.

##### `generate resource-allocation node-info-update`
This updates the json file `node-capacity-info.json` with info about the capacity of a node of a certain type. This file is then used for generating the resource choices.

Expand Down
2 changes: 2 additions & 0 deletions deployer/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,9 @@
import deployer.commands.generate.dedicated_cluster.aws # noqa: F401
import deployer.commands.generate.dedicated_cluster.gcp # noqa: F401
import deployer.commands.generate.helm_upgrade.jobs # noqa: F401
import deployer.commands.generate.resource_allocation.daemonset_overhead # noqa: F401
import deployer.commands.generate.resource_allocation.generate_choices # noqa: F401
import deployer.commands.generate.resource_allocation.instance_capacity # noqa: F401
import deployer.commands.generate.resource_allocation.update_nodeinfo # noqa: F401
import deployer.commands.grafana.central_grafana # noqa: F401
import deployer.commands.grafana.deploy_dashboards # noqa: F401
Expand Down
152 changes: 152 additions & 0 deletions deployer/commands/generate/resource_allocation/daemonset_overhead.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,152 @@
import json
import math
import subprocess
from pathlib import Path

import typer
from kubernetes.utils.quantity import parse_quantity
from ruamel.yaml import YAML

from deployer.infra_components.cluster import Cluster
from deployer.utils.file_acquisition import find_absolute_path_to_cluster_file

from .resource_allocation_app import resource_allocation_app

HERE = Path(__file__).parent
yaml = YAML()
yaml.preserve_quotes = True
yaml.indent(mapping=2, sequence=4, offset=2)


def get_k8s_distribution():
"""
Returns a 2-tuple with the guessed the k8s distribution based on the k8s
api-server's reported version, either Google's GKE, Amazon's EKS, or Azure's
AKS, and the server's reported gitVersion.
"""
output = subprocess.check_output(
[
"kubectl",
"version",
"--output=json",
],
text=True,
)
version_info = json.loads(output)
server_version_info = version_info["serverVersion"]["gitVersion"]
if "gke" in server_version_info:
return "gke", server_version_info
if "eks" in server_version_info:
return "eks", server_version_info
else:
return "aks", server_version_info


def get_daemon_sets_requests():
"""
Returns a list of info about DaemonSets with pods desired to be scheduled on
some nodes the k8s cluster.
"""
output = subprocess.check_output(
[
"kubectl",
"get",
"ds",
"--all-namespaces",
"--output=jsonpath-as-json={.items[*]}",
],
text=True,
)
daemon_sets = json.loads(output)

# filter out DaemonSets that aren't desired on any node
daemon_sets = [ds for ds in daemon_sets if ds["status"]["desiredNumberScheduled"]]

info = []
for ds in daemon_sets:
name = ds["metadata"]["name"]
req_mem = req_cpu = lim_mem = lim_cpu = 0
for c in ds["spec"]["template"]["spec"]["containers"]:
resources = c.get("resources", {})
requests = resources.get("requests", {})
limits = resources.get("limits", {})
req_mem += parse_quantity(requests.get("memory", 0))
lim_mem += parse_quantity(limits.get("memory", 0))
req_cpu += parse_quantity(requests.get("cpu", 0))
lim_cpu += parse_quantity(limits.get("cpu", 0))

info.append(
{
"name": name,
"cpu_request": float(req_cpu),
"cpu_limit": float(lim_cpu),
"memory_request": int(req_mem),
"memory_limit": int(lim_mem),
}
)

return info


def get_daemon_sets_overhead():
"""
Returns a summary of the overhead from get_daemon_sets_requests.
"""
daemon_sets = get_daemon_sets_requests()
# filter out DaemonSets related to nvidia GPUs
daemon_sets = [ds for ds in daemon_sets if "nvidia" not in ds["name"]]
# separate DaemonSets without requests, as only requests are what impacts
# scheduling of pods and reduces a node's remaining allocatable resources
req_daemon_sets = [
ds for ds in daemon_sets if ds["cpu_request"] or ds["memory_request"]
]
other_daemon_sets = [
ds for ds in daemon_sets if not ds["cpu_request"] and not ds["memory_request"]
]

cpu_requests = sum([ds["cpu_request"] for ds in req_daemon_sets])
memory_requests = sum([ds["memory_request"] for ds in req_daemon_sets])
info = {
"requesting_daemon_sets": ",".join(
sorted([ds["name"] for ds in req_daemon_sets])
),
"other_daemon_sets": ",".join(sorted([ds["name"] for ds in other_daemon_sets])),
"cpu_requests": str(math.ceil(cpu_requests * 1000)) + "m",
"memory_requests": str(math.ceil(memory_requests / 1024**2)) + "Mi",
}
return info


@resource_allocation_app.command()
def daemonset_overhead(
cluster_name: str = typer.Argument(..., help="Name of cluster to operate on"),
):
"""
Updates `daemonset_overhead.yaml` with an individual cluster's DaemonSets
with running pods combined requests of CPU and memory, excluding GPU related
DaemonSets.
"""
file_path = HERE / "daemonset_overhead.yaml"
file_path.touch(exist_ok=True)

# acquire a Cluster object
config_file_path = find_absolute_path_to_cluster_file(cluster_name)
with open(config_file_path) as f:
cluster = Cluster(yaml.load(f), config_file_path.parent)

# auth and inspect cluster
with cluster.auth():
k8s_dist, k8s_version = get_k8s_distribution()
ds_overhead = get_daemon_sets_overhead()

# read
with open(file_path) as f:
info = yaml.load(f) or {}

# update
ds_overhead["k8s_version"] = k8s_version
info.setdefault(k8s_dist, {})[cluster_name] = ds_overhead

# write
with open(file_path, "w") as f:
yaml.dump(info, f)
180 changes: 180 additions & 0 deletions deployer/commands/generate/resource_allocation/daemonset_overhead.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,180 @@
# This file contains generated information about cpu/memory requests made by
# DaemonSets with running pods in our clusters. This information is relevant
# when planning cpu/memory requests for other pods as the daemonsets requests
# reduces the available allocatable capacity.
#
# The requests vary between cloud providers, clusters, and k8s versions for
# reasons like:
#
# - Cloud providers' managed k8s provides different DaemonSets by default
# - DaemonSets may be coupled to managed k8s features (calico-node)
# - DaemonSets' requests may be coupled to managed k8s version (netd)
# - DaemonSets may have a vertical autoscaler changing requests dynamically over
# time if needed (calico-node-vertical-autoscaler)
# - We may deploy or change a DaemonSet's requests over time (support-cryptnono,
# support-prometheus-node-exporter)
#
# This file isn't updated by automation, but can easily be updated by manually
# by running a command once for each cluster:
#
# ls config/clusters | xargs -I {} deployer generate resource-allocation daemonset-overhead {}
#
gke:
2i2c:
requesting_daemon_sets: calico-node,fluentbit-gke,gke-metadata-server,gke-metrics-agent,ip-masq-agent,pdcsi-node,support-cryptnono,support-prometheus-node-exporter
other_daemon_sets: binder-staging-dind,binder-staging-image-cleaner,continuous-image-puller,imagebuilding-demo-binderhub-service-docker-api,netd
cpu_requests: 342m
memory_requests: 566Mi
k8s_version: v1.26.5-gke.2100
2i2c-uk:
requesting_daemon_sets: calico-node,fluentbit-gke,gke-metadata-server,gke-metrics-agent,ip-masq-agent,netd,pdcsi-node,support-cryptnono,support-prometheus-node-exporter
other_daemon_sets: ""
cpu_requests: 344m
memory_requests: 596Mi
k8s_version: v1.27.4-gke.900
awi-ciroh:
requesting_daemon_sets: calico-node,fluentbit-gke,gke-metadata-server,gke-metrics-agent,ip-masq-agent,pdcsi-node,support-cryptnono,support-prometheus-node-exporter
other_daemon_sets: netd
cpu_requests: 342m
memory_requests: 566Mi
k8s_version: v1.25.10-gke.2700
callysto:
requesting_daemon_sets: calico-node,fluentbit-gke,gke-metadata-server,gke-metrics-agent,ip-masq-agent,netd,pdcsi-node,support-cryptnono,support-prometheus-node-exporter
other_daemon_sets: ""
cpu_requests: 344m
memory_requests: 596Mi
k8s_version: v1.27.4-gke.900
catalystproject-latam:
requesting_daemon_sets: calico-node,fluentbit-gke,gke-metadata-server,ip-masq-agent,netd,pdcsi-node,support-cryptnono,support-prometheus-node-exporter
other_daemon_sets: ""
cpu_requests: 338m
memory_requests: 496Mi
k8s_version: v1.27.3-gke.100
cloudbank:
requesting_daemon_sets: calico-node,fluentbit-gke,gke-metadata-server,gke-metrics-agent,ip-masq-agent,pdcsi-node,support-cryptnono,support-prometheus-node-exporter
other_daemon_sets: continuous-image-puller,continuous-image-puller,continuous-image-puller,netd
cpu_requests: 342m
memory_requests: 566Mi
k8s_version: v1.26.5-gke.2100
hhmi:
requesting_daemon_sets: fluentbit-gke,gke-metadata-server,netd,pdcsi-node,support-cryptnono,support-prometheus-node-exporter
other_daemon_sets: ""
cpu_requests: 228m
memory_requests: 480Mi
k8s_version: v1.27.3-gke.100
leap:
requesting_daemon_sets: calico-node,fluentbit-gke,gke-metadata-server,gke-metrics-agent,ip-masq-agent,pdcsi-node,support-cryptnono,support-prometheus-node-exporter
other_daemon_sets: netd
cpu_requests: 342m
memory_requests: 566Mi
k8s_version: v1.25.10-gke.2700
linked-earth:
requesting_daemon_sets: calico-node,fluentbit-gke,gke-metadata-server,gke-metrics-agent,ip-masq-agent,netd,pdcsi-node,support-cryptnono,support-prometheus-node-exporter
other_daemon_sets: ""
cpu_requests: 344m
memory_requests: 596Mi
k8s_version: v1.27.4-gke.900
m2lines:
requesting_daemon_sets: calico-node,fluentbit-gke,gke-metadata-server,gke-metrics-agent,ip-masq-agent,netd,pdcsi-node,support-cryptnono,support-prometheus-node-exporter
other_daemon_sets: ""
cpu_requests: 344m
memory_requests: 596Mi
k8s_version: v1.27.4-gke.900
meom-ige:
requesting_daemon_sets: fluentbit-gke,gke-metadata-server,gke-metrics-agent,netd,pdcsi-node,support-cryptnono,support-prometheus-node-exporter
other_daemon_sets: ""
cpu_requests: 234m
memory_requests: 580Mi
k8s_version: v1.27.4-gke.900
pangeo-hubs:
requesting_daemon_sets: calico-node,fluentbit-gke,gke-metadata-server,gke-metrics-agent,ip-masq-agent,pdcsi-node,support-cryptnono,support-prometheus-node-exporter
other_daemon_sets: netd
cpu_requests: 342m
memory_requests: 566Mi
k8s_version: v1.26.5-gke.2100
qcl:
requesting_daemon_sets: calico-node,fluentbit-gke,gke-metadata-server,ip-masq-agent,pdcsi-node,support-cryptnono,support-prometheus-node-exporter
other_daemon_sets: continuous-image-puller,continuous-image-puller,netd
cpu_requests: 336m
memory_requests: 466Mi
k8s_version: v1.25.10-gke.2700
eks:
2i2c-aws-us:
requesting_daemon_sets: aws-node,ebs-csi-node,kube-proxy,support-cryptnono,support-prometheus-node-exporter
other_daemon_sets: ""
cpu_requests: 170m
memory_requests: 250Mi
k8s_version: v1.25.12-eks-2d98532
carbonplan:
requesting_daemon_sets: aws-node,ebs-csi-node,kube-proxy,support-cryptnono,support-prometheus-node-exporter
other_daemon_sets: ""
cpu_requests: 170m
memory_requests: 250Mi
k8s_version: v1.24.16-eks-2d98532
catalystproject-africa:
requesting_daemon_sets: aws-node,ebs-csi-node,kube-proxy,support-cryptnono,support-prometheus-node-exporter
other_daemon_sets: ""
cpu_requests: 170m
memory_requests: 250Mi
k8s_version: v1.27.4-eks-2d98532
gridsst:
requesting_daemon_sets: aws-node,ebs-csi-node,kube-proxy,support-cryptnono,support-prometheus-node-exporter
other_daemon_sets: ""
cpu_requests: 170m
memory_requests: 250Mi
k8s_version: v1.25.12-eks-2d98532
jupyter-meets-the-earth:
requesting_daemon_sets: aws-node,ebs-csi-node,kube-proxy,support-cryptnono,support-prometheus-node-exporter
other_daemon_sets: ""
cpu_requests: 170m
memory_requests: 250Mi
k8s_version: v1.25.12-eks-2d98532
nasa-cryo:
requesting_daemon_sets: aws-node,ebs-csi-node,kube-proxy,support-cryptnono,support-prometheus-node-exporter
other_daemon_sets: ""
cpu_requests: 170m
memory_requests: 250Mi
k8s_version: v1.25.12-eks-2d98532
nasa-ghg:
requesting_daemon_sets: aws-node,ebs-csi-node,kube-proxy,support-cryptnono,support-prometheus-node-exporter
other_daemon_sets: ""
cpu_requests: 170m
memory_requests: 250Mi
k8s_version: v1.27.4-eks-2d98532
nasa-veda:
requesting_daemon_sets: aws-node,ebs-csi-node,kube-proxy,support-cryptnono,support-prometheus-node-exporter
other_daemon_sets: ""
cpu_requests: 170m
memory_requests: 250Mi
k8s_version: v1.25.12-eks-2d98532
openscapes:
requesting_daemon_sets: aws-node,ebs-csi-node,kube-proxy,support-cryptnono,support-prometheus-node-exporter
other_daemon_sets: ""
cpu_requests: 170m
memory_requests: 250Mi
k8s_version: v1.24.16-eks-2d98532
smithsonian:
requesting_daemon_sets: aws-node,ebs-csi-node,kube-proxy,support-cryptnono,support-prometheus-node-exporter
other_daemon_sets: ""
cpu_requests: 170m
memory_requests: 250Mi
k8s_version: v1.25.12-eks-2d98532
ubc-eoas:
requesting_daemon_sets: aws-node,ebs-csi-node,kube-proxy,support-cryptnono,support-prometheus-node-exporter
other_daemon_sets: ""
cpu_requests: 170m
memory_requests: 250Mi
k8s_version: v1.24.17-eks-f8587cb
victor:
requesting_daemon_sets: aws-node,ebs-csi-node,kube-proxy,support-cryptnono,support-prometheus-node-exporter
other_daemon_sets: ""
cpu_requests: 170m
memory_requests: 250Mi
k8s_version: v1.25.12-eks-2d98532
aks:
utoronto:
requesting_daemon_sets: cloud-node-manager,csi-azuredisk-node,csi-azurefile-node,kube-proxy,support-cryptnono,support-prometheus-node-exporter
other_daemon_sets: calico-node,continuous-image-puller,continuous-image-puller,continuous-image-puller,continuous-image-puller
cpu_requests: 226m
memory_requests: 300Mi
k8s_version: v1.26.3
Loading

0 comments on commit d7fcda1

Please sign in to comment.