-
Notifications
You must be signed in to change notification settings - Fork 67
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
deployer: add info generating commands under resource-allocation
- Loading branch information
1 parent
221992f
commit d7fcda1
Showing
6 changed files
with
664 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
152 changes: 152 additions & 0 deletions
152
deployer/commands/generate/resource_allocation/daemonset_overhead.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,152 @@ | ||
import json | ||
import math | ||
import subprocess | ||
from pathlib import Path | ||
|
||
import typer | ||
from kubernetes.utils.quantity import parse_quantity | ||
from ruamel.yaml import YAML | ||
|
||
from deployer.infra_components.cluster import Cluster | ||
from deployer.utils.file_acquisition import find_absolute_path_to_cluster_file | ||
|
||
from .resource_allocation_app import resource_allocation_app | ||
|
||
HERE = Path(__file__).parent | ||
yaml = YAML() | ||
yaml.preserve_quotes = True | ||
yaml.indent(mapping=2, sequence=4, offset=2) | ||
|
||
|
||
def get_k8s_distribution(): | ||
""" | ||
Returns a 2-tuple with the guessed the k8s distribution based on the k8s | ||
api-server's reported version, either Google's GKE, Amazon's EKS, or Azure's | ||
AKS, and the server's reported gitVersion. | ||
""" | ||
output = subprocess.check_output( | ||
[ | ||
"kubectl", | ||
"version", | ||
"--output=json", | ||
], | ||
text=True, | ||
) | ||
version_info = json.loads(output) | ||
server_version_info = version_info["serverVersion"]["gitVersion"] | ||
if "gke" in server_version_info: | ||
return "gke", server_version_info | ||
if "eks" in server_version_info: | ||
return "eks", server_version_info | ||
else: | ||
return "aks", server_version_info | ||
|
||
|
||
def get_daemon_sets_requests(): | ||
""" | ||
Returns a list of info about DaemonSets with pods desired to be scheduled on | ||
some nodes the k8s cluster. | ||
""" | ||
output = subprocess.check_output( | ||
[ | ||
"kubectl", | ||
"get", | ||
"ds", | ||
"--all-namespaces", | ||
"--output=jsonpath-as-json={.items[*]}", | ||
], | ||
text=True, | ||
) | ||
daemon_sets = json.loads(output) | ||
|
||
# filter out DaemonSets that aren't desired on any node | ||
daemon_sets = [ds for ds in daemon_sets if ds["status"]["desiredNumberScheduled"]] | ||
|
||
info = [] | ||
for ds in daemon_sets: | ||
name = ds["metadata"]["name"] | ||
req_mem = req_cpu = lim_mem = lim_cpu = 0 | ||
for c in ds["spec"]["template"]["spec"]["containers"]: | ||
resources = c.get("resources", {}) | ||
requests = resources.get("requests", {}) | ||
limits = resources.get("limits", {}) | ||
req_mem += parse_quantity(requests.get("memory", 0)) | ||
lim_mem += parse_quantity(limits.get("memory", 0)) | ||
req_cpu += parse_quantity(requests.get("cpu", 0)) | ||
lim_cpu += parse_quantity(limits.get("cpu", 0)) | ||
|
||
info.append( | ||
{ | ||
"name": name, | ||
"cpu_request": float(req_cpu), | ||
"cpu_limit": float(lim_cpu), | ||
"memory_request": int(req_mem), | ||
"memory_limit": int(lim_mem), | ||
} | ||
) | ||
|
||
return info | ||
|
||
|
||
def get_daemon_sets_overhead(): | ||
""" | ||
Returns a summary of the overhead from get_daemon_sets_requests. | ||
""" | ||
daemon_sets = get_daemon_sets_requests() | ||
# filter out DaemonSets related to nvidia GPUs | ||
daemon_sets = [ds for ds in daemon_sets if "nvidia" not in ds["name"]] | ||
# separate DaemonSets without requests, as only requests are what impacts | ||
# scheduling of pods and reduces a node's remaining allocatable resources | ||
req_daemon_sets = [ | ||
ds for ds in daemon_sets if ds["cpu_request"] or ds["memory_request"] | ||
] | ||
other_daemon_sets = [ | ||
ds for ds in daemon_sets if not ds["cpu_request"] and not ds["memory_request"] | ||
] | ||
|
||
cpu_requests = sum([ds["cpu_request"] for ds in req_daemon_sets]) | ||
memory_requests = sum([ds["memory_request"] for ds in req_daemon_sets]) | ||
info = { | ||
"requesting_daemon_sets": ",".join( | ||
sorted([ds["name"] for ds in req_daemon_sets]) | ||
), | ||
"other_daemon_sets": ",".join(sorted([ds["name"] for ds in other_daemon_sets])), | ||
"cpu_requests": str(math.ceil(cpu_requests * 1000)) + "m", | ||
"memory_requests": str(math.ceil(memory_requests / 1024**2)) + "Mi", | ||
} | ||
return info | ||
|
||
|
||
@resource_allocation_app.command() | ||
def daemonset_overhead( | ||
cluster_name: str = typer.Argument(..., help="Name of cluster to operate on"), | ||
): | ||
""" | ||
Updates `daemonset_overhead.yaml` with an individual cluster's DaemonSets | ||
with running pods combined requests of CPU and memory, excluding GPU related | ||
DaemonSets. | ||
""" | ||
file_path = HERE / "daemonset_overhead.yaml" | ||
file_path.touch(exist_ok=True) | ||
|
||
# acquire a Cluster object | ||
config_file_path = find_absolute_path_to_cluster_file(cluster_name) | ||
with open(config_file_path) as f: | ||
cluster = Cluster(yaml.load(f), config_file_path.parent) | ||
|
||
# auth and inspect cluster | ||
with cluster.auth(): | ||
k8s_dist, k8s_version = get_k8s_distribution() | ||
ds_overhead = get_daemon_sets_overhead() | ||
|
||
# read | ||
with open(file_path) as f: | ||
info = yaml.load(f) or {} | ||
|
||
# update | ||
ds_overhead["k8s_version"] = k8s_version | ||
info.setdefault(k8s_dist, {})[cluster_name] = ds_overhead | ||
|
||
# write | ||
with open(file_path, "w") as f: | ||
yaml.dump(info, f) |
180 changes: 180 additions & 0 deletions
180
deployer/commands/generate/resource_allocation/daemonset_overhead.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,180 @@ | ||
# This file contains generated information about cpu/memory requests made by | ||
# DaemonSets with running pods in our clusters. This information is relevant | ||
# when planning cpu/memory requests for other pods as the daemonsets requests | ||
# reduces the available allocatable capacity. | ||
# | ||
# The requests vary between cloud providers, clusters, and k8s versions for | ||
# reasons like: | ||
# | ||
# - Cloud providers' managed k8s provides different DaemonSets by default | ||
# - DaemonSets may be coupled to managed k8s features (calico-node) | ||
# - DaemonSets' requests may be coupled to managed k8s version (netd) | ||
# - DaemonSets may have a vertical autoscaler changing requests dynamically over | ||
# time if needed (calico-node-vertical-autoscaler) | ||
# - We may deploy or change a DaemonSet's requests over time (support-cryptnono, | ||
# support-prometheus-node-exporter) | ||
# | ||
# This file isn't updated by automation, but can easily be updated by manually | ||
# by running a command once for each cluster: | ||
# | ||
# ls config/clusters | xargs -I {} deployer generate resource-allocation daemonset-overhead {} | ||
# | ||
gke: | ||
2i2c: | ||
requesting_daemon_sets: calico-node,fluentbit-gke,gke-metadata-server,gke-metrics-agent,ip-masq-agent,pdcsi-node,support-cryptnono,support-prometheus-node-exporter | ||
other_daemon_sets: binder-staging-dind,binder-staging-image-cleaner,continuous-image-puller,imagebuilding-demo-binderhub-service-docker-api,netd | ||
cpu_requests: 342m | ||
memory_requests: 566Mi | ||
k8s_version: v1.26.5-gke.2100 | ||
2i2c-uk: | ||
requesting_daemon_sets: calico-node,fluentbit-gke,gke-metadata-server,gke-metrics-agent,ip-masq-agent,netd,pdcsi-node,support-cryptnono,support-prometheus-node-exporter | ||
other_daemon_sets: "" | ||
cpu_requests: 344m | ||
memory_requests: 596Mi | ||
k8s_version: v1.27.4-gke.900 | ||
awi-ciroh: | ||
requesting_daemon_sets: calico-node,fluentbit-gke,gke-metadata-server,gke-metrics-agent,ip-masq-agent,pdcsi-node,support-cryptnono,support-prometheus-node-exporter | ||
other_daemon_sets: netd | ||
cpu_requests: 342m | ||
memory_requests: 566Mi | ||
k8s_version: v1.25.10-gke.2700 | ||
callysto: | ||
requesting_daemon_sets: calico-node,fluentbit-gke,gke-metadata-server,gke-metrics-agent,ip-masq-agent,netd,pdcsi-node,support-cryptnono,support-prometheus-node-exporter | ||
other_daemon_sets: "" | ||
cpu_requests: 344m | ||
memory_requests: 596Mi | ||
k8s_version: v1.27.4-gke.900 | ||
catalystproject-latam: | ||
requesting_daemon_sets: calico-node,fluentbit-gke,gke-metadata-server,ip-masq-agent,netd,pdcsi-node,support-cryptnono,support-prometheus-node-exporter | ||
other_daemon_sets: "" | ||
cpu_requests: 338m | ||
memory_requests: 496Mi | ||
k8s_version: v1.27.3-gke.100 | ||
cloudbank: | ||
requesting_daemon_sets: calico-node,fluentbit-gke,gke-metadata-server,gke-metrics-agent,ip-masq-agent,pdcsi-node,support-cryptnono,support-prometheus-node-exporter | ||
other_daemon_sets: continuous-image-puller,continuous-image-puller,continuous-image-puller,netd | ||
cpu_requests: 342m | ||
memory_requests: 566Mi | ||
k8s_version: v1.26.5-gke.2100 | ||
hhmi: | ||
requesting_daemon_sets: fluentbit-gke,gke-metadata-server,netd,pdcsi-node,support-cryptnono,support-prometheus-node-exporter | ||
other_daemon_sets: "" | ||
cpu_requests: 228m | ||
memory_requests: 480Mi | ||
k8s_version: v1.27.3-gke.100 | ||
leap: | ||
requesting_daemon_sets: calico-node,fluentbit-gke,gke-metadata-server,gke-metrics-agent,ip-masq-agent,pdcsi-node,support-cryptnono,support-prometheus-node-exporter | ||
other_daemon_sets: netd | ||
cpu_requests: 342m | ||
memory_requests: 566Mi | ||
k8s_version: v1.25.10-gke.2700 | ||
linked-earth: | ||
requesting_daemon_sets: calico-node,fluentbit-gke,gke-metadata-server,gke-metrics-agent,ip-masq-agent,netd,pdcsi-node,support-cryptnono,support-prometheus-node-exporter | ||
other_daemon_sets: "" | ||
cpu_requests: 344m | ||
memory_requests: 596Mi | ||
k8s_version: v1.27.4-gke.900 | ||
m2lines: | ||
requesting_daemon_sets: calico-node,fluentbit-gke,gke-metadata-server,gke-metrics-agent,ip-masq-agent,netd,pdcsi-node,support-cryptnono,support-prometheus-node-exporter | ||
other_daemon_sets: "" | ||
cpu_requests: 344m | ||
memory_requests: 596Mi | ||
k8s_version: v1.27.4-gke.900 | ||
meom-ige: | ||
requesting_daemon_sets: fluentbit-gke,gke-metadata-server,gke-metrics-agent,netd,pdcsi-node,support-cryptnono,support-prometheus-node-exporter | ||
other_daemon_sets: "" | ||
cpu_requests: 234m | ||
memory_requests: 580Mi | ||
k8s_version: v1.27.4-gke.900 | ||
pangeo-hubs: | ||
requesting_daemon_sets: calico-node,fluentbit-gke,gke-metadata-server,gke-metrics-agent,ip-masq-agent,pdcsi-node,support-cryptnono,support-prometheus-node-exporter | ||
other_daemon_sets: netd | ||
cpu_requests: 342m | ||
memory_requests: 566Mi | ||
k8s_version: v1.26.5-gke.2100 | ||
qcl: | ||
requesting_daemon_sets: calico-node,fluentbit-gke,gke-metadata-server,ip-masq-agent,pdcsi-node,support-cryptnono,support-prometheus-node-exporter | ||
other_daemon_sets: continuous-image-puller,continuous-image-puller,netd | ||
cpu_requests: 336m | ||
memory_requests: 466Mi | ||
k8s_version: v1.25.10-gke.2700 | ||
eks: | ||
2i2c-aws-us: | ||
requesting_daemon_sets: aws-node,ebs-csi-node,kube-proxy,support-cryptnono,support-prometheus-node-exporter | ||
other_daemon_sets: "" | ||
cpu_requests: 170m | ||
memory_requests: 250Mi | ||
k8s_version: v1.25.12-eks-2d98532 | ||
carbonplan: | ||
requesting_daemon_sets: aws-node,ebs-csi-node,kube-proxy,support-cryptnono,support-prometheus-node-exporter | ||
other_daemon_sets: "" | ||
cpu_requests: 170m | ||
memory_requests: 250Mi | ||
k8s_version: v1.24.16-eks-2d98532 | ||
catalystproject-africa: | ||
requesting_daemon_sets: aws-node,ebs-csi-node,kube-proxy,support-cryptnono,support-prometheus-node-exporter | ||
other_daemon_sets: "" | ||
cpu_requests: 170m | ||
memory_requests: 250Mi | ||
k8s_version: v1.27.4-eks-2d98532 | ||
gridsst: | ||
requesting_daemon_sets: aws-node,ebs-csi-node,kube-proxy,support-cryptnono,support-prometheus-node-exporter | ||
other_daemon_sets: "" | ||
cpu_requests: 170m | ||
memory_requests: 250Mi | ||
k8s_version: v1.25.12-eks-2d98532 | ||
jupyter-meets-the-earth: | ||
requesting_daemon_sets: aws-node,ebs-csi-node,kube-proxy,support-cryptnono,support-prometheus-node-exporter | ||
other_daemon_sets: "" | ||
cpu_requests: 170m | ||
memory_requests: 250Mi | ||
k8s_version: v1.25.12-eks-2d98532 | ||
nasa-cryo: | ||
requesting_daemon_sets: aws-node,ebs-csi-node,kube-proxy,support-cryptnono,support-prometheus-node-exporter | ||
other_daemon_sets: "" | ||
cpu_requests: 170m | ||
memory_requests: 250Mi | ||
k8s_version: v1.25.12-eks-2d98532 | ||
nasa-ghg: | ||
requesting_daemon_sets: aws-node,ebs-csi-node,kube-proxy,support-cryptnono,support-prometheus-node-exporter | ||
other_daemon_sets: "" | ||
cpu_requests: 170m | ||
memory_requests: 250Mi | ||
k8s_version: v1.27.4-eks-2d98532 | ||
nasa-veda: | ||
requesting_daemon_sets: aws-node,ebs-csi-node,kube-proxy,support-cryptnono,support-prometheus-node-exporter | ||
other_daemon_sets: "" | ||
cpu_requests: 170m | ||
memory_requests: 250Mi | ||
k8s_version: v1.25.12-eks-2d98532 | ||
openscapes: | ||
requesting_daemon_sets: aws-node,ebs-csi-node,kube-proxy,support-cryptnono,support-prometheus-node-exporter | ||
other_daemon_sets: "" | ||
cpu_requests: 170m | ||
memory_requests: 250Mi | ||
k8s_version: v1.24.16-eks-2d98532 | ||
smithsonian: | ||
requesting_daemon_sets: aws-node,ebs-csi-node,kube-proxy,support-cryptnono,support-prometheus-node-exporter | ||
other_daemon_sets: "" | ||
cpu_requests: 170m | ||
memory_requests: 250Mi | ||
k8s_version: v1.25.12-eks-2d98532 | ||
ubc-eoas: | ||
requesting_daemon_sets: aws-node,ebs-csi-node,kube-proxy,support-cryptnono,support-prometheus-node-exporter | ||
other_daemon_sets: "" | ||
cpu_requests: 170m | ||
memory_requests: 250Mi | ||
k8s_version: v1.24.17-eks-f8587cb | ||
victor: | ||
requesting_daemon_sets: aws-node,ebs-csi-node,kube-proxy,support-cryptnono,support-prometheus-node-exporter | ||
other_daemon_sets: "" | ||
cpu_requests: 170m | ||
memory_requests: 250Mi | ||
k8s_version: v1.25.12-eks-2d98532 | ||
aks: | ||
utoronto: | ||
requesting_daemon_sets: cloud-node-manager,csi-azuredisk-node,csi-azurefile-node,kube-proxy,support-cryptnono,support-prometheus-node-exporter | ||
other_daemon_sets: calico-node,continuous-image-puller,continuous-image-puller,continuous-image-puller,continuous-image-puller | ||
cpu_requests: 226m | ||
memory_requests: 300Mi | ||
k8s_version: v1.26.3 |
Oops, something went wrong.