diff --git a/deployer/__main__.py b/deployer/__main__.py index b94f3f9ae0..30385dff4c 100644 --- a/deployer/__main__.py +++ b/deployer/__main__.py @@ -10,6 +10,8 @@ import deployer.grafana.central_grafana # noqa: F401 import deployer.grafana.grafana_tokens # noqa: F401 import deployer.keys.decrypt_age # noqa: F401 +import deployer.resource_allocation.generate_choices # noqa: F401 +import deployer.resource_allocation.update_nodeinfo # noqa: F401 from .cli_app import app diff --git a/deployer/resource_allocation/__init__.py b/deployer/resource_allocation/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/deployer/resource_allocation/generate_choices.py b/deployer/resource_allocation/generate_choices.py new file mode 100644 index 0000000000..e7d19cb6ec --- /dev/null +++ b/deployer/resource_allocation/generate_choices.py @@ -0,0 +1,117 @@ +import json +import sys +from enum import Enum +from pathlib import Path + +import typer +from ruamel.yaml import YAML + +from ..cli_app import app + +yaml = YAML(typ="rt") + +HERE = Path(__file__).parent + + +class ResourceAllocationStrategies(str, Enum): + PROPORTIONAL_MEMORY_STRATEGY = "proportional-memory-strategy" + + +def proportional_memory_strategy(nodeinfo: dict, num_allocations: int): + """ + Generate choices for resource allocation based on proportional changes to memory + + Used primarily in research cases where: + 1. Workloads are more memory constrained than CPU constrained + 2. End users can be expected to select appropriate amount of memory they need for a given + workload, either by their own intrinsic knowledge or instructed by an instructor. + + It features: + 1. No memory overcommit at all, as end users are expected to ask for as much memory as + they need. + 2. CPU *guarantees* are proportional to amount of memory guarantee - the more memory you + ask for, the more CPU you are guaranteed. This allows end users to pick resources purely + based on memory only, simplifying the mental model. Also allows for maximum packing of + user pods onto a node, as we will *not* run out of CPU on a node before running out of + memory. + 3. No CPU limits at all, as CPU is a more flexible resource. The CPU guarantee will ensure + that users will not be starved of CPU. + 4. Each choice the user can make approximately has half as many resources as the next largest + choice, with the largest being a full node. This offers a decent compromise - if you pick + the largest option, you will most likely have to wait for a full node spawn, while smaller + options are much more likely to be shared. + """ + + # We operate on *available* memory, which already accounts for system components (like kubelet & systemd) + # as well as daemonsets we run on every node. This represents the resources that are available + # for user pods. + available_node_mem = nodeinfo["available"]["memory"] + available_node_cpu = nodeinfo["available"]["cpu"] + + # We always start from the top, and provide a choice that takes up the whole node. + mem_limit = available_node_mem + + choices = {} + for i in range(num_allocations): + # CPU guarantee is proportional to the memory limit for this particular choice. + # This makes sure we utilize all the memory on a node all the time. + cpu_guarantee = (mem_limit / available_node_mem) * available_node_cpu + + # Memory is in bytes, let's convert it to GB to display + mem_display = mem_limit / 1024 / 1024 / 1024 + display_name = f"{mem_display:.1f} GB RAM" + + choice = { + "display_name": display_name, + "kubespawner_override": { + # Guarantee and Limit are the same - this strategy has no oversubscription + "mem_guarantee": int(mem_limit), + "mem_limit": int(mem_limit), + "cpu_guarantee": cpu_guarantee, + # CPU limit is set to entire available CPU of the node, making sure no single + # user can starve the node of critical kubelet / systemd resources. + # Leaving it unset sets it to same as guarantee, which we do not want. + "cpu_limit": available_node_cpu, + }, + } + choices[f"mem_{num_allocations - i}"] = choice + + # Halve the mem_limit for the next choice + mem_limit = mem_limit / 2 + + # Reverse the choices so the smallest one is first + choices = dict(reversed(choices.items())) + + # Make the smallest choice the default explicitly + choices[list(choices.keys())[0]]["default"] = True + + return choices + + +@app.command() +def generate_resource_allocation_choices( + instance_type: str = typer.Argument( + ..., help="Instance type to generate Resource Allocation options for" + ), + num_allocations: int = typer.Option(5, help="Number of choices to generate"), + strategy: ResourceAllocationStrategies = typer.Option( + ResourceAllocationStrategies.PROPORTIONAL_MEMORY_STRATEGY, + help="Strategy to use for generating resource allocation choices choices", + ), +): + with open(HERE / "node-capacity-info.json") as f: + nodeinfo = json.load(f) + + if instance_type not in nodeinfo: + print( + f"Capacity information about {instance_type} not available", file=sys.stderr + ) + print("TODO: Provide information on how to update it", file=sys.stderr) + sys.exit(1) + + # Call appropriate function based on what strategy we want to use + if strategy == ResourceAllocationStrategies.PROPORTIONAL_MEMORY_STRATEGY: + choices = proportional_memory_strategy(nodeinfo[instance_type], num_allocations) + else: + raise ValueError(f"Strategy {strategy} is not currently supported") + yaml.dump(choices, sys.stdout) diff --git a/deployer/resource_allocation/node-capacity-info.json b/deployer/resource_allocation/node-capacity-info.json new file mode 100644 index 0000000000..c12c0a0b5e --- /dev/null +++ b/deployer/resource_allocation/node-capacity-info.json @@ -0,0 +1,32 @@ +{ + "r5.xlarge": { + "capacity": { + "cpu": 4.0, + "memory": 33186611200 + }, + "available": { + "cpu": 3.75, + "memory": 31883231232 + } + }, + "r5.16xlarge": { + "capacity": { + "cpu": 64.0, + "memory": 535146246144 + }, + "available": { + "cpu": 63.6, + "memory": 526011052032 + } + }, + "n2-highmem-4": { + "capacity": { + "cpu": 4.0, + "memory": 33670004736 + }, + "available": { + "cpu": 3.196, + "memory": 28975529984 + } + } +} \ No newline at end of file diff --git a/deployer/resource_allocation/update_nodeinfo.py b/deployer/resource_allocation/update_nodeinfo.py new file mode 100644 index 0000000000..0a5dda2198 --- /dev/null +++ b/deployer/resource_allocation/update_nodeinfo.py @@ -0,0 +1,129 @@ +import json +import subprocess +from pathlib import Path + +import typer +from kubernetes.utils.quantity import parse_quantity +from ruamel.yaml import YAML + +from ..cli_app import app + +HERE = Path(__file__).parent + +yaml = YAML(typ="rt") + + +def get_node_capacity_info(instance_type: str): + # Get full YAML spec of all nodes with this instance_type + nodes = json.loads( + subprocess.check_output( + [ + "kubectl", + "get", + "node", + "-l", + f"node.kubernetes.io/instance-type={instance_type}", + "-o", + "json", + ] + ).decode() + ) + + if not nodes.get("items"): + # No nodes with given instance_type found! + # A node with this instance_type needs to be actively running for us to accurately + # calculate how much resources are available, as it relies on the non-jupyter pods + # running at that time. + raise ValueError( + f"No nodes with instance-type={instance_type} found in current kubernetes cluster" + ) + + # Just pick one node + node = nodes["items"][0] + + # This is the toal amount of RAM and CPU on the node. + capacity = node["status"]["capacity"] + cpu_capacity = parse_quantity(capacity["cpu"]) + mem_capacity = parse_quantity(capacity["memory"]) + + # Total amount of RAM and CPU available to kubernetes as a whole. + # This accounts for things running on the node, such as kubelet, the + # container runtime, systemd, etc. This does *not* count for daemonsets + # and pods runninng on the kubernetes cluster. + allocatable = node["status"]["allocatable"] + cpu_allocatable = parse_quantity(allocatable["cpu"]) + mem_allocatable = parse_quantity(allocatable["memory"]) + + # Find all pods running on this node + all_pods = json.loads( + subprocess.check_output( + [ + "kubectl", + "get", + "pod", + "-A", + "--field-selector", + f'spec.nodeName={node["metadata"]["name"]}', + "-o", + "json", + ] + ).decode() + )["items"] + + # Filter out jupyterhub user pods + # TODO: Filter out dask scheduler and worker pods + pods = [ + p + for p in all_pods + if p["metadata"]["labels"].get("component") not in ("singleuser-server",) + ] + + # This is the amount of resources available for our workloads - jupyter and dask. + # We start with the allocatable resources, and subtract the resource *requirements* + # for all the pods that are running on every node, primarily from kube-system and + # support. The amount left over is what is available for the *scheduler* to put user pods + # on to. + cpu_available = cpu_allocatable + mem_available = mem_allocatable + + for p in pods: + mem_request = 0 + cpu_request = 0 + # Iterate through all the containers in the pod, and count the memory & cpu requests + # they make. We don't count initContainers' requests as they don't overlap with the + # container requests at any point. + for c in p["spec"]["containers"]: + mem_request += parse_quantity( + c.get("resources", {}).get("requests", {}).get("memory", "0") + ) + cpu_request += parse_quantity( + c.get("resources", {}).get("requests", {}).get("cpu", "0") + ) + cpu_available -= cpu_request + mem_available -= mem_request + + return { + # CPU units are in fractions, while memory units are bytes + "capacity": {"cpu": float(cpu_capacity), "memory": int(mem_capacity)}, + "available": {"cpu": float(cpu_available), "memory": int(mem_available)}, + } + + +@app.command() +def update_node_capacity_info( + instance_type: str = typer.Argument( + ..., help="Instance type to generate Resource Allocation options for" + ), +): + try: + with open(HERE / "node-capacity-info.json") as f: + instances_info = json.load(f) + except FileNotFoundError: + instances_info = {} + node_capacity = get_node_capacity_info(instance_type) + + instances_info[instance_type] = node_capacity + with open(HERE / "node-capacity-info.json", "w") as f: + json.dump(instances_info, f, indent=4) + + print(f"Updated node-capacity-info.json for {instance_type}")