Skip to content
85 changes: 67 additions & 18 deletions lib/scenario_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
from pathlib import Path
import random
import shutil
import math
import yaml
from collections import OrderedDict
from datetime import datetime
Expand Down Expand Up @@ -613,6 +614,56 @@ def _populate_image_names(self):
else:
service['image'] = f"{service_name}_{random.randint(500000,10000000)}"

def _populate_cpu_and_memory_limits(self):
services = self._usage_scenario.get('services', {})

DOCKER_AVAILABLE_MEMORY = int(subprocess.check_output(['docker', 'info', '--format', '{{.MemTotal}}'], encoding='UTF-8', errors='replace').strip())
unassigned_memory = DOCKER_AVAILABLE_MEMORY-1024**3 # we want to leave 1 GB free on the host / docker VM to avoid OOM situations

if unassigned_memory <= 0:
raise RuntimeError(f"Docker has insufficient memory available. Available: {DOCKER_AVAILABLE_MEMORY/1024**3:.2f}GB, Required: at least 1GB for GMT overhead")

SYSTEM_ASSIGNABLE_CPU_COUNT = int(subprocess.check_output(['docker', 'info', '--format', '{{.NCPU}}'], encoding='UTF-8', errors='replace').strip()) -1
if SYSTEM_ASSIGNABLE_CPU_COUNT <= 0:
raise RuntimeError(f"Cannot assign docker containers to any CPU as no CPUs are available to Docker. Available CPU count: {SYSTEM_ASSIGNABLE_CPU_COUNT}")

to_be_assigned_services = []
for service_name, service in services.items():
# wildly the docker compose spec allows deploy to be None
# ... thus we need to check and cannot .get()
if 'deploy' in service and service['deploy'] is not None and (memory := service['deploy'].get('resources', {}).get('limits', {}).get('memory', None)) is not None:
del service['deploy']['resources']['limits']['memory']
service['mem_limit'] = memory
if service.get('mem_limit', 0) == 0:
to_be_assigned_services.append(service_name)
self.__warnings.append(f"Service '{service_name}' had no memory limit set. GMT does not allow unbounded memory limits and auto value was applied.")
else:
memory_bytes = utils.docker_memory_to_bytes(service['mem_limit'])
if memory_bytes > unassigned_memory:
raise ValueError(f"You are trying to assign more memory to service {service_name} than is left available on host system and already assigned containers. Requested memory: {memory_bytes} Bytes. Left unassigned memory: {unassigned_memory} Bytes")
unassigned_memory -= memory_bytes

if 'deploy' in service and service['deploy'] is not None and (cpus := service['deploy'].get('resources', {}).get('limits', {}).get('cpus', None)) is not None:
del service['deploy']['resources']['limits']['cpus']
service['cpus'] = cpus
REQUESTED_CPUS = float(service.get('cpus', 0))
if REQUESTED_CPUS == 0:
# we do not want to auto enforce CPU limits. So we re-map the limit spec here to the host system for transparency and for comparing with other runs
service['cpus'] = SYSTEM_ASSIGNABLE_CPU_COUNT
elif REQUESTED_CPUS > SYSTEM_ASSIGNABLE_CPU_COUNT:
raise ValueError(f"You are trying to assign more cpus to service {service_name} than is available host system. Requested CPUs: {REQUESTED_CPUS}. Available CPUs: {SYSTEM_ASSIGNABLE_CPU_COUNT}")



service_count = len(to_be_assigned_services)
if service_count > 0:
memory_per_service = math.floor(unassigned_memory/service_count)
if memory_per_service < 1024**3:
self.__warnings.append('Auto-assigned memory for containers was less than 1 GB per container because no more memory was available to the host. If you feel that this is too low please set memory limits manually or upgrade to a bigger host.')
for service_name in to_be_assigned_services:
services[service_name]['mem_limit'] = memory_per_service


def _remove_docker_images(self):
print(TerminalColors.HEADER, '\nRemoving all temporary GMT images', TerminalColors.ENDC)

Expand Down Expand Up @@ -1060,6 +1111,10 @@ def _setup_services(self):
# This use case is when you have running containers on your host and want to benchmark some code running in them
services = self._usage_scenario.get('services', {})

SYSTEM_ASSIGNABLE_CPU_COUNT = int(subprocess.check_output(['docker', 'info', '--format', '{{.NCPU}}'], encoding='UTF-8', errors='replace').strip()) -1
if SYSTEM_ASSIGNABLE_CPU_COUNT <= 0:
raise RuntimeError(f"Cannot assign docker containers to any CPU as no CPUs are available to Docker. Available CPU count: {SYSTEM_ASSIGNABLE_CPU_COUNT}")

# Check if there are service dependencies defined with 'depends_on'.
# If so, change the order of the services accordingly.
services_ordered = self._order_services(services)
Expand Down Expand Up @@ -1275,24 +1330,17 @@ def _setup_services(self):
if 'pause-after-phase' in service:
self.__services_to_pause_phase[service['pause-after-phase']] = self.__services_to_pause_phase.get(service['pause-after-phase'], []) + [container_name]

# wildly the docker compose spec allows deploy to be None ... thus we need to check and cannot .get()
if 'deploy' in service and service['deploy'] is not None and (memory := service['deploy'].get('resources', {}).get('limits', {}).get('memory', None)):
docker_run_string.append('--memory') # value in bytes
docker_run_string.append(str(memory))
print('Applying Memory Limit from deploy')
elif memory := service.get('mem_limit', None): # we only need to get resources or cpus. they must align anyway
docker_run_string.append('--memory')
docker_run_string.append(str(memory)) # value in bytes e.g. "10M"
print('Applying Memory Limit from services')

if 'deploy' in service and service['deploy'] is not None and (cpus := service['deploy'].get('resources', {}).get('limits', {}).get('cpus', None)):
docker_run_string.append('--cpus') # value in cores
docker_run_string.append(str(cpus))
print('Applying CPU Limit from deploy')
elif cpus := service.get('cpus', None): # we only need to get resources or cpus. they must align anyway
docker_run_string.append('--cpus')
docker_run_string.append(str(cpus)) # value in (fractional) cores
print('Applying CPU Limit from services')
# apply cpuset but keep one core for GMT and metric providers free
# This cannot be configured via user as no knowledge of machine shall be required
docker_run_string.append('--cpuset-cpus')
docker_run_string.append(','.join(map(str, range(1,SYSTEM_ASSIGNABLE_CPU_COUNT+1)))) # range inclusive as we do not assign to 0
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think there is bug. Based on my understanding, the +1 is wrong here and it should be

docker_run_string.append(','.join(map(str, range(1,SYSTEM_ASSIGNABLE_CPU_COUNT))))

If there are in total 4 cores, the variable SYSTEM_ASSIGNABLE_CPU_COUNT has the value 3. --cpuset-cpus is expected to be set to 1-3, but with the current implementation it is set to 1-4.

Copy link
Member Author

@ArneTR ArneTR Dec 3, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Did you take into account the maybe unexpected range function behaviour in python? The right boundary is exclusive, not inclusive.

>>> list(range(1,4))
[1, 2, 3]

Please follow up if I misunderstood the bug report


docker_run_string.append('--memory-swappiness=0') # GMT should never swap as it gives hard to interpret / non-linear performance results
docker_run_string.append('--oom-score-adj=1000') # containers will be killed first so host does not OOM
docker_run_string.append(f"--memory={service['mem_limit']}")
docker_run_string.append(f"--memory-swap={service['mem_limit']}") # effectively disable swap

docker_run_string.append(f"--cpus={service['cpus']}")

if 'healthcheck' in service: # must come last
if 'disable' in service['healthcheck'] and service['healthcheck']['disable'] is True:
Expand Down Expand Up @@ -2378,6 +2426,7 @@ def run(self):
self._register_machine_id()
self._import_metric_providers()
self._populate_image_names()
self._populate_cpu_and_memory_limits()
self._prepare_docker()
self._check_running_containers_before_start()
self._remove_docker_images()
Expand Down
18 changes: 15 additions & 3 deletions lib/system_checks.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@

GMT_Resources = {
'free_disk': 1024 ** 3, # 1GB in bytes
'free_memory': 1024 ** 3, # 1GB in bytes
'free_memory': 2 * 1024 ** 3, # 2GB in bytes
}

######## CHECK FUNCTIONS ########
Expand Down Expand Up @@ -78,6 +78,16 @@ def check_free_disk(*_, **__):
free_space_bytes = psutil.disk_usage(os.path.dirname(os.path.abspath(__file__))).free
return free_space_bytes >= GMT_Resources['free_disk']

def check_available_cpus(*_, **__):
docker_reported_cpus = int(subprocess.check_output(['docker', 'info', '--format', '{{.NCPU}}'], encoding='UTF-8', errors='replace').strip())
return os.cpu_count() >= 2 and docker_reported_cpus >= 2

def check_docker_cpu_availability(*_, **__):
if platform.system() == 'Darwin':
return True # no checks on macOS as docker runs in VM here with custom CPU configuration
docker_reported_cpus = subprocess.check_output(['docker', 'info', '--format', '{{.NCPU}}'], encoding='UTF-8', errors='replace').strip()
return str(os.cpu_count()) == docker_reported_cpus # not casting to int to not get unexpected 0 or 1

def check_free_memory(*_, **__):
return psutil.virtual_memory().available >= GMT_Resources['free_memory']

Expand Down Expand Up @@ -154,8 +164,10 @@ def check_steal_time(*_, **__):
(check_ntp, Status.WARN, 'ntp', 'You have NTP time syncing active. This can create noise in runs and should be deactivated.'),
(check_cpu_utilization, Status.WARN, '< 5% CPU utilization', 'Your system seems to be busy. Utilization is above 5%. Consider terminating some processes for a more stable measurement.'),
(check_largest_sampling_rate, Status.WARN, 'high sampling rate', 'You have chosen at least one provider with a sampling rate > 1000 ms. That is not recommended and might lead also to longer benchmarking times due to internal extra sleeps to adjust measurement frames.'),
(check_free_disk, Status.ERROR, '1 GiB free hdd space', 'We recommend to free up some disk space (< 1GiB available)'),
(check_free_memory, Status.ERROR, '1 GiB free memory', 'No free memory! Please kill some programs (< 1GiB available)'),
(check_available_cpus, Status.ERROR, '< 2 CPUs', 'You need at least 2 CPU cores on the system (and assigned to Docker in case of macOS) to run GMT'),
(check_docker_cpu_availability, Status.ERROR, 'Docker CPU reporting', 'Docker reports a different amount of available CPUs than the system itself - GMT cannot handle this currently'),
(check_free_disk, Status.ERROR, '1 GiB free hdd space', 'You need to free up some disk space to run GMT reliably (< 1GiB available)'),
(check_free_memory, Status.ERROR, '2 GiB free memory', 'No free memory! Please kill some programs (< 1GiB available)'),
(check_docker_daemon, Status.ERROR, 'docker daemon', 'The docker daemon could not be reached. Are you running in rootless mode or have added yourself to the docker group? See installation: [See https://docs.green-coding.io/docs/installation/]'),
(check_docker_host_env, Status.ERROR, 'docker host env', 'You seem to be running a rootless docker and in this case you must set the DOCKER_HOST environment variable so that the docker library we use can find the docker agent. Typically this should be DOCKER_HOST=unix:///$XDG_RUNTIME_DIR/docker.sock'),
(check_containers_running, Status.WARN, 'running containers', 'You have other containers running on the system. This is usually what you want in local development, but for undisturbed measurements consider going for a measurement cluster [See https://docs.green-coding.io/docs/installation/installation-cluster/].'),
Expand Down
32 changes: 32 additions & 0 deletions lib/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -226,3 +226,35 @@ def find_own_cgroup_name():
if found_cgroups != 1:
raise RuntimeError(f"Could not find GMT\'s own cgroup or found too many. Amount: {found_cgroups}")
return lines[0].split('/')[-1].strip()

def docker_memory_to_bytes(memory_value):
"""Convert memory string with units (e.g., '50M', '2G') to bytes."""
"""Although GMT internally works with MiB this function is for converting for docker syntax"""
unit_multipliers = {
'B': 1, # Byte
'K': 1_024, # Kilobyte
'M': 1_024**2, # Megabyte
'G': 1_024**3, # Gigabyte
'T': 1_024**4, # Terabyte
}

if isinstance(memory_value, (float, int)) or memory_value[-1].isdigit():
# in case of float this will round down. but since float would be pure bytes anyway
# we must floor the value in any case as no less than a byte can be accounted
return int(memory_value)

# although not specified in the docker specification values like 10m and also 10MB are allowed.
# so if we see an additional B we remove it at the end
if memory_value[-1] == 'b' or memory_value[-1] == 'B':
memory_value = memory_value[:-1]

if memory_value[-1].isdigit():
unit = 'B'
num = memory_value
else:
num, unit = float(memory_value[:-1]), memory_value[-1].upper()

if unit in unit_multipliers:
return int(num * unit_multipliers[unit])

raise ValueError(f"Unrecognized memory unit: {unit}")
26 changes: 3 additions & 23 deletions optimization_providers/resources/utilization.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

from optimization_providers.base import Criticality, register_reporter
from lib import error_helpers
from lib import utils

REPORTER_NAME = 'utilization'
REPORTER_ICON = 'tachometer alternate'
Expand All @@ -10,33 +11,13 @@
MAX_CPU_UTIL = 90 #%
MIN_CPU_UTIL = 50 #%

def memory_to_bytes(memory_str):
"""Convert memory string with units (e.g., '50M', '2G') to bytes."""
unit_multipliers = {
'K': 1_000, # Kilobyte
'M': 1_000_000, # Megabyte
'G': 1_000_000_000, # Gigabyte
'T': 1_000_000_000, # Terabyte
}

if isinstance(memory_str, int) or memory_str[-1].isdigit():
return int(memory_str)

num, unit = float(memory_str[:-1]), memory_str[-1].upper()

if unit in unit_multipliers:
return int(num * unit_multipliers[unit])

raise ValueError(f"Unrecognized memory unit: {unit}")

# pylint: disable=unused-argument
@register_reporter('container_memory_utilization', Criticality.INFO, REPORTER_NAME, REPORTER_ICON, req_providers =['MemoryUsedCgroupContainerProvider'])
def container_memory_utilization(self, run, measurements, repo_path, network, notes, phases):

mem = {}
for s, d in run.get('usage_scenario').get('services').items():
if x := d.get('deploy', {}).get('resources', {}).get('limits', {}).get('memory', None):
mem[s] = memory_to_bytes(x)
mem[s] = utils.docker_memory_to_bytes(d['mem_limit']) # will always be there bc populated by scenario_runner

for service, measurement_stats in phases['data']['[RUNTIME]']['data']['memory_used_cgroup_container']['data'].items():
if not service in mem:
Expand Down Expand Up @@ -68,8 +49,7 @@ def container_cpu_utilization(self, run, measurements, repo_path, network, notes

cpus = {}
for s, d in run.get('usage_scenario').get('services').items():
if x := d.get('deploy', {}).get('resources', {}).get('limits', {}).get('cpus', None):
cpus[s] = float(x)
cpus[s] = float(d['cpus']) # will always be there bc populated by scenario_runner

for service, measurement_stats in phases['data']['[RUNTIME]']['data']['cpu_utilization_cgroup_container']['data'].items():
if not service in cpus:
Expand Down
37 changes: 37 additions & 0 deletions tests/data/usage_scenarios/basic_stress_with_limits.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
---
name: Test Stress
author: Dan Mateas
description: test

services:
test-container:
type: container
image: gcb_stress
mem_limit: 100MB
cpus: 2
build:
context: ../stress-application

flow:

- name: I am a hidden phase
container: test-container
hidden: true
commands:
- type: console
command: echo 1

- name: Stress
container: test-container
commands:
- type: console
command: stress-ng -c 1 -t 1 -q
note: Starting Stress

- name: I am hidden too
container: test-container
hidden: true
commands:
- type: console
command: echo 1
note: echo
5 changes: 5 additions & 0 deletions tests/data/usage_scenarios/resource_limits_good.yml
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,11 @@ services:
image: alpine
deploy: # allowed to be None

test-container-limits-none:
type: container
image: alpine
# no deploy key at all

test-container-cpu-and-memory-in-both:
type: container
image: alpine
Expand Down
14 changes: 4 additions & 10 deletions tests/data/usage_scenarios/resource_limits_shm_good.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,22 +7,16 @@ services:
test-container:
type: container
image: alpine
deploy:
resources:
limits:
cpus: "1.2"
memory: "10MB"
cpus: "1.2"
mem_limit: "10MB"

shm_size: "30MB"

test-container-2:
type: container
image: alpine
deploy:
resources:
limits:
cpus: "1.2"
memory: "10MB"
cpus: "1.2"
mem_limit: "10MB"

shm_size: 15728640

Expand Down
30 changes: 30 additions & 0 deletions tests/data/usage_scenarios/resource_limits_too_high.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
---
name: Testing SHM
author: Arne Tarara
description: Testing Too high limits

services:
test-container:
type: container
image: alpine
cpus: 400


flow:
- name: Testing SHM
container: test-container
commands:
- type: console
command: 'echo "SHM size is: $(df -h /dev/shm)"'
shell: sh
log-stdout: True
log-stderr: True

- name: Testing SHM 2
container: test-container-2
commands:
- type: console
command: 'echo "SHM size is: $(df -h /dev/shm)"'
shell: sh
log-stdout: True
log-stderr: True
1 change: 1 addition & 0 deletions tests/test_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -345,6 +345,7 @@ def run_steps(self, stop_at=None):
if stop_at == 'import_metric_providers':
return
self.__runner._populate_image_names()
self.__runner._populate_cpu_and_memory_limits()
self.__runner._prepare_docker()
self.__runner._check_running_containers_before_start()
self.__runner._remove_docker_images()
Expand Down
Loading
Loading