Skip to content

feat(runtime): use prlimit to limit resource usage of command to avoid OOM Runtime Kill #6338

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 43 commits into from
Feb 11, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
43 commits
Select commit Hold shift + click to select a range
7421aa1
log more mem info
xingyaoww Jan 18, 2025
501824a
simplify remote stress test a little bit
xingyaoww Jan 18, 2025
867f672
reliable way to reproduce error
xingyaoww Jan 18, 2025
c6902da
use a more reasonable tests
xingyaoww Jan 18, 2025
5c44726
Merge branch 'main' into xw/bash-perf
xingyaoww Feb 3, 2025
61b87ce
feat(runtime): add memory monitoring to prevent k8s OOM kills
openhands-agent Feb 3, 2025
54ac167
update lock
xingyaoww Feb 3, 2025
fc18e5c
update memory monitor for action execution server
xingyaoww Feb 3, 2025
feda348
monitor the entire pg
xingyaoww Feb 3, 2025
ed35d53
fix recursive call
xingyaoww Feb 3, 2025
6d6adba
Merge commit 'f24fbec165de33749500dc06c9b6e753b588dbf9' into xw/bash-…
xingyaoww Feb 3, 2025
5f33ae1
update log
xingyaoww Feb 3, 2025
4699e91
use prlimit to restrict memory usage
xingyaoww Feb 3, 2025
7fda066
fix prlimit
xingyaoww Feb 3, 2025
33737cd
also support running stress test locally
xingyaoww Feb 3, 2025
9da4550
log memory stuff in case of high system pressure
xingyaoww Feb 3, 2025
57afe20
tweak tests
xingyaoww Feb 3, 2025
6bc5ca3
combine docker stress test with remote runtime
xingyaoww Feb 3, 2025
d2d57fe
remove save perf debug
xingyaoww Feb 4, 2025
e09ac90
makes it work for both remote and docke rtests
xingyaoww Feb 4, 2025
a1d200c
allow override max memory gb in action execution server; try to get…
xingyaoww Feb 4, 2025
19f025b
ok got this working with docker
xingyaoww Feb 4, 2025
6e678fd
use pss instead of rss for process mem
xingyaoww Feb 4, 2025
74d048b
Merge branch 'main' into xw/bash-perf
enyst Feb 4, 2025
507c0a9
update runtime startup command for remote runtime too
xingyaoww Feb 5, 2025
a6bbbfe
Merge commit '74d048b62341b33e32961b861e6312ed70086ac6' into xw/bash-…
xingyaoww Feb 5, 2025
9b92118
update lock
xingyaoww Feb 5, 2025
f39181c
update stresstest script
xingyaoww Feb 5, 2025
81634ea
Merge commit '5fa2634d6070b84e912bb85017cf686cd7abecdf' into xw/bash-…
xingyaoww Feb 7, 2025
34e36d4
add stress test for file editing
xingyaoww Feb 10, 2025
a9dc6d4
add a memory test that can run in CI
xingyaoww Feb 10, 2025
239ee06
revert memory monitor
xingyaoww Feb 10, 2025
b039201
Merge commit 'b12b426e3ded6934b289e2efe2dd7ad0c7d181c1' into xw/bash-…
xingyaoww Feb 10, 2025
a51c9c5
simplify dep
xingyaoww Feb 10, 2025
995a3fd
revert more changes
xingyaoww Feb 10, 2025
2a38c54
revert even more changes
xingyaoww Feb 10, 2025
0a157a3
use lock from main
xingyaoww Feb 10, 2025
bf5dcbf
update comment
xingyaoww Feb 10, 2025
ac5ee21
add another test where we use higher limit and expect the stress test…
xingyaoww Feb 10, 2025
e64f477
Update openhands/runtime/action_execution_server.py
xingyaoww Feb 11, 2025
76744ea
only enable prlimit when max_memory_mb is not None
xingyaoww Feb 11, 2025
cc5f738
shorten stress test to 30 sec
xingyaoww Feb 11, 2025
679d27a
Merge branch 'main' into xw/bash-perf
xingyaoww Feb 11, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 19 additions & 0 deletions openhands/runtime/action_execution_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
from pathlib import Path
from zipfile import ZipFile

import psutil
from fastapi import Depends, FastAPI, HTTPException, Request, UploadFile
from fastapi.exceptions import RequestValidationError
from fastapi.responses import JSONResponse, StreamingResponse
Expand Down Expand Up @@ -108,6 +109,22 @@ def __init__(
self.last_execution_time = self.start_time
self._initialized = False

if _override_max_memory_gb := os.environ.get('RUNTIME_MAX_MEMORY_GB', None):
self.max_memory_gb = int(_override_max_memory_gb)
logger.info(
f'Setting max memory to {self.max_memory_gb}GB (according to the RUNTIME_MAX_MEMORY_GB environment variable)'
)
else:
# Get available system memory
total_memory_gb = psutil.virtual_memory().total / (
1024 * 1024 * 1024
) # Convert to GB
self.max_memory_gb = int(max(0.5, total_memory_gb - 1.0))
# Reserve 1GB as head room, minimum of 0.5GB
logger.info(
f'Total memory: {total_memory_gb}GB, setting limit to {self.max_memory_gb}GB (reserved 1GB for action execution server, minimum 0.5GB)'
)

@property
def initial_cwd(self):
return self._initial_cwd
Expand All @@ -120,8 +137,10 @@ async def ainit(self):
no_change_timeout_seconds=int(
os.environ.get('NO_CHANGE_TIMEOUT_SECONDS', 30)
),
max_memory_mb=self.max_memory_gb * 1024,
)
self.bash_session.initialize()

await wait_all(
(self._init_plugin(plugin) for plugin in self.plugins_to_load),
timeout=30,
Expand Down
10 changes: 7 additions & 3 deletions openhands/runtime/impl/remote/remote_runtime.py
Original file line number Diff line number Diff line change
Expand Up @@ -212,13 +212,17 @@ def _start_runtime(self):
plugins=self.plugins,
app_config=self.config,
)
environment = {
'DEBUG': 'true'
if self.config.debug or os.environ.get('DEBUG', 'false').lower() == 'true'
else {},
}
environment.update(self.config.sandbox.runtime_startup_env_vars)
start_request = {
'image': self.container_image,
'command': command,
'working_dir': '/openhands/code/',
'environment': {'DEBUG': 'true'}
if self.config.debug or os.environ.get('DEBUG', 'false').lower() == 'true'
else {},
'environment': environment,
'session_id': self.sid,
'resource_factor': self.config.sandbox.remote_runtime_resource_factor,
}
Expand Down
16 changes: 12 additions & 4 deletions openhands/runtime/utils/bash.py
Original file line number Diff line number Diff line change
Expand Up @@ -175,25 +175,32 @@ def __init__(
work_dir: str,
username: str | None = None,
no_change_timeout_seconds: int = 30,
max_memory_mb: int | None = None,
):
self.NO_CHANGE_TIMEOUT_SECONDS = no_change_timeout_seconds
self.work_dir = work_dir
self.username = username
self._initialized = False
self.max_memory_mb = max_memory_mb

def initialize(self):
self.server = libtmux.Server()
window_command = '/bin/bash'
_shell_command = '/bin/bash'
if self.username in ['root', 'openhands']:
# This starts a non-login (new) shell for the given user
window_command = f'su {self.username} -'
_shell_command = f'su {self.username} -'
# otherwise, we are running as the CURRENT USER (e.g., when running LocalRuntime)
if self.max_memory_mb is not None:
window_command = (
f'prlimit --as={self.max_memory_mb * 1024 * 1024} {_shell_command}'
)
else:
window_command = _shell_command

logger.debug(f'Initializing bash session with command: {window_command}')
session_name = f'openhands-{self.username}-{uuid.uuid4()}'
self.session = self.server.new_session(
session_name=session_name,
window_name='bash',
window_command=window_command,
start_directory=self.work_dir,
kill_session=True,
x=1000,
Expand All @@ -207,6 +214,7 @@ def initialize(self):
# We need to create a new pane because the initial pane's history limit is (default) 2000
_initial_window = self.session.attached_window
self.window = self.session.new_window(
window_name='bash',
window_shell=window_command,
start_directory=self.work_dir,
)
Expand Down
113 changes: 113 additions & 0 deletions tests/runtime/test_runtime_resource.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
"""Stress tests for the DockerRuntime, which connects to the ActionExecutor running in the sandbox."""

from conftest import _close_test_runtime, _load_runtime

from openhands.core.logger import openhands_logger as logger
from openhands.events.action import CmdRunAction


def test_stress_docker_runtime(temp_dir, runtime_cls, repeat=1):
runtime, config = _load_runtime(
temp_dir,
runtime_cls,
docker_runtime_kwargs={
'cpu_period': 100000, # 100ms
'cpu_quota': 100000, # Can use 100ms out of each 100ms period (1 CPU)
'mem_limit': '4G', # 4 GB of memory
},
)

action = CmdRunAction(
command='sudo apt-get update && sudo apt-get install -y stress-ng'
)
logger.info(action, extra={'msg_type': 'ACTION'})
obs = runtime.run_action(action)
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
assert obs.exit_code == 0

for _ in range(repeat):
# run stress-ng stress tests for 1 minute
action = CmdRunAction(command='stress-ng --all 1 -t 30s')
action.set_hard_timeout(120)
logger.info(action, extra={'msg_type': 'ACTION'})
obs = runtime.run_action(action)
logger.info(obs, extra={'msg_type': 'OBSERVATION'})

_close_test_runtime(runtime)


def test_stress_docker_runtime_hit_memory_limits(temp_dir, runtime_cls):
"""Test runtime behavior under resource constraints."""
runtime, config = _load_runtime(
temp_dir,
runtime_cls,
docker_runtime_kwargs={
'cpu_period': 100000, # 100ms
'cpu_quota': 100000, # Can use 100ms out of each 100ms period (1 CPU)
'mem_limit': '4G', # 4 GB of memory
'memswap_limit': '0', # No swap
'mem_swappiness': 0, # Disable swapping
'oom_kill_disable': False, # Enable OOM killer
},
runtime_startup_env_vars={
'RUNTIME_MAX_MEMORY_GB': '3',
},
)

action = CmdRunAction(
command='sudo apt-get update && sudo apt-get install -y stress-ng'
)
logger.info(action, extra={'msg_type': 'ACTION'})
obs = runtime.run_action(action)
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
assert obs.exit_code == 0

action = CmdRunAction(
command='stress-ng --vm 1 --vm-bytes 6G --timeout 30s --metrics'
)
action.set_hard_timeout(120)
logger.info(action, extra={'msg_type': 'ACTION'})
obs = runtime.run_action(action)
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
assert 'aborted early, out of system resources' in obs.content
assert obs.exit_code == 3 # OOM killed!

_close_test_runtime(runtime)


def test_stress_docker_runtime_within_memory_limits(temp_dir, runtime_cls):
"""Test runtime behavior under resource constraints."""
runtime, config = _load_runtime(
temp_dir,
runtime_cls,
docker_runtime_kwargs={
'cpu_period': 100000, # 100ms
'cpu_quota': 100000, # Can use 100ms out of each 100ms period (1 CPU)
'mem_limit': '4G', # 4 GB of memory
'memswap_limit': '0', # No swap
'mem_swappiness': 0, # Disable swapping
'oom_kill_disable': False, # Enable OOM killer
},
runtime_startup_env_vars={
'RUNTIME_MAX_MEMORY_GB': '7',
},
)

action = CmdRunAction(
command='sudo apt-get update && sudo apt-get install -y stress-ng'
)
logger.info(action, extra={'msg_type': 'ACTION'})
obs = runtime.run_action(action)
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
assert obs.exit_code == 0

action = CmdRunAction(
command='stress-ng --vm 1 --vm-bytes 6G --timeout 30s --metrics'
)
action.set_hard_timeout(120)
logger.info(action, extra={'msg_type': 'ACTION'})
obs = runtime.run_action(action)
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
assert obs.exit_code == 0

_close_test_runtime(runtime)
36 changes: 0 additions & 36 deletions tests/runtime/test_stress_docker_runtime.py

This file was deleted.