From 924c888b648dc5ef57c66ffcee3354fb13d04d93 Mon Sep 17 00:00:00 2001 From: Satya Ortiz-Gagne Date: Mon, 25 Nov 2024 15:52:48 -0500 Subject: [PATCH] Fix early stop returning errcode -15 benchio.0 [message] Terminating process because it ran for longer than 1 seconds. benchio.0 [end (-15)] 'milabench/bin/voir' --config /tmp/extra/benchio/voirconf-benchio.0-384a97a8bb2d5d89323fc897d5a5d82e.json milabench/tests/yoshua-benchio/main.py --sleep 60 --start 1 --end 11 [at 2024-11-25 15:51:09.910958] benchio.0 ========= * Error codes = -15 * No traceback info about the error instead of benchio.0 [message] Terminating process because it ran for longer than 1 seconds. benchio.0 [end] 'milabench/bin/voir' --config /tmp/extra/benchio/voirconf-benchio.0-384a97a8bb2d5d89323fc897d5a5d82e.json milabench/tests/yoshua-benchio/main.py --sleep 60 --start 1 --end 11 [at 2024-11-25 16:00:30.277804] benchio.0 ========= * early stopped --- milabench/commands/executors.py | 1 + tests/config/early_stop.yaml | 20 ++++++++++++++++++++ tests/test_mock.py | 20 ++++++++++++++++++++ tests/yoshua-benchio/main.py | 7 ++++--- 4 files changed, 45 insertions(+), 3 deletions(-) create mode 100644 tests/config/early_stop.yaml diff --git a/milabench/commands/executors.py b/milabench/commands/executors.py index 807a261e2..5d499613d 100644 --- a/milabench/commands/executors.py +++ b/milabench/commands/executors.py @@ -139,6 +139,7 @@ async def execute_command( # kill the underlying process which should force the coro to # return on next wait pack = packs[timedout] + await pack.send(event="stop", data=None) await force_terminate_now(pack, max_delay) # Grace period diff --git a/tests/config/early_stop.yaml b/tests/config/early_stop.yaml new file mode 100644 index 000000000..7a17e28b7 --- /dev/null +++ b/tests/config/early_stop.yaml @@ -0,0 +1,20 @@ +_defaults: + max_duration: 1 + voir: + options: + stop: 10 + interval: "1s" + +benchio: + inherits: _defaults + definition: ../yoshua-benchio + plan: + method: njobs + n: 1 + tags: + - monogpu + + argv: + --sleep: 60 + --start: 1 + --end: 11 diff --git a/tests/test_mock.py b/tests/test_mock.py index 2e41a8388..b0f63f4c9 100644 --- a/tests/test_mock.py +++ b/tests/test_mock.py @@ -2,11 +2,14 @@ import os import milabench.alt_async +from milabench.commands import Command import milabench.commands.executors from milabench.testing import resolved_config import pytest +TEST_FOLDER = os.path.dirname(__file__) + # benchmark that cannot be prepared because they are too big OVERSIZED_BENCHMARKS = { "llm-lora-single", @@ -125,6 +128,23 @@ def test_milabench(monkeypatch, bench, module_tmp_dir, standard_config): # shutil.rmtree(module_tmp_dir) +def test_early_stop(monkeypatch): + args= [ + "--base", "/tmp", + "--config", os.path.join(TEST_FOLDER, "config", "early_stop.yaml"), + "--use-current-env" + ] + + _execute = Command.execute + async def _wrap(self, *args, timeout_delay=None, **kwargs): + del timeout_delay + return await _execute.__call__(self, *args, timeout_delay=1, **kwargs) + + monkeypatch.setattr(Command, "execute", _wrap) + + run_cli("run", *args, "--no-report") + + ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")) def cleanpath(out, tmppath): import subprocess diff --git a/tests/yoshua-benchio/main.py b/tests/yoshua-benchio/main.py index de723724d..49a339f5b 100755 --- a/tests/yoshua-benchio/main.py +++ b/tests/yoshua-benchio/main.py @@ -30,9 +30,6 @@ def main(): args = parser.parse_args() - if args.sleep is not None: - time.sleep(args.sleep) - data = [[[i]] for i in range(args.start, args.end)] if args.bad: @@ -40,6 +37,10 @@ def main(): for [[x]] in voir.iterate("train", data, True): give(loss=1 / x) + give(rate=args.end - args.start) + + if args.sleep is not None: + time.sleep(args.sleep) if __name__ == "__main__":