From 924c888b648dc5ef57c66ffcee3354fb13d04d93 Mon Sep 17 00:00:00 2001
From: Satya Ortiz-Gagne <satya.ortiz-gagne@mila.quebec>
Date: Mon, 25 Nov 2024 15:52:48 -0500
Subject: [PATCH] Fix early stop returning errcode -15

benchio.0 [message] Terminating process because it ran for longer than 1 seconds.
benchio.0 [end (-15)] 'milabench/bin/voir' --config /tmp/extra/benchio/voirconf-benchio.0-384a97a8bb2d5d89323fc897d5a5d82e.json milabench/tests/yoshua-benchio/main.py --sleep 60 --start 1 --end 11 [at 2024-11-25 15:51:09.910958]
benchio.0
=========
  * Error codes = -15
  * No traceback info about the error

instead of

benchio.0 [message] Terminating process because it ran for longer than 1 seconds.
benchio.0 [end] 'milabench/bin/voir' --config /tmp/extra/benchio/voirconf-benchio.0-384a97a8bb2d5d89323fc897d5a5d82e.json milabench/tests/yoshua-benchio/main.py --sleep 60 --start 1 --end 11 [at 2024-11-25 16:00:30.277804]
benchio.0
=========
  * early stopped
---
 milabench/commands/executors.py |  1 +
 tests/config/early_stop.yaml    | 20 ++++++++++++++++++++
 tests/test_mock.py              | 20 ++++++++++++++++++++
 tests/yoshua-benchio/main.py    |  7 ++++---
 4 files changed, 45 insertions(+), 3 deletions(-)
 create mode 100644 tests/config/early_stop.yaml

diff --git a/milabench/commands/executors.py b/milabench/commands/executors.py
index 807a261e2..5d499613d 100644
--- a/milabench/commands/executors.py
+++ b/milabench/commands/executors.py
@@ -139,6 +139,7 @@ async def execute_command(
                     # kill the underlying process which should force the coro to 
                     # return on next wait
                     pack = packs[timedout]
+                    await pack.send(event="stop", data=None)
                     await force_terminate_now(pack, max_delay)
 
                 # Grace period
diff --git a/tests/config/early_stop.yaml b/tests/config/early_stop.yaml
new file mode 100644
index 000000000..7a17e28b7
--- /dev/null
+++ b/tests/config/early_stop.yaml
@@ -0,0 +1,20 @@
+_defaults:
+  max_duration: 1
+  voir:
+    options:
+      stop: 10
+      interval: "1s"
+
+benchio:
+  inherits: _defaults
+  definition: ../yoshua-benchio
+  plan:
+    method: njobs
+    n: 1
+  tags:
+    - monogpu
+
+  argv:
+    --sleep: 60
+    --start: 1
+    --end: 11
diff --git a/tests/test_mock.py b/tests/test_mock.py
index 2e41a8388..b0f63f4c9 100644
--- a/tests/test_mock.py
+++ b/tests/test_mock.py
@@ -2,11 +2,14 @@
 import os
 
 import milabench.alt_async
+from milabench.commands import Command
 import milabench.commands.executors
 from milabench.testing import resolved_config
 
 import pytest
 
+TEST_FOLDER = os.path.dirname(__file__)
+
 # benchmark that cannot be prepared because they are too big
 OVERSIZED_BENCHMARKS = {
     "llm-lora-single",
@@ -125,6 +128,23 @@ def test_milabench(monkeypatch, bench, module_tmp_dir, standard_config):
     # shutil.rmtree(module_tmp_dir)
 
 
+def test_early_stop(monkeypatch):
+    args= [
+        "--base", "/tmp",
+        "--config", os.path.join(TEST_FOLDER, "config", "early_stop.yaml"),
+        "--use-current-env"
+    ]
+
+    _execute = Command.execute
+    async def _wrap(self, *args, timeout_delay=None, **kwargs):
+        del timeout_delay
+        return await _execute.__call__(self, *args, timeout_delay=1, **kwargs)
+
+    monkeypatch.setattr(Command, "execute", _wrap)
+
+    run_cli("run", *args, "--no-report")
+
+
 ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))
 def cleanpath(out, tmppath):
     import subprocess
diff --git a/tests/yoshua-benchio/main.py b/tests/yoshua-benchio/main.py
index de723724d..49a339f5b 100755
--- a/tests/yoshua-benchio/main.py
+++ b/tests/yoshua-benchio/main.py
@@ -30,9 +30,6 @@ def main():
 
     args = parser.parse_args()
 
-    if args.sleep is not None:
-        time.sleep(args.sleep)
-
     data = [[[i]] for i in range(args.start, args.end)]
 
     if args.bad:
@@ -40,6 +37,10 @@ def main():
 
     for [[x]] in voir.iterate("train", data, True):
         give(loss=1 / x)
+    give(rate=args.end - args.start)
+
+    if args.sleep is not None:
+        time.sleep(args.sleep)
 
 
 if __name__ == "__main__":