fix tests

ngc92 · ngc92 · commit e5b7a0dc67cd · 2025-08-26T12:41:14.000+02:00
diff --git a/examples/eval.py b/examples/eval.py
@@ -211,7 +211,7 @@ def run_single_test(pool: multiprocessing.Pool, test: TestCase):
     """
     world_size = test.args.get("world_size", None)
     if world_size is None:
-        return pool.apply(_run_single_test, (test, 0, 0))
+        return pool.apply(_run_single_test, (test,))
     else:
         return run_multi_gpu_test(pool, test, world_size)
 
@@ -255,7 +255,7 @@ def _run_single_benchmark(test: TestCase, recheck: bool, max_repeats: int, max_t
     durations = []
     # generate input data once
     data = generate_input(**test.args)
-    check_copy = _clone_data(data)
+    check_copy = _clone_data(data, 0)
     #  first, one obligatory correctness check
     output = custom_kernel(data)
     good, message = wrap_check_implementation(check_copy, output)
@@ -275,7 +275,7 @@ def _run_single_benchmark(test: TestCase, recheck: bool, max_repeats: int, max_t
                 test.args["seed"] += 13
 
             data = generate_input(**test.args)
-            check_copy = _clone_data(data)
+            check_copy = _clone_data(data, 0)
         torch.cuda.synchronize()
         start_event = torch.cuda.Event(enable_timing=True)
         end_event = torch.cuda.Event(enable_timing=True)
@@ -509,7 +509,7 @@ def run_single_profile(test: TestCase) -> str:
     torch.cuda.synchronize()
 
     with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA]) as prof:
-        submission_output = custom_kernel(_clone_data(data))
+        submission_output = custom_kernel(_clone_data(data, 0))
         torch.cuda.synchronize()
     return prof.key_averages().table(sort_by="self_cuda_time_total", row_limit=20)
 
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -106,6 +106,32 @@ def bot(docker_compose, database):
   CUDA: "template.cu"
 """
 
+MULTi_GPU_TASK_YAML = """
+lang: py
+description: "Test task description"
+ranking_by: geom
+multi_gpu: true
+test_timeout: 120
+files:
+  - name: "kernel.py"
+    source: "kernel.py"
+  - name: "submission.py"
+    source: "@SUBMISSION@"
+config:
+  main: "kernel.py"
+tests:
+  - input_size: 1000
+    world_size: 4
+    dtype: "float32"
+benchmarks:
+  - input_size: 10000
+    world_size: 4
+    dtype: "float32"
+templates:
+  Python: "template.py"
+  CUDA: "template.cu"
+"""
+
 
 @pytest.fixture
 def task_directory(tmp_path):
@@ -117,6 +143,7 @@ def task_directory(tmp_path):
 
     # Create task.yml
     Path.write_text(tmp_path / "task.yml", TASK_YAML)
+    Path.write_text(tmp_path / "multi-task.yml", MULTi_GPU_TASK_YAML)
     return tmp_path
 
 
diff --git a/tests/test_task.py b/tests/test_task.py
@@ -14,6 +14,7 @@
     build_task_config,
     make_task_definition,
 )
+from libkernelbot.utils import KernelBotError
 
 
 @pytest.fixture()
@@ -148,6 +149,7 @@ def test_build_task_config_python(leaderboard_task):
             {"input_size": 5000, "dtype": "float16"},
         ],
         "mode": mode.value,
+        "multi_gpu": False,
         "test_timeout": 120,
         "benchmark_timeout": 180,
         "ranked_timeout": 180,
@@ -201,6 +203,7 @@ def test_build_task_config_cuda():
             {"input_size": 5000, "dtype": "float16"},
         ],
         "mode": mode.value,
+        "multi_gpu": False,
         "test_timeout": 120,
         "benchmark_timeout": 180,
         "ranked_timeout": 180,
@@ -234,3 +237,16 @@ def test_make_task_definition(task_directory):
     assert task.benchmarks == [{"input_size": 10000, "dtype": "float32"}]
     assert isinstance(task.config, PythonTaskData)
     assert task.config.main == "kernel.py"
+
+
+def test_multi_gpu_task(task_directory):
+    """Test make_task_definition with a multi-GPU task"""
+    orig = (task_directory / "task.yml").read_text()
+    (task_directory / "task.yml").write_text(orig + "\nmulti_gpu: true")
+
+    # no world size specified => Error
+    with pytest.raises(KernelBotError, match="does not specify world_size"):
+        make_task_definition(task_directory / "task.yml")
+
+    result = make_task_definition(task_directory / "multi-task.yml")
+    assert result.task.multi_gpu is True