From f48486a2560a2f6be944c53f52e13dd141536dea Mon Sep 17 00:00:00 2001
From: ngc92 <7938269+ngc92@users.noreply.github.com>
Date: Fri, 22 Aug 2025 01:33:55 +0200
Subject: [PATCH 1/9] utility fixture

---
 unit-tests/conftest.py | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/unit-tests/conftest.py b/unit-tests/conftest.py
index 170b0196..0408b82e 100644
--- a/unit-tests/conftest.py
+++ b/unit-tests/conftest.py
@@ -8,14 +8,10 @@
 
 
 @pytest.fixture(scope="module")
-def docker_compose():
-    tgt_path = Path.cwd()
-    if tgt_path.name == "unit-tests":
-        tgt_path = tgt_path.parent
-
+def docker_compose(project_root: Path):
     """Start a test database and run migrations"""
     subprocess.check_call(
-        ["docker", "compose", "-f", "docker-compose.test.yml", "up", "-d"], cwd=tgt_path
+        ["docker", "compose", "-f", "docker-compose.test.yml", "up", "-d"], cwd=project_root
     )
 
     try:
@@ -25,7 +21,7 @@ def docker_compose():
                 ["docker", "compose", "-f", "docker-compose.test.yml", "ps", "-q", "migrate-test"],
                 capture_output=True,
                 text=True,
-                cwd=tgt_path,
+                cwd=project_root,
             )
 
             if not result.stdout.strip():  # Container no longer exists
@@ -37,7 +33,7 @@ def docker_compose():
             ["docker", "compose", "-f", "docker-compose.test.yml", "logs", "migrate-test"],
             capture_output=True,
             text=True,
-            cwd=tgt_path,
+            cwd=project_root,
         )
 
         if "error" in logs.stdout.lower():
@@ -46,7 +42,7 @@ def docker_compose():
         yield
     finally:
         subprocess.run(
-            ["docker", "compose", "-f", "docker-compose.test.yml", "down", "-v"], cwd=tgt_path
+            ["docker", "compose", "-f", "docker-compose.test.yml", "down", "-v"], cwd=project_root
         )
 
 
@@ -122,3 +118,8 @@ def task_directory(tmp_path):
     # Create task.yml
     Path.write_text(tmp_path / "task.yml", TASK_YAML)
     return tmp_path
+
+
+@pytest.fixture(scope="session")
+def project_root():
+    return Path(__file__).parent.parent

From 9483faa35dd73d91842dd903a8d65db6922fe06d Mon Sep 17 00:00:00 2001
From: ngc92 <7938269+ngc92@users.noreply.github.com>
Date: Fri, 22 Aug 2025 01:34:12 +0200
Subject: [PATCH 2/9] pin torch version

---
 src/runners/modal_runner.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/runners/modal_runner.py b/src/runners/modal_runner.py
index eb74d173..bb8d952a 100644
--- a/src/runners/modal_runner.py
+++ b/src/runners/modal_runner.py
@@ -33,10 +33,10 @@
         "PyYAML",
     )
     .pip_install(
-        "torch~=2.7",
+        "torch>=2.7.0,<2.8.0",
         "torchvision~=0.22",
-        "torchaudio~=2.7",
-        index_url="https://download.pytorch.org/whl/cu128"
+        "torchaudio>=2.7.0,<2.8.0",
+        index_url="https://download.pytorch.org/whl/cu128",
     )
     # other frameworks
     .pip_install(

From 030d51b7f782a570bb68546836e88a844cd3236a Mon Sep 17 00:00:00 2001
From: ngc92 <7938269+ngc92@users.noreply.github.com>
Date: Fri, 22 Aug 2025 01:35:48 +0200
Subject: [PATCH 3/9] added basic testing for modal launcher

---
 unit-tests/test_modal.py | 199 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 199 insertions(+)
 create mode 100644 unit-tests/test_modal.py

diff --git a/unit-tests/test_modal.py b/unit-tests/test_modal.py
new file mode 100644
index 00000000..043b63de
--- /dev/null
+++ b/unit-tests/test_modal.py
@@ -0,0 +1,199 @@
+import os
+import subprocess
+import pytest
+from pathlib import Path
+
+from libkernelbot.launchers import ModalLauncher
+from libkernelbot.consts import SubmissionMode, GPU_TO_SM, ModalGPU
+from libkernelbot.task import make_task_definition, build_task_config
+from libkernelbot.report import RunProgressReporter
+
+
+class MockProgressReporter(RunProgressReporter):
+    """Test progress reporter that captures messages."""
+
+    def __init__(self, title: str = "Test Modal Run"):
+        super().__init__(title)
+        self.messages = []
+        self.updates = []
+
+    async def push(self, message: str):
+        self.messages.append(message)
+
+    async def update(self, message: str):
+        self.updates.append(message)
+
+
+@pytest.fixture(scope="session")
+def modal_deployment(project_root: Path):
+    """
+    Fixture that ensures Modal is deployed before running tests.
+    Runs once per test session and deploys to the specified Modal environment.
+    """
+    # Determine Modal environment (default to 'test' if not specified)
+    modal_env = os.getenv("PYTEST_MODAL_ENV", "pytest")
+
+    print(f"🚀 Deploying to Modal environment: {modal_env}")
+
+    # Deploy to Modal with specific environment
+    try:
+        result = subprocess.run(
+            ["modal", "deploy", "--env", modal_env, "modal_runner_archs.py"],
+            cwd=project_root / "src" / "runners",
+            capture_output=True,
+            text=True,
+            timeout=600  # 10 minute timeout in case image needs to be built
+        )
+
+        if result.returncode != 0:
+            # if it fails simply because the environment does not exist, we can fix  that
+            if "No such environment" in result.stderr:
+                result = subprocess.run(
+                    ["modal", "environment", "create", modal_env],
+                    cwd=project_root / "src" / "runners",
+                    capture_output=True,
+                    text=True,
+                    timeout=30
+                )
+                if result.returncode != 0:
+                    pytest.fail(f"Modal environment `{modal_env}` not available, and failed to create: {result.stderr}")
+                else:
+                    # try again, now that the env exists.
+                    result = subprocess.run(
+                        ["modal", "deploy", "--env", modal_env, "modal_runner_archs.py"],
+                        cwd=project_root / "src" / "runners",
+                        capture_output=True,
+                        text=True,
+                        timeout=300
+                    )
+                    if result.returncode != 0:
+                        pytest.fail(f"Modal deploy failed:\nSTDOUT:\n{result.stdout}\nSTDERR:\n{result.stderr}")
+            else:
+                pytest.fail(f"Modal deploy failed:\nSTDOUT:\n{result.stdout}\nSTDERR:\n{result.stderr}")
+
+        print(f"✅ Modal deployment to '{modal_env}' completed successfully")
+        print(f"Deploy output: {result.stdout}")
+
+        # Set the Modal environment for the session
+        original_env = os.environ.get("MODAL_ENVIRONMENT")
+        os.environ["MODAL_ENVIRONMENT"] = modal_env
+
+        yield modal_env
+
+        # Restore original environment
+        if original_env is not None:
+            os.environ["MODAL_ENVIRONMENT"] = original_env
+        elif "MODAL_ENVIRONMENT" in os.environ:
+            del os.environ["MODAL_ENVIRONMENT"]
+
+    except subprocess.TimeoutExpired as e:
+        pytest.fail(f"Modal deploy timed out after 5 minutes:\nstdout: {e.stdout}, stderr:{e.stderr}")
+    except Exception as e:
+        pytest.fail(f"Modal deploy failed with exception: {e}")
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("gpu_type", [ModalGPU.T4, ModalGPU.L4, ModalGPU.A100, ModalGPU.H100, ModalGPU.B200])
+async def test_modal_launcher_python_script(modal_deployment, project_root: Path, gpu_type: ModalGPU):
+    """
+    Test ModalLauncher with a real Python script using examples/identity_py.
+    """
+    launcher = ModalLauncher(add_include_dirs=[])
+    reporter = MockProgressReporter("progress")
+
+    # Load the real identity_py task
+    task_path = project_root / "examples" / "identity_py"
+    if not task_path.exists():
+        pytest.skip("examples/identity_py not found - skipping Modal integration test")
+
+    # Load the task definition
+    task_definition = make_task_definition(task_path)
+
+    # Use the actual working submission from the examples
+    submission_content = (task_path / "submission.py").read_text()
+
+    config = build_task_config(
+        task=task_definition.task,
+        submission_content=submission_content,
+        arch=GPU_TO_SM[gpu_type.name],
+        mode=SubmissionMode.TEST
+    )
+
+    result = await launcher.run_submission(config, gpu_type, reporter)
+
+    # Basic structure and success
+    assert result.success, f"Expected successful run, got: {result.error}"
+    assert result.error == ""
+    assert isinstance(result.runs, dict)
+
+    # System info - test actual expected values
+    assert gpu_type.name in result.system.gpu
+    assert "Linux" in result.system.platform
+    assert result.system.torch.startswith("2.7")  # update when the image changes
+
+    # Test run structure
+    assert 'test' in result.runs
+    test_run = result.runs['test']
+
+    # For Python runs, compilation is None
+    assert test_run.compilation is None
+
+    # Run needs to succeed
+    assert test_run.run.success is True
+    assert test_run.run.passed is True
+    assert test_run.run.exit_code == 0
+    assert test_run.run.stdout == ""
+    assert test_run.run.stderr == ""
+    assert test_run.run.duration > 0
+
+    # Test need to succeed
+    assert test_run.run.result['check'] == 'pass'
+    test_count = int(test_run.run.result['test-count'])
+    assert test_count == 5
+    for i in range(test_count):
+        assert test_run.run.result[f'test.{i}.status'] == 'pass'
+        assert 'size:' in test_run.run.result[f'test.{i}.spec']
+        assert 'seed:' in test_run.run.result[f'test.{i}.spec']
+
+    # sanity check for timings
+    assert test_run.start < test_run.end
+
+    # check messages
+    assert reporter.messages == ['⏳ Waiting for Modal run to finish...']
+    assert reporter.updates == ['✅ Waiting for modal run to finish... Done']
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("script", ["cheat-fd.py", "cheat-input.py", "cheat-rng.py"])
+async def test_modal_launcher_failing_script(modal_deployment, project_root: Path, script: str):
+    """
+        Test ModalLauncher with a real Python scripts that are designed to be wrong.
+        """
+    launcher = ModalLauncher(add_include_dirs=[])
+    reporter = MockProgressReporter("progress")
+    gpu_type = ModalGPU.T4
+
+    # Load the real identity_py task
+    task_path = project_root / "examples" / "identity_py"
+    if not task_path.exists():
+        pytest.skip("examples/identity_py not found - skipping Modal integration test")
+
+    # Load the task definition
+    task_definition = make_task_definition(task_path)
+
+    # Use the actual working submission from the examples
+    submission_content = (task_path / script).read_text()
+    task_definition.task.seed = 653212
+    config = build_task_config(
+        task=task_definition.task,
+        submission_content=submission_content,
+        arch=GPU_TO_SM[gpu_type.name],
+        mode=SubmissionMode.LEADERBOARD,
+    )
+
+    result = await launcher.run_submission(config, gpu_type, reporter)
+
+    # Basic structure and success
+    assert result.success, f"Expected successful run, got: {result.error}"
+    assert result.error == ""
+    assert result.runs['test'].run.passed is False or result.runs['benchmark'].run.passed is False

From 77e290e3c61ab4b0659bba1910442012f030eab2 Mon Sep 17 00:00:00 2001
From: ngc92 <7938269+ngc92@users.noreply.github.com>
Date: Fri, 22 Aug 2025 10:19:52 +0200
Subject: [PATCH 4/9] modal token

---
 .github/workflows/testing.yml | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/.github/workflows/testing.yml b/.github/workflows/testing.yml
index 71fd888f..4b486964 100644
--- a/.github/workflows/testing.yml
+++ b/.github/workflows/testing.yml
@@ -11,10 +11,14 @@ jobs:
   unit-tests:
     runs-on: ubuntu-latest
     timeout-minutes: 10
+    env:
+      MODAL_TOKEN_ID: ${{ secrets.MODAL_TOKEN_ID }}
+      MODAL_TOKEN_SECRET: ${{ secrets.MODAL_TOKEN_SECRET }}
     steps:
       - uses: actions/checkout@v4
       - uses: astral-sh/setup-uv@v4
       - run: uv sync --extra dev
+      - run: uv run modal token set --token-id ${MODAL_TOKEN_ID} --token-secret ${MODAL_TOKEN_SECRET}
       - run: uv run pytest --cov-report term --cov-report html --cov-report xml --cov=src/libkernelbot unit-tests -v
       - uses: actions/upload-artifact@v4
         with:

From e87acbfb97b246552452755f410ae5cefa5a5bb9 Mon Sep 17 00:00:00 2001
From: ngc92 <7938269+ngc92@users.noreply.github.com>
Date: Fri, 22 Aug 2025 11:07:00 +0200
Subject: [PATCH 5/9] use new function name

---
 src/libkernelbot/launchers/modal.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/libkernelbot/launchers/modal.py b/src/libkernelbot/launchers/modal.py
index 26adfe33..6c2308ec 100644
--- a/src/libkernelbot/launchers/modal.py
+++ b/src/libkernelbot/launchers/modal.py
@@ -32,7 +32,7 @@ async def run_submission(
 
         result = await loop.run_in_executor(
             None,
-            lambda: modal.Function.lookup("discord-bot-runner", func_name).remote(config=config),
+            lambda: modal.Function.from_name("discord-bot-runner", func_name).remote(config=config),
         )
 
         await status.update("✅ Waiting for modal run to finish... Done")

From f458349db7813631b4ccef6ce9d6896d3ab549bc Mon Sep 17 00:00:00 2001
From: ngc92 <7938269+ngc92@users.noreply.github.com>
Date: Fri, 22 Aug 2025 11:19:22 +0200
Subject: [PATCH 6/9] split into unit and integration tests

---
 .github/workflows/testing.yml | 21 +++++++++++++--------
 pyproject.toml                |  3 +++
 unit-tests/test_modal.py      |  6 ++++--
 3 files changed, 20 insertions(+), 10 deletions(-)

diff --git a/.github/workflows/testing.yml b/.github/workflows/testing.yml
index 4b486964..9c101191 100644
--- a/.github/workflows/testing.yml
+++ b/.github/workflows/testing.yml
@@ -11,15 +11,11 @@ jobs:
   unit-tests:
     runs-on: ubuntu-latest
     timeout-minutes: 10
-    env:
-      MODAL_TOKEN_ID: ${{ secrets.MODAL_TOKEN_ID }}
-      MODAL_TOKEN_SECRET: ${{ secrets.MODAL_TOKEN_SECRET }}
     steps:
       - uses: actions/checkout@v4
       - uses: astral-sh/setup-uv@v4
       - run: uv sync --extra dev
-      - run: uv run modal token set --token-id ${MODAL_TOKEN_ID} --token-secret ${MODAL_TOKEN_SECRET}
-      - run: uv run pytest --cov-report term --cov-report html --cov-report xml --cov=src/libkernelbot unit-tests -v
+      - run: uv run pytest --cov-report term --cov-report html --cov-report xml --cov=src/libkernelbot -m "not integration" unit-tests -v
       - uses: actions/upload-artifact@v4
         with:
           name: coverage
@@ -29,13 +25,22 @@ jobs:
         uses: py-cov-action/python-coverage-comment-action@v3
         with:
           GITHUB_TOKEN: ${{ github.token }}
-
       - name: Store Pull Request comment to be posted
         uses: actions/upload-artifact@v4
         if: steps.coverage_comment.outputs.COMMENT_FILE_WRITTEN == 'true'
         with:
-          # If you use a different name, update COMMENT_ARTIFACT_NAME accordingly
           name: python-coverage-comment-action
-          # If you use a different name, update COMMENT_FILENAME accordingly
           path: python-coverage-comment-action.txt
 
+  integration-tests:
+    runs-on: ubuntu-latest
+    timeout-minutes: 30
+    env:
+      MODAL_TOKEN_ID: ${{ secrets.MODAL_TOKEN_ID }}
+      MODAL_TOKEN_SECRET: ${{ secrets.MODAL_TOKEN_SECRET }}
+    steps:
+      - uses: actions/checkout@v4
+      - uses: astral-sh/setup-uv@v4
+      - run: uv sync --extra dev
+      - run: uv run modal token set --token-id ${MODAL_TOKEN_ID} --token-secret ${MODAL_TOKEN_SECRET}
+      - run: uv run pytest -m integration -v
diff --git a/pyproject.toml b/pyproject.toml
index bb72bd06..0c66e0bc 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -52,6 +52,9 @@ exclude_lines = [
 [tool.pytest.ini_options]
 testpaths = ["scripts", "tests"]
 python_files = ["test_*.py", "*_test.py", "ci_test_*.py"]
+markers = [
+    "integration: integration tests that need to interact externally, e.g., with modal/github actions/etc"
+]
 
 [tool.ruff]
 line-length = 120
diff --git a/unit-tests/test_modal.py b/unit-tests/test_modal.py
index 043b63de..718b086c 100644
--- a/unit-tests/test_modal.py
+++ b/unit-tests/test_modal.py
@@ -42,7 +42,7 @@ def modal_deployment(project_root: Path):
             cwd=project_root / "src" / "runners",
             capture_output=True,
             text=True,
-            timeout=600  # 10 minute timeout in case image needs to be built
+            timeout=600  # 10 minute timeout in case image needs to be built (can be very slow)
         )
 
         if result.returncode != 0:
@@ -64,7 +64,7 @@ def modal_deployment(project_root: Path):
                         cwd=project_root / "src" / "runners",
                         capture_output=True,
                         text=True,
-                        timeout=300
+                        timeout=600
                     )
                     if result.returncode != 0:
                         pytest.fail(f"Modal deploy failed:\nSTDOUT:\n{result.stdout}\nSTDERR:\n{result.stderr}")
@@ -92,6 +92,7 @@ def modal_deployment(project_root: Path):
         pytest.fail(f"Modal deploy failed with exception: {e}")
 
 
+@pytest.mark.integration
 @pytest.mark.asyncio
 @pytest.mark.parametrize("gpu_type", [ModalGPU.T4, ModalGPU.L4, ModalGPU.A100, ModalGPU.H100, ModalGPU.B200])
 async def test_modal_launcher_python_script(modal_deployment, project_root: Path, gpu_type: ModalGPU):
@@ -163,6 +164,7 @@ async def test_modal_launcher_python_script(modal_deployment, project_root: Path
     assert reporter.updates == ['✅ Waiting for modal run to finish... Done']
 
 
+@pytest.mark.integration
 @pytest.mark.asyncio
 @pytest.mark.parametrize("script", ["cheat-fd.py", "cheat-input.py", "cheat-rng.py"])
 async def test_modal_launcher_failing_script(modal_deployment, project_root: Path, script: str):

From 09c9e8ff93267a807a7ab9a066749f2477d70dd2 Mon Sep 17 00:00:00 2001
From: ngc92 <7938269+ngc92@users.noreply.github.com>
Date: Fri, 22 Aug 2025 11:28:26 +0200
Subject: [PATCH 7/9] rename

---
 .github/workflows/testing.yml                | 4 ++--
 {unit-tests => tests}/conftest.py            | 0
 {unit-tests => tests}/test_backend.py        | 0
 {unit-tests => tests}/test_leaderboard_db.py | 0
 {unit-tests => tests}/test_modal.py          | 0
 {unit-tests => tests}/test_report.py         | 0
 {unit-tests => tests}/test_submission.py     | 0
 {unit-tests => tests}/test_task.py           | 0
 {unit-tests => tests}/test_utils.py          | 0
 9 files changed, 2 insertions(+), 2 deletions(-)
 rename {unit-tests => tests}/conftest.py (100%)
 rename {unit-tests => tests}/test_backend.py (100%)
 rename {unit-tests => tests}/test_leaderboard_db.py (100%)
 rename {unit-tests => tests}/test_modal.py (100%)
 rename {unit-tests => tests}/test_report.py (100%)
 rename {unit-tests => tests}/test_submission.py (100%)
 rename {unit-tests => tests}/test_task.py (100%)
 rename {unit-tests => tests}/test_utils.py (100%)

diff --git a/.github/workflows/testing.yml b/.github/workflows/testing.yml
index 9c101191..5b71a3f7 100644
--- a/.github/workflows/testing.yml
+++ b/.github/workflows/testing.yml
@@ -15,7 +15,7 @@ jobs:
       - uses: actions/checkout@v4
       - uses: astral-sh/setup-uv@v4
       - run: uv sync --extra dev
-      - run: uv run pytest --cov-report term --cov-report html --cov-report xml --cov=src/libkernelbot -m "not integration" unit-tests -v
+      - run: uv run pytest --cov-report term --cov-report html --cov-report xml --cov=src/libkernelbot -m "not integration" tests -v
       - uses: actions/upload-artifact@v4
         with:
           name: coverage
@@ -43,4 +43,4 @@ jobs:
       - uses: astral-sh/setup-uv@v4
       - run: uv sync --extra dev
       - run: uv run modal token set --token-id ${MODAL_TOKEN_ID} --token-secret ${MODAL_TOKEN_SECRET}
-      - run: uv run pytest -m integration -v
+      - run: uv run pytest -m integration tests -v
diff --git a/unit-tests/conftest.py b/tests/conftest.py
similarity index 100%
rename from unit-tests/conftest.py
rename to tests/conftest.py
diff --git a/unit-tests/test_backend.py b/tests/test_backend.py
similarity index 100%
rename from unit-tests/test_backend.py
rename to tests/test_backend.py
diff --git a/unit-tests/test_leaderboard_db.py b/tests/test_leaderboard_db.py
similarity index 100%
rename from unit-tests/test_leaderboard_db.py
rename to tests/test_leaderboard_db.py
diff --git a/unit-tests/test_modal.py b/tests/test_modal.py
similarity index 100%
rename from unit-tests/test_modal.py
rename to tests/test_modal.py
diff --git a/unit-tests/test_report.py b/tests/test_report.py
similarity index 100%
rename from unit-tests/test_report.py
rename to tests/test_report.py
diff --git a/unit-tests/test_submission.py b/tests/test_submission.py
similarity index 100%
rename from unit-tests/test_submission.py
rename to tests/test_submission.py
diff --git a/unit-tests/test_task.py b/tests/test_task.py
similarity index 100%
rename from unit-tests/test_task.py
rename to tests/test_task.py
diff --git a/unit-tests/test_utils.py b/tests/test_utils.py
similarity index 100%
rename from unit-tests/test_utils.py
rename to tests/test_utils.py

From b5abc33023e668b551ab657b92eec4919cd36586 Mon Sep 17 00:00:00 2001
From: ngc92 <7938269+ngc92@users.noreply.github.com>
Date: Fri, 22 Aug 2025 11:38:20 +0200
Subject: [PATCH 8/9] lint

---
 tests/test_modal.py | 66 ++++++++++++++++++++++++++++-----------------
 1 file changed, 41 insertions(+), 25 deletions(-)

diff --git a/tests/test_modal.py b/tests/test_modal.py
index 718b086c..72873cf4 100644
--- a/tests/test_modal.py
+++ b/tests/test_modal.py
@@ -1,12 +1,13 @@
 import os
 import subprocess
-import pytest
 from pathlib import Path
 
+import pytest
+
+from libkernelbot.consts import GPU_TO_SM, ModalGPU, SubmissionMode
 from libkernelbot.launchers import ModalLauncher
-from libkernelbot.consts import SubmissionMode, GPU_TO_SM, ModalGPU
-from libkernelbot.task import make_task_definition, build_task_config
 from libkernelbot.report import RunProgressReporter
+from libkernelbot.task import build_task_config, make_task_definition
 
 
 class MockProgressReporter(RunProgressReporter):
@@ -42,7 +43,7 @@ def modal_deployment(project_root: Path):
             cwd=project_root / "src" / "runners",
             capture_output=True,
             text=True,
-            timeout=600  # 10 minute timeout in case image needs to be built (can be very slow)
+            timeout=600,  # 10 minute timeout in case image needs to be built (can be very slow)
         )
 
         if result.returncode != 0:
@@ -53,10 +54,13 @@ def modal_deployment(project_root: Path):
                     cwd=project_root / "src" / "runners",
                     capture_output=True,
                     text=True,
-                    timeout=30
+                    timeout=30,
                 )
                 if result.returncode != 0:
-                    pytest.fail(f"Modal environment `{modal_env}` not available, and failed to create: {result.stderr}")
+                    pytest.fail(
+                        f"Modal environment `{modal_env}` not available, "
+                        f"and failed to create: {result.stderr}"
+                    )
                 else:
                     # try again, now that the env exists.
                     result = subprocess.run(
@@ -64,12 +68,18 @@ def modal_deployment(project_root: Path):
                         cwd=project_root / "src" / "runners",
                         capture_output=True,
                         text=True,
-                        timeout=600
+                        timeout=600,
                     )
                     if result.returncode != 0:
-                        pytest.fail(f"Modal deploy failed:\nSTDOUT:\n{result.stdout}\nSTDERR:\n{result.stderr}")
+                        pytest.fail(
+                            f"Modal deploy failed:\n"
+                            f"STDOUT:\n{result.stdout}\n"
+                            f"STDERR:\n{result.stderr}"
+                        )
             else:
-                pytest.fail(f"Modal deploy failed:\nSTDOUT:\n{result.stdout}\nSTDERR:\n{result.stderr}")
+                pytest.fail(
+                    f"Modal deploy failed:\nSTDOUT:\n{result.stdout}\nSTDERR:\n{result.stderr}"
+                )
 
         print(f"✅ Modal deployment to '{modal_env}' completed successfully")
         print(f"Deploy output: {result.stdout}")
@@ -87,15 +97,21 @@ def modal_deployment(project_root: Path):
             del os.environ["MODAL_ENVIRONMENT"]
 
     except subprocess.TimeoutExpired as e:
-        pytest.fail(f"Modal deploy timed out after 5 minutes:\nstdout: {e.stdout}, stderr:{e.stderr}")
+        pytest.fail(
+            f"Modal deploy timed out after 5 minutes:\nstdout: {e.stdout}, stderr:{e.stderr}"
+        )
     except Exception as e:
         pytest.fail(f"Modal deploy failed with exception: {e}")
 
 
 @pytest.mark.integration
 @pytest.mark.asyncio
-@pytest.mark.parametrize("gpu_type", [ModalGPU.T4, ModalGPU.L4, ModalGPU.A100, ModalGPU.H100, ModalGPU.B200])
-async def test_modal_launcher_python_script(modal_deployment, project_root: Path, gpu_type: ModalGPU):
+@pytest.mark.parametrize(
+    "gpu_type", [ModalGPU.T4, ModalGPU.L4, ModalGPU.A100, ModalGPU.H100, ModalGPU.B200]
+)
+async def test_modal_launcher_python_script(
+    modal_deployment, project_root: Path, gpu_type: ModalGPU
+):
     """
     Test ModalLauncher with a real Python script using examples/identity_py.
     """
@@ -117,7 +133,7 @@ async def test_modal_launcher_python_script(modal_deployment, project_root: Path
         task=task_definition.task,
         submission_content=submission_content,
         arch=GPU_TO_SM[gpu_type.name],
-        mode=SubmissionMode.TEST
+        mode=SubmissionMode.TEST,
     )
 
     result = await launcher.run_submission(config, gpu_type, reporter)
@@ -133,8 +149,8 @@ async def test_modal_launcher_python_script(modal_deployment, project_root: Path
     assert result.system.torch.startswith("2.7")  # update when the image changes
 
     # Test run structure
-    assert 'test' in result.runs
-    test_run = result.runs['test']
+    assert "test" in result.runs
+    test_run = result.runs["test"]
 
     # For Python runs, compilation is None
     assert test_run.compilation is None
@@ -148,20 +164,20 @@ async def test_modal_launcher_python_script(modal_deployment, project_root: Path
     assert test_run.run.duration > 0
 
     # Test need to succeed
-    assert test_run.run.result['check'] == 'pass'
-    test_count = int(test_run.run.result['test-count'])
+    assert test_run.run.result["check"] == "pass"
+    test_count = int(test_run.run.result["test-count"])
     assert test_count == 5
     for i in range(test_count):
-        assert test_run.run.result[f'test.{i}.status'] == 'pass'
-        assert 'size:' in test_run.run.result[f'test.{i}.spec']
-        assert 'seed:' in test_run.run.result[f'test.{i}.spec']
+        assert test_run.run.result[f"test.{i}.status"] == "pass"
+        assert "size:" in test_run.run.result[f"test.{i}.spec"]
+        assert "seed:" in test_run.run.result[f"test.{i}.spec"]
 
     # sanity check for timings
     assert test_run.start < test_run.end
 
     # check messages
-    assert reporter.messages == ['⏳ Waiting for Modal run to finish...']
-    assert reporter.updates == ['✅ Waiting for modal run to finish... Done']
+    assert reporter.messages == ["⏳ Waiting for Modal run to finish..."]
+    assert reporter.updates == ["✅ Waiting for modal run to finish... Done"]
 
 
 @pytest.mark.integration
@@ -169,8 +185,8 @@ async def test_modal_launcher_python_script(modal_deployment, project_root: Path
 @pytest.mark.parametrize("script", ["cheat-fd.py", "cheat-input.py", "cheat-rng.py"])
 async def test_modal_launcher_failing_script(modal_deployment, project_root: Path, script: str):
     """
-        Test ModalLauncher with a real Python scripts that are designed to be wrong.
-        """
+    Test ModalLauncher with a real Python scripts that are designed to be wrong.
+    """
     launcher = ModalLauncher(add_include_dirs=[])
     reporter = MockProgressReporter("progress")
     gpu_type = ModalGPU.T4
@@ -198,4 +214,4 @@ async def test_modal_launcher_failing_script(modal_deployment, project_root: Pat
     # Basic structure and success
     assert result.success, f"Expected successful run, got: {result.error}"
     assert result.error == ""
-    assert result.runs['test'].run.passed is False or result.runs['benchmark'].run.passed is False
+    assert result.runs["test"].run.passed is False or result.runs["benchmark"].run.passed is False

From c3e57bc2e8781b6ecad338fd695b75ccdf7237aa Mon Sep 17 00:00:00 2001
From: ngc92 <7938269+ngc92@users.noreply.github.com>
Date: Tue, 26 Aug 2025 00:03:34 +0200
Subject: [PATCH 9/9] more interesting tests

---
 .../vectoradd_py/submission_cuda_inline.py    | 57 +------------------
 tests/test_modal.py                           | 19 ++++---
 2 files changed, 14 insertions(+), 62 deletions(-)

diff --git a/examples/vectoradd_py/submission_cuda_inline.py b/examples/vectoradd_py/submission_cuda_inline.py
index d505d2a4..51841871 100644
--- a/examples/vectoradd_py/submission_cuda_inline.py
+++ b/examples/vectoradd_py/submission_cuda_inline.py
@@ -54,7 +54,6 @@
 """
 
 
-
 add_module = load_inline(
     name='add_cuda',
     cpp_sources=add_cpp_source,
@@ -63,62 +62,12 @@
     verbose=True,
 )
 
+
 def add(A, B):
     if not A.is_cuda or not B.is_cuda:
         raise RuntimeError("Both tensors must be on GPU")
     return add_module.add_cuda(A, B)
 
-def custom_kernel(data: input_t) -> output_t:
-    """
-    Custom implementation of vector addition using CUDA inline function.
-    Args:
-        inputs: List of pairs of tensors [A, B] to be added.
-    Returns:
-        List of tensors containing element-wise sums.
-    """
-    A, B = data
 
-    assert A.is_cuda and B.is_cuda, "Input tensors must be on GPU"
-    assert A.shape == B.shape, "Input tensors must have the same shape"
-    assert A.dtype == torch.float16 and B.dtype == torch.float16, "Input tensors must be float16"
-    
-    M, N = A.shape
-    C = torch.empty_like(A)
-    
-    n_threads = 256
-    n_blocks = (M * N + n_threads - 1) // n_threads
-    
-    cuda_source = """
-    extern "C" __global__ void add_kernel(
-        const half* __restrict__ A,
-        const half* __restrict__ B,
-        half* __restrict__ C,
-        const int n_elements
-    ) {
-        const int idx = blockIdx.x * blockDim.x + threadIdx.x;
-        if (idx < n_elements) {
-            C[idx] = __hadd(A[idx], B[idx]);
-        }
-    }
-    """
-    
-    module = torch.utils.cpp_extension.load_inline(
-        name=f"add_kernel_{M}_{N}",
-        cpp_sources="",
-        cuda_sources=cuda_source,
-        functions=["add_kernel"],
-        with_cuda=True,
-        extra_cuda_cflags=["-arch=sm_70"],  # Adjust based on your GPU architecture
-    )
-    
-    module.add_kernel(
-        cuda_stream=torch.cuda.current_stream(),
-        args=[
-            A.reshape(-1), B.reshape(-1), C.reshape(-1),
-            M * N,
-        ],
-        blocks=n_blocks,
-        threads=n_threads,
-    )
-    
-    return C
+def custom_kernel(data: input_t) -> output_t:
+    return add(*data)
diff --git a/tests/test_modal.py b/tests/test_modal.py
index 72873cf4..9fa1725e 100644
--- a/tests/test_modal.py
+++ b/tests/test_modal.py
@@ -1,6 +1,7 @@
 import os
 import subprocess
 from pathlib import Path
+from typing import Tuple
 
 import pytest
 
@@ -109,8 +110,15 @@ def modal_deployment(project_root: Path):
 @pytest.mark.parametrize(
     "gpu_type", [ModalGPU.T4, ModalGPU.L4, ModalGPU.A100, ModalGPU.H100, ModalGPU.B200]
 )
+@pytest.mark.parametrize(
+    "task",
+    [
+        ("vectoradd_py", "submission_cuda_inline.py"),
+        ("vectoradd_py", "submission_triton.py"),
+    ],
+)
 async def test_modal_launcher_python_script(
-    modal_deployment, project_root: Path, gpu_type: ModalGPU
+    modal_deployment, project_root: Path, gpu_type: ModalGPU, task: Tuple[str, str]
 ):
     """
     Test ModalLauncher with a real Python script using examples/identity_py.
@@ -119,7 +127,7 @@ async def test_modal_launcher_python_script(
     reporter = MockProgressReporter("progress")
 
     # Load the real identity_py task
-    task_path = project_root / "examples" / "identity_py"
+    task_path = project_root / "examples" / task[0]
     if not task_path.exists():
         pytest.skip("examples/identity_py not found - skipping Modal integration test")
 
@@ -127,7 +135,7 @@ async def test_modal_launcher_python_script(
     task_definition = make_task_definition(task_path)
 
     # Use the actual working submission from the examples
-    submission_content = (task_path / "submission.py").read_text()
+    submission_content = (task_path / task[1]).read_text()
 
     config = build_task_config(
         task=task_definition.task,
@@ -152,15 +160,10 @@ async def test_modal_launcher_python_script(
     assert "test" in result.runs
     test_run = result.runs["test"]
 
-    # For Python runs, compilation is None
-    assert test_run.compilation is None
-
     # Run needs to succeed
     assert test_run.run.success is True
     assert test_run.run.passed is True
     assert test_run.run.exit_code == 0
-    assert test_run.run.stdout == ""
-    assert test_run.run.stderr == ""
     assert test_run.run.duration > 0
 
     # Test need to succeed