diff --git a/.github/workflows/testing.yml b/.github/workflows/testing.yml
index 71fd888f..5b71a3f7 100644
--- a/.github/workflows/testing.yml
+++ b/.github/workflows/testing.yml
@@ -15,7 +15,7 @@ jobs:
       - uses: actions/checkout@v4
       - uses: astral-sh/setup-uv@v4
       - run: uv sync --extra dev
-      - run: uv run pytest --cov-report term --cov-report html --cov-report xml --cov=src/libkernelbot unit-tests -v
+      - run: uv run pytest --cov-report term --cov-report html --cov-report xml --cov=src/libkernelbot -m "not integration" tests -v
       - uses: actions/upload-artifact@v4
         with:
           name: coverage
@@ -25,13 +25,22 @@ jobs:
         uses: py-cov-action/python-coverage-comment-action@v3
         with:
           GITHUB_TOKEN: ${{ github.token }}
-
       - name: Store Pull Request comment to be posted
         uses: actions/upload-artifact@v4
         if: steps.coverage_comment.outputs.COMMENT_FILE_WRITTEN == 'true'
         with:
-          # If you use a different name, update COMMENT_ARTIFACT_NAME accordingly
           name: python-coverage-comment-action
-          # If you use a different name, update COMMENT_FILENAME accordingly
           path: python-coverage-comment-action.txt
 
+  integration-tests:
+    runs-on: ubuntu-latest
+    timeout-minutes: 30
+    env:
+      MODAL_TOKEN_ID: ${{ secrets.MODAL_TOKEN_ID }}
+      MODAL_TOKEN_SECRET: ${{ secrets.MODAL_TOKEN_SECRET }}
+    steps:
+      - uses: actions/checkout@v4
+      - uses: astral-sh/setup-uv@v4
+      - run: uv sync --extra dev
+      - run: uv run modal token set --token-id ${MODAL_TOKEN_ID} --token-secret ${MODAL_TOKEN_SECRET}
+      - run: uv run pytest -m integration tests -v
diff --git a/examples/vectoradd_py/submission_cuda_inline.py b/examples/vectoradd_py/submission_cuda_inline.py
index d505d2a4..51841871 100644
--- a/examples/vectoradd_py/submission_cuda_inline.py
+++ b/examples/vectoradd_py/submission_cuda_inline.py
@@ -54,7 +54,6 @@
 """
 
 
-
 add_module = load_inline(
     name='add_cuda',
     cpp_sources=add_cpp_source,
@@ -63,62 +62,12 @@
     verbose=True,
 )
 
+
 def add(A, B):
     if not A.is_cuda or not B.is_cuda:
         raise RuntimeError("Both tensors must be on GPU")
     return add_module.add_cuda(A, B)
 
-def custom_kernel(data: input_t) -> output_t:
-    """
-    Custom implementation of vector addition using CUDA inline function.
-    Args:
-        inputs: List of pairs of tensors [A, B] to be added.
-    Returns:
-        List of tensors containing element-wise sums.
-    """
-    A, B = data
 
-    assert A.is_cuda and B.is_cuda, "Input tensors must be on GPU"
-    assert A.shape == B.shape, "Input tensors must have the same shape"
-    assert A.dtype == torch.float16 and B.dtype == torch.float16, "Input tensors must be float16"
-    
-    M, N = A.shape
-    C = torch.empty_like(A)
-    
-    n_threads = 256
-    n_blocks = (M * N + n_threads - 1) // n_threads
-    
-    cuda_source = """
-    extern "C" __global__ void add_kernel(
-        const half* __restrict__ A,
-        const half* __restrict__ B,
-        half* __restrict__ C,
-        const int n_elements
-    ) {
-        const int idx = blockIdx.x * blockDim.x + threadIdx.x;
-        if (idx < n_elements) {
-            C[idx] = __hadd(A[idx], B[idx]);
-        }
-    }
-    """
-    
-    module = torch.utils.cpp_extension.load_inline(
-        name=f"add_kernel_{M}_{N}",
-        cpp_sources="",
-        cuda_sources=cuda_source,
-        functions=["add_kernel"],
-        with_cuda=True,
-        extra_cuda_cflags=["-arch=sm_70"],  # Adjust based on your GPU architecture
-    )
-    
-    module.add_kernel(
-        cuda_stream=torch.cuda.current_stream(),
-        args=[
-            A.reshape(-1), B.reshape(-1), C.reshape(-1),
-            M * N,
-        ],
-        blocks=n_blocks,
-        threads=n_threads,
-    )
-    
-    return C
+def custom_kernel(data: input_t) -> output_t:
+    return add(*data)
diff --git a/pyproject.toml b/pyproject.toml
index bb72bd06..0c66e0bc 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -52,6 +52,9 @@ exclude_lines = [
 [tool.pytest.ini_options]
 testpaths = ["scripts", "tests"]
 python_files = ["test_*.py", "*_test.py", "ci_test_*.py"]
+markers = [
+    "integration: integration tests that need to interact externally, e.g., with modal/github actions/etc"
+]
 
 [tool.ruff]
 line-length = 120
diff --git a/src/libkernelbot/launchers/modal.py b/src/libkernelbot/launchers/modal.py
index 26adfe33..6c2308ec 100644
--- a/src/libkernelbot/launchers/modal.py
+++ b/src/libkernelbot/launchers/modal.py
@@ -32,7 +32,7 @@ async def run_submission(
 
         result = await loop.run_in_executor(
             None,
-            lambda: modal.Function.lookup("discord-bot-runner", func_name).remote(config=config),
+            lambda: modal.Function.from_name("discord-bot-runner", func_name).remote(config=config),
         )
 
         await status.update("✅ Waiting for modal run to finish... Done")
diff --git a/src/runners/modal_runner.py b/src/runners/modal_runner.py
index eb74d173..bb8d952a 100644
--- a/src/runners/modal_runner.py
+++ b/src/runners/modal_runner.py
@@ -33,10 +33,10 @@
         "PyYAML",
     )
     .pip_install(
-        "torch~=2.7",
+        "torch>=2.7.0,<2.8.0",
         "torchvision~=0.22",
-        "torchaudio~=2.7",
-        index_url="https://download.pytorch.org/whl/cu128"
+        "torchaudio>=2.7.0,<2.8.0",
+        index_url="https://download.pytorch.org/whl/cu128",
     )
     # other frameworks
     .pip_install(
diff --git a/unit-tests/conftest.py b/tests/conftest.py
similarity index 91%
rename from unit-tests/conftest.py
rename to tests/conftest.py
index 170b0196..0408b82e 100644
--- a/unit-tests/conftest.py
+++ b/tests/conftest.py
@@ -8,14 +8,10 @@
 
 
 @pytest.fixture(scope="module")
-def docker_compose():
-    tgt_path = Path.cwd()
-    if tgt_path.name == "unit-tests":
-        tgt_path = tgt_path.parent
-
+def docker_compose(project_root: Path):
     """Start a test database and run migrations"""
     subprocess.check_call(
-        ["docker", "compose", "-f", "docker-compose.test.yml", "up", "-d"], cwd=tgt_path
+        ["docker", "compose", "-f", "docker-compose.test.yml", "up", "-d"], cwd=project_root
     )
 
     try:
@@ -25,7 +21,7 @@ def docker_compose():
                 ["docker", "compose", "-f", "docker-compose.test.yml", "ps", "-q", "migrate-test"],
                 capture_output=True,
                 text=True,
-                cwd=tgt_path,
+                cwd=project_root,
             )
 
             if not result.stdout.strip():  # Container no longer exists
@@ -37,7 +33,7 @@ def docker_compose():
             ["docker", "compose", "-f", "docker-compose.test.yml", "logs", "migrate-test"],
             capture_output=True,
             text=True,
-            cwd=tgt_path,
+            cwd=project_root,
         )
 
         if "error" in logs.stdout.lower():
@@ -46,7 +42,7 @@ def docker_compose():
         yield
     finally:
         subprocess.run(
-            ["docker", "compose", "-f", "docker-compose.test.yml", "down", "-v"], cwd=tgt_path
+            ["docker", "compose", "-f", "docker-compose.test.yml", "down", "-v"], cwd=project_root
         )
 
 
@@ -122,3 +118,8 @@ def task_directory(tmp_path):
     # Create task.yml
     Path.write_text(tmp_path / "task.yml", TASK_YAML)
     return tmp_path
+
+
+@pytest.fixture(scope="session")
+def project_root():
+    return Path(__file__).parent.parent
diff --git a/unit-tests/test_backend.py b/tests/test_backend.py
similarity index 100%
rename from unit-tests/test_backend.py
rename to tests/test_backend.py
diff --git a/unit-tests/test_leaderboard_db.py b/tests/test_leaderboard_db.py
similarity index 100%
rename from unit-tests/test_leaderboard_db.py
rename to tests/test_leaderboard_db.py
diff --git a/tests/test_modal.py b/tests/test_modal.py
new file mode 100644
index 00000000..9fa1725e
--- /dev/null
+++ b/tests/test_modal.py
@@ -0,0 +1,220 @@
+import os
+import subprocess
+from pathlib import Path
+from typing import Tuple
+
+import pytest
+
+from libkernelbot.consts import GPU_TO_SM, ModalGPU, SubmissionMode
+from libkernelbot.launchers import ModalLauncher
+from libkernelbot.report import RunProgressReporter
+from libkernelbot.task import build_task_config, make_task_definition
+
+
+class MockProgressReporter(RunProgressReporter):
+    """Test progress reporter that captures messages."""
+
+    def __init__(self, title: str = "Test Modal Run"):
+        super().__init__(title)
+        self.messages = []
+        self.updates = []
+
+    async def push(self, message: str):
+        self.messages.append(message)
+
+    async def update(self, message: str):
+        self.updates.append(message)
+
+
+@pytest.fixture(scope="session")
+def modal_deployment(project_root: Path):
+    """
+    Fixture that ensures Modal is deployed before running tests.
+    Runs once per test session and deploys to the specified Modal environment.
+    """
+    # Determine Modal environment (default to 'test' if not specified)
+    modal_env = os.getenv("PYTEST_MODAL_ENV", "pytest")
+
+    print(f"🚀 Deploying to Modal environment: {modal_env}")
+
+    # Deploy to Modal with specific environment
+    try:
+        result = subprocess.run(
+            ["modal", "deploy", "--env", modal_env, "modal_runner_archs.py"],
+            cwd=project_root / "src" / "runners",
+            capture_output=True,
+            text=True,
+            timeout=600,  # 10 minute timeout in case image needs to be built (can be very slow)
+        )
+
+        if result.returncode != 0:
+            # if it fails simply because the environment does not exist, we can fix  that
+            if "No such environment" in result.stderr:
+                result = subprocess.run(
+                    ["modal", "environment", "create", modal_env],
+                    cwd=project_root / "src" / "runners",
+                    capture_output=True,
+                    text=True,
+                    timeout=30,
+                )
+                if result.returncode != 0:
+                    pytest.fail(
+                        f"Modal environment `{modal_env}` not available, "
+                        f"and failed to create: {result.stderr}"
+                    )
+                else:
+                    # try again, now that the env exists.
+                    result = subprocess.run(
+                        ["modal", "deploy", "--env", modal_env, "modal_runner_archs.py"],
+                        cwd=project_root / "src" / "runners",
+                        capture_output=True,
+                        text=True,
+                        timeout=600,
+                    )
+                    if result.returncode != 0:
+                        pytest.fail(
+                            f"Modal deploy failed:\n"
+                            f"STDOUT:\n{result.stdout}\n"
+                            f"STDERR:\n{result.stderr}"
+                        )
+            else:
+                pytest.fail(
+                    f"Modal deploy failed:\nSTDOUT:\n{result.stdout}\nSTDERR:\n{result.stderr}"
+                )
+
+        print(f"✅ Modal deployment to '{modal_env}' completed successfully")
+        print(f"Deploy output: {result.stdout}")
+
+        # Set the Modal environment for the session
+        original_env = os.environ.get("MODAL_ENVIRONMENT")
+        os.environ["MODAL_ENVIRONMENT"] = modal_env
+
+        yield modal_env
+
+        # Restore original environment
+        if original_env is not None:
+            os.environ["MODAL_ENVIRONMENT"] = original_env
+        elif "MODAL_ENVIRONMENT" in os.environ:
+            del os.environ["MODAL_ENVIRONMENT"]
+
+    except subprocess.TimeoutExpired as e:
+        pytest.fail(
+            f"Modal deploy timed out after 5 minutes:\nstdout: {e.stdout}, stderr:{e.stderr}"
+        )
+    except Exception as e:
+        pytest.fail(f"Modal deploy failed with exception: {e}")
+
+
+@pytest.mark.integration
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "gpu_type", [ModalGPU.T4, ModalGPU.L4, ModalGPU.A100, ModalGPU.H100, ModalGPU.B200]
+)
+@pytest.mark.parametrize(
+    "task",
+    [
+        ("vectoradd_py", "submission_cuda_inline.py"),
+        ("vectoradd_py", "submission_triton.py"),
+    ],
+)
+async def test_modal_launcher_python_script(
+    modal_deployment, project_root: Path, gpu_type: ModalGPU, task: Tuple[str, str]
+):
+    """
+    Test ModalLauncher with a real Python script using examples/identity_py.
+    """
+    launcher = ModalLauncher(add_include_dirs=[])
+    reporter = MockProgressReporter("progress")
+
+    # Load the real identity_py task
+    task_path = project_root / "examples" / task[0]
+    if not task_path.exists():
+        pytest.skip("examples/identity_py not found - skipping Modal integration test")
+
+    # Load the task definition
+    task_definition = make_task_definition(task_path)
+
+    # Use the actual working submission from the examples
+    submission_content = (task_path / task[1]).read_text()
+
+    config = build_task_config(
+        task=task_definition.task,
+        submission_content=submission_content,
+        arch=GPU_TO_SM[gpu_type.name],
+        mode=SubmissionMode.TEST,
+    )
+
+    result = await launcher.run_submission(config, gpu_type, reporter)
+
+    # Basic structure and success
+    assert result.success, f"Expected successful run, got: {result.error}"
+    assert result.error == ""
+    assert isinstance(result.runs, dict)
+
+    # System info - test actual expected values
+    assert gpu_type.name in result.system.gpu
+    assert "Linux" in result.system.platform
+    assert result.system.torch.startswith("2.7")  # update when the image changes
+
+    # Test run structure
+    assert "test" in result.runs
+    test_run = result.runs["test"]
+
+    # Run needs to succeed
+    assert test_run.run.success is True
+    assert test_run.run.passed is True
+    assert test_run.run.exit_code == 0
+    assert test_run.run.duration > 0
+
+    # Test need to succeed
+    assert test_run.run.result["check"] == "pass"
+    test_count = int(test_run.run.result["test-count"])
+    assert test_count == 5
+    for i in range(test_count):
+        assert test_run.run.result[f"test.{i}.status"] == "pass"
+        assert "size:" in test_run.run.result[f"test.{i}.spec"]
+        assert "seed:" in test_run.run.result[f"test.{i}.spec"]
+
+    # sanity check for timings
+    assert test_run.start < test_run.end
+
+    # check messages
+    assert reporter.messages == ["⏳ Waiting for Modal run to finish..."]
+    assert reporter.updates == ["✅ Waiting for modal run to finish... Done"]
+
+
+@pytest.mark.integration
+@pytest.mark.asyncio
+@pytest.mark.parametrize("script", ["cheat-fd.py", "cheat-input.py", "cheat-rng.py"])
+async def test_modal_launcher_failing_script(modal_deployment, project_root: Path, script: str):
+    """
+    Test ModalLauncher with a real Python scripts that are designed to be wrong.
+    """
+    launcher = ModalLauncher(add_include_dirs=[])
+    reporter = MockProgressReporter("progress")
+    gpu_type = ModalGPU.T4
+
+    # Load the real identity_py task
+    task_path = project_root / "examples" / "identity_py"
+    if not task_path.exists():
+        pytest.skip("examples/identity_py not found - skipping Modal integration test")
+
+    # Load the task definition
+    task_definition = make_task_definition(task_path)
+
+    # Use the actual working submission from the examples
+    submission_content = (task_path / script).read_text()
+    task_definition.task.seed = 653212
+    config = build_task_config(
+        task=task_definition.task,
+        submission_content=submission_content,
+        arch=GPU_TO_SM[gpu_type.name],
+        mode=SubmissionMode.LEADERBOARD,
+    )
+
+    result = await launcher.run_submission(config, gpu_type, reporter)
+
+    # Basic structure and success
+    assert result.success, f"Expected successful run, got: {result.error}"
+    assert result.error == ""
+    assert result.runs["test"].run.passed is False or result.runs["benchmark"].run.passed is False
diff --git a/unit-tests/test_report.py b/tests/test_report.py
similarity index 100%
rename from unit-tests/test_report.py
rename to tests/test_report.py
diff --git a/unit-tests/test_submission.py b/tests/test_submission.py
similarity index 100%
rename from unit-tests/test_submission.py
rename to tests/test_submission.py
diff --git a/unit-tests/test_task.py b/tests/test_task.py
similarity index 100%
rename from unit-tests/test_task.py
rename to tests/test_task.py
diff --git a/unit-tests/test_utils.py b/tests/test_utils.py
similarity index 100%
rename from unit-tests/test_utils.py
rename to tests/test_utils.py