diff --git a/.github/workflows/amd_workflow.yml b/.github/workflows/amd_workflow.yml index 8a0014ef..39dc66c5 100644 --- a/.github/workflows/amd_workflow.yml +++ b/.github/workflows/amd_workflow.yml @@ -35,13 +35,13 @@ jobs: run: | # Extract the payload content without printing it PAYLOAD=$(jq -r '.inputs.payload' $GITHUB_EVENT_PATH) - + # Apply mask to the extracted content echo "::add-mask::$PAYLOAD" - + # Now write to file (won't be logged since it's masked) echo "$PAYLOAD" > payload.json - + - name: Set venv directory based on runner run: | if [[ "${{ github.event.inputs.runner }}" == "amdgpu-mi250-x86-64" ]]; then @@ -77,5 +77,12 @@ jobs: if: always() with: name: run-result - path: | - result.json + path: result.json + + - name: Upload profiling artifacts + uses: actions/upload-artifact@v4 + if: always() + with: + name: profile-data + path: profile_data/* + retention-days: 1 diff --git a/.github/workflows/nvidia_workflow.yml b/.github/workflows/nvidia_workflow.yml index e16cf4d5..b50ec044 100644 --- a/.github/workflows/nvidia_workflow.yml +++ b/.github/workflows/nvidia_workflow.yml @@ -42,10 +42,10 @@ jobs: # Extract the payload content without printing it apt-get update && apt-get install -y jq PAYLOAD=$(jq -r '.inputs.payload' $GITHUB_EVENT_PATH) - + # Apply mask to the extracted content echo "::add-mask::$PAYLOAD" - + # Now write to file (won't be logged since it's masked) echo "$PAYLOAD" > payload.json @@ -73,15 +73,20 @@ jobs: shell: bash run: | python src/runners/github-runner.py - cat result.json # Debug: show output - name: Upload training artifacts uses: actions/upload-artifact@v4 if: always() with: name: run-result - path: | - result.json + path: result.json + - name: Upload profiling artifacts + uses: actions/upload-artifact@v4 + if: always() + with: + name: profile-data + path: profile_data/* + retention-days: 1 env: CUDA_VISIBLE_DEVICES: 0 diff --git a/scripts/ci_test_cuda.py b/scripts/ci_test_cuda.py index ae1a6cd7..c3fa893c 100644 --- a/scripts/ci_test_cuda.py +++ b/scripts/ci_test_cuda.py @@ -4,7 +4,7 @@ import pytest from libkernelbot.consts import ExitCode, SubmissionMode -from libkernelbot.run_eval import compile_cuda_script, run_cuda_script +from libkernelbot.run_eval import compile_cuda_script, make_system_info, run_cuda_script ref = Path("examples/identity_cuda/reference.cuh").read_text() task_h = Path("examples/identity_cuda/task.h").read_text() @@ -19,6 +19,7 @@ def run_cuda_helper(sources: dict, headers: dict = None, arch=None, **kwargs): headers = header_files eval_result = run_cuda_script( + make_system_info(), sources, headers, arch=arch, @@ -194,6 +195,7 @@ def test_include_dirs(tmp_path: Path): # can also use generic flags argument result = run_cuda_script( + make_system_info(), {"eval.cu": eval_cu, "submission.cu": sub}, header_files, flags=["-I.", f"-I{tmp_path}"], diff --git a/scripts/ci_test_python.py b/scripts/ci_test_python.py index 41ac92bd..7cc4fedd 100644 --- a/scripts/ci_test_python.py +++ b/scripts/ci_test_python.py @@ -1,7 +1,7 @@ from pathlib import Path from libkernelbot.consts import ExitCode, SubmissionMode -from libkernelbot.run_eval import run_pytorch_script +from libkernelbot.run_eval import make_system_info, run_pytorch_script ref = Path("examples/identity_py/reference.py").read_text() task = Path("examples/identity_py/task.py").read_text() @@ -12,6 +12,7 @@ def run_pytorch_helper(sources: dict, tests=None, **kwargs): result = run_pytorch_script( + make_system_info(), sources, "eval.py", mode=SubmissionMode.TEST.value, diff --git a/src/kernelbot/discord_reporter.py b/src/kernelbot/discord_reporter.py index f49e5225..3b6fd8c3 100644 --- a/src/kernelbot/discord_reporter.py +++ b/src/kernelbot/discord_reporter.py @@ -2,6 +2,7 @@ from discord_utils import _send_split_log from libkernelbot.report import ( + Link, Log, MultiProgressReporter, RunProgressReporter, @@ -69,6 +70,11 @@ async def display_report(self, title: str, report: RunResultReport): message += part.text elif isinstance(part, Log): message = await _send_split_log(thread, message, part.header, part.content) + elif isinstance(part, Link): + if len(message) > 0: + await thread.send(message) + message = "" + await thread.send(f"{part.title}: [{part.text}]({part.url})") if len(message) > 0: await thread.send(message) diff --git a/src/libkernelbot/launchers/github.py b/src/libkernelbot/launchers/github.py index be968429..c748e0fe 100644 --- a/src/libkernelbot/launchers/github.py +++ b/src/libkernelbot/launchers/github.py @@ -1,10 +1,11 @@ import asyncio import base64 +import dataclasses import datetime +import io import json import math import pprint -import tempfile import uuid import zipfile import zlib @@ -23,7 +24,14 @@ SubmissionMode, ) from libkernelbot.report import RunProgressReporter -from libkernelbot.run_eval import CompileResult, EvalResult, FullResult, RunResult, SystemInfo +from libkernelbot.run_eval import ( + CompileResult, + EvalResult, + FullResult, + ProfileResult, + RunResult, + SystemInfo, +) from libkernelbot.utils import setup_logging from .launcher import Launcher @@ -49,7 +57,7 @@ def __init__(self, repo: str, token: str, branch: str): self.token = token self.branch = branch - async def run_submission( + async def run_submission( # noqa: C901 self, config: dict, gpu_type: GPU, status: RunProgressReporter ) -> FullResult: gpu_vendor = None @@ -106,15 +114,17 @@ async def run_submission( await status.push("Downloading artifacts...") logger.info("Downloading artifacts...") - artifacts = await run.download_artifacts() - if "run-result" not in artifacts: - logger.error("Could not find `run-result` among artifacts: %s", artifacts.keys()) + index = run.get_artifact_index() + + if "run-result" not in index: + logger.error("Could not find `run-result` among artifacts: %s", index.keys()) await status.push("Downloading artifacts... failed") return FullResult( success=False, error="Could not download artifacts", runs={}, system=SystemInfo() ) - logs = artifacts["run-result"]["result.json"].decode("utf-8") + artifact = await run.download_artifact(index["run-result"]) + logs = artifact["result.json"].decode("utf-8") await status.update("Downloading artifacts... done") logger.info("Downloading artifacts... done") @@ -123,17 +133,24 @@ async def run_submission( runs = {} # convert json back to EvalResult structures, which requires # special handling for datetime and our dataclasses. + for k, v in data["runs"].items(): - if "compilation" in v and v["compilation"] is not None: - comp = CompileResult(**v["compilation"]) - else: - comp = None - run = RunResult(**v["run"]) + comp_res = None if v.get("compilation") is None else CompileResult(**v["compilation"]) + run_res = None if v.get("run") is None else RunResult(**v["run"]) + profile_res = None if v.get("profile") is None else ProfileResult(**v["profile"]) + + # Update profile artifact to the actual download URL. + # For the GitHub launcher the profile_artifact currently just contains + # the name of the artifact. + if profile_res is not None: + profile_res.download_url = index["profile-data"].public_download_url + res = EvalResult( start=datetime.datetime.fromisoformat(v["start"]), end=datetime.datetime.fromisoformat(v["end"]), - compilation=comp, - run=run, + compilation=comp_res, + run=run_res, + profile=profile_res, ) runs[k] = res @@ -147,6 +164,13 @@ async def wait_callback(self, run: "GitHubRun", status: RunProgressReporter): ) +@dataclasses.dataclass +class GitHubArtifact: + name: str + archive_download_url: str + public_download_url: str + + class GitHubRun: def __init__(self, repo: str, token: str, branch: str, workflow_file: str): gh = Github(token) @@ -323,34 +347,43 @@ async def wait_for_completion( logger.error(f"Error waiting for GitHub run {self.run_id}: {e}", exc_info=e) raise # Re-raise other exceptions - async def download_artifacts(self) -> dict: - logger.info("Attempting to download artifacts for run %s", self.run_id) + + def get_artifact_index(self) -> dict[str, GitHubArtifact]: + logger.info("Creating artifact index for run %s", self.run_id) artifacts = self.run.get_artifacts() extracted = {} for artifact in artifacts: - url = artifact.archive_download_url - headers = {"Authorization": f"token {self.token}"} - response = requests.get(url, headers=headers) - - if response.status_code == 200: - with tempfile.NamedTemporaryFile("w+b") as temp: - temp.write(response.content) - temp.flush() - - with zipfile.ZipFile(temp.name) as z: - artifact_dict = {} - for file in z.namelist(): - with z.open(file) as f: - artifact_dict[file] = f.read() - - extracted[artifact.name] = artifact_dict - else: - raise RuntimeError( - f"Failed to download artifact {artifact.name}. " - f"Status code: {response.status_code}" - ) + extracted[artifact.name] = GitHubArtifact( + name=artifact.name, + archive_download_url=artifact.archive_download_url, + # Non-machine users cannot download from the archive_download_url and + # the GitHub API does not give us access to the public download url. + public_download_url=f"{self.repo.html_url}/actions/runs/{self.run_id}/artifacts/{artifact.id}", + ) - logger.info("Download artifacts for run %s: %s", self.run_id, list(extracted.keys())) return extracted + + + async def download_artifact(self, artifact: GitHubArtifact) -> dict: + logger.info("Attempting to download artifact '%s' for run %s", artifact.name, self.run_id) + + url = artifact.archive_download_url + headers = {"Authorization": f"token {self.token}"} + response = requests.get(url, headers=headers) + + if response.status_code == 200: + artifact_dict = {} + with zipfile.ZipFile(io.BytesIO(response.content)) as z: + for file in z.namelist(): + with z.open(file) as f: + artifact_dict[file] = f.read() + + logger.info("Downloaded artifact '%s' for run %s", artifact.name, self.run_id) + return artifact_dict + else: + raise RuntimeError( + f"Failed to download artifact {artifact.name}. " + f"Status code: {response.status_code}" + ) diff --git a/src/libkernelbot/report.py b/src/libkernelbot/report.py index ec52e7bd..25bb27cb 100644 --- a/src/libkernelbot/report.py +++ b/src/libkernelbot/report.py @@ -32,9 +32,20 @@ class Log: content: str +@dataclasses.dataclass +class Link: + """ + Link represents a link in the profiling report, to result data + which can be downloaded by clicking it. + """ + title: str + text: str + url: str + + class RunResultReport: def __init__(self, data=None): - self.data: List[Text | Log] = data or [] + self.data: List[Text | Log | Link] = data or [] def add_text(self, section: str): self.data.append(Text(section)) @@ -42,6 +53,9 @@ def add_text(self, section: str): def add_log(self, header: str, log: str): self.data.append(Log(header, log)) + def add_link(self, title: str, text: str, url: str): + self.data.append(Link(title, text, url)) + def __repr__(self): return f"RunResultReport(data={self.data})" @@ -267,6 +281,7 @@ def generate_system_info(system: SystemInfo): Running on: * GPU: `{system.gpu}` * CPU: `{system.cpu}` +* Runtime: `{system.runtime}` * Platform: `{system.platform}` * Torch: `{system.torch}` """ @@ -322,6 +337,13 @@ def generate_report(result: FullResult) -> RunResultReport: # noqa: C901 make_profile_log(prof_run.run), ) + if prof_run.profile is not None and prof_run.profile.download_url is not None: + report.add_link( + f"{prof_run.profile.profiler} profiling output", + "Download from GitHub", + prof_run.profile.download_url, + ) + if "leaderboard" in runs: bench_run = runs["leaderboard"] if _handle_crash_report(report, bench_run): diff --git a/src/libkernelbot/run_eval.py b/src/libkernelbot/run_eval.py index 73e7e374..e8722ba7 100644 --- a/src/libkernelbot/run_eval.py +++ b/src/libkernelbot/run_eval.py @@ -1,6 +1,7 @@ import dataclasses import datetime import functools +import json import os import shlex import subprocess @@ -13,6 +14,16 @@ from libkernelbot.consts import CUDA_FLAGS, ExitCode, Timeout +@dataclasses.dataclass +class ProfileResult: + # fmt: off + profiler: str # The profiler used to gather this data + # Public download URL of all files created by the profiler + # This may also be configured later + download_url: Optional[str] + #fmt: on + + @dataclasses.dataclass class CompileResult: # fmt: off @@ -46,6 +57,7 @@ class SystemInfo: gpu: str = '' # Model name of the GPU device_count: int = 1 # Number of GPUs cpu: str = '' # Model name of the CPU + runtime: str = '' # Whether CUDA or ROCm platform: str = '' # Platform string of the machine torch: str = '' # Torch version # fmt: on @@ -58,6 +70,7 @@ class EvalResult: end: datetime.datetime # and when did it finish compilation: CompileResult | None # results of compilation run: RunResult | None # result of actually running the executable/script + profile: ProfileResult | None # result of profiling the executable # fmt: on @@ -285,6 +298,7 @@ def run_program( def run_single_evaluation( + system: SystemInfo, call: list[str], mode: str, *, @@ -296,34 +310,32 @@ def run_single_evaluation( ranked_timeout: int = Timeout.RANKED, ranking_by: str = "last", seed: Optional[int] = None, -) -> RunResult: +) -> tuple[RunResult, Optional[ProfileResult]]: """ A single runner run, either in the context of test files, or in the context of benchmark files. """ - if mode == "test": - with tempfile.NamedTemporaryFile("w") as tests_file: - tests_file.write(tests) - tests_file.flush() - return run_program( - call + [mode, tests_file.name], seed=seed, timeout=test_timeout, multi_gpu=multi_gpu - ) - elif mode in ["benchmark", "profile", "leaderboard"]: - timeout = ranked_timeout if mode == "leaderboard" else benchmark_timeout - with tempfile.NamedTemporaryFile("w") as bench_file: + with tempfile.NamedTemporaryFile("w") as cases: + if mode == "test": + timeout = test_timeout + cases.write(tests) + elif mode in ["benchmark", "profile", "leaderboard"]: + timeout = ranked_timeout if mode == "leaderboard" else benchmark_timeout if ranking_by == "last": - bench_file.write(benchmarks.splitlines(keepends=True)[-1]) + cases.write(benchmarks.splitlines(keepends=True)[-1]) else: - bench_file.write(benchmarks) - bench_file.flush() - return run_program( - call + [mode, bench_file.name], seed=seed, timeout=timeout, multi_gpu=multi_gpu - ) - else: - raise ValueError(f"Invalid mode {mode}") + cases.write(benchmarks) + else: + raise ValueError(f"Invalid mode {mode}") + + cases.flush() + call += [mode, cases.name] -def make_system_info() -> SystemInfo: + return run_program(call, seed=seed, timeout=timeout, multi_gpu=multi_gpu), None + + +def make_system_info() -> SystemInfo: # noqa: C901 info = SystemInfo() try: import torch @@ -334,19 +346,29 @@ def make_system_info() -> SystemInfo: if torch.cuda.is_available(): info.gpu = torch.cuda.get_device_name() info.device_count = torch.cuda.device_count() + if torch.version.hip is not None: + info.runtime = "ROCm" + elif torch.version.cuda is not None: + info.runtime = "CUDA" except ImportError: # get GPU info manually try: info.gpu = subprocess.check_output( ["nvidia-smi", "--query-gpu=name", "--format=csv,noheader"], encoding="utf-8" ) + info.device_count = info.gpu.count('\n') + info.runtime = "CUDA" except subprocess.CalledProcessError: # try again for HIP - # TODO suggested by Claude, untested try: - info.gpu = subprocess.check_output( - ["rocm-smi", "--showproductname"], encoding="utf-8" - ) + rocm_info = json.loads(subprocess.check_output( + ["rocm-smi", "--showproductname", "--json"], encoding="utf-8" + )) + if len(rocm_info) > 0: + info.gpu = next(rocm_info.__iter__())["Card Series"] + + info.device_count = len(rocm_info) + info.runtime = "ROCm" except subprocess.CalledProcessError: # OK, no GPU info available pass @@ -375,6 +397,7 @@ def make_system_info() -> SystemInfo: def run_cuda_script( # # noqa: C901 + system: SystemInfo, sources: dict[str, str], headers: Optional[dict[str, str]] = None, arch: Optional[int] = None, @@ -424,6 +447,7 @@ def run_cuda_script( # # noqa: C901 end=datetime.datetime.now(), compilation=compile_result, run=None, + profile=None, ) # cleaning up all source files _before_ we let the user code run, just in @@ -434,16 +458,18 @@ def run_cuda_script( # # noqa: C901 if os.path.exists(f): os.remove(f) - run_result = run_single_evaluation(["./eval.out"], **kwargs) + run_result, profile_result = run_single_evaluation(system, ["./eval.out"], **kwargs) return EvalResult( start=start, end=datetime.datetime.now(), compilation=compile_result, run=run_result, + profile=profile_result, ) def run_pytorch_script( # noqa: C901 + system: SystemInfo, sources: dict[str, str], main: str, **kwargs, @@ -495,13 +521,14 @@ def run_pytorch_script( # noqa: C901 exit_code=e.returncode, ) - run = run_single_evaluation(["python", main], **kwargs) + run, profile = run_single_evaluation(system, ["python", main], **kwargs) return EvalResult( start=start, end=datetime.datetime.now(), compilation=comp, run=run, + profile=profile, ) finally: for f in sources.keys(): @@ -558,7 +585,9 @@ def build_test_string(tests: list[dict]): def run_config(config: dict): + system = make_system_info() common_args = { + "system": system, "tests": build_test_string(config.get("tests", [])), "benchmarks": build_test_string(config.get("benchmarks", [])), "seed": config.get("seed", None), @@ -591,4 +620,4 @@ def run_config(config: dict): raise ValueError(f"Invalid language {config['lang']}") results = run_evaluation(runner, config["mode"]) - return FullResult(success=True, error="", runs=results, system=make_system_info()) + return FullResult(success=True, error="", runs=results, system=system) diff --git a/tests/test_backend.py b/tests/test_backend.py index 585674cd..94cc2795 100644 --- a/tests/test_backend.py +++ b/tests/test_backend.py @@ -77,6 +77,7 @@ async def test_handle_submission(bot: backend.KernelBackend, task_directory): "Running on:\n" "* GPU: `NVIDIA RTX 4090`\n" "* CPU: `Intel i9-12900K`\n" + "* Runtime: `CUDA`\n" "* Platform: `Linux-5.15.0`\n" "* Torch: `2.0.1+cu118`\n" ), @@ -210,6 +211,7 @@ async def test_submit_leaderboard(bot: backend.KernelBackend, task_directory): "device_count": 1, "gpu": "NVIDIA RTX 4090", "platform": "Linux-5.15.0", + "runtime": "CUDA", "torch": "2.0.1+cu118", }, } @@ -315,6 +317,7 @@ async def test_submit_full(bot: backend.KernelBackend, task_directory): "device_count": 1, "gpu": "NVIDIA RTX 4090", "platform": "Linux-5.15.0", + "runtime": "CUDA", "torch": "2.0.1+cu118", }, }, @@ -357,6 +360,7 @@ async def test_submit_full(bot: backend.KernelBackend, task_directory): "device_count": 1, "gpu": "NVIDIA RTX 4090", "platform": "Linux-5.15.0", + "runtime": "CUDA", "torch": "2.0.1+cu118", }, }, diff --git a/tests/test_report.py b/tests/test_report.py index e9efc41c..a1964e62 100644 --- a/tests/test_report.py +++ b/tests/test_report.py @@ -16,13 +16,24 @@ make_short_report, make_test_log, ) -from libkernelbot.run_eval import CompileResult, EvalResult, FullResult, RunResult, SystemInfo +from libkernelbot.run_eval import ( + CompileResult, + EvalResult, + FullResult, + ProfileResult, + RunResult, + SystemInfo, +) # define helpers and fixtures that create mock results def sample_system_info() -> SystemInfo: return SystemInfo( - gpu="NVIDIA RTX 4090", cpu="Intel i9-12900K", platform="Linux-5.15.0", torch="2.0.1+cu118" + gpu="NVIDIA RTX 4090", + cpu="Intel i9-12900K", + runtime="CUDA", + platform="Linux-5.15.0", + torch="2.0.1+cu118", ) @@ -82,6 +93,7 @@ def create_eval_result(mode="test") -> EvalResult: end=datetime.datetime.now(), compilation=sample_compile_result(), run=sample_run_result(mode), + profile=None, ) @@ -294,6 +306,7 @@ def test_make_short_report_full_success(): stderr="", result={}, ), + profile=None, ) result = make_short_report(runs, full=True) @@ -491,6 +504,7 @@ def test_generate_report_test_failure(sample_full_result: FullResult): "Running on:\n" "* GPU: `NVIDIA RTX 4090`\n" "* CPU: `Intel i9-12900K`\n" + "* Runtime: `CUDA`\n" "* Platform: `Linux-5.15.0`\n" "* Torch: `2.0.1+cu118`\n" ), @@ -522,6 +536,7 @@ def test_generate_report_benchmark_failure(sample_full_result: FullResult): "Running on:\n" "* GPU: `NVIDIA RTX 4090`\n" "* CPU: `Intel i9-12900K`\n" + "* Runtime: `CUDA`\n" "* Platform: `Linux-5.15.0`\n" "* Torch: `2.0.1+cu118`\n" ), @@ -556,6 +571,7 @@ def test_generate_report_benchmark_failure(sample_full_result: FullResult): "Running on:\n" "* GPU: `NVIDIA RTX 4090`\n" "* CPU: `Intel i9-12900K`\n" + "* Runtime: `CUDA`\n" "* Platform: `Linux-5.15.0`\n" "* Torch: `2.0.1+cu118`\n" ), @@ -591,6 +607,7 @@ def test_generate_report_leaderboard_failure(sample_full_result: FullResult): "Running on:\n" "* GPU: `NVIDIA RTX 4090`\n" "* CPU: `Intel i9-12900K`\n" + "* Runtime: `CUDA`\n" "* Platform: `Linux-5.15.0`\n" "* Torch: `2.0.1+cu118`\n" ), @@ -616,6 +633,7 @@ def test_generate_report_leaderboard_failure(sample_full_result: FullResult): "Running on:\n" "* GPU: `NVIDIA RTX 4090`\n" "* CPU: `Intel i9-12900K`\n" + "* Runtime: `CUDA`\n" "* Platform: `Linux-5.15.0`\n" "* Torch: `2.0.1+cu118`\n" ), @@ -644,8 +662,12 @@ def test_generate_report_profile(sample_full_result: FullResult): "benchmark.0.spec": "Benchmark", "benchmark.0.report": base64.b64encode(b"Profile report", b"+*").decode("utf-8"), } + sample_full_result.runs["profile"].profile = ProfileResult( + profiler="NSight", + download_url="https://example.com", + ) report = generate_report(sample_full_result) - from libkernelbot.report import Log, Text + from libkernelbot.report import Link, Log, Text assert report.data == [ Text( @@ -653,6 +675,7 @@ def test_generate_report_profile(sample_full_result: FullResult): "Running on:\n" "* GPU: `NVIDIA RTX 4090`\n" "* CPU: `Intel i9-12900K`\n" + "* Runtime: `CUDA`\n" "* Platform: `Linux-5.15.0`\n" "* Torch: `2.0.1+cu118`\n" ), @@ -665,6 +688,7 @@ def test_generate_report_profile(sample_full_result: FullResult): "> Division by zero", ), Log(header="Profiling", content="Benchmark\n\n Profile report\n"), + Link("NSight profiling output", "Download from GitHub", "https://example.com"), ]