Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 12 additions & 5 deletions .github/workflows/amd_workflow.yml
Original file line number Diff line number Diff line change
Expand Up @@ -35,13 +35,13 @@ jobs:
run: |
# Extract the payload content without printing it
PAYLOAD=$(jq -r '.inputs.payload' $GITHUB_EVENT_PATH)

# Apply mask to the extracted content
echo "::add-mask::$PAYLOAD"

# Now write to file (won't be logged since it's masked)
echo "$PAYLOAD" > payload.json

- name: Set venv directory based on runner
run: |
if [[ "${{ github.event.inputs.runner }}" == "amdgpu-mi250-x86-64" ]]; then
Expand Down Expand Up @@ -77,5 +77,12 @@ jobs:
if: always()
with:
name: run-result
path: |
result.json
path: result.json

- name: Upload profiling artifacts
uses: actions/upload-artifact@v4
if: always()
with:
name: profile-data
path: profile_data/*
retention-days: 1
15 changes: 10 additions & 5 deletions .github/workflows/nvidia_workflow.yml
Original file line number Diff line number Diff line change
Expand Up @@ -42,10 +42,10 @@ jobs:
# Extract the payload content without printing it
apt-get update && apt-get install -y jq
PAYLOAD=$(jq -r '.inputs.payload' $GITHUB_EVENT_PATH)

# Apply mask to the extracted content
echo "::add-mask::$PAYLOAD"

# Now write to file (won't be logged since it's masked)
echo "$PAYLOAD" > payload.json

Expand Down Expand Up @@ -73,15 +73,20 @@ jobs:
shell: bash
run: |
python src/runners/github-runner.py
cat result.json # Debug: show output

- name: Upload training artifacts
uses: actions/upload-artifact@v4
if: always()
with:
name: run-result
path: |
result.json
path: result.json

- name: Upload profiling artifacts
uses: actions/upload-artifact@v4
if: always()
with:
name: profile-data
path: profile_data/*
retention-days: 1
env:
CUDA_VISIBLE_DEVICES: 0
4 changes: 3 additions & 1 deletion scripts/ci_test_cuda.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import pytest

from libkernelbot.consts import ExitCode, SubmissionMode
from libkernelbot.run_eval import compile_cuda_script, run_cuda_script
from libkernelbot.run_eval import compile_cuda_script, make_system_info, run_cuda_script

ref = Path("examples/identity_cuda/reference.cuh").read_text()
task_h = Path("examples/identity_cuda/task.h").read_text()
Expand All @@ -19,6 +19,7 @@ def run_cuda_helper(sources: dict, headers: dict = None, arch=None, **kwargs):
headers = header_files

eval_result = run_cuda_script(
make_system_info(),
sources,
headers,
arch=arch,
Expand Down Expand Up @@ -194,6 +195,7 @@ def test_include_dirs(tmp_path: Path):

# can also use generic flags argument
result = run_cuda_script(
make_system_info(),
{"eval.cu": eval_cu, "submission.cu": sub},
header_files,
flags=["-I.", f"-I{tmp_path}"],
Expand Down
3 changes: 2 additions & 1 deletion scripts/ci_test_python.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from pathlib import Path

from libkernelbot.consts import ExitCode, SubmissionMode
from libkernelbot.run_eval import run_pytorch_script
from libkernelbot.run_eval import make_system_info, run_pytorch_script

ref = Path("examples/identity_py/reference.py").read_text()
task = Path("examples/identity_py/task.py").read_text()
Expand All @@ -12,6 +12,7 @@

def run_pytorch_helper(sources: dict, tests=None, **kwargs):
result = run_pytorch_script(
make_system_info(),
sources,
"eval.py",
mode=SubmissionMode.TEST.value,
Expand Down
6 changes: 6 additions & 0 deletions src/kernelbot/discord_reporter.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from discord_utils import _send_split_log

from libkernelbot.report import (
Link,
Log,
MultiProgressReporter,
RunProgressReporter,
Expand Down Expand Up @@ -69,6 +70,11 @@ async def display_report(self, title: str, report: RunResultReport):
message += part.text
elif isinstance(part, Log):
message = await _send_split_log(thread, message, part.header, part.content)
elif isinstance(part, Link):
if len(message) > 0:
await thread.send(message)
message = ""
await thread.send(f"{part.title}: [{part.text}]({part.url})")

if len(message) > 0:
await thread.send(message)
Expand Down
109 changes: 71 additions & 38 deletions src/libkernelbot/launchers/github.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
import asyncio
import base64
import dataclasses
import datetime
import io
import json
import math
import pprint
import tempfile
import uuid
import zipfile
import zlib
Expand All @@ -23,7 +24,14 @@
SubmissionMode,
)
from libkernelbot.report import RunProgressReporter
from libkernelbot.run_eval import CompileResult, EvalResult, FullResult, RunResult, SystemInfo
from libkernelbot.run_eval import (
CompileResult,
EvalResult,
FullResult,
ProfileResult,
RunResult,
SystemInfo,
)
from libkernelbot.utils import setup_logging

from .launcher import Launcher
Expand All @@ -49,7 +57,7 @@ def __init__(self, repo: str, token: str, branch: str):
self.token = token
self.branch = branch

async def run_submission(
async def run_submission( # noqa: C901
self, config: dict, gpu_type: GPU, status: RunProgressReporter
) -> FullResult:
gpu_vendor = None
Expand Down Expand Up @@ -106,15 +114,17 @@ async def run_submission(
await status.push("Downloading artifacts...")
logger.info("Downloading artifacts...")

artifacts = await run.download_artifacts()
if "run-result" not in artifacts:
logger.error("Could not find `run-result` among artifacts: %s", artifacts.keys())
index = run.get_artifact_index()

if "run-result" not in index:
logger.error("Could not find `run-result` among artifacts: %s", index.keys())
await status.push("Downloading artifacts... failed")
return FullResult(
success=False, error="Could not download artifacts", runs={}, system=SystemInfo()
)

logs = artifacts["run-result"]["result.json"].decode("utf-8")
artifact = await run.download_artifact(index["run-result"])
logs = artifact["result.json"].decode("utf-8")

await status.update("Downloading artifacts... done")
logger.info("Downloading artifacts... done")
Expand All @@ -123,17 +133,24 @@ async def run_submission(
runs = {}
# convert json back to EvalResult structures, which requires
# special handling for datetime and our dataclasses.

for k, v in data["runs"].items():
if "compilation" in v and v["compilation"] is not None:
comp = CompileResult(**v["compilation"])
else:
comp = None
run = RunResult(**v["run"])
comp_res = None if v.get("compilation") is None else CompileResult(**v["compilation"])
run_res = None if v.get("run") is None else RunResult(**v["run"])
profile_res = None if v.get("profile") is None else ProfileResult(**v["profile"])

# Update profile artifact to the actual download URL.
# For the GitHub launcher the profile_artifact currently just contains
# the name of the artifact.
if profile_res is not None:
profile_res.download_url = index["profile-data"].public_download_url

res = EvalResult(
start=datetime.datetime.fromisoformat(v["start"]),
end=datetime.datetime.fromisoformat(v["end"]),
compilation=comp,
run=run,
compilation=comp_res,
run=run_res,
profile=profile_res,
)
runs[k] = res

Expand All @@ -147,6 +164,13 @@ async def wait_callback(self, run: "GitHubRun", status: RunProgressReporter):
)


@dataclasses.dataclass
class GitHubArtifact:
name: str
archive_download_url: str
public_download_url: str


class GitHubRun:
def __init__(self, repo: str, token: str, branch: str, workflow_file: str):
gh = Github(token)
Expand Down Expand Up @@ -323,34 +347,43 @@ async def wait_for_completion(
logger.error(f"Error waiting for GitHub run {self.run_id}: {e}", exc_info=e)
raise # Re-raise other exceptions

async def download_artifacts(self) -> dict:
logger.info("Attempting to download artifacts for run %s", self.run_id)

def get_artifact_index(self) -> dict[str, GitHubArtifact]:
logger.info("Creating artifact index for run %s", self.run_id)
artifacts = self.run.get_artifacts()

extracted = {}

for artifact in artifacts:
url = artifact.archive_download_url
headers = {"Authorization": f"token {self.token}"}
response = requests.get(url, headers=headers)

if response.status_code == 200:
with tempfile.NamedTemporaryFile("w+b") as temp:
temp.write(response.content)
temp.flush()

with zipfile.ZipFile(temp.name) as z:
artifact_dict = {}
for file in z.namelist():
with z.open(file) as f:
artifact_dict[file] = f.read()

extracted[artifact.name] = artifact_dict
else:
raise RuntimeError(
f"Failed to download artifact {artifact.name}. "
f"Status code: {response.status_code}"
)
extracted[artifact.name] = GitHubArtifact(
name=artifact.name,
archive_download_url=artifact.archive_download_url,
# Non-machine users cannot download from the archive_download_url and
# the GitHub API does not give us access to the public download url.
public_download_url=f"{self.repo.html_url}/actions/runs/{self.run_id}/artifacts/{artifact.id}",
)

logger.info("Download artifacts for run %s: %s", self.run_id, list(extracted.keys()))
return extracted


async def download_artifact(self, artifact: GitHubArtifact) -> dict:
logger.info("Attempting to download artifact '%s' for run %s", artifact.name, self.run_id)

url = artifact.archive_download_url
headers = {"Authorization": f"token {self.token}"}
response = requests.get(url, headers=headers)

if response.status_code == 200:
artifact_dict = {}
with zipfile.ZipFile(io.BytesIO(response.content)) as z:
for file in z.namelist():
with z.open(file) as f:
artifact_dict[file] = f.read()

logger.info("Downloaded artifact '%s' for run %s", artifact.name, self.run_id)
return artifact_dict
else:
raise RuntimeError(
f"Failed to download artifact {artifact.name}. "
f"Status code: {response.status_code}"
)
24 changes: 23 additions & 1 deletion src/libkernelbot/report.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,16 +32,30 @@ class Log:
content: str


@dataclasses.dataclass
class Link:
"""
Link represents a link in the profiling report, to result data
which can be downloaded by clicking it.
"""
title: str
text: str
url: str


class RunResultReport:
def __init__(self, data=None):
self.data: List[Text | Log] = data or []
self.data: List[Text | Log | Link] = data or []

def add_text(self, section: str):
self.data.append(Text(section))

def add_log(self, header: str, log: str):
self.data.append(Log(header, log))

def add_link(self, title: str, text: str, url: str):
self.data.append(Link(title, text, url))

def __repr__(self):
return f"RunResultReport(data={self.data})"

Expand Down Expand Up @@ -267,6 +281,7 @@ def generate_system_info(system: SystemInfo):
Running on:
* GPU: `{system.gpu}`
* CPU: `{system.cpu}`
* Runtime: `{system.runtime}`
* Platform: `{system.platform}`
* Torch: `{system.torch}`
"""
Expand Down Expand Up @@ -322,6 +337,13 @@ def generate_report(result: FullResult) -> RunResultReport: # noqa: C901
make_profile_log(prof_run.run),
)

if prof_run.profile is not None and prof_run.profile.download_url is not None:
report.add_link(
f"{prof_run.profile.profiler} profiling output",
"Download from GitHub",
prof_run.profile.download_url,
)

if "leaderboard" in runs:
bench_run = runs["leaderboard"]
if _handle_crash_report(report, bench_run):
Expand Down
Loading
Loading