-
Notifications
You must be signed in to change notification settings - Fork 22
[WIP] Ncu profile #368
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[WIP] Ncu profile #368
Changes from all commits
3bf6f64
5f40e36
e3ac730
c60090b
2a69a10
9a6c08d
aa2f894
fbc28ad
6437e19
6c4bde0
a3e045c
844d3bf
3275924
b19b59b
3e8eb6f
998cf42
1de31fd
d754094
394e234
0e51cf5
3e6a59c
f31e4bb
00c215a
b014b79
f328eba
eaa54f7
e83b0f4
716aca9
cb880a7
2621ca1
af80b61
2931fd4
110386e
8a4c6b2
c9786fb
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -19,23 +19,11 @@ run-name: 'NVIDIA Job - ${{ github.event.inputs.run_id }}' | |
|
|
||
| jobs: | ||
| run: | ||
| runs-on: [gpumode-nvidia-arc] | ||
| runs-on: [nvidia-docker-b200-8-x86-64] | ||
| timeout-minutes: 10 | ||
| container: | ||
| image: nvidia/cuda:12.4.0-devel-ubuntu22.04 | ||
| steps: | ||
| - uses: actions/checkout@v3 | ||
|
|
||
| - name: Setup Python | ||
| uses: actions/setup-python@v5 | ||
| with: | ||
| python-version: '3.10' | ||
|
|
||
| - name: Install uv | ||
| uses: astral-sh/setup-uv@v3 | ||
| with: | ||
| version: "latest" | ||
|
|
||
| - name: Create input files | ||
| shell: bash | ||
|
Comment on lines
27
to
28
|
||
| run: | | ||
|
|
@@ -49,30 +37,18 @@ jobs: | |
| # Now write to file (won't be logged since it's masked) | ||
| echo "$PAYLOAD" > payload.json | ||
|
|
||
| - name: Install uv | ||
| uses: astral-sh/setup-uv@v3 | ||
| with: | ||
| version: "latest" | ||
|
|
||
| - name: Setup Python environment | ||
| - name: Setup Virtual Environment and Install Dependencies | ||
| shell: bash | ||
| run: | | ||
| uv venv .venv | ||
| echo "VIRTUAL_ENV=$PWD/.venv" >> $GITHUB_ENV | ||
| echo "$PWD/.venv/bin" >> $GITHUB_PATH | ||
| pip install --upgrade pip | ||
| pip install -r "requirements.txt" | ||
| pip install -e . | ||
|
|
||
| if [[ -n "${{ github.event.inputs.requirements }}" ]]; then | ||
| cat > "requirements.txt" <<'EOL' | ||
| ${{ github.event.inputs.requirements }} | ||
| EOL | ||
| uv pip install -r "requirements.txt" | ||
| fi | ||
| uv pip install -e . | ||
|
|
||
| - name: Run script | ||
| shell: bash | ||
| run: | | ||
| python src/runners/github-runner.py | ||
| python3 src/runners/github-runner.py | ||
|
|
||
| - name: Upload training artifacts | ||
| uses: actions/upload-artifact@v4 | ||
|
|
@@ -88,5 +64,3 @@ jobs: | |
| name: profile-data | ||
| path: profile_data/* | ||
| retention-days: 1 | ||
| env: | ||
| CUDA_VISIBLE_DEVICES: 0 | ||
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
|
|
@@ -500,9 +500,9 @@ def run_benchmarking(logger: PopcornOutput, pool: multiprocessing.Pool, tests: l | |||||
| return 112 | ||||||
|
|
||||||
|
|
||||||
| def _run_single_profile(test: TestCase) -> str: | ||||||
| def _run_single_profile_torch(test: TestCase) -> str: | ||||||
| """ | ||||||
| Runs a single test case. Do not call directly | ||||||
| Profiles a single benchmark using the torch profiler. | ||||||
| """ | ||||||
| from submission import custom_kernel | ||||||
| from torch.profiler import profile, ProfilerActivity | ||||||
|
|
@@ -511,14 +511,36 @@ def _run_single_profile(test: TestCase) -> str: | |||||
| data = generate_input(**test.args) | ||||||
| torch.cuda.synchronize() | ||||||
|
|
||||||
| cloned = _clone_data(data, 0) | ||||||
| with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA]) as prof: | ||||||
| with nvtx_range("custom_kernel"): | ||||||
| submission_output = custom_kernel(_clone_data(data, 0)) | ||||||
| submission_output = custom_kernel(cloned) | ||||||
|
||||||
| torch.cuda.synchronize() | ||||||
|
|
||||||
| return prof.key_averages().table(sort_by="self_cuda_time_total", row_limit=20) | ||||||
|
|
||||||
|
|
||||||
| def _run_single_profile_ncu(test: TestCase) -> str: | ||||||
| """ | ||||||
| Profiles a single benchmark using ncu. Note: this does not | ||||||
| invoke NCU; instead, it is expected that eval is launched | ||||||
| under NCU, and this function will rurnthe kernel excactly | ||||||
|
||||||
| under NCU, and this function will rurnthe kernel excactly | |
| under NCU, and this function will run the kernel excactly |
Copilot
AI
Nov 10, 2025
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Variable submission_output is not used.
| submission_output = custom_kernel(cloned) | |
| custom_kernel(cloned) |
Copilot
AI
Nov 10, 2025
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The condition bool(os.getenv("POPCORN_NCU", "0")) will always evaluate to True because os.getenv() returns a string (either the env var value or the default "0"), and bool("0") is True. To properly check for a truthy environment variable, use:
if os.getenv("POPCORN_NCU", "0") != "0":or
if os.getenv("POPCORN_NCU", "") in ("1", "true", "True"):| if bool(os.getenv("POPCORN_NCU", "0")): | |
| if os.getenv("POPCORN_NCU", "") in ("1", "true", "True"): |
| Original file line number | Diff line number | Diff line change | ||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -189,6 +189,8 @@ async def display_report(self, title: str, report: RunResultReport): | |||||||||||
| elif isinstance(part, Log): | ||||||||||||
| self.long_report += f"\n\n## {part.header}:\n" | ||||||||||||
| self.long_report += f"```\n{part.content}```" | ||||||||||||
|
|
||||||||||||
|
|
||||||||||||
| # ruff: noqa: C901 | ||||||||||||
| async def to_submit_info( | ||||||||||||
| user_info: Any, | ||||||||||||
|
|
@@ -197,14 +199,12 @@ async def to_submit_info( | |||||||||||
| leaderboard_name: str, | ||||||||||||
| gpu_type: str, | ||||||||||||
| db_context: LeaderboardDB, | ||||||||||||
| ) -> tuple[SubmissionRequest, SubmissionMode]: # noqa: C901 | ||||||||||||
| ) -> tuple[SubmissionRequest, SubmissionMode]: # noqa: C901 | ||||||||||||
| user_name = user_info["user_name"] | ||||||||||||
| user_id = user_info["user_id"] | ||||||||||||
|
|
||||||||||||
| try: | ||||||||||||
| submission_mode_enum: SubmissionMode = SubmissionMode( | ||||||||||||
| submission_mode.lower() | ||||||||||||
| ) | ||||||||||||
| submission_mode_enum: SubmissionMode = SubmissionMode(submission_mode.lower()) | ||||||||||||
| except ValueError: | ||||||||||||
| raise HTTPException( | ||||||||||||
| status_code=400, | ||||||||||||
|
|
@@ -222,6 +222,11 @@ async def to_submit_info( | |||||||||||
| SubmissionMode.BENCHMARK, | ||||||||||||
| SubmissionMode.LEADERBOARD, | ||||||||||||
| ] | ||||||||||||
| if submission_mode_enum == SubmissionMode.PROFILE: | ||||||||||||
| raise HTTPException( | ||||||||||||
| status_code=400, | ||||||||||||
| detail="Profile submissions are not currently supported via API, use Discord instead.", | ||||||||||||
| ) | ||||||||||||
|
Comment on lines
+225
to
+229
|
||||||||||||
| if submission_mode_enum == SubmissionMode.PROFILE: | |
| raise HTTPException( | |
| status_code=400, | |
| detail="Profile submissions are not currently supported via API, use Discord instead.", | |
| ) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The workflow tries to import torch without installing it first. The previous version included steps to set up Python and install PyTorch, but these steps have been removed. This will cause the health check to fail. Consider adding back the installation steps or ensure torch is pre-installed in the runner environment.