@@ -297,6 +297,62 @@ def run_program(
297297 )
298298
299299
300+ def profile_program (
301+ system : SystemInfo ,
302+ call : list [str ],
303+ seed : Optional [int ],
304+ timeout : int ,
305+ multi_gpu : bool ,
306+ ) -> tuple [RunResult , Optional [ProfileResult ]]:
307+ # The runner-specific configuration should implement logic
308+ # to fetch the data in this directory and return it as
309+ # ProfileResult.download_url.
310+ output_dir = Path ('profile_data' )
311+
312+ if system .runtime == "ROCm" :
313+ # Wrap program in rocprof
314+ output_dir .mkdir ()
315+ call = [
316+ "rocprofv3" ,
317+ "--log-level" ,
318+ "fatal" ,
319+ "--hip-trace" ,
320+ "--kernel-trace" ,
321+ "--rccl-trace" ,
322+ "--marker-trace" ,
323+ "--hip-trace" ,
324+ "--memory-copy-trace" ,
325+ # TODO(Robin): New? Doesn't work in the runner
326+ # "--memory-allocation-trace",
327+ "--scratch-memory-trace" ,
328+ # TODO(Robin): The HSA trace is very large. Skip for now, maybe make optional later?
329+ # "--hsa-trace",
330+ "--output-format" ,
331+ "pftrace" ,
332+ "csv" ,
333+ "-d" ,
334+ str (output_dir ),
335+ # Just store the files as %pid%_tracename.ext instead of putting them in an
336+ # additional directory named after the hostname.
337+ "-o" ,
338+ "%pid%" ,
339+ "--" ,
340+ ] + call
341+
342+ run_result = run_program (call , seed = seed , timeout = timeout , multi_gpu = multi_gpu )
343+ profile_result = None
344+
345+ if run_result .success :
346+ profile_result = ProfileResult (
347+ profiler = 'rocPROF' ,
348+ download_url = None ,
349+ )
350+
351+ return run_result , profile_result
352+ else :
353+ # TODO: Implement profiling for other platforms
354+ return run_program (call , seed = seed , timeout = timeout , multi_gpu = multi_gpu ), None
355+
300356def run_single_evaluation (
301357 system : SystemInfo ,
302358 call : list [str ],
@@ -332,6 +388,9 @@ def run_single_evaluation(
332388
333389 call += [mode , cases .name ]
334390
391+ if mode == "profile" :
392+ return profile_program (system , call , seed = seed , timeout = timeout , multi_gpu = multi_gpu )
393+
335394 return run_program (call , seed = seed , timeout = timeout , multi_gpu = multi_gpu ), None
336395
337396
0 commit comments