@@ -211,7 +211,7 @@ def run_single_test(pool: multiprocessing.Pool, test: TestCase):
211211 """
212212 world_size = test .args .get ("world_size" , None )
213213 if world_size is None :
214- return pool .apply (_run_single_test , (test , 0 , 0 ))
214+ return pool .apply (_run_single_test , (test ,))
215215 else :
216216 return run_multi_gpu_test (pool , test , world_size )
217217
@@ -255,7 +255,7 @@ def _run_single_benchmark(test: TestCase, recheck: bool, max_repeats: int, max_t
255255 durations = []
256256 # generate input data once
257257 data = generate_input (** test .args )
258- check_copy = _clone_data (data )
258+ check_copy = _clone_data (data , 0 )
259259 # first, one obligatory correctness check
260260 output = custom_kernel (data )
261261 good , message = wrap_check_implementation (check_copy , output )
@@ -275,7 +275,7 @@ def _run_single_benchmark(test: TestCase, recheck: bool, max_repeats: int, max_t
275275 test .args ["seed" ] += 13
276276
277277 data = generate_input (** test .args )
278- check_copy = _clone_data (data )
278+ check_copy = _clone_data (data , 0 )
279279 torch .cuda .synchronize ()
280280 start_event = torch .cuda .Event (enable_timing = True )
281281 end_event = torch .cuda .Event (enable_timing = True )
@@ -509,7 +509,7 @@ def run_single_profile(test: TestCase) -> str:
509509 torch .cuda .synchronize ()
510510
511511 with profile (activities = [ProfilerActivity .CPU , ProfilerActivity .CUDA ]) as prof :
512- submission_output = custom_kernel (_clone_data (data ))
512+ submission_output = custom_kernel (_clone_data (data , 0 ))
513513 torch .cuda .synchronize ()
514514 return prof .key_averages ().table (sort_by = "self_cuda_time_total" , row_limit = 20 )
515515
0 commit comments