NVIDIA · naoyam · Nov 7, 2024 · Nov 5, 2024 · Nov 5, 2024 · Nov 5, 2024
diff --git a/benchmarks/cpp/gelu_backward.cpp b/benchmarks/cpp/gelu_backward.cpp
@@ -162,7 +162,7 @@ static void NvFuserScheduler_GeluBackward_Compile(
       &fusion, SchedulerType::PointWise, c10::ArrayRef<c10::IValue>(inputs));
 
   for (auto _ : benchmark_state) {
-    FusionExecutor executor;
+    KernelExecutor executor;
     executor.compileFusion(&fusion, inputs, heuristic_params->lparams);
   }
 }
@@ -187,7 +187,7 @@ static void NvFuserScheduler_GeluBackward_RunFusion(
   auto heuristic_params = SchedulerEntry::scheduleWith(
       &fusion, SchedulerType::PointWise, c10::ArrayRef<c10::IValue>(inputs));
 
-  FusionExecutor executor;
+  KernelExecutor executor;
   executor.compileFusion(&fusion, inputs, heuristic_params->lparams);
 
   C10_CUDA_CHECK(cudaDeviceSynchronize());
@@ -218,7 +218,7 @@ static void NvFuserScheduler_GeluBackward_RunFusion_GpuOnly(
   auto heuristic_params = SchedulerEntry::scheduleWith(
       &fusion, SchedulerType::PointWise, c10::ArrayRef<c10::IValue>(inputs));
 
-  FusionExecutor executor;
+  KernelExecutor executor;
   executor.compileFusion(&fusion, inputs, heuristic_params->lparams);
 
   runBenchmarkIterations(
@@ -247,7 +247,7 @@ static void NvFuserScheduler_GeluBackward_RunFusion_CpuOnly(
   auto heuristic_params = SchedulerEntry::scheduleWith(
       &fusion, SchedulerType::PointWise, c10::ArrayRef<c10::IValue>(inputs));
 
-  FusionExecutor executor;
+  KernelExecutor executor;
   executor.setExecuteKernelFlag(false);
   executor.compileFusion(&fusion, inputs, heuristic_params->lparams);
 

diff --git a/benchmarks/cpp/heuristic_cache.cpp b/benchmarks/cpp/heuristic_cache.cpp
@@ -26,7 +26,7 @@ using namespace nvfuser;
 
 static auto getLayerBackwardNormRuntime(
     std::unique_ptr<Fusion> fusion_ptr,
-    std::unique_ptr<FusionExecutorCache>& fec,
+    std::unique_ptr<FusionExecutorCache>& executor_cache,
     std::vector<c10::IValue>& aten_inputs,
     std::vector<int64_t>& shape,
     std::vector<int64_t>& norm_shape) {
@@ -84,12 +84,12 @@ static auto getLayerBackwardNormRuntime(
   auto aten_mean = std::get<1>(aten_results);
   auto aten_rstd = std::get<2>(aten_results);
 
-  fec = std::make_unique<FusionExecutorCache>(std::move(fusion_ptr));
+  executor_cache = std::make_unique<FusionExecutorCache>(std::move(fusion_ptr));
   aten_inputs = {
       aten_grad_out, aten_input, aten_mean, aten_rstd, aten_weight, aten_bias};
-  auto cg_outputs = fec->runFusionWithInputs(aten_inputs);
+  auto cg_outputs = executor_cache->runFusionWithInputs(aten_inputs);
 
-  return fec->getMostRecentKernelRuntime();
+  return executor_cache->getMostRecentKernelRuntime();
 }
 
 static void NvFuserScheduler_LayerNormBackward_HeuristicCache(
@@ -98,14 +98,14 @@ static void NvFuserScheduler_LayerNormBackward_HeuristicCache(
   FusionGuard fg(fusion_ptr.get());
 
   // PreAllocate
-  std::unique_ptr<FusionExecutorCache> fec;
+  std::unique_ptr<FusionExecutorCache> executor_cache;
   std::vector<c10::IValue> aten_inputs;
 
   std::vector<int64_t> shape{20, 100, 35, 67};
   std::vector<int64_t> norm_shape{67};
 
   auto runtime = getLayerBackwardNormRuntime(
-      std::move(fusion_ptr), fec, aten_inputs, shape, norm_shape);
+      std::move(fusion_ptr), executor_cache, aten_inputs, shape, norm_shape);
 
   KernelArgumentHolder args =
       KernelArgumentHolder::createKernelArgumentHolder(aten_inputs);
@@ -120,7 +120,7 @@ static void NvFuserScheduler_LayerNormBackward_HeuristicCache(
 
 static auto getLayerForwardNormRuntime(
     std::unique_ptr<Fusion> fusion_ptr,
-    std::unique_ptr<FusionExecutorCache>& fec,
+    std::unique_ptr<FusionExecutorCache>& executor_cache,
     std::vector<c10::IValue>& aten_inputs,
     std::vector<int64_t>& shape,
     std::vector<int64_t>& norm_shape) {
@@ -141,11 +141,11 @@ static auto getLayerForwardNormRuntime(
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
   at::Tensor aten_input = at::randn(shape, options);
 
-  fec = std::make_unique<FusionExecutorCache>(std::move(fusion_ptr));
+  executor_cache = std::make_unique<FusionExecutorCache>(std::move(fusion_ptr));
   aten_inputs = {aten_input};
-  auto cg_outputs = fec->runFusionWithInputs(aten_inputs);
+  auto cg_outputs = executor_cache->runFusionWithInputs(aten_inputs);
 
-  return fec->getMostRecentKernelRuntime();
+  return executor_cache->getMostRecentKernelRuntime();
 }
 
 static void NvFuserScheduler_LayerNormForward_HeuristicCache(
@@ -154,14 +154,14 @@ static void NvFuserScheduler_LayerNormForward_HeuristicCache(
   FusionGuard fg(fusion_ptr.get());
 
   // PreAllocate
-  std::unique_ptr<FusionExecutorCache> fec;
+  std::unique_ptr<FusionExecutorCache> executor_cache;
   std::vector<c10::IValue> aten_inputs;
 
   std::vector<int64_t> shape{20, 100, 35, 67};
   std::vector<int64_t> norm_shape{67};
 
   auto runtime = getLayerForwardNormRuntime(
-      std::move(fusion_ptr), fec, aten_inputs, shape, norm_shape);
+      std::move(fusion_ptr), executor_cache, aten_inputs, shape, norm_shape);
 
   KernelArgumentHolder args =
       KernelArgumentHolder::createKernelArgumentHolder(aten_inputs);

diff --git a/benchmarks/cpp/heuristic_lookup.cpp b/benchmarks/cpp/heuristic_lookup.cpp
@@ -26,7 +26,7 @@ using namespace nvfuser;
 
 static auto getLayerBackwardNormRuntime(
     std::unique_ptr<Fusion> fusion_ptr,
-    std::unique_ptr<FusionExecutorCache>& fec,
+    std::unique_ptr<FusionExecutorCache>& executor_cache,
     std::vector<c10::IValue>& aten_inputs,
     std::vector<int64_t>& shape,
     std::vector<int64_t>& norm_shape) {
@@ -86,12 +86,12 @@ static auto getLayerBackwardNormRuntime(
   auto aten_mean = std::get<1>(aten_results);
   auto aten_rstd = std::get<2>(aten_results);
 
-  fec = std::make_unique<FusionExecutorCache>(std::move(fusion_ptr));
+  executor_cache = std::make_unique<FusionExecutorCache>(std::move(fusion_ptr));
   aten_inputs = {
       aten_grad_out, aten_input, aten_mean, aten_rstd, aten_weight, aten_bias};
-  auto cg_outputs = fec->runFusionWithInputs(aten_inputs);
+  auto cg_outputs = executor_cache->runFusionWithInputs(aten_inputs);
 
-  return fec->getMostRecentKernelRuntime();
+  return executor_cache->getMostRecentKernelRuntime();
 }
 
 static void NvFuserScheduler_LayerNormBackward_HeuristicLookup(
@@ -100,14 +100,14 @@ static void NvFuserScheduler_LayerNormBackward_HeuristicLookup(
   FusionGuard fg(fusion_ptr.get());
 
   // PreAllocate
-  std::unique_ptr<FusionExecutorCache> fec;
+  std::unique_ptr<FusionExecutorCache> executor_cache;
   std::vector<c10::IValue> aten_inputs;
 
   std::vector<int64_t> shape{20, 100, 35, 67};
   std::vector<int64_t> norm_shape{67};
 
   auto runtime = getLayerBackwardNormRuntime(
-      std::move(fusion_ptr), fec, aten_inputs, shape, norm_shape);
+      std::move(fusion_ptr), executor_cache, aten_inputs, shape, norm_shape);
 
   KernelArgumentHolder args =
       KernelArgumentHolder::createKernelArgumentHolder(aten_inputs);
@@ -122,7 +122,7 @@ static void NvFuserScheduler_LayerNormBackward_HeuristicLookup(
 
 static auto getLayerForwardNormRuntime(
     std::unique_ptr<Fusion> fusion_ptr,
-    std::unique_ptr<FusionExecutorCache>& fec,
+    std::unique_ptr<FusionExecutorCache>& executor_cache,
     std::vector<c10::IValue>& aten_inputs,
     std::vector<int64_t>& shape,
     std::vector<int64_t>& norm_shape) {
@@ -143,11 +143,11 @@ static auto getLayerForwardNormRuntime(
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
   at::Tensor aten_input = at::randn(shape, options);
 
-  fec = std::make_unique<FusionExecutorCache>(std::move(fusion_ptr));
+  executor_cache = std::make_unique<FusionExecutorCache>(std::move(fusion_ptr));
   aten_inputs = {aten_input};
-  auto cg_outputs = fec->runFusionWithInputs(aten_inputs);
+  auto cg_outputs = executor_cache->runFusionWithInputs(aten_inputs);
 
-  return fec->getMostRecentKernelRuntime();
+  return executor_cache->getMostRecentKernelRuntime();
 }
 
 static void NvFuserScheduler_LayerNormForward_HeuristicLookup(
@@ -156,14 +156,14 @@ static void NvFuserScheduler_LayerNormForward_HeuristicLookup(
   FusionGuard fg(fusion_ptr.get());
 
   // PreAllocate
-  std::unique_ptr<FusionExecutorCache> fec;
+  std::unique_ptr<FusionExecutorCache> executor_cache;
   std::vector<c10::IValue> aten_inputs;
 
   std::vector<int64_t> shape{20, 100, 35, 67};
   std::vector<int64_t> norm_shape{67};
 
   auto runtime = getLayerForwardNormRuntime(
-      std::move(fusion_ptr), fec, aten_inputs, shape, norm_shape);
+      std::move(fusion_ptr), executor_cache, aten_inputs, shape, norm_shape);
 
   KernelArgumentHolder args =
       KernelArgumentHolder::createKernelArgumentHolder(aten_inputs);

diff --git a/benchmarks/cpp/indexselect.cpp b/benchmarks/cpp/indexselect.cpp
@@ -132,7 +132,7 @@ static void NvFuserScheduler_IndexSelect_Compile(
       &fusion, SchedulerType::PointWise, c10::ArrayRef<c10::IValue>(inputs));
 
   for (auto _ : benchmark_state) {
-    FusionExecutor executor;
+    KernelExecutor executor;
     executor.compileFusion(
         &fusion, c10::ArrayRef<c10::IValue>(inputs), heuristic_params->lparams);
   }
@@ -155,7 +155,7 @@ static void NvFuserScheduler_IndexSelect_RunFusion(
   auto heuristic_params = SchedulerEntry::scheduleWith(
       &fusion, SchedulerType::PointWise, c10::ArrayRef<c10::IValue>(inputs));
 
-  FusionExecutor executor;
+  KernelExecutor executor;
   executor.compileFusion(
       &fusion, c10::ArrayRef<c10::IValue>(inputs), heuristic_params->lparams);
 

diff --git a/benchmarks/cpp/lstm_cell.cpp b/benchmarks/cpp/lstm_cell.cpp
@@ -155,7 +155,7 @@ static void NvFuserScheduler_LstmCell_Compile(
       &fusion, SchedulerType::PointWise, c10::ArrayRef<c10::IValue>(inputs));
 
   for (auto _ : benchmark_state) {
-    FusionExecutor executor;
+    KernelExecutor executor;
     executor.compileFusion(&fusion, inputs);
   }
 }
@@ -182,7 +182,7 @@ static void NvFuserScheduler_LstmCell_RunFusion(
   auto heuristic_params = SchedulerEntry::scheduleWith(
       &fusion, SchedulerType::PointWise, c10::ArrayRef<c10::IValue>(inputs));
 
-  FusionExecutor executor;
+  KernelExecutor executor;
   executor.compileFusion(&fusion, inputs);
 
   C10_CUDA_CHECK(cudaDeviceSynchronize());
@@ -220,7 +220,7 @@ static void NvFuserScheduler_LstmCell_RunFusion_GpuOnly(
   auto heuristic_params = SchedulerEntry::scheduleWith(
       &fusion, SchedulerType::PointWise, c10::ArrayRef<c10::IValue>(inputs));
 
-  FusionExecutor executor;
+  KernelExecutor executor;
   executor.compileFusion(&fusion, inputs);
 
   runBenchmarkIterations(
@@ -259,7 +259,7 @@ static void NvFuserScheduler_LstmCell_RunFusion_CpuOnly(
   auto heuristic_params = SchedulerEntry::scheduleWith(
       &fusion, SchedulerType::PointWise, c10::ArrayRef<c10::IValue>(inputs));
 
-  FusionExecutor executor;
+  KernelExecutor executor;
   executor.setExecuteKernelFlag(false);
   executor.compileFusion(&fusion, inputs);
 

diff --git a/benchmarks/cpp/matmul.cpp b/benchmarks/cpp/matmul.cpp
@@ -175,19 +175,19 @@ static void SingleMatmulBase(
 
   // Compile kernel
   auto launch_constraints = LaunchParams();
-  FusionExecutor fe;
-  fe.compileFusion(fusion, args, launch_constraints, cparams);
+  KernelExecutor ke;
+  ke.compileFusion(fusion, args, launch_constraints, cparams);
   NVF_CHECK(
-      getBankConflictInfo(fe.kernel(), launch_constraints).empty(),
+      getBankConflictInfo(ke.kernel(), launch_constraints).empty(),
       "Shared memory bank conflict not removed.");
 
   std::vector<c10::IValue> aten_inputs({inputs.first, inputs.second});
 
   // Warm up run
-  auto outputs = fe.runFusion(aten_inputs);
+  auto outputs = ke.runFusion(aten_inputs);
   checkMatch(expected_output, outputs.at(0).to(at::kDouble), k);
 
-  runBenchmarkIterations(benchmark_state, &fe, aten_inputs);
+  runBenchmarkIterations(benchmark_state, &ke, aten_inputs);
 
   // TODO: FLOPS calculation
 }
@@ -355,19 +355,19 @@ static void SingleMatmulPartitionedK(
   cparams.index_type = computeIndexType(M, N, K);
 
   // Compile kernel
-  FusionExecutor fe;
+  KernelExecutor ke;
   auto lparams = LaunchParams();
-  fe.compileFusion(fusion, args, lparams, cparams);
+  ke.compileFusion(fusion, args, lparams, cparams);
   NVF_CHECK(
-      getBankConflictInfo(fe.kernel(), lparams).empty(),
+      getBankConflictInfo(ke.kernel(), lparams).empty(),
       "Shared memory bank conflict not removed.");
 
   // Warm up run
-  auto outputs = fe.runFusion(aten_inputs);
+  auto outputs = ke.runFusion(aten_inputs);
 
   checkMatch(expected_output, outputs.at(0).to(at::kDouble), Ki);
 
-  runBenchmarkIterations(benchmark_state, &fe, aten_inputs);
+  runBenchmarkIterations(benchmark_state, &ke, aten_inputs);
 
   // TODO: FLOPS calculation
 }
@@ -461,21 +461,21 @@ static void NvFuserScheduler_MatmulSplitKReduction(
       KernelArgumentHolder::createKernelArgumentHolder(aten_inputs);
 
   // Compile kernel
-  FusionExecutor fe;
-  fe.compileFusion(
+  KernelExecutor ke;
+  ke.compileFusion(
       fusion, args, heuristic_params->lparams, heuristic_params->cparams);
 
   NVF_CHECK(
-      getBankConflictInfo(fe.kernel(), heuristic_params->lparams).empty(),
+      getBankConflictInfo(ke.kernel(), heuristic_params->lparams).empty(),
       "Shared memory bank conflict not removed.");
 
   // Warm up run
-  auto outputs = fe.runFusion(aten_inputs, heuristic_params->lparams);
+  auto outputs = ke.runFusion(aten_inputs, heuristic_params->lparams);
 
   checkMatch(expected_output, outputs.at(0).to(at::kDouble), splitk_factor);
 
   runBenchmarkIterations(
-      benchmark_state, &fe, aten_inputs, heuristic_params->lparams);
+      benchmark_state, &ke, aten_inputs, heuristic_params->lparams);
 
   // TODO: FLOPS calculation
 }