NVIDIA
diff --git a/‎transformer_engine/common/include/transformer_engine/comm_gemm.h‎
Lines changed: 8 additions & 0 deletions b/‎transformer_engine/common/include/transformer_engine/comm_gemm.h‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎transformer_engine/common/include/transformer_engine/comm_gemm_overlap.h‎
Lines changed: 2 additions & 0 deletions b/‎transformer_engine/common/include/transformer_engine/comm_gemm_overlap.h‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎transformer_engine/common/util/pybind_helper.h‎
Lines changed: 8 additions & 0 deletions b/‎transformer_engine/common/util/pybind_helper.h‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎transformer_engine/pytorch/csrc/common.h‎
Lines changed: 1 addition & 0 deletions b/‎transformer_engine/pytorch/csrc/common.h‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎transformer_engine/pytorch/csrc/extensions.h‎
Lines changed: 46 additions & 33 deletions b/‎transformer_engine/pytorch/csrc/extensions.h‎
Lines changed: 46 additions & 33 deletions
@@ -44,6 +44,14 @@ enum NVTECommGemmAlgoType {
   kNVTECommGemmAlgoAtomicMulticast = 4
 };
 
+bool nvte_built_with_cublasmp() {
+#ifdef NVTE_WITH_CUBLASMP
+    return true;
+#else
+    return false;
+#endif
+}
+
 /*! \brief Create a comm-gemm context.
  *
  *  \param[in]  comm          NCCL communicator.
 
@@ -26,6 +26,8 @@ namespace transformer_engine {
  */
 bool ubuf_built_with_mpi();
 
+enum class CommOverlapMethod { BULK = 0, PIPELINE = 1, RING_EXCHANGE = 2 };
+
 enum class CommOverlapType { RS = 0, AG = 1 };
 
 enum class CommOverlapAlgo {
 
@@ -9,6 +9,7 @@
 
 #include <pybind11/pybind11.h>
 #include <transformer_engine/comm_gemm_overlap.h>
+#include <transformer_engine/comm_gemm.h>
 #include <transformer_engine/fused_attn.h>
 #include <transformer_engine/transformer_engine.h>
 
@@ -84,6 +85,11 @@
       m, "Float8BlockScaleTensorFormat", pybind11::module_local())                                 \
       .value("GEMM_READY", transformer_engine::Float8BlockScaleTensorFormat::GEMM_READY)           \
       .value("COMPACT", transformer_engine::Float8BlockScaleTensorFormat::COMPACT);                \
+  pybind11::enum_<transformer_engine::CommOverlapMethod>(m, "CommOverlapMethod",                   \
+                                                         pybind11::module_local())                 \
+      .value("BULK", transformer_engine::CommOverlapMethod::BULK)                                  \
+      .value("PIPELINE", transformer_engine::CommOverlapMethod::PIPELINE)                          \
+      .value("RING_EXCHANGE", transformer_engine::CommOverlapMethod::RING_EXCHANGE);               \
   pybind11::enum_<transformer_engine::CommOverlapType>(m, "CommOverlapType",                       \
                                                        pybind11::module_local())                   \
       .value("RS", transformer_engine::CommOverlapType::RS)                                        \
@@ -135,6 +141,8 @@
       },                                                                                           \
       py::call_guard<py::gil_scoped_release>(), py::arg("device_id") = -1);                        \
   m.def("ubuf_built_with_mpi", &transformer_engine::ubuf_built_with_mpi,                           \
+        py::call_guard<py::gil_scoped_release>());                                                 \
+  m.def("nvte_built_with_cublasmp", &nvte_built_with_cublasmp,                                     \
         py::call_guard<py::gil_scoped_release>());
 
 #endif
@@ -27,6 +27,7 @@
 #include <transformer_engine/cast.h>
 #include <transformer_engine/cast_transpose_noop.h>
 #include <transformer_engine/comm_gemm_overlap.h>
+#include <transformer_engine/comm_gemm.h>
 #include <transformer_engine/fused_attn.h>
 #include <transformer_engine/fused_rope.h>
 #include <transformer_engine/fused_router.h>
 
@@ -128,7 +128,7 @@ std::vector<py::object> gemm(py::handle A, bool transa, py::handle B, bool trans
                              py::handle quantizer, std::optional<DType> out_dtype, MaybeTensor bias,
                              DType bias_type, bool gelu, MaybeTensor gelu_in, bool grad,
                              at::Tensor workspace, size_t workspaceSize, bool accumulate,
-                             bool use_split_accumulator, CommOverlapCore *comm_overlap = nullptr,
+                             bool use_split_accumulator, CommOverlapManager *comm_overlap = nullptr,
                              std::optional<CommOverlapType> comm_type = std::nullopt,
                              MaybeTensor extra_output = std::nullopt, bool bulk_overlap = false,
                              float alpha = 1.0f, std::optional<float> beta = std::nullopt);
@@ -504,48 +504,56 @@ class CommOverlapHelper : torch::CustomClassHolder {
 
   CommOverlapHelper();
 
-  CommOverlapHelper(c10d::ProcessGroup *world_group,
-                    std::optional<c10d::ProcessGroup *> intra_node_group);
+  CommOverlapHelper(c10d::ProcessGroup *tp_group);
+
+  CommOverlapHelper(c10d::ProcessGroup *world_group, c10d::ProcessGroup *intra_node_group);
 
   ~CommOverlapHelper();
 
   void ub_allgather(void *globaldata, size_t globalbytes, void *localdata, size_t localbytes,
                     ExtComm comm);
 
   void ub_barrier(ExtComm comm);
-};
-
-class CommOverlap : torch::CustomClassHolder, public transformer_engine::CommOverlapBase {
- public:
-  CommOverlap(const std::vector<size_t> &buffer_shape, at::ScalarType buffer_dtype,
-              CommOverlapHelper *helper, int tp_size, int num_splits = 3,
-              int num_max_streams = NVTE_COMM_OVERLAP_MAX_STREAMS, int comm_cga_size = 2,
-              int gemm_priority = 0, int comm_priority = 0, int num_comm_sm = 16,
-              bool set_sm_margin = true, bool atomic_gemm = false,
-              bool rs_overlap_first_gemm = false);
-
-  ~CommOverlap() {}
-
-  void copy_into_buffer(const at::Tensor &input, bool local_chunk = false);
 
-  at::Tensor get_buffer(bool local_chunk = false,
-                        std::optional<std::vector<int64_t>> shape = std::nullopt);
-
-  std::pair<at::Stream, at::Stream> get_communication_stream();
+  int64_t get_comm_ptr(std::string group = "world") { return pgs[group]->getCommPtr(); }
+};
 
-};  // CommOverlap
+class CommOverlapManager : torch::CustomClassHolder {
+ private:
+#ifndef NVTE_WITH_CUBLASMP
+  transformer_engine::CommOverlapCore *_ctx;
+#else
+  CommGemmCtx *_ctx;
+#endif
+  transformer_engine::CommOverlapMethod _method;
+  int _num_comm_sm;
+  bool _use_atomic_gemm;
 
-class CommOverlapP2P : torch::CustomClassHolder, public transformer_engine::CommOverlapP2PBase {
  public:
-  CommOverlapP2P(const std::vector<size_t> &buffer_shape, at::ScalarType buffer_dtype,
-                 CommOverlapHelper *helper, int tp_size,
-                 transformer_engine::CommOverlapType comm_type,
-                 int num_max_streams = NVTE_COMM_OVERLAP_MAX_STREAMS, int comm_cga_size = 2,
-                 int gemm_priority = 0, int comm_priority = 0, int num_comm_sm = 3,
-                 bool set_sm_margin = true, bool atomic_gemm = false, bool use_ce = true,
-                 bool aggregate = false);
-
-  ~CommOverlapP2P() {}
+  CommOverlapManager(transformer_engine::CommOverlapMethod method,
+                     transformer_engine::CommOverlapType comm_type,
+                     const std::vector<size_t> &buffer_shape, at::ScalarType buffer_dtype,
+                     CommOverlapHelper *helper, int tp_size, int num_splits = 3,
+                     int num_max_streams = NVTE_COMM_OVERLAP_MAX_STREAMS, int comm_cga_size = 2,
+                     int gemm_priority = 0, int comm_priority = 0, int num_comm_sm = 16,
+                     bool set_sm_margin = false, bool atomic_gemm = false,
+                     bool aggregate_ag = false, bool rs_overlap_first_gemm = false);
+
+  ~CommOverlapManager() {
+#ifdef NVTE_WITH_CUBLASMP
+    nvte_comm_gemm_ctx_destroy(_ctx);
+#else
+    delete _ctx;
+#endif;
+  }
+
+  bool is_fp8_ubuf() {
+#ifndef NVTE_WITH_CUBLASMP
+    return _ctx->is_fp8_ubuf();
+#else
+    return false;
+#endif
+  }
 
   void copy_into_buffer(const at::Tensor &input, bool local_chunk = false);
 
@@ -554,6 +562,11 @@ class CommOverlapP2P : torch::CustomClassHolder, public transformer_engine::Comm
 
   std::pair<at::Stream, at::Stream> get_communication_stream();
 
-};  // CommOverlapP2P
+  void execute(const TensorWrapper &A, bool transa, const TensorWrapper &B, bool transb,
+               TensorWrapper &D, TensorWrapper &bias, TensorWrapper &pre_gelu_out,
+               TensorWrapper &workspace, bool grad, bool accumulate, bool use_split_accumulator,
+               transformer_engine::CommOverlapType comm_type, TensorWrapper &aux_out,
+               cudaStream_t stream);
+};  // CommOverlapManager
 
 #endif  // TRANSFORMER_ENGINE_PYTORCH_CSRC_EXTENSIONS_H_