Intel-tensorflow
diff --git a/‎tensorflow/core/kernels/matmul_util.cc
+6-4 b/‎tensorflow/core/kernels/matmul_util.cc
+6-4
diff --git a/‎third_party/xla/xla/backends/gpu/runtime/command_buffer_cmd.cc
+8-5 b/‎third_party/xla/xla/backends/gpu/runtime/command_buffer_cmd.cc
+8-5
diff --git a/‎third_party/xla/xla/backends/gpu/runtime/command_buffer_cmd.h
+4-4 b/‎third_party/xla/xla/backends/gpu/runtime/command_buffer_cmd.h
+4-4
diff --git a/‎third_party/xla/xla/backends/gpu/runtime/gpublas_lt_matmul_thunk.cc
+11-9 b/‎third_party/xla/xla/backends/gpu/runtime/gpublas_lt_matmul_thunk.cc
+11-9
diff --git a/‎third_party/xla/xla/backends/gpu/runtime/gpublas_lt_matmul_thunk.h
+2-1 b/‎third_party/xla/xla/backends/gpu/runtime/gpublas_lt_matmul_thunk.h
+2-1
diff --git a/‎third_party/xla/xla/service/gpu/autotuning/gemm_algorithm_picker.cc
+1-1 b/‎third_party/xla/xla/service/gpu/autotuning/gemm_algorithm_picker.cc
+1-1
@@ -1,8 +1,11 @@
 /* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
+
     http://www.apache.org/licenses/LICENSE-2.0
+
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -176,7 +179,7 @@ StatusOr<se::blas::ComputationType> GetBlasComputationType(
 
     TF_ASSIGN_OR_RETURN(
         auto algorithms,
-        plan->GetAlgorithms(*max_algorithm_count, max_scratch_size));
+        plan->GetAlgorithms(stream, *max_algorithm_count, max_scratch_size));
 
     ptr->second = {std::move(plan), std::move(algorithms)};
   }
@@ -201,9 +204,8 @@ Status PlanAndAlgorithms::ExecuteOnStream(
                                se::DeviceMemoryBase{},  // c_scale_buffer
                                se::DeviceMemoryBase{},  // d_scale_buffer
                                se::DeviceMemoryBase{},  // d_amax_buffer
-                               algorithms[algorithm_idx],
-                               std::nullopt,  // workspace
-                               &scratch_allocator, profile_result);
+                               algorithms[algorithm_idx], scratch_allocator,
+                               profile_result);
 }
 
 }  // namespace tensorflow
 
@@ -1205,7 +1205,7 @@ CublasLtCmd::CublasLtCmd(
       workspace_buffer_(workspace_buffer) {}
 
 absl::StatusOr<se::gpu::BlasLt::MatmulPlan*> CublasLtCmd::GetMatmulPlan(
-    const stream_executor::Stream* stream) {
+    const se::Stream* stream) {
   auto it = matmul_plans_cache_.find(stream);
   if (it != matmul_plans_cache_.end()) return it->second.get();
   TF_ASSIGN_OR_RETURN(auto plan, se::gpu::BlasLt::GetMatmulPlan(
@@ -1215,13 +1215,14 @@ absl::StatusOr<se::gpu::BlasLt::MatmulPlan*> CublasLtCmd::GetMatmulPlan(
 }
 
 absl::StatusOr<se::gpu::BlasLt::MatmulAlgorithm>
-CublasLtCmd::GetMatmulAlgorithm(const se::gpu::BlasLt::MatmulPlan* plan,
+CublasLtCmd::GetMatmulAlgorithm(const se::Stream* stream,
+                                const se::gpu::BlasLt::MatmulPlan* plan,
                                 int64_t max_workspace) {
   auto it = matmul_algorithm_cache_.find(plan);
   if (it != matmul_algorithm_cache_.end()) return it->second;
   TF_ASSIGN_OR_RETURN(
       auto algorithms,
-      plan->GetAlgorithms(/*max_algorithm_count*/ 128,
+      plan->GetAlgorithms(stream, /*max_algorithm_count*/ 128,
                           /*max_workspace_size*/ max_workspace));
   TF_RET_CHECK(algorithm_idx_ >= 0 && algorithm_idx_ < algorithms.size());
   auto [it_insert, _] =
@@ -1237,7 +1238,8 @@ absl::Status CublasLtCmd::Initialize(const Thunk::InitializeParams& params,
   // Populate plan and algorithm cache;
   TF_ASSIGN_OR_RETURN(auto plan, GetMatmulPlan(params.stream));
   TF_RETURN_IF_ERROR(
-      GetMatmulAlgorithm(plan, workspace_buffer_.size()).status());
+      GetMatmulAlgorithm(params.stream, plan, workspace_buffer_.size())
+          .status());
   return absl::OkStatus();
 }
 
@@ -1246,7 +1248,8 @@ absl::Status CublasLtCmd::Record(const Thunk::ExecuteParams& execute_params,
                                  se::CommandBuffer* command_buffer) {
   TF_ASSIGN_OR_RETURN(auto plan, GetMatmulPlan(execute_params.stream));
   TF_ASSIGN_OR_RETURN(auto algorithm,
-                      GetMatmulAlgorithm(plan, workspace_buffer_.size()));
+                      GetMatmulAlgorithm(execute_params.stream, plan,
+                                         workspace_buffer_.size()));
 
   const BufferAllocations& allocs = *execute_params.buffer_allocations;
 
 
@@ -803,13 +803,13 @@ class CublasLtCmd : public TracedCommandBufferCmd {
 
  private:
   absl::StatusOr<se::gpu::BlasLt::MatmulPlan*> GetMatmulPlan(
-      const stream_executor::Stream* stream);
+      const se::Stream* stream);
 
   absl::StatusOr<se::gpu::BlasLt::MatmulAlgorithm> GetMatmulAlgorithm(
-      const se::gpu::BlasLt::MatmulPlan* plan, int64_t max_workspace);
+      const se::Stream* stream, const se::gpu::BlasLt::MatmulPlan* plan,
+      int64_t max_workspace);
 
-  absl::flat_hash_map<const stream_executor::Stream*,
-                      se::gpu::BlasLt::MatmulPlanPtr>
+  absl::flat_hash_map<const se::Stream*, se::gpu::BlasLt::MatmulPlanPtr>
       matmul_plans_cache_;
 
   absl::flat_hash_map<const se::gpu::BlasLt::MatmulPlan*,
 
@@ -65,11 +65,11 @@ CublasLtMatmulThunk::CublasLtMatmulThunk(
 absl::Status CublasLtMatmulThunk::ExecuteOnStream(const ExecuteParams& params) {
   TF_ASSIGN_OR_RETURN(auto plan, GetMatmulPlan(params.stream));
 
-  TF_ASSIGN_OR_RETURN(
-      auto algorithm,
-      GetMatmulAlgorithm(plan, workspace_buffer_.has_value()
-                                   ? workspace_buffer_.value().size()
-                                   : 0));
+  TF_ASSIGN_OR_RETURN(auto algorithm,
+                      GetMatmulAlgorithm(params.stream, plan,
+                                         workspace_buffer_.has_value()
+                                             ? workspace_buffer_.value().size()
+                                             : 0));
 
   VLOG(3) << "Running cublas_lt matmul thunk";
   const BufferAllocations& allocs = *params.buffer_allocations;
@@ -99,7 +99,7 @@ absl::Status CublasLtMatmulThunk::ExecuteOnStream(const ExecuteParams& params) {
     aux = allocs.GetDeviceAddress(aux_buffer_);
   }
 
-  std::optional<se::DeviceMemoryBase> workspace;
+  se::DeviceMemoryBase workspace;
   if (workspace_buffer_.has_value()) {
     workspace = allocs.GetDeviceAddress(workspace_buffer_.value());
   }
@@ -112,7 +112,7 @@ absl::Status CublasLtMatmulThunk::ExecuteOnStream(const ExecuteParams& params) {
 }
 
 absl::StatusOr<se::gpu::BlasLt::MatmulPlan*> CublasLtMatmulThunk::GetMatmulPlan(
-    const stream_executor::Stream* stream) {
+    const se::Stream* stream) {
   {
     absl::MutexLock lock(&matmul_plans_cache_mutex_);
     auto it = matmul_plans_cache_.find(stream);
@@ -127,7 +127,8 @@ absl::StatusOr<se::gpu::BlasLt::MatmulPlan*> CublasLtMatmulThunk::GetMatmulPlan(
 }
 
 absl::StatusOr<se::gpu::BlasLt::MatmulAlgorithm>
-CublasLtMatmulThunk::GetMatmulAlgorithm(const se::gpu::BlasLt::MatmulPlan* plan,
+CublasLtMatmulThunk::GetMatmulAlgorithm(const se::Stream* stream,
+                                        const se::gpu::BlasLt::MatmulPlan* plan,
                                         int64_t max_workspace) {
   {
     absl::MutexLock lock(&matmul_algorithm_cache_mutex_);
@@ -136,7 +137,8 @@ CublasLtMatmulThunk::GetMatmulAlgorithm(const se::gpu::BlasLt::MatmulPlan* plan,
   }
   TF_ASSIGN_OR_RETURN(
       auto algorithms,
-      plan->GetAlgorithms(/*max_algorithm_count*/ 128,
+      plan->GetAlgorithms(stream,
+                          /*max_algorithm_count*/ 128,
                           /*max_workspace_size*/ max_workspace));
   TF_RET_CHECK(algorithm_idx_ >= 0 && algorithm_idx_ < algorithms.size());
 
 
@@ -74,7 +74,8 @@ class CublasLtMatmulThunk : public Thunk {
   absl::StatusOr<se::gpu::BlasLt::MatmulPlan*> GetMatmulPlan(
       const stream_executor::Stream* stream);
   absl::StatusOr<se::gpu::BlasLt::MatmulAlgorithm> GetMatmulAlgorithm(
-      const se::gpu::BlasLt::MatmulPlan* plan, int64_t max_workspace);
+      const se::Stream* stream, const se::gpu::BlasLt::MatmulPlan* plan,
+      int64_t max_workspace);
 
   absl::Mutex matmul_plans_cache_mutex_;
   absl::flat_hash_map<const stream_executor::Stream*,
 
@@ -186,7 +186,7 @@ class GemmAutotuner {
 
     TF_ASSIGN_OR_RETURN(
         auto algorithms,
-        plan->GetAlgorithms(/*max_algorithm_count*/ 128,
+        plan->GetAlgorithms(stream_, /*max_algorithm_count*/ 128,
                             /*max_workspace_size*/ workspace_buffer.size()));
 
     auto tuned_func = [&](const BlasLt::MatmulAlgorithm& algorithm)