diff --git a/aten/src/ATen/native/cuda/BinaryMulDivKernel.cu b/aten/src/ATen/native/cuda/BinaryMulDivKernel.cu
index 107644d95c219..4b751da00a557 100644
--- a/aten/src/ATen/native/cuda/BinaryMulDivKernel.cu
+++ b/aten/src/ATen/native/cuda/BinaryMulDivKernel.cu
@@ -184,7 +184,7 @@ void mul_kernel_cuda(TensorIteratorBase& iter) {
           return a * b;
         }
       );
-      jitted_gpu_kernel<mul_name, scalar_t, scalar_t, 2>(iter, mul_string);
+      opmath_jitted_gpu_kernel_with_scalars<mul_name, scalar_t, scalar_t>(iter, mul_string);
     #else
       using opmath_t = at::opmath_type<scalar_t>;
       opmath_gpu_kernel_with_scalars<scalar_t>(iter, MulFunctor<opmath_t>());
diff --git a/test/test_binary_ufuncs.py b/test/test_binary_ufuncs.py
index b927c17ec8709..c293cab2943a1 100644
--- a/test/test_binary_ufuncs.py
+++ b/test/test_binary_ufuncs.py
@@ -3560,6 +3560,20 @@ def test_helper(x, q):
             x = make_tensor((2, 3, 4), dtype=x_dtype, device=device)
             test_helper(x, q)
 
+    @onlyCUDA
+    @dtypes(torch.chalf,)
+    def test_mul_chalf_tensor_and_cpu_scalar(self, device, dtype):
+        # Tests that Tensor and CPU Scalar work for `mul` for chalf.
+        # Ideally, this should be covered by `test_complex_half_reference_testing`
+        # from test_ops.py by checking reference_samples from the OpInfo.
+        # But currently that doesn't work as sample generation requires support of
+        # `index_select` which is not implemented for `complex32` at the
+        # time of writing this test.
+        # TODO: Remove this test once above issue is fixed.
+        # Ref: https://github.com/pytorch/pytorch/pull/76364
+        x = make_tensor((2, 2), device=device, dtype=dtype)
+        self.assertEqual(x * 2.5, x * torch.tensor(2.5, device=device, dtype=dtype))
+
 
 tensor_binary_ops = [
     '__lt__', '__le__',