diff --git a/include/matx/core/tensor.h b/include/matx/core/tensor.h
index d2fa6191e..cc8514092 100644
--- a/include/matx/core/tensor.h
+++ b/include/matx/core/tensor.h
@@ -1428,8 +1428,6 @@ class tensor_t : public detail::tensor_impl_t<T,RANK,Desc> {
    * more dimensions of a tensor. This includes completely dropping an unwanted
    * dimension, or simply taking a piece of a wanted dimension. Slice() is very
    * similar to indexing operations in both Python and MATLAB.
-   * 
-   * *NOTE* Users should not call Slice() directly anymore. Use the slice() operator instead.
    *
    * @param firsts
    *   List of starting index into each dimension. Indexing is 0-based
@@ -1452,10 +1450,10 @@ class tensor_t : public detail::tensor_impl_t<T,RANK,Desc> {
    * @returns Sliced view of tensor
    *
    */
-  template <int N = RANK, typename StrideType>
+  template <int N = RANK>
   __MATX_INLINE__ auto Slice([[maybe_unused]] const cuda::std::array<typename Desc::shape_type, RANK> &firsts,
-                            [[maybe_unused]] const cuda::std::array<typename Desc::shape_type, RANK> &ends,
-                            [[maybe_unused]] StrideType strides) const
+                             [[maybe_unused]] const cuda::std::array<typename Desc::shape_type, RANK> &ends,
+                             [[maybe_unused]] const cuda::std::array<typename Desc::stride_type, RANK> &strides) const
   {
     static_assert(N <= RANK && RANK > 0, "Must slice to a rank the same or less than current rank.");
 
@@ -1466,6 +1464,7 @@ class tensor_t : public detail::tensor_impl_t<T,RANK,Desc> {
 
     T *data = this->ldata_;
     int d = 0;
+    bool def_stride = (strides[0] == -1);
 
     [[maybe_unused]] int end_count = 0;
     for (int i = 0; i < RANK; i++) {
@@ -1487,14 +1486,9 @@ class tensor_t : public detail::tensor_impl_t<T,RANK,Desc> {
 
       MATX_ASSERT_STR(first < end, matxInvalidSize, "Slice must be at least one element long");
 
-      [[maybe_unused]] typename Desc::stride_type stride_mult;
-      
-      if constexpr (std::is_same_v<StrideType, detail::NoStride>) {
-        stride_mult = 1;
-      }
-      else {
-        stride_mult = (strides[i] == matxKeepStride) ? 1 : strides[i];
-      }
+      [[maybe_unused]] typename Desc::stride_type stride_mult = (def_stride || strides[i] == matxKeepStride)
+                                ? 1
+                                : strides[i]; // custom stride
 
       MATX_ASSERT_STR(first < end, matxInvalidParameter,
                       "Starting slice must be less than end slice");
@@ -1531,10 +1525,10 @@ class tensor_t : public detail::tensor_impl_t<T,RANK,Desc> {
     return tensor_t<T, N, Storage, decltype(new_desc)>{storage_, std::move(new_desc), data};
   }
 
-  template <typename StrideType, int N = RANK>
+  template <int N = RANK>
   __MATX_INLINE__ auto Slice(const typename Desc::shape_type (&firsts)[RANK],
-                            const typename Desc::shape_type (&ends)[RANK],
-                            StrideType strides) const
+                             const typename Desc::shape_type (&ends)[RANK],
+                             const typename Desc::stride_type (&strides)[RANK]) const
   {
     return Slice<N>(detail::to_array(firsts), detail::to_array(ends), detail::to_array(strides));
   }
@@ -1565,13 +1559,15 @@ class tensor_t : public detail::tensor_impl_t<T,RANK,Desc> {
    */
   template <int N = RANK>
   __MATX_INLINE__ auto Slice(const cuda::std::array<typename Desc::shape_type, RANK> &firsts,
-                            const cuda::std::array<typename Desc::shape_type, RANK> &ends) const
+                             const cuda::std::array<typename Desc::shape_type, RANK> &ends) const
   {
     static_assert(N <= RANK && RANK > 0, "Must slice to a rank the same or less than current rank.");
 
     MATX_NVTX_START("", matx::MATX_NVTX_LOG_API)
 
-    return Slice<detail::NoStride, N>(firsts, ends, detail::NoStride{});
+    const cuda::std::array<typename Desc::stride_type, RANK> strides = {-1};
+
+    return Slice<N>(firsts, ends, strides);
   }
 
   template <int N = RANK>
diff --git a/include/matx/core/type_utils.h b/include/matx/core/type_utils.h
index 956de9297..945ba369b 100644
--- a/include/matx/core/type_utils.h
+++ b/include/matx/core/type_utils.h
@@ -66,7 +66,6 @@ enum class MemoryLayout {
 namespace detail {
 struct NoShape{};
 struct EmptyOp{};
-struct NoStride{};
 
 template <typename T>
 struct is_noshape : std::integral_constant<bool, std::is_same_v<NoShape, T>> {};
diff --git a/include/matx/operators/slice.h b/include/matx/operators/slice.h
index 74716c00d..ff84d7f6a 100644
--- a/include/matx/operators/slice.h
+++ b/include/matx/operators/slice.h
@@ -42,21 +42,20 @@ namespace matx
    * Slices elements from an operator/tensor.
    */
   namespace detail {
-
-    template <int DIM, typename T, typename StrideType>
-      class SliceOp : public BaseOp<SliceOp<DIM, T, StrideType>>
+    template <int DIM, typename T>
+      class SliceOp : public BaseOp<SliceOp<DIM, T>>
     {
       public: 
         using value_type = typename T::value_type;
         using shape_type = index_t; 
-        using self_type = SliceOp<DIM, T, StrideType>;
+        using self_type = SliceOp<DIM, T>;
 
       private:
         typename base_type<T>::type op_;
         cuda::std::array<shape_type, DIM> sizes_;
         cuda::std::array<int32_t, DIM> dims_;
         cuda::std::array<shape_type, T::Rank()> starts_;
-        StrideType strides_; // Add [[no_unique_address]] in c++20
+        cuda::std::array<shape_type, T::Rank()> strides_;
 
       public:
         using matxop = bool;
@@ -69,7 +68,7 @@ namespace matx
 
         __MATX_INLINE__ SliceOp(T op, const cuda::std::array<shape_type, T::Rank()> &starts,
                                       const cuda::std::array<shape_type, T::Rank()> &ends,
-                                      StrideType strides) : op_(op) {
+                                      const cuda::std::array<shape_type, T::Rank()> &strides) : op_(op) {
           int32_t d = 0;
           for(int32_t i = 0; i < T::Rank(); i++) {
             shape_type start = starts[i] < 0 ? op.Size(i) + starts[i] : starts[i];
@@ -81,10 +80,7 @@ namespace matx
               "Slice end index out of range of operator");
 
             starts_[i] = start;
-
-            if constexpr (!std::is_same_v<NoStride, StrideType>) {
-              strides_[i] = strides[i];
-            }
+            strides_[i] = strides[i];
 
             // compute dims and sizes
             if(end != matxDropDim) {
@@ -99,10 +95,7 @@ namespace matx
               }
 
               //adjust size by stride
-              if constexpr (!std::is_same_v<NoStride, StrideType>) {
-                sizes_[d] = (shape_type)std::ceil(static_cast<double>(sizes_[d])/ static_cast<double>(strides_[d]));
-              }
-
+              sizes_[d] = (shape_type)std::ceil(static_cast<double>(sizes_[d])/ static_cast<double>(strides_[d]));
               d++;
             }
           }
@@ -115,7 +108,7 @@ namespace matx
             static_assert(sizeof...(Is)==Rank());
             static_assert((std::is_convertible_v<Is, index_t> && ... ));
 
-#if 0
+            // convert variadic type to tuple so we can read/update
             cuda::std::array<index_t, Rank()> inds{indices...};
             cuda::std::array<index_t, T::Rank()> ind{indices...};
 
@@ -128,29 +121,6 @@ namespace matx
             for(int32_t i = 0; i < Rank(); i++) {
               ind[dims_[i]] += inds[i] * strides_[i]; 
             }
-#else            
-            // convert variadic type to tuple so we can read/update
-            cuda::std::array<index_t, T::Rank()> ind;
-            cuda::std::array<index_t, Rank()> inds{indices...};
-
-            #pragma unroll            
-            for (int32_t i = 0; i < T::Rank(); i++) {
-              #pragma unroll
-              for(int32_t j = 0; j < Rank(); j++) {
-                if(dims_[j] == i) {
-                  if constexpr (!std::is_same_v<NoStride, StrideType>) {
-                    ind[i] = starts_[j] + inds[j] * strides_[i];
-                  }
-                  else {
-                    ind[i] = starts_[j] + inds[j];
-                  }
-                }
-                else {
-                  ind[i] = starts_[i];
-                }
-              }
-            }       
-#endif                   
 
             //return op_(ind);
             return cuda::std::apply(op_, ind);
@@ -162,42 +132,19 @@ namespace matx
             static_assert(sizeof...(Is)==Rank());
             static_assert((std::is_convertible_v<Is, index_t> && ... ));
 
-#if 0
-            cuda::std::array<index_t, Rank()> inds{indices...};
-            cuda::std::array<index_t, T::Rank()> ind{indices...};
+            // convert variadic type to tuple so we can read/update
+            cuda::std::array<shape_type, Rank()> inds{indices...};
+            cuda::std::array<shape_type, T::Rank()> ind{indices...};
 
 #pragma unroll 
-            for(int32_t i = 0; i < T::Rank(); i++) {
+            for(int i = 0; i < T::Rank(); i++) {
               ind[i] = starts_[i];
             }
 
 #pragma unroll 
-            for(int32_t i = 0; i < Rank(); i++) {
+            for(int i = 0; i < Rank(); i++) {
               ind[dims_[i]] += inds[i] * strides_[i]; 
             }
-#else            
-            // convert variadic type to tuple so we can read/update
-            cuda::std::array<index_t, T::Rank()> ind;
-            cuda::std::array<index_t, Rank()> inds{indices...};
-
-            #pragma unroll            
-            for (int32_t i = 0; i < T::Rank(); i++) {
-              #pragma unroll
-              for(int32_t j = 0; j < Rank(); j++) {
-                if(dims_[j] == i) {
-                  if constexpr (!std::is_same_v<NoStride, StrideType>) {
-                    ind[i] = starts_[j] + inds[j] * strides_[i];
-                  }
-                  else {
-                    ind[i] = starts_[j] + inds[j];
-                  }
-                }
-                else {
-                  ind[i] = starts_[i];
-                }
-              }
-            }        
-#endif       
 
             //return op_(ind);
             return cuda::std::apply(op_, ind);
@@ -269,23 +216,10 @@ namespace matx
     if constexpr (is_tensor_view_v<OpType>) {
       return op.Slice(starts, ends, strides);
     } else {
-      return detail::SliceOp<OpType::Rank(),OpType,decltype(strides)>(op, starts, ends, strides);
+      return detail::SliceOp<OpType::Rank(),OpType>(op, starts, ends, strides);
     }
   }
 
-  template <typename OpType>
-  __MATX_INLINE__ auto slice( const OpType &op, 
-      const cuda::std::array<index_t, OpType::Rank()> &starts,
-      const cuda::std::array<index_t, OpType::Rank()> &ends,
-      detail::NoStride strides)
-  {
-    if constexpr (is_tensor_view_v<OpType>) {
-      return op.Slice(starts, ends, strides);
-    } else {
-      return detail::SliceOp<OpType::Rank(),OpType,detail::NoStride>(op, starts, ends, detail::NoStride{});
-    }
-  }  
-
   template <typename OpType>
   __MATX_INLINE__ auto slice( const OpType &op, 
       const index_t (&starts)[OpType::Rank()],
@@ -316,7 +250,10 @@ namespace matx
       const cuda::std::array<index_t, OpType::Rank()> &starts,
       const cuda::std::array<index_t, OpType::Rank()> &ends)
   {
-    return slice(op, starts, ends, detail::NoStride{});
+    cuda::std::array<index_t, OpType::Rank()> strides;
+    strides.fill(1);
+
+    return slice(op, starts, ends, strides);
   }
   template <typename OpType>
   __MATX_INLINE__ auto slice( const OpType &op, 
@@ -354,24 +291,10 @@ namespace matx
     if constexpr (is_tensor_view_v<OpType>) {
       return op.template Slice<N>(starts, ends, strides);
     } else {
-      return detail::SliceOp<N,OpType,decltype(strides)>(op, starts, ends, strides);
-    }
-  }
-
-  template <int N, typename OpType>
-    __MATX_INLINE__ auto slice( const OpType op, 
-      const cuda::std::array<index_t, OpType::Rank()> &starts,
-      const cuda::std::array<index_t, OpType::Rank()> &ends,
-      detail::NoStride no_stride)
-  {
-    if constexpr (is_tensor_view_v<OpType>) {
-      return op.template Slice<N>(starts, ends);
-    } else {
-      return detail::SliceOp<N,OpType,detail::NoStride>(op, starts, ends, detail::NoStride{});
+      return detail::SliceOp<N,OpType>(op, starts, ends, strides);
     }
   }
 
-
   template <int N, typename OpType>
     __MATX_INLINE__ auto slice( const OpType op, 
         const index_t (&starts)[OpType::Rank()],
@@ -405,7 +328,9 @@ namespace matx
       const cuda::std::array<index_t, OpType::Rank()> &starts,
       const cuda::std::array<index_t, OpType::Rank()> &ends)
   {
-    return slice<N,OpType,detail::NoStride>(opIn, starts, ends, detail::NoStride{});
+    cuda::std::array<index_t, OpType::Rank()> strides;
+    strides.fill(1);
+    return slice<N,OpType>(opIn, starts, ends, strides);
   }
 
   template <int N, typename OpType>