NVIDIA
diff --git a/‎docs_input/api/manipulation/basic/slice.rst
Lines changed: 2 additions & 2 deletions b/‎docs_input/api/manipulation/basic/slice.rst
Lines changed: 2 additions & 2 deletions
diff --git a/‎docs_input/notebooks/exercises/example4_cfar.cu
Lines changed: 1 addition & 1 deletion b/‎docs_input/notebooks/exercises/example4_cfar.cu
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs_input/notebooks/exercises/example4_doppler.cu
Lines changed: 1 addition & 1 deletion b/‎docs_input/notebooks/exercises/example4_doppler.cu
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs_input/notebooks/exercises/example4_init.cu
Lines changed: 2 additions & 2 deletions b/‎docs_input/notebooks/exercises/example4_init.cu
Lines changed: 2 additions & 2 deletions
diff --git a/‎docs_input/notebooks/exercises/example4_pc.cu
Lines changed: 2 additions & 2 deletions b/‎docs_input/notebooks/exercises/example4_pc.cu
Lines changed: 2 additions & 2 deletions
diff --git a/‎docs_input/notebooks/exercises/example4_tpc.cu
Lines changed: 2 additions & 2 deletions b/‎docs_input/notebooks/exercises/example4_tpc.cu
Lines changed: 2 additions & 2 deletions
diff --git a/‎docs_input/notebooks/exercises/solutions/example1_assignment1.cu
Lines changed: 1 addition & 1 deletion b/‎docs_input/notebooks/exercises/solutions/example1_assignment1.cu
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/fft_conv.cu
Lines changed: 1 addition & 1 deletion b/‎examples/fft_conv.cu
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/mvdr_beamformer.h
Lines changed: 1 addition & 1 deletion b/‎examples/mvdr_beamformer.h
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/resample.cu
Lines changed: 2 additions & 2 deletions b/‎examples/resample.cu
Lines changed: 2 additions & 2 deletions
diff --git a/‎examples/simple_radar_pipeline.h
Lines changed: 7 additions & 7 deletions b/‎examples/simple_radar_pipeline.h
Lines changed: 7 additions & 7 deletions
diff --git a/‎include/matx/core/tensor.h
Lines changed: 18 additions & 14 deletions b/‎include/matx/core/tensor.h
Lines changed: 18 additions & 14 deletions
diff --git a/‎include/matx/core/type_utils.h
Lines changed: 1 addition & 0 deletions b/‎include/matx/core/type_utils.h
Lines changed: 1 addition & 0 deletions
diff --git a/‎include/matx/operators/dct.h
Lines changed: 1 addition & 1 deletion b/‎include/matx/operators/dct.h
Lines changed: 1 addition & 1 deletion
@@ -9,8 +9,8 @@ to indicate starting at the end and going backward.
 
 When slicing along any given tensor dimension, the start index is treated as inclusive, and the end index as exclusive.
 
-.. doxygenfunction:: slice(const OpType opIn, const index_t (&starts)[OpType::Rank()], const index_t (&ends)[OpType::Rank()])
-.. doxygenfunction:: slice(const OpType op, const index_t (&starts)[OpType::Rank()], const index_t (&ends)[OpType::Rank()], const index_t (&strides)[OpType::Rank()])
+.. doxygenfunction:: slice(const OpType &op, const index_t (&starts)[OpType::Rank()], const index_t (&ends)[OpType::Rank()], const index_t (&strides)[OpType::Rank()])
+.. doxygenfunction:: slice( const OpType &op, const index_t (&starts)[OpType::Rank()], const index_t (&ends)[OpType::Rank()]) 
 
 Examples
 ~~~~~~~~
 
@@ -53,7 +53,7 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
     radar.CFARDetections();
 
     printf("FFT output:\n");
-    print(radar.GetTPCData()->View().Slice<1>({0, 0, 0}, {matxSliceDim, matxSliceDim, 16}));
+    print(slice<1>(radar.GetTPCData()->View(), {0, 0, 0}, {matxSliceDim, matxSliceDim, 16}));
 
     cudaStreamDestroy(stream);
 
 
@@ -53,7 +53,7 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
     radar.DopplerProcessing();
 
     printf("Doppler output:\n");
-    radar.GetTPCView().Slice<1>({0, 0, 0}, {matxSliceDim, matxSliceDim, 16}).rint();
+    print(slice<1>(radar.GetTPCView(), {0, 0, 0}, {matxSliceDim, matxSliceDim, 16}));
 
     cudaStreamDestroy(stream);
 
 
@@ -49,8 +49,8 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
     cudaEventCreate(&stop);    
 
     auto radar = RadarPipeline(numPulses, numSamples, waveformLength, numChannels, stream);
-    auto rv = radar.GetNormT().Slice<1>({0, 0, 0}, {matxSliceDim, matxSliceDim, 16});
-    rv.print();
+    auto rv = slice<1>(radar.GetNormT(), {0, 0, 0}, {matxSliceDim, matxSliceDim, 16});
+    print(rv);
     cudaStreamDestroy(stream);
 
     return 0;
 
@@ -55,8 +55,8 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
 
     radar.PulseCompression();
 
-    auto rv = radar.GetInputView().Slice<1>({0, 0, 0}, {matxSliceDim, matxSliceDim, 16});
-    rv.print();
+    auto rv = slice<1>(radar.GetInputView(), {0, 0, 0}, {matxSliceDim, matxSliceDim, 16});
+    print(rv);
     cudaStreamDestroy(stream);
 
     return 0;
 
@@ -53,9 +53,9 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
     radar.ThreePulseCanceller();
 
     printf("x input:\n");
-    radar.GetInputView().Slice<1>({0, 0, 0}, {matxSliceDim, matxSliceDim, 16}).Print();
+    print(slice<1>(radar.GetInputView(), {0, 0, 0}, {matxSliceDim, matxSliceDim, 16}));
     printf("Convolution output:\n");
-    radar.GetTPCView()->Slice<1>({0,0,0}, {matxSliceDim, matxSliceDim, 10}).Print();     
+    print(slice<1>(radar.GetTPCView(), {0,0,0}, {matxSliceDim, matxSliceDim, 10}));     
     cudaStreamDestroy(stream);
 
     return 0;
 
@@ -67,7 +67,7 @@ int main() {
    * Get a slice of the second and third rows with all columns
    * https://devtech-compute.gitlab-master-pages.nvidia.com/matx/quickstart.html#slicing-and-dicing
    *****************************************************************************************************/
-  auto t2s = t2.Slice({1, 0}, {3, matxEnd}); // Put code here
+  auto t2s = slice(t2, {1, 0}, {3, matxEnd}); // Put code here
   /*** End editing ***/
 
   // Verify slice is correct
 
@@ -149,7 +149,7 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
   // Now the sig_freq view contains the full convolution result. Verify against
   // a direct convolution. The conv1d function only accepts a 1D filter, so we
   // create a sliced view here.
-  auto filt1 = filt_time.Slice<1>({0,0}, {matxDropDim, matxEnd});
+  auto filt1 = slice<1>(filt_time, {0,0}, {matxDropDim, matxEnd});
   (time_out = conv1d(sig_time, filt1, matxConvCorrMode_t::MATX_C_MODE_FULL)).run(exec);
 
   exec.sync();
 
@@ -108,7 +108,7 @@ class MVDRBeamformer {
 
     (cbfView = matmul(vhView, inVecView)).run(exec);
 
-    matx::copy(ivsView, inVecView.Slice({0, 0}, {matxEnd, snap_len_}), stream);
+    matx::copy(ivsView, slice(inVecView, {0, 0}, {matxEnd, snap_len_}), stream);
 
     (ivshView = hermitianT(ivsView)).run(exec);
 
 
@@ -69,7 +69,7 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
   (sigViewComplex = fft(sigView)).run(exec);
 
   // Slice
-  auto sliceView = sigViewComplex.Slice({0}, {nyq});
+  auto sliceView = slice(sigViewComplex, {0}, {nyq});
 
   // Inverse Transform - FFT size based on output
   (resampView = ifft(sliceView)).run(exec);
@@ -81,7 +81,7 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
     (sigViewComplex = fft(sigView)).run(exec);
 
     // Slice
-    auto sv = sigViewComplex.Slice({0}, {nyq});
+    auto sv = slice(sigViewComplex, {0}, {nyq});
 
     // Inverse Transform - FFT size based on output
     (resampView = ifft(sv)).run(exec);
 
@@ -237,11 +237,11 @@ class RadarPipeline {
   void PulseCompression()
   {
     // reshape waveform to be waveformLength
-    auto waveformPart = waveformView.Slice({0}, {waveformLength});
+    auto waveformPart = slice(waveformView, {0}, {waveformLength});
     auto waveformT =
         waveformView.template Clone<3>({numChannels, numPulses, matxKeepDim});
 
-    auto waveformFull = waveformView.Slice({0}, {numSamplesRnd});
+    auto waveformFull = slice(waveformView, {0}, {numSamplesRnd});
 
     auto x = inputView;
 
@@ -285,9 +285,9 @@ class RadarPipeline {
    */
   void ThreePulseCanceller()
   {
-    auto x = inputView.Permute({0, 2, 1}).Slice(
+    auto x = slice(inputView.Permute({0, 2, 1}), 
         {0, 0, 0}, {numChannels, numCompressedSamples, numPulses});
-    auto xo = tpcView.Permute({0, 2, 1}).Slice(
+    auto xo = slice(tpcView.Permute({0, 2, 1}), 
         {0, 0, 0}, {numChannels, numCompressedSamples, numPulses});
     (xo = conv1d(x, cancelMask, matxConvCorrMode_t::MATX_C_MODE_SAME)).run(exec);
   }
@@ -311,7 +311,7 @@ class RadarPipeline {
     const index_t cpulses = numPulses - (cancelMask.Size(0) - 1);
 
     auto xc =
-        tpcView.Slice({0, 0, 0}, {numChannels, cpulses, numCompressedSamples});
+        slice(tpcView, {0, 0, 0}, {numChannels, cpulses, numCompressedSamples});
 
     auto xf = tpcView.Permute({0, 2, 1});
 
@@ -368,11 +368,11 @@ class RadarPipeline {
     // This can be done with a convolution of the cfarMask with
     // ones.
     // norm = conv2(ones(size(X)), mask, 'same');
-    auto normTrim = normT.Slice({0, cfarMaskY / 2, cfarMaskX / 2},
+    auto normTrim = slice(normT, {0, cfarMaskY / 2, cfarMaskX / 2},
                                  {numChannels, numPulsesRnd + cfarMaskY / 2,
                                   numCompressedSamples + cfarMaskX / 2});
 
-    auto baTrim = ba.Slice({0, cfarMaskY / 2, cfarMaskX / 2},
+    auto baTrim = slice(ba, {0, cfarMaskY / 2, cfarMaskX / 2},
                             {numChannels, numPulsesRnd + cfarMaskY / 2,
                              numCompressedSamples + cfarMaskX / 2});
     (baTrim = baTrim / normTrim).run(exec);
 
@@ -1429,6 +1429,8 @@ class tensor_t : public detail::tensor_impl_t<T,RANK,Desc> {
    * more dimensions of a tensor. This includes completely dropping an unwanted
    * dimension, or simply taking a piece of a wanted dimension. Slice() is very
    * similar to indexing operations in both Python and MATLAB.
+   * 
+   * *NOTE* Users should not call Slice() directly anymore. Use the slice() operator instead.
    *
    * @param firsts
    *   List of starting index into each dimension. Indexing is 0-based
@@ -1451,10 +1453,10 @@ class tensor_t : public detail::tensor_impl_t<T,RANK,Desc> {
    * @returns Sliced view of tensor
    *
    */
-  template <int N = RANK>
+  template <int N = RANK, typename StrideType>
   __MATX_INLINE__ auto Slice([[maybe_unused]] const cuda::std::array<typename Desc::shape_type, RANK> &firsts,
-                             [[maybe_unused]] const cuda::std::array<typename Desc::shape_type, RANK> &ends,
-                             [[maybe_unused]] const cuda::std::array<typename Desc::stride_type, RANK> &strides) const
+                            [[maybe_unused]] const cuda::std::array<typename Desc::shape_type, RANK> &ends,
+                            [[maybe_unused]] StrideType strides) const
   {
     static_assert(N <= RANK && RANK > 0, "Must slice to a rank the same or less than current rank.");
 
@@ -1465,7 +1467,6 @@ class tensor_t : public detail::tensor_impl_t<T,RANK,Desc> {
 
     T *data = this->ldata_;
     int d = 0;
-    bool def_stride = (strides[0] == -1);
 
     [[maybe_unused]] int end_count = 0;
     for (int i = 0; i < RANK; i++) {
@@ -1487,9 +1488,14 @@ class tensor_t : public detail::tensor_impl_t<T,RANK,Desc> {
 
       MATX_ASSERT_STR(first < end, matxInvalidSize, "Slice must be at least one element long");
 
-      [[maybe_unused]] typename Desc::stride_type stride_mult = (def_stride || strides[i] == matxKeepStride)
-                                ? 1
-                                : strides[i]; // custom stride
+      [[maybe_unused]] typename Desc::stride_type stride_mult;
+      
+      if constexpr (std::is_same_v<StrideType, detail::NoStride>) {
+        stride_mult = 1;
+      }
+      else {
+        stride_mult = (strides[i] == matxKeepStride) ? 1 : strides[i];
+      }
 
       MATX_ASSERT_STR(first < end, matxInvalidParameter,
                       "Starting slice must be less than end slice");
@@ -1526,10 +1532,10 @@ class tensor_t : public detail::tensor_impl_t<T,RANK,Desc> {
     return tensor_t<T, N, Storage, decltype(new_desc)>{storage_, std::move(new_desc), data};
   }
 
-  template <int N = RANK>
+  template <typename StrideType, int N = RANK>
   __MATX_INLINE__ auto Slice(const typename Desc::shape_type (&firsts)[RANK],
-                             const typename Desc::shape_type (&ends)[RANK],
-                             const typename Desc::stride_type (&strides)[RANK]) const
+                            const typename Desc::shape_type (&ends)[RANK],
+                            StrideType strides) const
   {
     return Slice<N>(detail::to_array(firsts), detail::to_array(ends), detail::to_array(strides));
   }
@@ -1560,15 +1566,13 @@ class tensor_t : public detail::tensor_impl_t<T,RANK,Desc> {
    */
   template <int N = RANK>
   __MATX_INLINE__ auto Slice(const cuda::std::array<typename Desc::shape_type, RANK> &firsts,
-                             const cuda::std::array<typename Desc::shape_type, RANK> &ends) const
+                            const cuda::std::array<typename Desc::shape_type, RANK> &ends) const
   {
     static_assert(N <= RANK && RANK > 0, "Must slice to a rank the same or less than current rank.");
 
     MATX_NVTX_START("", matx::MATX_NVTX_LOG_API)
 
-    const cuda::std::array<typename Desc::stride_type, RANK> strides = {-1};
-
-    return Slice<N>(firsts, ends, strides);
+    return Slice<N, detail::NoStride>(firsts, ends, detail::NoStride{});
   }
 
   template <int N = RANK>
 
@@ -66,6 +66,7 @@ enum class MemoryLayout {
 namespace detail {
 struct NoShape{};
 struct EmptyOp{};
+struct NoStride{};
 
 template <typename T>
 struct is_noshape : std::integral_constant<bool, std::is_same_v<NoShape, T>> {};
 
@@ -104,7 +104,7 @@ void dct(OutputTensor &out, const InputTensor &in,
   tensor_t<cuda::std::complex<typename OutputTensor::value_type>, 1> tmp{{N + 1}};
 
   fft_impl(tmp, in, 0, FFTNorm::BACKWARD, stream);
-  auto s = tmp.Slice({0}, {N});
+  auto s = slice(tmp, {0}, {N});
   detail::dctOp(out, s, N).run(stream);
 }
Original file line number	Diff line number	Diff line change
`@@ -104,7 +104,7 @@ void dct(OutputTensor &out, const InputTensor &in,`
`104`	`104`	`tensor_t<cuda::std::complex<typename OutputTensor::value_type>, 1> tmp{{N + 1}};`
`105`	`105`
`106`	`106`	`fft_impl(tmp, in, 0, FFTNorm::BACKWARD, stream);`
`107`		`- auto s = tmp.Slice({0}, {N});`
	`107`	`+ auto s = slice(tmp, {0}, {N});`
`108`	`108`	`detail::dctOp(out, s, N).run(stream);`
`109`	`109`	`}`
`110`	`110`