Constructing batches from non-tensor objects.

mzient · mzient · commit a7140048f73e · 2025-08-14T18:23:34.000+02:00
Signed-off-by: Michał Zientkiewicz &lt;mzient@gmail.com&gt;
diff --git a/dali/python/backend_impl.cc b/dali/python/backend_impl.cc
@@ -1136,7 +1136,7 @@ void ExposeTensorList(py::module &m) {
       "tl"_a,
       "layout"_a = py::none())
     .def(py::init([](py::buffer b, string layout = "", bool is_pinned = false) {
-        DomainTimeRange range("TensorListCPU::init from a buffer", kCPUTensorColor);
+         DomainTimeRange range("TensorListCPU::init from a buffer", kCPUTensorColor);
         // We need to verify that the input data is C_CONTIGUOUS
         // and of a type that we can work with in the backend
         py::buffer_info info = b.request();
diff --git a/dali/python/nvidia/dali/experimental/dali2/_batch.py b/dali/python/nvidia/dali/experimental/dali2/_batch.py
@@ -21,6 +21,7 @@
 from . import _eval_mode
 from . import _invocation
 import copy
+import nvtx
 
 
 class BatchedSlice:
@@ -154,21 +155,26 @@ def _is_external(self) -> bool:
         return self._wraps_external_data
 
     @staticmethod
-    def broadcast(sample, batch_size: int) -> "Batch":
+    def broadcast(sample, batch_size: int, device: Optional[Device] = None) -> "Batch":
         if isinstance(sample, Batch):
             raise ValueError("Cannot broadcast a Batch")
         if _is_tensor_type(sample):
-            return Batch([Tensor(sample)] * batch_size)
+            return Batch([Tensor(sample, device=device)] * batch_size)
         import numpy as np
-        arr = np.array(batch_size)
-        if arr.dtype == np.float64:
-            arr = arr.astype(np.float32)
-        elif arr.dtype == np.int64:
-            arr = arr.astype(np.int32)
-        elif arr.dtype == np.uint64:
-            arr = arr.astype(np.uint32)
-        arr = np.stack([arr] * batch_size)
-        return Batch(_backend.TensorListCPU(arr))
+        with nvtx.annotate("to numpy and stack", domain="batch"):
+            arr = np.array(sample)
+            if arr.dtype == np.float64:
+                arr = arr.astype(np.float32)
+            elif arr.dtype == np.int64:
+                arr = arr.astype(np.int32)
+            elif arr.dtype == np.uint64:
+                arr = arr.astype(np.uint32)
+            arr = np.repeat(arr[np.newaxis], batch_size, axis=0)
+
+        with nvtx.annotate("to backend", domain="batch"):
+            tl = _backend.TensorListCPU(arr)
+        with nvtx.annotate("create batch", domain="batch"):
+            return Batch(tl, device=device)
 
     @property
     def dtype(self) -> DType:
diff --git a/dali/python/nvidia/dali/experimental/dali2/_op_builder.py b/dali/python/nvidia/dali/experimental/dali2/_op_builder.py
@@ -131,7 +131,7 @@ def _to_batch(x, batch_size, device=None):
                 raise ValueError(f"Unexpected batch size: {actual_batch_size} != {batch_size}")
             return Batch(x, device=device)
 
-        return Batch.broadcast(_to_tensor(x, device=device), batch_size)
+        return Batch.broadcast(x, batch_size, device=device)
 
 
 _unsupported_args = {"bytes_per_sample_hint", "preserve"}