Try at_shifts

cwindolf · cwindolf · commit 8236157cbc30 · 2023-12-04T10:27:59.000-05:00
diff --git a/src/dartsort/peel/matching.py b/src/dartsort/peel/matching.py
@@ -426,7 +426,8 @@ def peel_chunk(
     def templates_at_time(self, t_s):
         """Handle drift -- grab the right spatial neighborhoods."""
         pconvdb = self.pairwise_conv_db
-        pitch_shifts_a=pitch_shifts_b=None
+        pitch_shifts_a = pitch_shifts_b = None
+        pconvdb.to(self.objective_spatial_components.device, pin=True)
         if self.is_drifting:
             pitch_shifts_b, cur_spatial = template_util.templates_at_time(
                 t_s,
@@ -464,17 +465,22 @@ def templates_at_time(self, t_s):
                 fill_value=0.0,
             )
             max_channels = cur_ampvecs[:, 0, :].argmax(1)
-            # pconvdb = pconvdb.at_shifts(pitch_shifts_a, pitch_shifts_b)
+            # pitch_shifts_a = torch.as_tensor(pitch_shifts_a)
+            # pitch_shifts_b = torch.as_tensor(pitch_shifts_b)
             pitch_shifts_a = torch.as_tensor(pitch_shifts_a, device=cur_obj_spatial.device)
             pitch_shifts_b = torch.as_tensor(pitch_shifts_b, device=cur_obj_spatial.device)
+            pconvdb = pconvdb.at_shifts(pitch_shifts_a, pitch_shifts_b)
+            # pitch_shifts_a = torch.as_tensor(pitch_shifts_a, device=cur_obj_spatial.device)
+            # pitch_shifts_b = torch.as_tensor(pitch_shifts_b, device=cur_obj_spatial.device)
         else:
             cur_spatial = self.spatial_components
             cur_obj_spatial = self.objective_spatial_components
             max_channels = self.registered_template_ampvecs.argmax(1)
 
         # if not pconvdb._is_torch:
-        #     # pconvdb.to("cpu")
-        pconvdb.to(cur_obj_spatial.device)
+            # pconvdb.to("cpu")
+        # if cur_obj_spatial.device.type == "cuda" and not pconvdb.device.type == "cuda":
+        #     pconvdb.to(cur_obj_spatial.device, pin=True)
 
         return MatchingTemplateData(
             objective_spatial_components=cur_obj_spatial,
@@ -492,8 +498,10 @@ def templates_at_time(self, t_s):
             compressed_upsampled_temporal=self.compressed_upsampled_temporal,
             max_channels=torch.as_tensor(max_channels, device=cur_obj_spatial.device),
             pairwise_conv_db=pconvdb,
-            shifts_a=pitch_shifts_a,
-            shifts_b=pitch_shifts_b,
+            shifts_a=None,
+            shifts_b=None,
+            # shifts_a=pitch_shifts_a,
+            # shifts_b=pitch_shifts_b,
         )
 
     def match_chunk(
@@ -560,20 +568,20 @@ def match_chunk(
 
             # subtract them
             # old_norm = torch.linalg.norm(residual) ** 2
-            compressed_template_data.subtract_conv(
-                padded_conv,
+            compressed_template_data.subtract(
+                residual_padded,
                 new_peaks.times,
                 new_peaks.template_indices,
                 new_peaks.upsampling_indices,
                 new_peaks.scalings,
-                conv_pad_len=self.obj_pad_len,
             )
-            compressed_template_data.subtract(
-                residual_padded,
+            compressed_template_data.subtract_conv(
+                padded_conv,
                 new_peaks.times,
                 new_peaks.template_indices,
                 new_peaks.upsampling_indices,
                 new_peaks.scalings,
+                conv_pad_len=self.obj_pad_len,
             )
 
             # new_norm = torch.linalg.norm(residual) ** 2
@@ -627,7 +635,7 @@ def find_peaks(
             alpha=2.0,
             out=padded_objective[:-1],
         )
-        
+
         # first step: coarse peaks. not temporally upsampled or amplitude-scaled.
         objective = (padded_objective + refrac_mask)[
             :-1, self.obj_pad_len : -self.obj_pad_len
@@ -668,7 +676,7 @@ def find_peaks(
         )
         if time_shifts is not None:
             times += time_shifts
-        
+
         return MatchingPeaks(
             n_spikes=times.numel(),
             times=times,
@@ -884,12 +892,17 @@ def fine_match(
             superres_ix = superres_index[objective_template_indices]
             dup_ix, column_ix = (superres_ix < self.n_templates).nonzero(as_tuple=True)
             template_indices = superres_ix[dup_ix, column_ix]
-            convs = torch.einsum(
-                "jtc,jrc,jtr->j",
-                snips[dup_ix],
-                self.spatial_singular[template_indices],
+            convs = torch.baddbmm(
                 self.temporal_components[template_indices],
-            )
+                snips[dup_ix],
+                self.spatial_singular[template_indices].mT,
+            ).sum((1, 2))
+            # convs = torch.einsum(
+            #     "jtc,jrc,jtr->j",
+            #     snips[dup_ix],
+            #     self.spatial_singular[template_indices],
+            #     self.temporal_components[template_indices],
+            # )
             norms = self.template_norms_squared[template_indices]
             objs = torch.full(superres_ix.shape, -torch.inf, device=convs.device)
             objs[dup_ix, column_ix] = 2 * convs - norms
diff --git a/src/dartsort/templates/pairwise.py b/src/dartsort/templates/pairwise.py
@@ -52,20 +52,35 @@ class CompressedPairwiseConv:
     # the 0 index is special: pconv[0] === 0.
     pconv: np.ndarray
     in_memory: bool = False
+    device: torch.device = torch.device("cpu")
 
     def __post_init__(self):
         assert self.shifts_a.ndim == self.shifts_b.ndim == 1
         assert self.shifts_a.shape == (self.shifted_template_index_a.shape[1],)
-        assert self.shifts_b.shape == (self.upsampled_shifted_template_index_b.shape[1],)
+        assert self.shifts_b.shape == (
+            self.upsampled_shifted_template_index_b.shape[1],
+        )
+
+        self.a_shift_offset, self.offset_shift_a_to_ix = _get_shift_indexer(
+            self.shifts_a
+        )
+        self.b_shift_offset, self.offset_shift_b_to_ix = _get_shift_indexer(
+            self.shifts_b
+        )
+
+    def get_shift_ix_a(self, shifts_a):
+        return self.offset_shift_a_to_ix[shifts_a.to(int) + self.a_shift_offset]
+
+    def get_shift_ix_b(self, shifts_b):
+        return self.offset_shift_b_to_ix[shifts_b.to(int) + self.b_shift_offset]
 
     @classmethod
     def from_h5(cls, hdf5_filename, in_memory=True):
-        ff = [f for f in fields(cls) if not f.name == "in_memory"]
+        ff = [f for f in fields(cls) if f.name not in ("in_memory", "device")]
         if in_memory:
             with h5py.File(hdf5_filename, "r") as h5:
                 data = {f.name: torch.from_numpy(h5[f.name][:]) for f in ff}
             return cls(**data, in_memory=in_memory)
-        
         _h5 = h5py.File(hdf5_filename, "r")
         data = {}
         for f in ff:
@@ -117,7 +132,7 @@ def from_template_data(
         )
         return cls.from_h5(hdf5_filename)
 
-    def at_shifts(self, shifts_a=None, shifts_b=None):
+    def at_shifts(self, shifts_a=None, shifts_b=None, device=None):
         """Subset this database to one set of shifts.
 
         The database becomes shiftless (not in the pejorative sense).
@@ -133,8 +148,8 @@ def at_shifts(self, shifts_a=None, shifts_b=None):
         n_shifted_temps_a, n_up_shifted_temps_b = self.pconv_index.shape
 
         # active shifted and upsampled indices
-        shift_ix_a = torch.searchsorted(self.shifts_a, shifts_a)
-        shift_ix_b = torch.searchsorted(self.shifts_b, shifts_b)
+        shift_ix_a = self.get_shift_ix_a(shifts_a)
+        shift_ix_b = self.get_shift_ix_b(shifts_b)
         sub_shifted_temp_index_a = self.shifted_template_index_a[
             torch.arange(len(self.shifted_template_index_a))[:, None],
             shift_ix_a[:, None],
@@ -166,6 +181,8 @@ def at_shifts(self, shifts_a=None, shifts_b=None):
             sub_pconv = self.pconv[sub_pconv_indices.to(self.pconv.device)]
         else:
             sub_pconv = torch.from_numpy(batched_h5_read(self.pconv, sub_pconv_indices))
+        if device is not None:
+            sub_pconv = sub_pconv.to(device)
 
         # reindexing
         n_sub_shifted_temps_a = len(shifted_temp_ixs_a)
@@ -184,17 +201,30 @@ def at_shifts(self, shifts_a=None, shifts_b=None):
             pconv_index=sub_pconv_index,
             pconv=sub_pconv,
             in_memory=True,
+            device=self.device,
         )
 
-    def to(self, device=None, incl_pconv=False):
+    def to(self, device=None, incl_pconv=False, pin=False):
         """Become torch tensors on device."""
-        for f in fields(self):
-            if f.name == "pconv":
+        print(f"to {device=}")
+        for name in ["offset_shift_a_to_ix", "offset_shift_b_to_ix"] + [
+            f.name for f in fields(self)
+        ]:
+            if name == "pconv" and not incl_pconv:
                 continue
-            v = getattr(self, f.name)
+            v = getattr(self, name)
             if isinstance(v, np.ndarray) or torch.is_tensor(v):
-                setattr(self, f.name, torch.as_tensor(v, device=device))
+                setattr(self, name, torch.as_tensor(v, device=device))
         self.device = device
+        if pin and self.device.type == "cuda" and torch.cuda.is_available() and not self.pconv.is_pinned():
+            # self.pconv.share_memory_()
+            print("pin")
+            torch.cuda.cudart().cudaHostRegister(
+                self.pconv.data_ptr(), self.pconv.numel() * self.pconv.element_size(), 0
+            )
+            # assert x.is_shared()
+            assert self.pconv.is_pinned()
+            # self.pconv = self.pconv.pin_memory()
         return self
 
     def query(
@@ -211,9 +241,9 @@ def query(
         device=None,
     ):
         if template_indices_a is None:
-                template_indices_a = torch.arange(
-                    len(self.shifted_template_index_a), device=self.device
-                )
+            template_indices_a = torch.arange(
+                len(self.shifted_template_index_a), device=self.device
+            )
         template_indices_a = torch.atleast_1d(template_indices_a)
         template_indices_b = torch.atleast_1d(template_indices_b)
 
@@ -230,8 +260,8 @@ def query(
             shifted_template_index = shifted_template_index[:, 0]
             upsampled_shifted_template_index = upsampled_shifted_template_index[:, 0]
         else:
-            shift_indices_a = torch.searchsorted(self.shifts_a, shifts_a)
-            shift_indices_b = torch.searchsorted(self.shifts_b, shifts_b)
+            shift_indices_a = self.get_shift_ix_a(shifts_a)
+            shift_indices_b = self.get_shift_ix_a(shifts_b)
             a_ix = (template_indices_a, shift_indices_a)
             b_ix = (template_indices_b, shift_indices_b)
 
@@ -250,6 +280,9 @@ def query(
         up_shifted_temp_ix_b = upsampled_shifted_template_index[b_ix]
 
         # return convolutions between all ai,bj or just ai,bi?
+        print(f"{shifted_temp_ix_a.device=} {up_shifted_temp_ix_b.device=}")
+        print(f"{self.device=} {self.shifts_a.device=}")
+        print(f"{template_indices_a.device=} {template_indices_b.device=}")
         if grid:
             pconv_indices = self.pconv_index[
                 shifted_temp_ix_a[:, None], up_shifted_temp_ix_b[None, :]
@@ -258,9 +291,13 @@ def query(
                 template_indices_a, template_indices_b
             ).T
             if scalings_b is not None:
-                scalings_b = torch.broadcast_to(scalings_b[None], pconv_indices.shape).reshape(-1)
+                scalings_b = torch.broadcast_to(
+                    scalings_b[None], pconv_indices.shape
+                ).reshape(-1)
             if times_b is not None:
-                times_b = torch.broadcast_to(times_b[None], pconv_indices.shape).reshape(-1)
+                times_b = torch.broadcast_to(
+                    times_b[None], pconv_indices.shape
+                ).reshape(-1)
             pconv_indices = pconv_indices.view(-1)
         else:
             pconv_indices = self.pconv_index[shifted_temp_ix_a, up_shifted_temp_ix_b]
@@ -279,7 +316,9 @@ def query(
         if self.in_memory:
             pconvs = self.pconv[pconv_indices.to(self.pconv.device)]
         else:
-            pconvs = torch.from_numpy(batched_h5_read(self.pconv, pconv_indices.numpy(force=True)))
+            pconvs = torch.from_numpy(
+                batched_h5_read(self.pconv, pconv_indices.numpy(force=True))
+            )
         if device is not None:
             pconvs = pconvs.to(device)
 
@@ -291,6 +330,7 @@ def query(
 
         return template_indices_a, template_indices_b, pconvs
 
+
 def batched_h5_read(dataset, indices, batch_size=1000):
     if indices.size < batch_size:
         return dataset[indices]
@@ -299,4 +339,19 @@ def batched_h5_read(dataset, indices, batch_size=1000):
         for bs in range(0, indices.size, batch_size):
             be = min(indices.size, bs + batch_size)
             out[bs:be] = dataset[indices[bs:be]]
-        return out
+        return out
+
+
+def _get_shift_indexer(shifts):
+    assert torch.equal(shifts, torch.sort(shifts).values)
+    shift_offset = -int(shifts[0])
+    offset_shift_to_ix = []
+    for j, shift in enumerate(shifts):
+        ix = shift + shift_offset
+        assert len(offset_shift_to_ix) <= ix
+        assert 0 <= ix < len(shifts)
+        while len(offset_shift_to_ix) < ix:
+            offset_shift_to_ix.append(len(shifts))
+        offset_shift_to_ix.append(j)
+    offset_shift_to_ix = torch.tensor(offset_shift_to_ix, device=shifts.device)
+    return shift_offset, offset_shift_to_ix