cwindolf
diff --git a/‎src/dartsort/peel/matching.py
Lines changed: 13 additions & 4 deletions b/‎src/dartsort/peel/matching.py
Lines changed: 13 additions & 4 deletions
diff --git a/‎src/dartsort/peel/peel_base.py
Lines changed: 4 additions & 2 deletions b/‎src/dartsort/peel/peel_base.py
Lines changed: 4 additions & 2 deletions
diff --git a/‎src/dartsort/templates/get_templates.py
Lines changed: 21 additions & 12 deletions b/‎src/dartsort/templates/get_templates.py
Lines changed: 21 additions & 12 deletions
diff --git a/‎src/dartsort/templates/pairwise.py
Lines changed: 2 additions & 1 deletion b/‎src/dartsort/templates/pairwise.py
Lines changed: 2 additions & 1 deletion
diff --git a/‎src/dartsort/templates/pairwise_util.py
Lines changed: 26 additions & 17 deletions b/‎src/dartsort/templates/pairwise_util.py
Lines changed: 26 additions & 17 deletions
@@ -218,6 +218,10 @@ def build_template_data(
         temporal_components = low_rank_templates.temporal_components.astype(dtype)
         singular_values = low_rank_templates.singular_values.astype(dtype)
         spatial_components = low_rank_templates.spatial_components.astype(dtype)
+        print(f"{template_data.templates.dtype=}")
+        print(f"{temporal_components.dtype=}")
+        print(f"{singular_values.dtype=}")
+        print(f"{spatial_components.dtype=}")
         self.register_buffer("temporal_components", torch.tensor(temporal_components))
         self.register_buffer("singular_values", torch.tensor(singular_values))
         self.register_buffer("spatial_components", torch.tensor(spatial_components))
@@ -236,16 +240,20 @@ def build_template_data(
         chunk_centers_s = self.recording._recording_segments[0].sample_index_to_time(
             chunk_centers_samples
         )
+        print(f"build_template_data {device=}")
+        print(f"{chunk_centers_s.shape=} {chunk_centers_s[:10]=}")
         self.pairwise_conv_db = CompressedPairwiseConv.from_template_data(
             save_folder / "pconv.h5",
             template_data=template_data,
             low_rank_templates=low_rank_templates,
             compressed_upsampled_temporal=compressed_upsampled_temporal,
             chunk_time_centers_s=chunk_centers_s,
-            motion_est=motion_est,
+            motion_est=self.motion_est,
             geom=self.geom,
             conv_ignore_threshold=self.conv_ignore_threshold,
             coarse_approx_error_threshold=self.coarse_approx_error_threshold,
+            device=device,
+            n_jobs=n_jobs,
         )
 
         self.fixed_output_data += [
@@ -258,7 +266,7 @@ def build_template_data(
             ),
             (
                 "compressed_upsampled_temporal",
-                compressed_upsampled_temporal.compressed_upsampled_temporal,
+                compressed_upsampled_temporal.compressed_upsampled_templates,
             ),
         ]
 
@@ -274,13 +282,14 @@ def handle_upsampling(
             ptps=ptps,
             max_upsample=temporal_upsampling_factor,
         )
+        print(f"{compressed_upsampled_temporal.compressed_upsampled_templates.dtype=}")
         self.register_buffer(
             "compressed_upsampling_map",
-            compressed_upsampled_temporal.compressed_upsampling_map,
+            torch.tensor(compressed_upsampled_temporal.compressed_upsampling_map),
         )
         self.register_buffer(
             "compressed_upsampled_temporal",
-            compressed_upsampled_temporal.compressed_upsampled_temporal,
+            torch.tensor(compressed_upsampled_temporal.compressed_upsampled_templates),
         )
         if temporal_upsampling_factor == 1:
             return compressed_upsampled_temporal
 
@@ -213,7 +213,7 @@ def peeling_needs_fit(self):
     def precompute_peeling_data(self, save_folder, n_jobs=0, device=None):
         # subclasses should override if they need to cache data for peeling
         # runs before fit_peeler_models()
-        assert not self.peeling_needs_fit()
+        pass
 
     def fit_peeler_models(self, save_folder):
         # subclasses should override if they need to fit models for peeling
@@ -324,7 +324,9 @@ def needs_fit(self):
     def fit_models(self, save_folder, n_jobs=0, device=None):
         with torch.no_grad():
             if self.peeling_needs_fit():
-                self.precompute_peeling_data()
+                self.precompute_peeling_data(
+                    save_folder=save_folder, n_jobs=n_jobs, device=device
+                )
                 self.fit_peeler_models(
                     save_folder=save_folder, n_jobs=n_jobs, device=device
                 )
 
@@ -181,6 +181,7 @@ def get_templates(
         snr_threshold=denoising_snr_threshold,
     )
     templates = weights * raw_templates + (1 - weights) * low_rank_templates
+    templates = templates.astype(recording.dtype)
 
     return dict(
         sorting=sorting,
@@ -379,13 +380,16 @@ def get_all_shifted_raw_and_low_rank_templates(
         registered_kdtree = KDTree(registered_geom)
 
     n_units = sorting.labels.max() + 1
-    raw_templates = np.zeros((n_units, spike_length_samples, n_template_channels))
+    raw_templates = np.zeros(
+        (n_units, spike_length_samples, n_template_channels), dtype=recording.dtype
+    )
     low_rank_templates = None
     if not raw:
         low_rank_templates = np.zeros(
-            (n_units, spike_length_samples, n_template_channels)
+            (n_units, spike_length_samples, n_template_channels),
+            dtype=recording.dtype,
         )
-    snrs_by_channel = np.zeros((n_units, n_template_channels))
+    snrs_by_channel = np.zeros((n_units, n_template_channels), dtype=recording.dtype)
 
     unit_id_chunks = [
         unit_ids[i : i + units_per_job] for i in range(0, n_units, units_per_job)
@@ -421,6 +425,8 @@ def get_all_shifted_raw_and_low_rank_templates(
                 unit="template",
             )
         for res in results:
+            if res is None:
+                continue
             units_chunk, raw_temps_chunk, low_rank_temps_chunk, snrs_chunk = res
             raw_templates[units_chunk] = raw_temps_chunk
             if not raw:
@@ -477,12 +483,14 @@ def __init__(
             dtype=torch.from_numpy(np.zeros(1, dtype=recording.dtype)).dtype,
         )
 
+        self.n_template_channels = self.n_channels
         if self.registered:
             self.geom = recording.get_channel_locations()
             self.match_distance = pdist(self.geom).min() / 2
             self.registered_geom = registered_kdtree.data
             self.registered_kdtree = registered_kdtree
             self.pitch_shifts = pitch_shifts
+            self.n_template_channels = len(self.registered_geom)
 
 
 _template_process_context = None
@@ -535,6 +543,8 @@ def _template_job(unit_ids):
     p = _template_process_context
 
     in_units_full = np.flatnonzero(np.isin(p.sorting.labels, unit_ids))
+    if not in_units_full.size:
+        return
     labels_full = p.sorting.labels[in_units_full]
 
     # only so many spikes per unit
@@ -564,7 +574,7 @@ def _template_job(unit_ids):
         (times >= p.trough_offset_samples) & (times < p.max_spike_time)
     )
     if not valid.size:
-        return uids, 0, 0, 0
+        return
     in_units = in_units[valid]
     labels = labels[valid]
     times = times[valid]
@@ -581,12 +591,12 @@ def _template_job(unit_ids):
     # compute raw templates and spike counts per channel
     raw_templates = []
     counts = []
+    units_chunk = []
     for u in uids:
         in_unit = np.flatnonzero(labels == u)
         if not in_unit.size:
-            raw_templates.append(np.zeros(1))
-            counts.append(0)
             continue
+        units_chunk.append(u)
         in_unit_orig = in_units[labels == u]
         if p.registered:
             raw_templates.append(
@@ -617,9 +627,10 @@ def _template_job(unit_ids):
             )
             counts.append(in_unit.size)
     snrs_by_chan = [ptp(rt, 0) * c for rt, c in zip(raw_templates, counts)]
+    raw_templates = np.array(raw_templates)
 
     if p.denoising_tsvd is None:
-        return uids, raw_templates, None, snrs_by_chan
+        return units_chunk, raw_templates, None, snrs_by_chan
 
     # apply denoising
     waveforms = waveforms.permute(0, 2, 1).reshape(n * c, t)
@@ -628,11 +639,8 @@ def _template_job(unit_ids):
 
     # get low rank templates
     low_rank_templates = []
-    for u in uids:
+    for u in units_chunk:
         in_unit = np.flatnonzero(labels == u)
-        if not in_unit.size:
-            low_rank_templates.append(0)
-            continue
         in_unit_orig = in_units[labels == u]
         if p.registered:
             low_rank_templates.append(
@@ -650,8 +658,9 @@ def _template_job(unit_ids):
             low_rank_templates.append(
                 p.reducer(waveforms[in_unit], axis=0).numpy(force=True)
             )
+    low_rank_templates = np.array(low_rank_templates)
 
-    return uids, raw_templates, low_rank_templates, snrs_by_chan
+    return units_chunk, raw_templates, low_rank_templates, snrs_by_chan
 
 
 class TorchSVDProjector(torch.nn.Module):
 
@@ -79,13 +79,14 @@ def from_template_data(
         geom: Optional[np.ndarray] = None,
         conv_ignore_threshold=0.0,
         coarse_approx_error_threshold=0.0,
-        conv_batch_size=128,
+        conv_batch_size=1024,
         units_batch_size=8,
         overwrite=False,
         device=None,
         n_jobs=0,
         show_progress=True,
     ):
+        print(f"pairwise from_template_data {device=}")
         compressed_convolve_to_h5(
             hdf5_filename,
             template_data=template_data,
 
@@ -28,7 +28,7 @@ def compressed_convolve_to_h5(
     geom: Optional[np.ndarray] = None,
     conv_ignore_threshold=0.0,
     coarse_approx_error_threshold=0.0,
-    conv_batch_size=128,
+    conv_batch_size=1024,
     units_batch_size=8,
     overwrite=False,
     device=None,
@@ -57,6 +57,7 @@ def compressed_convolve_to_h5(
     upsampled_shifted_template_index = get_upsampled_shifted_template_index(
         template_shift_index, compressed_upsampled_temporal
     )
+    print(f"compressed_convolve_to_h5 {conv_batch_size=} {units_batch_size=} {device=}")
 
     chunk_res_iterator = iterate_compressed_pairwise_convolutions(
         template_data=template_data,
@@ -148,7 +149,7 @@ def iterate_compressed_pairwise_convolutions(
     conv_ignore_threshold=0.0,
     coarse_approx_error_threshold=0.0,
     max_shift="full",
-    conv_batch_size=128,
+    conv_batch_size=1024,
     units_batch_size=8,
     device=None,
     n_jobs=0,
@@ -165,6 +166,7 @@ def iterate_compressed_pairwise_convolutions(
     process the results differently.
     """
     # construct drift-related helper data if needed
+    print(f"iterate_compressed_pairwise_convolutions {conv_batch_size=} {units_batch_size=} {device=}")
     n_shifts = template_shift_index.all_pitch_shifts.size
     do_shifting = n_shifts > 1
     geom_kdtree = reg_geom_kdtree = match_distance = None
@@ -267,7 +269,7 @@ def compressed_convolve_pairs(
     conv_ignore_threshold=0.0,
     coarse_approx_error_threshold=0.0,
     max_shift="full",
-    batch_size=128,
+    batch_size=1024,
     device=None,
 ) -> Optional[CompressedConvResult]:
     """Compute compressed pairwise convolutions between template pairs
@@ -280,9 +282,11 @@ def compressed_convolve_pairs(
     shifts, superres templates, and upsamples. Some of these may be zero or may
     be duplicates, so the return value is a sparse representation. See below.
     """
+    # print(f"compressed_convolve_pairs {device=}")
     # print(f"{units_a.shape=}")
     # print(f"{units_b.shape=}")
     # print(f"{(units_a.size * units_b.size)=}")
+    # print(f"compressed_convolve_pairs {batch_size=} {units_a.size=} {device=}")
 
     # what pairs, shifts, etc are we convolving?
     shifted_temp_ix_a, temp_ix_a, shift_a, unit_a = handle_shift_indices(
@@ -317,6 +321,9 @@ def compressed_convolve_pairs(
         match_distance=match_distance,
         device=device,
     )
+    # print(f"{low_rank_templates.spatial_components.dtype=} {low_rank_templates.singular_values.dtype=}")
+    # print(f"{compressed_upsampled_temporal.compressed_upsampled_templates.dtype=}")
+    # print(f"{spatial_singular_a.dtype=} {spatial_singular_b.dtype=}")
 
     # figure out pairs of shifted templates to convolve in a deduplicated way
     pairs_ret = shift_deduplicated_pairs(
@@ -392,27 +399,27 @@ def compressed_convolve_pairs(
     # print(f"{temporal_a[ix_a[conv_ix]].shape=}")
     # print(f"{conv_temporal_components_up_b.shape=}")
     pconv, kept = correlate_pairs_lowrank(
-        torch.as_tensor(spatial_singular_a[ix_a[conv_ix]]).to(device),
-        torch.as_tensor(spatial_singular_b[ix_b[conv_ix]]).to(device),
-        torch.as_tensor(temporal_a[ix_a[conv_ix]]).to(device),
-        torch.as_tensor(conv_temporal_components_up_b).to(device),
+        torch.as_tensor(spatial_singular_a[ix_a[conv_ix]], device=device),
+        torch.as_tensor(spatial_singular_b[ix_b[conv_ix]], device=device),
+        torch.as_tensor(temporal_a[ix_a[conv_ix]], device=device),
+        torch.as_tensor(conv_temporal_components_up_b, device=device),
         max_shift=max_shift,
         conv_ignore_threshold=conv_ignore_threshold,
         batch_size=batch_size,
     )
-    print(f"-----------")
-    print(f"after corr {pconv.shape=} {conv_ix[kept].shape=}")
+    # print(f"-----------")
+    # print(f"after corr {pconv.shape=} {conv_ix[kept].shape=}")
     conv_ix = conv_ix[kept]
     if not conv_ix.size:
         return None
     kept_pairs = np.flatnonzero(np.isin(compression_index, kept))
-    print(f"-----------")
-    print(f"kept {pconv.shape=} {conv_ix.shape=} {compression_index.shape=}")
-    print(f"{compression_index.min()=} {compression_index.max()=}")
-    print(f"{compression_index[kept_pairs].min()=} {compression_index[kept_pairs].max()=}")
-    print(f"{ix_a.shape=} {ix_b.shape=}")
-    print(f"{kept.shape=} {kept.dtype=} {kept.min()=} {kept.max()=}")
-    print(f"{kept_pairs.shape=} {kept_pairs.dtype=} {kept_pairs.min()=} {kept_pairs.max()=}")
+    # print(f"-----------")
+    # print(f"kept {pconv.shape=} {conv_ix.shape=} {compression_index.shape=}")
+    # print(f"{compression_index.min()=} {compression_index.max()=}")
+    # print(f"{compression_index[kept_pairs].min()=} {compression_index[kept_pairs].max()=}")
+    # print(f"{ix_a.shape=} {ix_b.shape=}")
+    # print(f"{kept.shape=} {kept.dtype=} {kept.min()=} {kept.max()=}")
+    # print(f"{kept_pairs.shape=} {kept_pairs.dtype=} {kept_pairs.min()=} {kept_pairs.max()=}")
     compression_index = np.searchsorted(kept, compression_index[kept_pairs])
     conv_ix = np.searchsorted(kept_pairs, conv_ix)
     ix_a = ix_a[kept_pairs]
@@ -472,7 +479,7 @@ def correlate_pairs_lowrank(
     temporal_b,
     max_shift="full",
     conv_ignore_threshold=0.0,
-    batch_size=128,
+    batch_size=1024,
 ):
     """Convolve pairs of low rank templates
 
@@ -504,6 +511,8 @@ def correlate_pairs_lowrank(
     assert n_pairs == n_pairs_
     assert t == t_
     assert rank == rank_
+    # print(f"{spatial_a.device=} {spatial_b.device=} {temporal_a.device=} {temporal_b.device=}")
+    # print(f"compressed_convolve_pairs {batch_size=} {n_pairs=} {spatial_a.device=}")
 
     if max_shift == "full":
         max_shift = t - 1