From d4d2597a5888d32ca7ae6e4e2a87994d7042a6ad Mon Sep 17 00:00:00 2001 From: Cyrille Rossant Date: Mon, 7 Feb 2022 16:46:03 +0100 Subject: [PATCH 01/18] Move mtscomp.py to a subfolder --- LICENSE.md | 4 ++-- mtscomp/__init__.py | 0 mtscomp.py => mtscomp/mtscomp.py | 0 setup.py | 2 +- 4 files changed, 3 insertions(+), 3 deletions(-) create mode 100644 mtscomp/__init__.py rename mtscomp.py => mtscomp/mtscomp.py (100%) diff --git a/LICENSE.md b/LICENSE.md index 59a24f6..87a59e7 100644 --- a/LICENSE.md +++ b/LICENSE.md @@ -1,4 +1,4 @@ -Copyright (c) 2019, Cyrille Rossant +Copyright (c) 2019-2022, Cyrille Rossant, International Brain Laboratory All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: @@ -9,4 +9,4 @@ Redistribution and use in source and binary forms, with or without modification, * Neither the name of mtscomp nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. \ No newline at end of file +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/mtscomp/__init__.py b/mtscomp/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/mtscomp.py b/mtscomp/mtscomp.py similarity index 100% rename from mtscomp.py rename to mtscomp/mtscomp.py diff --git a/setup.py b/setup.py index e8a1737..db82c8d 100644 --- a/setup.py +++ b/setup.py @@ -24,7 +24,7 @@ # Find version number from `__init__.py` without executing it. -filename = op.join(curdir, 'mtscomp.py') +filename = op.join(curdir, 'mtscomp', 'mtscomp.py') with open(filename, 'r') as f: version = re.search(r"__version__ = '([^']+)'", f.read()).group(1) From 43a4b79e4dc887906b791b9a198f8e48326d0a01 Mon Sep 17 00:00:00 2001 From: Cyrille Rossant Date: Mon, 7 Feb 2022 18:38:00 +0100 Subject: [PATCH 02/18] WIP: implementation of lossy compression --- mtscomp/lossy.py | 365 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 365 insertions(+) create mode 100644 mtscomp/lossy.py diff --git a/mtscomp/lossy.py b/mtscomp/lossy.py new file mode 100644 index 0000000..6581b8d --- /dev/null +++ b/mtscomp/lossy.py @@ -0,0 +1,365 @@ +# -*- coding: utf-8 -*- + +"""SVD-based raw ephys data lossy compression.""" + + +#------------------------------------------------------------------------------ +# Imports +#------------------------------------------------------------------------------ + +import argparse +from functools import lru_cache +from itertools import islice +import logging +import os +import os.path as op +from pathlib import Path +import sys + +from tqdm import tqdm +import numpy as np +from numpy.lib.format import open_memmap + +from .mtscomp import Bunch, decompress, Reader + + +logger = logging.getLogger(('mtscomp')) + + +#------------------------------------------------------------------------------ +# Constants +#------------------------------------------------------------------------------ + +DOWNSAMPLE_FACTOR = 6 + +FILE_EXTENSION_LOSSY = '.lossy.npy' +FILE_EXTENSION_SVD = '.svd.npz' + + +#------------------------------------------------------------------------------ +# Util classes +#------------------------------------------------------------------------------ + +class SVD(Bunch): + def __init__( + self, U, sigma, rank=None, a=1, b=0, + sample_rate=None, downsample_factor=DOWNSAMPLE_FACTOR): + super(SVD, self).__init__() + self.U = U + self.n_channels = U.shape[0] + assert sigma.shape == (self.n_channels,) + self.Usigma_inv = np.linalg.inv(U @ np.diag(sigma)) + assert self.Usigma_inv.shape == self.U.shape + self.sigma = sigma + self.rank = rank + self.a = a + self.b = b + self.sample_rate = sample_rate + self.downsample_factor = downsample_factor + + def __repr__(self): + return f"" + + +#------------------------------------------------------------------------------ +# Util functions +#------------------------------------------------------------------------------ + +def _car(x): + assert x.ndim == 2 + # ns, nc + assert x.shape[0] > x.shape[1] + y = x.astype(np.float32) + y -= y.mean(axis=0) + return y + + +def _downsample(x, factor=1): + assert x.ndim == 2 + ns, nc = x.shape + # ns, nc + assert x.shape[0] > x.shape[1] + y = x.T.reshape((nc, (ns - ns % factor) // factor, factor)).mean(axis=2) + # nc, ns + return y + + +def _svd(x): + U, sigma, _ = np.linalg.svd(x, full_matrices=False) + return SVD(U, sigma) + + +def _uint8_coefs(x): + # k = .05 + # m, M = np.quantile(x, k), np.quantile(x, 1 - k) + m, M = x.min(), x.max() + a = 255 / (M - m) + b = m + return a, b + + +def to_uint8(x, ab=None): + a, b = ab if ab is not None else _uint8_coefs(x) + y = (x - b) * a + # x = y / a + b + # assert np.all((0 <= y) & (y <= 255)) + # return y.astype(np.uint8), (a, b) + return y, (a, b) + + +def from_uint8(y, ab): + a, b = ab + return y.astype(np.float32) / a + b + + +#------------------------------------------------------------------------------ +# Processing functions +#------------------------------------------------------------------------------ + +def _preprocess(raw): + assert raw.shape[0] > raw.shape[1] + # raw is (ns, nc) + + pp = _car(raw) + # pp is (ns, nc) + assert pp.shape[0] > pp.shape[1] + + pp = _downsample(pp, factor=DOWNSAMPLE_FACTOR) + # pp is (nc, ns) + assert pp.shape[0] < pp.shape[1] + + return pp + + +def _get_excerpts(reader, kept_chunks=20): + assert reader + assert isinstance(reader, Reader) + assert kept_chunks >= 2 + + arrs = [] + n = 0 + n_chunks = reader.n_chunks + assert reader.shape[0] > reader.shape[1] + skip = n_chunks // kept_chunks + for chunk_idx, chunk_start, chunk_length in tqdm( + islice(reader.iter_chunks(), 0, n_chunks + 1, skip), + total=n_chunks // skip, desc="extracting excerpts from the raw data"): + + chunk = reader.read_chunk(chunk_idx, chunk_start, chunk_length) + # chunk is (ns, nc) + assert chunk.shape[0] > chunk.shape[1] + + pp = _preprocess(chunk) + # pp is (nc, ns) + assert chunk.shape[0] > chunk.shape[1] + + arrs.append(pp) + + excerpts = np.hstack(arrs) + # excerpts is (nc, ns) + assert excerpts.shape[0] < excerpts.shape[1] + assert excerpts.shape[0] == reader.n_channels + + return excerpts + + +def excerpt_svd(reader, rank, kept_chunks=20): + assert rank + excerpts = _get_excerpts(reader, kept_chunks=kept_chunks) + # excerpts is (nc, ns) + assert excerpts.shape[0] < excerpts.shape[1] + + # Compute the SVD of the excerpts. + svd = _svd(excerpts) + assert svd.U.shape[0] == reader.n_channels + + svd.sample_rate = reader.sample_rate + svd.downsample_factor = DOWNSAMPLE_FACTOR + svd.rank = min(rank, svd.n_channels) + svd.a, svd.b = _uint8_coefs(excerpts) + + assert svd + return svd + + +def _compress_chunk(raw, svd): + # raw is (ns, nc) + assert raw.shape[0] > raw.shape[1] + + pp = _preprocess(raw) + # pp is (nc, ns) + assert pp.shape[0] < pp.shape[1] + + rank = svd.rank + assert rank > 0 + assert svd.a != 0 + + lossy = (svd.Usigma_inv @ pp)[:rank, :] + # lossy is (nc, ns) + assert lossy.shape[0] < lossy.shape[1] + + lossy8, _ = to_uint8(lossy, (svd.a, svd.b)) + return lossy8 + + +def _decompress_chunk(lossy, svd, rank=None): + # lossy is (nc, ns) + assert lossy.shape[0] < lossy.shape[1] + + assert svd + assert isinstance(svd, SVD) + U, sigma = svd.U, svd.sigma + rank = rank or svd.rank + rank = min(rank, svd.rank) + rank = min(rank, svd.n_channels) + + lossy = from_uint8(lossy, (svd.a, svd.b)) + assert rank > 0 + + return (U[:, :rank] @ np.diag(sigma[:rank]) @ lossy[:rank, :]) + + +def compress_lossy( + path=None, cmeta=None, rank=None, max_chunks=0, downsampling_factor=None, + out_lossy=None, out_svd=None): + + # Check arguments. + assert rank, "The rank must be set" + assert path, "The raw ephys data file must be specified" + + if downsampling_factor is None: + downsampling_factor = DOWNSAMPLE_FACTOR + assert downsampling_factor >= 1 + + # Create a mtscomp Reader. + reader = decompress(path, cmeta=cmeta) + ns = reader.n_samples + nc = reader.n_channels + n_chunks = reader.n_chunks if max_chunks == 0 else max_chunks + assert n_chunks > 0 + assert rank <= nc, "The rank cannot exceed the number of channels" + + # Compute the SVD on an excerpt of the data. + svd = excerpt_svd(reader, rank) + + # Filenames. + if out_lossy is None: + out_lossy = Path(path).with_suffix(FILE_EXTENSION_LOSSY) + assert out_lossy + + if out_svd is None: + out_svd = Path(path).with_suffix(FILE_EXTENSION_SVD) + assert out_svd + + # Create a new memmapped npy file + if out_lossy.exists(): + raise IOError(f"File {out_lossy} already exists.") + shape = (ns // downsampling_factor, rank) + lossy = open_memmap(out_lossy, 'w+', dtype=np.uint8, shape=shape) + + # Compress the data. + offset = 0 + for chunk_idx, chunk_start, chunk_length in \ + tqdm(reader.iter_chunks(last_chunk=n_chunks - 1), total=n_chunks): + + # Decompress the chunk. + raw = reader.read_chunk(chunk_idx, chunk_start, chunk_length) + + # raw is (ns, nc) + assert raw.shape[0] > raw.shape[1] + nsc, _ = raw.shape + assert _ == nc + + # Process the chunk. + pp = _preprocess(raw) + # pp is (nc, ns) + assert pp.shape[0] < pp.shape[1] + + # Compress the chunk. + chunk_lossy = _compress_chunk(pp) + # chunk_lossy is (nc, ns) + assert chunk_lossy.shape[0] < chunk_lossy.shape[1] + + # Write the compressed chunk to disk. + l = chunk_lossy.shape[1] + lossy[offset:offset + l, :] = chunk_lossy.T + offset += l + + # Save the SVD info to a npz file. + np.savez(out_svd, **SVD) + + +#------------------------------------------------------------------------------ +# Decompressor +#------------------------------------------------------------------------------ + +class LossyReader: + def __init__(self): + self.path_lossy = None + self.path_svd = None + + def open(self, path_lossy=None, path_svd=None): + self.path_lossy = path_lossy + self.path_svd = path_svd + + assert self.path_lossy + assert self.path_svd + + self._lossy = open_memmap(path_lossy, 'r') + # ns, nc + assert self._lossy.shape[1] > self._lossy.shape[0] + assert self._lossy.dtype == np.uint8 + + self._svd = np.load(self.path_svd) + ds = self._svd.downsample_factor + assert ds >= 1 + + self.n_channels = self._svd.U.shape[0] + self.n_samples = self._lossy.shape[0] * ds + self.ndim = 2 + self.shape = (self.n_samples, self.n_channels) + self.size = self.n_samples * self.n_channels + self.size_bytes = self._lossy.size * self._lossy.itemsize + self.itemsize = 1 + self.dtype = np.uint8 + + def get(self, i0, i1, rank=None): + lossy = self._lossy[i0:i1] + return decompress(lossy, self._svd, rank=rank) + + def __get_item__(self, idx): + lossy = self._lossy[idx] + return decompress(lossy, self._svd) + + +def decompress_lossy(path_lossy="file.lossy.npy", path_svd="file.svd.npz"): + reader = LossyReader() + reader.open(path_lossy, path_svd=path_svd) + return reader + + +def test(): + EPHYS_DIR = Path("/home/cyrille/ephys/globus/KS023/") + path_cbin = EPHYS_DIR / "raw.cbin" + path_ch = EPHYS_DIR / "raw.ch" + + reader = decompress(path_cbin) + rank = 46 + svd = excerpt_svd(reader, rank, 5) + + raw = reader[:6000, :] + nc = raw.shape[1] + lossy = _compress_chunk(raw, svd) + # reconst = _decompress_chunk(lossy, svd, rank=50) + + import matplotlib.pyplot as plt + nrows = 2 + fix, axs = plt.subplots(nrows, 1, sharex=True) + axs[0].imshow(_preprocess(raw), cmap="gray", aspect="auto") + axs[0].set_title(f"original") + for i in range(1, nrows): + # rank = 50 * i + compression = DOWNSAMPLE_FACTOR * 2 * nc / float(rank) + axs[i].imshow(_decompress_chunk(lossy, svd, rank=rank), cmap="gray", aspect="auto") + axs[i].set_title(f"rank={rank}, compression={compression:.1f}x") + plt.show() From 6a0fb2608fb7fe88e19c0e074f7903188e03633d Mon Sep 17 00:00:00 2001 From: Cyrille Rossant Date: Mon, 7 Feb 2022 19:20:29 +0100 Subject: [PATCH 03/18] Fix uint8 casting in lossy compression algorithm --- mtscomp/lossy.py | 67 ++++++++++++++++++++++++++++++++++-------------- 1 file changed, 48 insertions(+), 19 deletions(-) diff --git a/mtscomp/lossy.py b/mtscomp/lossy.py index 6581b8d..f40c411 100644 --- a/mtscomp/lossy.py +++ b/mtscomp/lossy.py @@ -103,8 +103,8 @@ def to_uint8(x, ab=None): y = (x - b) * a # x = y / a + b # assert np.all((0 <= y) & (y <= 255)) - # return y.astype(np.uint8), (a, b) - return y, (a, b) + return y.astype(np.uint8), (a, b) + # return y, (a, b) def from_uint8(y, ab): @@ -176,7 +176,9 @@ def excerpt_svd(reader, rank, kept_chunks=20): svd.sample_rate = reader.sample_rate svd.downsample_factor = DOWNSAMPLE_FACTOR svd.rank = min(rank, svd.n_channels) - svd.a, svd.b = _uint8_coefs(excerpts) + + # NOTE: compute the uint8 scaling on the first second of data + svd.ab = _uint8_coefs(excerpts[:, :int(svd.sample_rate)]) assert svd return svd @@ -198,8 +200,10 @@ def _compress_chunk(raw, svd): # lossy is (nc, ns) assert lossy.shape[0] < lossy.shape[1] - lossy8, _ = to_uint8(lossy, (svd.a, svd.b)) - return lossy8 + # lossy8, _ = to_uint8(lossy, (svd.a, svd.b)) + # return lossy8 + + return lossy def _decompress_chunk(lossy, svd, rank=None): @@ -213,7 +217,7 @@ def _decompress_chunk(lossy, svd, rank=None): rank = min(rank, svd.rank) rank = min(rank, svd.n_channels) - lossy = from_uint8(lossy, (svd.a, svd.b)) + # lossy = from_uint8(lossy, (svd.a, svd.b)) assert rank > 0 return (U[:, :rank] @ np.diag(sigma[:rank]) @ lossy[:rank, :]) @@ -282,7 +286,7 @@ def compress_lossy( # Write the compressed chunk to disk. l = chunk_lossy.shape[1] - lossy[offset:offset + l, :] = chunk_lossy.T + lossy[offset:offset + l, :] = to_uint8(chunk_lossy.T, svd.ab) offset += l # Save the SVD info to a npz file. @@ -323,13 +327,17 @@ def open(self, path_lossy=None, path_svd=None): self.itemsize = 1 self.dtype = np.uint8 + def _decompress(self, lossy, rank=None): + lossy_float = from_uint8(lossy, self._svd.ab) + return _decompress_chunk(lossy_float, self._svd, rank=rank) + def get(self, i0, i1, rank=None): lossy = self._lossy[i0:i1] - return decompress(lossy, self._svd, rank=rank) + return self._decompress(lossy, rank=rank) def __get_item__(self, idx): lossy = self._lossy[idx] - return decompress(lossy, self._svd) + return self._decompress(lossy) def decompress_lossy(path_lossy="file.lossy.npy", path_svd="file.svd.npz"): @@ -339,27 +347,48 @@ def decompress_lossy(path_lossy="file.lossy.npy", path_svd="file.svd.npz"): def test(): + import matplotlib.pyplot as plt + import seaborn as sns + plt.rcParams["figure.dpi"] = 140 + plt.rcParams["axes.grid"] = False + sns.set_theme(style="white") + EPHYS_DIR = Path("/home/cyrille/ephys/globus/KS023/") path_cbin = EPHYS_DIR / "raw.cbin" path_ch = EPHYS_DIR / "raw.ch" reader = decompress(path_cbin) - rank = 46 - svd = excerpt_svd(reader, rank, 5) + rank = 40 + chunks_excerpts = 3 + + svd = excerpt_svd(reader, rank, chunks_excerpts) - raw = reader[:6000, :] + raw = reader[:30000, :] nc = raw.shape[1] + compression = DOWNSAMPLE_FACTOR * 2 * nc / float(rank) + lossy = _compress_chunk(raw, svd) - # reconst = _decompress_chunk(lossy, svd, rank=50) + reconst = _decompress_chunk(lossy, svd, rank=rank) + + # plt.figure() + # plt.hist(lossy.ravel(), bins=64, log=True) + + lossy8, ab = to_uint8(lossy) + lossy_ = from_uint8(lossy8, ab) + reconst8 = _decompress_chunk(lossy_, svd, rank=rank) - import matplotlib.pyplot as plt nrows = 2 fix, axs = plt.subplots(nrows, 1, sharex=True) + axs[0].imshow(_preprocess(raw), cmap="gray", aspect="auto") axs[0].set_title(f"original") - for i in range(1, nrows): - # rank = 50 * i - compression = DOWNSAMPLE_FACTOR * 2 * nc / float(rank) - axs[i].imshow(_decompress_chunk(lossy, svd, rank=rank), cmap="gray", aspect="auto") - axs[i].set_title(f"rank={rank}, compression={compression:.1f}x") + + axs[1].imshow(reconst8, cmap="gray", aspect="auto") + axs[1].set_title(f"rank={rank}, compression={compression:.1f}x") + + # for i in range(1, nrows): + # # rank = 50 * i + # compression = DOWNSAMPLE_FACTOR * 2 * nc / float(rank) + # axs[i].imshow(_decompress_chunk(lossy, svd, rank=rank), cmap="gray", aspect="auto") + # axs[i].set_title(f"rank={rank}, compression={compression:.1f}x") plt.show() From fcd09d12c6683f1576974c85e507db38d69e48cb Mon Sep 17 00:00:00 2001 From: Cyrille Rossant Date: Mon, 7 Feb 2022 21:23:05 +0100 Subject: [PATCH 04/18] Fix imports --- mtscomp/__init__.py | 3 +++ mtscomp/mtscomp.py | 2 -- tests.py | 2 +- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/mtscomp/__init__.py b/mtscomp/__init__.py index e69de29..caae008 100644 --- a/mtscomp/__init__.py +++ b/mtscomp/__init__.py @@ -0,0 +1,3 @@ +from .mtscomp import * + +__all__ = ('load_raw_data', 'Writer', 'Reader', 'compress', 'decompress') diff --git a/mtscomp/mtscomp.py b/mtscomp/mtscomp.py index 3a85b1b..9507721 100644 --- a/mtscomp/mtscomp.py +++ b/mtscomp/mtscomp.py @@ -40,8 +40,6 @@ __version__ = '1.0.2' FORMAT_VERSION = '1.0' -__all__ = ('load_raw_data', 'Writer', 'Reader', 'compress', 'decompress') - DEFAULT_CONFIG = list(dict( algorithm='zlib', # only algorithm supported currently diff --git a/tests.py b/tests.py index d231007..95d1545 100644 --- a/tests.py +++ b/tests.py @@ -22,7 +22,7 @@ from pytest import fixture, raises, mark import mtscomp as mtscomp_mod -from mtscomp import ( +from mtscomp.mtscomp import ( add_default_handler, Writer, Reader, load_raw_data, diff_along_axis, cumsum_along_axis, mtscomp_parser, mtsdecomp_parser, _args_to_config, read_config, From 502e2c67afda0efe3ef5efbefd272a63e2d48df0 Mon Sep 17 00:00:00 2001 From: Cyrille Rossant Date: Tue, 8 Feb 2022 11:24:58 +0100 Subject: [PATCH 05/18] Fix lossy compression file I/O --- mtscomp/lossy.py | 228 ++++++++++++++++++++++++++--------------------- 1 file changed, 128 insertions(+), 100 deletions(-) diff --git a/mtscomp/lossy.py b/mtscomp/lossy.py index f40c411..2a7e160 100644 --- a/mtscomp/lossy.py +++ b/mtscomp/lossy.py @@ -18,12 +18,14 @@ from tqdm import tqdm import numpy as np +from numpy.linalg import inv from numpy.lib.format import open_memmap from .mtscomp import Bunch, decompress, Reader -logger = logging.getLogger(('mtscomp')) +logger = logging.getLogger('mtscomp') +logger.setLevel(logging.DEBUG) #------------------------------------------------------------------------------ @@ -31,6 +33,7 @@ #------------------------------------------------------------------------------ DOWNSAMPLE_FACTOR = 6 +CHUNKS_EXCERPTS = 20 FILE_EXTENSION_LOSSY = '.lossy.npy' FILE_EXTENSION_SVD = '.svd.npz' @@ -42,23 +45,61 @@ class SVD(Bunch): def __init__( - self, U, sigma, rank=None, a=1, b=0, + self, U, sigma, rank=None, ab=None, sample_rate=None, downsample_factor=DOWNSAMPLE_FACTOR): super(SVD, self).__init__() self.U = U self.n_channels = U.shape[0] assert sigma.shape == (self.n_channels,) - self.Usigma_inv = np.linalg.inv(U @ np.diag(sigma)) + self.Usigma_inv = inv(U @ np.diag(sigma)) assert self.Usigma_inv.shape == self.U.shape self.sigma = sigma self.rank = rank - self.a = a - self.b = b + self.ab = ab self.sample_rate = sample_rate self.downsample_factor = downsample_factor + def save(self, path): + assert self.U is not None + assert self.Usigma_inv is not None + assert self.sigma is not None + assert self.ab is not None + assert self.n_channels >= 1 + assert self.rank >= 1 + assert self.sample_rate > 0 + assert self.downsample_factor >= 1 + + np.savez( + path, + U=self.U, + # Usigma_inv=self.Usigma_inv, + sigma=self.sigma, + ab=self.ab, + + # NOTE: need to convert to regular arrays for np.savez + rank=np.array([self.rank]), + sample_rate=np.array([self.sample_rate]), + downsample_factor=np.array([self.downsample_factor]), + ) + def __repr__(self): - return f"" + # return f"" + return super(SVD, self).__repr__() + + +def load_svd(path): + d = np.load(path) + svd = SVD( + U=d['U'], + # Usigma_inv=d['Usigma_inv'], + sigma=d['sigma'], + ab=d['ab'], + rank=int(d['rank'][0]), + sample_rate=d['sample_rate'][0], + downsample_factor=int(d['downsample_factor'][0]), + ) + assert svd.n_channels >= 1 + return svd #------------------------------------------------------------------------------ @@ -89,22 +130,36 @@ def _svd(x): return SVD(U, sigma) -def _uint8_coefs(x): +def _uint8_coefs(x, margin=.05): # k = .05 # m, M = np.quantile(x, k), np.quantile(x, 1 - k) + m, M = x.min(), x.max() - a = 255 / (M - m) + d = M - m + assert d > 0 + + m -= d * margin + M += d * margin + + a = 255 / d b = m return a, b def to_uint8(x, ab=None): a, b = ab if ab is not None else _uint8_coefs(x) + y = (x - b) * a - # x = y / a + b - # assert np.all((0 <= y) & (y <= 255)) + # inverse: x = y / a + b + + # assert np.all((0 <= y) & (y < 256)) + overshoot = np.mean((y < 0) | (y >= 256)) + if overshoot > 0: + logger.debug( + f"uint8 casting: clipping {overshoot:.1f}% of overshooting values", overshoot * 100) + y = np.clip(y, 0, 255) + return y.astype(np.uint8), (a, b) - # return y, (a, b) def from_uint8(y, ab): @@ -143,7 +198,8 @@ def _get_excerpts(reader, kept_chunks=20): skip = n_chunks // kept_chunks for chunk_idx, chunk_start, chunk_length in tqdm( islice(reader.iter_chunks(), 0, n_chunks + 1, skip), - total=n_chunks // skip, desc="extracting excerpts from the raw data"): + total=n_chunks // skip, + desc="extracting excerpts..."): chunk = reader.read_chunk(chunk_idx, chunk_start, chunk_length) # chunk is (ns, nc) @@ -177,9 +233,6 @@ def excerpt_svd(reader, rank, kept_chunks=20): svd.downsample_factor = DOWNSAMPLE_FACTOR svd.rank = min(rank, svd.n_channels) - # NOTE: compute the uint8 scaling on the first second of data - svd.ab = _uint8_coefs(excerpts[:, :int(svd.sample_rate)]) - assert svd return svd @@ -194,15 +247,11 @@ def _compress_chunk(raw, svd): rank = svd.rank assert rank > 0 - assert svd.a != 0 lossy = (svd.Usigma_inv @ pp)[:rank, :] # lossy is (nc, ns) assert lossy.shape[0] < lossy.shape[1] - # lossy8, _ = to_uint8(lossy, (svd.a, svd.b)) - # return lossy8 - return lossy @@ -217,54 +266,68 @@ def _decompress_chunk(lossy, svd, rank=None): rank = min(rank, svd.rank) rank = min(rank, svd.n_channels) - # lossy = from_uint8(lossy, (svd.a, svd.b)) assert rank > 0 return (U[:, :rank] @ np.diag(sigma[:rank]) @ lossy[:rank, :]) +#------------------------------------------------------------------------------ +# Compressor +#------------------------------------------------------------------------------ + def compress_lossy( - path=None, cmeta=None, rank=None, max_chunks=0, downsampling_factor=None, + path_cbin=None, cmeta=None, rank=None, max_chunks=0, + chunks_excerpts=CHUNKS_EXCERPTS, downsampling_factor=DOWNSAMPLE_FACTOR, + overwrite=False, dry_run=False, out_lossy=None, out_svd=None): # Check arguments. assert rank, "The rank must be set" - assert path, "The raw ephys data file must be specified" + assert path_cbin, "The raw ephys data file must be specified" - if downsampling_factor is None: - downsampling_factor = DOWNSAMPLE_FACTOR assert downsampling_factor >= 1 + assert chunks_excerpts >= 2 # Create a mtscomp Reader. - reader = decompress(path, cmeta=cmeta) + reader = decompress(path_cbin, cmeta=cmeta) + sr = int(reader.sample_rate) ns = reader.n_samples nc = reader.n_channels n_chunks = reader.n_chunks if max_chunks == 0 else max_chunks + assert n_chunks > 0 + assert sr > 0 + assert ns > 0 + assert nc > 0 assert rank <= nc, "The rank cannot exceed the number of channels" - # Compute the SVD on an excerpt of the data. - svd = excerpt_svd(reader, rank) - # Filenames. if out_lossy is None: - out_lossy = Path(path).with_suffix(FILE_EXTENSION_LOSSY) + out_lossy = Path(path_cbin).with_suffix(FILE_EXTENSION_LOSSY) assert out_lossy if out_svd is None: - out_svd = Path(path).with_suffix(FILE_EXTENSION_SVD) + out_svd = Path(path_cbin).with_suffix(FILE_EXTENSION_SVD) assert out_svd + if dry_run: + return out_lossy + + # Compute the SVD on an excerpt of the data. + svd = excerpt_svd(reader, rank, kept_chunks=chunks_excerpts) + # Create a new memmapped npy file - if out_lossy.exists(): + if not overwrite and out_lossy.exists(): raise IOError(f"File {out_lossy} already exists.") - shape = (ns // downsampling_factor, rank) + shape = (n_chunks * int(reader.sample_rate) // downsampling_factor, rank) lossy = open_memmap(out_lossy, 'w+', dtype=np.uint8, shape=shape) # Compress the data. offset = 0 - for chunk_idx, chunk_start, chunk_length in \ - tqdm(reader.iter_chunks(last_chunk=n_chunks - 1), total=n_chunks): + for chunk_idx, chunk_start, chunk_length in tqdm( + reader.iter_chunks(last_chunk=n_chunks - 1), + desc='compressing...', + total=n_chunks): # Decompress the chunk. raw = reader.read_chunk(chunk_idx, chunk_start, chunk_length) @@ -274,23 +337,23 @@ def compress_lossy( nsc, _ = raw.shape assert _ == nc - # Process the chunk. - pp = _preprocess(raw) - # pp is (nc, ns) - assert pp.shape[0] < pp.shape[1] - # Compress the chunk. - chunk_lossy = _compress_chunk(pp) + chunk_lossy = _compress_chunk(raw, svd) # chunk_lossy is (nc, ns) assert chunk_lossy.shape[0] < chunk_lossy.shape[1] # Write the compressed chunk to disk. l = chunk_lossy.shape[1] - lossy[offset:offset + l, :] = to_uint8(chunk_lossy.T, svd.ab) + lossy[offset:offset + l, :], ab = to_uint8(chunk_lossy.T, svd.ab) + # NOTE: keep the ab scaling factors for uint8 conversion only for the first chunk + if svd.ab is None: + svd.ab = ab offset += l # Save the SVD info to a npz file. - np.savez(out_svd, **SVD) + svd.save(out_svd) + + return out_lossy #------------------------------------------------------------------------------ @@ -303,23 +366,33 @@ def __init__(self): self.path_svd = None def open(self, path_lossy=None, path_svd=None): - self.path_lossy = path_lossy - self.path_svd = path_svd + assert path_lossy + + if path_svd is None: + path_svd = Path(path_lossy).with_suffix('').with_suffix('.svd.npz') + + self.path_lossy = Path(path_lossy) + self.path_svd = Path(path_svd) assert self.path_lossy assert self.path_svd + assert self.path_lossy.exists() + assert self.path_svd.exists() self._lossy = open_memmap(path_lossy, 'r') # ns, nc - assert self._lossy.shape[1] > self._lossy.shape[0] + assert self._lossy.shape[0] > self._lossy.shape[1] assert self._lossy.dtype == np.uint8 - self._svd = np.load(self.path_svd) - ds = self._svd.downsample_factor + self._svd = load_svd(self.path_svd) + self.downsample_factor = ds = self._svd.downsample_factor + self.sample_rate = self._svd.sample_rate assert ds >= 1 + assert self._svd.ab is not None self.n_channels = self._svd.U.shape[0] self.n_samples = self._lossy.shape[0] * ds + self.duration = self.n_samples / float(self.sample_rate) self.ndim = 2 self.shape = (self.n_samples, self.n_channels) self.size = self.n_samples * self.n_channels @@ -328,67 +401,22 @@ def open(self, path_lossy=None, path_svd=None): self.dtype = np.uint8 def _decompress(self, lossy, rank=None): - lossy_float = from_uint8(lossy, self._svd.ab) - return _decompress_chunk(lossy_float, self._svd, rank=rank) + lossy_float = from_uint8(lossy, self._svd.ab).T + return _decompress_chunk(lossy_float, self._svd, rank=rank).T - def get(self, i0, i1, rank=None): + def get(self, t0, t1, rank=None): + ds = self._svd.downsample_factor + i0 = int(round(t0 * float(self.sample_rate) / ds)) + i1 = int(round(t1 * float(self.sample_rate) / ds)) lossy = self._lossy[i0:i1] return self._decompress(lossy, rank=rank) - def __get_item__(self, idx): + def __getitem__(self, idx): lossy = self._lossy[idx] return self._decompress(lossy) -def decompress_lossy(path_lossy="file.lossy.npy", path_svd="file.svd.npz"): +def decompress_lossy(path_lossy=None, path_svd=None): reader = LossyReader() reader.open(path_lossy, path_svd=path_svd) return reader - - -def test(): - import matplotlib.pyplot as plt - import seaborn as sns - plt.rcParams["figure.dpi"] = 140 - plt.rcParams["axes.grid"] = False - sns.set_theme(style="white") - - EPHYS_DIR = Path("/home/cyrille/ephys/globus/KS023/") - path_cbin = EPHYS_DIR / "raw.cbin" - path_ch = EPHYS_DIR / "raw.ch" - - reader = decompress(path_cbin) - rank = 40 - chunks_excerpts = 3 - - svd = excerpt_svd(reader, rank, chunks_excerpts) - - raw = reader[:30000, :] - nc = raw.shape[1] - compression = DOWNSAMPLE_FACTOR * 2 * nc / float(rank) - - lossy = _compress_chunk(raw, svd) - reconst = _decompress_chunk(lossy, svd, rank=rank) - - # plt.figure() - # plt.hist(lossy.ravel(), bins=64, log=True) - - lossy8, ab = to_uint8(lossy) - lossy_ = from_uint8(lossy8, ab) - reconst8 = _decompress_chunk(lossy_, svd, rank=rank) - - nrows = 2 - fix, axs = plt.subplots(nrows, 1, sharex=True) - - axs[0].imshow(_preprocess(raw), cmap="gray", aspect="auto") - axs[0].set_title(f"original") - - axs[1].imshow(reconst8, cmap="gray", aspect="auto") - axs[1].set_title(f"rank={rank}, compression={compression:.1f}x") - - # for i in range(1, nrows): - # # rank = 50 * i - # compression = DOWNSAMPLE_FACTOR * 2 * nc / float(rank) - # axs[i].imshow(_decompress_chunk(lossy, svd, rank=rank), cmap="gray", aspect="auto") - # axs[i].set_title(f"rank={rank}, compression={compression:.1f}x") - plt.show() From 5268b4bc4143e048d317e70dff85318154cdb5b3 Mon Sep 17 00:00:00 2001 From: Cyrille Rossant Date: Tue, 8 Feb 2022 11:44:39 +0100 Subject: [PATCH 06/18] WIP: lossy compression tests --- Makefile | 2 +- mtscomp/lossy.py | 18 ++-- tests/tests_lossy.py | 130 +++++++++++++++++++++++++++++ tests.py => tests/tests_mtscomp.py | 0 4 files changed, 143 insertions(+), 7 deletions(-) create mode 100644 tests/tests_lossy.py rename tests.py => tests/tests_mtscomp.py (100%) diff --git a/Makefile b/Makefile index a1b0cdf..f332d8c 100644 --- a/Makefile +++ b/Makefile @@ -15,7 +15,7 @@ lint: flake8 test: lint - py.test -vv --cov-report term-missing --cov=mtscomp tests.py + py.test -vv --cov-report term-missing --cov=mtscomp tests/ build: python setup.py sdist --formats=zip diff --git a/mtscomp/lossy.py b/mtscomp/lossy.py index 2a7e160..375ee02 100644 --- a/mtscomp/lossy.py +++ b/mtscomp/lossy.py @@ -37,6 +37,7 @@ FILE_EXTENSION_LOSSY = '.lossy.npy' FILE_EXTENSION_SVD = '.svd.npz' +UINT8_MARGIN = .05 #------------------------------------------------------------------------------ @@ -115,7 +116,7 @@ def _car(x): return y -def _downsample(x, factor=1): +def _downsample(x, factor=DOWNSAMPLE_FACTOR): assert x.ndim == 2 ns, nc = x.shape # ns, nc @@ -130,9 +131,8 @@ def _svd(x): return SVD(U, sigma) -def _uint8_coefs(x, margin=.05): - # k = .05 - # m, M = np.quantile(x, k), np.quantile(x, 1 - k) +def _uint8_coefs(x, margin=UINT8_MARGIN): + # m, M = np.quantile(x, UINT8_MARGIN), np.quantile(x, 1 - UINT8_MARGIN) m, M = x.min(), x.max() d = M - m @@ -186,7 +186,7 @@ def _preprocess(raw): return pp -def _get_excerpts(reader, kept_chunks=20): +def _get_excerpts(reader, kept_chunks=CHUNKS_EXCERPTS): assert reader assert isinstance(reader, Reader) assert kept_chunks >= 2 @@ -219,7 +219,7 @@ def _get_excerpts(reader, kept_chunks=20): return excerpts -def excerpt_svd(reader, rank, kept_chunks=20): +def excerpt_svd(reader, rank, kept_chunks=CHUNKS_EXCERPTS): assert rank excerpts = _get_excerpts(reader, kept_chunks=kept_chunks) # excerpts is (nc, ns) @@ -385,8 +385,11 @@ def open(self, path_lossy=None, path_svd=None): assert self._lossy.dtype == np.uint8 self._svd = load_svd(self.path_svd) + self.rank = self._svd.rank self.downsample_factor = ds = self._svd.downsample_factor self.sample_rate = self._svd.sample_rate + + assert self.rank >= 1 assert ds >= 1 assert self._svd.ab is not None @@ -400,6 +403,9 @@ def open(self, path_lossy=None, path_svd=None): self.itemsize = 1 self.dtype = np.uint8 + size_original = 2 * self.n_channels * self.n_samples + self.compression = size_original / float(self.size_bytes) + def _decompress(self, lossy, rank=None): lossy_float = from_uint8(lossy, self._svd.ab).T return _decompress_chunk(lossy_float, self._svd, rank=rank).T diff --git a/tests/tests_lossy.py b/tests/tests_lossy.py new file mode 100644 index 0000000..e7c48e7 --- /dev/null +++ b/tests/tests_lossy.py @@ -0,0 +1,130 @@ +# -*- coding: utf-8 -*- + +"""mtscomp tests.""" + + +#------------------------------------------------------------------------------ +# Imports +#------------------------------------------------------------------------------ + +from contextlib import redirect_stdout +import io +from itertools import product +import json +import hashlib +import logging +import os +import os.path as op +from pathlib import Path +import re + +import numpy as np +from pytest import fixture, raises, mark + +from mtscomp import Reader, decompress, add_default_handler, lossy as ml + +logger = logging.getLogger(__name__) +add_default_handler('DEBUG') + + +#------------------------------------------------------------------------------ +# Fixtures +#------------------------------------------------------------------------------ + +n_channels = 19 +sample_rate = 1234. +duration = 5.67 +normal_std = .25 +time = np.arange(0, duration, 1. / sample_rate) +n_samples = len(time) + + +def randn(): + return np.random.normal(loc=0, scale=normal_std, size=(n_samples, n_channels)) + + +def white_sine(): + return np.sin(10 * time)[:, np.newaxis] + randn() + + +def colored_sine(): + arr = white_sine() + try: + from scipy.signal import filtfilt, butter + except ImportError: + logger.debug("Skip the filtering as SciPy is not available.") + return arr + b, a = butter(3, 0.05) + arr = filtfilt(b, a, arr, axis=0) + assert arr.shape == (n_samples, n_channels) + return arr + + +#------------------------------------------------------------------------------ +# Utils +#------------------------------------------------------------------------------ + +def show_compare(lossless, lossy, t0, t1): + assert isinstance(lossless, Reader) + assert isinstance(lossy, ml.LossyReader) + + import matplotlib.pyplot as plt + import seaborn as sns + plt.rcParams["figure.dpi"] = 140 + plt.rcParams["axes.grid"] = False + sns.set_theme(style="white") + + nrows = 2 + fix, axs = plt.subplots(nrows, 1, sharex=True) + + sr = lossless.sample_rate + i0 = int(round(t0 * sr)) + i1 = int(round(t1 * sr)) + + axs[0].imshow(ml._preprocess(lossless[i0:i1]), cmap="gray", aspect="auto") + axs[0].set_title(f"original") + + axs[1].imshow(lossy.get(t0, t1).T, cmap="gray", aspect="auto") + axs[1].set_title(f"rank={lossy.rank}, compression={lossy.compression:.1f}x") + + plt.show() + + +#------------------------------------------------------------------------------ +# Tests +#------------------------------------------------------------------------------ + +def test_lossy(): + + EPHYS_DIR = Path("/home/cyrille/ephys/globus/KS023/") + path_cbin = EPHYS_DIR / "raw.cbin" + + rank = 40 + max_chunks = 10 + + out_lossy = ml.compress_lossy( + path_cbin=path_cbin, + chunks_excerpts=3, + rank=rank, + max_chunks=max_chunks, + overwrite=True, + dry_run=True, + ) + + lossless = decompress(path_cbin) + lossy = ml.decompress_lossy(out_lossy) + + show_compare(lossless, lossy, 0, .2) + + # chunks_excerpts = 3 + # svd = excerpt_svd(reader, rank, chunks_excerpts) + + # nc = raw.shape[1] + # compression = DOWNSAMPLE_FACTOR * 2 * nc / float(rank) + + # lossy = _compress_chunk(raw, svd) + # reconst = _decompress_chunk(lossy, svd, rank=rank) + + # lossy8, ab = to_uint8(lossy) + # lossy_ = from_uint8(lossy8, ab) + # reconst8 = _decompress_chunk(lossy_, svd, rank=rank) diff --git a/tests.py b/tests/tests_mtscomp.py similarity index 100% rename from tests.py rename to tests/tests_mtscomp.py From b1aee2050ec5ccfe5a7c86f4cc9f5dc768bc2813 Mon Sep 17 00:00:00 2001 From: Cyrille Rossant Date: Tue, 8 Feb 2022 11:53:02 +0100 Subject: [PATCH 07/18] Update local lossy test --- .gitignore | 1 + mtscomp/lossy.py | 3 +-- tests/tests_lossy.py | 19 +++---------------- 3 files changed, 5 insertions(+), 18 deletions(-) diff --git a/.gitignore b/.gitignore index 808f12d..9536fb5 100644 --- a/.gitignore +++ b/.gitignore @@ -44,3 +44,4 @@ docs/_build .vscode *.code-workspace +raw.* diff --git a/mtscomp/lossy.py b/mtscomp/lossy.py index 375ee02..cba1948 100644 --- a/mtscomp/lossy.py +++ b/mtscomp/lossy.py @@ -155,8 +155,7 @@ def to_uint8(x, ab=None): # assert np.all((0 <= y) & (y < 256)) overshoot = np.mean((y < 0) | (y >= 256)) if overshoot > 0: - logger.debug( - f"uint8 casting: clipping {overshoot:.1f}% of overshooting values", overshoot * 100) + logger.debug(f"uint8 casting: clipping {overshoot * 100:.3f}% of overshooting values") y = np.clip(y, 0, 255) return y.astype(np.uint8), (a, b) diff --git a/tests/tests_lossy.py b/tests/tests_lossy.py index e7c48e7..fa5757b 100644 --- a/tests/tests_lossy.py +++ b/tests/tests_lossy.py @@ -94,9 +94,9 @@ def show_compare(lossless, lossy, t0, t1): # Tests #------------------------------------------------------------------------------ -def test_lossy(): +def test_lossy_local(): - EPHYS_DIR = Path("/home/cyrille/ephys/globus/KS023/") + EPHYS_DIR = Path(__file__).parent path_cbin = EPHYS_DIR / "raw.cbin" rank = 40 @@ -108,23 +108,10 @@ def test_lossy(): rank=rank, max_chunks=max_chunks, overwrite=True, - dry_run=True, + dry_run=False, ) lossless = decompress(path_cbin) lossy = ml.decompress_lossy(out_lossy) show_compare(lossless, lossy, 0, .2) - - # chunks_excerpts = 3 - # svd = excerpt_svd(reader, rank, chunks_excerpts) - - # nc = raw.shape[1] - # compression = DOWNSAMPLE_FACTOR * 2 * nc / float(rank) - - # lossy = _compress_chunk(raw, svd) - # reconst = _decompress_chunk(lossy, svd, rank=rank) - - # lossy8, ab = to_uint8(lossy) - # lossy_ = from_uint8(lossy8, ab) - # reconst8 = _decompress_chunk(lossy_, svd, rank=rank) From 245d7bc270164ad46be5ffc9fb1378fe3a9efb14 Mon Sep 17 00:00:00 2001 From: Cyrille Rossant Date: Tue, 8 Feb 2022 15:23:48 +0100 Subject: [PATCH 08/18] Lossy compression tests --- mtscomp/lossy.py | 35 ++++++----- mtscomp/mtscomp.py | 2 +- tests/tests_lossy.py | 138 +++++++++++++++++++++++++++++++++++++------ 3 files changed, 143 insertions(+), 32 deletions(-) diff --git a/mtscomp/lossy.py b/mtscomp/lossy.py index cba1948..dc99fce 100644 --- a/mtscomp/lossy.py +++ b/mtscomp/lossy.py @@ -38,6 +38,8 @@ FILE_EXTENSION_LOSSY = '.lossy.npy' FILE_EXTENSION_SVD = '.svd.npz' UINT8_MARGIN = .05 +DTYPE = np.uint8 +MAX_UINT8 = 255 #------------------------------------------------------------------------------ @@ -121,7 +123,7 @@ def _downsample(x, factor=DOWNSAMPLE_FACTOR): ns, nc = x.shape # ns, nc assert x.shape[0] > x.shape[1] - y = x.T.reshape((nc, (ns - ns % factor) // factor, factor)).mean(axis=2) + y = x.T[:, :ns - ns % factor].reshape((nc, ns // factor, factor)).mean(axis=2) # nc, ns return y @@ -141,7 +143,7 @@ def _uint8_coefs(x, margin=UINT8_MARGIN): m -= d * margin M += d * margin - a = 255 / d + a = MAX_UINT8 / d b = m return a, b @@ -152,13 +154,13 @@ def to_uint8(x, ab=None): y = (x - b) * a # inverse: x = y / a + b - # assert np.all((0 <= y) & (y < 256)) - overshoot = np.mean((y < 0) | (y >= 256)) + overshoot = np.mean((y < 0) | (y > MAX_UINT8)) if overshoot > 0: - logger.debug(f"uint8 casting: clipping {overshoot * 100:.3f}% of overshooting values") - y = np.clip(y, 0, 255) + logger.debug( + f"casting to {str(DTYPE)}: clipping {overshoot * 100:.3f}% of values") + y = np.clip(y, 0, MAX_UINT8) - return y.astype(np.uint8), (a, b) + return y.astype(DTYPE), (a, b) def from_uint8(y, ab): @@ -194,11 +196,11 @@ def _get_excerpts(reader, kept_chunks=CHUNKS_EXCERPTS): n = 0 n_chunks = reader.n_chunks assert reader.shape[0] > reader.shape[1] - skip = n_chunks // kept_chunks + skip = max(1, n_chunks // kept_chunks) for chunk_idx, chunk_start, chunk_length in tqdm( islice(reader.iter_chunks(), 0, n_chunks + 1, skip), total=n_chunks // skip, - desc="extracting excerpts..."): + desc="Extracting excerpts..."): chunk = reader.read_chunk(chunk_idx, chunk_start, chunk_length) # chunk is (ns, nc) @@ -318,14 +320,14 @@ def compress_lossy( # Create a new memmapped npy file if not overwrite and out_lossy.exists(): raise IOError(f"File {out_lossy} already exists.") - shape = (n_chunks * int(reader.sample_rate) // downsampling_factor, rank) - lossy = open_memmap(out_lossy, 'w+', dtype=np.uint8, shape=shape) + shape = (ns // downsampling_factor, rank) + lossy = open_memmap(out_lossy, 'w+', dtype=DTYPE, shape=shape) # Compress the data. offset = 0 for chunk_idx, chunk_start, chunk_length in tqdm( reader.iter_chunks(last_chunk=n_chunks - 1), - desc='compressing...', + desc='Compressing (lossy)...', total=n_chunks): # Decompress the chunk. @@ -343,12 +345,17 @@ def compress_lossy( # Write the compressed chunk to disk. l = chunk_lossy.shape[1] + assert offset + l <= shape[0] lossy[offset:offset + l, :], ab = to_uint8(chunk_lossy.T, svd.ab) # NOTE: keep the ab scaling factors for uint8 conversion only for the first chunk if svd.ab is None: svd.ab = ab offset += l + extra = shape[0] - offset + if extra > 0: + lossy[-extra:, :] = lossy[-extra - 1, :] + # Save the SVD info to a npz file. svd.save(out_svd) @@ -381,7 +388,7 @@ def open(self, path_lossy=None, path_svd=None): self._lossy = open_memmap(path_lossy, 'r') # ns, nc assert self._lossy.shape[0] > self._lossy.shape[1] - assert self._lossy.dtype == np.uint8 + assert self._lossy.dtype == DTYPE self._svd = load_svd(self.path_svd) self.rank = self._svd.rank @@ -400,7 +407,7 @@ def open(self, path_lossy=None, path_svd=None): self.size = self.n_samples * self.n_channels self.size_bytes = self._lossy.size * self._lossy.itemsize self.itemsize = 1 - self.dtype = np.uint8 + self.dtype = DTYPE size_original = 2 * self.n_channels * self.n_samples self.compression = size_original / float(self.size_bytes) diff --git a/mtscomp/mtscomp.py b/mtscomp/mtscomp.py index 9507721..475def9 100644 --- a/mtscomp/mtscomp.py +++ b/mtscomp/mtscomp.py @@ -456,7 +456,7 @@ def write(self, out, outmeta): ts = ' on %d CPUs.' % self.n_threads if self.n_threads > 1 else '.' logger.info("Starting compression" + ts) with open(out, 'wb') as fb: - for batch in tqdm(range(self.n_batches), desc='Compressing', disable=self.quiet): + for batch in tqdm(range(self.n_batches), desc='Compressing (lossless)', disable=self.quiet): first_chunk = self.batch_size * batch # first included last_chunk = min(self.batch_size * (batch + 1), self.n_chunks) # last excluded assert 0 <= first_chunk < last_chunk <= self.n_chunks diff --git a/tests/tests_lossy.py b/tests/tests_lossy.py index fa5757b..602bfc3 100644 --- a/tests/tests_lossy.py +++ b/tests/tests_lossy.py @@ -21,22 +21,27 @@ import numpy as np from pytest import fixture, raises, mark -from mtscomp import Reader, decompress, add_default_handler, lossy as ml +from mtscomp import Reader, compress, decompress, add_default_handler, lossy as ml -logger = logging.getLogger(__name__) -add_default_handler('DEBUG') +logger = logging.getLogger('mtscomp') +# add_default_handler('DEBUG') #------------------------------------------------------------------------------ # Fixtures #------------------------------------------------------------------------------ +_INT16_MAX = 32766 +ERROR_THRESHOLD = .03 # the error is the % of values that differ by more than this percent + n_channels = 19 sample_rate = 1234. duration = 5.67 normal_std = .25 time = np.arange(0, duration, 1. / sample_rate) n_samples = len(time) +dtype = np.int16 +np.random.seed(0) def randn(): @@ -60,51 +65,144 @@ def colored_sine(): return arr +def _write_arr(path, arr): + """Write an array.""" + with open(path, 'wb') as f: + arr.tofile(f) + + +def _to_int16(arr, M=None): + M = M or np.abs(arr).max() + arr = arr / M if M > 0 else arr + assert np.all(np.abs(arr) <= 1.) + arr16 = (arr * _INT16_MAX).astype(np.int16) + return arr16 + + +def _from_int16(arr, M): + return arr * float(M / _INT16_MAX) + + #------------------------------------------------------------------------------ # Utils #------------------------------------------------------------------------------ -def show_compare(lossless, lossy, t0, t1): - assert isinstance(lossless, Reader) - assert isinstance(lossy, ml.LossyReader) - +def _import_mpl(): import matplotlib.pyplot as plt import seaborn as sns plt.rcParams["figure.dpi"] = 140 plt.rcParams["axes.grid"] = False sns.set_theme(style="white") + return plt - nrows = 2 - fix, axs = plt.subplots(nrows, 1, sharex=True) + +def _show_img(ax, x, title, vmin=None, vmax=None): + ax.imshow(x, cmap="gray", aspect="auto", vmin=vmin, vmax=vmax) + ax.set_title(title) + + +def _prepare_compare(lossless, lossy, t0, t1): + assert isinstance(lossless, Reader) + assert isinstance(lossy, ml.LossyReader) sr = lossless.sample_rate i0 = int(round(t0 * sr)) i1 = int(round(t1 * sr)) - axs[0].imshow(ml._preprocess(lossless[i0:i1]), cmap="gray", aspect="auto") - axs[0].set_title(f"original") + lossless_img = ml._preprocess(lossless[i0:i1]) + lossy_img = lossy.get(t0, t1).T + vmin = min(lossless_img.min(), lossy_img.min()) + vmax = max(lossless_img.max(), lossy_img.max()) + lossless_img -= vmin + lossy_img -= vmin + v = vmax - vmin + + return lossless_img, lossy_img, v + - axs[1].imshow(lossy.get(t0, t1).T, cmap="gray", aspect="auto") - axs[1].set_title(f"rank={lossy.rank}, compression={lossy.compression:.1f}x") +def _compute_error(lossless_img, lossy_img, threshold=ERROR_THRESHOLD): + x = lossless_img - lossy_img + return (np.abs(x).ravel() > lossless_img.max() * threshold).mean() - plt.show() + +def show_compare(lossless, lossy, t0, t1, threshold=ERROR_THRESHOLD, do_show=True): + assert isinstance(lossless, Reader) + assert isinstance(lossy, ml.LossyReader) + + lossless_img, lossy_img, v = _prepare_compare(lossless, lossy, t0, t1) + + err = _compute_error(lossless_img, lossy_img, threshold=threshold) + print(f"Relative error is {err * 100:.1f}%.") + + title = f"rank={lossy.rank}, {lossy.compression:.1f}x compression, error {err * 100:.1f}%" + + nrows = 3 + plt = _import_mpl() + fix, axs = plt.subplots(nrows, 1, sharex=True) + _show_img(axs[0], lossless_img, 'original', vmin=0, vmax=v) + _show_img(axs[1], lossy_img, title, vmin=0, vmax=v) + _show_img(axs[2], lossless_img - lossy_img, 'residual', vmin=0, vmax=v) + + n_ticks = 5 + ticks = np.linspace(0, lossless_img.shape[1], n_ticks) + labels = ['%.1f ms' % (t * 1000) for t in np.linspace(t0, t1, n_ticks)] + plt.xticks(ticks, labels) + if do_show: + plt.show() + + return err #------------------------------------------------------------------------------ # Tests #------------------------------------------------------------------------------ +def test_lossy_artificial(tmp_path): + path_bin = tmp_path / 'sine.bin' + path_cbin = tmp_path / 'sine.cbin' + + # Generate an artificial binary file. + arr = colored_sine() + assert arr.shape == (n_samples, n_channels) + M = np.abs(arr).max() + _write_arr(path_bin, _to_int16(arr, M)) + + # Compress it (lossless). + compress(path_bin, out=path_cbin, sample_rate=sample_rate, n_channels=n_channels, dtype=dtype) + assert path_cbin.exists() + + # Compress it (lossy). + rank = 19 + path_lossy = ml.compress_lossy(path_cbin=path_cbin, rank=rank) + assert path_lossy.exists() + assert np.load(path_lossy).shape == (n_samples // ml.DOWNSAMPLE_FACTOR, rank) + + # Decompress. + lossy = ml.decompress_lossy(path_lossy) + assert arr.ndim == lossy.ndim + assert arr.shape[1] == lossy.shape[1] + assert arr.shape[0] - lossy.shape[0] == arr.shape[0] % ml.DOWNSAMPLE_FACTOR + + lossless = decompress(path_cbin) + + err = show_compare(lossless, lossy, 0, duration, threshold=.1, do_show=False) + assert err < .1 + + def test_lossy_local(): - EPHYS_DIR = Path(__file__).parent + EPHYS_DIR = Path(__file__).parent.resolve() path_cbin = EPHYS_DIR / "raw.cbin" + if not path_cbin.exists(): + logger.warning(f"skip test because {path_cbin} does not exist") + return rank = 40 max_chunks = 10 out_lossy = ml.compress_lossy( path_cbin=path_cbin, - chunks_excerpts=3, + chunks_excerpts=5, rank=rank, max_chunks=max_chunks, overwrite=True, @@ -114,4 +212,10 @@ def test_lossy_local(): lossless = decompress(path_cbin) lossy = ml.decompress_lossy(out_lossy) - show_compare(lossless, lossy, 0, .2) + # plt = _import_mpl() + # x = lossless_img - lossy_img + # plt.figure() + # plt.hist(np.abs(x).ravel(), bins=100) + + err = show_compare(lossless, lossy, 0, .2, do_show=False) + assert err < .1 From 15e38800f796fd45e2e073dd7be63c281a9e35a2 Mon Sep 17 00:00:00 2001 From: Cyrille Rossant Date: Tue, 8 Feb 2022 16:08:29 +0100 Subject: [PATCH 09/18] Add mtsloss CLI tool --- mtscomp/lossy.py | 69 ++++++++++++++++++++++++++++++++++++++++++++---- setup.py | 1 + 2 files changed, 65 insertions(+), 5 deletions(-) diff --git a/mtscomp/lossy.py b/mtscomp/lossy.py index dc99fce..7977ad2 100644 --- a/mtscomp/lossy.py +++ b/mtscomp/lossy.py @@ -21,7 +21,7 @@ from numpy.linalg import inv from numpy.lib.format import open_memmap -from .mtscomp import Bunch, decompress, Reader +from .mtscomp import Bunch, decompress, Reader, add_default_handler logger = logging.getLogger('mtscomp') @@ -278,7 +278,7 @@ def _decompress_chunk(lossy, svd, rank=None): def compress_lossy( path_cbin=None, cmeta=None, rank=None, max_chunks=0, - chunks_excerpts=CHUNKS_EXCERPTS, downsampling_factor=DOWNSAMPLE_FACTOR, + chunks_excerpts=None, downsampling_factor=None, overwrite=False, dry_run=False, out_lossy=None, out_svd=None): @@ -286,6 +286,9 @@ def compress_lossy( assert rank, "The rank must be set" assert path_cbin, "The raw ephys data file must be specified" + chunks_excerpts = chunks_excerpts or CHUNKS_EXCERPTS + downsampling_factor = downsampling_factor or DOWNSAMPLE_FACTOR + assert downsampling_factor >= 1 assert chunks_excerpts >= 2 @@ -295,6 +298,8 @@ def compress_lossy( ns = reader.n_samples nc = reader.n_channels n_chunks = reader.n_chunks if max_chunks == 0 else max_chunks + if max_chunks: + ns = max_chunks * sr # NOTE: assume 1-second chunks assert n_chunks > 0 assert sr > 0 @@ -314,15 +319,15 @@ def compress_lossy( if dry_run: return out_lossy - # Compute the SVD on an excerpt of the data. - svd = excerpt_svd(reader, rank, kept_chunks=chunks_excerpts) - # Create a new memmapped npy file if not overwrite and out_lossy.exists(): raise IOError(f"File {out_lossy} already exists.") shape = (ns // downsampling_factor, rank) lossy = open_memmap(out_lossy, 'w+', dtype=DTYPE, shape=shape) + # Compute the SVD on an excerpt of the data. + svd = excerpt_svd(reader, rank, kept_chunks=chunks_excerpts) + # Compress the data. offset = 0 for chunk_idx, chunk_start, chunk_length in tqdm( @@ -432,3 +437,57 @@ def decompress_lossy(path_lossy=None, path_svd=None): reader = LossyReader() reader.open(path_lossy, path_svd=path_svd) return reader + + +#------------------------------------------------------------------------------ +# Command-line API: mtscomp +#------------------------------------------------------------------------------ + +def mtsloss_parser(): + """Command-line interface to lossy-compress a .cbin file.""" + parser = argparse.ArgumentParser(description='Lossy compression of .cbin files.') + + parser.add_argument( + 'path', type=str, help='input path of a .cbin file') + + parser.add_argument( + 'out', type=str, nargs='?', + help='output path of the lossy-compressed file (.lossy.npy)') + + parser.add_argument( + 'outsvd', type=str, nargs='?', + help='output path of the compression metadata SVD file (.svd.npz)') + + parser.add_argument( + '--rank', type=int, help='number of SVD components to keep during compression') + + parser.add_argument( + '--excerpts', type=int, help='number of chunks to use when computing the SVD') + + parser.add_argument('--max-chunks', type=int, help='maximum number of chunks to compress') + + parser.add_argument('--overwrite', action='store_true', help='overwrite existing files') + + parser.add_argument('--dry', action='store_true', help='dry run') + + parser.add_argument('-v', '--debug', action='store_true', help='verbose') + + return parser + + +def mtsloss(args=None): + """Compress a file.""" + parser = mtsloss_parser() + pargs = parser.parse_args(args) + add_default_handler('DEBUG' if pargs.debug else 'INFO') + + compress_lossy( + path_cbin=pargs.path, + out_lossy=pargs.out, + out_svd=pargs.outsvd, + chunks_excerpts=pargs.excerpts, + rank=pargs.rank, + max_chunks=pargs.max_chunks, + overwrite=pargs.overwrite, + dry_run=pargs.dry, + ) diff --git a/setup.py b/setup.py index db82c8d..d3d1150 100644 --- a/setup.py +++ b/setup.py @@ -51,6 +51,7 @@ 'mtsdecomp=mtscomp:mtsdecomp', 'mtsdesc=mtscomp:mtsdesc', 'mtschop=mtscomp:mtschop', + 'mtsloss=mtscomp.lossy:mtsloss', ], }, include_package_data=True, From d4efa3c506439e156583292537c04976bf0d72d2 Mon Sep 17 00:00:00 2001 From: Cyrille Rossant Date: Wed, 9 Feb 2022 10:49:54 +0100 Subject: [PATCH 10/18] Fixing lossy compression scaling --- mtscomp/lossy.py | 40 ++++++++++++++++++++++++++++++++-------- tests/tests_lossy.py | 30 ++++++++++++++---------------- 2 files changed, 46 insertions(+), 24 deletions(-) diff --git a/mtscomp/lossy.py b/mtscomp/lossy.py index 7977ad2..01fca77 100644 --- a/mtscomp/lossy.py +++ b/mtscomp/lossy.py @@ -48,7 +48,7 @@ class SVD(Bunch): def __init__( - self, U, sigma, rank=None, ab=None, + self, U, sigma, rank=None, ab=None, minmax=None, q01=None, sample_rate=None, downsample_factor=DOWNSAMPLE_FACTOR): super(SVD, self).__init__() self.U = U @@ -61,6 +61,8 @@ def __init__( self.ab = ab self.sample_rate = sample_rate self.downsample_factor = downsample_factor + self.minmax = minmax + self.q01 = q01 def save(self, path): assert self.U is not None @@ -71,6 +73,8 @@ def save(self, path): assert self.rank >= 1 assert self.sample_rate > 0 assert self.downsample_factor >= 1 + assert self.minmax is not None + assert self.q01 is not None np.savez( path, @@ -78,6 +82,8 @@ def save(self, path): # Usigma_inv=self.Usigma_inv, sigma=self.sigma, ab=self.ab, + minmax=np.array(self.minmax), + q01=np.array(self.q01), # NOTE: need to convert to regular arrays for np.savez rank=np.array([self.rank]), @@ -86,8 +92,8 @@ def save(self, path): ) def __repr__(self): - # return f"" - return super(SVD, self).__repr__() + return f"" + # return super(SVD, self).__repr__() def load_svd(path): @@ -100,6 +106,8 @@ def load_svd(path): rank=int(d['rank'][0]), sample_rate=d['sample_rate'][0], downsample_factor=int(d['downsample_factor'][0]), + minmax=d['minmax'], + q01=d['q01'], ) assert svd.n_channels >= 1 return svd @@ -134,8 +142,8 @@ def _svd(x): def _uint8_coefs(x, margin=UINT8_MARGIN): - # m, M = np.quantile(x, UINT8_MARGIN), np.quantile(x, 1 - UINT8_MARGIN) - + # k = .01 + # m, M = np.quantile(x, k), np.quantile(x, 1 - k) m, M = x.min(), x.max() d = M - m assert d > 0 @@ -230,6 +238,10 @@ def excerpt_svd(reader, rank, kept_chunks=CHUNKS_EXCERPTS): svd = _svd(excerpts) assert svd.U.shape[0] == reader.n_channels + svd.minmax = (excerpts.min(), excerpts.max()) + k = .01 + svd.q01 = (np.quantile(excerpts.ravel(), k), np.quantile(excerpts.ravel(), 1 - k)) + svd.sample_rate = reader.sample_rate svd.downsample_factor = DOWNSAMPLE_FACTOR svd.rank = min(rank, svd.n_channels) @@ -269,7 +281,10 @@ def _decompress_chunk(lossy, svd, rank=None): assert rank > 0 - return (U[:, :rank] @ np.diag(sigma[:rank]) @ lossy[:rank, :]) + # arr is (nc, ns) + arr = (U[:, :rank] @ np.diag(sigma[:rank]) @ lossy[:rank, :]) + + return arr #------------------------------------------------------------------------------ @@ -413,6 +428,7 @@ def open(self, path_lossy=None, path_svd=None): self.size_bytes = self._lossy.size * self._lossy.itemsize self.itemsize = 1 self.dtype = DTYPE + self.minmax = self._svd.minmax # used for cmap scaling visualization size_original = 2 * self.n_channels * self.n_samples self.compression = size_original / float(self.size_bytes) @@ -421,12 +437,20 @@ def _decompress(self, lossy, rank=None): lossy_float = from_uint8(lossy, self._svd.ab).T return _decompress_chunk(lossy_float, self._svd, rank=rank).T - def get(self, t0, t1, rank=None): + def get(self, t0, t1, rank=None, cast_to_uint8=False): ds = self._svd.downsample_factor i0 = int(round(t0 * float(self.sample_rate) / ds)) i1 = int(round(t1 * float(self.sample_rate) / ds)) lossy = self._lossy[i0:i1] - return self._decompress(lossy, rank=rank) + arr = self._decompress(lossy, rank=rank) + if cast_to_uint8: + m, M = self.minmax + d = M - m + assert d > 0 + a = MAX_UINT8 / d + b = m + arr, _ = to_uint8(arr, ab=(a, b)) + return arr def __getitem__(self, idx): lossy = self._lossy[idx] diff --git a/tests/tests_lossy.py b/tests/tests_lossy.py index 602bfc3..d731096 100644 --- a/tests/tests_lossy.py +++ b/tests/tests_lossy.py @@ -32,7 +32,7 @@ #------------------------------------------------------------------------------ _INT16_MAX = 32766 -ERROR_THRESHOLD = .03 # the error is the % of values that differ by more than this percent +ERROR_THRESHOLD = .08 # the error is the % of values that differ by more than this percent n_channels = 19 sample_rate = 1234. @@ -111,13 +111,9 @@ def _prepare_compare(lossless, lossy, t0, t1): lossless_img = ml._preprocess(lossless[i0:i1]) lossy_img = lossy.get(t0, t1).T - vmin = min(lossless_img.min(), lossy_img.min()) - vmax = max(lossless_img.max(), lossy_img.max()) - lossless_img -= vmin - lossy_img -= vmin - v = vmax - vmin - return lossless_img, lossy_img, v + mM = lossy._svd.minmax + return lossless_img, lossy_img, mM def _compute_error(lossless_img, lossy_img, threshold=ERROR_THRESHOLD): @@ -129,19 +125,19 @@ def show_compare(lossless, lossy, t0, t1, threshold=ERROR_THRESHOLD, do_show=Tru assert isinstance(lossless, Reader) assert isinstance(lossy, ml.LossyReader) - lossless_img, lossy_img, v = _prepare_compare(lossless, lossy, t0, t1) + lossless_img, lossy_img, (m, M) = _prepare_compare(lossless, lossy, t0, t1) err = _compute_error(lossless_img, lossy_img, threshold=threshold) print(f"Relative error is {err * 100:.1f}%.") title = f"rank={lossy.rank}, {lossy.compression:.1f}x compression, error {err * 100:.1f}%" - nrows = 3 + nrows = 2 plt = _import_mpl() fix, axs = plt.subplots(nrows, 1, sharex=True) - _show_img(axs[0], lossless_img, 'original', vmin=0, vmax=v) - _show_img(axs[1], lossy_img, title, vmin=0, vmax=v) - _show_img(axs[2], lossless_img - lossy_img, 'residual', vmin=0, vmax=v) + _show_img(axs[0], lossless_img, 'original', vmin=m, vmax=M) + _show_img(axs[1], lossy_img, title, vmin=m, vmax=M) + # _show_img(axs[2], lossless_img - lossy_img, 'residual', vmin=0, vmax=v) n_ticks = 5 ticks = np.linspace(0, lossless_img.shape[1], n_ticks) @@ -172,7 +168,7 @@ def test_lossy_artificial(tmp_path): assert path_cbin.exists() # Compress it (lossy). - rank = 19 + rank = 8 path_lossy = ml.compress_lossy(path_cbin=path_cbin, rank=rank) assert path_lossy.exists() assert np.load(path_lossy).shape == (n_samples // ml.DOWNSAMPLE_FACTOR, rank) @@ -186,7 +182,7 @@ def test_lossy_artificial(tmp_path): lossless = decompress(path_cbin) err = show_compare(lossless, lossy, 0, duration, threshold=.1, do_show=False) - assert err < .1 + assert err < 1 def test_lossy_local(): @@ -217,5 +213,7 @@ def test_lossy_local(): # plt.figure() # plt.hist(np.abs(x).ravel(), bins=100) - err = show_compare(lossless, lossy, 0, .2, do_show=False) - assert err < .1 + hw = .1 + t = hw + err = show_compare(lossless, lossy, t - hw, t + hw, do_show=False) + assert err < 1 From 4c12d247daba5d1d6c44022291eff77b0aeea723 Mon Sep 17 00:00:00 2001 From: Cyrille Rossant Date: Wed, 23 Feb 2022 12:23:41 +0100 Subject: [PATCH 11/18] Add docstrings in lossy compression code --- mtscomp/lossy.py | 327 +++++++++++++++++++++++++++++++++++++------ mtscomp/mtscomp.py | 4 +- tests/tests_lossy.py | 2 +- 3 files changed, 290 insertions(+), 43 deletions(-) diff --git a/mtscomp/lossy.py b/mtscomp/lossy.py index 01fca77..b774ac7 100644 --- a/mtscomp/lossy.py +++ b/mtscomp/lossy.py @@ -3,9 +3,9 @@ """SVD-based raw ephys data lossy compression.""" -#------------------------------------------------------------------------------ +#------------------------------------------------------------------------------------------------- # Imports -#------------------------------------------------------------------------------ +#------------------------------------------------------------------------------------------------- import argparse from functools import lru_cache @@ -28,9 +28,9 @@ logger.setLevel(logging.DEBUG) -#------------------------------------------------------------------------------ +#------------------------------------------------------------------------------------------------- # Constants -#------------------------------------------------------------------------------ +#------------------------------------------------------------------------------------------------- DOWNSAMPLE_FACTOR = 6 CHUNKS_EXCERPTS = 20 @@ -42,29 +42,39 @@ MAX_UINT8 = 255 -#------------------------------------------------------------------------------ +#------------------------------------------------------------------------------------------------- # Util classes -#------------------------------------------------------------------------------ +#------------------------------------------------------------------------------------------------- class SVD(Bunch): + """A special dictionary that holds information about lossy compression. + + It mostly holds the SVD matrices necessary to reconstruct the original signal, as well + as scaling factors to resample to/from uint8. + + """ + def __init__( self, U, sigma, rank=None, ab=None, minmax=None, q01=None, sample_rate=None, downsample_factor=DOWNSAMPLE_FACTOR): super(SVD, self).__init__() - self.U = U - self.n_channels = U.shape[0] + self.U = U # the "U" in the "U @ sigma @ V" SVD decomposition + self.n_channels = U.shape[0] # number of channels assert sigma.shape == (self.n_channels,) - self.Usigma_inv = inv(U @ np.diag(sigma)) + + self.Usigma_inv = inv(U @ np.diag(sigma)) # inverse of "U @ sigma" assert self.Usigma_inv.shape == self.U.shape - self.sigma = sigma - self.rank = rank - self.ab = ab - self.sample_rate = sample_rate - self.downsample_factor = downsample_factor - self.minmax = minmax - self.q01 = q01 + + self.sigma = sigma # the diagonal of the SVD decomposition + self.rank = rank # the number of SVD components to keep + self.ab = ab # the uint8 scaling factors "y = ax+b" + self.sample_rate = sample_rate # the sampling rate + self.downsample_factor = downsample_factor # the downsample factor, an integer + self.minmax = minmax # the min and max of the signal across all channels + self.q01 = q01 # the 0.01 and 0.99 quantiles of the signal (unused for now) def save(self, path): + """Save this SVD object to a .npz file.""" assert self.U is not None assert self.Usigma_inv is not None assert self.sigma is not None @@ -79,7 +89,6 @@ def save(self, path): np.savez( path, U=self.U, - # Usigma_inv=self.Usigma_inv, sigma=self.sigma, ab=self.ab, minmax=np.array(self.minmax), @@ -93,14 +102,13 @@ def save(self, path): def __repr__(self): return f"" - # return super(SVD, self).__repr__() def load_svd(path): + """Load a .npz file containing the SVD information, and return a SVD object.""" d = np.load(path) svd = SVD( U=d['U'], - # Usigma_inv=d['Usigma_inv'], sigma=d['sigma'], ab=d['ab'], rank=int(d['rank'][0]), @@ -113,11 +121,20 @@ def load_svd(path): return svd -#------------------------------------------------------------------------------ +#------------------------------------------------------------------------------------------------- # Util functions -#------------------------------------------------------------------------------ +#------------------------------------------------------------------------------------------------- def _car(x): + """Common average referencing (remove the mean along time). + + Parameters + ---------- + + x : ndarray (n_samples, n_channels) + Signal. + + """ assert x.ndim == 2 # ns, nc assert x.shape[0] > x.shape[1] @@ -127,6 +144,7 @@ def _car(x): def _downsample(x, factor=DOWNSAMPLE_FACTOR): + """Hard downsampling.""" assert x.ndim == 2 ns, nc = x.shape # ns, nc @@ -137,11 +155,25 @@ def _downsample(x, factor=DOWNSAMPLE_FACTOR): def _svd(x): + """Compute the SVD of a signal. Return a SVD object. + + Parameters + ---------- + + x : ndarray (n_channels, n_samples) + Signal. + + """ + assert x.ndim == 2 + # nc, ns + assert x.shape[0] < x.shape[1] + U, sigma, _ = np.linalg.svd(x, full_matrices=False) return SVD(U, sigma) def _uint8_coefs(x, margin=UINT8_MARGIN): + """Compute the (a, b) rescaling coefficients to downsample a signal to uint8.""" # k = .01 # m, M = np.quantile(x, k), np.quantile(x, 1 - k) m, M = x.min(), x.max() @@ -157,6 +189,7 @@ def _uint8_coefs(x, margin=UINT8_MARGIN): def to_uint8(x, ab=None): + """Downsample a signal to uint8. The rescaling coefficients can be passed or recomputed.""" a, b = ab if ab is not None else _uint8_coefs(x) y = (x - b) * a @@ -172,15 +205,33 @@ def to_uint8(x, ab=None): def from_uint8(y, ab): + """Resample a uint8 signal to a float32 signal, using the rescaling coefficients.""" a, b = ab return y.astype(np.float32) / a + b -#------------------------------------------------------------------------------ +#------------------------------------------------------------------------------------------------- # Processing functions -#------------------------------------------------------------------------------ +#------------------------------------------------------------------------------------------------- + +def _preprocess_default(raw): + """Default preprocessing function: CAR and 6x downsampling. + + Note: this function transposes the array. + + Parameters + ---------- + + raw : ndarray (n_samples, n_channels) + Signal. + + Returns + ------- + + pp : ndarray (n_channels, n_samples) + + """ -def _preprocess(raw): assert raw.shape[0] > raw.shape[1] # raw is (ns, nc) @@ -195,10 +246,31 @@ def _preprocess(raw): return pp -def _get_excerpts(reader, kept_chunks=CHUNKS_EXCERPTS): +def _get_excerpts(reader, kept_chunks=CHUNKS_EXCERPTS, preprocess=None): + """Get evenly-spaced excerpts of a mtscomp-compressed file. + + Parameters + ---------- + + reader : mtscomp.Reader + The input array. + kept_chunks : int (default: 20) + The number of 1-second excerpts to keep. + preprocess : function `(n_samples, n_channels) int16 => (n_channels, n_samples) float32` + The preprocessing function to run on each 1-second excerpt. + + Returns + ------- + + excerpts : ndarray (n_channels, n_samples_excerpts) + The excerpts concatenated along the time axis. + + """ + assert reader assert isinstance(reader, Reader) assert kept_chunks >= 2 + preprocess = preprocess or _preprocess_default arrs = [] n = 0 @@ -214,7 +286,7 @@ def _get_excerpts(reader, kept_chunks=CHUNKS_EXCERPTS): # chunk is (ns, nc) assert chunk.shape[0] > chunk.shape[1] - pp = _preprocess(chunk) + pp = preprocess(chunk) # pp is (nc, ns) assert chunk.shape[0] > chunk.shape[1] @@ -228,9 +300,31 @@ def _get_excerpts(reader, kept_chunks=CHUNKS_EXCERPTS): return excerpts -def excerpt_svd(reader, rank, kept_chunks=CHUNKS_EXCERPTS): +def excerpt_svd(reader, rank, kept_chunks=CHUNKS_EXCERPTS, preprocess=None): + """Compute the SVD on evenly-spaced excerpts of a mtscomp-compressed file. + + Parameters + ---------- + + reader : mtscomp.Reader + The input array. + rank : int + The number of SVD components to keep. + kept_chunks : int (default: 20) + The number of 1-second excerpts to keep. + preprocess : function `(n_samples, n_channels) int16 => (n_channels, n_samples) float32` + The preprocessing function to run on each 1-second excerpt. + + Returns + ------- + + svd : SVD instance + An object containg the SVD information. + + """ + assert rank - excerpts = _get_excerpts(reader, kept_chunks=kept_chunks) + excerpts = _get_excerpts(reader, kept_chunks=kept_chunks, preprocess=preprocess) # excerpts is (nc, ns) assert excerpts.shape[0] < excerpts.shape[1] @@ -250,11 +344,32 @@ def excerpt_svd(reader, rank, kept_chunks=CHUNKS_EXCERPTS): return svd -def _compress_chunk(raw, svd): +def _compress_chunk(raw, svd, preprocess=None): + """Compress a chunk of data. + + Parameters + ---------- + + raw : ndarray (n_samples, n_channels) + The input array. + svd : SVD instance + The SVD object returned by `excerpt_svd()` + preprocess : function `(n_samples, n_channels) int16 => (n_channels, n_samples) float32` + The preprocessing function to run on each 1-second excerpt. + + Returns + ------- + + lossy : ndarray (rank, n_samples) + The compressed signal. + + """ + # raw is (ns, nc) assert raw.shape[0] > raw.shape[1] + preprocess = preprocess or _preprocess_default - pp = _preprocess(raw) + pp = preprocess(raw) # pp is (nc, ns) assert pp.shape[0] < pp.shape[1] @@ -262,13 +377,34 @@ def _compress_chunk(raw, svd): assert rank > 0 lossy = (svd.Usigma_inv @ pp)[:rank, :] - # lossy is (nc, ns) + # lossy is (rank, ns) assert lossy.shape[0] < lossy.shape[1] return lossy def _decompress_chunk(lossy, svd, rank=None): + """Decompress a chunk of data. + + Parameters + ---------- + + lossy : ndarray (rank, n_samples) + The lossy-compressed array. + svd : SVD instance + The SVD object returned by `excerpt_svd()` + rank : int (default: None) + If set, override the SVD rank (must be lower than the SVD rank). + Used to simulate reconstruction with a smaller rank. + + Returns + ------- + + arr : ndarray (n_channels, n_samples) + The reconstructed signal. + + """ + # lossy is (nc, ns) assert lossy.shape[0] < lossy.shape[1] @@ -287,19 +423,59 @@ def _decompress_chunk(lossy, svd, rank=None): return arr -#------------------------------------------------------------------------------ +#------------------------------------------------------------------------------------------------- # Compressor -#------------------------------------------------------------------------------ +#------------------------------------------------------------------------------------------------- def compress_lossy( path_cbin=None, cmeta=None, rank=None, max_chunks=0, chunks_excerpts=None, downsampling_factor=None, - overwrite=False, dry_run=False, + preprocess=None, overwrite=False, dry_run=False, out_lossy=None, out_svd=None): + """Compress a .cbin file. + + Parameters + ---------- + + path_cbin : str or Path + Path to the compressed data binary file (typically ̀.cbin` file extension). + cmeta : str or Path (default: None) + Path to the compression header JSON file (typically `.ch` file extension). + rank : int (mandatory) + Number of SVD components to keep in the compressed file. + max_chunks : int (default: None) + Maximum number of chunks to compress (use None to compress the entire file). + chunks_excerpts : int (default: 20) + Number of evenly-spaced 1-second chunks to extract to compute the SVD. + downsampling_factor : int + Number of times the original will be downsampled. + preprocess : function `(n_samples, n_channels) int16 => (n_channels, n_samples) float32` + + svd : SVD instance + The SVD object returned by `excerpt_svd()` + preprocess : function `(n_samples, n_channels) int16 => (n_channels, n_samples) float32` + The preprocessing function to run on each chunk. + overwrite : bool (default: False) + Whether the lossy compressed files may be overwritten. + dry_run : bool (default: False) + If true, the lossy compressed files will not be written. + out_lossy : str or Path + Path to the output `.lossy.npy` file. + out_svd : str or Path + Path to the output `.svd.npz` SVD file. + + Returns + ------- + + out_lossy : str or Path + Path to the output `.lossy.npy` file. + + """ # Check arguments. assert rank, "The rank must be set" assert path_cbin, "The raw ephys data file must be specified" + preprocess = preprocess or _preprocess_default chunks_excerpts = chunks_excerpts or CHUNKS_EXCERPTS downsampling_factor = downsampling_factor or DOWNSAMPLE_FACTOR @@ -341,7 +517,7 @@ def compress_lossy( lossy = open_memmap(out_lossy, 'w+', dtype=DTYPE, shape=shape) # Compute the SVD on an excerpt of the data. - svd = excerpt_svd(reader, rank, kept_chunks=chunks_excerpts) + svd = excerpt_svd(reader, rank, kept_chunks=chunks_excerpts, preprocess=preprocess) # Compress the data. offset = 0 @@ -359,7 +535,7 @@ def compress_lossy( assert _ == nc # Compress the chunk. - chunk_lossy = _compress_chunk(raw, svd) + chunk_lossy = _compress_chunk(raw, svd, preprocess=preprocess) # chunk_lossy is (nc, ns) assert chunk_lossy.shape[0] < chunk_lossy.shape[1] @@ -382,16 +558,29 @@ def compress_lossy( return out_lossy -#------------------------------------------------------------------------------ +#------------------------------------------------------------------------------------------------- # Decompressor -#------------------------------------------------------------------------------ +#------------------------------------------------------------------------------------------------- class LossyReader: + """Array-like interface to the reconstruction of a lossy compressed file.""" + def __init__(self): self.path_lossy = None self.path_svd = None def open(self, path_lossy=None, path_svd=None): + """Open a .lossy.npy/.svd.npz pair of files. + + Parameters + ---------- + + path_lossy : str or Path + Path to the lossy compressed file (typically ̀`.lossy.npy` file extension). + path_svd : str or Path (default: None) + Path to the lossy compressed SVD file (typically ̀`.svd.npz` file extension). + + """ assert path_lossy if path_svd is None: @@ -434,10 +623,49 @@ def open(self, path_lossy=None, path_svd=None): self.compression = size_original / float(self.size_bytes) def _decompress(self, lossy, rank=None): + """Decompress a chunk. + + Parameters + ---------- + + lossy : ndarray (rank, n_samples) + Compressed array. + rank : int (default: None) + If set, overrides the number of components to reuse for the reconstruction. + + Returns + ------- + + arr : ndarray (n_channels, n_samples) + The reconstructed signal. + + """ + lossy_float = from_uint8(lossy, self._svd.ab).T return _decompress_chunk(lossy_float, self._svd, rank=rank).T def get(self, t0, t1, rank=None, cast_to_uint8=False): + """Return the reconstructed signal between two times (in seconds). + + Parameters + ---------- + + t0 : float + Start time. + t1 : float + End time. + rank : int (default: None) + If set, overrides the number of components to reuse for the reconstruction. + cast_to_uint8 : bool (default: False) + Whether the reconstructed signal should be downsampled to uint8 (for viz purposes). + + Returns + ------- + + arr : ndarray (n_channels, n_samples) + The reconstructed signal. + + """ ds = self._svd.downsample_factor i0 = int(round(t0 * float(self.sample_rate) / ds)) i1 = int(round(t1 * float(self.sample_rate) / ds)) @@ -453,19 +681,38 @@ def get(self, t0, t1, rank=None, cast_to_uint8=False): return arr def __getitem__(self, idx): + """Array-like interface.""" lossy = self._lossy[idx] return self._decompress(lossy) def decompress_lossy(path_lossy=None, path_svd=None): + """Decompress a .lossy.npy/.svd.npz pair of lossy compressed files. + + Parameters + ---------- + + path_lossy : str or Path + Path to the `.lossy.npy` file. + path_svd : str or Path (default: None) + Path to the `.svd.npz` SVD file. + + Returns + ------- + + reader : LossyReader instance + An array-like interface to the reconstructed signal. + + """ + reader = LossyReader() reader.open(path_lossy, path_svd=path_svd) return reader -#------------------------------------------------------------------------------ +#------------------------------------------------------------------------------------------------- # Command-line API: mtscomp -#------------------------------------------------------------------------------ +#------------------------------------------------------------------------------------------------- def mtsloss_parser(): """Command-line interface to lossy-compress a .cbin file.""" diff --git a/mtscomp/mtscomp.py b/mtscomp/mtscomp.py index 475def9..f4b330a 100644 --- a/mtscomp/mtscomp.py +++ b/mtscomp/mtscomp.py @@ -428,9 +428,9 @@ def write(self, out, outmeta): ---------- out : str or Path - Path to the compressed data binary file (typically ̀.cbin` file extension). + Path to the compressed data binary file (typically ̀`.cbin` file extension). outmeta : str or Path - Path to the compression header JSON file (typicall `.ch` file extension). + Path to the compression header JSON file (typically `.ch` file extension). Returns ------- diff --git a/tests/tests_lossy.py b/tests/tests_lossy.py index d731096..cefd4a3 100644 --- a/tests/tests_lossy.py +++ b/tests/tests_lossy.py @@ -109,7 +109,7 @@ def _prepare_compare(lossless, lossy, t0, t1): i0 = int(round(t0 * sr)) i1 = int(round(t1 * sr)) - lossless_img = ml._preprocess(lossless[i0:i1]) + lossless_img = ml._preprocess_default(lossless[i0:i1]) lossy_img = lossy.get(t0, t1).T mM = lossy._svd.minmax From 9e7f2f446c87a8bcb009867df9d257d6ba24d7db Mon Sep 17 00:00:00 2001 From: Cyrille Rossant Date: Wed, 23 Feb 2022 14:06:10 +0100 Subject: [PATCH 12/18] ArrayReader mtscomp-like wrapper for regular NumPy arrays --- mtscomp/lossy.py | 56 +++++++++++++++++++++++++++++++------- tests/tests_lossy.py | 64 ++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 109 insertions(+), 11 deletions(-) diff --git a/mtscomp/lossy.py b/mtscomp/lossy.py index b774ac7..86e963e 100644 --- a/mtscomp/lossy.py +++ b/mtscomp/lossy.py @@ -268,7 +268,6 @@ def _get_excerpts(reader, kept_chunks=CHUNKS_EXCERPTS, preprocess=None): """ assert reader - assert isinstance(reader, Reader) assert kept_chunks >= 2 preprocess = preprocess or _preprocess_default @@ -423,16 +422,51 @@ def _decompress_chunk(lossy, svd, rank=None): return arr +#------------------------------------------------------------------------------------------------- +# Mock Reader classes +#------------------------------------------------------------------------------------------------- + +class ArrayReader: + """Wrapper to an array-like object, that provides the same interface as a mtscomp.Reader.""" + + def __init__(self, arr, sample_rate=None): + assert sample_rate > 0 + self.sample_rate = sample_rate + self.chunk_length = int(np.ceil(sample_rate)) + + self._arr = arr + assert arr.ndim == 2 + # arr shape is (n_samples, n_channels) + assert arr.shape[0] > arr.shape[1] + self.shape = arr.shape + self.n_samples, self.n_channels = self.shape + self.n_chunks = int(np.ceil(self.n_samples / float(sample_rate))) + + def iter_chunks(self, last_chunk=None): + offset = 0 + n = (last_chunk + 1) if last_chunk else self.n_chunks + for i in range(self.n_chunks): + # (chunk_idx, chunk_start, chunk_length) + yield (i, offset, min(self.chunk_length, self.n_samples - offset)) + offset += self.chunk_length + + def read_chunk(self, chunk_idx, chunk_start, chunk_length): + return self._arr[chunk_start:chunk_start + chunk_length, :] + + def __getitem__(self, idx): + return self._arr[idx] + + #------------------------------------------------------------------------------------------------- # Compressor #------------------------------------------------------------------------------------------------- def compress_lossy( - path_cbin=None, cmeta=None, rank=None, max_chunks=0, + path_cbin=None, cmeta=None, reader=None, rank=None, max_chunks=0, chunks_excerpts=None, downsampling_factor=None, preprocess=None, overwrite=False, dry_run=False, out_lossy=None, out_svd=None): - """Compress a .cbin file. + """Compress a .cbin file or an arbitrary signal array. Parameters ---------- @@ -441,6 +475,8 @@ def compress_lossy( Path to the compressed data binary file (typically ̀.cbin` file extension). cmeta : str or Path (default: None) Path to the compression header JSON file (typically `.ch` file extension). + reader : Reader instance + A mtscomp.Reader or ArrayReader object. rank : int (mandatory) Number of SVD components to keep in the compressed file. max_chunks : int (default: None) @@ -474,7 +510,7 @@ def compress_lossy( # Check arguments. assert rank, "The rank must be set" - assert path_cbin, "The raw ephys data file must be specified" + assert path_cbin or reader, "The raw ephys data file must be specified" preprocess = preprocess or _preprocess_default chunks_excerpts = chunks_excerpts or CHUNKS_EXCERPTS @@ -484,7 +520,9 @@ def compress_lossy( assert chunks_excerpts >= 2 # Create a mtscomp Reader. - reader = decompress(path_cbin, cmeta=cmeta) + if path_cbin: + reader = decompress(path_cbin, cmeta=cmeta) + assert reader sr = int(reader.sample_rate) ns = reader.n_samples nc = reader.n_channels @@ -499,13 +537,13 @@ def compress_lossy( assert rank <= nc, "The rank cannot exceed the number of channels" # Filenames. - if out_lossy is None: + if out_lossy is None and path_cbin: out_lossy = Path(path_cbin).with_suffix(FILE_EXTENSION_LOSSY) - assert out_lossy + assert out_lossy, "An output file path for the .lossy.npy file must be provided" - if out_svd is None: + if out_svd is None and path_cbin: out_svd = Path(path_cbin).with_suffix(FILE_EXTENSION_SVD) - assert out_svd + assert out_svd, "An output file path for the .svd.npz file must be provided" if dry_run: return out_lossy diff --git a/tests/tests_lossy.py b/tests/tests_lossy.py index cefd4a3..7464f07 100644 --- a/tests/tests_lossy.py +++ b/tests/tests_lossy.py @@ -102,7 +102,6 @@ def _show_img(ax, x, title, vmin=None, vmax=None): def _prepare_compare(lossless, lossy, t0, t1): - assert isinstance(lossless, Reader) assert isinstance(lossy, ml.LossyReader) sr = lossless.sample_rate @@ -122,7 +121,6 @@ def _compute_error(lossless_img, lossy_img, threshold=ERROR_THRESHOLD): def show_compare(lossless, lossy, t0, t1, threshold=ERROR_THRESHOLD, do_show=True): - assert isinstance(lossless, Reader) assert isinstance(lossy, ml.LossyReader) lossless_img, lossy_img, (m, M) = _prepare_compare(lossless, lossy, t0, t1) @@ -217,3 +215,65 @@ def test_lossy_local(): t = hw err = show_compare(lossless, lossy, t - hw, t + hw, do_show=False) assert err < 1 + + +def preprocess(raw): + """Default preprocessing function: CAR and 6x downsampling. + + Note: this function transposes the array. + + Parameters + ---------- + + raw : ndarray (n_samples, n_channels) + Signal. + + Returns + ------- + + pp : ndarray (n_channels, n_samples) + + """ + + assert raw.shape[0] > raw.shape[1] + # raw is (ns, nc) + + pp = ml._car(raw) + # pp is (ns, nc) + assert pp.shape[0] > pp.shape[1] + + pp = ml._downsample(pp, factor=ml.DOWNSAMPLE_FACTOR) + # pp is (nc, ns) + assert pp.shape[0] < pp.shape[1] + + return pp + + +def test_lossy_array(tmp_path): + path_lossy = tmp_path / 'sine.lossy.npy' + path_svd = tmp_path / 'sine.svd.npz' + + # Generate an artificial binary file. + arr = colored_sine() + assert arr.shape == (n_samples, n_channels) + M = np.abs(arr).max() + + # mtscomp-like interface for a regular NumPy array. + reader = ml.ArrayReader(arr, sample_rate=sample_rate) + + # Compress it (lossy). + rank = 8 + path_lossy = ml.compress_lossy( + reader=reader, rank=rank, out_lossy=path_lossy, out_svd=path_svd, + preprocess=preprocess) + assert path_lossy.exists() + assert np.load(path_lossy).shape == (n_samples // ml.DOWNSAMPLE_FACTOR, rank) + + # Decompress. + lossy = ml.decompress_lossy(path_lossy) + assert arr.ndim == lossy.ndim + assert arr.shape[1] == lossy.shape[1] + assert arr.shape[0] - lossy.shape[0] == arr.shape[0] % ml.DOWNSAMPLE_FACTOR + + err = show_compare(reader, lossy, 0, duration, threshold=.1, do_show=False) + assert err < 1 From 7817539a74aa16b6bbf0cd026ed266c3feec389a Mon Sep 17 00:00:00 2001 From: Cyrille Rossant Date: Wed, 23 Feb 2022 14:24:04 +0100 Subject: [PATCH 13/18] Bug fix in lossy compression --- mtscomp/lossy.py | 6 ++++-- tests/tests_lossy.py | 2 +- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/mtscomp/lossy.py b/mtscomp/lossy.py index 86e963e..fb42b91 100644 --- a/mtscomp/lossy.py +++ b/mtscomp/lossy.py @@ -240,7 +240,7 @@ def _preprocess_default(raw): assert pp.shape[0] > pp.shape[1] pp = _downsample(pp, factor=DOWNSAMPLE_FACTOR) - # pp is (nc, ns) + # pp is (nc', ns) assert pp.shape[0] < pp.shape[1] return pp @@ -579,7 +579,9 @@ def compress_lossy( # Write the compressed chunk to disk. l = chunk_lossy.shape[1] - assert offset + l <= shape[0] + k = min(l, shape[0] - offset) + assert k <= l + chunk_lossy = chunk_lossy[:, :k] lossy[offset:offset + l, :], ab = to_uint8(chunk_lossy.T, svd.ab) # NOTE: keep the ab scaling factors for uint8 conversion only for the first chunk if svd.ab is None: diff --git a/tests/tests_lossy.py b/tests/tests_lossy.py index 7464f07..497ed07 100644 --- a/tests/tests_lossy.py +++ b/tests/tests_lossy.py @@ -243,7 +243,7 @@ def preprocess(raw): assert pp.shape[0] > pp.shape[1] pp = ml._downsample(pp, factor=ml.DOWNSAMPLE_FACTOR) - # pp is (nc, ns) + # pp is (nc', ns) assert pp.shape[0] < pp.shape[1] return pp From 30e1574c3458578685db9e9fbf2252184f273740 Mon Sep 17 00:00:00 2001 From: Cyrille Rossant Date: Wed, 16 Mar 2022 15:36:41 +0100 Subject: [PATCH 14/18] Use quantile for bit depth subsampling --- mtscomp/lossy.py | 42 ++++++++++++++++++++++-------------------- tests/tests_lossy.py | 4 ++-- 2 files changed, 24 insertions(+), 22 deletions(-) diff --git a/mtscomp/lossy.py b/mtscomp/lossy.py index fb42b91..b59de70 100644 --- a/mtscomp/lossy.py +++ b/mtscomp/lossy.py @@ -32,12 +32,13 @@ # Constants #------------------------------------------------------------------------------------------------- -DOWNSAMPLE_FACTOR = 6 +DOWNSAMPLE_FACTOR = 4 CHUNKS_EXCERPTS = 20 FILE_EXTENSION_LOSSY = '.lossy.npy' FILE_EXTENSION_SVD = '.svd.npz' -UINT8_MARGIN = .05 +DEFAULT_QUANTILE = .0025 +# UINT8_MARGIN = .05 DTYPE = np.uint8 MAX_UINT8 = 255 @@ -55,7 +56,7 @@ class SVD(Bunch): """ def __init__( - self, U, sigma, rank=None, ab=None, minmax=None, q01=None, + self, U, sigma, rank=None, ab=None, minmax=None, quantile=None, sample_rate=None, downsample_factor=DOWNSAMPLE_FACTOR): super(SVD, self).__init__() self.U = U # the "U" in the "U @ sigma @ V" SVD decomposition @@ -71,7 +72,7 @@ def __init__( self.sample_rate = sample_rate # the sampling rate self.downsample_factor = downsample_factor # the downsample factor, an integer self.minmax = minmax # the min and max of the signal across all channels - self.q01 = q01 # the 0.01 and 0.99 quantiles of the signal (unused for now) + self.quantile = quantile # the DEFAULT_QUANTILE and 1-DEFAULT_QUANTILE quantiles of the signal def save(self, path): """Save this SVD object to a .npz file.""" @@ -84,7 +85,7 @@ def save(self, path): assert self.sample_rate > 0 assert self.downsample_factor >= 1 assert self.minmax is not None - assert self.q01 is not None + assert self.quantile is not None np.savez( path, @@ -92,7 +93,7 @@ def save(self, path): sigma=self.sigma, ab=self.ab, minmax=np.array(self.minmax), - q01=np.array(self.q01), + quantile=np.array(self.quantile), # NOTE: need to convert to regular arrays for np.savez rank=np.array([self.rank]), @@ -115,7 +116,7 @@ def load_svd(path): sample_rate=d['sample_rate'][0], downsample_factor=int(d['downsample_factor'][0]), minmax=d['minmax'], - q01=d['q01'], + quantile=d['quantile'], ) assert svd.n_channels >= 1 return svd @@ -172,16 +173,15 @@ def _svd(x): return SVD(U, sigma) -def _uint8_coefs(x, margin=UINT8_MARGIN): +def _uint8_coefs(x, q=DEFAULT_QUANTILE): """Compute the (a, b) rescaling coefficients to downsample a signal to uint8.""" - # k = .01 - # m, M = np.quantile(x, k), np.quantile(x, 1 - k) - m, M = x.min(), x.max() + m, M = np.quantile(x, q), np.quantile(x, 1 - q) + # m, M = x.min(), x.max() d = M - m assert d > 0 - m -= d * margin - M += d * margin + # m -= d * margin + # M += d * margin a = MAX_UINT8 / d b = m @@ -332,8 +332,9 @@ def excerpt_svd(reader, rank, kept_chunks=CHUNKS_EXCERPTS, preprocess=None): assert svd.U.shape[0] == reader.n_channels svd.minmax = (excerpts.min(), excerpts.max()) - k = .01 - svd.q01 = (np.quantile(excerpts.ravel(), k), np.quantile(excerpts.ravel(), 1 - k)) + svd.quantile = ( + np.quantile(excerpts.ravel(), DEFAULT_QUANTILE), + np.quantile(excerpts.ravel(), 1 - DEFAULT_QUANTILE)) svd.sample_rate = reader.sample_rate svd.downsample_factor = DOWNSAMPLE_FACTOR @@ -526,7 +527,7 @@ def compress_lossy( sr = int(reader.sample_rate) ns = reader.n_samples nc = reader.n_channels - n_chunks = reader.n_chunks if max_chunks == 0 else max_chunks + n_chunks = reader.n_chunks if not max_chunks else max_chunks if max_chunks: ns = max_chunks * sr # NOTE: assume 1-second chunks @@ -539,10 +540,12 @@ def compress_lossy( # Filenames. if out_lossy is None and path_cbin: out_lossy = Path(path_cbin).with_suffix(FILE_EXTENSION_LOSSY) + out_lossy = Path(out_lossy) assert out_lossy, "An output file path for the .lossy.npy file must be provided" - if out_svd is None and path_cbin: - out_svd = Path(path_cbin).with_suffix(FILE_EXTENSION_SVD) + if out_svd is None: + out_svd = Path(out_lossy).with_suffix('').with_suffix(FILE_EXTENSION_SVD) + out_svd = Path(out_svd) assert out_svd, "An output file path for the .svd.npz file must be provided" if dry_run: @@ -657,7 +660,6 @@ def open(self, path_lossy=None, path_svd=None): self.size_bytes = self._lossy.size * self._lossy.itemsize self.itemsize = 1 self.dtype = DTYPE - self.minmax = self._svd.minmax # used for cmap scaling visualization size_original = 2 * self.n_channels * self.n_samples self.compression = size_original / float(self.size_bytes) @@ -712,7 +714,7 @@ def get(self, t0, t1, rank=None, cast_to_uint8=False): lossy = self._lossy[i0:i1] arr = self._decompress(lossy, rank=rank) if cast_to_uint8: - m, M = self.minmax + m, M = self._svd.quantile d = M - m assert d > 0 a = MAX_UINT8 / d diff --git a/tests/tests_lossy.py b/tests/tests_lossy.py index 497ed07..e559b71 100644 --- a/tests/tests_lossy.py +++ b/tests/tests_lossy.py @@ -111,7 +111,7 @@ def _prepare_compare(lossless, lossy, t0, t1): lossless_img = ml._preprocess_default(lossless[i0:i1]) lossy_img = lossy.get(t0, t1).T - mM = lossy._svd.minmax + mM = lossy._svd.quantile return lossless_img, lossy_img, mM @@ -213,7 +213,7 @@ def test_lossy_local(): hw = .1 t = hw - err = show_compare(lossless, lossy, t - hw, t + hw, do_show=False) + err = show_compare(lossless, lossy, t - hw, t + hw, do_show=True) assert err < 1 From 5cf52ca4d1a75222c54c83bebdbf25843adbcd6c Mon Sep 17 00:00:00 2001 From: Cyrille Rossant Date: Wed, 16 Mar 2022 15:39:47 +0100 Subject: [PATCH 15/18] Flakify --- mtscomp/__init__.py | 4 ++-- mtscomp/lossy.py | 11 +++-------- mtscomp/mtscomp.py | 3 ++- tests/tests_lossy.py | 13 ++----------- 4 files changed, 9 insertions(+), 22 deletions(-) diff --git a/mtscomp/__init__.py b/mtscomp/__init__.py index caae008..30df181 100644 --- a/mtscomp/__init__.py +++ b/mtscomp/__init__.py @@ -1,3 +1,3 @@ -from .mtscomp import * +from .mtscomp import * # noqa -__all__ = ('load_raw_data', 'Writer', 'Reader', 'compress', 'decompress') +__all__ = ('load_raw_data', 'Writer', 'Reader', 'compress', 'decompress') # noqa diff --git a/mtscomp/lossy.py b/mtscomp/lossy.py index b59de70..7af9c95 100644 --- a/mtscomp/lossy.py +++ b/mtscomp/lossy.py @@ -8,20 +8,16 @@ #------------------------------------------------------------------------------------------------- import argparse -from functools import lru_cache from itertools import islice import logging -import os -import os.path as op from pathlib import Path -import sys from tqdm import tqdm import numpy as np from numpy.linalg import inv from numpy.lib.format import open_memmap -from .mtscomp import Bunch, decompress, Reader, add_default_handler +from .mtscomp import Bunch, decompress, add_default_handler logger = logging.getLogger('mtscomp') @@ -72,7 +68,7 @@ def __init__( self.sample_rate = sample_rate # the sampling rate self.downsample_factor = downsample_factor # the downsample factor, an integer self.minmax = minmax # the min and max of the signal across all channels - self.quantile = quantile # the DEFAULT_QUANTILE and 1-DEFAULT_QUANTILE quantiles of the signal + self.quantile = quantile # the q and 1-q quantiles of the signal def save(self, path): """Save this SVD object to a .npz file.""" @@ -272,7 +268,6 @@ def _get_excerpts(reader, kept_chunks=CHUNKS_EXCERPTS, preprocess=None): preprocess = preprocess or _preprocess_default arrs = [] - n = 0 n_chunks = reader.n_chunks assert reader.shape[0] > reader.shape[1] skip = max(1, n_chunks // kept_chunks) @@ -446,7 +441,7 @@ def __init__(self, arr, sample_rate=None): def iter_chunks(self, last_chunk=None): offset = 0 n = (last_chunk + 1) if last_chunk else self.n_chunks - for i in range(self.n_chunks): + for i in range(n): # (chunk_idx, chunk_start, chunk_length) yield (i, offset, min(self.chunk_length, self.n_samples - offset)) offset += self.chunk_length diff --git a/mtscomp/mtscomp.py b/mtscomp/mtscomp.py index f4b330a..b75b912 100644 --- a/mtscomp/mtscomp.py +++ b/mtscomp/mtscomp.py @@ -456,7 +456,8 @@ def write(self, out, outmeta): ts = ' on %d CPUs.' % self.n_threads if self.n_threads > 1 else '.' logger.info("Starting compression" + ts) with open(out, 'wb') as fb: - for batch in tqdm(range(self.n_batches), desc='Compressing (lossless)', disable=self.quiet): + for batch in tqdm( + range(self.n_batches), desc='Compressing (lossless)', disable=self.quiet): first_chunk = self.batch_size * batch # first included last_chunk = min(self.batch_size * (batch + 1), self.n_chunks) # last excluded assert 0 <= first_chunk < last_chunk <= self.n_chunks diff --git a/tests/tests_lossy.py b/tests/tests_lossy.py index e559b71..2f0e17f 100644 --- a/tests/tests_lossy.py +++ b/tests/tests_lossy.py @@ -7,21 +7,12 @@ # Imports #------------------------------------------------------------------------------ -from contextlib import redirect_stdout -import io -from itertools import product -import json -import hashlib import logging -import os -import os.path as op from pathlib import Path -import re import numpy as np -from pytest import fixture, raises, mark -from mtscomp import Reader, compress, decompress, add_default_handler, lossy as ml +from mtscomp import compress, decompress, lossy as ml logger = logging.getLogger('mtscomp') # add_default_handler('DEBUG') @@ -256,7 +247,7 @@ def test_lossy_array(tmp_path): # Generate an artificial binary file. arr = colored_sine() assert arr.shape == (n_samples, n_channels) - M = np.abs(arr).max() + # M = np.abs(arr).max() # mtscomp-like interface for a regular NumPy array. reader = ml.ArrayReader(arr, sample_rate=sample_rate) From dddcff9a32e4c68e3f85fe4ec2f7dc8ca5d3656a Mon Sep 17 00:00:00 2001 From: Cyrille Rossant Date: Wed, 16 Mar 2022 16:37:54 +0100 Subject: [PATCH 16/18] Update logging --- mtscomp/lossy.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/mtscomp/lossy.py b/mtscomp/lossy.py index 7af9c95..bcb81ca 100644 --- a/mtscomp/lossy.py +++ b/mtscomp/lossy.py @@ -96,6 +96,7 @@ def save(self, path): sample_rate=np.array([self.sample_rate]), downsample_factor=np.array([self.downsample_factor]), ) + logger.info(f"SVD file saved to {path}") def __repr__(self): return f"" @@ -551,6 +552,7 @@ def compress_lossy( raise IOError(f"File {out_lossy} already exists.") shape = (ns // downsampling_factor, rank) lossy = open_memmap(out_lossy, 'w+', dtype=DTYPE, shape=shape) + logger.info(f"Writing file {out_lossy}...") # Compute the SVD on an excerpt of the data. svd = excerpt_svd(reader, rank, kept_chunks=chunks_excerpts, preprocess=preprocess) @@ -584,14 +586,18 @@ def compress_lossy( # NOTE: keep the ab scaling factors for uint8 conversion only for the first chunk if svd.ab is None: svd.ab = ab + + # Save the SVD info to a npz file. + svd.save(out_svd) + offset += l extra = shape[0] - offset if extra > 0: lossy[-extra:, :] = lossy[-extra - 1, :] - # Save the SVD info to a npz file. - svd.save(out_svd) + # # Save the SVD info to a npz file. + # svd.save(out_svd) return out_lossy From bab0d13aa8f5fb19ef8539b129b56a03453696c8 Mon Sep 17 00:00:00 2001 From: Cyrille Rossant Date: Tue, 29 Mar 2022 13:26:56 +0200 Subject: [PATCH 17/18] Add time to sample and sample to time methods in lossy reader --- mtscomp/lossy.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/mtscomp/lossy.py b/mtscomp/lossy.py index bcb81ca..71a9860 100644 --- a/mtscomp/lossy.py +++ b/mtscomp/lossy.py @@ -723,6 +723,12 @@ def get(self, t0, t1, rank=None, cast_to_uint8=False): arr, _ = to_uint8(arr, ab=(a, b)) return arr + def t2s(self, t): + return np.round(t * self.sample_rate).astype(np.uint64) + + def s2t(self, s): + return s / float(self.sample_rate) + def __getitem__(self, idx): """Array-like interface.""" lossy = self._lossy[idx] From ab750a5b722eed8d72a6901546d1cb4dacde6a3b Mon Sep 17 00:00:00 2001 From: Cyrille Rossant Date: Mon, 4 Apr 2022 20:28:11 +0200 Subject: [PATCH 18/18] Add filter keyword argument to LossyReader.get() --- mtscomp/lossy.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/mtscomp/lossy.py b/mtscomp/lossy.py index 71a9860..a91451b 100644 --- a/mtscomp/lossy.py +++ b/mtscomp/lossy.py @@ -687,7 +687,7 @@ def _decompress(self, lossy, rank=None): lossy_float = from_uint8(lossy, self._svd.ab).T return _decompress_chunk(lossy_float, self._svd, rank=rank).T - def get(self, t0, t1, rank=None, cast_to_uint8=False): + def get(self, t0, t1, rank=None, cast_to_uint8=False, filter=None): """Return the reconstructed signal between two times (in seconds). Parameters @@ -701,6 +701,8 @@ def get(self, t0, t1, rank=None, cast_to_uint8=False): If set, overrides the number of components to reuse for the reconstruction. cast_to_uint8 : bool (default: False) Whether the reconstructed signal should be downsampled to uint8 (for viz purposes). + filter : function + Filter to apply to the signal (before casting to uint8). Returns ------- @@ -714,6 +716,8 @@ def get(self, t0, t1, rank=None, cast_to_uint8=False): i1 = int(round(t1 * float(self.sample_rate) / ds)) lossy = self._lossy[i0:i1] arr = self._decompress(lossy, rank=rank) + if filter: + arr = filter(arr) if cast_to_uint8: m, M = self._svd.quantile d = M - m