From 1b66206e8ba09a76169fc4cae61113764a45f4bb Mon Sep 17 00:00:00 2001 From: Yuqiu Yang Date: Tue, 4 Jul 2023 14:18:57 -0500 Subject: [PATCH 1/6] Added hot-spring sherry adjustment This adjustment includes matching zero probabilities as well as a function that can estimate noise models from real data. --- cytomulate/cell_type_general.py | 20 ++++++++++++++-- cytomulate/emulation/cell_type.py | 6 +++-- cytomulate/utilities.py | 40 ++++++++++++++++++++++++++++--- tests/test_utilities.py | 14 +++++++++-- 4 files changed, 71 insertions(+), 9 deletions(-) diff --git a/cytomulate/cell_type_general.py b/cytomulate/cell_type_general.py index 917124f..8935b5e 100644 --- a/cytomulate/cell_type_general.py +++ b/cytomulate/cell_type_general.py @@ -34,7 +34,9 @@ def __init__(self, # cell_mean and cell_covariance are used during cell differentiation self.cell_mean = np.zeros(self.n_markers) self.cell_covariance = np.zeros((self.n_markers, self.n_markers)) - + # zero_probabilities is used for adjustment + self.zero_probabilities = np.zeros(n_markers) + def sample_cell(self, n_samples: int) -> Tuple[np.ndarray, np.ndarray]: """Draw random samples from the cell type model @@ -56,6 +58,20 @@ def sample_cell(self, """ X = np.zeros((n_samples, self.n_markers)) X[:, :], _ = self.model.sample(n_samples) - expressed_index = (X > 0) X = np.clip(X, a_min=0, a_max=None) + for m in range(self.n_markers): + n_zero_exp = int((self.zero_probabilities[m]) * n_samples) + n_zero_present = np.sum(X[:, m]<0.0001) + n_zero_needed = np.max([0, n_zero_exp-n_zero_present]) + non_zero_ind = np.where(X[:,m]>=0.0001)[0] + p = 1/(X[non_zero_ind, m]) + p /= np.sum(p) + # if n_zero_needed is 0, this should yield + # [] which when plugged into the next statement + # shall change nothing + ind_to_zero = np.random.choice(non_zero_ind, size=n_zero_needed, + replace=False, p=p) + X[ind_to_zero, m] = 0 + + expressed_index = (X > 0) return X, expressed_index diff --git a/cytomulate/emulation/cell_type.py b/cytomulate/emulation/cell_type.py index d3d8792..7902891 100644 --- a/cytomulate/emulation/cell_type.py +++ b/cytomulate/emulation/cell_type.py @@ -29,7 +29,6 @@ def __init__(self, """ super().__init__(label, cell_id, n_markers) - def fit(self, data: np.ndarray, max_components: int, @@ -60,7 +59,10 @@ def fit(self, self.cell_mean = np.mean(data, axis=0) self.cell_covariance = np.cov(data, rowvar=False) - + + for m in range(self.n_markers): + self.zero_probabilities[m] = np.mean(data[:, m] < 0.0001) + # We use BIC (the smaller the better) to perform model selection smallest_bic = np.Inf current_bic = 0 diff --git a/cytomulate/utilities.py b/cytomulate/utilities.py index 9d625ed..adab51b 100644 --- a/cytomulate/utilities.py +++ b/cytomulate/utilities.py @@ -1,6 +1,7 @@ # Math computation import numpy as np from numpy import random as rd +from scipy.special import erfinv # Polynomials and spline functions from numpy.polynomial import polynomial @@ -176,7 +177,7 @@ def trajectories(end_values: Optional[Union[list, np.ndarray]] = None, return trajectories_functions -def univariate_noise_model(noise_distribution: str = "normal", +def univariate_noise_model(noise_distribution: str = "uniform", **kwargs) -> Callable: """Generate a noise distribution This is mainly used to generate background noise in the cytof_data object @@ -184,7 +185,7 @@ def univariate_noise_model(noise_distribution: str = "normal", Parameters ---------- noise_distribution: str - Either "normal" or "uniform" + Either "normal", "half_normal", or "uniform" kwargs: extra parameters needed for numpy.random.normal or numpy.random.uniform @@ -193,7 +194,10 @@ def univariate_noise_model(noise_distribution: str = "normal", model: Callable A RV generator that only takes size as its input """ - if noise_distribution == "normal": + if noise_distribution == "half_normal": + def model(size): + return -np.abs(rd.normal(**kwargs, size=size)) + elif noise_distribution == "normal": def model(size): return rd.normal(**kwargs, size=size) elif noise_distribution == "uniform": @@ -203,3 +207,33 @@ def model(size): raise ValueError('Unknown noise distribution') return model + +def estimate_noise_model(data: np.ndarray, + noise_distribution: str = "uniform") -> Callable: + """Estimate the noise model from data + + Parameters + ---------- + data : np.ndarray + An array of expression matrix + noise_distribution : str, optional + Either "half_normal" or "uniform", by default "uniform" + + Returns + ------- + Callable + A RV generator that only takes size as its input + """ + para_dict = {"noise_distribution": noise_distribution} + if noise_distribution == "uniform": + min_val = np.min(data) + para_dict["low"] = min_val + para_dict["high"] = 0 + + if noise_distribution == "half_normal": + m = np.median(data[np.where(data<=0)]) + scale = np.abs(m/(np.sqrt(2)*erfinv(0.5))) + para_dict["loc"] = 0 + para_dict["scale"] = scale + + return univariate_noise_model(**para_dict) \ No newline at end of file diff --git a/tests/test_utilities.py b/tests/test_utilities.py index fc145a4..46178f8 100644 --- a/tests/test_utilities.py +++ b/tests/test_utilities.py @@ -4,7 +4,8 @@ polynomial_function, \ brownian_bridge_function, \ trajectories, \ - univariate_noise_model + univariate_noise_model, \ + estimate_noise_model @pytest.mark.parametrize("x, y, smoothing_factor, t, expected", [ @@ -58,7 +59,7 @@ def test_trajectories(end_values, coefficients, x, y, t, expected): @pytest.mark.parametrize("kwargs, size, expected", [ ({"noise_distribution":"normal", "loc":0, "scale":1}, 5, (5, )), - ({"noise_distribution":"normal", "loc":0, "scale":1}, (5, 3), (5, 3)), + ({"noise_distribution":"half_normal", "loc":0, "scale":1}, (5, 3), (5, 3)), ({"noise_distribution":"uniform", "low":0, "high":1}, 5, (5, )), ({"noise_distribution":"uniform", "low":0, "high":1}, (5, 3), (5, 3)), ({"noise_distribution":"gamma"}, (5, 3), (5, 3)), @@ -70,3 +71,12 @@ def test_univariate_noise_model(kwargs, size, expected): except ValueError: assert True + +@pytest.mark.parametrize("kwargs, size, expected", [ + (-np.abs(np.random.normal(size=(5,5), loc=0, scale=1)), "half_normal", 5, (5, )), + (np.random.uniform(size=(5,5), low=-1, high=0), "uniform", 5, (5, )), +]) +def test_estimate_noise_model(data, noise_distribution, size, expected): + f = estimate_noise_model(data=data, + noise_distribution=noise_distribution) + assert f(size).shape == expected From f069e4b212b3fe6bcdd8e254766d4f9c0074dd30 Mon Sep 17 00:00:00 2001 From: Yuqiu Yang Date: Tue, 4 Jul 2023 14:21:11 -0500 Subject: [PATCH 2/6] Update test_utilities.py --- tests/test_utilities.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_utilities.py b/tests/test_utilities.py index 46178f8..dd3ee98 100644 --- a/tests/test_utilities.py +++ b/tests/test_utilities.py @@ -72,7 +72,7 @@ def test_univariate_noise_model(kwargs, size, expected): assert True -@pytest.mark.parametrize("kwargs, size, expected", [ +@pytest.mark.parametrize("data, noise_distribution, size, expected", [ (-np.abs(np.random.normal(size=(5,5), loc=0, scale=1)), "half_normal", 5, (5, )), (np.random.uniform(size=(5,5), low=-1, high=0), "uniform", 5, (5, )), ]) From 190d4759bae9f2c20c6b7a5d367536248841b000 Mon Sep 17 00:00:00 2001 From: Yuqiu Yang Date: Tue, 4 Jul 2023 14:31:45 -0500 Subject: [PATCH 3/6] Update cell_type_general.py --- cytomulate/cell_type_general.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/cytomulate/cell_type_general.py b/cytomulate/cell_type_general.py index 8935b5e..77a7ace 100644 --- a/cytomulate/cell_type_general.py +++ b/cytomulate/cell_type_general.py @@ -63,15 +63,16 @@ def sample_cell(self, n_zero_exp = int((self.zero_probabilities[m]) * n_samples) n_zero_present = np.sum(X[:, m]<0.0001) n_zero_needed = np.max([0, n_zero_exp-n_zero_present]) - non_zero_ind = np.where(X[:,m]>=0.0001)[0] - p = 1/(X[non_zero_ind, m]) - p /= np.sum(p) - # if n_zero_needed is 0, this should yield - # [] which when plugged into the next statement - # shall change nothing - ind_to_zero = np.random.choice(non_zero_ind, size=n_zero_needed, - replace=False, p=p) - X[ind_to_zero, m] = 0 + if n_zero_needed > 0: + non_zero_ind = np.where(X[:,m]>=0.0001)[0] + p = 1/(X[non_zero_ind, m]) + p /= np.sum(p) + # if n_zero_needed is 0, this should yield + # [] which when plugged into the next statement + # shall change nothing + ind_to_zero = np.random.choice(non_zero_ind, size=n_zero_needed, + replace=False, p=p) + X[ind_to_zero, m] = 0 expressed_index = (X > 0) return X, expressed_index From bbacb7b23a7f9ced475a5cccb3ccc3cd83ca6031 Mon Sep 17 00:00:00 2001 From: Kevin Wang Date: Wed, 12 Jul 2023 20:58:58 -0500 Subject: [PATCH 4/6] [Release Prep] v0.2.0 --- README.md | 22 +++++++++++++++------- cytomulate/__init__.py | 2 +- docs/source/change/index.rst | 17 +++++++++++++---- docs/source/change/releases.rst | 23 +++++++++++++++++++++++ docs/source/conf.py | 2 +- meta.yaml | 2 +- setup.py | 2 +- 7 files changed, 55 insertions(+), 15 deletions(-) diff --git a/README.md b/README.md index d914eb0..bc3ba15 100755 --- a/README.md +++ b/README.md @@ -8,8 +8,8 @@ | Branch | Release | CI/CD | Documentation | Code Coverage | | --- | --- | --- | --- | --- | -| main | ![Badge1](https://img.shields.io/badge/Version-v0.1.1-success) | ![Tests](https://github.com/kevin931/cytomulate/actions/workflows/ci.yml/badge.svg?branch=main) | [![Documentation Status](https://readthedocs.org/projects/cytomulate/badge/?version=dev)](https://cytomulate.readthedocs.io/en/main/?badge=main) | [![codecov](https://codecov.io/gh/kevin931/cytomulate/branch/dev/graph/badge.svg?token=F5H0QTXGMR)](https://codecov.io/gh/kevin931/cytomulate) | -| dev | ![Badge1](https://img.shields.io/badge/Version-v0.1.1-success) |![Tests](https://github.com/kevin931/cytomulate/actions/workflows/ci.yml/badge.svg?branch=dev) | [![Documentation Status](https://readthedocs.org/projects/cytomulate/badge/?version=dev)](https://cytomulate.readthedocs.io/en/dev/?badge=dev) | [![codecov](https://codecov.io/gh/kevin931/cytomulate/branch/dev/graph/badge.svg?token=F5H0QTXGMR)](https://codecov.io/gh/kevin931/cytomulate) | +| main | ![Badge1](https://img.shields.io/badge/Version-v0.2.0-success) | ![Tests](https://github.com/kevin931/cytomulate/actions/workflows/ci.yml/badge.svg?branch=main) | [![Documentation Status](https://readthedocs.org/projects/cytomulate/badge/?version=dev)](https://cytomulate.readthedocs.io/en/main/?badge=main) | [![codecov](https://codecov.io/gh/kevin931/cytomulate/branch/dev/graph/badge.svg?token=F5H0QTXGMR)](https://codecov.io/gh/kevin931/cytomulate) | +| dev | ![Badge1](https://img.shields.io/badge/Version-v0.2.0-success) |![Tests](https://github.com/kevin931/cytomulate/actions/workflows/ci.yml/badge.svg?branch=dev) | [![Documentation Status](https://readthedocs.org/projects/cytomulate/badge/?version=dev)](https://cytomulate.readthedocs.io/en/dev/?badge=dev) | [![codecov](https://codecov.io/gh/kevin931/cytomulate/branch/dev/graph/badge.svg?token=F5H0QTXGMR)](https://codecov.io/gh/kevin931/cytomulate) | ## Installation @@ -124,17 +124,25 @@ guidelines, development guides, etc. Our documentation is built automatically on the cloud! If you wish to build locally, check our detailed guide [here](https://cytomulate.readthedocs.io/en/latest/change/build.html)! -## Latest Release: v0.1.1 +## Latest Release: v0.2.0 -This is our first maintenance update to be released to v0.1.x, -and we are packing in lots of enhancements! All changes are -regarding documentations! +Welcome to Cytomulate v0.2.0! Hooray! We are not only bringing documentation enhancements, but we +are also introducing a new feature for more accurate simulations! + +### Changes and New Features +- The `utilities.univariate_noise_model()` method: + - Added `half_normal` option to the `noise_distribution` parameter + - Changed the default `noise_distribution` to `uniform` (This is a **breaking change** because of the benefits to simulated results). + - A warning is given when no user-specified `noise_distribution` is supplied to warn the breaking change +- Added the `utilities.estimate_noise_model()` method to estimate the noise present in the data +- Added a built-in estimation procedure to match the amount of zeroes observed in the dataset ### Improvements -- Added 4 more detailed tutorials on [our documentation website](https://cytomulate.readthedocs.io) +- Added 4 more detailed tutorials on `our documentation website `_ - Improved docstrings with more details on key parameters - Updated the lastest references and links + ## References If you are cytomulating in your workflow, citing [our paper](https://doi.org/10.1101/2022.06.14.496200) is appreciated: diff --git a/cytomulate/__init__.py b/cytomulate/__init__.py index 032bcc0..7bd2f54 100755 --- a/cytomulate/__init__.py +++ b/cytomulate/__init__.py @@ -4,7 +4,7 @@ A package for simulating CyTOF data """ -__version__ = "0.1.1" +__version__ = "0.2.0" __author__ = "Yuqiu Yang, Kevin Wang, Tao Wang, Sherry Wang" from cytomulate.creation.cytof_data import CreationCytofData diff --git a/docs/source/change/index.rst b/docs/source/change/index.rst index b7a7912..dc308e4 100644 --- a/docs/source/change/index.rst +++ b/docs/source/change/index.rst @@ -9,12 +9,21 @@ Latest Release --------------- ************** -v0.1.1 +v0.2.0 ************** -This is our first maintenance update to be released to v0.1.x, -and we are packing in lots of enhancements! All changes are -regarding documentations! +Welcome to Cytomulate v0.2.0! Hooray! We are not only bringing documentation enhancements, but we +are also introducing a new feature for more accurate simulations! + +Changes and New Features +-------------------------- + +- The `utilities.univariate_noise_model()` method: + - Added `half_normal` option to the `noise_distribution` parameter + - Changed the default `noise_distribution` to `uniform` (This is a **breaking change** because of the benefits to simulated results). + - A warning is given when no user-specified `noise_distribution` is supplied to warn the breaking change +- Added the `utilities.estimate_noise_model()` method to estimate the noise present in the data +- Added a built-in estimation procedure to match the amount of zeroes observed in the dataset Improvements --------------- diff --git a/docs/source/change/releases.rst b/docs/source/change/releases.rst index 6f6633f..24419c5 100644 --- a/docs/source/change/releases.rst +++ b/docs/source/change/releases.rst @@ -6,6 +6,29 @@ This is a complete history of ``cytomulate`` releases. ------------------- +************** +v0.2.0 +************** + +Welcome to Cytomulate v0.2.0! Hooray! We are not only bringing documentation enhancements, but we +are also introducing a new feature for more accurate simulations! + +Changes and New Features +-------------------------- + +- The `utilities.univariate_noise_model()` method: + - Added `half_normal` option to the `noise_distribution` parameter + - Changed the default `noise_distribution` to `uniform` (This is a **breaking change** because of the benefits to simulated results). + - A warning is given when no user-specified `noise_distribution` is supplied to warn the breaking change +- Added the `utilities.estimate_noise_model()` method to estimate the noise present in the data +- Added a built-in estimation procedure to match the amount of zeroes observed in the dataset + +Improvements +--------------- +- Added 4 more detailed tutorials on `our documentation website `_ +- Improved docstrings with more details on key parameters +- Updated the lastest references and links + ************** v0.1.1 ************** diff --git a/docs/source/conf.py b/docs/source/conf.py index d4824bd..ba6fee9 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -5,7 +5,7 @@ project = 'cytomulate' copyright = '2022-2023, cytomulate developers' author = 'cytomulate developers' -release = '0.1.1' +release = '0.2.0' extensions = [ "sphinx_rtd_theme", diff --git a/meta.yaml b/meta.yaml index 93c8a1d..b895d02 100644 --- a/meta.yaml +++ b/meta.yaml @@ -1,5 +1,5 @@ {% set name = "cytomulate" %} -{% set version = "0.1.1" %} +{% set version = "0.2.0" %} package: name: cytomulate diff --git a/setup.py b/setup.py index 43fc8d9..ba5d11a 100644 --- a/setup.py +++ b/setup.py @@ -4,7 +4,7 @@ import shutil import distutils.cmd -VERSION = "0.1.1" +VERSION = "0.2.0" class PypiCommand(distutils.cmd.Command): From 1b11afc35ba9c2e2a2bfd6f369557ea95399892f Mon Sep 17 00:00:00 2001 From: Kevin Wang Date: Thu, 13 Jul 2023 11:29:58 -0500 Subject: [PATCH 5/6] [Feature] Add warning for the default value The ``utilities.univariate_noise_model`` now has a warning when user does not supply a default. This is intended to mitigate the breaking change introduced in v0.2.0. All codes are tested and properly documented. --- cytomulate/utilities.py | 20 ++++++++++++++++---- docs/source/documentation/utilities.rst | 3 ++- tests/test_utilities.py | 6 ++++++ 3 files changed, 24 insertions(+), 5 deletions(-) diff --git a/cytomulate/utilities.py b/cytomulate/utilities.py index adab51b..cc0b4c1 100644 --- a/cytomulate/utilities.py +++ b/cytomulate/utilities.py @@ -8,8 +8,11 @@ from scipy.interpolate import Akima1DInterpolator from scipy.interpolate import UnivariateSpline +# Warnings +import warnings + # Typing -from typing import Union, Optional, List, Tuple, Callable +from typing import Union, Optional, List, Callable def spline_function(x: np.ndarray, @@ -177,10 +180,13 @@ def trajectories(end_values: Optional[Union[list, np.ndarray]] = None, return trajectories_functions -def univariate_noise_model(noise_distribution: str = "uniform", +def univariate_noise_model(noise_distribution: Optional[str] = None, **kwargs) -> Callable: """Generate a noise distribution - This is mainly used to generate background noise in the cytof_data object + This is mainly used to generate background noise in the cytof_data object. + + .. versionchanged:: 0.2.0 The default `noise_distribution` is changed to `uniform`. If no user-specified value is provided, a warning is given to inform users of the change. + .. versionadded:: 0.2.0 Added the `half_normal` option to the `noise_distribution` parameter. Parameters ---------- @@ -194,6 +200,10 @@ def univariate_noise_model(noise_distribution: str = "uniform", model: Callable A RV generator that only takes size as its input """ + if noise_distribution is None: + warnings.warn("The default `noise_distribution` is now changed from `normal` to `uniform` as of v0.2.0. Please see the release notes for details.") + noise_distribution = "uniform" + if noise_distribution == "half_normal": def model(size): return -np.abs(rd.normal(**kwargs, size=size)) @@ -210,7 +220,9 @@ def model(size): def estimate_noise_model(data: np.ndarray, noise_distribution: str = "uniform") -> Callable: - """Estimate the noise model from data + """Estimate the noise model from data + + .. versionadded:: 0.2.0 Parameters ---------- diff --git a/docs/source/documentation/utilities.rst b/docs/source/documentation/utilities.rst index 2a8a544..962a6de 100644 --- a/docs/source/documentation/utilities.rst +++ b/docs/source/documentation/utilities.rst @@ -6,4 +6,5 @@ Module: utilities .. autofunction:: polynomial_function .. autofunction:: brownian_bridge_function .. autofunction:: trajectories -.. autofunction:: univariate_noise_model \ No newline at end of file +.. autofunction:: univariate_noise_model +.. autofunction:: estimate_noise_model \ No newline at end of file diff --git a/tests/test_utilities.py b/tests/test_utilities.py index dd3ee98..006f53c 100644 --- a/tests/test_utilities.py +++ b/tests/test_utilities.py @@ -70,7 +70,13 @@ def test_univariate_noise_model(kwargs, size, expected): assert f(size).shape == expected except ValueError: assert True + +def test_univariate_noise_model_warning(): + with pytest.warns(UserWarning) as record: + f = univariate_noise_model() + assert str(record[0].message) == "The default `noise_distribution` is now changed from `normal` to `uniform` as of v0.2.0. Please see the release notes for details." + @pytest.mark.parametrize("data, noise_distribution, size, expected", [ (-np.abs(np.random.normal(size=(5,5), loc=0, scale=1)), "half_normal", 5, (5, )), From e22ad1a5b48c92a3cf6694739c407d625c626924 Mon Sep 17 00:00:00 2001 From: Kevin Wang Date: Thu, 27 Jul 2023 13:04:14 -0500 Subject: [PATCH 6/6] [Release Prep] Hash for conda installation --- meta.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/meta.yaml b/meta.yaml index 7322844..a891b4b 100644 --- a/meta.yaml +++ b/meta.yaml @@ -7,7 +7,7 @@ package: source: url: https://pypi.io/packages/source/{{ name[0] }}/{{ name }}/{{ name }}-{{ version }}.tar.gz - sha256: d592c1d525a8ac9aefe2cdabab37fb68fece4c3f0954a76b329c0da86b6324ad + sha256: 499008618b335573641f520bf097db5e7d40899bda499952efdad9c0ae7cb9b4 build: noarch: python