Merge pull request #31 from WenjieDu/dev

Add `rdo()`
WenjieDu · May 27, 2024 · 4ec11e9 · 4ec11e9
2 parents 6017ec5 + 3c65181
commit 4ec11e9
Show file tree

Hide file tree

Showing 11 changed files with 148 additions and 12 deletions.
diff --git a/.github/workflows/testing_ci.yml b/.github/workflows/testing_ci.yml
@@ -16,8 +16,8 @@ jobs:
         strategy:
             fail-fast: false
             matrix:
-                os: [ ubuntu-latest, windows-latest, macOS-latest ]
-                python-version: [ '3.7', '3.10' ]
+                os: [ ubuntu-latest, windows-latest, macOS-13 ]
+                python-version: [ '3.7', '3.11' ]
 
         steps:
             - uses: actions/checkout@v3

diff --git a/README.md b/README.md
@@ -98,7 +98,7 @@ and we are pursuing to publish it in prestigious academic venues, e.g. JMLR (tra
 please cite PyPOTS project as below and 🌟star this repository to make others notice this library. 🤗 Thank you!
 
 <p align="center">
-<a href="https://pypots.com/ecosystem/">
+<a href="https://github.com/WenjieDu/PyPOTS">
     <img src="https://pypots.com/figs/pypots_logos/Ecosystem/PyPOTS_Ecosystem_Pipeline.png" width="95%"/>
 </a>
 </p>

diff --git a/pygrinder/__init__.py b/pygrinder/__init__.py
@@ -21,11 +21,12 @@
 #
 # Dev branch marker is: 'X.Y.dev' or 'X.Y.devN' where N is an integer.
 # 'X.Y.dev0' is the canonical version of 'X.Y.dev'
-__version__ = "0.4"
+__version__ = "0.5"
 
 from .missing_at_random import mar_logistic
 from .missing_completely_at_random import mcar, mcar_little_test
 from .missing_not_at_random import mnar_x, mnar_t
+from .randomly_drop_observations import rdo
 from .utils import (
     calc_missing_rate,
     masked_fill,
@@ -41,6 +42,7 @@
     "mar_logistic",
     "mnar_x",
     "mnar_t",
+    "rdo",
     "calc_missing_rate",
     "masked_fill",
     "fill_and_get_mask",

diff --git a/pygrinder/missing_at_random/mar_logistic.py b/pygrinder/missing_at_random/mar_logistic.py
@@ -102,8 +102,8 @@ def mar_logistic(
 
     Parameters
     ----------
-    X : shape of [n_steps, n_features]
-        A time series data vector without any missing data.
+    X :
+        A time series data vector without any missing data. Shape of [n_steps, n_features].
 
     obs_rate :
         The proportion of variables without missing values that will be used for fitting the logistic masking model.
@@ -113,7 +113,7 @@ def mar_logistic(
 
     Returns
     -------
-    corrupted_X : array-like
+    corrupted_X :
         Original X with artificial missing values.
         Both originally-missing and artificially-missing values are left as NaN.
 

diff --git a/pygrinder/missing_completely_at_random/little_test.py b/pygrinder/missing_completely_at_random/little_test.py
@@ -26,7 +26,7 @@ def mcar_little_test(X: Union[pd.DataFrame, np.ndarray]) -> float:
 
     Returns
     -------
-    p_value: float
+    p_value:
         The p-value of a chi-square hypothesis test.
         Null hypothesis: the time series is missing completely at random (MCAR).
 

diff --git a/pygrinder/missing_completely_at_random/mcar.py b/pygrinder/missing_completely_at_random/mcar.py
@@ -15,6 +15,8 @@ def _mcar_numpy(
     X: np.ndarray,
     p: float,
 ) -> np.ndarray:
+    assert 0 < p < 1, f"p must be in range (0, 1), but got {p}"
+
     # clone X to ensure values of X out of this function not being affected
     X = np.copy(X)
     mcar_missing_mask = np.asarray(np.random.rand(np.prod(X.shape)) < p)
@@ -27,6 +29,8 @@ def _mcar_torch(
     X: torch.Tensor,
     p: float,
 ) -> torch.Tensor:
+    assert 0 < p < 1, f"p must be in range (0, 1), but got {p}"
+
     # clone X to ensure values of X out of this function not being affected
     X = torch.clone(X)
     mcar_missing_mask = torch.rand(X.shape) < p
@@ -45,7 +49,7 @@ def mcar(
     X :
         Data vector. If X has any missing values, they should be numpy.nan.
 
-    p : float, in (0,1),
+    p :
         The probability that values may be masked as missing completely at random.
         Note that the values are randomly selected no matter if they are originally missing or observed.
         If the selected values are originally missing, they will be kept as missing.
@@ -57,11 +61,13 @@ def mcar(
 
     Returns
     -------
-    corrupted_X : array-like
+    corrupted_X :
         Original X with artificial missing values.
         Both originally-missing and artificially-missing values are left as NaN.
 
     """
+    assert 0 < p < 1, f"p must be in range (0, 1), but got {p}"
+
     if isinstance(X, list):
         X = np.asarray(X)
 

diff --git a/pygrinder/missing_not_at_random/mnar_t.py b/pygrinder/missing_not_at_random/mnar_t.py
@@ -78,11 +78,12 @@ def mnar_t(
 
     Returns
     -------
-    corrupted_X : array-like
+    corrupted_X :
         Original X with artificial missing values.
         Both originally-missing and artificially-missing values are left as NaN.
 
     """
+
     if isinstance(X, list):
         X = np.asarray(X)
 

diff --git a/pygrinder/missing_not_at_random/mnar_x.py b/pygrinder/missing_not_at_random/mnar_x.py
@@ -83,7 +83,7 @@ def mnar_x(
 
     Returns
     -------
-    corrupted_X : array-like
+    corrupted_X :
         Original X with artificial missing values.
         Both originally-missing and artificially-missing values are left as NaN.
     """

diff --git a/pygrinder/randomly_drop_observations/__init__.py b/pygrinder/randomly_drop_observations/__init__.py
@@ -0,0 +1,12 @@
+"""
+
+"""
+
+# Created by Wenjie Du <wenjay.du@gmail.com>
+# License: BSD-3-Clause
+
+from .rdo import rdo
+
+__all__ = [
+    "rdo",
+]
diff --git a/pygrinder/randomly_drop_observations/rdo.py b/pygrinder/randomly_drop_observations/rdo.py
@@ -0,0 +1,96 @@
+"""
+Corrupt data by randomly drop original observations.
+"""
+
+# Created by Wenjie Du <wenjay.du@gmail.com>
+# License: BSD-3-Clause
+
+from typing import Union
+
+import numpy as np
+import torch
+
+
+def _rdo_numpy(
+    X: np.ndarray,
+    p: float,
+) -> np.ndarray:
+    assert 0 < p < 1, f"p must be in range (0, 1), but got {p}"
+
+    # clone X to ensure values of X out of this function not being affected
+    X = np.copy(X)
+    ori_shape = X.shape
+    X = X.reshape(-1)
+    indices = np.where(~np.isnan(X))[0].tolist()
+    indices = np.random.choice(
+        indices,
+        round(len(indices) * p),
+        replace=False,
+    )
+    X[indices] = np.nan
+    X = X.reshape(ori_shape)
+    return X
+
+
+def _rdo_torch(
+    X: torch.Tensor,
+    p: float,
+) -> torch.Tensor:
+    assert 0 < p < 1, f"p must be in range (0, 1), but got {p}"
+
+    # clone X to ensure values of X out of this function not being affected
+    X = torch.clone(X)
+    ori_shape = X.shape
+    X = X.reshape(-1)
+    indices = torch.where(~torch.isnan(X))[0].tolist()
+    indices = np.random.choice(
+        indices,
+        round(len(indices) * p),
+        replace=False,
+    )
+    X[indices] = torch.nan
+    X = X.reshape(ori_shape)
+    return X
+
+
+def rdo(
+    X: Union[np.ndarray, torch.Tensor],
+    p: float,
+) -> Union[np.ndarray, torch.Tensor]:
+    """Create missingness in the data by randomly drop observations.
+
+    Parameters
+    ----------
+    X :
+        Data vector. If X has any missing values, they should be numpy.nan.
+
+    p :
+        The proportion of the observed values that will be randomly masked as missing.
+        RDO (randomly drop observations) will randomly select values from the observed values to be masked as missing.
+        The number of selected observations is determined by `p` and the total number of observed values in X,
+        e.g. if `p`=0.1, and there are 1000 observed values in X, then 0.1*1000=100 values will be randomly selected
+        to be masked as missing. If the result is not an integer, the number of selected values will be rounded to
+        the nearest.
+
+    Returns
+    -------
+    corrupted_X :
+        Original X with artificial missing values.
+        Both originally-missing and artificially-missing values are left as NaN.
+
+    """
+    assert 0 < p < 1, f"p must be in range (0, 1), but got {p}"
+
+    if isinstance(X, list):
+        X = np.asarray(X)
+
+    if isinstance(X, np.ndarray):
+        corrupted_X = _rdo_numpy(X, p)
+    elif isinstance(X, torch.Tensor):
+        corrupted_X = _rdo_torch(X, p)
+    else:
+        raise TypeError(
+            "X must be type of list/numpy.ndarray/torch.Tensor, " f"but got {type(X)}"
+        )
+
+    return corrupted_X
diff --git a/tests/test_pygrinder.py b/tests/test_pygrinder.py
@@ -16,6 +16,7 @@
     mar_logistic,
     mnar_x,
     mnar_t,
+    rdo,
     masked_fill,
     calc_missing_rate,
     fill_and_get_mask,
@@ -119,3 +120,21 @@ def test_2_mnar(self):
         X_with_nan = mnar_t(X, cycle=20, pos=10, scale=3)
         test_pvalue = mcar_little_test(X_with_nan.numpy().reshape(128, -1))
         print(f"MCAR Little test p_value for MNAR_T_not_return_masks: {test_pvalue}")
+
+    def test_3_rdo(self):
+        X = np.random.randn(128, 10, 36)
+        X_with_missing = mcar(
+            X,
+            p=DEFAULT_MISSING_RATE,
+        )
+        n_observations = (~np.isnan(X_with_missing)).sum()
+        n_rdo = round(DEFAULT_MISSING_RATE * n_observations)
+
+        X_with_rdo = rdo(X_with_missing, p=DEFAULT_MISSING_RATE)
+        n_left_observations = (~np.isnan(X_with_rdo)).sum()
+        assert n_left_observations == n_observations - n_rdo
+
+        X_with_missing = torch.from_numpy(X_with_missing)
+        X_with_rdo = rdo(X_with_missing, p=DEFAULT_MISSING_RATE)
+        n_left_observations = (~torch.isnan(X_with_rdo)).sum()
+        assert n_left_observations == n_observations - n_rdo