Skip to content

Commit

Permalink
Merge pull request #31 from WenjieDu/dev
Browse files Browse the repository at this point in the history
Add `rdo()`
  • Loading branch information
WenjieDu authored May 27, 2024
2 parents 6017ec5 + 3c65181 commit 4ec11e9
Show file tree
Hide file tree
Showing 11 changed files with 148 additions and 12 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/testing_ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,8 @@ jobs:
strategy:
fail-fast: false
matrix:
os: [ ubuntu-latest, windows-latest, macOS-latest ]
python-version: [ '3.7', '3.10' ]
os: [ ubuntu-latest, windows-latest, macOS-13 ]
python-version: [ '3.7', '3.11' ]

steps:
- uses: actions/checkout@v3
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ and we are pursuing to publish it in prestigious academic venues, e.g. JMLR (tra
please cite PyPOTS project as below and 🌟star this repository to make others notice this library. 🤗 Thank you!

<p align="center">
<a href="https://pypots.com/ecosystem/">
<a href="https://github.com/WenjieDu/PyPOTS">
<img src="https://pypots.com/figs/pypots_logos/Ecosystem/PyPOTS_Ecosystem_Pipeline.png" width="95%"/>
</a>
</p>
Expand Down
4 changes: 3 additions & 1 deletion pygrinder/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,11 +21,12 @@
#
# Dev branch marker is: 'X.Y.dev' or 'X.Y.devN' where N is an integer.
# 'X.Y.dev0' is the canonical version of 'X.Y.dev'
__version__ = "0.4"
__version__ = "0.5"

from .missing_at_random import mar_logistic
from .missing_completely_at_random import mcar, mcar_little_test
from .missing_not_at_random import mnar_x, mnar_t
from .randomly_drop_observations import rdo
from .utils import (
calc_missing_rate,
masked_fill,
Expand All @@ -41,6 +42,7 @@
"mar_logistic",
"mnar_x",
"mnar_t",
"rdo",
"calc_missing_rate",
"masked_fill",
"fill_and_get_mask",
Expand Down
6 changes: 3 additions & 3 deletions pygrinder/missing_at_random/mar_logistic.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,8 +102,8 @@ def mar_logistic(
Parameters
----------
X : shape of [n_steps, n_features]
A time series data vector without any missing data.
X :
A time series data vector without any missing data. Shape of [n_steps, n_features].
obs_rate :
The proportion of variables without missing values that will be used for fitting the logistic masking model.
Expand All @@ -113,7 +113,7 @@ def mar_logistic(
Returns
-------
corrupted_X : array-like
corrupted_X :
Original X with artificial missing values.
Both originally-missing and artificially-missing values are left as NaN.
Expand Down
2 changes: 1 addition & 1 deletion pygrinder/missing_completely_at_random/little_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ def mcar_little_test(X: Union[pd.DataFrame, np.ndarray]) -> float:
Returns
-------
p_value: float
p_value:
The p-value of a chi-square hypothesis test.
Null hypothesis: the time series is missing completely at random (MCAR).
Expand Down
10 changes: 8 additions & 2 deletions pygrinder/missing_completely_at_random/mcar.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@ def _mcar_numpy(
X: np.ndarray,
p: float,
) -> np.ndarray:
assert 0 < p < 1, f"p must be in range (0, 1), but got {p}"

# clone X to ensure values of X out of this function not being affected
X = np.copy(X)
mcar_missing_mask = np.asarray(np.random.rand(np.prod(X.shape)) < p)
Expand All @@ -27,6 +29,8 @@ def _mcar_torch(
X: torch.Tensor,
p: float,
) -> torch.Tensor:
assert 0 < p < 1, f"p must be in range (0, 1), but got {p}"

# clone X to ensure values of X out of this function not being affected
X = torch.clone(X)
mcar_missing_mask = torch.rand(X.shape) < p
Expand All @@ -45,7 +49,7 @@ def mcar(
X :
Data vector. If X has any missing values, they should be numpy.nan.
p : float, in (0,1),
p :
The probability that values may be masked as missing completely at random.
Note that the values are randomly selected no matter if they are originally missing or observed.
If the selected values are originally missing, they will be kept as missing.
Expand All @@ -57,11 +61,13 @@ def mcar(
Returns
-------
corrupted_X : array-like
corrupted_X :
Original X with artificial missing values.
Both originally-missing and artificially-missing values are left as NaN.
"""
assert 0 < p < 1, f"p must be in range (0, 1), but got {p}"

if isinstance(X, list):
X = np.asarray(X)

Expand Down
3 changes: 2 additions & 1 deletion pygrinder/missing_not_at_random/mnar_t.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,11 +78,12 @@ def mnar_t(
Returns
-------
corrupted_X : array-like
corrupted_X :
Original X with artificial missing values.
Both originally-missing and artificially-missing values are left as NaN.
"""

if isinstance(X, list):
X = np.asarray(X)

Expand Down
2 changes: 1 addition & 1 deletion pygrinder/missing_not_at_random/mnar_x.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ def mnar_x(
Returns
-------
corrupted_X : array-like
corrupted_X :
Original X with artificial missing values.
Both originally-missing and artificially-missing values are left as NaN.
"""
Expand Down
12 changes: 12 additions & 0 deletions pygrinder/randomly_drop_observations/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
"""
"""

# Created by Wenjie Du <wenjay.du@gmail.com>
# License: BSD-3-Clause

from .rdo import rdo

__all__ = [
"rdo",
]
96 changes: 96 additions & 0 deletions pygrinder/randomly_drop_observations/rdo.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
"""
Corrupt data by randomly drop original observations.
"""

# Created by Wenjie Du <wenjay.du@gmail.com>
# License: BSD-3-Clause

from typing import Union

import numpy as np
import torch


def _rdo_numpy(
X: np.ndarray,
p: float,
) -> np.ndarray:
assert 0 < p < 1, f"p must be in range (0, 1), but got {p}"

# clone X to ensure values of X out of this function not being affected
X = np.copy(X)
ori_shape = X.shape
X = X.reshape(-1)
indices = np.where(~np.isnan(X))[0].tolist()
indices = np.random.choice(
indices,
round(len(indices) * p),
replace=False,
)
X[indices] = np.nan
X = X.reshape(ori_shape)
return X


def _rdo_torch(
X: torch.Tensor,
p: float,
) -> torch.Tensor:
assert 0 < p < 1, f"p must be in range (0, 1), but got {p}"

# clone X to ensure values of X out of this function not being affected
X = torch.clone(X)
ori_shape = X.shape
X = X.reshape(-1)
indices = torch.where(~torch.isnan(X))[0].tolist()
indices = np.random.choice(
indices,
round(len(indices) * p),
replace=False,
)
X[indices] = torch.nan
X = X.reshape(ori_shape)
return X


def rdo(
X: Union[np.ndarray, torch.Tensor],
p: float,
) -> Union[np.ndarray, torch.Tensor]:
"""Create missingness in the data by randomly drop observations.
Parameters
----------
X :
Data vector. If X has any missing values, they should be numpy.nan.
p :
The proportion of the observed values that will be randomly masked as missing.
RDO (randomly drop observations) will randomly select values from the observed values to be masked as missing.
The number of selected observations is determined by `p` and the total number of observed values in X,
e.g. if `p`=0.1, and there are 1000 observed values in X, then 0.1*1000=100 values will be randomly selected
to be masked as missing. If the result is not an integer, the number of selected values will be rounded to
the nearest.
Returns
-------
corrupted_X :
Original X with artificial missing values.
Both originally-missing and artificially-missing values are left as NaN.
"""
assert 0 < p < 1, f"p must be in range (0, 1), but got {p}"

if isinstance(X, list):
X = np.asarray(X)

if isinstance(X, np.ndarray):
corrupted_X = _rdo_numpy(X, p)
elif isinstance(X, torch.Tensor):
corrupted_X = _rdo_torch(X, p)
else:
raise TypeError(
"X must be type of list/numpy.ndarray/torch.Tensor, " f"but got {type(X)}"
)

return corrupted_X
19 changes: 19 additions & 0 deletions tests/test_pygrinder.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
mar_logistic,
mnar_x,
mnar_t,
rdo,
masked_fill,
calc_missing_rate,
fill_and_get_mask,
Expand Down Expand Up @@ -119,3 +120,21 @@ def test_2_mnar(self):
X_with_nan = mnar_t(X, cycle=20, pos=10, scale=3)
test_pvalue = mcar_little_test(X_with_nan.numpy().reshape(128, -1))
print(f"MCAR Little test p_value for MNAR_T_not_return_masks: {test_pvalue}")

def test_3_rdo(self):
X = np.random.randn(128, 10, 36)
X_with_missing = mcar(
X,
p=DEFAULT_MISSING_RATE,
)
n_observations = (~np.isnan(X_with_missing)).sum()
n_rdo = round(DEFAULT_MISSING_RATE * n_observations)

X_with_rdo = rdo(X_with_missing, p=DEFAULT_MISSING_RATE)
n_left_observations = (~np.isnan(X_with_rdo)).sum()
assert n_left_observations == n_observations - n_rdo

X_with_missing = torch.from_numpy(X_with_missing)
X_with_rdo = rdo(X_with_missing, p=DEFAULT_MISSING_RATE)
n_left_observations = (~torch.isnan(X_with_rdo)).sum()
assert n_left_observations == n_observations - n_rdo

0 comments on commit 4ec11e9

Please sign in to comment.