diff --git a/sica/_whitening.py b/sica/_whitening.py index d6305ce..d23e03e 100644 --- a/sica/_whitening.py +++ b/sica/_whitening.py @@ -16,40 +16,40 @@ def whitening( - X: Union[np.ndarray, spmatrix], - n_components: int, - svd_solver: str, - chunked: bool, - chunk_size: Union[int, None], - zero_center: bool, - random_state: Optional[Union[int, np.random.RandomState]] = None + X: Union[np.ndarray, spmatrix], + n_components: int, + svd_solver: str, + chunked: bool, + chunk_size: Union[int, None], + zero_center: bool, + random_state: Optional[Union[int, np.random.RandomState]] = None, ) -> Tuple[np.ndarray, np.ndarray]: - """ Whiten data (i.e. transform variables into a set of new uncorrelated and unit-variance variables) and reduce + """Whiten data (i.e. transform variables into a set of new uncorrelated and unit-variance variables) and reduce dimension trhough a PCA-like approach. This function handles array-like formats as well as sparse matrices. - + Parameters ---------- X : 2D ndarray or spmatrix, shape (n_samples , n_features) - + n_components : int number of pricipal components to compute. If None, n_components = min(X.shape) - + svd_solver : str {‘auto’, ‘full’, ‘arpack’, ‘randomized’ , 'lobpcg'} solver for the different PCA methods. Please note that some solvers may not be compatible with some PCA methods. See PCA, TruncatedSVD and IncrementalPCA from sklearn.decompostion or scipy.sparse.linalg.svds. - + chunked : boolean - if True, perform an incremental PCA on segments of chunk_size. The incremental PCA automatically + if True, perform an incremental PCA on segments of chunk_size. The incremental PCA automatically zero centers and ignores settings of random_seed and svd_solver. - + chunk_size : int Number of observations to include in each chunk. Required if chunked=True was passed. - + zero_center : boolean If True, compute standard PCA from covariance matrix. If False, omit zero-centering variables (uses TruncatedSVD), which allows to handle sparse input efficiently. - + random_state : int, RandomState, optional Change to use different initial states for the optimization. The default is None. @@ -106,27 +106,28 @@ def whitening( def _pca_with_sparse( - X: spmatrix, - npcs: int, - solver: Optional[str] = "arpack", - mu=None, - random_state: Optional[Union[int, np.random.RandomState]] = None) -> Tuple[np.ndarray, np.ndarray]: - """ Compute PCA decomposition with initial centering for sparse input. - + X: spmatrix, + npcs: int, + solver: Optional[str] = "arpack", + mu=None, + random_state: Optional[Union[int, np.random.RandomState]] = None, +) -> Tuple[np.ndarray, np.ndarray]: + """Compute PCA decomposition with initial centering for sparse input. + Parameters ---------- X : spmatrix, shape (n_samples, n_features) npcs : int number of PCA componnents. - + solver : str, optional Eigenvalue solver to use. Should be ‘arpack’ or ‘lobpcg’. See scipy.sparse.linalg.svds. The default is 'arpack'. - + mu : TYPE, optional DESCRIPTION. The default is None. - + random_state : int, RandomState, optional The default is None. diff --git a/sica/base.py b/sica/base.py index 42749a1..392d650 100644 --- a/sica/base.py +++ b/sica/base.py @@ -91,17 +91,17 @@ def _check_algorithm(algorithm: str, fun: str) -> Tuple[str, dict]: def _centrotype(X: np.ndarray, Sim: np.ndarray, cluster_labels: list) -> np.ndarray: """Compute the centrotype of the cluster of ICA components defined by cluster_labels - + centrotype : component of the cluster which is the most similar to the other components of the cluster Parameters ---------- X : 2D array, shape (n_components , n_observations) matrix of independent ICA components - + Sim : 2D array, shape (n_components , n_components) similarity matrix for ICA components (i.e. rows of X) - + cluster_labels : list of integers indexes of the cluster of components (ex:[0 , 1 , 7] refers to the rows 0, 1 and 7 of X) @@ -117,15 +117,15 @@ def _centrotype(X: np.ndarray, Sim: np.ndarray, cluster_labels: list) -> np.ndar def _stability_index(Sim: np.ndarray, cluster_labels: list) -> float: """Compute the stability index for the cluster of ICA components defined by cluster_labels. - + Please refer to https://bmcgenomics.biomedcentral.com/track/pdf/10.1186/s12864-017-4112-9 (section "Method") for the exact formula of the stability index. Parameters ---------- Sim : 2D array, shape (n_components , n_components) - similarity matrix for ICA components - + similarity matrix for ICA components + cluster_labels : list of integers indexes of the cluster of components (ex: [0 , 1 , 7] refers to the rows 0, 1 and 7 of X) @@ -150,10 +150,9 @@ def _stability_index(Sim: np.ndarray, cluster_labels: list) -> float: class StabilizedICA(BaseEstimator, TransformerMixin): - """ Implement a stabilized version of the Independent Component Analysis algorithm. - - It fits the matrix factorization model X = AS, where A is the unmixing matrix (n_mixtures, n_sources), S is the - source matrix (n_sources, n_observations) and X is the observed mixed data (n_mixtures, n_observations). + """Implement a stabilized version of the Independent Component Analysis algorithm. It fits the matrix factorization + model X = AS, where A is the unmixing matrix (n_mixtures, n_sources), S is the source matrix + (n_sources, n_observations) and X is the observed mixed data (n_mixtures, n_observations). Parameters ---------- @@ -167,10 +166,14 @@ class StabilizedICA(BaseEstimator, TransformerMixin): Method for resampling the data before each run of the ICA solver. - If None, no resampling is applied. - - If 'bootstrap' the classical bootstrap method is applied to the original data matrix, the resampled matrix is whitened (using the whitening hyperparameters set for the fit method) and the ICA components are extracted. - - If 'fast_boostrap' a fast bootstrap algorithm is applied to the original data matrix and the whitening operation is performed simultaneously with SVD decomposition and then the ICA components are extracted (see References). + - If 'bootstrap' the classical bootstrap method is applied to the original data matrix, the resampled matrix is + whitened (using the whitening hyperparameters set for the fit method) and the ICA components are extracted. + - If 'fast_boostrap' a fast bootstrap algorithm is applied to the original data matrix and the whitening + operation is performed simultaneously with SVD decomposition and then the ICA components are extracted + (see References). - Resampling could lead to quite heavy computations (whitening at each iteration), depending on the size of the input data. It should be considered with care. The default is None. + Resampling could lead to quite heavy computations (whitening at each iteration), depending on the size of the + input data. It should be considered with care. The default is None. algorithm : str {'fastica_par' , 'fastica_def' , 'picard_fastica' , 'picard' , 'picard_ext' , 'picard_orth'}, optional. The algorithm applied for solving the ICA problem at each run. Please see the supplementary explanations @@ -231,15 +234,15 @@ class StabilizedICA(BaseEstimator, TransformerMixin): n_jobs : int, optional Number of jobs to run in parallel. -1 means using all processors. See the joblib package documentation for more explanations. The default is 1. - + verbose: int, optional Control the verbosity: the higher, the more messages. The default is 0. - + Attributes ---------- S_: 2D array, shape (n_components , n_observations) Array of sources/metagenes, each line corresponds to a stabilized ICA component (i.e. the centrotype of - a cluster of components). + a cluster of components). stability_indexes_ : 1D array, shape (n_components) Stability indexes for the stabilized ICA components. @@ -255,20 +258,22 @@ class StabilizedICA(BaseEstimator, TransformerMixin): correspond to the observations. The user should keep in mind that, at the end, he will obtain `n_components` vectors of dimension `n_observations`, independent form each other (as finite samples of latent independent distributions). The user guide and the definition of the ICA framework should be helpful. - - - For a data set of discretized sound signals registered by 10 microphones at 100 time points, if we want to retrieve 5 ICA sources we need to set `n_mixtures = 10`, `n_observations = 100` and `n_components = 5`. - - For a gene expression data set with 100 samples and 10000 genes, if we want to retrieve 10 ICA sources **in the gene space** we need to set `n_mixtures = 100`, `n_observations = 10000` and `n_components = 10`. - + + - For a data set of discretized sound signals registered by 10 microphones at 100 time points, if we want to + retrieve 5 ICA sources we need to set `n_mixtures = 10`, `n_observations = 100` and `n_components = 5`. + - For a gene expression data set with 100 samples and 10000 genes, if we want to retrieve 10 ICA sources **in + the gene space** we need to set `n_mixtures = 100`, `n_observations = 10000` and `n_components = 10`. + References ---------- Fast bootstrap algorithm: Fisher A, Caffo B, Schwartz B, Zipunnikov V. Fast, Exact Bootstrap Principal Component Analysis for p > 1 million. J Am Stat Assoc. 2016;111(514):846-860. doi: 10.1080/01621459.2015.1062383. Epub 2016 Aug 18. PMID: 27616801; PMCID: PMC5014451. - + ICASSO method : - J. Himberg and A. Hyvarinen, "Icasso: software for investigating the reliability of ICA estimates by clustering and visualization," + J. Himberg and A. Hyvarinen, "Icasso: software for investigating the reliability of ICA estimates by clustering and visualization," 2003 IEEE XIII Workshop on Neural Networks for Signal Processing (IEEE Cat. No.03TH8718), 2003, pp. 259-268, doi: 10.1109/NNSP.2003.1318025 - (see https://www.cs.helsinki.fi/u/ahyvarin/papers/Himberg03.pdf). + (see https://www.cs.helsinki.fi/u/ahyvarin/papers/Himberg03.pdf). Picard algorithm and extensions: Pierre Ablin, Jean-Francois Cardoso, Alexandre Gramfort, "Faster independent component analysis by @@ -280,36 +285,36 @@ class StabilizedICA(BaseEstimator, TransformerMixin): UMAP: For more details about the UMAP (Uniform Manifold Approximation and Projection), see https://pypi.org/project/umap-learn/. - + Examples -------- >>> import pandas as pd - >>> from sica.base import StabilizedICA + >>> from sica.base import StabilizedICA >>> df = pd.read_csv("data.csv" , index_col = 0) >>> sICA = StabilizedICA(n_components = 45 , n_runs = 30, plot = True, n_jobs = -1) >>> sICA.fit(df) >>> Sources = pd.DataFrame(sICA.S_ , columns = df.index , index = ['source ' + str(i) for i in range(sICA.S_.shape[0])]) - >>> Sources.head() + >>> Sources.head() """ def __init__( - self, - n_components: int, - n_runs: int, - resampling: Optional[Union[str, None]] = None, - algorithm: Optional[str] = "fastica_par", - fun: Optional[str] = "logcosh", - whiten: Optional[bool] = True, - max_iter: Optional[int] = 2000, - plot: Optional[bool] = False, - normalize: Optional[bool] = True, - reorientation: Optional[bool] = True, - pca_solver: Optional[str] = "auto", - chunked: Optional[bool] = False, - chunk_size: Optional[Union[int, None]] = None, - zero_center: Optional[bool] = True, - n_jobs: Optional[int] = 1, - verbose: Optional[int] = 0, + self, + n_components: int, + n_runs: int, + resampling: Optional[Union[str, None]] = None, + algorithm: Optional[str] = "fastica_par", + fun: Optional[str] = "logcosh", + whiten: Optional[bool] = True, + max_iter: Optional[int] = 2000, + plot: Optional[bool] = False, + normalize: Optional[bool] = True, + reorientation: Optional[bool] = True, + pca_solver: Optional[str] = "auto", + chunked: Optional[bool] = False, + chunk_size: Optional[Union[int, None]] = None, + zero_center: Optional[bool] = True, + n_jobs: Optional[int] = 1, + verbose: Optional[int] = 0, ) -> NoReturn: super().__init__() self.n_components = n_components @@ -334,26 +339,26 @@ def __init__( self.stability_indexes_ = None def fit(self, X: np.ndarray, y: Optional[Any] = None) -> object: - """ Fit the ICA model with X (use stabilization). - + """Fit the ICA model with X (use stabilization). + 1. Compute the ICA components of X ``n_runs`` times. - - 2. Cluster all the ``n_components*n_runs`` components with agglomerative + + 2. Cluster all the ``n_components*n_runs`` components with agglomerative hierarchical clustering (average linkage) into ``n_components`` clusters. - + 3. For each cluster compute its stability index and return its centrotype as the - final ICA component. - + final ICA component. + Parameters ---------- X : 2D array-like, shape (n_mixtures, n_observations) or (n_components, n_observations) if whiten is False. - Training data + Training data y : Ignored Ignored. Returns - ------- + ------- self : object Returns the instance itself. """ @@ -397,7 +402,7 @@ def fit(self, X: np.ndarray, y: Optional[Any] = None) -> object: "X_w": X_w, "method": self._method, "max_iter": self.max_iter, - "solver_params": self._solver_params + "solver_params": self._solver_params, }, ) @@ -429,7 +434,7 @@ def fit(self, X: np.ndarray, y: Optional[Any] = None) -> object: "method": self._method, "max_iter": self.max_iter, "solver_params": self._solver_params, - "n_components": self.n_components + "n_components": self.n_components, }, ) @@ -462,7 +467,7 @@ def fit(self, X: np.ndarray, y: Optional[Any] = None) -> object: "method": self._method, "max_iter": self.max_iter, "solver_params": self._solver_params, - "n_components": self.n_components + "n_components": self.n_components, }, ) @@ -498,8 +503,8 @@ def fit(self, X: np.ndarray, y: Optional[Any] = None) -> object: # Re-oriente the stabilized ICA components towards positive heaviest tails if self.reorientation: self.S_ = ( - np.where(stats.skew(Centrotypes, axis=1) >= 0, 1, -1).reshape(-1, 1) - ) * Centrotypes + np.where(stats.skew(Centrotypes, axis=1) >= 0, 1, -1).reshape(-1, 1) + ) * Centrotypes else: self.S_ = Centrotypes @@ -525,20 +530,18 @@ def fit(self, X: np.ndarray, y: Optional[Any] = None) -> object: return self - def _parallel_decomposition(self, - parallel: Parallel, - func: Callable[..., np.ndarray], - kwargs: dict - ) -> List[np.ndarray]: - """ Compute in parallel the n_runs runs of the ICA solver. If the solver comes from sklearn.FastICA, + def _parallel_decomposition( + self, parallel: Parallel, func: Callable[..., np.ndarray], kwargs: dict + ) -> List[np.ndarray]: + """Compute in parallel the n_runs runs of the ICA solver. If the solver comes from sklearn.FastICA, some potential convergence errors ar handled through multiple retryings. - + Parameters ---------- parallel : joblib.Parallel Object to use workers to compute in parallel the n_runs application of the function func to solve the ICA problem. - + func : callable Function to perform the ICA decomposition for a single run. It should return an array of ICA components of shape (n_components , n_observations) @@ -559,23 +562,31 @@ def _parallel_decomposition(self, decomposition = None while (attempt <= maxtrials) and (not success): try: - decomposition = parallel(delayed(func)(**kwargs) for _ in range(self.n_runs)) + decomposition = parallel( + delayed(func)(**kwargs) for _ in range(self.n_runs) + ) success = True except ValueError: - print("FastICA from sklearn did not converge due to numerical instabilities - Retrying...") + print( + "FastICA from sklearn did not converge due to numerical instabilities - Retrying..." + ) attempt += 1 if not success: raise ValueError("Too many attempts: FastICA did not converge !") else: - decomposition = parallel(delayed(func)(**kwargs) for _ in range(self.n_runs)) + decomposition = parallel( + delayed(func)(**kwargs) for _ in range(self.n_runs) + ) return decomposition @staticmethod - def _ICA_decomposition(X_w: np.ndarray, method: str, max_iter: int, solver_params: dict) -> np.ndarray: - """ Apply FastICA or picard (picard package) algorithm to the whitened matrix X_w to solve the ICA problem. - + def _ICA_decomposition( + X_w: np.ndarray, method: str, max_iter: int, solver_params: dict + ) -> np.ndarray: + """Apply FastICA or picard (picard package) algorithm to the whitened matrix X_w to solve the ICA problem. + Parameters ---------- X_w : 2D array, shape (n_observations , n_components) @@ -589,11 +600,7 @@ def _ICA_decomposition(X_w: np.ndarray, method: str, max_iter: int, solver_param if method == "picard": _, _, S = picard( - X_w.T, - max_iter=max_iter, - whiten=False, - centering=False, - **solver_params + X_w.T, max_iter=max_iter, whiten=False, centering=False, **solver_params ) else: ica = FastICA(max_iter=max_iter, whiten=False, **solver_params) @@ -602,20 +609,21 @@ def _ICA_decomposition(X_w: np.ndarray, method: str, max_iter: int, solver_param @staticmethod def _ICA_decomposition_bootstrap( - X: np.ndarray, - whitening_params: dict, - method: str, - max_iter: int, - solver_params: dict, - n_components: int) -> np.ndarray: - """ Draw a bootstrap sample from the original data matrix X, whiten it and apply FastICA or picard + X: np.ndarray, + whitening_params: dict, + method: str, + max_iter: int, + solver_params: dict, + n_components: int, + ) -> np.ndarray: + """Draw a bootstrap sample from the original data matrix X, whiten it and apply FastICA or picard (picard package) algorithm to solve the ICA problem. - + Parameters ---------- X : 2D array, shape (n_observations , n_mixtures) Original data matrix. - + whitening_params : dict A dictionnary containing the arguments to pass to the whitening function to whiten the bootstrap matrix. @@ -645,19 +653,20 @@ def _ICA_decomposition_bootstrap( @staticmethod def _ICA_decomposition_fast_bootstrap( - U: np.ndarray, - SVt: np.ndarray, - method: str, - max_iter: int, - solver_params: dict, - n_components: int) -> np.ndarray: - """ Draw a boostrap whitened sample from the original matrix X (svd decomposition of X = USVt) [1], and apply + U: np.ndarray, + SVt: np.ndarray, + method: str, + max_iter: int, + solver_params: dict, + n_components: int, + ) -> np.ndarray: + """Draw a boostrap whitened sample from the original matrix X (svd decomposition of X = USVt) [1], and apply FastICA or picard (picard package) algorithm to solve the ICA problem. - + Parameters ---------- U : 2D array, shape (n_observations , n_mixtures) - + SVt : 2D array, shape (n_mixtures , n_mixtures) Returns @@ -665,7 +674,7 @@ def _ICA_decomposition_fast_bootstrap( S : 2D array, shape (n_components , n_observations) Array of sources obtained from a single run of an ICA solver and a bootstrap sample of the original matrix X. Each line corresponds to an ICA component. - + References ---------- [1] : Fisher A, Caffo B, Schwartz B, Zipunnikov V. Fast, Exact Bootstrap Principal Component Analysis for p > 1 million. @@ -676,7 +685,7 @@ def _ICA_decomposition_fast_bootstrap( Ab, Sb, Rbt = linalg.svd(SVt[:, np.random.choice(range(n), size=n)]) Ub = np.dot(U, Ab) Ub, Rbt = svd_flip(Ub, Rbt) - Xb_w = Ub[:, : n_components] * np.sqrt(p - 1) + Xb_w = Ub[:, :n_components] * np.sqrt(p - 1) if method == "picard": _, _, S = picard( @@ -692,7 +701,7 @@ def _ICA_decomposition_fast_bootstrap( return S def transform(self, X: np.ndarray, copy: Optional[bool] = True) -> np.ndarray: - """ Apply dimensionality reduction to X (i.e. recover the mixing matrix applying the pseudo-inverse + """Apply dimensionality reduction to X (i.e. recover the mixing matrix applying the pseudo-inverse of the sources). Parameters @@ -718,8 +727,10 @@ def transform(self, X: np.ndarray, copy: Optional[bool] = True) -> np.ndarray: A = X.T.dot(np.linalg.pinv(self.S_)) return A - def inverse_transform(self, X: np.ndarray, copy: Optional[bool] = True) -> np.ndarray: - """ Transform the mixing matrix back to the mixed data (applying the sources). + def inverse_transform( + self, X: np.ndarray, copy: Optional[bool] = True + ) -> np.ndarray: + """Transform the mixing matrix back to the mixed data (applying the sources). Parameters ---------- @@ -742,31 +753,31 @@ def inverse_transform(self, X: np.ndarray, copy: Optional[bool] = True) -> np.nd return X_new def projection( - self, - method: Optional[str] = "mds", - ax: Optional[Union[matplotlib.axes.Axes, None]] = None + self, + method: Optional[str] = "mds", + ax: Optional[Union[matplotlib.axes.Axes, None]] = None, ) -> None: """Plot the ``n_components*n_runs`` ICA components computed during fit() in 2D. Approximate the original dissimilarities between components by Euclidean distance. Each cluster is represented with a different color. - + Parameters - ---------- + ---------- method : string, optional Name of the dimensionality reduction method (e.g "tsne" , "mds" or "umap") The default is "umap". - + ax : matplotlib.axes, optional The default is None. - + Returns ------- None. - + Notes ----- - We use the dissimilarity measure ``sqrt(1 - |rho_ij|)`` (rho the Pearson correlation) instead of ``1 - |rho_ij|`` to reduce overlapping. - + - Please note that multidimensional scaling (MDS) is more computationally demanding than t-SNE or UMAP. However, it takes into account the global structures of the data set while the others don't. For t-SNE or UMAP one cannot really interpret the inter-cluster distances. """ @@ -787,7 +798,9 @@ def projection( elif method == "umap": embedding = umap.UMAP(n_components=2, metric="precomputed") else: - raise ValueError("method parameter value can only be 'tsne', 'mds' or 'umap'") + raise ValueError( + "method parameter value can only be 'tsne', 'mds' or 'umap'" + ) P = embedding.fit_transform(np.sqrt(1 - self._Sim)) @@ -795,71 +808,72 @@ def projection( return -def MSTD(X: np.ndarray, - m: int, - M: int, - step: int, - n_runs: int, - fun: Optional[str] = "logcosh", - algorithm: Optional[str] = "fastica_par", - whiten: Optional[bool] = True, - max_iter: Optional[int] = 2000, - n_jobs: Optional[int] = -1, - ax: Optional[Union[matplotlib.axes.Axes, None]] = None - ) -> None: +def MSTD( + X: np.ndarray, + m: int, + M: int, + step: int, + n_runs: int, + fun: Optional[str] = "logcosh", + algorithm: Optional[str] = "fastica_par", + whiten: Optional[bool] = True, + max_iter: Optional[int] = 2000, + n_jobs: Optional[int] = -1, + ax: Optional[Union[matplotlib.axes.Axes, None]] = None, +) -> None: """Plot "MSTD graphs" to help choose an optimal dimension for ICA decomposition. - + Run stabilized ICA algorithm for several dimensions in [m , M] and compute the stability distribution of the components each time. - + Parameters ---------- X : 2D array, shape (n_mixtures, n_observations) Training data - + m : int Minimal dimension for ICA decomposition. - + M : int > m Maximal dimension for ICA decomposition. - + step : int > 0 Step between two dimensions (ex: if ``step = 2`` the function will test the dimensions m, m+2, m+4, ... , M). - + n_runs : int Number of times we run the FastICA algorithm (see fit method of class Stabilized_ICA) - + fun : str {'cube' , 'exp' , 'logcosh' , 'tanh'} or function, optional. The default is 'logcosh'. See the fit method of StabilizedICA for more details. - + algorithm : str {'fastica_par' , 'fastica_def' , 'picard_fastica' , 'picard' , 'picard_ext' , 'picard_orth'}, optional. The algorithm applied for solving the ICA problem at each run. Please the supplementary explanations for more details. The default is 'fastica_par', i.e. FastICA from sklearn with parallel implementation. - + whiten : bool, optional It True, X is whitened only once as an initial step, with an SVD solver and M components. If False, X must be already whitened, with M components. The default is True. - + max_iter : int, optional Parameter for _ICA_decomposition. The default is 2000. - + n_jobs : int Number of jobs to run in parallel for each stabilized ICA step. Default is -1 - + ax : array of matplotlib.axes objects, optional The default is None. - + Returns ------- None. - + References ---------- Kairov U, Cantini L, Greco A, Molkenov A, Czerwinska U, Barillot E, Zinovyev A. Determining the optimal number of independent components for reproducible transcriptomic data analysis. BMC Genomics. 2017 Sep 11;18(1):712. doi: 10.1186/s12864-017-4112-9. PMID: 28893186; PMCID: PMC5594474. (see https://bmcgenomics.biomedcentral.com/track/pdf/10.1186/s12864-017-4112-9 ). - + Examples -------- >>> import pandas as pd @@ -905,8 +919,15 @@ def MSTD(X: np.ndarray, # for i in range(m , M+step , step): #uncomment if you don't want to use tqdm (and comment the line below !) for i in tqdm(range(m, M + step, step)): - s = StabilizedICA(n_components=i, n_runs=n_runs, algorithm=algorithm, fun=fun, whiten=False, - max_iter=max_iter, n_jobs=n_jobs) + s = StabilizedICA( + n_components=i, + n_runs=n_runs, + algorithm=algorithm, + fun=fun, + whiten=False, + max_iter=max_iter, + n_jobs=n_jobs, + ) s.fit(X_w[:, :i].T) mean.append(np.mean(s.stability_indexes_)) ax[0].plot(range(1, len(s.stability_indexes_) + 1), s.stability_indexes_, "k") diff --git a/sica/mutualknn.py b/sica/mutualknn.py index 1ce3d1f..4795f42 100644 --- a/sica/mutualknn.py +++ b/sica/mutualknn.py @@ -17,20 +17,20 @@ class MNN(object): """Given two arrays X and Y or a precomputed distance matrix computes the undirected adjacency matrix using the Mutual Nearest Neighbors method. - + Parameters ---------- X : 2D array of shape (n_components_1 , n_features_1), 2D array of shape (n_components_1 , n_components_2) if metric = "precomputed". - + Y : 2D array of shape (n_components_2 , n_features_2), optional The default is None. - + k : int > 0 Parameter for the Mutual Nearest Neighbor method (i.e. number of neighbors that we consider). - + metric : string - Metric for the computation of the adjacency matrix (e.g "pearson" , "spearman" + Metric for the computation of the adjacency matrix (e.g "pearson" , "spearman" or any metric accepted by ``scipy.spatial.distance.cdsit``). Attributes @@ -42,20 +42,20 @@ class MNN(object): maximal non-null distance between two different components. Notes - ----- + ----- - In the case where the distance matrix is not precomputed, we compute the distance between each rows of X and Y. - + - In the case X and Y are dataframes, we consider only the common columns of X and Y. Otherwise, we assume that the columns are the same for X and Y. - + """ def __init__( - self, - k: int, - metric: str, - X: Union[np.ndarray, pd.DataFrame], - Y: Optional[Union[np.ndarray, pd.DataFrame]] = None + self, + k: int, + metric: str, + X: Union[np.ndarray, pd.DataFrame], + Y: Optional[Union[np.ndarray, pd.DataFrame]] = None, ) -> NoReturn: self.X = X self.Y = Y @@ -70,12 +70,12 @@ def __init__( @staticmethod def compute_distance( - X: Union[np.ndarray, pd.DataFrame], - Y: Union[np.ndarray, pd.DataFrame], - metric: str + X: Union[np.ndarray, pd.DataFrame], + Y: Union[np.ndarray, pd.DataFrame], + metric: str, ) -> np.ndarray: """Compute the distance between each pair of rows of X and Y - + Parameters ---------- X : 2D array of shape (n_components_1 , n_features_1) @@ -119,12 +119,12 @@ def adjacency_matrix(self, weighted: bool) -> Union[np.ndarray, None]: """ bool_mask = ( - self.distance - <= np.sort(self.distance, axis=1)[:, self.k - 1].reshape(-1, 1) - ) * ( - self.distance - <= np.sort(self.distance, axis=0)[self.k - 1, :].reshape(1, -1) - ) + self.distance + <= np.sort(self.distance, axis=1)[:, self.k - 1].reshape(-1, 1) + ) * ( + self.distance + <= np.sort(self.distance, axis=0)[self.k - 1, :].reshape(1, -1) + ) if weighted: adjacency = bool_mask * self.distance @@ -134,28 +134,28 @@ def adjacency_matrix(self, weighted: bool) -> Union[np.ndarray, None]: class MNNgraph(object): - """ Given a list of data sets, draws the MNN graph with a networkx object (compatible with the software Cytoscape) - + """Given a list of data sets, draws the MNN graph with a networkx object (compatible with the software Cytoscape) + Parameters - ---------- + ---------- data : list of 2D data sets of shape (n_components_i , n_features_i) - + names : list of strings Names of the data sets. - + k : int > 0 Parameter for the Mutual Nearest Neighbors method. - + metric : string, optional Metric for the computation of the adjacency matrices. The default is "pearson". - + weighted : bool If True each coefficient of the adjacency matrix is weighted by the associated ``distance``, otherwise the coefficients are 0 or 1. Attributes - ---------- + ---------- graph_ : networkx object Each edge of the graph is associated with the following attributes: @@ -166,27 +166,27 @@ class MNNgraph(object): corresponds to the correlation coefficient itself while for other metrics it corresponds to 1 - minmax scaled distance (min and max are computed over all the between-nodes distances, including those that are not associated to an edge in the graph). For an unweighted graph, it equals 1 every time. - + Notes ----- If the elements of data are not dataframes, we assume that they all share the same features. - + Examples -------- >>> from sica.mutualknn import MNNgraph >>> cg = MNNgraph(data = [df1 , df2 , df3] , names=['dataframe1' , 'dataframe2' , 'dataframe3'] , k=1) - >>> cg.draw(colors = ['r', 'g' , 'b'] , spacing = 2) + >>> cg.draw(colors = ['r', 'g' , 'b'] , spacing = 2) >>> cg.export_json("example.json") - + """ def __init__( - self, - data: List[np.ndarray], - names: List[str], - k: int, - metric: Optional[str] = "pearson", - weighted: Optional[bool] = True + self, + data: List[np.ndarray], + names: List[str], + k: int, + metric: Optional[str] = "pearson", + weighted: Optional[bool] = True, ) -> NoReturn: self.data = data self.names = names @@ -196,28 +196,28 @@ def __init__( @staticmethod def create_graph( - data: Union[np.array, List[np.ndarray]], - names: List[str], - k: int, - metric: str, - weighted: bool + data: Union[np.array, List[np.ndarray]], + names: List[str], + k: int, + metric: str, + weighted: bool, ) -> nx.Graph: - """Create the MNN graph associated to the list of data sets. Two situations are + """Create the MNN graph associated to the list of data sets. Two situations are distinguished : one with only two data sets and another with more than two data sets. - + Parameters ---------- - data : list of 2D arrays of shape (n_components_i , n_features_i) - + data : list of 2D arrays of shape (n_components_i , n_features_i) + names : list of strings Names of the data sets. - + k : integer >= 1 Parameter for the Mutual Nearest Neighbors Method. - + metric : string Metric for the computation of the adjacency matrices. - + weighted : bool If True each coefficient of the adjacency matrix is weighted by the associated ``distance``, otherwise the coefficients are 0 or 1. @@ -244,7 +244,10 @@ def create_graph( for u in range(h_w.shape[0]): for v in range(h_w.shape[1]): if h_w[u, v] > 0: - n1, n2 = (names[0] + " " + str(u + 1), names[1] + " " + str(v + 1),) + n1, n2 = ( + names[0] + " " + str(u + 1), + names[1] + " " + str(v + 1), + ) G.add_node( n1, weight=1, @@ -263,17 +266,27 @@ def create_graph( count += -5 if weighted: - if metric in ["pearson", "spearman", "kendall", "correlation", "cosine"]: - update = {e[:2]: {"weight": 1 - e[2], - "label": str(np.round(1 - e[2], 2)) - } - for e in G.edges.data("distance") - } + if metric in [ + "pearson", + "spearman", + "kendall", + "correlation", + "cosine", + ]: + update = { + e[:2]: {"weight": 1 - e[2], "label": str(np.round(1 - e[2], 2))} + for e in G.edges.data("distance") + } else: update = {} for e in G.edges.data("distance"): - temp = 1 - (e[2] - mutualnn.min_dist_) / (mutualnn.max_dist_ - mutualnn.min_dist_) - update[e[:2]] = {"weight": temp, "label": str(np.round(temp, 2))} + temp = 1 - (e[2] - mutualnn.min_dist_) / ( + mutualnn.max_dist_ - mutualnn.min_dist_ + ) + update[e[:2]] = { + "weight": temp, + "label": str(np.round(temp, 2)), + } nx.set_edge_attributes(G, update) else: @@ -292,44 +305,57 @@ def create_graph( for u in range(h_w.shape[0]): for v in range(h_w.shape[1]): if h_w[u, v] > 0: - n1, n2 = (L[i][0] + " " + str(u + 1), L[i][1] + " " + str(v + 1)) + n1, n2 = ( + L[i][0] + " " + str(u + 1), + L[i][1] + " " + str(v + 1), + ) G.add_node(n1, weight=1, data_set=L[i][0]) G.add_node(n2, weight=1, data_set=L[i][1]) G.add_edge(n1, n2, distance=h_w[u, v], weight=h_w[u, v]) if weighted: - if metric in ["pearson", "spearman", "kendall", "correlation", "cosine"]: - update = {e[:2]: {"weight": 1 - e[2]} for e in G.edges.data("distance")} + if metric in [ + "pearson", + "spearman", + "kendall", + "correlation", + "cosine", + ]: + update = { + e[:2]: {"weight": 1 - e[2]} for e in G.edges.data("distance") + } else: maxweight = max(collect_max) minweight = min(collect_min) update = {} for e in G.edges.data("distance"): - update[e[:2]] = {"weight": 1 - (e[2] - minweight) / (maxweight - minweight)} + update[e[:2]] = { + "weight": 1 - (e[2] - minweight) / (maxweight - minweight) + } nx.set_edge_attributes(G, update) return G def draw( - self, - bipartite_graph: Optional[bool] = False, - ax: Optional[matplotlib.axes.Axes] = None, - colors: Optional[list] = None, - spacing: Optional[float] = 1 + self, + bipartite_graph: Optional[bool] = False, + ax: Optional[matplotlib.axes.Axes] = None, + colors: Optional[list] = None, + spacing: Optional[float] = 1, ) -> None: """Draw the MNN graph. - + Parameters - ---------- + ---------- bipartite_graph : boolean, optional If True a custom bipartite layout is used (only with two data sets). The default is False - + ax : matplotlib.axes, optional The default is None. - + colors : list of matplotlib.colors, optional List of colors you want each data set to be associated with. The default is None. - + spacing : float >= 1, optional Deal with the space between nodes. Increase this value to move nodes farther apart. @@ -445,16 +471,16 @@ def draw( def export_json(self, file_name: str) -> None: """Save the graph in a json file adapted to cytoscape format - + Parameters ---------- file_name : string Name of the json file. - + Returns ------- None. - + """ dic = nx.readwrite.json_graph.cytoscape_data(self.graph_) with open(file_name, "w") as fp: diff --git a/tests/test_StabilizedICA.py b/tests/test_StabilizedICA.py index 51859d5..268833c 100644 --- a/tests/test_StabilizedICA.py +++ b/tests/test_StabilizedICA.py @@ -27,13 +27,13 @@ def center_and_norm(x: np.ndarray, axis: Optional[int] = -1) -> None: def create_data_superGauss( - add_noise: bool, - seed: int, - N: Optional[int] = 3, - T: Optional[int] = 1000, - M: Optional[Union[int, None]] = None + add_noise: bool, + seed: int, + N: Optional[int] = 3, + T: Optional[int] = 1000, + M: Optional[Union[int, None]] = None, ) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.random.RandomState]: - """ Create a simple data set with superGaussian-distributed sources (Laplace distribution). + """Create a simple data set with superGaussian-distributed sources (Laplace distribution). Parameters ---------- add_noise : bool @@ -74,14 +74,13 @@ def create_data_superGauss( def create_data_subGauss( - add_noise: int, - seed: int, - N: Optional[int] = 3, - T: Optional[int] = 1000, - M: Optional[Union[int, None]] = None + add_noise: int, + seed: int, + N: Optional[int] = 3, + T: Optional[int] = 1000, + M: Optional[Union[int, None]] = None, ) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.random.RandomState]: - """ Create a simple data set with subGaussian-distributed sources (uniform distribution). - """ + """Create a simple data set with subGaussian-distributed sources (uniform distribution).""" rng = np.random.RandomState(seed) S = rng.uniform(low=-1, high=1, size=(N, T)) center_and_norm(S) @@ -101,19 +100,19 @@ def create_data_subGauss( def create_data_mix( - add_noise: int, - seed: int, - N: Optional[int] = 3, - T: Optional[int] = 1000, - M: Optional[Union[int, None]] = None + add_noise: int, + seed: int, + N: Optional[int] = 3, + T: Optional[int] = 1000, + M: Optional[Union[int, None]] = None, ) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.random.RandomState]: - """ Create a simple data set with a mixture of superGaussian-distributed and - subGaussian-distributed sources (Laplace and uniform distributions). + """Create a simple data set with a mixture of superGaussian-distributed and + subGaussian-distributed sources (Laplace and uniform distributions). """ rng = np.random.RandomState(seed) S = np.zeros((3, 1000)) S[: int(N // 2), :] = rng.uniform(low=-1, high=1, size=(int(N // 2), T)) - S[int(N // 2):, :] = rng.laplace(size=(N - int(N // 2), T)) + S[int(N // 2) :, :] = rng.laplace(size=(N - int(N // 2), T)) center_and_norm(S) if M is None: A = rng.randn(N, N) @@ -131,18 +130,18 @@ def create_data_mix( def base_test_simple( - strategy: Tuple[str, str], - data: Tuple[np.ndarray, np.ndarray, np.ndarray, np.random.RandomState], - noise: bool + strategy: Tuple[str, str], + data: Tuple[np.ndarray, np.ndarray, np.ndarray, np.random.RandomState], + noise: bool, ) -> None: - """ Simple test function for stabilized ICA without resampling. + """Simple test function for stabilized ICA without resampling. Parameters ---------- strategy : (string , string) Pair of algorithm and function which characterizes the solver of the ICA problem (ex : ('fastica_par' , 'logcosh')). - data : array X, array A, array S, randomstate + data : array X, array A, array S, randomstate Simulated data. noise : bool @@ -160,19 +159,20 @@ def base_test_simple( normalizing = [True, False] reorienting = [True, False] for whiten, norm, reorient in itertools.product( - whitening, normalizing, reorienting + whitening, normalizing, reorienting ): - sica = StabilizedICA(n_components=S.shape[0], - n_runs=10, - fun=strategy[1], - algorithm=strategy[0], - whiten=whiten, - normalize=norm, - reorientation=reorient, - resampling=None, - n_jobs=1, - verbose=0 - ) + sica = StabilizedICA( + n_components=S.shape[0], + n_runs=10, + fun=strategy[1], + algorithm=strategy[0], + whiten=whiten, + normalize=norm, + reorientation=reorient, + resampling=None, + n_jobs=1, + verbose=0, + ) if whiten: test_A = sica.fit_transform(X) else: @@ -187,10 +187,14 @@ def base_test_simple( if not noise: if whiten: assert test_A.shape == A.shape - assert_almost_equal(X, np.dot(test_A, sica.S_) + sica.mean_.reshape(-1, 1)) + assert_almost_equal( + X, np.dot(test_A, sica.S_) + sica.mean_.reshape(-1, 1) + ) else: assert test_Aw.shape == (S.shape[0], S.shape[0]) - assert_almost_equal(X, pca.inverse_transform(np.dot(test_Aw, sica.S_).T).T) + assert_almost_equal( + X, pca.inverse_transform(np.dot(test_Aw, sica.S_).T).T + ) center_and_norm(sica.S_) s1_, s2_, s3_ = sica.S_ @@ -219,12 +223,12 @@ def base_test_simple( def base_test_with_bootstrap( - resampling: str, - strategy: Tuple[str, str], - data: Tuple[np.ndarray, np.ndarray, np.ndarray, np.random.RandomState], - noise: bool + resampling: str, + strategy: Tuple[str, str], + data: Tuple[np.ndarray, np.ndarray, np.ndarray, np.random.RandomState], + noise: bool, ) -> None: - """ Simple test function for StabilizedICA with resampling. + """Simple test function for StabilizedICA with resampling. Parameters ---------- resampling : {'bootstrap' , 'fast_bootstrap'} @@ -232,7 +236,7 @@ def base_test_with_bootstrap( strategy : (string , string) Pair of algorithm and function which characterizes the solver of the ICA problem (ex : ('fastica_par' , 'logcosh')). - data : array X, array A, array S, randomstate + data : array X, array A, array S, randomstate Simulated data. noise : bool @@ -318,7 +322,9 @@ def base_test_with_bootstrap( ("picard_ext", "tanh"), ], ) -def test_StabilizedICA_supGaussian(add_noise: bool, seed: bool, strategy: Tuple[str, str]): +def test_StabilizedICA_supGaussian( + add_noise: bool, seed: bool, strategy: Tuple[str, str] +): data = create_data_superGauss(add_noise, seed, M=100) base_test_simple(strategy=strategy, data=data, noise=add_noise) @@ -336,7 +342,9 @@ def test_StabilizedICA_supGaussian(add_noise: bool, seed: bool, strategy: Tuple[ ("picard_ext", "tanh"), ], ) -def test_StabilizedICA_subGaussian(add_noise: bool, seed: bool, strategy: Tuple[str, str]): +def test_StabilizedICA_subGaussian( + add_noise: bool, seed: bool, strategy: Tuple[str, str] +): data = create_data_subGauss(add_noise, seed, M=10) base_test_simple(strategy=strategy, data=data, noise=add_noise) @@ -381,9 +389,13 @@ def test_StabilizedICA_mix(add_noise: bool, seed: bool, strategy: Tuple[str, str ) @pytest.mark.parametrize("resampling", ["bootstrap", "fast_bootstrap"]) @pytest.mark.slow -def test_StabilizedICA_bootstrap_supGaussian(add_noise: bool, seed: bool, strategy: Tuple[str, str], resampling: str): +def test_StabilizedICA_bootstrap_supGaussian( + add_noise: bool, seed: bool, strategy: Tuple[str, str], resampling: str +): data = create_data_superGauss(add_noise, seed, M=100) - base_test_with_bootstrap(resampling=resampling, strategy=strategy, data=data, noise=add_noise) + base_test_with_bootstrap( + resampling=resampling, strategy=strategy, data=data, noise=add_noise + ) @pytest.mark.parametrize("add_noise", [True, False]) @@ -401,9 +413,13 @@ def test_StabilizedICA_bootstrap_supGaussian(add_noise: bool, seed: bool, strate ) @pytest.mark.parametrize("resampling", ["bootstrap", "fast_bootstrap"]) @pytest.mark.slow -def test_StabilizedICA_bootstrap_subGaussian(add_noise: bool, seed: bool, strategy: Tuple[str, str], resampling: str): +def test_StabilizedICA_bootstrap_subGaussian( + add_noise: bool, seed: bool, strategy: Tuple[str, str], resampling: str +): data = create_data_subGauss(add_noise, seed, M=100) - base_test_with_bootstrap(resampling=resampling, strategy=strategy, data=data, noise=add_noise) + base_test_with_bootstrap( + resampling=resampling, strategy=strategy, data=data, noise=add_noise + ) @pytest.mark.parametrize("add_noise", [True, False]) @@ -423,15 +439,24 @@ def test_StabilizedICA_bootstrap_subGaussian(add_noise: bool, seed: bool, strate ) @pytest.mark.parametrize("resampling", ["bootstrap", "fast_bootstrap"]) @pytest.mark.slow -def test_StabilizedICA_bootstrap_mix(add_noise: bool, seed: bool, strategy: Tuple[str, str], resampling: str): +def test_StabilizedICA_bootstrap_mix( + add_noise: bool, seed: bool, strategy: Tuple[str, str], resampling: str +): data = create_data_mix(add_noise, seed, M=100) - base_test_with_bootstrap(resampling=resampling, strategy=strategy, data=data, noise=add_noise) + base_test_with_bootstrap( + resampling=resampling, strategy=strategy, data=data, noise=add_noise + ) def test_reorientation(): X, A, S, rng = create_data_superGauss(add_noise=False, seed=42) sica = StabilizedICA( - n_components=3, n_runs=10, reorientation=True, resampling=None, n_jobs=1, verbose=0 + n_components=3, + n_runs=10, + reorientation=True, + resampling=None, + n_jobs=1, + verbose=0, ) sica.fit(X)