From 266d3618abe7fe4b4b7ef90fe5c2ab2e0d26ba40 Mon Sep 17 00:00:00 2001 From: d-schindler <60650591+d-schindler@users.noreply.github.com> Date: Fri, 26 Apr 2024 12:32:44 +0200 Subject: [PATCH] write proper documentation --- README.md | 2 +- docs/index_readme.md | 2 +- examples/Example_7_data_clustering.ipynb | 13 +- src/pygenstability/data_clustering.py | 173 ++++++++++++++++++++--- 4 files changed, 165 insertions(+), 25 deletions(-) diff --git a/README.md b/README.md index 3064f6b..53cc885 100755 --- a/README.md +++ b/README.md @@ -254,7 +254,7 @@ If you are interested in trying our other packages, see the below list: [9] Z. Liu and M. Barahona, 'Graph-based data clustering via multiscale community detection', *Applied Network Science*, vol. 5, no. 1, p. 3, Dec. 2020, doi: 10.1007/s41109-019-0248-7. -[10] T. Berry and T. Suaer, 'Consistent manifold representation for topological data analysis', *Foundations of Data Science*, vol. 1, no. 1, p. 1-38, Feb. 2019, doi: 10.3934/fods.2019001. +[10] T. Berry and T. Sauer, 'Consistent manifold representation for topological data analysis', *Foundations of Data Science*, vol. 1, no. 1, p. 1-38, Feb. 2019, doi: 10.3934/fods.2019001. ## Licence diff --git a/docs/index_readme.md b/docs/index_readme.md index da189b8..b327c1c 100644 --- a/docs/index_readme.md +++ b/docs/index_readme.md @@ -239,7 +239,7 @@ If you are interested in trying our other packages, see the below list: [9] Z. Liu and M. Barahona, 'Graph-based data clustering via multiscale community detection', *Applied Network Science*, vol. 5, no. 1, p. 3, Dec. 2020, doi: 10.1007/s41109-019-0248-7. -[10] T. Berry and T. Suaer, 'Consistent manifold representation for topological data analysis', *Foundations of Data Science*, vol. 1, no. 1, p. 1-38, Feb. 2019, doi: 10.3934/fods.2019001. +[10] T. Berry and T. Sauer, 'Consistent manifold representation for topological data analysis', *Foundations of Data Science*, vol. 1, no. 1, p. 1-38, Feb. 2019, doi: 10.3934/fods.2019001. ## Licence diff --git a/examples/Example_7_data_clustering.ipynb b/examples/Example_7_data_clustering.ipynb index 68cd033..420f978 100644 --- a/examples/Example_7_data_clustering.ipynb +++ b/examples/Example_7_data_clustering.ipynb @@ -137,9 +137,9 @@ "name": "stderr", "output_type": "stream", "text": [ - "100%|██████████| 50/50 [00:00<00:00, 432.72it/s]\n", - "100%|██████████| 50/50 [00:39<00:00, 1.28it/s]\n", - "100%|██████████| 50/50 [00:00<00:00, 92.64it/s]\n" + "100%|██████████| 50/50 [00:00<00:00, 952.91it/s]\n", + "100%|██████████| 50/50 [00:13<00:00, 3.68it/s]\n", + "100%|██████████| 50/50 [00:00<00:00, 277.15it/s]\n" ] } ], @@ -147,15 +147,14 @@ "# apply multiscale data clustering to synthetic data\n", "clustering = pgs.DataClustering(\n", " metric=\"euclidean\",\n", - " graph_method=\"cknn\",\n", + " graph_method=\"cknn-mst\",\n", " k=5,\n", " delta=1.0,\n", " constructor=\"linearized\",\n", " min_scale=-3.0,\n", " max_scale=0.0,\n", " n_scale=50,\n", - " with_spectral_gap=True)\n", - "results = clustering.fit(X)" + " with_spectral_gap=True).fit(X)" ] }, { @@ -178,7 +177,7 @@ ], "source": [ "# identify optimal scales and plot scan\n", - "clustering.scale_selection(kernel_size=0.2)\n", + "labels = clustering.scale_selection(kernel_size=0.2)\n", "clustering.plot_scan()" ] }, diff --git a/src/pygenstability/data_clustering.py b/src/pygenstability/data_clustering.py index 3ea94ac..a749b2d 100644 --- a/src/pygenstability/data_clustering.py +++ b/src/pygenstability/data_clustering.py @@ -14,7 +14,7 @@ from pygenstability.contrib.sankey import plot_sankey as pgs_plot_sankey -def compute_CkNN(D, k=5, delta=1): +def _compute_CkNN(D, k=5, delta=1): """Computes CkNN graph.""" # obtain rescaled distance matrix, see CkNN paper darray_n_nbrs = np.partition(D, k)[:, [k]] @@ -24,13 +24,13 @@ def compute_CkNN(D, k=5, delta=1): return A -class GraphConstruction: +class _GraphConstruction: """Graph construction.""" def __init__( self, metric="euclidean", - method="cknn", + method="cknn-mst", k=5, delta=1.0, distance_threshold=np.inf, @@ -63,10 +63,10 @@ def get_graph(self, X): S = 1 - D_norm # sparsify distance matrix with CkNN or kNN method - if self.method == "cknn": - sparse = compute_CkNN(D_norm, self.k, self.delta) + if self.method == "cknn-mst": + sparse = _compute_CkNN(D_norm, self.k, self.delta) - elif self.method == "knn": + elif self.method == "knn-mst": sparse = kneighbors_graph( D_norm, n_neighbors=self.k, metric="precomputed" ).toarray() @@ -84,13 +84,75 @@ def get_graph(self, X): return self.adjacency_ -class DataClustering(GraphConstruction): - """Data clustering.""" +class DataClustering(_GraphConstruction): + """Class for multiscale graph-based data clustering. + + This class provides an interface for multiscale graph-based data clustering [1]_ + with PyGenStability. + + Parameters + ---------- + metric : str or function, default='euclidean' + The distance metric to use. The distance function can be ‘braycurtis’, ‘canberra’, + ‘chebyshev’, ‘cityblock’, ‘correlation’, ‘cosine’, ‘dice’, ‘euclidean’, ‘hamming’, + ‘jaccard’, ‘jensenshannon’, ‘kulczynski1’, ‘mahalanobis’, ‘matching’, ‘minkowski’, + ‘rogerstanimoto’, ‘russellrao’, ‘seuclidean’, ‘sokalmichener’, ‘sokalsneath’, + ‘sqeuclidean’, ‘yule’. + + graph_method : {'knn-mst', 'cknn-mst', 'precomputed'}, default='cknn-mst' + Method to construct graph from sample-by-feature matrix: + + - 'knn-mst' will use k-Nearest Neighbor graph combined with Miniumus Spanning Tree. + - 'cknn-mst' will use Continunous k-Nearest Neighbor graph [2]_ combined with + Miniumus Spanning Tree. + - 'precomputed' assumes that data is already provided as adjacency matrix of a + sparse graph. + + k : int, default=5 + Number of neighbors considered in graph construction. This parameter is expected + to be positive. + + delta : float, default=1.0 + Density parameter for Continunous k-Nearest Neighbor graph. This parameter is + expected to be positive. + + distance_threshold : float, optional + Optional thresholding of distance matrix. + + **pgs_kwargs : dict, optional + Parameters for PyGenStability, see documentation. Some possible arguments: + - constructor (str/function): name of the generalized Markov Stability constructor, + or custom constructor function. It must have two arguments, graph and scale. + - min_scale (float): minimum Markov scale + - max_scale (float): maximum Markov scale + - n_scale (int): number of scale steps + - with_spectral_gap (bool): normalise scale by spectral gap + + Attributes: + ----------- + adjacency_ : sparse matrix of shape (n_samples,n_samples) + Sparse adjacency matrix of constructed graph. + + results_ : dict + PyGenStability results dictionary, see documentation for all arguments. + + labels_ : list of ndarray + List of robust partitions identified with optimal scale selection. + + References: + ----------- + .. [1] Z. Liu and M. Barahona, 'Graph-based data clustering via multiscale + community detection', *Applied Network Science*, vol. 5, no. 1, p. 3, + Dec. 2020, doi: 10.1007/s41109-019-0248-7. + .. [2] T. Berry and T. Sauer, 'Consistent manifold representation for + topological data analysis', *Foundations of Data Science*, vol. 1, no. 1, + p. 1-38, Feb. 2019, doi: 10.3934/fods.2019001. + """ def __init__( self, metric="euclidean", - graph_method="cknn", + graph_method="cknn-mst", k=5, delta=1.0, distance_threshold=np.inf, @@ -132,19 +194,58 @@ def labels_(self): return labels def fit(self, X): - """Construct graph and run PyGenStability for multiscale data clustering.""" + """Fit multiscale graph-based data clustering with PyGenStability from data. + + Parameters: + ----------- + X : {array-like, sparse matrix} of shape (n_samples,n_features) or \ + (n_samples,n_samples) if graph_method='precomputed' + + Returns: + ------- + self : DataClustering + The fitted multiscale graph-based data clustering. + """ # construct graph self.adjacency_ = csr_matrix(self.get_graph(X)) # run PyGenStability self.results_ = pgs_run(self.adjacency_, **self.pgs_kwargs) - return self.results_ + return self def scale_selection( self, kernel_size=0.1, window_size=0.1, max_nvi=1, basin_radius=0.01 ): - """Identify optimal scales.""" + """Identify optimal scales [3]. + + Parameters: + ----------- + kernel_size : int or float, default=0.1 + Size of kernel for average-pooling of the NVI(t,t') matrix. If float smaller + than one it's the relative number of scales. + + window_size : int or float, default=0.1 + Size of window for moving mean, to smooth the pooled diagonal. If float smaller + than one it's the relative number of scales. + + max_nvi: float, default=1 + Threshold for local minima of the pooled diagonal. + + basin_radius: int or float, default=0.01 + Radius of basin around local minima of the pooled diagonal. If float smaller + than one it's the relative number of scales. + + Returns: + -------- + labels_ : list of ndarray + List of robust partitions identified with optimal scale selection. + + References: + ----------- + .. [3] D. J. Schindler, J. Clarke, and M. Barahona, 'Multiscale Mobility Patterns and + the Restriction of Human Movement', *arXiv:2201.06323*, 2023 + """ # transform relative values to absolute values if kernel_size < 1: kernel_size = int(kernel_size * self.results_["run_params"]["n_scale"]) @@ -164,17 +265,37 @@ def scale_selection( basin_radius=basin_radius, ) + return self.labels_ + def plot_scan(self): - """Plot PyGenStability scan.""" + """Plot summary figure for PyGenStability scan.""" if self.results_ is None: return pgs_plot_scan(self.results_) def plot_robust_partitions( - self, x_coord, y_coord, edge_width=1, node_size=20, cmap="tab20" + self, x_coord, y_coord, edge_width=1.0, node_size=20.0, cmap="tab20" ): - """Plot robust partitions.""" + """Plot robust partitions with graph layout. + + Parameters: + ---------- + x_coord : ndarray of shape (n_samples,) + X-coordinates provided for samples. + + y_coord : ndarray of shape (n_samples,) + Y-coordinates provided for samples. + + edge_width : float, default=1.0 + Edge width of graph. This parameter is expected to be positive. + + node_size : float, default=20.0 + Node size in graph. This parameter is expected to be positive. + + cmap : str, default:'tab20' + Color map for cluster colors. + """ for m, partition in enumerate(self.labels_): # plot @@ -210,7 +331,27 @@ def plot_sankey( filename="communities_sankey.html", scale_index=None, ): - """Plot Sankey diagram.""" + """Plot Sankey diagram. + + Parameters: + ----------- + optimal_scales : bool, default=True + Plot Sankey diagram of robust partitions only or not. + + live : bool, default=False + If True, interactive figure will appear in browser. + + filename : str, default="communities_sankey.html" + Filename to save the plot. + + scale_index : bool + Plot Sankey diagram for provided scale indices. + + Returns: + -------- + fig : plotly figure + Sankey diagram figure. + """ # plot non-trivial optimal scales only if optimal_scales: n_partitions = len(self.labels_)