write proper documentation

barahona-research-group · Apr 26, 2024 · 266d361 · 266d361
1 parent 41ab488
commit 266d361
Show file tree

Hide file tree

Showing 4 changed files with 165 additions and 25 deletions.
diff --git a/README.md b/README.md
@@ -254,7 +254,7 @@ If you are interested in trying our other packages, see the below list:
 
 [9] Z. Liu and M. Barahona, 'Graph-based data clustering via multiscale community detection', *Applied Network Science*, vol. 5, no. 1, p. 3, Dec. 2020, doi: 10.1007/s41109-019-0248-7.
 
-[10] T. Berry and T. Suaer, 'Consistent manifold representation for topological data analysis', *Foundations of Data Science*, vol. 1, no. 1, p. 1-38, Feb. 2019, doi: 10.3934/fods.2019001.
+[10] T. Berry and T. Sauer, 'Consistent manifold representation for topological data analysis', *Foundations of Data Science*, vol. 1, no. 1, p. 1-38, Feb. 2019, doi: 10.3934/fods.2019001.
 
 ## Licence
 

diff --git a/docs/index_readme.md b/docs/index_readme.md
@@ -239,7 +239,7 @@ If you are interested in trying our other packages, see the below list:
 
 [9] Z. Liu and M. Barahona, 'Graph-based data clustering via multiscale community detection', *Applied Network Science*, vol. 5, no. 1, p. 3, Dec. 2020, doi: 10.1007/s41109-019-0248-7.
 
-[10] T. Berry and T. Suaer, 'Consistent manifold representation for topological data analysis', *Foundations of Data Science*, vol. 1, no. 1, p. 1-38, Feb. 2019, doi: 10.3934/fods.2019001.
+[10] T. Berry and T. Sauer, 'Consistent manifold representation for topological data analysis', *Foundations of Data Science*, vol. 1, no. 1, p. 1-38, Feb. 2019, doi: 10.3934/fods.2019001.
 
 ## Licence
 

diff --git a/examples/Example_7_data_clustering.ipynb b/examples/Example_7_data_clustering.ipynb
@@ -137,25 +137,24 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "100%|██████████| 50/50 [00:00<00:00, 432.72it/s]\n",
-      "100%|██████████| 50/50 [00:39<00:00,  1.28it/s]\n",
-      "100%|██████████| 50/50 [00:00<00:00, 92.64it/s]\n"
+      "100%|██████████| 50/50 [00:00<00:00, 952.91it/s]\n",
+      "100%|██████████| 50/50 [00:13<00:00,  3.68it/s]\n",
+      "100%|██████████| 50/50 [00:00<00:00, 277.15it/s]\n"
      ]
     }
    ],
    "source": [
     "# apply multiscale data clustering to synthetic data\n",
     "clustering = pgs.DataClustering(\n",
     "    metric=\"euclidean\",\n",
-    "    graph_method=\"cknn\",\n",
+    "    graph_method=\"cknn-mst\",\n",
     "    k=5,\n",
     "    delta=1.0,\n",
     "    constructor=\"linearized\",\n",
     "    min_scale=-3.0,\n",
     "    max_scale=0.0,\n",
     "    n_scale=50,\n",
-    "    with_spectral_gap=True)\n",
-    "results = clustering.fit(X)"
+    "    with_spectral_gap=True).fit(X)"
    ]
   },
   {
@@ -178,7 +177,7 @@
    ],
    "source": [
     "# identify optimal scales and plot scan\n",
-    "clustering.scale_selection(kernel_size=0.2)\n",
+    "labels = clustering.scale_selection(kernel_size=0.2)\n",
     "clustering.plot_scan()"
    ]
   },

diff --git a/src/pygenstability/data_clustering.py b/src/pygenstability/data_clustering.py
@@ -14,7 +14,7 @@
 from pygenstability.contrib.sankey import plot_sankey as pgs_plot_sankey
 
 
-def compute_CkNN(D, k=5, delta=1):
+def _compute_CkNN(D, k=5, delta=1):
     """Computes CkNN graph."""
     # obtain rescaled distance matrix, see CkNN paper
     darray_n_nbrs = np.partition(D, k)[:, [k]]
@@ -24,13 +24,13 @@ def compute_CkNN(D, k=5, delta=1):
     return A
 
 
-class GraphConstruction:
+class _GraphConstruction:
     """Graph construction."""
 
     def __init__(
         self,
         metric="euclidean",
-        method="cknn",
+        method="cknn-mst",
         k=5,
         delta=1.0,
         distance_threshold=np.inf,
@@ -63,10 +63,10 @@ def get_graph(self, X):
         S = 1 - D_norm
 
         # sparsify distance matrix with CkNN or kNN method
-        if self.method == "cknn":
-            sparse = compute_CkNN(D_norm, self.k, self.delta)
+        if self.method == "cknn-mst":
+            sparse = _compute_CkNN(D_norm, self.k, self.delta)
 
-        elif self.method == "knn":
+        elif self.method == "knn-mst":
             sparse = kneighbors_graph(
                 D_norm, n_neighbors=self.k, metric="precomputed"
             ).toarray()
@@ -84,13 +84,75 @@ def get_graph(self, X):
         return self.adjacency_
 
 
-class DataClustering(GraphConstruction):
-    """Data clustering."""
+class DataClustering(_GraphConstruction):
+    """Class for multiscale graph-based data clustering.
+
+    This class provides an interface for multiscale graph-based data clustering [1]_
+    with PyGenStability.
+
+    Parameters
+    ----------
+    metric : str or function, default='euclidean'
+        The distance metric to use. The distance function can be ‘braycurtis’, ‘canberra’,
+        ‘chebyshev’, ‘cityblock’, ‘correlation’, ‘cosine’, ‘dice’, ‘euclidean’, ‘hamming’,
+        ‘jaccard’, ‘jensenshannon’, ‘kulczynski1’, ‘mahalanobis’, ‘matching’, ‘minkowski’,
+        ‘rogerstanimoto’, ‘russellrao’, ‘seuclidean’, ‘sokalmichener’, ‘sokalsneath’,
+        ‘sqeuclidean’, ‘yule’.
+
+    graph_method : {'knn-mst', 'cknn-mst', 'precomputed'}, default='cknn-mst'
+        Method to construct graph from sample-by-feature matrix:
+
+        - 'knn-mst' will use k-Nearest Neighbor graph combined with Miniumus Spanning Tree.
+        - 'cknn-mst' will use Continunous k-Nearest Neighbor graph [2]_ combined with
+        Miniumus Spanning Tree.
+        - 'precomputed' assumes that data is already provided as adjacency matrix of a
+        sparse graph.
+
+    k : int, default=5
+        Number of neighbors considered in graph construction. This parameter is expected
+        to be positive.
+
+    delta : float, default=1.0
+        Density parameter for Continunous k-Nearest Neighbor graph. This parameter is
+        expected to be positive.
+
+    distance_threshold : float, optional
+        Optional thresholding of distance matrix.
+
+    **pgs_kwargs : dict, optional
+        Parameters for PyGenStability, see documentation. Some possible arguments:
+        - constructor (str/function): name of the generalized Markov Stability constructor,
+        or custom constructor function. It must have two arguments, graph and scale.
+        - min_scale (float): minimum Markov scale
+        - max_scale (float): maximum Markov scale
+        - n_scale (int): number of scale steps
+        - with_spectral_gap (bool): normalise scale by spectral gap
+
+    Attributes:
+    -----------
+    adjacency_ : sparse matrix of shape (n_samples,n_samples)
+        Sparse adjacency matrix of constructed graph.
+
+    results_ : dict
+        PyGenStability results dictionary, see documentation for all arguments.
+
+    labels_ : list of ndarray
+        List of robust partitions identified with optimal scale selection.
+
+    References:
+    -----------
+        .. [1] Z. Liu and M. Barahona, 'Graph-based data clustering via multiscale
+        community detection', *Applied Network Science*, vol. 5, no. 1, p. 3,
+        Dec. 2020, doi: 10.1007/s41109-019-0248-7.
+        .. [2] T. Berry and T. Sauer, 'Consistent manifold representation for
+        topological data analysis', *Foundations of Data Science*, vol. 1, no. 1,
+        p. 1-38, Feb. 2019, doi: 10.3934/fods.2019001.
+    """
 
     def __init__(
         self,
         metric="euclidean",
-        graph_method="cknn",
+        graph_method="cknn-mst",
         k=5,
         delta=1.0,
         distance_threshold=np.inf,
@@ -132,19 +194,58 @@ def labels_(self):
         return labels
 
     def fit(self, X):
-        """Construct graph and run PyGenStability for multiscale data clustering."""
+        """Fit multiscale graph-based data clustering with PyGenStability from data.
+
+        Parameters:
+        -----------
+        X : {array-like, sparse matrix} of shape (n_samples,n_features) or \
+            (n_samples,n_samples) if graph_method='precomputed'
+
+        Returns:
+        -------
+        self : DataClustering
+            The fitted multiscale graph-based data clustering.
+        """
         # construct graph
         self.adjacency_ = csr_matrix(self.get_graph(X))
 
         # run PyGenStability
         self.results_ = pgs_run(self.adjacency_, **self.pgs_kwargs)
 
-        return self.results_
+        return self
 
     def scale_selection(
         self, kernel_size=0.1, window_size=0.1, max_nvi=1, basin_radius=0.01
     ):
-        """Identify optimal scales."""
+        """Identify optimal scales [3].
+
+        Parameters:
+        -----------
+        kernel_size : int or float, default=0.1
+            Size of kernel for average-pooling of the NVI(t,t') matrix. If float smaller
+            than one it's the relative number of scales.
+
+        window_size : int or float, default=0.1
+            Size of window for moving mean, to smooth the pooled diagonal. If float smaller
+            than one it's the relative number of scales.
+
+        max_nvi: float, default=1
+            Threshold for local minima of the pooled diagonal.
+
+        basin_radius: int or float, default=0.01
+            Radius of basin around local minima of the pooled diagonal. If float smaller
+            than one it's the relative number of scales.
+
+        Returns:
+        --------
+        labels_ : list of ndarray
+            List of robust partitions identified with optimal scale selection.
+
+        References:
+        -----------
+        .. [3] D. J. Schindler, J. Clarke, and M. Barahona, 'Multiscale Mobility Patterns and
+               the Restriction of Human Movement', *arXiv:2201.06323*, 2023
+        """
         # transform relative values to absolute values
         if kernel_size < 1:
             kernel_size = int(kernel_size * self.results_["run_params"]["n_scale"])
@@ -164,17 +265,37 @@ def scale_selection(
             basin_radius=basin_radius,
         )
 
+        return self.labels_
+
     def plot_scan(self):
-        """Plot PyGenStability scan."""
+        """Plot summary figure for PyGenStability scan."""
         if self.results_ is None:
             return
 
         pgs_plot_scan(self.results_)
 
     def plot_robust_partitions(
-        self, x_coord, y_coord, edge_width=1, node_size=20, cmap="tab20"
+        self, x_coord, y_coord, edge_width=1.0, node_size=20.0, cmap="tab20"
     ):
-        """Plot robust partitions."""
+        """Plot robust partitions with graph layout.
+
+        Parameters:
+        ----------
+        x_coord : ndarray of shape (n_samples,)
+            X-coordinates provided for samples.
+
+        y_coord : ndarray of shape (n_samples,)
+            Y-coordinates provided for samples.
+
+        edge_width : float, default=1.0
+            Edge width of graph. This parameter is expected to be positive.
+
+        node_size : float, default=20.0
+            Node size in graph. This parameter is expected to be positive.
+
+        cmap : str, default:'tab20'
+            Color map for cluster colors.
+        """
         for m, partition in enumerate(self.labels_):
 
             # plot
@@ -210,7 +331,27 @@ def plot_sankey(
         filename="communities_sankey.html",
         scale_index=None,
     ):
-        """Plot Sankey diagram."""
+        """Plot Sankey diagram.
+
+        Parameters:
+        -----------
+        optimal_scales : bool, default=True
+            Plot Sankey diagram of robust partitions only or not.
+
+        live : bool, default=False
+            If True, interactive figure will appear in browser.
+
+        filename : str, default="communities_sankey.html"
+            Filename to save the plot.
+
+        scale_index : bool
+            Plot Sankey diagram for provided scale indices.
+
+        Returns:
+        --------
+        fig : plotly figure
+            Sankey diagram figure.
+        """
         # plot non-trivial optimal scales only
         if optimal_scales:
             n_partitions = len(self.labels_)