Skip to content

Commit

Permalink
write proper documentation
Browse files Browse the repository at this point in the history
  • Loading branch information
d-schindler committed Apr 26, 2024
1 parent 41ab488 commit 266d361
Show file tree
Hide file tree
Showing 4 changed files with 165 additions and 25 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -254,7 +254,7 @@ If you are interested in trying our other packages, see the below list:

[9] Z. Liu and M. Barahona, 'Graph-based data clustering via multiscale community detection', *Applied Network Science*, vol. 5, no. 1, p. 3, Dec. 2020, doi: 10.1007/s41109-019-0248-7.

[10] T. Berry and T. Suaer, 'Consistent manifold representation for topological data analysis', *Foundations of Data Science*, vol. 1, no. 1, p. 1-38, Feb. 2019, doi: 10.3934/fods.2019001.
[10] T. Berry and T. Sauer, 'Consistent manifold representation for topological data analysis', *Foundations of Data Science*, vol. 1, no. 1, p. 1-38, Feb. 2019, doi: 10.3934/fods.2019001.

## Licence

Expand Down
2 changes: 1 addition & 1 deletion docs/index_readme.md
Original file line number Diff line number Diff line change
Expand Up @@ -239,7 +239,7 @@ If you are interested in trying our other packages, see the below list:

[9] Z. Liu and M. Barahona, 'Graph-based data clustering via multiscale community detection', *Applied Network Science*, vol. 5, no. 1, p. 3, Dec. 2020, doi: 10.1007/s41109-019-0248-7.

[10] T. Berry and T. Suaer, 'Consistent manifold representation for topological data analysis', *Foundations of Data Science*, vol. 1, no. 1, p. 1-38, Feb. 2019, doi: 10.3934/fods.2019001.
[10] T. Berry and T. Sauer, 'Consistent manifold representation for topological data analysis', *Foundations of Data Science*, vol. 1, no. 1, p. 1-38, Feb. 2019, doi: 10.3934/fods.2019001.

## Licence

Expand Down
13 changes: 6 additions & 7 deletions examples/Example_7_data_clustering.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -137,25 +137,24 @@
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 50/50 [00:00<00:00, 432.72it/s]\n",
"100%|██████████| 50/50 [00:39<00:00, 1.28it/s]\n",
"100%|██████████| 50/50 [00:00<00:00, 92.64it/s]\n"
"100%|██████████| 50/50 [00:00<00:00, 952.91it/s]\n",
"100%|██████████| 50/50 [00:13<00:00, 3.68it/s]\n",
"100%|██████████| 50/50 [00:00<00:00, 277.15it/s]\n"
]
}
],
"source": [
"# apply multiscale data clustering to synthetic data\n",
"clustering = pgs.DataClustering(\n",
" metric=\"euclidean\",\n",
" graph_method=\"cknn\",\n",
" graph_method=\"cknn-mst\",\n",
" k=5,\n",
" delta=1.0,\n",
" constructor=\"linearized\",\n",
" min_scale=-3.0,\n",
" max_scale=0.0,\n",
" n_scale=50,\n",
" with_spectral_gap=True)\n",
"results = clustering.fit(X)"
" with_spectral_gap=True).fit(X)"
]
},
{
Expand All @@ -178,7 +177,7 @@
],
"source": [
"# identify optimal scales and plot scan\n",
"clustering.scale_selection(kernel_size=0.2)\n",
"labels = clustering.scale_selection(kernel_size=0.2)\n",
"clustering.plot_scan()"
]
},
Expand Down
173 changes: 157 additions & 16 deletions src/pygenstability/data_clustering.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
from pygenstability.contrib.sankey import plot_sankey as pgs_plot_sankey


def compute_CkNN(D, k=5, delta=1):
def _compute_CkNN(D, k=5, delta=1):
"""Computes CkNN graph."""
# obtain rescaled distance matrix, see CkNN paper
darray_n_nbrs = np.partition(D, k)[:, [k]]
Expand All @@ -24,13 +24,13 @@ def compute_CkNN(D, k=5, delta=1):
return A


class GraphConstruction:
class _GraphConstruction:
"""Graph construction."""

def __init__(
self,
metric="euclidean",
method="cknn",
method="cknn-mst",
k=5,
delta=1.0,
distance_threshold=np.inf,
Expand Down Expand Up @@ -63,10 +63,10 @@ def get_graph(self, X):
S = 1 - D_norm

# sparsify distance matrix with CkNN or kNN method
if self.method == "cknn":
sparse = compute_CkNN(D_norm, self.k, self.delta)
if self.method == "cknn-mst":
sparse = _compute_CkNN(D_norm, self.k, self.delta)

elif self.method == "knn":
elif self.method == "knn-mst":
sparse = kneighbors_graph(
D_norm, n_neighbors=self.k, metric="precomputed"
).toarray()
Expand All @@ -84,13 +84,75 @@ def get_graph(self, X):
return self.adjacency_


class DataClustering(GraphConstruction):
"""Data clustering."""
class DataClustering(_GraphConstruction):
"""Class for multiscale graph-based data clustering.
This class provides an interface for multiscale graph-based data clustering [1]_
with PyGenStability.
Parameters
----------
metric : str or function, default='euclidean'
The distance metric to use. The distance function can be ‘braycurtis’, ‘canberra’,
‘chebyshev’, ‘cityblock’, ‘correlation’, ‘cosine’, ‘dice’, ‘euclidean’, ‘hamming’,
‘jaccard’, ‘jensenshannon’, ‘kulczynski1’, ‘mahalanobis’, ‘matching’, ‘minkowski’,
‘rogerstanimoto’, ‘russellrao’, ‘seuclidean’, ‘sokalmichener’, ‘sokalsneath’,
‘sqeuclidean’, ‘yule’.
graph_method : {'knn-mst', 'cknn-mst', 'precomputed'}, default='cknn-mst'
Method to construct graph from sample-by-feature matrix:
- 'knn-mst' will use k-Nearest Neighbor graph combined with Miniumus Spanning Tree.
- 'cknn-mst' will use Continunous k-Nearest Neighbor graph [2]_ combined with
Miniumus Spanning Tree.
- 'precomputed' assumes that data is already provided as adjacency matrix of a
sparse graph.
k : int, default=5
Number of neighbors considered in graph construction. This parameter is expected
to be positive.
delta : float, default=1.0
Density parameter for Continunous k-Nearest Neighbor graph. This parameter is
expected to be positive.
distance_threshold : float, optional
Optional thresholding of distance matrix.
**pgs_kwargs : dict, optional
Parameters for PyGenStability, see documentation. Some possible arguments:
- constructor (str/function): name of the generalized Markov Stability constructor,
or custom constructor function. It must have two arguments, graph and scale.
- min_scale (float): minimum Markov scale
- max_scale (float): maximum Markov scale
- n_scale (int): number of scale steps
- with_spectral_gap (bool): normalise scale by spectral gap
Attributes:
-----------
adjacency_ : sparse matrix of shape (n_samples,n_samples)
Sparse adjacency matrix of constructed graph.
results_ : dict
PyGenStability results dictionary, see documentation for all arguments.
labels_ : list of ndarray
List of robust partitions identified with optimal scale selection.
References:
-----------
.. [1] Z. Liu and M. Barahona, 'Graph-based data clustering via multiscale
community detection', *Applied Network Science*, vol. 5, no. 1, p. 3,
Dec. 2020, doi: 10.1007/s41109-019-0248-7.
.. [2] T. Berry and T. Sauer, 'Consistent manifold representation for
topological data analysis', *Foundations of Data Science*, vol. 1, no. 1,
p. 1-38, Feb. 2019, doi: 10.3934/fods.2019001.
"""

def __init__(
self,
metric="euclidean",
graph_method="cknn",
graph_method="cknn-mst",
k=5,
delta=1.0,
distance_threshold=np.inf,
Expand Down Expand Up @@ -132,19 +194,58 @@ def labels_(self):
return labels

def fit(self, X):
"""Construct graph and run PyGenStability for multiscale data clustering."""
"""Fit multiscale graph-based data clustering with PyGenStability from data.
Parameters:
-----------
X : {array-like, sparse matrix} of shape (n_samples,n_features) or \
(n_samples,n_samples) if graph_method='precomputed'
Returns:
-------
self : DataClustering
The fitted multiscale graph-based data clustering.
"""
# construct graph
self.adjacency_ = csr_matrix(self.get_graph(X))

# run PyGenStability
self.results_ = pgs_run(self.adjacency_, **self.pgs_kwargs)

return self.results_
return self

def scale_selection(
self, kernel_size=0.1, window_size=0.1, max_nvi=1, basin_radius=0.01
):
"""Identify optimal scales."""
"""Identify optimal scales [3].
Parameters:
-----------
kernel_size : int or float, default=0.1
Size of kernel for average-pooling of the NVI(t,t') matrix. If float smaller
than one it's the relative number of scales.
window_size : int or float, default=0.1
Size of window for moving mean, to smooth the pooled diagonal. If float smaller
than one it's the relative number of scales.
max_nvi: float, default=1
Threshold for local minima of the pooled diagonal.
basin_radius: int or float, default=0.01
Radius of basin around local minima of the pooled diagonal. If float smaller
than one it's the relative number of scales.
Returns:
--------
labels_ : list of ndarray
List of robust partitions identified with optimal scale selection.
References:
-----------
.. [3] D. J. Schindler, J. Clarke, and M. Barahona, 'Multiscale Mobility Patterns and
the Restriction of Human Movement', *arXiv:2201.06323*, 2023
"""
# transform relative values to absolute values
if kernel_size < 1:
kernel_size = int(kernel_size * self.results_["run_params"]["n_scale"])
Expand All @@ -164,17 +265,37 @@ def scale_selection(
basin_radius=basin_radius,
)

return self.labels_

def plot_scan(self):
"""Plot PyGenStability scan."""
"""Plot summary figure for PyGenStability scan."""
if self.results_ is None:
return

pgs_plot_scan(self.results_)

def plot_robust_partitions(
self, x_coord, y_coord, edge_width=1, node_size=20, cmap="tab20"
self, x_coord, y_coord, edge_width=1.0, node_size=20.0, cmap="tab20"
):
"""Plot robust partitions."""
"""Plot robust partitions with graph layout.
Parameters:
----------
x_coord : ndarray of shape (n_samples,)
X-coordinates provided for samples.
y_coord : ndarray of shape (n_samples,)
Y-coordinates provided for samples.
edge_width : float, default=1.0
Edge width of graph. This parameter is expected to be positive.
node_size : float, default=20.0
Node size in graph. This parameter is expected to be positive.
cmap : str, default:'tab20'
Color map for cluster colors.
"""
for m, partition in enumerate(self.labels_):

# plot
Expand Down Expand Up @@ -210,7 +331,27 @@ def plot_sankey(
filename="communities_sankey.html",
scale_index=None,
):
"""Plot Sankey diagram."""
"""Plot Sankey diagram.
Parameters:
-----------
optimal_scales : bool, default=True
Plot Sankey diagram of robust partitions only or not.
live : bool, default=False
If True, interactive figure will appear in browser.
filename : str, default="communities_sankey.html"
Filename to save the plot.
scale_index : bool
Plot Sankey diagram for provided scale indices.
Returns:
--------
fig : plotly figure
Sankey diagram figure.
"""
# plot non-trivial optimal scales only
if optimal_scales:
n_partitions = len(self.labels_)
Expand Down

0 comments on commit 266d361

Please sign in to comment.