generated from sharif-ml-lab/IMDb-IR-System
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathclustering_metrics.py
69 lines (55 loc) · 2.25 KB
/
clustering_metrics.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
import numpy as np
from typing import List
from sklearn.metrics import silhouette_score
from sklearn.metrics import adjusted_rand_score
from sklearn.metrics import confusion_matrix
class ClusteringMetrics:
def __init__(self):
pass
def silhouette_score(self, embeddings: List, cluster_labels: List) -> float:
"""
Calculate the average silhouette score for the given cluster assignments.
Parameters
-----------
embeddings: List
A list of vectors representing the data points.
cluster_labels: List
A list of cluster assignments for each data point.
Returns
--------
float
The average silhouette score, ranging from -1 to 1, where a higher value indicates better clustering.
"""
return silhouette_score(embeddings, cluster_labels)
def purity_score(self, true_labels: List, cluster_labels: List) -> float:
"""
Calculate the purity score for the given cluster assignments and ground truth labels.
Parameters
-----------
true_labels: List
A list of ground truth labels for each data point (Genres).
cluster_labels: List
A list of cluster assignments for each data point.
Returns
--------
float
The purity score, ranging from 0 to 1, where a higher value indicates better clustering.
"""
contingency_matrix = confusion_matrix(true_labels, cluster_labels)
purity = np.sum(np.amax(contingency_matrix, axis=0)) / np.sum(contingency_matrix)
return purity
def adjusted_rand_score(self, true_labels: List, cluster_labels: List) -> float:
"""
Calculate the adjusted Rand index for the given cluster assignments and ground truth labels.
Parameters
-----------
true_labels: List
A list of ground truth labels for each data point (Genres).
cluster_labels: List
A list of cluster assignments for each data point.
Returns
--------
float
The adjusted Rand index, ranging from -1 to 1, where a higher value indicates better clustering.
"""
return adjusted_rand_score(true_labels, cluster_labels)