Merge pull request #9 from thisis-nkul/RoboticsClubIITJ-master

update fork
RoboticsClubIITJ · Mar 13, 2021 · 66aa5fe · 66aa5fe
2 parents cd6a034 + db05722
commit 66aa5fe
Show file tree

Hide file tree

Showing 4 changed files with 194 additions and 8 deletions.
diff --git a/Examples/agglomerative_clustering_example.py b/Examples/agglomerative_clustering_example.py
@@ -0,0 +1,9 @@
+from MLlib.models import Agglomerative_clustering
+import numpy as np
+
+X = np.genfromtxt('datasets/agglomerative_clustering.txt')
+
+
+model = Agglomerative_clustering()
+model.work(X, 4)
+model.plot(X)
diff --git a/Examples/datasets/agglomerative_clustering.txt b/Examples/datasets/agglomerative_clustering.txt
@@ -0,0 +1,8 @@
+0.40 0.53
+0.22 0.32
+0.35 0.32
+0.26 0.19
+0.08 0.41
+0.35 0.30
+0.80 0.98
+0.28 0.33
diff --git a/MLlib/models.py b/MLlib/models.py
@@ -11,6 +11,7 @@
 from MLlib.utils.pca_utils import PCA_utils, infer_dimension
 import MLlib.nn as nn
 from collections import Counter, OrderedDict
+from MLlib.utils.agglomerative_clustering_utils import compute_distance
 import numpy as np
 from numpy.random import random
 from scipy.stats import norm
@@ -20,6 +21,7 @@
 import matplotlib.pyplot as plt
 from datetime import datetime
 import math
+import scipy.cluster.hierarchy as shc
 
 DATE_FORMAT = '%d-%m-%Y_%H-%M-%S'
 
@@ -267,6 +269,7 @@ class PolynomialRegression():
         Model in rob format , in Local
         disk.
     """
+
     def __init__(self, degree):
         self.degree = degree
         self.weights = 0
@@ -280,7 +283,7 @@ def fit(
             epochs=200,
             zeros=False,
             save_best=True
-            ):
+    ):
         """
         Train the Polynomial Regression Model
         by fitting its associated weights,
@@ -446,8 +449,7 @@ def plot(
             epochs=60,
             zeros=False,
             save_best=False
-            ):
-
+    ):
         """
         Plot the graph of Loss vs Epochs
         Plot the graph of line Of Polynomial Regression
@@ -503,7 +505,7 @@ def plot(
         P = np.hstack((
             P,
             X[:, 1:2]
-            ))
+        ))
 
         X = P
         m = []
@@ -652,7 +654,6 @@ def Plot(self,
              epochs=25,
              zeros=False
              ):
-
         """
         Plots for Logistic Regression.
 
@@ -1209,15 +1210,15 @@ def fit(self, x, y):
         count_for_sample = x.shape[0]
         self.class_log = [np.log(len(i)/count_for_sample) for i in separate]
         count = self.alpha + np.array([np.array(i).sum(axis=0) for i in
-                                      separate])
+                                       separate])
         smoothing = 2 * self.alpha
         doc = np.array([smoothing + len(i) for i in separate])
         self.log_prob = count / doc[np.newaxis].T
         return self
 
     def predict_log(self, x):
         return [(np.log(self.log_prob) * i + np.log(1 - self.log_prob) *
-                np.abs(i - 1)).sum(axis=1) + self.class_log for i in x]
+                 np.abs(i - 1)).sum(axis=1) + self.class_log for i in x]
 
     def predict(self, x):
         return np.argmax(self.predict_log(x), axis=1)
@@ -1387,6 +1388,7 @@ class PCA(PCA_utils):
     data to project it to a lower dimensional space. The input data is centered
     but not scaled for each feature before applying the SVD.
     """
+
     def __init__(self, n_components=None, whiten=False, svd_solver='auto'):
         self.n_components = n_components
         self.whiten = whiten
@@ -1508,7 +1510,6 @@ def get_percentile(c, percentile_rank):
         return d[index]
 
     def get_outliers(x):
-
         """ get_outliers Function
          PARAMETER
            =========
@@ -1563,3 +1564,91 @@ def forward(self, x):
         for layer in self._submodules.values():
             x = layer(x)
         return x
+
+
+class Agglomerative_clustering():
+    """
+    One of the models used for Unsupervised
+    learning, by making finite number of clusters
+    from Dataset points.
+
+    ATTRIBUTES
+    ==========
+
+    None
+
+    METHODS
+    =======
+
+    work(M, num_cluster):
+        Give details about cluster arrangements
+        from Dataset's Points
+    """
+
+    def work(self, X, num_clusters):
+        """
+        Show the arrangement of clusters , provided with
+        number of clusters and Input Dataset
+        Matrix.
+
+        PARAMETERS
+        ==========
+
+        X: ndarray(dtype=int,ndim=2)
+            Dataset Matrix with finite number
+            of points, having their corresponding
+            x and y coordinates.
+
+        num_cluster: int
+            Number of Clusters to be made from
+            the provided Dataset's points. num_cluster should be
+            less than or equal to X.shape[0]
+
+        samples: list
+            List of lists of Dataset points, which will be
+            updated with every iteration of while loop due
+            to merging of data points, in
+            order to obtain suitable clusters.
+
+        Distance_mat: ndarray(dtype=int,ndim=2)
+            Adjacency Matrix, consisting of
+            distance between every two points/ two clusters/
+            one point - one cluster
+
+        RETURNS
+        =======
+
+        None
+        """
+
+        samples = [[list(X[i])] for i in range(X.shape[0])]
+        m = len(samples)
+        # create adjacency matrix
+        Distance_mat = compute_distance(samples)
+        print("Samples before clustering : {}".format(samples))
+        print("=============================================")
+        while m > num_clusters:
+            Distance_mat = compute_distance(samples)
+            # find the index [i,j] of the minimum distance from the matrix
+            # samples[i], samples[j] are to be merged
+            sample_ind_needed = np.where(Distance_mat == Distance_mat.min())[0]
+            print("Sample size before clustering   : ", m)
+            print("Samples indexes to be merged: {}".format(sample_ind_needed))
+            value_to_add = samples.pop(sample_ind_needed[1])
+            # print("Values :{}".format(value_to_add))
+            print("Samples before clustering: {}".format(samples))
+            samples[sample_ind_needed[0]].append(value_to_add)
+            print("Samples after clustering: {}".format(samples))
+            m = len(samples)
+            print("Sample size after clustering   : ", m)
+            print("=============================================")
+        print("Number of clusters formed are : {}".format(m))
+        print("Clusters formed are  : {}".format(samples))
+
+        # plotting the dendrograms
+
+    def plot(self, X):
+        plt.figure(figsize=(10, 7))
+        plt.title("Dendrograms")
+        shc.dendrogram(shc.linkage(X, method='single'))
+        plt.show()
diff --git a/MLlib/utils/agglomerative_clustering_utils.py b/MLlib/utils/agglomerative_clustering_utils.py
@@ -0,0 +1,80 @@
+import numpy as np
+
+
+def compute_distance(samples):
+    """
+    Creates a matrix of distances between individual samples and clusters
+    attained at a particular step
+    """
+    distance_mat = np.zeros((len(samples), len(samples)))
+    for i in range(distance_mat.shape[0]):
+        for j in range(distance_mat.shape[0]):
+            if i != j:
+                distance_mat[i, j] = float(
+                    distance_calculate(samples[i], samples[j]))
+            else:
+                distance_mat[i, j] = 10**4
+    return distance_mat
+
+
+def distance_calculate(sample1, sample2):
+    """
+    Distance calulated between two samples.
+    If both of them are samples/clusters, then
+    simple norm is used. In other cases, we refer
+    it as an exception case and calculates the
+    necessary distance between cluster and a sample
+    """
+    dist = []
+    for i in range(len(sample1)):
+        for j in range(len(sample2)):
+            try:
+                dist.append(np.linalg.norm(
+                    np.array(sample1[i])-np.array(sample2[j])))
+            except TypeError:
+                dist.append(intersampledist(sample1[i], sample2[j]))
+    return min(dist)
+
+
+def intersampledist(s1, s2):
+    """
+    To be used in case we have one sample and one cluster.
+    It takes the help of one method 'interclusterdist'
+    to compute the distances between elements of a
+    cluster(which are samples) and the actual sample given.
+    """
+    if str(type(s2[0])) != '<class \'list\'>':
+        s2 = [s2]
+    if str(type(s1[0])) != '<class \'list\'>':
+        s1 = [s1]
+    m = len(s1)
+    n = len(s2)
+    dist = []
+    if n >= m:
+        for i in range(n):
+            for j in range(m):
+                if (str(type(s2[i][0]) != '<class \'list\'>')):
+                    dist.append(interclusterdist(s2[i], s1[j]))
+                else:
+                    dist.append(np.linalg.norm(
+                        np.array(s2[i])-np.array(s1[j])))
+    else:
+        for i in range(m):
+            for j in range(n):
+                if (str(type(s1[i][0]) != '<class \'list\'>')):
+                    dist.append(interclusterdist(s1[i], s2[j]))
+                else:
+                    dist.append(np.linalg.norm(
+                        np.array(s1[i])-np.array(s2[j])))
+    return min(dist)
+
+
+def interclusterdist(cluster, sample):
+    if sample[0] != '<class \'list\'>':
+        sample = [sample]
+    dist = []
+    for i in range(len(cluster)):
+        for j in range(len(sample)):
+            dist.append(np.linalg.norm(
+                np.array(cluster[i])-np.array(sample[j])))
+    return min(dist)