Skip to content

Commit

Permalink
Merge pull request #9 from thisis-nkul/RoboticsClubIITJ-master
Browse files Browse the repository at this point in the history
update fork
  • Loading branch information
0xnakul authored Mar 13, 2021
2 parents cd6a034 + db05722 commit 66aa5fe
Show file tree
Hide file tree
Showing 4 changed files with 194 additions and 8 deletions.
9 changes: 9 additions & 0 deletions Examples/agglomerative_clustering_example.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
from MLlib.models import Agglomerative_clustering
import numpy as np

X = np.genfromtxt('datasets/agglomerative_clustering.txt')


model = Agglomerative_clustering()
model.work(X, 4)
model.plot(X)
8 changes: 8 additions & 0 deletions Examples/datasets/agglomerative_clustering.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
0.40 0.53
0.22 0.32
0.35 0.32
0.26 0.19
0.08 0.41
0.35 0.30
0.80 0.98
0.28 0.33
105 changes: 97 additions & 8 deletions MLlib/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from MLlib.utils.pca_utils import PCA_utils, infer_dimension
import MLlib.nn as nn
from collections import Counter, OrderedDict
from MLlib.utils.agglomerative_clustering_utils import compute_distance
import numpy as np
from numpy.random import random
from scipy.stats import norm
Expand All @@ -20,6 +21,7 @@
import matplotlib.pyplot as plt
from datetime import datetime
import math
import scipy.cluster.hierarchy as shc

DATE_FORMAT = '%d-%m-%Y_%H-%M-%S'

Expand Down Expand Up @@ -267,6 +269,7 @@ class PolynomialRegression():
Model in rob format , in Local
disk.
"""

def __init__(self, degree):
self.degree = degree
self.weights = 0
Expand All @@ -280,7 +283,7 @@ def fit(
epochs=200,
zeros=False,
save_best=True
):
):
"""
Train the Polynomial Regression Model
by fitting its associated weights,
Expand Down Expand Up @@ -446,8 +449,7 @@ def plot(
epochs=60,
zeros=False,
save_best=False
):

):
"""
Plot the graph of Loss vs Epochs
Plot the graph of line Of Polynomial Regression
Expand Down Expand Up @@ -503,7 +505,7 @@ def plot(
P = np.hstack((
P,
X[:, 1:2]
))
))

X = P
m = []
Expand Down Expand Up @@ -652,7 +654,6 @@ def Plot(self,
epochs=25,
zeros=False
):

"""
Plots for Logistic Regression.
Expand Down Expand Up @@ -1209,15 +1210,15 @@ def fit(self, x, y):
count_for_sample = x.shape[0]
self.class_log = [np.log(len(i)/count_for_sample) for i in separate]
count = self.alpha + np.array([np.array(i).sum(axis=0) for i in
separate])
separate])
smoothing = 2 * self.alpha
doc = np.array([smoothing + len(i) for i in separate])
self.log_prob = count / doc[np.newaxis].T
return self

def predict_log(self, x):
return [(np.log(self.log_prob) * i + np.log(1 - self.log_prob) *
np.abs(i - 1)).sum(axis=1) + self.class_log for i in x]
np.abs(i - 1)).sum(axis=1) + self.class_log for i in x]

def predict(self, x):
return np.argmax(self.predict_log(x), axis=1)
Expand Down Expand Up @@ -1387,6 +1388,7 @@ class PCA(PCA_utils):
data to project it to a lower dimensional space. The input data is centered
but not scaled for each feature before applying the SVD.
"""

def __init__(self, n_components=None, whiten=False, svd_solver='auto'):
self.n_components = n_components
self.whiten = whiten
Expand Down Expand Up @@ -1508,7 +1510,6 @@ def get_percentile(c, percentile_rank):
return d[index]

def get_outliers(x):

""" get_outliers Function
PARAMETER
=========
Expand Down Expand Up @@ -1563,3 +1564,91 @@ def forward(self, x):
for layer in self._submodules.values():
x = layer(x)
return x


class Agglomerative_clustering():
"""
One of the models used for Unsupervised
learning, by making finite number of clusters
from Dataset points.
ATTRIBUTES
==========
None
METHODS
=======
work(M, num_cluster):
Give details about cluster arrangements
from Dataset's Points
"""

def work(self, X, num_clusters):
"""
Show the arrangement of clusters , provided with
number of clusters and Input Dataset
Matrix.
PARAMETERS
==========
X: ndarray(dtype=int,ndim=2)
Dataset Matrix with finite number
of points, having their corresponding
x and y coordinates.
num_cluster: int
Number of Clusters to be made from
the provided Dataset's points. num_cluster should be
less than or equal to X.shape[0]
samples: list
List of lists of Dataset points, which will be
updated with every iteration of while loop due
to merging of data points, in
order to obtain suitable clusters.
Distance_mat: ndarray(dtype=int,ndim=2)
Adjacency Matrix, consisting of
distance between every two points/ two clusters/
one point - one cluster
RETURNS
=======
None
"""

samples = [[list(X[i])] for i in range(X.shape[0])]
m = len(samples)
# create adjacency matrix
Distance_mat = compute_distance(samples)
print("Samples before clustering : {}".format(samples))
print("=============================================")
while m > num_clusters:
Distance_mat = compute_distance(samples)
# find the index [i,j] of the minimum distance from the matrix
# samples[i], samples[j] are to be merged
sample_ind_needed = np.where(Distance_mat == Distance_mat.min())[0]
print("Sample size before clustering : ", m)
print("Samples indexes to be merged: {}".format(sample_ind_needed))
value_to_add = samples.pop(sample_ind_needed[1])
# print("Values :{}".format(value_to_add))
print("Samples before clustering: {}".format(samples))
samples[sample_ind_needed[0]].append(value_to_add)
print("Samples after clustering: {}".format(samples))
m = len(samples)
print("Sample size after clustering : ", m)
print("=============================================")
print("Number of clusters formed are : {}".format(m))
print("Clusters formed are : {}".format(samples))

# plotting the dendrograms

def plot(self, X):
plt.figure(figsize=(10, 7))
plt.title("Dendrograms")
shc.dendrogram(shc.linkage(X, method='single'))
plt.show()
80 changes: 80 additions & 0 deletions MLlib/utils/agglomerative_clustering_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
import numpy as np


def compute_distance(samples):
"""
Creates a matrix of distances between individual samples and clusters
attained at a particular step
"""
distance_mat = np.zeros((len(samples), len(samples)))
for i in range(distance_mat.shape[0]):
for j in range(distance_mat.shape[0]):
if i != j:
distance_mat[i, j] = float(
distance_calculate(samples[i], samples[j]))
else:
distance_mat[i, j] = 10**4
return distance_mat


def distance_calculate(sample1, sample2):
"""
Distance calulated between two samples.
If both of them are samples/clusters, then
simple norm is used. In other cases, we refer
it as an exception case and calculates the
necessary distance between cluster and a sample
"""
dist = []
for i in range(len(sample1)):
for j in range(len(sample2)):
try:
dist.append(np.linalg.norm(
np.array(sample1[i])-np.array(sample2[j])))
except TypeError:
dist.append(intersampledist(sample1[i], sample2[j]))
return min(dist)


def intersampledist(s1, s2):
"""
To be used in case we have one sample and one cluster.
It takes the help of one method 'interclusterdist'
to compute the distances between elements of a
cluster(which are samples) and the actual sample given.
"""
if str(type(s2[0])) != '<class \'list\'>':
s2 = [s2]
if str(type(s1[0])) != '<class \'list\'>':
s1 = [s1]
m = len(s1)
n = len(s2)
dist = []
if n >= m:
for i in range(n):
for j in range(m):
if (str(type(s2[i][0]) != '<class \'list\'>')):
dist.append(interclusterdist(s2[i], s1[j]))
else:
dist.append(np.linalg.norm(
np.array(s2[i])-np.array(s1[j])))
else:
for i in range(m):
for j in range(n):
if (str(type(s1[i][0]) != '<class \'list\'>')):
dist.append(interclusterdist(s1[i], s2[j]))
else:
dist.append(np.linalg.norm(
np.array(s1[i])-np.array(s2[j])))
return min(dist)


def interclusterdist(cluster, sample):
if sample[0] != '<class \'list\'>':
sample = [sample]
dist = []
for i in range(len(cluster)):
for j in range(len(sample)):
dist.append(np.linalg.norm(
np.array(cluster[i])-np.array(sample[j])))
return min(dist)

0 comments on commit 66aa5fe

Please sign in to comment.