-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathProject1_Q2(b)_Alishbah_Fahad_1001924185.py
130 lines (107 loc) · 4.13 KB
/
Project1_Q2(b)_Alishbah_Fahad_1001924185.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
#!/usr/bin/env python
# coding: utf-8
# # Q2(b)
# ### Importing Libraries and Data
# In[1]:
import numpy as np
import math
from math import sqrt
from math import exp
from math import pi
def clean_data(line):
return line.replace('(', '').replace(')', '').replace(' ', '').strip().split(',')
def fetch_data(filename):
with open(filename, 'r') as f:
input_data = f.readlines()
clean_input = list(map(clean_data, input_data))
f.close()
return clean_input
def readFile(dataset_path):
input_data = fetch_data(dataset_path)
input_np = np.array(input_data)
return input_np
training = r"C:\Users\alish\OneDrive\Documents\Alishbah\CSE6363_Machine Learning\Project-1\axf4185_project_1\dataset\Training_Data.txt"
test = r"C:\Users\alish\OneDrive\Documents\Alishbah\CSE6363_Machine Learning\Project-1\axf4185_project_1\dataset\Test Data.txt"
Training_Data = readFile(training)
Test_Data = readFile(test)
print("Training Data:")
print(Training_Data)
print()
print("Test Data:")
print(Test_Data)
# ### Implementing Gaussian Na ̈ıve Bayes Classifier
# In[2]:
# Replacing 'W' and 'M' to '1' and '0' respectively
for i in Training_Data:
if i[3]=='W':
i[3]=i[3].replace('W','1')
i[3]=int(i[3])
else:
i[3]=i[3].replace('M','0')
i[3]=int(i[3])
Training_Data=Training_Data.astype(float)
# Split Training data by class
def separate_by_class(Trainingdata):
separated = dict()
for i in range(len(Trainingdata)):
vector = Trainingdata[i]
class_value = vector[-1]
if (class_value not in separated):
separated[class_value] = list()
separated[class_value].append(vector)
return separated
splitted_data = separate_by_class(Training_Data)
# Calculating mean
def mean(numbers):
return sum(numbers)/float(len(numbers))
# Calculating the standard deviation
def stdev(numbers):
avg = mean(numbers)
variance = sum([(x-avg)**2 for x in numbers]) / float(len(numbers)-1)
return sqrt(variance)
# Calculating mean, stdev and count for each column in a dataset
def summarize_dataset(dataset):
summaries = [(mean(column), stdev(column), len(column)) for column in zip(*dataset)]
del(summaries[-1])
return summaries
# Split dataset by class then calculate statistics for each Feature
def summarize_by_class(dataset):
separated = separate_by_class(dataset)
summaries = dict()
for class_value, rows in separated.items():
summaries[class_value] = summarize_dataset(rows)
return summaries
summary = summarize_by_class(Training_Data)
# Calculating Gaussian probability distribution function
def calculate_probability(x, mean, stdev):
exponent = exp(-((x-mean)**2 / (2 * stdev**2 )))
return (1 / (sqrt(2 * pi) * stdev)) * exponent
# Calculating probabilities of predicting each class for given Test Data
def calculate_class_probabilities(summaries, row):
total_rows = sum([summaries[label][0][2] for label in summaries])
probabilities = dict()
for class_value, class_summaries in summaries.items():
probabilities[class_value] = summaries[class_value][0][2]/float(total_rows)
for i in range(len(class_summaries)):
mean, stdev, _ = class_summaries[i]
probabilities[class_value] *= calculate_probability(row[i], mean, stdev)
return probabilities
probabilities = calculate_class_probabilities(summary, Training_Data[0])
# Predict the class for given Test Data
def predict(summaries, row):
probabilities = calculate_class_probabilities(summary, Training_Data[0])
best_label, best_prob = None, -1
for class_value, probability in probabilities.items():
if best_label is None or probability > best_prob:
best_prob = probability
best_label = class_value
return best_label
# Naive Bayes Algorithm
def naive_bayes(train, test):
summarize = summarize_by_class(train)
predictions = list()
for row in test:
output = predict(summarize, row)
predictions.append(output)
return(predictions)
print (naive_bayes(Training_Data,Test_Data), "--->" , "[' W' ' W' ' W' ' W']")