Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
65 changes: 65 additions & 0 deletions KNN self-learning.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
# -*- coding: utf-8 -*-
"""
Created on Mon Jul 1 20:31:05 2019

@author: z
"""
###KNN code self-leaerning(self-made KNN algorithm)
import numpy as np
from math import sqrt
import warnings
from collections import Counter
import pandas as pd
import random

def k_nearest_neighbours(data, predict, k=5):
if len(data) >= k:
warnings.warn('K is set to a value less than the total number of all voting groups!')
distances = [] ##Here we use Euclidean distances for convenience reasons
for group in data:
for features in data[group]:
euclidean_distance = np.linalg.norm(np.array(features)-np.array(predict))
distances.append([euclidean_distance, group])
votes = [i[1] for i in sorted(distances)[:k]]
vote_result = Counter(votes).most_common(1)[0][0]
confidence = Counter(votes).most_common(1)[0][1] / k

return vote_result, confidence

accuracies = []
for i in range(25):
df = pd.read_csv("breast-cancer-wisconsin.data.txt")
df.replace('?', -999999, inplace=True) ###try to make '?' as an outlier
df.drop(['id'], 1, inplace=True)
full_data = df.astype(float).values.tolist()
random.shuffle(full_data)

test_size = 0.2
train_set = {2:[], 4:[]}
test_set = {2:[], 4:[]}
train_data = full_data[:-int(test_size*len(full_data))]
test_data = full_data[-int(test_size)*len(full_data):]

for i in train_data:
train_set[i[-1]].append(i[:-1])
for i in test_data:
test_set[i[-1]].append(i[:-1])

correct = 0
total = 0

for group in test_set:
for data in test_set[group]:
vote,Confidence = k_nearest_neighbours(train_set, data, k=5)
if group == vote:
correct += 1
else:
total += 1

print('Accuracy of self-made KNN algorithm is:', correct/total)
accuracies.append(correct/total)

print(sum(accuracies)/len(accuracies))



40 changes: 40 additions & 0 deletions KNN using scikit-learn(traditional version).py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
# -*- coding: utf-8 -*-
"""
Created on Mon Jul 1 21:44:33 2019

@author: z
"""

import numpy as np
from sklearn import preprocessing, neighbors
from sklearn.model_selection import train_test_split
import pandas as pd

accuracies = []
for i in range(25):
df = pd.read_csv("breast-cancer-wisconsin.data.txt")
df.replace('?', -999999, inplace=True) ###try to make '?' as an outlier
df.drop(['id'], 1, inplace=True)

X = np.array(df.drop(['class'], 1))
y = np.array(df['class'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

clf = neighbors.KNeighborsClassifier()
clf.fit(X_train, y_train)

accuracy = clf.score(X_test, y_test)

###just a small example:
example = np.array([[4,2,1,1,1,2,3,2,1], [4,2,1,2,2,2,3,2,1]])
example_reshape = example.reshape(len(example), -1)
prediction = clf.predict(example_reshape)

accuracies.append(accuracy)

print(sum(accuracies)/len(accuracies))