-
Notifications
You must be signed in to change notification settings - Fork 8
/
Copy pathK means.py
72 lines (44 loc) · 1.55 KB
/
K means.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
# coding: utf-8
# In[22]:
import pickle
import numpy
import sys
import matplotlib.pyplot as plt
sys.path.append(".../tools/")
from feature_format import featureFormat, targetFeatureSplit
# In[23]:
def Draw(pred, features, poi, mark_poi=False, name='image.png', f1_name='feature 1', f2_name ='feature 2'):
colors = ["b", "c", "k", "m", "g"]
for ii, pp in enumerate(pred):
plt.scatter(features[ii][0], features[ii][1], color= colors[pred[ii]])
if mark_poi:
for ii, pp in enumerate(pred):
if poi[ii]:
plt.scatter(features[ii][0], features[ii][1], color="r", marker="*")
plt.xlabel(f1_name)
plt.ylabel(f2_name)
plt.savefig(name)
plt.show()
# In[24]:
data_dict = pickle.load(open("C:/Users/Geekquad/final_project_dataset_modified_unix.pkl", "rb"))
data_dict.pop("TOTAL", 0)
# In[25]:
feature_1 = 'salary'
feature_2 = "exercised_stock_options"
poi = "poi"
features_list = [poi, feature_1, feature_2]
data = featureFormat(data_dict, features_list)
poi, finance_features = targetFeatureSplit(data)
# In[26]:
for f1, f2 in finance_features:
plt.scatter( f1, f2 )
plt.show()
# ## Clustering using K-means:
# In[27]:
from sklearn.cluster import KMeans
features_list = ["poi", feature_1, feature_2]
data2 = featureFormat(data_dict, features_list)
poi, finance_features = targetFeatureSplit(data2)
clf = KMeans(n_clusters=2)
pred = clf.fit_predict(finance_features)
Draw(pred, finance_features, poi, name="clusters_before_scaling.pdf", f1_name = feature_1, f2_name=feature_2)