-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathknn_untuned_&_dataset_analysis.py
119 lines (85 loc) · 3.34 KB
/
knn_untuned_&_dataset_analysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
# -*- coding: utf-8 -*-
"""ChicagoDataSet.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/19k4PwC6rHtiwLTwy9Btt1RnQQ4eL1JBy
## Code Setup
"""
#Import Libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
#Import Graph Libraries
from matplotlib import style
from collections import Counter
#Import Machine Learning Methods
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import recall_score, precision_score, f1_score, accuracy_score
#Load Data
df = pd.read_csv('drive/My Drive/CAP5610/Group Project/Data/total_df.csv', error_bad_lines=False)
print(df.columns.values)
df = df.drop(['Unnamed: 0'], axis=1)
print(df.columns.values)
X = df.drop(['Arrest'], axis=1)
Y = df['Arrest']
print(X.value_counts)
print(Y.value_counts)
#Cross-validation 5-fold
folds = StratifiedKFold(n_splits = 5, shuffle = True)
#Getting all the scores from the cross validation
accuracy_scores_KNN = []
precision_scores_KNN = []
recall_scores_KNN = []
f1_scores_KNN = []
for train_index, test_index in folds.split(X, Y):
X_train, X_test, Y_train, Y_test = X.iloc[train_index], X.iloc[test_index], Y.iloc[train_index], Y.iloc[test_index]
#Use K-Nearest
knn = KNeighborsClassifier()
knn.fit(X_train, Y_train)
predKNN = knn.predict(X_test)
accuracy_scores_KNN.append(accuracy_score(Y_test, predKNN))
precision_scores_KNN.append(precision_score(Y_test, predKNN))
recall_scores_KNN.append(recall_score(Y_test, predKNN))
f1_scores_KNN.append(f1_score(Y_test, predKNN))
#Print Average Scores for knn
print('---- Average Scores ----')
print('Accuracy Score: ', np.average(accuracy_scores_KNN))
print('Precision Score: ', np.average(precision_scores_KNN))
print('Recall Score: ', np.average(recall_scores_KNN))
print('F1 Score: ', np.average(f1_scores_KNN))
print()
"""### Just Messing around with the locations and arrests"""
# Getting the min and max longitude and latitude
max_vals = df.max()
min_vals = df.min()
max_lat = max_vals['Latitude']
min_lat = min_vals['Latitude']
max_log = max_vals['Longitude']
min_log = min_vals['Longitude']
#Hard coded location bounds due to some outliers. There are a few that are way off based on location.
BBox = ((-87.9846, -87.5050, 41.6165, 42.0538))
ch_map = plt.imread('drive/My Drive/CAP5610/Group Project/Data/Chicago_map.png')
fig, ax = plt.subplots(figsize = (10,9))
colors = np.where(df["Arrest"]==1,'b','r')
ax.scatter(df['Longitude'], df['Latitude'], zorder=1, alpha= 0.2, c=colors, s=5)
ax.set_title('Plotting Spatial Data on Chicago Crime Map')
ax.set_xlim(BBox[0],BBox[1])
ax.set_ylim(BBox[2],BBox[3])
ax.imshow(ch_map, zorder=0, extent=BBox, aspect='equal')
for i in range(16):
year = df[df['Year']==i]
fig, ax = plt.subplots(figsize = (10,9))
colors = np.where(year["Arrest"]==1,'b','r')
ax.scatter(year['Longitude'], year['Latitude'], zorder=1, alpha= 0.2, c=colors, s=5)
if i < 9:
output_year = "200" + str(i + 1)
else:
output_year = "20" + str(i + 1)
ax.set_title('Plotting Spatial Data on Chicago Crime Map Year - ' + output_year)
ax.set_xlim(BBox[0],BBox[1])
ax.set_ylim(BBox[2],BBox[3])
ax.imshow(ch_map, zorder=0, extent=BBox, aspect='equal')