-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdeliverable4_savemodel.py
142 lines (109 loc) · 5.35 KB
/
deliverable4_savemodel.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
# -*- coding: utf-8 -*-
"""Deliverable4_SaveModel.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1vHFr7dmMmQNrITDBdlhL0TUvRFN1fmVk
"""
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import csv
import random
import string
import re
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
import pickle
#READ IN THE APPROPRIATE DATA AS YEAR SPECIFIC DATAFRAMES
datapath_2014 = "https://raw.githubusercontent.com/omarw99/MAIS202Project_StockPredictor/master/Dataset/2014_Financial_Data.csv"
datapath_2015 = "https://raw.githubusercontent.com/omarw99/MAIS202Project_StockPredictor/master/Dataset/2015_Financial_Data.csv"
datapath_2016 = "https://raw.githubusercontent.com/omarw99/MAIS202Project_StockPredictor/master/Dataset/2016_Financial_Data.csv"
datapath_2017 = "https://raw.githubusercontent.com/omarw99/MAIS202Project_StockPredictor/master/Dataset/2017_Financial_Data.csv"
datapath_2018 = "https://raw.githubusercontent.com/omarw99/MAIS202Project_StockPredictor/master/Dataset/2018_Financial_Data.csv"
df_2014 = pd.read_csv(datapath_2014)
df_2015 = pd.read_csv(datapath_2015)
df_2016 = pd.read_csv(datapath_2016)
df_2017 = pd.read_csv(datapath_2017)
df_2018 = pd.read_csv(datapath_2018)
#EXTRACT THE RETURN COLUMNS AS SERIES FROM THE YEAR SPECIFIC DATAFFRAMES
returns_2014 = df_2014['2015 PRICE VAR [%]']
returns_2015 = df_2015['2016 PRICE VAR [%]']
returns_2016 = df_2016['2017 PRICE VAR [%]']
returns_2017 = df_2017['2018 PRICE VAR [%]']
returns_2018 = df_2018['2019 PRICE VAR [%]']
returns_2014 = returns_2014.rename('Next Year Stock Return')
returns_2015 = returns_2015.rename('Next Year Stock Return')
returns_2016 = returns_2016.rename('Next Year Stock Return')
returns_2017 = returns_2017.rename('Next Year Stock Return')
returns_2018 = returns_2018.rename('Next Year Stock Return')
#IDENTIFY THE COLUMNS IN THE YEAR SPECIFIC DATAFRAMES TO KEEP AS THE ONES WITH RELEVANT FINANCIAL RATIOS
columnsToKeep = ['Revenue Growth', 'EPS', 'Dividend per Share', 'EBITDA Margin', 'Net Profit Margin', 'priceToSalesRatio', 'priceToFreeCashFlowsRatio',
'dividendYield', 'grossProfitMargin', 'returnOnEquity', 'currentRatio', 'quickRatio', 'cashRatio', 'debtRatio', 'debtEquityRatio',
'interestCoverage', 'PE ratio', 'Receivables Turnover', 'Payables Turnover', 'Inventory Turnover', 'Sector']
df_2014 = df_2014[columnsToKeep]
df_2015 = df_2015[columnsToKeep]
df_2016 = df_2016[columnsToKeep]
df_2017 = df_2017[columnsToKeep]
df_2018 = df_2018[columnsToKeep]
#ADD THE YEARLY RETURNS SERIES BACK TO THE YEAR SPECIFIC DATAFRAMES
df_2014 = pd.concat([df_2014, returns_2014], axis = 1)
df_2015 = pd.concat([df_2015, returns_2015], axis = 1)
df_2016 = pd.concat([df_2016, returns_2016], axis = 1)
df_2017 = pd.concat([df_2017, returns_2017], axis = 1)
df_2018 = pd.concat([df_2018, returns_2018], axis = 1)
#COMBINE ALL THE YEAR SPECIFIC DATAFRAMES INTO ONE INITIAL_COMBINED_DF, DROPNA, AND RESET INDEX
individual_dataframes = [df_2014, df_2015, df_2016, df_2017, df_2018]
initial_combined_df = pd.concat(individual_dataframes)
initial_combined_df = initial_combined_df.dropna()
initial_combined_df = initial_combined_df.reset_index(drop = True)
#ONE HOT ENCODE THE SECTOR COLUMN
combined_df = pd.get_dummies(initial_combined_df, columns = ['Sector'])
def remove_outlier(df_in, col_name):
q1 = df_in[col_name].quantile(0.25)
q3 = df_in[col_name].quantile(0.75)
iqr = q3-q1 #Interquartile range
fence_low = q1-1.5*iqr
fence_high = q3+1.5*iqr
df_out = df_in.loc[(df_in[col_name] > fence_low) & (df_in[col_name] < fence_high)]
return df_out
std = combined_df.describe().iloc[2]
sortedSTD = std.sort_values(ascending=False)
x = 0
for std in sortedSTD:
if std > 1000:
x += 1
colNamesWithHighestSTD = sortedSTD.index
for i in range(x):
combined_df = remove_outlier(combined_df, colNamesWithHighestSTD[i])
combined_df = combined_df.reset_index(drop = True)
#SPLIT THE COMBINED_DF INTO TRAIN AND TEST SETS, THEN RESET THEIR INDICES
train, test = train_test_split(combined_df, test_size=0.2)
train.reset_index(inplace = True, drop = True)
test.reset_index(inplace = True, drop = True)
#CREATE THE X_TRAIN, X_TEST, Y_TRAIN, Y_TEST ARRAYS
y_train = np.asarray(train['Next Year Stock Return'])
X_train = np.asarray(train.drop(columns = ['Next Year Stock Return']))
y_test = np.asarray(test['Next Year Stock Return'])
X_test = np.asarray(test.drop(columns = ['Next Year Stock Return']))
"""RANDOM FOREST REGRESSION MODEL"""
model = RandomForestRegressor(random_state = 0)
model.fit(X_train, y_train)
y_train_predict = model.predict(X_train)
plt.scatter(y_train, y_train_predict)
plt.show()
#MSE OF THE TRAINING SET
mse_train = np.mean(np.square(np.subtract(y_train, y_train_predict)))
print("Training set Mean Squared Error: {}".format(mse_train))
y_test_predict = model.predict(X_test)
plt.scatter(y_test, y_test_predict)
plt.show()
#MSE OF THE TESTING SET
mse_test = np.mean(np.square(np.subtract(y_test, y_test_predict)))
print("Testing set Mean Squared Error: {}".format(mse_test))
"""SAVING MINI DF AND MODEL"""
#SAVING THE FIRST 5 ROWS OF COMBINED_DF TO ACCESS LATER WHEN VECTORIZING
dfToSave = combined_df.head()
dfToSave.to_csv('df.csv', index=False)
#SAVE THE MODEL TO DISK
filename = 'finalized_model.sav'
pickle.dump(model, open(filename, 'wb'))