-
Notifications
You must be signed in to change notification settings - Fork 0
/
LogisticRegressionJaneStreet.q
261 lines (194 loc) · 7.43 KB
/
LogisticRegressionJaneStreet.q
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
\c 100 100
\cd C:\q\w32\
/ Import Python libraries
\l p.q
/import several ml libraries
\l quantQ\lib\quantQjupyterq.q
\l mlnotebooks\utils\graphics.q
\l automl\automl.q
\l ml\ml.q
\l mlnotebooks\utils\graphics.q
\l mlnotebooks\utils\util.q
/Fun Q ml library
\l funq\funqJQ.q
/graphing
\l embedPy\examples\importmatplotlib.q
plt:.matplotlib.pyplot[]
//A True/false table is given for our 130 features
features:("SSSSSSSSSSSSSSSSSSSSSSSSSSSSSS";enlist",") 0: `:C:/MLProjects/JaneStreetMarketPrediction/features.csv
features:(select feature from features) ,'(flip 1_flip features = `TRUE)
`feature xkey `features
features:"f"$features
show 10#features
//1 for true, 0 for false
//load 500 days of trade data
t:("FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF";enlist",") 0: `:C:/MLProjects/JaneStreetMarketPrediction/train.csv
//key the table based on date
`date xkey `t
//apply sort attribute on table t
`s#t;
count select from t where resp_4 >0, resp<0
count select from t where resp_4 >0, resp>0
count select from t where resp_4 <0, resp<0
count select from t where resp_4 <0, resp>0
/
There is a high likelihood of
Rule 1: Optimize for resp_4
Rule 2: Don't take trades with weight above X
Rule 3: End model uses online learning
Rule 4: Be prepared for regime change (Identify the change itself, warning signs, and effects)
Rule 5: Feature Engineer lagging indicators
update sumresp:sums resp_1 from `t
plt.xlabel"Date";
plt.ylabel"Price change";
plt.title"Asset Price (cumulative resp_1)";
plt.grid 1b;
plt.scatter[exec sumresp from t; exec date from t]
plt.show[];
//we see the asset appreciate over time, making most trades profitable
//we will likely see very similar trading days. We will try to cluster trading days together later on
delete sumresp from `t;
//We analyze the data on a per day basis.
day0t:select from t where date=0
day0t:0!day0t //unkey
delete date, weight, ts_id from `day0t;
update trade:0 from `day0t;
update trade: 1 from `day0t where resp>0;
day0t:`trade xcols day0t
//Partition data into training and testing, no time series data leakage
d:.ut.part[`train`test!3 1;til] "f"$day0t
//fill nulls
d.train:0f^d.train
d.test:0f^d.test
delete resp_1, resp_2, resp_3, resp_4, resp from `d.train
delete resp_1, resp_2, resp_3, resp_4, resp from `d.test
y:first get first `Y`X set' 0 1 cut value flip d`train
yt:first get first `Yt`Xt set' 0 1 cut value flip d`test
zsf:.ml.zscoref each X
X:zsf @' X
Xt:zsf @' Xt
//unregularized
show THETA:enlist theta:(1+count X)#0f
THETA:1#.fmincg.fmincg[2000;.ml.logcostgrad[();Y;X];THETA 0]
avg yt=first "i"$.ml.plog[Xt] THETA
//We will now regularize
f:.ml.logcost[();Yt;Xt]1#.fmincg.fmincg[;;THETA 0]::
l2:.ut.sseq[.05;0;.55]
e:(f[1000] .ml.logcostgrad[;Y;X] .ml.l2@) each l2
//Find optimal l2 regularization
.ml.imin l2!e
//0.55 is the optimal l2 regularization parameter
THETA:1#.fmincg.fmincg[1000;.ml.logcostgrad[.ml.l2[0.55];Y;X]; THETA 0]
.ut.rnd[0.01] p0:first .ml.plog[Xt] THETA
"i"$p0
avg yt="i"$p0
// 53.75% accuracy on test data is not good.
testTable0:1397#t
update test0:p0>0.5 from `testTable0
select avg resp from testTable0 where test0=1
select avg resp from testTable0
select avg resp from testTable0 where test0=0
(exec avg resp from testTable0 where test0=0)%(exec avg resp from testTable0)
//A 270% increase in return over random selection is great, though we have to reverse the intent of the algo
//Instead of predicting trades using features, we will use the change in features
delta0t:day0t - prev day0t
delta0:0f^delta0t
update trade:(exec trade from day0t) from `delta0t
delta0t:1_delta0t; //drop first row
delta0:.ut.part[`train`test!3 1;til] "f"$delta0t
delete resp_1, resp_2, resp_3, resp_4, resp from `delta0.train;
delete resp_1, resp_2, resp_3, resp_4, resp from `delta0.test;
delta0.train:0f^delta0.train
delta0.test:0f^delta0.test
y:first get first `Y`X set' 0 1 cut value flip delta0.train
yt:first get first `Yt`Xt set' 0 1 cut value flip delta0.test
zsf:.ml.zscoref each X
X:zsf @' X
Xt:zsf @' Xt
show THETA:enlist theta:(1+count X)#0f
f:.ml.logcost[();Yt;Xt]1#.fmincg.fmincg[;;THETA 0]::
l2:.ut.sseq[.05;0;.55]
e:(f[1000] .ml.logcostgrad[;Y;X] .ml.l2@) each l2
//Find optimal l2 regularization
show l2reg:.ml.imin l2!e
//0.55 is still the optimal l2 regularization parameter
THETA:1#.fmincg.fmincg[1000;.ml.logcostgrad[.ml.l2[l2reg];Y;X]; THETA 0]
.ut.rnd[0.01] p:first .ml.plog[Xt] THETA
"i"$p
avg yt="i"$p
//60% accuracy is not a bad score as a trade selector
//We can still see if the predicted trades have
p:0,p //replace initial row
count p>.5
testTable:1398#t
update test:p>0.5 from `testTable
select avg resp from testTable where test=1
select avg resp from testTable
select avg resp from testTable where test=0
(exec avg resp from testTable where test=0)%(exec avg resp from testTable)
//Interestingly, when we inverse the intent of the algo we do quite well, 4x better in fact.
//selection via feature deltas is significantly better than using features
//Below is the best possibl score
select avg resp from testTable where resp>0
//These are just the THETAs for a single day. We make the assumption that each day has different trading dynamics
//We now must test our THETAs over a period of dates
//Perhaps there are similar trading days which we can cluster
//create day1t table
day1t:select from t where date=1
day1t:0!day1t //unkey
delete date, weight, ts_id from `day1t;
update trade:0 from `day1t;
update trade: 1 from `day1t where resp>0;
day1t:`trade xcols day1t
//create delta1t table
delta1t:day1t - prev day1t
update trade:(exec trade from day1t) from `delta1t
delta1t:1_delta1t; //drop first row
delta1:0f^delta1t
delete resp_1, resp_2, resp_3, resp_4, resp from `delta1;
delta1:0f^delta1
y:first get first `Y`X set' 0 1 cut value flip delta1
zsf:.ml.zscoref each X
X:zsf @' X
.ut.rnd[0.01] p1:first .ml.plog[X] THETA
"i"$p1
avg y="i"$p1
p1:0,p1 //replace initial row
count p1
count select from t where date=1
testTable1:select from t where date =1
update test:p1>0.5 from `testTable1
select avg resp from testTable1 where test=1
select avg resp from testTable1
select avg resp from testTable1 where test=0
abs((exec avg resp from testTable1 where test=0)%(exec avg resp from testTable1))
//Even on a different day, we still have positive results (108% increase), though the THETAs likely would need to be adjusted.
//We can either find new THETAs across a larger set of dates and apply to the table, or find new THETAs every so often
//It is unlikely there is some static set of THETAs which will give us what we want, but it doesn't hurt to try
//We will start out by applying current THETAs to first 30 days to see how alpha deteriorates over time
/create table
t30:select from t where date<31
t30:0!t30 //unkey
delete date, weight, ts_id from `t30;
update trade:0 from `t30;
update trade: 1 from `t30 where resp>0;
t30:`trade xcols t30
/create delta table
delta30t:t30 - prev t30
update trade:(exec trade from t30) from `delta30t
delta30t:1_delta30t; //drop first row
delta30:0f^delta30t
delete resp_1, resp_2, resp_3, resp_4, resp from `delta30;
delta30:0f^delta30
y:first get first `Y`X set' 0 1 cut value flip delta30
zsf:.ml.zscoref each X
X:zsf @' X
.ut.rnd[0.01] p30:first .ml.plog[X] THETA
"i"$p30
avg y="i"$p30
p30:0,p30 //replace initial row
count p30
count select from t30
update test:p30>0.5 from `t30
update date:(count t30)#(exec date from t) from `t30;
select avg resp by date from t30 where test=0