Skip to content

Commit

Permalink
Add files via upload
Browse files Browse the repository at this point in the history
  • Loading branch information
raidastauras authored Jan 19, 2018
1 parent 7fd903c commit dbe546a
Show file tree
Hide file tree
Showing 2 changed files with 229 additions and 19 deletions.
130 changes: 121 additions & 9 deletions helpers/get_features.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,22 @@
https://cryptotrader.org/talib
"""

import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from datetime import datetime
from talib.abstract import *
import talib
from helpers.utils import extract_timeseries_from_oanda_data
import pylab as plt


def scale_data(input_data_no_dummies, split):
"""Scale NON DUMMY data given train, test, cv split"""
from sklearn.preprocessing import MinMaxScaler
train_split = int(len(input_data_no_dummies)*split[0])
scaler = MinMaxScaler()
scaler.fit(input_data_no_dummies[:train_split])
return scaler.transform(input_data_no_dummies)


def prep_data_for_feature_gen(data):
Expand Down Expand Up @@ -109,17 +121,116 @@ def get_features(oanda_data):

all_dummies = np.array([ht_trend, mrkt_london, mrkt_ny, mrkt_sydney, mrkt_tokyo])

# min max parameters for scaling
# _, length = all_indicators.shape
# length = int(length * 0.7)
#
# min_max_parameters = np.array([np.nanmax(all_indicators[:, :length], axis=1),
# np.nanmin(all_indicators[:, :length], axis=1)])

return all_indicators.T, all_dummies.T # transpose to get (data_points, features)


# min max scaling params (needs to be created manually (for now) or better use scilearn min_max scaler
def get_features_v2(oanda_data, time_periods, return_numpy):
"""Returns all (mostly) indicators from ta-lib library for given time periods"""

# load primary data
inputs = prep_data_for_feature_gen(oanda_data)

# get name of all the functions
function_groups = ['Cycle Indicators',
'Momentum Indicators',
'Overlap Studies',
'Volume Indicators',
'Volatility Indicators',
'Statistic Functions']
function_list = [talib.get_function_groups()[group] for group in function_groups]
function_list = [item for sublist in function_list for item in sublist] # flatten the list
function_list.remove('MAVP')

# price and volume
price, volume = extract_timeseries_from_oanda_data(oanda_data, ['closeMid', 'volume'])
price_change = np.array([float(i) / float(j) - 1 for i, j in zip(price[1:], price)])
volume_change = np.array([float(i) / float(j) - 1 for i, j in zip(volume[1:], volume)])
price_change = np.concatenate([[0], price_change], axis=0)
volume_change = np.concatenate([[0], volume_change], axis=0)

# get all indicators
df_indicators = pd.DataFrame()
df_indicators['price'] = price.ravel()
df_indicators['price_delta'] = price_change
df_indicators['volume_change'] = volume_change
for func in function_list:
if 'timeperiod' in getattr(talib.abstract, func).info['parameters']:
for time_period in time_periods:
indicator = getattr(talib.abstract, func)(inputs, timeperiod=time_period)
if any(isinstance(item, np.ndarray) for item in indicator): # if indicator returns > 1 time-series
indicator_id = 0
for x in indicator:
df_indicators[func + '_' + str(indicator_id) + '_tp_' + str(time_period)] = x
indicator_id += 1
else: # if indicator returns 1 time-series
df_indicators[func + '_tp_' + str(time_period)] = indicator
else:
indicator = getattr(talib.abstract, func)(inputs)
if any(isinstance(item, np.ndarray) for item in indicator):
indicator_id = 0
for x in indicator:
df_indicators[func + str(indicator_id)] = x
indicator_id += 1
else:
df_indicators[func] = indicator

# manual handling of features
df_indicators['AD'] = df_indicators['AD'].pct_change()
df_indicators['OBV'] = df_indicators['OBV'].pct_change()
df_indicators['HT_DCPERIOD'] = (df_indicators['HT_DCPERIOD'] > pd.rolling_mean(df_indicators['HT_DCPERIOD'], 50)).astype(float)
df_indicators['HT_DCPHASE'] = (df_indicators['HT_DCPHASE'] > pd.rolling_mean(df_indicators['HT_DCPHASE'], 10)).astype(float)
df_indicators['ADX_tp_10'] = (df_indicators['ADX_tp_10'] > pd.rolling_mean(df_indicators['ADX_tp_10'], 10)).astype(float)
df_indicators['MACD0'] = df_indicators['MACD0'] - df_indicators['MACD1']
df_indicators['MINUS_DI_tp_10'] = (df_indicators['MINUS_DI_tp_10'] > pd.rolling_mean(df_indicators['MINUS_DI_tp_10'], 20)).astype(float)
df_indicators['RSI_tp_10'] = (df_indicators['RSI_tp_10'] > pd.rolling_mean(df_indicators['RSI_tp_10'], 15)).astype(float)
df_indicators['ULTOSC'] = (df_indicators['ULTOSC'] > pd.rolling_mean(df_indicators['ULTOSC'], 15)).astype(float)
df_indicators['BBANDS_0_tp_10'] = df_indicators['BBANDS_0_tp_10'] - df_indicators['price']
df_indicators['BBANDS_1_tp_10'] = df_indicators['BBANDS_1_tp_10'] - df_indicators['price']
df_indicators['BBANDS_2_tp_10'] = df_indicators['BBANDS_2_tp_10'] - df_indicators['price']
df_indicators['DEMA_tp_10'] = df_indicators['DEMA_tp_10'] - df_indicators['price']
df_indicators['EMA_tp_10'] = df_indicators['EMA_tp_10'] - df_indicators['price']
df_indicators['HT_TRENDLINE'] = df_indicators['HT_TRENDLINE'] - df_indicators['price']
df_indicators['KAMA_tp_10'] = df_indicators['KAMA_tp_10'] - df_indicators['price']
df_indicators['MAMA0'] = df_indicators['MAMA0'] - df_indicators['price']
df_indicators['MAMA1'] = df_indicators['MAMA1'] - df_indicators['price']
df_indicators['MIDPOINT_tp_10'] = df_indicators['MIDPOINT_tp_10'] - df_indicators['price']
df_indicators['MIDPRICE_tp_10'] = df_indicators['MIDPRICE_tp_10'] - df_indicators['price']
df_indicators['SMA_tp_10'] = df_indicators['SMA_tp_10'] - df_indicators['price']
df_indicators['T3_tp_10'] = df_indicators['T3_tp_10'] - df_indicators['price']
df_indicators['TEMA_tp_10'] = df_indicators['TEMA_tp_10'] - df_indicators['price']
df_indicators['TRIMA_tp_10'] = df_indicators['TRIMA_tp_10'] - df_indicators['price']
df_indicators['WMA_tp_10'] = df_indicators['WMA_tp_10'] - df_indicators['price']
df_indicators['SAR'] = df_indicators['SAR'] - df_indicators['price']
df_indicators['LINEARREG_tp_10'] = df_indicators['LINEARREG_tp_10'] - df_indicators['price']
df_indicators['LINEARREG_INTERCEPT_tp_10'] = df_indicators['LINEARREG_INTERCEPT_tp_10'] - df_indicators['price']
df_indicators['TSF_tp_10'] = df_indicators['TSF_tp_10'] - df_indicators['price']

# markets dummies
time = np.array([datetime.strptime(x['time'], '%Y-%m-%dT%H:%M:%S.000000Z') for x in oanda_data])
df_indicators['mrkt_london'] = np.array([3 <= x.hour <= 11 for x in time]).astype(int)
df_indicators['mrkt_ny'] = np.array([8 <= x.hour <= 16 for x in time]).astype(int)
df_indicators['mrkt_sydney'] = np.array([17 <= x.hour <= 24 or 0 <= x.hour <= 1 for x in time]).astype(int)
df_indicators['mrkt_tokyo'] = np.array([19 <= x.hour <= 24 or 0 <= x.hour <= 3 for x in time]).astype(int)

print('Features shape: {}'.format(df_indicators.shape))

return df_indicators.as_matrix() if return_numpy else df_indicators


# # min max scaling params (needs to be created manually (for now) or better use scilearn min_max scaler
# # min max parameters for scaling
# import pandas as pd
# oanda_data = np.load('data\\AUD_JPY_H1.npy')[-50000:]
# all_indicators, all_dummies = get_features(oanda_data)
# length = int(len(all_indicators) * 0.5)
# all_indicators = pd.DataFrame(all_indicators[:length, ])
# all_indicators_pd = all_indicators[all_indicators.apply(lambda x: np.abs(x - x.median()) / x.std() < 3).all(axis=1)]
# all_indicators_np = all_indicators_pd.as_matrix()
#
# min_max_parameters = np.array([np.nanmax(all_indicators_np[:, :length].T, axis=1),
# np.nanmin(all_indicators_np[:, :length].T, axis=1)])

# eur usd
min_max_scaling = np.array([[1.86410584e-03, 2.01841085e+00, 1.19412800e+00,
1.19447352e+00, 1.19295244e+00, 2.70961491e-03,
1.32700000e-03, 4.05070743e-03, 9.86577181e-01,
Expand All @@ -138,3 +249,4 @@ def get_features(oanda_data):
-2.96059473e-15, -1.71810219e+03, -4.40536468e-01,
-9.46300629e-01, 1.65643780e+02, 1.18376500e+00,
6.95891066e-02]])

118 changes: 108 additions & 10 deletions helpers/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
"""

import numpy as np
import pandas as pd
import pylab as plt


def remove_nan_rows(items):
Expand All @@ -27,29 +29,37 @@ def extract_timeseries_from_oanda_data(oanda_data, items):
return output if len(output) > 1 else output[0]


def price_to_binary_target(oanda_data, delta=0.001):
def price_to_binary_target(oanda_data, delta=0.0001):
"""Quick and dirty way of constructing output where:
[1, 0, 0] rise in price
[0, 1, 0] price drop
[0, 0, 1] no change (flat)
"""
price = extract_timeseries_from_oanda_data(oanda_data, ['closeMid'])
price_change = np.array([x1 / x2 - 1 for x1, x2 in zip(price[1:], price)])
price_change = np.concatenate([[[0]], price_change])
binary_price = np.zeros(shape=(len(price), 3))
binary_price[-1] = np.nan
for data_point in range(len(price_change)):
if price_change[data_point] > 0 and price_change[data_point] - delta > 0: # price will drop
for data_point in range(len(price_change) - 1):
if price_change[data_point+1] > 0 and price_change[data_point+1] - delta > 0: # price will drop
column = 0
elif price_change[data_point] < 0 and price_change[data_point] + delta < 0: # price will rise
elif price_change[data_point+1] < 0 and price_change[data_point+1] + delta < 0: # price will rise
column = 1
else: # price will not change
column = 2
binary_price[data_point][column] = 1

# print target label distribution
data_points = len(binary_price[:-1])
print('Rise: {:.2f}, Drop: {:.2f}, Flat: {:.2f}'.format(np.sum(binary_price[:-1, 0]) / data_points,
np.sum(binary_price[:-1, 1]) / data_points,
np.sum(binary_price[:-1, 2]) / data_points))

# print df to check if no look-ahead bias is introduced
print(pd.DataFrame(np.concatenate([np.around(price, 5),
np.around(price_change, 4),
binary_price.astype(int)], axis=1)[:10, :]))

return binary_price


Expand Down Expand Up @@ -82,14 +92,19 @@ def get_signal(softmax_output):
return signal


def portfolio_value(price, signal, trans_costs=0.0001):
"""Return portfolio value given price of an instrument, it's transaction costs and signal values"""
price_change = np.array([i / j - 1 for i, j in zip(price[1:], price)])
signal_percent = signal[:-1] * price_change.reshape(len(price_change), 1)
def portfolio_value(price_change, signal, trans_costs=0.000):
"""Return portfolio value.
IMPORTANT!
signal received from last fully formed candle
percentage price change over last fully formed candle and previous period"""
# signal = signal_train
# price_change = price_data
signal_percent = signal[:-1] * price_change[1:]
transaction_costs = np.zeros_like(signal_percent)
for i in range(len(signal)-1):
transaction_costs[i] = [trans_costs * price[i] if signal[i] != signal[i+1] else 0]
transaction_costs[i] = [trans_costs if signal[i] != signal[i+1] and signal[i+1] != 0 else 0]
value = np.cumsum(signal_percent - transaction_costs) + 1
# full = np.concatenate([signal, np.concatenate([[[0]], transaction_costs], axis=0)], axis=1)
return value


Expand Down Expand Up @@ -122,4 +137,87 @@ def get_cnn_input_output(x, y, time_steps=12):
for i in range(data_points - time_steps):
x_batch_reshaped.append(x[i:i+time_steps, :])
x_batch_reshaped = np.transpose(np.array([x_batch_reshaped]), axes=(1, 3, 2, 0))
return np.array(x_batch_reshaped), y[time_steps:]
return np.array(x_batch_reshaped), y[time_steps:]


def plot_roc_curve(y_pred_prob, y_target):
"""Source: http://scikit-learn.org/stable/auto_examples/model_selection/plot_roc.html"""
from sklearn.metrics import roc_curve, auc
# roc curve
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(3):
fpr[i], tpr[i], _ = roc_curve(y_target[:, i], y_pred_prob[:, i])
roc_auc[i] = auc(fpr[i], tpr[i])

# Compute micro-average ROC curve and ROC area
fpr["micro"], tpr["micro"], _ = roc_curve(y_target.ravel(), y_pred_prob.ravel())
roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])

return roc_auc["micro"], fpr["micro"], tpr["micro"]


def y_trans(y, to_1d):
"""Transform y data from [1, -1] to [[1, 0, 0], [0, 1, 0]] and vice versa"""
if to_1d:
y_flat = y.argmax(axis=1)
map = {0: 1, 1: -1, 2: 0}
y_new = np.copy(y_flat)
for k, v in map.items():
y_new[y_flat == k] = v
return y_new.reshape(-1, 1)
else:
y_new = np.zeros(shape=(len(y), 3))
for i in range(len(y_new)):
index = [0 if y[i] == 1 else 1 if y[i] == -1 else 2][0]
y_new[i, index] = 1
return y_new


def min_max_scale(input_train, input_test, input_cv, std_dev_threshold=2.1):
from sklearn.preprocessing import MinMaxScaler

# get rid of outliers
input_train_df = pd.DataFrame(input_train)
input_train_no_outliers = input_train_df[input_train_df.apply(
lambda x: np.abs(x - x.median()) / x.std() < std_dev_threshold).all(axis=1)].as_matrix()

scaler = MinMaxScaler()
scaler.fit(input_train_no_outliers)

input_train_scaled = scaler.fit_transform(input_train)
input_test_scaled = scaler.fit_transform(input_test)
input_cv_scaled = scaler.fit_transform(input_cv)

return input_train_scaled, input_test_scaled, input_cv_scaled


def get_pca(input_train, input_test, input_cv, threshold=0.01):
from sklearn.decomposition import PCA
pca = PCA()
pca.fit(input_train)
plt.plot(pca.explained_variance_ratio_)
nr_features = np.sum(pca.explained_variance_ratio_ > threshold)

input_train_pca = pca.fit_transform(input_train)
input_test_pca = pca.fit_transform(input_test)
input_cv_pca = pca.fit_transform(input_cv)

input_train_pca = input_train_pca[:, :nr_features]
input_test_pca = input_test_pca[:, :nr_features]
input_cv_pca = input_cv_pca[:, :nr_features]

return input_train_pca, input_test_pca, input_cv_pca


def get_poloynomials(input_train, input_test, input_cv, degree=2):
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(degree=degree)
poly.fit(input_train)

input_train_poly = poly.fit_transform(input_train)
input_test_poly = poly.fit_transform(input_test)
input_cv_poly = poly.fit_transform(input_cv)

return input_train_poly, input_test_poly, input_cv_poly

0 comments on commit dbe546a

Please sign in to comment.