From dbe546a930f4c96ff4a4106a49a21e8c5fb1a3b6 Mon Sep 17 00:00:00 2001 From: Raidas Grisk <30408368+RaidasGrisk@users.noreply.github.com> Date: Fri, 19 Jan 2018 18:56:04 +0200 Subject: [PATCH] Add files via upload --- helpers/get_features.py | 130 +++++++++++++++++++++++++++++++++++++--- helpers/utils.py | 118 ++++++++++++++++++++++++++++++++---- 2 files changed, 229 insertions(+), 19 deletions(-) diff --git a/helpers/get_features.py b/helpers/get_features.py index b28bae9..4d665df 100644 --- a/helpers/get_features.py +++ b/helpers/get_features.py @@ -5,10 +5,22 @@ https://cryptotrader.org/talib """ +import pandas as pd import numpy as np -from datetime import datetime, timedelta +from datetime import datetime from talib.abstract import * +import talib from helpers.utils import extract_timeseries_from_oanda_data +import pylab as plt + + +def scale_data(input_data_no_dummies, split): + """Scale NON DUMMY data given train, test, cv split""" + from sklearn.preprocessing import MinMaxScaler + train_split = int(len(input_data_no_dummies)*split[0]) + scaler = MinMaxScaler() + scaler.fit(input_data_no_dummies[:train_split]) + return scaler.transform(input_data_no_dummies) def prep_data_for_feature_gen(data): @@ -109,17 +121,116 @@ def get_features(oanda_data): all_dummies = np.array([ht_trend, mrkt_london, mrkt_ny, mrkt_sydney, mrkt_tokyo]) - # min max parameters for scaling - # _, length = all_indicators.shape - # length = int(length * 0.7) - # - # min_max_parameters = np.array([np.nanmax(all_indicators[:, :length], axis=1), - # np.nanmin(all_indicators[:, :length], axis=1)]) - return all_indicators.T, all_dummies.T # transpose to get (data_points, features) -# min max scaling params (needs to be created manually (for now) or better use scilearn min_max scaler +def get_features_v2(oanda_data, time_periods, return_numpy): + """Returns all (mostly) indicators from ta-lib library for given time periods""" + + # load primary data + inputs = prep_data_for_feature_gen(oanda_data) + + # get name of all the functions + function_groups = ['Cycle Indicators', + 'Momentum Indicators', + 'Overlap Studies', + 'Volume Indicators', + 'Volatility Indicators', + 'Statistic Functions'] + function_list = [talib.get_function_groups()[group] for group in function_groups] + function_list = [item for sublist in function_list for item in sublist] # flatten the list + function_list.remove('MAVP') + + # price and volume + price, volume = extract_timeseries_from_oanda_data(oanda_data, ['closeMid', 'volume']) + price_change = np.array([float(i) / float(j) - 1 for i, j in zip(price[1:], price)]) + volume_change = np.array([float(i) / float(j) - 1 for i, j in zip(volume[1:], volume)]) + price_change = np.concatenate([[0], price_change], axis=0) + volume_change = np.concatenate([[0], volume_change], axis=0) + + # get all indicators + df_indicators = pd.DataFrame() + df_indicators['price'] = price.ravel() + df_indicators['price_delta'] = price_change + df_indicators['volume_change'] = volume_change + for func in function_list: + if 'timeperiod' in getattr(talib.abstract, func).info['parameters']: + for time_period in time_periods: + indicator = getattr(talib.abstract, func)(inputs, timeperiod=time_period) + if any(isinstance(item, np.ndarray) for item in indicator): # if indicator returns > 1 time-series + indicator_id = 0 + for x in indicator: + df_indicators[func + '_' + str(indicator_id) + '_tp_' + str(time_period)] = x + indicator_id += 1 + else: # if indicator returns 1 time-series + df_indicators[func + '_tp_' + str(time_period)] = indicator + else: + indicator = getattr(talib.abstract, func)(inputs) + if any(isinstance(item, np.ndarray) for item in indicator): + indicator_id = 0 + for x in indicator: + df_indicators[func + str(indicator_id)] = x + indicator_id += 1 + else: + df_indicators[func] = indicator + + # manual handling of features + df_indicators['AD'] = df_indicators['AD'].pct_change() + df_indicators['OBV'] = df_indicators['OBV'].pct_change() + df_indicators['HT_DCPERIOD'] = (df_indicators['HT_DCPERIOD'] > pd.rolling_mean(df_indicators['HT_DCPERIOD'], 50)).astype(float) + df_indicators['HT_DCPHASE'] = (df_indicators['HT_DCPHASE'] > pd.rolling_mean(df_indicators['HT_DCPHASE'], 10)).astype(float) + df_indicators['ADX_tp_10'] = (df_indicators['ADX_tp_10'] > pd.rolling_mean(df_indicators['ADX_tp_10'], 10)).astype(float) + df_indicators['MACD0'] = df_indicators['MACD0'] - df_indicators['MACD1'] + df_indicators['MINUS_DI_tp_10'] = (df_indicators['MINUS_DI_tp_10'] > pd.rolling_mean(df_indicators['MINUS_DI_tp_10'], 20)).astype(float) + df_indicators['RSI_tp_10'] = (df_indicators['RSI_tp_10'] > pd.rolling_mean(df_indicators['RSI_tp_10'], 15)).astype(float) + df_indicators['ULTOSC'] = (df_indicators['ULTOSC'] > pd.rolling_mean(df_indicators['ULTOSC'], 15)).astype(float) + df_indicators['BBANDS_0_tp_10'] = df_indicators['BBANDS_0_tp_10'] - df_indicators['price'] + df_indicators['BBANDS_1_tp_10'] = df_indicators['BBANDS_1_tp_10'] - df_indicators['price'] + df_indicators['BBANDS_2_tp_10'] = df_indicators['BBANDS_2_tp_10'] - df_indicators['price'] + df_indicators['DEMA_tp_10'] = df_indicators['DEMA_tp_10'] - df_indicators['price'] + df_indicators['EMA_tp_10'] = df_indicators['EMA_tp_10'] - df_indicators['price'] + df_indicators['HT_TRENDLINE'] = df_indicators['HT_TRENDLINE'] - df_indicators['price'] + df_indicators['KAMA_tp_10'] = df_indicators['KAMA_tp_10'] - df_indicators['price'] + df_indicators['MAMA0'] = df_indicators['MAMA0'] - df_indicators['price'] + df_indicators['MAMA1'] = df_indicators['MAMA1'] - df_indicators['price'] + df_indicators['MIDPOINT_tp_10'] = df_indicators['MIDPOINT_tp_10'] - df_indicators['price'] + df_indicators['MIDPRICE_tp_10'] = df_indicators['MIDPRICE_tp_10'] - df_indicators['price'] + df_indicators['SMA_tp_10'] = df_indicators['SMA_tp_10'] - df_indicators['price'] + df_indicators['T3_tp_10'] = df_indicators['T3_tp_10'] - df_indicators['price'] + df_indicators['TEMA_tp_10'] = df_indicators['TEMA_tp_10'] - df_indicators['price'] + df_indicators['TRIMA_tp_10'] = df_indicators['TRIMA_tp_10'] - df_indicators['price'] + df_indicators['WMA_tp_10'] = df_indicators['WMA_tp_10'] - df_indicators['price'] + df_indicators['SAR'] = df_indicators['SAR'] - df_indicators['price'] + df_indicators['LINEARREG_tp_10'] = df_indicators['LINEARREG_tp_10'] - df_indicators['price'] + df_indicators['LINEARREG_INTERCEPT_tp_10'] = df_indicators['LINEARREG_INTERCEPT_tp_10'] - df_indicators['price'] + df_indicators['TSF_tp_10'] = df_indicators['TSF_tp_10'] - df_indicators['price'] + + # markets dummies + time = np.array([datetime.strptime(x['time'], '%Y-%m-%dT%H:%M:%S.000000Z') for x in oanda_data]) + df_indicators['mrkt_london'] = np.array([3 <= x.hour <= 11 for x in time]).astype(int) + df_indicators['mrkt_ny'] = np.array([8 <= x.hour <= 16 for x in time]).astype(int) + df_indicators['mrkt_sydney'] = np.array([17 <= x.hour <= 24 or 0 <= x.hour <= 1 for x in time]).astype(int) + df_indicators['mrkt_tokyo'] = np.array([19 <= x.hour <= 24 or 0 <= x.hour <= 3 for x in time]).astype(int) + + print('Features shape: {}'.format(df_indicators.shape)) + + return df_indicators.as_matrix() if return_numpy else df_indicators + + +# # min max scaling params (needs to be created manually (for now) or better use scilearn min_max scaler +# # min max parameters for scaling +# import pandas as pd +# oanda_data = np.load('data\\AUD_JPY_H1.npy')[-50000:] +# all_indicators, all_dummies = get_features(oanda_data) +# length = int(len(all_indicators) * 0.5) +# all_indicators = pd.DataFrame(all_indicators[:length, ]) +# all_indicators_pd = all_indicators[all_indicators.apply(lambda x: np.abs(x - x.median()) / x.std() < 3).all(axis=1)] +# all_indicators_np = all_indicators_pd.as_matrix() +# +# min_max_parameters = np.array([np.nanmax(all_indicators_np[:, :length].T, axis=1), +# np.nanmin(all_indicators_np[:, :length].T, axis=1)]) + +# eur usd min_max_scaling = np.array([[1.86410584e-03, 2.01841085e+00, 1.19412800e+00, 1.19447352e+00, 1.19295244e+00, 2.70961491e-03, 1.32700000e-03, 4.05070743e-03, 9.86577181e-01, @@ -138,3 +249,4 @@ def get_features(oanda_data): -2.96059473e-15, -1.71810219e+03, -4.40536468e-01, -9.46300629e-01, 1.65643780e+02, 1.18376500e+00, 6.95891066e-02]]) + diff --git a/helpers/utils.py b/helpers/utils.py index 554765b..6f1ab3d 100644 --- a/helpers/utils.py +++ b/helpers/utils.py @@ -3,6 +3,8 @@ """ import numpy as np +import pandas as pd +import pylab as plt def remove_nan_rows(items): @@ -27,7 +29,7 @@ def extract_timeseries_from_oanda_data(oanda_data, items): return output if len(output) > 1 else output[0] -def price_to_binary_target(oanda_data, delta=0.001): +def price_to_binary_target(oanda_data, delta=0.0001): """Quick and dirty way of constructing output where: [1, 0, 0] rise in price [0, 1, 0] price drop @@ -35,21 +37,29 @@ def price_to_binary_target(oanda_data, delta=0.001): """ price = extract_timeseries_from_oanda_data(oanda_data, ['closeMid']) price_change = np.array([x1 / x2 - 1 for x1, x2 in zip(price[1:], price)]) + price_change = np.concatenate([[[0]], price_change]) binary_price = np.zeros(shape=(len(price), 3)) binary_price[-1] = np.nan - for data_point in range(len(price_change)): - if price_change[data_point] > 0 and price_change[data_point] - delta > 0: # price will drop + for data_point in range(len(price_change) - 1): + if price_change[data_point+1] > 0 and price_change[data_point+1] - delta > 0: # price will drop column = 0 - elif price_change[data_point] < 0 and price_change[data_point] + delta < 0: # price will rise + elif price_change[data_point+1] < 0 and price_change[data_point+1] + delta < 0: # price will rise column = 1 else: # price will not change column = 2 binary_price[data_point][column] = 1 + # print target label distribution data_points = len(binary_price[:-1]) print('Rise: {:.2f}, Drop: {:.2f}, Flat: {:.2f}'.format(np.sum(binary_price[:-1, 0]) / data_points, np.sum(binary_price[:-1, 1]) / data_points, np.sum(binary_price[:-1, 2]) / data_points)) + + # print df to check if no look-ahead bias is introduced + print(pd.DataFrame(np.concatenate([np.around(price, 5), + np.around(price_change, 4), + binary_price.astype(int)], axis=1)[:10, :])) + return binary_price @@ -82,14 +92,19 @@ def get_signal(softmax_output): return signal -def portfolio_value(price, signal, trans_costs=0.0001): - """Return portfolio value given price of an instrument, it's transaction costs and signal values""" - price_change = np.array([i / j - 1 for i, j in zip(price[1:], price)]) - signal_percent = signal[:-1] * price_change.reshape(len(price_change), 1) +def portfolio_value(price_change, signal, trans_costs=0.000): + """Return portfolio value. + IMPORTANT! + signal received from last fully formed candle + percentage price change over last fully formed candle and previous period""" + # signal = signal_train + # price_change = price_data + signal_percent = signal[:-1] * price_change[1:] transaction_costs = np.zeros_like(signal_percent) for i in range(len(signal)-1): - transaction_costs[i] = [trans_costs * price[i] if signal[i] != signal[i+1] else 0] + transaction_costs[i] = [trans_costs if signal[i] != signal[i+1] and signal[i+1] != 0 else 0] value = np.cumsum(signal_percent - transaction_costs) + 1 + # full = np.concatenate([signal, np.concatenate([[[0]], transaction_costs], axis=0)], axis=1) return value @@ -122,4 +137,87 @@ def get_cnn_input_output(x, y, time_steps=12): for i in range(data_points - time_steps): x_batch_reshaped.append(x[i:i+time_steps, :]) x_batch_reshaped = np.transpose(np.array([x_batch_reshaped]), axes=(1, 3, 2, 0)) - return np.array(x_batch_reshaped), y[time_steps:] \ No newline at end of file + return np.array(x_batch_reshaped), y[time_steps:] + + +def plot_roc_curve(y_pred_prob, y_target): + """Source: http://scikit-learn.org/stable/auto_examples/model_selection/plot_roc.html""" + from sklearn.metrics import roc_curve, auc + # roc curve + fpr = dict() + tpr = dict() + roc_auc = dict() + for i in range(3): + fpr[i], tpr[i], _ = roc_curve(y_target[:, i], y_pred_prob[:, i]) + roc_auc[i] = auc(fpr[i], tpr[i]) + + # Compute micro-average ROC curve and ROC area + fpr["micro"], tpr["micro"], _ = roc_curve(y_target.ravel(), y_pred_prob.ravel()) + roc_auc["micro"] = auc(fpr["micro"], tpr["micro"]) + + return roc_auc["micro"], fpr["micro"], tpr["micro"] + + +def y_trans(y, to_1d): + """Transform y data from [1, -1] to [[1, 0, 0], [0, 1, 0]] and vice versa""" + if to_1d: + y_flat = y.argmax(axis=1) + map = {0: 1, 1: -1, 2: 0} + y_new = np.copy(y_flat) + for k, v in map.items(): + y_new[y_flat == k] = v + return y_new.reshape(-1, 1) + else: + y_new = np.zeros(shape=(len(y), 3)) + for i in range(len(y_new)): + index = [0 if y[i] == 1 else 1 if y[i] == -1 else 2][0] + y_new[i, index] = 1 + return y_new + + +def min_max_scale(input_train, input_test, input_cv, std_dev_threshold=2.1): + from sklearn.preprocessing import MinMaxScaler + + # get rid of outliers + input_train_df = pd.DataFrame(input_train) + input_train_no_outliers = input_train_df[input_train_df.apply( + lambda x: np.abs(x - x.median()) / x.std() < std_dev_threshold).all(axis=1)].as_matrix() + + scaler = MinMaxScaler() + scaler.fit(input_train_no_outliers) + + input_train_scaled = scaler.fit_transform(input_train) + input_test_scaled = scaler.fit_transform(input_test) + input_cv_scaled = scaler.fit_transform(input_cv) + + return input_train_scaled, input_test_scaled, input_cv_scaled + + +def get_pca(input_train, input_test, input_cv, threshold=0.01): + from sklearn.decomposition import PCA + pca = PCA() + pca.fit(input_train) + plt.plot(pca.explained_variance_ratio_) + nr_features = np.sum(pca.explained_variance_ratio_ > threshold) + + input_train_pca = pca.fit_transform(input_train) + input_test_pca = pca.fit_transform(input_test) + input_cv_pca = pca.fit_transform(input_cv) + + input_train_pca = input_train_pca[:, :nr_features] + input_test_pca = input_test_pca[:, :nr_features] + input_cv_pca = input_cv_pca[:, :nr_features] + + return input_train_pca, input_test_pca, input_cv_pca + + +def get_poloynomials(input_train, input_test, input_cv, degree=2): + from sklearn.preprocessing import PolynomialFeatures + poly = PolynomialFeatures(degree=degree) + poly.fit(input_train) + + input_train_poly = poly.fit_transform(input_train) + input_test_poly = poly.fit_transform(input_test) + input_cv_poly = poly.fit_transform(input_cv) + + return input_train_poly, input_test_poly, input_cv_poly \ No newline at end of file