Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
98 changes: 98 additions & 0 deletions Beta_0.1/Data_Processor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
import numpy as np
import pandas as pd
import math
from sklearn import preprocessing

class Stock_Data(object):
# 对每一次股票数据进行处理
def __init__(self,stock_code,input_size,num_steps,train_ratio,interval,future):
'''
一个Stock_Data对象对应一只股票历史数据,在Stock_count中将股票历史交易数据处理成模型需要的训练集和测试集
:param stock_code: 股票的编码
:param input_size: 决定target的涨跌幅是几日的平均,例如涨跌幅取一个月后五天内的平均涨跌幅,input_size=5
:param num_steps: 决定输入LSTM神经网络的时序长度
:param train_ratio: 训练集占所有样本的比率
:param interval: interval用于将收盘价处理成interval天内的涨跌幅
:param future: future表示用来预测多少天之后的涨跌幅
'''
self.stock_code = stock_code
self.input_size = input_size
self.num_steps = num_steps
self.train_ratio = train_ratio
self.interval = interval
# interval表示将价格换成涨跌率的时间间隔,比如将50天之内的价格换成涨跌率interval=50,就是用50天内的价格去除以50天之前的最后一天的价格
self.future = future
# future表示取多久之后的平均值,比如预测30个交易日之后的10日平均值,future=30,input_size=10
self.data_x = []
self.data_y = []

df=pd.read_csv('Data\\交易数据\\'+stock_code+'.csv')

'''将收盘价每interval天计算成涨跌率,'''
close=df['close']
self.close_changed = []
for i in range(int(math.ceil(len(close) / interval))):
if i == 0:
for j in range(interval):
self.close_changed += [(close[i * interval + j] - close[i * interval]) / close[i * interval]]
elif i != 0 and (len(close) - i * interval) > interval:
for j in range(interval):
self.close_changed += [(close[i * interval + j] - close[i * interval - 1]) / close[
i * interval - 1]] # -1是为了取这interval天之前最后一天的
elif i != 0 and (len(close) - i * interval) < interval:
for j in range(len(close) - i * interval):
self.close_changed += [(close[i * interval + j] - close[i * interval - 1]) / close[i * interval - 1]]

'''数据归一化'''
self.trans_close = (self.close_changed[:] - min(self.close_changed[:])) / (max(self.close_changed[:])-min(self.close_changed[:])) #价格涨跌率归一化
self.trans_vol = (df['vol'][:] - min(df['vol'][:])) / (max(df['vol'])-min(df['vol'])) #日成交量归一化

'''用未归一化的数据来构建模型targets'''
for i in range(len(self.close_changed) - num_steps - self.input_size - self.future):
change_y = 0
for j in range(input_size):
change_y += self.close_changed[i + num_steps + self.future - j]
self.data_y += [[change_y / self.input_size]] #data_y是时间点 i 之后 future 天 input_size天内的平均涨跌幅
self.data_y = np.array(self.data_y)

self.data=[[self.trans_close[i],self.trans_vol[i]] for i in range(len(self.trans_close))]
self.data=preprocessing.MinMaxScaler().fit_transform(self.data) # 将data归于[0,1]之间
self.data_x=np.array([np.array(self.data[i:i+num_steps]) for i in range(len(self.data)-num_steps-self.input_size-self.interval)])

'''划分训练集测试集'''
self.train_x,self.test_x = np.split(self.data_x,[int(len(self.data_x)*self.train_ratio)]) #训练集数据占所有数据的train_radio
self.train_y,self.test_y = np.split(self.data_y,[int(len(self.data_y)*self.train_ratio)])

def get_fluctuation(self):
# 获取每一只股票在历史数据上的涨跌幅波动
fluctuation=max(self.close_changed[:])-min(self.close_changed[:])
return fluctuation


def get_stocks(input_size,num_steps,train_ratio,interval,future):
'''
获取每只股票对应的Stock_Data对象,读取数据文件中的股票数据
:return: 每只股票的训练集测试集,股票名,历史数据中涨跌幅的波动
'''
stocks_df=pd.DataFrame.from_csv('Data/交易数据/download_stocks_symbol.csv')
stock_name=stocks_df['code'].values.tolist()
# 处理成Stock_Data对象
stock_data=[
Stock_Data(stock_code=code,input_size=input_size,num_steps=num_steps,train_ratio=train_ratio,interval=interval,future=future)
for code in stock_name
]

'''合并每只股票的测试样本为一个统一的测试集'''
merge_test_x = []
merge_test_y = []
for test_data in stock_data: # 键值对
merge_test_x += list(test_data.test_x)
merge_test_y += list(test_data.test_y)

'''计算所有股票的平均波动'''
sum_fluctuation = 0
for code in stock_name:
sum_fluctuation += Stock_Data(stock_code=code,input_size=input_size,num_steps=num_steps,train_ratio=train_ratio,interval=interval,future=future).get_fluctuation()
mean_fluctuation = sum_fluctuation/len(stock_name)

return stock_name, stock_data, mean_fluctuation, merge_test_x, merge_test_y
55 changes: 55 additions & 0 deletions Beta_0.1/Data_fetcher_tushare.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
import tushare as ts
import pandas as pd
import random,time

'''使用tushare API需要提供的指令'''
tushare_token='9d318f4f16fc290756e756ddca50bdaabda9d3d98698345f4e505768'

def get_stock_symbol():
'''获取当日所有上市股票信息,并存于文件中'''
pro=ts.pro_api(tushare_token)
stocks_info=pro.query('stock_basic',exchange='',list_status='L', fields='ts_code,symbol,name,area,industry,list_date')
stocks_info.to_csv('Data\\stocks_info.csv')
return stocks_info


def random_download_stocks(stock_count):
'''
该函数用来随机下载stock_count数量的股票
:param stock_count: 是要下载的股票数量
'''
ts.set_token(tushare_token)
Stocks=pd.DataFrame.from_csv('Data\\stocks_info.csv')
length=len(Stocks)
print('共有',length,'只股票,现在随机下载',stock_count,'只!')

'''处理时间,将开始时间处理成当日的三年前的同一天'''
now = time.localtime()
year = now.tm_year - 3
start = str(year)
if now.tm_mon < 10:
start = start + '0' + str(now.tm_mon)
else:
start = start + str(now.tm_mon)
if now.tm_mday < 10:
start = start + '0' + str(now.tm_mday)
else:
start = start + str(now.tm_mday)
'''下载股票'''
success_stocks=pd.DataFrame(columns=['code','name'])
for i in range(stock_count):
df=pd.DataFrame()
while df.empty:
index = random.randint(1, 1000000) % length
ts_code, name = Stocks.iloc[index][['ts_code', 'name']]
df = ts.pro_bar(ts_code=ts_code, start_date=start, adj='qfq') #复权可以消除由于除权除息造成的价格走势畸变,保持股价走势的连续性,便于神经网络分析,这是技术面分析常用到的数据。
lastday =time.strptime(df.iloc[[df.shape[0]-1]]['trade_date'].values[0],'%Y%m%d') #按%Y%m%d格式,将时间字符串转化成元组
if lastday.tm_year==year and (lastday.tm_mon==now.tm_mon or lastday.tm_mon==now.tm_mon+1):
#判断获取的股票在三年前有没有交易数据
pass
else:
df=pd.DataFrame()
df.to_csv('Data\\交易数据\\'+ts_code+'.csv')
#存于文件中
success_stocks.loc[success_stocks.shape[0]+1]={'code':ts_code,'name':name}
success_stocks.to_csv('Data\\交易数据\\download_stocks_symbol.csv')
1 change: 1 addition & 0 deletions Beta_0.1/README
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
测试版0.1
188 changes: 188 additions & 0 deletions Beta_0.1/model_lstm.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,188 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import tensorflow as tf
import numpy as np
import pandas as pd
import Data_Processor

class LSTM_net():
def __init__(self,
sess,
stock_count,
lstm_shape,
num_steps,
input_size,
train_ratio,
logs_dir='',
plots_dir=''):
self.sess = sess
self.stock_count = stock_count
self.lstm_shape = lstm_shape
self.num_steps = num_steps
self.input_size = input_size
self.logs_dir = logs_dir
self.plots_dir = plots_dir
self.train_ratio=train_ratio
# 在类的构造函数里就画好图
self.graph()

def graph(self):
'''构建数据流图'''
'''定义需要用到的占位符'''
with tf.name_scope('inputs'):
self.inputs = tf.placeholder(tf.float32, [None,self.num_steps, 2], name='inputs') # 2 is for the change and vol
self.targets = tf.placeholder(tf.float32, [None, 1], name='targets')
with tf.name_scope('parameter'):
self.learning_rate = tf.placeholder(tf.float32, None, name='LearningRate')
self.keep_prob = tf.placeholder(tf.float32, None, name='KeepProb') # 表示在 dropout 中每个神经元被保留的几率

'''根据lstm_size、keep_prob、num_layers构建含Dropout包装器的LSTM神经网络'''
def _one_lstm_layer(lstm_size):
lstm_layer = tf.nn.rnn_cell.DropoutWrapper(
tf.nn.rnn_cell.LSTMCell(self.lstm_size, state_is_tuple=True),
output_keep_prob=self.keep_prob
)
return lstm_layer

with tf.name_scope('hidden_layer'):
if len(self.lstm_shape) > 1 :
lstm_layers = tf.nn.rnn_cell.MultiRNNCell([_one_lstm_layer(self.lstm_shape[i]) for i in range(len(self.lstm_shape))], state_is_tuple=True)
else :
lstm_layers = one_lstm_layer(self.lstm_shape[0]) #网络共num_layers层,每层的神经元结构相同

'''获得LSTM网络的输出和状态'''
lstm_output,_ = tf.nn.dynamic_rnn(lstm_layers, self.inputs, dtype=tf.float32)

'''根据LSTM网络的输出计算出与target维度匹配的输出'''
# Before transpose, lstm_output.get_shape() = (batch_size, num_steps, lstm_size)
# After transpose, lstm_output.get_shape() = (num_steps, batch_size, lstm_size)
lstm_output = tf.transpose(lstm_output, [1, 0, 2], name='hidden_layer_output')

'''构建神经网络的输出层'''
with tf.name_scope('linear_layer'):
last = tf.gather(lstm_output,int(lstm_output.get_shape()[0])-1, name='last_lstm_output')
weight = tf.Variable(tf.truncated_normal([self.lstm_size, 1]), name='weight')
bias = tf.Variable(tf.constant(0.1,shape=[1]), name='bias')
self.prediction = tf.matmul(last, weight) + bias
tf.summary.histogram('prediction', self.prediction)

'''模型的代价函数和优化器'''
with tf.name_scope('loss'):
self.loss=tf.reduce_mean(tf.square(self.prediction-self.targets))
tf.summary.scalar('loss', self.loss)#均方差函数

self.optimizer = tf.train.RMSPropOptimizer(learning_rate=self.learning_rate).minimize(self.loss)

'''tensorboard记录的参数mergerd, 以及文件接口'''
self.merged = tf.summary.merge_all()
self.train_writer = tf.summary.FileWriter(self.logs_dir + '/train', self.sess.graph)
self.test_writer = tf.summary.FileWriter(self.logs_dir + '/test')



def train(self,max_epoch,init_learning_rate,decay_rate,decay_epoch,batch_ratio,keep_prob,interval,future):
'''获取数据,设定好batch_size与训练epoch,将数据输入进图开始训练和测试'''

'''获取数据,初始化模型参数'''
stock_name, stock_data, mean_fluctuation, merge_test_x, merger_test_y = Data_Processor.get_stocks(
input_size=self.input_size,
num_steps=self.num_steps,
train_ratio=self.train_ratio,
interval=interval,
future=future)

tf.global_variables_initializer().run()

def _feed_dic(train):
if train:
feed_dic = {
self.learning_rate: learning_rate,
self.keep_prob: keep_prob,
self.inputs: batch_x,
self.targets: batch_y,
}
else:
feed_dic = {
self.learning_rate: 0.0,
self.keep_prob: 1.0,
self.inputs: merge_test_x,
self.targets: merger_test_y,
}
return feed_dic

'''开始训练'''
for epoch in range(max_epoch):
'''每轮更新一次学习率'''
learning_rate = init_learning_rate * (
decay_rate ** max(float(epoch + 1 - decay_epoch), 0.0)
)

'''按股票训练'''
for data in stock_data:
train_x = tf.placeholder(data.train_x.dtype,data.train_x.shape)
train_y = tf.placeholder(data.train_y.dtype,data.train_y.shape)

# 取batch_size个样本的训练集
batch_size = int(len(data.train_x)*batch_ratio)
dataset = tf.data.Dataset.from_tensor_slices((train_x, train_y))
dataset = dataset.batch(batch_size)
iterator = dataset.make_initializable_iterator()
# 初始化参数
self.sess.run(iterator.initializer, feed_dict={train_x: data.train_x, train_y: data.train_y})
next_batch = iterator.get_next()
# 获取下一个batch的数据
batch_x, batch_y = self.sess.run(next_batch)

# 训练
summary, train_optimizer = self.sess.run(
[self.merged, self.optimizer], _feed_dic(True)
)
self.train_writer.add_summary(summary, epoch)

# 测试
test_pred, test_loss = self.sess.run([self.prediction, self.loss], _feed_dic(False))
print('After epoch',epoch,'the test_loss: ',test_loss)

# 最终再测试一次
final_pred, final_loss=self.sess.run([self.prediction, self.loss], _feed_dic(False))
print('Final,the test_loss: ',final_loss)

# 计算所有股票的平均预测误差
sum_error=0
for i in range(final_pred.shape[0]):
print('final_pred:',final_pred[i][0],' and the target:',merge_test_y[i][0])
sum_error += (final_pred[i][0] - merge_test_y[i][0])
mean_error=sum_error/final_pred.shape[0]

print('所有股票涨幅的平均波动为:', mean_fluctuation)
print('在测试集上对于涨跌趋势的预测的平均误差为:', mean_error)
# 这里借助所有股票涨幅的平均波动来帮助评估模型的预测误差

self.train_writer.close()
self.test_writer.close()


def main():
with tf.Session() as sess:
lstm_model=LSTM_net(
sess,
stock_count=5,
lstm_shape=[128],
num_steps=250, # 一年250个交易日
input_size=10,
train_ratio=0.9,
logs_dir='./logs',
plots_dir='./plots'
)
lstm_model.train(max_epoch=30,
init_learning_rate=0.001,
decay_rate=0.98,
decay_epoch=10,
batch_ratio=0.8,
keep_prob=0.8,
interval=30,
future=30
)

main()
Loading