Skip to content

Commit

Permalink
add train and test python3 script and modify readme
Browse files Browse the repository at this point in the history
  • Loading branch information
nl8590687 committed May 11, 2018
1 parent fb89aab commit 2685bbe
Show file tree
Hide file tree
Showing 5 changed files with 158 additions and 21 deletions.
22 changes: 20 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,16 +7,28 @@

This project uses keras, TensorFlow based on LSTM, CNN and CTC to implement.

[查看本项目的Wiki页面](https://github.com/nl8590687/ASRT_SpeechRecognition/wiki) (正在完善中)

本项目目前已经可以正常进行训练了。

通过git克隆仓库以后,需要将datalist目录下的文件全部拷贝到dataset目录下,也就是将其跟数据集放在一起。
```shell
$ cp -rf datalist/* dataset/
```

目前可用的模型有22

本项目运行请执行:
本项目开始训练请执行:
```shell
$ python3 train_mspeech.py
```
本项目开始测试请执行:
```shell
$ python3 SpeechModel22.py
$ python3 test_mspeech.py
```
测试之前,请确保代码中填写的模型文件路径存在。

如果程序运行期间有什么问题,可以及时在issue中提出来,我将尽快做出答复。

## Model 模型

Expand All @@ -28,6 +40,12 @@ CNN + LSTM/GRU + CTC

基于概率图的马尔可夫模型

## About Accuracy 关于准确率

当前,speech_model22的准确率在GPU上训练了120+小时(大约50个epoch),在测试集上基本能达到70+%的汉语拼音正确率

不过由于目前国际和国内的部分团队能做到97%,所以正确率仍有待于进一步提高

## Python Import
Python的依赖库

Expand Down
48 changes: 30 additions & 18 deletions SpeechModel22.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,16 +66,17 @@ def __init__(self, datapath):

def CreateModel(self):
'''
定义CNN/LSTM/CTC模型,使用函数式模型
输入层:39维的特征值序列,一条语音数据的最大长度设为1500(大约15s)
隐藏层一:1024个神经元的卷积层
隐藏层二:池化层,池化窗口大小为2
隐藏层三:Dropout层,需要断开的神经元的比例为0.2,防止过拟合
隐藏层四:循环层、LSTM层
隐藏层五:Dropout层,需要断开的神经元的比例为0.2,防止过拟合
定义CNN/LSTM/CTC模型,使用函数式模型,暂时不完全与这里的注释内容相同
输入层:200维的特征值序列,一条语音数据的最大长度设为1600(大约16s)
隐藏层一:32卷积核,尺寸3*3的卷积层
隐藏层二:32卷积核,尺寸3*3的卷积层
隐藏层三:最大池化层,池化窗口大小为2
隐藏层四:64卷积核,尺寸3*3的卷积层
隐藏层五:64卷积核,尺寸3*3的卷积层
隐藏层六:最大池化层,池化窗口大小为2
隐藏层七:全连接层,神经元数量为256,使用reLu作为激活函数,
隐藏层六:全连接层,神经元数量为self.MS_OUTPUT_SIZE,使用softmax作为激活函数,
输出层:自定义层,即CTC层,使用CTC的loss作为损失函数,实现连接性时序多输出
输出层:自定义层,即CTC层,使用CTC的loss作为损失函数,CTC_loss越小,神经网络拟合的越好
'''
# 每一帧使用13维mfcc特征及其13维一阶差分和13维二阶差分表示,最大信号序列长度为1500
input_data = Input(name='the_input', shape=(self.AUDIO_LENGTH, self.AUDIO_FEATURE_LENGTH, 1))
Expand Down Expand Up @@ -178,8 +179,9 @@ def TrainModel(self, datapath, epoch = 2, save_step = 1000, batch_size = 32, fil
训练模型
参数:
datapath: 数据保存的路径
epoch: 迭代轮数
epoch: 迭代轮数,暂时没有用处
save_step: 每多少步保存一次模型
batch_size:每一个训练批次的数据量的大小
filename: 默认保存文件名,不含文件后缀名
'''
data=DataSpeech(datapath, 'train')
Expand All @@ -204,8 +206,8 @@ def TrainModel(self, datapath, epoch = 2, save_step = 1000, batch_size = 32, fil
break

self.SaveModel(comment='_e_'+str(epoch)+'_step_'+str(n_step * save_step))
self.TestModel(self.datapath, str_dataset='train', data_count = 4)
self.TestModel(self.datapath, str_dataset='dev', data_count = 4)
self.TestModel(self.datapath, str_dataset='train', data_count = 4, show_ratio = False)
self.TestModel(self.datapath, str_dataset='dev', data_count = 4, show_ratio = False)

def LoadModel(self,filename='model_speech/speech_model22.model'):
'''
Expand All @@ -224,9 +226,14 @@ def SaveModel(self,filename='model_speech/speech_model22',comment=''):
f.write(filename+comment)
f.close()

def TestModel(self, datapath='', str_dataset='dev', data_count = 32, out_report = False):
def TestModel(self, datapath='', str_dataset='dev', data_count = 32, out_report = False, show_ratio = True):
'''
测试检验模型效果
datapath:数据集的路径,暂时不用
str_dataset:使用何种数据集进行测试,可选的有train、dev和test,分别代表训练集、开发集和测试集
data_count:用于测试的数据数量,数字越大随机波动越小,如果是个人电脑一般可以设为128
out_report:是否生成测试报告,可在项目根目录下看到
show_ratio:测试时是否显示当前测试进度比例,避免因为时间太长以为发生了死循环
'''
data=DataSpeech(self.datapath, str_dataset)
#data.LoadDataList(str_dataset)
Expand Down Expand Up @@ -257,7 +264,7 @@ def TestModel(self, datapath='', str_dataset='dev', data_count = 32, out_report
else: # 否则肯定是增加了一堆乱七八糟的奇奇怪怪的字
word_error_num += words_n # 就直接加句子本来的总字数就好了

if(i % 10 == 0):
if(i % 10 == 0 and show_ratio == True):
print('测试进度:',i,'/',data_count)

txt = ''
Expand All @@ -281,6 +288,8 @@ def Predict(self, data_input, input_len):
'''
预测结果
返回语音识别后的拼音符号列表
data_input:(1, timestep, 200)的numpy array,与GetData中的data_input是相同的
input_len:输入的语音标签序列的长度,这个与模型的卷积层和池化层的配置有关
'''

batch_size = 1
Expand Down Expand Up @@ -332,7 +341,8 @@ def Predict(self, data_input, input_len):
def RecognizeSpeech(self, wavsignal, fs):
'''
最终做语音识别用的函数,识别一个wav序列的语音
不过这里现在还有bug
wavsignal:wav声音的原始语音序列
fs:wav的采样频率,需要使用16kHz的wav采样频率才行
'''

#data = self.data
Expand All @@ -342,6 +352,7 @@ def RecognizeSpeech(self, wavsignal, fs):
#data_input = GetMfccFeature(wavsignal, fs)
#t0=time.time()
data_input = GetFrequencyFeature2(wavsignal, fs)
#data_input = GetFrequencyFeature3(wavsignal, fs)
#t1=time.time()
#print('time cost:',t1-t0)

Expand All @@ -368,6 +379,7 @@ def RecognizeSpeech(self, wavsignal, fs):
def RecognizeSpeech_FromFile(self, filename):
'''
最终做语音识别用的函数,识别指定文件名的语音
filename:wav文件名
'''

wavsignal,fs = read_wav_data(filename)
Expand All @@ -385,7 +397,7 @@ def model(self):
'''
返回keras model
'''
return self._model
return self._model, self.base_model


if(__name__=='__main__'):
Expand Down Expand Up @@ -422,8 +434,8 @@ def model(self):
ms = ModelSpeech(datapath)

#ms.LoadModel(modelpath + 'm22_2\\1\\speech_model22_e_0_step_327500.model')
ms.LoadModel(modelpath + 'm22_2/1/speech_model22_e_0_step_327500.model')
#ms.TrainModel(datapath, epoch = 50, batch_size = 4, save_step = 500)
#ms.LoadModel(modelpath + 'm22_2/1/speech_model22_e_0_step_327500.model')
ms.TrainModel(datapath, epoch = 50, batch_size = 4, save_step = 500)
#ms.TestModel(datapath, str_dataset='train', data_count = 128, out_report = True)
#r = ms.RecognizeSpeech_FromFile('E:\\语音数据集\\ST-CMDS-20170001_1-OS\\20170001P00241I0053.wav')
#r = ms.RecognizeSpeech_FromFile('E:\\语音数据集\\ST-CMDS-20170001_1-OS\\20170001P00020I0087.wav')
Expand Down
3 changes: 2 additions & 1 deletion readdata22_2.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,7 +154,8 @@ def GetData(self,n_start,n_amount=1):

# 获取输入特征
data_input = GetFrequencyFeature2(wavsignal,fs)
#data_input = np.array(data_input)
#data_input = GetFrequencyFeature3(wavsignal,fs)

data_input = data_input.reshape(data_input.shape[0],data_input.shape[1],1)
#arr_zero = np.zeros((1, 39), dtype=np.int16) #一个全是0的行向量

Expand Down
57 changes: 57 additions & 0 deletions test_mspeech.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
@author: nl8590687
用于测试语音识别系统语音模型的程序
"""
import platform as plat
import os

import tensorflow as tf
from keras.backend.tensorflow_backend import set_session


from SpeechModel22 import ModelSpeech


os.environ["CUDA_VISIBLE_DEVICES"] = "0"
#进行配置,使用70%的GPU
config = tf.ConfigProto()
config.gpu_options.per_process_gpu_memory_fraction = 0.9
#config.gpu_options.allow_growth=True #不全部占满显存, 按需分配
set_session(tf.Session(config=config))


datapath = ''
modelpath = 'model_speech'


if(not os.path.exists(modelpath)): # 判断保存模型的目录是否存在
os.makedirs(modelpath) # 如果不存在,就新建一个,避免之后保存模型的时候炸掉

system_type = plat.system() # 由于不同的系统的文件路径表示不一样,需要进行判断
if(system_type == 'Windows'):
datapath = 'E:\\语音数据集'
modelpath = modelpath + '\\'
elif(system_type == 'Linux'):
datapath = 'dataset'
modelpath = modelpath + '/'
else:
print('*[Message] Unknown System\n')
datapath = 'dataset'
modelpath = modelpath + '/'

ms = ModelSpeech(datapath)

ms.LoadModel(modelpath + 'speech_model22_e_0_step_327500.model')

ms.TestModel(datapath, str_dataset='train', data_count = 128, out_report = True)

#r = ms.RecognizeSpeech_FromFile('E:\\语音数据集\\ST-CMDS-20170001_1-OS\\20170001P00241I0053.wav')
#r = ms.RecognizeSpeech_FromFile('E:\\语音数据集\\ST-CMDS-20170001_1-OS\\20170001P00020I0087.wav')
#r = ms.RecognizeSpeech_FromFile('E:\\语音数据集\\wav\\train\\A11\\A11_167.WAV')
#r = ms.RecognizeSpeech_FromFile('E:\\语音数据集\\wav\\test\\D4\\D4_750.wav')
#print('*[提示] 语音识别结果:\n',r)


49 changes: 49 additions & 0 deletions train_mspeech.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
@author: nl8590687
用于训练语音识别系统语音模型的程序
"""
import platform as plat
import os

import tensorflow as tf
from keras.backend.tensorflow_backend import set_session


from SpeechModel22 import ModelSpeech

os.environ["CUDA_VISIBLE_DEVICES"] = "0"
#进行配置,使用70%的GPU
config = tf.ConfigProto()
config.gpu_options.per_process_gpu_memory_fraction = 0.9
#config.gpu_options.allow_growth=True #不全部占满显存, 按需分配
set_session(tf.Session(config=config))


datapath = ''
modelpath = 'model_speech'


if(not os.path.exists(modelpath)): # 判断保存模型的目录是否存在
os.makedirs(modelpath) # 如果不存在,就新建一个,避免之后保存模型的时候炸掉

system_type = plat.system() # 由于不同的系统的文件路径表示不一样,需要进行判断
if(system_type == 'Windows'):
datapath = 'E:\\语音数据集'
modelpath = modelpath + '\\'
elif(system_type == 'Linux'):
datapath = 'dataset'
modelpath = modelpath + '/'
else:
print('*[Message] Unknown System\n')
datapath = 'dataset'
modelpath = modelpath + '/'

ms = ModelSpeech(datapath)

#ms.LoadModel(modelpath + 'speech_model22_e_0_step_327500.model')
ms.TrainModel(datapath, epoch = 50, batch_size = 4, save_step = 500)


0 comments on commit 2685bbe

Please sign in to comment.