-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathctc_model.py
138 lines (116 loc) · 4.42 KB
/
ctc_model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
import numpy as np
import matplotlib.pyplot as plt
import librosa
import os
import soundfile as sf
from scipy.io import wavfile #for audio processing
import random
import tensorflow as tf
print(tf.__version__)
from keras.models import Model , Sequential
from keras.utils import Sequence
import keras
from keras.layers import *
from keras.layers.wrappers import TimeDistributed
from keras.layers.merge import Add
from keras.layers.normalization import BatchNormalization
from keras.models import Model
from keras import backend as K
from keras.utils import plot_model
def ctc_lambda_func(args):
y_pred, labels, input_length, label_length = args
return K.ctc_batch_cost(labels, y_pred, input_length, label_length )
class CTC():
def __init__(self,
input_size=None,
output_size=None,
initializer='glorot_uniform'):
self.input_size = input_size
self.output_size = output_size
self.initializer = initializer
self.m = None
self.tm = None
def build(self,
conv_filters = 200,
conv2d_filters = 13,
conv_size = 5,
conv2d_strides = 1,
conv_strides = 1,
act = 'relu',
rnn_layers = 2,
LSTM_units = 128,
drop_out = 0.8):
input_data = Input(shape = self.input_size, name = 'the_inputs')
x = Conv1D(conv_filters,
conv_size,
strides = conv_strides,
padding = "same",
name = 'conv1d1')(input_data)
x = BatchNormalization()(x)
x = Activation(act)(x)
x = Conv1D(conv_filters,
conv_size,
strides = conv_strides,
padding = "same",
name = 'conv1d2')(x)
x = BatchNormalization()(x)
x = Activation(act)(x)
for _ in range(rnn_layers):
x = Bidirectional(LSTM(LSTM_units,
return_sequences = True))(x)
x = Dropout(drop_out)(x)
x = BatchNormalization()(x)
y_pred = TimeDistributed(Dense(self.output_size,
activation = 'softmax'))(x)
# ctc inputs
labels = Input(name='the_labels', shape=[None,], dtype='int32')
input_length = Input(name='input_length', shape=[1], dtype='int32')
label_length = Input(name='label_length', shape=[1], dtype='int32')
loss_out = Lambda(ctc_lambda_func,
output_shape=(1,),
name='ctc')([y_pred,
labels,
input_length,
label_length])
self.tm = Model(inputs = input_data,
outputs = y_pred)
self.m = Model(inputs = [input_data,
labels,
input_length,
label_length],
outputs = loss_out)
return self.m, self.tm
def ctc(y_true, y_pred):
return y_pred
model_ctc = CTC((101,594), 29)
model_ctc.build()
json_file = open('model.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
loaded_model = model_from_json(loaded_model_json)
# load weights into new model
loaded_model.load_weights("model.h5")
print("Loaded model from disk")
def graph_spectrogram(wav_file):
rate, data = get_wav_info(wav_file)
nfft = 200 # Length of each window segment
fs = 8000 # Sampling frequencies
noverlap = 120 # Overlap between windows
nchannels = data.ndim
if nchannels == 1:
pxx, freqs, bins, im = plt.specgram(data, nfft, fs, noverlap = noverlap)
elif nchannels == 2:
pxx, freqs, bins, im = plt.specgram(data[:,0], nfft, fs, noverlap = noverlap)
return modify_spectogram_shape(pxx)
# Load a wav file
def get_wav_info(swav_file):
rate , data = wavfile.read(wav_file)
return rate, data
def modify_spectrogram_shape( sample ,shape = (101,198) ):
a = np.zeros(shape)
a[: , :sample.shape[1]] = sample
return sample
if __name__ == '__main__':
file_path = input("file_path: ");
inp = graph_spectrogram(file_path)
predictions = loaded_model.preict(inp np.array([0]),np.array([101]),np.array([40]) )