-
Notifications
You must be signed in to change notification settings - Fork 0
/
MFCC.py
116 lines (94 loc) · 4.06 KB
/
MFCC.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Mel Frequency Cepstrum Coefficients class
This moves low and high frequencies to different sides
of the spectrum thus giving us a non-linear output
for human auditory perception
@author: alejandrogonzales
"""
import numpy as np
from scipy.fftpack import dct
# Mel Frequency Cepstrum Coefficients Class
# TODO remove all refs to self and replace with local vars
class MFCC():
# Initialize input constructor vars
Afcc = 1125 # amplitude of MFCC calculation (default see Mel scale equation)
#Afcc = 2595 # amplitude of MFCC calculation (default see Mel scale equation)
Bfcc = 700 # amplitude for exponential calculation (see Mel scale equation)
Nfft = 0 # number of FFT pts
Nfcc = 0 # number of MFCC filterbanks
Fs = 0 # sampling frequency
lower_freq = 0
upper_freq = 0
mel_pts = None # init mel scale array
freq_pts = None # init frequency array
bin_pts = None # frequency bin array
# Constructor
def __init__(self, Nfft, Nfcc, Fs, lower_freq, upper_freq):
self.Nfft = Nfft
#self.Nfcc = Nfcc+2 # add two to account for indep. endpoints
self.Nfcc = Nfcc
self.Fs = Fs
print("Nfft = " + str(Nfft))
print("Nffc = " + str(Nfcc))
print("Fs = " + str(Fs))
print("\n")
self.lower_freq = lower_freq
self.upper_freq = upper_freq
# Initialize the mel frequency scale
mel_pts = np.zeros(Nfcc+2)
freq_pts = np.zeros(Nfcc+2)
# Fit the mel frequency coefficients
def fit(self):
# TODO Consider create a function to calculate all mel coeffs
# Fit the mel scale, frequency pts and frequency bin pts (filterbank frames)
mfccMap = {}
mel_lower = self.Afcc*np.log(1 + self.lower_freq/self.Bfcc)
mel_upper = self.Afcc*np.log(1 + self.upper_freq/self.Bfcc)
#mel_lower = self.Afcc*np.log10(1 + self.lower_freq/self.Bfcc)
#mel_upper = self.Afcc*np.log10(1 + self.upper_freq/self.Bfcc)
self.mel_pts = np.linspace(mel_lower, mel_upper, self.Nfcc+2)
self.freq_pts = self.Bfcc*(np.exp(self.mel_pts/self.Afcc) - 1)
#self.freq_pts = self.Bfcc*(10**(self.mel_pts/self.Afcc) - 1)
self.bin_pts = np.floor((self.Nfft)*self.freq_pts/self.Fs)
mfccMap['mel'] = self.mel_pts
mfccMap['freq'] = self.freq_pts
mfccMap['bin'] = self.bin_pts
return mfccMap
# Calculate the filterbanks (i.e. triangular filters)
def calc_filter_banks(self):
# initalize filterbank matrix and other dep. vars
Nfcc = self.Nfcc
bin_pts = self.bin_pts
# Matrix of filterbanks (rowlen = num_filteranks, colen = numfft)
fbank = np.zeros((Nfcc, int(np.floor((self.Nfft/2+1)))))
# Loop through the rows of the filterbank matrix (one row per bank)
for m in np.arange(1, Nfcc+1):
f_m_minus = int(bin_pts[m-1]) # left triangle pt
f_m = int(bin_pts[m]) # center triangle pt
f_m_plus = int(bin_pts[m+1]) # right triangle pt
# Loop through left triangular pts (see Hm(k) function)
for k in np.arange(f_m_minus, f_m):
fbank[m-1, k] = (k - bin_pts[m-1])/(bin_pts[m] - bin_pts[m-1])
# Loop through right triangular pts (see Hm(k) function)
for k in np.arange(f_m, f_m_plus):
fbank[m-1, k] = (bin_pts[m+1] - k)/(bin_pts[m+1] - bin_pts[m])
return fbank
# Calculate the Mel Frequency Cepstrum Coefficients (ie acoustic vector)
# Each utterance is transformed into sequence of acoustic vectors
def calc_mfcc_dct(self, filter_banks, num_mfcc):
mfcc = dct(filter_banks, type=2, axis=1, norm='ortho')
mfcc = mfcc[:, 1:(num_mfcc+1)] # pull the 2nd to the num_mfcc+1 entries
return mfcc
# Apply sin lifter to the MFCC matrix
def calc_mfcc_lift(self, mfcc, num_lift):
# apply liftering to remove high quefrencies from the MFCCs
[nframes, ncoeffs] = mfcc.shape # get the coefficients
n = np.arange(ncoeffs) # indices for mfcc coefficients
slift = 1 + (num_lift/2)*np.sin((np.pi*n)/num_lift) # sin lifter for high quefrencies
mfcc_lift = mfcc*slift
# TODO To improve the MFCCs, that is to reduce the dependency on higher and
# and lower frequencies in the vocal tract we need to apply liftering, that
# is filtering in the quefrency domain (ie cepstrum domain)
return mfcc_lift