Merge pull request #76 from erikma/dev/erikma/autoNfft

jameslyons · web-flow · commit 40c590269b57 · 2018-12-20T17:00:31.000+10:00
Fix "WARNING:root:frame length (X) is greater than FFT size"
diff --git a/python_speech_features/base.py b/python_speech_features/base.py
@@ -5,18 +5,35 @@
 from python_speech_features import sigproc
 from scipy.fftpack import dct
 
+def calculate_nfft(samplerate, winlen):
+    """Calculates the FFT size as a power of two greater than or equal to
+    the number of samples in a single window length.
+    
+    Having an FFT less than the window length loses precision by dropping
+    many of the samples; a longer FFT than the window allows zero-padding
+    of the FFT buffer which is neutral in terms of frequency domain conversion.
+
+    :param samplerate: The sample rate of the signal we are working with, in Hz.
+    :param winlen: The length of the analysis window in seconds.
+    """
+    window_length_samples = winlen * samplerate
+    nfft = 1
+    while nfft < window_length_samples:
+        nfft *= 2
+    return nfft
+
 def mfcc(signal,samplerate=16000,winlen=0.025,winstep=0.01,numcep=13,
-         nfilt=26,nfft=512,lowfreq=0,highfreq=None,preemph=0.97,ceplifter=22,appendEnergy=True,
+         nfilt=26,nfft=None,lowfreq=0,highfreq=None,preemph=0.97,ceplifter=22,appendEnergy=True,
          winfunc=lambda x:numpy.ones((x,))):
     """Compute MFCC features from an audio signal.
 
     :param signal: the audio signal from which to compute features. Should be an N*1 array
-    :param samplerate: the samplerate of the signal we are working with.
+    :param samplerate: the sample rate of the signal we are working with, in Hz.
     :param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds)
     :param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds)
     :param numcep: the number of cepstrum to return, default 13
     :param nfilt: the number of filters in the filterbank, default 26.
-    :param nfft: the FFT size. Default is 512.
+    :param nfft: the FFT size. Default is None, which uses the calculate_nfft function to choose the smallest size that does not drop sample data.
     :param lowfreq: lowest band edge of mel filters. In Hz, default is 0.
     :param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2
     :param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97.
@@ -25,6 +42,7 @@ def mfcc(signal,samplerate=16000,winlen=0.025,winstep=0.01,numcep=13,
     :param winfunc: the analysis window to apply to each frame. By default no window is applied. You can use numpy window functions here e.g. winfunc=numpy.hamming
     :returns: A numpy array of size (NUMFRAMES by numcep) containing features. Each row holds 1 feature vector.
     """
+    nfft = nfft or calculate_nfft(samplerate, winlen)
     feat,energy = fbank(signal,samplerate,winlen,winstep,nfilt,nfft,lowfreq,highfreq,preemph,winfunc)
     feat = numpy.log(feat)
     feat = dct(feat, type=2, axis=1, norm='ortho')[:,:numcep]
@@ -38,7 +56,7 @@ def fbank(signal,samplerate=16000,winlen=0.025,winstep=0.01,
     """Compute Mel-filterbank energy features from an audio signal.
 
     :param signal: the audio signal from which to compute features. Should be an N*1 array
-    :param samplerate: the samplerate of the signal we are working with.
+    :param samplerate: the sample rate of the signal we are working with, in Hz.
     :param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds)
     :param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds)
     :param nfilt: the number of filters in the filterbank, default 26.
@@ -69,7 +87,7 @@ def logfbank(signal,samplerate=16000,winlen=0.025,winstep=0.01,
     """Compute log Mel-filterbank energy features from an audio signal.
 
     :param signal: the audio signal from which to compute features. Should be an N*1 array
-    :param samplerate: the samplerate of the signal we are working with.
+    :param samplerate: the sample rate of the signal we are working with, in Hz.
     :param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds)
     :param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds)
     :param nfilt: the number of filters in the filterbank, default 26.
@@ -89,7 +107,7 @@ def ssc(signal,samplerate=16000,winlen=0.025,winstep=0.01,
     """Compute Spectral Subband Centroid features from an audio signal.
 
     :param signal: the audio signal from which to compute features. Should be an N*1 array
-    :param samplerate: the samplerate of the signal we are working with.
+    :param samplerate: the sample rate of the signal we are working with, in Hz.
     :param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds)
     :param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds)
     :param nfilt: the number of filters in the filterbank, default 26.
@@ -134,7 +152,7 @@ def get_filterbanks(nfilt=20,nfft=512,samplerate=16000,lowfreq=0,highfreq=None):
 
     :param nfilt: the number of filters in the filterbank, default 20.
     :param nfft: the FFT size. Default is 512.
-    :param samplerate: the samplerate of the signal we are working with. Affects mel spacing.
+    :param samplerate: the sample rate of the signal we are working with, in Hz. Affects mel spacing.
     :param lowfreq: lowest band edge of mel filters, default 0 Hz
     :param highfreq: highest band edge of mel filters, default samplerate/2
     :returns: A numpy array of size nfilt * (nfft/2 + 1) containing filterbank. Each row holds 1 filter.