Merge pull request #47 from tbfly/master

jameslyons · web-flow · commit a350d7a01b4b · 2017-08-10T18:45:44.000+10:00
*): Adjust for coding style.
diff --git a/python_speech_features/base.py b/python_speech_features/base.py
@@ -19,12 +19,12 @@ def mfcc(signal,samplerate=16000,winlen=0.025,winstep=0.01,numcep=13,
     :param nfft: the FFT size. Default is 512.
     :param lowfreq: lowest band edge of mel filters. In Hz, default is 0.
     :param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2
-    :param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97. 
-    :param ceplifter: apply a lifter to final cepstral coefficients. 0 is no lifter. Default is 22. 
+    :param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97.
+    :param ceplifter: apply a lifter to final cepstral coefficients. 0 is no lifter. Default is 22.
     :param appendEnergy: if this is true, the zeroth cepstral coefficient is replaced with the log of the total frame energy.
-    :param winfunc: the analysis window to apply to each frame. By default no window is applied. You can use numpy window functions here e.g. winfunc=numpy.hamming 
+    :param winfunc: the analysis window to apply to each frame. By default no window is applied. You can use numpy window functions here e.g. winfunc=numpy.hamming
     :returns: A numpy array of size (NUMFRAMES by numcep) containing features. Each row holds 1 feature vector.
-    """            
+    """
     feat,energy = fbank(signal,samplerate,winlen,winstep,nfilt,nfft,lowfreq,highfreq,preemph,winfunc)
     feat = numpy.log(feat)
     feat = dct(feat, type=2, axis=1, norm='ortho')[:,:numcep]
@@ -39,7 +39,7 @@ def fbank(signal,samplerate=16000,winlen=0.025,winstep=0.01,
 
     :param signal: the audio signal from which to compute features. Should be an N*1 array
     :param samplerate: the samplerate of the signal we are working with.
-    :param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds)    
+    :param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds)
     :param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds)
     :param nfilt: the number of filters in the filterbank, default 26.
     :param nfft: the FFT size. Default is 512.
@@ -49,18 +49,18 @@ def fbank(signal,samplerate=16000,winlen=0.025,winstep=0.01,
     :param winfunc: the analysis window to apply to each frame. By default no window is applied. You can use numpy window functions here e.g. winfunc=numpy.hamming
     :returns: 2 values. The first is a numpy array of size (NUMFRAMES by nfilt) containing features. Each row holds 1 feature vector. The
         second return value is the energy in each frame (total energy, unwindowed)
-    """          
+    """
     highfreq= highfreq or samplerate/2
     signal = sigproc.preemphasis(signal,preemph)
     frames = sigproc.framesig(signal, winlen*samplerate, winstep*samplerate, winfunc)
     pspec = sigproc.powspec(frames,nfft)
     energy = numpy.sum(pspec,1) # this stores the total energy in each frame
     energy = numpy.where(energy == 0,numpy.finfo(float).eps,energy) # if energy is zero, we get problems with log
-        
+
     fb = get_filterbanks(nfilt,nfft,samplerate,lowfreq,highfreq)
     feat = numpy.dot(pspec,fb.T) # compute the filterbank energies
     feat = numpy.where(feat == 0,numpy.finfo(float).eps,feat) # if feat is zero, we get problems with log
-    
+
     return feat,energy
 
 def logfbank(signal,samplerate=16000,winlen=0.025,winstep=0.01,
@@ -69,15 +69,15 @@ def logfbank(signal,samplerate=16000,winlen=0.025,winstep=0.01,
 
     :param signal: the audio signal from which to compute features. Should be an N*1 array
     :param samplerate: the samplerate of the signal we are working with.
-    :param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds)    
-    :param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds)    
+    :param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds)
+    :param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds)
     :param nfilt: the number of filters in the filterbank, default 26.
     :param nfft: the FFT size. Default is 512.
     :param lowfreq: lowest band edge of mel filters. In Hz, default is 0.
     :param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2
-    :param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97. 
-    :returns: A numpy array of size (NUMFRAMES by nfilt) containing features. Each row holds 1 feature vector. 
-    """          
+    :param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97.
+    :returns: A numpy array of size (NUMFRAMES by nfilt) containing features. Each row holds 1 feature vector.
+    """
     feat,energy = fbank(signal,samplerate,winlen,winstep,nfilt,nfft,lowfreq,highfreq,preemph)
     return numpy.log(feat)
 
@@ -88,36 +88,36 @@ def ssc(signal,samplerate=16000,winlen=0.025,winstep=0.01,
 
     :param signal: the audio signal from which to compute features. Should be an N*1 array
     :param samplerate: the samplerate of the signal we are working with.
-    :param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds)    
-    :param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds)    
+    :param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds)
+    :param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds)
     :param nfilt: the number of filters in the filterbank, default 26.
     :param nfft: the FFT size. Default is 512.
     :param lowfreq: lowest band edge of mel filters. In Hz, default is 0.
     :param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2
     :param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97.
     :param winfunc: the analysis window to apply to each frame. By default no window is applied. You can use numpy window functions here e.g. winfunc=numpy.hamming
-    :returns: A numpy array of size (NUMFRAMES by nfilt) containing features. Each row holds 1 feature vector. 
-    """          
+    :returns: A numpy array of size (NUMFRAMES by nfilt) containing features. Each row holds 1 feature vector.
+    """
     highfreq= highfreq or samplerate/2
     signal = sigproc.preemphasis(signal,preemph)
     frames = sigproc.framesig(signal, winlen*samplerate, winstep*samplerate, winfunc)
     pspec = sigproc.powspec(frames,nfft)
     pspec = numpy.where(pspec == 0,numpy.finfo(float).eps,pspec) # if things are all zeros we get problems
-    
+
     fb = get_filterbanks(nfilt,nfft,samplerate,lowfreq,highfreq)
     feat = numpy.dot(pspec,fb.T) # compute the filterbank energies
     R = numpy.tile(numpy.linspace(1,samplerate/2,numpy.size(pspec,1)),(numpy.size(pspec,0),1))
-    
+
     return numpy.dot(pspec*R,fb.T) / feat
-    
+
 def hz2mel(hz):
     """Convert a value in Hertz to Mels
 
     :param hz: a value in Hz. This can also be a numpy array, conversion proceeds element-wise.
     :returns: a value in Mels. If an array was passed in, an identical sized array is returned.
     """
     return 2595 * numpy.log10(1+hz/700.)
-    
+
 def mel2hz(mel):
     """Convert a value in Mels to Hertz
 
@@ -139,7 +139,7 @@ def get_filterbanks(nfilt=20,nfft=512,samplerate=16000,lowfreq=0,highfreq=None):
     """
     highfreq= highfreq or samplerate/2
     assert highfreq <= samplerate/2, "highfreq is greater than samplerate/2"
-    
+
     # compute points evenly spaced in mels
     lowmel = hz2mel(lowfreq)
     highmel = hz2mel(highfreq)
@@ -154,12 +154,12 @@ def get_filterbanks(nfilt=20,nfft=512,samplerate=16000,lowfreq=0,highfreq=None):
             fbank[j,i] = (i - bin[j]) / (bin[j+1]-bin[j])
         for i in range(int(bin[j+1]), int(bin[j+2])):
             fbank[j,i] = (bin[j+2]-i) / (bin[j+2]-bin[j+1])
-    return fbank                 
-    
+    return fbank
+
 def lifter(cepstra, L=22):
     """Apply a cepstral lifter the the matrix of cepstra. This has the effect of increasing the
     magnitude of the high frequency DCT coeffs.
-    
+
     :param cepstra: the matrix of mel-cepstra, will be numframes * numcep in size.
     :param L: the liftering coefficient to use. Default is 22. L <= 0 disables lifter.
     """
diff --git a/python_speech_features/sigproc.py b/python_speech_features/sigproc.py
@@ -53,68 +53,68 @@ def deframesig(frames,siglen,frame_len,frame_step,winfunc=lambda x:numpy.ones((x
     frame_step = round_half_up(frame_step)
     numframes = numpy.shape(frames)[0]
     assert numpy.shape(frames)[1] == frame_len, '"frames" matrix is wrong size, 2nd dim is not equal to frame_len'
- 
+
     indices = numpy.tile(numpy.arange(0,frame_len),(numframes,1)) + numpy.tile(numpy.arange(0,numframes*frame_step,frame_step),(frame_len,1)).T
     indices = numpy.array(indices,dtype=numpy.int32)
-    padlen = (numframes-1)*frame_step + frame_len   
-    
+    padlen = (numframes-1)*frame_step + frame_len
+
     if siglen <= 0: siglen = padlen
-    
+
     rec_signal = numpy.zeros((padlen,))
     window_correction = numpy.zeros((padlen,))
     win = winfunc(frame_len)
-    
+
     for i in range(0,numframes):
         window_correction[indices[i,:]] = window_correction[indices[i,:]] + win + 1e-15 #add a little bit so it is never zero
         rec_signal[indices[i,:]] = rec_signal[indices[i,:]] + frames[i,:]
-        
+
     rec_signal = rec_signal/window_correction
     return rec_signal[0:siglen]
-    
+
 def magspec(frames,NFFT):
-    """Compute the magnitude spectrum of each frame in frames. If frames is an NxD matrix, output will be Nx(NFFT/2+1). 
+    """Compute the magnitude spectrum of each frame in frames. If frames is an NxD matrix, output will be Nx(NFFT/2+1).
 
     :param frames: the array of frames. Each row is a frame.
-    :param NFFT: the FFT length to use. If NFFT > frame_len, the frames are zero-padded. 
+    :param NFFT: the FFT length to use. If NFFT > frame_len, the frames are zero-padded.
     :returns: If frames is an NxD matrix, output will be Nx(NFFT/2+1). Each row will be the magnitude spectrum of the corresponding frame.
-    """    
-    if numpy.shape(frames)[1] > NFFT: 
+    """
+    if numpy.shape(frames)[1] > NFFT:
         logging.warn('frame length (%d) is greater than FFT size (%d), frame will be truncated. Increase NFFT to avoid.', numpy.shape(frames)[1], NFFT)
     complex_spec = numpy.fft.rfft(frames,NFFT)
     return numpy.absolute(complex_spec)
-          
+
 def powspec(frames,NFFT):
-    """Compute the power spectrum of each frame in frames. If frames is an NxD matrix, output will be Nx(NFFT/2+1). 
+    """Compute the power spectrum of each frame in frames. If frames is an NxD matrix, output will be Nx(NFFT/2+1).
 
     :param frames: the array of frames. Each row is a frame.
-    :param NFFT: the FFT length to use. If NFFT > frame_len, the frames are zero-padded. 
+    :param NFFT: the FFT length to use. If NFFT > frame_len, the frames are zero-padded.
     :returns: If frames is an NxD matrix, output will be Nx(NFFT/2+1). Each row will be the power spectrum of the corresponding frame.
-    """    
+    """
     return 1.0/NFFT * numpy.square(magspec(frames,NFFT))
-    
+
 def logpowspec(frames,NFFT,norm=1):
-    """Compute the log power spectrum of each frame in frames. If frames is an NxD matrix, output will be Nx(NFFT/2+1). 
+    """Compute the log power spectrum of each frame in frames. If frames is an NxD matrix, output will be Nx(NFFT/2+1).
 
     :param frames: the array of frames. Each row is a frame.
-    :param NFFT: the FFT length to use. If NFFT > frame_len, the frames are zero-padded. 
+    :param NFFT: the FFT length to use. If NFFT > frame_len, the frames are zero-padded.
     :param norm: If norm=1, the log power spectrum is normalised so that the max value (across all frames) is 0.
     :returns: If frames is an NxD matrix, output will be Nx(NFFT/2+1). Each row will be the log power spectrum of the corresponding frame.
-    """    
+    """
     ps = powspec(frames,NFFT);
     ps[ps<=1e-30] = 1e-30
     lps = 10*numpy.log10(ps)
     if norm:
         return lps - numpy.max(lps)
     else:
         return lps
-    
+
 def preemphasis(signal,coeff=0.95):
     """perform preemphasis on the input signal.
-    
+
     :param signal: The signal to filter.
     :param coeff: The preemphasis coefficient. 0 is no filter, default is 0.95.
     :returns: the filtered signal.
-    """    
+    """
     return numpy.append(signal[0],signal[1:]-coeff*signal[:-1])