MFCC参数提取
原理
代码过程
import numpy as np
from scipy.io import wavfile
from scipy.signal import lfilter
import matplotlib.pyplot as plt
def enframe(x, win, inc=None):
"""
对语音信号进行分帧
args:
x:输入信号
win:窗口大小
return:返回的是分帧后的数据
"""
nx = len(x)
nlen = win
if isinstance(win, list) or isinstance(win, np.ndarray):
nwin = len(win)
nlen = nwin # 帧长=窗长
elif isinstance(win, int):
nwin = 1
nlen = win # 设置为帧长
if inc is None:
inc = nlen
nf = (nx - nlen + inc) // inc#取整除商的部分向下取整
frameout = np.zeros((nf, nlen))
indf = np.multiply(inc, np.array([i for i in range(nf)]))
for i in range(nf):
frameout[i, :] = x[indf[i]:indf[i] + nlen]
if isinstance(win, list) or isinstance(win, np.ndarray):
frameout = np.multiply(frameout, np.array(win))
return frameout
def melbankm(p, NFFT, fs, fl, fh):
"""
计算Mel滤波器组
args:
p: 滤波器个数
NFFT: 一帧FFT后的数据长度
fs: 采样率
fl: 最低频率
fh: 最高频率
return:
"""
bl = 1125 * np.log(1 + fl / 700) # 把 Hz 变成 Mel
bh = 1125 * np.log(1 + fh / 700)
B = bh - bl # Mel带宽
y = np.linspace(0, B, p + 2) # 将梅尔刻度等间隔
Fb = 700 * (np.exp(y / 1125) - 1) # 把 Mel 变成Hz
W2 = int(NFFT / 2 + 1)
df = fs / NFFT
freq = [int(i * df) for i in range(W2)] # 采样频率值
bank = np.zeros((p, W2))
for k in range(1, p + 1):
f0, f1, f2 = Fb[k], Fb[k - 1], Fb[k + 1]
n1 = np.floor(f1 / df)
n2 = np.floor(f2 / df)
n0 = np.floor(f0 / df)
for i in range(1, W2):
if n1 <= i <= n0:
bank[k - 1, i] = (i - n1) / (n0 - n1)
elif n0 < i <= n2:
bank[k - 1, i] = (n2 - i) / (n2 - n0)
elif i > n2:
break
# plt.plot(freq, bank[k - 1, :], 'r')
# plt.savefig('images/mel.png')
return bank
def mfcc(data,fs,p,frameSize,inc,nfft=512,n_dct = 12):
#预处理-加重
data = lfilter([1,-0.94],[1],data)
#分帧
data = enframe(data,frameSize,inc)
#加窗
data = np.multiply(data,np.hanning(frameSize))
#fft
data = np.fft.rfft(data,nfft)
#计算能量普
data = (np.abs(data)**2)/nfft
#计算通过Mel滤波器的能量
bank = melbankm(p,nfft,fs,0,0.5*fs)
ss = np.matmul(data, bank.T)
#计算DCT倒谱
M = bank.shape[0]
m = np.array([i for i in range(M)])
mfcc = np.zeros((ss.shape[0],n_dct))
for n in range(n_dct):
mfcc[:, n] = np.sqrt(2 / M) * np.sum(np.multiply(np.log(ss), np.cos((2 * m - 1) * n * np.pi / 2 / M)), axis=1)
return mfcc
#加载语音文件
fs,data = wavfile.read('fft.wav')
temp = mfcc(data,fs,16, int(fs // 2 * 0.025), int(fs // 2 * 0.01))
time = np.array([i for i in range(len(data))])
another = temp
temp = temp[:,1]
plt.subplot(2,1,1)
plt.plot(temp)
plt.subplot(2,1,2)
plt.plot(another)
plt.savefig('MFCC.png')
plt.show()
结果图