import os import wavio import numpy as np import math from matplotlib import pyplot as plt from scipy.fftpack import dct读取语音数据及主函数
for wav in wavs: wav_dir = os.path.join(data_dir, wav) wav_data = wavio.read(wav_dir) data = wav_data.data sample_rate = wav_data.rate #16k sampwidth = wav_data.sampwidth #normalization norm_data = data/max(abs(data)) #frames frames = frames_crop(norm_data,sample_rate) #add window win = 160 windows = choose_windows(name = "Hamming",N = win) #parameters N = 2048 #NFFT M = 40 #filters number num_ceps = 24 #fft spe_freqs = np.zeros((frames.shape[0],int(N/2))) #spectrogram fbank_feature = np.zeros((frames.shape[0],M)) #Fbank fbank_feature_2 = np.zeros((frames.shape[0],M)) #Fbank second version mfcc_dct = np.zeros((frames.shape[0],num_ceps)) #Fbank second version for i in range(frames.shape[0]): frames_fft = np.fft.fft(windows * frames[i],N) spe_freqs[i][:] = log_data(np.abs(frames_fft[:int(N/2)])) filter_banks = mel_filters(sample_rate = sample_rate, NFFT = N, pow_frames = np.abs(frames_fft[:int(N/2)]), nfilt = M) fbank_feature[i][:] = log_data(filter_banks) filter_banks_2,w2 = mel_filters_2(M = M,N = N,fs = sample_rate,l = 0,h = 0.5, pow_frames = np.abs(frames_fft[:int(N/2)])) fbank_feature_2[i][:] = log_data(filter_banks_2) D = dct(filter_banks_2,type = 2,norm = 'ortho')[1:(num_ceps+1)] mfcc_dct[i][:] = D break #print(mfcc_dct.shape) plt.pcolor(mfcc_dct.T,cmap = 'jet') plt.show() break对语音数据进行分帧
def frames_crop(x1, sample_rate):
signal = x1
frame_size, frame_stride = 0.01, 0.0075 # 帧长帧移
frame_length, frame_step = int(round(frame_size * sample_rate)), int(round(frame_stride * sample_rate))
signal_length = len(signal)
num_frames = int(np.ceil(np.abs(signal_length - frame_length) / frame_step)) + 1
pad_signal_length = (num_frames - 1) * frame_step + frame_length
z = np.zeros((pad_signal_length - signal_length))
pad_signal = np.append(signal, z)
indices = np.arange(0, frame_length).reshape(1, -1) + np.arange(0, num_frames * frame_step, frame_step).reshape(
-1, 1)
frames1 = pad_signal[indices]
return frames1
选择窗函数
def choose_windows(name, N):
# Rect/Hanning/Hamming
if name == 'Hamming':
window = np.array([0.54 - 0.46 * np.cos(2 * np.pi * n / (N - 1)) for n in range(N)])
elif name == 'Hanning':
window = np.array([0.5 - 0.5 * np.cos(2 * np.pi * n / (N - 1)) for n in range(N)])
elif name == 'Rect':
window = np.ones(N)
return window
对数据进行求Log
def log_data(arr): res = np.zeros((arr.shape[0])) for i in range(arr.shape[0]): res[i] = 20 * math.log(arr[i],10) return res两种梅尔滤波器 第一种
def mel_filters(sample_rate,NFFT,pow_frames,nfilt = 40): low_freq_mel = 0 high_freq_mel = (2595 * np.log10(1 + (sample_rate / 2) / 700)) # 将Hz转换为Mel mel_points = np.linspace(low_freq_mel, high_freq_mel, nfilt + 2) # 使得Mel scale间距相等 hz_points = (700 * (10**(mel_points / 2595) - 1)) # 将Mel转换为Hz bin = np.floor((NFFT + 1) * hz_points / sample_rate) fbank = np.zeros((nfilt, int(np.floor(NFFT / 2)))) for m in range(1, nfilt + 1): f_m_minus = int(bin[m - 1]) # 左 f_m = int(bin[m]) # 中 f_m_plus = int(bin[m + 1]) # 右 for k in range(f_m_minus, f_m): fbank[m - 1, k] = (k - bin[m - 1]) / (bin[m] - bin[m - 1]) for k in range(f_m, f_m_plus): fbank[m - 1, k] = (bin[m + 1] - k) / (bin[m + 1] - bin[m]) filter_banks = np.dot(pow_frames, fbank.T) filter_banks = np.where(filter_banks == 0, np.finfo(float).eps, filter_banks) # 数值稳定性 return filter_banks第二种
def mel_filters_2(M,N,fs,l,h,pow_frames):
'''mel滤波器
input:M(int):滤波器个数
N(int):FFT点数
fs(int):采样频率
l(float):低频系数
h(float):高频系数
output:melbank(二维array):mel滤波器
'''
fl = fs * l #滤波器范围的最低频率
fh = fs * h #滤波器范围的最高频率
bl = 1125 * np.log(1 + fl / 700) #将频率转换为mel频率
bh = 1125 * np.log(1 + fh /700)
B = bh - bl #频带宽度
y = np.linspace(0,B,M+2) #将mel刻度等间距
#print('mel间隔',y)
Fb = 700 * (np.exp(y / 1125) - 1) #将mel变为HZ
#print(Fb)
w2 = int(N / 2)
df = fs / N
freq = [] #采样频率值
for n in range(0,w2):
freqs = int(n * df)
freq.append(freqs)
melbank = np.zeros((M,w2))
#print(len(freq))
for k in range(1,M+1):
f1 = Fb[k - 1]
f2 = Fb[k + 1]
f0 = Fb[k]
n1 = np.floor(f1/df)
n2 = np.floor(f2/df)
n0 = np.floor(f0/df)
for i in range(1,w2):
if i >= n1 and i <= n0:
melbank[k-1,i] = (i-n1)/(n0-n1)
if i >= n0 and i <= n2:
melbank[k-1,i] = (n2-i)/(n2-n0)
#plt.plot(freq,melbank[k-1,:])
#plt.show()
print(melbank.shape)
filter_banks = np.dot(pow_frames, melbank.T)
filter_banks = np.where(filter_banks == 0, np.finfo(float).eps, filter_banks) # 数值稳定性
return filter_banks,w2
可将以上特征进行保存成txt形式格式。接下来将会使用Opensmile进行提取语音数据特征



