栏目分类:
子分类:
返回
名师互学网用户登录
快速导航关闭
当前搜索
当前分类
子分类
实用工具
热门搜索
名师互学网 > IT > 软件开发 > 后端开发 > Python

语音特征提取(语谱图Spectrogram,Fbank, MFCC)——python代码

Python 更新时间: 发布时间: IT归档 最新发布 模块sitemap 名妆网 法律咨询 聚返吧 英语巴士网 伯小乐 网商动力

语音特征提取(语谱图Spectrogram,Fbank, MFCC)——python代码

导入相关包
import os
import wavio
import numpy as np
import math
from matplotlib import pyplot as plt
from scipy.fftpack import dct
读取语音数据及主函数
for wav in wavs:
	wav_dir = os.path.join(data_dir, wav)
	wav_data = wavio.read(wav_dir)
	data = wav_data.data
	sample_rate = wav_data.rate	#16k
	sampwidth = wav_data.sampwidth
	
	#normalization
	norm_data = data/max(abs(data))
	
	#frames
	frames = frames_crop(norm_data,sample_rate)
	
	#add window
	win = 160
	windows = choose_windows(name = "Hamming",N = win)
	
	#parameters
	N = 2048 #NFFT
	M = 40 #filters number
	num_ceps = 24
	
	#fft
	spe_freqs = np.zeros((frames.shape[0],int(N/2)))	#spectrogram
	fbank_feature = np.zeros((frames.shape[0],M))	#Fbank
	fbank_feature_2 = np.zeros((frames.shape[0],M))	#Fbank second version
	
	mfcc_dct = np.zeros((frames.shape[0],num_ceps))	#Fbank second version
	
	
	for i in range(frames.shape[0]):
		frames_fft = np.fft.fft(windows * frames[i],N)
		spe_freqs[i][:] = log_data(np.abs(frames_fft[:int(N/2)]))
		
		filter_banks = mel_filters(sample_rate = sample_rate, NFFT = N, pow_frames = np.abs(frames_fft[:int(N/2)]), nfilt = M)
		fbank_feature[i][:] = log_data(filter_banks)
		
		filter_banks_2,w2 = mel_filters_2(M = M,N = N,fs = sample_rate,l = 0,h = 0.5, pow_frames = np.abs(frames_fft[:int(N/2)]))
		fbank_feature_2[i][:] = log_data(filter_banks_2)
		
		
		D = dct(filter_banks_2,type = 2,norm = 'ortho')[1:(num_ceps+1)]
		mfcc_dct[i][:] = D
		break
	#print(mfcc_dct.shape)
	plt.pcolor(mfcc_dct.T,cmap = 'jet')
	plt.show()
	break
对语音数据进行分帧
def frames_crop(x1, sample_rate):
    signal = x1
    frame_size, frame_stride = 0.01, 0.0075  # 帧长帧移
    frame_length, frame_step = int(round(frame_size * sample_rate)), int(round(frame_stride * sample_rate))

    signal_length = len(signal)
    num_frames = int(np.ceil(np.abs(signal_length - frame_length) / frame_step)) + 1

    pad_signal_length = (num_frames - 1) * frame_step + frame_length

    z = np.zeros((pad_signal_length - signal_length))
    pad_signal = np.append(signal, z)

    indices = np.arange(0, frame_length).reshape(1, -1) + np.arange(0, num_frames * frame_step, frame_step).reshape(
        -1, 1)
    frames1 = pad_signal[indices]

    return frames1
选择窗函数
def choose_windows(name, N):
    # Rect/Hanning/Hamming
    if name == 'Hamming':
        window = np.array([0.54 - 0.46 * np.cos(2 * np.pi * n / (N - 1)) for n in range(N)])
    elif name == 'Hanning':
        window = np.array([0.5 - 0.5 * np.cos(2 * np.pi * n / (N - 1)) for n in range(N)])
    elif name == 'Rect':
        window = np.ones(N)
    return window
对数据进行求Log
def log_data(arr):
	res = np.zeros((arr.shape[0]))
	for i in range(arr.shape[0]):
		res[i] = 20 * math.log(arr[i],10)
	return res
两种梅尔滤波器 第一种
def mel_filters(sample_rate,NFFT,pow_frames,nfilt = 40):
	low_freq_mel = 0
	high_freq_mel = (2595 * np.log10(1 + (sample_rate / 2) / 700))  # 将Hz转换为Mel
	mel_points = np.linspace(low_freq_mel, high_freq_mel, nfilt + 2)  # 使得Mel scale间距相等
	hz_points = (700 * (10**(mel_points / 2595) - 1))  # 将Mel转换为Hz

	bin = np.floor((NFFT + 1) * hz_points / sample_rate)


	fbank = np.zeros((nfilt, int(np.floor(NFFT / 2))))

	for m in range(1, nfilt + 1):
		f_m_minus = int(bin[m - 1])   # 左
		f_m = int(bin[m])             # 中
		f_m_plus = int(bin[m + 1])    # 右

		for k in range(f_m_minus, f_m):
			fbank[m - 1, k] = (k - bin[m - 1]) / (bin[m] - bin[m - 1])
		for k in range(f_m, f_m_plus):
			fbank[m - 1, k] = (bin[m + 1] - k) / (bin[m + 1] - bin[m])
	filter_banks = np.dot(pow_frames, fbank.T)
	filter_banks = np.where(filter_banks == 0, np.finfo(float).eps, filter_banks)  # 数值稳定性

	return filter_banks
第二种
def mel_filters_2(M,N,fs,l,h,pow_frames):
	'''mel滤波器
	input:M(int):滤波器个数
	N(int):FFT点数
	fs(int):采样频率
	l(float):低频系数
	h(float):高频系数
	output:melbank(二维array):mel滤波器
	'''
	fl = fs * l #滤波器范围的最低频率
	fh = fs * h #滤波器范围的最高频率
	bl = 1125 * np.log(1 + fl / 700) #将频率转换为mel频率
	bh = 1125 * np.log(1 + fh /700) 
	B = bh - bl #频带宽度
	y = np.linspace(0,B,M+2) #将mel刻度等间距
	#print('mel间隔',y)
	Fb = 700 * (np.exp(y / 1125) - 1) #将mel变为HZ
	#print(Fb)
	w2 = int(N / 2)
	df = fs / N
	freq = [] #采样频率值
	for n in range(0,w2):
		freqs = int(n * df)
		freq.append(freqs)
		melbank = np.zeros((M,w2))
	#print(len(freq))
 
	for k in range(1,M+1):
		f1 = Fb[k - 1]
		f2 = Fb[k + 1]
		f0 = Fb[k]
		n1 = np.floor(f1/df)
		n2 = np.floor(f2/df)
		n0 = np.floor(f0/df)
		for i in range(1,w2):
			if i >= n1 and i <= n0:
				melbank[k-1,i] = (i-n1)/(n0-n1)
			if i >= n0 and i <= n2:
				melbank[k-1,i] = (n2-i)/(n2-n0)
		#plt.plot(freq,melbank[k-1,:])
	#plt.show()
	print(melbank.shape)
	filter_banks = np.dot(pow_frames, melbank.T)
	filter_banks = np.where(filter_banks == 0, np.finfo(float).eps, filter_banks)  # 数值稳定性
	return filter_banks,w2

可将以上特征进行保存成txt形式格式。接下来将会使用Opensmile进行提取语音数据特征

转载请注明:文章转载自 www.mshxw.com
本文地址:https://www.mshxw.com/it/499292.html
我们一直用心在做
关于我们 文章归档 网站地图 联系我们

版权所有 (c)2021-2022 MSHXW.COM

ICP备案号:晋ICP备2021003244-6号