机器学习-语音识别

声音(.wav)->文本(字符串)
文本(字符串)->声音

音频识别

声音的时域和频域表示

时域:位移=f(时间)
频域:(振幅, 相位)=f(频率)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
import numpy as np
import numpy.fft as nf
import scipy.io.wavfile as wf
import matplotlib.pyplot as mp
sample_rate, sigs = wf.read('../data/freq.wav')
sigs = sigs / 2 ** 15
times = np.arange(len(sigs)) / sample_rate
freqs = nf.fftfreq(sigs.size, 1 / sample_rate)
ffts = nf.fft(sigs)
pows = np.abs(ffts)
mp.figure('Time Domain', facecolor='lightgray')
mp.title('Time Domain', fontsize=20)
mp.xlabel('Time', fontsize=14)
mp.ylabel('Signal', fontsize=14)
mp.tick_params(labelsize=10)
mp.grid(linestyle=':')
mp.plot(times, sigs, c='dodgerblue',
label='Signal=f(Time)')
mp.legend()
mp.figure('Frequency Domain', facecolor='lightgray')
mp.title('Frequency Domain', fontsize=20)
mp.xlabel('Frequency', fontsize=14)
mp.ylabel('Power', fontsize=14)
mp.tick_params(labelsize=10)
mp.grid(linestyle=':')
mp.plot(freqs[freqs>=0], pows[freqs>=0],
c='orangered', label='Power=F(Frequency)')
mp.legend()
mp.show()

梅尔频率倒谱系数(MFCC)矩阵

将一段音频样本划分成若干片段,其中每一个片段对应MFCC矩阵中的一行,构成一个子样本。将每个子样本做傅里叶变换得到频率谱线,从中选择与音频内容关系最为紧密的13个特征频率,形成一个特征向量。将多个子样本的特征向量组合成矩阵,即MFCC矩阵。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
import numpy as np
import scipy.io.wavfile as wf
import python_speech_features as sf
import matplotlib.pyplot as mp
sample_rate, sigs = wf.read(
'D:/ML/data/speeches/training/banana/banana01.wav')
mfcc = sf.mfcc(sigs, sample_rate)
mp.matshow(mfcc.T, cmap='gist_rainbow',
fignum='MFCC')
mp.title('MFCC', fontsize=20)
mp.xlabel('Sample', fontsize=14)
mp.ylabel('Feature', fontsize=14)
mp.tick_params(which='both', top=False,
labeltop=False, labelbottom=True,
labelsize=10)
mp.show()

语音识别

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
import os
import warnings
import numpy as np
import scipy.io.wavfile as wf
import python_speech_features as sf
import hmmlearn.hmm as hl
warnings.filterwarnings('ignore',
category=DeprecationWarning)
np.seterr(all='ignore')
def search_speeches(directory, speeches):
directory = os.path.normpath(directory)
if not os.path.isdir(directory):
raise IOError("The directory '" +
directory + "' doesn't exist!")
for entry in os.listdir(directory):
label = directory[directory.rfind(
os.path.sep) + 1:]
path = os.path.join(directory, entry)
if os.path.isdir(path):
search_speeches(path, speeches)
elif os.path.isfile(path) and \
path.endswith('.wav'):
if label not in speeches:
speeches[label] = []
speeches[label].append(path)
train_speeches = {}
search_speeches('../data/speeches/training',
train_speeches)
train_x, train_y = [], []
for label, filenames in train_speeches.items():
mfccs = np.array([])
for filename in filenames:
sample_rate, sigs = wf.read(filename)
mfcc = sf.mfcc(sigs, sample_rate)
if len(mfccs) == 0:
mfccs = mfcc
else:
mfccs = np.append(
mfccs, mfcc, axis=0)
train_x.append(mfccs)
train_y.append(label)
modles = {}
for mfccs, label in zip(train_x, train_y):
model = hl.GaussianHMM(
n_components=4, covariance_type='diag',
n_iter=1000)
modles[label] = model.fit(mfccs)
test_speeches = {}
search_speeches('../data/speeches/testing',
test_speeches)
test_x, test_y = [], []
for label, filenames in test_speeches.items():
mfccs = np.array([])
for filename in filenames:
sample_rate, sigs = wf.read(filename)
mfcc = sf.mfcc(sigs, sample_rate)
if len(mfccs) == 0:
mfccs = mfcc
else:
mfccs = np.append(mfccs, mfcc, axis=0)
test_x.append(mfccs)
test_y.append(label)
pred_test_y = []
for mfccs in test_x:
best_score, best_label = None, None
for label, model in modles.items():
score = model.score(mfccs)
if (best_score is None) or \
(best_score < score):
best_score, best_label = \
score, label
pred_test_y.append(best_label)
print(test_y)
print(pred_test_y)

声音合成

根据需求获取某个声音的模型频域数据,根据业务需要可以修改模型数据,逆向生成时域数据,完成声音的合成。

案例:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
import json
import numpy as np
import scipy.io.wavfile as wf
with open('../data/12.json', 'r') as f:
freqs = json.loads(f.read())
tones = [
('G5', 1.5),
('A5', 0.5),
('G5', 1.5),
('E5', 0.5),
('D5', 0.5),
('E5', 0.25),
('D5', 0.25),
('C5', 0.5),
('A4', 0.5),
('C5', 0.75)]
sample_rate = 44100
music = np.empty(shape=1)
for tone, duration in tones:
times = np.linspace(0, duration, duration * sample_rate)
sound = np.sin(2 * np.pi * freqs[tone] * times)
music = np.append(music, sound)
music *= 2 ** 15
music = music.astype(np.int16)
wf.write('../data/music.wav', sample_rate, music)