-
Notifications
You must be signed in to change notification settings - Fork 4
/
drawAudio.py
108 lines (86 loc) · 3.39 KB
/
drawAudio.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
from re import L
import librosa
import numpy as np
import os
import sklearn
#获得当前文件所在路径
path = os.path.dirname(os.path.abspath(__file__))
# 获得音频文件路径
audio_path = os.path.join(path, 'data/audio/car_horn/7389-1-2-3.wav')
assert os.path.exists(audio_path) # 断言,文件是否存在
x, sr = librosa.load(audio_path) # 读取音频文件
# print(type(x), type(sr))
# print(x.shape, sr)
# 可视化音频
import matplotlib.pyplot as plt
import librosa.display
plt.figure(figsize=(14, 5))
librosa.display.waveshow(x, sr=sr)
plt.title('amplitude envelope')
plt.savefig('picture/wave.png')
# 声谱图(spectrogram)是声音或其他信号的频率随时间变化时的频谱(spectrum)的一种直观表示。
# 在二维数组中,第一个轴是频率,第二个轴是时间。
X = librosa.stft(x)
Xdb = librosa.amplitude_to_db(abs(X))
plt.figure(figsize=(14, 5))
librosa.display.specshow(Xdb, sr=sr, x_axis='time', y_axis='hz')
plt.colorbar()
plt.title("spectrogram")
plt.savefig('picture/spectrogram.png')
plt.figure(figsize=(14, 5))
librosa.display.specshow(Xdb, sr=sr, x_axis='time', y_axis='log')
plt.colorbar()
plt.title("spectrogram (log)")
plt.savefig('picture/spectrogram_log.png')
# 特征提取
# 过零率 Zero Crossing Rate 是一个信号符号变化的比率,即,在每帧中,语音信号从正变为负或从负变为正的次数。
# Zooming in
n0 = 9000
n1 = 9100
plt.figure(figsize=(14, 5))
plt.plot(x[n0:n1])
plt.title('Zero Crossing Rate')
plt.grid()
plt.savefig('picture/zero_crossing_rate.png')
zero_crossings = librosa.zero_crossings(x[n0:n1], pad=False)
print(sum(zero_crossings))
spectral_centroids = librosa.feature.spectral_centroid(x, sr=sr)[0]
# print(spectral_centroids.shape)
# (2647,)
# Computing the time variable for visualization
frames = range(len(spectral_centroids))
t = librosa.frames_to_time(frames)
# Normalising the spectral centroid for visualisation
def normalize(x, axis=0):
return sklearn.preprocessing.minmax_scale(x, axis=axis)
#Plotting the Spectral Centroid along the waveform
plt.figure(figsize=(14, 5))
spectral_rolloff = librosa.feature.spectral_rolloff(x+0.01, sr=sr)[0]
librosa.display.waveshow(x, sr=sr, alpha=0.4)
plt.plot(t, normalize(spectral_rolloff), color='r')
plt.title('Spectral Centroid')
plt.savefig('picture/spectral_centroid.png')
# 信号的Mel频率倒谱系数(MFCC)是一小组特征(通常约10-20),其简明地描述了频谱包络的整体形状,它模拟了人声的特征。
mfccs = librosa.feature.mfcc(x, sr=sr)
# print(mfccs.shape)
# (20, 173)
#Displaying the MFCCs:
plt.figure(figsize=(14, 5))
librosa.display.specshow(mfccs, sr=sr, x_axis='time')
plt.title('MFCC')
plt.savefig('picture/mfcc.png')
# mfcc计算了超过173帧的20个MFCC。我们还可以执行特征缩放,使得每个系数维度具有零均值和单位方差:
mfccs = sklearn.preprocessing.scale(mfccs, axis=1)
# print(mfccs.mean(axis=1))
# print(mfccs.var(axis=1))
plt.figure(figsize=(14, 5))
librosa.display.specshow(mfccs, sr=sr, x_axis='time')
plt.title('MFCC (scaled)')
plt.savefig('picture/mfcc_scaled.png')
# 色度频率 Chroma Frequencies
hop_length = 512
chromagram = librosa.feature.chroma_stft(x, sr=sr, hop_length=hop_length)
plt.figure(figsize=(14, 5))
librosa.display.specshow(chromagram, x_axis='time', y_axis='chroma', hop_length=hop_length, cmap='coolwarm')
plt.title('Chroma')
plt.savefig('picture/chroma.png')