-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.m
200 lines (164 loc) · 5.04 KB
/
main.m
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
clc, clearvars, close all
%% Read the voice signal
[y, Fs] = audioread('male.wav');
% Get time vector
t = (0:length(y)-1)*1/Fs;
% Plot the waveform
figure('name', 'Voice sample')
plot(t, y)
grid on
xlabel('Time (s)')
ylabel('Amplitude')
title('Speech Signal')
%% Compute spectrogram using hamming window
% Time in seconds
Tw = [0.010, 0.100];
Ts = Tw(1)/2;
% Window size in samples
windowSize = [Tw(1)*Fs, Tw(2)*Fs];
windowOverlap = ceil(Ts * Fs);
nfft = [2^nextpow2(windowSize(1)), 2^nextpow2(windowSize(2))];
% Generate hamming window
wideWindow = hamming(windowSize(1));
narrowWindow = hamming(windowSize(2));
% Plot spectograms
figure('name', 'Spectrogram for wide window')
spectrogram(y, wideWindow, windowOverlap, nfft(1), Fs, 'yaxis')
title(['Using ', num2str(nfft(1)), ' DFT points'])
figure('name', 'Spectrogram for narrow window')
spectrogram(y, narrowWindow, windowOverlap, nfft(2), Fs, 'yaxis')
title(['Using ', num2str(nfft(2)), ' DFT points'])
%% Voiced, unvoiced and silence detection
% Here, wide window parameters are being used
window = ceil(0.010 * Fs);
overlap = ceil(0.005 * Fs);
% Buffer speech signal into matrix of data frames
dataframes = buffer(y, window, overlap, 'nodelay')';
num_df = size(dataframes, 1);
% Initialize vectors
E = zeros(1,num_df);
ZCR = zeros(1,num_df);
decision = zeros(1,num_df);
pitch = zeros(1,num_df);
voiced = 0;
for k = 1:num_df
% Extract one frame
x = dataframes(k,:);
% Compute its energy
E(k) = sum(x.^2);
% Compute its zero-crossing rate
ZCR(k) = sum(abs(diff(x>0)))/length(x);
% Check if the frame is voice, no voice or, silence
decision(k) = Detect(E(k), ZCR(k));
% Compute pitch and number of voiced frames
if decision(k) == 2 % i.e., voice
voiced = voiced + 1;
pitch(k) = Pitch(x, Fs);
end
end
% Plot Energy and Zero-Crossing Rate
figure('name', 'Energy and Zero-Crossing Rate')
subplot(3,1,1)
plot(t, y), grid on
title('Input Signal'), xlabel('Time (s)')
subplot(3,1,2)
plot(E, '-'), grid on
title('Energy'), xlabel('Frames')
subplot(3,1,3)
plot(ZCR, '-'), grid on
title('Zero-Crossing Rate'), xlabel('Frames')
%% Linear predictive coding - LPC Analysis
voicedFrame = zeros(window, 1);
unvoicedFrame = zeros(window, 1);
% Isolate a voiced frame
t_voiced = (0:length(voicedFrame)-1)*1/Fs;
for k = 1:num_df
if decision(k) == 2
voicedFrame = dataframes(k, :);
break;
end
end
% Isolate an unvoiced frame
t_unvoiced = (0:length(unvoicedFrame)-1)*1/Fs;
for k = 1:num_df
if decision(k) == 1
unvoicedFrame = dataframes(k,:);
break;
end
end
% LPC on voiced part
[lpc8v, error8v, Hv_8] = LPC(voicedFrame, 8);
[lpc12v, error12v, Hv_12] = LPC(voicedFrame, 12);
[lpc16v, error16v, Hv_16] = LPC(voicedFrame, 16);
% LPC on unvoiced part
[lpc8u, error8u, Hu_8] = LPC(unvoicedFrame, 8);
[lpc12u, error12u, Hu_12] = LPC(unvoicedFrame, 12);
[lpc16u, error16u, Hu_16] = LPC(unvoicedFrame, 16);
% Plot LPC of voiced frames
figure('name', 'LPC - Voiced')
subplot(3,1,1)
plot(t_voiced, voicedFrame)
title('Voiced frame')
xlabel('Time (s)')
ylabel('Amplitude')
subplot(3,1,2)
DFT(voicedFrame, Fs);
subplot(3,1,3)
plot(t_voiced, error8v, t_voiced, error12v, t_voiced, error16v, t_voiced, abs(Hv_8),t_voiced, abs(Hv_12),t_voiced, abs(Hv_16))
title('Prediction Error')
xlabel('Time (s)')
ylabel('Amplitude')
legend('8','12','16','allpole8','allpole12','allpole16','Location', 'EastOutside')
% Plot LPC of unvoiced frames
figure('name', 'LPC - Unvoiced')
subplot(3,1,1)
plot(t_unvoiced,unvoicedFrame)
title('Unvoiced frame')
xlabel('Time (s)')
ylabel('Amplitude')
subplot(3,1,2)
DFT(unvoicedFrame, Fs);
subplot(3,1,3)
plot(t_unvoiced, error8u, t_unvoiced, error12u, t_unvoiced, error16u,t_unvoiced,abs(Hu_8),t_unvoiced, abs(Hu_12),t_unvoiced, abs(Hu_16));
title('Prediction Error')
xlabel('Time (s)')
ylabel('Amplitude')
legend('8','12','16', 'allpole8','allpole12','allpole16', 'Location', 'EastOutside')
%% Extract voice, unvoiced and silence frames
% Initialize vectors
voiced = zeros(1, numel(decision));
unvoiced = zeros(1, numel(decision));
silence = zeros(1, numel(decision));
for k = 1:numel(decision)
if decision(k) == 0 % silence
voiced(k) = NaN;
unvoiced(k) = NaN;
elseif decision(k) == 1 % unvoiced
unvoiced(k) = 1;
voiced(k) = NaN;
silence(k) = NaN;
else % voiced
voiced(k) = 2;
unvoiced(k) = NaN;
silence(k) = NaN;
end
end
% Plot voice activity detection
figure('name', 'Voice Activity Detection')
subplot(2,1,1), plot(t,y), grid on
title('Input Signal'), xlabel('Time (s)')
subplot(2,1,2)
plot(1:num_df, silence, 'b', ...
1:num_df, unvoiced, 'y+', ...
1:num_df, voiced, 'r*')
grid on, ylim([-1 3])
title('Voiced-Unvoiced-Silence Detection'), xlabel('Frames'), ylabel('Classification')
legend('silence', 'unvoiced', 'voiced')
%% Pitch Estimation
for k = 1:length(pitch)
if pitch(k) == 0
pitch(k) = NaN;
end
end
disp('Estimated pitch in voice activity:')
fprintf('%.1f Hz\n', unique(pitch(pitch>0)))