forked from ufal/whisper_streaming
-
Notifications
You must be signed in to change notification settings - Fork 0
/
whisper_online_full_options.py
387 lines (338 loc) · 16.9 KB
/
whisper_online_full_options.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
import logging
logging.basicConfig(filename="log.txt", filemode="a")
logger = logging.getLogger(__name__)
import whisper_online
import argparse
import os
import csv
import json
import torch
import time
import sys
import numpy as np
import gc
from tqdm import tqdm
from linastt.utils.monitoring import tic, toc, vram_peak, ram_peak
def export_processing_times(args, processing_times):
os.makedirs(args.output_path,exist_ok=True)
with open(os.path.join(args.output_path,"result.json"), 'w') as fp:
json.dump(processing_times, fp, indent=4)
with open(os.path.join(args.output_path,"result.txt"),"w") as f:
f.write(f"Processing time statistics\n")
f.write(f"Global statistics:\n")
f.write(f"Number of files: {len(processing_times)}\n\n")
all_processing_times = []
f.write(f"All segements statistics:\n")
for i in processing_times:
all_processing_times += processing_times[i]['segment_processing_time']
f.write(f"\tNumber of segements: {len(all_processing_times)}\n")
f.write(f"\tTotal time: {np.sum(all_processing_times):.2f}\n")
f.write(f"\tMean: {np.mean(all_processing_times):.2f}\n")
f.write(f"\tMax: {np.max(all_processing_times):.2f}\n")
f.write(f"\tMin: {np.min(all_processing_times):.2f}\n")
f.write(f"\tStd: {np.std(all_processing_times):.2f}\n")
f.write(f"\tMedian: {np.median(all_processing_times):.2f}\n\n")
f.write(f"Processing time statistics per file:\n")
for i in processing_times:
f.write(f"\t{i}: {len(processing_times[i]['segment_duration'])} processing_times values\n")
f.write(f"\t\tTotal time: {np.sum(processing_times[i]['segment_processing_time']):.2f}\n")
f.write(f"\t\tMean: {np.mean(processing_times[i]['segment_processing_time']):.2f}\n")
f.write(f"\t\tMax: {np.max(processing_times[i]['segment_processing_time']):.2f}\n")
f.write(f"\t\tMin: {np.min(processing_times[i]['segment_processing_time']):.2f}\n")
f.write(f"\t\tStd: {np.std(processing_times[i]['segment_processing_time']):.2f}\n")
f.write(f"\t\tMedian: {np.median(processing_times[i]['segment_processing_time']):.2f}\n")
def export_params(args):
with open(os.path.join(args.output_path,"params.txt"),"w") as f:
f.write(f"Parameters\n")
f.write(f"Audio path: {args.audio_path}\n")
f.write(f"Model: {args.model}\n")
f.write(f"Language: {args.lan}\n")
f.write(f"Backend: {args.backend}\n")
f.write(f"Task: {args.task}\n")
f.write(f"Device: {args.device}\n")
if args.device == "cuda":
f.write(f"GPU: {torch.cuda.get_device_name()}\n")
else:
f.write(f"CPU threads: {args.cpu_threads}\n")
f.write(f"Offline: {args.offline}\n")
f.write(f"Comp unaware: {args.comp_unaware}\n")
f.write(f"Buffer trimming: {args.buffer_trimming}\n")
f.write(f"Buffer trimming sec: {args.buffer_trimming_sec}\n")
f.write(f"Min chunk size: {args.min_chunk_size}\n")
f.write(f"Output path: {args.output_path}\n")
f.write(f"VAD: {args.vad}\n")
f.write(f"Method: {args.method}\n")
f.write(f"Previous text: {args.previous_text}\n")
f.write(f"Compute type: {args.compute_type}\n")
f.write(f"Verbose: {args.verbose}\n")
def export_transcipt(transcripts, file=None):
if isinstance(file, str):
f = open(file, "w")
else:
f = file
for i in transcripts:
if i[0] is not None:
f.write(f"{i[0]:1.3f} {i[1]:1.3f} {i[2]}\n")
if isinstance(file, str):
f.close()
def output_streaming(committed, newbuffer):
# print(committed)
# print(newbuffer)
# print()
text = committed
text += newbuffer[2] if newbuffer[0] is not None else ""
text = text.replace(".", ".\n")
# print(text, end="\r", flush=True)
with open("streaming.txt", "w") as f:
f.write(text)
def output_timed(out, out_time=0, commit=False, buffered_time=None):
if out[0] is not None:
if commit:
print(f"{buffered_time:6.2f} {out_time:6.2f} {out[0]:6.2f} {out[1]:6.2f} (slat={buffered_time-out[0]:.2f}s) {out[2]:<120}", flush=True)
else:
print(f"{out_time:6.2f} {0:6.2f} {out[0]:6.2f} {out[1]:6.2f} (slat={out_time-out[0]:.2f}s) {out[2]:<100}", end="\r", flush=True)
else:
if commit:
print(f"{out_time:6.2f} {'':<120}", flush=True)
else:
print(f"{out_time:6.2f}", end="\r", flush=True)
def process_file(audio_path, args, online, processing_times):
# if os.path.exists(os.path.join(args.output_path,"transcripts",os.path.basename(audio_path).replace(".mp3",".txt").replace(".wav",".txt").replace(".flac",".txt"))):
# logger.info(f"{audio_path} already processed")
# return processing_times
min_chunk = args.min_chunk_size
SAMPLING_RATE = 16000
MODE = "streaming" #benchmark, timed, streaming
if MODE=="streaming":
confirmed_transcription = ""
duration = len(whisper_online.load_audio(audio_path))/SAMPLING_RATE
logger.info("")
logger.info(f"Processing {audio_path} (duration is {duration:.2f}s)")
beg = args.start_at
start = time.time() - beg
os.makedirs(os.path.join(args.output_path,"transcripts"),exist_ok=True)
processing_times[audio_path] = {'max_vram': -1,'segment_duration' : [], 'segment_timestamps': [], 'segment_processing_time': []}
transcripts = []
if args.offline: ## offline mode processing (for testing/debugging)
start_time = time.time()
a = whisper_online.load_audio(audio_path)
online.insert_audio_chunk(a)
try:
_, o = online.process_iter()
end_time = time.time()
except AssertionError as e:
logger.info("assertion error on {audio_path}")
print(e)
del processing_times[audio_path]
pass
else:
if MODE!="benchmark":
whisper_online.output_transcript(o, start)
processing_times[audio_path]['segment_duration'].append(duration)
processing_times[audio_path]['segment_timestamps'].append((0,duration))
processing_times[audio_path]['segment_processing_time'].append(end_time-start_time)
logger.info(f"Finished processing {audio_path} in {end_time-start_time:.2f}s")
now = None
elif args.comp_unaware: # computational unaware mode
end = beg + min_chunk
if MODE=="benchmark":
pbar = tqdm(total=round(duration,3))
while True:
start_time = time.time()
a = whisper_online.load_audio_chunk(audio_path,beg,end)
online.insert_audio_chunk(a)
try:
_, o = online.process_iter()
end_time = time.time()
except AssertionError:
logger.info("assertion error")
pass
else:
if MODE!="benchmark":
whisper_online.output_transcript(o, start, now=end)
logger.debug(f"## last processed {end:.2f}s")
processing_times[audio_path]['segment_duration'].append(end-beg)
processing_times[audio_path]['segment_timestamps'].append((beg,end))
processing_times[audio_path]['segment_processing_time'].append(end_time-start_time)
if MODE:
pbar.n = round(end,3)
pbar.refresh()
if end >= duration:
break
beg = end
if end + min_chunk > duration:
end = duration
else:
end += min_chunk
now = duration
else: # online = simultaneous mode
processing_times[audio_path]['segment_latency'] = []
processing_times[audio_path]['segment_start_latency'] = []
processing_times[audio_path]['segment_start_buffer_latency'] = []
processing_times[audio_path]['segment_buffer_latency'] = []
end = 0
buffered_time = 0
from playsound import playsound
playsound(os.path.abspath(audio_path), False)
if MODE=="benchmark":
pbar = tqdm(total=round(duration,3))
while True:
now = time.time() - start
if now < end+min_chunk:
time.sleep(min_chunk+end-now)
end = time.time() - start
logger.debug(f"Processing {beg:.2f} to {end:.2f}")
start_time = time.time()
a = whisper_online.load_audio_chunk(audio_path, beg, end)
online.insert_audio_chunk(a)
processing_times[audio_path]['segment_duration'].append(len(online.audio_buffer)/online.SAMPLING_RATE)
processing_times[audio_path]['segment_timestamps'].append((online.buffer_time_offset,online.buffer_time_offset+len(online.audio_buffer)/online.SAMPLING_RATE))
try:
committed, buffer = online.process_iter()
end_time = time.time()
except AssertionError:
logger.info("assertion error")
pass
else:
if MODE!="benchmark":
if MODE=="streaming":
if committed[0] is not None: confirmed_transcription += committed[2]
output_streaming(confirmed_transcription, buffer)
else:
if committed[0] is not None:
output_timed(committed, out_time=end_time-start, commit=True, buffered_time=buffered_time)
output_timed(buffer, out_time=end_time-start)
buffered_time = end_time-start
transcripts.append(committed)
now = time.time() - start
processing_times[audio_path]['segment_processing_time'].append(end_time-start_time)
if committed[0] is not None:
processing_times[audio_path]['segment_latency'].append(now - committed[1])
processing_times[audio_path]['segment_start_latency'].append(now - committed[0])
if buffer[0] is not None:
processing_times[audio_path]['segment_start_buffer_latency'].append(now - buffer[0])
processing_times[audio_path]['segment_buffer_latency'].append(now - buffer[1])
logger.debug(f"The latency is {now-end:.2f}s and output is '{committed[2]}'")
if MODE=="benchmark":
pbar.n = min(round(end,3), pbar.total)
pbar.refresh()
beg = end
if end >= duration:# or beg>=40:
break
now = None
if args.device == "cuda":
processing_times[audio_path]['max_vram'] = vram_peak()
try:
logger.info(f'Number of GPUS: {os.environ["CUDA_VISIBLE_DEVICES"]}')
except KeyError:
pass
logger.info(f"GPU used: {torch.cuda.get_device_name()}")
o = online.finish()
transcripts.append(o)
# logging.getLogger(__name__).setLevel(level=logging.INFO)
if MODE!="benchmark" and not args.offline and not args.comp_unaware:
if MODE=="streaming":
output_streaming(confirmed_transcription, o)
else:
output_timed(o, out_time=end_time-start, commit=True, buffered_time=buffered_time)
export_transcipt(transcripts, os.path.join(args.output_path,"transcripts",os.path.basename(audio_path).replace(".mp3",".txt").replace(".wav",".txt").replace(".flac",".txt")))
return processing_times
def init_args():
parser = argparse.ArgumentParser()
parser.add_argument('audio_path', type=str, help="Filename (or folder) of 16kHz mono channel wav, on which live streaming is simulated.")
# parser.add_argument('--folder', action="store_true", help="If set, audio_path is a folder with wav files, not a single file.")
whisper_online.add_shared_args(parser)
parser.add_argument('--start_at', type=float, default=0.0, help='Start processing audio at this time.')
parser.add_argument('--offline', action="store_true", default=False, help='Offline mode.')
parser.add_argument('--comp_unaware', action="store_true", default=False, help='Computationally unaware simulation.')
parser.add_argument('--device', type=str, default="cuda", choices=["cuda", "cpu"],help='Device used.')
parser.add_argument('--compute_type', type=str, default="int8", choices=["int8", "float16", "float32", "int8_float16"], help='Computation type (int8, float16...).')
parser.add_argument('--output_path', type=str, default="./", help='Output folder of the script.')
parser.add_argument('--method', type=str, default="greedy", choices=["beam-search", "greedy"],help='Greedy or beam search decoding.')
parser.add_argument('--verbose', default=1, help='Verbose mode (2=DEBUG, 1=INFO, 0=ERROR).')
parser.add_argument('--cpu_threads', default=4, help='When running on CPU, number of threads to use.')
parser.add_argument('--previous_text', action="store_true", default=False, help='Condition on previous text (default False).')
parser.add_argument('--subfolders', action="store_true", default=False, help='Search for audios in subfolders (default False).')
args = parser.parse_args()
if args.verbose==2:
logging.getLogger(__name__).setLevel(level=logging.DEBUG)
# logging.getLogger('numba').setLevel(logging.WARNING)
# logging.getLogger('faster_whisper').setLevel(logging.WARNING)
elif args.verbose==1:
logging.getLogger(__name__).setLevel(level=logging.INFO)
else:
logging.getLogger(__name__).setLevel(level=logging.ERROR)
# logging.basicConfig(filename="log.txt", filemode="a", level=logging.ERROR)
if args.offline and args.comp_unaware:
logger.error("No or one option from --offline and --comp_unaware are available, not both. Exiting.")
sys.exit(1)
return args
def init_processor(args):
size = args.model
language = args.lan
t = time.time()
logger.info(f"Loading Whisper {size} model for {language}...")
model_kwargs = {'device': args.device, 'cpu_threads': int(args.cpu_threads), 'compute_type': args.compute_type}
if args.backend == "faster-whisper":
asr_cls = whisper_online.FasterWhisperASR
else:
asr_cls = whisper_online.WhisperTimestampedASR
if args.backend == "whisper_timestamped-transformers":
model_kwargs['backend'] = "transformers"
else:
model_kwargs['backend'] = "openai-whisper"
asr = asr_cls(modelsize=size, lan=language, model_kwargs=model_kwargs)
if args.method != "greedy":
asr.transcribe_kargs['beam_size'] = 5
asr.transcribe_kargs['best_of'] = 5
asr.transcribe_kargs["temperature"] = (0.0, 0.2, 0.4, 0.6, 0.8, 1.0)
if args.task == "translate":
asr.set_translate_task()
tgt_language = "en" # Whisper translates into English
else:
tgt_language = language # Whisper transcribes in this language
e = time.time()
logger.info(f"Loading finished. It took {e-t:.2f} seconds.")
if args.vad:
logger.info(f"setting VAD filter {args.vad}")
asr.use_vad(args.vad if args.vad!=True else None)
if args.buffer_trimming == "sentence":
tokenizer = whisper_online.create_tokenizer(tgt_language)
else:
tokenizer = None
online_processor = whisper_online.OnlineASRProcessor(asr,tokenizer,logfile=logger,buffer_trimming=(args.buffer_trimming, args.buffer_trimming_sec))
return online_processor
def get_file_list(args):
SUBFOLDERS = args.subfolders
audios_path = []
if os.path.isdir(args.audio_path):
paths = os.listdir(args.audio_path)
paths = [os.path.join(args.audio_path, f) for f in paths]
if SUBFOLDERS:
sub_folders = [f for f in paths if os.path.isdir(f)]
audios_path = paths
for sub_folder in sub_folders:
audios_path += [os.path.join(sub_folder, f) for f in os.listdir(sub_folder)]
else:
audios_path = paths
audios_path.sort()
audios_path = [f for f in audios_path if f.endswith(".wav") or f.endswith(".mp3") or f.endswith(".flac")]
logger.info(f"Processing files in {args.audio_path} ({len(audios_path)} files)")
else:
audios_path = [args.audio_path]
return audios_path
if __name__ == "__main__":
args = init_args()
audios_path = get_file_list(args)
processing_times = {}
for audio_path in tqdm(audios_path, total=len(audios_path)):
online_processor = init_processor(args)
# load the audio into the LRU cache before we start the timer
a = whisper_online.load_audio_chunk(audios_path[0],0,1)
# warm up the ASR, because the very first transcribe takes much more time than the other
online_processor.asr.transcribe(a)
processing_times = process_file(audio_path, args, online_processor, processing_times)
online_processor = None
gc.collect()
export_processing_times(args, processing_times)
export_params(args)