From 681f4709eb86106a5d8c4f81d004bd40627f6f90 Mon Sep 17 00:00:00 2001 From: Carlos Fonseca <32177100+carlfm01@users.noreply.github.com> Date: Wed, 5 Jun 2019 04:27:46 +0000 Subject: [PATCH 01/25] Remove comments check from alphabet --- util/text.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/util/text.py b/util/text.py index 7ae6ef3e59..fed193e7e6 100644 --- a/util/text.py +++ b/util/text.py @@ -15,10 +15,6 @@ def __init__(self, config_file): self._size = 0 with codecs.open(config_file, 'r', 'utf-8') as fin: for line in fin: - if line[0:2] == '\\#': - line = '#\n' - elif line[0] == '#': - continue self._label_to_str += line[:-1] # remove the line ending self._str_to_label[line[:-1]] = self._size self._size += 1 From 421243d2841a86c7a3f0fbea5d4f4d49ab82f706 Mon Sep 17 00:00:00 2001 From: Carlos Fonseca <32177100+carlfm01@users.noreply.github.com> Date: Wed, 5 Jun 2019 04:29:02 +0000 Subject: [PATCH 02/25] Remove sort from feeding --- util/feeding.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/util/feeding.py b/util/feeding.py index a88f366030..66022a7c13 100644 --- a/util/feeding.py +++ b/util/feeding.py @@ -66,7 +66,7 @@ def to_sparse_tuple(sequence): def create_dataset(csvs, batch_size, cache_path=''): df = read_csvs(csvs) - df.sort_values(by='wav_filesize', inplace=True) + #df.sort_values(by='wav_filesize', inplace=True) # Convert to character index arrays df['transcript'] = df['transcript'].apply(partial(text_to_char_array, alphabet=Config.alphabet)) From d08efad480d160a8bd5ff94568a26abb57fbc760 Mon Sep 17 00:00:00 2001 From: Carlos Fonseca <32177100+carlfm01@users.noreply.github.com> Date: Wed, 5 Jun 2019 04:56:34 +0000 Subject: [PATCH 03/25] Remove sort from evaluate tools --- util/evaluate_tools.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/util/evaluate_tools.py b/util/evaluate_tools.py index 1ad91f46ea..46fea32437 100644 --- a/util/evaluate_tools.py +++ b/util/evaluate_tools.py @@ -64,9 +64,7 @@ def calculate_report(labels, decodings, losses): samples_wer, samples_cer = wer_cer_batch(samples) # Order the remaining items by their loss (lowest loss on top) - samples.sort(key=lambda s: s.loss) # Then order by WER (highest WER on top) - samples.sort(key=lambda s: s.wer, reverse=True) return samples_wer, samples_cer, samples From ba1a58763e1b18720b2a8602d92c823327e0dd09 Mon Sep 17 00:00:00 2001 From: Carlos Fonseca Date: Sat, 29 Jun 2019 03:29:15 +0000 Subject: [PATCH 04/25] Remove TF dependency --- requirements.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 2958643ba0..fdf8e6ba1d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,4 @@ # Main training requirements -tensorflow == 1.13.1 numpy == 1.15.4 progressbar2 pandas From aebd08df4f80e05a53400f503c1fe533a107ca23 Mon Sep 17 00:00:00 2001 From: Yi-Hua Chiu Date: Mon, 30 Dec 2019 15:54:27 +0800 Subject: [PATCH 05/25] [ADD] mix noise audio --- util/feeding.py | 18 +++++++++++++++++- util/flags.py | 7 ++++++- 2 files changed, 23 insertions(+), 2 deletions(-) diff --git a/util/feeding.py b/util/feeding.py index 16d0e3128d..3084dd9436 100644 --- a/util/feeding.py +++ b/util/feeding.py @@ -16,6 +16,7 @@ from util.flags import FLAGS from util.spectrogram_augmentations import augment_freq_time_mask, augment_dropout, augment_pitch_and_tempo, augment_speed_up from util.audio import read_frames_from_file, vad_split, DEFAULT_FORMAT +from util.decoded_augmentation import augment_noise def read_csvs(csv_files): @@ -67,8 +68,23 @@ def samples_to_mfccs(samples, sample_rate, train_phase=False): def audiofile_to_features(wav_filename, train_phase=False): samples = tf.io.read_file(wav_filename) decoded = contrib_audio.decode_wav(samples, desired_channels=1) - features, features_len = samples_to_mfccs(decoded.audio, decoded.sample_rate, train_phase=train_phase) + audio = decoded.audio + # augment decoded + if train_phase and FLAGS.decoded_aug_mix_noise_walk_dirs: + audio = augment_noise( + audio, + FLAGS.decoded_aug_mix_noise_walk_dirs.split(','), + change_audio_db_max=FLAGS.decoded_aug_mix_noise_max_audio_db, + change_audio_db_min=FLAGS.decoded_aug_mix_noise_min_audio_db, + change_noise_db_max=FLAGS.decoded_aug_mix_noise_max_noise_db, + change_noise_db_min=FLAGS.decoded_aug_mix_noise_min_noise_db, + ) + + + features, features_len = samples_to_mfccs(audio, decoded.sample_rate, train_phase=train_phase) + + # augment features if train_phase: if FLAGS.data_aug_features_multiplicative > 0: features = features*tf.random.normal(mean=1, stddev=FLAGS.data_aug_features_multiplicative, shape=tf.shape(features)) diff --git a/util/flags.py b/util/flags.py index d8a2656c6a..2d01a3a2b9 100644 --- a/util/flags.py +++ b/util/flags.py @@ -24,6 +24,12 @@ def create_flags(): # Data Augmentation # ================ + f.DEFINE_string('decoded_aug_mix_noise_walk_dirs', '', 'walk through wav dir, then mix noise wav into decoded audio') + f.DEFINE_float('decoded_aug_mix_noise_max_noise_db', -25, 'limit noise max volume') + f.DEFINE_float('decoded_aug_mix_noise_min_noise_db', -50, 'limit noise min volume') + f.DEFINE_float('decoded_aug_mix_noise_max_audio_db', 0, 'limit noise max volume') + f.DEFINE_float('decoded_aug_mix_noise_min_audio_db', -10, 'limit noise min volume') + f.DEFINE_float('data_aug_features_additive', 0, 'std of the Gaussian additive noise') f.DEFINE_float('data_aug_features_multiplicative', 0, 'std of normal distribution around 1 for multiplicative noise') @@ -42,7 +48,6 @@ def create_flags(): f.DEFINE_float('augmentation_pitch_and_tempo_scaling_max_pitch', 1.2, 'max value of pitch scaling') f.DEFINE_float('augmentation_pitch_and_tempo_scaling_max_tempo', 1.2, 'max vlaue of tempo scaling') - # Global Constants # ================ From d255c3f952036fd6153ac6ee20a4c526ba98370a Mon Sep 17 00:00:00 2001 From: Yi-Hua Chiu Date: Mon, 30 Dec 2019 16:10:00 +0800 Subject: [PATCH 06/25] [FIX] add missing file decoded_augmentation.py --- util/decoded_augmentation.py | 67 ++++++++++++++++++++++++++++++++++++ 1 file changed, 67 insertions(+) create mode 100644 util/decoded_augmentation.py diff --git a/util/decoded_augmentation.py b/util/decoded_augmentation.py new file mode 100644 index 0000000000..f74b5f1d1f --- /dev/null +++ b/util/decoded_augmentation.py @@ -0,0 +1,67 @@ +import tensorflow as tf +import tensorflow.compat.v1 as tfv1 +from tensorflow.python.ops import gen_audio_ops as contrib_audio +import os + + +def augment_noise(audio, + walk_dirs, + change_audio_db_max=0, + change_audio_db_min=-10, + change_noise_db_max=-25, + change_noise_db_min=-50 + ): + noise_filenames = [] + for d in walk_dirs: + for dirpath, _, filenames in os.walk(d): + for filename in filenames: + if filename.endswith('.wav'): + noise_filenames.append(os.path.join(dirpath, filename)) + print('Collect {} noise filenames for augmentation'.format(len(noise_filenames))) + noise_filenames = tf.convert_to_tensor(noise_filenames, dtype=tf.string) + + rand_int = tfv1.random_uniform( + [], dtype=tf.int32, minval=0, maxval=tf.shape(noise_filenames)[0]) + noise_filename = noise_filenames[rand_int] + noise_samples = tf.io.read_file(noise_filename) + noise_decoded = contrib_audio.decode_wav(noise_samples, desired_channels=1) + noise_audio = noise_decoded.audio + + decoded_audio_len = tf.shape(audio)[0] + noise_decoded_audio_len = tf.shape(noise_audio)[0] + + multiply = tf.math.floordiv(decoded_audio_len, noise_decoded_audio_len) + 1 + noise_audio = tf.tile(noise_audio, [multiply, 1]) + + # now noise_decoded_len must > decoded_len + noise_decoded_audio_len = tf.shape(noise_audio)[0] + + mix_decoded_start_end_points = tfv1.random_uniform( + [2], minval=0, maxval=decoded_audio_len-1, dtype=tf.int32) + mix_decoded_start_point = tf.math.reduce_min(mix_decoded_start_end_points) + mix_decoded_end_point = tf.math.reduce_max( + mix_decoded_start_end_points) + 1 + mix_decoded_width = mix_decoded_end_point - mix_decoded_start_point + + left_zeros = tf.zeros(shape=[mix_decoded_start_point, 1]) + + mix_noise_decoded_start_point = tfv1.random_uniform( + [], minval=0, maxval=noise_decoded_audio_len - mix_decoded_width, dtype=tf.int32) + mix_noise_decoded_end_point = mix_noise_decoded_start_point + mix_decoded_width + extract_noise_decoded = noise_audio[mix_noise_decoded_start_point:mix_noise_decoded_end_point, :] + + right_zeros = tf.zeros( + shape=[decoded_audio_len - mix_decoded_end_point, 1]) + + mixed_noise = tf.concat( + [left_zeros, extract_noise_decoded, right_zeros], axis=0) + + choosen_audio_db = tfv1.random_uniform( + [], minval=change_audio_db_min, maxval=change_audio_db_max) + audio_ratio = tf.math.exp(choosen_audio_db / 10) + + choosen_noise_db = tfv1.random_uniform( + [], minval=change_noise_db_min, maxval=change_noise_db_max) + noise_ratio = tf.math.exp(choosen_noise_db / 10) + + return tf.multiply(audio, audio_ratio) + tf.multiply(mixed_noise, noise_ratio) From ec251367bd012fce533c2ab30f9c8fce8f81ca80 Mon Sep 17 00:00:00 2001 From: Yi-Hua Chiu Date: Tue, 31 Dec 2019 11:20:58 +0800 Subject: [PATCH 07/25] mix noise works, but performance is bad --- util/decoded_augmentation.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/util/decoded_augmentation.py b/util/decoded_augmentation.py index f74b5f1d1f..449e772a8f 100644 --- a/util/decoded_augmentation.py +++ b/util/decoded_augmentation.py @@ -8,9 +8,9 @@ def augment_noise(audio, walk_dirs, change_audio_db_max=0, change_audio_db_min=-10, - change_noise_db_max=-25, - change_noise_db_min=-50 - ): + change_noise_db_max=-15, + change_noise_db_min=-25): + assert isinstance(walk_dirs, list) noise_filenames = [] for d in walk_dirs: for dirpath, _, filenames in os.walk(d): @@ -31,10 +31,10 @@ def augment_noise(audio, noise_decoded_audio_len = tf.shape(noise_audio)[0] multiply = tf.math.floordiv(decoded_audio_len, noise_decoded_audio_len) + 1 - noise_audio = tf.tile(noise_audio, [multiply, 1]) + noise_audio_tile = tf.tile(noise_audio, [multiply, 1]) # now noise_decoded_len must > decoded_len - noise_decoded_audio_len = tf.shape(noise_audio)[0] + noise_decoded_audio_len = tf.shape(noise_audio_tile)[0] mix_decoded_start_end_points = tfv1.random_uniform( [2], minval=0, maxval=decoded_audio_len-1, dtype=tf.int32) @@ -48,7 +48,7 @@ def augment_noise(audio, mix_noise_decoded_start_point = tfv1.random_uniform( [], minval=0, maxval=noise_decoded_audio_len - mix_decoded_width, dtype=tf.int32) mix_noise_decoded_end_point = mix_noise_decoded_start_point + mix_decoded_width - extract_noise_decoded = noise_audio[mix_noise_decoded_start_point:mix_noise_decoded_end_point, :] + extract_noise_decoded = noise_audio_tile[mix_noise_decoded_start_point:mix_noise_decoded_end_point, :] right_zeros = tf.zeros( shape=[decoded_audio_len - mix_decoded_end_point, 1]) @@ -58,10 +58,11 @@ def augment_noise(audio, choosen_audio_db = tfv1.random_uniform( [], minval=change_audio_db_min, maxval=change_audio_db_max) - audio_ratio = tf.math.exp(choosen_audio_db / 10) + audio_ratio = tf.math.pow(10.0, choosen_audio_db / 10) choosen_noise_db = tfv1.random_uniform( [], minval=change_noise_db_min, maxval=change_noise_db_max) - noise_ratio = tf.math.exp(choosen_noise_db / 10) - + # choosen_noise_db = tf.random.normal( + # [], mean=change_noise_db_max, stddev=change_noise_db_min) + noise_ratio = tf.math.pow(10.0, choosen_noise_db / 10) return tf.multiply(audio, audio_ratio) + tf.multiply(mixed_noise, noise_ratio) From 484134eb9e303dea22358a15930277675c0a2f7c Mon Sep 17 00:00:00 2001 From: Yi-Hua Chiu Date: Tue, 31 Dec 2019 15:57:34 +0800 Subject: [PATCH 08/25] [MOD] use tf.Dataset to cache noise audio --- ..._augmentation.py => audio_augmentation.py} | 33 ++++++++----------- util/feeding.py | 17 ++++++++-- util/flags.py | 9 ++--- 3 files changed, 33 insertions(+), 26 deletions(-) rename util/{decoded_augmentation.py => audio_augmentation.py} (75%) diff --git a/util/decoded_augmentation.py b/util/audio_augmentation.py similarity index 75% rename from util/decoded_augmentation.py rename to util/audio_augmentation.py index 449e772a8f..cb668f696b 100644 --- a/util/decoded_augmentation.py +++ b/util/audio_augmentation.py @@ -3,29 +3,26 @@ from tensorflow.python.ops import gen_audio_ops as contrib_audio import os - -def augment_noise(audio, - walk_dirs, - change_audio_db_max=0, - change_audio_db_min=-10, - change_noise_db_max=-15, - change_noise_db_min=-25): +def collect_noise_filenames(walk_dirs): assert isinstance(walk_dirs, list) - noise_filenames = [] + for d in walk_dirs: for dirpath, _, filenames in os.walk(d): for filename in filenames: if filename.endswith('.wav'): - noise_filenames.append(os.path.join(dirpath, filename)) - print('Collect {} noise filenames for augmentation'.format(len(noise_filenames))) - noise_filenames = tf.convert_to_tensor(noise_filenames, dtype=tf.string) + yield os.path.join(dirpath, filename) - rand_int = tfv1.random_uniform( - [], dtype=tf.int32, minval=0, maxval=tf.shape(noise_filenames)[0]) - noise_filename = noise_filenames[rand_int] - noise_samples = tf.io.read_file(noise_filename) - noise_decoded = contrib_audio.decode_wav(noise_samples, desired_channels=1) - noise_audio = noise_decoded.audio +def noise_file_to_audio(noise_file): + samples = tf.io.read_file(noise_file) + decoded = contrib_audio.decode_wav(samples, desired_channels=1) + return decoded.audio + +def augment_noise(audio, + noise_audio, + change_audio_db_max=0, + change_audio_db_min=-10, + change_noise_db_max=-15, + change_noise_db_min=-25): decoded_audio_len = tf.shape(audio)[0] noise_decoded_audio_len = tf.shape(noise_audio)[0] @@ -62,7 +59,5 @@ def augment_noise(audio, choosen_noise_db = tfv1.random_uniform( [], minval=change_noise_db_min, maxval=change_noise_db_max) - # choosen_noise_db = tf.random.normal( - # [], mean=change_noise_db_max, stddev=change_noise_db_min) noise_ratio = tf.math.pow(10.0, choosen_noise_db / 10) return tf.multiply(audio, audio_ratio) + tf.multiply(mixed_noise, noise_ratio) diff --git a/util/feeding.py b/util/feeding.py index 3084dd9436..d8ad88a20b 100644 --- a/util/feeding.py +++ b/util/feeding.py @@ -16,7 +16,7 @@ from util.flags import FLAGS from util.spectrogram_augmentations import augment_freq_time_mask, augment_dropout, augment_pitch_and_tempo, augment_speed_up from util.audio import read_frames_from_file, vad_split, DEFAULT_FORMAT -from util.decoded_augmentation import augment_noise +from util.audio_augmentation import augment_noise, noise_file_to_audio, collect_noise_filenames def read_csvs(csv_files): @@ -70,11 +70,22 @@ def audiofile_to_features(wav_filename, train_phase=False): decoded = contrib_audio.decode_wav(samples, desired_channels=1) audio = decoded.audio - # augment decoded + # augment audio if train_phase and FLAGS.decoded_aug_mix_noise_walk_dirs: + # because we have to determine the shuffle size, so we could not use generator + noise_filenames = tf.convert_to_tensor( + list(collect_noise_filenames(FLAGS.decoded_aug_mix_noise_walk_dirs.split(','))), + dtype=tf.string) + print(">>> Collect {} noise files for mixing audio".format(noise_filenames.shape[0])) + noise_dataset = (tf.data.Dataset.from_tensor_slices(noise_filenames) + .map(noise_file_to_audio, num_parallel_calls=tf.data.experimental.AUTOTUNE) + .shuffle(noise_filenames.shape[0]) + .cache(FLAGS.decoded_aug_mix_noise_cache) + .repeat()) + iterator = tf.compat.v1.data.make_one_shot_iterator(noise_dataset) audio = augment_noise( audio, - FLAGS.decoded_aug_mix_noise_walk_dirs.split(','), + iterator.get_next(), change_audio_db_max=FLAGS.decoded_aug_mix_noise_max_audio_db, change_audio_db_min=FLAGS.decoded_aug_mix_noise_min_audio_db, change_noise_db_max=FLAGS.decoded_aug_mix_noise_max_noise_db, diff --git a/util/flags.py b/util/flags.py index 2d01a3a2b9..c63a969f36 100644 --- a/util/flags.py +++ b/util/flags.py @@ -25,10 +25,11 @@ def create_flags(): # ================ f.DEFINE_string('decoded_aug_mix_noise_walk_dirs', '', 'walk through wav dir, then mix noise wav into decoded audio') - f.DEFINE_float('decoded_aug_mix_noise_max_noise_db', -25, 'limit noise max volume') - f.DEFINE_float('decoded_aug_mix_noise_min_noise_db', -50, 'limit noise min volume') - f.DEFINE_float('decoded_aug_mix_noise_max_audio_db', 0, 'limit noise max volume') - f.DEFINE_float('decoded_aug_mix_noise_min_audio_db', -10, 'limit noise min volume') + f.DEFINE_string('decoded_aug_mix_noise_cache', '', 'must cache noise audio data, or it will read audio file every training step') + f.DEFINE_float('decoded_aug_mix_noise_max_noise_db', -25, 'to limit noise max volume') + f.DEFINE_float('decoded_aug_mix_noise_min_noise_db', -50, 'to limit noise min volume') + f.DEFINE_float('decoded_aug_mix_noise_max_audio_db', 0, 'to limit audio max volume') + f.DEFINE_float('decoded_aug_mix_noise_min_audio_db', -10, 'to limit audio min volume') f.DEFINE_float('data_aug_features_additive', 0, 'std of the Gaussian additive noise') f.DEFINE_float('data_aug_features_multiplicative', 0, 'std of normal distribution around 1 for multiplicative noise') From 4f24f08f09611226d37347216acbd0af2d485f4a Mon Sep 17 00:00:00 2001 From: Yi-Hua Chiu Date: Tue, 31 Dec 2019 17:19:00 +0800 Subject: [PATCH 09/25] rename decoded -> audio --- util/feeding.py | 14 +++++++------- util/flags.py | 12 ++++++------ 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/util/feeding.py b/util/feeding.py index d8ad88a20b..8067bf5c58 100644 --- a/util/feeding.py +++ b/util/feeding.py @@ -71,25 +71,25 @@ def audiofile_to_features(wav_filename, train_phase=False): audio = decoded.audio # augment audio - if train_phase and FLAGS.decoded_aug_mix_noise_walk_dirs: + if train_phase and FLAGS.audio_aug_mix_noise_walk_dirs: # because we have to determine the shuffle size, so we could not use generator noise_filenames = tf.convert_to_tensor( - list(collect_noise_filenames(FLAGS.decoded_aug_mix_noise_walk_dirs.split(','))), + list(collect_noise_filenames(FLAGS.audio_aug_mix_noise_walk_dirs.split(','))), dtype=tf.string) print(">>> Collect {} noise files for mixing audio".format(noise_filenames.shape[0])) noise_dataset = (tf.data.Dataset.from_tensor_slices(noise_filenames) .map(noise_file_to_audio, num_parallel_calls=tf.data.experimental.AUTOTUNE) .shuffle(noise_filenames.shape[0]) - .cache(FLAGS.decoded_aug_mix_noise_cache) + .cache(FLAGS.audio_aug_mix_noise_cache) .repeat()) iterator = tf.compat.v1.data.make_one_shot_iterator(noise_dataset) audio = augment_noise( audio, iterator.get_next(), - change_audio_db_max=FLAGS.decoded_aug_mix_noise_max_audio_db, - change_audio_db_min=FLAGS.decoded_aug_mix_noise_min_audio_db, - change_noise_db_max=FLAGS.decoded_aug_mix_noise_max_noise_db, - change_noise_db_min=FLAGS.decoded_aug_mix_noise_min_noise_db, + change_audio_db_max=FLAGS.audio_aug_mix_noise_max_audio_db, + change_audio_db_min=FLAGS.audio_aug_mix_noise_min_audio_db, + change_noise_db_max=FLAGS.audio_aug_mix_noise_max_noise_db, + change_noise_db_min=FLAGS.audio_aug_mix_noise_min_noise_db, ) diff --git a/util/flags.py b/util/flags.py index c63a969f36..8a3852ecf5 100644 --- a/util/flags.py +++ b/util/flags.py @@ -24,12 +24,12 @@ def create_flags(): # Data Augmentation # ================ - f.DEFINE_string('decoded_aug_mix_noise_walk_dirs', '', 'walk through wav dir, then mix noise wav into decoded audio') - f.DEFINE_string('decoded_aug_mix_noise_cache', '', 'must cache noise audio data, or it will read audio file every training step') - f.DEFINE_float('decoded_aug_mix_noise_max_noise_db', -25, 'to limit noise max volume') - f.DEFINE_float('decoded_aug_mix_noise_min_noise_db', -50, 'to limit noise min volume') - f.DEFINE_float('decoded_aug_mix_noise_max_audio_db', 0, 'to limit audio max volume') - f.DEFINE_float('decoded_aug_mix_noise_min_audio_db', -10, 'to limit audio min volume') + f.DEFINE_string('audio_aug_mix_noise_walk_dirs', '', 'walk through wav dir, then mix noise wav into decoded audio') + f.DEFINE_string('audio_aug_mix_noise_cache', '', 'must cache noise audio data, or it will read audio file every training step') + f.DEFINE_float('audio_aug_mix_noise_max_noise_db', -25, 'to limit noise max volume') + f.DEFINE_float('audio_aug_mix_noise_min_noise_db', -50, 'to limit noise min volume') + f.DEFINE_float('audio_aug_mix_noise_max_audio_db', 0, 'to limit audio max volume') + f.DEFINE_float('audio_aug_mix_noise_min_audio_db', -10, 'to limit audio min volume') f.DEFINE_float('data_aug_features_additive', 0, 'std of the Gaussian additive noise') f.DEFINE_float('data_aug_features_multiplicative', 0, 'std of normal distribution around 1 for multiplicative noise') From 1f57ece8ba65ea7b13c52dce577d95478416b12b Mon Sep 17 00:00:00 2001 From: Yi-Hua Chiu Date: Thu, 2 Jan 2020 16:21:37 +0800 Subject: [PATCH 10/25] [FIX] don't create tf.Dataset in other tf.Dataset's pipeline --- util/feeding.py | 38 +++++++++++++++++++++----------------- 1 file changed, 21 insertions(+), 17 deletions(-) diff --git a/util/feeding.py b/util/feeding.py index 8067bf5c58..ea7987a6f1 100644 --- a/util/feeding.py +++ b/util/feeding.py @@ -65,27 +65,16 @@ def samples_to_mfccs(samples, sample_rate, train_phase=False): return mfccs, tf.shape(input=mfccs)[0] -def audiofile_to_features(wav_filename, train_phase=False): +def audiofile_to_features(wav_filename, train_phase=False, noise_iterator=None): samples = tf.io.read_file(wav_filename) decoded = contrib_audio.decode_wav(samples, desired_channels=1) audio = decoded.audio # augment audio - if train_phase and FLAGS.audio_aug_mix_noise_walk_dirs: - # because we have to determine the shuffle size, so we could not use generator - noise_filenames = tf.convert_to_tensor( - list(collect_noise_filenames(FLAGS.audio_aug_mix_noise_walk_dirs.split(','))), - dtype=tf.string) - print(">>> Collect {} noise files for mixing audio".format(noise_filenames.shape[0])) - noise_dataset = (tf.data.Dataset.from_tensor_slices(noise_filenames) - .map(noise_file_to_audio, num_parallel_calls=tf.data.experimental.AUTOTUNE) - .shuffle(noise_filenames.shape[0]) - .cache(FLAGS.audio_aug_mix_noise_cache) - .repeat()) - iterator = tf.compat.v1.data.make_one_shot_iterator(noise_dataset) + if train_phase and noise_iterator: audio = augment_noise( audio, - iterator.get_next(), + noise_iterator.get_next(), change_audio_db_max=FLAGS.audio_aug_mix_noise_max_audio_db, change_audio_db_min=FLAGS.audio_aug_mix_noise_min_audio_db, change_noise_db_max=FLAGS.audio_aug_mix_noise_max_noise_db, @@ -106,9 +95,9 @@ def audiofile_to_features(wav_filename, train_phase=False): return features, features_len -def entry_to_features(wav_filename, transcript, train_phase): +def entry_to_features(wav_filename, transcript, train_phase, noise_iterator=None): # https://bugs.python.org/issue32117 - features, features_len = audiofile_to_features(wav_filename, train_phase=train_phase) + features, features_len = audiofile_to_features(wav_filename, train_phase=train_phase, noise_iterator=noise_iterator) return wav_filename, features, features_len, tf.SparseTensor(*transcript) @@ -147,7 +136,22 @@ def batch_fn(wav_filenames, features, features_len, transcripts): return tf.data.Dataset.zip((wav_filenames, features, transcripts)) num_gpus = len(Config.available_devices) - process_fn = partial(entry_to_features, train_phase=train_phase) + + if train_phase and FLAGS.audio_aug_mix_noise_walk_dirs: + # because we have to determine the shuffle size, so we could not use generator + noise_filenames = tf.convert_to_tensor( + list(collect_noise_filenames(FLAGS.audio_aug_mix_noise_walk_dirs.split(','))), + dtype=tf.string) + print(">>> Collect {} noise files for mixing audio".format(noise_filenames.shape[0])) + noise_dataset = (tf.data.Dataset.from_tensor_slices(noise_filenames) + .map(noise_file_to_audio, num_parallel_calls=tf.data.experimental.AUTOTUNE) + .shuffle(noise_filenames.shape[0]) + .cache(FLAGS.audio_aug_mix_noise_cache) + .repeat()) + noise_iterator = tf.compat.v1.data.make_one_shot_iterator(noise_dataset) + else: + noise_iterator = None + process_fn = partial(entry_to_features, train_phase=train_phase, noise_iterator=noise_iterator) dataset = (tf.data.Dataset.from_generator(generate_values, output_types=(tf.string, (tf.int64, tf.int32, tf.int64))) From 66cc7c48c9e3d07d3ab742b6dc235e5b706349af Mon Sep 17 00:00:00 2001 From: Yi-Hua Chiu Date: Mon, 13 Jan 2020 22:38:55 +0800 Subject: [PATCH 11/25] limit audio signal between +-1.0 --- util/audio_augmentation.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/util/audio_augmentation.py b/util/audio_augmentation.py index cb668f696b..283a54ed90 100644 --- a/util/audio_augmentation.py +++ b/util/audio_augmentation.py @@ -60,4 +60,5 @@ def augment_noise(audio, choosen_noise_db = tfv1.random_uniform( [], minval=change_noise_db_min, maxval=change_noise_db_max) noise_ratio = tf.math.pow(10.0, choosen_noise_db / 10) - return tf.multiply(audio, audio_ratio) + tf.multiply(mixed_noise, noise_ratio) + mixed_audio = tf.multiply(audio, audio_ratio) + tf.multiply(mixed_noise, noise_ratio) + return tf.maximum(tf.minimum(mixed_audio, 1.0), -1.0) From b7eb0f4d4c6a9d070a4d718727fdb22d83146100 Mon Sep 17 00:00:00 2001 From: Yi-Hua Chiu Date: Tue, 11 Feb 2020 14:15:35 +0800 Subject: [PATCH 12/25] [FIX] switch shuffle/map for memory cost, replace cache with prefetch for memory cost [MOD] deprecate FLAGS.audio_aug_mix_noise_cache --- util/feeding.py | 8 +++++--- util/flags.py | 1 - 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/util/feeding.py b/util/feeding.py index ea7987a6f1..35b7d3be92 100644 --- a/util/feeding.py +++ b/util/feeding.py @@ -17,6 +17,7 @@ from util.spectrogram_augmentations import augment_freq_time_mask, augment_dropout, augment_pitch_and_tempo, augment_speed_up from util.audio import read_frames_from_file, vad_split, DEFAULT_FORMAT from util.audio_augmentation import augment_noise, noise_file_to_audio, collect_noise_filenames +from util.logging import log_info def read_csvs(csv_files): @@ -139,14 +140,15 @@ def batch_fn(wav_filenames, features, features_len, transcripts): if train_phase and FLAGS.audio_aug_mix_noise_walk_dirs: # because we have to determine the shuffle size, so we could not use generator + log_info("Enable Mixing Noise Augmentation") noise_filenames = tf.convert_to_tensor( list(collect_noise_filenames(FLAGS.audio_aug_mix_noise_walk_dirs.split(','))), dtype=tf.string) - print(">>> Collect {} noise files for mixing audio".format(noise_filenames.shape[0])) + log_info("Collect {} noise files for mixing audio".format(noise_filenames.shape[0])) noise_dataset = (tf.data.Dataset.from_tensor_slices(noise_filenames) - .map(noise_file_to_audio, num_parallel_calls=tf.data.experimental.AUTOTUNE) .shuffle(noise_filenames.shape[0]) - .cache(FLAGS.audio_aug_mix_noise_cache) + .map(noise_file_to_audio, num_parallel_calls=tf.data.experimental.AUTOTUNE) + .prefetch(tf.compat.v1.data.experimental.AUTOTUNE) .repeat()) noise_iterator = tf.compat.v1.data.make_one_shot_iterator(noise_dataset) else: diff --git a/util/flags.py b/util/flags.py index 8a3852ecf5..ecfa90b8d2 100644 --- a/util/flags.py +++ b/util/flags.py @@ -25,7 +25,6 @@ def create_flags(): # ================ f.DEFINE_string('audio_aug_mix_noise_walk_dirs', '', 'walk through wav dir, then mix noise wav into decoded audio') - f.DEFINE_string('audio_aug_mix_noise_cache', '', 'must cache noise audio data, or it will read audio file every training step') f.DEFINE_float('audio_aug_mix_noise_max_noise_db', -25, 'to limit noise max volume') f.DEFINE_float('audio_aug_mix_noise_min_noise_db', -50, 'to limit noise min volume') f.DEFINE_float('audio_aug_mix_noise_max_audio_db', 0, 'to limit audio max volume') From ccae7cc93ef717f6045d3128c1aebdb6d8f5e858 Mon Sep 17 00:00:00 2001 From: Yi-Hua Chiu Date: Mon, 17 Feb 2020 15:38:30 +0800 Subject: [PATCH 13/25] [MOD] limit the buffer size of .shuffle() to protect memory usage --- util/feeding.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/util/feeding.py b/util/feeding.py index 35b7d3be92..e9c2fb9ad7 100644 --- a/util/feeding.py +++ b/util/feeding.py @@ -146,7 +146,7 @@ def batch_fn(wav_filenames, features, features_len, transcripts): dtype=tf.string) log_info("Collect {} noise files for mixing audio".format(noise_filenames.shape[0])) noise_dataset = (tf.data.Dataset.from_tensor_slices(noise_filenames) - .shuffle(noise_filenames.shape[0]) + .shuffle(min(noise_filenames.shape[0], 102400)) .map(noise_file_to_audio, num_parallel_calls=tf.data.experimental.AUTOTUNE) .prefetch(tf.compat.v1.data.experimental.AUTOTUNE) .repeat()) From 8cc95f9ee814d63a4a9c472921cca65662d3cbdd Mon Sep 17 00:00:00 2001 From: Yi-Hua Chiu Date: Wed, 19 Feb 2020 10:21:02 +0800 Subject: [PATCH 14/25] [ADD] bin/normalize_noise_audio.py --- bin/normalize_noise_audio.py | 172 +++++++++++++++++++++++++++++++++++ 1 file changed, 172 insertions(+) create mode 100644 bin/normalize_noise_audio.py diff --git a/bin/normalize_noise_audio.py b/bin/normalize_noise_audio.py new file mode 100644 index 0000000000..b487ece01b --- /dev/null +++ b/bin/normalize_noise_audio.py @@ -0,0 +1,172 @@ +from __future__ import absolute_import, division, print_function + +# Make sure we can import stuff from util/ +# This script needs to be run from the root of the DeepSpeech repository + +from util.feeding import secs_to_hours +from librosa import get_duration +from multiprocessing import Pool +from functools import partial +import math +import argparse +import sys +import os +sys.path.insert(1, os.path.join(sys.path[0], '..')) + +try: + import tqdm +except ImportError as err: + print('[ImportError] try `pip install tqdm`') + raise err + +try: + from pydub import AudioSegment +except ImportError as err: + print('[ImportError] try `sudo apt-get install ffmpeg && pip install pydub`') + raise err + + +def detect_silence(sound: AudioSegment, silence_threshold=-50.0, chunk_size=10): + start_trim = 0 # ms + sound_size = len(sound) + assert chunk_size > 0 # to avoid infinite loop + while sound[start_trim:(start_trim + chunk_size)].dBFS < silence_threshold and start_trim < sound_size: + start_trim += chunk_size + + end_trim = sound_size + while sound[(end_trim - chunk_size):end_trim].dBFS < silence_threshold and end_trim > 0: + end_trim -= chunk_size + + start_trim = min(sound_size, start_trim) + end_trim = max(0, end_trim) + + return min([start_trim, end_trim]), max([start_trim, end_trim]) + + +def trim_silence_audio(sound: AudioSegment, silence_threshold=-50.0, chunk_size=10): + start_trim, end_trim = detect_silence(sound, silence_threshold, chunk_size) + return sound[start_trim:end_trim] + + +def convert(filename, dst_dirpath, dirpath, normalize, trim_silence, + min_duration_seconds, max_duration_seconds): + if not filename.endswith(('.wav', '.raw')): + return + + filepath = os.path.join(dirpath, filename) + if filename.endswith('.wav'): + sound: AudioSegment = AudioSegment.from_file(filepath) + else: + try: + sound: AudioSegment = AudioSegment.from_raw(filepath, + sample_width=2, + frame_rate=44100, + channels=1) + except Exception as err: # pylint: disable=broad-except + print('Retrying conversion: {}'.format(err)) + try: + sound: AudioSegment = AudioSegment.from_raw(filepath, + sample_width=2, + frame_rate=48000, + channels=1) + except Exception as err: # pylint: disable=broad-except + print('Skipping file {}, got error: {}'.format(filepath, err)) + return + try: + sound = sound.set_frame_rate(16000) + except Exception as err: # pylint: disable=broad-except + print('Skipping {}'.format(err)) + return + + n_splits = max(1, math.ceil(sound.duration_seconds / max_duration_seconds)) + chunk_duration_ms = math.ceil(len(sound) / n_splits) + chunks = [] + + for i in range(n_splits): + end_ms = min((i + 1) * chunk_duration_ms, len(sound)) + chunk = sound[(i * chunk_duration_ms):end_ms] + chunks.append(chunk) + + for i, chunk in enumerate(chunks): + dst_path = os.path.join(dst_dirpath, str(i) + '_' + filename) + if dst_path.endswith('.raw'): + dst_path = dst_path[:-4] + '.wav' + + if os.path.exists(dst_path): + print('Audio already exists: {}'.format(dst_path)) + return + + if normalize: + chunk = chunk.normalize() + if chunk.dBFS < -30.0: + chunk = chunk.compress_dynamic_range().normalize() + if chunk.dBFS < -30.0: + chunk = chunk.compress_dynamic_range().normalize() + if trim_silence: + chunk = trim_silence_audio(chunk) + + if chunk.duration_seconds < min_duration_seconds: + return + chunk.export(dst_path, format='wav') + + +def get_noise_duration(dst_dir): + duration = 0.0 + file_num = 0 + for dirpath, _, filenames in os.walk(dst_dir): + for f in filenames: + if not f.endswith('.wav'): + continue + duration += get_duration(filename=os.path.join(dirpath, f)) + file_num += 1 + return duration, file_num + + +def main(src_dir, + dst_dir, + min_duration_seconds, + max_duration_seconds, + normalize=True, + trim_silence=True): + assert os.path.exists(src_dir) + if not os.path.exists(dst_dir): + os.makedirs(dst_dir, exist_ok=False) + src_dir = os.path.abspath(src_dir) + dst_dir = os.path.abspath(dst_dir) + + for dirpath, _, filenames in os.walk(src_dir): + dirpath = os.path.abspath(dirpath) + dst_dirpath = os.path.join( + dst_dir, dirpath.replace(src_dir, '').lstrip('/')) + + print('Converting directory: {} -> {}'.format(dirpath, dst_dirpath)) + if not os.path.exists(dst_dirpath): + os.makedirs(dst_dirpath, exist_ok=False) + + convert_func = partial(convert, + dst_dirpath=dst_dirpath, + dirpath=dirpath, + normalize=normalize, + trim_silence=trim_silence, + min_duration_seconds=min_duration_seconds, + max_duration_seconds=max_duration_seconds) + + pool = Pool(processes=None) + for _ in tqdm.tqdm(pool.imap_unordered(convert_func, filenames), total=len(filenames)): + pass + + +if __name__ == "__main__": + PARSER = argparse.ArgumentParser(description='Optimize noise files') + PARSER.add_argument('--from_dir', help='Convert wav from directory', type=str) + PARSER.add_argument('--to_dir', help='save wav to directory', type=str) + PARSER.add_argument('--min_sec', help='min duration seconds of saved file', type=float, default=1.0) + PARSER.add_argument('--max_sec', help='max duration seconds of saved file', type=float, default=30.0) + PARSER.add_argument('--normalize', action='store_true', help='Normalize sound range, default is true', default=True) + PARSER.add_argument('--trim', action='store_true', help='Trim silence, default is true', default=True) + PARAMS = PARSER.parse_args() + + main(PARAMS.from_dir, PARAMS.to_dir, PARAMS.min_sec, PARAMS.max_sec, PARAMS.normalize, PARAMS.trim) + + DURATION, FILE_NUM = get_noise_duration(PARAMS.to_dir) + print("Your noise dataset has {} files and a duration of {}\n".format(FILE_NUM, secs_to_hours(DURATION))) From 9e2648a47c04010e8ba28121fab9e0dd8f2bd010 Mon Sep 17 00:00:00 2001 From: Yi-Hua Chiu Date: Sat, 22 Feb 2020 00:40:09 +0800 Subject: [PATCH 15/25] [MOD] mix noise into complete audio --- util/audio_augmentation.py | 33 +++++++++------------------------ 1 file changed, 9 insertions(+), 24 deletions(-) diff --git a/util/audio_augmentation.py b/util/audio_augmentation.py index 283a54ed90..d23fe207d4 100644 --- a/util/audio_augmentation.py +++ b/util/audio_augmentation.py @@ -25,33 +25,18 @@ def augment_noise(audio, change_noise_db_min=-25): decoded_audio_len = tf.shape(audio)[0] - noise_decoded_audio_len = tf.shape(noise_audio)[0] + decoded_noise_len = tf.shape(noise_audio)[0] - multiply = tf.math.floordiv(decoded_audio_len, noise_decoded_audio_len) + 1 + multiply = tf.math.floordiv(decoded_audio_len, decoded_noise_len) + 1 noise_audio_tile = tf.tile(noise_audio, [multiply, 1]) - # now noise_decoded_len must > decoded_len - noise_decoded_audio_len = tf.shape(noise_audio_tile)[0] + # Now, decoded_noise_len must > decoded_audio_len + decoded_noise_len = tf.shape(noise_audio_tile)[0] - mix_decoded_start_end_points = tfv1.random_uniform( - [2], minval=0, maxval=decoded_audio_len-1, dtype=tf.int32) - mix_decoded_start_point = tf.math.reduce_min(mix_decoded_start_end_points) - mix_decoded_end_point = tf.math.reduce_max( - mix_decoded_start_end_points) + 1 - mix_decoded_width = mix_decoded_end_point - mix_decoded_start_point - - left_zeros = tf.zeros(shape=[mix_decoded_start_point, 1]) - - mix_noise_decoded_start_point = tfv1.random_uniform( - [], minval=0, maxval=noise_decoded_audio_len - mix_decoded_width, dtype=tf.int32) - mix_noise_decoded_end_point = mix_noise_decoded_start_point + mix_decoded_width - extract_noise_decoded = noise_audio_tile[mix_noise_decoded_start_point:mix_noise_decoded_end_point, :] - - right_zeros = tf.zeros( - shape=[decoded_audio_len - mix_decoded_end_point, 1]) - - mixed_noise = tf.concat( - [left_zeros, extract_noise_decoded, right_zeros], axis=0) + mix_decoded_start_point = tfv1.random_uniform( + [], minval=0, maxval=decoded_noise_len-decoded_audio_len, dtype=tf.int32) + mix_decoded_end_point = mix_decoded_start_point + decoded_audio_len + extract_noise_decoded = noise_audio_tile[mix_decoded_start_point:mix_decoded_end_point, :] choosen_audio_db = tfv1.random_uniform( [], minval=change_audio_db_min, maxval=change_audio_db_max) @@ -60,5 +45,5 @@ def augment_noise(audio, choosen_noise_db = tfv1.random_uniform( [], minval=change_noise_db_min, maxval=change_noise_db_max) noise_ratio = tf.math.pow(10.0, choosen_noise_db / 10) - mixed_audio = tf.multiply(audio, audio_ratio) + tf.multiply(mixed_noise, noise_ratio) + mixed_audio = tf.multiply(audio, audio_ratio) + tf.multiply(extract_noise_decoded, noise_ratio) return tf.maximum(tf.minimum(mixed_audio, 1.0), -1.0) From 2269514a9ef676100b46f0c99c0e6a7150feb4dd Mon Sep 17 00:00:00 2001 From: Yi-Hua Chiu Date: Fri, 6 Mar 2020 16:19:09 +0800 Subject: [PATCH 16/25] [ADD] dev/test dataset can also mix noise [MOD] use SNR to balance noise/speech volume, refactor the called functions to accept noise arguments --- DeepSpeech.py | 9 ++-- evaluate.py | 4 +- util/audio_augmentation.py | 85 +++++++++++++++++++++++++++++--------- util/feeding.py | 58 +++++++++++++------------- util/flags.py | 12 +++--- 5 files changed, 109 insertions(+), 59 deletions(-) diff --git a/DeepSpeech.py b/DeepSpeech.py index 3f62050236..6186fc6723 100755 --- a/DeepSpeech.py +++ b/DeepSpeech.py @@ -436,7 +436,8 @@ def train(): batch_size=FLAGS.train_batch_size, enable_cache=FLAGS.feature_cache and do_cache_dataset, cache_path=FLAGS.feature_cache, - train_phase=True) + train_phase=True, + noise_dirs=FLAGS.audio_aug_mix_noise_walk_train_dirs) iterator = tfv1.data.Iterator.from_structure(tfv1.data.get_output_types(train_set), tfv1.data.get_output_shapes(train_set), @@ -447,7 +448,7 @@ def train(): if FLAGS.dev_files: dev_csvs = FLAGS.dev_files.split(',') - dev_sets = [create_dataset([csv], batch_size=FLAGS.dev_batch_size, train_phase=False) for csv in dev_csvs] + dev_sets = [create_dataset([csv], batch_size=FLAGS.dev_batch_size, train_phase=False, noise_dirs=FLAGS.audio_aug_mix_noise_walk_dev_dirs) for csv in dev_csvs] dev_init_ops = [iterator.make_initializer(dev_set) for dev_set in dev_sets] # Dropout @@ -673,7 +674,7 @@ def __call__(self, progress, data, **kwargs): def test(): - samples = evaluate(FLAGS.test_files.split(','), create_model, try_loading) + samples = evaluate(FLAGS.test_files.split(','), create_model, try_loading, noise_dirs=FLAGS.audio_aug_mix_noise_walk_test_dirs) if FLAGS.test_output_file: # Save decoded tuples as JSON, converting NumPy floats to Python floats json.dump(samples, open(FLAGS.test_output_file, 'w'), default=float) @@ -896,7 +897,7 @@ def do_single_file_inference(input_file_path): print('Could not load checkpoint from {}'.format(FLAGS.checkpoint_dir)) sys.exit(1) - features, features_len = audiofile_to_features(input_file_path) + features, features_len = audiofile_to_features(input_file_path, 0.0) previous_state_c = np.zeros([1, Config.n_cell_dim]) previous_state_h = np.zeros([1, Config.n_cell_dim]) diff --git a/evaluate.py b/evaluate.py index 435a6be8a6..dd655a36cf 100755 --- a/evaluate.py +++ b/evaluate.py @@ -41,7 +41,7 @@ def sparse_tuple_to_texts(sp_tuple, alphabet): return [alphabet.decode(res) for res in results] -def evaluate(test_csvs, create_model, try_loading): +def evaluate(test_csvs, create_model, try_loading, noise_dirs=None): if FLAGS.lm_binary_path: scorer = Scorer(FLAGS.lm_alpha, FLAGS.lm_beta, FLAGS.lm_binary_path, FLAGS.lm_trie_path, @@ -50,7 +50,7 @@ def evaluate(test_csvs, create_model, try_loading): scorer = None test_csvs = FLAGS.test_files.split(',') - test_sets = [create_dataset([csv], batch_size=FLAGS.test_batch_size, train_phase=False) for csv in test_csvs] + test_sets = [create_dataset([csv], batch_size=FLAGS.test_batch_size, train_phase=False, noise_dirs=noise_dirs) for csv in test_csvs] iterator = tfv1.data.Iterator.from_structure(tfv1.data.get_output_types(test_sets[0]), tfv1.data.get_output_shapes(test_sets[0]), output_classes=tfv1.data.get_output_classes(test_sets[0])) diff --git a/util/audio_augmentation.py b/util/audio_augmentation.py index d23fe207d4..c65dc5e9a4 100644 --- a/util/audio_augmentation.py +++ b/util/audio_augmentation.py @@ -1,7 +1,46 @@ +from __future__ import absolute_import, division, print_function + import tensorflow as tf import tensorflow.compat.v1 as tfv1 +import numpy as np from tensorflow.python.ops import gen_audio_ops as contrib_audio import os +from util.logging import log_info + +DBFS_COEF = 20.0 / np.log(10.0) + + +def get_dbfs(wav_filename): + samples = tf.io.read_file(wav_filename) + decoded = contrib_audio.decode_wav(samples, desired_channels=1) + rms = tf.sqrt(tf.reduce_mean(tf.square(decoded.audio))) + dbfs = DBFS_COEF * tf.math.log(rms) + return dbfs + + +def create_noise_iterator(noise_dirs): + """noise_dirs: `str` or `list`""" + if isinstance(noise_dirs, str): + noise_dirs = noise_dirs.split(',') + + noise_filenames = tf.convert_to_tensor( + list(collect_noise_filenames(noise_dirs)), + dtype=tf.string) + log_info("Collect {} noise files for mixing audio".format( + noise_filenames.shape[0])) + + def extract_dbfs(wav_filename): + return wav_filename, get_dbfs(wav_filename) + noise_dataset = (tf.data.Dataset.from_tensor_slices(noise_filenames) + .map(extract_dbfs, num_parallel_calls=tf.data.experimental.AUTOTUNE) + .cache() + .shuffle(min(noise_filenames.shape[0], 102400)) + .map(noise_file_to_audio, num_parallel_calls=tf.data.experimental.AUTOTUNE) + .prefetch(tfv1.data.experimental.AUTOTUNE) + .repeat()) + noise_iterator = tfv1.data.make_one_shot_iterator(noise_dataset) + return noise_iterator + def collect_noise_filenames(walk_dirs): assert isinstance(walk_dirs, list) @@ -12,38 +51,44 @@ def collect_noise_filenames(walk_dirs): if filename.endswith('.wav'): yield os.path.join(dirpath, filename) -def noise_file_to_audio(noise_file): + +def noise_file_to_audio(noise_file, noise_dbfs): samples = tf.io.read_file(noise_file) decoded = contrib_audio.decode_wav(samples, desired_channels=1) - return decoded.audio + return decoded.audio, noise_dbfs -def augment_noise(audio, - noise_audio, - change_audio_db_max=0, - change_audio_db_min=-10, - change_noise_db_max=-15, - change_noise_db_min=-25): +def augment_noise(audio, + audio_dbfs, + noise, + noise_dbfs, + max_audio_gain_db=5, + min_audio_gain_db=-10, + max_snr_db=30, + min_snr_db=5): decoded_audio_len = tf.shape(audio)[0] - decoded_noise_len = tf.shape(noise_audio)[0] + decoded_noise_len = tf.shape(noise)[0] multiply = tf.math.floordiv(decoded_audio_len, decoded_noise_len) + 1 - noise_audio_tile = tf.tile(noise_audio, [multiply, 1]) + noise_audio_tile = tf.tile(noise, [multiply, 1]) # Now, decoded_noise_len must > decoded_audio_len decoded_noise_len = tf.shape(noise_audio_tile)[0] - mix_decoded_start_point = tfv1.random_uniform( - [], minval=0, maxval=decoded_noise_len-decoded_audio_len, dtype=tf.int32) + mix_decoded_start_point = tfv1.random_uniform([], minval=0, maxval=decoded_noise_len-decoded_audio_len, dtype=tf.int32) mix_decoded_end_point = mix_decoded_start_point + decoded_audio_len extract_noise_decoded = noise_audio_tile[mix_decoded_start_point:mix_decoded_end_point, :] - choosen_audio_db = tfv1.random_uniform( - [], minval=change_audio_db_min, maxval=change_audio_db_max) - audio_ratio = tf.math.pow(10.0, choosen_audio_db / 10) + audio_gain_db = tfv1.random_uniform([], minval=min_audio_gain_db, maxval=max_audio_gain_db) + target_audio_dbfs = audio_dbfs + audio_gain_db + audio_gain_ratio = tf.math.pow(10.0, audio_gain_db / 10) + + # target_snr_db := target_audio_dbfs - target_noise_dbfs + target_snr_db = tfv1.random_uniform([], minval=min_snr_db, maxval=max_snr_db) - choosen_noise_db = tfv1.random_uniform( - [], minval=change_noise_db_min, maxval=change_noise_db_max) - noise_ratio = tf.math.pow(10.0, choosen_noise_db / 10) - mixed_audio = tf.multiply(audio, audio_ratio) + tf.multiply(extract_noise_decoded, noise_ratio) - return tf.maximum(tf.minimum(mixed_audio, 1.0), -1.0) + target_noise_dbfs = target_audio_dbfs - target_snr_db + noise_gain_db = target_noise_dbfs - noise_dbfs + noise_gain_ratio = tf.math.pow(10.0, noise_gain_db / 10) + mixed_audio = tf.multiply(audio, audio_gain_ratio) + tf.multiply(extract_noise_decoded, noise_gain_ratio) + mixed_audio = tf.maximum(tf.minimum(mixed_audio, 1.0), -1.0) + return mixed_audio diff --git a/util/feeding.py b/util/feeding.py index e9c2fb9ad7..f8e8a7aebc 100644 --- a/util/feeding.py +++ b/util/feeding.py @@ -16,7 +16,7 @@ from util.flags import FLAGS from util.spectrogram_augmentations import augment_freq_time_mask, augment_dropout, augment_pitch_and_tempo, augment_speed_up from util.audio import read_frames_from_file, vad_split, DEFAULT_FORMAT -from util.audio_augmentation import augment_noise, noise_file_to_audio, collect_noise_filenames +from util.audio_augmentation import augment_noise, create_noise_iterator, get_dbfs from util.logging import log_info @@ -66,23 +66,25 @@ def samples_to_mfccs(samples, sample_rate, train_phase=False): return mfccs, tf.shape(input=mfccs)[0] -def audiofile_to_features(wav_filename, train_phase=False, noise_iterator=None): +def audiofile_to_features(wav_filename, audio_dbfs, train_phase=False, noise_iterator=None): samples = tf.io.read_file(wav_filename) decoded = contrib_audio.decode_wav(samples, desired_channels=1) audio = decoded.audio # augment audio - if train_phase and noise_iterator: + if noise_iterator: + noise, noise_dbfs = noise_iterator.get_next() audio = augment_noise( audio, - noise_iterator.get_next(), - change_audio_db_max=FLAGS.audio_aug_mix_noise_max_audio_db, - change_audio_db_min=FLAGS.audio_aug_mix_noise_min_audio_db, - change_noise_db_max=FLAGS.audio_aug_mix_noise_max_noise_db, - change_noise_db_min=FLAGS.audio_aug_mix_noise_min_noise_db, + audio_dbfs, + noise, + noise_dbfs, + max_audio_gain_db=FLAGS.audio_aug_mix_noise_max_audio_gain_db, + min_audio_gain_db=FLAGS.audio_aug_mix_noise_min_audio_gain_db, + max_snr_db=FLAGS.audio_aug_mix_noise_max_snr_db, + min_snr_db=FLAGS.audio_aug_mix_noise_min_snr_db, ) - features, features_len = samples_to_mfccs(audio, decoded.sample_rate, train_phase=train_phase) # augment features @@ -96,9 +98,9 @@ def audiofile_to_features(wav_filename, train_phase=False, noise_iterator=None): return features, features_len -def entry_to_features(wav_filename, transcript, train_phase, noise_iterator=None): +def entry_to_features(wav_filename, transcript, audio_dbfs, train_phase, noise_iterator): # https://bugs.python.org/issue32117 - features, features_len = audiofile_to_features(wav_filename, train_phase=train_phase, noise_iterator=noise_iterator) + features, features_len = audiofile_to_features(wav_filename, audio_dbfs, train_phase=train_phase, noise_iterator=noise_iterator) return wav_filename, features, features_len, tf.SparseTensor(*transcript) @@ -111,7 +113,7 @@ def to_sparse_tuple(sequence): return indices, sequence, shape -def create_dataset(csvs, batch_size, enable_cache=False, cache_path=None, train_phase=False): +def create_dataset(csvs, batch_size, enable_cache=False, cache_path=None, train_phase=False, noise_dirs=None): df = read_csvs(csvs) df.sort_values(by='wav_filesize', inplace=True) @@ -138,26 +140,26 @@ def batch_fn(wav_filenames, features, features_len, transcripts): num_gpus = len(Config.available_devices) - if train_phase and FLAGS.audio_aug_mix_noise_walk_dirs: - # because we have to determine the shuffle size, so we could not use generator - log_info("Enable Mixing Noise Augmentation") - noise_filenames = tf.convert_to_tensor( - list(collect_noise_filenames(FLAGS.audio_aug_mix_noise_walk_dirs.split(','))), - dtype=tf.string) - log_info("Collect {} noise files for mixing audio".format(noise_filenames.shape[0])) - noise_dataset = (tf.data.Dataset.from_tensor_slices(noise_filenames) - .shuffle(min(noise_filenames.shape[0], 102400)) - .map(noise_file_to_audio, num_parallel_calls=tf.data.experimental.AUTOTUNE) - .prefetch(tf.compat.v1.data.experimental.AUTOTUNE) - .repeat()) - noise_iterator = tf.compat.v1.data.make_one_shot_iterator(noise_dataset) + if noise_dirs: + noise_iterator = create_noise_iterator(noise_dirs) else: noise_iterator = None + process_fn = partial(entry_to_features, train_phase=train_phase, noise_iterator=noise_iterator) - dataset = (tf.data.Dataset.from_generator(generate_values, - output_types=(tf.string, (tf.int64, tf.int32, tf.int64))) - .map(process_fn, num_parallel_calls=tf.data.experimental.AUTOTUNE)) + dataset = tf.data.Dataset.from_generator(generate_values, + output_types=(tf.string, (tf.int64, tf.int32, tf.int64))) + + if noise_dirs: + dataset = (dataset.map(lambda wav_filename, transcript: (wav_filename, transcript, get_dbfs(wav_filename)), + num_parallel_calls=tf.data.experimental.AUTOTUNE) + .cache()) + else: + dataset = dataset.map(lambda wav_filename, transcript: (wav_filename, transcript, 0.0), + num_parallel_calls=tf.data.experimental.AUTOTUNE) + + dataset = dataset.map( + process_fn, num_parallel_calls=tf.data.experimental.AUTOTUNE) if enable_cache: dataset = dataset.cache(cache_path) diff --git a/util/flags.py b/util/flags.py index ecfa90b8d2..69fb81182f 100644 --- a/util/flags.py +++ b/util/flags.py @@ -24,11 +24,13 @@ def create_flags(): # Data Augmentation # ================ - f.DEFINE_string('audio_aug_mix_noise_walk_dirs', '', 'walk through wav dir, then mix noise wav into decoded audio') - f.DEFINE_float('audio_aug_mix_noise_max_noise_db', -25, 'to limit noise max volume') - f.DEFINE_float('audio_aug_mix_noise_min_noise_db', -50, 'to limit noise min volume') - f.DEFINE_float('audio_aug_mix_noise_max_audio_db', 0, 'to limit audio max volume') - f.DEFINE_float('audio_aug_mix_noise_min_audio_db', -10, 'to limit audio min volume') + f.DEFINE_string('audio_aug_mix_noise_walk_train_dirs', '', 'walk through wav dir, then mix noise wav into decoded audio') + f.DEFINE_string('audio_aug_mix_noise_walk_dev_dirs', '', 'walk through wav dir, then mix noise wav into decoded audio') + f.DEFINE_string('audio_aug_mix_noise_walk_test_dirs', '', 'walk through wav dir, then mix noise wav into decoded audio') + f.DEFINE_float('audio_aug_mix_noise_max_snr_db', 20, 'to limit noise max volume', lower_bound=0.0) + f.DEFINE_float('audio_aug_mix_noise_min_snr_db', 3, 'to limit noise min volume', lower_bound=0.0) + f.DEFINE_float('audio_aug_mix_noise_max_audio_gain_db', 5, 'to limit audio max volume') + f.DEFINE_float('audio_aug_mix_noise_min_audio_gain_db', -10, 'to limit audio min volume') f.DEFINE_float('data_aug_features_additive', 0, 'std of the Gaussian additive noise') f.DEFINE_float('data_aug_features_multiplicative', 0, 'std of normal distribution around 1 for multiplicative noise') From 0b8147ce8c4a1906de80f1db793b8aa63dc15045 Mon Sep 17 00:00:00 2001 From: Yi-Hua Chiu Date: Mon, 16 Mar 2020 17:07:29 +0800 Subject: [PATCH 17/25] [ADD] use dbfs and SNR to determine the balance of audio/noise, add option to dump audio into tensorboard [FIX] correct gain db formula --- DeepSpeech.py | 33 ++++-- evaluate.py | 6 +- util/audio_augmentation.py | 220 ++++++++++++++++++++++++++++--------- util/feeding.py | 66 ++++++----- util/flags.py | 18 +-- 5 files changed, 244 insertions(+), 99 deletions(-) diff --git a/DeepSpeech.py b/DeepSpeech.py index 6186fc6723..28042fa506 100755 --- a/DeepSpeech.py +++ b/DeepSpeech.py @@ -218,7 +218,7 @@ def calculate_mean_edit_distance_and_loss(iterator, dropout, reuse): the decoded result and the batch's original Y. ''' # Obtain the next batch of data - batch_filenames, (batch_x, batch_seq_len), batch_y = iterator.get_next() + batch_filenames, (batch_x, batch_seq_len), batch_y, review_audio = iterator.get_next() if FLAGS.use_cudnn_rnn: rnn_impl = rnn_impl_cudnn_rnn @@ -238,7 +238,7 @@ def calculate_mean_edit_distance_and_loss(iterator, dropout, reuse): avg_loss = tf.reduce_mean(input_tensor=total_loss) # Finally we return the average loss - return avg_loss, non_finite_files + return avg_loss, non_finite_files, review_audio # Adam Optimization @@ -299,7 +299,7 @@ def get_tower_results(iterator, optimizer, dropout_rates): with tf.name_scope('tower_%d' % i): # Calculate the avg_loss and mean_edit_distance and retrieve the decoded # batch along with the original batch's labels (Y) of this tower - avg_loss, non_finite_files = calculate_mean_edit_distance_and_loss(iterator, dropout_rates, reuse=i > 0) + avg_loss, non_finite_files, review_audio = calculate_mean_edit_distance_and_loss(iterator, dropout_rates, reuse=i > 0) # Allow for variables to be re-used by the next tower tfv1.get_variable_scope().reuse_variables() @@ -316,6 +316,8 @@ def get_tower_results(iterator, optimizer, dropout_rates): tower_non_finite_files.append(non_finite_files) avg_loss_across_towers = tf.reduce_mean(input_tensor=tower_avg_losses, axis=0) + if FLAGS.augmentation_review_audio_steps: + tfv1.summary.audio(name='step_audio', tensor=review_audio, sample_rate=16000, collections=['step_audio_summaries']) tfv1.summary.scalar(name='step_loss', tensor=avg_loss_across_towers, collections=['step_summaries']) all_non_finite_files = tf.concat(tower_non_finite_files, axis=0) @@ -437,7 +439,7 @@ def train(): enable_cache=FLAGS.feature_cache and do_cache_dataset, cache_path=FLAGS.feature_cache, train_phase=True, - noise_dirs=FLAGS.audio_aug_mix_noise_walk_train_dirs) + noise_dirs_or_files=FLAGS.audio_aug_mix_noise_train_dirs_or_files) iterator = tfv1.data.Iterator.from_structure(tfv1.data.get_output_types(train_set), tfv1.data.get_output_shapes(train_set), @@ -448,7 +450,7 @@ def train(): if FLAGS.dev_files: dev_csvs = FLAGS.dev_files.split(',') - dev_sets = [create_dataset([csv], batch_size=FLAGS.dev_batch_size, train_phase=False, noise_dirs=FLAGS.audio_aug_mix_noise_walk_dev_dirs) for csv in dev_csvs] + dev_sets = [create_dataset([csv], batch_size=FLAGS.dev_batch_size, train_phase=False, noise_dirs_or_files=FLAGS.audio_aug_mix_noise_dev_dirs_or_files) for csv in dev_csvs] dev_init_ops = [iterator.make_initializer(dev_set) for dev_set in dev_sets] # Dropout @@ -484,6 +486,7 @@ def train(): apply_gradient_op = optimizer.apply_gradients(avg_tower_gradients, global_step=global_step) # Summaries + step_audio_summaries_op = tfv1.summary.merge_all('step_audio_summaries') step_summaries_op = tfv1.summary.merge_all('step_summaries') step_summary_writers = { 'train': tfv1.summary.FileWriter(os.path.join(FLAGS.summary_dir, 'train'), max_queue=120), @@ -594,11 +597,20 @@ def __call__(self, progress, data, **kwargs): session.run(init_op) # Batch loop + + i_audio_steps = 0 while True: try: - _, current_step, batch_loss, problem_files, step_summary = \ - session.run([train_op, global_step, loss, non_finite_files, step_summaries_op], - feed_dict=feed_dict) + step_audio_summary = None + if i_audio_steps < FLAGS.augmentation_review_audio_steps and epoch == 0: + _, current_step, batch_loss, problem_files, step_summary, step_audio_summary = \ + session.run([train_op, global_step, loss, non_finite_files, step_summaries_op, step_audio_summaries_op], + feed_dict=feed_dict) + i_audio_steps += 1 + else: + _, current_step, batch_loss, problem_files, step_summary = \ + session.run([train_op, global_step, loss, non_finite_files, step_summaries_op], + feed_dict=feed_dict) except tf.errors.OutOfRangeError: break @@ -612,6 +624,9 @@ def __call__(self, progress, data, **kwargs): pbar.update(step_count) + if step_audio_summary is not None: + step_summary_writer.add_summary(step_audio_summary, current_step) + step_summary_writer.add_summary(step_summary, current_step) if is_train and FLAGS.checkpoint_secs > 0 and time.time() - checkpoint_time > FLAGS.checkpoint_secs: @@ -674,7 +689,7 @@ def __call__(self, progress, data, **kwargs): def test(): - samples = evaluate(FLAGS.test_files.split(','), create_model, try_loading, noise_dirs=FLAGS.audio_aug_mix_noise_walk_test_dirs) + samples = evaluate(FLAGS.test_files.split(','), create_model, try_loading, noise_dirs_or_files=FLAGS.audio_aug_mix_noise_test_dirs_or_files) if FLAGS.test_output_file: # Save decoded tuples as JSON, converting NumPy floats to Python floats json.dump(samples, open(FLAGS.test_output_file, 'w'), default=float) diff --git a/evaluate.py b/evaluate.py index dd655a36cf..94348e088c 100755 --- a/evaluate.py +++ b/evaluate.py @@ -41,7 +41,7 @@ def sparse_tuple_to_texts(sp_tuple, alphabet): return [alphabet.decode(res) for res in results] -def evaluate(test_csvs, create_model, try_loading, noise_dirs=None): +def evaluate(test_csvs, create_model, try_loading, noise_dirs_or_files=None): if FLAGS.lm_binary_path: scorer = Scorer(FLAGS.lm_alpha, FLAGS.lm_beta, FLAGS.lm_binary_path, FLAGS.lm_trie_path, @@ -50,13 +50,13 @@ def evaluate(test_csvs, create_model, try_loading, noise_dirs=None): scorer = None test_csvs = FLAGS.test_files.split(',') - test_sets = [create_dataset([csv], batch_size=FLAGS.test_batch_size, train_phase=False, noise_dirs=noise_dirs) for csv in test_csvs] + test_sets = [create_dataset([csv], batch_size=FLAGS.test_batch_size, train_phase=False, noise_dirs_or_files=noise_dirs_or_files) for csv in test_csvs] iterator = tfv1.data.Iterator.from_structure(tfv1.data.get_output_types(test_sets[0]), tfv1.data.get_output_shapes(test_sets[0]), output_classes=tfv1.data.get_output_classes(test_sets[0])) test_init_ops = [iterator.make_initializer(test_set) for test_set in test_sets] - batch_wav_filename, (batch_x, batch_x_len), batch_y = iterator.get_next() + batch_wav_filename, (batch_x, batch_x_len), batch_y, _ = iterator.get_next() # One rate per layer no_dropout = [None] * 6 diff --git a/util/audio_augmentation.py b/util/audio_augmentation.py index c65dc5e9a4..7f7705e825 100644 --- a/util/audio_augmentation.py +++ b/util/audio_augmentation.py @@ -6,89 +6,207 @@ from tensorflow.python.ops import gen_audio_ops as contrib_audio import os from util.logging import log_info +from util.config import Config -DBFS_COEF = 20.0 / np.log(10.0) +DBFS_COEF = 10.0 / np.log(10.0) -def get_dbfs(wav_filename): +def filename_to_audio(wav_filename): + r"""Decode `wab_filename` and return the audio + + Args: + wav_filename: A str, the path of wav file + + Returns: + A 2-D Tensor with shape [`time-steps`, 1]. + """ samples = tf.io.read_file(wav_filename) decoded = contrib_audio.decode_wav(samples, desired_channels=1) - rms = tf.sqrt(tf.reduce_mean(tf.square(decoded.audio))) - dbfs = DBFS_COEF * tf.math.log(rms) - return dbfs + return decoded.audio + +def audio_to_dbfs(audio, sample_rate=16000, chunk_ms=100, reduce_funcs=tf.reduce_mean): + r"""Separately measure the chunks dbfs of `audio`, then return the statistics values through `reduce_funcs + + Args: + audio: A 2-D Tensor with shape [`time-steps`, 1]. + sample_rate: An integer, specifying the audio sample rate to determining the chunk size for dbfs measurement. + chunk_ms: An integer in milliseconds unit, specifying each chunk size for separately measuring dbfs, default is `100ms` + reduce_funcs: A function or A list of function, specifying the statistics method to chunks, default is tf.reduce_mean + + Returns: + A float or A list of float, depends on reduce_funcs is function or list of function + """ + assert chunk_ms % 10 == 0, 'chunk_ms must be a multiple of 10' + + audio_len = tf.shape(audio)[0] + chunk_len = tf.math.floordiv(sample_rate, tf.math.floordiv(1000, chunk_ms)) # default: 1600 + n_chunks = tf.math.floordiv(audio_len, chunk_len) + trim_audio_len = tf.multiply(n_chunks, chunk_len) + audio = audio[:trim_audio_len] + splits = tf.reshape(audio, shape=[n_chunks, -1]) + + squares = tf.square(splits) + means = tf.reduce_mean(squares, axis=1) + # the statistics functions must execute before tf.log(), or the gain db would be wrong + if not isinstance(reduce_funcs, list): + reduces = reduce_funcs(means) + return DBFS_COEF * tf.math.log(reduces + 1e-8) -def create_noise_iterator(noise_dirs): - """noise_dirs: `str` or `list`""" - if isinstance(noise_dirs, str): - noise_dirs = noise_dirs.split(',') + reduces = [reduce_func(means) for reduce_func in reduce_funcs] + return [DBFS_COEF * tf.math.log(reduce + 1e-8) for reduce in reduces] - noise_filenames = tf.convert_to_tensor( - list(collect_noise_filenames(noise_dirs)), - dtype=tf.string) - log_info("Collect {} noise files for mixing audio".format( - noise_filenames.shape[0])) - def extract_dbfs(wav_filename): - return wav_filename, get_dbfs(wav_filename) +def create_noise_iterator(noise_dirs_or_files, read_csvs_func): + r"""Create an iterator to yield audio + + Args: + noise_dirs_or_files: A list/tuple of str, the collection source of wav filenames. + read_csvs_func: A function, please specify the `read_csvs()` function from `util/feeding.py`, which is to prevent recursive import error. + + Returns: + An one shot iterator of audio with 2-D Tensor of shape [`time-step`, 1], use `.get_next()` to get the Tensor. + """ + if isinstance(noise_dirs_or_files, str): + noise_dirs_or_files = noise_dirs_or_files.split(',') + + noise_filenames = tf.convert_to_tensor(list(collect_noise_filenames(noise_dirs_or_files, read_csvs_func)), dtype=tf.string) + log_info("Collect {} noise files for mixing audio".format(noise_filenames.shape[0])) + noise_dataset = (tf.data.Dataset.from_tensor_slices(noise_filenames) - .map(extract_dbfs, num_parallel_calls=tf.data.experimental.AUTOTUNE) - .cache() .shuffle(min(noise_filenames.shape[0], 102400)) - .map(noise_file_to_audio, num_parallel_calls=tf.data.experimental.AUTOTUNE) + .map(filename_to_audio, num_parallel_calls=tf.data.experimental.AUTOTUNE) .prefetch(tfv1.data.experimental.AUTOTUNE) .repeat()) noise_iterator = tfv1.data.make_one_shot_iterator(noise_dataset) return noise_iterator -def collect_noise_filenames(walk_dirs): - assert isinstance(walk_dirs, list) +def collect_noise_filenames(dirs_or_files, read_csvs_func): + r"""Collect wav filenames from directories or csv files - for d in walk_dirs: - for dirpath, _, filenames in os.walk(d): - for filename in filenames: - if filename.endswith('.wav'): - yield os.path.join(dirpath, filename) + Args: + dirs_or_files: A list/tuple of str, the collection source of wav filenames. + read_csvs_func: A function, please specify the `read_csvs()` function from `util/feeding.py`, which is to prevent recursive import error. + Returns: + An iterator of str, yield every filename suffix with `.wav` or under `wav_filename` column of DataFrame + """ -def noise_file_to_audio(noise_file, noise_dbfs): - samples = tf.io.read_file(noise_file) - decoded = contrib_audio.decode_wav(samples, desired_channels=1) - return decoded.audio, noise_dbfs + assert isinstance(dirs_or_files, (list, tuple)) + + for dir_or_file in dirs_or_files: + assert os.path.exists(dir_or_file) + if os.path.isdir(dir_or_file): + for dirpath, _, filenames in os.walk(dir_or_file): + for filename in filenames: + if filename.endswith('.wav'): + yield os.path.join(dirpath, filename) + elif os.path.isfile(dir_or_file): + df = read_csvs_func([dir_or_file]) + for filename in df['wav_filename']: + yield filename def augment_noise(audio, - audio_dbfs, noise, - noise_dbfs, - max_audio_gain_db=5, - min_audio_gain_db=-10, - max_snr_db=30, - min_snr_db=5): - decoded_audio_len = tf.shape(audio)[0] - decoded_noise_len = tf.shape(noise)[0] + min_audio_dbfs=0.0, + max_audio_dbfs=-35.0, + min_snr_db=3.0, + max_snr_db=30.0, + limit_audio_peak_dbfs=7.0, + limit_noise_peak_dbfs=3.0, + sample_rate=16000): + r"""Mix audio Tensor with noise Tensor + + If the noise length is shorter than audio, the process will automaticaly repeat the noise file to over audio length, + The process randomly choose a duration of the noise to complete coverage the audio, + i.e. the shapes between the choosen duration of noise and audio are equal. - multiply = tf.math.floordiv(decoded_audio_len, decoded_noise_len) + 1 - noise_audio_tile = tf.tile(noise, [multiply, 1]) + Args: + audio: A 2-D Tensor with shape [`time-steps`, 1]. + noise: A 2-D Tensor with shape [`time-steps`, 1]. + min_audio_dbfs: A float in dbfs unit, specifying the `minimum` volume of audio during gaining audio. + max_audio_dbfs: A float in dbfs unit, specifying the `maximum` volume of audio during gaining audio. + min_snr_db: A float in db unit, specifying the minimum signal-to-noise ratio during gaining audio and noise. + max_snr_db: A float in db unit, specifying the maximum signal-to-noise ratio during gaining audio and noise. + limit_audio_peak_dbfs: A float, specifying the limitation of maximun audio dbfs of chunks, the audio volume will not gain over than the specified value. + limit_noise_peak_dbfs: A float, specifying the limitation of maximun noise dbfs of chunks, the noise volume will not gain over than the specified value. + sample_rate: An integer, specifying the audio sample rate to determining the chunk size for dbfs measurement. - # Now, decoded_noise_len must > decoded_audio_len - decoded_noise_len = tf.shape(noise_audio_tile)[0] + Returns: + A 2-D Tensor with shape [`time-steps`, 1]. Has the same type and shape as `audio`. + """ - mix_decoded_start_point = tfv1.random_uniform([], minval=0, maxval=decoded_noise_len-decoded_audio_len, dtype=tf.int32) - mix_decoded_end_point = mix_decoded_start_point + decoded_audio_len - extract_noise_decoded = noise_audio_tile[mix_decoded_start_point:mix_decoded_end_point, :] + audio_len = tf.shape(audio)[0] + noise_len = tf.shape(noise)[0] - audio_gain_db = tfv1.random_uniform([], minval=min_audio_gain_db, maxval=max_audio_gain_db) - target_audio_dbfs = audio_dbfs + audio_gain_db - audio_gain_ratio = tf.math.pow(10.0, audio_gain_db / 10) + audio_mean_dbfs, audio_max_dbfs = audio_to_dbfs(audio, sample_rate, reduce_funcs=[tf.reduce_mean, tf.reduce_max]) + + multiply = tf.math.floordiv(audio_len, noise_len) + 1 + noise_tile = tf.tile(noise, [multiply, 1]) + + + # Now, noise_len must > audio_len + noise_tile_len = tf.shape(noise_tile)[0] + + mix_decoded_start_point = tfv1.random_uniform([], minval=0, maxval=noise_tile_len-audio_len, dtype=tf.int32) + mix_decoded_end_point = mix_decoded_start_point + audio_len + extract_noise = noise_tile[mix_decoded_start_point:mix_decoded_end_point, :] + + extract_noise_mean_dbfs, extract_noise_max_dbfs = audio_to_dbfs(extract_noise, sample_rate, reduce_funcs=[tf.reduce_mean, tf.reduce_max]) + + target_audio_dbfs = tfv1.random_uniform([], minval=min_audio_dbfs, maxval=max_audio_dbfs) + + audio_gain_db = target_audio_dbfs - audio_mean_dbfs + + # limit audio peak + audio_gain_db = tf.minimum(limit_audio_peak_dbfs - audio_max_dbfs, audio_gain_db) + target_audio_dbfs = audio_mean_dbfs + audio_gain_db + + audio_gain_ratio = tf.math.pow(10.0, audio_gain_db / 20.0) # target_snr_db := target_audio_dbfs - target_noise_dbfs target_snr_db = tfv1.random_uniform([], minval=min_snr_db, maxval=max_snr_db) target_noise_dbfs = target_audio_dbfs - target_snr_db - noise_gain_db = target_noise_dbfs - noise_dbfs - noise_gain_ratio = tf.math.pow(10.0, noise_gain_db / 10) - mixed_audio = tf.multiply(audio, audio_gain_ratio) + tf.multiply(extract_noise_decoded, noise_gain_ratio) + noise_gain_db = target_noise_dbfs - extract_noise_mean_dbfs + + # limit noise peak + noise_gain_db = tf.minimum(limit_noise_peak_dbfs - extract_noise_max_dbfs, noise_gain_db) + noise_gain_ratio = tf.math.pow(10.0, noise_gain_db / 20.0) + + mixed_audio = tf.multiply(audio, audio_gain_ratio) + tf.multiply(extract_noise, noise_gain_ratio) + mixed_audio = tf.maximum(tf.minimum(mixed_audio, 1.0), -1.0) + return mixed_audio + +def gla(spectrogram): + r"""Use Griffin-Lim algorithm to reconstruct audio and fix iteration=10 to not waste too much performance in prefetch + + Args: + spectrogram: A 3-D Tensor with shape [1, `time-steps`, `features`]. + Returns: + A 2-D Tensor with shape [`time-steps`, 1], which is a reconstructed audio from spectrogram. + """ + frame_length = int(Config.audio_window_samples) + frame_step = int(Config.audio_step_samples) + fft_length = 512 + spectrogram = tf.reshape(spectrogram, shape=[1, -1, 257]) + abs_spectrogram = tf.abs(spectrogram) + + def reconstruct_phases(prev_phases): + xi = tf.complex(abs_spectrogram, 0.0) * prev_phases + audio = tf.signal.inverse_stft(xi, frame_length=frame_length, frame_step=frame_step, fft_length=fft_length) + next_xi = tf.signal.stft(audio, frame_length=frame_length, frame_step=frame_step, fft_length=fft_length) + next_phases = tf.math.exp(tf.complex(0.0, tf.angle(next_xi))) + return next_phases + + rands = tfv1.random_uniform(tf.shape(spectrogram), dtype=tf.float32) + phases = tf.math.exp(tf.complex(0.0, 2.0 * np.pi * rands)) + + reconstructed_phases = tf.while_loop(lambda _: True, reconstruct_phases, [phases], maximum_iterations=10) + xi = tf.complex(abs_spectrogram, 0.0) * reconstructed_phases + audio = tf.signal.inverse_stft(xi, frame_length=frame_length, frame_step=frame_step, fft_length=fft_length) + return tf.transpose(audio) diff --git a/util/feeding.py b/util/feeding.py index f8e8a7aebc..eb46ecdd72 100644 --- a/util/feeding.py +++ b/util/feeding.py @@ -16,8 +16,7 @@ from util.flags import FLAGS from util.spectrogram_augmentations import augment_freq_time_mask, augment_dropout, augment_pitch_and_tempo, augment_speed_up from util.audio import read_frames_from_file, vad_split, DEFAULT_FORMAT -from util.audio_augmentation import augment_noise, create_noise_iterator, get_dbfs -from util.logging import log_info +from util.audio_augmentation import augment_noise, create_noise_iterator, gla def read_csvs(csv_files): @@ -63,29 +62,38 @@ def samples_to_mfccs(samples, sample_rate, train_phase=False): mfccs = contrib_audio.mfcc(spectrogram, sample_rate, dct_coefficient_count=Config.n_input) mfccs = tf.reshape(mfccs, [-1, Config.n_input]) - return mfccs, tf.shape(input=mfccs)[0] + review_audio = samples + if FLAGS.augmentation_review_audio_steps and train_phase and any([ + FLAGS.augmentation_spec_dropout_keeprate < 1, + FLAGS.augmentation_freq_and_time_masking, + FLAGS.augmentation_pitch_and_tempo_scaling, + FLAGS.augmentation_speed_up_std > 0]): + review_audio = gla(spectrogram) + return mfccs, tf.shape(input=mfccs)[0], review_audio -def audiofile_to_features(wav_filename, audio_dbfs, train_phase=False, noise_iterator=None): + +def audiofile_to_features(wav_filename, train_phase=False, noise_iterator=None): samples = tf.io.read_file(wav_filename) decoded = contrib_audio.decode_wav(samples, desired_channels=1) audio = decoded.audio # augment audio if noise_iterator: - noise, noise_dbfs = noise_iterator.get_next() + noise = noise_iterator.get_next() audio = augment_noise( audio, - audio_dbfs, noise, - noise_dbfs, - max_audio_gain_db=FLAGS.audio_aug_mix_noise_max_audio_gain_db, - min_audio_gain_db=FLAGS.audio_aug_mix_noise_min_audio_gain_db, - max_snr_db=FLAGS.audio_aug_mix_noise_max_snr_db, + min_audio_dbfs=FLAGS.audio_aug_mix_noise_min_audio_dbfs, + max_audio_dbfs=FLAGS.audio_aug_mix_noise_max_audio_dbfs, min_snr_db=FLAGS.audio_aug_mix_noise_min_snr_db, + max_snr_db=FLAGS.audio_aug_mix_noise_max_snr_db, + limit_audio_peak_dbfs=FLAGS.audio_aug_mix_noise_limit_audio_peak_dbfs, + limit_noise_peak_dbfs=FLAGS.audio_aug_mix_noise_limit_noise_peak_dbfs, + sample_rate=FLAGS.audio_sample_rate, ) - features, features_len = samples_to_mfccs(audio, decoded.sample_rate, train_phase=train_phase) + features, features_len, review_audio = samples_to_mfccs(audio, decoded.sample_rate, train_phase=train_phase) # augment features if train_phase: @@ -95,13 +103,13 @@ def audiofile_to_features(wav_filename, audio_dbfs, train_phase=False, noise_ite if FLAGS.data_aug_features_additive > 0: features = features+tf.random.normal(mean=0.0, stddev=FLAGS.data_aug_features_additive, shape=tf.shape(features)) - return features, features_len + return features, features_len, review_audio -def entry_to_features(wav_filename, transcript, audio_dbfs, train_phase, noise_iterator): +def entry_to_features(wav_filename, transcript, train_phase, noise_iterator): # https://bugs.python.org/issue32117 - features, features_len = audiofile_to_features(wav_filename, audio_dbfs, train_phase=train_phase, noise_iterator=noise_iterator) - return wav_filename, features, features_len, tf.SparseTensor(*transcript) + features, features_len, review_audio = audiofile_to_features(wav_filename, train_phase=train_phase, noise_iterator=noise_iterator) + return wav_filename, features, features_len, tf.SparseTensor(*transcript), review_audio def to_sparse_tuple(sequence): @@ -113,7 +121,7 @@ def to_sparse_tuple(sequence): return indices, sequence, shape -def create_dataset(csvs, batch_size, enable_cache=False, cache_path=None, train_phase=False, noise_dirs=None): +def create_dataset(csvs, batch_size, enable_cache=False, cache_path=None, train_phase=False, noise_dirs_or_files=None): df = read_csvs(csvs) df.sort_values(by='wav_filesize', inplace=True) @@ -130,18 +138,26 @@ def sparse_reshape(sparse): shape = sparse.dense_shape return tf.sparse.reshape(sparse, [shape[0], shape[2]]) - def batch_fn(wav_filenames, features, features_len, transcripts): + def batch_fn(wav_filenames, features, features_len, transcripts, review_audios): features = tf.data.Dataset.zip((features, features_len)) features = features.padded_batch(batch_size, padded_shapes=([None, Config.n_input], [])) transcripts = transcripts.batch(batch_size).map(sparse_reshape) wav_filenames = wav_filenames.batch(batch_size) - return tf.data.Dataset.zip((wav_filenames, features, transcripts)) + + # In order not to waste too much prefetch performance, randomly extract only `one` audio for each step + if FLAGS.augmentation_review_audio_steps and batch_size > 1: + skip_size = tf.random.uniform(shape=[], minval=0, maxval=batch_size - 1, dtype=tf.int64) + review_audio = review_audios.skip(skip_size).batch(1) + else: + review_audio = review_audios.batch(1) + + return tf.data.Dataset.zip((wav_filenames, features, transcripts, review_audio)) num_gpus = len(Config.available_devices) - if noise_dirs: - noise_iterator = create_noise_iterator(noise_dirs) + if noise_dirs_or_files: + noise_iterator = create_noise_iterator(noise_dirs_or_files, read_csvs) else: noise_iterator = None @@ -150,14 +166,6 @@ def batch_fn(wav_filenames, features, features_len, transcripts): dataset = tf.data.Dataset.from_generator(generate_values, output_types=(tf.string, (tf.int64, tf.int32, tf.int64))) - if noise_dirs: - dataset = (dataset.map(lambda wav_filename, transcript: (wav_filename, transcript, get_dbfs(wav_filename)), - num_parallel_calls=tf.data.experimental.AUTOTUNE) - .cache()) - else: - dataset = dataset.map(lambda wav_filename, transcript: (wav_filename, transcript, 0.0), - num_parallel_calls=tf.data.experimental.AUTOTUNE) - dataset = dataset.map( process_fn, num_parallel_calls=tf.data.experimental.AUTOTUNE) @@ -190,7 +198,7 @@ def generate_values(): yield time_start, time_end, samples def to_mfccs(time_start, time_end, samples): - features, features_len = samples_to_mfccs(samples, sample_rate) + features, features_len, _ = samples_to_mfccs(samples, sample_rate) return time_start, time_end, features, features_len def create_batch_set(bs, criteria): diff --git a/util/flags.py b/util/flags.py index 69fb81182f..176ee53c05 100644 --- a/util/flags.py +++ b/util/flags.py @@ -24,13 +24,15 @@ def create_flags(): # Data Augmentation # ================ - f.DEFINE_string('audio_aug_mix_noise_walk_train_dirs', '', 'walk through wav dir, then mix noise wav into decoded audio') - f.DEFINE_string('audio_aug_mix_noise_walk_dev_dirs', '', 'walk through wav dir, then mix noise wav into decoded audio') - f.DEFINE_string('audio_aug_mix_noise_walk_test_dirs', '', 'walk through wav dir, then mix noise wav into decoded audio') - f.DEFINE_float('audio_aug_mix_noise_max_snr_db', 20, 'to limit noise max volume', lower_bound=0.0) - f.DEFINE_float('audio_aug_mix_noise_min_snr_db', 3, 'to limit noise min volume', lower_bound=0.0) - f.DEFINE_float('audio_aug_mix_noise_max_audio_gain_db', 5, 'to limit audio max volume') - f.DEFINE_float('audio_aug_mix_noise_min_audio_gain_db', -10, 'to limit audio min volume') + f.DEFINE_string('audio_aug_mix_noise_train_dirs_or_files', '', 'comma separated list of files or dirs, specifying the dataset used for mixing audio into train dataset, if empty, no mix will be run in train phase') + f.DEFINE_string('audio_aug_mix_noise_dev_dirs_or_files', '', 'comma separated list of files or dirs, specifying the dataset used for mixing audio into dev dataset, if empty, no mix will be run in dev phase') + f.DEFINE_string('audio_aug_mix_noise_test_dirs_or_files', '', 'comma separated list of files or dirs, specifying the dataset used for mixing audio into test dataset, if empty, no mix will be run in test phase') + f.DEFINE_float('audio_aug_mix_noise_min_audio_dbfs', 0, 'min value of dbfs to specify the min volume of audio during gaining audio') + f.DEFINE_float('audio_aug_mix_noise_max_audio_dbfs', -35, 'max value of dbfs to specify the max volume of audio during gaining audio') + f.DEFINE_float('audio_aug_mix_noise_min_snr_db', 3, 'min value of db to specify the min signal-to-noise ratio during gaining audio and noise') + f.DEFINE_float('audio_aug_mix_noise_max_snr_db', 30, 'max value of db to specify the max signal-to-noise ratio during gaining audio and noise') + f.DEFINE_float('audio_aug_mix_noise_limit_audio_peak_dbfs', 7.0, 'max value of dbfs to specify the limitation of max audio dbfs of chunks, the audio volume will not gain over than the specified value') + f.DEFINE_float('audio_aug_mix_noise_limit_noise_peak_dbfs', 3.0, 'max value of dbfs to specify the limitation of max noise dbfs of chunks, the noise volume will not gain over than the specified value') f.DEFINE_float('data_aug_features_additive', 0, 'std of the Gaussian additive noise') f.DEFINE_float('data_aug_features_multiplicative', 0, 'std of normal distribution around 1 for multiplicative noise') @@ -50,6 +52,8 @@ def create_flags(): f.DEFINE_float('augmentation_pitch_and_tempo_scaling_max_pitch', 1.2, 'max value of pitch scaling') f.DEFINE_float('augmentation_pitch_and_tempo_scaling_max_tempo', 1.2, 'max vlaue of tempo scaling') + f.DEFINE_integer('augmentation_review_audio_steps', 0, 'number of audio, push the audio into summary directory (if 0, no audio will be dumped)') + # Global Constants # ================ From 42bc45b198cdc3a3eff41fe369bf88d16b325163 Mon Sep 17 00:00:00 2001 From: Yi-Hua Chiu Date: Thu, 19 Mar 2020 15:04:09 +0800 Subject: [PATCH 18/25] [FIX] audiofile_to_features & samples_to_mfccs return 3 values now, add FLAGS.train_augmentation_files as condition to judge cache dataset or not, change constant to FLAGS [MOD] rename variables --- DeepSpeech.py | 23 ++++++++++++----------- evaluate.py | 4 ++-- util/audio_augmentation.py | 24 ++++++++++++------------ util/feeding.py | 22 +++++++++++----------- util/flags.py | 20 ++++++++++---------- 5 files changed, 47 insertions(+), 46 deletions(-) diff --git a/DeepSpeech.py b/DeepSpeech.py index 28042fa506..c0897cf4a8 100755 --- a/DeepSpeech.py +++ b/DeepSpeech.py @@ -316,8 +316,8 @@ def get_tower_results(iterator, optimizer, dropout_rates): tower_non_finite_files.append(non_finite_files) avg_loss_across_towers = tf.reduce_mean(input_tensor=tower_avg_losses, axis=0) - if FLAGS.augmentation_review_audio_steps: - tfv1.summary.audio(name='step_audio', tensor=review_audio, sample_rate=16000, collections=['step_audio_summaries']) + if FLAGS.review_audio_steps: + tfv1.summary.audio(name='step_audio', tensor=review_audio, sample_rate=FLAGS.audio_sample_rate, collections=['step_audio_summaries']) tfv1.summary.scalar(name='step_loss', tensor=avg_loss_across_towers, collections=['step_summaries']) all_non_finite_files = tf.concat(tower_non_finite_files, axis=0) @@ -430,7 +430,8 @@ def train(): FLAGS.augmentation_spec_dropout_keeprate < 1 or FLAGS.augmentation_freq_and_time_masking or FLAGS.augmentation_pitch_and_tempo_scaling or - FLAGS.augmentation_speed_up_std > 0): + FLAGS.augmentation_speed_up_std > 0 or + FLAGS.train_augmentation_files): do_cache_dataset = False # Create training and validation datasets @@ -439,7 +440,7 @@ def train(): enable_cache=FLAGS.feature_cache and do_cache_dataset, cache_path=FLAGS.feature_cache, train_phase=True, - noise_dirs_or_files=FLAGS.audio_aug_mix_noise_train_dirs_or_files) + noise_sources=FLAGS.train_augmentation_files) iterator = tfv1.data.Iterator.from_structure(tfv1.data.get_output_types(train_set), tfv1.data.get_output_shapes(train_set), @@ -450,7 +451,7 @@ def train(): if FLAGS.dev_files: dev_csvs = FLAGS.dev_files.split(',') - dev_sets = [create_dataset([csv], batch_size=FLAGS.dev_batch_size, train_phase=False, noise_dirs_or_files=FLAGS.audio_aug_mix_noise_dev_dirs_or_files) for csv in dev_csvs] + dev_sets = [create_dataset([csv], batch_size=FLAGS.dev_batch_size, train_phase=False, noise_sources=FLAGS.dev_augmentation_files) for csv in dev_csvs] dev_init_ops = [iterator.make_initializer(dev_set) for dev_set in dev_sets] # Dropout @@ -598,15 +599,15 @@ def __call__(self, progress, data, **kwargs): # Batch loop - i_audio_steps = 0 + audio_summary_steps = 0 while True: try: step_audio_summary = None - if i_audio_steps < FLAGS.augmentation_review_audio_steps and epoch == 0: + if audio_summary_steps < FLAGS.review_audio_steps and epoch == 0: _, current_step, batch_loss, problem_files, step_summary, step_audio_summary = \ session.run([train_op, global_step, loss, non_finite_files, step_summaries_op, step_audio_summaries_op], feed_dict=feed_dict) - i_audio_steps += 1 + audio_summary_steps += 1 else: _, current_step, batch_loss, problem_files, step_summary = \ session.run([train_op, global_step, loss, non_finite_files, step_summaries_op], @@ -689,7 +690,7 @@ def __call__(self, progress, data, **kwargs): def test(): - samples = evaluate(FLAGS.test_files.split(','), create_model, try_loading, noise_dirs_or_files=FLAGS.audio_aug_mix_noise_test_dirs_or_files) + samples = evaluate(FLAGS.test_files.split(','), create_model, try_loading, noise_sources=FLAGS.test_augmentation_files) if FLAGS.test_output_file: # Save decoded tuples as JSON, converting NumPy floats to Python floats json.dump(samples, open(FLAGS.test_output_file, 'w'), default=float) @@ -701,7 +702,7 @@ def create_inference_graph(batch_size=1, n_steps=16, tflite=False): # Create feature computation graph input_samples = tfv1.placeholder(tf.float32, [Config.audio_window_samples], 'input_samples') samples = tf.expand_dims(input_samples, -1) - mfccs, _ = samples_to_mfccs(samples, FLAGS.audio_sample_rate) + mfccs, _, _ = samples_to_mfccs(samples, FLAGS.audio_sample_rate) mfccs = tf.identity(mfccs, name='mfccs') # Input tensor will be of shape [batch_size, n_steps, 2*n_context+1, n_input] @@ -912,7 +913,7 @@ def do_single_file_inference(input_file_path): print('Could not load checkpoint from {}'.format(FLAGS.checkpoint_dir)) sys.exit(1) - features, features_len = audiofile_to_features(input_file_path, 0.0) + features, features_len, _ = audiofile_to_features(input_file_path) previous_state_c = np.zeros([1, Config.n_cell_dim]) previous_state_h = np.zeros([1, Config.n_cell_dim]) diff --git a/evaluate.py b/evaluate.py index 94348e088c..d8a3ec853d 100755 --- a/evaluate.py +++ b/evaluate.py @@ -41,7 +41,7 @@ def sparse_tuple_to_texts(sp_tuple, alphabet): return [alphabet.decode(res) for res in results] -def evaluate(test_csvs, create_model, try_loading, noise_dirs_or_files=None): +def evaluate(test_csvs, create_model, try_loading, noise_sources=None): if FLAGS.lm_binary_path: scorer = Scorer(FLAGS.lm_alpha, FLAGS.lm_beta, FLAGS.lm_binary_path, FLAGS.lm_trie_path, @@ -50,7 +50,7 @@ def evaluate(test_csvs, create_model, try_loading, noise_dirs_or_files=None): scorer = None test_csvs = FLAGS.test_files.split(',') - test_sets = [create_dataset([csv], batch_size=FLAGS.test_batch_size, train_phase=False, noise_dirs_or_files=noise_dirs_or_files) for csv in test_csvs] + test_sets = [create_dataset([csv], batch_size=FLAGS.test_batch_size, train_phase=False, noise_sources=noise_sources) for csv in test_csvs] iterator = tfv1.data.Iterator.from_structure(tfv1.data.get_output_types(test_sets[0]), tfv1.data.get_output_shapes(test_sets[0]), output_classes=tfv1.data.get_output_classes(test_sets[0])) diff --git a/util/audio_augmentation.py b/util/audio_augmentation.py index 7f7705e825..fa3b39a528 100644 --- a/util/audio_augmentation.py +++ b/util/audio_augmentation.py @@ -57,7 +57,7 @@ def audio_to_dbfs(audio, sample_rate=16000, chunk_ms=100, reduce_funcs=tf.reduce return [DBFS_COEF * tf.math.log(reduce + 1e-8) for reduce in reduces] -def create_noise_iterator(noise_dirs_or_files, read_csvs_func): +def create_noise_iterator(noise_sources, read_csvs_func): r"""Create an iterator to yield audio Args: @@ -67,10 +67,10 @@ def create_noise_iterator(noise_dirs_or_files, read_csvs_func): Returns: An one shot iterator of audio with 2-D Tensor of shape [`time-step`, 1], use `.get_next()` to get the Tensor. """ - if isinstance(noise_dirs_or_files, str): - noise_dirs_or_files = noise_dirs_or_files.split(',') + if isinstance(noise_sources, str): + noise_sources = noise_sources.split(',') - noise_filenames = tf.convert_to_tensor(list(collect_noise_filenames(noise_dirs_or_files, read_csvs_func)), dtype=tf.string) + noise_filenames = tf.convert_to_tensor(list(collect_noise_filenames(noise_sources, read_csvs_func)), dtype=tf.string) log_info("Collect {} noise files for mixing audio".format(noise_filenames.shape[0])) noise_dataset = (tf.data.Dataset.from_tensor_slices(noise_filenames) @@ -82,7 +82,7 @@ def create_noise_iterator(noise_dirs_or_files, read_csvs_func): return noise_iterator -def collect_noise_filenames(dirs_or_files, read_csvs_func): +def collect_noise_filenames(sources, read_csvs_func): r"""Collect wav filenames from directories or csv files Args: @@ -93,17 +93,17 @@ def collect_noise_filenames(dirs_or_files, read_csvs_func): An iterator of str, yield every filename suffix with `.wav` or under `wav_filename` column of DataFrame """ - assert isinstance(dirs_or_files, (list, tuple)) + assert isinstance(sources, (list, tuple)) - for dir_or_file in dirs_or_files: - assert os.path.exists(dir_or_file) - if os.path.isdir(dir_or_file): - for dirpath, _, filenames in os.walk(dir_or_file): + for source in sources: + assert os.path.exists(source) + if os.path.isdir(source): + for dirpath, _, filenames in os.walk(source): for filename in filenames: if filename.endswith('.wav'): yield os.path.join(dirpath, filename) - elif os.path.isfile(dir_or_file): - df = read_csvs_func([dir_or_file]) + elif os.path.isfile(source): + df = read_csvs_func([source]) for filename in df['wav_filename']: yield filename diff --git a/util/feeding.py b/util/feeding.py index eb46ecdd72..24fd59f0ed 100644 --- a/util/feeding.py +++ b/util/feeding.py @@ -63,7 +63,7 @@ def samples_to_mfccs(samples, sample_rate, train_phase=False): mfccs = tf.reshape(mfccs, [-1, Config.n_input]) review_audio = samples - if FLAGS.augmentation_review_audio_steps and train_phase and any([ + if FLAGS.review_audio_steps and train_phase and any([ FLAGS.augmentation_spec_dropout_keeprate < 1, FLAGS.augmentation_freq_and_time_masking, FLAGS.augmentation_pitch_and_tempo_scaling, @@ -84,12 +84,12 @@ def audiofile_to_features(wav_filename, train_phase=False, noise_iterator=None): audio = augment_noise( audio, noise, - min_audio_dbfs=FLAGS.audio_aug_mix_noise_min_audio_dbfs, - max_audio_dbfs=FLAGS.audio_aug_mix_noise_max_audio_dbfs, - min_snr_db=FLAGS.audio_aug_mix_noise_min_snr_db, - max_snr_db=FLAGS.audio_aug_mix_noise_max_snr_db, - limit_audio_peak_dbfs=FLAGS.audio_aug_mix_noise_limit_audio_peak_dbfs, - limit_noise_peak_dbfs=FLAGS.audio_aug_mix_noise_limit_noise_peak_dbfs, + min_audio_dbfs=FLAGS.audio_aug_min_audio_dbfs, + max_audio_dbfs=FLAGS.audio_aug_max_audio_dbfs, + min_snr_db=FLAGS.audio_aug_min_snr_db, + max_snr_db=FLAGS.audio_aug_max_snr_db, + limit_audio_peak_dbfs=FLAGS.audio_aug_limit_audio_peak_dbfs, + limit_noise_peak_dbfs=FLAGS.audio_aug_limit_noise_peak_dbfs, sample_rate=FLAGS.audio_sample_rate, ) @@ -121,7 +121,7 @@ def to_sparse_tuple(sequence): return indices, sequence, shape -def create_dataset(csvs, batch_size, enable_cache=False, cache_path=None, train_phase=False, noise_dirs_or_files=None): +def create_dataset(csvs, batch_size, enable_cache=False, cache_path=None, train_phase=False, noise_sources=None): df = read_csvs(csvs) df.sort_values(by='wav_filesize', inplace=True) @@ -146,7 +146,7 @@ def batch_fn(wav_filenames, features, features_len, transcripts, review_audios): wav_filenames = wav_filenames.batch(batch_size) # In order not to waste too much prefetch performance, randomly extract only `one` audio for each step - if FLAGS.augmentation_review_audio_steps and batch_size > 1: + if FLAGS.review_audio_steps and batch_size > 1: skip_size = tf.random.uniform(shape=[], minval=0, maxval=batch_size - 1, dtype=tf.int64) review_audio = review_audios.skip(skip_size).batch(1) else: @@ -156,8 +156,8 @@ def batch_fn(wav_filenames, features, features_len, transcripts, review_audios): num_gpus = len(Config.available_devices) - if noise_dirs_or_files: - noise_iterator = create_noise_iterator(noise_dirs_or_files, read_csvs) + if noise_sources: + noise_iterator = create_noise_iterator(noise_sources, read_csvs) else: noise_iterator = None diff --git a/util/flags.py b/util/flags.py index 176ee53c05..740894edd5 100644 --- a/util/flags.py +++ b/util/flags.py @@ -24,15 +24,15 @@ def create_flags(): # Data Augmentation # ================ - f.DEFINE_string('audio_aug_mix_noise_train_dirs_or_files', '', 'comma separated list of files or dirs, specifying the dataset used for mixing audio into train dataset, if empty, no mix will be run in train phase') - f.DEFINE_string('audio_aug_mix_noise_dev_dirs_or_files', '', 'comma separated list of files or dirs, specifying the dataset used for mixing audio into dev dataset, if empty, no mix will be run in dev phase') - f.DEFINE_string('audio_aug_mix_noise_test_dirs_or_files', '', 'comma separated list of files or dirs, specifying the dataset used for mixing audio into test dataset, if empty, no mix will be run in test phase') - f.DEFINE_float('audio_aug_mix_noise_min_audio_dbfs', 0, 'min value of dbfs to specify the min volume of audio during gaining audio') - f.DEFINE_float('audio_aug_mix_noise_max_audio_dbfs', -35, 'max value of dbfs to specify the max volume of audio during gaining audio') - f.DEFINE_float('audio_aug_mix_noise_min_snr_db', 3, 'min value of db to specify the min signal-to-noise ratio during gaining audio and noise') - f.DEFINE_float('audio_aug_mix_noise_max_snr_db', 30, 'max value of db to specify the max signal-to-noise ratio during gaining audio and noise') - f.DEFINE_float('audio_aug_mix_noise_limit_audio_peak_dbfs', 7.0, 'max value of dbfs to specify the limitation of max audio dbfs of chunks, the audio volume will not gain over than the specified value') - f.DEFINE_float('audio_aug_mix_noise_limit_noise_peak_dbfs', 3.0, 'max value of dbfs to specify the limitation of max noise dbfs of chunks, the noise volume will not gain over than the specified value') + f.DEFINE_string('train_augmentation_files', '', 'comma separated list of files or dirs, specifying the dataset used for mixing audio into train dataset, if empty, no mix will be run in train phase') + f.DEFINE_string('dev_augmentation_files', '', 'comma separated list of files or dirs, specifying the dataset used for mixing audio into dev dataset, if empty, no mix will be run in dev phase') + f.DEFINE_string('test_augmentation_files', '', 'comma separated list of files or dirs, specifying the dataset used for mixing audio into test dataset, if empty, no mix will be run in test phase') + f.DEFINE_float('audio_aug_min_audio_dbfs', 0, 'min value of dbfs to specify the min volume of audio during gaining audio') + f.DEFINE_float('audio_aug_max_audio_dbfs', -35, 'max value of dbfs to specify the max volume of audio during gaining audio') + f.DEFINE_float('audio_aug_min_snr_db', 3, 'min value of db to specify the min signal-to-noise ratio during gaining audio and noise') + f.DEFINE_float('audio_aug_max_snr_db', 30, 'max value of db to specify the max signal-to-noise ratio during gaining audio and noise') + f.DEFINE_float('audio_aug_limit_audio_peak_dbfs', 7.0, 'max value of dbfs to specify the limitation of max audio dbfs of chunks, the audio volume will not gain over than the specified value') + f.DEFINE_float('audio_aug_limit_noise_peak_dbfs', 3.0, 'max value of dbfs to specify the limitation of max noise dbfs of chunks, the noise volume will not gain over than the specified value') f.DEFINE_float('data_aug_features_additive', 0, 'std of the Gaussian additive noise') f.DEFINE_float('data_aug_features_multiplicative', 0, 'std of normal distribution around 1 for multiplicative noise') @@ -52,7 +52,7 @@ def create_flags(): f.DEFINE_float('augmentation_pitch_and_tempo_scaling_max_pitch', 1.2, 'max value of pitch scaling') f.DEFINE_float('augmentation_pitch_and_tempo_scaling_max_tempo', 1.2, 'max vlaue of tempo scaling') - f.DEFINE_integer('augmentation_review_audio_steps', 0, 'number of audio, push the audio into summary directory (if 0, no audio will be dumped)') + f.DEFINE_integer('review_audio_steps', 0, 'number of audio, push the audio into summary directory (if 0, no audio will be dumped)') # Global Constants # ================ From 289722dc2ec81f4f446e152027b9dfdd263cb8ac Mon Sep 17 00:00:00 2001 From: Daniel Date: Sun, 29 Mar 2020 12:49:40 +0200 Subject: [PATCH 19/25] Fix issues. --- DeepSpeech.py | 8 ++++++-- evaluate.py | 8 ++++++-- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/DeepSpeech.py b/DeepSpeech.py index 92404e07a6..74fcc62b5e 100644 --- a/DeepSpeech.py +++ b/DeepSpeech.py @@ -239,7 +239,10 @@ def calculate_mean_edit_distance_and_loss(iterator, dropout, reuse): logits, _ = create_model(batch_x, batch_seq_len, dropout, reuse=reuse, rnn_impl=rnn_impl) # Compute the CTC loss using TensorFlow's `ctc_loss` - total_loss = tfv1.nn.ctc_loss(labels=batch_y, inputs=logits, sequence_length=batch_seq_len) + total_loss = tfv1.nn.ctc_loss(labels=batch_y, + inputs=logits, + sequence_length=batch_seq_len, + ignore_longer_outputs_than_inputs=True) # Check if any files lead to non finite loss non_finite_files = tf.gather(batch_filenames, tfv1.where(~tf.math.is_finite(total_loss))) @@ -556,7 +559,8 @@ def __call__(self, progress, data, **kwargs): log_info("Ignoring sparse warp error: {}".format(err)) continue else: - raise + print("Ignoring error:", err) + continue except tf.errors.OutOfRangeError: exception_box.raise_if_set() break diff --git a/evaluate.py b/evaluate.py index d0ce3231c2..94a6b6f027 100755 --- a/evaluate.py +++ b/evaluate.py @@ -70,8 +70,9 @@ def evaluate(test_csvs, create_model): transposed = tf.nn.softmax(tf.transpose(a=logits, perm=[1, 0, 2])) loss = tfv1.nn.ctc_loss(labels=batch_y, - inputs=logits, - sequence_length=batch_x_len) + inputs=logits, + sequence_length=batch_x_len, + ignore_longer_outputs_than_inputs=True) tfv1.train.get_or_create_global_step() @@ -110,6 +111,9 @@ def run_test(init_op, dataset): session.run([batch_wav_filename, transposed, loss, batch_x_len, batch_y]) except tf.errors.OutOfRangeError: break + except tf.errors.InvalidArgumentError as e: + print("Ignoring error:", e) + continue decoded = ctc_beam_search_decoder_batch(batch_logits, batch_lengths, Config.alphabet, FLAGS.beam_width, num_processes=num_processes, scorer=scorer, From 9334e79f2888fc2c48c3f0815b56ab15205369c7 Mon Sep 17 00:00:00 2001 From: Daniel Date: Sun, 29 Mar 2020 12:53:02 +0200 Subject: [PATCH 20/25] Save invalid files. --- DeepSpeech.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/DeepSpeech.py b/DeepSpeech.py index 74fcc62b5e..efaae80a4c 100644 --- a/DeepSpeech.py +++ b/DeepSpeech.py @@ -570,6 +570,12 @@ def __call__(self, progress, data, **kwargs): log_error('The following files caused an infinite (or NaN) ' 'loss: {}'.format(','.join(problem_files))) + # Save invalid files + sys.path.append("/DeepSpeech/deepspeech-german/training/") + from filter_invalid_files import add_files_to_excluded + add_files_to_excluded(problem_files) + sys.exit(1) + total_loss += batch_loss step_count += 1 From 40b431b1aefb98d1b7163c373b2a9a4f3a813623 Mon Sep 17 00:00:00 2001 From: Daniel Date: Sun, 29 Mar 2020 19:26:06 +0200 Subject: [PATCH 21/25] Fix merging errors. --- util/feeding.py | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/util/feeding.py b/util/feeding.py index 3e926b862a..dea2669049 100644 --- a/util/feeding.py +++ b/util/feeding.py @@ -2,8 +2,10 @@ from __future__ import absolute_import, division, print_function from functools import partial +import os import numpy as np +import pandas import tensorflow as tf from tensorflow.python.ops import gen_audio_ops as contrib_audio @@ -18,6 +20,18 @@ from util.audio_augmentation import augment_noise, create_noise_iterator, gla +def read_csvs(csv_files): + sets = [] + for csv in csv_files: + file = pandas.read_csv(csv, encoding='utf-8', na_filter=False) + #FIXME: not cross-platform + csv_dir = os.path.dirname(os.path.abspath(csv)) + file['wav_filename'] = file['wav_filename'].str.replace(r'(^[^/])', lambda m: os.path.join(csv_dir, m.group(1))) # pylint: disable=cell-var-from-loop + sets.append(file) + # Concat all sets, drop any extra columns, re-index the final result as 0..N + return pandas.concat(sets, join='inner', ignore_index=True) + + def samples_to_mfccs(samples, sample_rate, train_phase=False, sample_id=None): if train_phase: # We need the lambdas to make TensorFlow happy. @@ -116,9 +130,9 @@ def audiofile_to_features(wav_filename, train_phase=False, noise_iterator=None): return audio_to_features(decoded.audio, decoded.sample_rate, train_phase=train_phase, sample_id=wav_filename, noise_iterator=noise_iterator) -def entry_to_features(sample_id, audio, sample_rate, transcript, train_phase=False): +def entry_to_features(sample_id, audio, sample_rate, transcript, train_phase=False, noise_iterator=None): # https://bugs.python.org/issue32117 - features, features_len, review_audio = audio_to_features(audio, sample_rate, train_phase=train_phase, sample_id=sample_id) + features, features_len, review_audio = audio_to_features(audio, sample_rate, train_phase=train_phase, sample_id=sample_id, noise_iterator=noise_iterator) sparse_transcript = tf.SparseTensor(*transcript) return sample_id, features, features_len, sparse_transcript, review_audio From f7d1279d2c4bd56d782db083a9d3d355f4239424 Mon Sep 17 00:00:00 2001 From: Yi-Hua Chiu Date: Tue, 31 Mar 2020 17:15:13 +0800 Subject: [PATCH 22/25] [FIX] replace tqdm with prograssbar [ADD] separate speech/noise mixing, add option to mix multi noise into one audio [MOD] change FLAGS name, gla iterations is optional --- DeepSpeech.py | 10 +-- bin/normalize_noise_audio.py | 14 ++-- evaluate.py | 4 +- util/audio_augmentation.py | 121 ++++++++++++++++++++++++----------- util/feeding.py | 36 ++++++----- util/flags.py | 28 +++++--- 6 files changed, 140 insertions(+), 73 deletions(-) diff --git a/DeepSpeech.py b/DeepSpeech.py index c0897cf4a8..5ea8a6318f 100755 --- a/DeepSpeech.py +++ b/DeepSpeech.py @@ -431,7 +431,8 @@ def train(): FLAGS.augmentation_freq_and_time_masking or FLAGS.augmentation_pitch_and_tempo_scaling or FLAGS.augmentation_speed_up_std > 0 or - FLAGS.train_augmentation_files): + FLAGS.train_augmentation_noise_files or + FLAGS.train_augmentation_speech_files): do_cache_dataset = False # Create training and validation datasets @@ -440,7 +441,8 @@ def train(): enable_cache=FLAGS.feature_cache and do_cache_dataset, cache_path=FLAGS.feature_cache, train_phase=True, - noise_sources=FLAGS.train_augmentation_files) + noise_sources=FLAGS.train_augmentation_noise_files, + speech_sources=FLAGS.train_augmentation_speech_files) iterator = tfv1.data.Iterator.from_structure(tfv1.data.get_output_types(train_set), tfv1.data.get_output_shapes(train_set), @@ -451,7 +453,7 @@ def train(): if FLAGS.dev_files: dev_csvs = FLAGS.dev_files.split(',') - dev_sets = [create_dataset([csv], batch_size=FLAGS.dev_batch_size, train_phase=False, noise_sources=FLAGS.dev_augmentation_files) for csv in dev_csvs] + dev_sets = [create_dataset([csv], batch_size=FLAGS.dev_batch_size, train_phase=False, noise_sources=FLAGS.dev_augmentation_noise_files, speech_sources=FLAGS.dev_augmentation_speech_files) for csv in dev_csvs] dev_init_ops = [iterator.make_initializer(dev_set) for dev_set in dev_sets] # Dropout @@ -690,7 +692,7 @@ def __call__(self, progress, data, **kwargs): def test(): - samples = evaluate(FLAGS.test_files.split(','), create_model, try_loading, noise_sources=FLAGS.test_augmentation_files) + samples = evaluate(FLAGS.test_files.split(','), create_model, try_loading, noise_sources=FLAGS.test_augmentation_noise_files, speech_sources=FLAGS.test_augmentation_speech_files) if FLAGS.test_output_file: # Save decoded tuples as JSON, converting NumPy floats to Python floats json.dump(samples, open(FLAGS.test_output_file, 'w'), default=float) diff --git a/bin/normalize_noise_audio.py b/bin/normalize_noise_audio.py index b487ece01b..2a15fad562 100644 --- a/bin/normalize_noise_audio.py +++ b/bin/normalize_noise_audio.py @@ -3,7 +3,6 @@ # Make sure we can import stuff from util/ # This script needs to be run from the root of the DeepSpeech repository -from util.feeding import secs_to_hours from librosa import get_duration from multiprocessing import Pool from functools import partial @@ -11,13 +10,10 @@ import argparse import sys import os +import progressbar sys.path.insert(1, os.path.join(sys.path[0], '..')) -try: - import tqdm -except ImportError as err: - print('[ImportError] try `pip install tqdm`') - raise err +from util.feeding import secs_to_hours try: from pydub import AudioSegment @@ -152,8 +148,10 @@ def main(src_dir, max_duration_seconds=max_duration_seconds) pool = Pool(processes=None) - for _ in tqdm.tqdm(pool.imap_unordered(convert_func, filenames), total=len(filenames)): - pass + pbar = progressbar.ProgressBar(prefix='Preparing Noise Dataset', max_value=len(filenames)).start() + for i, _ in enumerate(pool.imap_unordered(convert_func, filenames)): + pbar.update(i) + pbar.finish() if __name__ == "__main__": diff --git a/evaluate.py b/evaluate.py index d8a3ec853d..113a748835 100755 --- a/evaluate.py +++ b/evaluate.py @@ -41,7 +41,7 @@ def sparse_tuple_to_texts(sp_tuple, alphabet): return [alphabet.decode(res) for res in results] -def evaluate(test_csvs, create_model, try_loading, noise_sources=None): +def evaluate(test_csvs, create_model, try_loading, noise_sources=None, speech_sources=None): if FLAGS.lm_binary_path: scorer = Scorer(FLAGS.lm_alpha, FLAGS.lm_beta, FLAGS.lm_binary_path, FLAGS.lm_trie_path, @@ -50,7 +50,7 @@ def evaluate(test_csvs, create_model, try_loading, noise_sources=None): scorer = None test_csvs = FLAGS.test_files.split(',') - test_sets = [create_dataset([csv], batch_size=FLAGS.test_batch_size, train_phase=False, noise_sources=noise_sources) for csv in test_csvs] + test_sets = [create_dataset([csv], batch_size=FLAGS.test_batch_size, train_phase=False, noise_sources=noise_sources, speech_sources=speech_sources) for csv in test_csvs] iterator = tfv1.data.Iterator.from_structure(tfv1.data.get_output_types(test_sets[0]), tfv1.data.get_output_shapes(test_sets[0]), output_classes=tfv1.data.get_output_classes(test_sets[0])) diff --git a/util/audio_augmentation.py b/util/audio_augmentation.py index fa3b39a528..6a98f887f4 100644 --- a/util/audio_augmentation.py +++ b/util/audio_augmentation.py @@ -109,13 +109,21 @@ def collect_noise_filenames(sources, read_csvs_func): def augment_noise(audio, - noise, - min_audio_dbfs=0.0, - max_audio_dbfs=-35.0, - min_snr_db=3.0, - max_snr_db=30.0, + noise_iterator=None, + speech_iterator=None, + min_n_noises=0, + max_n_noises=1, + min_n_speakers=0, + max_n_speakers=1, + min_audio_dbfs=-35.0, + max_audio_dbfs=0.0, + min_noise_snr_db=3.0, + max_noise_snr_db=30.0, + min_speech_snr_db=3.0, + max_speech_snr_db=30.0, limit_audio_peak_dbfs=7.0, limit_noise_peak_dbfs=3.0, + limit_speech_peak_dbfs=7.0, sample_rate=16000): r"""Mix audio Tensor with noise Tensor @@ -125,13 +133,21 @@ def augment_noise(audio, Args: audio: A 2-D Tensor with shape [`time-steps`, 1]. - noise: A 2-D Tensor with shape [`time-steps`, 1]. + noise_iterator: A one shot iterator for noise file, the yield item shape is [`time-steps`, 1]. + speech_iterator: A one shot iterator for speech file, the yield item shape is [`time-steps`, 1]. + min_n_noises: A int, min number of the noises per audio mixing + max_n_noises: A int, 'max number of the noises per audio mixing + min_n_speakers: A int, min number of the speakers per audio mixing + max_n_speakers: A int, max number of the speakers per audio mixing min_audio_dbfs: A float in dbfs unit, specifying the `minimum` volume of audio during gaining audio. max_audio_dbfs: A float in dbfs unit, specifying the `maximum` volume of audio during gaining audio. - min_snr_db: A float in db unit, specifying the minimum signal-to-noise ratio during gaining audio and noise. - max_snr_db: A float in db unit, specifying the maximum signal-to-noise ratio during gaining audio and noise. - limit_audio_peak_dbfs: A float, specifying the limitation of maximun audio dbfs of chunks, the audio volume will not gain over than the specified value. - limit_noise_peak_dbfs: A float, specifying the limitation of maximun noise dbfs of chunks, the noise volume will not gain over than the specified value. + min_noise_snr_db: A float in db unit, specifying the `minimum` signal-to-noise ratio during gaining noise. + max_noise_snr_db: A float in db unit, specifying the `maximum` signal-to-noise ratio during gaining noise. + min_speech_snr_db: A float in db unit, specifying the `minimum` signal-to-noise ratio during gaining speech. + max_speech_snr_db: A float in db unit, specifying the `maximum` signal-to-noise ratio during gaining speech. + limit_audio_peak_dbfs: A float, specifying the limitation of maximun `audio` dbfs of chunks, the audio volume will not gain over than the specified value. + limit_noise_peak_dbfs: A float, specifying the limitation of maximun `noise` dbfs of chunks, the noise volume will not gain over than the specified value. + limit_speech_peak_dbfs: A float, specifying the limitation of maximun `speech` dbfs of chunks, the noise volume will not gain over than the specified value. sample_rate: An integer, specifying the audio sample rate to determining the chunk size for dbfs measurement. Returns: @@ -139,51 +155,84 @@ def augment_noise(audio, """ audio_len = tf.shape(audio)[0] - noise_len = tf.shape(noise)[0] - audio_mean_dbfs, audio_max_dbfs = audio_to_dbfs(audio, sample_rate, reduce_funcs=[tf.reduce_mean, tf.reduce_max]) + target_audio_dbfs = tfv1.random_uniform([], minval=min_audio_dbfs, maxval=max_audio_dbfs) + audio_gain_db = target_audio_dbfs - audio_mean_dbfs + + # limit audio peak + audio_gain_db = tf.minimum(limit_audio_peak_dbfs - audio_max_dbfs, audio_gain_db) + target_audio_dbfs = audio_mean_dbfs + audio_gain_db + audio_gain_ratio = tf.math.pow(10.0, audio_gain_db / 20.0) + mixed_audio = tf.multiply(audio, audio_gain_ratio) + + if noise_iterator: + n_noise = tfv1.random_uniform([], minval=min_n_noises, maxval=max_n_noises, dtype=tf.int32) if min_n_noises != max_n_noises else min_n_noises + def mix_noise_func(au): + noise = noise_iterator.get_next() + noise, noise_mean_dbfs, noise_max_dbfs = extract_noise(noise, audio_len, sample_rate) + return mix(au, target_audio_dbfs, noise, noise_mean_dbfs, noise_max_dbfs, min_noise_snr_db, max_noise_snr_db, limit_noise_peak_dbfs) + mixed_audio = tf.while_loop(lambda _: True, mix_noise_func, [mixed_audio], maximum_iterations=n_noise) + + if speech_iterator: + n_speakers = tfv1.random_uniform([], minval=min_n_speakers, maxval=max_n_speakers, dtype=tf.int32) if min_n_speakers != max_n_speakers else min_n_speakers + def mix_speech_func(au): + speech = speech_iterator.get_next() + speech, speech_mean_dbfs, speech_max_dbfs = extract_noise(speech, audio_len, sample_rate) + return mix(au, target_audio_dbfs, speech, speech_mean_dbfs, speech_max_dbfs, min_speech_snr_db, max_speech_snr_db, limit_speech_peak_dbfs) + mixed_audio = tf.while_loop(lambda _: True, mix_speech_func, [mixed_audio], maximum_iterations=n_speakers) + + mixed_audio = tf.maximum(tf.minimum(mixed_audio, 1.0), -1.0) + + return mixed_audio + +def extract_noise(noise, audio_len, sample_rate=16000): + r"""to prepare the mixable noise file out + + Args: + noise: A 2-D Tensor with shape [`time-steps`, 1] + audio_len: A tf.int32 scalar, the audio length + sample_rate: An integer, specifying the audio sample rate to determining the chunk size for dbfs measurement. + Returns: + A 2-D Tensor with shape [`audio_len`, 1]. + A float, the extracted noise mean dbfs + A float, the extracted noise max dbfs + """ + noise_len = tf.shape(noise)[0] multiply = tf.math.floordiv(audio_len, noise_len) + 1 noise_tile = tf.tile(noise, [multiply, 1]) - # Now, noise_len must > audio_len noise_tile_len = tf.shape(noise_tile)[0] mix_decoded_start_point = tfv1.random_uniform([], minval=0, maxval=noise_tile_len-audio_len, dtype=tf.int32) mix_decoded_end_point = mix_decoded_start_point + audio_len - extract_noise = noise_tile[mix_decoded_start_point:mix_decoded_end_point, :] - - extract_noise_mean_dbfs, extract_noise_max_dbfs = audio_to_dbfs(extract_noise, sample_rate, reduce_funcs=[tf.reduce_mean, tf.reduce_max]) - - target_audio_dbfs = tfv1.random_uniform([], minval=min_audio_dbfs, maxval=max_audio_dbfs) + extracted_noise = noise_tile[mix_decoded_start_point:mix_decoded_end_point, :] + extracted_noise_mean_dbfs, extracted_noise_max_dbfs = audio_to_dbfs(extracted_noise, sample_rate, reduce_funcs=[tf.reduce_mean, tf.reduce_max]) + return extracted_noise, extracted_noise_mean_dbfs, extracted_noise_max_dbfs - audio_gain_db = target_audio_dbfs - audio_mean_dbfs +def mix(audio, audio_dbfs, noise, noise_mean_dbfs, noise_max_dbfs, min_noise_snr_db, max_noise_snr_db, limit_noise_peak_dbfs): + r"""The input audio len must equal to noise len - # limit audio peak - audio_gain_db = tf.minimum(limit_audio_peak_dbfs - audio_max_dbfs, audio_gain_db) - target_audio_dbfs = audio_mean_dbfs + audio_gain_db - - audio_gain_ratio = tf.math.pow(10.0, audio_gain_db / 20.0) + Returns: + A 2-D Tensor with shape [`time-steps`, 1]. Has the same type and shape as `audio`. + """ # target_snr_db := target_audio_dbfs - target_noise_dbfs - target_snr_db = tfv1.random_uniform([], minval=min_snr_db, maxval=max_snr_db) + target_snr_db = tfv1.random_uniform([], minval=min_noise_snr_db, maxval=max_noise_snr_db) - target_noise_dbfs = target_audio_dbfs - target_snr_db - noise_gain_db = target_noise_dbfs - extract_noise_mean_dbfs + target_noise_dbfs = audio_dbfs - target_snr_db + noise_gain_db = target_noise_dbfs - noise_mean_dbfs # limit noise peak - noise_gain_db = tf.minimum(limit_noise_peak_dbfs - extract_noise_max_dbfs, noise_gain_db) + noise_gain_db = tf.minimum(limit_noise_peak_dbfs - noise_max_dbfs, noise_gain_db) noise_gain_ratio = tf.math.pow(10.0, noise_gain_db / 20.0) - mixed_audio = tf.multiply(audio, audio_gain_ratio) + tf.multiply(extract_noise, noise_gain_ratio) - - mixed_audio = tf.maximum(tf.minimum(mixed_audio, 1.0), -1.0) - - return mixed_audio + audio += tf.multiply(noise, noise_gain_ratio) + return audio -def gla(spectrogram): - r"""Use Griffin-Lim algorithm to reconstruct audio and fix iteration=10 to not waste too much performance in prefetch +def gla(spectrogram, n_iter=10): + r"""Use Griffin-Lim algorithm to reconstruct audio Args: spectrogram: A 3-D Tensor with shape [1, `time-steps`, `features`]. @@ -206,7 +255,7 @@ def reconstruct_phases(prev_phases): rands = tfv1.random_uniform(tf.shape(spectrogram), dtype=tf.float32) phases = tf.math.exp(tf.complex(0.0, 2.0 * np.pi * rands)) - reconstructed_phases = tf.while_loop(lambda _: True, reconstruct_phases, [phases], maximum_iterations=10) + reconstructed_phases = tf.while_loop(lambda _: True, reconstruct_phases, [phases], maximum_iterations=n_iter) xi = tf.complex(abs_spectrogram, 0.0) * reconstructed_phases audio = tf.signal.inverse_stft(xi, frame_length=frame_length, frame_step=frame_step, fft_length=fft_length) return tf.transpose(audio) diff --git a/util/feeding.py b/util/feeding.py index 24fd59f0ed..00ff40e316 100644 --- a/util/feeding.py +++ b/util/feeding.py @@ -68,28 +68,35 @@ def samples_to_mfccs(samples, sample_rate, train_phase=False): FLAGS.augmentation_freq_and_time_masking, FLAGS.augmentation_pitch_and_tempo_scaling, FLAGS.augmentation_speed_up_std > 0]): - review_audio = gla(spectrogram) + review_audio = gla(spectrogram, FLAGS.review_audio_gla_iterations) return mfccs, tf.shape(input=mfccs)[0], review_audio -def audiofile_to_features(wav_filename, train_phase=False, noise_iterator=None): +def audiofile_to_features(wav_filename, train_phase=False, noise_iterator=None, speech_iterator=None): samples = tf.io.read_file(wav_filename) decoded = contrib_audio.decode_wav(samples, desired_channels=1) audio = decoded.audio # augment audio - if noise_iterator: - noise = noise_iterator.get_next() + if noise_iterator or speech_iterator: audio = augment_noise( audio, - noise, + noise_iterator, + speech_iterator, + min_n_noises=FLAGS.audio_aug_min_n_noises, + max_n_noises=FLAGS.audio_aug_max_n_noises, + min_n_speakers=FLAGS.audio_aug_min_n_speakers, + max_n_speakers=FLAGS.audio_aug_max_n_speakers, min_audio_dbfs=FLAGS.audio_aug_min_audio_dbfs, max_audio_dbfs=FLAGS.audio_aug_max_audio_dbfs, - min_snr_db=FLAGS.audio_aug_min_snr_db, - max_snr_db=FLAGS.audio_aug_max_snr_db, + min_noise_snr_db=FLAGS.audio_aug_min_noise_snr_db, + max_noise_snr_db=FLAGS.audio_aug_max_noise_snr_db, + min_speech_snr_db=FLAGS.audio_aug_min_speech_snr_db, + max_speech_snr_db=FLAGS.audio_aug_max_speech_snr_db, limit_audio_peak_dbfs=FLAGS.audio_aug_limit_audio_peak_dbfs, limit_noise_peak_dbfs=FLAGS.audio_aug_limit_noise_peak_dbfs, + limit_speech_peak_dbfs=FLAGS.audio_aug_limit_speech_peak_dbfs, sample_rate=FLAGS.audio_sample_rate, ) @@ -106,9 +113,9 @@ def audiofile_to_features(wav_filename, train_phase=False, noise_iterator=None): return features, features_len, review_audio -def entry_to_features(wav_filename, transcript, train_phase, noise_iterator): +def entry_to_features(wav_filename, transcript, train_phase, noise_iterator, speech_iterator): # https://bugs.python.org/issue32117 - features, features_len, review_audio = audiofile_to_features(wav_filename, train_phase=train_phase, noise_iterator=noise_iterator) + features, features_len, review_audio = audiofile_to_features(wav_filename, train_phase=train_phase, noise_iterator=noise_iterator, speech_iterator=speech_iterator) return wav_filename, features, features_len, tf.SparseTensor(*transcript), review_audio @@ -121,7 +128,7 @@ def to_sparse_tuple(sequence): return indices, sequence, shape -def create_dataset(csvs, batch_size, enable_cache=False, cache_path=None, train_phase=False, noise_sources=None): +def create_dataset(csvs, batch_size, enable_cache=False, cache_path=None, train_phase=False, noise_sources=None, speech_sources=None): df = read_csvs(csvs) df.sort_values(by='wav_filesize', inplace=True) @@ -156,12 +163,11 @@ def batch_fn(wav_filenames, features, features_len, transcripts, review_audios): num_gpus = len(Config.available_devices) - if noise_sources: - noise_iterator = create_noise_iterator(noise_sources, read_csvs) - else: - noise_iterator = None + noise_iterator = create_noise_iterator(noise_sources, read_csvs) if noise_sources else None + speech_iterator = create_noise_iterator(speech_sources, read_csvs) if speech_sources else None - process_fn = partial(entry_to_features, train_phase=train_phase, noise_iterator=noise_iterator) + + process_fn = partial(entry_to_features, train_phase=train_phase, noise_iterator=noise_iterator, speech_iterator=speech_iterator) dataset = tf.data.Dataset.from_generator(generate_values, output_types=(tf.string, (tf.int64, tf.int32, tf.int64))) diff --git a/util/flags.py b/util/flags.py index 740894edd5..1b1b61c6e3 100644 --- a/util/flags.py +++ b/util/flags.py @@ -24,15 +24,26 @@ def create_flags(): # Data Augmentation # ================ - f.DEFINE_string('train_augmentation_files', '', 'comma separated list of files or dirs, specifying the dataset used for mixing audio into train dataset, if empty, no mix will be run in train phase') - f.DEFINE_string('dev_augmentation_files', '', 'comma separated list of files or dirs, specifying the dataset used for mixing audio into dev dataset, if empty, no mix will be run in dev phase') - f.DEFINE_string('test_augmentation_files', '', 'comma separated list of files or dirs, specifying the dataset used for mixing audio into test dataset, if empty, no mix will be run in test phase') - f.DEFINE_float('audio_aug_min_audio_dbfs', 0, 'min value of dbfs to specify the min volume of audio during gaining audio') - f.DEFINE_float('audio_aug_max_audio_dbfs', -35, 'max value of dbfs to specify the max volume of audio during gaining audio') - f.DEFINE_float('audio_aug_min_snr_db', 3, 'min value of db to specify the min signal-to-noise ratio during gaining audio and noise') - f.DEFINE_float('audio_aug_max_snr_db', 30, 'max value of db to specify the max signal-to-noise ratio during gaining audio and noise') + f.DEFINE_string('train_augmentation_noise_files', '', 'comma separated list of files or dirs, specifying the noise dataset used for mixing audio into train dataset, if empty, no mix will be run in train phase') + f.DEFINE_string('dev_augmentation_noise_files', '', 'comma separated list of files or dirs, specifying the noise dataset used for mixing audio into dev dataset, if empty, no mix will be run in dev phase') + f.DEFINE_string('test_augmentation_noise_files', '', 'comma separated list of files or dirs, specifying the noise dataset used for mixing audio into test dataset, if empty, no mix will be run in test phase') + f.DEFINE_string('train_augmentation_speech_files', '', 'comma separated list of files or dirs, specifying the speech dataset used for mixing audio into train dataset, if empty, no mix will be run in train phase') + f.DEFINE_string('dev_augmentation_speech_files', '', 'comma separated list of files or dirs, specifying the speech dataset used for mixing audio into dev dataset, if empty, no mix will be run in dev phase') + f.DEFINE_string('test_augmentation_speech_files', '', 'comma separated list of files or dirs, specifying the speech dataset used for mixing audio into test dataset, if empty, no mix will be run in test phase') + f.DEFINE_float('audio_aug_min_audio_dbfs', -35, 'min value of dbfs to specify the min volume of audio during gaining audio') + f.DEFINE_float('audio_aug_max_audio_dbfs', 0, 'max value of dbfs to specify the max volume of audio during gaining audio') + f.DEFINE_float('audio_aug_min_noise_snr_db', 3, 'min value of db to specify the min signal-to-noise ratio during gaining noise') + f.DEFINE_float('audio_aug_max_noise_snr_db', 30, 'max value of db to specify the max signal-to-noise ratio during gaining noise') + f.DEFINE_float('audio_aug_min_speech_snr_db', 10, 'min value of db to specify the min signal-to-noise ratio during gaining speech') + f.DEFINE_float('audio_aug_max_speech_snr_db', 50, 'max value of db to specify the max signal-to-noise ratio during gaining speech') f.DEFINE_float('audio_aug_limit_audio_peak_dbfs', 7.0, 'max value of dbfs to specify the limitation of max audio dbfs of chunks, the audio volume will not gain over than the specified value') f.DEFINE_float('audio_aug_limit_noise_peak_dbfs', 3.0, 'max value of dbfs to specify the limitation of max noise dbfs of chunks, the noise volume will not gain over than the specified value') + f.DEFINE_float('audio_aug_limit_speech_peak_dbfs', 3.0, 'max value of dbfs to specify the limitation of max speech dbfs of chunks, the speech volume will not gain over than the specified value') + f.DEFINE_integer('audio_aug_min_n_noises', 0, 'min number of the noises per audio mixing') + f.DEFINE_integer('audio_aug_max_n_noises', 1, 'max number of the noises per audio mixing') + f.DEFINE_integer('audio_aug_min_n_speakers', 0, 'min number of the speakers per audio mixing') + f.DEFINE_integer('audio_aug_max_n_speakers', 1, 'max number of the speakers per audio mixing') + f.DEFINE_float('data_aug_features_additive', 0, 'std of the Gaussian additive noise') f.DEFINE_float('data_aug_features_multiplicative', 0, 'std of normal distribution around 1 for multiplicative noise') @@ -52,7 +63,8 @@ def create_flags(): f.DEFINE_float('augmentation_pitch_and_tempo_scaling_max_pitch', 1.2, 'max value of pitch scaling') f.DEFINE_float('augmentation_pitch_and_tempo_scaling_max_tempo', 1.2, 'max vlaue of tempo scaling') - f.DEFINE_integer('review_audio_steps', 0, 'number of audio, push the audio into summary directory (if 0, no audio will be dumped)') + f.DEFINE_integer('review_audio_steps', 0, 'number of audio, push the audio into summary directory (if 0, no audio will be dumped), one file per step is saved until the given count is reached') + f.DEFINE_integer('review_audio_gla_iterations', 10, 'number of iteration to reconstruct audio from features, using Griffin-Lim Algorithm') # Global Constants # ================ From c089b7fdf100861d6b2d12bfa4153b98a1730121 Mon Sep 17 00:00:00 2001 From: Daniel Date: Fri, 17 Apr 2020 20:35:15 +0200 Subject: [PATCH 23/25] Fix merge not detecting moved scripts. --- training/deepspeech_training/evaluate.py | 12 ++-- training/deepspeech_training/train.py | 61 ++++++++++++++----- .../util}/audio_augmentation.py | 4 +- 3 files changed, 57 insertions(+), 20 deletions(-) rename {util => training/deepspeech_training/util}/audio_augmentation.py (99%) diff --git a/training/deepspeech_training/evaluate.py b/training/deepspeech_training/evaluate.py index 5877b618ad..0d9f02c9e9 100755 --- a/training/deepspeech_training/evaluate.py +++ b/training/deepspeech_training/evaluate.py @@ -43,7 +43,7 @@ def sparse_tuple_to_texts(sp_tuple, alphabet): return [alphabet.decode(res) for res in results] -def evaluate(test_csvs, create_model): +def evaluate(test_csvs, create_model, noise_sources=None, speech_sources=None): if FLAGS.scorer_path: scorer = Scorer(FLAGS.lm_alpha, FLAGS.lm_beta, FLAGS.scorer_path, Config.alphabet) @@ -51,13 +51,13 @@ def evaluate(test_csvs, create_model): scorer = None test_csvs = FLAGS.test_files.split(',') - test_sets = [create_dataset([csv], batch_size=FLAGS.test_batch_size, train_phase=False) for csv in test_csvs] + test_sets = [create_dataset([csv], batch_size=FLAGS.test_batch_size, train_phase=False, noise_sources=noise_sources, speech_sources=speech_sources) for csv in test_csvs] iterator = tfv1.data.Iterator.from_structure(tfv1.data.get_output_types(test_sets[0]), tfv1.data.get_output_shapes(test_sets[0]), output_classes=tfv1.data.get_output_classes(test_sets[0])) test_init_ops = [iterator.make_initializer(test_set) for test_set in test_sets] - batch_wav_filename, (batch_x, batch_x_len), batch_y = iterator.get_next() + batch_wav_filename, (batch_x, batch_x_len), batch_y, _ = iterator.get_next() # One rate per layer no_dropout = [None] * 6 @@ -71,7 +71,8 @@ def evaluate(test_csvs, create_model): loss = tfv1.nn.ctc_loss(labels=batch_y, inputs=logits, - sequence_length=batch_x_len) + sequence_length=batch_x_len, + ignore_longer_outputs_than_inputs=True) tfv1.train.get_or_create_global_step() @@ -106,6 +107,9 @@ def run_test(init_op, dataset): session.run([batch_wav_filename, transposed, loss, batch_x_len, batch_y]) except tf.errors.OutOfRangeError: break + except tf.errors.InvalidArgumentError as e: + print("Ignoring error:", e) + continue decoded = ctc_beam_search_decoder_batch(batch_logits, batch_lengths, Config.alphabet, FLAGS.beam_width, num_processes=num_processes, scorer=scorer, diff --git a/training/deepspeech_training/train.py b/training/deepspeech_training/train.py index eed8cd9eb2..600dff3560 100644 --- a/training/deepspeech_training/train.py +++ b/training/deepspeech_training/train.py @@ -228,7 +228,7 @@ def calculate_mean_edit_distance_and_loss(iterator, dropout, reuse): the decoded result and the batch's original Y. ''' # Obtain the next batch of data - batch_filenames, (batch_x, batch_seq_len), batch_y = iterator.get_next() + batch_filenames, (batch_x, batch_seq_len), batch_y, review_audio = iterator.get_next() if FLAGS.train_cudnn: rnn_impl = rnn_impl_cudnn_rnn @@ -239,7 +239,10 @@ def calculate_mean_edit_distance_and_loss(iterator, dropout, reuse): logits, _ = create_model(batch_x, batch_seq_len, dropout, reuse=reuse, rnn_impl=rnn_impl) # Compute the CTC loss using TensorFlow's `ctc_loss` - total_loss = tfv1.nn.ctc_loss(labels=batch_y, inputs=logits, sequence_length=batch_seq_len) + total_loss = tfv1.nn.ctc_loss(labels=batch_y, + inputs=logits, + sequence_length=batch_seq_len, + ignore_longer_outputs_than_inputs=True) # Check if any files lead to non finite loss non_finite_files = tf.gather(batch_filenames, tfv1.where(~tf.math.is_finite(total_loss))) @@ -248,7 +251,7 @@ def calculate_mean_edit_distance_and_loss(iterator, dropout, reuse): avg_loss = tf.reduce_mean(input_tensor=total_loss) # Finally we return the average loss - return avg_loss, non_finite_files + return avg_loss, non_finite_files, review_audio # Adam Optimization @@ -309,7 +312,7 @@ def get_tower_results(iterator, optimizer, dropout_rates): with tf.name_scope('tower_%d' % i): # Calculate the avg_loss and mean_edit_distance and retrieve the decoded # batch along with the original batch's labels (Y) of this tower - avg_loss, non_finite_files = calculate_mean_edit_distance_and_loss(iterator, dropout_rates, reuse=i > 0) + avg_loss, non_finite_files, review_audio = calculate_mean_edit_distance_and_loss(iterator, dropout_rates, reuse=i > 0) # Allow for variables to be re-used by the next tower tfv1.get_variable_scope().reuse_variables() @@ -326,6 +329,8 @@ def get_tower_results(iterator, optimizer, dropout_rates): tower_non_finite_files.append(non_finite_files) avg_loss_across_towers = tf.reduce_mean(input_tensor=tower_avg_losses, axis=0) + if FLAGS.review_audio_steps: + tfv1.summary.audio(name='step_audio', tensor=review_audio, sample_rate=FLAGS.audio_sample_rate, collections=['step_audio_summaries']) tfv1.summary.scalar(name='step_loss', tensor=avg_loss_across_towers, collections=['step_summaries']) all_non_finite_files = tf.concat(tower_non_finite_files, axis=0) @@ -415,7 +420,9 @@ def train(): FLAGS.augmentation_freq_and_time_masking or FLAGS.augmentation_pitch_and_tempo_scaling or FLAGS.augmentation_speed_up_std > 0 or - FLAGS.augmentation_sparse_warp): + FLAGS.augmentation_sparse_warp or + FLAGS.train_augmentation_noise_files or + FLAGS.train_augmentation_speech_files): do_cache_dataset = False exception_box = ExceptionBox() @@ -428,7 +435,9 @@ def train(): train_phase=True, exception_box=exception_box, process_ahead=len(Config.available_devices) * FLAGS.train_batch_size * 2, - buffering=FLAGS.read_buffer) + buffering=FLAGS.read_buffer, + noise_sources=FLAGS.train_augmentation_noise_files, + speech_sources=FLAGS.train_augmentation_speech_files) iterator = tfv1.data.Iterator.from_structure(tfv1.data.get_output_types(train_set), tfv1.data.get_output_shapes(train_set), @@ -444,7 +453,9 @@ def train(): train_phase=False, exception_box=exception_box, process_ahead=len(Config.available_devices) * FLAGS.dev_batch_size * 2, - buffering=FLAGS.read_buffer) for source in dev_sources] + buffering=FLAGS.read_buffer, + noise_sources=FLAGS.dev_augmentation_noise_files, + speech_sources=FLAGS.dev_augmentation_speech_files) for source in dev_sources] dev_init_ops = [iterator.make_initializer(dev_set) for dev_set in dev_sets] # Dropout @@ -482,6 +493,7 @@ def train(): apply_gradient_op = optimizer.apply_gradients(avg_tower_gradients, global_step=global_step) # Summaries + step_audio_summaries_op = tfv1.summary.merge_all('step_audio_summaries') step_summaries_op = tfv1.summary.merge_all('step_summaries') step_summary_writers = { 'train': tfv1.summary.FileWriter(os.path.join(FLAGS.summary_dir, 'train'), max_queue=120), @@ -541,17 +553,29 @@ def __call__(self, progress, data, **kwargs): session.run(init_op) # Batch loop + + audio_summary_steps = 0 while True: try: - _, current_step, batch_loss, problem_files, step_summary = \ - session.run([train_op, global_step, loss, non_finite_files, step_summaries_op], - feed_dict=feed_dict) + step_audio_summary = None + if audio_summary_steps < FLAGS.review_audio_steps and epoch == 0: + _, current_step, batch_loss, problem_files, step_summary, step_audio_summary = \ + session.run([train_op, global_step, loss, non_finite_files, step_summaries_op, step_audio_summaries_op], + feed_dict=feed_dict) + audio_summary_steps += 1 + else: + _, current_step, batch_loss, problem_files, step_summary = \ + session.run([train_op, global_step, loss, non_finite_files, step_summaries_op], + feed_dict=feed_dict) + exception_box.raise_if_set() except tf.errors.InvalidArgumentError as err: if FLAGS.augmentation_sparse_warp: log_info("Ignoring sparse warp error: {}".format(err)) continue - raise + else: + print("Ignoring error:", err) + continue except tf.errors.OutOfRangeError: exception_box.raise_if_set() break @@ -561,11 +585,20 @@ def __call__(self, progress, data, **kwargs): log_error('The following files caused an infinite (or NaN) ' 'loss: {}'.format(','.join(problem_files))) + # Save invalid files + sys.path.append("/DeepSpeech/deepspeech-german/training/") + from filter_invalid_files import add_files_to_excluded + add_files_to_excluded(problem_files) + sys.exit(1) + total_loss += batch_loss step_count += 1 pbar.update(step_count) + if step_audio_summary is not None: + step_summary_writer.add_summary(step_audio_summary, current_step) + step_summary_writer.add_summary(step_summary, current_step) if is_train and FLAGS.checkpoint_secs > 0 and time.time() - checkpoint_time > FLAGS.checkpoint_secs: @@ -639,7 +672,7 @@ def __call__(self, progress, data, **kwargs): def test(): - samples = evaluate(FLAGS.test_files.split(','), create_model) + samples = evaluate(FLAGS.test_files.split(','), create_model, noise_sources=FLAGS.test_augmentation_noise_files, speech_sources=FLAGS.test_augmentation_speech_files) if FLAGS.test_output_file: # Save decoded tuples as JSON, converting NumPy floats to Python floats json.dump(samples, open(FLAGS.test_output_file, 'w'), default=float) @@ -651,7 +684,7 @@ def create_inference_graph(batch_size=1, n_steps=16, tflite=False): # Create feature computation graph input_samples = tfv1.placeholder(tf.float32, [Config.audio_window_samples], 'input_samples') samples = tf.expand_dims(input_samples, -1) - mfccs, _ = samples_to_mfccs(samples, FLAGS.audio_sample_rate) + mfccs, _, _ = samples_to_mfccs(samples, FLAGS.audio_sample_rate) mfccs = tf.identity(mfccs, name='mfccs') # Input tensor will be of shape [batch_size, n_steps, 2*n_context+1, n_input] @@ -851,7 +884,7 @@ def do_single_file_inference(input_file_path): # Restore variables from training checkpoint load_graph_for_evaluation(session) - features, features_len = audiofile_to_features(input_file_path) + features, features_len, _ = audiofile_to_features(input_file_path) previous_state_c = np.zeros([1, Config.n_cell_dim]) previous_state_h = np.zeros([1, Config.n_cell_dim]) diff --git a/util/audio_augmentation.py b/training/deepspeech_training/util/audio_augmentation.py similarity index 99% rename from util/audio_augmentation.py rename to training/deepspeech_training/util/audio_augmentation.py index 6a98f887f4..55b8957178 100644 --- a/util/audio_augmentation.py +++ b/training/deepspeech_training/util/audio_augmentation.py @@ -5,8 +5,8 @@ import numpy as np from tensorflow.python.ops import gen_audio_ops as contrib_audio import os -from util.logging import log_info -from util.config import Config +from .logging import log_info +from .config import Config DBFS_COEF = 10.0 / np.log(10.0) From 491a4b06f9393b338346b6dd58c7dccef6cca4b2 Mon Sep 17 00:00:00 2001 From: Daniel Date: Fri, 17 Apr 2020 20:50:11 +0200 Subject: [PATCH 24/25] Undo personal changes. --- training/deepspeech_training/evaluate.py | 6 +----- training/deepspeech_training/train.py | 13 ++----------- 2 files changed, 3 insertions(+), 16 deletions(-) diff --git a/training/deepspeech_training/evaluate.py b/training/deepspeech_training/evaluate.py index 0d9f02c9e9..10043213b4 100755 --- a/training/deepspeech_training/evaluate.py +++ b/training/deepspeech_training/evaluate.py @@ -71,8 +71,7 @@ def evaluate(test_csvs, create_model, noise_sources=None, speech_sources=None): loss = tfv1.nn.ctc_loss(labels=batch_y, inputs=logits, - sequence_length=batch_x_len, - ignore_longer_outputs_than_inputs=True) + sequence_length=batch_x_len) tfv1.train.get_or_create_global_step() @@ -107,9 +106,6 @@ def run_test(init_op, dataset): session.run([batch_wav_filename, transposed, loss, batch_x_len, batch_y]) except tf.errors.OutOfRangeError: break - except tf.errors.InvalidArgumentError as e: - print("Ignoring error:", e) - continue decoded = ctc_beam_search_decoder_batch(batch_logits, batch_lengths, Config.alphabet, FLAGS.beam_width, num_processes=num_processes, scorer=scorer, diff --git a/training/deepspeech_training/train.py b/training/deepspeech_training/train.py index 600dff3560..dc48cef3f7 100644 --- a/training/deepspeech_training/train.py +++ b/training/deepspeech_training/train.py @@ -241,8 +241,7 @@ def calculate_mean_edit_distance_and_loss(iterator, dropout, reuse): # Compute the CTC loss using TensorFlow's `ctc_loss` total_loss = tfv1.nn.ctc_loss(labels=batch_y, inputs=logits, - sequence_length=batch_seq_len, - ignore_longer_outputs_than_inputs=True) + sequence_length=batch_seq_len) # Check if any files lead to non finite loss non_finite_files = tf.gather(batch_filenames, tfv1.where(~tf.math.is_finite(total_loss))) @@ -573,9 +572,7 @@ def __call__(self, progress, data, **kwargs): if FLAGS.augmentation_sparse_warp: log_info("Ignoring sparse warp error: {}".format(err)) continue - else: - print("Ignoring error:", err) - continue + raise except tf.errors.OutOfRangeError: exception_box.raise_if_set() break @@ -585,12 +582,6 @@ def __call__(self, progress, data, **kwargs): log_error('The following files caused an infinite (or NaN) ' 'loss: {}'.format(','.join(problem_files))) - # Save invalid files - sys.path.append("/DeepSpeech/deepspeech-german/training/") - from filter_invalid_files import add_files_to_excluded - add_files_to_excluded(problem_files) - sys.exit(1) - total_loss += batch_loss step_count += 1 From 2fa91e8c871c99f3f1950aa2801f9c39b7b93add Mon Sep 17 00:00:00 2001 From: Yi-Hua Chiu Date: Tue, 12 May 2020 16:56:53 +0800 Subject: [PATCH 25/25] To recover the incorrect merge Revert "Merge branch 'no-sort' into more-augment-options" This reverts commit 77922262464c5c0fb9d2c5c90e9046f59769bb9d, reversing changes made to f7d1279d2c4bd56d782db083a9d3d355f4239424. --- requirements.txt | 2 +- util/evaluate_tools.py | 7 ++++++- util/feeding.py | 2 +- util/text.py | 19 ++++++++++++------- 4 files changed, 20 insertions(+), 10 deletions(-) diff --git a/requirements.txt b/requirements.txt index e15e2a9f5b..d399ac4e8f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ # Main training requirements - +tensorflow == 1.14.0 numpy == 1.15.4 progressbar2 pandas diff --git a/util/evaluate_tools.py b/util/evaluate_tools.py index 59fb542be2..7f6a8ffb78 100644 --- a/util/evaluate_tools.py +++ b/util/evaluate_tools.py @@ -66,7 +66,12 @@ def calculate_report(wav_filenames, labels, decodings, losses): samples_wer, samples_cer = wer_cer_batch(samples) # Order the remaining items by their loss (lowest loss on top) + samples.sort(key=lambda s: s.loss) - + # Then order by descending WER/CER + if FLAGS.utf8: + samples.sort(key=lambda s: s.cer, reverse=True) + else: + samples.sort(key=lambda s: s.wer, reverse=True) return samples_wer, samples_cer, samples diff --git a/util/feeding.py b/util/feeding.py index 76d8953d97..00ff40e316 100644 --- a/util/feeding.py +++ b/util/feeding.py @@ -130,7 +130,7 @@ def to_sparse_tuple(sequence): def create_dataset(csvs, batch_size, enable_cache=False, cache_path=None, train_phase=False, noise_sources=None, speech_sources=None): df = read_csvs(csvs) - #df.sort_values(by='wav_filesize', inplace=True) + df.sort_values(by='wav_filesize', inplace=True) df['transcript'] = df.apply(text_to_char_array, alphabet=Config.alphabet, result_type='reduce', axis=1) diff --git a/util/text.py b/util/text.py index c2d450c3a9..d3be7eb88a 100644 --- a/util/text.py +++ b/util/text.py @@ -14,13 +14,18 @@ def __init__(self, config_file): self._label_to_str = {} self._str_to_label = {} self._size = 0 - with codecs.open(config_file, 'r', 'utf-8') as fin: - for line in fin: - self._label_to_str += line[:-1] # remove the line ending - self._str_to_label[line[:-1]] = self._size - self._size += 1 - - def string_from_label(self, label): + if config_file: + with codecs.open(config_file, 'r', 'utf-8') as fin: + for line in fin: + if line[0:2] == '\\#': + line = '#\n' + elif line[0] == '#': + continue + self._label_to_str[self._size] = line[:-1] # remove the line ending + self._str_to_label[line[:-1]] = self._size + self._size += 1 + + def _string_from_label(self, label): return self._label_to_str[label] def _label_from_string(self, string):