From 681f4709eb86106a5d8c4f81d004bd40627f6f90 Mon Sep 17 00:00:00 2001
From: Carlos Fonseca <32177100+carlfm01@users.noreply.github.com>
Date: Wed, 5 Jun 2019 04:27:46 +0000
Subject: [PATCH 01/25] Remove comments check from alphabet

---
 util/text.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/util/text.py b/util/text.py
index 7ae6ef3e59..fed193e7e6 100644
--- a/util/text.py
+++ b/util/text.py
@@ -15,10 +15,6 @@ def __init__(self, config_file):
         self._size = 0
         with codecs.open(config_file, 'r', 'utf-8') as fin:
             for line in fin:
-                if line[0:2] == '\\#':
-                    line = '#\n'
-                elif line[0] == '#':
-                    continue
                 self._label_to_str += line[:-1] # remove the line ending
                 self._str_to_label[line[:-1]] = self._size
                 self._size += 1

From 421243d2841a86c7a3f0fbea5d4f4d49ab82f706 Mon Sep 17 00:00:00 2001
From: Carlos Fonseca <32177100+carlfm01@users.noreply.github.com>
Date: Wed, 5 Jun 2019 04:29:02 +0000
Subject: [PATCH 02/25] Remove sort from feeding

---
 util/feeding.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/util/feeding.py b/util/feeding.py
index a88f366030..66022a7c13 100644
--- a/util/feeding.py
+++ b/util/feeding.py
@@ -66,7 +66,7 @@ def to_sparse_tuple(sequence):
 
 def create_dataset(csvs, batch_size, cache_path=''):
     df = read_csvs(csvs)
-    df.sort_values(by='wav_filesize', inplace=True)
+    #df.sort_values(by='wav_filesize', inplace=True)
 
     # Convert to character index arrays
     df['transcript'] = df['transcript'].apply(partial(text_to_char_array, alphabet=Config.alphabet))

From d08efad480d160a8bd5ff94568a26abb57fbc760 Mon Sep 17 00:00:00 2001
From: Carlos Fonseca <32177100+carlfm01@users.noreply.github.com>
Date: Wed, 5 Jun 2019 04:56:34 +0000
Subject: [PATCH 03/25] Remove sort from evaluate tools

---
 util/evaluate_tools.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/util/evaluate_tools.py b/util/evaluate_tools.py
index 1ad91f46ea..46fea32437 100644
--- a/util/evaluate_tools.py
+++ b/util/evaluate_tools.py
@@ -64,9 +64,7 @@ def calculate_report(labels, decodings, losses):
     samples_wer, samples_cer = wer_cer_batch(samples)
 
     # Order the remaining items by their loss (lowest loss on top)
-    samples.sort(key=lambda s: s.loss)
 
     # Then order by WER (highest WER on top)
-    samples.sort(key=lambda s: s.wer, reverse=True)
 
     return samples_wer, samples_cer, samples

From ba1a58763e1b18720b2a8602d92c823327e0dd09 Mon Sep 17 00:00:00 2001
From: Carlos Fonseca <carlfm01@hotmail.com>
Date: Sat, 29 Jun 2019 03:29:15 +0000
Subject: [PATCH 04/25] Remove TF dependency

---
 requirements.txt | 1 -
 1 file changed, 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 2958643ba0..fdf8e6ba1d 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,4 @@
 # Main training requirements
-tensorflow == 1.13.1
 numpy == 1.15.4
 progressbar2
 pandas

From aebd08df4f80e05a53400f503c1fe533a107ca23 Mon Sep 17 00:00:00 2001
From: Yi-Hua Chiu <mychiux413@gmail.com>
Date: Mon, 30 Dec 2019 15:54:27 +0800
Subject: [PATCH 05/25] [ADD] mix noise audio

---
 util/feeding.py | 18 +++++++++++++++++-
 util/flags.py   |  7 ++++++-
 2 files changed, 23 insertions(+), 2 deletions(-)

diff --git a/util/feeding.py b/util/feeding.py
index 16d0e3128d..3084dd9436 100644
--- a/util/feeding.py
+++ b/util/feeding.py
@@ -16,6 +16,7 @@
 from util.flags import FLAGS
 from util.spectrogram_augmentations import augment_freq_time_mask, augment_dropout, augment_pitch_and_tempo, augment_speed_up
 from util.audio import read_frames_from_file, vad_split, DEFAULT_FORMAT
+from util.decoded_augmentation import augment_noise
 
 
 def read_csvs(csv_files):
@@ -67,8 +68,23 @@ def samples_to_mfccs(samples, sample_rate, train_phase=False):
 def audiofile_to_features(wav_filename, train_phase=False):
     samples = tf.io.read_file(wav_filename)
     decoded = contrib_audio.decode_wav(samples, desired_channels=1)
-    features, features_len = samples_to_mfccs(decoded.audio, decoded.sample_rate, train_phase=train_phase)
+    audio = decoded.audio
 
+    # augment decoded
+    if train_phase and FLAGS.decoded_aug_mix_noise_walk_dirs:
+        audio = augment_noise(
+            audio,
+            FLAGS.decoded_aug_mix_noise_walk_dirs.split(','),
+            change_audio_db_max=FLAGS.decoded_aug_mix_noise_max_audio_db,
+            change_audio_db_min=FLAGS.decoded_aug_mix_noise_min_audio_db,
+            change_noise_db_max=FLAGS.decoded_aug_mix_noise_max_noise_db,
+            change_noise_db_min=FLAGS.decoded_aug_mix_noise_min_noise_db,
+        )
+
+
+    features, features_len = samples_to_mfccs(audio, decoded.sample_rate, train_phase=train_phase)
+
+    # augment features
     if train_phase:
         if FLAGS.data_aug_features_multiplicative > 0:
             features = features*tf.random.normal(mean=1, stddev=FLAGS.data_aug_features_multiplicative, shape=tf.shape(features))
diff --git a/util/flags.py b/util/flags.py
index d8a2656c6a..2d01a3a2b9 100644
--- a/util/flags.py
+++ b/util/flags.py
@@ -24,6 +24,12 @@ def create_flags():
     # Data Augmentation
     # ================
 
+    f.DEFINE_string('decoded_aug_mix_noise_walk_dirs', '', 'walk through wav dir, then mix noise wav into decoded audio')
+    f.DEFINE_float('decoded_aug_mix_noise_max_noise_db', -25, 'limit noise max volume')
+    f.DEFINE_float('decoded_aug_mix_noise_min_noise_db', -50, 'limit noise min volume')
+    f.DEFINE_float('decoded_aug_mix_noise_max_audio_db', 0, 'limit noise max volume')
+    f.DEFINE_float('decoded_aug_mix_noise_min_audio_db', -10, 'limit noise min volume')
+
     f.DEFINE_float('data_aug_features_additive', 0, 'std of the Gaussian additive noise')
     f.DEFINE_float('data_aug_features_multiplicative', 0, 'std of normal distribution around 1 for multiplicative noise')
 
@@ -42,7 +48,6 @@ def create_flags():
     f.DEFINE_float('augmentation_pitch_and_tempo_scaling_max_pitch', 1.2, 'max value of pitch scaling')
     f.DEFINE_float('augmentation_pitch_and_tempo_scaling_max_tempo', 1.2, 'max vlaue of tempo scaling')
 
-
     # Global Constants
     # ================
 

From d255c3f952036fd6153ac6ee20a4c526ba98370a Mon Sep 17 00:00:00 2001
From: Yi-Hua Chiu <mychiux413@gmail.com>
Date: Mon, 30 Dec 2019 16:10:00 +0800
Subject: [PATCH 06/25] [FIX] add missing file decoded_augmentation.py

---
 util/decoded_augmentation.py | 67 ++++++++++++++++++++++++++++++++++++
 1 file changed, 67 insertions(+)
 create mode 100644 util/decoded_augmentation.py

diff --git a/util/decoded_augmentation.py b/util/decoded_augmentation.py
new file mode 100644
index 0000000000..f74b5f1d1f
--- /dev/null
+++ b/util/decoded_augmentation.py
@@ -0,0 +1,67 @@
+import tensorflow as tf
+import tensorflow.compat.v1 as tfv1
+from tensorflow.python.ops import gen_audio_ops as contrib_audio
+import os
+
+
+def augment_noise(audio,
+                  walk_dirs,
+                  change_audio_db_max=0,
+                  change_audio_db_min=-10,
+                  change_noise_db_max=-25,
+                  change_noise_db_min=-50
+                  ):
+    noise_filenames = []
+    for d in walk_dirs:
+        for dirpath, _, filenames in os.walk(d):
+            for filename in filenames:
+                if filename.endswith('.wav'):
+                    noise_filenames.append(os.path.join(dirpath, filename))
+    print('Collect {} noise filenames for augmentation'.format(len(noise_filenames)))
+    noise_filenames = tf.convert_to_tensor(noise_filenames, dtype=tf.string)
+
+    rand_int = tfv1.random_uniform(
+        [], dtype=tf.int32, minval=0, maxval=tf.shape(noise_filenames)[0])
+    noise_filename = noise_filenames[rand_int]
+    noise_samples = tf.io.read_file(noise_filename)
+    noise_decoded = contrib_audio.decode_wav(noise_samples, desired_channels=1)
+    noise_audio = noise_decoded.audio
+
+    decoded_audio_len = tf.shape(audio)[0]
+    noise_decoded_audio_len = tf.shape(noise_audio)[0]
+
+    multiply = tf.math.floordiv(decoded_audio_len, noise_decoded_audio_len) + 1
+    noise_audio = tf.tile(noise_audio, [multiply, 1])
+
+    # now noise_decoded_len must > decoded_len
+    noise_decoded_audio_len = tf.shape(noise_audio)[0]
+
+    mix_decoded_start_end_points = tfv1.random_uniform(
+        [2], minval=0, maxval=decoded_audio_len-1, dtype=tf.int32)
+    mix_decoded_start_point = tf.math.reduce_min(mix_decoded_start_end_points)
+    mix_decoded_end_point = tf.math.reduce_max(
+        mix_decoded_start_end_points) + 1
+    mix_decoded_width = mix_decoded_end_point - mix_decoded_start_point
+
+    left_zeros = tf.zeros(shape=[mix_decoded_start_point, 1])
+
+    mix_noise_decoded_start_point = tfv1.random_uniform(
+        [], minval=0, maxval=noise_decoded_audio_len - mix_decoded_width, dtype=tf.int32)
+    mix_noise_decoded_end_point = mix_noise_decoded_start_point + mix_decoded_width
+    extract_noise_decoded = noise_audio[mix_noise_decoded_start_point:mix_noise_decoded_end_point, :]
+
+    right_zeros = tf.zeros(
+        shape=[decoded_audio_len - mix_decoded_end_point, 1])
+
+    mixed_noise = tf.concat(
+        [left_zeros, extract_noise_decoded, right_zeros], axis=0)
+
+    choosen_audio_db = tfv1.random_uniform(
+        [], minval=change_audio_db_min, maxval=change_audio_db_max)
+    audio_ratio = tf.math.exp(choosen_audio_db / 10)
+
+    choosen_noise_db = tfv1.random_uniform(
+        [], minval=change_noise_db_min, maxval=change_noise_db_max)
+    noise_ratio = tf.math.exp(choosen_noise_db / 10)
+
+    return tf.multiply(audio, audio_ratio) + tf.multiply(mixed_noise, noise_ratio)

From ec251367bd012fce533c2ab30f9c8fce8f81ca80 Mon Sep 17 00:00:00 2001
From: Yi-Hua Chiu <mychiux413@gmail.com>
Date: Tue, 31 Dec 2019 11:20:58 +0800
Subject: [PATCH 07/25] mix noise works, but performance is bad

---
 util/decoded_augmentation.py | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/util/decoded_augmentation.py b/util/decoded_augmentation.py
index f74b5f1d1f..449e772a8f 100644
--- a/util/decoded_augmentation.py
+++ b/util/decoded_augmentation.py
@@ -8,9 +8,9 @@ def augment_noise(audio,
                   walk_dirs,
                   change_audio_db_max=0,
                   change_audio_db_min=-10,
-                  change_noise_db_max=-25,
-                  change_noise_db_min=-50
-                  ):
+                  change_noise_db_max=-15,
+                  change_noise_db_min=-25):
+    assert isinstance(walk_dirs, list)
     noise_filenames = []
     for d in walk_dirs:
         for dirpath, _, filenames in os.walk(d):
@@ -31,10 +31,10 @@ def augment_noise(audio,
     noise_decoded_audio_len = tf.shape(noise_audio)[0]
 
     multiply = tf.math.floordiv(decoded_audio_len, noise_decoded_audio_len) + 1
-    noise_audio = tf.tile(noise_audio, [multiply, 1])
+    noise_audio_tile = tf.tile(noise_audio, [multiply, 1])
 
     # now noise_decoded_len must > decoded_len
-    noise_decoded_audio_len = tf.shape(noise_audio)[0]
+    noise_decoded_audio_len = tf.shape(noise_audio_tile)[0]
 
     mix_decoded_start_end_points = tfv1.random_uniform(
         [2], minval=0, maxval=decoded_audio_len-1, dtype=tf.int32)
@@ -48,7 +48,7 @@ def augment_noise(audio,
     mix_noise_decoded_start_point = tfv1.random_uniform(
         [], minval=0, maxval=noise_decoded_audio_len - mix_decoded_width, dtype=tf.int32)
     mix_noise_decoded_end_point = mix_noise_decoded_start_point + mix_decoded_width
-    extract_noise_decoded = noise_audio[mix_noise_decoded_start_point:mix_noise_decoded_end_point, :]
+    extract_noise_decoded = noise_audio_tile[mix_noise_decoded_start_point:mix_noise_decoded_end_point, :]
 
     right_zeros = tf.zeros(
         shape=[decoded_audio_len - mix_decoded_end_point, 1])
@@ -58,10 +58,11 @@ def augment_noise(audio,
 
     choosen_audio_db = tfv1.random_uniform(
         [], minval=change_audio_db_min, maxval=change_audio_db_max)
-    audio_ratio = tf.math.exp(choosen_audio_db / 10)
+    audio_ratio = tf.math.pow(10.0, choosen_audio_db / 10)
 
     choosen_noise_db = tfv1.random_uniform(
         [], minval=change_noise_db_min, maxval=change_noise_db_max)
-    noise_ratio = tf.math.exp(choosen_noise_db / 10)
-
+    # choosen_noise_db = tf.random.normal(
+    #     [], mean=change_noise_db_max, stddev=change_noise_db_min)
+    noise_ratio = tf.math.pow(10.0, choosen_noise_db / 10)
     return tf.multiply(audio, audio_ratio) + tf.multiply(mixed_noise, noise_ratio)

From 484134eb9e303dea22358a15930277675c0a2f7c Mon Sep 17 00:00:00 2001
From: Yi-Hua Chiu <mychiux413@gmail.com>
Date: Tue, 31 Dec 2019 15:57:34 +0800
Subject: [PATCH 08/25] [MOD] use tf.Dataset to cache noise audio

---
 ..._augmentation.py => audio_augmentation.py} | 33 ++++++++-----------
 util/feeding.py                               | 17 ++++++++--
 util/flags.py                                 |  9 ++---
 3 files changed, 33 insertions(+), 26 deletions(-)
 rename util/{decoded_augmentation.py => audio_augmentation.py} (75%)

diff --git a/util/decoded_augmentation.py b/util/audio_augmentation.py
similarity index 75%
rename from util/decoded_augmentation.py
rename to util/audio_augmentation.py
index 449e772a8f..cb668f696b 100644
--- a/util/decoded_augmentation.py
+++ b/util/audio_augmentation.py
@@ -3,29 +3,26 @@
 from tensorflow.python.ops import gen_audio_ops as contrib_audio
 import os
 
-
-def augment_noise(audio,
-                  walk_dirs,
-                  change_audio_db_max=0,
-                  change_audio_db_min=-10,
-                  change_noise_db_max=-15,
-                  change_noise_db_min=-25):
+def collect_noise_filenames(walk_dirs):
     assert isinstance(walk_dirs, list)
-    noise_filenames = []
+
     for d in walk_dirs:
         for dirpath, _, filenames in os.walk(d):
             for filename in filenames:
                 if filename.endswith('.wav'):
-                    noise_filenames.append(os.path.join(dirpath, filename))
-    print('Collect {} noise filenames for augmentation'.format(len(noise_filenames)))
-    noise_filenames = tf.convert_to_tensor(noise_filenames, dtype=tf.string)
+                    yield os.path.join(dirpath, filename)
 
-    rand_int = tfv1.random_uniform(
-        [], dtype=tf.int32, minval=0, maxval=tf.shape(noise_filenames)[0])
-    noise_filename = noise_filenames[rand_int]
-    noise_samples = tf.io.read_file(noise_filename)
-    noise_decoded = contrib_audio.decode_wav(noise_samples, desired_channels=1)
-    noise_audio = noise_decoded.audio
+def noise_file_to_audio(noise_file):
+    samples = tf.io.read_file(noise_file)
+    decoded = contrib_audio.decode_wav(samples, desired_channels=1)
+    return decoded.audio
+
+def augment_noise(audio,
+                  noise_audio,
+                  change_audio_db_max=0,
+                  change_audio_db_min=-10,
+                  change_noise_db_max=-15,
+                  change_noise_db_min=-25):
 
     decoded_audio_len = tf.shape(audio)[0]
     noise_decoded_audio_len = tf.shape(noise_audio)[0]
@@ -62,7 +59,5 @@ def augment_noise(audio,
 
     choosen_noise_db = tfv1.random_uniform(
         [], minval=change_noise_db_min, maxval=change_noise_db_max)
-    # choosen_noise_db = tf.random.normal(
-    #     [], mean=change_noise_db_max, stddev=change_noise_db_min)
     noise_ratio = tf.math.pow(10.0, choosen_noise_db / 10)
     return tf.multiply(audio, audio_ratio) + tf.multiply(mixed_noise, noise_ratio)
diff --git a/util/feeding.py b/util/feeding.py
index 3084dd9436..d8ad88a20b 100644
--- a/util/feeding.py
+++ b/util/feeding.py
@@ -16,7 +16,7 @@
 from util.flags import FLAGS
 from util.spectrogram_augmentations import augment_freq_time_mask, augment_dropout, augment_pitch_and_tempo, augment_speed_up
 from util.audio import read_frames_from_file, vad_split, DEFAULT_FORMAT
-from util.decoded_augmentation import augment_noise
+from util.audio_augmentation import augment_noise, noise_file_to_audio, collect_noise_filenames
 
 
 def read_csvs(csv_files):
@@ -70,11 +70,22 @@ def audiofile_to_features(wav_filename, train_phase=False):
     decoded = contrib_audio.decode_wav(samples, desired_channels=1)
     audio = decoded.audio
 
-    # augment decoded
+    # augment audio
     if train_phase and FLAGS.decoded_aug_mix_noise_walk_dirs:
+        # because we have to determine the shuffle size, so we could not use generator
+        noise_filenames = tf.convert_to_tensor(
+            list(collect_noise_filenames(FLAGS.decoded_aug_mix_noise_walk_dirs.split(','))),
+            dtype=tf.string)
+        print(">>> Collect {} noise files for mixing audio".format(noise_filenames.shape[0]))
+        noise_dataset = (tf.data.Dataset.from_tensor_slices(noise_filenames)
+                         .map(noise_file_to_audio, num_parallel_calls=tf.data.experimental.AUTOTUNE)
+                         .shuffle(noise_filenames.shape[0])
+                         .cache(FLAGS.decoded_aug_mix_noise_cache)
+                         .repeat())
+        iterator = tf.compat.v1.data.make_one_shot_iterator(noise_dataset)
         audio = augment_noise(
             audio,
-            FLAGS.decoded_aug_mix_noise_walk_dirs.split(','),
+            iterator.get_next(),
             change_audio_db_max=FLAGS.decoded_aug_mix_noise_max_audio_db,
             change_audio_db_min=FLAGS.decoded_aug_mix_noise_min_audio_db,
             change_noise_db_max=FLAGS.decoded_aug_mix_noise_max_noise_db,
diff --git a/util/flags.py b/util/flags.py
index 2d01a3a2b9..c63a969f36 100644
--- a/util/flags.py
+++ b/util/flags.py
@@ -25,10 +25,11 @@ def create_flags():
     # ================
 
     f.DEFINE_string('decoded_aug_mix_noise_walk_dirs', '', 'walk through wav dir, then mix noise wav into decoded audio')
-    f.DEFINE_float('decoded_aug_mix_noise_max_noise_db', -25, 'limit noise max volume')
-    f.DEFINE_float('decoded_aug_mix_noise_min_noise_db', -50, 'limit noise min volume')
-    f.DEFINE_float('decoded_aug_mix_noise_max_audio_db', 0, 'limit noise max volume')
-    f.DEFINE_float('decoded_aug_mix_noise_min_audio_db', -10, 'limit noise min volume')
+    f.DEFINE_string('decoded_aug_mix_noise_cache', '', 'must cache noise audio data, or it will read audio file every training step')
+    f.DEFINE_float('decoded_aug_mix_noise_max_noise_db', -25, 'to limit noise max volume')
+    f.DEFINE_float('decoded_aug_mix_noise_min_noise_db', -50, 'to limit noise min volume')
+    f.DEFINE_float('decoded_aug_mix_noise_max_audio_db', 0, 'to limit audio max volume')
+    f.DEFINE_float('decoded_aug_mix_noise_min_audio_db', -10, 'to limit audio min volume')
 
     f.DEFINE_float('data_aug_features_additive', 0, 'std of the Gaussian additive noise')
     f.DEFINE_float('data_aug_features_multiplicative', 0, 'std of normal distribution around 1 for multiplicative noise')

From 4f24f08f09611226d37347216acbd0af2d485f4a Mon Sep 17 00:00:00 2001
From: Yi-Hua Chiu <mychiux413@gmail.com>
Date: Tue, 31 Dec 2019 17:19:00 +0800
Subject: [PATCH 09/25] rename decoded -> audio

---
 util/feeding.py | 14 +++++++-------
 util/flags.py   | 12 ++++++------
 2 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/util/feeding.py b/util/feeding.py
index d8ad88a20b..8067bf5c58 100644
--- a/util/feeding.py
+++ b/util/feeding.py
@@ -71,25 +71,25 @@ def audiofile_to_features(wav_filename, train_phase=False):
     audio = decoded.audio
 
     # augment audio
-    if train_phase and FLAGS.decoded_aug_mix_noise_walk_dirs:
+    if train_phase and FLAGS.audio_aug_mix_noise_walk_dirs:
         # because we have to determine the shuffle size, so we could not use generator
         noise_filenames = tf.convert_to_tensor(
-            list(collect_noise_filenames(FLAGS.decoded_aug_mix_noise_walk_dirs.split(','))),
+            list(collect_noise_filenames(FLAGS.audio_aug_mix_noise_walk_dirs.split(','))),
             dtype=tf.string)
         print(">>> Collect {} noise files for mixing audio".format(noise_filenames.shape[0]))
         noise_dataset = (tf.data.Dataset.from_tensor_slices(noise_filenames)
                          .map(noise_file_to_audio, num_parallel_calls=tf.data.experimental.AUTOTUNE)
                          .shuffle(noise_filenames.shape[0])
-                         .cache(FLAGS.decoded_aug_mix_noise_cache)
+                         .cache(FLAGS.audio_aug_mix_noise_cache)
                          .repeat())
         iterator = tf.compat.v1.data.make_one_shot_iterator(noise_dataset)
         audio = augment_noise(
             audio,
             iterator.get_next(),
-            change_audio_db_max=FLAGS.decoded_aug_mix_noise_max_audio_db,
-            change_audio_db_min=FLAGS.decoded_aug_mix_noise_min_audio_db,
-            change_noise_db_max=FLAGS.decoded_aug_mix_noise_max_noise_db,
-            change_noise_db_min=FLAGS.decoded_aug_mix_noise_min_noise_db,
+            change_audio_db_max=FLAGS.audio_aug_mix_noise_max_audio_db,
+            change_audio_db_min=FLAGS.audio_aug_mix_noise_min_audio_db,
+            change_noise_db_max=FLAGS.audio_aug_mix_noise_max_noise_db,
+            change_noise_db_min=FLAGS.audio_aug_mix_noise_min_noise_db,
         )
 
 
diff --git a/util/flags.py b/util/flags.py
index c63a969f36..8a3852ecf5 100644
--- a/util/flags.py
+++ b/util/flags.py
@@ -24,12 +24,12 @@ def create_flags():
     # Data Augmentation
     # ================
 
-    f.DEFINE_string('decoded_aug_mix_noise_walk_dirs', '', 'walk through wav dir, then mix noise wav into decoded audio')
-    f.DEFINE_string('decoded_aug_mix_noise_cache', '', 'must cache noise audio data, or it will read audio file every training step')
-    f.DEFINE_float('decoded_aug_mix_noise_max_noise_db', -25, 'to limit noise max volume')
-    f.DEFINE_float('decoded_aug_mix_noise_min_noise_db', -50, 'to limit noise min volume')
-    f.DEFINE_float('decoded_aug_mix_noise_max_audio_db', 0, 'to limit audio max volume')
-    f.DEFINE_float('decoded_aug_mix_noise_min_audio_db', -10, 'to limit audio min volume')
+    f.DEFINE_string('audio_aug_mix_noise_walk_dirs', '', 'walk through wav dir, then mix noise wav into decoded audio')
+    f.DEFINE_string('audio_aug_mix_noise_cache', '', 'must cache noise audio data, or it will read audio file every training step')
+    f.DEFINE_float('audio_aug_mix_noise_max_noise_db', -25, 'to limit noise max volume')
+    f.DEFINE_float('audio_aug_mix_noise_min_noise_db', -50, 'to limit noise min volume')
+    f.DEFINE_float('audio_aug_mix_noise_max_audio_db', 0, 'to limit audio max volume')
+    f.DEFINE_float('audio_aug_mix_noise_min_audio_db', -10, 'to limit audio min volume')
 
     f.DEFINE_float('data_aug_features_additive', 0, 'std of the Gaussian additive noise')
     f.DEFINE_float('data_aug_features_multiplicative', 0, 'std of normal distribution around 1 for multiplicative noise')

From 1f57ece8ba65ea7b13c52dce577d95478416b12b Mon Sep 17 00:00:00 2001
From: Yi-Hua Chiu <mychiux413@gmail.com>
Date: Thu, 2 Jan 2020 16:21:37 +0800
Subject: [PATCH 10/25] [FIX] don't create tf.Dataset in other tf.Dataset's
 pipeline

---
 util/feeding.py | 38 +++++++++++++++++++++-----------------
 1 file changed, 21 insertions(+), 17 deletions(-)

diff --git a/util/feeding.py b/util/feeding.py
index 8067bf5c58..ea7987a6f1 100644
--- a/util/feeding.py
+++ b/util/feeding.py
@@ -65,27 +65,16 @@ def samples_to_mfccs(samples, sample_rate, train_phase=False):
     return mfccs, tf.shape(input=mfccs)[0]
 
 
-def audiofile_to_features(wav_filename, train_phase=False):
+def audiofile_to_features(wav_filename, train_phase=False, noise_iterator=None):
     samples = tf.io.read_file(wav_filename)
     decoded = contrib_audio.decode_wav(samples, desired_channels=1)
     audio = decoded.audio
 
     # augment audio
-    if train_phase and FLAGS.audio_aug_mix_noise_walk_dirs:
-        # because we have to determine the shuffle size, so we could not use generator
-        noise_filenames = tf.convert_to_tensor(
-            list(collect_noise_filenames(FLAGS.audio_aug_mix_noise_walk_dirs.split(','))),
-            dtype=tf.string)
-        print(">>> Collect {} noise files for mixing audio".format(noise_filenames.shape[0]))
-        noise_dataset = (tf.data.Dataset.from_tensor_slices(noise_filenames)
-                         .map(noise_file_to_audio, num_parallel_calls=tf.data.experimental.AUTOTUNE)
-                         .shuffle(noise_filenames.shape[0])
-                         .cache(FLAGS.audio_aug_mix_noise_cache)
-                         .repeat())
-        iterator = tf.compat.v1.data.make_one_shot_iterator(noise_dataset)
+    if train_phase and noise_iterator:
         audio = augment_noise(
             audio,
-            iterator.get_next(),
+            noise_iterator.get_next(),
             change_audio_db_max=FLAGS.audio_aug_mix_noise_max_audio_db,
             change_audio_db_min=FLAGS.audio_aug_mix_noise_min_audio_db,
             change_noise_db_max=FLAGS.audio_aug_mix_noise_max_noise_db,
@@ -106,9 +95,9 @@ def audiofile_to_features(wav_filename, train_phase=False):
     return features, features_len
 
 
-def entry_to_features(wav_filename, transcript, train_phase):
+def entry_to_features(wav_filename, transcript, train_phase, noise_iterator=None):
     # https://bugs.python.org/issue32117
-    features, features_len = audiofile_to_features(wav_filename, train_phase=train_phase)
+    features, features_len = audiofile_to_features(wav_filename, train_phase=train_phase, noise_iterator=noise_iterator)
     return wav_filename, features, features_len, tf.SparseTensor(*transcript)
 
 
@@ -147,7 +136,22 @@ def batch_fn(wav_filenames, features, features_len, transcripts):
         return tf.data.Dataset.zip((wav_filenames, features, transcripts))
 
     num_gpus = len(Config.available_devices)
-    process_fn = partial(entry_to_features, train_phase=train_phase)
+
+    if train_phase and FLAGS.audio_aug_mix_noise_walk_dirs:
+        # because we have to determine the shuffle size, so we could not use generator
+        noise_filenames = tf.convert_to_tensor(
+            list(collect_noise_filenames(FLAGS.audio_aug_mix_noise_walk_dirs.split(','))),
+            dtype=tf.string)
+        print(">>> Collect {} noise files for mixing audio".format(noise_filenames.shape[0]))
+        noise_dataset = (tf.data.Dataset.from_tensor_slices(noise_filenames)
+                         .map(noise_file_to_audio, num_parallel_calls=tf.data.experimental.AUTOTUNE)
+                         .shuffle(noise_filenames.shape[0])
+                         .cache(FLAGS.audio_aug_mix_noise_cache)
+                         .repeat())
+        noise_iterator = tf.compat.v1.data.make_one_shot_iterator(noise_dataset)
+    else:
+        noise_iterator = None
+    process_fn = partial(entry_to_features, train_phase=train_phase, noise_iterator=noise_iterator)
 
     dataset = (tf.data.Dataset.from_generator(generate_values,
                                               output_types=(tf.string, (tf.int64, tf.int32, tf.int64)))

From 66cc7c48c9e3d07d3ab742b6dc235e5b706349af Mon Sep 17 00:00:00 2001
From: Yi-Hua Chiu <mychiux413@gmail.com>
Date: Mon, 13 Jan 2020 22:38:55 +0800
Subject: [PATCH 11/25] limit audio signal between +-1.0

---
 util/audio_augmentation.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/util/audio_augmentation.py b/util/audio_augmentation.py
index cb668f696b..283a54ed90 100644
--- a/util/audio_augmentation.py
+++ b/util/audio_augmentation.py
@@ -60,4 +60,5 @@ def augment_noise(audio,
     choosen_noise_db = tfv1.random_uniform(
         [], minval=change_noise_db_min, maxval=change_noise_db_max)
     noise_ratio = tf.math.pow(10.0, choosen_noise_db / 10)
-    return tf.multiply(audio, audio_ratio) + tf.multiply(mixed_noise, noise_ratio)
+    mixed_audio = tf.multiply(audio, audio_ratio) + tf.multiply(mixed_noise, noise_ratio)
+    return tf.maximum(tf.minimum(mixed_audio, 1.0), -1.0)

From b7eb0f4d4c6a9d070a4d718727fdb22d83146100 Mon Sep 17 00:00:00 2001
From: Yi-Hua Chiu <mychiux413@gmail.com>
Date: Tue, 11 Feb 2020 14:15:35 +0800
Subject: [PATCH 12/25] [FIX] switch shuffle/map for memory cost, replace cache
 with prefetch for memory cost [MOD] deprecate FLAGS.audio_aug_mix_noise_cache

---
 util/feeding.py | 8 +++++---
 util/flags.py   | 1 -
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/util/feeding.py b/util/feeding.py
index ea7987a6f1..35b7d3be92 100644
--- a/util/feeding.py
+++ b/util/feeding.py
@@ -17,6 +17,7 @@
 from util.spectrogram_augmentations import augment_freq_time_mask, augment_dropout, augment_pitch_and_tempo, augment_speed_up
 from util.audio import read_frames_from_file, vad_split, DEFAULT_FORMAT
 from util.audio_augmentation import augment_noise, noise_file_to_audio, collect_noise_filenames
+from util.logging import log_info
 
 
 def read_csvs(csv_files):
@@ -139,14 +140,15 @@ def batch_fn(wav_filenames, features, features_len, transcripts):
 
     if train_phase and FLAGS.audio_aug_mix_noise_walk_dirs:
         # because we have to determine the shuffle size, so we could not use generator
+        log_info("Enable Mixing Noise Augmentation")
         noise_filenames = tf.convert_to_tensor(
             list(collect_noise_filenames(FLAGS.audio_aug_mix_noise_walk_dirs.split(','))),
             dtype=tf.string)
-        print(">>> Collect {} noise files for mixing audio".format(noise_filenames.shape[0]))
+        log_info("Collect {} noise files for mixing audio".format(noise_filenames.shape[0]))
         noise_dataset = (tf.data.Dataset.from_tensor_slices(noise_filenames)
-                         .map(noise_file_to_audio, num_parallel_calls=tf.data.experimental.AUTOTUNE)
                          .shuffle(noise_filenames.shape[0])
-                         .cache(FLAGS.audio_aug_mix_noise_cache)
+                         .map(noise_file_to_audio, num_parallel_calls=tf.data.experimental.AUTOTUNE)
+                         .prefetch(tf.compat.v1.data.experimental.AUTOTUNE)
                          .repeat())
         noise_iterator = tf.compat.v1.data.make_one_shot_iterator(noise_dataset)
     else:
diff --git a/util/flags.py b/util/flags.py
index 8a3852ecf5..ecfa90b8d2 100644
--- a/util/flags.py
+++ b/util/flags.py
@@ -25,7 +25,6 @@ def create_flags():
     # ================
 
     f.DEFINE_string('audio_aug_mix_noise_walk_dirs', '', 'walk through wav dir, then mix noise wav into decoded audio')
-    f.DEFINE_string('audio_aug_mix_noise_cache', '', 'must cache noise audio data, or it will read audio file every training step')
     f.DEFINE_float('audio_aug_mix_noise_max_noise_db', -25, 'to limit noise max volume')
     f.DEFINE_float('audio_aug_mix_noise_min_noise_db', -50, 'to limit noise min volume')
     f.DEFINE_float('audio_aug_mix_noise_max_audio_db', 0, 'to limit audio max volume')

From ccae7cc93ef717f6045d3128c1aebdb6d8f5e858 Mon Sep 17 00:00:00 2001
From: Yi-Hua Chiu <mychiux413@gmail.com>
Date: Mon, 17 Feb 2020 15:38:30 +0800
Subject: [PATCH 13/25] [MOD] limit the buffer size of .shuffle() to protect
 memory usage

---
 util/feeding.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/util/feeding.py b/util/feeding.py
index 35b7d3be92..e9c2fb9ad7 100644
--- a/util/feeding.py
+++ b/util/feeding.py
@@ -146,7 +146,7 @@ def batch_fn(wav_filenames, features, features_len, transcripts):
             dtype=tf.string)
         log_info("Collect {} noise files for mixing audio".format(noise_filenames.shape[0]))
         noise_dataset = (tf.data.Dataset.from_tensor_slices(noise_filenames)
-                         .shuffle(noise_filenames.shape[0])
+                         .shuffle(min(noise_filenames.shape[0], 102400))
                          .map(noise_file_to_audio, num_parallel_calls=tf.data.experimental.AUTOTUNE)
                          .prefetch(tf.compat.v1.data.experimental.AUTOTUNE)
                          .repeat())

From 8cc95f9ee814d63a4a9c472921cca65662d3cbdd Mon Sep 17 00:00:00 2001
From: Yi-Hua Chiu <mychiux413@gmail.com>
Date: Wed, 19 Feb 2020 10:21:02 +0800
Subject: [PATCH 14/25] [ADD] bin/normalize_noise_audio.py

---
 bin/normalize_noise_audio.py | 172 +++++++++++++++++++++++++++++++++++
 1 file changed, 172 insertions(+)
 create mode 100644 bin/normalize_noise_audio.py

diff --git a/bin/normalize_noise_audio.py b/bin/normalize_noise_audio.py
new file mode 100644
index 0000000000..b487ece01b
--- /dev/null
+++ b/bin/normalize_noise_audio.py
@@ -0,0 +1,172 @@
+from __future__ import absolute_import, division, print_function
+
+# Make sure we can import stuff from util/
+# This script needs to be run from the root of the DeepSpeech repository
+
+from util.feeding import secs_to_hours
+from librosa import get_duration
+from multiprocessing import Pool
+from functools import partial
+import math
+import argparse
+import sys
+import os
+sys.path.insert(1, os.path.join(sys.path[0], '..'))
+
+try:
+    import tqdm
+except ImportError as err:
+    print('[ImportError] try `pip install tqdm`')
+    raise err
+
+try:
+    from pydub import AudioSegment
+except ImportError as err:
+    print('[ImportError] try `sudo apt-get install ffmpeg && pip install pydub`')
+    raise err
+
+
+def detect_silence(sound: AudioSegment, silence_threshold=-50.0, chunk_size=10):
+    start_trim = 0  # ms
+    sound_size = len(sound)
+    assert chunk_size > 0  # to avoid infinite loop
+    while sound[start_trim:(start_trim + chunk_size)].dBFS < silence_threshold and start_trim < sound_size:
+        start_trim += chunk_size
+
+    end_trim = sound_size
+    while sound[(end_trim - chunk_size):end_trim].dBFS < silence_threshold and end_trim > 0:
+        end_trim -= chunk_size
+
+    start_trim = min(sound_size, start_trim)
+    end_trim = max(0, end_trim)
+
+    return min([start_trim, end_trim]), max([start_trim, end_trim])
+
+
+def trim_silence_audio(sound: AudioSegment, silence_threshold=-50.0, chunk_size=10):
+    start_trim, end_trim = detect_silence(sound, silence_threshold, chunk_size)
+    return sound[start_trim:end_trim]
+
+
+def convert(filename, dst_dirpath, dirpath, normalize, trim_silence,
+            min_duration_seconds, max_duration_seconds):
+    if not filename.endswith(('.wav', '.raw')):
+        return
+
+    filepath = os.path.join(dirpath, filename)
+    if filename.endswith('.wav'):
+        sound: AudioSegment = AudioSegment.from_file(filepath)
+    else:
+        try:
+            sound: AudioSegment = AudioSegment.from_raw(filepath,
+                                                        sample_width=2,
+                                                        frame_rate=44100,
+                                                        channels=1)
+        except Exception as err:  # pylint: disable=broad-except
+            print('Retrying conversion: {}'.format(err))
+            try:
+                sound: AudioSegment = AudioSegment.from_raw(filepath,
+                                                            sample_width=2,
+                                                            frame_rate=48000,
+                                                            channels=1)
+            except Exception as err:  # pylint: disable=broad-except
+                print('Skipping file {}, got error: {}'.format(filepath, err))
+                return
+        try:
+            sound = sound.set_frame_rate(16000)
+        except Exception as err:  # pylint: disable=broad-except
+            print('Skipping {}'.format(err))
+            return
+
+    n_splits = max(1, math.ceil(sound.duration_seconds / max_duration_seconds))
+    chunk_duration_ms = math.ceil(len(sound) / n_splits)
+    chunks = []
+
+    for i in range(n_splits):
+        end_ms = min((i + 1) * chunk_duration_ms, len(sound))
+        chunk = sound[(i * chunk_duration_ms):end_ms]
+        chunks.append(chunk)
+
+    for i, chunk in enumerate(chunks):
+        dst_path = os.path.join(dst_dirpath, str(i) + '_' + filename)
+        if dst_path.endswith('.raw'):
+            dst_path = dst_path[:-4] + '.wav'
+
+        if os.path.exists(dst_path):
+            print('Audio already exists: {}'.format(dst_path))
+            return
+
+        if normalize:
+            chunk = chunk.normalize()
+            if chunk.dBFS < -30.0:
+                chunk = chunk.compress_dynamic_range().normalize()
+            if chunk.dBFS < -30.0:
+                chunk = chunk.compress_dynamic_range().normalize()
+        if trim_silence:
+            chunk = trim_silence_audio(chunk)
+
+        if chunk.duration_seconds < min_duration_seconds:
+            return
+        chunk.export(dst_path, format='wav')
+
+
+def get_noise_duration(dst_dir):
+    duration = 0.0
+    file_num = 0
+    for dirpath, _, filenames in os.walk(dst_dir):
+        for f in filenames:
+            if not f.endswith('.wav'):
+                continue
+            duration += get_duration(filename=os.path.join(dirpath, f))
+            file_num += 1
+    return duration, file_num
+
+
+def main(src_dir,
+         dst_dir,
+         min_duration_seconds,
+         max_duration_seconds,
+         normalize=True,
+         trim_silence=True):
+    assert os.path.exists(src_dir)
+    if not os.path.exists(dst_dir):
+        os.makedirs(dst_dir, exist_ok=False)
+    src_dir = os.path.abspath(src_dir)
+    dst_dir = os.path.abspath(dst_dir)
+
+    for dirpath, _, filenames in os.walk(src_dir):
+        dirpath = os.path.abspath(dirpath)
+        dst_dirpath = os.path.join(
+            dst_dir, dirpath.replace(src_dir, '').lstrip('/'))
+
+        print('Converting directory: {} -> {}'.format(dirpath, dst_dirpath))
+        if not os.path.exists(dst_dirpath):
+            os.makedirs(dst_dirpath, exist_ok=False)
+
+        convert_func = partial(convert,
+                               dst_dirpath=dst_dirpath,
+                               dirpath=dirpath,
+                               normalize=normalize,
+                               trim_silence=trim_silence,
+                               min_duration_seconds=min_duration_seconds,
+                               max_duration_seconds=max_duration_seconds)
+
+        pool = Pool(processes=None)
+        for _ in tqdm.tqdm(pool.imap_unordered(convert_func, filenames), total=len(filenames)):
+            pass
+
+
+if __name__ == "__main__":
+    PARSER = argparse.ArgumentParser(description='Optimize noise files')
+    PARSER.add_argument('--from_dir', help='Convert wav from directory', type=str)
+    PARSER.add_argument('--to_dir', help='save wav to directory', type=str)
+    PARSER.add_argument('--min_sec', help='min duration seconds of saved file', type=float, default=1.0)
+    PARSER.add_argument('--max_sec', help='max duration seconds of saved file', type=float, default=30.0)
+    PARSER.add_argument('--normalize', action='store_true', help='Normalize sound range, default is true', default=True)
+    PARSER.add_argument('--trim', action='store_true', help='Trim silence, default is true', default=True)
+    PARAMS = PARSER.parse_args()
+
+    main(PARAMS.from_dir, PARAMS.to_dir, PARAMS.min_sec, PARAMS.max_sec, PARAMS.normalize, PARAMS.trim)
+
+    DURATION, FILE_NUM = get_noise_duration(PARAMS.to_dir)
+    print("Your noise dataset has {} files and a duration of {}\n".format(FILE_NUM, secs_to_hours(DURATION)))

From 9e2648a47c04010e8ba28121fab9e0dd8f2bd010 Mon Sep 17 00:00:00 2001
From: Yi-Hua Chiu <mychiux413@gmail.com>
Date: Sat, 22 Feb 2020 00:40:09 +0800
Subject: [PATCH 15/25] [MOD] mix noise into complete audio

---
 util/audio_augmentation.py | 33 +++++++++------------------------
 1 file changed, 9 insertions(+), 24 deletions(-)

diff --git a/util/audio_augmentation.py b/util/audio_augmentation.py
index 283a54ed90..d23fe207d4 100644
--- a/util/audio_augmentation.py
+++ b/util/audio_augmentation.py
@@ -25,33 +25,18 @@ def augment_noise(audio,
                   change_noise_db_min=-25):
 
     decoded_audio_len = tf.shape(audio)[0]
-    noise_decoded_audio_len = tf.shape(noise_audio)[0]
+    decoded_noise_len = tf.shape(noise_audio)[0]
 
-    multiply = tf.math.floordiv(decoded_audio_len, noise_decoded_audio_len) + 1
+    multiply = tf.math.floordiv(decoded_audio_len, decoded_noise_len) + 1
     noise_audio_tile = tf.tile(noise_audio, [multiply, 1])
 
-    # now noise_decoded_len must > decoded_len
-    noise_decoded_audio_len = tf.shape(noise_audio_tile)[0]
+    # Now, decoded_noise_len must > decoded_audio_len
+    decoded_noise_len = tf.shape(noise_audio_tile)[0]
 
-    mix_decoded_start_end_points = tfv1.random_uniform(
-        [2], minval=0, maxval=decoded_audio_len-1, dtype=tf.int32)
-    mix_decoded_start_point = tf.math.reduce_min(mix_decoded_start_end_points)
-    mix_decoded_end_point = tf.math.reduce_max(
-        mix_decoded_start_end_points) + 1
-    mix_decoded_width = mix_decoded_end_point - mix_decoded_start_point
-
-    left_zeros = tf.zeros(shape=[mix_decoded_start_point, 1])
-
-    mix_noise_decoded_start_point = tfv1.random_uniform(
-        [], minval=0, maxval=noise_decoded_audio_len - mix_decoded_width, dtype=tf.int32)
-    mix_noise_decoded_end_point = mix_noise_decoded_start_point + mix_decoded_width
-    extract_noise_decoded = noise_audio_tile[mix_noise_decoded_start_point:mix_noise_decoded_end_point, :]
-
-    right_zeros = tf.zeros(
-        shape=[decoded_audio_len - mix_decoded_end_point, 1])
-
-    mixed_noise = tf.concat(
-        [left_zeros, extract_noise_decoded, right_zeros], axis=0)
+    mix_decoded_start_point = tfv1.random_uniform(
+        [], minval=0, maxval=decoded_noise_len-decoded_audio_len, dtype=tf.int32)
+    mix_decoded_end_point = mix_decoded_start_point + decoded_audio_len
+    extract_noise_decoded = noise_audio_tile[mix_decoded_start_point:mix_decoded_end_point, :]
 
     choosen_audio_db = tfv1.random_uniform(
         [], minval=change_audio_db_min, maxval=change_audio_db_max)
@@ -60,5 +45,5 @@ def augment_noise(audio,
     choosen_noise_db = tfv1.random_uniform(
         [], minval=change_noise_db_min, maxval=change_noise_db_max)
     noise_ratio = tf.math.pow(10.0, choosen_noise_db / 10)
-    mixed_audio = tf.multiply(audio, audio_ratio) + tf.multiply(mixed_noise, noise_ratio)
+    mixed_audio = tf.multiply(audio, audio_ratio) + tf.multiply(extract_noise_decoded, noise_ratio)
     return tf.maximum(tf.minimum(mixed_audio, 1.0), -1.0)

From 2269514a9ef676100b46f0c99c0e6a7150feb4dd Mon Sep 17 00:00:00 2001
From: Yi-Hua Chiu <mychiux413@gmail.com>
Date: Fri, 6 Mar 2020 16:19:09 +0800
Subject: [PATCH 16/25] [ADD] dev/test dataset can also mix noise [MOD] use SNR
 to balance noise/speech volume, refactor the called functions to accept noise
 arguments

---
 DeepSpeech.py              |  9 ++--
 evaluate.py                |  4 +-
 util/audio_augmentation.py | 85 +++++++++++++++++++++++++++++---------
 util/feeding.py            | 58 +++++++++++++-------------
 util/flags.py              | 12 +++---
 5 files changed, 109 insertions(+), 59 deletions(-)

diff --git a/DeepSpeech.py b/DeepSpeech.py
index 3f62050236..6186fc6723 100755
--- a/DeepSpeech.py
+++ b/DeepSpeech.py
@@ -436,7 +436,8 @@ def train():
                                batch_size=FLAGS.train_batch_size,
                                enable_cache=FLAGS.feature_cache and do_cache_dataset,
                                cache_path=FLAGS.feature_cache,
-                               train_phase=True)
+                               train_phase=True,
+                               noise_dirs=FLAGS.audio_aug_mix_noise_walk_train_dirs)
 
     iterator = tfv1.data.Iterator.from_structure(tfv1.data.get_output_types(train_set),
                                                  tfv1.data.get_output_shapes(train_set),
@@ -447,7 +448,7 @@ def train():
 
     if FLAGS.dev_files:
         dev_csvs = FLAGS.dev_files.split(',')
-        dev_sets = [create_dataset([csv], batch_size=FLAGS.dev_batch_size, train_phase=False) for csv in dev_csvs]
+        dev_sets = [create_dataset([csv], batch_size=FLAGS.dev_batch_size, train_phase=False, noise_dirs=FLAGS.audio_aug_mix_noise_walk_dev_dirs) for csv in dev_csvs]
         dev_init_ops = [iterator.make_initializer(dev_set) for dev_set in dev_sets]
 
     # Dropout
@@ -673,7 +674,7 @@ def __call__(self, progress, data, **kwargs):
 
 
 def test():
-    samples = evaluate(FLAGS.test_files.split(','), create_model, try_loading)
+    samples = evaluate(FLAGS.test_files.split(','), create_model, try_loading, noise_dirs=FLAGS.audio_aug_mix_noise_walk_test_dirs)
     if FLAGS.test_output_file:
         # Save decoded tuples as JSON, converting NumPy floats to Python floats
         json.dump(samples, open(FLAGS.test_output_file, 'w'), default=float)
@@ -896,7 +897,7 @@ def do_single_file_inference(input_file_path):
             print('Could not load checkpoint from {}'.format(FLAGS.checkpoint_dir))
             sys.exit(1)
 
-        features, features_len = audiofile_to_features(input_file_path)
+        features, features_len = audiofile_to_features(input_file_path, 0.0)
         previous_state_c = np.zeros([1, Config.n_cell_dim])
         previous_state_h = np.zeros([1, Config.n_cell_dim])
 
diff --git a/evaluate.py b/evaluate.py
index 435a6be8a6..dd655a36cf 100755
--- a/evaluate.py
+++ b/evaluate.py
@@ -41,7 +41,7 @@ def sparse_tuple_to_texts(sp_tuple, alphabet):
     return [alphabet.decode(res) for res in results]
 
 
-def evaluate(test_csvs, create_model, try_loading):
+def evaluate(test_csvs, create_model, try_loading, noise_dirs=None):
     if FLAGS.lm_binary_path:
         scorer = Scorer(FLAGS.lm_alpha, FLAGS.lm_beta,
                         FLAGS.lm_binary_path, FLAGS.lm_trie_path,
@@ -50,7 +50,7 @@ def evaluate(test_csvs, create_model, try_loading):
         scorer = None
 
     test_csvs = FLAGS.test_files.split(',')
-    test_sets = [create_dataset([csv], batch_size=FLAGS.test_batch_size, train_phase=False) for csv in test_csvs]
+    test_sets = [create_dataset([csv], batch_size=FLAGS.test_batch_size, train_phase=False, noise_dirs=noise_dirs) for csv in test_csvs]
     iterator = tfv1.data.Iterator.from_structure(tfv1.data.get_output_types(test_sets[0]),
                                                  tfv1.data.get_output_shapes(test_sets[0]),
                                                  output_classes=tfv1.data.get_output_classes(test_sets[0]))
diff --git a/util/audio_augmentation.py b/util/audio_augmentation.py
index d23fe207d4..c65dc5e9a4 100644
--- a/util/audio_augmentation.py
+++ b/util/audio_augmentation.py
@@ -1,7 +1,46 @@
+from __future__ import absolute_import, division, print_function
+
 import tensorflow as tf
 import tensorflow.compat.v1 as tfv1
+import numpy as np
 from tensorflow.python.ops import gen_audio_ops as contrib_audio
 import os
+from util.logging import log_info
+
+DBFS_COEF = 20.0 / np.log(10.0)
+
+
+def get_dbfs(wav_filename):
+    samples = tf.io.read_file(wav_filename)
+    decoded = contrib_audio.decode_wav(samples, desired_channels=1)
+    rms = tf.sqrt(tf.reduce_mean(tf.square(decoded.audio)))
+    dbfs = DBFS_COEF * tf.math.log(rms)
+    return dbfs
+
+
+def create_noise_iterator(noise_dirs):
+    """noise_dirs: `str` or `list`"""
+    if isinstance(noise_dirs, str):
+        noise_dirs = noise_dirs.split(',')
+
+    noise_filenames = tf.convert_to_tensor(
+        list(collect_noise_filenames(noise_dirs)),
+        dtype=tf.string)
+    log_info("Collect {} noise files for mixing audio".format(
+        noise_filenames.shape[0]))
+
+    def extract_dbfs(wav_filename):
+        return wav_filename, get_dbfs(wav_filename)
+    noise_dataset = (tf.data.Dataset.from_tensor_slices(noise_filenames)
+                     .map(extract_dbfs, num_parallel_calls=tf.data.experimental.AUTOTUNE)
+                     .cache()
+                     .shuffle(min(noise_filenames.shape[0], 102400))
+                     .map(noise_file_to_audio, num_parallel_calls=tf.data.experimental.AUTOTUNE)
+                     .prefetch(tfv1.data.experimental.AUTOTUNE)
+                     .repeat())
+    noise_iterator = tfv1.data.make_one_shot_iterator(noise_dataset)
+    return noise_iterator
+
 
 def collect_noise_filenames(walk_dirs):
     assert isinstance(walk_dirs, list)
@@ -12,38 +51,44 @@ def collect_noise_filenames(walk_dirs):
                 if filename.endswith('.wav'):
                     yield os.path.join(dirpath, filename)
 
-def noise_file_to_audio(noise_file):
+
+def noise_file_to_audio(noise_file, noise_dbfs):
     samples = tf.io.read_file(noise_file)
     decoded = contrib_audio.decode_wav(samples, desired_channels=1)
-    return decoded.audio
+    return decoded.audio, noise_dbfs
 
-def augment_noise(audio,
-                  noise_audio,
-                  change_audio_db_max=0,
-                  change_audio_db_min=-10,
-                  change_noise_db_max=-15,
-                  change_noise_db_min=-25):
 
+def augment_noise(audio,
+                  audio_dbfs,
+                  noise,
+                  noise_dbfs,
+                  max_audio_gain_db=5,
+                  min_audio_gain_db=-10,
+                  max_snr_db=30,
+                  min_snr_db=5):
     decoded_audio_len = tf.shape(audio)[0]
-    decoded_noise_len = tf.shape(noise_audio)[0]
+    decoded_noise_len = tf.shape(noise)[0]
 
     multiply = tf.math.floordiv(decoded_audio_len, decoded_noise_len) + 1
-    noise_audio_tile = tf.tile(noise_audio, [multiply, 1])
+    noise_audio_tile = tf.tile(noise, [multiply, 1])
 
     # Now, decoded_noise_len must > decoded_audio_len
     decoded_noise_len = tf.shape(noise_audio_tile)[0]
 
-    mix_decoded_start_point = tfv1.random_uniform(
-        [], minval=0, maxval=decoded_noise_len-decoded_audio_len, dtype=tf.int32)
+    mix_decoded_start_point = tfv1.random_uniform([], minval=0, maxval=decoded_noise_len-decoded_audio_len, dtype=tf.int32)
     mix_decoded_end_point = mix_decoded_start_point + decoded_audio_len
     extract_noise_decoded = noise_audio_tile[mix_decoded_start_point:mix_decoded_end_point, :]
 
-    choosen_audio_db = tfv1.random_uniform(
-        [], minval=change_audio_db_min, maxval=change_audio_db_max)
-    audio_ratio = tf.math.pow(10.0, choosen_audio_db / 10)
+    audio_gain_db = tfv1.random_uniform([], minval=min_audio_gain_db, maxval=max_audio_gain_db)
+    target_audio_dbfs = audio_dbfs + audio_gain_db
+    audio_gain_ratio = tf.math.pow(10.0, audio_gain_db / 10)
+
+    # target_snr_db := target_audio_dbfs - target_noise_dbfs
+    target_snr_db = tfv1.random_uniform([], minval=min_snr_db, maxval=max_snr_db)
 
-    choosen_noise_db = tfv1.random_uniform(
-        [], minval=change_noise_db_min, maxval=change_noise_db_max)
-    noise_ratio = tf.math.pow(10.0, choosen_noise_db / 10)
-    mixed_audio = tf.multiply(audio, audio_ratio) + tf.multiply(extract_noise_decoded, noise_ratio)
-    return tf.maximum(tf.minimum(mixed_audio, 1.0), -1.0)
+    target_noise_dbfs = target_audio_dbfs - target_snr_db
+    noise_gain_db = target_noise_dbfs - noise_dbfs
+    noise_gain_ratio = tf.math.pow(10.0, noise_gain_db / 10)
+    mixed_audio = tf.multiply(audio, audio_gain_ratio) + tf.multiply(extract_noise_decoded, noise_gain_ratio)
+    mixed_audio = tf.maximum(tf.minimum(mixed_audio, 1.0), -1.0)
+    return mixed_audio
diff --git a/util/feeding.py b/util/feeding.py
index e9c2fb9ad7..f8e8a7aebc 100644
--- a/util/feeding.py
+++ b/util/feeding.py
@@ -16,7 +16,7 @@
 from util.flags import FLAGS
 from util.spectrogram_augmentations import augment_freq_time_mask, augment_dropout, augment_pitch_and_tempo, augment_speed_up
 from util.audio import read_frames_from_file, vad_split, DEFAULT_FORMAT
-from util.audio_augmentation import augment_noise, noise_file_to_audio, collect_noise_filenames
+from util.audio_augmentation import augment_noise, create_noise_iterator, get_dbfs
 from util.logging import log_info
 
 
@@ -66,23 +66,25 @@ def samples_to_mfccs(samples, sample_rate, train_phase=False):
     return mfccs, tf.shape(input=mfccs)[0]
 
 
-def audiofile_to_features(wav_filename, train_phase=False, noise_iterator=None):
+def audiofile_to_features(wav_filename, audio_dbfs, train_phase=False, noise_iterator=None):
     samples = tf.io.read_file(wav_filename)
     decoded = contrib_audio.decode_wav(samples, desired_channels=1)
     audio = decoded.audio
 
     # augment audio
-    if train_phase and noise_iterator:
+    if noise_iterator:
+        noise, noise_dbfs = noise_iterator.get_next()
         audio = augment_noise(
             audio,
-            noise_iterator.get_next(),
-            change_audio_db_max=FLAGS.audio_aug_mix_noise_max_audio_db,
-            change_audio_db_min=FLAGS.audio_aug_mix_noise_min_audio_db,
-            change_noise_db_max=FLAGS.audio_aug_mix_noise_max_noise_db,
-            change_noise_db_min=FLAGS.audio_aug_mix_noise_min_noise_db,
+            audio_dbfs,
+            noise,
+            noise_dbfs,
+            max_audio_gain_db=FLAGS.audio_aug_mix_noise_max_audio_gain_db,
+            min_audio_gain_db=FLAGS.audio_aug_mix_noise_min_audio_gain_db,
+            max_snr_db=FLAGS.audio_aug_mix_noise_max_snr_db,
+            min_snr_db=FLAGS.audio_aug_mix_noise_min_snr_db,
         )
 
-
     features, features_len = samples_to_mfccs(audio, decoded.sample_rate, train_phase=train_phase)
 
     # augment features
@@ -96,9 +98,9 @@ def audiofile_to_features(wav_filename, train_phase=False, noise_iterator=None):
     return features, features_len
 
 
-def entry_to_features(wav_filename, transcript, train_phase, noise_iterator=None):
+def entry_to_features(wav_filename, transcript, audio_dbfs, train_phase, noise_iterator):
     # https://bugs.python.org/issue32117
-    features, features_len = audiofile_to_features(wav_filename, train_phase=train_phase, noise_iterator=noise_iterator)
+    features, features_len = audiofile_to_features(wav_filename, audio_dbfs, train_phase=train_phase, noise_iterator=noise_iterator)
     return wav_filename, features, features_len, tf.SparseTensor(*transcript)
 
 
@@ -111,7 +113,7 @@ def to_sparse_tuple(sequence):
     return indices, sequence, shape
 
 
-def create_dataset(csvs, batch_size, enable_cache=False, cache_path=None, train_phase=False):
+def create_dataset(csvs, batch_size, enable_cache=False, cache_path=None, train_phase=False, noise_dirs=None):
     df = read_csvs(csvs)
     df.sort_values(by='wav_filesize', inplace=True)
 
@@ -138,26 +140,26 @@ def batch_fn(wav_filenames, features, features_len, transcripts):
 
     num_gpus = len(Config.available_devices)
 
-    if train_phase and FLAGS.audio_aug_mix_noise_walk_dirs:
-        # because we have to determine the shuffle size, so we could not use generator
-        log_info("Enable Mixing Noise Augmentation")
-        noise_filenames = tf.convert_to_tensor(
-            list(collect_noise_filenames(FLAGS.audio_aug_mix_noise_walk_dirs.split(','))),
-            dtype=tf.string)
-        log_info("Collect {} noise files for mixing audio".format(noise_filenames.shape[0]))
-        noise_dataset = (tf.data.Dataset.from_tensor_slices(noise_filenames)
-                         .shuffle(min(noise_filenames.shape[0], 102400))
-                         .map(noise_file_to_audio, num_parallel_calls=tf.data.experimental.AUTOTUNE)
-                         .prefetch(tf.compat.v1.data.experimental.AUTOTUNE)
-                         .repeat())
-        noise_iterator = tf.compat.v1.data.make_one_shot_iterator(noise_dataset)
+    if noise_dirs:
+        noise_iterator = create_noise_iterator(noise_dirs)
     else:
         noise_iterator = None
+
     process_fn = partial(entry_to_features, train_phase=train_phase, noise_iterator=noise_iterator)
 
-    dataset = (tf.data.Dataset.from_generator(generate_values,
-                                              output_types=(tf.string, (tf.int64, tf.int32, tf.int64)))
-                              .map(process_fn, num_parallel_calls=tf.data.experimental.AUTOTUNE))
+    dataset = tf.data.Dataset.from_generator(generate_values,
+                                             output_types=(tf.string, (tf.int64, tf.int32, tf.int64)))
+
+    if noise_dirs:
+        dataset = (dataset.map(lambda wav_filename, transcript: (wav_filename, transcript, get_dbfs(wav_filename)),
+                               num_parallel_calls=tf.data.experimental.AUTOTUNE)
+                   .cache())
+    else:
+        dataset = dataset.map(lambda wav_filename, transcript: (wav_filename, transcript, 0.0),
+                              num_parallel_calls=tf.data.experimental.AUTOTUNE)
+
+    dataset = dataset.map(
+        process_fn, num_parallel_calls=tf.data.experimental.AUTOTUNE)
 
     if enable_cache:
         dataset = dataset.cache(cache_path)
diff --git a/util/flags.py b/util/flags.py
index ecfa90b8d2..69fb81182f 100644
--- a/util/flags.py
+++ b/util/flags.py
@@ -24,11 +24,13 @@ def create_flags():
     # Data Augmentation
     # ================
 
-    f.DEFINE_string('audio_aug_mix_noise_walk_dirs', '', 'walk through wav dir, then mix noise wav into decoded audio')
-    f.DEFINE_float('audio_aug_mix_noise_max_noise_db', -25, 'to limit noise max volume')
-    f.DEFINE_float('audio_aug_mix_noise_min_noise_db', -50, 'to limit noise min volume')
-    f.DEFINE_float('audio_aug_mix_noise_max_audio_db', 0, 'to limit audio max volume')
-    f.DEFINE_float('audio_aug_mix_noise_min_audio_db', -10, 'to limit audio min volume')
+    f.DEFINE_string('audio_aug_mix_noise_walk_train_dirs', '', 'walk through wav dir, then mix noise wav into decoded audio')
+    f.DEFINE_string('audio_aug_mix_noise_walk_dev_dirs', '', 'walk through wav dir, then mix noise wav into decoded audio')
+    f.DEFINE_string('audio_aug_mix_noise_walk_test_dirs', '', 'walk through wav dir, then mix noise wav into decoded audio')
+    f.DEFINE_float('audio_aug_mix_noise_max_snr_db', 20, 'to limit noise max volume', lower_bound=0.0)
+    f.DEFINE_float('audio_aug_mix_noise_min_snr_db', 3, 'to limit noise min volume', lower_bound=0.0)
+    f.DEFINE_float('audio_aug_mix_noise_max_audio_gain_db', 5, 'to limit audio max volume')
+    f.DEFINE_float('audio_aug_mix_noise_min_audio_gain_db', -10, 'to limit audio min volume')
 
     f.DEFINE_float('data_aug_features_additive', 0, 'std of the Gaussian additive noise')
     f.DEFINE_float('data_aug_features_multiplicative', 0, 'std of normal distribution around 1 for multiplicative noise')

From 0b8147ce8c4a1906de80f1db793b8aa63dc15045 Mon Sep 17 00:00:00 2001
From: Yi-Hua Chiu <mychiux413@gmail.com>
Date: Mon, 16 Mar 2020 17:07:29 +0800
Subject: [PATCH 17/25] [ADD] use dbfs and SNR to determine the balance of
 audio/noise, add option to dump audio into tensorboard [FIX] correct gain db
 formula

---
 DeepSpeech.py              |  33 ++++--
 evaluate.py                |   6 +-
 util/audio_augmentation.py | 220 ++++++++++++++++++++++++++++---------
 util/feeding.py            |  66 ++++++-----
 util/flags.py              |  18 +--
 5 files changed, 244 insertions(+), 99 deletions(-)

diff --git a/DeepSpeech.py b/DeepSpeech.py
index 6186fc6723..28042fa506 100755
--- a/DeepSpeech.py
+++ b/DeepSpeech.py
@@ -218,7 +218,7 @@ def calculate_mean_edit_distance_and_loss(iterator, dropout, reuse):
     the decoded result and the batch's original Y.
     '''
     # Obtain the next batch of data
-    batch_filenames, (batch_x, batch_seq_len), batch_y = iterator.get_next()
+    batch_filenames, (batch_x, batch_seq_len), batch_y, review_audio = iterator.get_next()
 
     if FLAGS.use_cudnn_rnn:
         rnn_impl = rnn_impl_cudnn_rnn
@@ -238,7 +238,7 @@ def calculate_mean_edit_distance_and_loss(iterator, dropout, reuse):
     avg_loss = tf.reduce_mean(input_tensor=total_loss)
 
     # Finally we return the average loss
-    return avg_loss, non_finite_files
+    return avg_loss, non_finite_files, review_audio
 
 
 # Adam Optimization
@@ -299,7 +299,7 @@ def get_tower_results(iterator, optimizer, dropout_rates):
                 with tf.name_scope('tower_%d' % i):
                     # Calculate the avg_loss and mean_edit_distance and retrieve the decoded
                     # batch along with the original batch's labels (Y) of this tower
-                    avg_loss, non_finite_files = calculate_mean_edit_distance_and_loss(iterator, dropout_rates, reuse=i > 0)
+                    avg_loss, non_finite_files, review_audio = calculate_mean_edit_distance_and_loss(iterator, dropout_rates, reuse=i > 0)
 
                     # Allow for variables to be re-used by the next tower
                     tfv1.get_variable_scope().reuse_variables()
@@ -316,6 +316,8 @@ def get_tower_results(iterator, optimizer, dropout_rates):
                     tower_non_finite_files.append(non_finite_files)
 
     avg_loss_across_towers = tf.reduce_mean(input_tensor=tower_avg_losses, axis=0)
+    if FLAGS.augmentation_review_audio_steps:
+        tfv1.summary.audio(name='step_audio', tensor=review_audio, sample_rate=16000, collections=['step_audio_summaries'])
     tfv1.summary.scalar(name='step_loss', tensor=avg_loss_across_towers, collections=['step_summaries'])
 
     all_non_finite_files = tf.concat(tower_non_finite_files, axis=0)
@@ -437,7 +439,7 @@ def train():
                                enable_cache=FLAGS.feature_cache and do_cache_dataset,
                                cache_path=FLAGS.feature_cache,
                                train_phase=True,
-                               noise_dirs=FLAGS.audio_aug_mix_noise_walk_train_dirs)
+                               noise_dirs_or_files=FLAGS.audio_aug_mix_noise_train_dirs_or_files)
 
     iterator = tfv1.data.Iterator.from_structure(tfv1.data.get_output_types(train_set),
                                                  tfv1.data.get_output_shapes(train_set),
@@ -448,7 +450,7 @@ def train():
 
     if FLAGS.dev_files:
         dev_csvs = FLAGS.dev_files.split(',')
-        dev_sets = [create_dataset([csv], batch_size=FLAGS.dev_batch_size, train_phase=False, noise_dirs=FLAGS.audio_aug_mix_noise_walk_dev_dirs) for csv in dev_csvs]
+        dev_sets = [create_dataset([csv], batch_size=FLAGS.dev_batch_size, train_phase=False, noise_dirs_or_files=FLAGS.audio_aug_mix_noise_dev_dirs_or_files) for csv in dev_csvs]
         dev_init_ops = [iterator.make_initializer(dev_set) for dev_set in dev_sets]
 
     # Dropout
@@ -484,6 +486,7 @@ def train():
     apply_gradient_op = optimizer.apply_gradients(avg_tower_gradients, global_step=global_step)
 
     # Summaries
+    step_audio_summaries_op = tfv1.summary.merge_all('step_audio_summaries')
     step_summaries_op = tfv1.summary.merge_all('step_summaries')
     step_summary_writers = {
         'train': tfv1.summary.FileWriter(os.path.join(FLAGS.summary_dir, 'train'), max_queue=120),
@@ -594,11 +597,20 @@ def __call__(self, progress, data, **kwargs):
             session.run(init_op)
 
             # Batch loop
+
+            i_audio_steps = 0
             while True:
                 try:
-                    _, current_step, batch_loss, problem_files, step_summary = \
-                        session.run([train_op, global_step, loss, non_finite_files, step_summaries_op],
-                                    feed_dict=feed_dict)
+                    step_audio_summary = None
+                    if i_audio_steps < FLAGS.augmentation_review_audio_steps and epoch == 0:
+                        _, current_step, batch_loss, problem_files, step_summary, step_audio_summary = \
+                            session.run([train_op, global_step, loss, non_finite_files, step_summaries_op, step_audio_summaries_op],
+                                        feed_dict=feed_dict)
+                        i_audio_steps += 1
+                    else:
+                        _, current_step, batch_loss, problem_files, step_summary = \
+                            session.run([train_op, global_step, loss, non_finite_files, step_summaries_op],
+                                        feed_dict=feed_dict)
                 except tf.errors.OutOfRangeError:
                     break
 
@@ -612,6 +624,9 @@ def __call__(self, progress, data, **kwargs):
 
                 pbar.update(step_count)
 
+                if step_audio_summary is not None:
+                    step_summary_writer.add_summary(step_audio_summary, current_step)
+
                 step_summary_writer.add_summary(step_summary, current_step)
 
                 if is_train and FLAGS.checkpoint_secs > 0 and time.time() - checkpoint_time > FLAGS.checkpoint_secs:
@@ -674,7 +689,7 @@ def __call__(self, progress, data, **kwargs):
 
 
 def test():
-    samples = evaluate(FLAGS.test_files.split(','), create_model, try_loading, noise_dirs=FLAGS.audio_aug_mix_noise_walk_test_dirs)
+    samples = evaluate(FLAGS.test_files.split(','), create_model, try_loading, noise_dirs_or_files=FLAGS.audio_aug_mix_noise_test_dirs_or_files)
     if FLAGS.test_output_file:
         # Save decoded tuples as JSON, converting NumPy floats to Python floats
         json.dump(samples, open(FLAGS.test_output_file, 'w'), default=float)
diff --git a/evaluate.py b/evaluate.py
index dd655a36cf..94348e088c 100755
--- a/evaluate.py
+++ b/evaluate.py
@@ -41,7 +41,7 @@ def sparse_tuple_to_texts(sp_tuple, alphabet):
     return [alphabet.decode(res) for res in results]
 
 
-def evaluate(test_csvs, create_model, try_loading, noise_dirs=None):
+def evaluate(test_csvs, create_model, try_loading, noise_dirs_or_files=None):
     if FLAGS.lm_binary_path:
         scorer = Scorer(FLAGS.lm_alpha, FLAGS.lm_beta,
                         FLAGS.lm_binary_path, FLAGS.lm_trie_path,
@@ -50,13 +50,13 @@ def evaluate(test_csvs, create_model, try_loading, noise_dirs=None):
         scorer = None
 
     test_csvs = FLAGS.test_files.split(',')
-    test_sets = [create_dataset([csv], batch_size=FLAGS.test_batch_size, train_phase=False, noise_dirs=noise_dirs) for csv in test_csvs]
+    test_sets = [create_dataset([csv], batch_size=FLAGS.test_batch_size, train_phase=False, noise_dirs_or_files=noise_dirs_or_files) for csv in test_csvs]
     iterator = tfv1.data.Iterator.from_structure(tfv1.data.get_output_types(test_sets[0]),
                                                  tfv1.data.get_output_shapes(test_sets[0]),
                                                  output_classes=tfv1.data.get_output_classes(test_sets[0]))
     test_init_ops = [iterator.make_initializer(test_set) for test_set in test_sets]
 
-    batch_wav_filename, (batch_x, batch_x_len), batch_y = iterator.get_next()
+    batch_wav_filename, (batch_x, batch_x_len), batch_y, _ = iterator.get_next()
 
     # One rate per layer
     no_dropout = [None] * 6
diff --git a/util/audio_augmentation.py b/util/audio_augmentation.py
index c65dc5e9a4..7f7705e825 100644
--- a/util/audio_augmentation.py
+++ b/util/audio_augmentation.py
@@ -6,89 +6,207 @@
 from tensorflow.python.ops import gen_audio_ops as contrib_audio
 import os
 from util.logging import log_info
+from util.config import Config
 
-DBFS_COEF = 20.0 / np.log(10.0)
 
+DBFS_COEF = 10.0 / np.log(10.0)
 
-def get_dbfs(wav_filename):
+def filename_to_audio(wav_filename):
+    r"""Decode `wab_filename` and return the audio
+
+    Args:
+        wav_filename: A str, the path of wav file
+
+    Returns:
+        A 2-D Tensor with shape [`time-steps`, 1].
+    """
     samples = tf.io.read_file(wav_filename)
     decoded = contrib_audio.decode_wav(samples, desired_channels=1)
-    rms = tf.sqrt(tf.reduce_mean(tf.square(decoded.audio)))
-    dbfs = DBFS_COEF * tf.math.log(rms)
-    return dbfs
+    return decoded.audio
+
+def audio_to_dbfs(audio, sample_rate=16000, chunk_ms=100, reduce_funcs=tf.reduce_mean):
+    r"""Separately measure the chunks dbfs of `audio`, then return the statistics values through `reduce_funcs
+
+    Args:
+        audio: A 2-D Tensor with shape [`time-steps`, 1].
+        sample_rate: An integer, specifying the audio sample rate to determining the chunk size for dbfs measurement.
+        chunk_ms: An integer in milliseconds unit, specifying each chunk size for separately measuring dbfs, default is `100ms`
+        reduce_funcs: A function or A list of function, specifying the statistics method to chunks, default is tf.reduce_mean
+
+    Returns:
+        A float or A list of float, depends on reduce_funcs is function or list of function
+    """
+    assert chunk_ms % 10 == 0, 'chunk_ms must be a multiple of 10'
+
+    audio_len = tf.shape(audio)[0]
+    chunk_len = tf.math.floordiv(sample_rate, tf.math.floordiv(1000, chunk_ms)) # default: 1600
+    n_chunks = tf.math.floordiv(audio_len, chunk_len)
+    trim_audio_len = tf.multiply(n_chunks, chunk_len)
+    audio = audio[:trim_audio_len]
+    splits = tf.reshape(audio, shape=[n_chunks, -1])
+
+    squares = tf.square(splits)
+    means = tf.reduce_mean(squares, axis=1)
 
+    # the statistics functions must execute before tf.log(), or the gain db would be wrong
+    if not isinstance(reduce_funcs, list):
+        reduces = reduce_funcs(means)
+        return DBFS_COEF * tf.math.log(reduces + 1e-8)
 
-def create_noise_iterator(noise_dirs):
-    """noise_dirs: `str` or `list`"""
-    if isinstance(noise_dirs, str):
-        noise_dirs = noise_dirs.split(',')
+    reduces = [reduce_func(means) for reduce_func in reduce_funcs]
+    return [DBFS_COEF * tf.math.log(reduce + 1e-8) for reduce in reduces]
 
-    noise_filenames = tf.convert_to_tensor(
-        list(collect_noise_filenames(noise_dirs)),
-        dtype=tf.string)
-    log_info("Collect {} noise files for mixing audio".format(
-        noise_filenames.shape[0]))
 
-    def extract_dbfs(wav_filename):
-        return wav_filename, get_dbfs(wav_filename)
+def create_noise_iterator(noise_dirs_or_files, read_csvs_func):
+    r"""Create an iterator to yield audio
+
+    Args:
+        noise_dirs_or_files: A list/tuple of str, the collection source of wav filenames.
+        read_csvs_func: A function, please specify the `read_csvs()` function from `util/feeding.py`, which is to prevent recursive import error.
+
+    Returns:
+        An one shot iterator of audio with 2-D Tensor of shape [`time-step`, 1], use `<iter>.get_next()` to get the Tensor.
+    """
+    if isinstance(noise_dirs_or_files, str):
+        noise_dirs_or_files = noise_dirs_or_files.split(',')
+
+    noise_filenames = tf.convert_to_tensor(list(collect_noise_filenames(noise_dirs_or_files, read_csvs_func)), dtype=tf.string)
+    log_info("Collect {} noise files for mixing audio".format(noise_filenames.shape[0]))
+
     noise_dataset = (tf.data.Dataset.from_tensor_slices(noise_filenames)
-                     .map(extract_dbfs, num_parallel_calls=tf.data.experimental.AUTOTUNE)
-                     .cache()
                      .shuffle(min(noise_filenames.shape[0], 102400))
-                     .map(noise_file_to_audio, num_parallel_calls=tf.data.experimental.AUTOTUNE)
+                     .map(filename_to_audio, num_parallel_calls=tf.data.experimental.AUTOTUNE)
                      .prefetch(tfv1.data.experimental.AUTOTUNE)
                      .repeat())
     noise_iterator = tfv1.data.make_one_shot_iterator(noise_dataset)
     return noise_iterator
 
 
-def collect_noise_filenames(walk_dirs):
-    assert isinstance(walk_dirs, list)
+def collect_noise_filenames(dirs_or_files, read_csvs_func):
+    r"""Collect wav filenames from directories or csv files
 
-    for d in walk_dirs:
-        for dirpath, _, filenames in os.walk(d):
-            for filename in filenames:
-                if filename.endswith('.wav'):
-                    yield os.path.join(dirpath, filename)
+    Args:
+        dirs_or_files: A list/tuple of str, the collection source of wav filenames.
+        read_csvs_func: A function, please specify the `read_csvs()` function from `util/feeding.py`, which is to prevent recursive import error.
 
+    Returns:
+        An iterator of str, yield every filename suffix with `.wav` or under `wav_filename` column of DataFrame
+    """
 
-def noise_file_to_audio(noise_file, noise_dbfs):
-    samples = tf.io.read_file(noise_file)
-    decoded = contrib_audio.decode_wav(samples, desired_channels=1)
-    return decoded.audio, noise_dbfs
+    assert isinstance(dirs_or_files, (list, tuple))
+
+    for dir_or_file in dirs_or_files:
+        assert os.path.exists(dir_or_file)
+        if os.path.isdir(dir_or_file):
+            for dirpath, _, filenames in os.walk(dir_or_file):
+                for filename in filenames:
+                    if filename.endswith('.wav'):
+                        yield os.path.join(dirpath, filename)
+        elif os.path.isfile(dir_or_file):
+            df = read_csvs_func([dir_or_file])
+            for filename in df['wav_filename']:
+                yield filename
 
 
 def augment_noise(audio,
-                  audio_dbfs,
                   noise,
-                  noise_dbfs,
-                  max_audio_gain_db=5,
-                  min_audio_gain_db=-10,
-                  max_snr_db=30,
-                  min_snr_db=5):
-    decoded_audio_len = tf.shape(audio)[0]
-    decoded_noise_len = tf.shape(noise)[0]
+                  min_audio_dbfs=0.0,
+                  max_audio_dbfs=-35.0,
+                  min_snr_db=3.0,
+                  max_snr_db=30.0,
+                  limit_audio_peak_dbfs=7.0,
+                  limit_noise_peak_dbfs=3.0,
+                  sample_rate=16000):
+    r"""Mix audio Tensor with noise Tensor
+
+    If the noise length is shorter than audio, the process will automaticaly repeat the noise file to over audio length,
+    The process randomly choose a duration of the noise to complete coverage the audio,
+    i.e. the shapes between the choosen duration of noise and audio are equal.
 
-    multiply = tf.math.floordiv(decoded_audio_len, decoded_noise_len) + 1
-    noise_audio_tile = tf.tile(noise, [multiply, 1])
+    Args:
+        audio: A 2-D Tensor with shape [`time-steps`, 1].
+        noise: A 2-D Tensor with shape [`time-steps`, 1].
+        min_audio_dbfs: A float in dbfs unit, specifying the `minimum` volume of audio during gaining audio.
+        max_audio_dbfs: A float in dbfs unit, specifying the `maximum` volume of audio during gaining audio.
+        min_snr_db: A float in db unit, specifying the minimum signal-to-noise ratio during gaining audio and noise.
+        max_snr_db: A float in db unit, specifying the maximum signal-to-noise ratio during gaining audio and noise.
+        limit_audio_peak_dbfs: A float, specifying the limitation of maximun audio dbfs of chunks, the audio volume will not gain over than the specified value.
+        limit_noise_peak_dbfs: A float, specifying the limitation of maximun noise dbfs of chunks, the noise volume will not gain over than the specified value.
+        sample_rate: An integer, specifying the audio sample rate to determining the chunk size for dbfs measurement.
 
-    # Now, decoded_noise_len must > decoded_audio_len
-    decoded_noise_len = tf.shape(noise_audio_tile)[0]
+    Returns:
+        A 2-D Tensor with shape [`time-steps`, 1]. Has the same type and shape as `audio`.
+    """
 
-    mix_decoded_start_point = tfv1.random_uniform([], minval=0, maxval=decoded_noise_len-decoded_audio_len, dtype=tf.int32)
-    mix_decoded_end_point = mix_decoded_start_point + decoded_audio_len
-    extract_noise_decoded = noise_audio_tile[mix_decoded_start_point:mix_decoded_end_point, :]
+    audio_len = tf.shape(audio)[0]
+    noise_len = tf.shape(noise)[0]
 
-    audio_gain_db = tfv1.random_uniform([], minval=min_audio_gain_db, maxval=max_audio_gain_db)
-    target_audio_dbfs = audio_dbfs + audio_gain_db
-    audio_gain_ratio = tf.math.pow(10.0, audio_gain_db / 10)
+    audio_mean_dbfs, audio_max_dbfs = audio_to_dbfs(audio, sample_rate, reduce_funcs=[tf.reduce_mean, tf.reduce_max])
+
+    multiply = tf.math.floordiv(audio_len, noise_len) + 1
+    noise_tile = tf.tile(noise, [multiply, 1])
+
+
+    # Now, noise_len must > audio_len
+    noise_tile_len = tf.shape(noise_tile)[0]
+
+    mix_decoded_start_point = tfv1.random_uniform([], minval=0, maxval=noise_tile_len-audio_len, dtype=tf.int32)
+    mix_decoded_end_point = mix_decoded_start_point + audio_len
+    extract_noise = noise_tile[mix_decoded_start_point:mix_decoded_end_point, :]
+
+    extract_noise_mean_dbfs, extract_noise_max_dbfs = audio_to_dbfs(extract_noise, sample_rate, reduce_funcs=[tf.reduce_mean, tf.reduce_max])
+
+    target_audio_dbfs = tfv1.random_uniform([], minval=min_audio_dbfs, maxval=max_audio_dbfs)
+
+    audio_gain_db = target_audio_dbfs - audio_mean_dbfs
+
+    # limit audio peak
+    audio_gain_db = tf.minimum(limit_audio_peak_dbfs - audio_max_dbfs, audio_gain_db)
+    target_audio_dbfs = audio_mean_dbfs + audio_gain_db
+
+    audio_gain_ratio = tf.math.pow(10.0, audio_gain_db / 20.0)
 
     # target_snr_db := target_audio_dbfs - target_noise_dbfs
     target_snr_db = tfv1.random_uniform([], minval=min_snr_db, maxval=max_snr_db)
 
     target_noise_dbfs = target_audio_dbfs - target_snr_db
-    noise_gain_db = target_noise_dbfs - noise_dbfs
-    noise_gain_ratio = tf.math.pow(10.0, noise_gain_db / 10)
-    mixed_audio = tf.multiply(audio, audio_gain_ratio) + tf.multiply(extract_noise_decoded, noise_gain_ratio)
+    noise_gain_db = target_noise_dbfs - extract_noise_mean_dbfs
+
+    # limit noise peak
+    noise_gain_db = tf.minimum(limit_noise_peak_dbfs - extract_noise_max_dbfs, noise_gain_db)
+    noise_gain_ratio = tf.math.pow(10.0, noise_gain_db / 20.0)
+
+    mixed_audio = tf.multiply(audio, audio_gain_ratio) + tf.multiply(extract_noise, noise_gain_ratio)
+
     mixed_audio = tf.maximum(tf.minimum(mixed_audio, 1.0), -1.0)
+
     return mixed_audio
+
+def gla(spectrogram):
+    r"""Use Griffin-Lim algorithm to reconstruct audio and fix iteration=10 to not waste too much performance in prefetch
+
+    Args:
+        spectrogram: A 3-D Tensor with shape [1, `time-steps`, `features`].
+    Returns:
+        A 2-D Tensor with shape [`time-steps`, 1], which is a reconstructed audio from spectrogram.
+    """
+    frame_length = int(Config.audio_window_samples)
+    frame_step = int(Config.audio_step_samples)
+    fft_length = 512
+    spectrogram = tf.reshape(spectrogram, shape=[1, -1, 257])
+    abs_spectrogram = tf.abs(spectrogram)
+
+    def reconstruct_phases(prev_phases):
+        xi = tf.complex(abs_spectrogram, 0.0) * prev_phases
+        audio = tf.signal.inverse_stft(xi, frame_length=frame_length, frame_step=frame_step, fft_length=fft_length)
+        next_xi = tf.signal.stft(audio, frame_length=frame_length, frame_step=frame_step, fft_length=fft_length)
+        next_phases = tf.math.exp(tf.complex(0.0, tf.angle(next_xi)))
+        return next_phases
+
+    rands = tfv1.random_uniform(tf.shape(spectrogram), dtype=tf.float32)
+    phases = tf.math.exp(tf.complex(0.0, 2.0 * np.pi * rands))
+
+    reconstructed_phases = tf.while_loop(lambda _: True, reconstruct_phases, [phases], maximum_iterations=10)
+    xi = tf.complex(abs_spectrogram, 0.0) * reconstructed_phases
+    audio = tf.signal.inverse_stft(xi, frame_length=frame_length, frame_step=frame_step, fft_length=fft_length)
+    return tf.transpose(audio)
diff --git a/util/feeding.py b/util/feeding.py
index f8e8a7aebc..eb46ecdd72 100644
--- a/util/feeding.py
+++ b/util/feeding.py
@@ -16,8 +16,7 @@
 from util.flags import FLAGS
 from util.spectrogram_augmentations import augment_freq_time_mask, augment_dropout, augment_pitch_and_tempo, augment_speed_up
 from util.audio import read_frames_from_file, vad_split, DEFAULT_FORMAT
-from util.audio_augmentation import augment_noise, create_noise_iterator, get_dbfs
-from util.logging import log_info
+from util.audio_augmentation import augment_noise, create_noise_iterator, gla
 
 
 def read_csvs(csv_files):
@@ -63,29 +62,38 @@ def samples_to_mfccs(samples, sample_rate, train_phase=False):
     mfccs = contrib_audio.mfcc(spectrogram, sample_rate, dct_coefficient_count=Config.n_input)
     mfccs = tf.reshape(mfccs, [-1, Config.n_input])
 
-    return mfccs, tf.shape(input=mfccs)[0]
+    review_audio = samples
+    if FLAGS.augmentation_review_audio_steps and train_phase and any([
+                FLAGS.augmentation_spec_dropout_keeprate < 1,
+                FLAGS.augmentation_freq_and_time_masking,
+                FLAGS.augmentation_pitch_and_tempo_scaling,
+                FLAGS.augmentation_speed_up_std > 0]):
+        review_audio = gla(spectrogram)
 
+    return mfccs, tf.shape(input=mfccs)[0], review_audio
 
-def audiofile_to_features(wav_filename, audio_dbfs, train_phase=False, noise_iterator=None):
+
+def audiofile_to_features(wav_filename, train_phase=False, noise_iterator=None):
     samples = tf.io.read_file(wav_filename)
     decoded = contrib_audio.decode_wav(samples, desired_channels=1)
     audio = decoded.audio
 
     # augment audio
     if noise_iterator:
-        noise, noise_dbfs = noise_iterator.get_next()
+        noise = noise_iterator.get_next()
         audio = augment_noise(
             audio,
-            audio_dbfs,
             noise,
-            noise_dbfs,
-            max_audio_gain_db=FLAGS.audio_aug_mix_noise_max_audio_gain_db,
-            min_audio_gain_db=FLAGS.audio_aug_mix_noise_min_audio_gain_db,
-            max_snr_db=FLAGS.audio_aug_mix_noise_max_snr_db,
+            min_audio_dbfs=FLAGS.audio_aug_mix_noise_min_audio_dbfs,
+            max_audio_dbfs=FLAGS.audio_aug_mix_noise_max_audio_dbfs,
             min_snr_db=FLAGS.audio_aug_mix_noise_min_snr_db,
+            max_snr_db=FLAGS.audio_aug_mix_noise_max_snr_db,
+            limit_audio_peak_dbfs=FLAGS.audio_aug_mix_noise_limit_audio_peak_dbfs,
+            limit_noise_peak_dbfs=FLAGS.audio_aug_mix_noise_limit_noise_peak_dbfs,
+            sample_rate=FLAGS.audio_sample_rate,
         )
 
-    features, features_len = samples_to_mfccs(audio, decoded.sample_rate, train_phase=train_phase)
+    features, features_len, review_audio = samples_to_mfccs(audio, decoded.sample_rate, train_phase=train_phase)
 
     # augment features
     if train_phase:
@@ -95,13 +103,13 @@ def audiofile_to_features(wav_filename, audio_dbfs, train_phase=False, noise_ite
         if FLAGS.data_aug_features_additive > 0:
             features = features+tf.random.normal(mean=0.0, stddev=FLAGS.data_aug_features_additive, shape=tf.shape(features))
 
-    return features, features_len
+    return features, features_len, review_audio
 
 
-def entry_to_features(wav_filename, transcript, audio_dbfs, train_phase, noise_iterator):
+def entry_to_features(wav_filename, transcript, train_phase, noise_iterator):
     # https://bugs.python.org/issue32117
-    features, features_len = audiofile_to_features(wav_filename, audio_dbfs, train_phase=train_phase, noise_iterator=noise_iterator)
-    return wav_filename, features, features_len, tf.SparseTensor(*transcript)
+    features, features_len, review_audio = audiofile_to_features(wav_filename, train_phase=train_phase, noise_iterator=noise_iterator)
+    return wav_filename, features, features_len, tf.SparseTensor(*transcript), review_audio
 
 
 def to_sparse_tuple(sequence):
@@ -113,7 +121,7 @@ def to_sparse_tuple(sequence):
     return indices, sequence, shape
 
 
-def create_dataset(csvs, batch_size, enable_cache=False, cache_path=None, train_phase=False, noise_dirs=None):
+def create_dataset(csvs, batch_size, enable_cache=False, cache_path=None, train_phase=False, noise_dirs_or_files=None):
     df = read_csvs(csvs)
     df.sort_values(by='wav_filesize', inplace=True)
 
@@ -130,18 +138,26 @@ def sparse_reshape(sparse):
         shape = sparse.dense_shape
         return tf.sparse.reshape(sparse, [shape[0], shape[2]])
 
-    def batch_fn(wav_filenames, features, features_len, transcripts):
+    def batch_fn(wav_filenames, features, features_len, transcripts, review_audios):
         features = tf.data.Dataset.zip((features, features_len))
         features = features.padded_batch(batch_size,
                                          padded_shapes=([None, Config.n_input], []))
         transcripts = transcripts.batch(batch_size).map(sparse_reshape)
         wav_filenames = wav_filenames.batch(batch_size)
-        return tf.data.Dataset.zip((wav_filenames, features, transcripts))
+
+        # In order not to waste too much prefetch performance, randomly extract only `one` audio for each step
+        if FLAGS.augmentation_review_audio_steps and batch_size > 1:
+            skip_size = tf.random.uniform(shape=[], minval=0, maxval=batch_size - 1, dtype=tf.int64)
+            review_audio = review_audios.skip(skip_size).batch(1)
+        else:
+            review_audio = review_audios.batch(1)
+
+        return tf.data.Dataset.zip((wav_filenames, features, transcripts, review_audio))
 
     num_gpus = len(Config.available_devices)
 
-    if noise_dirs:
-        noise_iterator = create_noise_iterator(noise_dirs)
+    if noise_dirs_or_files:
+        noise_iterator = create_noise_iterator(noise_dirs_or_files, read_csvs)
     else:
         noise_iterator = None
 
@@ -150,14 +166,6 @@ def batch_fn(wav_filenames, features, features_len, transcripts):
     dataset = tf.data.Dataset.from_generator(generate_values,
                                              output_types=(tf.string, (tf.int64, tf.int32, tf.int64)))
 
-    if noise_dirs:
-        dataset = (dataset.map(lambda wav_filename, transcript: (wav_filename, transcript, get_dbfs(wav_filename)),
-                               num_parallel_calls=tf.data.experimental.AUTOTUNE)
-                   .cache())
-    else:
-        dataset = dataset.map(lambda wav_filename, transcript: (wav_filename, transcript, 0.0),
-                              num_parallel_calls=tf.data.experimental.AUTOTUNE)
-
     dataset = dataset.map(
         process_fn, num_parallel_calls=tf.data.experimental.AUTOTUNE)
 
@@ -190,7 +198,7 @@ def generate_values():
             yield time_start, time_end, samples
 
     def to_mfccs(time_start, time_end, samples):
-        features, features_len = samples_to_mfccs(samples, sample_rate)
+        features, features_len, _ = samples_to_mfccs(samples, sample_rate)
         return time_start, time_end, features, features_len
 
     def create_batch_set(bs, criteria):
diff --git a/util/flags.py b/util/flags.py
index 69fb81182f..176ee53c05 100644
--- a/util/flags.py
+++ b/util/flags.py
@@ -24,13 +24,15 @@ def create_flags():
     # Data Augmentation
     # ================
 
-    f.DEFINE_string('audio_aug_mix_noise_walk_train_dirs', '', 'walk through wav dir, then mix noise wav into decoded audio')
-    f.DEFINE_string('audio_aug_mix_noise_walk_dev_dirs', '', 'walk through wav dir, then mix noise wav into decoded audio')
-    f.DEFINE_string('audio_aug_mix_noise_walk_test_dirs', '', 'walk through wav dir, then mix noise wav into decoded audio')
-    f.DEFINE_float('audio_aug_mix_noise_max_snr_db', 20, 'to limit noise max volume', lower_bound=0.0)
-    f.DEFINE_float('audio_aug_mix_noise_min_snr_db', 3, 'to limit noise min volume', lower_bound=0.0)
-    f.DEFINE_float('audio_aug_mix_noise_max_audio_gain_db', 5, 'to limit audio max volume')
-    f.DEFINE_float('audio_aug_mix_noise_min_audio_gain_db', -10, 'to limit audio min volume')
+    f.DEFINE_string('audio_aug_mix_noise_train_dirs_or_files', '', 'comma separated list of files or dirs, specifying the dataset used for mixing audio into train dataset, if empty, no mix will be run in train phase')
+    f.DEFINE_string('audio_aug_mix_noise_dev_dirs_or_files', '', 'comma separated list of files or dirs, specifying the dataset used for mixing audio into dev dataset, if empty, no mix will be run in dev phase')
+    f.DEFINE_string('audio_aug_mix_noise_test_dirs_or_files', '', 'comma separated list of files or dirs, specifying the dataset used for mixing audio into test dataset, if empty, no mix will be run in test phase')
+    f.DEFINE_float('audio_aug_mix_noise_min_audio_dbfs', 0, 'min value of dbfs to specify the min volume of audio during gaining audio')
+    f.DEFINE_float('audio_aug_mix_noise_max_audio_dbfs', -35, 'max value of dbfs to specify the max volume of audio during gaining audio')
+    f.DEFINE_float('audio_aug_mix_noise_min_snr_db', 3, 'min value of db to specify the min signal-to-noise ratio during gaining audio and noise')
+    f.DEFINE_float('audio_aug_mix_noise_max_snr_db', 30, 'max value of db to specify the max signal-to-noise ratio during gaining audio and noise')
+    f.DEFINE_float('audio_aug_mix_noise_limit_audio_peak_dbfs', 7.0, 'max value of dbfs to specify the limitation of max audio dbfs of chunks, the audio volume will not gain over than the specified value')
+    f.DEFINE_float('audio_aug_mix_noise_limit_noise_peak_dbfs', 3.0, 'max value of dbfs to specify the limitation of max noise dbfs of chunks, the noise volume will not gain over than the specified value')
 
     f.DEFINE_float('data_aug_features_additive', 0, 'std of the Gaussian additive noise')
     f.DEFINE_float('data_aug_features_multiplicative', 0, 'std of normal distribution around 1 for multiplicative noise')
@@ -50,6 +52,8 @@ def create_flags():
     f.DEFINE_float('augmentation_pitch_and_tempo_scaling_max_pitch', 1.2, 'max value of pitch scaling')
     f.DEFINE_float('augmentation_pitch_and_tempo_scaling_max_tempo', 1.2, 'max vlaue of tempo scaling')
 
+    f.DEFINE_integer('augmentation_review_audio_steps', 0, 'number of audio, push the audio into summary directory (if 0, no audio will be dumped)')
+
     # Global Constants
     # ================
 

From 42bc45b198cdc3a3eff41fe369bf88d16b325163 Mon Sep 17 00:00:00 2001
From: Yi-Hua Chiu <mychiux413@gmail.com>
Date: Thu, 19 Mar 2020 15:04:09 +0800
Subject: [PATCH 18/25] [FIX] audiofile_to_features & samples_to_mfccs return 3
 values now, add FLAGS.train_augmentation_files as condition to judge cache
 dataset or not, change constant to FLAGS [MOD] rename variables

---
 DeepSpeech.py              | 23 ++++++++++++-----------
 evaluate.py                |  4 ++--
 util/audio_augmentation.py | 24 ++++++++++++------------
 util/feeding.py            | 22 +++++++++++-----------
 util/flags.py              | 20 ++++++++++----------
 5 files changed, 47 insertions(+), 46 deletions(-)

diff --git a/DeepSpeech.py b/DeepSpeech.py
index 28042fa506..c0897cf4a8 100755
--- a/DeepSpeech.py
+++ b/DeepSpeech.py
@@ -316,8 +316,8 @@ def get_tower_results(iterator, optimizer, dropout_rates):
                     tower_non_finite_files.append(non_finite_files)
 
     avg_loss_across_towers = tf.reduce_mean(input_tensor=tower_avg_losses, axis=0)
-    if FLAGS.augmentation_review_audio_steps:
-        tfv1.summary.audio(name='step_audio', tensor=review_audio, sample_rate=16000, collections=['step_audio_summaries'])
+    if FLAGS.review_audio_steps:
+        tfv1.summary.audio(name='step_audio', tensor=review_audio, sample_rate=FLAGS.audio_sample_rate, collections=['step_audio_summaries'])
     tfv1.summary.scalar(name='step_loss', tensor=avg_loss_across_towers, collections=['step_summaries'])
 
     all_non_finite_files = tf.concat(tower_non_finite_files, axis=0)
@@ -430,7 +430,8 @@ def train():
             FLAGS.augmentation_spec_dropout_keeprate < 1 or
             FLAGS.augmentation_freq_and_time_masking or
             FLAGS.augmentation_pitch_and_tempo_scaling or
-            FLAGS.augmentation_speed_up_std > 0):
+            FLAGS.augmentation_speed_up_std > 0 or
+            FLAGS.train_augmentation_files):
         do_cache_dataset = False
 
     # Create training and validation datasets
@@ -439,7 +440,7 @@ def train():
                                enable_cache=FLAGS.feature_cache and do_cache_dataset,
                                cache_path=FLAGS.feature_cache,
                                train_phase=True,
-                               noise_dirs_or_files=FLAGS.audio_aug_mix_noise_train_dirs_or_files)
+                               noise_sources=FLAGS.train_augmentation_files)
 
     iterator = tfv1.data.Iterator.from_structure(tfv1.data.get_output_types(train_set),
                                                  tfv1.data.get_output_shapes(train_set),
@@ -450,7 +451,7 @@ def train():
 
     if FLAGS.dev_files:
         dev_csvs = FLAGS.dev_files.split(',')
-        dev_sets = [create_dataset([csv], batch_size=FLAGS.dev_batch_size, train_phase=False, noise_dirs_or_files=FLAGS.audio_aug_mix_noise_dev_dirs_or_files) for csv in dev_csvs]
+        dev_sets = [create_dataset([csv], batch_size=FLAGS.dev_batch_size, train_phase=False, noise_sources=FLAGS.dev_augmentation_files) for csv in dev_csvs]
         dev_init_ops = [iterator.make_initializer(dev_set) for dev_set in dev_sets]
 
     # Dropout
@@ -598,15 +599,15 @@ def __call__(self, progress, data, **kwargs):
 
             # Batch loop
 
-            i_audio_steps = 0
+            audio_summary_steps = 0
             while True:
                 try:
                     step_audio_summary = None
-                    if i_audio_steps < FLAGS.augmentation_review_audio_steps and epoch == 0:
+                    if audio_summary_steps < FLAGS.review_audio_steps and epoch == 0:
                         _, current_step, batch_loss, problem_files, step_summary, step_audio_summary = \
                             session.run([train_op, global_step, loss, non_finite_files, step_summaries_op, step_audio_summaries_op],
                                         feed_dict=feed_dict)
-                        i_audio_steps += 1
+                        audio_summary_steps += 1
                     else:
                         _, current_step, batch_loss, problem_files, step_summary = \
                             session.run([train_op, global_step, loss, non_finite_files, step_summaries_op],
@@ -689,7 +690,7 @@ def __call__(self, progress, data, **kwargs):
 
 
 def test():
-    samples = evaluate(FLAGS.test_files.split(','), create_model, try_loading, noise_dirs_or_files=FLAGS.audio_aug_mix_noise_test_dirs_or_files)
+    samples = evaluate(FLAGS.test_files.split(','), create_model, try_loading, noise_sources=FLAGS.test_augmentation_files)
     if FLAGS.test_output_file:
         # Save decoded tuples as JSON, converting NumPy floats to Python floats
         json.dump(samples, open(FLAGS.test_output_file, 'w'), default=float)
@@ -701,7 +702,7 @@ def create_inference_graph(batch_size=1, n_steps=16, tflite=False):
     # Create feature computation graph
     input_samples = tfv1.placeholder(tf.float32, [Config.audio_window_samples], 'input_samples')
     samples = tf.expand_dims(input_samples, -1)
-    mfccs, _ = samples_to_mfccs(samples, FLAGS.audio_sample_rate)
+    mfccs, _, _ = samples_to_mfccs(samples, FLAGS.audio_sample_rate)
     mfccs = tf.identity(mfccs, name='mfccs')
 
     # Input tensor will be of shape [batch_size, n_steps, 2*n_context+1, n_input]
@@ -912,7 +913,7 @@ def do_single_file_inference(input_file_path):
             print('Could not load checkpoint from {}'.format(FLAGS.checkpoint_dir))
             sys.exit(1)
 
-        features, features_len = audiofile_to_features(input_file_path, 0.0)
+        features, features_len, _ = audiofile_to_features(input_file_path)
         previous_state_c = np.zeros([1, Config.n_cell_dim])
         previous_state_h = np.zeros([1, Config.n_cell_dim])
 
diff --git a/evaluate.py b/evaluate.py
index 94348e088c..d8a3ec853d 100755
--- a/evaluate.py
+++ b/evaluate.py
@@ -41,7 +41,7 @@ def sparse_tuple_to_texts(sp_tuple, alphabet):
     return [alphabet.decode(res) for res in results]
 
 
-def evaluate(test_csvs, create_model, try_loading, noise_dirs_or_files=None):
+def evaluate(test_csvs, create_model, try_loading, noise_sources=None):
     if FLAGS.lm_binary_path:
         scorer = Scorer(FLAGS.lm_alpha, FLAGS.lm_beta,
                         FLAGS.lm_binary_path, FLAGS.lm_trie_path,
@@ -50,7 +50,7 @@ def evaluate(test_csvs, create_model, try_loading, noise_dirs_or_files=None):
         scorer = None
 
     test_csvs = FLAGS.test_files.split(',')
-    test_sets = [create_dataset([csv], batch_size=FLAGS.test_batch_size, train_phase=False, noise_dirs_or_files=noise_dirs_or_files) for csv in test_csvs]
+    test_sets = [create_dataset([csv], batch_size=FLAGS.test_batch_size, train_phase=False, noise_sources=noise_sources) for csv in test_csvs]
     iterator = tfv1.data.Iterator.from_structure(tfv1.data.get_output_types(test_sets[0]),
                                                  tfv1.data.get_output_shapes(test_sets[0]),
                                                  output_classes=tfv1.data.get_output_classes(test_sets[0]))
diff --git a/util/audio_augmentation.py b/util/audio_augmentation.py
index 7f7705e825..fa3b39a528 100644
--- a/util/audio_augmentation.py
+++ b/util/audio_augmentation.py
@@ -57,7 +57,7 @@ def audio_to_dbfs(audio, sample_rate=16000, chunk_ms=100, reduce_funcs=tf.reduce
     return [DBFS_COEF * tf.math.log(reduce + 1e-8) for reduce in reduces]
 
 
-def create_noise_iterator(noise_dirs_or_files, read_csvs_func):
+def create_noise_iterator(noise_sources, read_csvs_func):
     r"""Create an iterator to yield audio
 
     Args:
@@ -67,10 +67,10 @@ def create_noise_iterator(noise_dirs_or_files, read_csvs_func):
     Returns:
         An one shot iterator of audio with 2-D Tensor of shape [`time-step`, 1], use `<iter>.get_next()` to get the Tensor.
     """
-    if isinstance(noise_dirs_or_files, str):
-        noise_dirs_or_files = noise_dirs_or_files.split(',')
+    if isinstance(noise_sources, str):
+        noise_sources = noise_sources.split(',')
 
-    noise_filenames = tf.convert_to_tensor(list(collect_noise_filenames(noise_dirs_or_files, read_csvs_func)), dtype=tf.string)
+    noise_filenames = tf.convert_to_tensor(list(collect_noise_filenames(noise_sources, read_csvs_func)), dtype=tf.string)
     log_info("Collect {} noise files for mixing audio".format(noise_filenames.shape[0]))
 
     noise_dataset = (tf.data.Dataset.from_tensor_slices(noise_filenames)
@@ -82,7 +82,7 @@ def create_noise_iterator(noise_dirs_or_files, read_csvs_func):
     return noise_iterator
 
 
-def collect_noise_filenames(dirs_or_files, read_csvs_func):
+def collect_noise_filenames(sources, read_csvs_func):
     r"""Collect wav filenames from directories or csv files
 
     Args:
@@ -93,17 +93,17 @@ def collect_noise_filenames(dirs_or_files, read_csvs_func):
         An iterator of str, yield every filename suffix with `.wav` or under `wav_filename` column of DataFrame
     """
 
-    assert isinstance(dirs_or_files, (list, tuple))
+    assert isinstance(sources, (list, tuple))
 
-    for dir_or_file in dirs_or_files:
-        assert os.path.exists(dir_or_file)
-        if os.path.isdir(dir_or_file):
-            for dirpath, _, filenames in os.walk(dir_or_file):
+    for source in sources:
+        assert os.path.exists(source)
+        if os.path.isdir(source):
+            for dirpath, _, filenames in os.walk(source):
                 for filename in filenames:
                     if filename.endswith('.wav'):
                         yield os.path.join(dirpath, filename)
-        elif os.path.isfile(dir_or_file):
-            df = read_csvs_func([dir_or_file])
+        elif os.path.isfile(source):
+            df = read_csvs_func([source])
             for filename in df['wav_filename']:
                 yield filename
 
diff --git a/util/feeding.py b/util/feeding.py
index eb46ecdd72..24fd59f0ed 100644
--- a/util/feeding.py
+++ b/util/feeding.py
@@ -63,7 +63,7 @@ def samples_to_mfccs(samples, sample_rate, train_phase=False):
     mfccs = tf.reshape(mfccs, [-1, Config.n_input])
 
     review_audio = samples
-    if FLAGS.augmentation_review_audio_steps and train_phase and any([
+    if FLAGS.review_audio_steps and train_phase and any([
                 FLAGS.augmentation_spec_dropout_keeprate < 1,
                 FLAGS.augmentation_freq_and_time_masking,
                 FLAGS.augmentation_pitch_and_tempo_scaling,
@@ -84,12 +84,12 @@ def audiofile_to_features(wav_filename, train_phase=False, noise_iterator=None):
         audio = augment_noise(
             audio,
             noise,
-            min_audio_dbfs=FLAGS.audio_aug_mix_noise_min_audio_dbfs,
-            max_audio_dbfs=FLAGS.audio_aug_mix_noise_max_audio_dbfs,
-            min_snr_db=FLAGS.audio_aug_mix_noise_min_snr_db,
-            max_snr_db=FLAGS.audio_aug_mix_noise_max_snr_db,
-            limit_audio_peak_dbfs=FLAGS.audio_aug_mix_noise_limit_audio_peak_dbfs,
-            limit_noise_peak_dbfs=FLAGS.audio_aug_mix_noise_limit_noise_peak_dbfs,
+            min_audio_dbfs=FLAGS.audio_aug_min_audio_dbfs,
+            max_audio_dbfs=FLAGS.audio_aug_max_audio_dbfs,
+            min_snr_db=FLAGS.audio_aug_min_snr_db,
+            max_snr_db=FLAGS.audio_aug_max_snr_db,
+            limit_audio_peak_dbfs=FLAGS.audio_aug_limit_audio_peak_dbfs,
+            limit_noise_peak_dbfs=FLAGS.audio_aug_limit_noise_peak_dbfs,
             sample_rate=FLAGS.audio_sample_rate,
         )
 
@@ -121,7 +121,7 @@ def to_sparse_tuple(sequence):
     return indices, sequence, shape
 
 
-def create_dataset(csvs, batch_size, enable_cache=False, cache_path=None, train_phase=False, noise_dirs_or_files=None):
+def create_dataset(csvs, batch_size, enable_cache=False, cache_path=None, train_phase=False, noise_sources=None):
     df = read_csvs(csvs)
     df.sort_values(by='wav_filesize', inplace=True)
 
@@ -146,7 +146,7 @@ def batch_fn(wav_filenames, features, features_len, transcripts, review_audios):
         wav_filenames = wav_filenames.batch(batch_size)
 
         # In order not to waste too much prefetch performance, randomly extract only `one` audio for each step
-        if FLAGS.augmentation_review_audio_steps and batch_size > 1:
+        if FLAGS.review_audio_steps and batch_size > 1:
             skip_size = tf.random.uniform(shape=[], minval=0, maxval=batch_size - 1, dtype=tf.int64)
             review_audio = review_audios.skip(skip_size).batch(1)
         else:
@@ -156,8 +156,8 @@ def batch_fn(wav_filenames, features, features_len, transcripts, review_audios):
 
     num_gpus = len(Config.available_devices)
 
-    if noise_dirs_or_files:
-        noise_iterator = create_noise_iterator(noise_dirs_or_files, read_csvs)
+    if noise_sources:
+        noise_iterator = create_noise_iterator(noise_sources, read_csvs)
     else:
         noise_iterator = None
 
diff --git a/util/flags.py b/util/flags.py
index 176ee53c05..740894edd5 100644
--- a/util/flags.py
+++ b/util/flags.py
@@ -24,15 +24,15 @@ def create_flags():
     # Data Augmentation
     # ================
 
-    f.DEFINE_string('audio_aug_mix_noise_train_dirs_or_files', '', 'comma separated list of files or dirs, specifying the dataset used for mixing audio into train dataset, if empty, no mix will be run in train phase')
-    f.DEFINE_string('audio_aug_mix_noise_dev_dirs_or_files', '', 'comma separated list of files or dirs, specifying the dataset used for mixing audio into dev dataset, if empty, no mix will be run in dev phase')
-    f.DEFINE_string('audio_aug_mix_noise_test_dirs_or_files', '', 'comma separated list of files or dirs, specifying the dataset used for mixing audio into test dataset, if empty, no mix will be run in test phase')
-    f.DEFINE_float('audio_aug_mix_noise_min_audio_dbfs', 0, 'min value of dbfs to specify the min volume of audio during gaining audio')
-    f.DEFINE_float('audio_aug_mix_noise_max_audio_dbfs', -35, 'max value of dbfs to specify the max volume of audio during gaining audio')
-    f.DEFINE_float('audio_aug_mix_noise_min_snr_db', 3, 'min value of db to specify the min signal-to-noise ratio during gaining audio and noise')
-    f.DEFINE_float('audio_aug_mix_noise_max_snr_db', 30, 'max value of db to specify the max signal-to-noise ratio during gaining audio and noise')
-    f.DEFINE_float('audio_aug_mix_noise_limit_audio_peak_dbfs', 7.0, 'max value of dbfs to specify the limitation of max audio dbfs of chunks, the audio volume will not gain over than the specified value')
-    f.DEFINE_float('audio_aug_mix_noise_limit_noise_peak_dbfs', 3.0, 'max value of dbfs to specify the limitation of max noise dbfs of chunks, the noise volume will not gain over than the specified value')
+    f.DEFINE_string('train_augmentation_files', '', 'comma separated list of files or dirs, specifying the dataset used for mixing audio into train dataset, if empty, no mix will be run in train phase')
+    f.DEFINE_string('dev_augmentation_files', '', 'comma separated list of files or dirs, specifying the dataset used for mixing audio into dev dataset, if empty, no mix will be run in dev phase')
+    f.DEFINE_string('test_augmentation_files', '', 'comma separated list of files or dirs, specifying the dataset used for mixing audio into test dataset, if empty, no mix will be run in test phase')
+    f.DEFINE_float('audio_aug_min_audio_dbfs', 0, 'min value of dbfs to specify the min volume of audio during gaining audio')
+    f.DEFINE_float('audio_aug_max_audio_dbfs', -35, 'max value of dbfs to specify the max volume of audio during gaining audio')
+    f.DEFINE_float('audio_aug_min_snr_db', 3, 'min value of db to specify the min signal-to-noise ratio during gaining audio and noise')
+    f.DEFINE_float('audio_aug_max_snr_db', 30, 'max value of db to specify the max signal-to-noise ratio during gaining audio and noise')
+    f.DEFINE_float('audio_aug_limit_audio_peak_dbfs', 7.0, 'max value of dbfs to specify the limitation of max audio dbfs of chunks, the audio volume will not gain over than the specified value')
+    f.DEFINE_float('audio_aug_limit_noise_peak_dbfs', 3.0, 'max value of dbfs to specify the limitation of max noise dbfs of chunks, the noise volume will not gain over than the specified value')
 
     f.DEFINE_float('data_aug_features_additive', 0, 'std of the Gaussian additive noise')
     f.DEFINE_float('data_aug_features_multiplicative', 0, 'std of normal distribution around 1 for multiplicative noise')
@@ -52,7 +52,7 @@ def create_flags():
     f.DEFINE_float('augmentation_pitch_and_tempo_scaling_max_pitch', 1.2, 'max value of pitch scaling')
     f.DEFINE_float('augmentation_pitch_and_tempo_scaling_max_tempo', 1.2, 'max vlaue of tempo scaling')
 
-    f.DEFINE_integer('augmentation_review_audio_steps', 0, 'number of audio, push the audio into summary directory (if 0, no audio will be dumped)')
+    f.DEFINE_integer('review_audio_steps', 0, 'number of audio, push the audio into summary directory (if 0, no audio will be dumped)')
 
     # Global Constants
     # ================

From 289722dc2ec81f4f446e152027b9dfdd263cb8ac Mon Sep 17 00:00:00 2001
From: Daniel <daniel@mail.de>
Date: Sun, 29 Mar 2020 12:49:40 +0200
Subject: [PATCH 19/25] Fix issues.

---
 DeepSpeech.py | 8 ++++++--
 evaluate.py   | 8 ++++++--
 2 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/DeepSpeech.py b/DeepSpeech.py
index 92404e07a6..74fcc62b5e 100644
--- a/DeepSpeech.py
+++ b/DeepSpeech.py
@@ -239,7 +239,10 @@ def calculate_mean_edit_distance_and_loss(iterator, dropout, reuse):
     logits, _ = create_model(batch_x, batch_seq_len, dropout, reuse=reuse, rnn_impl=rnn_impl)
 
     # Compute the CTC loss using TensorFlow's `ctc_loss`
-    total_loss = tfv1.nn.ctc_loss(labels=batch_y, inputs=logits, sequence_length=batch_seq_len)
+    total_loss = tfv1.nn.ctc_loss(labels=batch_y,
+                                  inputs=logits,
+                                  sequence_length=batch_seq_len,
+                                  ignore_longer_outputs_than_inputs=True)
 
     # Check if any files lead to non finite loss
     non_finite_files = tf.gather(batch_filenames, tfv1.where(~tf.math.is_finite(total_loss)))
@@ -556,7 +559,8 @@ def __call__(self, progress, data, **kwargs):
                         log_info("Ignoring sparse warp error: {}".format(err))
                         continue
                     else:
-                        raise
+                        print("Ignoring error:", err)
+                        continue
                 except tf.errors.OutOfRangeError:
                     exception_box.raise_if_set()
                     break
diff --git a/evaluate.py b/evaluate.py
index d0ce3231c2..94a6b6f027 100755
--- a/evaluate.py
+++ b/evaluate.py
@@ -70,8 +70,9 @@ def evaluate(test_csvs, create_model):
     transposed = tf.nn.softmax(tf.transpose(a=logits, perm=[1, 0, 2]))
 
     loss = tfv1.nn.ctc_loss(labels=batch_y,
-                          inputs=logits,
-                          sequence_length=batch_x_len)
+                            inputs=logits,
+                            sequence_length=batch_x_len,
+                            ignore_longer_outputs_than_inputs=True)
 
     tfv1.train.get_or_create_global_step()
 
@@ -110,6 +111,9 @@ def run_test(init_op, dataset):
                         session.run([batch_wav_filename, transposed, loss, batch_x_len, batch_y])
                 except tf.errors.OutOfRangeError:
                     break
+                except tf.errors.InvalidArgumentError as e:
+                    print("Ignoring error:", e)
+                    continue
 
                 decoded = ctc_beam_search_decoder_batch(batch_logits, batch_lengths, Config.alphabet, FLAGS.beam_width,
                                                         num_processes=num_processes, scorer=scorer,

From 9334e79f2888fc2c48c3f0815b56ab15205369c7 Mon Sep 17 00:00:00 2001
From: Daniel <daniel@mail.de>
Date: Sun, 29 Mar 2020 12:53:02 +0200
Subject: [PATCH 20/25] Save invalid files.

---
 DeepSpeech.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/DeepSpeech.py b/DeepSpeech.py
index 74fcc62b5e..efaae80a4c 100644
--- a/DeepSpeech.py
+++ b/DeepSpeech.py
@@ -570,6 +570,12 @@ def __call__(self, progress, data, **kwargs):
                     log_error('The following files caused an infinite (or NaN) '
                               'loss: {}'.format(','.join(problem_files)))
 
+                    # Save invalid files
+                    sys.path.append("/DeepSpeech/deepspeech-german/training/")
+                    from filter_invalid_files import add_files_to_excluded
+                    add_files_to_excluded(problem_files)
+                    sys.exit(1)
+
                 total_loss += batch_loss
                 step_count += 1
 

From 40b431b1aefb98d1b7163c373b2a9a4f3a813623 Mon Sep 17 00:00:00 2001
From: Daniel <daniel@mail.de>
Date: Sun, 29 Mar 2020 19:26:06 +0200
Subject: [PATCH 21/25] Fix merging errors.

---
 util/feeding.py | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/util/feeding.py b/util/feeding.py
index 3e926b862a..dea2669049 100644
--- a/util/feeding.py
+++ b/util/feeding.py
@@ -2,8 +2,10 @@
 from __future__ import absolute_import, division, print_function
 
 from functools import partial
+import os
 
 import numpy as np
+import pandas
 import tensorflow as tf
 
 from tensorflow.python.ops import gen_audio_ops as contrib_audio
@@ -18,6 +20,18 @@
 from util.audio_augmentation import augment_noise, create_noise_iterator, gla
 
 
+def read_csvs(csv_files):
+    sets = []
+    for csv in csv_files:
+        file = pandas.read_csv(csv, encoding='utf-8', na_filter=False)
+        #FIXME: not cross-platform
+        csv_dir = os.path.dirname(os.path.abspath(csv))
+        file['wav_filename'] = file['wav_filename'].str.replace(r'(^[^/])', lambda m: os.path.join(csv_dir, m.group(1))) # pylint: disable=cell-var-from-loop
+        sets.append(file)
+    # Concat all sets, drop any extra columns, re-index the final result as 0..N
+    return pandas.concat(sets, join='inner', ignore_index=True)
+
+
 def samples_to_mfccs(samples, sample_rate, train_phase=False, sample_id=None):
     if train_phase:
         # We need the lambdas to make TensorFlow happy.
@@ -116,9 +130,9 @@ def audiofile_to_features(wav_filename, train_phase=False, noise_iterator=None):
     return audio_to_features(decoded.audio, decoded.sample_rate, train_phase=train_phase, sample_id=wav_filename, noise_iterator=noise_iterator)
 
 
-def entry_to_features(sample_id, audio, sample_rate, transcript, train_phase=False):
+def entry_to_features(sample_id, audio, sample_rate, transcript, train_phase=False, noise_iterator=None):
     # https://bugs.python.org/issue32117
-    features, features_len, review_audio = audio_to_features(audio, sample_rate, train_phase=train_phase, sample_id=sample_id)
+    features, features_len, review_audio = audio_to_features(audio, sample_rate, train_phase=train_phase, sample_id=sample_id, noise_iterator=noise_iterator)
     sparse_transcript = tf.SparseTensor(*transcript)
     return sample_id, features, features_len, sparse_transcript, review_audio
 

From f7d1279d2c4bd56d782db083a9d3d355f4239424 Mon Sep 17 00:00:00 2001
From: Yi-Hua Chiu <mychiux413@gmail.com>
Date: Tue, 31 Mar 2020 17:15:13 +0800
Subject: [PATCH 22/25] [FIX] replace tqdm with prograssbar [ADD] separate
 speech/noise mixing, add option to mix multi noise into one audio [MOD]
 change FLAGS name, gla iterations is optional

---
 DeepSpeech.py                |  10 +--
 bin/normalize_noise_audio.py |  14 ++--
 evaluate.py                  |   4 +-
 util/audio_augmentation.py   | 121 ++++++++++++++++++++++++-----------
 util/feeding.py              |  36 ++++++-----
 util/flags.py                |  28 +++++---
 6 files changed, 140 insertions(+), 73 deletions(-)

diff --git a/DeepSpeech.py b/DeepSpeech.py
index c0897cf4a8..5ea8a6318f 100755
--- a/DeepSpeech.py
+++ b/DeepSpeech.py
@@ -431,7 +431,8 @@ def train():
             FLAGS.augmentation_freq_and_time_masking or
             FLAGS.augmentation_pitch_and_tempo_scaling or
             FLAGS.augmentation_speed_up_std > 0 or
-            FLAGS.train_augmentation_files):
+            FLAGS.train_augmentation_noise_files or
+            FLAGS.train_augmentation_speech_files):
         do_cache_dataset = False
 
     # Create training and validation datasets
@@ -440,7 +441,8 @@ def train():
                                enable_cache=FLAGS.feature_cache and do_cache_dataset,
                                cache_path=FLAGS.feature_cache,
                                train_phase=True,
-                               noise_sources=FLAGS.train_augmentation_files)
+                               noise_sources=FLAGS.train_augmentation_noise_files,
+                               speech_sources=FLAGS.train_augmentation_speech_files)
 
     iterator = tfv1.data.Iterator.from_structure(tfv1.data.get_output_types(train_set),
                                                  tfv1.data.get_output_shapes(train_set),
@@ -451,7 +453,7 @@ def train():
 
     if FLAGS.dev_files:
         dev_csvs = FLAGS.dev_files.split(',')
-        dev_sets = [create_dataset([csv], batch_size=FLAGS.dev_batch_size, train_phase=False, noise_sources=FLAGS.dev_augmentation_files) for csv in dev_csvs]
+        dev_sets = [create_dataset([csv], batch_size=FLAGS.dev_batch_size, train_phase=False, noise_sources=FLAGS.dev_augmentation_noise_files, speech_sources=FLAGS.dev_augmentation_speech_files) for csv in dev_csvs]
         dev_init_ops = [iterator.make_initializer(dev_set) for dev_set in dev_sets]
 
     # Dropout
@@ -690,7 +692,7 @@ def __call__(self, progress, data, **kwargs):
 
 
 def test():
-    samples = evaluate(FLAGS.test_files.split(','), create_model, try_loading, noise_sources=FLAGS.test_augmentation_files)
+    samples = evaluate(FLAGS.test_files.split(','), create_model, try_loading, noise_sources=FLAGS.test_augmentation_noise_files, speech_sources=FLAGS.test_augmentation_speech_files)
     if FLAGS.test_output_file:
         # Save decoded tuples as JSON, converting NumPy floats to Python floats
         json.dump(samples, open(FLAGS.test_output_file, 'w'), default=float)
diff --git a/bin/normalize_noise_audio.py b/bin/normalize_noise_audio.py
index b487ece01b..2a15fad562 100644
--- a/bin/normalize_noise_audio.py
+++ b/bin/normalize_noise_audio.py
@@ -3,7 +3,6 @@
 # Make sure we can import stuff from util/
 # This script needs to be run from the root of the DeepSpeech repository
 
-from util.feeding import secs_to_hours
 from librosa import get_duration
 from multiprocessing import Pool
 from functools import partial
@@ -11,13 +10,10 @@
 import argparse
 import sys
 import os
+import progressbar
 sys.path.insert(1, os.path.join(sys.path[0], '..'))
 
-try:
-    import tqdm
-except ImportError as err:
-    print('[ImportError] try `pip install tqdm`')
-    raise err
+from util.feeding import secs_to_hours
 
 try:
     from pydub import AudioSegment
@@ -152,8 +148,10 @@ def main(src_dir,
                                max_duration_seconds=max_duration_seconds)
 
         pool = Pool(processes=None)
-        for _ in tqdm.tqdm(pool.imap_unordered(convert_func, filenames), total=len(filenames)):
-            pass
+        pbar = progressbar.ProgressBar(prefix='Preparing Noise Dataset', max_value=len(filenames)).start()
+        for i, _ in enumerate(pool.imap_unordered(convert_func, filenames)):
+            pbar.update(i)
+        pbar.finish()
 
 
 if __name__ == "__main__":
diff --git a/evaluate.py b/evaluate.py
index d8a3ec853d..113a748835 100755
--- a/evaluate.py
+++ b/evaluate.py
@@ -41,7 +41,7 @@ def sparse_tuple_to_texts(sp_tuple, alphabet):
     return [alphabet.decode(res) for res in results]
 
 
-def evaluate(test_csvs, create_model, try_loading, noise_sources=None):
+def evaluate(test_csvs, create_model, try_loading, noise_sources=None, speech_sources=None):
     if FLAGS.lm_binary_path:
         scorer = Scorer(FLAGS.lm_alpha, FLAGS.lm_beta,
                         FLAGS.lm_binary_path, FLAGS.lm_trie_path,
@@ -50,7 +50,7 @@ def evaluate(test_csvs, create_model, try_loading, noise_sources=None):
         scorer = None
 
     test_csvs = FLAGS.test_files.split(',')
-    test_sets = [create_dataset([csv], batch_size=FLAGS.test_batch_size, train_phase=False, noise_sources=noise_sources) for csv in test_csvs]
+    test_sets = [create_dataset([csv], batch_size=FLAGS.test_batch_size, train_phase=False, noise_sources=noise_sources, speech_sources=speech_sources) for csv in test_csvs]
     iterator = tfv1.data.Iterator.from_structure(tfv1.data.get_output_types(test_sets[0]),
                                                  tfv1.data.get_output_shapes(test_sets[0]),
                                                  output_classes=tfv1.data.get_output_classes(test_sets[0]))
diff --git a/util/audio_augmentation.py b/util/audio_augmentation.py
index fa3b39a528..6a98f887f4 100644
--- a/util/audio_augmentation.py
+++ b/util/audio_augmentation.py
@@ -109,13 +109,21 @@ def collect_noise_filenames(sources, read_csvs_func):
 
 
 def augment_noise(audio,
-                  noise,
-                  min_audio_dbfs=0.0,
-                  max_audio_dbfs=-35.0,
-                  min_snr_db=3.0,
-                  max_snr_db=30.0,
+                  noise_iterator=None,
+                  speech_iterator=None,
+                  min_n_noises=0,
+                  max_n_noises=1,
+                  min_n_speakers=0,
+                  max_n_speakers=1,
+                  min_audio_dbfs=-35.0,
+                  max_audio_dbfs=0.0,
+                  min_noise_snr_db=3.0,
+                  max_noise_snr_db=30.0,
+                  min_speech_snr_db=3.0,
+                  max_speech_snr_db=30.0,
                   limit_audio_peak_dbfs=7.0,
                   limit_noise_peak_dbfs=3.0,
+                  limit_speech_peak_dbfs=7.0,
                   sample_rate=16000):
     r"""Mix audio Tensor with noise Tensor
 
@@ -125,13 +133,21 @@ def augment_noise(audio,
 
     Args:
         audio: A 2-D Tensor with shape [`time-steps`, 1].
-        noise: A 2-D Tensor with shape [`time-steps`, 1].
+        noise_iterator: A one shot iterator for noise file, the yield item shape is [`time-steps`, 1].
+        speech_iterator: A one shot iterator for speech file, the yield item shape is [`time-steps`, 1].
+        min_n_noises: A int, min number of the noises per audio mixing
+        max_n_noises: A int, 'max number of the noises per audio mixing
+        min_n_speakers: A int, min number of the speakers per audio mixing
+        max_n_speakers: A int, max number of the speakers per audio mixing
         min_audio_dbfs: A float in dbfs unit, specifying the `minimum` volume of audio during gaining audio.
         max_audio_dbfs: A float in dbfs unit, specifying the `maximum` volume of audio during gaining audio.
-        min_snr_db: A float in db unit, specifying the minimum signal-to-noise ratio during gaining audio and noise.
-        max_snr_db: A float in db unit, specifying the maximum signal-to-noise ratio during gaining audio and noise.
-        limit_audio_peak_dbfs: A float, specifying the limitation of maximun audio dbfs of chunks, the audio volume will not gain over than the specified value.
-        limit_noise_peak_dbfs: A float, specifying the limitation of maximun noise dbfs of chunks, the noise volume will not gain over than the specified value.
+        min_noise_snr_db: A float in db unit, specifying the `minimum` signal-to-noise ratio during gaining noise.
+        max_noise_snr_db: A float in db unit, specifying the `maximum` signal-to-noise ratio during gaining noise.
+        min_speech_snr_db: A float in db unit, specifying the `minimum` signal-to-noise ratio during gaining speech.
+        max_speech_snr_db: A float in db unit, specifying the `maximum` signal-to-noise ratio during gaining speech.
+        limit_audio_peak_dbfs: A float, specifying the limitation of maximun `audio` dbfs of chunks, the audio volume will not gain over than the specified value.
+        limit_noise_peak_dbfs: A float, specifying the limitation of maximun `noise` dbfs of chunks, the noise volume will not gain over than the specified value.
+        limit_speech_peak_dbfs: A float, specifying the limitation of maximun `speech` dbfs of chunks, the noise volume will not gain over than the specified value.
         sample_rate: An integer, specifying the audio sample rate to determining the chunk size for dbfs measurement.
 
     Returns:
@@ -139,51 +155,84 @@ def augment_noise(audio,
     """
 
     audio_len = tf.shape(audio)[0]
-    noise_len = tf.shape(noise)[0]
-
     audio_mean_dbfs, audio_max_dbfs = audio_to_dbfs(audio, sample_rate, reduce_funcs=[tf.reduce_mean, tf.reduce_max])
+    target_audio_dbfs = tfv1.random_uniform([], minval=min_audio_dbfs, maxval=max_audio_dbfs)
+    audio_gain_db = target_audio_dbfs - audio_mean_dbfs
+
+    # limit audio peak
+    audio_gain_db = tf.minimum(limit_audio_peak_dbfs - audio_max_dbfs, audio_gain_db)
+    target_audio_dbfs = audio_mean_dbfs + audio_gain_db
+    audio_gain_ratio = tf.math.pow(10.0, audio_gain_db / 20.0)
+    mixed_audio = tf.multiply(audio, audio_gain_ratio)
+
 
+    if noise_iterator:
+        n_noise = tfv1.random_uniform([], minval=min_n_noises, maxval=max_n_noises, dtype=tf.int32) if min_n_noises != max_n_noises else min_n_noises
+        def mix_noise_func(au):
+            noise = noise_iterator.get_next()
+            noise, noise_mean_dbfs, noise_max_dbfs = extract_noise(noise, audio_len, sample_rate)
+            return mix(au, target_audio_dbfs, noise, noise_mean_dbfs, noise_max_dbfs, min_noise_snr_db, max_noise_snr_db, limit_noise_peak_dbfs)
+        mixed_audio = tf.while_loop(lambda _: True, mix_noise_func, [mixed_audio], maximum_iterations=n_noise)
+
+    if speech_iterator:
+        n_speakers = tfv1.random_uniform([], minval=min_n_speakers, maxval=max_n_speakers, dtype=tf.int32) if min_n_speakers != max_n_speakers else min_n_speakers
+        def mix_speech_func(au):
+            speech = speech_iterator.get_next()
+            speech, speech_mean_dbfs, speech_max_dbfs = extract_noise(speech, audio_len, sample_rate)
+            return mix(au, target_audio_dbfs, speech, speech_mean_dbfs, speech_max_dbfs, min_speech_snr_db, max_speech_snr_db, limit_speech_peak_dbfs)
+        mixed_audio = tf.while_loop(lambda _: True, mix_speech_func, [mixed_audio], maximum_iterations=n_speakers)
+
+    mixed_audio = tf.maximum(tf.minimum(mixed_audio, 1.0), -1.0)
+
+    return mixed_audio
+
+def extract_noise(noise, audio_len, sample_rate=16000):
+    r"""to prepare the mixable noise file out
+
+    Args:
+        noise: A 2-D Tensor with shape [`time-steps`, 1]
+        audio_len: A tf.int32 scalar, the audio length
+        sample_rate: An integer, specifying the audio sample rate to determining the chunk size for dbfs measurement.
+    Returns:
+        A 2-D Tensor with shape [`audio_len`, 1].
+        A float, the extracted noise mean dbfs
+        A float, the extracted noise max dbfs
+    """
+    noise_len = tf.shape(noise)[0]
     multiply = tf.math.floordiv(audio_len, noise_len) + 1
     noise_tile = tf.tile(noise, [multiply, 1])
 
-
     # Now, noise_len must > audio_len
     noise_tile_len = tf.shape(noise_tile)[0]
 
     mix_decoded_start_point = tfv1.random_uniform([], minval=0, maxval=noise_tile_len-audio_len, dtype=tf.int32)
     mix_decoded_end_point = mix_decoded_start_point + audio_len
-    extract_noise = noise_tile[mix_decoded_start_point:mix_decoded_end_point, :]
-
-    extract_noise_mean_dbfs, extract_noise_max_dbfs = audio_to_dbfs(extract_noise, sample_rate, reduce_funcs=[tf.reduce_mean, tf.reduce_max])
-
-    target_audio_dbfs = tfv1.random_uniform([], minval=min_audio_dbfs, maxval=max_audio_dbfs)
+    extracted_noise = noise_tile[mix_decoded_start_point:mix_decoded_end_point, :]
+    extracted_noise_mean_dbfs, extracted_noise_max_dbfs = audio_to_dbfs(extracted_noise, sample_rate, reduce_funcs=[tf.reduce_mean, tf.reduce_max])
+    return extracted_noise, extracted_noise_mean_dbfs, extracted_noise_max_dbfs
 
-    audio_gain_db = target_audio_dbfs - audio_mean_dbfs
+def mix(audio, audio_dbfs, noise, noise_mean_dbfs, noise_max_dbfs, min_noise_snr_db, max_noise_snr_db, limit_noise_peak_dbfs):
+    r"""The input audio len must equal to noise len
 
-    # limit audio peak
-    audio_gain_db = tf.minimum(limit_audio_peak_dbfs - audio_max_dbfs, audio_gain_db)
-    target_audio_dbfs = audio_mean_dbfs + audio_gain_db
-
-    audio_gain_ratio = tf.math.pow(10.0, audio_gain_db / 20.0)
+    Returns:
+        A 2-D Tensor with shape [`time-steps`, 1]. Has the same type and shape as `audio`.
+    """
 
     # target_snr_db := target_audio_dbfs - target_noise_dbfs
-    target_snr_db = tfv1.random_uniform([], minval=min_snr_db, maxval=max_snr_db)
+    target_snr_db = tfv1.random_uniform([], minval=min_noise_snr_db, maxval=max_noise_snr_db)
 
-    target_noise_dbfs = target_audio_dbfs - target_snr_db
-    noise_gain_db = target_noise_dbfs - extract_noise_mean_dbfs
+    target_noise_dbfs = audio_dbfs - target_snr_db
+    noise_gain_db = target_noise_dbfs - noise_mean_dbfs
 
     # limit noise peak
-    noise_gain_db = tf.minimum(limit_noise_peak_dbfs - extract_noise_max_dbfs, noise_gain_db)
+    noise_gain_db = tf.minimum(limit_noise_peak_dbfs - noise_max_dbfs, noise_gain_db)
     noise_gain_ratio = tf.math.pow(10.0, noise_gain_db / 20.0)
 
-    mixed_audio = tf.multiply(audio, audio_gain_ratio) + tf.multiply(extract_noise, noise_gain_ratio)
-
-    mixed_audio = tf.maximum(tf.minimum(mixed_audio, 1.0), -1.0)
-
-    return mixed_audio
+    audio += tf.multiply(noise, noise_gain_ratio)
+    return audio
 
-def gla(spectrogram):
-    r"""Use Griffin-Lim algorithm to reconstruct audio and fix iteration=10 to not waste too much performance in prefetch
+def gla(spectrogram, n_iter=10):
+    r"""Use Griffin-Lim algorithm to reconstruct audio
 
     Args:
         spectrogram: A 3-D Tensor with shape [1, `time-steps`, `features`].
@@ -206,7 +255,7 @@ def reconstruct_phases(prev_phases):
     rands = tfv1.random_uniform(tf.shape(spectrogram), dtype=tf.float32)
     phases = tf.math.exp(tf.complex(0.0, 2.0 * np.pi * rands))
 
-    reconstructed_phases = tf.while_loop(lambda _: True, reconstruct_phases, [phases], maximum_iterations=10)
+    reconstructed_phases = tf.while_loop(lambda _: True, reconstruct_phases, [phases], maximum_iterations=n_iter)
     xi = tf.complex(abs_spectrogram, 0.0) * reconstructed_phases
     audio = tf.signal.inverse_stft(xi, frame_length=frame_length, frame_step=frame_step, fft_length=fft_length)
     return tf.transpose(audio)
diff --git a/util/feeding.py b/util/feeding.py
index 24fd59f0ed..00ff40e316 100644
--- a/util/feeding.py
+++ b/util/feeding.py
@@ -68,28 +68,35 @@ def samples_to_mfccs(samples, sample_rate, train_phase=False):
                 FLAGS.augmentation_freq_and_time_masking,
                 FLAGS.augmentation_pitch_and_tempo_scaling,
                 FLAGS.augmentation_speed_up_std > 0]):
-        review_audio = gla(spectrogram)
+        review_audio = gla(spectrogram, FLAGS.review_audio_gla_iterations)
 
     return mfccs, tf.shape(input=mfccs)[0], review_audio
 
 
-def audiofile_to_features(wav_filename, train_phase=False, noise_iterator=None):
+def audiofile_to_features(wav_filename, train_phase=False, noise_iterator=None, speech_iterator=None):
     samples = tf.io.read_file(wav_filename)
     decoded = contrib_audio.decode_wav(samples, desired_channels=1)
     audio = decoded.audio
 
     # augment audio
-    if noise_iterator:
-        noise = noise_iterator.get_next()
+    if noise_iterator or speech_iterator:
         audio = augment_noise(
             audio,
-            noise,
+            noise_iterator,
+            speech_iterator,
+            min_n_noises=FLAGS.audio_aug_min_n_noises,
+            max_n_noises=FLAGS.audio_aug_max_n_noises,
+            min_n_speakers=FLAGS.audio_aug_min_n_speakers,
+            max_n_speakers=FLAGS.audio_aug_max_n_speakers,
             min_audio_dbfs=FLAGS.audio_aug_min_audio_dbfs,
             max_audio_dbfs=FLAGS.audio_aug_max_audio_dbfs,
-            min_snr_db=FLAGS.audio_aug_min_snr_db,
-            max_snr_db=FLAGS.audio_aug_max_snr_db,
+            min_noise_snr_db=FLAGS.audio_aug_min_noise_snr_db,
+            max_noise_snr_db=FLAGS.audio_aug_max_noise_snr_db,
+            min_speech_snr_db=FLAGS.audio_aug_min_speech_snr_db,
+            max_speech_snr_db=FLAGS.audio_aug_max_speech_snr_db,
             limit_audio_peak_dbfs=FLAGS.audio_aug_limit_audio_peak_dbfs,
             limit_noise_peak_dbfs=FLAGS.audio_aug_limit_noise_peak_dbfs,
+            limit_speech_peak_dbfs=FLAGS.audio_aug_limit_speech_peak_dbfs,
             sample_rate=FLAGS.audio_sample_rate,
         )
 
@@ -106,9 +113,9 @@ def audiofile_to_features(wav_filename, train_phase=False, noise_iterator=None):
     return features, features_len, review_audio
 
 
-def entry_to_features(wav_filename, transcript, train_phase, noise_iterator):
+def entry_to_features(wav_filename, transcript, train_phase, noise_iterator, speech_iterator):
     # https://bugs.python.org/issue32117
-    features, features_len, review_audio = audiofile_to_features(wav_filename, train_phase=train_phase, noise_iterator=noise_iterator)
+    features, features_len, review_audio = audiofile_to_features(wav_filename, train_phase=train_phase, noise_iterator=noise_iterator, speech_iterator=speech_iterator)
     return wav_filename, features, features_len, tf.SparseTensor(*transcript), review_audio
 
 
@@ -121,7 +128,7 @@ def to_sparse_tuple(sequence):
     return indices, sequence, shape
 
 
-def create_dataset(csvs, batch_size, enable_cache=False, cache_path=None, train_phase=False, noise_sources=None):
+def create_dataset(csvs, batch_size, enable_cache=False, cache_path=None, train_phase=False, noise_sources=None, speech_sources=None):
     df = read_csvs(csvs)
     df.sort_values(by='wav_filesize', inplace=True)
 
@@ -156,12 +163,11 @@ def batch_fn(wav_filenames, features, features_len, transcripts, review_audios):
 
     num_gpus = len(Config.available_devices)
 
-    if noise_sources:
-        noise_iterator = create_noise_iterator(noise_sources, read_csvs)
-    else:
-        noise_iterator = None
+    noise_iterator = create_noise_iterator(noise_sources, read_csvs) if noise_sources else None
+    speech_iterator = create_noise_iterator(speech_sources, read_csvs) if speech_sources else None
 
-    process_fn = partial(entry_to_features, train_phase=train_phase, noise_iterator=noise_iterator)
+
+    process_fn = partial(entry_to_features, train_phase=train_phase, noise_iterator=noise_iterator, speech_iterator=speech_iterator)
 
     dataset = tf.data.Dataset.from_generator(generate_values,
                                              output_types=(tf.string, (tf.int64, tf.int32, tf.int64)))
diff --git a/util/flags.py b/util/flags.py
index 740894edd5..1b1b61c6e3 100644
--- a/util/flags.py
+++ b/util/flags.py
@@ -24,15 +24,26 @@ def create_flags():
     # Data Augmentation
     # ================
 
-    f.DEFINE_string('train_augmentation_files', '', 'comma separated list of files or dirs, specifying the dataset used for mixing audio into train dataset, if empty, no mix will be run in train phase')
-    f.DEFINE_string('dev_augmentation_files', '', 'comma separated list of files or dirs, specifying the dataset used for mixing audio into dev dataset, if empty, no mix will be run in dev phase')
-    f.DEFINE_string('test_augmentation_files', '', 'comma separated list of files or dirs, specifying the dataset used for mixing audio into test dataset, if empty, no mix will be run in test phase')
-    f.DEFINE_float('audio_aug_min_audio_dbfs', 0, 'min value of dbfs to specify the min volume of audio during gaining audio')
-    f.DEFINE_float('audio_aug_max_audio_dbfs', -35, 'max value of dbfs to specify the max volume of audio during gaining audio')
-    f.DEFINE_float('audio_aug_min_snr_db', 3, 'min value of db to specify the min signal-to-noise ratio during gaining audio and noise')
-    f.DEFINE_float('audio_aug_max_snr_db', 30, 'max value of db to specify the max signal-to-noise ratio during gaining audio and noise')
+    f.DEFINE_string('train_augmentation_noise_files', '', 'comma separated list of files or dirs, specifying the noise dataset used for mixing audio into train dataset, if empty, no mix will be run in train phase')
+    f.DEFINE_string('dev_augmentation_noise_files', '', 'comma separated list of files or dirs, specifying the noise dataset used for mixing audio into dev dataset, if empty, no mix will be run in dev phase')
+    f.DEFINE_string('test_augmentation_noise_files', '', 'comma separated list of files or dirs, specifying the noise dataset used for mixing audio into test dataset, if empty, no mix will be run in test phase')
+    f.DEFINE_string('train_augmentation_speech_files', '', 'comma separated list of files or dirs, specifying the speech dataset used for mixing audio into train dataset, if empty, no mix will be run in train phase')
+    f.DEFINE_string('dev_augmentation_speech_files', '', 'comma separated list of files or dirs, specifying the speech dataset used for mixing audio into dev dataset, if empty, no mix will be run in dev phase')
+    f.DEFINE_string('test_augmentation_speech_files', '', 'comma separated list of files or dirs, specifying the speech dataset used for mixing audio into test dataset, if empty, no mix will be run in test phase')
+    f.DEFINE_float('audio_aug_min_audio_dbfs', -35, 'min value of dbfs to specify the min volume of audio during gaining audio')
+    f.DEFINE_float('audio_aug_max_audio_dbfs', 0, 'max value of dbfs to specify the max volume of audio during gaining audio')
+    f.DEFINE_float('audio_aug_min_noise_snr_db', 3, 'min value of db to specify the min signal-to-noise ratio during gaining noise')
+    f.DEFINE_float('audio_aug_max_noise_snr_db', 30, 'max value of db to specify the max signal-to-noise ratio during gaining noise')
+    f.DEFINE_float('audio_aug_min_speech_snr_db', 10, 'min value of db to specify the min signal-to-noise ratio during gaining speech')
+    f.DEFINE_float('audio_aug_max_speech_snr_db', 50, 'max value of db to specify the max signal-to-noise ratio during gaining speech')
     f.DEFINE_float('audio_aug_limit_audio_peak_dbfs', 7.0, 'max value of dbfs to specify the limitation of max audio dbfs of chunks, the audio volume will not gain over than the specified value')
     f.DEFINE_float('audio_aug_limit_noise_peak_dbfs', 3.0, 'max value of dbfs to specify the limitation of max noise dbfs of chunks, the noise volume will not gain over than the specified value')
+    f.DEFINE_float('audio_aug_limit_speech_peak_dbfs', 3.0, 'max value of dbfs to specify the limitation of max speech dbfs of chunks, the speech volume will not gain over than the specified value')
+    f.DEFINE_integer('audio_aug_min_n_noises', 0, 'min number of the noises per audio mixing')
+    f.DEFINE_integer('audio_aug_max_n_noises', 1, 'max number of the noises per audio mixing')
+    f.DEFINE_integer('audio_aug_min_n_speakers', 0, 'min number of the speakers per audio mixing')
+    f.DEFINE_integer('audio_aug_max_n_speakers', 1, 'max number of the speakers per audio mixing')
+
 
     f.DEFINE_float('data_aug_features_additive', 0, 'std of the Gaussian additive noise')
     f.DEFINE_float('data_aug_features_multiplicative', 0, 'std of normal distribution around 1 for multiplicative noise')
@@ -52,7 +63,8 @@ def create_flags():
     f.DEFINE_float('augmentation_pitch_and_tempo_scaling_max_pitch', 1.2, 'max value of pitch scaling')
     f.DEFINE_float('augmentation_pitch_and_tempo_scaling_max_tempo', 1.2, 'max vlaue of tempo scaling')
 
-    f.DEFINE_integer('review_audio_steps', 0, 'number of audio, push the audio into summary directory (if 0, no audio will be dumped)')
+    f.DEFINE_integer('review_audio_steps', 0, 'number of audio, push the audio into summary directory (if 0, no audio will be dumped), one file per step is saved until the given count is reached')
+    f.DEFINE_integer('review_audio_gla_iterations', 10, 'number of iteration to reconstruct audio from features, using Griffin-Lim Algorithm')
 
     # Global Constants
     # ================

From c089b7fdf100861d6b2d12bfa4153b98a1730121 Mon Sep 17 00:00:00 2001
From: Daniel <daniel@mail.de>
Date: Fri, 17 Apr 2020 20:35:15 +0200
Subject: [PATCH 23/25] Fix merge not detecting moved scripts.

---
 training/deepspeech_training/evaluate.py      | 12 ++--
 training/deepspeech_training/train.py         | 61 ++++++++++++++-----
 .../util}/audio_augmentation.py               |  4 +-
 3 files changed, 57 insertions(+), 20 deletions(-)
 rename {util => training/deepspeech_training/util}/audio_augmentation.py (99%)

diff --git a/training/deepspeech_training/evaluate.py b/training/deepspeech_training/evaluate.py
index 5877b618ad..0d9f02c9e9 100755
--- a/training/deepspeech_training/evaluate.py
+++ b/training/deepspeech_training/evaluate.py
@@ -43,7 +43,7 @@ def sparse_tuple_to_texts(sp_tuple, alphabet):
     return [alphabet.decode(res) for res in results]
 
 
-def evaluate(test_csvs, create_model):
+def evaluate(test_csvs, create_model, noise_sources=None, speech_sources=None):
     if FLAGS.scorer_path:
         scorer = Scorer(FLAGS.lm_alpha, FLAGS.lm_beta,
                         FLAGS.scorer_path, Config.alphabet)
@@ -51,13 +51,13 @@ def evaluate(test_csvs, create_model):
         scorer = None
 
     test_csvs = FLAGS.test_files.split(',')
-    test_sets = [create_dataset([csv], batch_size=FLAGS.test_batch_size, train_phase=False) for csv in test_csvs]
+    test_sets = [create_dataset([csv], batch_size=FLAGS.test_batch_size, train_phase=False, noise_sources=noise_sources, speech_sources=speech_sources) for csv in test_csvs]
     iterator = tfv1.data.Iterator.from_structure(tfv1.data.get_output_types(test_sets[0]),
                                                  tfv1.data.get_output_shapes(test_sets[0]),
                                                  output_classes=tfv1.data.get_output_classes(test_sets[0]))
     test_init_ops = [iterator.make_initializer(test_set) for test_set in test_sets]
 
-    batch_wav_filename, (batch_x, batch_x_len), batch_y = iterator.get_next()
+    batch_wav_filename, (batch_x, batch_x_len), batch_y, _ = iterator.get_next()
 
     # One rate per layer
     no_dropout = [None] * 6
@@ -71,7 +71,8 @@ def evaluate(test_csvs, create_model):
 
     loss = tfv1.nn.ctc_loss(labels=batch_y,
                             inputs=logits,
-                            sequence_length=batch_x_len)
+                            sequence_length=batch_x_len,
+                            ignore_longer_outputs_than_inputs=True)
 
     tfv1.train.get_or_create_global_step()
 
@@ -106,6 +107,9 @@ def run_test(init_op, dataset):
                         session.run([batch_wav_filename, transposed, loss, batch_x_len, batch_y])
                 except tf.errors.OutOfRangeError:
                     break
+                except tf.errors.InvalidArgumentError as e:
+                    print("Ignoring error:", e)
+                    continue
 
                 decoded = ctc_beam_search_decoder_batch(batch_logits, batch_lengths, Config.alphabet, FLAGS.beam_width,
                                                         num_processes=num_processes, scorer=scorer,
diff --git a/training/deepspeech_training/train.py b/training/deepspeech_training/train.py
index eed8cd9eb2..600dff3560 100644
--- a/training/deepspeech_training/train.py
+++ b/training/deepspeech_training/train.py
@@ -228,7 +228,7 @@ def calculate_mean_edit_distance_and_loss(iterator, dropout, reuse):
     the decoded result and the batch's original Y.
     '''
     # Obtain the next batch of data
-    batch_filenames, (batch_x, batch_seq_len), batch_y = iterator.get_next()
+    batch_filenames, (batch_x, batch_seq_len), batch_y, review_audio = iterator.get_next()
 
     if FLAGS.train_cudnn:
         rnn_impl = rnn_impl_cudnn_rnn
@@ -239,7 +239,10 @@ def calculate_mean_edit_distance_and_loss(iterator, dropout, reuse):
     logits, _ = create_model(batch_x, batch_seq_len, dropout, reuse=reuse, rnn_impl=rnn_impl)
 
     # Compute the CTC loss using TensorFlow's `ctc_loss`
-    total_loss = tfv1.nn.ctc_loss(labels=batch_y, inputs=logits, sequence_length=batch_seq_len)
+    total_loss = tfv1.nn.ctc_loss(labels=batch_y,
+                                  inputs=logits,
+                                  sequence_length=batch_seq_len,
+                                  ignore_longer_outputs_than_inputs=True)
 
     # Check if any files lead to non finite loss
     non_finite_files = tf.gather(batch_filenames, tfv1.where(~tf.math.is_finite(total_loss)))
@@ -248,7 +251,7 @@ def calculate_mean_edit_distance_and_loss(iterator, dropout, reuse):
     avg_loss = tf.reduce_mean(input_tensor=total_loss)
 
     # Finally we return the average loss
-    return avg_loss, non_finite_files
+    return avg_loss, non_finite_files, review_audio
 
 
 # Adam Optimization
@@ -309,7 +312,7 @@ def get_tower_results(iterator, optimizer, dropout_rates):
                 with tf.name_scope('tower_%d' % i):
                     # Calculate the avg_loss and mean_edit_distance and retrieve the decoded
                     # batch along with the original batch's labels (Y) of this tower
-                    avg_loss, non_finite_files = calculate_mean_edit_distance_and_loss(iterator, dropout_rates, reuse=i > 0)
+                    avg_loss, non_finite_files, review_audio = calculate_mean_edit_distance_and_loss(iterator, dropout_rates, reuse=i > 0)
 
                     # Allow for variables to be re-used by the next tower
                     tfv1.get_variable_scope().reuse_variables()
@@ -326,6 +329,8 @@ def get_tower_results(iterator, optimizer, dropout_rates):
                     tower_non_finite_files.append(non_finite_files)
 
     avg_loss_across_towers = tf.reduce_mean(input_tensor=tower_avg_losses, axis=0)
+    if FLAGS.review_audio_steps:
+        tfv1.summary.audio(name='step_audio', tensor=review_audio, sample_rate=FLAGS.audio_sample_rate, collections=['step_audio_summaries'])
     tfv1.summary.scalar(name='step_loss', tensor=avg_loss_across_towers, collections=['step_summaries'])
 
     all_non_finite_files = tf.concat(tower_non_finite_files, axis=0)
@@ -415,7 +420,9 @@ def train():
             FLAGS.augmentation_freq_and_time_masking or
             FLAGS.augmentation_pitch_and_tempo_scaling or
             FLAGS.augmentation_speed_up_std > 0 or
-            FLAGS.augmentation_sparse_warp):
+            FLAGS.augmentation_sparse_warp or
+            FLAGS.train_augmentation_noise_files or
+            FLAGS.train_augmentation_speech_files):
         do_cache_dataset = False
 
     exception_box = ExceptionBox()
@@ -428,7 +435,9 @@ def train():
                                train_phase=True,
                                exception_box=exception_box,
                                process_ahead=len(Config.available_devices) * FLAGS.train_batch_size * 2,
-                               buffering=FLAGS.read_buffer)
+                               buffering=FLAGS.read_buffer,
+                               noise_sources=FLAGS.train_augmentation_noise_files,
+                               speech_sources=FLAGS.train_augmentation_speech_files)
 
     iterator = tfv1.data.Iterator.from_structure(tfv1.data.get_output_types(train_set),
                                                  tfv1.data.get_output_shapes(train_set),
@@ -444,7 +453,9 @@ def train():
                                    train_phase=False,
                                    exception_box=exception_box,
                                    process_ahead=len(Config.available_devices) * FLAGS.dev_batch_size * 2,
-                                   buffering=FLAGS.read_buffer) for source in dev_sources]
+                                   buffering=FLAGS.read_buffer,
+                                   noise_sources=FLAGS.dev_augmentation_noise_files,
+                                   speech_sources=FLAGS.dev_augmentation_speech_files) for source in dev_sources]
         dev_init_ops = [iterator.make_initializer(dev_set) for dev_set in dev_sets]
 
     # Dropout
@@ -482,6 +493,7 @@ def train():
     apply_gradient_op = optimizer.apply_gradients(avg_tower_gradients, global_step=global_step)
 
     # Summaries
+    step_audio_summaries_op = tfv1.summary.merge_all('step_audio_summaries')
     step_summaries_op = tfv1.summary.merge_all('step_summaries')
     step_summary_writers = {
         'train': tfv1.summary.FileWriter(os.path.join(FLAGS.summary_dir, 'train'), max_queue=120),
@@ -541,17 +553,29 @@ def __call__(self, progress, data, **kwargs):
             session.run(init_op)
 
             # Batch loop
+
+            audio_summary_steps = 0
             while True:
                 try:
-                    _, current_step, batch_loss, problem_files, step_summary = \
-                        session.run([train_op, global_step, loss, non_finite_files, step_summaries_op],
-                                    feed_dict=feed_dict)
+                    step_audio_summary = None
+                    if audio_summary_steps < FLAGS.review_audio_steps and epoch == 0:
+                        _, current_step, batch_loss, problem_files, step_summary, step_audio_summary = \
+                            session.run([train_op, global_step, loss, non_finite_files, step_summaries_op, step_audio_summaries_op],
+                                        feed_dict=feed_dict)
+                        audio_summary_steps += 1
+                    else:
+                        _, current_step, batch_loss, problem_files, step_summary = \
+                            session.run([train_op, global_step, loss, non_finite_files, step_summaries_op],
+                                        feed_dict=feed_dict)
+
                     exception_box.raise_if_set()
                 except tf.errors.InvalidArgumentError as err:
                     if FLAGS.augmentation_sparse_warp:
                         log_info("Ignoring sparse warp error: {}".format(err))
                         continue
-                    raise
+                    else:
+                        print("Ignoring error:", err)
+                        continue
                 except tf.errors.OutOfRangeError:
                     exception_box.raise_if_set()
                     break
@@ -561,11 +585,20 @@ def __call__(self, progress, data, **kwargs):
                     log_error('The following files caused an infinite (or NaN) '
                               'loss: {}'.format(','.join(problem_files)))
 
+                    # Save invalid files
+                    sys.path.append("/DeepSpeech/deepspeech-german/training/")
+                    from filter_invalid_files import add_files_to_excluded
+                    add_files_to_excluded(problem_files)
+                    sys.exit(1)
+
                 total_loss += batch_loss
                 step_count += 1
 
                 pbar.update(step_count)
 
+                if step_audio_summary is not None:
+                    step_summary_writer.add_summary(step_audio_summary, current_step)
+
                 step_summary_writer.add_summary(step_summary, current_step)
 
                 if is_train and FLAGS.checkpoint_secs > 0 and time.time() - checkpoint_time > FLAGS.checkpoint_secs:
@@ -639,7 +672,7 @@ def __call__(self, progress, data, **kwargs):
 
 
 def test():
-    samples = evaluate(FLAGS.test_files.split(','), create_model)
+    samples = evaluate(FLAGS.test_files.split(','), create_model, noise_sources=FLAGS.test_augmentation_noise_files, speech_sources=FLAGS.test_augmentation_speech_files)
     if FLAGS.test_output_file:
         # Save decoded tuples as JSON, converting NumPy floats to Python floats
         json.dump(samples, open(FLAGS.test_output_file, 'w'), default=float)
@@ -651,7 +684,7 @@ def create_inference_graph(batch_size=1, n_steps=16, tflite=False):
     # Create feature computation graph
     input_samples = tfv1.placeholder(tf.float32, [Config.audio_window_samples], 'input_samples')
     samples = tf.expand_dims(input_samples, -1)
-    mfccs, _ = samples_to_mfccs(samples, FLAGS.audio_sample_rate)
+    mfccs, _, _ = samples_to_mfccs(samples, FLAGS.audio_sample_rate)
     mfccs = tf.identity(mfccs, name='mfccs')
 
     # Input tensor will be of shape [batch_size, n_steps, 2*n_context+1, n_input]
@@ -851,7 +884,7 @@ def do_single_file_inference(input_file_path):
         # Restore variables from training checkpoint
         load_graph_for_evaluation(session)
 
-        features, features_len = audiofile_to_features(input_file_path)
+        features, features_len, _ = audiofile_to_features(input_file_path)
         previous_state_c = np.zeros([1, Config.n_cell_dim])
         previous_state_h = np.zeros([1, Config.n_cell_dim])
 
diff --git a/util/audio_augmentation.py b/training/deepspeech_training/util/audio_augmentation.py
similarity index 99%
rename from util/audio_augmentation.py
rename to training/deepspeech_training/util/audio_augmentation.py
index 6a98f887f4..55b8957178 100644
--- a/util/audio_augmentation.py
+++ b/training/deepspeech_training/util/audio_augmentation.py
@@ -5,8 +5,8 @@
 import numpy as np
 from tensorflow.python.ops import gen_audio_ops as contrib_audio
 import os
-from util.logging import log_info
-from util.config import Config
+from .logging import log_info
+from .config import Config
 
 
 DBFS_COEF = 10.0 / np.log(10.0)

From 491a4b06f9393b338346b6dd58c7dccef6cca4b2 Mon Sep 17 00:00:00 2001
From: Daniel <daniel@mail.de>
Date: Fri, 17 Apr 2020 20:50:11 +0200
Subject: [PATCH 24/25] Undo personal changes.

---
 training/deepspeech_training/evaluate.py |  6 +-----
 training/deepspeech_training/train.py    | 13 ++-----------
 2 files changed, 3 insertions(+), 16 deletions(-)

diff --git a/training/deepspeech_training/evaluate.py b/training/deepspeech_training/evaluate.py
index 0d9f02c9e9..10043213b4 100755
--- a/training/deepspeech_training/evaluate.py
+++ b/training/deepspeech_training/evaluate.py
@@ -71,8 +71,7 @@ def evaluate(test_csvs, create_model, noise_sources=None, speech_sources=None):
 
     loss = tfv1.nn.ctc_loss(labels=batch_y,
                             inputs=logits,
-                            sequence_length=batch_x_len,
-                            ignore_longer_outputs_than_inputs=True)
+                            sequence_length=batch_x_len)
 
     tfv1.train.get_or_create_global_step()
 
@@ -107,9 +106,6 @@ def run_test(init_op, dataset):
                         session.run([batch_wav_filename, transposed, loss, batch_x_len, batch_y])
                 except tf.errors.OutOfRangeError:
                     break
-                except tf.errors.InvalidArgumentError as e:
-                    print("Ignoring error:", e)
-                    continue
 
                 decoded = ctc_beam_search_decoder_batch(batch_logits, batch_lengths, Config.alphabet, FLAGS.beam_width,
                                                         num_processes=num_processes, scorer=scorer,
diff --git a/training/deepspeech_training/train.py b/training/deepspeech_training/train.py
index 600dff3560..dc48cef3f7 100644
--- a/training/deepspeech_training/train.py
+++ b/training/deepspeech_training/train.py
@@ -241,8 +241,7 @@ def calculate_mean_edit_distance_and_loss(iterator, dropout, reuse):
     # Compute the CTC loss using TensorFlow's `ctc_loss`
     total_loss = tfv1.nn.ctc_loss(labels=batch_y,
                                   inputs=logits,
-                                  sequence_length=batch_seq_len,
-                                  ignore_longer_outputs_than_inputs=True)
+                                  sequence_length=batch_seq_len)
 
     # Check if any files lead to non finite loss
     non_finite_files = tf.gather(batch_filenames, tfv1.where(~tf.math.is_finite(total_loss)))
@@ -573,9 +572,7 @@ def __call__(self, progress, data, **kwargs):
                     if FLAGS.augmentation_sparse_warp:
                         log_info("Ignoring sparse warp error: {}".format(err))
                         continue
-                    else:
-                        print("Ignoring error:", err)
-                        continue
+                    raise
                 except tf.errors.OutOfRangeError:
                     exception_box.raise_if_set()
                     break
@@ -585,12 +582,6 @@ def __call__(self, progress, data, **kwargs):
                     log_error('The following files caused an infinite (or NaN) '
                               'loss: {}'.format(','.join(problem_files)))
 
-                    # Save invalid files
-                    sys.path.append("/DeepSpeech/deepspeech-german/training/")
-                    from filter_invalid_files import add_files_to_excluded
-                    add_files_to_excluded(problem_files)
-                    sys.exit(1)
-
                 total_loss += batch_loss
                 step_count += 1
 

From 2fa91e8c871c99f3f1950aa2801f9c39b7b93add Mon Sep 17 00:00:00 2001
From: Yi-Hua Chiu <mychiux413@gmail.com>
Date: Tue, 12 May 2020 16:56:53 +0800
Subject: [PATCH 25/25] To recover the incorrect merge

Revert "Merge branch 'no-sort' into more-augment-options"

This reverts commit 77922262464c5c0fb9d2c5c90e9046f59769bb9d, reversing
changes made to f7d1279d2c4bd56d782db083a9d3d355f4239424.
---
 requirements.txt       |  2 +-
 util/evaluate_tools.py |  7 ++++++-
 util/feeding.py        |  2 +-
 util/text.py           | 19 ++++++++++++-------
 4 files changed, 20 insertions(+), 10 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index e15e2a9f5b..d399ac4e8f 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,5 @@
 # Main training requirements
-
+tensorflow == 1.14.0
 numpy == 1.15.4
 progressbar2
 pandas
diff --git a/util/evaluate_tools.py b/util/evaluate_tools.py
index 59fb542be2..7f6a8ffb78 100644
--- a/util/evaluate_tools.py
+++ b/util/evaluate_tools.py
@@ -66,7 +66,12 @@ def calculate_report(wav_filenames, labels, decodings, losses):
     samples_wer, samples_cer = wer_cer_batch(samples)
 
     # Order the remaining items by their loss (lowest loss on top)
+    samples.sort(key=lambda s: s.loss)
 
-
+    # Then order by descending WER/CER
+    if FLAGS.utf8:
+        samples.sort(key=lambda s: s.cer, reverse=True)
+    else:
+        samples.sort(key=lambda s: s.wer, reverse=True)
 
     return samples_wer, samples_cer, samples
diff --git a/util/feeding.py b/util/feeding.py
index 76d8953d97..00ff40e316 100644
--- a/util/feeding.py
+++ b/util/feeding.py
@@ -130,7 +130,7 @@ def to_sparse_tuple(sequence):
 
 def create_dataset(csvs, batch_size, enable_cache=False, cache_path=None, train_phase=False, noise_sources=None, speech_sources=None):
     df = read_csvs(csvs)
-    #df.sort_values(by='wav_filesize', inplace=True)
+    df.sort_values(by='wav_filesize', inplace=True)
 
     df['transcript'] = df.apply(text_to_char_array, alphabet=Config.alphabet, result_type='reduce', axis=1)
 
diff --git a/util/text.py b/util/text.py
index c2d450c3a9..d3be7eb88a 100644
--- a/util/text.py
+++ b/util/text.py
@@ -14,13 +14,18 @@ def __init__(self, config_file):
         self._label_to_str = {}
         self._str_to_label = {}
         self._size = 0
-        with codecs.open(config_file, 'r', 'utf-8') as fin:
-            for line in fin:
-                self._label_to_str += line[:-1] # remove the line ending
-                self._str_to_label[line[:-1]] = self._size
-                self._size += 1
-
-    def string_from_label(self, label):
+        if config_file:
+            with codecs.open(config_file, 'r', 'utf-8') as fin:
+                for line in fin:
+                    if line[0:2] == '\\#':
+                        line = '#\n'
+                    elif line[0] == '#':
+                        continue
+                    self._label_to_str[self._size] = line[:-1] # remove the line ending
+                    self._str_to_label[line[:-1]] = self._size
+                    self._size += 1
+
+    def _string_from_label(self, label):
         return self._label_to_str[label]
 
     def _label_from_string(self, string):