fix example channel dim (#123)

keunwoochoi · Mar 25, 2021 · ff6fe77 · ff6fe77
1 parent 7643354
commit ff6fe77
Showing 1 changed file with 46 additions and 9 deletions.
diff --git a/docs/quickstart.rst b/docs/quickstart.rst
@@ -45,13 +45,14 @@ Use STFT Magnitude
 
 .. code-block:: python
 
+    import tensorflow as tf
     from tensorflow.keras.models import Sequential
     from kapre import STFT, Magnitude, MagnitudeToDecibel
 
     sampling_rate = 16000  # sampling rate of your input audio
     duration = 20.0  # duration of the audio
     num_channel = 2  # number of channels of the audio
-    input_shape = (num_channel, int(sampling_rate * duration))  # let's follow `channels_last` convention even for audio
+    input_shape = (int(sampling_rate * duration), num_channel)  # let's follow `channels_last` convention
 
     model = Sequential()
     model.add(STFT(n_fft=2048, win_length=2018, hop_length=1024,
@@ -66,17 +67,50 @@ Use STFT Magnitude
     _________________________________________________________________
     Layer (type)                 Output Shape              Param #
     =================================================================
-    stft (STFT)                  (None, 0, 1025, 320000)   0
+    stft (STFT)                  (None, 311, 1025, 2)      0
     _________________________________________________________________
-    magnitude (Magnitude)        (None, 0, 1025, 320000)   0
+    magnitude (Magnitude)        (None, 311, 1025, 2)      0
     _________________________________________________________________
-    magnitude_to_decibel (Magnit (None, 0, 1025, 320000)   0
+    magnitude_to_decibel (Magnit (None, 311, 1025, 2)      0
     =================================================================
     Total params: 0
     Trainable params: 0
     Non-trainable params: 0
     _________________________________________________________________
     """
+    # A 20-second stereo audio signal is converted to a (311, 1025, 2) tensor.
+
+    # Now, you can add your own model. For example, let's add ResNet50
+    # with global average pooling, no pre-trained weights,
+    # and for a 10-class classification.
+
+    model.add(
+        tf.keras.applications.ResNet50(
+            include_top=True, weights=None, input_tensor=None,
+            input_shape=(311, 1025, 2), pooling='avg', classes=10
+        )
+    )
+
+    model.summary()
+    """
+    Model: "sequential"
+    _________________________________________________________________
+    Layer (type)                 Output Shape              Param #
+    =================================================================
+    stft (STFT)                  (None, 311, 1025, 2)      0
+    _________________________________________________________________
+    magnitude (Magnitude)        (None, 311, 1025, 2)      0
+    _________________________________________________________________
+    magnitude_to_decibel (Magnit (None, 311, 1025, 2)      0
+    _________________________________________________________________
+    resnet50 (Functional)        (None, 10)                23605066
+    =================================================================
+    Total params: 23,605,066
+    Trainable params: 23,551,946
+    Non-trainable params: 53,120
+    _________________________________________________________________
+    """
+
 
 
 Use STFT Magnitude -- a lazy version
@@ -90,7 +124,7 @@ Use STFT Magnitude -- a lazy version
     sampling_rate = 16000  # sampling rate of your input audio
     duration = 20.0  # duration of the audio
     num_channel = 2  # number of channels of the audio
-    input_shape = (num_channel, int(sampling_rate * duration))  # let's follow `channels_last` convention even for audio
+    input_shape = (int(sampling_rate * duration), num_channel)  # let's follow `channels_last` convention
 
     model = Sequential(get_stft_magnitude_layer(input_shape=input_shape, return_decibel=True))
 
@@ -100,25 +134,28 @@ Use STFT Magnitude -- a lazy version
     _________________________________________________________________
     Layer (type)                 Output Shape              Param #
     =================================================================
-    stft_magnitude (Sequential)  (None, 0, 1025, 320000)   0
+    stft_magnitude (Sequential)  (None, 622, 1025, 2)      0
     =================================================================
     Total params: 0
     Trainable params: 0
     Non-trainable params: 0
     _________________________________________________________________
     """
+    # Here, a 20-second stereo audio signal is converted to a (622, 1025, 2) tensor.
+    # x2 more temporal frames compared to the example above because we didn't set hop_length here,
+    # and that means it's set to a 25% hop length, not 50% as above.
 
     model.layers[0].summary()  # let's deep dive one level
     """
     Model: "stft_magnitude"
     _________________________________________________________________
     Layer (type)                 Output Shape              Param #
     =================================================================
-    stft (STFT)                  (None, 0, 1025, 320000)   0
+    stft (STFT)                  (None, 622, 1025, 2)      0
     _________________________________________________________________
-    magnitude (Magnitude)        (None, 0, 1025, 320000)   0
+    magnitude (Magnitude)        (None, 622, 1025, 2)      0
     _________________________________________________________________
-    magnitude_to_decibel (Magnit (None, 0, 1025, 320000)   0
+    magnitude_to_decibel (Magnit (None, 622, 1025, 2)      0
     =================================================================
     Total params: 0
     Trainable params: 0