diff --git a/docs/quickstart.rst b/docs/quickstart.rst index 4490c01..e6925df 100644 --- a/docs/quickstart.rst +++ b/docs/quickstart.rst @@ -45,13 +45,14 @@ Use STFT Magnitude .. code-block:: python + import tensorflow as tf from tensorflow.keras.models import Sequential from kapre import STFT, Magnitude, MagnitudeToDecibel sampling_rate = 16000 # sampling rate of your input audio duration = 20.0 # duration of the audio num_channel = 2 # number of channels of the audio - input_shape = (num_channel, int(sampling_rate * duration)) # let's follow `channels_last` convention even for audio + input_shape = (int(sampling_rate * duration), num_channel) # let's follow `channels_last` convention model = Sequential() model.add(STFT(n_fft=2048, win_length=2018, hop_length=1024, @@ -66,17 +67,50 @@ Use STFT Magnitude _________________________________________________________________ Layer (type) Output Shape Param # ================================================================= - stft (STFT) (None, 0, 1025, 320000) 0 + stft (STFT) (None, 311, 1025, 2) 0 _________________________________________________________________ - magnitude (Magnitude) (None, 0, 1025, 320000) 0 + magnitude (Magnitude) (None, 311, 1025, 2) 0 _________________________________________________________________ - magnitude_to_decibel (Magnit (None, 0, 1025, 320000) 0 + magnitude_to_decibel (Magnit (None, 311, 1025, 2) 0 ================================================================= Total params: 0 Trainable params: 0 Non-trainable params: 0 _________________________________________________________________ """ + # A 20-second stereo audio signal is converted to a (311, 1025, 2) tensor. + + # Now, you can add your own model. For example, let's add ResNet50 + # with global average pooling, no pre-trained weights, + # and for a 10-class classification. + + model.add( + tf.keras.applications.ResNet50( + include_top=True, weights=None, input_tensor=None, + input_shape=(311, 1025, 2), pooling='avg', classes=10 + ) + ) + + model.summary() + """ + Model: "sequential" + _________________________________________________________________ + Layer (type) Output Shape Param # + ================================================================= + stft (STFT) (None, 311, 1025, 2) 0 + _________________________________________________________________ + magnitude (Magnitude) (None, 311, 1025, 2) 0 + _________________________________________________________________ + magnitude_to_decibel (Magnit (None, 311, 1025, 2) 0 + _________________________________________________________________ + resnet50 (Functional) (None, 10) 23605066 + ================================================================= + Total params: 23,605,066 + Trainable params: 23,551,946 + Non-trainable params: 53,120 + _________________________________________________________________ + """ + Use STFT Magnitude -- a lazy version @@ -90,7 +124,7 @@ Use STFT Magnitude -- a lazy version sampling_rate = 16000 # sampling rate of your input audio duration = 20.0 # duration of the audio num_channel = 2 # number of channels of the audio - input_shape = (num_channel, int(sampling_rate * duration)) # let's follow `channels_last` convention even for audio + input_shape = (int(sampling_rate * duration), num_channel) # let's follow `channels_last` convention model = Sequential(get_stft_magnitude_layer(input_shape=input_shape, return_decibel=True)) @@ -100,13 +134,16 @@ Use STFT Magnitude -- a lazy version _________________________________________________________________ Layer (type) Output Shape Param # ================================================================= - stft_magnitude (Sequential) (None, 0, 1025, 320000) 0 + stft_magnitude (Sequential) (None, 622, 1025, 2) 0 ================================================================= Total params: 0 Trainable params: 0 Non-trainable params: 0 _________________________________________________________________ """ + # Here, a 20-second stereo audio signal is converted to a (622, 1025, 2) tensor. + # x2 more temporal frames compared to the example above because we didn't set hop_length here, + # and that means it's set to a 25% hop length, not 50% as above. model.layers[0].summary() # let's deep dive one level """ @@ -114,11 +151,11 @@ Use STFT Magnitude -- a lazy version _________________________________________________________________ Layer (type) Output Shape Param # ================================================================= - stft (STFT) (None, 0, 1025, 320000) 0 + stft (STFT) (None, 622, 1025, 2) 0 _________________________________________________________________ - magnitude (Magnitude) (None, 0, 1025, 320000) 0 + magnitude (Magnitude) (None, 622, 1025, 2) 0 _________________________________________________________________ - magnitude_to_decibel (Magnit (None, 0, 1025, 320000) 0 + magnitude_to_decibel (Magnit (None, 622, 1025, 2) 0 ================================================================= Total params: 0 Trainable params: 0