-
Notifications
You must be signed in to change notification settings - Fork 0
/
train.yaml
137 lines (117 loc) · 5.29 KB
/
train.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
# #################################
# Basic training parameters for speaker-id. We have first a network that
# computes some embeddings. On the top of that, we employ a classifier.
#
# #################################
# Seed needs to be set at top of yaml, before objects with parameters are made
seed: 1986
__set_seed: !!python/object/apply:torch.manual_seed [!ref <seed>]
# If you plan to train a system on an HPC cluster with a big dataset,
# we strongly suggest doing the following:
# 1- Compress the dataset in a single tar or zip file.
# 2- Copy your dataset locally (i.e., the local disk of the computing node).
# 3- Uncompress the dataset in the local folder.
# 4- Set data_folder with the local path.
# Reading data from the local disk of the compute node (e.g. $SLURM_TMPDIR with SLURM-based clusters) is very important.
# It allows you to read the data much faster without slowing down the shared filesystem.
data_folder: ./data
output_folder: !ref ./results/speaker_id/<seed>
save_folder: !ref <output_folder>/save
train_log: !ref <output_folder>/train_log.txt
# Path where data manifest files will be stored
# The data manifest files are created by the data preparation script.
train_annotation: train.json
valid_annotation: valid.json
test_annotation: test.json
# The train logger writes training statistics to a file, as well as stdout.
train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
save_file: !ref <train_log>
error_stats: !name:speechbrain.utils.metric_stats.MetricStats
metric: !name:speechbrain.nnet.losses.classification_error
reduction: batch
ckpt_interval_minutes: 15 # save checkpoint every N min
# Feature parameters
n_mels: 23
# Training Parameters
sample_rate: 16000
number_of_epochs: 35
batch_size: 32
lr_start: 0.001
lr_final: 0.0001
n_classes: 250 # In this case, we have 250 speakers
emb_dim: 512 # dimensionality of the embeddings
dataloader_options:
batch_size: !ref <batch_size>
# Added noise and reverb come from OpenRIR dataset, automatically
# downloaded and prepared with this Environmental Corruption class.
env_corrupt: !new:speechbrain.lobes.augment.EnvCorrupt
openrir_folder: !ref <data_folder>
babble_prob: 0.0
reverb_prob: 0.0
noise_prob: 1.0
noise_snr_low: 0
noise_snr_high: 15
# Adds speech change + time and frequency dropouts (time-domain implementation)
# # A small speed change help to improve the performance of speaker-id as well.
augmentation: !new:speechbrain.lobes.augment.TimeDomainSpecAugment
sample_rate: !ref <sample_rate>
speeds: [95, 100, 105]
# Feature extraction
compute_features: !new:speechbrain.lobes.features.Fbank
n_mels: !ref <n_mels>
# Mean and std normalization of the input features
mean_var_norm: !new:speechbrain.processing.features.InputNormalization
norm_type: sentence
std_norm: False
# To design a custom model, either just edit the simple CustomModel
# class that's listed here, or replace this `!new` call with a line
# pointing to a different file you've defined.
embedding_model: !new:custom_model.Xvector
in_channels: !ref <n_mels>
activation: !name:torch.nn.LeakyReLU
tdnn_blocks: 5
tdnn_channels: [512, 512, 512, 512, 1500]
tdnn_kernel_sizes: [5, 3, 3, 1, 1]
tdnn_dilations: [1, 2, 3, 1, 1]
lin_neurons: !ref <emb_dim>
classifier: !new:custom_model.Classifier
input_shape: [null, null, !ref <emb_dim>]
activation: !name:torch.nn.LeakyReLU
lin_blocks: 1
lin_neurons: !ref <emb_dim>
out_neurons: !ref <n_classes>
# The first object passed to the Brain class is this "Epoch Counter"
# which is saved by the Checkpointer so that training can be resumed
# if it gets interrupted at any point.
epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
limit: !ref <number_of_epochs>
# Objects in "modules" dict will have their parameters moved to the correct
# device, as well as having train()/eval() called on them by the Brain class.
modules:
compute_features: !ref <compute_features>
env_corrupt: !ref <env_corrupt>
augmentation: !ref <augmentation>
embedding_model: !ref <embedding_model>
classifier: !ref <classifier>
mean_var_norm: !ref <mean_var_norm>
# This optimizer will be constructed by the Brain class after all parameters
# are moved to the correct device. Then it will be added to the checkpointer.
opt_class: !name:torch.optim.Adam
lr: !ref <lr_start>
# This function manages learning rate annealing over the epochs.
# We here use the simple lr annealing method that linearly decreases
# the lr from the initial value to the final one.
lr_annealing: !new:speechbrain.nnet.schedulers.LinearScheduler
initial_value: !ref <lr_start>
final_value: !ref <lr_final>
epoch_count: !ref <number_of_epochs>
# This object is used for saving the state of training both so that it
# can be resumed if it gets interrupted, and also so that the best checkpoint
# can be later loaded for evaluation or inference.
checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
checkpoints_dir: !ref <save_folder>
recoverables:
embedding_model: !ref <embedding_model>
classifier: !ref <classifier>
normalizer: !ref <mean_var_norm>
counter: !ref <epoch_counter>