From 73cb21527c80fdbc8d38c8cc6e5718e7ed597691 Mon Sep 17 00:00:00 2001 From: TianYuan Date: Tue, 12 Oct 2021 08:54:55 +0000 Subject: [PATCH 1/3] refactor of examples --- README.md | 10 +- examples/GANVocoder/README.md | 1 + .../GANVocoder/normalize.py | 0 .../parallelwave_gan/baker/README.md | 22 +- .../parallelwave_gan/baker/conf/default.yaml | 5 +- .../parallelwave_gan/baker/preprocess.sh | 16 +- .../parallelwave_gan/baker/run.sh | 4 +- .../parallelwave_gan/baker/synthesize.sh | 4 +- .../baker/synthesize_from_wav.py | 4 +- .../parallelwave_gan/ljspeech/README.md | 22 +- .../ljspeech/conf/default.yaml | 4 - .../parallelwave_gan/ljspeech/preprocess.sh | 16 +- .../parallelwave_gan/ljspeech/run.sh | 4 +- .../parallelwave_gan/ljspeech/synthesize.sh | 4 +- .../GANVocoder/parallelwave_gan/synthesize.py | 2 +- .../GANVocoder/parallelwave_gan/train.py | 8 +- .../GANVocoder/preprocess.py | 8 - examples/fastspeech2/aishell3/README.md | 19 +- examples/fastspeech2/aishell3/preprocess.sh | 15 +- examples/fastspeech2/aishell3/run.sh | 4 +- examples/fastspeech2/aishell3/synthesize.sh | 4 +- .../fastspeech2/aishell3/synthesize_e2e.py | 22 +- examples/fastspeech2/baker/README.md | 35 ++-- examples/fastspeech2/baker/preprocess.sh | 16 +- examples/fastspeech2/baker/run.sh | 3 +- examples/fastspeech2/baker/synthesize.sh | 4 +- examples/fastspeech2/baker/synthesize_e2e.py | 10 +- examples/fastspeech2/ljspeech/README.md | 34 +-- examples/fastspeech2/ljspeech/preprocess.sh | 14 +- examples/fastspeech2/ljspeech/run.sh | 3 +- examples/fastspeech2/ljspeech/synthesize.sh | 3 +- .../fastspeech2/ljspeech/synthesize_e2e.py | 8 +- .../fastspeech2/normalize.py | 0 .../fastspeech2/preprocess.py | 11 +- .../fastspeech2/synthesize.py | 51 +++-- .../fastspeech2/train.py | 63 +++--- examples/speedyspeech/baker/README.md | 16 +- examples/speedyspeech/baker/conf/default.yaml | 4 - examples/speedyspeech/baker/inference.py | 2 +- examples/speedyspeech/baker/preprocess.sh | 14 +- examples/speedyspeech/baker/run.sh | 4 +- examples/speedyspeech/baker/synthesize.sh | 4 +- examples/speedyspeech/baker/synthesize_e2e.py | 10 +- .../speedyspeech/normalize.py | 0 .../speedyspeech/preprocess.py | 16 +- .../speedyspeech/synthesize.py | 8 +- .../speedyspeech/train.py | 9 +- examples/text_frontend/test_g2p.py | 4 +- examples/text_frontend/test_textnorm.py | 2 +- examples/transformer_tts/ljspeech/README.md | 5 +- .../transformer_tts/ljspeech/preprocess.sh | 8 +- examples/transformer_tts/ljspeech/run.sh | 2 +- .../transformer_tts/ljspeech/synthesize.sh | 2 +- .../ljspeech/synthesize_e2e.py | 4 +- .../{ljspeech => }/normalize.py | 10 +- .../{ljspeech => }/preprocess.py | 0 .../{ljspeech => }/synthesize.py | 9 +- .../transformer_tts/{ljspeech => }/train.py | 52 +---- .../datasets}/preprocess_utils.py | 0 parakeet/frontend/__init__.py | 2 +- .../{cn_frontend.py => zh_frontend.py} | 2 +- .../README.md | 0 .../__init__.py | 2 +- .../char_convert.py | 0 .../chronology.py | 0 .../constants.py | 0 .../num.py | 0 .../phonecode.py | 0 .../quantifier.py | 0 .../text_normlization.py | 0 .../models/{ => fastspeech2}/fastspeech2.py | 0 .../fastspeech2}/fastspeech2_updater.py | 12 +- .../parallel_wavegan.py | 0 .../parallel_wavegan_updater.py} | 0 .../models/{ => speedyspeech}/speedyspeech.py | 0 .../speedyspeech}/speedyspeech_updater.py | 0 .../{ => transformer_tts}/transformer_tts.py | 0 .../transformer_tts_updater.py | 4 +- .../training/multispk_fastspeech2_updater.py | 162 -------------- .../training/optimizer.py | 0 .../benchmark}/PWGAN/README.md | 2 +- .../benchmark}/PWGAN/run_all.sh | 12 +- .../benchmark}/PWGAN/run_benchmark.sh | 6 +- tests/{ => chain}/README.md | 0 tests/{ => chain}/infer.sh | 0 tests/{ => chain}/lite_train_infer.sh | 0 tests/{ => chain}/prepare.sh | 0 .../speedyspeech_params_lite_multi_gpu.txt | 50 +++++ .../speedyspeech_params_lite_single_gpu.txt | 51 +++++ .../speedyspeech_params_whole_multi_gpu.txt | 50 +++++ .../speedyspeech_params_whole_single_gpu.txt | 50 +++++ tests/{ => chain}/test.sh | 0 tests/{ => chain}/whole_train_infer.sh | 0 tests/speedyspeech_params_lite_multi_gpu.txt | 50 ----- tests/speedyspeech_params_lite_single_gpu.txt | 51 ----- tests/speedyspeech_params_whole_multi_gpu.txt | 50 ----- .../speedyspeech_params_whole_single_gpu.txt | 50 ----- utils/fs2_pwg_syn.py | 141 ------------- utils/fs2_train.py | 198 ------------------ utils/gen_duration_from_textgrid.py | 8 +- 100 files changed, 499 insertions(+), 1092 deletions(-) create mode 100644 examples/GANVocoder/README.md rename utils/vocoder_normalize.py => examples/GANVocoder/normalize.py (100%) rename examples/{ => GANVocoder}/parallelwave_gan/baker/README.md (88%) rename examples/{ => GANVocoder}/parallelwave_gan/baker/conf/default.yaml (96%) rename examples/{ => GANVocoder}/parallelwave_gan/baker/preprocess.sh (80%) rename examples/{ => GANVocoder}/parallelwave_gan/baker/run.sh (75%) rename examples/{ => GANVocoder}/parallelwave_gan/baker/synthesize.sh (68%) rename examples/{ => GANVocoder}/parallelwave_gan/baker/synthesize_from_wav.py (96%) rename examples/{ => GANVocoder}/parallelwave_gan/ljspeech/README.md (87%) rename examples/{ => GANVocoder}/parallelwave_gan/ljspeech/conf/default.yaml (95%) rename examples/{ => GANVocoder}/parallelwave_gan/ljspeech/preprocess.sh (80%) rename examples/{ => GANVocoder}/parallelwave_gan/ljspeech/run.sh (75%) rename examples/{ => GANVocoder}/parallelwave_gan/ljspeech/synthesize.sh (68%) rename utils/pwg_syn.py => examples/GANVocoder/parallelwave_gan/synthesize.py (97%) rename utils/pwg_train.py => examples/GANVocoder/parallelwave_gan/train.py (96%) rename utils/vocoder_preprocess.py => examples/GANVocoder/preprocess.py (97%) rename utils/fs2_normalize.py => examples/fastspeech2/normalize.py (100%) rename utils/fs2_preprocess.py => examples/fastspeech2/preprocess.py (97%) rename utils/multi_spk_fs2_pwg_syn.py => examples/fastspeech2/synthesize.py (79%) rename utils/multi_spk_fs2_train.py => examples/fastspeech2/train.py (81%) rename utils/ss_normalize.py => examples/speedyspeech/normalize.py (100%) rename utils/ss_preprocess.py => examples/speedyspeech/preprocess.py (95%) rename utils/ss_pwg_syn.py => examples/speedyspeech/synthesize.py (95%) rename utils/ss_train.py => examples/speedyspeech/train.py (96%) rename examples/transformer_tts/{ljspeech => }/normalize.py (95%) rename examples/transformer_tts/{ljspeech => }/preprocess.py (100%) rename examples/transformer_tts/{ljspeech => }/synthesize.py (95%) rename examples/transformer_tts/{ljspeech => }/train.py (81%) rename {utils => parakeet/datasets}/preprocess_utils.py (100%) rename parakeet/frontend/{cn_frontend.py => zh_frontend.py} (99%) rename parakeet/frontend/{cn_normalization => zh_normalization}/README.md (100%) rename parakeet/frontend/{cn_normalization => zh_normalization}/__init__.py (90%) rename parakeet/frontend/{cn_normalization => zh_normalization}/char_convert.py (100%) rename parakeet/frontend/{cn_normalization => zh_normalization}/chronology.py (100%) rename parakeet/frontend/{cn_normalization => zh_normalization}/constants.py (100%) rename parakeet/frontend/{cn_normalization => zh_normalization}/num.py (100%) rename parakeet/frontend/{cn_normalization => zh_normalization}/phonecode.py (100%) rename parakeet/frontend/{cn_normalization => zh_normalization}/quantifier.py (100%) rename parakeet/frontend/{cn_normalization => zh_normalization}/text_normlization.py (100%) rename parakeet/models/{ => fastspeech2}/fastspeech2.py (100%) rename parakeet/{training => models/fastspeech2}/fastspeech2_updater.py (93%) rename parakeet/models/{ => parallel_wavegan}/parallel_wavegan.py (100%) rename parakeet/{training/pwg_updater.py => models/parallel_wavegan/parallel_wavegan_updater.py} (100%) rename parakeet/models/{ => speedyspeech}/speedyspeech.py (100%) rename parakeet/{training => models/speedyspeech}/speedyspeech_updater.py (100%) rename parakeet/models/{ => transformer_tts}/transformer_tts.py (100%) rename parakeet/{training => models/transformer_tts}/transformer_tts_updater.py (98%) delete mode 100644 parakeet/training/multispk_fastspeech2_updater.py rename utils/train_utils.py => parakeet/training/optimizer.py (100%) rename {benchmark => tests/benchmark}/PWGAN/README.md (85%) rename {benchmark => tests/benchmark}/PWGAN/run_all.sh (69%) rename {benchmark => tests/benchmark}/PWGAN/run_benchmark.sh (86%) rename tests/{ => chain}/README.md (100%) rename tests/{ => chain}/infer.sh (100%) rename tests/{ => chain}/lite_train_infer.sh (100%) rename tests/{ => chain}/prepare.sh (100%) create mode 100644 tests/chain/speedyspeech_params_lite_multi_gpu.txt create mode 100644 tests/chain/speedyspeech_params_lite_single_gpu.txt create mode 100644 tests/chain/speedyspeech_params_whole_multi_gpu.txt create mode 100644 tests/chain/speedyspeech_params_whole_single_gpu.txt rename tests/{ => chain}/test.sh (100%) rename tests/{ => chain}/whole_train_infer.sh (100%) delete mode 100644 tests/speedyspeech_params_lite_multi_gpu.txt delete mode 100644 tests/speedyspeech_params_lite_single_gpu.txt delete mode 100644 tests/speedyspeech_params_whole_multi_gpu.txt delete mode 100644 tests/speedyspeech_params_whole_single_gpu.txt delete mode 100644 utils/fs2_pwg_syn.py delete mode 100644 utils/fs2_train.py diff --git a/README.md b/README.md index 572b7860..61bc8235 100644 --- a/README.md +++ b/README.md @@ -7,12 +7,14 @@ Parakeet aims to provide a flexible, efficient and state-of-the-art text-to-spee ## News +- Oct-12-2021, Parallel WaveGAN with LJSpeech. Check [examples/GANVocoder/parallelwave_gan/ljspeech](./examples/GANVocoder/parallelwave_gan/ljspeech). +- Oct-12-2021, FastSpeech2/FastPitch with LJSpeech. Check [examples/fastspeech2/ljspeech](./examples/fastspeech2/ljspeech). - Sep-14-2021, Reconstruction of TransformerTTS. Check [examples/transformer_tts/ljspeech](./examples/transformer_tts/ljspeech). - Aug-31-2021, Chinese Text Frontend. Check [examples/text_frontend](./examples/text_frontend). - Aug-23-2021, FastSpeech2/FastPitch with AISHELL-3. Check [examples/fastspeech2/aishell3](./examples/fastspeech2/aishell3). - Aug-03-2021, FastSpeech2/FastPitch with CSMSC. Check [examples/fastspeech2/baker](./examples/fastspeech2/baker). - Jul-19-2021, SpeedySpeech with CSMSC. Check [examples/speedyspeech/baker](./examples/speedyspeech/baker). -- Jul-01-2021, Parallel WaveGAN with CSMSC. Check [examples/parallelwave_gan/baker](./examples/parallelwave_gan/baker). +- Jul-01-2021, Parallel WaveGAN with CSMSC. Check [examples/GANVocoder/parallelwave_gan/baker](./examples/GANVocoder/parallelwave_gan/baker). - Jul-01-2021, Montreal-Forced-Aligner. Check [examples/use_mfa](./examples/use_mfa). - May-07-2021, Voice Cloning in Chinese. Check [examples/tacotron2_aishell3](./examples/tacotron2_aishell3). @@ -68,7 +70,7 @@ Entries to the introduction, and the launch of training and synthsis for differe - [>>> Chinese Text Frontend](./examples/text_frontend) - [>>> FastSpeech2/FastPitch](./examples/fastspeech2) - [>>> Montreal-Forced-Aligner](./examples/use_mfa) -- [>>> Parallel WaveGAN](./examples/parallelwave_gan) +- [>>> Parallel WaveGAN](./examples/GANVocoder/parallelwave_gan) - [>>> SpeedySpeech](./examples/speedyspeech) - [>>> Tacotron2_AISHELL3](./examples/tacotron2_aishell3) - [>>> GE2E](./examples/ge2e) @@ -87,9 +89,10 @@ Check our [website](https://paddleparakeet.readthedocs.io/en/latest/demo.html) f #### FastSpeech2/FastPitch 1. [fastspeech2_nosil_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_baker_ckpt_0.4.zip) 2. [fastspeech2_nosil_aishell3_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_aishell3_ckpt_0.4.zip) +3. [fastspeech2_nosil_ljspeech_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_ljspeech_ckpt_0.5.zip) #### SpeedySpeech -1. [speedyspeech_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/speedyspeech_baker_ckpt_0.4.zip) +1. [speedyspeech_nosil_baker_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/speedyspeech_nosil_baker_ckpt_0.5.zip) #### TransformerTTS @@ -109,6 +112,7 @@ Check our [website](https://paddleparakeet.readthedocs.io/en/latest/demo.html) f #### Parallel WaveGAN 1. [pwg_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_baker_ckpt_0.4.zip) +2. [pwg_ljspeech_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_ljspeech_ckpt_0.5.zip) ### Voice Cloning diff --git a/examples/GANVocoder/README.md b/examples/GANVocoder/README.md new file mode 100644 index 00000000..3109be17 --- /dev/null +++ b/examples/GANVocoder/README.md @@ -0,0 +1 @@ +different GAN Vocoders have the same preprocess.py and normalize.py diff --git a/utils/vocoder_normalize.py b/examples/GANVocoder/normalize.py similarity index 100% rename from utils/vocoder_normalize.py rename to examples/GANVocoder/normalize.py diff --git a/examples/parallelwave_gan/baker/README.md b/examples/GANVocoder/parallelwave_gan/baker/README.md similarity index 88% rename from examples/parallelwave_gan/baker/README.md rename to examples/GANVocoder/parallelwave_gan/baker/README.md index 61bb4d0e..9d65b155 100644 --- a/examples/parallelwave_gan/baker/README.md +++ b/examples/GANVocoder/parallelwave_gan/baker/README.md @@ -37,19 +37,19 @@ Also there is a `metadata.jsonl` in each subfolder. It is a table-like file whic ## Train the model -`./run.sh` calls `Parakeet/utils/pwg_train.py`. +`./run.sh` calls `../train.py`. ```bash ./run.sh ``` Here's the complete help message. ```text -usage: pwg_train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA] - [--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR] - [--device DEVICE] [--nprocs NPROCS] [--verbose VERBOSE] - [--batch-size BATCH_SIZE] [--max-iter MAX_ITER] - [--run-benchmark RUN_BENCHMARK] - [--profiler_options PROFILER_OPTIONS] +usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA] + [--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR] + [--device DEVICE] [--nprocs NPROCS] [--verbose VERBOSE] + [--batch-size BATCH_SIZE] [--max-iter MAX_ITER] + [--run-benchmark RUN_BENCHMARK] + [--profiler_options PROFILER_OPTIONS] Train a ParallelWaveGAN model. @@ -102,14 +102,14 @@ pwg_baker_ckpt_0.4 ## Synthesize -`synthesize.sh` calls `Parakeet/utils/pwg_syn.py `, which can synthesize waveform from `metadata.jsonl`. +`synthesize.sh` calls `../synthesize.py `, which can synthesize waveform from `metadata.jsonl`. ```bash ./synthesize.sh ``` ```text -usage: pwg_syn.py [-h] [--config CONFIG] [--checkpoint CHECKPOINT] - [--test-metadata TEST_METADATA] [--output-dir OUTPUT_DIR] - [--device DEVICE] [--verbose VERBOSE] +usage: synthesize.py [-h] [--config CONFIG] [--checkpoint CHECKPOINT] + [--test-metadata TEST_METADATA] [--output-dir OUTPUT_DIR] + [--device DEVICE] [--verbose VERBOSE] Synthesize with parallel wavegan. diff --git a/examples/parallelwave_gan/baker/conf/default.yaml b/examples/GANVocoder/parallelwave_gan/baker/conf/default.yaml similarity index 96% rename from examples/parallelwave_gan/baker/conf/default.yaml rename to examples/GANVocoder/parallelwave_gan/baker/conf/default.yaml index 96cb9390..5628b7f7 100644 --- a/examples/parallelwave_gan/baker/conf/default.yaml +++ b/examples/GANVocoder/parallelwave_gan/baker/conf/default.yaml @@ -15,10 +15,7 @@ window: "hann" # Window function. n_mels: 80 # Number of mel basis. fmin: 80 # Minimum freq in mel basis calculation. (Hz) fmax: 7600 # Maximum frequency in mel basis calculation. (Hz) -trim_silence: false # Whether to trim the start and end of silence. -top_db: 60 # Need to tune carefully if the recording is not good. -trim_frame_length: 2048 # Frame size in trimming. (in samples) -trim_hop_length: 512 # Hop size in trimming. (in samples) + ########################################################### # GENERATOR NETWORK ARCHITECTURE SETTING # diff --git a/examples/parallelwave_gan/baker/preprocess.sh b/examples/GANVocoder/parallelwave_gan/baker/preprocess.sh similarity index 80% rename from examples/parallelwave_gan/baker/preprocess.sh rename to examples/GANVocoder/parallelwave_gan/baker/preprocess.sh index 228b1464..df5b7d22 100755 --- a/examples/parallelwave_gan/baker/preprocess.sh +++ b/examples/GANVocoder/parallelwave_gan/baker/preprocess.sh @@ -3,10 +3,7 @@ stage=0 stop_stage=100 -fs=24000 -n_shift=300 - -export MAIN_ROOT=`realpath ${PWD}/../../../` +export MAIN_ROOT=`realpath ${PWD}/../../../../` if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then # get durations from MFA's result @@ -14,13 +11,12 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then python3 ${MAIN_ROOT}/utils/gen_duration_from_textgrid.py \ --inputdir=./baker_alignment_tone \ --output=durations.txt \ - --sample-rate=${fs} \ - --n-shift=${n_shift} + --config=conf/default.yaml fi if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then echo "Extract features ..." - python3 ${MAIN_ROOT}/utils/vocoder_preprocess.py \ + python3 ../../preprocess.py \ --rootdir=~/datasets/BZNSYP/ \ --dataset=baker \ --dumpdir=dump \ @@ -42,16 +38,16 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then # normalize, dev and test should use train's stats echo "Normalize ..." - python3 ${MAIN_ROOT}/utils/vocoder_normalize.py \ + python3 ../../normalize.py \ --metadata=dump/train/raw/metadata.jsonl \ --dumpdir=dump/train/norm \ --stats=dump/train/feats_stats.npy - python3 ${MAIN_ROOT}/utils/vocoder_normalize.py \ + python3 ../../normalize.py \ --metadata=dump/dev/raw/metadata.jsonl \ --dumpdir=dump/dev/norm \ --stats=dump/train/feats_stats.npy - python3 ${MAIN_ROOT}/utils/vocoder_normalize.py \ + python3 ../../normalize.py \ --metadata=dump/test/raw/metadata.jsonl \ --dumpdir=dump/test/norm \ --stats=dump/train/feats_stats.npy diff --git a/examples/parallelwave_gan/baker/run.sh b/examples/GANVocoder/parallelwave_gan/baker/run.sh similarity index 75% rename from examples/parallelwave_gan/baker/run.sh rename to examples/GANVocoder/parallelwave_gan/baker/run.sh index f79ae6ed..df8cefd8 100755 --- a/examples/parallelwave_gan/baker/run.sh +++ b/examples/GANVocoder/parallelwave_gan/baker/run.sh @@ -1,10 +1,8 @@ #!/bin/bash -export MAIN_ROOT=`realpath ${PWD}/../../../` - FLAGS_cudnn_exhaustive_search=true \ FLAGS_conv_workspace_size_limit=4000 \ -python ${MAIN_ROOT}/utils/pwg_train.py \ +python ../train.py \ --train-metadata=dump/train/norm/metadata.jsonl \ --dev-metadata=dump/dev/norm/metadata.jsonl \ --config=conf/default.yaml \ diff --git a/examples/parallelwave_gan/baker/synthesize.sh b/examples/GANVocoder/parallelwave_gan/baker/synthesize.sh similarity index 68% rename from examples/parallelwave_gan/baker/synthesize.sh rename to examples/GANVocoder/parallelwave_gan/baker/synthesize.sh index 3887fc09..938e7869 100755 --- a/examples/parallelwave_gan/baker/synthesize.sh +++ b/examples/GANVocoder/parallelwave_gan/baker/synthesize.sh @@ -1,8 +1,6 @@ #!/bin/bash -export MAIN_ROOT=`realpath ${PWD}/../../../` - -python3 ${MAIN_ROOT}/utils/pwg_syn.py \ +python3 ../synthesize.py \ --config=conf/default.yaml \ --checkpoint=exp/default/checkpoints/snapshot_iter_400000.pdz\ --test-metadata=dump/test/norm/metadata.jsonl \ diff --git a/examples/parallelwave_gan/baker/synthesize_from_wav.py b/examples/GANVocoder/parallelwave_gan/baker/synthesize_from_wav.py similarity index 96% rename from examples/parallelwave_gan/baker/synthesize_from_wav.py rename to examples/GANVocoder/parallelwave_gan/baker/synthesize_from_wav.py index 16db38b4..5435422c 100644 --- a/examples/parallelwave_gan/baker/synthesize_from_wav.py +++ b/examples/GANVocoder/parallelwave_gan/baker/synthesize_from_wav.py @@ -23,8 +23,8 @@ import soundfile as sf import yaml from parakeet.data.get_feats import LogMelFBank -from parakeet.models.parallel_wavegan import PWGGenerator -from parakeet.models.parallel_wavegan import PWGInference +from parakeet.models.parallel_wavegan.parallel_wavegan import PWGGenerator +from parakeet.models.parallel_wavegan.parallel_wavegan import PWGInference from parakeet.modules.normalizer import ZScore from yacs.config import CfgNode as Configuration diff --git a/examples/parallelwave_gan/ljspeech/README.md b/examples/GANVocoder/parallelwave_gan/ljspeech/README.md similarity index 87% rename from examples/parallelwave_gan/ljspeech/README.md rename to examples/GANVocoder/parallelwave_gan/ljspeech/README.md index 3f2a6714..066181a1 100644 --- a/examples/parallelwave_gan/ljspeech/README.md +++ b/examples/GANVocoder/parallelwave_gan/ljspeech/README.md @@ -39,19 +39,19 @@ The dataset is split into 3 parts, namely `train`, `dev` and `test`, each of whi Also there is a `metadata.jsonl` in each subfolder. It is a table-like file which contains id and paths to spectrogam of each utterance. ## Train the model -`./run.sh` calls `Parakeet/utils/pwg_train.py`. +`./run.sh` calls `../train.py`. ```bash ./run.sh ``` Here's the complete help message. ```text -usage: pwg_train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA] - [--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR] - [--device DEVICE] [--nprocs NPROCS] [--verbose VERBOSE] - [--batch-size BATCH_SIZE] [--max-iter MAX_ITER] - [--run-benchmark RUN_BENCHMARK] - [--profiler_options PROFILER_OPTIONS] +usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA] + [--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR] + [--device DEVICE] [--nprocs NPROCS] [--verbose VERBOSE] + [--batch-size BATCH_SIZE] [--max-iter MAX_ITER] + [--run-benchmark RUN_BENCHMARK] + [--profiler_options PROFILER_OPTIONS] Train a ParallelWaveGAN model. @@ -102,14 +102,14 @@ pwg_ljspeech_ckpt_0.5 ``` ## Synthesize -`synthesize.sh` calls `Parakeet/utils/pwg_syn.py `, which can synthesize waveform from `metadata.jsonl`. +`synthesize.sh` calls `../synthesize.py `, which can synthesize waveform from `metadata.jsonl`. ```bash ./synthesize.sh ``` ```text -usage: pwg_syn.py [-h] [--config CONFIG] [--checkpoint CHECKPOINT] - [--test-metadata TEST_METADATA] [--output-dir OUTPUT_DIR] - [--device DEVICE] [--verbose VERBOSE] +usage: synthesize.py [-h] [--config CONFIG] [--checkpoint CHECKPOINT] + [--test-metadata TEST_METADATA] [--output-dir OUTPUT_DIR] + [--device DEVICE] [--verbose VERBOSE] Synthesize with parallel wavegan. diff --git a/examples/parallelwave_gan/ljspeech/conf/default.yaml b/examples/GANVocoder/parallelwave_gan/ljspeech/conf/default.yaml similarity index 95% rename from examples/parallelwave_gan/ljspeech/conf/default.yaml rename to examples/GANVocoder/parallelwave_gan/ljspeech/conf/default.yaml index f046f5ba..2edec3b9 100644 --- a/examples/parallelwave_gan/ljspeech/conf/default.yaml +++ b/examples/GANVocoder/parallelwave_gan/ljspeech/conf/default.yaml @@ -15,10 +15,6 @@ window: "hann" # Window function. n_mels: 80 # Number of mel basis. fmin: 80 # Minimum freq in mel basis calculation. (Hz) fmax: 7600 # Maximum frequency in mel basis calculation. (Hz) -trim_silence: false # Whether to trim the start and end of silence. -top_db: 60 # Need to tune carefully if the recording is not good. -trim_frame_length: 2048 # Frame size in trimming. (in samples) -trim_hop_length: 512 # Hop size in trimming. (in samples) ########################################################### # GENERATOR NETWORK ARCHITECTURE SETTING # diff --git a/examples/parallelwave_gan/ljspeech/preprocess.sh b/examples/GANVocoder/parallelwave_gan/ljspeech/preprocess.sh similarity index 80% rename from examples/parallelwave_gan/ljspeech/preprocess.sh rename to examples/GANVocoder/parallelwave_gan/ljspeech/preprocess.sh index 5185a146..d88d2989 100755 --- a/examples/parallelwave_gan/ljspeech/preprocess.sh +++ b/examples/GANVocoder/parallelwave_gan/ljspeech/preprocess.sh @@ -3,10 +3,7 @@ stage=0 stop_stage=100 -fs=22050 -n_shift=256 - -export MAIN_ROOT=`realpath ${PWD}/../../../` +export MAIN_ROOT=`realpath ${PWD}/../../../../` if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then # get durations from MFA's result @@ -14,14 +11,13 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then python3 ${MAIN_ROOT}/utils/gen_duration_from_textgrid.py \ --inputdir=./ljspeech_alignment \ --output=durations.txt \ - --sample-rate=${fs} \ - --n-shift=${n_shift} + --config=conf/default.yaml fi if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then # extract features echo "Extract features ..." - python3 ${MAIN_ROOT}/utils/vocoder_preprocess.py \ + python3 ../../preprocess.py \ --rootdir=~/datasets/LJSpeech-1.1/ \ --dataset=ljspeech \ --dumpdir=dump \ @@ -43,16 +39,16 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then # normalize, dev and test should use train's stats echo "Normalize ..." - python3 ${MAIN_ROOT}/utils/vocoder_normalize.py \ + python3 ../../normalize.py \ --metadata=dump/train/raw/metadata.jsonl \ --dumpdir=dump/train/norm \ --stats=dump/train/feats_stats.npy - python3 ${MAIN_ROOT}/utils/vocoder_normalize.py \ + python3 ../../normalize.py \ --metadata=dump/dev/raw/metadata.jsonl \ --dumpdir=dump/dev/norm \ --stats=dump/train/feats_stats.npy - python3 ${MAIN_ROOT}/utils/vocoder_normalize.py \ + python3 ../../normalize.py \ --metadata=dump/test/raw/metadata.jsonl \ --dumpdir=dump/test/norm \ --stats=dump/train/feats_stats.npy diff --git a/examples/parallelwave_gan/ljspeech/run.sh b/examples/GANVocoder/parallelwave_gan/ljspeech/run.sh similarity index 75% rename from examples/parallelwave_gan/ljspeech/run.sh rename to examples/GANVocoder/parallelwave_gan/ljspeech/run.sh index f79ae6ed..df8cefd8 100755 --- a/examples/parallelwave_gan/ljspeech/run.sh +++ b/examples/GANVocoder/parallelwave_gan/ljspeech/run.sh @@ -1,10 +1,8 @@ #!/bin/bash -export MAIN_ROOT=`realpath ${PWD}/../../../` - FLAGS_cudnn_exhaustive_search=true \ FLAGS_conv_workspace_size_limit=4000 \ -python ${MAIN_ROOT}/utils/pwg_train.py \ +python ../train.py \ --train-metadata=dump/train/norm/metadata.jsonl \ --dev-metadata=dump/dev/norm/metadata.jsonl \ --config=conf/default.yaml \ diff --git a/examples/parallelwave_gan/ljspeech/synthesize.sh b/examples/GANVocoder/parallelwave_gan/ljspeech/synthesize.sh similarity index 68% rename from examples/parallelwave_gan/ljspeech/synthesize.sh rename to examples/GANVocoder/parallelwave_gan/ljspeech/synthesize.sh index 3887fc09..938e7869 100755 --- a/examples/parallelwave_gan/ljspeech/synthesize.sh +++ b/examples/GANVocoder/parallelwave_gan/ljspeech/synthesize.sh @@ -1,8 +1,6 @@ #!/bin/bash -export MAIN_ROOT=`realpath ${PWD}/../../../` - -python3 ${MAIN_ROOT}/utils/pwg_syn.py \ +python3 ../synthesize.py \ --config=conf/default.yaml \ --checkpoint=exp/default/checkpoints/snapshot_iter_400000.pdz\ --test-metadata=dump/test/norm/metadata.jsonl \ diff --git a/utils/pwg_syn.py b/examples/GANVocoder/parallelwave_gan/synthesize.py similarity index 97% rename from utils/pwg_syn.py rename to examples/GANVocoder/parallelwave_gan/synthesize.py index 7c37e340..1ee52dbc 100644 --- a/utils/pwg_syn.py +++ b/examples/GANVocoder/parallelwave_gan/synthesize.py @@ -24,7 +24,7 @@ import yaml from paddle import distributed as dist from parakeet.datasets.data_table import DataTable -from parakeet.models.parallel_wavegan import PWGGenerator +from parakeet.models.parallel_wavegan.parallel_wavegan import PWGGenerator from yacs.config import CfgNode diff --git a/utils/pwg_train.py b/examples/GANVocoder/parallelwave_gan/train.py similarity index 96% rename from utils/pwg_train.py rename to examples/GANVocoder/parallelwave_gan/train.py index 61444a01..ec357a30 100644 --- a/utils/pwg_train.py +++ b/examples/GANVocoder/parallelwave_gan/train.py @@ -30,13 +30,13 @@ from paddle.optimizer.lr import StepDecay from parakeet.datasets.data_table import DataTable from parakeet.datasets.vocoder_batch_fn import Clip -from parakeet.models.parallel_wavegan import PWGGenerator -from parakeet.models.parallel_wavegan import PWGDiscriminator +from parakeet.models.parallel_wavegan.parallel_wavegan import PWGGenerator +from parakeet.models.parallel_wavegan.parallel_wavegan import PWGDiscriminator +from parakeet.models.parallel_wavegan.parallel_wavegan_updater import PWGUpdater +from parakeet.models.parallel_wavegan.parallel_wavegan_updater import PWGEvaluator from parakeet.modules.stft_loss import MultiResolutionSTFTLoss from parakeet.training.extensions.snapshot import Snapshot from parakeet.training.extensions.visualizer import VisualDL -from parakeet.training.pwg_updater import PWGUpdater -from parakeet.training.pwg_updater import PWGEvaluator from parakeet.training.seeding import seed_everything from parakeet.training.trainer import Trainer from pathlib import Path diff --git a/utils/vocoder_preprocess.py b/examples/GANVocoder/preprocess.py similarity index 97% rename from utils/vocoder_preprocess.py rename to examples/GANVocoder/preprocess.py index 776281e6..e536f595 100644 --- a/utils/vocoder_preprocess.py +++ b/examples/GANVocoder/preprocess.py @@ -132,14 +132,6 @@ def process_sentence(config: Dict[str, Any], start, end = librosa.time_to_samples([start, end], sr=config.fs) y = y[start:end] - # energy based silence trimming - if config.trim_silence: - y, _ = librosa.effects.trim( - y, - top_db=config.top_db, - frame_length=config.trim_frame_length, - hop_length=config.trim_hop_length) - # extract mel feats logmel = mel_extractor.get_log_mel_fbank(y) diff --git a/examples/fastspeech2/aishell3/README.md b/examples/fastspeech2/aishell3/README.md index c58f0b70..d7f90067 100644 --- a/examples/fastspeech2/aishell3/README.md +++ b/examples/fastspeech2/aishell3/README.md @@ -48,21 +48,18 @@ The dataset is split into 3 parts, namely `train`, `dev` and` test`, each of whi Also there is a `metadata.jsonl` in each subfolder. It is a table-like file which contains phones, text_lengths, speech_lengths, durations, path of speech features, path of pitch features, path of energy features, speaker and id of each utterance. ## Train the model -`./run.sh` calls `Parakeet/utils/multi_spk_fs2_train.py`. +`./run.sh` calls `../train.py`. ```bash ./run.sh ``` Here's the complete help message. ```text -usage: multi_spk_fs2_train.py [-h] [--config CONFIG] - [--train-metadata TRAIN_METADATA] - [--dev-metadata DEV_METADATA] - [--output-dir OUTPUT_DIR] [--device DEVICE] - [--nprocs NPROCS] [--verbose VERBOSE] - [--phones-dict PHONES_DICT] - [--speaker-dict SPEAKER_DICT] +usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA] + [--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR] + [--device DEVICE] [--nprocs NPROCS] [--verbose VERBOSE] + [--phones-dict PHONES_DICT] [--speaker-dict SPEAKER_DICT] -Train a FastSpeech2 model with multiple speaker dataset. +Train a FastSpeech2 model. optional arguments: -h, --help show this help message and exit @@ -79,7 +76,7 @@ optional arguments: --phones-dict PHONES_DICT phone vocabulary file. --speaker-dict SPEAKER_DICT - speaker id map file. + speaker id map file for multiple speaker model. ``` 1. `--config` is a config file in yaml format to overwrite the default config, which can be found at `conf/default.yaml`. 2. `--train-metadata` and `--dev-metadata` should be the metadata file in the normalized subfolder of `train` and `dev` in the `dump` folder. @@ -148,7 +145,7 @@ optional arguments: --phones-dict PHONES_DICT phone vocabulary file. --speaker-dict SPEAKER_DICT - speaker id map file. + speaker id map file for multiple speaker model. --test-metadata TEST_METADATA test metadata. --output-dir OUTPUT_DIR diff --git a/examples/fastspeech2/aishell3/preprocess.sh b/examples/fastspeech2/aishell3/preprocess.sh index 8713cc6e..281abee0 100755 --- a/examples/fastspeech2/aishell3/preprocess.sh +++ b/examples/fastspeech2/aishell3/preprocess.sh @@ -2,8 +2,6 @@ stage=0 stop_stage=100 -fs=24000 -n_shift=300 export MAIN_ROOT=`realpath ${PWD}/../../../` @@ -13,20 +11,19 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then python3 ${MAIN_ROOT}/utils/gen_duration_from_textgrid.py \ --inputdir=./aishell3_alignment_tone \ --output durations.txt \ - --sample-rate=${fs} \ - --n-shift=${n_shift} + --config=conf/default.yaml fi if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then # extract features echo "Extract features ..." - python3 ${MAIN_ROOT}/utils/fs2_preprocess.py \ + python3 ../preprocess.py \ --dataset=aishell3 \ --rootdir=~/datasets/data_aishell3/ \ --dumpdir=dump \ --dur-file=durations.txt \ --config=conf/default.yaml \ - --num-cpu=8 \ + --num-cpu=20 \ --cut-sil=True fi @@ -49,7 +46,7 @@ fi if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then # normalize and covert phone/speaker to id, dev and test should use train's stats echo "Normalize ..." - python3 ${MAIN_ROOT}/utils/fs2_normalize.py \ + python3 ../normalize.py \ --metadata=dump/train/raw/metadata.jsonl \ --dumpdir=dump/train/norm \ --speech-stats=dump/train/speech_stats.npy \ @@ -58,7 +55,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then --phones-dict=dump/phone_id_map.txt \ --speaker-dict=dump/speaker_id_map.txt - python3 ${MAIN_ROOT}/utils/fs2_normalize.py \ + python3 ../normalize.py \ --metadata=dump/dev/raw/metadata.jsonl \ --dumpdir=dump/dev/norm \ --speech-stats=dump/train/speech_stats.npy \ @@ -67,7 +64,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then --phones-dict=dump/phone_id_map.txt \ --speaker-dict=dump/speaker_id_map.txt - python3 ${MAIN_ROOT}/utils/fs2_normalize.py \ + python3 ../normalize.py \ --metadata=dump/test/raw/metadata.jsonl \ --dumpdir=dump/test/norm \ --speech-stats=dump/train/speech_stats.npy \ diff --git a/examples/fastspeech2/aishell3/run.sh b/examples/fastspeech2/aishell3/run.sh index adbe9da5..d4f06da9 100755 --- a/examples/fastspeech2/aishell3/run.sh +++ b/examples/fastspeech2/aishell3/run.sh @@ -1,8 +1,6 @@ #!/bin/bash -export MAIN_ROOT=`realpath ${PWD}/../../../` - -python3 ${MAIN_ROOT}/utils/multi_spk_fs2_train.py \ +python3 ../train.py \ --train-metadata=dump/train/norm/metadata.jsonl \ --dev-metadata=dump/dev/norm/metadata.jsonl \ --config=conf/default.yaml \ diff --git a/examples/fastspeech2/aishell3/synthesize.sh b/examples/fastspeech2/aishell3/synthesize.sh index fdb73508..f14bace9 100755 --- a/examples/fastspeech2/aishell3/synthesize.sh +++ b/examples/fastspeech2/aishell3/synthesize.sh @@ -1,8 +1,6 @@ #!/bin/bash -export MAIN_ROOT=`realpath ${PWD}/../../../` - -python3 ${MAIN_ROOT}/utils/multi_spk_fs2_pwg_syn.py \ +python3 ../synthesize.py \ --fastspeech2-config=conf/default.yaml \ --fastspeech2-checkpoint=exp/default/checkpoints/snapshot_iter_96400.pdz \ --fastspeech2-stat=dump/train/speech_stats.npy \ diff --git a/examples/fastspeech2/aishell3/synthesize_e2e.py b/examples/fastspeech2/aishell3/synthesize_e2e.py index f28db866..b4be5db7 100644 --- a/examples/fastspeech2/aishell3/synthesize_e2e.py +++ b/examples/fastspeech2/aishell3/synthesize_e2e.py @@ -20,11 +20,11 @@ import paddle import soundfile as sf import yaml -from parakeet.frontend.cn_frontend import Frontend -from parakeet.models.fastspeech2 import FastSpeech2 -from parakeet.models.fastspeech2 import FastSpeech2Inference -from parakeet.models.parallel_wavegan import PWGGenerator -from parakeet.models.parallel_wavegan import PWGInference +from parakeet.frontend.zh_frontend import Frontend +from parakeet.models.fastspeech2.fastspeech2 import FastSpeech2 +from parakeet.models.fastspeech2.fastspeech2 import FastSpeech2Inference +from parakeet.models.parallel_wavegan.parallel_wavegan import PWGGenerator +from parakeet.models.parallel_wavegan.parallel_wavegan import PWGInference from parakeet.modules.normalizer import ZScore from yacs.config import CfgNode @@ -106,7 +106,7 @@ def evaluate(args, fastspeech2_config, pwg_config): str(output_dir / (str(spk_id) + "_" + utt_id + ".wav")), wav.numpy(), samplerate=fastspeech2_config.fs) - print(f"{utt_id} done!") + print(f"{spk_id}_{utt_id} done!") def main(): @@ -136,15 +136,9 @@ def main(): help="mean and standard deviation used to normalize spectrogram when training parallel wavegan." ) parser.add_argument( - "--phones-dict", - type=str, - default="phone_id_map.txt", - help="phone vocabulary file.") + "--phones-dict", type=str, default=None, help="phone vocabulary file.") parser.add_argument( - "--speaker-dict", - type=str, - default="speaker_id_map.txt ", - help="speaker id map file.") + "--speaker-dict", type=str, default=None, help="speaker id map file.") parser.add_argument( "--text", type=str, diff --git a/examples/fastspeech2/baker/README.md b/examples/fastspeech2/baker/README.md index 221ab192..b23e6295 100644 --- a/examples/fastspeech2/baker/README.md +++ b/examples/fastspeech2/baker/README.md @@ -41,18 +41,18 @@ The dataset is split into 3 parts, namely `train`, `dev` and` test`, each of whi Also there is a `metadata.jsonl` in each subfolder. It is a table-like file which contains phones, text_lengths, speech_lengths, durations, path of speech features, path of pitch features, path of energy features, speaker and id of each utterance. ## Train the model -`./run.sh` calls `Parakeet/utils/fs2_train.py`. +`./run.sh` calls `../train.py`. ```bash ./run.sh ``` Here's the complete help message. ```text -usage: fs2_train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA] - [--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR] - [--device DEVICE] [--nprocs NPROCS] [--verbose VERBOSE] - [--phones-dict PHONES_DICT] +usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA] + [--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR] + [--device DEVICE] [--nprocs NPROCS] [--verbose VERBOSE] + [--phones-dict PHONES_DICT] [--speaker-dict SPEAKER_DICT] -Train a FastSpeech2 model with sigle speaker dataset. +Train a FastSpeech2 model. optional arguments: -h, --help show this help message and exit @@ -68,6 +68,8 @@ optional arguments: --verbose VERBOSE verbose. --phones-dict PHONES_DICT phone vocabulary file. + --speaker-dict SPEAKER_DICT + speaker id map file for multiple speaker model. ``` 1. `--config` is a config file in yaml format to overwrite the default config, which can be found at `conf/default.yaml`. 2. `--train-metadata` and `--dev-metadata` should be the metadata file in the normalized subfolder of `train` and `dev` in the `dump` folder. @@ -100,20 +102,19 @@ pwg_baker_ckpt_0.4 ├── pwg_snapshot_iter_400000.pdz # model parameters of parallel wavegan └── pwg_stats.npy # statistics used to normalize spectrogram when training parallel wavegan ``` -`synthesize.sh` calls `Parakeet/utils/fs2_pwg_syn.py`, which can synthesize waveform from `metadata.jsonl`. +`synthesize.sh` calls `../synthesize.py`, which can synthesize waveform from `metadata.jsonl`. ```bash ./synthesize.sh ``` ```text -usage: fs2_pwg_syn.py [-h] [--fastspeech2-config FASTSPEECH2_CONFIG] - [--fastspeech2-checkpoint FASTSPEECH2_CHECKPOINT] - [--fastspeech2-stat FASTSPEECH2_STAT] - [--pwg-config PWG_CONFIG] - [--pwg-checkpoint PWG_CHECKPOINT] [--pwg-stat PWG_STAT] - [--phones-dict PHONES_DICT] - [--test-metadata TEST_METADATA] - [--output-dir OUTPUT_DIR] [--device DEVICE] - [--verbose VERBOSE] +usage: synthesize.py [-h] [--fastspeech2-config FASTSPEECH2_CONFIG] + [--fastspeech2-checkpoint FASTSPEECH2_CHECKPOINT] + [--fastspeech2-stat FASTSPEECH2_STAT] + [--pwg-config PWG_CONFIG] + [--pwg-checkpoint PWG_CHECKPOINT] [--pwg-stat PWG_STAT] + [--phones-dict PHONES_DICT] [--speaker-dict SPEAKER_DICT] + [--test-metadata TEST_METADATA] [--output-dir OUTPUT_DIR] + [--device DEVICE] [--verbose VERBOSE] Synthesize with fastspeech2 & parallel wavegan. @@ -134,6 +135,8 @@ optional arguments: spectrogram when training parallel wavegan. --phones-dict PHONES_DICT phone vocabulary file. + --speaker-dict SPEAKER_DICT + speaker id map file for multiple speaker model. --test-metadata TEST_METADATA test metadata. --output-dir OUTPUT_DIR diff --git a/examples/fastspeech2/baker/preprocess.sh b/examples/fastspeech2/baker/preprocess.sh index 0f6b43a2..dff3e349 100755 --- a/examples/fastspeech2/baker/preprocess.sh +++ b/examples/fastspeech2/baker/preprocess.sh @@ -3,9 +3,6 @@ stage=0 stop_stage=100 -fs=24000 -n_shift=300 - export MAIN_ROOT=`realpath ${PWD}/../../../` if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then @@ -14,20 +11,19 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then python3 ${MAIN_ROOT}/utils/gen_duration_from_textgrid.py \ --inputdir=./baker_alignment_tone \ --output=durations.txt \ - --sample-rate=${fs} \ - --n-shift=${n_shift} + --config=conf/default.yaml fi if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then # extract features echo "Extract features ..." - python3 ${MAIN_ROOT}/utils/fs2_preprocess.py \ + python3 ../preprocess.py \ --dataset=baker \ --rootdir=~/datasets/BZNSYP/ \ --dumpdir=dump \ --dur-file=durations.txt \ --config=conf/default.yaml \ - --num-cpu=8 \ + --num-cpu=20 \ --cut-sil=True fi @@ -50,7 +46,7 @@ fi if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then # normalize and covert phone/speaker to id, dev and test should use train's stats echo "Normalize ..." - python3 ${MAIN_ROOT}/utils/fs2_normalize.py \ + python3 ../normalize.py \ --metadata=dump/train/raw/metadata.jsonl \ --dumpdir=dump/train/norm \ --speech-stats=dump/train/speech_stats.npy \ @@ -59,7 +55,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then --phones-dict=dump/phone_id_map.txt \ --speaker-dict=dump/speaker_id_map.txt - python3 ${MAIN_ROOT}/utils/fs2_normalize.py \ + python3 ../normalize.py \ --metadata=dump/dev/raw/metadata.jsonl \ --dumpdir=dump/dev/norm \ --speech-stats=dump/train/speech_stats.npy \ @@ -68,7 +64,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then --phones-dict=dump/phone_id_map.txt \ --speaker-dict=dump/speaker_id_map.txt - python3 ${MAIN_ROOT}/utils/fs2_normalize.py \ + python3 ../normalize.py \ --metadata=dump/test/raw/metadata.jsonl \ --dumpdir=dump/test/norm \ --speech-stats=dump/train/speech_stats.npy \ diff --git a/examples/fastspeech2/baker/run.sh b/examples/fastspeech2/baker/run.sh index 7ded4220..3e9a5e22 100755 --- a/examples/fastspeech2/baker/run.sh +++ b/examples/fastspeech2/baker/run.sh @@ -1,7 +1,6 @@ #!/bin/bash -export MAIN_ROOT=`realpath ${PWD}/../../../` -python3 ${MAIN_ROOT}/utils/fs2_train.py \ +python3 ../train.py \ --train-metadata=dump/train/norm/metadata.jsonl \ --dev-metadata=dump/dev/norm/metadata.jsonl \ --config=conf/default.yaml \ diff --git a/examples/fastspeech2/baker/synthesize.sh b/examples/fastspeech2/baker/synthesize.sh index 0f2227ae..6b363d88 100755 --- a/examples/fastspeech2/baker/synthesize.sh +++ b/examples/fastspeech2/baker/synthesize.sh @@ -1,9 +1,7 @@ #!/bin/bash -export MAIN_ROOT=`realpath ${PWD}/../../../` - -python3 ${MAIN_ROOT}/utils/fs2_pwg_syn.py \ +python3 ../synthesize.py \ --fastspeech2-config=conf/default.yaml \ --fastspeech2-checkpoint=exp/default/checkpoints/snapshot_iter_76000.pdz \ --fastspeech2-stat=dump/train/speech_stats.npy \ diff --git a/examples/fastspeech2/baker/synthesize_e2e.py b/examples/fastspeech2/baker/synthesize_e2e.py index 4815c648..0321bda4 100644 --- a/examples/fastspeech2/baker/synthesize_e2e.py +++ b/examples/fastspeech2/baker/synthesize_e2e.py @@ -20,11 +20,11 @@ import paddle import soundfile as sf import yaml -from parakeet.frontend.cn_frontend import Frontend -from parakeet.models.fastspeech2 import FastSpeech2 -from parakeet.models.fastspeech2 import FastSpeech2Inference -from parakeet.models.parallel_wavegan import PWGGenerator -from parakeet.models.parallel_wavegan import PWGInference +from parakeet.frontend.zh_frontend import Frontend +from parakeet.models.fastspeech2.fastspeech2 import FastSpeech2 +from parakeet.models.fastspeech2.fastspeech2 import FastSpeech2Inference +from parakeet.models.parallel_wavegan.parallel_wavegan import PWGGenerator +from parakeet.models.parallel_wavegan.parallel_wavegan import PWGInference from parakeet.modules.normalizer import ZScore from yacs.config import CfgNode diff --git a/examples/fastspeech2/ljspeech/README.md b/examples/fastspeech2/ljspeech/README.md index 1eb74f67..7aa4f1ba 100644 --- a/examples/fastspeech2/ljspeech/README.md +++ b/examples/fastspeech2/ljspeech/README.md @@ -41,18 +41,18 @@ The dataset is split into 3 parts, namely `train`, `dev` and` test`, each of whi Also there is a `metadata.jsonl` in each subfolder. It is a table-like file which contains phones, text_lengths, speech_lengths, durations, path of speech features, path of pitch features, path of energy features, speaker and id of each utterance. ## Train the model -`./run.sh` calls `Parakeet/utils/fs2_train.py`. +`./run.sh` calls `../train.py`. ```bash ./run.sh ``` Here's the complete help message. ```text -usage: fs2_train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA] - [--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR] - [--device DEVICE] [--nprocs NPROCS] [--verbose VERBOSE] - [--phones-dict PHONES_DICT] +usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA] + [--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR] + [--device DEVICE] [--nprocs NPROCS] [--verbose VERBOSE] + [--phones-dict PHONES_DICT] [--speaker-dict SPEAKER_DICT] -Train a FastSpeech2 model with sigle speaker dataset. +Train a FastSpeech2 model. optional arguments: -h, --help show this help message and exit @@ -68,6 +68,8 @@ optional arguments: --verbose VERBOSE verbose. --phones-dict PHONES_DICT phone vocabulary file. + --speaker-dict SPEAKER_DICT + speaker id map file for multiple speaker model. ``` 1. `--config` is a config file in yaml format to overwrite the default config, which can be found at `conf/default.yaml`. 2. `--train-metadata` and `--dev-metadata` should be the metadata file in the normalized subfolder of `train` and `dev` in the `dump` folder. @@ -100,19 +102,19 @@ pwg_ljspeech_ckpt_0.5 ├── pwg_snapshot_iter_400000.pdz # generator parameters of parallel wavegan └── pwg_stats.npy # statistics used to normalize spectrogram when training parallel wavegan ``` -`synthesize.sh` calls `Parakeet/utils/fs2_pwg_syn.py`, which can synthesize waveform from `metadata.jsonl`. +`synthesize.sh` calls `../synthesize.py`, which can synthesize waveform from `metadata.jsonl`. ```bash ./synthesize.sh ``` ```text -usage: fs2_pwg_syn.py [-h] [--fastspeech2-config FASTSPEECH2_CONFIG] - [--fastspeech2-checkpoint FASTSPEECH2_CHECKPOINT] - [--fastspeech2-stat FASTSPEECH2_STAT] - [--pwg-config PWG_CONFIG] - [--pwg-checkpoint PWG_CHECKPOINT] [--pwg-stat PWG_STAT] - [--phones-dict PHONES_DICT] - [--test-metadata TEST_METADATA] [--output-dir OUTPUT_DIR] - [--device DEVICE] [--verbose VERBOSE] +usage: synthesize.py [-h] [--fastspeech2-config FASTSPEECH2_CONFIG] + [--fastspeech2-checkpoint FASTSPEECH2_CHECKPOINT] + [--fastspeech2-stat FASTSPEECH2_STAT] + [--pwg-config PWG_CONFIG] + [--pwg-checkpoint PWG_CHECKPOINT] [--pwg-stat PWG_STAT] + [--phones-dict PHONES_DICT] [--speaker-dict SPEAKER_DICT] + [--test-metadata TEST_METADATA] [--output-dir OUTPUT_DIR] + [--device DEVICE] [--verbose VERBOSE] Synthesize with fastspeech2 & parallel wavegan. @@ -133,6 +135,8 @@ optional arguments: spectrogram when training parallel wavegan. --phones-dict PHONES_DICT phone vocabulary file. + --speaker-dict SPEAKER_DICT + speaker id map file for multiple speaker model. --test-metadata TEST_METADATA test metadata. --output-dir OUTPUT_DIR diff --git a/examples/fastspeech2/ljspeech/preprocess.sh b/examples/fastspeech2/ljspeech/preprocess.sh index a96f1c62..ff2e765d 100755 --- a/examples/fastspeech2/ljspeech/preprocess.sh +++ b/examples/fastspeech2/ljspeech/preprocess.sh @@ -3,9 +3,6 @@ stage=0 stop_stage=100 -fs=22050 -n_shift=256 - export MAIN_ROOT=`realpath ${PWD}/../../../` if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then @@ -14,14 +11,13 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then python3 ${MAIN_ROOT}/utils/gen_duration_from_textgrid.py \ --inputdir=./ljspeech_alignment \ --output=durations.txt \ - --sample-rate=${fs} \ - --n-shift=${n_shift} + --config=conf/default.yaml fi if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then # extract features echo "Extract features ..." - python3 ${MAIN_ROOT}/utils/fs2_preprocess.py \ + python3 ../preprocess.py \ --dataset=ljspeech \ --rootdir=~/datasets/LJSpeech-1.1/ \ --dumpdir=dump \ @@ -50,7 +46,7 @@ fi if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then # normalize and covert phone/speaker to id, dev and test should use train's stats echo "Normalize ..." - python3 ${MAIN_ROOT}/utils/fs2_normalize.py \ + python3 ../normalize.py \ --metadata=dump/train/raw/metadata.jsonl \ --dumpdir=dump/train/norm \ --speech-stats=dump/train/speech_stats.npy \ @@ -59,7 +55,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then --phones-dict=dump/phone_id_map.txt \ --speaker-dict=dump/speaker_id_map.txt - python3 ${MAIN_ROOT}/utils/fs2_normalize.py \ + python3 ../normalize.py \ --metadata=dump/dev/raw/metadata.jsonl \ --dumpdir=dump/dev/norm \ --speech-stats=dump/train/speech_stats.npy \ @@ -68,7 +64,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then --phones-dict=dump/phone_id_map.txt \ --speaker-dict=dump/speaker_id_map.txt - python3 ${MAIN_ROOT}/utils/fs2_normalize.py \ + python3 ../normalize.py \ --metadata=dump/test/raw/metadata.jsonl \ --dumpdir=dump/test/norm \ --speech-stats=dump/train/speech_stats.npy \ diff --git a/examples/fastspeech2/ljspeech/run.sh b/examples/fastspeech2/ljspeech/run.sh index d42974df..fd5e2c68 100755 --- a/examples/fastspeech2/ljspeech/run.sh +++ b/examples/fastspeech2/ljspeech/run.sh @@ -1,7 +1,6 @@ #!/bin/bash -export MAIN_ROOT=`realpath ${PWD}/../../../` -python3 ${MAIN_ROOT}/utils/fs2_train.py \ +python3 ../train.py \ --train-metadata=dump/train/norm/metadata.jsonl \ --dev-metadata=dump/dev/norm/metadata.jsonl \ --config=conf/default.yaml \ diff --git a/examples/fastspeech2/ljspeech/synthesize.sh b/examples/fastspeech2/ljspeech/synthesize.sh index b71b17d9..dd16c867 100755 --- a/examples/fastspeech2/ljspeech/synthesize.sh +++ b/examples/fastspeech2/ljspeech/synthesize.sh @@ -1,8 +1,7 @@ #!/bin/bash -export MAIN_ROOT=`realpath ${PWD}/../../../` -python3 ${MAIN_ROOT}/utils/fs2_pwg_syn.py \ +python3 ../synthesize.py \ --fastspeech2-config=conf/default.yaml \ --fastspeech2-checkpoint=exp/default/checkpoints/snapshot_iter_100000.pdz \ --fastspeech2-stat=dump/train/speech_stats.npy \ diff --git a/examples/fastspeech2/ljspeech/synthesize_e2e.py b/examples/fastspeech2/ljspeech/synthesize_e2e.py index 93ed91c0..16890658 100644 --- a/examples/fastspeech2/ljspeech/synthesize_e2e.py +++ b/examples/fastspeech2/ljspeech/synthesize_e2e.py @@ -22,10 +22,10 @@ import yaml from yacs.config import CfgNode from parakeet.frontend import English -from parakeet.models.fastspeech2 import FastSpeech2 -from parakeet.models.fastspeech2 import FastSpeech2Inference -from parakeet.models.parallel_wavegan import PWGGenerator -from parakeet.models.parallel_wavegan import PWGInference +from parakeet.models.fastspeech2.fastspeech2 import FastSpeech2 +from parakeet.models.fastspeech2.fastspeech2 import FastSpeech2Inference +from parakeet.models.parallel_wavegan.parallel_wavegan import PWGGenerator +from parakeet.models.parallel_wavegan.parallel_wavegan import PWGInference from parakeet.modules.normalizer import ZScore diff --git a/utils/fs2_normalize.py b/examples/fastspeech2/normalize.py similarity index 100% rename from utils/fs2_normalize.py rename to examples/fastspeech2/normalize.py diff --git a/utils/fs2_preprocess.py b/examples/fastspeech2/preprocess.py similarity index 97% rename from utils/fs2_preprocess.py rename to examples/fastspeech2/preprocess.py index 62ff3756..49716fb7 100644 --- a/utils/fs2_preprocess.py +++ b/examples/fastspeech2/preprocess.py @@ -29,14 +29,13 @@ from parakeet.data.get_feats import Energy from parakeet.data.get_feats import LogMelFBank from parakeet.data.get_feats import Pitch +from parakeet.datasets.preprocess_utils import compare_duration_and_mel_length +from parakeet.datasets.preprocess_utils import merge_silence +from parakeet.datasets.preprocess_utils import get_input_token +from parakeet.datasets.preprocess_utils import get_phn_dur +from parakeet.datasets.preprocess_utils import get_spk_id_map from yacs.config import CfgNode -from preprocess_utils import compare_duration_and_mel_length -from preprocess_utils import merge_silence -from preprocess_utils import get_input_token -from preprocess_utils import get_phn_dur -from preprocess_utils import get_spk_id_map - def process_sentence(config: Dict[str, Any], fp: Path, diff --git a/utils/multi_spk_fs2_pwg_syn.py b/examples/fastspeech2/synthesize.py similarity index 79% rename from utils/multi_spk_fs2_pwg_syn.py rename to examples/fastspeech2/synthesize.py index 23bb95e3..af427ab6 100644 --- a/utils/multi_spk_fs2_pwg_syn.py +++ b/examples/fastspeech2/synthesize.py @@ -23,10 +23,10 @@ import yaml from yacs.config import CfgNode from parakeet.datasets.data_table import DataTable -from parakeet.models.fastspeech2 import FastSpeech2 -from parakeet.models.fastspeech2 import FastSpeech2Inference -from parakeet.models.parallel_wavegan import PWGGenerator -from parakeet.models.parallel_wavegan import PWGInference +from parakeet.models.fastspeech2.fastspeech2 import FastSpeech2 +from parakeet.models.fastspeech2.fastspeech2 import FastSpeech2Inference +from parakeet.models.parallel_wavegan.parallel_wavegan import PWGGenerator +from parakeet.models.parallel_wavegan.parallel_wavegan import PWGInference from parakeet.modules.normalizer import ZScore @@ -37,20 +37,28 @@ def evaluate(args, fastspeech2_config, pwg_config): # construct dataset for evaluation with jsonlines.open(args.test_metadata, 'r') as reader: test_metadata = list(reader) - test_dataset = DataTable( - data=test_metadata, fields=["utt_id", "text", "spk_id"]) + fields = ["utt_id", "text"] + + if args.speaker_dict is not None: + print("multiple speaker fastspeech2!") + with open(args.speaker_dict, 'rt') as f: + spk_id = [line.strip().split() for line in f.readlines()] + num_speakers = len(spk_id) + fields += ["spk_id"] + else: + print("single speaker fastspeech2!") + num_speakers = None + print("num_speakers:", num_speakers) + + test_dataset = DataTable(data=test_metadata, fields=fields) + + odim = fastspeech2_config.n_mels with open(args.phones_dict, "r") as f: phn_id = [line.strip().split() for line in f.readlines()] vocab_size = len(phn_id) print("vocab_size:", vocab_size) - with open(args.speaker_dict, 'rt') as f: - spk_id = [line.strip().split() for line in f.readlines()] - num_speakers = len(spk_id) - print("num_speakers:", num_speakers) - - odim = fastspeech2_config.n_mels model = FastSpeech2( idim=vocab_size, odim=odim, @@ -79,7 +87,7 @@ def evaluate(args, fastspeech2_config, pwg_config): std = paddle.to_tensor(std) pwg_normalizer = ZScore(mu, std) - fastspeech2_inferencce = FastSpeech2Inference(fastspeech2_normalizer, model) + fastspeech2_inference = FastSpeech2Inference(fastspeech2_normalizer, model) pwg_inference = PWGInference(pwg_normalizer, vocoder) output_dir = Path(args.output_dir) @@ -88,10 +96,12 @@ def evaluate(args, fastspeech2_config, pwg_config): for datum in test_dataset: utt_id = datum["utt_id"] text = paddle.to_tensor(datum["text"]) - spk_id = paddle.to_tensor(datum["spk_id"]) - + if "spk_id" in datum: + spk_id = paddle.to_tensor(datum["spk_id"]) + else: + spk_id = None with paddle.no_grad(): - wav = pwg_inference(fastspeech2_inferencce(text, spk_id=spk_id)) + wav = pwg_inference(fastspeech2_inference(text, spk_id=spk_id)) sf.write( str(output_dir / (utt_id + ".wav")), wav.numpy(), @@ -126,15 +136,12 @@ def main(): help="mean and standard deviation used to normalize spectrogram when training parallel wavegan." ) parser.add_argument( - "--phones-dict", - type=str, - default="phone_id_map.txt", - help="phone vocabulary file.") + "--phones-dict", type=str, default=None, help="phone vocabulary file.") parser.add_argument( "--speaker-dict", type=str, - default="speaker_id_map.txt ", - help="speaker id map file.") + default=None, + help="speaker id map file for multiple speaker model.") parser.add_argument("--test-metadata", type=str, help="test metadata.") parser.add_argument("--output-dir", type=str, help="output dir.") parser.add_argument( diff --git a/utils/multi_spk_fs2_train.py b/examples/fastspeech2/train.py similarity index 81% rename from utils/multi_spk_fs2_train.py rename to examples/fastspeech2/train.py index abdd02bb..99c5c7b9 100644 --- a/utils/multi_spk_fs2_train.py +++ b/examples/fastspeech2/train.py @@ -23,23 +23,22 @@ import yaml from paddle import DataParallel from paddle import distributed as dist -from paddle.io import DataLoader -from paddle.io import DistributedBatchSampler +from paddle.io import DataLoader, DistributedBatchSampler from parakeet.datasets.data_table import DataTable +from parakeet.datasets.am_batch_fn import fastspeech2_single_spk_batch_fn from parakeet.datasets.am_batch_fn import fastspeech2_multi_spk_batch_fn -from parakeet.models.fastspeech2 import FastSpeech2 +from parakeet.models.fastspeech2.fastspeech2 import FastSpeech2 +from parakeet.models.fastspeech2.fastspeech2_updater import FastSpeech2Evaluator +from parakeet.models.fastspeech2.fastspeech2_updater import FastSpeech2Updater from parakeet.training.extensions.snapshot import Snapshot from parakeet.training.extensions.visualizer import VisualDL -from parakeet.training.multispk_fastspeech2_updater import FastSpeech2Evaluator -from parakeet.training.multispk_fastspeech2_updater import FastSpeech2Updater +from parakeet.training.optimizer import build_optimizers from parakeet.training.seeding import seed_everything from parakeet.training.trainer import Trainer from pathlib import Path from visualdl import LogWriter from yacs.config import CfgNode -from train_utils import build_optimizers - def train_sp(args, config): # decides device type and whether to run in parallel @@ -58,6 +57,22 @@ def train_sp(args, config): print( f"rank: {dist.get_rank()}, pid: {os.getpid()}, parent_pid: {os.getppid()}", ) + fields = [ + "text", "text_lengths", "speech", "speech_lengths", "durations", + "pitch", "energy" + ] + if args.speaker_dict is not None: + print("multiple speaker fastspeech2!") + collate_fn = fastspeech2_multi_spk_batch_fn + with open(args.speaker_dict, 'rt') as f: + spk_id = [line.strip().split() for line in f.readlines()] + num_speakers = len(spk_id) + fields += ["spk_id"] + else: + print("single speaker fastspeech2!") + collate_fn = fastspeech2_single_spk_batch_fn + num_speakers = None + print("num_speakers:", num_speakers) # dataloader has been too verbose logging.getLogger("DataLoader").disabled = True @@ -67,27 +82,21 @@ def train_sp(args, config): train_metadata = list(reader) train_dataset = DataTable( data=train_metadata, - fields=[ - "text", "text_lengths", "speech", "speech_lengths", "durations", - "pitch", "energy", "spk_id" - ], + fields=fields, converters={"speech": np.load, "pitch": np.load, "energy": np.load}, ) with jsonlines.open(args.dev_metadata, 'r') as reader: dev_metadata = list(reader) - dev_dataset = DataTable( data=dev_metadata, - fields=[ - "text", "text_lengths", "speech", "speech_lengths", "durations", - "pitch", "energy", "spk_id" - ], + fields=fields, converters={"speech": np.load, "pitch": np.load, "energy": np.load}, ) # collate function and dataloader + train_sampler = DistributedBatchSampler( train_dataset, batch_size=config.batch_size, @@ -99,7 +108,7 @@ def train_sp(args, config): train_dataloader = DataLoader( train_dataset, batch_sampler=train_sampler, - collate_fn=fastspeech2_multi_spk_batch_fn, + collate_fn=collate_fn, num_workers=config.num_workers) dev_dataloader = DataLoader( @@ -107,7 +116,7 @@ def train_sp(args, config): shuffle=False, drop_last=False, batch_size=config.batch_size, - collate_fn=fastspeech2_multi_spk_batch_fn, + collate_fn=collate_fn, num_workers=config.num_workers) print("dataloaders done!") @@ -116,11 +125,6 @@ def train_sp(args, config): vocab_size = len(phn_id) print("vocab_size:", vocab_size) - with open(args.speaker_dict, 'rt') as f: - spk_id = [line.strip().split() for line in f.readlines()] - num_speakers = len(spk_id) - print("num_speakers:", num_speakers) - odim = config.n_mels model = FastSpeech2( idim=vocab_size, @@ -165,8 +169,7 @@ def train_sp(args, config): def main(): # parse args and config and redirect to train_sp - parser = argparse.ArgumentParser( - description="Train a FastSpeech2 model with multiple speaker dataset.") + parser = argparse.ArgumentParser(description="Train a FastSpeech2 model.") parser.add_argument("--config", type=str, help="fastspeech2 config file.") parser.add_argument("--train-metadata", type=str, help="training data.") parser.add_argument("--dev-metadata", type=str, help="dev data.") @@ -177,19 +180,17 @@ def main(): "--nprocs", type=int, default=1, help="number of processes.") parser.add_argument("--verbose", type=int, default=1, help="verbose.") parser.add_argument( - "--phones-dict", - type=str, - default="phone_id_map.txt ", - help="phone vocabulary file.") + "--phones-dict", type=str, default=None, help="phone vocabulary file.") parser.add_argument( "--speaker-dict", type=str, - default="speaker_id_map.txt ", - help="speaker id map file.") + default=None, + help="speaker id map file for multiple speaker model.") args = parser.parse_args() if args.device == "cpu" and args.nprocs > 1: raise RuntimeError("Multiprocess training on CPU is not supported.") + with open(args.config) as f: config = CfgNode(yaml.safe_load(f)) diff --git a/examples/speedyspeech/baker/README.md b/examples/speedyspeech/baker/README.md index 456fecd8..5dcf43b7 100644 --- a/examples/speedyspeech/baker/README.md +++ b/examples/speedyspeech/baker/README.md @@ -38,18 +38,18 @@ The dataset is split into 3 parts, namely `train`, `dev` and `test`, each of whi Also there is a `metadata.jsonl` in each subfolder. It is a table-like file which contains phones, tones, durations, path of spectrogram, and id of each utterance. ## Train the model -`./run.sh` calls `Parakeet/utils/ss_train.py`. +`./run.sh` calls `../train.py`. ```bash ./run.sh ``` Here's the complete help message. ```text -usage: ss_train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA] - [--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR] - [--device DEVICE] [--nprocs NPROCS] [--verbose VERBOSE] - [--use-relative-path USE_RELATIVE_PATH] - [--phones-dict PHONES_DICT] [--tones-dict TONES_DICT] +usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA] + [--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR] + [--device DEVICE] [--nprocs NPROCS] [--verbose VERBOSE] + [--use-relative-path USE_RELATIVE_PATH] + [--phones-dict PHONES_DICT] [--tones-dict TONES_DICT] Train a Speedyspeech model with sigle speaker dataset. @@ -107,12 +107,12 @@ pwg_baker_ckpt_0.4 ├── pwg_snapshot_iter_400000.pdz # model parameters of parallel wavegan └── pwg_stats.npy # statistics used to normalize spectrogram when training parallel wavegan ``` -`synthesize.sh` calls `Parakeet/utils/ss_pwg_syn.py`, which can synthesize waveform from `metadata.jsonl`. +`synthesize.sh` calls `../synthesize.py`, which can synthesize waveform from `metadata.jsonl`. ```bash ./synthesize.sh ``` ```text -usage: ss_pwg_syn.py [-h] [--speedyspeech-config SPEEDYSPEECH_CONFIG] +usage: synthesize.py [-h] [--speedyspeech-config SPEEDYSPEECH_CONFIG] [--speedyspeech-checkpoint SPEEDYSPEECH_CHECKPOINT] [--speedyspeech-stat SPEEDYSPEECH_STAT] [--pwg-config PWG_CONFIG] diff --git a/examples/speedyspeech/baker/conf/default.yaml b/examples/speedyspeech/baker/conf/default.yaml index 43a8dca0..8be96aaa 100644 --- a/examples/speedyspeech/baker/conf/default.yaml +++ b/examples/speedyspeech/baker/conf/default.yaml @@ -10,10 +10,6 @@ window: "hann" # Window function. n_mels: 80 # Number of mel basis. fmin: 80 # Minimum freq in mel basis calculation. fmax: 7600 # Maximum frequency in mel basis calculation. -trim_silence: false # Whether to trim the start and end of silence. -top_db: 60 # Need to tune carefully if the recording is not good. -trim_frame_length: 2048 # Frame size in trimming.(in samples) -trim_hop_length: 512 # Hop size in trimming.(in samples) ########################################################### # DATA SETTING # diff --git a/examples/speedyspeech/baker/inference.py b/examples/speedyspeech/baker/inference.py index 3bf4019a..a1d18540 100644 --- a/examples/speedyspeech/baker/inference.py +++ b/examples/speedyspeech/baker/inference.py @@ -18,7 +18,7 @@ import soundfile as sf from paddle import inference -from parakeet.frontend.cn_frontend import Frontend +from parakeet.frontend.zh_frontend import Frontend def main(): diff --git a/examples/speedyspeech/baker/preprocess.sh b/examples/speedyspeech/baker/preprocess.sh index 47cda8cc..422caa31 100755 --- a/examples/speedyspeech/baker/preprocess.sh +++ b/examples/speedyspeech/baker/preprocess.sh @@ -3,9 +3,6 @@ stage=0 stop_stage=100 -fs=24000 -n_shift=300 - export MAIN_ROOT=`realpath ${PWD}/../../../` if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then @@ -14,13 +11,12 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then python3 ${MAIN_ROOT}/utils/gen_duration_from_textgrid.py \ --inputdir=./baker_alignment_tone \ --output=durations.txt \ - --sample-rate=${fs} \ - --n-shift=${n_shift} + --config=conf/default.yaml fi if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then echo "Extract features ..." - python3 ${MAIN_ROOT}/utils/ss_preprocess.py \ + python3 ../preprocess.py \ --dataset=baker \ --rootdir=~/datasets/BZNSYP/ \ --dumpdir=dump \ @@ -42,7 +38,7 @@ fi if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then # normalize and covert phone/tone to id, dev and test should use train's stats echo "Normalize ..." - python3 ${MAIN_ROOT}/utils/ss_normalize.py \ + python3 ../normalize.py \ --metadata=dump/train/raw/metadata.jsonl \ --dumpdir=dump/train/norm \ --stats=dump/train/feats_stats.npy \ @@ -50,7 +46,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then --tones-dict=dump/tone_id_map.txt \ --use-relative-path=True - python3 ${MAIN_ROOT}/utils/ss_normalize.py \ + python3 ../normalize.py \ --metadata=dump/dev/raw/metadata.jsonl \ --dumpdir=dump/dev/norm \ --stats=dump/train/feats_stats.npy \ @@ -58,7 +54,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then --tones-dict=dump/tone_id_map.txt \ --use-relative-path=True - python3 ${MAIN_ROOT}/utils/ss_normalize.py \ + python3 ../normalize.py \ --metadata=dump/test/raw/metadata.jsonl \ --dumpdir=dump/test/norm \ --stats=dump/train/feats_stats.npy \ diff --git a/examples/speedyspeech/baker/run.sh b/examples/speedyspeech/baker/run.sh index 69b4962e..64936ef3 100755 --- a/examples/speedyspeech/baker/run.sh +++ b/examples/speedyspeech/baker/run.sh @@ -1,9 +1,7 @@ #!/bin/bash -export MAIN_ROOT=`realpath ${PWD}/../../../` - -python ${MAIN_ROOT}/utils/ss_train.py \ +python ../train.py \ --train-metadata=dump/train/norm/metadata.jsonl \ --dev-metadata=dump/dev/norm/metadata.jsonl \ --config=conf/default.yaml \ diff --git a/examples/speedyspeech/baker/synthesize.sh b/examples/speedyspeech/baker/synthesize.sh index da49eabe..1528e615 100755 --- a/examples/speedyspeech/baker/synthesize.sh +++ b/examples/speedyspeech/baker/synthesize.sh @@ -1,8 +1,6 @@ #!/bin/bash -export MAIN_ROOT=`realpath ${PWD}/../../../` - -python3 ${MAIN_ROOT}/utils/ss_pwg_syn.py \ +python3 ../synthesize.py \ --speedyspeech-config=conf/default.yaml \ --speedyspeech-checkpoint=exp/default/checkpoints/snapshot_iter_11400.pdz \ --speedyspeech-stat=dump/train/feats_stats.npy \ diff --git a/examples/speedyspeech/baker/synthesize_e2e.py b/examples/speedyspeech/baker/synthesize_e2e.py index 0f5bf54c..80330f29 100644 --- a/examples/speedyspeech/baker/synthesize_e2e.py +++ b/examples/speedyspeech/baker/synthesize_e2e.py @@ -23,11 +23,11 @@ import yaml from paddle import jit from paddle.static import InputSpec -from parakeet.frontend.cn_frontend import Frontend -from parakeet.models.speedyspeech import SpeedySpeech -from parakeet.models.speedyspeech import SpeedySpeechInference -from parakeet.models.parallel_wavegan import PWGGenerator -from parakeet.models.parallel_wavegan import PWGInference +from parakeet.frontend.zh_frontend import Frontend +from parakeet.models.speedyspeech.speedyspeech import SpeedySpeech +from parakeet.models.speedyspeech.speedyspeech import SpeedySpeechInference +from parakeet.models.parallel_wavegan.parallel_wavegan import PWGGenerator +from parakeet.models.parallel_wavegan.parallel_wavegan import PWGInference from parakeet.modules.normalizer import ZScore from yacs.config import CfgNode diff --git a/utils/ss_normalize.py b/examples/speedyspeech/normalize.py similarity index 100% rename from utils/ss_normalize.py rename to examples/speedyspeech/normalize.py diff --git a/utils/ss_preprocess.py b/examples/speedyspeech/preprocess.py similarity index 95% rename from utils/ss_preprocess.py rename to examples/speedyspeech/preprocess.py index 37d88087..c0890489 100644 --- a/utils/ss_preprocess.py +++ b/examples/speedyspeech/preprocess.py @@ -26,14 +26,13 @@ import yaml from concurrent.futures import ThreadPoolExecutor from parakeet.data.get_feats import LogMelFBank +from parakeet.datasets.preprocess_utils import compare_duration_and_mel_length +from parakeet.datasets.preprocess_utils import merge_silence +from parakeet.datasets.preprocess_utils import get_phones_tones +from parakeet.datasets.preprocess_utils import get_phn_dur from pathlib import Path from yacs.config import CfgNode -from preprocess_utils import compare_duration_and_mel_length -from preprocess_utils import merge_silence -from preprocess_utils import get_phones_tones -from preprocess_utils import get_phn_dur - def process_sentence(config: Dict[str, Any], fp: Path, @@ -73,13 +72,6 @@ def process_sentence(config: Dict[str, Any], sentences[utt_id][1] = durations start, end = librosa.time_to_samples([start, end], sr=config.fs) wav = wav[start:end] - # energy based silence trimming - if config.trim_silence: - wav, _ = librosa.effects.trim( - wav, - top_db=config.top_db, - frame_length=config.trim_frame_length, - hop_length=config.trim_hop_length) # extract mel feats logmel = mel_extractor.get_log_mel_fbank(wav) diff --git a/utils/ss_pwg_syn.py b/examples/speedyspeech/synthesize.py similarity index 95% rename from utils/ss_pwg_syn.py rename to examples/speedyspeech/synthesize.py index 550f9405..82d5e6fa 100644 --- a/utils/ss_pwg_syn.py +++ b/examples/speedyspeech/synthesize.py @@ -26,10 +26,10 @@ from yacs.config import CfgNode from parakeet.datasets.data_table import DataTable -from parakeet.models.speedyspeech import SpeedySpeech -from parakeet.models.speedyspeech import SpeedySpeechInference -from parakeet.models.parallel_wavegan import PWGGenerator -from parakeet.models.parallel_wavegan import PWGInference +from parakeet.models.speedyspeech.speedyspeech import SpeedySpeech +from parakeet.models.speedyspeech.speedyspeech import SpeedySpeechInference +from parakeet.models.parallel_wavegan.parallel_wavegan import PWGGenerator +from parakeet.models.parallel_wavegan.parallel_wavegan import PWGInference from parakeet.modules.normalizer import ZScore diff --git a/utils/ss_train.py b/examples/speedyspeech/train.py similarity index 96% rename from utils/ss_train.py rename to examples/speedyspeech/train.py index 4dfd7307..f9436e57 100644 --- a/utils/ss_train.py +++ b/examples/speedyspeech/train.py @@ -27,19 +27,18 @@ from paddle.io import DistributedBatchSampler from parakeet.datasets.data_table import DataTable from parakeet.datasets.am_batch_fn import speedyspeech_batch_fn -from parakeet.models.speedyspeech import SpeedySpeech +from parakeet.models.speedyspeech.speedyspeech import SpeedySpeech +from parakeet.models.speedyspeech.speedyspeech_updater import SpeedySpeechEvaluator +from parakeet.models.speedyspeech.speedyspeech_updater import SpeedySpeechUpdater from parakeet.training.extensions.snapshot import Snapshot from parakeet.training.extensions.visualizer import VisualDL +from parakeet.training.optimizer import build_optimizers from parakeet.training.seeding import seed_everything -from parakeet.training.speedyspeech_updater import SpeedySpeechEvaluator -from parakeet.training.speedyspeech_updater import SpeedySpeechUpdater from parakeet.training.trainer import Trainer from pathlib import Path from visualdl import LogWriter from yacs.config import CfgNode -from train_utils import build_optimizers - def train_sp(args, config): # decides device type and whether to run in parallel diff --git a/examples/text_frontend/test_g2p.py b/examples/text_frontend/test_g2p.py index 45d6c44e..0515e994 100644 --- a/examples/text_frontend/test_g2p.py +++ b/examples/text_frontend/test_g2p.py @@ -16,7 +16,7 @@ import re from pathlib import Path -from parakeet.frontend.cn_frontend import Frontend as cnFrontend +from parakeet.frontend.zh_frontend import Frontend as zhFrontend from parakeet.utils.error_rate import word_errors SILENCE_TOKENS = {"sp", "sil", "sp1", "spl"} @@ -90,7 +90,7 @@ def main(): line_list = line.split(" ") utt_id, phones = line_list[0], " ".join(line_list[1:]) ref_dict[utt_id] = phones - frontend = cnFrontend() + frontend = zhFrontend() avg_wer = get_avg_wer(raw_dict, ref_dict, frontend, output_dir) print("The avg WER of g2p is:", avg_wer) diff --git a/examples/text_frontend/test_textnorm.py b/examples/text_frontend/test_textnorm.py index 0de3c5cf..99eed290 100644 --- a/examples/text_frontend/test_textnorm.py +++ b/examples/text_frontend/test_textnorm.py @@ -16,7 +16,7 @@ import re from pathlib import Path -from parakeet.frontend.cn_normalization.text_normlization import TextNormalizer +from parakeet.frontend.zh_normalization.text_normlization import TextNormalizer from parakeet.utils.error_rate import char_errors diff --git a/examples/transformer_tts/ljspeech/README.md b/examples/transformer_tts/ljspeech/README.md index 00b0016d..4951612c 100644 --- a/examples/transformer_tts/ljspeech/README.md +++ b/examples/transformer_tts/ljspeech/README.md @@ -36,10 +36,11 @@ The dataset is split into 3 parts, namely `train`, `dev` and` test`, each of whi Also there is a `metadata.jsonl` in each subfolder. It is a table-like file which contains phones, text_lengths, speech_lengths, path of speech features, speaker and id of each utterance. ## Train the model +`./run.sh` calls `../train.py`. ```bash ./run.sh ``` -Or you can use `train.py` directly. Here's the complete help message. +Here's the complete help message. ```text usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA] [--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR] @@ -93,7 +94,7 @@ waveflow_ljspeech_ckpt_0.3 ├── config.yaml # default config used to train waveflow └── step-2000000.pdparams # model parameters of waveflow ``` -`synthesize.sh` calls `synthesize.py`, which can synthesize waveform from `metadata.jsonl`. +`synthesize.sh` calls `../synthesize.py`, which can synthesize waveform from `metadata.jsonl`. ```bash ./synthesize.sh ``` diff --git a/examples/transformer_tts/ljspeech/preprocess.sh b/examples/transformer_tts/ljspeech/preprocess.sh index 2af20faf..7fc5247b 100755 --- a/examples/transformer_tts/ljspeech/preprocess.sh +++ b/examples/transformer_tts/ljspeech/preprocess.sh @@ -8,7 +8,7 @@ export MAIN_ROOT=`realpath ${PWD}/../../../` if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then # extract features echo "Extract features ..." - python3 preprocess.py \ + python3 ../preprocess.py \ --dataset=ljspeech \ --rootdir=~/datasets/LJSpeech-1.1/ \ --dumpdir=dump \ @@ -27,21 +27,21 @@ fi if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then # normalize and covert phone to id, dev and test should use train's stats echo "Normalize ..." - python3 normalize.py \ + python3 ../normalize.py \ --metadata=dump/train/raw/metadata.jsonl \ --dumpdir=dump/train/norm \ --speech-stats=dump/train/speech_stats.npy \ --phones-dict=dump/phone_id_map.txt \ --speaker-dict=dump/speaker_id_map.txt - python3 normalize.py \ + python3 ../normalize.py \ --metadata=dump/dev/raw/metadata.jsonl \ --dumpdir=dump/dev/norm \ --speech-stats=dump/train/speech_stats.npy \ --phones-dict=dump/phone_id_map.txt \ --speaker-dict=dump/speaker_id_map.txt - python3 normalize.py \ + python3 ../normalize.py \ --metadata=dump/test/raw/metadata.jsonl \ --dumpdir=dump/test/norm \ --speech-stats=dump/train/speech_stats.npy \ diff --git a/examples/transformer_tts/ljspeech/run.sh b/examples/transformer_tts/ljspeech/run.sh index 736401a0..f448bdfc 100755 --- a/examples/transformer_tts/ljspeech/run.sh +++ b/examples/transformer_tts/ljspeech/run.sh @@ -1,6 +1,6 @@ #!/bin/bash -python3 train.py \ +python3 ../train.py \ --train-metadata=dump/train/norm/metadata.jsonl \ --dev-metadata=dump/dev/norm/metadata.jsonl \ --config=conf/default.yaml \ diff --git a/examples/transformer_tts/ljspeech/synthesize.sh b/examples/transformer_tts/ljspeech/synthesize.sh index d7dba1f3..7f4f5314 100755 --- a/examples/transformer_tts/ljspeech/synthesize.sh +++ b/examples/transformer_tts/ljspeech/synthesize.sh @@ -1,6 +1,6 @@ #!/bin/bash -python3 synthesize.py \ +python3 ../synthesize.py \ --transformer-tts-config=conf/default.yaml \ --transformer-tts-checkpoint=exp/default/checkpoints/snapshot_iter_201500.pdz \ --transformer-tts-stat=dump/train/speech_stats.npy \ diff --git a/examples/transformer_tts/ljspeech/synthesize_e2e.py b/examples/transformer_tts/ljspeech/synthesize_e2e.py index 534b6aa0..d3680b05 100644 --- a/examples/transformer_tts/ljspeech/synthesize_e2e.py +++ b/examples/transformer_tts/ljspeech/synthesize_e2e.py @@ -22,8 +22,8 @@ import yaml from yacs.config import CfgNode from parakeet.frontend import English -from parakeet.models.transformer_tts import TransformerTTS -from parakeet.models.transformer_tts import TransformerTTSInference +from parakeet.models.transformer_tts.transformer_tts import TransformerTTS +from parakeet.models.transformer_tts.transformer_tts import TransformerTTSInference from parakeet.models.waveflow import ConditionalWaveFlow from parakeet.modules.normalizer import ZScore from parakeet.utils import layer_tools diff --git a/examples/transformer_tts/ljspeech/normalize.py b/examples/transformer_tts/normalize.py similarity index 95% rename from examples/transformer_tts/ljspeech/normalize.py rename to examples/transformer_tts/normalize.py index cb4bea95..a666ca2f 100644 --- a/examples/transformer_tts/ljspeech/normalize.py +++ b/examples/transformer_tts/normalize.py @@ -48,15 +48,9 @@ def main(): required=True, help="speech statistics file.") parser.add_argument( - "--phones-dict", - type=str, - default="phone_id_map.txt ", - help="phone vocabulary file.") + "--phones-dict", type=str, default=None, help="phone vocabulary file.") parser.add_argument( - "--speaker-dict", - type=str, - default="speaker_id_map.txt ", - help="speaker id map file.") + "--speaker-dict", type=str, default=None, help="speaker id map file.") parser.add_argument( "--verbose", type=int, diff --git a/examples/transformer_tts/ljspeech/preprocess.py b/examples/transformer_tts/preprocess.py similarity index 100% rename from examples/transformer_tts/ljspeech/preprocess.py rename to examples/transformer_tts/preprocess.py diff --git a/examples/transformer_tts/ljspeech/synthesize.py b/examples/transformer_tts/synthesize.py similarity index 95% rename from examples/transformer_tts/ljspeech/synthesize.py rename to examples/transformer_tts/synthesize.py index 2c42c7cb..2af508bf 100644 --- a/examples/transformer_tts/ljspeech/synthesize.py +++ b/examples/transformer_tts/synthesize.py @@ -23,8 +23,8 @@ import yaml from yacs.config import CfgNode from parakeet.datasets.data_table import DataTable -from parakeet.models.transformer_tts import TransformerTTS -from parakeet.models.transformer_tts import TransformerTTSInference +from parakeet.models.transformer_tts.transformer_tts import TransformerTTS +from parakeet.models.transformer_tts.transformer_tts import TransformerTTSInference from parakeet.models.waveflow import ConditionalWaveFlow from parakeet.modules.normalizer import ZScore from parakeet.utils import layer_tools @@ -113,10 +113,7 @@ def main(): parser.add_argument( "--waveflow-checkpoint", type=str, help="waveflow checkpoint to load.") parser.add_argument( - "--phones-dict", - type=str, - default="phone_id_map.txt", - help="phone vocabulary file.") + "--phones-dict", type=str, default=None, help="phone vocabulary file.") parser.add_argument("--test-metadata", type=str, help="test metadata.") parser.add_argument("--output-dir", type=str, help="output dir.") diff --git a/examples/transformer_tts/ljspeech/train.py b/examples/transformer_tts/train.py similarity index 81% rename from examples/transformer_tts/ljspeech/train.py rename to examples/transformer_tts/train.py index 77b1d3c2..2899a44e 100644 --- a/examples/transformer_tts/ljspeech/train.py +++ b/examples/transformer_tts/train.py @@ -24,44 +24,21 @@ import yaml from paddle import DataParallel from paddle import distributed as dist -from paddle import nn -from paddle.io import DataLoader, DistributedBatchSampler +from paddle.io import DataLoader +from paddle.io import DistributedBatchSampler from parakeet.datasets.data_table import DataTable from parakeet.datasets.am_batch_fn import transformer_single_spk_batch_fn -from parakeet.models.transformer_tts import TransformerTTS +from parakeet.models.transformer_tts.transformer_tts import TransformerTTS +from parakeet.models.transformer_tts.transformer_tts_updater import TransformerTTSUpdater +from parakeet.models.transformer_tts.transformer_tts_updater import TransformerTTSEvaluator from parakeet.training.extensions.snapshot import Snapshot from parakeet.training.extensions.visualizer import VisualDL +from parakeet.training.optimizer import build_optimizers from parakeet.training.seeding import seed_everything from parakeet.training.trainer import Trainer -from parakeet.training.transformer_tts_updater import TransformerTTSUpdater -from parakeet.training.transformer_tts_updater import TransformerTTSEvaluator from visualdl import LogWriter from yacs.config import CfgNode -optim_classes = dict( - adadelta=paddle.optimizer.Adadelta, - adagrad=paddle.optimizer.Adagrad, - adam=paddle.optimizer.Adam, - adamax=paddle.optimizer.Adamax, - adamw=paddle.optimizer.AdamW, - lamb=paddle.optimizer.Lamb, - momentum=paddle.optimizer.Momentum, - rmsprop=paddle.optimizer.RMSProp, - sgd=paddle.optimizer.SGD, ) - - -def build_optimizers(model: nn.Layer, optim='adadelta', - learning_rate=0.01) -> paddle.optimizer: - optim_class = optim_classes.get(optim) - if optim_class is None: - raise ValueError(f"must be one of {list(optim_classes)}: {optim}") - else: - optim = optim_class( - parameters=model.parameters(), learning_rate=learning_rate) - - optimizers = optim - return optimizers - def train_sp(args, config): # decides device type and whether to run in parallel @@ -179,13 +156,6 @@ def train_sp(args, config): trainer.run() -def get_cfg_default(): - config_path = (Path(__file__).parent / "conf" / "default.yaml").resolve() - with open(config_path, 'rt') as f: - config = CfgNode(yaml.safe_load(f)) - return config - - def main(): # parse args and config and redirect to train_sp parser = argparse.ArgumentParser(description="Train a TransformerTTS " @@ -201,18 +171,14 @@ def main(): "--nprocs", type=int, default=1, help="number of processes.") parser.add_argument("--verbose", type=int, default=1, help="verbose.") parser.add_argument( - "--phones-dict", - type=str, - default="phone_id_map.txt ", - help="phone vocabulary file.") + "--phones-dict", type=str, default=None, help="phone vocabulary file.") args = parser.parse_args() if args.device == "cpu" and args.nprocs > 1: raise RuntimeError("Multiprocess training on CPU is not supported.") - config = get_cfg_default() - if args.config: - config.merge_from_file(args.config) + with open(args.config) as f: + config = CfgNode(yaml.safe_load(f)) print("========Args========") print(yaml.safe_dump(vars(args))) diff --git a/utils/preprocess_utils.py b/parakeet/datasets/preprocess_utils.py similarity index 100% rename from utils/preprocess_utils.py rename to parakeet/datasets/preprocess_utils.py diff --git a/parakeet/frontend/__init__.py b/parakeet/frontend/__init__.py index 51d36f99..b8779b65 100644 --- a/parakeet/frontend/__init__.py +++ b/parakeet/frontend/__init__.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from .cn_normalization import * +from .zh_normalization import * from .generate_lexicon import * from .normalizer import * from .phonectic import * diff --git a/parakeet/frontend/cn_frontend.py b/parakeet/frontend/zh_frontend.py similarity index 99% rename from parakeet/frontend/cn_frontend.py rename to parakeet/frontend/zh_frontend.py index f600160d..8a0c1668 100644 --- a/parakeet/frontend/cn_frontend.py +++ b/parakeet/frontend/zh_frontend.py @@ -22,7 +22,7 @@ from pypinyin import lazy_pinyin from pypinyin import Style -from parakeet.frontend.cn_normalization.text_normlization import TextNormalizer +from parakeet.frontend.zh_normalization.text_normlization import TextNormalizer from parakeet.frontend.generate_lexicon import generate_lexicon from parakeet.frontend.tone_sandhi import ToneSandhi diff --git a/parakeet/frontend/cn_normalization/README.md b/parakeet/frontend/zh_normalization/README.md similarity index 100% rename from parakeet/frontend/cn_normalization/README.md rename to parakeet/frontend/zh_normalization/README.md diff --git a/parakeet/frontend/cn_normalization/__init__.py b/parakeet/frontend/zh_normalization/__init__.py similarity index 90% rename from parakeet/frontend/cn_normalization/__init__.py rename to parakeet/frontend/zh_normalization/__init__.py index ea322906..77e10ebb 100644 --- a/parakeet/frontend/cn_normalization/__init__.py +++ b/parakeet/frontend/zh_normalization/__init__.py @@ -12,4 +12,4 @@ # See the License for the specific language governing permissions and # limitations under the License. -from parakeet.frontend.cn_normalization.text_normlization import * +from parakeet.frontend.zh_normalization.text_normlization import * diff --git a/parakeet/frontend/cn_normalization/char_convert.py b/parakeet/frontend/zh_normalization/char_convert.py similarity index 100% rename from parakeet/frontend/cn_normalization/char_convert.py rename to parakeet/frontend/zh_normalization/char_convert.py diff --git a/parakeet/frontend/cn_normalization/chronology.py b/parakeet/frontend/zh_normalization/chronology.py similarity index 100% rename from parakeet/frontend/cn_normalization/chronology.py rename to parakeet/frontend/zh_normalization/chronology.py diff --git a/parakeet/frontend/cn_normalization/constants.py b/parakeet/frontend/zh_normalization/constants.py similarity index 100% rename from parakeet/frontend/cn_normalization/constants.py rename to parakeet/frontend/zh_normalization/constants.py diff --git a/parakeet/frontend/cn_normalization/num.py b/parakeet/frontend/zh_normalization/num.py similarity index 100% rename from parakeet/frontend/cn_normalization/num.py rename to parakeet/frontend/zh_normalization/num.py diff --git a/parakeet/frontend/cn_normalization/phonecode.py b/parakeet/frontend/zh_normalization/phonecode.py similarity index 100% rename from parakeet/frontend/cn_normalization/phonecode.py rename to parakeet/frontend/zh_normalization/phonecode.py diff --git a/parakeet/frontend/cn_normalization/quantifier.py b/parakeet/frontend/zh_normalization/quantifier.py similarity index 100% rename from parakeet/frontend/cn_normalization/quantifier.py rename to parakeet/frontend/zh_normalization/quantifier.py diff --git a/parakeet/frontend/cn_normalization/text_normlization.py b/parakeet/frontend/zh_normalization/text_normlization.py similarity index 100% rename from parakeet/frontend/cn_normalization/text_normlization.py rename to parakeet/frontend/zh_normalization/text_normlization.py diff --git a/parakeet/models/fastspeech2.py b/parakeet/models/fastspeech2/fastspeech2.py similarity index 100% rename from parakeet/models/fastspeech2.py rename to parakeet/models/fastspeech2/fastspeech2.py diff --git a/parakeet/training/fastspeech2_updater.py b/parakeet/models/fastspeech2/fastspeech2_updater.py similarity index 93% rename from parakeet/training/fastspeech2_updater.py rename to parakeet/models/fastspeech2/fastspeech2_updater.py index ad9e610c..b579fc27 100644 --- a/parakeet/training/fastspeech2_updater.py +++ b/parakeet/models/fastspeech2/fastspeech2_updater.py @@ -14,7 +14,7 @@ import logging from paddle import distributed as dist -from parakeet.models.fastspeech2 import FastSpeech2Loss +from parakeet.models.fastspeech2.fastspeech2 import FastSpeech2Loss from parakeet.training.extensions.evaluator import StandardEvaluator from parakeet.training.reporter import report from parakeet.training.updaters.standard_updater import StandardUpdater @@ -51,6 +51,8 @@ def __init__(self, def update_core(self, batch): self.msg = "Rank: {}, ".format(dist.get_rank()) losses_dict = {} + # spk_id!=None in multiple spk fastspeech2 + spk_id = batch["spk_id"] if "spk_id" in batch else None before_outs, after_outs, d_outs, p_outs, e_outs, ys, olens = self.model( text=batch["text"], @@ -59,7 +61,8 @@ def update_core(self, batch): speech_lengths=batch["speech_lengths"], durations=batch["durations"], pitch=batch["pitch"], - energy=batch["energy"], ) + energy=batch["energy"], + spk_id=spk_id) l1_loss, duration_loss, pitch_loss, energy_loss = self.criterion( after_outs=after_outs, @@ -120,6 +123,8 @@ def __init__(self, def evaluate_core(self, batch): self.msg = "Evaluate: " losses_dict = {} + # spk_id!=None in multiple spk fastspeech2 + spk_id = batch["spk_id"] if "spk_id" in batch else None before_outs, after_outs, d_outs, p_outs, e_outs, ys, olens = self.model( text=batch["text"], @@ -128,7 +133,8 @@ def evaluate_core(self, batch): speech_lengths=batch["speech_lengths"], durations=batch["durations"], pitch=batch["pitch"], - energy=batch["energy"]) + energy=batch["energy"], + spk_id=spk_id) l1_loss, duration_loss, pitch_loss, energy_loss = self.criterion( after_outs=after_outs, diff --git a/parakeet/models/parallel_wavegan.py b/parakeet/models/parallel_wavegan/parallel_wavegan.py similarity index 100% rename from parakeet/models/parallel_wavegan.py rename to parakeet/models/parallel_wavegan/parallel_wavegan.py diff --git a/parakeet/training/pwg_updater.py b/parakeet/models/parallel_wavegan/parallel_wavegan_updater.py similarity index 100% rename from parakeet/training/pwg_updater.py rename to parakeet/models/parallel_wavegan/parallel_wavegan_updater.py diff --git a/parakeet/models/speedyspeech.py b/parakeet/models/speedyspeech/speedyspeech.py similarity index 100% rename from parakeet/models/speedyspeech.py rename to parakeet/models/speedyspeech/speedyspeech.py diff --git a/parakeet/training/speedyspeech_updater.py b/parakeet/models/speedyspeech/speedyspeech_updater.py similarity index 100% rename from parakeet/training/speedyspeech_updater.py rename to parakeet/models/speedyspeech/speedyspeech_updater.py diff --git a/parakeet/models/transformer_tts.py b/parakeet/models/transformer_tts/transformer_tts.py similarity index 100% rename from parakeet/models/transformer_tts.py rename to parakeet/models/transformer_tts/transformer_tts.py diff --git a/parakeet/training/transformer_tts_updater.py b/parakeet/models/transformer_tts/transformer_tts_updater.py similarity index 98% rename from parakeet/training/transformer_tts_updater.py rename to parakeet/models/transformer_tts/transformer_tts_updater.py index 7e75a860..a9ae48ad 100644 --- a/parakeet/training/transformer_tts_updater.py +++ b/parakeet/models/transformer_tts/transformer_tts_updater.py @@ -16,8 +16,8 @@ import paddle from paddle import distributed as dist -from parakeet.models.transformer_tts import GuidedMultiHeadAttentionLoss -from parakeet.models.transformer_tts import TransformerTTSLoss +from parakeet.models.transformer_tts.transformer_tts import GuidedMultiHeadAttentionLoss +from parakeet.models.transformer_tts.transformer_tts import TransformerTTSLoss from parakeet.training.extensions.evaluator import StandardEvaluator from parakeet.training.reporter import report from parakeet.training.updaters.standard_updater import StandardUpdater diff --git a/parakeet/training/multispk_fastspeech2_updater.py b/parakeet/training/multispk_fastspeech2_updater.py deleted file mode 100644 index c7fa04ca..00000000 --- a/parakeet/training/multispk_fastspeech2_updater.py +++ /dev/null @@ -1,162 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import logging - -from paddle import distributed as dist -from parakeet.models.fastspeech2 import FastSpeech2Loss -from parakeet.training.extensions.evaluator import StandardEvaluator -from parakeet.training.reporter import report -from parakeet.training.updaters.standard_updater import StandardUpdater -logging.basicConfig( - format='%(asctime)s [%(levelname)s] [%(filename)s:%(lineno)d] %(message)s', - datefmt='[%Y-%m-%d %H:%M:%S]') -logger = logging.getLogger(__name__) -logger.setLevel(logging.INFO) - - -class FastSpeech2Updater(StandardUpdater): - def __init__(self, - model, - optimizer, - dataloader, - init_state=None, - use_masking=False, - use_weighted_masking=False, - output_dir=None): - super().__init__(model, optimizer, dataloader, init_state=None) - self.use_masking = use_masking - self.use_weighted_masking = use_weighted_masking - - self.criterion = FastSpeech2Loss( - use_masking=self.use_masking, - use_weighted_masking=self.use_weighted_masking) - - log_file = output_dir / 'worker_{}.log'.format(dist.get_rank()) - self.filehandler = logging.FileHandler(str(log_file)) - logger.addHandler(self.filehandler) - self.logger = logger - self.msg = "" - - def update_core(self, batch): - self.msg = "Rank: {}, ".format(dist.get_rank()) - losses_dict = {} - - before_outs, after_outs, d_outs, p_outs, e_outs, ys, olens = self.model( - text=batch["text"], - text_lengths=batch["text_lengths"], - speech=batch["speech"], - speech_lengths=batch["speech_lengths"], - durations=batch["durations"], - pitch=batch["pitch"], - energy=batch["energy"], - spk_id=batch["spk_id"], ) - - l1_loss, duration_loss, pitch_loss, energy_loss = self.criterion( - after_outs=after_outs, - before_outs=before_outs, - d_outs=d_outs, - p_outs=p_outs, - e_outs=e_outs, - ys=ys, - ds=batch["durations"], - ps=batch["pitch"], - es=batch["energy"], - ilens=batch["text_lengths"], - olens=olens) - - loss = l1_loss + duration_loss + pitch_loss + energy_loss - - optimizer = self.optimizer - optimizer.clear_grad() - loss.backward() - optimizer.step() - - report("train/loss", float(loss)) - report("train/l1_loss", float(l1_loss)) - report("train/duration_loss", float(duration_loss)) - report("train/pitch_loss", float(pitch_loss)) - report("train/energy_loss", float(energy_loss)) - - losses_dict["l1_loss"] = float(l1_loss) - losses_dict["duration_loss"] = float(duration_loss) - losses_dict["pitch_loss"] = float(pitch_loss) - losses_dict["energy_loss"] = float(energy_loss) - losses_dict["loss"] = float(loss) - self.msg += ', '.join('{}: {:>.6f}'.format(k, v) - for k, v in losses_dict.items()) - - -class FastSpeech2Evaluator(StandardEvaluator): - def __init__(self, - model, - dataloader, - use_masking=False, - use_weighted_masking=False, - output_dir=None): - super().__init__(model, dataloader) - self.use_masking = use_masking - self.use_weighted_masking = use_weighted_masking - - self.criterion = FastSpeech2Loss( - use_masking=self.use_masking, - use_weighted_masking=self.use_weighted_masking) - - log_file = output_dir / 'worker_{}.log'.format(dist.get_rank()) - self.filehandler = logging.FileHandler(str(log_file)) - logger.addHandler(self.filehandler) - self.logger = logger - self.msg = "" - - def evaluate_core(self, batch): - self.msg = "Evaluate: " - losses_dict = {} - - before_outs, after_outs, d_outs, p_outs, e_outs, ys, olens = self.model( - text=batch["text"], - text_lengths=batch["text_lengths"], - speech=batch["speech"], - speech_lengths=batch["speech_lengths"], - durations=batch["durations"], - pitch=batch["pitch"], - energy=batch["energy"], - spk_id=batch["spk_id"], ) - - l1_loss, duration_loss, pitch_loss, energy_loss = self.criterion( - after_outs=after_outs, - before_outs=before_outs, - d_outs=d_outs, - p_outs=p_outs, - e_outs=e_outs, - ys=ys, - ds=batch["durations"], - ps=batch["pitch"], - es=batch["energy"], - ilens=batch["text_lengths"], - olens=olens, ) - loss = l1_loss + duration_loss + pitch_loss + energy_loss - - report("eval/loss", float(loss)) - report("eval/l1_loss", float(l1_loss)) - report("eval/duration_loss", float(duration_loss)) - report("eval/pitch_loss", float(pitch_loss)) - report("eval/energy_loss", float(energy_loss)) - - losses_dict["l1_loss"] = float(l1_loss) - losses_dict["duration_loss"] = float(duration_loss) - losses_dict["pitch_loss"] = float(pitch_loss) - losses_dict["energy_loss"] = float(energy_loss) - losses_dict["loss"] = float(loss) - self.msg += ', '.join('{}: {:>.6f}'.format(k, v) - for k, v in losses_dict.items()) - self.logger.info(self.msg) diff --git a/utils/train_utils.py b/parakeet/training/optimizer.py similarity index 100% rename from utils/train_utils.py rename to parakeet/training/optimizer.py diff --git a/benchmark/PWGAN/README.md b/tests/benchmark/PWGAN/README.md similarity index 85% rename from benchmark/PWGAN/README.md rename to tests/benchmark/PWGAN/README.md index dedc0bf1..3d2267ae 100644 --- a/benchmark/PWGAN/README.md +++ b/tests/benchmark/PWGAN/README.md @@ -4,7 +4,7 @@ ``` 即可运行. 执行逻辑: -1. cd 到 ../../ (也就是 Parakeet 目录) +1. cd 到 ../../../ (也就是 Parakeet 目录) 2. 安装 parakeet 所需依赖 3. 从 bos 下载数据集并解压缩 4. 预处理数据集为训练 pwg 所需格式,保存到 Parakeet/dump 文件夹底下 diff --git a/benchmark/PWGAN/run_all.sh b/tests/benchmark/PWGAN/run_all.sh similarity index 69% rename from benchmark/PWGAN/run_all.sh rename to tests/benchmark/PWGAN/run_all.sh index 6ac7e2cf..e26db317 100755 --- a/benchmark/PWGAN/run_all.sh +++ b/tests/benchmark/PWGAN/run_all.sh @@ -5,7 +5,7 @@ stop_stage=100 # 提供可稳定复现性能的脚本,默认在标准docker环境内py37执行: paddlepaddle/paddle:latest-gpu-cuda10.1-cudnn7 paddle=2.1.2 py=37 # 执行目录:需说明 -cd ../../ +cd ../../../ # 1 安装该模型需要的依赖 (如需开启优化策略请注明) if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then sudo apt-get install libsndfile1 @@ -22,11 +22,11 @@ fi # 数据预处理 if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then - python utils/vocoder_preprocess.py --rootdir=BZNSYP/ --dumpdir=dump --num-cpu=20 --cut-sil=True --dur-file=durations.txt --config=examples/parallelwave_gan/baker/conf/default.yaml + python examples/GANVocoder/preprocess.py --rootdir=BZNSYP/ --dumpdir=dump --num-cpu=20 --cut-sil=True --dur-file=durations.txt --config=examples/GANVocoder/parallelwave_gan/baker/conf/default.yaml python utils/compute_statistics.py --metadata=dump/train/raw/metadata.jsonl --field-name="feats" - python utils/vocoder_normalize.py --metadata=dump/train/raw/metadata.jsonl --dumpdir=dump/train/norm --stats=dump/train/feats_stats.npy - python utils/vocoder_normalize.py --metadata=dump/dev/raw/metadata.jsonl --dumpdir=dump/dev/norm --stats=dump/train/feats_stats.npy - python utils/vocoder_normalize.py --metadata=dump/test/raw/metadata.jsonl --dumpdir=dump/test/norm --stats=dump/train/feats_stats.npy + python examples/GANVocoder/normalize.py --metadata=dump/train/raw/metadata.jsonl --dumpdir=dump/train/norm --stats=dump/train/feats_stats.npy + python examples/GANVocoder/normalize.py --metadata=dump/dev/raw/metadata.jsonl --dumpdir=dump/dev/norm --stats=dump/train/feats_stats.npy + python examples/GANVocoder/normalize.py --metadata=dump/test/raw/metadata.jsonl --dumpdir=dump/test/norm --stats=dump/train/feats_stats.npy fi # 3 批量运行(如不方便批量,1,2需放到单个模型中) if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then @@ -40,7 +40,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then for bs_item in ${bs_item_list[@]}; do echo "index is speed, 1gpus, begin, ${model_name}" run_mode=sp - CUDA_VISIBLE_DEVICES=0 bash benchmark/PWGAN/run_benchmark.sh ${run_mode} ${bs_item} ${fp_item} 100 ${model_mode} # (5min) + CUDA_VISIBLE_DEVICES=0 bash tests/benchmark/PWGAN/run_benchmark.sh ${run_mode} ${bs_item} ${fp_item} 100 ${model_mode} # (5min) sleep 60 echo "index is speed, 8gpus, run_mode is multi_process, begin, ${model_name}" run_mode=mp diff --git a/benchmark/PWGAN/run_benchmark.sh b/tests/benchmark/PWGAN/run_benchmark.sh similarity index 86% rename from benchmark/PWGAN/run_benchmark.sh rename to tests/benchmark/PWGAN/run_benchmark.sh index f9ca7827..bcdccccf 100755 --- a/benchmark/PWGAN/run_benchmark.sh +++ b/tests/benchmark/PWGAN/run_benchmark.sh @@ -24,13 +24,13 @@ function _train(){ --max-iter=${max_iter} --train-metadata=dump/train/norm/metadata.jsonl \ --dev-metadata=dump/dev/norm/metadata.jsonl \ - --config=examples/parallelwave_gan/baker/conf/default.yaml \ + --config=examples/GANVocoder/parallelwave_gan/baker/conf/default.yaml \ --output-dir=exp/default \ --run-benchmark=true" case ${run_mode} in - sp) train_cmd="python3 utils/pwg_train.py --nprocs=1 ${train_cmd}" ;; - mp) train_cmd="python3 utils/pwg_train.py --nprocs=8 ${train_cmd}" + sp) train_cmd="python3 examples/GANVocoder/parallelwave_gan/train.py --nprocs=1 ${train_cmd}" ;; + mp) train_cmd="python3 examples/GANVocoder/parallelwave_gan/train.py --nprocs=8 ${train_cmd}" log_parse_file="mylog/workerlog.0" ;; *) echo "choose run_mode(sp or mp)"; exit 1; esac diff --git a/tests/README.md b/tests/chain/README.md similarity index 100% rename from tests/README.md rename to tests/chain/README.md diff --git a/tests/infer.sh b/tests/chain/infer.sh similarity index 100% rename from tests/infer.sh rename to tests/chain/infer.sh diff --git a/tests/lite_train_infer.sh b/tests/chain/lite_train_infer.sh similarity index 100% rename from tests/lite_train_infer.sh rename to tests/chain/lite_train_infer.sh diff --git a/tests/prepare.sh b/tests/chain/prepare.sh similarity index 100% rename from tests/prepare.sh rename to tests/chain/prepare.sh diff --git a/tests/chain/speedyspeech_params_lite_multi_gpu.txt b/tests/chain/speedyspeech_params_lite_multi_gpu.txt new file mode 100644 index 00000000..98026241 --- /dev/null +++ b/tests/chain/speedyspeech_params_lite_multi_gpu.txt @@ -0,0 +1,50 @@ +===========================train_params=========================== +model_name:speedyspeech +python:python3.7 +gpu_list:0,1 +null:null +null:null +null:null +null:null +null:null +null:null +null:null +null:null +null:null +## +trainer:norm_train +norm_train:../../examples/speedyspeech/train.py --train-metadata=train_data/mini_BZNSYP/train/norm/metadata.jsonl --dev-metadata=train_data/mini_BZNSYP/dev/norm/metadata.jsonl --config=../../examples/speedyspeech/baker/conf/default.yaml --batch_size=32 --max_epoch=20 --num_snapshots=10 --output-dir=exp/default --phones-dict=train_data/mini_BZNSYP/phone_id_map.txt --tones-dict=train_data/mini_BZNSYP/tone_id_map.txt --use-relative-path=True +null:null +null:null +null:null +null:null +null:null +## +===========================eval_params=========================== +eval:../../examples/speedyspeech/baker/synthesize_e2e.py --speedyspeech-config=../../examples/speedyspeech/baker/conf/default.yaml --speedyspeech-checkpoint=exp/default/checkpoints/snapshot_iter_20.pdz --speedyspeech-stat=train_data/mini_BZNSYP/train/feats_stats.npy --pwg-config=pretrain_models/pwg_baker_ckpt_0.4/pwg_default.yaml --pwg-checkpoint=pretrain_models/pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz --pwg-stat=pretrain_models/pwg_baker_ckpt_0.4/pwg_stats.npy --text=../../examples/speedyspeech/sentences.txt --output-dir=e2e --inference-dir=inference --device="gpu" --phones-dict=train_data/mini_BZNSYP/phone_id_map.txt --tones-dict=train_data/mini_BZNSYP/tone_id_map.txt +null:null +## +===========================infer_params=========================== +null:null +null:null +null:null +null:null +null:null +null:null +null:null +null:null +## +null:null +null:null +null:null +inference:../../examples/speedyspeech/baker/inference.py --inference-dir=pretrain_models/speedyspeech_pwg_inference_0.5 --text=../../examples/speedyspeech/sentences.txt --output-dir=inference_out --enable-auto-log --phones-dict=pretrain_models/speedyspeech_pwg_inference_0.5/phone_id_map.txt --tones-dict=pretrain_models/speedyspeech_pwg_inference_0.5/tone_id_map.txt +null:null +null:null +null:null +null:null +null:null +null:null +null:null +null:null +null:null +null:null diff --git a/tests/chain/speedyspeech_params_lite_single_gpu.txt b/tests/chain/speedyspeech_params_lite_single_gpu.txt new file mode 100644 index 00000000..e821183a --- /dev/null +++ b/tests/chain/speedyspeech_params_lite_single_gpu.txt @@ -0,0 +1,51 @@ +===========================train_params=========================== +model_name:speedyspeech +python:python3.7 +gpu_list:0 +null:null +null:null +null:null +null:null +null:null +null:null +null:null +null:null +null:null +## +trainer:norm_train +norm_train:../../examples/speedyspeech/train.py --train-metadata=train_data/mini_BZNSYP/train/norm/metadata.jsonl --dev-metadata=train_data/mini_BZNSYP/dev/norm/metadata.jsonl --config=../../examples/speedyspeech/baker/conf/default.yaml --batch_size=32 --max_epoch=10 --num_snapshots=10 --output-dir=exp/default --phones-dict=train_data/mini_BZNSYP/phone_id_map.txt --tones-dict=train_data/mini_BZNSYP/tone_id_map.txt --use-relative-path=True +null:null +null:null +null:null +null:null +null:null +## +===========================eval_params=========================== +eval:../../examples/speedyspeech/baker/synthesize_e2e.py --speedyspeech-config=../../examples/speedyspeech/baker/conf/default.yaml --speedyspeech-checkpoint=exp/default/checkpoints/snapshot_iter_30.pdz --speedyspeech-stat=train_data/mini_BZNSYP/train/feats_stats.npy --pwg-config=pretrain_models/pwg_baker_ckpt_0.4/pwg_default.yaml --pwg-checkpoint=pretrain_models/pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz --pwg-stat=pretrain_models/pwg_baker_ckpt_0.4/pwg_stats.npy --text=../../examples/speedyspeech/sentences.txt --output-dir=e2e --inference-dir=inference --device="gpu" --phones-dict=train_data/mini_BZNSYP/phone_id_map.txt --tones-dict=train_data/mini_BZNSYP/tone_id_map.txt +null:null +## +===========================infer_params=========================== +null:null +null:null +null:null +null:null +null:null +null:null +null:null +null:null +## +null:null +null:null +null:null +inference:../../examples/speedyspeech/baker/inference.py --inference-dir=pretrain_models/speedyspeech_pwg_inference_0.5 --text=../../examples/speedyspeech/sentences.txt --output-dir=inference_out --enable-auto-log --phones-dict=pretrain_models/speedyspeech_pwg_inference_0.5/phone_id_map.txt --tones-dict=pretrain_models/speedyspeech_pwg_inference_0.5/tone_id_map.txt +--use_gpu:True +null:null +null:null +null:null +null:null +null:null +null:null +null:null +null:null +null:null +null:null diff --git a/tests/chain/speedyspeech_params_whole_multi_gpu.txt b/tests/chain/speedyspeech_params_whole_multi_gpu.txt new file mode 100644 index 00000000..7c517119 --- /dev/null +++ b/tests/chain/speedyspeech_params_whole_multi_gpu.txt @@ -0,0 +1,50 @@ +===========================train_params=========================== +model_name:speedyspeech +python:python3.7 +gpu_list:0,1 +null:null +null:null +null:null +null:null +null:null +null:null +null:null +null:null +null:null +## +trainer:norm_train +norm_train:../../examples/speedyspeech/train.py --train-metadata=train_data/processed_BZNSYP/train/norm/metadata.jsonl --dev-metadata=train_data/processed_BZNSYP/dev/norm/metadata.jsonl --config=../../examples/speedyspeech/baker/conf/default.yaml --output-dir=exp/whole --phones-dict=train_data/processed_BZNSYP/phone_id_map.txt --tones-dict=train_data/processed_BZNSYP/tone_id_map.txt --use-relative-path=True +null:null +null:null +null:null +null:null +null:null +## +===========================eval_params=========================== +eval:../../examples/speedyspeech/baker/synthesize_e2e.py --speedyspeech-config=pretrain_models/speedyspeech_nosil_baker_ckpt_0.5/default.yaml --speedyspeech-checkpoint=pretrain_models/speedyspeech_nosil_baker_ckpt_0.5/snapshot_iter_11400.pdz --speedyspeech-stat=pretrain_models/speedyspeech_nosil_baker_ckpt_0.5/feats_stats.npy --pwg-config=pretrain_models/pwg_baker_ckpt_0.4/pwg_default.yaml --pwg-checkpoint=pretrain_models/pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz --pwg-stat=pretrain_models/pwg_baker_ckpt_0.4/pwg_stats.npy --text=../../examples/speedyspeech/sentences.txt --output-dir=e2e --inference-dir=inference --device="gpu" --phones-dict=pretrain_models/speedyspeech_nosil_baker_ckpt_0.5/phone_id_map.txt --tones-dict=pretrain_models/speedyspeech_nosil_baker_ckpt_0.5/tone_id_map.txt +null:null +## +===========================infer_params=========================== +null:null +null:null +null:null +null:null +null:null +null:null +null:null +null:null +## +null:null +null:null +null:null +inference:../../examples/speedyspeech/baker/inference.py --inference-dir=pretrain_models/speedyspeech_pwg_inference_0.5 --text=../../examples/speedyspeech/sentences.txt --output-dir=inference_out --enable-auto-log --phones-dict=pretrain_models/speedyspeech_pwg_inference_0.5/phone_id_map.txt --tones-dict=pretrain_models/speedyspeech_pwg_inference_0.5/tone_id_map.txt +null:null +null:null +null:null +null:null +null:null +null:null +null:null +null:null +null:null +null:null diff --git a/tests/chain/speedyspeech_params_whole_single_gpu.txt b/tests/chain/speedyspeech_params_whole_single_gpu.txt new file mode 100644 index 00000000..9a6c611e --- /dev/null +++ b/tests/chain/speedyspeech_params_whole_single_gpu.txt @@ -0,0 +1,50 @@ +===========================train_params=========================== +model_name:speedyspeech +python:python3.7 +gpu_list:0 +null:null +null:null +null:null +null:null +null:null +null:null +null:null +null:null +null:null +## +trainer:norm_train +norm_train:../../examples/speedyspeech/train.py --train-metadata=train_data/processed_BZNSYP/train/norm/metadata.jsonl --dev-metadata=train_data/processed_BZNSYP/dev/norm/metadata.jsonl --config=../../examples/speedyspeech/baker/conf/default.yaml --output-dir=exp/whole --phones-dict=train_data/processed_BZNSYP/phone_id_map.txt --tones-dict=train_data/processed_BZNSYP/tone_id_map.txt --use-relative-path=True +null:null +null:null +null:null +null:null +null:null +## +===========================eval_params=========================== +eval:../../examples/speedyspeech/baker/synthesize_e2e.py --speedyspeech-config=pretrain_models/speedyspeech_nosil_baker_ckpt_0.5/default.yaml --speedyspeech-checkpoint=pretrain_models/speedyspeech_nosil_baker_ckpt_0.5/snapshot_iter_11400.pdz --speedyspeech-stat=pretrain_models/speedyspeech_nosil_baker_ckpt_0.5/feats_stats.npy --pwg-config=pretrain_models/pwg_baker_ckpt_0.4/pwg_default.yaml --pwg-checkpoint=pretrain_models/pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz --pwg-stat=pretrain_models/pwg_baker_ckpt_0.4/pwg_stats.npy --text=../../examples/speedyspeech/sentences.txt --output-dir=e2e --inference-dir=inference --device="gpu" --phones-dict=pretrain_models/speedyspeech_nosil_baker_ckpt_0.5/phone_id_map.txt --tones-dict=pretrain_models/speedyspeech_nosil_baker_ckpt_0.5/tone_id_map.txt +null:null +## +===========================infer_params=========================== +null:null +null:null +null:null +null:null +null:null +null:null +null:null +null:null +## +null:null +null:null +null:null +inference:../../examples/speedyspeech/baker/inference.py --inference-dir=pretrain_models/speedyspeech_pwg_inference_0.5 --text=../../examples/speedyspeech/sentences.txt --output-dir=inference_out --enable-auto-log --phones-dict=pretrain_models/speedyspeech_pwg_inference_0.5/phone_id_map.txt --tones-dict=pretrain_models/speedyspeech_pwg_inference_0.5/tone_id_map.txt +null:null +null:null +null:null +null:null +null:null +null:null +null:null +null:null +null:null +null:null diff --git a/tests/test.sh b/tests/chain/test.sh similarity index 100% rename from tests/test.sh rename to tests/chain/test.sh diff --git a/tests/whole_train_infer.sh b/tests/chain/whole_train_infer.sh similarity index 100% rename from tests/whole_train_infer.sh rename to tests/chain/whole_train_infer.sh diff --git a/tests/speedyspeech_params_lite_multi_gpu.txt b/tests/speedyspeech_params_lite_multi_gpu.txt deleted file mode 100644 index 4a1c4091..00000000 --- a/tests/speedyspeech_params_lite_multi_gpu.txt +++ /dev/null @@ -1,50 +0,0 @@ -===========================train_params=========================== -model_name:speedyspeech -python:python3.7 -gpu_list:0,1 -null:null -null:null -null:null -null:null -null:null -null:null -null:null -null:null -null:null -## -trainer:norm_train -norm_train:../utils/ss_train.py --train-metadata=train_data/mini_BZNSYP/train/norm/metadata.jsonl --dev-metadata=train_data/mini_BZNSYP/dev/norm/metadata.jsonl --config=../examples/speedyspeech/baker/conf/default.yaml --batch_size=32 --max_epoch=20 --num_snapshots=10 --output-dir=exp/default --phones-dict=train_data/mini_BZNSYP/phone_id_map.txt --tones-dict=train_data/mini_BZNSYP/tone_id_map.txt --use-relative-path=True -null:null -null:null -null:null -null:null -null:null -## -===========================eval_params=========================== -eval:../examples/speedyspeech/baker/synthesize_e2e.py --speedyspeech-config=../examples/speedyspeech/baker/conf/default.yaml --speedyspeech-checkpoint=exp/default/checkpoints/snapshot_iter_20.pdz --speedyspeech-stat=train_data/mini_BZNSYP/train/feats_stats.npy --pwg-config=pretrain_models/pwg_baker_ckpt_0.4/pwg_default.yaml --pwg-checkpoint=pretrain_models/pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz --pwg-stat=pretrain_models/pwg_baker_ckpt_0.4/pwg_stats.npy --text=../examples/speedyspeech/sentences.txt --output-dir=e2e --inference-dir=inference --device="gpu" --phones-dict=train_data/mini_BZNSYP/phone_id_map.txt --tones-dict=train_data/mini_BZNSYP/tone_id_map.txt -null:null -## -===========================infer_params=========================== -null:null -null:null -null:null -null:null -null:null -null:null -null:null -null:null -## -null:null -null:null -null:null -inference:../examples/speedyspeech/baker/inference.py --inference-dir=pretrain_models/speedyspeech_pwg_inference_0.5 --text=../examples/speedyspeech/sentences.txt --output-dir=inference_out --enable-auto-log --phones-dict=pretrain_models/speedyspeech_pwg_inference_0.5/phone_id_map.txt --tones-dict=pretrain_models/speedyspeech_pwg_inference_0.5/tone_id_map.txt -null:null -null:null -null:null -null:null -null:null -null:null -null:null -null:null -null:null -null:null diff --git a/tests/speedyspeech_params_lite_single_gpu.txt b/tests/speedyspeech_params_lite_single_gpu.txt deleted file mode 100644 index 6c5428f9..00000000 --- a/tests/speedyspeech_params_lite_single_gpu.txt +++ /dev/null @@ -1,51 +0,0 @@ -===========================train_params=========================== -model_name:speedyspeech -python:python3.7 -gpu_list:0 -null:null -null:null -null:null -null:null -null:null -null:null -null:null -null:null -null:null -## -trainer:norm_train -norm_train:../utils/ss_train.py --train-metadata=train_data/mini_BZNSYP/train/norm/metadata.jsonl --dev-metadata=train_data/mini_BZNSYP/dev/norm/metadata.jsonl --config=../examples/speedyspeech/baker/conf/default.yaml --batch_size=32 --max_epoch=10 --num_snapshots=10 --output-dir=exp/default --phones-dict=train_data/mini_BZNSYP/phone_id_map.txt --tones-dict=train_data/mini_BZNSYP/tone_id_map.txt --use-relative-path=True -null:null -null:null -null:null -null:null -null:null -## -===========================eval_params=========================== -eval:../examples/speedyspeech/baker/synthesize_e2e.py --speedyspeech-config=../examples/speedyspeech/baker/conf/default.yaml --speedyspeech-checkpoint=exp/default/checkpoints/snapshot_iter_30.pdz --speedyspeech-stat=train_data/mini_BZNSYP/train/feats_stats.npy --pwg-config=pretrain_models/pwg_baker_ckpt_0.4/pwg_default.yaml --pwg-checkpoint=pretrain_models/pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz --pwg-stat=pretrain_models/pwg_baker_ckpt_0.4/pwg_stats.npy --text=../examples/speedyspeech/sentences.txt --output-dir=e2e --inference-dir=inference --device="gpu" --phones-dict=train_data/mini_BZNSYP/phone_id_map.txt --tones-dict=train_data/mini_BZNSYP/tone_id_map.txt -null:null -## -===========================infer_params=========================== -null:null -null:null -null:null -null:null -null:null -null:null -null:null -null:null -## -null:null -null:null -null:null -inference:../examples/speedyspeech/baker/inference.py --inference-dir=pretrain_models/speedyspeech_pwg_inference_0.5 --text=../examples/speedyspeech/sentences.txt --output-dir=inference_out --enable-auto-log --phones-dict=pretrain_models/speedyspeech_pwg_inference_0.5/phone_id_map.txt --tones-dict=pretrain_models/speedyspeech_pwg_inference_0.5/tone_id_map.txt ---use_gpu:True -null:null -null:null -null:null -null:null -null:null -null:null -null:null -null:null -null:null -null:null diff --git a/tests/speedyspeech_params_whole_multi_gpu.txt b/tests/speedyspeech_params_whole_multi_gpu.txt deleted file mode 100644 index a53b7ff2..00000000 --- a/tests/speedyspeech_params_whole_multi_gpu.txt +++ /dev/null @@ -1,50 +0,0 @@ -===========================train_params=========================== -model_name:speedyspeech -python:python3.7 -gpu_list:0,1 -null:null -null:null -null:null -null:null -null:null -null:null -null:null -null:null -null:null -## -trainer:norm_train -norm_train:../utils/ss_train.py --train-metadata=train_data/processed_BZNSYP/train/norm/metadata.jsonl --dev-metadata=train_data/processed_BZNSYP/dev/norm/metadata.jsonl --config=../examples/speedyspeech/baker/conf/default.yaml --output-dir=exp/whole --phones-dict=train_data/processed_BZNSYP/phone_id_map.txt --tones-dict=train_data/processed_BZNSYP/tone_id_map.txt --use-relative-path=True -null:null -null:null -null:null -null:null -null:null -## -===========================eval_params=========================== -eval:../examples/speedyspeech/baker/synthesize_e2e.py --speedyspeech-config=pretrain_models/speedyspeech_nosil_baker_ckpt_0.5/default.yaml --speedyspeech-checkpoint=pretrain_models/speedyspeech_nosil_baker_ckpt_0.5/snapshot_iter_11400.pdz --speedyspeech-stat=pretrain_models/speedyspeech_nosil_baker_ckpt_0.5/feats_stats.npy --pwg-config=pretrain_models/pwg_baker_ckpt_0.4/pwg_default.yaml --pwg-checkpoint=pretrain_models/pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz --pwg-stat=pretrain_models/pwg_baker_ckpt_0.4/pwg_stats.npy --text=../examples/speedyspeech/sentences.txt --output-dir=e2e --inference-dir=inference --device="gpu" --phones-dict=pretrain_models/speedyspeech_nosil_baker_ckpt_0.5/phone_id_map.txt --tones-dict=pretrain_models/speedyspeech_nosil_baker_ckpt_0.5/tone_id_map.txt -null:null -## -===========================infer_params=========================== -null:null -null:null -null:null -null:null -null:null -null:null -null:null -null:null -## -null:null -null:null -null:null -inference:../examples/speedyspeech/baker/inference.py --inference-dir=pretrain_models/speedyspeech_pwg_inference_0.5 --text=../examples/speedyspeech/sentences.txt --output-dir=inference_out --enable-auto-log --phones-dict=pretrain_models/speedyspeech_pwg_inference_0.5/phone_id_map.txt --tones-dict=pretrain_models/speedyspeech_pwg_inference_0.5/tone_id_map.txt -null:null -null:null -null:null -null:null -null:null -null:null -null:null -null:null -null:null -null:null diff --git a/tests/speedyspeech_params_whole_single_gpu.txt b/tests/speedyspeech_params_whole_single_gpu.txt deleted file mode 100644 index d3248ca4..00000000 --- a/tests/speedyspeech_params_whole_single_gpu.txt +++ /dev/null @@ -1,50 +0,0 @@ -===========================train_params=========================== -model_name:speedyspeech -python:python3.7 -gpu_list:0 -null:null -null:null -null:null -null:null -null:null -null:null -null:null -null:null -null:null -## -trainer:norm_train -norm_train:../utils/ss_train.py --train-metadata=train_data/processed_BZNSYP/train/norm/metadata.jsonl --dev-metadata=train_data/processed_BZNSYP/dev/norm/metadata.jsonl --config=../examples/speedyspeech/baker/conf/default.yaml --output-dir=exp/whole --phones-dict=train_data/processed_BZNSYP/phone_id_map.txt --tones-dict=train_data/processed_BZNSYP/tone_id_map.txt --use-relative-path=True -null:null -null:null -null:null -null:null -null:null -## -===========================eval_params=========================== -eval:../examples/speedyspeech/baker/synthesize_e2e.py --speedyspeech-config=pretrain_models/speedyspeech_nosil_baker_ckpt_0.5/default.yaml --speedyspeech-checkpoint=pretrain_models/speedyspeech_nosil_baker_ckpt_0.5/snapshot_iter_11400.pdz --speedyspeech-stat=pretrain_models/speedyspeech_nosil_baker_ckpt_0.5/feats_stats.npy --pwg-config=pretrain_models/pwg_baker_ckpt_0.4/pwg_default.yaml --pwg-checkpoint=pretrain_models/pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz --pwg-stat=pretrain_models/pwg_baker_ckpt_0.4/pwg_stats.npy --text=../examples/speedyspeech/sentences.txt --output-dir=e2e --inference-dir=inference --device="gpu" --phones-dict=pretrain_models/speedyspeech_nosil_baker_ckpt_0.5/phone_id_map.txt --tones-dict=pretrain_models/speedyspeech_nosil_baker_ckpt_0.5/tone_id_map.txt -null:null -## -===========================infer_params=========================== -null:null -null:null -null:null -null:null -null:null -null:null -null:null -null:null -## -null:null -null:null -null:null -inference:../examples/speedyspeech/baker/inference.py --inference-dir=pretrain_models/speedyspeech_pwg_inference_0.5 --text=../examples/speedyspeech/sentences.txt --output-dir=inference_out --enable-auto-log --phones-dict=pretrain_models/speedyspeech_pwg_inference_0.5/phone_id_map.txt --tones-dict=pretrain_models/speedyspeech_pwg_inference_0.5/tone_id_map.txt -null:null -null:null -null:null -null:null -null:null -null:null -null:null -null:null -null:null -null:null diff --git a/utils/fs2_pwg_syn.py b/utils/fs2_pwg_syn.py deleted file mode 100644 index e717677f..00000000 --- a/utils/fs2_pwg_syn.py +++ /dev/null @@ -1,141 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -import logging -from pathlib import Path - -import jsonlines -import numpy as np -import paddle -import soundfile as sf -import yaml -from yacs.config import CfgNode -from parakeet.datasets.data_table import DataTable -from parakeet.models.fastspeech2 import FastSpeech2 -from parakeet.models.fastspeech2 import FastSpeech2Inference -from parakeet.models.parallel_wavegan import PWGGenerator -from parakeet.models.parallel_wavegan import PWGInference -from parakeet.modules.normalizer import ZScore - - -def evaluate(args, fastspeech2_config, pwg_config): - # dataloader has been too verbose - logging.getLogger("DataLoader").disabled = True - - # construct dataset for evaluation - with jsonlines.open(args.test_metadata, 'r') as reader: - test_metadata = list(reader) - test_dataset = DataTable(data=test_metadata, fields=["utt_id", "text"]) - - with open(args.phones_dict, "r") as f: - phn_id = [line.strip().split() for line in f.readlines()] - vocab_size = len(phn_id) - print("vocab_size:", vocab_size) - odim = fastspeech2_config.n_mels - model = FastSpeech2( - idim=vocab_size, odim=odim, **fastspeech2_config["model"]) - - model.set_state_dict( - paddle.load(args.fastspeech2_checkpoint)["main_params"]) - model.eval() - - vocoder = PWGGenerator(**pwg_config["generator_params"]) - vocoder.set_state_dict(paddle.load(args.pwg_checkpoint)["generator_params"]) - vocoder.remove_weight_norm() - vocoder.eval() - print("model done!") - - stat = np.load(args.fastspeech2_stat) - mu, std = stat - mu = paddle.to_tensor(mu) - std = paddle.to_tensor(std) - fastspeech2_normalizer = ZScore(mu, std) - - stat = np.load(args.pwg_stat) - mu, std = stat - mu = paddle.to_tensor(mu) - std = paddle.to_tensor(std) - pwg_normalizer = ZScore(mu, std) - - fastspeech2_inferencce = FastSpeech2Inference(fastspeech2_normalizer, model) - pwg_inference = PWGInference(pwg_normalizer, vocoder) - - output_dir = Path(args.output_dir) - output_dir.mkdir(parents=True, exist_ok=True) - - for datum in test_dataset: - utt_id = datum["utt_id"] - text = paddle.to_tensor(datum["text"]) - - with paddle.no_grad(): - wav = pwg_inference(fastspeech2_inferencce(text)) - sf.write( - str(output_dir / (utt_id + ".wav")), - wav.numpy(), - samplerate=fastspeech2_config.fs) - print(f"{utt_id} done!") - - -def main(): - # parse args and config and redirect to train_sp - parser = argparse.ArgumentParser( - description="Synthesize with fastspeech2 & parallel wavegan.") - parser.add_argument( - "--fastspeech2-config", type=str, help="fastspeech2 config file.") - parser.add_argument( - "--fastspeech2-checkpoint", - type=str, - help="fastspeech2 checkpoint to load.") - parser.add_argument( - "--fastspeech2-stat", - type=str, - help="mean and standard deviation used to normalize spectrogram when training fastspeech2." - ) - parser.add_argument( - "--pwg-config", type=str, help="parallel wavegan config file.") - parser.add_argument( - "--pwg-checkpoint", - type=str, - help="parallel wavegan generator parameters to load.") - parser.add_argument( - "--pwg-stat", - type=str, - help="mean and standard deviation used to normalize spectrogram when training parallel wavegan." - ) - parser.add_argument( - "--phones-dict", type=str, default=None, help="phone vocabulary file.") - parser.add_argument("--test-metadata", type=str, help="test metadata.") - parser.add_argument("--output-dir", type=str, help="output dir.") - parser.add_argument( - "--device", type=str, default="gpu", help="device type to use.") - parser.add_argument("--verbose", type=int, default=1, help="verbose.") - - args = parser.parse_args() - with open(args.fastspeech2_config) as f: - fastspeech2_config = CfgNode(yaml.safe_load(f)) - with open(args.pwg_config) as f: - pwg_config = CfgNode(yaml.safe_load(f)) - - print("========Args========") - print(yaml.safe_dump(vars(args))) - print("========Config========") - print(fastspeech2_config) - print(pwg_config) - - evaluate(args, fastspeech2_config, pwg_config) - - -if __name__ == "__main__": - main() diff --git a/utils/fs2_train.py b/utils/fs2_train.py deleted file mode 100644 index b79509dd..00000000 --- a/utils/fs2_train.py +++ /dev/null @@ -1,198 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -import logging -import os -import shutil - -import jsonlines -import numpy as np -import paddle -import yaml -from paddle import DataParallel -from paddle import distributed as dist -from paddle.io import DataLoader, DistributedBatchSampler -from parakeet.datasets.data_table import DataTable -from parakeet.datasets.am_batch_fn import fastspeech2_single_spk_batch_fn -from parakeet.models.fastspeech2 import FastSpeech2 -from parakeet.training.extensions.snapshot import Snapshot -from parakeet.training.extensions.visualizer import VisualDL -from parakeet.training.fastspeech2_updater import FastSpeech2Evaluator -from parakeet.training.fastspeech2_updater import FastSpeech2Updater -from parakeet.training.seeding import seed_everything -from parakeet.training.trainer import Trainer -from pathlib import Path -from visualdl import LogWriter -from yacs.config import CfgNode - -from train_utils import build_optimizers - - -def train_sp(args, config): - # decides device type and whether to run in parallel - # setup running environment correctly - if not paddle.is_compiled_with_cuda(): - paddle.set_device("cpu") - else: - paddle.set_device("gpu") - world_size = paddle.distributed.get_world_size() - if world_size > 1: - paddle.distributed.init_parallel_env() - - # set the random seed, it is a must for multiprocess training - seed_everything(config.seed) - - print( - f"rank: {dist.get_rank()}, pid: {os.getpid()}, parent_pid: {os.getppid()}", - ) - - # dataloader has been too verbose - logging.getLogger("DataLoader").disabled = True - - # construct dataset for training and validation - with jsonlines.open(args.train_metadata, 'r') as reader: - train_metadata = list(reader) - train_dataset = DataTable( - data=train_metadata, - fields=[ - "text", "text_lengths", "speech", "speech_lengths", "durations", - "pitch", "energy" - ], - converters={"speech": np.load, - "pitch": np.load, - "energy": np.load}, ) - with jsonlines.open(args.dev_metadata, 'r') as reader: - dev_metadata = list(reader) - dev_dataset = DataTable( - data=dev_metadata, - fields=[ - "text", "text_lengths", "speech", "speech_lengths", "durations", - "pitch", "energy" - ], - converters={"speech": np.load, - "pitch": np.load, - "energy": np.load}, ) - - # collate function and dataloader - - train_sampler = DistributedBatchSampler( - train_dataset, - batch_size=config.batch_size, - shuffle=True, - drop_last=True) - - print("samplers done!") - - train_dataloader = DataLoader( - train_dataset, - batch_sampler=train_sampler, - collate_fn=fastspeech2_single_spk_batch_fn, - num_workers=config.num_workers) - - dev_dataloader = DataLoader( - dev_dataset, - shuffle=False, - drop_last=False, - batch_size=config.batch_size, - collate_fn=fastspeech2_single_spk_batch_fn, - num_workers=config.num_workers) - print("dataloaders done!") - - with open(args.phones_dict, "r") as f: - phn_id = [line.strip().split() for line in f.readlines()] - vocab_size = len(phn_id) - print("vocab_size:", vocab_size) - - odim = config.n_mels - model = FastSpeech2(idim=vocab_size, odim=odim, **config["model"]) - if world_size > 1: - model = DataParallel(model) - print("model done!") - - optimizer = build_optimizers(model, **config["optimizer"]) - print("optimizer done!") - - output_dir = Path(args.output_dir) - output_dir.mkdir(parents=True, exist_ok=True) - if dist.get_rank() == 0: - config_name = args.config.split("/")[-1] - # copy conf to output_dir - shutil.copyfile(args.config, output_dir / config_name) - - updater = FastSpeech2Updater( - model=model, - optimizer=optimizer, - dataloader=train_dataloader, - output_dir=output_dir, - **config["updater"]) - - trainer = Trainer(updater, (config.max_epoch, 'epoch'), output_dir) - - evaluator = FastSpeech2Evaluator( - model, dev_dataloader, output_dir=output_dir, **config["updater"]) - - if dist.get_rank() == 0: - trainer.extend(evaluator, trigger=(1, "epoch")) - writer = LogWriter(str(output_dir)) - trainer.extend(VisualDL(writer), trigger=(1, "iteration")) - trainer.extend( - Snapshot(max_size=config.num_snapshots), trigger=(1, 'epoch')) - # print(trainer.extensions) - trainer.run() - - -def main(): - # parse args and config and redirect to train_sp - parser = argparse.ArgumentParser( - description="Train a FastSpeech2 model with sigle speaker dataset.") - parser.add_argument("--config", type=str, help="fastspeech2 config file.") - parser.add_argument("--train-metadata", type=str, help="training data.") - parser.add_argument("--dev-metadata", type=str, help="dev data.") - parser.add_argument("--output-dir", type=str, help="output dir.") - parser.add_argument( - "--device", type=str, default="gpu", help="device type to use.") - parser.add_argument( - "--nprocs", type=int, default=1, help="number of processes.") - parser.add_argument("--verbose", type=int, default=1, help="verbose.") - parser.add_argument( - "--phones-dict", - type=str, - default="phone_id_map.txt ", - help="phone vocabulary file.") - - args = parser.parse_args() - if args.device == "cpu" and args.nprocs > 1: - raise RuntimeError("Multiprocess training on CPU is not supported.") - - with open(args.config) as f: - config = CfgNode(yaml.safe_load(f)) - - print("========Args========") - print(yaml.safe_dump(vars(args))) - print("========Config========") - print(config) - print( - f"master see the word size: {dist.get_world_size()}, from pid: {os.getpid()}" - ) - - # dispatch - if args.nprocs > 1: - dist.spawn(train_sp, (args, config), nprocs=args.nprocs) - else: - train_sp(args, config) - - -if __name__ == "__main__": - main() diff --git a/utils/gen_duration_from_textgrid.py b/utils/gen_duration_from_textgrid.py index f75a8b01..b2a5fa3d 100644 --- a/utils/gen_duration_from_textgrid.py +++ b/utils/gen_duration_from_textgrid.py @@ -17,7 +17,9 @@ import librosa import numpy as np +import yaml from praatio import tgio +from yacs.config import CfgNode def readtg(tg_path, sample_rate=24000, n_shift=300): @@ -95,12 +97,16 @@ def main(): "--n-shift", type=int, help="the n_shift of time_to_freames, also called hop_length.") + parser.add_argument( + "--config", type=str, help="config file with fs and n_shift.") args = parser.parse_args() + with open(args.config) as f: + config = CfgNode(yaml.safe_load(f)) inputdir = Path(args.inputdir).expanduser() output = Path(args.output).expanduser() - gen_duration_from_textgrid(inputdir, output, args.sample_rate, args.n_shift) + gen_duration_from_textgrid(inputdir, output, config.fs, config.n_shift) if __name__ == "__main__": From 4b94af98e5dcd364936c996f9e3dd661b6c481e9 Mon Sep 17 00:00:00 2001 From: TianYuan Date: Tue, 12 Oct 2021 10:39:21 +0000 Subject: [PATCH 2/3] add __init__ for models --- .../baker/synthesize_from_wav.py | 4 ++-- .../GANVocoder/parallelwave_gan/synthesize.py | 2 +- examples/GANVocoder/parallelwave_gan/train.py | 8 ++++---- examples/fastspeech2/aishell3/synthesize_e2e.py | 8 ++++---- examples/fastspeech2/baker/synthesize_e2e.py | 8 ++++---- examples/fastspeech2/ljspeech/synthesize_e2e.py | 8 ++++---- examples/fastspeech2/synthesize.py | 8 ++++---- examples/fastspeech2/train.py | 9 +++++---- examples/speedyspeech/baker/synthesize_e2e.py | 8 ++++---- examples/speedyspeech/synthesize.py | 8 ++++---- examples/speedyspeech/train.py | 6 +++--- .../transformer_tts/ljspeech/synthesize_e2e.py | 4 ++-- examples/transformer_tts/synthesize.py | 4 ++-- examples/transformer_tts/train.py | 6 +++--- parakeet/models/fastspeech2/__init__.py | 16 ++++++++++++++++ .../models/fastspeech2/fastspeech2_updater.py | 2 +- parakeet/models/parallel_wavegan/__init__.py | 16 ++++++++++++++++ parakeet/models/speedyspeech/__init__.py | 16 ++++++++++++++++ .../models/speedyspeech/speedyspeech_updater.py | 3 ++- parakeet/models/transformer_tts/__init__.py | 16 ++++++++++++++++ .../transformer_tts/transformer_tts_updater.py | 4 ++-- 21 files changed, 115 insertions(+), 49 deletions(-) create mode 100644 parakeet/models/fastspeech2/__init__.py create mode 100644 parakeet/models/parallel_wavegan/__init__.py create mode 100644 parakeet/models/speedyspeech/__init__.py create mode 100644 parakeet/models/transformer_tts/__init__.py diff --git a/examples/GANVocoder/parallelwave_gan/baker/synthesize_from_wav.py b/examples/GANVocoder/parallelwave_gan/baker/synthesize_from_wav.py index 5435422c..16db38b4 100644 --- a/examples/GANVocoder/parallelwave_gan/baker/synthesize_from_wav.py +++ b/examples/GANVocoder/parallelwave_gan/baker/synthesize_from_wav.py @@ -23,8 +23,8 @@ import soundfile as sf import yaml from parakeet.data.get_feats import LogMelFBank -from parakeet.models.parallel_wavegan.parallel_wavegan import PWGGenerator -from parakeet.models.parallel_wavegan.parallel_wavegan import PWGInference +from parakeet.models.parallel_wavegan import PWGGenerator +from parakeet.models.parallel_wavegan import PWGInference from parakeet.modules.normalizer import ZScore from yacs.config import CfgNode as Configuration diff --git a/examples/GANVocoder/parallelwave_gan/synthesize.py b/examples/GANVocoder/parallelwave_gan/synthesize.py index 1ee52dbc..7c37e340 100644 --- a/examples/GANVocoder/parallelwave_gan/synthesize.py +++ b/examples/GANVocoder/parallelwave_gan/synthesize.py @@ -24,7 +24,7 @@ import yaml from paddle import distributed as dist from parakeet.datasets.data_table import DataTable -from parakeet.models.parallel_wavegan.parallel_wavegan import PWGGenerator +from parakeet.models.parallel_wavegan import PWGGenerator from yacs.config import CfgNode diff --git a/examples/GANVocoder/parallelwave_gan/train.py b/examples/GANVocoder/parallelwave_gan/train.py index ec357a30..7e6aa9a6 100644 --- a/examples/GANVocoder/parallelwave_gan/train.py +++ b/examples/GANVocoder/parallelwave_gan/train.py @@ -30,10 +30,10 @@ from paddle.optimizer.lr import StepDecay from parakeet.datasets.data_table import DataTable from parakeet.datasets.vocoder_batch_fn import Clip -from parakeet.models.parallel_wavegan.parallel_wavegan import PWGGenerator -from parakeet.models.parallel_wavegan.parallel_wavegan import PWGDiscriminator -from parakeet.models.parallel_wavegan.parallel_wavegan_updater import PWGUpdater -from parakeet.models.parallel_wavegan.parallel_wavegan_updater import PWGEvaluator +from parakeet.models.parallel_wavegan import PWGGenerator +from parakeet.models.parallel_wavegan import PWGDiscriminator +from parakeet.models.parallel_wavegan import PWGUpdater +from parakeet.models.parallel_wavegan import PWGEvaluator from parakeet.modules.stft_loss import MultiResolutionSTFTLoss from parakeet.training.extensions.snapshot import Snapshot from parakeet.training.extensions.visualizer import VisualDL diff --git a/examples/fastspeech2/aishell3/synthesize_e2e.py b/examples/fastspeech2/aishell3/synthesize_e2e.py index b4be5db7..bc7c2f24 100644 --- a/examples/fastspeech2/aishell3/synthesize_e2e.py +++ b/examples/fastspeech2/aishell3/synthesize_e2e.py @@ -21,10 +21,10 @@ import soundfile as sf import yaml from parakeet.frontend.zh_frontend import Frontend -from parakeet.models.fastspeech2.fastspeech2 import FastSpeech2 -from parakeet.models.fastspeech2.fastspeech2 import FastSpeech2Inference -from parakeet.models.parallel_wavegan.parallel_wavegan import PWGGenerator -from parakeet.models.parallel_wavegan.parallel_wavegan import PWGInference +from parakeet.models.fastspeech2 import FastSpeech2 +from parakeet.models.fastspeech2 import FastSpeech2Inference +from parakeet.models.parallel_wavegan import PWGGenerator +from parakeet.models.parallel_wavegan import PWGInference from parakeet.modules.normalizer import ZScore from yacs.config import CfgNode diff --git a/examples/fastspeech2/baker/synthesize_e2e.py b/examples/fastspeech2/baker/synthesize_e2e.py index 0321bda4..7b39ab6a 100644 --- a/examples/fastspeech2/baker/synthesize_e2e.py +++ b/examples/fastspeech2/baker/synthesize_e2e.py @@ -21,10 +21,10 @@ import soundfile as sf import yaml from parakeet.frontend.zh_frontend import Frontend -from parakeet.models.fastspeech2.fastspeech2 import FastSpeech2 -from parakeet.models.fastspeech2.fastspeech2 import FastSpeech2Inference -from parakeet.models.parallel_wavegan.parallel_wavegan import PWGGenerator -from parakeet.models.parallel_wavegan.parallel_wavegan import PWGInference +from parakeet.models.fastspeech2 import FastSpeech2 +from parakeet.models.fastspeech2 import FastSpeech2Inference +from parakeet.models.parallel_wavegan import PWGGenerator +from parakeet.models.parallel_wavegan import PWGInference from parakeet.modules.normalizer import ZScore from yacs.config import CfgNode diff --git a/examples/fastspeech2/ljspeech/synthesize_e2e.py b/examples/fastspeech2/ljspeech/synthesize_e2e.py index 16890658..93ed91c0 100644 --- a/examples/fastspeech2/ljspeech/synthesize_e2e.py +++ b/examples/fastspeech2/ljspeech/synthesize_e2e.py @@ -22,10 +22,10 @@ import yaml from yacs.config import CfgNode from parakeet.frontend import English -from parakeet.models.fastspeech2.fastspeech2 import FastSpeech2 -from parakeet.models.fastspeech2.fastspeech2 import FastSpeech2Inference -from parakeet.models.parallel_wavegan.parallel_wavegan import PWGGenerator -from parakeet.models.parallel_wavegan.parallel_wavegan import PWGInference +from parakeet.models.fastspeech2 import FastSpeech2 +from parakeet.models.fastspeech2 import FastSpeech2Inference +from parakeet.models.parallel_wavegan import PWGGenerator +from parakeet.models.parallel_wavegan import PWGInference from parakeet.modules.normalizer import ZScore diff --git a/examples/fastspeech2/synthesize.py b/examples/fastspeech2/synthesize.py index af427ab6..aee7bcee 100644 --- a/examples/fastspeech2/synthesize.py +++ b/examples/fastspeech2/synthesize.py @@ -23,10 +23,10 @@ import yaml from yacs.config import CfgNode from parakeet.datasets.data_table import DataTable -from parakeet.models.fastspeech2.fastspeech2 import FastSpeech2 -from parakeet.models.fastspeech2.fastspeech2 import FastSpeech2Inference -from parakeet.models.parallel_wavegan.parallel_wavegan import PWGGenerator -from parakeet.models.parallel_wavegan.parallel_wavegan import PWGInference +from parakeet.models.fastspeech2 import FastSpeech2 +from parakeet.models.fastspeech2 import FastSpeech2Inference +from parakeet.models.parallel_wavegan import PWGGenerator +from parakeet.models.parallel_wavegan import PWGInference from parakeet.modules.normalizer import ZScore diff --git a/examples/fastspeech2/train.py b/examples/fastspeech2/train.py index 99c5c7b9..1ea2c561 100644 --- a/examples/fastspeech2/train.py +++ b/examples/fastspeech2/train.py @@ -23,13 +23,14 @@ import yaml from paddle import DataParallel from paddle import distributed as dist -from paddle.io import DataLoader, DistributedBatchSampler +from paddle.io import DataLoader +from paddle.io import DistributedBatchSampler from parakeet.datasets.data_table import DataTable from parakeet.datasets.am_batch_fn import fastspeech2_single_spk_batch_fn from parakeet.datasets.am_batch_fn import fastspeech2_multi_spk_batch_fn -from parakeet.models.fastspeech2.fastspeech2 import FastSpeech2 -from parakeet.models.fastspeech2.fastspeech2_updater import FastSpeech2Evaluator -from parakeet.models.fastspeech2.fastspeech2_updater import FastSpeech2Updater +from parakeet.models.fastspeech2 import FastSpeech2 +from parakeet.models.fastspeech2 import FastSpeech2Evaluator +from parakeet.models.fastspeech2 import FastSpeech2Updater from parakeet.training.extensions.snapshot import Snapshot from parakeet.training.extensions.visualizer import VisualDL from parakeet.training.optimizer import build_optimizers diff --git a/examples/speedyspeech/baker/synthesize_e2e.py b/examples/speedyspeech/baker/synthesize_e2e.py index 80330f29..f633aeab 100644 --- a/examples/speedyspeech/baker/synthesize_e2e.py +++ b/examples/speedyspeech/baker/synthesize_e2e.py @@ -24,10 +24,10 @@ from paddle import jit from paddle.static import InputSpec from parakeet.frontend.zh_frontend import Frontend -from parakeet.models.speedyspeech.speedyspeech import SpeedySpeech -from parakeet.models.speedyspeech.speedyspeech import SpeedySpeechInference -from parakeet.models.parallel_wavegan.parallel_wavegan import PWGGenerator -from parakeet.models.parallel_wavegan.parallel_wavegan import PWGInference +from parakeet.models.speedyspeech import SpeedySpeech +from parakeet.models.speedyspeech import SpeedySpeechInference +from parakeet.models.parallel_wavegan import PWGGenerator +from parakeet.models.parallel_wavegan import PWGInference from parakeet.modules.normalizer import ZScore from yacs.config import CfgNode diff --git a/examples/speedyspeech/synthesize.py b/examples/speedyspeech/synthesize.py index 82d5e6fa..550f9405 100644 --- a/examples/speedyspeech/synthesize.py +++ b/examples/speedyspeech/synthesize.py @@ -26,10 +26,10 @@ from yacs.config import CfgNode from parakeet.datasets.data_table import DataTable -from parakeet.models.speedyspeech.speedyspeech import SpeedySpeech -from parakeet.models.speedyspeech.speedyspeech import SpeedySpeechInference -from parakeet.models.parallel_wavegan.parallel_wavegan import PWGGenerator -from parakeet.models.parallel_wavegan.parallel_wavegan import PWGInference +from parakeet.models.speedyspeech import SpeedySpeech +from parakeet.models.speedyspeech import SpeedySpeechInference +from parakeet.models.parallel_wavegan import PWGGenerator +from parakeet.models.parallel_wavegan import PWGInference from parakeet.modules.normalizer import ZScore diff --git a/examples/speedyspeech/train.py b/examples/speedyspeech/train.py index f9436e57..f7a4e301 100644 --- a/examples/speedyspeech/train.py +++ b/examples/speedyspeech/train.py @@ -27,9 +27,9 @@ from paddle.io import DistributedBatchSampler from parakeet.datasets.data_table import DataTable from parakeet.datasets.am_batch_fn import speedyspeech_batch_fn -from parakeet.models.speedyspeech.speedyspeech import SpeedySpeech -from parakeet.models.speedyspeech.speedyspeech_updater import SpeedySpeechEvaluator -from parakeet.models.speedyspeech.speedyspeech_updater import SpeedySpeechUpdater +from parakeet.models.speedyspeech import SpeedySpeech +from parakeet.models.speedyspeech import SpeedySpeechEvaluator +from parakeet.models.speedyspeech import SpeedySpeechUpdater from parakeet.training.extensions.snapshot import Snapshot from parakeet.training.extensions.visualizer import VisualDL from parakeet.training.optimizer import build_optimizers diff --git a/examples/transformer_tts/ljspeech/synthesize_e2e.py b/examples/transformer_tts/ljspeech/synthesize_e2e.py index d3680b05..534b6aa0 100644 --- a/examples/transformer_tts/ljspeech/synthesize_e2e.py +++ b/examples/transformer_tts/ljspeech/synthesize_e2e.py @@ -22,8 +22,8 @@ import yaml from yacs.config import CfgNode from parakeet.frontend import English -from parakeet.models.transformer_tts.transformer_tts import TransformerTTS -from parakeet.models.transformer_tts.transformer_tts import TransformerTTSInference +from parakeet.models.transformer_tts import TransformerTTS +from parakeet.models.transformer_tts import TransformerTTSInference from parakeet.models.waveflow import ConditionalWaveFlow from parakeet.modules.normalizer import ZScore from parakeet.utils import layer_tools diff --git a/examples/transformer_tts/synthesize.py b/examples/transformer_tts/synthesize.py index 2af508bf..c71b4065 100644 --- a/examples/transformer_tts/synthesize.py +++ b/examples/transformer_tts/synthesize.py @@ -23,8 +23,8 @@ import yaml from yacs.config import CfgNode from parakeet.datasets.data_table import DataTable -from parakeet.models.transformer_tts.transformer_tts import TransformerTTS -from parakeet.models.transformer_tts.transformer_tts import TransformerTTSInference +from parakeet.models.transformer_tts import TransformerTTS +from parakeet.models.transformer_tts import TransformerTTSInference from parakeet.models.waveflow import ConditionalWaveFlow from parakeet.modules.normalizer import ZScore from parakeet.utils import layer_tools diff --git a/examples/transformer_tts/train.py b/examples/transformer_tts/train.py index 2899a44e..b1263bcc 100644 --- a/examples/transformer_tts/train.py +++ b/examples/transformer_tts/train.py @@ -28,9 +28,9 @@ from paddle.io import DistributedBatchSampler from parakeet.datasets.data_table import DataTable from parakeet.datasets.am_batch_fn import transformer_single_spk_batch_fn -from parakeet.models.transformer_tts.transformer_tts import TransformerTTS -from parakeet.models.transformer_tts.transformer_tts_updater import TransformerTTSUpdater -from parakeet.models.transformer_tts.transformer_tts_updater import TransformerTTSEvaluator +from parakeet.models.transformer_tts import TransformerTTS +from parakeet.models.transformer_tts import TransformerTTSUpdater +from parakeet.models.transformer_tts import TransformerTTSEvaluator from parakeet.training.extensions.snapshot import Snapshot from parakeet.training.extensions.visualizer import VisualDL from parakeet.training.optimizer import build_optimizers diff --git a/parakeet/models/fastspeech2/__init__.py b/parakeet/models/fastspeech2/__init__.py new file mode 100644 index 00000000..83479d6f --- /dev/null +++ b/parakeet/models/fastspeech2/__init__.py @@ -0,0 +1,16 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .fastspeech2 import * +from .fastspeech2_updater import * diff --git a/parakeet/models/fastspeech2/fastspeech2_updater.py b/parakeet/models/fastspeech2/fastspeech2_updater.py index b579fc27..789965f4 100644 --- a/parakeet/models/fastspeech2/fastspeech2_updater.py +++ b/parakeet/models/fastspeech2/fastspeech2_updater.py @@ -14,7 +14,7 @@ import logging from paddle import distributed as dist -from parakeet.models.fastspeech2.fastspeech2 import FastSpeech2Loss +from parakeet.models.fastspeech2 import FastSpeech2Loss from parakeet.training.extensions.evaluator import StandardEvaluator from parakeet.training.reporter import report from parakeet.training.updaters.standard_updater import StandardUpdater diff --git a/parakeet/models/parallel_wavegan/__init__.py b/parakeet/models/parallel_wavegan/__init__.py new file mode 100644 index 00000000..89403c0e --- /dev/null +++ b/parakeet/models/parallel_wavegan/__init__.py @@ -0,0 +1,16 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .parallel_wavegan import * +from .parallel_wavegan_updater import * diff --git a/parakeet/models/speedyspeech/__init__.py b/parakeet/models/speedyspeech/__init__.py new file mode 100644 index 00000000..6d9b7088 --- /dev/null +++ b/parakeet/models/speedyspeech/__init__.py @@ -0,0 +1,16 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .speedyspeech import * +from .speedyspeech_updater import * diff --git a/parakeet/models/speedyspeech/speedyspeech_updater.py b/parakeet/models/speedyspeech/speedyspeech_updater.py index e897fdf3..3135d342 100644 --- a/parakeet/models/speedyspeech/speedyspeech_updater.py +++ b/parakeet/models/speedyspeech/speedyspeech_updater.py @@ -17,7 +17,8 @@ from paddle import distributed as dist from paddle.fluid.layers import huber_loss from paddle.nn import functional as F -from parakeet.modules.losses import masked_l1_loss, weighted_mean +from parakeet.modules.losses import masked_l1_loss +from parakeet.modules.losses import weighted_mean from parakeet.modules.ssim import ssim from parakeet.training.extensions.evaluator import StandardEvaluator from parakeet.training.reporter import report diff --git a/parakeet/models/transformer_tts/__init__.py b/parakeet/models/transformer_tts/__init__.py new file mode 100644 index 00000000..0456c300 --- /dev/null +++ b/parakeet/models/transformer_tts/__init__.py @@ -0,0 +1,16 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .transformer_tts import * +from .transformer_tts_updater import * diff --git a/parakeet/models/transformer_tts/transformer_tts_updater.py b/parakeet/models/transformer_tts/transformer_tts_updater.py index a9ae48ad..7e75a860 100644 --- a/parakeet/models/transformer_tts/transformer_tts_updater.py +++ b/parakeet/models/transformer_tts/transformer_tts_updater.py @@ -16,8 +16,8 @@ import paddle from paddle import distributed as dist -from parakeet.models.transformer_tts.transformer_tts import GuidedMultiHeadAttentionLoss -from parakeet.models.transformer_tts.transformer_tts import TransformerTTSLoss +from parakeet.models.transformer_tts import GuidedMultiHeadAttentionLoss +from parakeet.models.transformer_tts import TransformerTTSLoss from parakeet.training.extensions.evaluator import StandardEvaluator from parakeet.training.reporter import report from parakeet.training.updaters.standard_updater import StandardUpdater From a60392436e0167b3e3ec5354878be8bad767f960 Mon Sep 17 00:00:00 2001 From: TianYuan Date: Tue, 12 Oct 2021 11:13:21 +0000 Subject: [PATCH 3/3] add with no grad when inference --- .../parallelwave_gan/baker/synthesize_from_wav.py | 3 ++- examples/GANVocoder/parallelwave_gan/synthesize.py | 3 ++- examples/speedyspeech/baker/synthesize_e2e.py | 10 +++++----- examples/transformer_tts/ljspeech/synthesize_e2e.py | 11 ++++++----- 4 files changed, 15 insertions(+), 12 deletions(-) diff --git a/examples/GANVocoder/parallelwave_gan/baker/synthesize_from_wav.py b/examples/GANVocoder/parallelwave_gan/baker/synthesize_from_wav.py index 16db38b4..948a2870 100644 --- a/examples/GANVocoder/parallelwave_gan/baker/synthesize_from_wav.py +++ b/examples/GANVocoder/parallelwave_gan/baker/synthesize_from_wav.py @@ -76,7 +76,8 @@ def evaluate(args, config): # extract mel feats mel = mel_extractor.get_log_mel_fbank(wav) mel = paddle.to_tensor(mel) - gen_wav = pwg_inference(mel) + with paddle.no_grad(): + gen_wav = pwg_inference(mel) sf.write( str(output_dir / ("gen_" + utt_name)), gen_wav.numpy(), diff --git a/examples/GANVocoder/parallelwave_gan/synthesize.py b/examples/GANVocoder/parallelwave_gan/synthesize.py index 7c37e340..e57ddf88 100644 --- a/examples/GANVocoder/parallelwave_gan/synthesize.py +++ b/examples/GANVocoder/parallelwave_gan/synthesize.py @@ -80,7 +80,8 @@ def main(): mel = example['feats'] mel = paddle.to_tensor(mel) # (T, C) with timer() as t: - wav = generator.inference(c=mel) + with paddle.no_grad(): + wav = generator.inference(c=mel) wav = wav.numpy() N += wav.size T += t.elapse diff --git a/examples/speedyspeech/baker/synthesize_e2e.py b/examples/speedyspeech/baker/synthesize_e2e.py index f633aeab..8e8dad30 100644 --- a/examples/speedyspeech/baker/synthesize_e2e.py +++ b/examples/speedyspeech/baker/synthesize_e2e.py @@ -121,11 +121,11 @@ def evaluate(args, speedyspeech_config, pwg_config): with paddle.no_grad(): mel = speedyspeech_inference(part_phone_ids, part_tone_ids) temp_wav = pwg_inference(mel) - if flags == 0: - wav = temp_wav - flags = 1 - else: - wav = paddle.concat([wav, temp_wav]) + if flags == 0: + wav = temp_wav + flags = 1 + else: + wav = paddle.concat([wav, temp_wav]) sf.write( output_dir / (utt_id + ".wav"), wav.numpy(), diff --git a/examples/transformer_tts/ljspeech/synthesize_e2e.py b/examples/transformer_tts/ljspeech/synthesize_e2e.py index 534b6aa0..a5566e4b 100644 --- a/examples/transformer_tts/ljspeech/synthesize_e2e.py +++ b/examples/transformer_tts/ljspeech/synthesize_e2e.py @@ -89,11 +89,12 @@ def evaluate(args, acoustic_model_config, vocoder_config): phones = [phn for phn in phones if not phn.isspace()] phones = [phn if phn in phone_id_map else "," for phn in phones] phone_ids = [phone_id_map[phn] for phn in phones] - mel = transformer_tts_inference(paddle.to_tensor(phone_ids)) - # mel shape is (T, feats) and waveflow's input shape is (batch, feats, T) - mel = mel.unsqueeze(0).transpose([0, 2, 1]) - # wavflow's output shape is (B, T) - wav = vocoder.infer(mel)[0] + with paddle.no_grad(): + mel = transformer_tts_inference(paddle.to_tensor(phone_ids)) + # mel shape is (T, feats) and waveflow's input shape is (batch, feats, T) + mel = mel.unsqueeze(0).transpose([0, 2, 1]) + # wavflow's output shape is (B, T) + wav = vocoder.infer(mel)[0] sf.write( str(output_dir / (utt_id + ".wav")),