diff --git a/parakeet/models/speedyspeech.py b/parakeet/models/speedyspeech.py index 24197fe3..dea567b6 100644 --- a/parakeet/models/speedyspeech.py +++ b/parakeet/models/speedyspeech.py @@ -206,7 +206,9 @@ def inference(self, text, tones=None): k = paddle.full([1], 0, dtype=paddle.int64) for j in range(t_enc): d = durations_to_expand[0, j] - M[0, k:k + d, j] = 1 + # If the d == 0, slice action is meaningless and not supported + if d >= 1: + M[0, k:k + d, j] = 1 k += d encodings = paddle.matmul(M, encodings) diff --git a/tests/README.md b/tests/README.md new file mode 100644 index 00000000..60969212 --- /dev/null +++ b/tests/README.md @@ -0,0 +1,16 @@ +Please use lite_train_infer.sh, whole_train_infer.sh, infer.sh. + +For lite_train_infer +``` +bash lite_train_infer.sh +``` + +For whole_train_infer +``` +bash whole_train_infer.sh +``` + +For infer +``` +bash infer.sh +``` diff --git a/tests/infer.sh b/tests/infer.sh index 754f6330..c1d19de0 100644 --- a/tests/infer.sh +++ b/tests/infer.sh @@ -1,2 +1,2 @@ bash prepare.sh infer -bash test.sh speedyspeech_params_lite.txt infer +bash test.sh speedyspeech_params_lite_single_gpu.txt infer diff --git a/tests/lite_train_infer.sh b/tests/lite_train_infer.sh index 28a84305..de67efa0 100644 --- a/tests/lite_train_infer.sh +++ b/tests/lite_train_infer.sh @@ -1,2 +1,7 @@ +rm exp -rf +rm e2e -rf bash prepare.sh lite_train_infer -bash test.sh speedyspeech_params_lite.txt lite_train_infer +bash test.sh speedyspeech_params_lite_single_gpu.txt lite_train_infer +rm exp -rf +rm e2e -rf +bash test.sh speedyspeech_params_lite_multi_gpu.txt lite_train_infer diff --git a/tests/prepare.sh b/tests/prepare.sh index 2fa63477..ad389171 100644 --- a/tests/prepare.sh +++ b/tests/prepare.sh @@ -36,7 +36,8 @@ if [ ${MODE} = "lite_train_infer" ];then wget -nc -P ./pretrain_models/ https://paddlespeech.bj.bcebos.com/Parakeet/pwg_baker_ckpt_0.4.zip (cd ./pretrain_models && unzip speedyspeech_baker_ckpt_0.4.zip && unzip pwg_baker_ckpt_0.4.zip) # generate a config patch - echo 'max_epoch: 30' > lite_train_infer.yaml + echo 'max_epoch: 10' > lite_train_infer.yaml + echo 'num_snapshots: 10' >> lite_train_infer.yaml # download data rm -rf ./train_data/mini_BZNSYP wget -nc -P ./train_data/ https://paddlespeech.bj.bcebos.com/datasets/CE/speedyspeech/mini_BZNSYP.tar.gz diff --git a/tests/speedyspeech_params_lite.txt b/tests/speedyspeech_params_lite_multi_gpu.txt similarity index 96% rename from tests/speedyspeech_params_lite.txt rename to tests/speedyspeech_params_lite_multi_gpu.txt index c1cfb8f5..c7c66a20 100644 --- a/tests/speedyspeech_params_lite.txt +++ b/tests/speedyspeech_params_lite_multi_gpu.txt @@ -1,7 +1,7 @@ ===========================train_params=========================== model_name:speedyspeech python:python3.7 -gpu_list:1|0,1 +gpu_list:2,3 null:null null:null null:null @@ -21,7 +21,7 @@ null:null null:null ## ===========================eval_params=========================== -eval:../examples/speedyspeech/baker/synthesize_e2e.py --speedyspeech-config=../examples/speedyspeech/baker/conf/default.yaml --speedyspeech-checkpoint=exp/default/checkpoints/snapshot_iter_90.pdz --speedyspeech-stat=pretrain_models/speedyspeech_baker_ckpt_0.4/speedy_speech_stats.npy --pwg-config=../examples/parallelwave_gan/baker/conf/default.yaml --pwg-checkpoint=pretrain_models/pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz --pwg-stat=pretrain_models/pwg_baker_ckpt_0.4/pwg_stats.npy --text=../examples/speedyspeech/baker/sentences.txt --output-dir=e2e --inference-dir=inference --device="gpu" --phones-dict=../examples/speedyspeech/baker/phones.txt --tones-dict=../examples/speedyspeech/baker/tones.txt +eval:../examples/speedyspeech/baker/synthesize_e2e.py --speedyspeech-config=../examples/speedyspeech/baker/conf/default.yaml --speedyspeech-checkpoint=exp/default/checkpoints/snapshot_iter_10.pdz --speedyspeech-stat=pretrain_models/speedyspeech_baker_ckpt_0.4/speedy_speech_stats.npy --pwg-config=../examples/parallelwave_gan/baker/conf/default.yaml --pwg-checkpoint=pretrain_models/pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz --pwg-stat=pretrain_models/pwg_baker_ckpt_0.4/pwg_stats.npy --text=../examples/speedyspeech/baker/sentences.txt --output-dir=e2e --inference-dir=inference --device="gpu" --phones-dict=../examples/speedyspeech/baker/phones.txt --tones-dict=../examples/speedyspeech/baker/tones.txt null:null ## ===========================infer_params=========================== diff --git a/tests/speedyspeech_params_lite_single_gpu.txt b/tests/speedyspeech_params_lite_single_gpu.txt new file mode 100644 index 00000000..431e624e --- /dev/null +++ b/tests/speedyspeech_params_lite_single_gpu.txt @@ -0,0 +1,51 @@ +===========================train_params=========================== +model_name:speedyspeech +python:python3.7 +gpu_list:2 +null:null +null:null +null:null +null:null +null:null +null:null +null:null +null:null +null:null +## +trainer:norm_train +norm_train:../examples/speedyspeech/baker/train.py --train-metadata=train_data/mini_BZNSYP/train/norm/metadata.jsonl --dev-metadata=train_data/mini_BZNSYP/dev/norm/metadata.jsonl --config=lite_train_infer.yaml --output-dir=exp/default +null:null +null:null +null:null +null:null +null:null +## +===========================eval_params=========================== +eval:../examples/speedyspeech/baker/synthesize_e2e.py --speedyspeech-config=../examples/speedyspeech/baker/conf/default.yaml --speedyspeech-checkpoint=exp/default/checkpoints/snapshot_iter_30.pdz --speedyspeech-stat=pretrain_models/speedyspeech_baker_ckpt_0.4/speedy_speech_stats.npy --pwg-config=../examples/parallelwave_gan/baker/conf/default.yaml --pwg-checkpoint=pretrain_models/pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz --pwg-stat=pretrain_models/pwg_baker_ckpt_0.4/pwg_stats.npy --text=../examples/speedyspeech/baker/sentences.txt --output-dir=e2e --inference-dir=inference --device="gpu" --phones-dict=../examples/speedyspeech/baker/phones.txt --tones-dict=../examples/speedyspeech/baker/tones.txt +null:null +## +===========================infer_params=========================== +null:null +null:null +null:null +null:null +null:null +null:null +null:null +null:null +## +null:null +null:null +null:null +inference:../examples/speedyspeech/baker/inference.py --inference-dir=pretrain_models/speedyspeech_pwg_inference_0.4 --text=../examples/speedyspeech/baker/sentences.txt --output-dir=inference_out --enable-auto-log --phones-dict=../examples/speedyspeech/baker/phones.txt --tones-dict=../examples/speedyspeech/baker/tones.txt --output-dir=e2e --inference-dir=inference +--use_gpu:True +null:null +null:null +null:null +null:null +null:null +null:null +null:null +null:null +null:null +null:null diff --git a/tests/speedyspeech_params_whole.txt b/tests/speedyspeech_params_whole_multi_gpu.txt similarity index 96% rename from tests/speedyspeech_params_whole.txt rename to tests/speedyspeech_params_whole_multi_gpu.txt index e171cf5b..1cb77a74 100644 --- a/tests/speedyspeech_params_whole.txt +++ b/tests/speedyspeech_params_whole_multi_gpu.txt @@ -13,7 +13,7 @@ null:null null:null ## trainer:norm_train -norm_train:../examples/speedyspeech/baker/train.py --train-metadata=train_data/BZNSYP/train/norm/metadata.jsonl --dev-metadata=train_data/BZNSYP/dev/norm/metadata.jsonl --output-dir=exp/lite +norm_train:../examples/speedyspeech/baker/train.py --train-metadata=train_data/BZNSYP/train/norm/metadata.jsonl --dev-metadata=train_data/BZNSYP/dev/norm/metadata.jsonl --output-dir=exp/whole null:null null:null null:null diff --git a/tests/speedyspeech_params_whole_single_gpu.txt b/tests/speedyspeech_params_whole_single_gpu.txt new file mode 100644 index 00000000..356445b4 --- /dev/null +++ b/tests/speedyspeech_params_whole_single_gpu.txt @@ -0,0 +1,51 @@ +===========================train_params=========================== +model_name:speedyspeech +python:python3.7 +gpu_list:1 +null:null +null:null +null:null +null:null +null:null +null:null +null:null +null:null +null:null +## +trainer:norm_train +norm_train:../examples/speedyspeech/baker/train.py --train-metadata=train_data/BZNSYP/train/norm/metadata.jsonl --dev-metadata=train_data/BZNSYP/dev/norm/metadata.jsonl --output-dir=exp/whole +null:null +null:null +null:null +null:null +null:null +## +===========================eval_params=========================== +eval:../examples/speedyspeech/baker/synthesize_e2e.py --speedyspeech-config=../examples/speedyspeech/baker/conf/default.yaml --speedyspeech-checkpoint=pretrain_models/speedyspeech_baker_ckpt_0.4/speedyspeech_snapshot_iter_91800.pdz --speedyspeech-stat=pretrain_models/speedyspeech_baker_ckpt_0.4/speedy_speech_stats.npy --pwg-config=../examples/parallelwave_gan/baker/conf/default.yaml --pwg-checkpoint=pretrain_models/pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz --pwg-stat=pretrain_models/pwg_baker_ckpt_0.4/pwg_stats.npy --text=../examples/speedyspeech/baker/sentences.txt --output-dir=e2e --inference-dir=inference --device="gpu" --phones-dict=../examples/speedyspeech/baker/phones.txt --tones-dict=../examples/speedyspeech/baker/tones.txt +null:null +## +===========================infer_params=========================== +null:null +null:null +null:null +null:null +null:null +null:null +null:null +null:null +## +null:null +null:null +null:null +inference:../examples/speedyspeech/baker/inference.py --inference-dir=pretrain_models/speedyspeech_pwg_inference_0.4 --text=../examples/speedyspeech/baker/sentences.txt --output-dir=inference_out --enable-auto-log --phones-dict=../examples/speedyspeech/baker/phones.txt --tones-dict=../examples/speedyspeech/baker/tones.txt --output-dir=e2e --inference-dir=inference +--use_gpu:True +null:null +null:null +null:null +null:null +null:null +null:null +null:null +null:null +null:null +null:null diff --git a/tests/test.sh b/tests/test.sh index f2808d90..17565d88 100644 --- a/tests/test.sh +++ b/tests/test.sh @@ -323,6 +323,7 @@ else elif [ ${#gpu} -le 15 ];then # train with multi-gpu gsu=${gpu//,/ } nump=`echo $gsu | wc -w` + CUDA_VISIBLE_DEVICES=${gpu} cmd="${python} ${run_train} --nprocs=$nump" else # train with multi-machine cmd="${python} -m paddle.distributed.launch --ips=${ips} --gpus=${gpu} ${run_train} ${set_save_model} ${set_pretrain} ${set_epoch} ${set_autocast} ${set_batchsize} ${set_train_params1}" diff --git a/tests/whole_train_infer.sh b/tests/whole_train_infer.sh index 125264f6..fa2b89df 100644 --- a/tests/whole_train_infer.sh +++ b/tests/whole_train_infer.sh @@ -1,2 +1,7 @@ +rm exp -rf +rm e2e -rf bash prepare.sh whole_train_infer -bash test.sh speedyspeech_params_whole.txt whole_train_infer +bash test.sh speedyspeech_params_whole_single_gpu.txt whole_train_infer +rm exp -rf +rm e2e -rf +bash test.sh speedyspeech_params_whole_multi_gpu.txt whole_train_infer