diff --git a/DeepSpeech/Dockerfile.train b/DeepSpeech/Dockerfile.train index 736bddac..ac43f519 100644 --- a/DeepSpeech/Dockerfile.train +++ b/DeepSpeech/Dockerfile.train @@ -1,8 +1,8 @@ FROM nvidia/cuda:10.0-cudnn7-runtime-ubuntu18.04 ARG ds_repo=mozilla/DeepSpeech -ARG ds_branch=ba56407376f1e1109be33ac87bcb6eb9709b18be -ARG ds_sha1=ba56407376f1e1109be33ac87bcb6eb9709b18be +ARG ds_branch=9ac8cebb3b5257fc502622c34e36ae3de55dceee +ARG ds_sha1=9ac8cebb3b5257fc502622c34e36ae3de55dceee ARG kenlm_repo=kpu/kenlm ARG kenlm_branch=2ad7cb56924cd3c6811c604973f592cb5ef604eb diff --git a/DeepSpeech/generate_alphabet.sh b/DeepSpeech/generate_alphabet.sh index 54a11ad6..d17f7ff5 100755 --- a/DeepSpeech/generate_alphabet.sh +++ b/DeepSpeech/generate_alphabet.sh @@ -8,9 +8,9 @@ pushd $HOME/ds/ all_test_csv="$(find /mnt/extracted/data/ -type f -name '*test.csv' -printf '%p,' | sed -e 's/,$//g')" #replace '#' with '' in the whole dataset due to an error in sentence validator that allowed this char - sed -i 's/#//g' /mnt/extracted/data/*test.csv - sed -i 's/#//g' /mnt/extracted/data/*train.csv - sed -i 's/#//g' /mnt/extracted/data/*dev.csv + sed -i 's/#//g' /mnt/extracted/data/cv-it/clips/*test.csv + sed -i 's/#//g' /mnt/extracted/data/cv-it/clips/*train.csv + sed -i 's/#//g' /mnt/extracted/data/cv-it/clips/*dev.csv if [ ! -f "/mnt/models/alphabet.txt" ]; then if [ "${ENGLISH_COMPATIBLE}" = "1" ]; then diff --git a/DeepSpeech/import_lingualibre.sh b/DeepSpeech/import_lingualibre.sh deleted file mode 100755 index 802a3a20..00000000 --- a/DeepSpeech/import_lingualibre.sh +++ /dev/null @@ -1,23 +0,0 @@ -#!/bin/bash - -set -xe - -pushd $HOME/ds/ - if [ ! -f "/mnt/sources/lingua_libre_Q385-ita-Italian_train.zip" ]; then - wget https://lingualibre.fr/datasets/Q385-ita-Italian.zip -O /mnt/source/lingua_libre_Q385-ita-Italian_train.zip - unzip /mnt/sources/lingua_libre_Q385-ita-Italian_train.zip -d /mnt/extracted/data/lingualibre - fi; - if [ "${ENGLISH_COMPATIBLE}" = "1" ]; then - IMPORT_AS_ENGLISH="--normalize" - fi; - - if [ ! -f "/mnt/extracted/data/lingualibre/lingua_libre_Q385-ita-Italian_train.csv" ]; then - python bin/import_lingua_libre.py \ - --qId 385 \ - --iso639-3 ita \ - --english-name Italian \ - ${IMPORT_AS_ENGLISH} \ - --bogus-records $HOME/lingua_libre_skiplist.txt \ - /mnt/extracted/data/lingualibre - fi; -popd diff --git a/DeepSpeech/run_it.sh b/DeepSpeech/run_it.sh index 2e9235e9..70a46937 100644 --- a/DeepSpeech/run_it.sh +++ b/DeepSpeech/run_it.sh @@ -4,8 +4,6 @@ set -xe $HOME/import_cvit.sh -$HOME/import_lingualibre.sh - $HOME/generate_alphabet.sh $HOME/build_lm.sh diff --git a/DeepSpeech/train_it.sh b/DeepSpeech/train_it.sh index be100884..123588e6 100755 --- a/DeepSpeech/train_it.sh +++ b/DeepSpeech/train_it.sh @@ -57,7 +57,6 @@ pushd $HOME/ds/ --checkpoint_dir /mnt/checkpoints/ \ --export_dir /mnt/models/ \ --export_tflite \ - --nouse_seq_length \ --export_language "it" fi;