diff --git a/banglaconversion b/banglaconversion new file mode 100644 index 0000000000..a9af48c90f --- /dev/null +++ b/banglaconversion @@ -0,0 +1,54 @@ +# Install necessary dependencies +pip install -r requirements_eval_tflite.txt + +# Replace 'YOUR_MODEL.pb' with the path to your Bangla model file +MODEL_PATH='YOUR_MODEL.pb' +LANGUAGE_MODEL='path_to_your_language_model' + +# Evaluate the model +python -u DeepSpeech.py \ + --alphabet_config_path=alphabet.txt \ + --lm_binary_path=$LANGUAGE_MODEL \ + --lm_trie_path=trie \ + --model $MODEL_PATH \ + --test_files=test.csv \ + --scorer_path=scorer +import os +import argparse +import subprocess +import shutil + +def train_bangla_language_model(data_dir, output_dir): + # Define paths + alphabet_path = 'alphabet.txt' + lm_binary_path = 'lm.binary' + lm_trie_path = 'trie' + + # Generate alphabet file + with open(alphabet_path, 'w', encoding='utf-8') as f: + f.write('ঀঁংঃঅআইঈউঊঋএঐওঔকখগঘঙচছজঝঞটঠডঢণতথদধনপফবভমযরলশষসহঽািীুূৃৄেৈোৌ্ৎৗড়ঢ়য়০১২৩৪৫৬৭৮৯\n') + + # Train the language model + print('Training language model...') + subprocess.call(['./kenlm/build/bin/lmplz', '--order', '5', '--arpa', lm_binary_path, '--text', os.path.join(data_dir, 'text.txt'), '--discount_fallback']) + + # Build the language model trie + print('Building language model trie...') + subprocess.call(['./DeepSpeech.py', '--alphabet_config_path', alphabet_path, '--lm_binary_path', lm_binary_path, '--lm_trie_path', lm_trie_path]) + + # Move generated files to output directory + shutil.move(alphabet_path, os.path.join(output_dir, alphabet_path)) + shutil.move(lm_binary_path, os.path.join(output_dir, lm_binary_path)) + shutil.move(lm_trie_path, os.path.join(output_dir, lm_trie_path)) + + print('Training completed. Language model files saved to', output_dir) + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='Train a Bangla language model for DeepSpeech.') + parser.add_argument('data_dir', type=str, help='Path to the directory containing audio and text data.') + parser.add_argument('output_dir', type=str, help='Path to the output directory to save the trained model files.') + args = parser.parse_args() + + train_bangla_language_model(args.data_dir, args.output_dir) + ./train_bangla_language_model.py /path/to/data_dir /path/to/output_dir + diff --git a/bin/import_aishell.py b/bin/import_aishell.py index 341d0d881b..c3f283549f 100755 --- a/bin/import_aishell.py +++ b/bin/import_aishell.py @@ -1,21 +1,17 @@ -#!/usr/bin/env python import glob import os import tarfile - -import pandas +import pandas as pd from deepspeech_training.util.importers import get_importers_parser -COLUMNNAMES = ["wav_filename", "wav_filesize", "transcript"] - +COLUMN_NAMES = ["wav_filename", "wav_filesize", "transcript"] def extract(archive_path, target_dir): - print("Extracting {} into {}...".format(archive_path, target_dir)) + print(f"Extracting {archive_path} into {target_dir}...") with tarfile.open(archive_path) as tar: tar.extractall(target_dir) - def preprocess_data(tgz_file, target_dir): # First extract main archive and sub-archives extract(tgz_file, target_dir) @@ -25,23 +21,7 @@ def preprocess_data(tgz_file, target_dir): for targz in glob.glob(os.path.join(wav_archives_folder, "*.tar.gz")): extract(targz, main_folder) - # Folder structure is now: - # - data_aishell/ - # - train/S****/*.wav - # - dev/S****/*.wav - # - test/S****/*.wav - # - wav/S****.tar.gz - # - transcript/aishell_transcript_v0.8.txt - - # Transcripts file has one line per WAV file, where each line consists of - # the WAV file name without extension followed by a single space followed - # by the transcript. - - # Since the transcripts themselves can contain spaces, we split on space but - # only once, then build a mapping from file name to transcript - transcripts_path = os.path.join( - main_folder, "transcript", "aishell_transcript_v0.8.txt" - ) + transcripts_path = os.path.join(main_folder, "transcript", "aishell_transcript_v0.8.txt") with open(transcripts_path) as fin: transcripts = dict((line.split(" ", maxsplit=1) for line in fin)) @@ -52,36 +32,33 @@ def load_set(glob_path): wav_filename = wav wav_filesize = os.path.getsize(wav) transcript_key = os.path.splitext(os.path.basename(wav))[0] - transcript = transcripts[transcript_key].strip("\n") + transcript = transcripts.get(transcript_key, "").strip("\n") set_files.append((wav_filename, wav_filesize, transcript)) except KeyError: - print("Warning: Missing transcript for WAV file {}.".format(wav)) + print(f"Warning: Missing transcript for WAV file {wav}.") return set_files - for subset in ("train", "dev", "test"): - print("Loading {} set samples...".format(subset)) + for subset in ["train", "dev", "test"]: + print(f"Loading {subset} set samples...") subset_files = load_set(os.path.join(main_folder, subset, "S*", "*.wav")) - df = pandas.DataFrame(data=subset_files, columns=COLUMNNAMES) + df = pd.DataFrame(data=subset_files, columns=COLUMN_NAMES) - # Trim train set to under 10s by removing the last couple hundred samples if subset == "train": durations = (df["wav_filesize"] - 44) / 16000 / 2 df = df[durations <= 10.0] - print("Trimming {} samples > 10 seconds".format((durations > 10.0).sum())) + print(f"Trimming {subset} samples > 10 seconds: {(durations > 10.0).sum()}") - dest_csv = os.path.join(target_dir, "aishell_{}.csv".format(subset)) - print("Saving {} set into {}...".format(subset, dest_csv)) + dest_csv = os.path.join(target_dir, f"aishell_{subset}.csv") + print(f"Saving {subset} set into {dest_csv}...") df.to_csv(dest_csv, index=False) - def main(): - # http://www.openslr.org/33/ parser = get_importers_parser(description="Import AISHELL corpus") parser.add_argument("aishell_tgz_file", help="Path to data_aishell.tgz") parser.add_argument( "--target_dir", default="", - help="Target folder to extract files into and put the resulting CSVs. Defaults to same folder as the main archive.", + help="Target folder to extract files into and put the resulting CSVs. Defaults to the same folder as the main archive.", ) params = parser.parse_args() @@ -90,6 +67,6 @@ def main(): preprocess_data(params.aishell_tgz_file, params.target_dir) - if __name__ == "__main__": main() + diff --git a/ci_scripts/cppwin-tests.sh b/ci_scripts/cppwin-tests.sh index 81acf862ba..81fe5cb9e7 100755 --- a/ci_scripts/cppwin-tests.sh +++ b/ci_scripts/cppwin-tests.sh @@ -2,19 +2,20 @@ set -xe -source $(dirname "$0")/all-vars.sh -source $(dirname "$0")/all-utils.sh -source $(dirname "$0")/asserts.sh +source "$(dirname "$0")/all-vars.sh" +source "$(dirname "$0")/all-utils.sh" +source "$(dirname "$0")/asserts.sh" bitrate=$1 set_ldc_sample_filename "${bitrate}" download_material "${CI_TMP_DIR}/ds" -export PATH=${CI_TMP_DIR}/ds/:$PATH +export PATH="${CI_TMP_DIR}/ds/:$PATH" check_versions ensure_cuda_usage "$2" run_basic_inference_tests + diff --git a/ci_scripts/cppwin_tflite-tests.sh b/ci_scripts/cppwin_tflite-tests.sh index 2558d92926..69d6d91d0e 100755 --- a/ci_scripts/cppwin_tflite-tests.sh +++ b/ci_scripts/cppwin_tflite-tests.sh @@ -1,22 +1,21 @@ #!/bin/bash - set -xe -source $(dirname "$0")/all-vars.sh -source $(dirname "$0")/all-utils.sh -source $(dirname "$0")/asserts.sh +source "$(dirname "$0")/all-vars.sh" +source "$(dirname "$0")/all-utils.sh" +source "$(dirname "$0")/asserts.sh" bitrate=$1 set_ldc_sample_filename "${bitrate}" -model_source=${DEEPSPEECH_TEST_MODEL//.pb/.tflite} +model_source="${DEEPSPEECH_TEST_MODEL//.pb/.tflite}" model_name=$(basename "${model_source}") -model_name_mmap=$(basename "${model_source}") +model_name_mmap="${model_name}.mmap" export DATA_TMP_DIR=${CI_TMP_DIR} download_material "${CI_TMP_DIR}/ds" -export PATH=${CI_TMP_DIR}/ds/:$PATH +export PATH="${CI_TMP_DIR}/ds/:$PATH" check_versions diff --git a/ci_scripts/tf-package.sh b/ci_scripts/tf-package.sh index 998aeb68a8..eca806911c 100755 --- a/ci_scripts/tf-package.sh +++ b/ci_scripts/tf-package.sh @@ -2,53 +2,41 @@ set -xe -source $(dirname $0)/tf-vars.sh +source "$(dirname "$0")/tf-vars.sh" -mkdir -p ${CI_ARTIFACTS_DIR} || true +mkdir -p "${CI_ARTIFACTS_DIR}" || true -cp ${DS_ROOT_TASK}/tensorflow/bazel_*.log ${CI_ARTIFACTS_DIR} || true +cp "${DS_ROOT_TASK}/tensorflow/bazel_*.log" "${CI_ARTIFACTS_DIR}" || true OUTPUT_ROOT="${DS_ROOT_TASK}/tensorflow/bazel-bin" -for output_bin in \ - tensorflow/lite/experimental/c/libtensorflowlite_c.so \ - tensorflow/tools/graph_transforms/transform_graph \ - tensorflow/tools/graph_transforms/summarize_graph \ - tensorflow/tools/benchmark/benchmark_model \ - tensorflow/contrib/util/convert_graphdef_memmapped_format \ +for output_bin in \ + tensorflow/lite/experimental/c/libtensorflowlite_c.so \ + tensorflow/tools/graph_transforms/transform_graph \ + tensorflow/tools/graph_transforms/summarize_graph \ + tensorflow/tools/benchmark/benchmark_model \ + tensorflow/contrib/util/convert_graphdef_memmapped_format \ tensorflow/lite/toco/toco; do if [ -f "${OUTPUT_ROOT}/${output_bin}" ]; then - cp ${OUTPUT_ROOT}/${output_bin} ${CI_ARTIFACTS_DIR}/ - fi; -done; + cp "${OUTPUT_ROOT}/${output_bin}" "${CI_ARTIFACTS_DIR}/" + fi +done if [ -f "${OUTPUT_ROOT}/tensorflow/lite/tools/benchmark/benchmark_model" ]; then - cp ${OUTPUT_ROOT}/tensorflow/lite/tools/benchmark/benchmark_model ${CI_ARTIFACTS_DIR}/lite_benchmark_model + cp "${OUTPUT_ROOT}/tensorflow/lite/tools/benchmark/benchmark_model" "${CI_ARTIFACTS_DIR}/lite_benchmark_model" fi -# It seems that bsdtar and gnutar are behaving a bit differently on the way -# they deal with --exclude="./public/*" ; this caused ./DeepSpeech/tensorflow/core/public/ -# to be ditched when we just wanted to get rid of ./public/ on OSX. -# Switching to gnutar (already needed for the --transform on DeepSpeech tasks) -# does the trick. TAR_EXCLUDE="--exclude=./dls/*" if [ "${OS}" = "Darwin" ]; then TAR_EXCLUDE="--exclude=./dls/* --exclude=./public/* --exclude=./generic-worker/* --exclude=./homebrew/* --exclude=./homebrew.cache/* --exclude=./homebrew.logs/*" -fi; - -# Make a tar of -# - /home/build-user/ (linux -# - /Users/build-user/TaskCluster/HeavyTasks/X/ (OSX) -# - C:\builds\tc-workdir\ (windows) +fi if [ "${OS}" = "${CI_MSYS_VERSION}" ]; then export PATH=$PATH:'/c/Program Files/7-Zip/' - pushd ${DS_ROOT_TASK} - 7z a '-xr!.\dls\' '-xr!.\tmp\' '-xr!.\msys64\' -snl -snh -so home.tar . | 7z a -si ${CI_ARTIFACTS_DIR}/home.tar.xz - popd + 7z a '-xr!.\dls\' '-xr!.\tmp\' '-xr!.\msys64\' -snl -snh -so home.tar . | 7z a -si "${CI_ARTIFACTS_DIR}/home.tar.xz" else - ${TAR} -C ${DS_ROOT_TASK} ${TAR_EXCLUDE} -cf - . | ${XZ} > ${CI_ARTIFACTS_DIR}/home.tar.xz + tar -C "${DS_ROOT_TASK}" ${TAR_EXCLUDE} -cf - . | xz > "${CI_ARTIFACTS_DIR}/home.tar.xz" fi if [ "${OS}" = "Linux" ]; then @@ -57,6 +45,7 @@ elif [ "${OS}" = "${CI_MSYS_VERSION}" ]; then SHA_SUM_GEN="sha256sum" elif [ "${OS}" = "Darwin" ]; then SHA_SUM_GEN="shasum -a 256" -fi; +fi + +${SHA_SUM_GEN} "${CI_ARTIFACTS_DIR}"/* > "${CI_ARTIFACTS_DIR}/checksums.txt" -${SHA_SUM_GEN} ${CI_ARTIFACTS_DIR}/* > ${CI_ARTIFACTS_DIR}/checksums.txt diff --git a/evaluate_tflite.py b/evaluate_tflite.py index 0d46261551..1e26b07bbb 100644 --- a/evaluate_tflite.py +++ b/evaluate_tflite.py @@ -1,4 +1,3 @@ -#!/usr/bin/env python # -*- coding: utf-8 -*- from __future__ import absolute_import, division, print_function @@ -12,7 +11,6 @@ from deepspeech import Model from deepspeech_training.util.evaluate_tools import calculate_and_print_report -from deepspeech_training.util.flags import create_flags from functools import partial from multiprocessing import JoinableQueue, Process, cpu_count, Manager from six.moves import zip, range @@ -52,7 +50,7 @@ def tflite_worker(model, scorer, queue_in, queue_out, gpu_mask): print(queue_out.qsize(), end='\r') # Update the current progress queue_in.task_done() -def main(args, _): +def main(args): manager = Manager() work_todo = JoinableQueue() # this is where we are going to store input data work_done = manager.Queue() # this where we are gonna push them out @@ -80,7 +78,7 @@ def main(args, _): if not os.path.isabs(row['wav_filename']): row['wav_filename'] = os.path.join(os.path.dirname(args.csv), row['wav_filename']) work_todo.put({'filename': row['wav_filename'], 'transcript': row['transcript']}) - wav_filenames.extend(row['wav_filename']) + wav_filenames.append(row['wav_filename']) print('Totally %d wav entries found in csv\n' % count) work_todo.join() @@ -122,5 +120,5 @@ def parse_args(): return args if __name__ == '__main__': - create_flags() - absl.app.run(partial(main, parse_args())) + absl.app.run(main, parse_args()) + diff --git a/lm_optimizer.py b/lm_optimizer.py index 25d8a05eb2..8d52f173b9 100644 --- a/lm_optimizer.py +++ b/lm_optimizer.py @@ -1,7 +1,3 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -from __future__ import absolute_import, print_function - import absl.app import optuna import sys diff --git a/tests/test_text.py b/tests/test_text.py index 5bdda19ef6..d1ca8055a2 100644 --- a/tests/test_text.py +++ b/tests/test_text.py @@ -6,20 +6,21 @@ class TestAlphabetParsing(unittest.TestCase): def _ending_tester(self, file, expected): - alphabet = Alphabet(os.path.join(os.path.dirname(__file__), 'test_data', file)) - label = '' - label_id = -1 + alphabet_file_path = os.path.join(os.path.dirname(__file__), 'test_data', file) + with open(alphabet_file_path, 'r') as f: + alphabet_data = f.read().splitlines() + alphabet = Alphabet(alphabet_data) for expected_label, expected_label_id in expected: try: label_id = alphabet.Encode(expected_label) + self.assertEqual(label_id, [expected_label_id]) except KeyError: - pass - self.assertEqual(label_id, [expected_label_id]) + self.fail(f"Failed to encode label '{expected_label}'") try: label = alphabet.Decode([expected_label_id]) + self.assertEqual(label, expected_label) except KeyError: - pass - self.assertEqual(label, expected_label) + self.fail(f"Failed to decode label '{expected_label_id}'") def test_macos_ending(self): self._ending_tester('alphabet_macos.txt', [('a', 0), ('b', 1), ('c', 2)])