diff --git a/machine/translation/thot/simplex_model_weight_tuner.py b/machine/translation/thot/simplex_model_weight_tuner.py index 77b3cd8..3491732 100644 --- a/machine/translation/thot/simplex_model_weight_tuner.py +++ b/machine/translation/thot/simplex_model_weight_tuner.py @@ -83,11 +83,8 @@ def _generate_translations( try: model = load_smt_model(self._word_alignment_model_type, parameters) decoder = load_smt_decoder(model, parameters) - if decoder is not None: - translations = decoder.translate_batch([to_sentence(s) for s in source_corpus]) - return [to_target_tokens(t.target) for t in translations] - else: - raise ValueError("Decoder could not be loaded.") + translations = decoder.translate_batch([to_sentence(s) for s in source_corpus]) + return [to_target_tokens(t.target) for t in translations] finally: if decoder is not None: decoder.clear() diff --git a/machine/translation/thot/thot_smt_model_trainer.py b/machine/translation/thot/thot_smt_model_trainer.py index a2d151e..097baed 100644 --- a/machine/translation/thot/thot_smt_model_trainer.py +++ b/machine/translation/thot/thot_smt_model_trainer.py @@ -489,11 +489,7 @@ def _train_tune_corpus( for i in range(len(tune_source_corpus)): if i > 0: progress(ProgressStatus.from_step(i, len(tune_source_corpus))) - if decoder is None or smt_model is None: - raise RuntimeError("Decoder or SMT model is None") decoder.train_sentence_pair(to_sentence(tune_source_corpus[i]), to_sentence(tune_target_corpus[i])) - if smt_model is None: - raise RuntimeError("SMT model is None") smt_model.print_translation_model(parameters.translation_model_filename_prefix) smt_model.print_language_model(parameters.language_model_filename_prefix) progress(ProgressStatus.from_step(len(tune_source_corpus), len(tune_source_corpus))) diff --git a/machine/translation/thot/thot_word_alignment_model_trainer.py b/machine/translation/thot/thot_word_alignment_model_trainer.py index 8bc4527..f97579c 100644 --- a/machine/translation/thot/thot_word_alignment_model_trainer.py +++ b/machine/translation/thot/thot_word_alignment_model_trainer.py @@ -80,8 +80,6 @@ def __init__( if model_type >= ThotWordAlignmentModelType.IBM2: if parameters.get_hmm_iteration_count(model_type) > 0: ibm2_or_hmm = ta.HmmAlignmentModel(ibm1) - if ibm2_or_hmm is None: - raise ValueError("ibm2_or_hmm should not be None") if parameters.hmm_p0 is not None: ibm2_or_hmm.hmm_p0 = parameters.hmm_p0 if parameters.hmm_lexical_smoothing_factor is not None: @@ -100,8 +98,6 @@ def __init__( and parameters.get_ibm3_iteration_count(model_type) > 0 ): ibm3 = ta.Ibm3AlignmentModel(ibm2_or_hmm) - if ibm3 is None: - raise ValueError("ibm3 should not be None") if parameters.ibm3_fertility_smoothing_factor is not None: ibm3.fertility_smoothing_factor = parameters.ibm3_fertility_smoothing_factor if parameters.ibm3_count_threshold is not None: diff --git a/poetry.lock b/poetry.lock index bb7168a..1ec8e1c 100644 --- a/poetry.lock +++ b/poetry.lock @@ -439,17 +439,17 @@ css = ["tinycss2 (>=1.1.0,<1.3)"] [[package]] name = "boto3" -version = "1.35.47" +version = "1.35.48" description = "The AWS SDK for Python" optional = false python-versions = ">=3.8" files = [ - {file = "boto3-1.35.47-py3-none-any.whl", hash = "sha256:0b307f685875e9c7857ce21c0d3050d8d4f3778455a6852d5f98ac75194b400e"}, - {file = "boto3-1.35.47.tar.gz", hash = "sha256:65b808e4cf1af8c2f405382d53656a0d92eee8f85c7388c43d64c7a5571b065f"}, + {file = "boto3-1.35.48-py3-none-any.whl", hash = "sha256:60889bb6e21f0af662ac9404e00125d3b8a5808f190e89462e5ddf73604adfc1"}, + {file = "boto3-1.35.48.tar.gz", hash = "sha256:5007a5cdd09e4db9309adf2ee090455a34ae639bd10a68a1fefca72cd77070fc"}, ] [package.dependencies] -botocore = ">=1.35.47,<1.36.0" +botocore = ">=1.35.48,<1.36.0" jmespath = ">=0.7.1,<2.0.0" s3transfer = ">=0.10.0,<0.11.0" @@ -458,13 +458,13 @@ crt = ["botocore[crt] (>=1.21.0,<2.0a0)"] [[package]] name = "botocore" -version = "1.35.47" +version = "1.35.48" description = "Low-level, data-driven core of boto 3." optional = false python-versions = ">=3.8" files = [ - {file = "botocore-1.35.47-py3-none-any.whl", hash = "sha256:05f4493119a96799ff84d43e78691efac3177e1aec8840cca99511de940e342a"}, - {file = "botocore-1.35.47.tar.gz", hash = "sha256:f8f703463d3cd8b6abe2bedc443a7ab29f0e2ff1588a2e83164b108748645547"}, + {file = "botocore-1.35.48-py3-none-any.whl", hash = "sha256:34fa25fd717208b05745e60f271a39636108fa87a3512fbca18e7e6f787a3239"}, + {file = "botocore-1.35.48.tar.gz", hash = "sha256:3e766cc251053c9ef98542fdf225381ed58531769c3811a6282bd7247f7e2bdf"}, ] [package.dependencies] @@ -736,22 +736,22 @@ toml = ["tomli"] [[package]] name = "datasets" -version = "3.0.2" +version = "2.21.0" description = "HuggingFace community-driven open-source library of datasets" optional = false python-versions = ">=3.8.0" files = [ - {file = "datasets-3.0.2-py3-none-any.whl", hash = "sha256:220bfbea0be9bf81d121bd2ac76fe4ef3f7defe0e8586ce1e7f66dcaaf69f88d"}, - {file = "datasets-3.0.2.tar.gz", hash = "sha256:07204c389ce0491ef3ad50dd79966d3fd40422a12b831cf84a117323ac74fbc1"}, + {file = "datasets-2.21.0-py3-none-any.whl", hash = "sha256:25e4e097110ce28824b746a107727ada94024cba11db8bc588d468414692b65a"}, + {file = "datasets-2.21.0.tar.gz", hash = "sha256:998f85a8460f1bd982e5bd058f8a0808eef424249e3df1e8cdd594ccd0dc8ba2"}, ] [package.dependencies] aiohttp = "*" dill = ">=0.3.0,<0.3.9" filelock = "*" -fsspec = {version = ">=2023.1.0,<=2024.9.0", extras = ["http"]} -huggingface-hub = ">=0.23.0" -multiprocess = "<0.70.17" +fsspec = {version = ">=2023.1.0,<=2024.6.1", extras = ["http"]} +huggingface-hub = ">=0.21.2" +multiprocess = "*" numpy = ">=1.17" packaging = "*" pandas = "*" @@ -762,17 +762,19 @@ tqdm = ">=4.66.3" xxhash = "*" [package.extras] +apache-beam = ["apache-beam (>=2.26.0)"] audio = ["librosa", "soundfile (>=0.12.1)", "soxr (>=0.4.0)"] benchmarks = ["tensorflow (==2.12.0)", "torch (==2.0.1)", "transformers (==4.30.1)"] -dev = ["Pillow (>=9.4.0)", "absl-py", "decorator", "elasticsearch (<8.0.0)", "faiss-cpu (>=1.8.0.post1)", "jax (>=0.3.14)", "jaxlib (>=0.3.14)", "joblib (<1.3.0)", "joblibspark", "librosa", "lz4", "moto[server]", "polars[timezone] (>=0.20.0)", "protobuf (<4.0.0)", "py7zr", "pyspark (>=3.4)", "pytest", "pytest-datadir", "pytest-xdist", "rarfile (>=4.0)", "ruff (>=0.3.0)", "s3fs", "s3fs (>=2021.11.1)", "soundfile (>=0.12.1)", "soxr (>=0.4.0)", "sqlalchemy", "tensorflow (>=2.16.0)", "tensorflow (>=2.6.0)", "tensorflow (>=2.6.0)", "tiktoken", "torch", "torch (>=2.0.0)", "torchdata", "transformers", "transformers (>=4.42.0)", "zstandard"] +dev = ["Pillow (>=9.4.0)", "absl-py", "decorator", "elasticsearch (<8.0.0)", "faiss-cpu (>=1.8.0.post1)", "jax (>=0.3.14)", "jaxlib (>=0.3.14)", "joblib (<1.3.0)", "joblibspark", "librosa", "lz4", "moto[server]", "polars[timezone] (>=0.20.0)", "protobuf (<4.0.0)", "py7zr", "pyspark (>=3.4)", "pytest", "pytest-datadir", "pytest-xdist", "rarfile (>=4.0)", "ruff (>=0.3.0)", "s3fs", "s3fs (>=2021.11.1)", "soundfile (>=0.12.1)", "soxr (>=0.4.0)", "sqlalchemy", "tensorflow (>=2.16.0)", "tensorflow (>=2.6.0)", "tensorflow (>=2.6.0)", "tiktoken", "torch", "torch (>=2.0.0)", "transformers", "transformers (>=4.42.0)", "typing-extensions (>=4.6.1)", "zstandard"] docs = ["s3fs", "tensorflow (>=2.6.0)", "torch", "transformers"] jax = ["jax (>=0.3.14)", "jaxlib (>=0.3.14)"] +metrics-tests = ["Werkzeug (>=1.0.1)", "accelerate", "bert-score (>=0.3.6)", "jiwer", "langdetect", "mauve-text", "nltk (<3.8.2)", "requests-file (>=1.5.1)", "rouge-score", "sacrebleu", "sacremoses", "scikit-learn", "scipy", "sentencepiece", "seqeval", "six (>=1.15.0,<1.16.0)", "spacy (>=3.0.0)", "texttable (>=1.6.3)", "tldextract", "tldextract (>=3.1.0)", "toml (>=0.10.1)", "typer (<0.5.0)"] quality = ["ruff (>=0.3.0)"] s3 = ["s3fs"] tensorflow = ["tensorflow (>=2.6.0)"] tensorflow-gpu = ["tensorflow (>=2.6.0)"] -tests = ["Pillow (>=9.4.0)", "absl-py", "decorator", "elasticsearch (<8.0.0)", "faiss-cpu (>=1.8.0.post1)", "jax (>=0.3.14)", "jaxlib (>=0.3.14)", "joblib (<1.3.0)", "joblibspark", "librosa", "lz4", "moto[server]", "polars[timezone] (>=0.20.0)", "protobuf (<4.0.0)", "py7zr", "pyspark (>=3.4)", "pytest", "pytest-datadir", "pytest-xdist", "rarfile (>=4.0)", "s3fs (>=2021.11.1)", "soundfile (>=0.12.1)", "soxr (>=0.4.0)", "sqlalchemy", "tensorflow (>=2.16.0)", "tensorflow (>=2.6.0)", "tiktoken", "torch (>=2.0.0)", "torchdata", "transformers (>=4.42.0)", "zstandard"] -tests-numpy2 = ["Pillow (>=9.4.0)", "absl-py", "decorator", "elasticsearch (<8.0.0)", "jax (>=0.3.14)", "jaxlib (>=0.3.14)", "joblib (<1.3.0)", "joblibspark", "lz4", "moto[server]", "polars[timezone] (>=0.20.0)", "protobuf (<4.0.0)", "py7zr", "pyspark (>=3.4)", "pytest", "pytest-datadir", "pytest-xdist", "rarfile (>=4.0)", "s3fs (>=2021.11.1)", "soundfile (>=0.12.1)", "soxr (>=0.4.0)", "sqlalchemy", "tiktoken", "torch (>=2.0.0)", "torchdata", "transformers (>=4.42.0)", "zstandard"] +tests = ["Pillow (>=9.4.0)", "absl-py", "decorator", "elasticsearch (<8.0.0)", "faiss-cpu (>=1.8.0.post1)", "jax (>=0.3.14)", "jaxlib (>=0.3.14)", "joblib (<1.3.0)", "joblibspark", "librosa", "lz4", "moto[server]", "polars[timezone] (>=0.20.0)", "protobuf (<4.0.0)", "py7zr", "pyspark (>=3.4)", "pytest", "pytest-datadir", "pytest-xdist", "rarfile (>=4.0)", "s3fs (>=2021.11.1)", "soundfile (>=0.12.1)", "soxr (>=0.4.0)", "sqlalchemy", "tensorflow (>=2.16.0)", "tensorflow (>=2.6.0)", "tiktoken", "torch (>=2.0.0)", "transformers (>=4.42.0)", "typing-extensions (>=4.6.1)", "zstandard"] +tests-numpy2 = ["Pillow (>=9.4.0)", "absl-py", "decorator", "elasticsearch (<8.0.0)", "jax (>=0.3.14)", "jaxlib (>=0.3.14)", "joblib (<1.3.0)", "joblibspark", "librosa", "lz4", "moto[server]", "polars[timezone] (>=0.20.0)", "protobuf (<4.0.0)", "py7zr", "pyspark (>=3.4)", "pytest", "pytest-datadir", "pytest-xdist", "rarfile (>=4.0)", "s3fs (>=2021.11.1)", "soundfile (>=0.12.1)", "soxr (>=0.4.0)", "sqlalchemy", "tiktoken", "torch (>=2.0.0)", "typing-extensions (>=4.6.1)", "zstandard"] torch = ["torch"] vision = ["Pillow (>=9.4.0)"] @@ -1068,13 +1070,13 @@ files = [ [[package]] name = "fsspec" -version = "2024.9.0" +version = "2024.6.1" description = "File-system specification" optional = false python-versions = ">=3.8" files = [ - {file = "fsspec-2024.9.0-py3-none-any.whl", hash = "sha256:a0947d552d8a6efa72cc2c730b12c41d043509156966cca4fb157b0f2a0c574b"}, - {file = "fsspec-2024.9.0.tar.gz", hash = "sha256:4b0afb90c2f21832df142f292649035d80b421f60a9e1c027802e5a0da2b04e8"}, + {file = "fsspec-2024.6.1-py3-none-any.whl", hash = "sha256:3cb443f8bcd2efb31295a5b9fdb02aee81d8452c80d28f97a6d0959e6cee101e"}, + {file = "fsspec-2024.6.1.tar.gz", hash = "sha256:fad7d7e209dd4c1208e3bbfda706620e0da5142bebbd9c384afb95b07e798e49"}, ] [package.dependencies] @@ -3653,19 +3655,19 @@ crt = ["botocore[crt] (>=1.33.2,<2.0a.0)"] [[package]] name = "sacremoses" -version = "0.1.1" +version = "0.0.53" description = "SacreMoses" optional = false -python-versions = ">=3.8" +python-versions = "*" files = [ - {file = "sacremoses-0.1.1-py3-none-any.whl", hash = "sha256:31e04c98b169bfd902144824d191825cd69220cdb4ae4bcf1ec58a7db5587b1a"}, - {file = "sacremoses-0.1.1.tar.gz", hash = "sha256:b6fd5d3a766b02154ed80b962ddca91e1fd25629c0978c7efba21ebccf663934"}, + {file = "sacremoses-0.0.53.tar.gz", hash = "sha256:43715868766c643b35de4b8046cce236bfe59a7fa88b25eaf6ddf02bacf53a7a"}, ] [package.dependencies] click = "*" joblib = "*" regex = "*" +six = "*" tqdm = "*" [[package]] @@ -4781,4 +4783,4 @@ thot = ["sil-thot"] [metadata] lock-version = "2.0" python-versions = ">=3.9,<3.13" -content-hash = "d5b3124b114cf521123dd3f9d65b5cc913054b671d7061e26c720ae0b878a3c7" +content-hash = "a32c0f7ba2fd188daf88de61a4f57d64124491bb617461a4042bd4f8860f7dfa" diff --git a/pyproject.toml b/pyproject.toml index 8cbb29f..e38c9f2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -28,6 +28,7 @@ norecursedirs = "tests/testutils" [tool.pyright] typeCheckingMode = "basic" extraPaths = ["tests"] +reportMissingModuleSource = false [tool.poetry] name = "sil-machine" @@ -61,11 +62,11 @@ charset-normalizer = "^2.1.1" sentencepiece = "^0.2.0" -sil-thot = "^3.4.4" +sil-thot = "^3.4.5" -transformers = "^4.38.0, <4.46.0" -datasets = "^3.0.0" -sacremoses = "^0.1.0" +transformers = ">=4.38.0, <4.46.0" +datasets = "^2.4.0" +sacremoses = "^0.0.53" clearml = { extras = ["s3"], version = "^1.13.1" } botocore = "^1.35.41"