sillsdev · TaperChipmunk32 · Oct 29, 2024 · Oct 17, 2024
diff --git a/.devcontainer/dockerfile b/.devcontainer/dockerfile
@@ -1,13 +1,14 @@
 #compatability with Tensorflow 2.6.0 as per https://www.tensorflow.org/install/source#gpu
-ARG PYTHON_VERSION=3.8
-ARG UBUNTU_VERSION=focal
+ARG PYTHON_VERSION=3.9
+ARG UBUNTU_VERSION=noble
 ARG POETRY_VERSION=1.6.1
-ARG CUDA_VERSION=11.2.2-cudnn8-runtime-ubuntu20.04
+ARG CUDA_VERSION=12.6.1-base-ubuntu24.04
 
 FROM nvidia/cuda:$CUDA_VERSION
 ARG PYTHON_VERSION
 ARG POETRY_VERSION
 
+ENV POETRY_VENV=/opt/poetry-venv
 ENV PIP_DISABLE_PIP_VERSION_CHECK=on
 ENV TZ=America/New_York
 RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone
@@ -33,7 +34,8 @@ RUN ln -sfn /usr/bin/python${PYTHON_VERSION} /usr/bin/python3  & \
 # Install python packages
 RUN pip install -U pip setuptools \
     && pip install poetry==${POETRY_VERSION} black pipenv virtualenv clearml \
-    && poetry config virtualenvs.in-project true
+    && poetry config virtualenvs.in-project true \
+    && pip install cffi
 
 COPY ./.devcontainer/clearml.conf /root/clearml.conf
 

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -14,16 +14,16 @@ jobs:
     strategy:
       matrix:
         os: [ubuntu-latest, macos-12, windows-latest]
-        python-version: ["3.8", "3.9", "3.10", "3.11"]
+        python-version: ["3.9", "3.10", "3.11", "3.12"]
     defaults:
       run:
         shell: bash
 
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
       - name: Set up Python ${{ matrix.python-version }}
         id: setup-python
-        uses: actions/setup-python@v4
+        uses: actions/setup-python@v5
         with:
           python-version: ${{ matrix.python-version }}
       - name: Install Poetry
@@ -34,7 +34,7 @@ jobs:
           installer-parallel: true
       - name: Load cached venv
         id: cached-poetry-dependencies
-        uses: actions/cache@v3
+        uses: actions/cache@v4
         with:
           path: .venv
           key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}
@@ -55,7 +55,7 @@ jobs:
           node-version: "14"
       - name: Lint with pyright
         run: |
-          npm install -g pyright@1.1.362
+          npm install -g pyright@1.1.386
           poetry run pyright
       - name: Test with pytest
         run: poetry run pytest --cov --cov-report=xml
@@ -70,12 +70,12 @@ jobs:
     runs-on: ubuntu-latest
 
     steps:
-      - uses: actions/checkout@v3
-      - name: Set up Python 3.8
+      - uses: actions/checkout@v4
+      - name: Set up Python 3.9
         id: setup-python
-        uses: actions/setup-python@v4
+        uses: actions/setup-python@v5
         with:
-          python-version: "3.8"
+          python-version: "3.9"
       - name: Install Poetry
         uses: snok/install-poetry@v1
         with:
@@ -84,14 +84,14 @@ jobs:
           installer-parallel: true
       - name: Load cached venv
         id: cached-poetry-dependencies
-        uses: actions/cache@v3
+        uses: actions/cache@v4
         with:
           path: .venv
           key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}
       - name: Build
         run: poetry build
       - name: Upload package
-        uses: actions/upload-artifact@v3
+        uses: actions/upload-artifact@v4
         with:
           name: wheel
           path: dist/*.whl

diff --git a/dockerfile b/dockerfile
@@ -1,8 +1,8 @@
 #compatability with Tensorflow 2.6.0 as per https://www.tensorflow.org/install/source#gpu
-ARG PYTHON_VERSION=3.11
-ARG UBUNTU_VERSION=focal
+ARG PYTHON_VERSION=3.12
+ARG UBUNTU_VERSION=noble
 ARG POETRY_VERSION=1.6.1
-ARG CUDA_VERSION=11.2.2-cudnn8-runtime-ubuntu20.04
+ARG CUDA_VERSION=12.6.1-base-ubuntu24.04
 
 FROM python:$PYTHON_VERSION-slim as builder
 ARG POETRY_VERSION
@@ -35,13 +35,11 @@ WORKDIR /root
 
 RUN apt-get update && \
     apt-get install --no-install-recommends -y software-properties-common && \
-    add-apt-repository ppa:deadsnakes/ppa -y && \
     apt-get update && \
     apt-get install --no-install-recommends -y \
     curl \
     python$PYTHON_VERSION \
-    python$PYTHON_VERSION-distutils \
-# these are needed for ClearML
+    # these are needed for ClearML
     git libsm6 libxext6 libxrender-dev libglib2.0-0 && \
     rm -rf /var/lib/apt/lists/* && \
     apt-get clean

diff --git a/dockerfile.cpu_only b/dockerfile.cpu_only
@@ -1,6 +1,6 @@
 #compatability with Tensorflow 2.6.0 as per https://www.tensorflow.org/install/source#gpu
-ARG PYTHON_VERSION=3.11
-ARG UBUNTU_VERSION=focal
+ARG PYTHON_VERSION=3.12
+ARG UBUNTU_VERSION=noble
 ARG POETRY_VERSION=1.6.1
 
 FROM python:$PYTHON_VERSION-slim AS builder

diff --git a/machine/corpora/parallel_text_corpus.py b/machine/corpora/parallel_text_corpus.py
@@ -490,7 +490,7 @@ def iterable() -> Iterable[Tuple[Union[str, int], dict]]:
                         example[alignment_column] = {source_lang: src_indices, target_lang: trg_indices}
                     yield key, example
 
-        return IterableDataset(ExamplesIterable(iterable, {}), info, split)
+        return IterableDataset(ExamplesIterable(iterable, {}), info, split)  # type: ignore
 
 
 class _TransformParallelTextCorpus(ParallelTextCorpus):
@@ -617,8 +617,9 @@ def count(self, include_empty: bool = True, text_ids: Optional[Iterable[str]] =
             if include_empty:
                 return len(self._df)
             return len(self._df[(self._df[self._source_column] != "") & (self._df[self._target_column] != "")])
-        return len(self._df[self._df[self._source_column].isin(set(text_ids))]) & (
-            len(self._df[self._target_column].isin(set(text_ids)))
+        text_ids = list(text_ids)
+        return len(self._df[self._df[self._source_column].isin(text_ids)]) & (
+            len(self._df[self._target_column].isin(text_ids))
         )
 
     def _get_rows(self, text_ids: Optional[Iterable[str]] = None) -> Generator[ParallelTextRow, None, None]:

diff --git a/machine/corpora/usfm_text_base.py b/machine/corpora/usfm_text_base.py
@@ -68,9 +68,10 @@ def _get_rows(self) -> Generator[TextRow, None, None]:
         return gen(row_collector.rows)
 
     def _read_usfm(self) -> str:
-        with self._create_stream_container() as stream_container, TextIOWrapper(
-            stream_container.open_stream(), encoding=self._encoding, errors="replace"
-        ) as reader:
+        with (
+            self._create_stream_container() as stream_container,
+            TextIOWrapper(stream_container.open_stream(), encoding=self._encoding, errors="replace") as reader,
+        ):
             return reader.read()
 
 

diff --git a/machine/jobs/huggingface/hugging_face_nmt_model_factory.py b/machine/jobs/huggingface/hugging_face_nmt_model_factory.py
@@ -39,7 +39,10 @@ def __init__(self, config: Any) -> None:
             and self._training_args.report_to is not None
             and "clearml" in self._training_args.report_to
         ):
-            self._training_args.report_to.remove("clearml")
+            if isinstance(self._training_args.report_to, list):
+                self._training_args.report_to.remove("clearml")
+            elif isinstance(self._training_args.report_to, str) and self._training_args.report_to == "clearml":
+                self._training_args.report_to = None
 
         # The default of training_args.log_level is passive, so we set log level at info here to have that default.
         transformers_logging.set_verbosity_info()

diff --git a/machine/jobs/nmt_engine_build_job.py b/machine/jobs/nmt_engine_build_job.py
@@ -69,9 +69,10 @@ def _train_model(
                 check_canceled()
 
         logger.info("Training NMT model")
-        with progress_reporter.start_next_phase() as phase_progress, self._nmt_model_factory.create_model_trainer(
-            parallel_corpus
-        ) as model_trainer:
+        with (
+            progress_reporter.start_next_phase() as phase_progress,
+            self._nmt_model_factory.create_model_trainer(parallel_corpus) as model_trainer,
+        ):
             model_trainer.train(progress=phase_progress, check_canceled=check_canceled)
             model_trainer.save()
             train_corpus_size = model_trainer.stats.train_corpus_size

diff --git a/machine/jobs/smt_engine_build_job.py b/machine/jobs/smt_engine_build_job.py
@@ -49,17 +49,19 @@ def _train_model(
         check_canceled: Optional[Callable[[], None]],
     ) -> Tuple[int, float]:
 
-        with progress_reporter.start_next_phase() as phase_progress, self._smt_model_factory.create_model_trainer(
-            self._tokenizer, parallel_corpus
-        ) as trainer:
+        with (
+            progress_reporter.start_next_phase() as phase_progress,
+            self._smt_model_factory.create_model_trainer(self._tokenizer, parallel_corpus) as trainer,
+        ):
             trainer.train(progress=phase_progress, check_canceled=check_canceled)
             trainer.save()
             train_corpus_size = trainer.stats.train_corpus_size
             confidence = trainer.stats.metrics["bleu"] * 100
 
-        with progress_reporter.start_next_phase() as phase_progress, self._smt_model_factory.create_truecaser_trainer(
-            self._tokenizer, target_corpus
-        ) as truecase_trainer:
+        with (
+            progress_reporter.start_next_phase() as phase_progress,
+            self._smt_model_factory.create_truecaser_trainer(self._tokenizer, target_corpus) as truecase_trainer,
+        ):
             truecase_trainer.train(progress=phase_progress, check_canceled=check_canceled)
             truecase_trainer.save()
 

diff --git a/machine/jobs/word_alignment_build_job.py b/machine/jobs/word_alignment_build_job.py
@@ -69,9 +69,10 @@ def _train_model(
         check_canceled: Optional[Callable[[], None]],
     ) -> int:
 
-        with progress_reporter.start_next_phase() as phase_progress, self._word_alignment_model_factory.create_model_trainer(
-            self._tokenizer, parallel_corpus
-        ) as trainer:
+        with (
+            progress_reporter.start_next_phase() as phase_progress,
+            self._word_alignment_model_factory.create_model_trainer(self._tokenizer, parallel_corpus) as trainer,
+        ):
             trainer.train(progress=phase_progress, check_canceled=check_canceled)
             trainer.save()
             train_corpus_size = trainer.stats.train_corpus_size

diff --git a/machine/translation/huggingface/hugging_face_nmt_engine.py b/machine/translation/huggingface/hugging_face_nmt_engine.py
@@ -44,15 +44,21 @@ def __init__(
         self._pipeline_kwargs = pipeline_kwargs
         if isinstance(self._model, PreTrainedModel):
             self._model.eval()
+            self._is_model_owned = False
         else:
             model_config = AutoConfig.from_pretrained(str(self._model), label2id={}, id2label={}, num_labels=0)
             self._model = cast(
                 PreTrainedModel, AutoModelForSeq2SeqLM.from_pretrained(str(self._model), config=model_config)
             )
+            self._is_model_owned = True
         self._tokenizer = AutoTokenizer.from_pretrained(self._model.name_or_path, use_fast=True)
         if isinstance(self._tokenizer, (NllbTokenizer, NllbTokenizerFast)):
             self._mpn = MosesPunctNormalizer()
-            self._mpn.substitutions = [(re.compile(r), sub) for r, sub in self._mpn.substitutions]  # type: ignore
+            self._mpn.substitutions = [
+                (str(re.compile(r)), sub)
+                for r, sub in self._mpn.substitutions
+                if isinstance(r, str) and isinstance(sub, str)
+            ]
         else:
             self._mpn = None
 
@@ -93,6 +99,10 @@ def __init__(
             **self._pipeline_kwargs,
         )
 
+    @property
+    def tokenizer(self) -> PreTrainedTokenizer | PreTrainedTokenizerFast:
+        return self._tokenizer
+
     def translate(self, segment: Union[str, Sequence[str]]) -> TranslationResult:
         return self.translate_batch([segment])[0]
 
@@ -161,6 +171,8 @@ def __enter__(self) -> HuggingFaceNmtEngine:
 
     def close(self) -> None:
         del self._pipeline
+        if self._is_model_owned:
+            del self._model
         gc.collect()
         with torch.no_grad():
             torch.cuda.empty_cache()

diff --git a/machine/translation/huggingface/hugging_face_nmt_model_trainer.py b/machine/translation/huggingface/hugging_face_nmt_model_trainer.py
@@ -99,7 +99,11 @@ def __init__(
         self._add_unk_src_tokens = add_unk_src_tokens
         self._add_unk_tgt_tokens = add_unk_tgt_tokens
         self._mpn = MosesPunctNormalizer()
-        self._mpn.substitutions = [(re.compile(r), sub) for r, sub in self._mpn.substitutions]  # type: ignore
+        self._mpn.substitutions = [
+            (str(re.compile(r)), sub)
+            for r, sub in self._mpn.substitutions
+            if isinstance(r, str) and isinstance(sub, str)
+        ]
         self._stats = TrainStats()
 
     @property
@@ -222,7 +226,8 @@ def add_lang_code_to_tokenizer(tokenizer: Any, lang_code: str):
             )
             lang_id = tokenizer.convert_tokens_to_ids(lang_code)
             tokenizer.lang_code_to_id[lang_code] = lang_id
-            if isinstance(tokenizer, (NllbTokenizer, MBart50Tokenizer, MBartTokenizer)):
+
+            if isinstance(tokenizer, (MBart50Tokenizer, MBartTokenizer)):
                 tokenizer.id_to_lang_code[lang_id] = lang_code
                 tokenizer.fairseq_tokens_to_ids[lang_code] = lang_id
                 tokenizer.fairseq_ids_to_tokens[lang_id] = lang_code
@@ -271,7 +276,7 @@ def add_lang_code_to_tokenizer(tokenizer: Any, lang_code: str):
 
             # For multilingual translation models like mBART-50 and M2M100 we need to force the target language token
             # as the first generated token. We ask the user to explicitly provide this as --forced_bos_token argument.
-            forced_bos_token_id = tokenizer.lang_code_to_id[self._tgt_lang]
+            forced_bos_token_id = tokenizer.convert_tokens_to_ids(self._tgt_lang)
             model.config.forced_bos_token_id = forced_bos_token_id
             if model.generation_config is not None:
                 model.generation_config.forced_bos_token_id = forced_bos_token_id
@@ -372,7 +377,7 @@ def save(self) -> None:
 
     def __exit__(self, type: Any, value: Any, traceback: Any) -> None:
         if self._trainer is not None:
-            self._trainer = None
+            del self._trainer
             gc.collect()
             with torch.no_grad():
                 torch.cuda.empty_cache()

diff --git a/machine/translation/thot/thot_smt_model_trainer.py b/machine/translation/thot/thot_smt_model_trainer.py
@@ -129,9 +129,10 @@ def _filter_phrase_table_using_corpus(filename: Path, source_corpus: Sequence[Se
                 j += 1
 
     temp_filename = filename.parent / f"{filename.name}.temp"
-    with filename.open("r", encoding="utf-8-sig") as file, temp_filename.open(
-        "w", encoding="utf-8", newline="\n"
-    ) as temp_file:
+    with (
+        filename.open("r", encoding="utf-8-sig") as file,
+        temp_filename.open("w", encoding="utf-8", newline="\n") as temp_file,
+    ):
         for line in file:
             fields = line.strip().split("|||")
             phrase = fields[1].strip()
@@ -295,9 +296,10 @@ def _write_ngram_counts_file(self, lm_prefix: Path, ngram_size: int, train_corpu
 
     def _write_word_prediction_file(self, lm_prefix: Path, train_corpus: ParallelTextCorpus) -> None:
         rand = Random(self.seed)
-        with (lm_prefix.parent / f"{lm_prefix.name}.wp").open(
-            "w", encoding="utf-8", newline="\n"
-        ) as file, train_corpus.take(100000).get_rows() as rows:
+        with (
+            (lm_prefix.parent / f"{lm_prefix.name}.wp").open("w", encoding="utf-8", newline="\n") as file,
+            train_corpus.take(100000).get_rows() as rows,
+        ):
             for row in sorted(rows, key=lambda _: rand.randint(0, sys.maxsize)):
                 segment_str = " ".join(escape_token(t) for t in row.target_segment)
                 file.write(segment_str + "\n")
@@ -414,9 +416,10 @@ def _generate_best_alignments(
     ) -> None:
         model = create_thot_word_alignment_model(self._word_alignment_model_type)
         model.load(swm_prefix)
-        with filename.open("w", encoding="utf-8", newline="\n") as file, train_corpus.transform(
-            _escape_tokens_row
-        ).get_rows() as rows:
+        with (
+            filename.open("w", encoding="utf-8", newline="\n") as file,
+            train_corpus.transform(_escape_tokens_row).get_rows() as rows,
+        ):
             i = 0
             for row in rows:
                 file.write("# 1\n")