Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update to support Python 3.9-3.12 #131

Merged
merged 1 commit into from
Oct 29, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 6 additions & 4 deletions .devcontainer/dockerfile
Original file line number Diff line number Diff line change
@@ -1,13 +1,14 @@
#compatability with Tensorflow 2.6.0 as per https://www.tensorflow.org/install/source#gpu
ARG PYTHON_VERSION=3.8
ARG UBUNTU_VERSION=focal
ARG PYTHON_VERSION=3.9
ARG UBUNTU_VERSION=noble
ARG POETRY_VERSION=1.6.1
ARG CUDA_VERSION=11.2.2-cudnn8-runtime-ubuntu20.04
ARG CUDA_VERSION=12.6.1-base-ubuntu24.04

FROM nvidia/cuda:$CUDA_VERSION
ARG PYTHON_VERSION
ARG POETRY_VERSION

ENV POETRY_VENV=/opt/poetry-venv
ENV PIP_DISABLE_PIP_VERSION_CHECK=on
ENV TZ=America/New_York
RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone
Expand All @@ -33,7 +34,8 @@ RUN ln -sfn /usr/bin/python${PYTHON_VERSION} /usr/bin/python3 & \
# Install python packages
RUN pip install -U pip setuptools \
&& pip install poetry==${POETRY_VERSION} black pipenv virtualenv clearml \
&& poetry config virtualenvs.in-project true
&& poetry config virtualenvs.in-project true \
&& pip install cffi

COPY ./.devcontainer/clearml.conf /root/clearml.conf

Expand Down
22 changes: 11 additions & 11 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,16 +14,16 @@ jobs:
strategy:
matrix:
os: [ubuntu-latest, macos-12, windows-latest]
python-version: ["3.8", "3.9", "3.10", "3.11"]
python-version: ["3.9", "3.10", "3.11", "3.12"]
defaults:
run:
shell: bash

steps:
- uses: actions/checkout@v3
- uses: actions/checkout@v4
- name: Set up Python ${{ matrix.python-version }}
id: setup-python
uses: actions/setup-python@v4
uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python-version }}
- name: Install Poetry
Expand All @@ -34,7 +34,7 @@ jobs:
installer-parallel: true
- name: Load cached venv
id: cached-poetry-dependencies
uses: actions/cache@v3
uses: actions/cache@v4
with:
path: .venv
key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}
Expand All @@ -55,7 +55,7 @@ jobs:
node-version: "14"
- name: Lint with pyright
run: |
npm install -g pyright@1.1.362
npm install -g pyright@1.1.386
poetry run pyright
- name: Test with pytest
run: poetry run pytest --cov --cov-report=xml
Expand All @@ -70,12 +70,12 @@ jobs:
runs-on: ubuntu-latest

steps:
- uses: actions/checkout@v3
- name: Set up Python 3.8
- uses: actions/checkout@v4
- name: Set up Python 3.9
id: setup-python
uses: actions/setup-python@v4
uses: actions/setup-python@v5
with:
python-version: "3.8"
python-version: "3.9"
- name: Install Poetry
uses: snok/install-poetry@v1
with:
Expand All @@ -84,14 +84,14 @@ jobs:
installer-parallel: true
- name: Load cached venv
id: cached-poetry-dependencies
uses: actions/cache@v3
uses: actions/cache@v4
with:
path: .venv
key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}
- name: Build
run: poetry build
- name: Upload package
uses: actions/upload-artifact@v3
uses: actions/upload-artifact@v4
with:
name: wheel
path: dist/*.whl
Expand Down
10 changes: 4 additions & 6 deletions dockerfile
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
#compatability with Tensorflow 2.6.0 as per https://www.tensorflow.org/install/source#gpu
ARG PYTHON_VERSION=3.11
ARG UBUNTU_VERSION=focal
ARG PYTHON_VERSION=3.12
ARG UBUNTU_VERSION=noble
ARG POETRY_VERSION=1.6.1
ARG CUDA_VERSION=11.2.2-cudnn8-runtime-ubuntu20.04
ARG CUDA_VERSION=12.6.1-base-ubuntu24.04

FROM python:$PYTHON_VERSION-slim as builder
ARG POETRY_VERSION
Expand Down Expand Up @@ -35,13 +35,11 @@ WORKDIR /root

RUN apt-get update && \
apt-get install --no-install-recommends -y software-properties-common && \
add-apt-repository ppa:deadsnakes/ppa -y && \
apt-get update && \
apt-get install --no-install-recommends -y \
curl \
python$PYTHON_VERSION \
python$PYTHON_VERSION-distutils \
# these are needed for ClearML
# these are needed for ClearML
git libsm6 libxext6 libxrender-dev libglib2.0-0 && \
rm -rf /var/lib/apt/lists/* && \
apt-get clean
Expand Down
4 changes: 2 additions & 2 deletions dockerfile.cpu_only
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#compatability with Tensorflow 2.6.0 as per https://www.tensorflow.org/install/source#gpu
ARG PYTHON_VERSION=3.11
ARG UBUNTU_VERSION=focal
ARG PYTHON_VERSION=3.12
ARG UBUNTU_VERSION=noble
ARG POETRY_VERSION=1.6.1

FROM python:$PYTHON_VERSION-slim AS builder
Expand Down
7 changes: 4 additions & 3 deletions machine/corpora/parallel_text_corpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -490,7 +490,7 @@ def iterable() -> Iterable[Tuple[Union[str, int], dict]]:
example[alignment_column] = {source_lang: src_indices, target_lang: trg_indices}
yield key, example

return IterableDataset(ExamplesIterable(iterable, {}), info, split)
return IterableDataset(ExamplesIterable(iterable, {}), info, split) # type: ignore


class _TransformParallelTextCorpus(ParallelTextCorpus):
Expand Down Expand Up @@ -617,8 +617,9 @@ def count(self, include_empty: bool = True, text_ids: Optional[Iterable[str]] =
if include_empty:
return len(self._df)
return len(self._df[(self._df[self._source_column] != "") & (self._df[self._target_column] != "")])
return len(self._df[self._df[self._source_column].isin(set(text_ids))]) & (
len(self._df[self._target_column].isin(set(text_ids)))
text_ids = list(text_ids)
return len(self._df[self._df[self._source_column].isin(text_ids)]) & (
len(self._df[self._target_column].isin(text_ids))
)

def _get_rows(self, text_ids: Optional[Iterable[str]] = None) -> Generator[ParallelTextRow, None, None]:
Expand Down
7 changes: 4 additions & 3 deletions machine/corpora/usfm_text_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,9 +68,10 @@ def _get_rows(self) -> Generator[TextRow, None, None]:
return gen(row_collector.rows)

def _read_usfm(self) -> str:
with self._create_stream_container() as stream_container, TextIOWrapper(
stream_container.open_stream(), encoding=self._encoding, errors="replace"
) as reader:
with (
self._create_stream_container() as stream_container,
TextIOWrapper(stream_container.open_stream(), encoding=self._encoding, errors="replace") as reader,
):
return reader.read()


Expand Down
5 changes: 4 additions & 1 deletion machine/jobs/huggingface/hugging_face_nmt_model_factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,10 @@ def __init__(self, config: Any) -> None:
and self._training_args.report_to is not None
and "clearml" in self._training_args.report_to
):
self._training_args.report_to.remove("clearml")
if isinstance(self._training_args.report_to, list):
self._training_args.report_to.remove("clearml")
elif isinstance(self._training_args.report_to, str) and self._training_args.report_to == "clearml":
self._training_args.report_to = None

# The default of training_args.log_level is passive, so we set log level at info here to have that default.
transformers_logging.set_verbosity_info()
Expand Down
7 changes: 4 additions & 3 deletions machine/jobs/nmt_engine_build_job.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,9 +69,10 @@ def _train_model(
check_canceled()

logger.info("Training NMT model")
with progress_reporter.start_next_phase() as phase_progress, self._nmt_model_factory.create_model_trainer(
parallel_corpus
) as model_trainer:
with (
progress_reporter.start_next_phase() as phase_progress,
self._nmt_model_factory.create_model_trainer(parallel_corpus) as model_trainer,
):
model_trainer.train(progress=phase_progress, check_canceled=check_canceled)
model_trainer.save()
train_corpus_size = model_trainer.stats.train_corpus_size
Expand Down
14 changes: 8 additions & 6 deletions machine/jobs/smt_engine_build_job.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,17 +49,19 @@ def _train_model(
check_canceled: Optional[Callable[[], None]],
) -> Tuple[int, float]:

with progress_reporter.start_next_phase() as phase_progress, self._smt_model_factory.create_model_trainer(
self._tokenizer, parallel_corpus
) as trainer:
with (
progress_reporter.start_next_phase() as phase_progress,
self._smt_model_factory.create_model_trainer(self._tokenizer, parallel_corpus) as trainer,
):
trainer.train(progress=phase_progress, check_canceled=check_canceled)
trainer.save()
train_corpus_size = trainer.stats.train_corpus_size
confidence = trainer.stats.metrics["bleu"] * 100

with progress_reporter.start_next_phase() as phase_progress, self._smt_model_factory.create_truecaser_trainer(
self._tokenizer, target_corpus
) as truecase_trainer:
with (
progress_reporter.start_next_phase() as phase_progress,
self._smt_model_factory.create_truecaser_trainer(self._tokenizer, target_corpus) as truecase_trainer,
):
truecase_trainer.train(progress=phase_progress, check_canceled=check_canceled)
truecase_trainer.save()

Expand Down
7 changes: 4 additions & 3 deletions machine/jobs/word_alignment_build_job.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,9 +69,10 @@ def _train_model(
check_canceled: Optional[Callable[[], None]],
) -> int:

with progress_reporter.start_next_phase() as phase_progress, self._word_alignment_model_factory.create_model_trainer(
self._tokenizer, parallel_corpus
) as trainer:
with (
progress_reporter.start_next_phase() as phase_progress,
self._word_alignment_model_factory.create_model_trainer(self._tokenizer, parallel_corpus) as trainer,
):
trainer.train(progress=phase_progress, check_canceled=check_canceled)
trainer.save()
train_corpus_size = trainer.stats.train_corpus_size
Expand Down
14 changes: 13 additions & 1 deletion machine/translation/huggingface/hugging_face_nmt_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,15 +44,21 @@ def __init__(
self._pipeline_kwargs = pipeline_kwargs
if isinstance(self._model, PreTrainedModel):
self._model.eval()
self._is_model_owned = False
else:
model_config = AutoConfig.from_pretrained(str(self._model), label2id={}, id2label={}, num_labels=0)
self._model = cast(
PreTrainedModel, AutoModelForSeq2SeqLM.from_pretrained(str(self._model), config=model_config)
)
self._is_model_owned = True
self._tokenizer = AutoTokenizer.from_pretrained(self._model.name_or_path, use_fast=True)
if isinstance(self._tokenizer, (NllbTokenizer, NllbTokenizerFast)):
self._mpn = MosesPunctNormalizer()
self._mpn.substitutions = [(re.compile(r), sub) for r, sub in self._mpn.substitutions] # type: ignore
self._mpn.substitutions = [
(str(re.compile(r)), sub)
for r, sub in self._mpn.substitutions
if isinstance(r, str) and isinstance(sub, str)
]
else:
self._mpn = None

Expand Down Expand Up @@ -93,6 +99,10 @@ def __init__(
**self._pipeline_kwargs,
)

@property
def tokenizer(self) -> PreTrainedTokenizer | PreTrainedTokenizerFast:
return self._tokenizer

def translate(self, segment: Union[str, Sequence[str]]) -> TranslationResult:
return self.translate_batch([segment])[0]

Expand Down Expand Up @@ -161,6 +171,8 @@ def __enter__(self) -> HuggingFaceNmtEngine:

def close(self) -> None:
del self._pipeline
if self._is_model_owned:
del self._model
gc.collect()
with torch.no_grad():
torch.cuda.empty_cache()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,11 @@ def __init__(
self._add_unk_src_tokens = add_unk_src_tokens
self._add_unk_tgt_tokens = add_unk_tgt_tokens
self._mpn = MosesPunctNormalizer()
self._mpn.substitutions = [(re.compile(r), sub) for r, sub in self._mpn.substitutions] # type: ignore
self._mpn.substitutions = [
(str(re.compile(r)), sub)
for r, sub in self._mpn.substitutions
if isinstance(r, str) and isinstance(sub, str)
]
self._stats = TrainStats()

@property
Expand Down Expand Up @@ -222,7 +226,8 @@ def add_lang_code_to_tokenizer(tokenizer: Any, lang_code: str):
)
lang_id = tokenizer.convert_tokens_to_ids(lang_code)
tokenizer.lang_code_to_id[lang_code] = lang_id
if isinstance(tokenizer, (NllbTokenizer, MBart50Tokenizer, MBartTokenizer)):

if isinstance(tokenizer, (MBart50Tokenizer, MBartTokenizer)):
tokenizer.id_to_lang_code[lang_id] = lang_code
tokenizer.fairseq_tokens_to_ids[lang_code] = lang_id
tokenizer.fairseq_ids_to_tokens[lang_id] = lang_code
Expand Down Expand Up @@ -271,7 +276,7 @@ def add_lang_code_to_tokenizer(tokenizer: Any, lang_code: str):

# For multilingual translation models like mBART-50 and M2M100 we need to force the target language token
# as the first generated token. We ask the user to explicitly provide this as --forced_bos_token argument.
forced_bos_token_id = tokenizer.lang_code_to_id[self._tgt_lang]
forced_bos_token_id = tokenizer.convert_tokens_to_ids(self._tgt_lang)
model.config.forced_bos_token_id = forced_bos_token_id
if model.generation_config is not None:
model.generation_config.forced_bos_token_id = forced_bos_token_id
Expand Down Expand Up @@ -372,7 +377,7 @@ def save(self) -> None:

def __exit__(self, type: Any, value: Any, traceback: Any) -> None:
if self._trainer is not None:
self._trainer = None
del self._trainer
gc.collect()
with torch.no_grad():
torch.cuda.empty_cache()
Expand Down
21 changes: 12 additions & 9 deletions machine/translation/thot/thot_smt_model_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,9 +129,10 @@ def _filter_phrase_table_using_corpus(filename: Path, source_corpus: Sequence[Se
j += 1

temp_filename = filename.parent / f"{filename.name}.temp"
with filename.open("r", encoding="utf-8-sig") as file, temp_filename.open(
"w", encoding="utf-8", newline="\n"
) as temp_file:
with (
filename.open("r", encoding="utf-8-sig") as file,
temp_filename.open("w", encoding="utf-8", newline="\n") as temp_file,
):
for line in file:
fields = line.strip().split("|||")
phrase = fields[1].strip()
Expand Down Expand Up @@ -295,9 +296,10 @@ def _write_ngram_counts_file(self, lm_prefix: Path, ngram_size: int, train_corpu

def _write_word_prediction_file(self, lm_prefix: Path, train_corpus: ParallelTextCorpus) -> None:
rand = Random(self.seed)
with (lm_prefix.parent / f"{lm_prefix.name}.wp").open(
"w", encoding="utf-8", newline="\n"
) as file, train_corpus.take(100000).get_rows() as rows:
with (
(lm_prefix.parent / f"{lm_prefix.name}.wp").open("w", encoding="utf-8", newline="\n") as file,
train_corpus.take(100000).get_rows() as rows,
):
for row in sorted(rows, key=lambda _: rand.randint(0, sys.maxsize)):
segment_str = " ".join(escape_token(t) for t in row.target_segment)
file.write(segment_str + "\n")
Expand Down Expand Up @@ -414,9 +416,10 @@ def _generate_best_alignments(
) -> None:
model = create_thot_word_alignment_model(self._word_alignment_model_type)
model.load(swm_prefix)
with filename.open("w", encoding="utf-8", newline="\n") as file, train_corpus.transform(
_escape_tokens_row
).get_rows() as rows:
with (
filename.open("w", encoding="utf-8", newline="\n") as file,
train_corpus.transform(_escape_tokens_row).get_rows() as rows,
):
i = 0
for row in rows:
file.write("# 1\n")
Expand Down
Loading
Loading