From 20ee9f237ec11fbb7422332fe0e83ae7180095d0 Mon Sep 17 00:00:00 2001 From: Mathieu Bernard Date: Fri, 24 Jul 2020 12:15:21 +0200 Subject: [PATCH] improved comments --- phonemizer/backend/espeak.py | 23 ++++++++++++++--------- phonemizer/utils.py | 23 ++++++++++++++++------- 2 files changed, 30 insertions(+), 16 deletions(-) diff --git a/phonemizer/backend/espeak.py b/phonemizer/backend/espeak.py index 5237470..387c11c 100644 --- a/phonemizer/backend/espeak.py +++ b/phonemizer/backend/espeak.py @@ -167,7 +167,6 @@ def __init__(self, language, 'lang_switch argument "{}" invalid, must be in {}' .format(language_switch, ", ".join(valid_lang_switch))) self._lang_switch = language_switch - # self._lang_switch_list = [] self._with_stress = with_stress @@ -201,18 +200,23 @@ def phonemize(self, text, separator=default_separator, log_storage = self.logger self.logger = None + # divide the input text in chunks, each chunk being processed in a + # separate job text_chunks = chunks(text, njobs) + + # offset used below to recover the line numbers in the input text + # wrt the chunks offset = [0] + cumsum( - [c.count('\n') + 1 for c in text_chunks[:-1]]) + (c.count('\n') + 1 for c in text_chunks[:-1])) - # we have here a list of phonemized chunks, output is a list of - # (text, lang_switches) + # we have here a list of (phonemized chunk, lang_switches) output = joblib.Parallel(n_jobs=njobs)( joblib.delayed(self._phonemize_aux)(t, separator, strip) for t in text_chunks) - # flatten them in a single list. For language switches lines we - # need to add an offset for each text chunk + # flatten both the phonemized chunks and language switches in a + # list. For language switches lines we need to add an offset to + # have the correct lines numbers wrt the input text. text = list(itertools.chain(*(chunk[0] for chunk in output))) lang_switches = [chunk[1] for chunk in output] for i in range(len(lang_switches)): @@ -223,11 +227,12 @@ def phonemize(self, text, separator=default_separator, # restore the log as it was before parallel processing self.logger = log_storage + # warn the user if language switches occured during phonemization self._warn_on_lang_switch(lang_switches) - result = self._phonemize_postprocess( - text, text_type, punctuation_marks) - return result + # finally restore the punctuation + return self._phonemize_postprocess( + text, text_type, punctuation_marks) def _command(self, fname): return ( diff --git a/phonemizer/utils.py b/phonemizer/utils.py index 31edf75..2633f01 100644 --- a/phonemizer/utils.py +++ b/phonemizer/utils.py @@ -30,24 +30,33 @@ def cumsum(l): def str2list(s): - """Returns the string `s` as a list of lines""" + """Returns the string `s` as a list of lines, split by \n""" return s.strip().split('\n') if isinstance(s, six.string_types) else s def list2str(s): - """Returns the list of lines `s` as a single string""" + """Returns the list of lines `s` as a single string separated by \n""" return '\n'.join(s) if not isinstance(s, six.string_types) else s def chunks(text, n): - """Return `n` equally sized chunks of a `text` + """Return a maximum of `n` equally sized chunks of a `text` - `n` must be an integer greater than 0. + This method is usefull when phonemizing a single text on multiple jobs. - Only the n-1 first chunks have equal size. The last chunk can be longer. - The input `text` can be a list or a string. Return a list of `n` strings. + The exact number of chunks eturned is `m = min(n, len(str2list(text)))`. + Only the m-1 first chunks have equal size. The last chunk can be longer. + The input `text` can be a list or a string. Return a list of `m` strings. - This method is usefull when phonemizing a single text on multiple jobs. + Parameters + ---------- + text (str or list) : The text to divide in chunks + + n (int) : The number of chunks to build, must be an integer greater than 0. + + Returns + ------- + The chunked text as a list of str. """ text = str2list(text)