Skip to content

Commit

Permalink
improved comments
Browse files Browse the repository at this point in the history
  • Loading branch information
mmmaat committed Jul 24, 2020
1 parent 5be90ee commit 20ee9f2
Show file tree
Hide file tree
Showing 2 changed files with 30 additions and 16 deletions.
23 changes: 14 additions & 9 deletions phonemizer/backend/espeak.py
Original file line number Diff line number Diff line change
Expand Up @@ -167,7 +167,6 @@ def __init__(self, language,
'lang_switch argument "{}" invalid, must be in {}'
.format(language_switch, ", ".join(valid_lang_switch)))
self._lang_switch = language_switch
# self._lang_switch_list = []

self._with_stress = with_stress

Expand Down Expand Up @@ -201,18 +200,23 @@ def phonemize(self, text, separator=default_separator,
log_storage = self.logger
self.logger = None

# divide the input text in chunks, each chunk being processed in a
# separate job
text_chunks = chunks(text, njobs)

# offset used below to recover the line numbers in the input text
# wrt the chunks
offset = [0] + cumsum(
[c.count('\n') + 1 for c in text_chunks[:-1]])
(c.count('\n') + 1 for c in text_chunks[:-1]))

# we have here a list of phonemized chunks, output is a list of
# (text, lang_switches)
# we have here a list of (phonemized chunk, lang_switches)
output = joblib.Parallel(n_jobs=njobs)(
joblib.delayed(self._phonemize_aux)(t, separator, strip)
for t in text_chunks)

# flatten them in a single list. For language switches lines we
# need to add an offset for each text chunk
# flatten both the phonemized chunks and language switches in a
# list. For language switches lines we need to add an offset to
# have the correct lines numbers wrt the input text.
text = list(itertools.chain(*(chunk[0] for chunk in output)))
lang_switches = [chunk[1] for chunk in output]
for i in range(len(lang_switches)):
Expand All @@ -223,11 +227,12 @@ def phonemize(self, text, separator=default_separator,
# restore the log as it was before parallel processing
self.logger = log_storage

# warn the user if language switches occured during phonemization
self._warn_on_lang_switch(lang_switches)
result = self._phonemize_postprocess(
text, text_type, punctuation_marks)

return result
# finally restore the punctuation
return self._phonemize_postprocess(
text, text_type, punctuation_marks)

def _command(self, fname):
return (
Expand Down
23 changes: 16 additions & 7 deletions phonemizer/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,24 +30,33 @@ def cumsum(l):


def str2list(s):
"""Returns the string `s` as a list of lines"""
"""Returns the string `s` as a list of lines, split by \n"""
return s.strip().split('\n') if isinstance(s, six.string_types) else s


def list2str(s):
"""Returns the list of lines `s` as a single string"""
"""Returns the list of lines `s` as a single string separated by \n"""
return '\n'.join(s) if not isinstance(s, six.string_types) else s


def chunks(text, n):
"""Return `n` equally sized chunks of a `text`
"""Return a maximum of `n` equally sized chunks of a `text`
`n` must be an integer greater than 0.
This method is usefull when phonemizing a single text on multiple jobs.
Only the n-1 first chunks have equal size. The last chunk can be longer.
The input `text` can be a list or a string. Return a list of `n` strings.
The exact number of chunks eturned is `m = min(n, len(str2list(text)))`.
Only the m-1 first chunks have equal size. The last chunk can be longer.
The input `text` can be a list or a string. Return a list of `m` strings.
This method is usefull when phonemizing a single text on multiple jobs.
Parameters
----------
text (str or list) : The text to divide in chunks
n (int) : The number of chunks to build, must be an integer greater than 0.
Returns
-------
The chunked text as a list of str.
"""
text = str2list(text)
Expand Down

0 comments on commit 20ee9f2

Please sign in to comment.