From 5cf97d6219032927289616b8f1cf43cd960598bd Mon Sep 17 00:00:00 2001 From: vince62s Date: Sat, 5 Oct 2024 19:38:25 +0200 Subject: [PATCH] fix test --- eole/transforms/tokenize.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/eole/transforms/tokenize.py b/eole/transforms/tokenize.py index fe0a27bc..e2b7f451 100644 --- a/eole/transforms/tokenize.py +++ b/eole/transforms/tokenize.py @@ -172,9 +172,12 @@ def _tokenize(self, tokens, side="src", is_train=False): # This method embeds a custom logic to correctly handle certain placeholders # in case the tokenizer doesn't preserve them. sentence = " ".join(tokens) - delim_list = [mapped_toks[0] for mapped_toks in self.mapped_tokens] + [ - self.eos_token - ] + if self.mapped_tokens is not None: + delim_list = [mapped_toks[0] for mapped_toks in self.mapped_tokens] + [ + self.eos_token + ] + else: + delim_list = [self.eos_token] pattern = f"({'|'.join(map(re.escape, delim_list))})" # Split sentence on EOS and Added-Tokens sent_list = re.split(pattern, sentence)