Version 1.5.1: Minor bugfixes and improvements. (#48)

* Improve regex efficiency. * Fix edge case for srt subtitle. * Add LLM-based translation quality evaluator. * Fix issue always causing false format-checking for context generation. * Update noise suppression test-suites. * Bump up to version 1.5.1
zh-plus · Jul 1, 2024 · d9dde29 · d9dde29
1 parent b4cc646
commit d9dde29
Show file tree

Hide file tree

Showing 14 changed files with 224 additions and 36 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,13 @@
+## 1.4.1
+
+Minor bugfixes and improvements.
+
+### Other Changes:
+
+- Fix default check_format returning False issue.
+- Fix edges cases for srt file generation.
+- Prepare translation evaluator for benchmarking.
+
 ## 1.5.0
 
 This update add Gemini Models support for translation.

diff --git a/README.md b/README.md
@@ -252,6 +252,7 @@ To maintain context between translation segments, the process is sequential for
   languages.
 - [x] [Feature] Add custom OpenAI & Anthropic endpoint support.
 - [ ] [Feature] Add local translation model support (e.g. [SakuraLLM](https://github.com/SakuraLLM/Sakura-13B-Galgame)).
+- [ ] [Quality] Construct translation quality benchmark test for each patch.
 - [ ] [Others] Add transcribed examples.
     - [ ] Song
     - [ ] Podcast

diff --git a/openlrc/agents.py b/openlrc/agents.py
@@ -1,14 +1,17 @@
 #  Copyright (C) 2024. Hao Zheng
 #  All rights reserved.
 import abc
+import json
 import re
 from typing import Optional, Tuple, List, Type, Union
 
+from json_repair import repair_json
+
 from openlrc.chatbot import route_chatbot, GPTBot, ClaudeBot
 from openlrc.context import TranslationContext, TranslateInfo
 from openlrc.logger import logger
 from openlrc.prompter import ChunkedTranslatePrompter, ContextReviewPrompter, ProofreaderPrompter, PROOFREAD_PREFIX, \
-    ContextReviewerValidatePrompter
+    ContextReviewerValidatePrompter, TranslationEvaluatorPrompter
 from openlrc.validators import POTENTIAL_PREFIX_COMBOS
 
 
@@ -211,3 +214,34 @@ def proofread(self, texts: List[str], translations, context: TranslationContext)
         resp = self.chatbot.message(messages_list, output_checker=self.prompter.check_format)[0]
         revised = self._parse_responses(resp)
         return revised
+
+
+class TranslationEvaluatorAgent(Agent):
+    TEMPERATURE = 0.95
+
+    def __init__(self, chatbot_model: str = 'gpt-3.5-turbo', fee_limit: float = 0.3, proxy: str = None,
+                 base_url_config: Optional[dict] = None):
+        super().__init__()
+        self.chatbot = self._initialize_chatbot(chatbot_model, fee_limit, proxy, base_url_config)
+        self.prompter = TranslationEvaluatorPrompter()
+
+    def evaluate(self, src_texts, target_texts) -> dict:
+        messages_list = [
+            {'role': 'system', 'content': self.prompter.system()},
+            {'role': 'user', 'content': self.prompter.user(src_texts, target_texts)},
+        ]
+        resp = self.chatbot.message(messages_list, stop_sequences=[self.prompter.stop_sequence])[0]
+        content = self.chatbot.get_content(resp)
+
+        # Repair potentially broken JSON
+        content = repair_json(content)
+
+        # Returned response should be in JSON format
+        json_resp = json.loads(content)
+        # acc = json_resp['accuracy']
+        # fluency = json_resp['fluency']
+        # completeness = json_resp['completeness']
+        # cultural_adaptation = json_resp['cultural adaptation']
+        # consistency = json_resp['consistency']
+
+        return json_resp
diff --git a/openlrc/context.py b/openlrc/context.py
@@ -19,7 +19,7 @@ def update(self, **args):
 
     @property
     def non_glossary_guideline(self) -> str:
-        cleaned_text = re.sub(r'### Glossary(.*?\n)*?### Characters', '### Characters', self.guideline, flags=re.DOTALL)
+        cleaned_text = re.sub(r'### Glossary.*?### Characters', '### Characters', self.guideline, flags=re.DOTALL)
         return cleaned_text
 
 

diff --git a/openlrc/defaults.py b/openlrc/defaults.py
@@ -2,7 +2,7 @@
 #  All rights reserved.
 from lingua import Language
 
-# Check https://github.com/SYSTRAN/faster-whisper/blob/master/faster_whisper/transcribe.py#L184 for details
+# Check https://github.com/SYSTRAN/faster-whisper/blob/master/faster_whisper/transcribe.py for details
 default_asr_options = {
     "beam_size": 3,
     "best_of": 5,

diff --git a/openlrc/evaluate.py b/openlrc/evaluate.py
@@ -0,0 +1,54 @@
+#  Copyright (C) 2024. Hao Zheng
+#  All rights reserved.
+import abc
+
+from openlrc.agents import TranslationEvaluatorAgent
+from openlrc.logger import logger
+
+
+class TranslationEvaluator(abc.ABC):
+    """
+    Base class for all evaluators.
+    """
+
+    @abc.abstractmethod
+    def evaluate(self, src_texts, target_texts, src_lang, target_lang):
+        """
+        Evaluate the translated texts.
+        :return: The evaluation result.
+        """
+        raise NotImplementedError()
+
+
+class LLMTranslationEvaluator(TranslationEvaluator):
+    """
+    Evaluate the translated texts using large language models.
+    """
+
+    def __init__(self, chatbot_model: str = 'gpt-3.5-turbo'):
+        self.agenet = TranslationEvaluatorAgent(chatbot_model=chatbot_model)
+        self.recommended_model = {
+            'gpt-4',
+            'claude-3-sonnet',
+            'claude-3-opus',
+            'gemini-1.5-pro'
+        }
+
+        for m in self.recommended_model:
+            if chatbot_model.startswith(m):
+                self.agenet = TranslationEvaluatorAgent(chatbot_model=chatbot_model)
+                break
+        else:
+            logger.warning(f'Chatbot model {chatbot_model} is not in the recommended list for evaluating translations.')
+
+    def evaluate(self, src_texts, target_texts, src_lang=None, target_lang=None):
+        return self.agenet.evaluate(src_texts, target_texts)
+
+
+class EmbeddingTranslationEvaluator(TranslationEvaluator):
+    """
+    Evaluate the translated texts using embeddings.
+    """
+
+    def evaluate(self, src_texts, target_texts, src_lang, target_lang):
+        pass
diff --git a/openlrc/preprocess.py b/openlrc/preprocess.py
@@ -59,7 +59,7 @@ def noise_suppression(self, audio_paths: Union[str, Path, List[str], List[Path]]
             atten_lim_db = self.options['atten_lim_db']
 
         model, df_state, _ = init_df()
-        chunk_size = 300  # 5 min
+        chunk_size = 180  # 3 min
 
         ns_audio_paths = []
         for audio_path, output_path in zip(audio_paths, self.output_paths):
@@ -69,7 +69,7 @@ def noise_suppression(self, audio_paths: Union[str, Path, List[str], List[Path]]
             if not ns_path.exists():
                 audio, info = load_audio(audio_path, sr=df_state.sr())
 
-                # Split audio into 10 min chunks
+                # Split audio into 3 min chunks
                 audio_chunks = [audio[:, i:i + chunk_size * info.sample_rate]
                                 for i in range(0, audio.shape[1], chunk_size * info.sample_rate)]
 
@@ -79,7 +79,7 @@ def noise_suppression(self, audio_paths: Union[str, Path, List[str], List[Path]]
 
                 enhanced = torch.cat(enhanced_chunks, dim=1)
 
-                assert enhanced.shape == audio.shape, 'Enhanced audio shape does not match original audio shape.'
+                assert enhanced.shape == audio.shape, f'Enhanced audio shape does not match original audio shape: {enhanced.shape} != {audio.shape}'
 
                 save_audio(ns_path, enhanced, sr=df_state.sr())
 

diff --git a/openlrc/prompter.py b/openlrc/prompter.py
@@ -9,7 +9,7 @@
 
 from openlrc.context import TranslateInfo
 from openlrc.validators import ChunkedTranslateValidator, AtomicTranslateValidator, ProofreaderValidator, \
-    ContextReviewerValidateValidator
+    ContextReviewerValidateValidator, TranslationEvaluatorValidator
 
 ORIGINAL_PREFIX = 'Original>'
 TRANSLATION_PREFIX = 'Translation>'
@@ -91,7 +91,7 @@
 
 The translation should be in a lovely colloquial style and suitable for high-quality subtitles.
 
-I’m going to tip \$1000 for a better translation!
+I’m going to tip $1000 for a better translation!
 
 ### retry_instructions
 There was an issue with the previous translation. 
@@ -108,7 +108,7 @@ def check_format(self, user_input: str, generated_content: str) -> bool:
         if hasattr(self, 'validator') and self.validator:
             return self.validator.validate(user_input, generated_content)
         else:
-            return False
+            return True
 
 
 class TranslatePrompter(Prompter, ABC):
@@ -303,7 +303,7 @@ def system(self):
 
 class ContextReviewerValidatePrompter(Prompter):
     def __init__(self):
-        self.validator = ContextReviewerValidateValidator('en')
+        self.validator = ContextReviewerValidateValidator()
 
     def system(self):
         return f'''Ignore all previous instructions.
@@ -370,5 +370,77 @@ def system(self):
 
 Output:
 False'''
+
     def user(self, context):
         return f'''Input:\n{context}\nOutput:'''
+
+
+class TranslationEvaluatorPrompter(Prompter):
+    def __init__(self):
+        self.validator = TranslationEvaluatorValidator()
+        self.stop_sequence = '<--END-OF-JSON-->'
+
+    def system(self):
+        return f'''Ignore all previous instructions.
+### Context:
+You are an expert in evaluating subtitle translations. Your task is to assess the quality of a translated subtitle text based on several key factors. The original text and its translation are provided for your review.
+
+### Objective:
+The goal is to provide a comprehensive evaluation of the translated subtitle text by scoring it on five specific criteria: Accuracy, Fluency, Completeness, Cultural Adaptation, and Consistency. Each criterion should be rated on a scale from 1 to 10, with 1 being the lowest quality and 10 being the highest.
+
+### Style:
+The evaluation should be detailed, objective, and professional. Use clear and concise language to convey your assessment.
+
+### Tone:
+Maintain a constructive and neutral tone throughout your evaluation. Focus on providing actionable feedback that can help improve the quality of the translation.
+
+### Audience:
+Your evaluation will be read by subtitle translators, quality assurance teams, and project managers who are looking to understand the strengths and weaknesses of the translation.
+
+### Response Format:
+Please provide your evaluation in the following JSON format:
+
+{{
+    "accuracy": {{"score": [1-10], "justification": "[Justification]"}},
+    "fluency": {{"score": [1-10], "justification": "[Justification]"}},
+    "completeness": {{"score": [1-10], "justification": "[Justification]"}},
+    "cultural adaptation": {{"score": [1-10], "justification": "[Justification]"}},
+    "consistency": {{"score": [1-10], "justification": "[Justification]"}}
+}}
+{self.stop_sequence}
+
+### Example1:
+Input:
+Original Texts:
+Those who resist change may find themselves left behind.
+On the other hand, those who embrace change can thrive in the new environment.
+
+Translated Texts:
+那些抗拒变化的人可能会发现自己被抛在后面。
+另一方面，那些接受变化的人可以在新环境中发展。
+
+Output:
+result = {{
+    "accuracy": {{"score": <example integer score>, "justification": "<example-string>"}},
+    "fluency": {{"score": <example integer score>, "justification": "<example-string>"}},
+    "completeness": {{"score": <example integer score>, "justification": "<example-string>"}},
+    "cultural adaptation": {{"score": <example integer score>, "justification": "<example-string>"}},
+    "consistency"': {{"score": <example integer score>, "justification": "<example-string>"}}
+}}
+{self.stop_sequence}
+
+Note that the result are processed by an automated system, so it is imperative that you adhere to the required output format.
+'''
+
+    def user(self, original: List[str], translation: List[str]):
+        original_str = '\n'.join(original)
+        translation_str = '\n'.join(translation)
+        return f'''Input:
+Original Texts:
+{original_str}
+
+Translated Texts:
+{translation_str}
+
+Output:
+'''
diff --git a/openlrc/subtitle.py b/openlrc/subtitle.py
@@ -172,7 +172,7 @@ def from_lrc(cls, filename):
         return cls(language=lang, segments=segments, filename=filename)
 
     @classmethod
-    def from_srt(cls, filename):
+    def from_srt(cls, filename: Union[str, Path]):
         """
         Processes an SRT (SubRip Subtitle) file according to the SRT specifications outlined
         at http://www.textfiles.com/uploads/kds-srt.txt.
@@ -184,14 +184,6 @@ def from_srt(cls, filename):
             - A blank line indicating the end of a subtitle entry.
 
         This function is designed to read or manipulate an SRT file based on the provided filename.
-
-        Args:
-            filename (str): The path to the SRT file to be processed.
-
-        Returns:
-            The return value is not specified in the provided docstring. Depending on the implementation,
-            this function could return a data structure representing the parsed SRT file, a success status,
-            or possibly nothing.
         """
         filename = Path(filename)
         with open(filename, encoding='utf-8') as f:
@@ -212,7 +204,7 @@ def from_srt(cls, filename):
 
                 # Multi-line subtitle
                 text = []
-                while lines[i].strip():
+                while i < len(lines) and lines[i].strip():
                     text.append(lines[i].strip())
                     i += 1
 

diff --git a/openlrc/transcribe.py b/openlrc/transcribe.py
@@ -81,9 +81,6 @@ def seg_from_words(seg: Segment, seg_id, words, tokens):
                            seg.temperature, seg.avg_logprob, seg.compression_ratio, seg.no_speech_prob, words)
 
         def mid_split(seg_entry):
-            """
-            Todo: Split into multiple segments (>2)
-            """
             text = seg_entry.text
             doc = nlp(text)
 
@@ -118,7 +115,7 @@ def is_punct(char):
                 for k in range(len(seg_entry.words) - 1):
                     gaps.append(seg_entry.words[k + 1].start - seg_entry.words[k].end)
                 max_gap = max(gaps)
-                split_idx = gaps.index(max_gap)  # TODO: Multiple largest or Multiple long gap
+                split_idx = gaps.index(max_gap)
 
                 if max_gap >= 2:  # Split using the max gap
                     former_words = seg_entry.words[:split_idx]