From d8f02d694d33f0c6e8a8fa2f6795b69e692fee0f Mon Sep 17 00:00:00 2001
From: zh-plus
Date: Wed, 29 May 2024 16:35:48 +0800
Subject: [PATCH 1/2] Remove deprecated context.
---
README.md | 32 ---------
openlrc/context.py | 81 -----------------------
openlrc/gui_streamlit/home.py | 3 -
openlrc/gui_streamlit/pages/2_context.py | 26 --------
openlrc/openlrc.py | 23 +------
openlrc/prompter.py | 8 +--
openlrc/translate.py | 10 +--
tests/test_context.py | 82 ------------------------
tests/test_prompter.py | 4 +-
9 files changed, 7 insertions(+), 262 deletions(-)
delete mode 100644 openlrc/context.py
delete mode 100644 openlrc/gui_streamlit/pages/2_context.py
delete mode 100644 tests/test_context.py
diff --git a/README.md b/README.md
index 9b549e0..0619dfd 100644
--- a/README.md
+++ b/README.md
@@ -19,14 +19,6 @@ e.g. [OpenAI-GPT](https://github.com/openai/openai-python), [Anthropic-Claude](h
## New 🚨
-- 2024.3.29: Claude models are now available for translation. According to the testing, Claude 3 Sonnet performs way
- better than GPT-3.5 Turbo. We recommend using Claude 3 Sonnet for non-english audio (source language) translation (For
- now, the default model
- are still GPT-3.5 Turbo):
- ```python
- lrcer = LRCer(chatbot_model='claude-3-sonnet-20240229')
- ```
-- 2024.4.4: ~~Add basic streamlit GUI support. Try `openlrc gui` to start the GUI.~~
- 2024.5.7:
- Add custom endpoint (base_url) support for OpenAI & Anthropic:
```python
@@ -127,9 +119,6 @@ if __name__ == '__main__':
lrcer.run(['./data/test_audio.mp3', './data/test_video.mp4'], target_lang='zh-cn')
# Generate translated ./data/test_audio.lrc and ./data/test_video.srt
- # Use context.yaml to improve translation
- lrcer.run('./data/test.mp3', target_lang='zh-cn', context_path='./data/context.yaml')
-
# Use glossary to improve translation
lrcer = LRCer(glossary='./data/aoe4-glossary.yaml')
@@ -165,27 +154,6 @@ if __name__ == '__main__':
Check more details in [Documentation](https://zh-plus.github.io/openlrc/#/).
-### Context
-
-Utilize the available context to enhance the quality of your translation.
-Save them as `context.yaml` in the same directory as your audio file.
-
-> [!NOTE]
-> The improvement of translation quality from Context is **NOT** guaranteed.
-
-```yaml
-background: "This is a multi-line background.
-This is a basic example."
-audio_type: Movie
-description_map: {
- movie_name1 (without extension): "This
- is a multi-line description for movie1.",
- movie_name2 (without extension): "This
- is a multi-line description for movie2.",
- movie_name3 (without extension): "This is a single-line description for movie 3.",
-}
-```
-
### Glossary
Add glossary to improve domain specific translation. For example `aoe4-glossary.yaml`:
diff --git a/openlrc/context.py b/openlrc/context.py
deleted file mode 100644
index 7202d0f..0000000
--- a/openlrc/context.py
+++ /dev/null
@@ -1,81 +0,0 @@
-# Copyright (C) 2024. Hao Zheng
-# All rights reserved.
-from difflib import get_close_matches
-from pathlib import Path
-from typing import Union
-
-import yaml
-
-from openlrc.logger import logger
-
-
-class Context:
- def __init__(self, background='', description_map=None, audio_type='Anime', config_path=None):
- """
- Context(optional) for translation.
-
- Args:
- background (str): Providing background information for establishing context for the translation.
- description_map (dict, optional): {"name(without extension)": "description", ...}
- audio_type (str, optional): Audio type, default to Anime.
- config_path (str, optional): Path to config file.
-
- Raises:
- FileNotFoundError: If the config file specified by config_path does not exist.
-
- """
- self.config_path = None
- self.background = background
- self.audio_type = audio_type
- self.description_map = description_map if description_map else dict()
-
- # if config_path exist, load yaml file
- if config_path:
- config_path = Path(config_path)
- if config_path.exists():
- self.load_config(config_path)
- else:
- raise FileNotFoundError(f'Config file {config_path} not found.')
-
- def load_config(self, config_path: Union[str, Path]):
- config_path = Path(config_path)
- if not config_path.exists():
- raise FileNotFoundError(f'Config file {config_path} not found.')
-
- with open(config_path, 'r', encoding='utf-8') as f:
- config: dict = yaml.safe_load(f)
-
- if config.get('background'):
- self.background = config['background']
-
- if config.get('audio_type'):
- self.audio_type = config['audio_type']
-
- if config.get('description_map'):
- self.description_map = config['description_map']
-
- self.config_path = config_path
-
- def save_config(self):
- with open(self.config_path, 'w') as f:
- yaml.dump({
- 'background': self.background,
- 'audio_type': self.audio_type,
- 'description_map': self.description_map,
- }, f)
-
- def get_description(self, audio_name):
- value = ''
- if self.description_map:
- matches = get_close_matches(audio_name, self.description_map.keys())
- if matches:
- key = matches[0]
- value = self.description_map.get(key)
- logger.info(f'Found description map: {key} -> {value}')
- else:
- logger.info(f'No description map for {audio_name} found.')
-
- return value
-
- def __str__(self):
- return f'Context(background={self.background}, audio_type={self.audio_type}, description_map={self.description_map})'
diff --git a/openlrc/gui_streamlit/home.py b/openlrc/gui_streamlit/home.py
index e93a3cf..ba2f082 100644
--- a/openlrc/gui_streamlit/home.py
+++ b/openlrc/gui_streamlit/home.py
@@ -156,8 +156,6 @@
help='Currently bottleneck-ed by Spacy')
target_lang = st.text_input("Target Language", value='zh-cn', help='Language code for translation target')
prompter = st.selectbox("Prompter", options=['base'], disabled=True, help='Currently, only `base` is supported.')
- context_path = st.text_input("Context Path",
- help='Additional context to aid translation. Check [context](/context) for more details. ')
col1, col2, col3 = st.columns(3)
with col1:
@@ -189,7 +187,6 @@
preprocess_options=get_preprocess_options(atten_lim_db),
proxy=proxy, )
results = lrcer.run(paths, src_lang=src_lang, target_lang=target_lang, prompter=prompter,
- context_path=context_path if context_path else None,
skip_trans=skip_trans, noise_suppress=noise_suppress, bilingual_sub=bilingual_sub)
print(paths)
print(results)
diff --git a/openlrc/gui_streamlit/pages/2_context.py b/openlrc/gui_streamlit/pages/2_context.py
deleted file mode 100644
index 9517763..0000000
--- a/openlrc/gui_streamlit/pages/2_context.py
+++ /dev/null
@@ -1,26 +0,0 @@
-# Copyright (C) 2024. Hao Zheng
-# All rights reserved.
-
-import streamlit as st
-
-context_msg = """## Context 📝
-
-Utilize the available context to enhance the quality of your translation.
-Save them as `context.yaml` in the same directory as your audio file.
-
-> [!NOTE]
-> The improvement of translation quality from Context is **NOT** guaranteed.
-
-```yaml
-background: "This is a multi-line background.
-This is a basic example."
-audio_type: Movie
-description_map: {
- movie_name1 (without extension): "This
- is a multi-line description for movie1.",
- movie_name2 (without extension): "This
- is a multi-line description for movie2.",
- movie_name3 (without extension): "This is a single-line description for movie 3.",
-}
-```"""
-st.write(context_msg)
diff --git a/openlrc/openlrc.py b/openlrc/openlrc.py
index 68b24b8..3457cff 100644
--- a/openlrc/openlrc.py
+++ b/openlrc/openlrc.py
@@ -13,7 +13,6 @@
from faster_whisper.transcribe import Segment
-from openlrc.context import Context
from openlrc.defaults import default_asr_options, default_vad_options, default_preprocess_options
from openlrc.logger import logger
from openlrc.opt import SubtitleOptimizer
@@ -58,7 +57,6 @@ def __init__(self, whisper_model='large-v3', compute_type='float16', chatbot_mod
self.fee_limit = fee_limit
self.api_fee = 0 # Can be updated in different thread, operation should be thread-safe
self.from_video = set()
- self.context: Context = Context()
self.proxy = proxy
self.base_url_config = base_url_config
self.glossary = self.parse_glossary(glossary)
@@ -222,16 +220,12 @@ def _translate(self, audio_name, prompter, target_lang, transcribed_opt_sub, tra
translator = LLMTranslator(chatbot_model=self.chatbot_model, prompter=prompter, fee_limit=self.fee_limit,
proxy=self.proxy, base_url_config=self.base_url_config,
retry_model=self.retry_model)
- context = self.context
target_texts = translator.translate(
transcribed_opt_sub.texts,
src_lang=transcribed_opt_sub.lang,
target_lang=target_lang,
title=audio_name,
- audio_type=context.audio_type,
- background=context.background,
- description=context.get_description(audio_name),
compare_path=compare_path,
glossary=self.glossary
)
@@ -254,7 +248,7 @@ def _translate(self, audio_name, prompter, target_lang, transcribed_opt_sub, tra
return final_subtitle
def run(self, paths: Union[str, Path, List[Union[str, Path]]], src_lang: Optional[str] = None, target_lang='zh-cn',
- prompter='base_trans', context_path: Optional[Union[str, Path]] = None, skip_trans=False,
+ prompter='base_trans', skip_trans=False,
noise_suppress=False,
bilingual_sub=False, clear_temp_folder=False) -> List[str]:
"""
@@ -268,7 +262,6 @@ def run(self, paths: Union[str, Path, List[Union[str, Path]]], src_lang: Optiona
src_lang (str): Language of the audio, default to auto-detect.
target_lang (str): Target language, default to Mandarin Chinese.
prompter (str): Currently, only `base_trans` is supported.
- context_path (str): path to context config file. (Default to use `context.yaml` in the first audio's directory)
skip_trans (bool): Whether to skip the translation process. (Default to False)
noise_suppress (bool): Whether to suppress the noise in the audio. (Default to False)
bilingual_sub (bool): Whether to generate bilingual subtitles. (Default to False)
@@ -292,20 +285,6 @@ def run(self, paths: Union[str, Path, List[Union[str, Path]]], src_lang: Optiona
paths = list(map(Path, paths))
- if context_path:
- context_path = Path(context_path)
- self.context.load_config(context_path)
- logger.info(f'Found context config: {context_path}')
- logger.debug(f'Context: {self.context}')
- else:
- # Try to find the default `context.yaml` in the first audio's directory
- try:
- context_path = paths[0].parent / 'context.yaml'
- self.context.load_config(context_path)
- logger.info(f'Found context config file: {context_path}')
- except FileNotFoundError:
- logger.info(f'Default context config not found: {self.context}, using default context.')
-
audio_paths = self.pre_process(paths, noise_suppress=noise_suppress)
logger.info(f'Working on {len(audio_paths)} audio files: {pformat(audio_paths)}')
diff --git a/openlrc/prompter.py b/openlrc/prompter.py
index 8c3ba3b..f34cbdd 100644
--- a/openlrc/prompter.py
+++ b/openlrc/prompter.py
@@ -15,7 +15,7 @@
base_instruction = f'''Ignore all previous instructions.
You are a translator tasked with revising and translating subtitles into a target language. Your goal is to ensure accurate, concise, and natural-sounding translations for each line of dialogue. The input consists of transcribed audio, which may contain transcription errors. Your task is to first correct any errors you find in the sentences based on their context, and then translate them to the target language according to the revised sentences.
The user will provide a chunk of lines, you should respond with an accurate, concise, and natural-sounding translation for the dialogue, with appropriate punctuation.
-The user may provide additional context, such as background, description or title of the source material, a summary of the current scene, or a list of character names. Use this information to improve the quality of your translation.
+The user may provide additional context, such as title of the source material, a summary of the current scene, or a list of character names. Use this information to improve the quality of your translation.
Your response will be processed by an automated system, so it is imperative that you adhere to the required output format.
The source subtitles were AI-generated with a speech-to-text tool so they are likely to contain errors. Where the input seems likely to be incorrect, use ALL available context to determine what the correct text should be, to the best of your ability.
@@ -115,7 +115,7 @@ def check_format(self, messages, output_str):
class BaseTranslatePrompter(TranslatePrompter):
- def __init__(self, src_lang, target_lang, audio_type=None, title='', background='', description='', glossary=None):
+ def __init__(self, src_lang, target_lang, audio_type=None, title='', glossary=None):
self.src_lang = src_lang
self.target_lang = target_lang
self.src_lang_display = Language.get(src_lang).display_name('en')
@@ -124,8 +124,6 @@ def __init__(self, src_lang, target_lang, audio_type=None, title='', background=
self.audio_type = audio_type
self.title = title
- self.background = background
- self.description = description
self.glossary = glossary
self.potential_prefix_combo = [
[original_prefix, translation_prefix],
@@ -136,8 +134,6 @@ def __init__(self, src_lang, target_lang, audio_type=None, title='', background=
['Original>', 'Translation>']
]
self.user_prompt = f'''{f"{self.title}" if self.title else ""}
-{f"{self.background}" if self.background else ""}
-{f"{self.description}" if self.description else ""}
{{scene}}
{{summaries_str}}
diff --git a/openlrc/translate.py b/openlrc/translate.py
index c022e1b..4bfe300 100644
--- a/openlrc/translate.py
+++ b/openlrc/translate.py
@@ -20,7 +20,7 @@
class Translator(ABC):
@abstractmethod
- def translate(self, texts: Union[str, List[str]], src_lang, target_lang):
+ def translate(self, texts: Union[str, List[str]], src_lang, target_lang) -> List[str]:
pass
@@ -202,8 +202,7 @@ def send_and_parse(messages, chatbot):
return summary, scene, translated
def translate(self, texts: Union[str, List[str]], src_lang, target_lang, audio_type='Anime', title='',
- background='', description='', compare_path: Path = Path('translate_intermediate.json'),
- glossary: dict = None):
+ compare_path: Path = Path('translate_intermediate.json'), glossary: dict = None) -> List[str]:
"""
Translate a list of texts from source language to target language.
@@ -213,8 +212,6 @@ def translate(self, texts: Union[str, List[str]], src_lang, target_lang, audio_t
target_lang (str): The target language.
audio_type (str): The type of audio (e.g., 'Anime').
title (str): The title of the content.
- background (str): The background context.
- description (str): The description of the content.
compare_path (Path): The path to save intermediate translation results.
glossary (dict): The glossary to use for translation.
@@ -225,8 +222,7 @@ def translate(self, texts: Union[str, List[str]], src_lang, target_lang, audio_t
texts = [texts]
prompter: BaseTranslatePrompter = prompter_map[self.prompter](
- src_lang, target_lang, audio_type, title=title, background=background, description=description,
- glossary=glossary
+ src_lang, target_lang, audio_type, title=title, glossary=glossary
)
chunks = self.make_chunks(texts, chunk_size=self.chunk_size)
diff --git a/tests/test_context.py b/tests/test_context.py
deleted file mode 100644
index 13cd993..0000000
--- a/tests/test_context.py
+++ /dev/null
@@ -1,82 +0,0 @@
-# Copyright (C) 2024. Hao Zheng
-# All rights reserved.
-
-import tempfile
-import unittest
-from pathlib import Path
-
-from openlrc.context import Context
-
-
-class TestContext(unittest.TestCase):
- def setUp(self) -> None:
- self.context = Context(background='test background', audio_type='test audio type',
- description_map={'test audio name': 'description'})
-
- def test_init(self):
- context = self.context
- self.assertEqual(context.background, 'test background')
- self.assertEqual(context.audio_type, 'test audio type')
- self.assertEqual(context.description_map, {'test audio name': 'description'})
- self.assertIsNone(context.config_path)
-
- def test_init_with_config_file(self):
- with tempfile.NamedTemporaryFile(mode='w', delete=False) as f:
- f.write(
- 'background: config background\naudio_type: config audio type\ndescription_map:\n config: config description\n')
- config_path = Path(f.name)
-
- context = Context(config_path=config_path)
-
- self.assertEqual(context.background, 'config background')
- self.assertEqual(context.audio_type, 'config audio type')
- self.assertEqual(context.description_map, {'config': 'config description'})
- self.assertEqual(context.config_path, config_path)
-
- config_path.unlink()
-
- def test_init_with_invalid_config_file(self):
- with self.assertRaises(FileNotFoundError):
- Context(config_path='invalid_path')
-
- def test_load_config(self):
- context = self.context
- with tempfile.NamedTemporaryFile(mode='w', delete=False) as f:
- f.write(
- 'background: config background\naudio_type: config audio type\ndescription_map:\n config: config description\n')
- config_path = Path(f.name)
-
- context.load_config(config_path)
-
- self.assertEqual(context.background, 'config background')
- self.assertEqual(context.audio_type, 'config audio type')
- self.assertEqual(context.description_map, {'config': 'config description'})
- self.assertEqual(context.config_path, config_path)
-
- config_path.unlink()
-
- def test_save_config(self):
- context = self.context
- with tempfile.NamedTemporaryFile(mode='w', delete=False) as f:
- config_path = Path(f.name)
-
- context.config_path = config_path
- context.save_config()
-
- with open(config_path, 'r') as file:
- config = file.read()
-
- self.assertIn('background: test background', config)
- self.assertIn('audio_type: test audio type', config)
- self.assertIn('description_map:\n test audio name: description', config)
-
- config_path.unlink()
-
- def test_get_description(self):
- context = self.context
- self.assertEqual(context.get_description('test audio name'), 'description')
- self.assertEqual(context.get_description('audio name without description'), '')
-
- def test_str(self):
- self.assertEqual(str(self.context),
- 'Context(background=test background, audio_type=test audio type, description_map={\'test audio name\': \'description\'})')
diff --git a/tests/test_prompter.py b/tests/test_prompter.py
index dc55b06..4b2f792 100644
--- a/tests/test_prompter.py
+++ b/tests/test_prompter.py
@@ -6,8 +6,6 @@
from openlrc.prompter import BaseTranslatePrompter
formatted_user_input = '''Title
-Background
-Description
test scene content
Chunk 1: test chunk1 summary
@@ -32,7 +30,7 @@
class TestPrompter(unittest.TestCase):
def setUp(self) -> None:
- self.prompter = BaseTranslatePrompter('ja', 'zh-cn', 'movie', 'Title', 'Background', 'Description')
+ self.prompter = BaseTranslatePrompter('ja', 'zh-cn', 'movie', 'Title')
self.formatted_user_input = formatted_user_input
def test_user_prompt(self):
From bd42a395e47a9ce143f4fced76801139d39729ab Mon Sep 17 00:00:00 2001
From: zh-plus
Date: Fri, 31 May 2024 17:52:49 +0800
Subject: [PATCH 2/2] Enhance translation workflow by adding Context Reviewer
Agent to generate translation guideline.
---
openlrc/agents.py | 163 ++++++++++++++++++
openlrc/context.py | 29 ++++
openlrc/gui_streamlit/home.py | 2 +-
openlrc/openlrc.py | 28 ++--
openlrc/prompter.py | 237 ++++++++++++++++++++++----
openlrc/translate.py | 307 ++++++++++++----------------------
tests/test_prompter.py | 24 +--
tests/test_translate.py | 2 +-
8 files changed, 532 insertions(+), 260 deletions(-)
create mode 100644 openlrc/agents.py
create mode 100644 openlrc/context.py
diff --git a/openlrc/agents.py b/openlrc/agents.py
new file mode 100644
index 0000000..cd78836
--- /dev/null
+++ b/openlrc/agents.py
@@ -0,0 +1,163 @@
+# Copyright (C) 2024. Hao Zheng
+# All rights reserved.
+import abc
+import re
+from typing import Optional, Tuple, List
+
+from openlrc.chatbot import route_chatbot
+from openlrc.context import TranslationContext, TranslateInfo
+from openlrc.logger import logger
+from openlrc.prompter import BaseTranslatePrompter, ContextReviewPrompter, potential_prefix_combo, \
+ ProofreaderPrompter, proofread_prefix
+
+
+class Agent(abc.ABC):
+ TEMPERATURE = 0.5
+ """
+ Base class for all agents.
+ """
+
+ def _initialize_chatbot(self, chatbot_model: str, fee_limit: float, proxy: str, base_url_config: Optional[dict]):
+ chatbot_cls, model_name = route_chatbot(chatbot_model)
+ return chatbot_cls(model=model_name, fee_limit=fee_limit, proxy=proxy, retry=3,
+ temperature=self.TEMPERATURE, base_url_config=base_url_config)
+
+
+class ChunkedTranslatorAgent(Agent):
+ """
+ Translate the well-defined chunked text to the target language and send it to the chatbot for further processing.
+ """
+
+ TEMPERATURE = 0.9
+
+ def __init__(self, src_lang, target_lang, info: TranslateInfo = TranslateInfo(),
+ chatbot_model: str = 'gpt-3.5-turbo', fee_limit: float = 0.2, proxy: str = None,
+ base_url_config: Optional[dict] = None):
+ super().__init__()
+ self.chatbot_model = chatbot_model
+ self.info = info
+ self.chatbot = self._initialize_chatbot(chatbot_model, fee_limit, proxy, base_url_config)
+ self.prompter = BaseTranslatePrompter(src_lang, target_lang, info)
+ self.cost = 0
+
+ def __str__(self):
+ return f'Translator Agent ({self.chatbot_model})'
+
+ def _parse_responses(self, resp) -> Tuple[List[str], str, str]:
+ """
+ Parse the response from the chatbot API.
+
+ Args:
+ resp: The response from the chatbot API.
+
+ Returns:
+ Tuple[List[str], str, str]: Parsed translations, summary, and scene from the response.
+ """
+ content = self.chatbot.get_content(resp)
+
+ try:
+ summary = self._extract_tag_content(content, 'summary')
+ scene = self._extract_tag_content(content, 'scene')
+ translations = self._extract_translations(content)
+
+ return [t.strip() for t in translations], summary.strip(), scene.strip()
+ except Exception as e:
+ logger.error(f'Failed to extract contents from response: {content}')
+ raise e
+
+ def _extract_tag_content(self, content: str, tag: str) -> str:
+ match = re.search(rf'<{tag}>(.*?){tag}>', content)
+ return match.group(1) if match else ''
+
+ def _extract_translations(self, content: str) -> List[str]:
+ for _, trans_prefix in potential_prefix_combo:
+ translations = re.findall(f'{trans_prefix}\n*(.*?)(?:#\d+||\n*$)', content, re.DOTALL)
+ if translations:
+ return self._clean_translations(translations, content)
+ return []
+
+ def _clean_translations(self, translations: List[str], content: str) -> List[str]:
+ if any(re.search(r'(<.*?>|)', t) for t in translations):
+ logger.warning(f'The extracted translation from response contains tags: {content}, tags removed')
+ return [re.sub(r'(<.*?>|).*', '', t, flags=re.DOTALL) for t in translations]
+ return translations
+
+ def translate_chunk(self, chunk_id: int, chunk: List[Tuple[int, str]],
+ context: TranslationContext = TranslationContext(),
+ use_glossary: bool = True) -> Tuple[List[str], TranslationContext]:
+ user_input = self.prompter.format_texts(chunk)
+ guideline = context.guideline if use_glossary else context.non_glossary_guideline
+ messages_list = [
+ {'role': 'system', 'content': self.prompter.system()},
+ {'role': 'user', 'content': self.prompter.user(chunk_id, user_input, context.summary, guideline)},
+ ]
+ resp = self.chatbot.message(messages_list, output_checker=self.prompter.check_format)[0]
+ translations, summary, scene = self._parse_responses(resp)
+ self.cost += self.chatbot.api_fees[-1]
+ context.update(summary=summary, scene=scene, model=self.chatbot_model)
+
+ return translations, context
+
+
+class ContextReviewerAgent(Agent):
+ """
+ Review the context of the subtitles to ensure accuracy and completeness.
+ """
+
+ TEMPERATURE = 0.8
+
+ def __init__(self, src_lang, target_lang, info: TranslateInfo = TranslateInfo(),
+ chatbot_model: str = 'gpt-3.5-turbo', fee_limit: float = 0.2, proxy: str = None,
+ base_url_config: Optional[dict] = None):
+ super().__init__()
+ self.src_lang = src_lang
+ self.target_lang = target_lang
+ self.info = info
+ self.chatbot_model = chatbot_model
+ self.prompter = ContextReviewPrompter(src_lang, target_lang)
+ self.chatbot = self._initialize_chatbot(chatbot_model, fee_limit, proxy, base_url_config)
+
+ def __str__(self):
+ return f'Context Reviewer Agent ({self.chatbot_model})'
+
+ def build_context(self, texts, title='', glossary: Optional[dict] = None) -> str:
+ text_content = '\n'.join(texts)
+ messages_list = [
+ {'role': 'system', 'content': self.prompter.system()},
+ {'role': 'user', 'content': self.prompter.user(text_content, title=title, given_glossary=glossary)},
+ ]
+ resp = self.chatbot.message(messages_list, output_checker=self.prompter.check_format)[0]
+ context = self.chatbot.get_content(resp)
+ return context
+
+
+class ProofreaderAgent(Agent):
+ """
+ Adapt subtitles to ensure cultural relevance and appropriateness.
+ """
+ TEMPERATURE = 0.8
+
+ def __init__(self, src_lang, target_lang, info: TranslateInfo = TranslateInfo(),
+ chatbot_model: str = 'gpt-3.5-turbo', fee_limit: float = 0.2, proxy: str = None,
+ base_url_config: Optional[dict] = None):
+ super().__init__()
+ self.src_lang = src_lang
+ self.target_lang = target_lang
+ self.info = info
+ self.prompter = ProofreaderPrompter(src_lang, target_lang)
+ self.chatbot = self._initialize_chatbot(chatbot_model, fee_limit, proxy, base_url_config)
+
+ def _parse_responses(self, resp) -> List[str]:
+ content = self.chatbot.get_content(resp)
+ revised = re.findall(proofread_prefix + r'\s*(.*)', content, re.MULTILINE)
+
+ return revised
+
+ def proofread(self, texts: List[str], translations, context: TranslationContext) -> List[str]:
+ messages_list = [
+ {'role': 'system', 'content': self.prompter.system()},
+ {'role': 'user', 'content': self.prompter.user(texts, translations, context.guideline)},
+ ]
+ resp = self.chatbot.message(messages_list, output_checker=self.prompter.check_format)[0]
+ revised = self._parse_responses(resp)
+ return revised
diff --git a/openlrc/context.py b/openlrc/context.py
new file mode 100644
index 0000000..17433eb
--- /dev/null
+++ b/openlrc/context.py
@@ -0,0 +1,29 @@
+# Copyright (C) 2024. Hao Zheng
+# All rights reserved.
+import re
+from typing import Optional
+
+from pydantic import BaseModel
+
+
+class TranslationContext(BaseModel):
+ summary: Optional[str] = ''
+ scene: Optional[str] = ''
+ model: Optional[str] = None
+ guideline: Optional[str] = None
+
+ def update(self, **args):
+ for key, value in args.items():
+ if hasattr(self, key):
+ setattr(self, key, value)
+
+ @property
+ def non_glossary_guideline(self) -> str:
+ cleaned_text = re.sub(r'Glossary:\n(.*?\n)*?\nCharacters:', 'Characters:', self.guideline, flags=re.DOTALL)
+ return cleaned_text
+
+
+class TranslateInfo(BaseModel):
+ title: Optional[str] = ''
+ audio_type: str = 'Movie'
+ glossary: Optional[dict] = None
diff --git a/openlrc/gui_streamlit/home.py b/openlrc/gui_streamlit/home.py
index ba2f082..b67038c 100644
--- a/openlrc/gui_streamlit/home.py
+++ b/openlrc/gui_streamlit/home.py
@@ -186,7 +186,7 @@
window_size_samples, speech_pad_ms),
preprocess_options=get_preprocess_options(atten_lim_db),
proxy=proxy, )
- results = lrcer.run(paths, src_lang=src_lang, target_lang=target_lang, prompter=prompter,
+ results = lrcer.run(paths, src_lang=src_lang, target_lang=target_lang,
skip_trans=skip_trans, noise_suppress=noise_suppress, bilingual_sub=bilingual_sub)
print(paths)
print(results)
diff --git a/openlrc/openlrc.py b/openlrc/openlrc.py
index 3457cff..6557c11 100644
--- a/openlrc/openlrc.py
+++ b/openlrc/openlrc.py
@@ -13,6 +13,7 @@
from faster_whisper.transcribe import Segment
+from openlrc.context import TranslateInfo
from openlrc.defaults import default_asr_options, default_vad_options, default_preprocess_options
from openlrc.logger import logger
from openlrc.opt import SubtitleOptimizer
@@ -130,18 +131,18 @@ def transcription_producer(self, transcription_queue, audio_paths, src_lang):
transcription_queue.put(None)
logger.info('Transcription producer finished.')
- def transcription_consumer(self, transcription_queue, target_lang, prompter, skip_trans, bilingual_sub):
+ def transcription_consumer(self, transcription_queue, target_lang, skip_trans, bilingual_sub):
"""
Parallel Consumer.
"""
with concurrent.futures.ThreadPoolExecutor() as executor:
- futures = [executor.submit(self.consumer_worker, transcription_queue, target_lang, prompter, skip_trans,
+ futures = [executor.submit(self.consumer_worker, transcription_queue, target_lang, skip_trans,
bilingual_sub)
for _ in range(self.consumer_thread)]
concurrent.futures.wait(futures)
logger.info('Transcription consumer finished.')
- def consumer_worker(self, transcription_queue, target_lang, prompter, skip_trans, bilingual_sub):
+ def consumer_worker(self, transcription_queue, target_lang, skip_trans, bilingual_sub):
"""
Parallel translation.
"""
@@ -173,7 +174,7 @@ def consumer_worker(self, transcription_queue, target_lang, prompter, skip_trans
else:
with Timer('Translation process'):
try:
- final_subtitle = self._translate(audio_name, prompter, target_lang, transcribed_opt_sub,
+ final_subtitle = self._translate(audio_name, target_lang, transcribed_opt_sub,
translated_path)
except Exception as e:
self.exception = e
@@ -212,12 +213,14 @@ def consumer_worker(self, transcription_queue, target_lang, prompter, skip_trans
self.transcribed_paths.append(result_path)
- def _translate(self, audio_name, prompter, target_lang, transcribed_opt_sub, translated_path):
+ def _translate(self, audio_name, target_lang, transcribed_opt_sub, translated_path):
+ context = TranslateInfo(title=audio_name, audio_type='Movie', glossary=self.glossary)
+
json_filename = Path(translated_path.parent / (audio_name + '.json'))
compare_path = Path(translated_path.parent, f'{audio_name}_compare.json')
if not translated_path.exists():
# Translate the transcribed json
- translator = LLMTranslator(chatbot_model=self.chatbot_model, prompter=prompter, fee_limit=self.fee_limit,
+ translator = LLMTranslator(chatbot_model=self.chatbot_model, fee_limit=self.fee_limit,
proxy=self.proxy, base_url_config=self.base_url_config,
retry_model=self.retry_model)
@@ -225,9 +228,8 @@ def _translate(self, audio_name, prompter, target_lang, transcribed_opt_sub, tra
transcribed_opt_sub.texts,
src_lang=transcribed_opt_sub.lang,
target_lang=target_lang,
- title=audio_name,
- compare_path=compare_path,
- glossary=self.glossary
+ info=context,
+ compare_path=compare_path
)
with self._lock:
@@ -248,9 +250,7 @@ def _translate(self, audio_name, prompter, target_lang, transcribed_opt_sub, tra
return final_subtitle
def run(self, paths: Union[str, Path, List[Union[str, Path]]], src_lang: Optional[str] = None, target_lang='zh-cn',
- prompter='base_trans', skip_trans=False,
- noise_suppress=False,
- bilingual_sub=False, clear_temp_folder=False) -> List[str]:
+ skip_trans=False, noise_suppress=False, bilingual_sub=False, clear_temp_folder=False) -> List[str]:
"""
Split the translation into 2 phases: transcription and translation. They're running in parallel.
Firstly, transcribe the audios one-by-one. At the same time, translation threads are created and waiting for
@@ -261,7 +261,6 @@ def run(self, paths: Union[str, Path, List[Union[str, Path]]], src_lang: Optiona
paths (Union[str, Path, List[Union[str, Path]]]): Audio/Video paths, can be a list or a single path.
src_lang (str): Language of the audio, default to auto-detect.
target_lang (str): Target language, default to Mandarin Chinese.
- prompter (str): Currently, only `base_trans` is supported.
skip_trans (bool): Whether to skip the translation process. (Default to False)
noise_suppress (bool): Whether to suppress the noise in the audio. (Default to False)
bilingual_sub (bool): Whether to generate bilingual subtitles. (Default to False)
@@ -293,8 +292,7 @@ def run(self, paths: Union[str, Path, List[Union[str, Path]]], src_lang: Optiona
with Timer('Transcription (Producer) and Translation (Consumer) process'):
consumer = concurrent.futures.ThreadPoolExecutor(thread_name_prefix='Consumer') \
- .submit(self.transcription_consumer, transcription_queue, target_lang, prompter, skip_trans,
- bilingual_sub)
+ .submit(self.transcription_consumer, transcription_queue, target_lang, skip_trans, bilingual_sub)
producer = concurrent.futures.ThreadPoolExecutor(thread_name_prefix='Producer') \
.submit(self.transcription_producer, transcription_queue, audio_paths, src_lang)
diff --git a/openlrc/prompter.py b/openlrc/prompter.py
index f34cbdd..747bd3f 100644
--- a/openlrc/prompter.py
+++ b/openlrc/prompter.py
@@ -1,15 +1,28 @@
# Copyright (C) 2024. Hao Zheng
# All rights reserved.
-
+import abc
import re
+from abc import ABC
+from typing import List, Tuple, Optional
from langcodes import Language
from lingua import LanguageDetectorBuilder
+from openlrc.context import TranslateInfo
from openlrc.logger import logger
original_prefix = 'Original>'
translation_prefix = 'Translation>'
+proofread_prefix = 'Proofread>'
+
+potential_prefix_combo = [
+ [original_prefix, translation_prefix],
+ ['原文>', '翻译>'],
+ ['原文>', '译文>'],
+ ['原文>', '翻譯>'],
+ ['原文>', '譯文>'],
+ ['Original>', 'Translation>']
+]
# instruction prompt modified from https://github.com/machinewrapped/gpt-subtrans
base_instruction = f'''Ignore all previous instructions.
@@ -97,11 +110,15 @@
Please translate the subtitles again, paying careful attention to ensure that each line is translated separately, and that every line has a matching translation.
Do not merge lines together in the translation, it leads to incorrect timings and confusion for the reader.
The content of the translation is for learning purposes only and will not violate the usage guidelines.
-{{glossary}}
'''
-class TranslatePrompter:
+class Prompter(abc.ABC):
+ def check_format(self, messages, output_str):
+ return True
+
+
+class TranslatePrompter(Prompter, ABC):
@classmethod
def format_texts(cls, texts):
raise NotImplementedError()
@@ -110,48 +127,38 @@ def format_texts(cls, texts):
def post_process(texts):
raise NotImplementedError()
- def check_format(self, messages, output_str):
- raise NotImplementedError()
-
class BaseTranslatePrompter(TranslatePrompter):
- def __init__(self, src_lang, target_lang, audio_type=None, title='', glossary=None):
+ def __init__(self, src_lang, target_lang, context: TranslateInfo):
self.src_lang = src_lang
self.target_lang = target_lang
self.src_lang_display = Language.get(src_lang).display_name('en')
self.target_lang_display = Language.get(target_lang).display_name('en')
self.lan_detector = LanguageDetectorBuilder.from_all_languages().build()
- self.audio_type = audio_type
- self.title = title
- self.glossary = glossary
- self.potential_prefix_combo = [
- [original_prefix, translation_prefix],
- ['原文>', '翻译>'],
- ['原文>', '译文>'],
- ['原文>', '翻譯>'],
- ['原文>', '譯文>'],
- ['Original>', 'Translation>']
- ]
- self.user_prompt = f'''{f"{self.title}" if self.title else ""}
-
-{{scene}}
- {{summaries_str}}
-
+ self.audio_type = context.audio_type
+ self.title = context.title
+ self.glossary = context.glossary
+ self.user_prompt = f'''Translation guidelines from context reviewer:
+{{guideline}}
+
+Previews summaries:
+{{summaries_str}}
+
Scene 1 Chunk {{chunk_num}}
-Please translate these subtitles for {self.audio_type}{f" named {self.title}" if self.title else ""} from {self.src_lang_display} to {self.target_lang_display}.\n
+Please translate these subtitles for {self.audio_type} from {self.src_lang_display} to {self.target_lang_display}.\n
{{user_input}}
'''
def system(self):
- return base_instruction.format(glossary='' if not self.glossary else self.formatted_glossary)
+ return base_instruction
- def user(self, chunk_num, user_input, summaries='', scene=''):
+ def user(self, chunk_num, user_input, summaries='', guideline=''):
summaries_str = '\n'.join(f'Chunk {i}: {summary}' for i, summary in enumerate(summaries, 1))
- return self.user_prompt.format(summaries_str=summaries_str, scene=scene, chunk_num=chunk_num,
- user_input=user_input).strip()
+ return self.user_prompt.format(
+ summaries_str=summaries_str, chunk_num=chunk_num, user_input=user_input, guideline=guideline).strip()
@property
def formatted_glossary(self):
@@ -166,7 +173,7 @@ def formatted_glossary(self):
return result
@classmethod
- def format_texts(cls, texts):
+ def format_texts(cls, texts: List[Tuple[int, str]]):
"""
Reconstruct list of text into desired format.
@@ -189,7 +196,7 @@ def check_format(self, messages, content):
logger.error(f'Fail to extract original text.')
return False
- for potential_ori_prefix, potential_trans_prefix in self.potential_prefix_combo:
+ for potential_ori_prefix, potential_trans_prefix in potential_prefix_combo:
translation = re.findall(potential_trans_prefix + r'\n*(.*?)(?:#\d+||\n*$)', content, re.DOTALL)
if translation:
@@ -274,6 +281,170 @@ def check_format(self, messages, output_str):
return True
-prompter_map = {
- 'base_trans': BaseTranslatePrompter,
-}
+class ContextReviewPrompter(Prompter):
+ def __init__(self, src_lang, target_lang):
+ self.src_lang = src_lang
+ self.target_lang = target_lang
+ self.src_lang_display = Language.get(src_lang).display_name('en')
+ self.target_lang_display = Language.get(target_lang).display_name('en')
+ self.lan_detector = LanguageDetectorBuilder.from_all_languages().build()
+
+ def system(self):
+ return f'''Ignore all previous instructions.
+You are context reviewer to build necessary context during translation to ensure the consistency and accuracy of the translation.
+ Build a comprehensive glossary of key terms, and phrases used in the {self.src_lang_display} to {self.target_lang_display} translations. This glossary should include any technical terms, slang, or culturally specific references that need to be consistently translated or localized.
+ Note the glossary should only contains terms that may cause confusion or inconsistency in translation, such as abbreviation, or techinical words.
+ Write a concise story summary that captures the main plot points, characters, and themes of the video. This summary will help other team members understand the context and ensure consistency in translation and localization.
+ Provide the character name translations to ensure consistency in the subtitles. Include any relevant information about the characters, such as their relationships, roles, or personalities.
+ Define the tone and style of the subtitles, ensuring they match the intended mood and atmosphere of the texts. Provide guidelines on language use, formality, and any stylistic preferences.
+ Identify the target audience for the subtitles, considering factors such as age, cultural background, and language proficiency. Provide insights on how to tailor the subtitles to meet the needs and expectations of this audience.
+ Compile all this information into a reference document that can be used by translators, localization specialists, and proofreaders to ensure consistency and accuracy throughout the subtitling process.
+ Note, user may provide pre-defined glossary, you should carefully review the given texts and update the glossary. Do not change the given glossary unless necessary, you should only add new terms.
+
+ You are given the un-translated texts split by lines.
+
+Example input:
+Please review the following text (title:The detectors) and provide the necessary context for the translation from English to Chinese:
+John and Sarah discuss their plan to locate a suspect, deducing that he is likely in the uptown area.
+John: "As a 10 years experienced detector, my advice is We should start our search in the uptown area."
+Sarah: "Agreed. Let's gather more information before we move."
+Then, they prepare to start their investigation.
+
+Example output:
+Title: The detectors
+
+Glossary:
+- suspect: 嫌疑人
+- uptown: 市中心
+
+Characters:
+- John: 约翰, a detector with 10 years of experience
+- Sarah: 萨拉, John's detector partner
+
+Summary:
+John and Sarah discuss their plan to locate a suspect in the uptown area. They decide to gather more information before starting their investigation.
+
+Tone and Style:
+The subtitles should be formal and professional, reflecting the serious nature of the investigation. Avoid slang and colloquial language.
+
+Target Audience:
+The target audience is adult viewers with an interest in crime dramas. They are likely to be familiar with police procedurals and enjoy suspenseful storytelling.
+
+
+### retry_instructions
+There was an issue with the previous translation.
+
+Only output the glossary, characters, summary, tone and style, and target audience. Do not output any translated text.
+Remember to include "characters" section in your response, if there is information about any characters in texts.
+'''
+
+ def user(self, text, title='', given_glossary: Optional[dict] = None):
+ glossary_text = f'Given glossary: {given_glossary}' if given_glossary else ''
+ return f'''{glossary_text}
+Please review the following text (title:{title}) and provide the necessary context for the translation from {self.src_lang_display} to {self.target_lang_display}:
+{text}'''
+
+
+class ProofreaderPrompter(Prompter):
+ def __init__(self, src_lang, target_lang):
+ self.src_lang = src_lang
+ self.target_lang = target_lang
+ self.src_lang_display = Language.get(src_lang).display_name('en')
+ self.target_lang_display = Language.get(target_lang).display_name('en')
+ self.lan_detector = LanguageDetectorBuilder.from_all_languages().build()
+
+ def system(self):
+ return f'''Ignore all previous instructions.
+You are a experienced proofreader, responsible for meticulously reviewing the translated subtitles to ensure they are free of grammatical errors, spelling mistakes, and inconsistencies. The Proofreader ensures that the subtitles are clear, concise, and adhere to the provided glossary and style guidelines.
+Carefully read through the translated subtitles provided by translators. Ensure that the subtitles make sense in the context of the video and are easy to understand.
+Check for and correct any grammatical errors, including punctuation, syntax, and sentence structure. Ensure that all words are spelled correctly and consistently throughout the subtitles.
+Refer to the glossary and style guidelines provided by the Context Reviewer. Ensure that key terms, names, and phrases are used consistently and correctly throughout the subtitles. Verify that the tone and style of the subtitles are consistent with the guidelines.
+Ensure that the subtitles are clear and concise, avoiding overly complex or ambiguous language. Make sure that the subtitles are easy to read and understand, especially considering the target audience's language proficiency.
+Ensure that the subtitles accurately reflect the context and intent of the original dialogue. Make sure that any cultural references, jokes, or idiomatic expressions are appropriately localized and understandable.
+Conduct a final review to ensure there are no remaining errors or inconsistencies. Make any necessary corrections to ensure the subtitles are accurate, natural-sounding, and of the highest quality.
+
+Example input:
+Please proofread the following translated text (the original texts are for reference only, focus on the translated text):
+#1
+{original_prefix}
+Those who resist change may find themselves left behind.
+{translation_prefix}
+那些抗拒变化的人可能会发现自己被抛在后面。
+
+#2
+{original_prefix}
+On the other hand, those who embrace change can thrive in the new environment.
+{translation_prefix}
+另一方面,那些接受变化的人可以在新环境中发展。
+
+#3
+{original_prefix}
+Thus, it is important to adapt to changing circumstances and remain open to new opportunities.
+{translation_prefix}
+因此,适应变化的环境并对新机会持开放态度是很重要的。
+
+
+Example output:
+#1
+{translation_prefix}
+那些抗拒变化的人可能会发现自己被抛在后面。
+{proofread_prefix}
+那些抗拒变化的人可能会发现自己落伍了。
+
+#2
+{translation_prefix}
+另一方面,那些接受变化的人可以在新环境中发展。
+{proofread_prefix}
+相反,那些拥抱变化的人可以在新环境中如鱼得水。
+
+#3
+{translation_prefix}
+因此,适应变化的环境并对新机会持开放态度是很重要的。
+{proofread_prefix}
+因此,适应变化的环境并对新机会保持开放态度是非常重要的。
+
+
+### retry_instructions
+Please proofread the subtitles again, paying careful attention to ensure that each line is proofreaded separately, and that every line has a matching text.
+Do not merge lines together during the proofread, it leads to incorrect timings and confusion for the reader.
+'''
+
+ def user(self, texts, translations, guideline=''):
+ formated_texts = '\n'.join(
+ [
+ f'#{i}\n{original_prefix}\n{text}\n{translation_prefix}\n{trans}\n' for i, (text, trans) in
+ enumerate(zip(texts, translations), start=1)
+ ])
+ return f'''Translation guidelines from context reviewer:
+{guideline}
+
+Please proofread the following translated subtitles, which is from {self.src_lang_display} to {self.target_lang_display}:
+{formated_texts}
+
+Output:
+'''
+
+ def check_format(self, messages, content):
+ # If message is for claude, use messages[0]
+ user_input = messages[1]['content'] if len(messages) == 2 else messages[0]['content']
+ original = re.findall(original_prefix + r'\n(.*?)\n' + translation_prefix, user_input, re.DOTALL)
+ if not original:
+ logger.error(f'Fail to extract original text.')
+ return False
+
+ localized = re.findall(proofread_prefix + r'\s*(.*)', content, re.MULTILINE)
+
+ if not localized:
+ # TODO: Try to change chatbot_model if always fail
+ logger.warning(f'Fail to extract translation.')
+ logger.debug(f'Content: {content}')
+ return False
+
+ if len(original) != len(localized):
+ logger.warning(
+ f'Fail to ensure length consistent: original is {len(original)}, translation is {len(localized)}')
+ logger.debug(f'original: {original}')
+ logger.debug(f'translation: {original}')
+ return False
+
+ return True
diff --git a/openlrc/translate.py b/openlrc/translate.py
index 4bfe300..226fce6 100644
--- a/openlrc/translate.py
+++ b/openlrc/translate.py
@@ -3,69 +3,50 @@
import json
import os
-import re
import uuid
from abc import ABC, abstractmethod
from itertools import zip_longest
from pathlib import Path
-from typing import Union, List
+from typing import Union, List, Optional, Tuple
import requests
-from openlrc.chatbot import route_chatbot, all_pricing
+from openlrc.agents import ChunkedTranslatorAgent, ContextReviewerAgent
+from openlrc.chatbot import all_pricing
+from openlrc.context import TranslationContext, TranslateInfo
from openlrc.logger import logger
-from openlrc.prompter import prompter_map, BaseTranslatePrompter, AtomicTranslatePrompter
+from openlrc.prompter import AtomicTranslatePrompter
class Translator(ABC):
@abstractmethod
- def translate(self, texts: Union[str, List[str]], src_lang, target_lang) -> List[str]:
+ def translate(self, texts: Union[str, List[str]], src_lang: str, target_lang: str,
+ info: TranslateInfo) -> List[str]:
pass
class LLMTranslator(Translator):
- def __init__(self, chatbot_model: str = 'gpt-3.5-turbo', prompter: str = 'base_trans', fee_limit=0.2,
- chunk_size=30, intercept_line=None, proxy=None, base_url_config=None, retry_model=None):
+ CHUNK_SIZE = 30
+
+ def __init__(self, chatbot_model: str = 'gpt-3.5-turbo', fee_limit: float = 0.2, chunk_size: int = CHUNK_SIZE,
+ intercept_line: Optional[int] = None, proxy: Optional[str] = None,
+ base_url_config: Optional[dict] = None,
+ retry_model: Optional[str] = None):
"""
Initialize the LLMTranslator with given parameters.
-
- Args:
- chatbot_model (str): The model of the chatbot to use.
- prompter (str): The prompter to format the texts for translation.
- fee_limit (float): The fee limit for the API.
- chunk_size (int): The size of text chunks for translation.
- intercept_line (int): The line number to intercept.
- proxy (str): The proxy server to use.
- base_url_config (dict): The base URL configuration for the chatbot API.
- retry_model (str): The model to use for retrying translation if the primary model fails.
"""
- if prompter not in prompter_map:
- raise ValueError(f'Prompter {prompter} not found.')
-
- self.temperature = 0.9
-
- chatbot_cls, model_name = route_chatbot(chatbot_model)
- self.chatbot = chatbot_cls(model=model_name, fee_limit=fee_limit, proxy=proxy, retry=3,
- temperature=self.temperature, base_url_config=base_url_config)
-
- self.retry_chatbot = None
- if retry_model:
- retry_chatbot_cls, retry_model_name = route_chatbot(retry_model)
- self.retry_chatbot = retry_chatbot_cls(
- model=retry_model_name, fee_limit=fee_limit, proxy=proxy, retry=3, temperature=self.temperature,
- base_url_config=base_url_config
- )
-
- self.prompter = prompter
+ self.chatbot_model = chatbot_model
self.fee_limit = fee_limit
+ self.proxy = proxy
+ self.base_url_config = base_url_config
self.chunk_size = chunk_size
self.api_fee = 0
self.intercept_line = intercept_line
self.retry_model = retry_model
@staticmethod
- def list_chatbots():
+ def list_chatbots() -> List[str]:
"""
List available chatbot models.
@@ -75,7 +56,7 @@ def list_chatbots():
return list(all_pricing.keys())
@staticmethod
- def make_chunks(texts, chunk_size=30):
+ def make_chunks(texts: List[str], chunk_size: int = 30) -> List[List[Tuple[int, str]]]:
"""
Split the text into chunks of specified size.
@@ -99,203 +80,131 @@ def make_chunks(texts, chunk_size=30):
return chunks
- def _parse_responses(self, resp, potential_prefix_combo, changed_chatbot=None):
- """
- Parse the response from the chatbot API.
-
- Args:
- resp (str): The response from the chatbot API.
- potential_prefix_combo (List[Tuple[str, str]]): Potential prefix combinations for parsing.
- changed_chatbot: The chatbot instance used for parsing if different from the primary chatbot.
-
- Returns:
- Tuple[str, str, List[str]]: Parsed summary, scene, and translations from the response.
- """
- content = changed_chatbot.get_content(resp) if changed_chatbot else self.chatbot.get_content(resp)
-
- try:
- summary = re.search(r'(.*)', content)
- scene = re.search(r'(.*)', content)
-
- summary = summary.group(1) if summary else ''
- scene = scene.group(1) if scene else ''
-
- for _, trans_prefix in potential_prefix_combo:
- translation = re.findall(f'{trans_prefix}\n*(.*?)(?:#\d+||\n*$)', content, re.DOTALL)
- if translation:
- break
- else:
- return summary.strip(), scene.strip(), []
-
- # Remove "\nxxx" tags (or some wierd tags like
❓) from translation
- if any([re.search(r'(<.*?>|)', t) for t in translation]):
- logger.warning(f'The extracted translation from response contains tags: {content}, tags removed')
- translation = [
- re.sub(
- r'(<.*?>|).*',
- '', t, flags=re.DOTALL
- )
- for t in translation
- ]
-
- return summary.strip(), scene.strip(), [t.strip() for t in translation]
-
- except Exception as e:
- logger.error(f'Failed to extract contents from response: {content}')
- raise e
-
- def _translate_chunk(self, chunk, prompter, summaries, scene, i):
+ def _translate_chunk(self, translator_agent: ChunkedTranslatorAgent, chunk: List[Tuple[int, str]],
+ context: TranslationContext, chunk_id: int,
+ retry_agent: Optional[ChunkedTranslatorAgent] = None) -> Tuple[
+ List[str], TranslationContext]:
"""
Translate a single chunk of text.
-
- Args:
- chunk (List[Tuple[int, str]]): The chunk of text to be translated.
- prompter (BaseTranslatePrompter): The prompter instance to format the text.
- summaries (List[str]): List of summaries for context.
- scene (str): The current scene context.
- i (int): The chunk index.
-
- Returns:
- Tuple[str, str, List[str]]: The summary, scene, and translated texts for the chunk.
"""
+ translated, context = translator_agent.translate_chunk(chunk_id, chunk, context)
+
+ if len(translated) != len(chunk) and translator_agent.info.glossary:
+ logger.warning(f'Cannot translate chunk {chunk_id} with glossary, trying to remove glossary.')
+ translated, context = translator_agent.translate_chunk(chunk_id, chunk, context, use_glossary=False)
- def send_and_parse(messages, chatbot):
- """
- Helper function to send messages to the chatbot and parse the response.
-
- Args:
- messages (List[dict]): List of messages to send.
- chatbot: The chatbot instance to use.
-
- Returns:
- Tuple[str, str, List[str]]: The parsed summary, scene, and translations.
- """
- resp = chatbot.message(messages, output_checker=prompter.check_format)[0]
- return self._parse_responses(resp, prompter.potential_prefix_combo, changed_chatbot=chatbot)
-
- user_input = prompter.format_texts(chunk)
- glossary_sys_prompt = prompter.system()
- non_glossary_sys_prompt = prompter_map[self.prompter](prompter.src_lang, prompter.target_lang).system()
- messages_list = [
- {'role': 'system', 'content': glossary_sys_prompt},
- {'role': 'user', 'content': prompter.user(i, user_input, summaries, scene)},
- ]
- summary, scene, translated = send_and_parse(messages_list.copy(), self.chatbot)
-
- if len(translated) != len(chunk):
- logger.warning(f'Cant translate chunk {i} with glossary, trying to remove glossary.')
- messages_list[0]['content'] = non_glossary_sys_prompt
- summary, scene, translated = send_and_parse(messages_list.copy(), self.chatbot)
-
- if self.retry_chatbot and len(translated) != len(chunk):
+ if retry_agent and len(translated) != len(chunk):
logger.warning(
- f'Trying to change chatbot to keep performing chunked translation. Retry chatbot: {self.retry_model}'
- )
- messages_list[0]['content'] = glossary_sys_prompt
- summary, scene, translated = send_and_parse(messages_list.copy(), self.retry_chatbot)
+ f'Trying to change chatbot to keep performing chunked translation. Retry chatbot: {retry_agent}')
+ translated, context = retry_agent.translate_chunk(chunk_id, chunk, context)
- if len(translated) != len(chunk):
+ if len(translated) != len(chunk) and retry_agent.info.glossary:
logger.warning(f'New bot: Trying to remove glossary to keep performing chunked translation.')
- messages_list[0]['content'] = non_glossary_sys_prompt
- summary, scene, translated = send_and_parse(messages_list.copy(), self.retry_chatbot)
+ translated, context = retry_agent.translate_chunk(chunk_id, chunk, context, use_glossary=False)
- return summary, scene, translated
+ return translated, context
- def translate(self, texts: Union[str, List[str]], src_lang, target_lang, audio_type='Anime', title='',
- compare_path: Path = Path('translate_intermediate.json'), glossary: dict = None) -> List[str]:
+ def translate(self, texts: Union[str, List[str]], src_lang: str, target_lang: str,
+ info: TranslateInfo = TranslateInfo(),
+ compare_path: Path = Path('translate_intermediate.json')) -> List[str]:
"""
Translate a list of texts from source language to target language.
-
- Args:
- texts (Union[str, List[str]]): The texts to be translated.
- src_lang (str): The source language.
- target_lang (str): The target language.
- audio_type (str): The type of audio (e.g., 'Anime').
- title (str): The title of the content.
- compare_path (Path): The path to save intermediate translation results.
- glossary (dict): The glossary to use for translation.
-
- Returns:
- List[str]: The translated texts.
"""
if not isinstance(texts, list):
texts = [texts]
- prompter: BaseTranslatePrompter = prompter_map[self.prompter](
- src_lang, target_lang, audio_type, title=title, glossary=glossary
- )
+ translator_agent = ChunkedTranslatorAgent(src_lang, target_lang, info, self.chatbot_model, self.fee_limit,
+ self.proxy, self.base_url_config)
- chunks = self.make_chunks(texts, chunk_size=self.chunk_size)
- logger.info(f'Translating {title}: {len(chunks)} chunks, {len(texts)} lines in total.')
+ retry_agent = ChunkedTranslatorAgent(src_lang, target_lang, info, self.retry_model, self.fee_limit,
+ self.proxy, self.base_url_config) if self.retry_model else None
- translations = []
- summaries = []
- summary, scene = '', ''
- compare_list = []
- start_chunk = 0
+ # proofreader = ProofreaderAgent(src_lang, target_lang, info)
- if compare_path.exists():
- logger.info(f'Resume from {compare_path}')
- with open(compare_path, 'r', encoding='utf-8') as f:
- compare_results = json.load(f)
- compare_list = compare_results['compare']
- summaries = compare_results['summaries']
- scene = compare_results['scene']
- translations = [item['output'] for item in compare_list]
- start_chunk = compare_list[-1]['chunk']
- logger.info(f'Resume translation from chunk {start_chunk}')
+ chunks = self.make_chunks(texts, chunk_size=self.chunk_size)
+ logger.info(f'Translating {info.title}: {len(chunks)} chunks, {len(texts)} lines in total.')
+
+ translations, summaries, compare_list, start_chunk, guideline = self._resume_translation(compare_path)
+ if not guideline:
+ context_reviewer = ContextReviewerAgent(src_lang, target_lang, info)
+ guideline = context_reviewer.build_context(texts, title=info.title, glossary=info.glossary)
+ logger.info(f'Translation Guideline:\n{guideline}')
+ context = TranslationContext(guideline=guideline)
for i, chunk in list(enumerate(chunks, start=1))[start_chunk:]:
atomic = False
- summary, scene, translated = self._translate_chunk(chunk, prompter, summaries, scene, i)
+ translated, context = self._translate_chunk(translator_agent, chunk, context, i, retry_agent=retry_agent)
+ # Proofreader Not fully tested
+ # localized_trans = proofreader.localize_subtitles(
+ # texts=[c[1] for c in chunk], translations=translated, context=context
+ # )
if len(translated) != len(chunk):
logger.warning(f'Chunk {i} translation length inconsistent: {len(translated)} vs {len(chunk)},'
- f'Trying to use atomic translation instead.')
+ f' Trying to use atomic translation instead.')
chunk_texts = [item[1] for item in chunk]
- translated = self.atomic_translate(chunk_texts, src_lang, target_lang)
+ translated = self.atomic_translate(self.chatbot_model, chunk_texts, src_lang, target_lang)
atomic = True
translations.extend(translated)
- summaries.append(summary)
- logger.info(f'Translated {title}: {i}/{len(chunks)}')
- logger.info(f'summary: {summary}')
- logger.info(f'scene: {scene}')
-
- compare_list.extend([{'chunk': i,
- 'idx': item[0] if item else 'N\\A',
- 'method': 'atomic' if atomic else 'chunked',
- 'model': self.chatbot.model,
- 'input': item[1] if item else 'N\\A',
- 'output': trans if trans else 'N\\A'}
- for (item, trans) in zip_longest(chunk, translated)])
- compare_results = {'compare': compare_list, 'summaries': summaries, 'scene': scene}
- with open(compare_path, 'w', encoding='utf-8') as f:
- json.dump(compare_results, f, indent=4, ensure_ascii=False)
-
- self.api_fee += sum(self.chatbot.api_fees)
+ summaries.append(context.summary)
+ logger.info(f'Translated {info.title}: {i}/{len(chunks)}')
+ logger.info(f'summary: {context.summary}')
+ logger.info(f'scene: {context.scene}')
+
+ compare_list.extend(self._generate_compare_list(chunk, translated, i, atomic, context))
+ self._save_intermediate_results(compare_path, compare_list, summaries, context.scene, guideline)
+
+ self.api_fee += translator_agent.cost + (retry_agent.cost if retry_agent else 0)
return translations
- def atomic_translate(self, texts, src_lang, target_lang):
- """
- Perform atomic translation for each text.
+ def _resume_translation(self, compare_path: Path) -> Tuple[List[str], List[str], List[dict], int, str]:
+ translations, summaries, compare_list, start_chunk, guideline = [], [], [], 0, ''
- Args:
- texts (List[str]): List of texts to be translated.
- src_lang (str): Source language.
- target_lang (str): Target language.
+ if compare_path.exists():
+ logger.info(f'Resume from {compare_path}')
+ with open(compare_path, 'r', encoding='utf-8') as f:
+ compare_results = json.load(f)
+ compare_list = compare_results['compare']
+ summaries = compare_results['summaries']
+ translations = [item['output'] for item in compare_list]
+ start_chunk = compare_list[-1]['chunk']
+ guideline = compare_results['guideline']
+ logger.info(f'Resume translation from chunk {start_chunk}')
- Returns:
- List[str]: List of translated texts.
+ return translations, summaries, compare_list, start_chunk, guideline
+
+ def _generate_compare_list(self, chunk: List[Tuple[int, str]], translated: List[str], chunk_id: int,
+ atomic: bool, context: TranslationContext) -> List[dict]:
+ return [{'chunk': chunk_id,
+ 'idx': item[0] if item else 'N\\A',
+ 'method': 'atomic' if atomic else 'chunked',
+ 'model': context.model,
+ 'input': item[1] if item else 'N\\A',
+ 'output': trans if trans else 'N\\A'}
+ for (item, trans) in zip_longest(chunk, translated)]
+
+ def _save_intermediate_results(self, compare_path: Path, compare_list: List[dict], summaries: List[str],
+ scene: str, guideline: str):
+ compare_results = {'compare': compare_list, 'summaries': summaries, 'scene': scene, 'guideline': guideline}
+ with open(compare_path, 'w', encoding='utf-8') as f:
+ json.dump(compare_results, f, indent=4, ensure_ascii=False)
+
+ def atomic_translate(self, chatbot_model: str, texts: List[str], src_lang: str, target_lang: str) -> List[str]:
"""
+ Perform atomic translation for each text.
+ """
+ chatbot = ChunkedTranslatorAgent(src_lang, target_lang, TranslateInfo(), chatbot_model, self.fee_limit,
+ self.proxy,
+ self.base_url_config).chatbot
+
prompter = AtomicTranslatePrompter(src_lang, target_lang)
- message_lists = [[
- {'role': 'user', 'content': prompter.user(text)}
- ] for text in texts]
+ message_lists = [[{'role': 'user', 'content': prompter.user(text)}] for text in texts]
- responses = self.chatbot.message(message_lists, output_checker=prompter.check_format)
- translated = list(map(self.chatbot.get_content, responses))
+ responses = chatbot.message(message_lists, output_checker=prompter.check_format)
+ self.api_fee += sum(chatbot.api_fees[-(len(texts)):])
+ translated = list(map(chatbot.get_content, responses))
assert len(translated) == len(texts), f'Atomic translation failed: {len(translated)} vs {len(texts)}'
@@ -317,7 +226,7 @@ def __init__(self):
'X-ClientTraceId': str(uuid.uuid4())
}
- def translate(self, texts: Union[str, List[str]], src_lang, target_lang):
+ def translate(self, texts: Union[str, List[str]], src_lang, target_lang, info=None):
params = {
'api-version': '3.0',
'from': src_lang,
diff --git a/tests/test_prompter.py b/tests/test_prompter.py
index 4b2f792..e7fd5a9 100644
--- a/tests/test_prompter.py
+++ b/tests/test_prompter.py
@@ -3,17 +3,19 @@
import unittest
+from openlrc.context import TranslateInfo
from openlrc.prompter import BaseTranslatePrompter
-formatted_user_input = '''Title
-
-test scene content
- Chunk 1: test chunk1 summary
-Chunk 2: test chunk2 summary
-
+formatted_user_input = '''Translation guidelines from context reviewer:
+This is a guidline.
+
+Previews summaries:
+Chunk 1: test chunk1 summary
+Chunk 2: test chunk2 summary
+
Scene 1 Chunk 1
-Please translate these subtitles for movie named Title from Japanese to Chinese (China).
+Please translate these subtitles for movie from Japanese to Chinese (China).
#1
Original>
@@ -30,7 +32,8 @@
class TestPrompter(unittest.TestCase):
def setUp(self) -> None:
- self.prompter = BaseTranslatePrompter('ja', 'zh-cn', 'movie', 'Title')
+ context = TranslateInfo(title='Title', audio_type='movie')
+ self.prompter = BaseTranslatePrompter('ja', 'zh-cn', context)
self.formatted_user_input = formatted_user_input
def test_user_prompt(self):
@@ -44,7 +47,8 @@ def test_user_prompt(self):
生き残る秘訣は、進化し続けることです。
Translation>'''
self.assertEqual(
- self.prompter.user(1, user_input, ['test chunk1 summary', 'test chunk2 summary'], 'test scene content'),
+ self.prompter.user(1, user_input, ['test chunk1 summary', 'test chunk2 summary'],
+ guideline='This is a guidline.'),
self.formatted_user_input
)
@@ -58,8 +62,6 @@ def test_check_format(self):
messages = [{'role': 'system', 'content': 'system content'},
{'role': 'user', 'content': formatted_user_input}]
content = '''Title
-Background
-Description
Scene
Chunk 1:
diff --git a/tests/test_translate.py b/tests/test_translate.py
index 9a0094b..5984c61 100644
--- a/tests/test_translate.py
+++ b/tests/test_translate.py
@@ -59,7 +59,7 @@ def test_atomic_translate(self):
for chatbot_model in test_models:
texts = ['Hello, how are you?', 'I am fine, thank you.']
translator = LLMTranslator(chatbot_model)
- translations = translator.atomic_translate(texts, 'en', 'zh')
+ translations = translator.atomic_translate(chatbot_model, texts, 'en', 'zh')
self.assertGreater(get_similarity(translations[0], '你好,你好吗?'), 0.5)
self.assertGreater(get_similarity(translations[1], '我很好,谢谢。'), 0.5)