Skip to content

Commit

Permalink
fix: fix infinite hang, filter None-fail translation
Browse files Browse the repository at this point in the history
  • Loading branch information
vTuanpham committed Dec 10, 2023
1 parent 4220a7a commit 3849032
Show file tree
Hide file tree
Showing 5 changed files with 96 additions and 23 deletions.
3 changes: 1 addition & 2 deletions examples/ShareGPTV3/ShareGPTV3.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@ def __init__(self, file_path: str, output_path: str, target_lang: str="vi",
super().__init__(file_path, output_path,
parser_name=PARSER_NAME,
do_translate=True,
no_translated_code=True,
target_config=DialogsConfig,
target_fields=['user_prompts', 'agent_responses'],
target_lang=target_lang,
Expand Down Expand Up @@ -65,7 +64,7 @@ def convert(self) -> None:
data_converted.append(data_dict)

# Be sure to assign the final data list to self.converted_data
self.converted_data = data_converted
self.converted_data = data_converted[:5000]

return None

Expand Down
2 changes: 1 addition & 1 deletion examples/TIGER-Lab-MathInstruct/TigerLabMathInstruct.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ def convert(self):
data_converted.append(data_dict)

# Be sure to assign the final data list to self.converted_data
self.converted_data = data_converted[20000:120000]
self.converted_data = data_converted

pass

Expand Down
72 changes: 53 additions & 19 deletions translator/data_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,13 @@
import random
import sys
import string
import multiprocessing
import threading
import warnings
from pprint import pprint

sys.path.insert(0, r'./')
try:
from google.colab import files

IN_COLAB = True
except ImportError:
IN_COLAB = False
Expand All @@ -27,7 +26,7 @@

from configs import BaseConfig, QAConfig, DialogsConfig
from .utils import force_super_call, ForceBaseCallMeta, timeit, have_internet
from .filters import have_code
from .filters import have_code, have_re_code


if not have_internet:
Expand All @@ -48,6 +47,7 @@ def __init__(self, file_path: str,
max_list_length_per_thread: int = 3,
source_lang: str = "en",
target_lang: str = "vi",
fail_translation_code: str="P1OP1_F"
) -> None:

self.data_read = None
Expand All @@ -63,6 +63,7 @@ def __init__(self, file_path: str,
self.do_translate = do_translate

if self.do_translate:
self.fail_translation_code = fail_translation_code
self.enable_sub_task_thread = enable_sub_task_thread
self.source_lang = source_lang
self.target_lang = target_lang
Expand Down Expand Up @@ -113,8 +114,23 @@ def pre_translate_validate(self) -> None:
print(f"\nTotal data left after filtering for translation: {len(validated_translate_data)}\n")
self.converted_data = validated_translate_data

@timeit
def post_translate_validate(self) -> None:
pass
post_validated_translate_data = []
# Note: This validates will override the original self.converted_data_translated
for idx, example in enumerate(tqdm(self.converted_data_translated, desc="Validating data after translation:")):
for key in self.target_fields:
example_filters = 0
if have_re_code(example[key], code=self.fail_translation_code):
example_filters += 1
if len(self.converted_data_translated) - 1 == idx:
tqdm.write(f"Number of example with fail code: {example_filters}")
break
elif key == self.target_fields[-1]:
post_validated_translate_data.append(example)

print(f"\nTotal data left after filtering fail translation: {len(post_validated_translate_data)}\n")
self.converted_data_translated = post_validated_translate_data

@staticmethod
def id_generator(size=6, chars=string.ascii_uppercase + string.digits) -> str:
Expand Down Expand Up @@ -169,24 +185,30 @@ def multithread_list_str_translate(self, list_str: List[str],
with ThreadPoolExecutor(max_workers=num_threads) as executor:
futures = []
finished_task = 0
manager = multiprocessing.Manager()
lock = manager.Lock()
lock = threading.Lock()

def callback_list_done(future):
nonlocal translated_list_data
nonlocal finished_task
nonlocal manager
nonlocal lock
if not future.exception():
translated_list_data.extend(future.result())
finished_task += 1
with lock:
# This need to be .append to keep the list structure
# Since this deal with sub-list and needed to be merged later
translated_list_data.append(future.result())
finished_task += 1
else:
tqdm.write(f"Sub task of chunk {progress_idx} with field {field_name} failed with the following error: {future.exception()}."
f"\nRestarting thread when others finished...")
pass

for idx, list_chunk in enumerate(sub_str_lists):
# Assign each thread with a new Translator instance
future_chunk = executor.submit(self.translate_en2vi, list_chunk, data_type, Translator(), idx)
future_chunk = executor.submit(self.translate_en2vi,
src_texts=list_chunk,
data_type=data_type,
translator=Translator(),
sub_list_idx=idx)
future_chunk.add_done_callback(callback_list_done)
future_dict = {
"future": future_chunk,
Expand All @@ -201,8 +223,11 @@ def callback_list_done(future):
if future_dict['future'].exception():
tqdm.write(
f"\n Thread {future_dict['idx']} failed, restarting thread with chunk {future_dict['idx']}\n")
backup_future_chunk = executor.submit(self.translate_en2vi, sub_str_lists[future_dict['idx']],
data_type, Translator(), future_dict['idx'])
backup_future_chunk = executor.submit(self.translate_en2vi,
src_texts=sub_str_lists[future_dict['idx']],
data_type=data_type,
translator=Translator(),
sub_list_idx=future_dict['idx'])
backup_future_chunk.add_done_callback(callback_list_done)
backup_future_dict = {
"future": backup_future_chunk,
Expand Down Expand Up @@ -242,10 +267,16 @@ def translate_en2vi(self, src_texts: Union[List[str], str],
try:
target_texts = translator_instance.translate(src_texts, src=self.source_lang, dest=self.target_lang)
except TypeError:
# TypeError likely due to gender-specific translation, which has no fix yet. Please refer to
# ssut/py-googletrans#260 for more info
if sub_list_idx is None:
target_texts = translator_instance.translate("Translate fail ERR ERR", src=self.source_lang, dest=self.target_lang)
target_texts = translator_instance.translate(self.fail_translation_code,
src=self.source_lang,
dest=self.target_lang)
else:
target_texts = translator_instance.translate(["Translate fail ERR ERR", "Translate fail ERR ERR"], src=self.source_lang, dest=self.target_lang)
target_texts = translator_instance.translate([self.fail_translation_code, self.fail_translation_code],
src=self.source_lang,
dest=self.target_lang)

def extract_texts(obj):
if isinstance(obj, list):
Expand Down Expand Up @@ -308,17 +339,19 @@ def translate_converted(self,
with ThreadPoolExecutor(max_workers=num_threads) as executor:
futures = []
finished_task = 0
manager = multiprocessing.Manager()
lock = manager.Lock()
lock = threading.Lock()

def callback_done(future):
nonlocal translated_data
nonlocal finished_task
nonlocal progress_bar
nonlocal lock
if not future.exception():
translated_data.extend(future.result())
finished_task += 1
progress_bar.update(1)
with lock:
# This need to be += or .extend to shallow flatten the list structure
translated_data += future.result()
finished_task += 1
progress_bar.update(1)
tqdm.write("\nTask finished, adding translated data to result...\n")
else:
tqdm.write(f"\nTask failed with the following error: {future.exception()}."
Expand Down Expand Up @@ -426,6 +459,7 @@ def save(self) -> None:
if self.do_translate:
self.pre_translate_validate()
self.translate_converted()
self.post_translate_validate()
assert self.converted_data_translated is not None, "Converted data haven't been translated yet!"
output_translated_path = os.path.join(self.output_dir,
f"{self.parser_name}_translated_{self.target_lang}.json")
Expand Down
3 changes: 2 additions & 1 deletion translator/filters/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
from .code_filter import have_code
from .code_filter import have_code
from .fail_translation_filter import have_re_code
39 changes: 39 additions & 0 deletions translator/filters/fail_translation_filter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
import re
from typing import Union, List


def have_re_code(text: Union[str, List[str]], code: str="P1OP1_F") -> bool:
is_found = False
if isinstance(text, list):
for str_text in text:
if code in str_text: is_found = True
else:
if code in text: is_found = True

return is_found


if __name__ == "__main__":
code_text =[\
'''
Can you generate a title that accurately reflects the impact of the pandemic on the hospitality industry? To help you out, use this Python code to extract keywords with five or more letters from this passage about the industry's response to new health and safety protocols:
```
P1OP1_Frtfhbrth
import re
text = "The hospitality industry has faced significant challenges due to the pandemic, including the need to implement new health and safety protocols. Many hotels and restaurants have had to adapt quickly to these changes in order to ensure the safety of their customers and staff. This has resulted in increased costs and decreased revenue for many businesses. However, some companies have been able to innovate and find new ways to serve their customers, such as offering contactless delivery and online ordering options."
keywords = re.findall(r'\b\w{5,}\b', text)
```
Once you have your list of keywords, combine them with this title prompt: "Can You Solve the Puzzle and Craft a Title for This Hospitality Industry Passage?" Be sure that your title accurately reflects the impact of the pandemic on the industry. Good luck, puzzle master!
''',
'''
Can you generate a title that accurately reflects the impact of the pandemic on the hospitality industry? To help you out, use this Python code to extract keywords with five or more letters from this passage about the industry's response to new health and safety protocols:
```
import re
text = "The hospitality industry has faced significant challenges due to the pandemic, including the need to implement new health and safety protocols. Many hotels and restaurants have had to adapt quickly to these changes in order to ensure the safety of their customers and staff. This has resulted in increased costs and decreased revenue for many businesses. However, some companies have been able to innovate and find new ways to serve their customers, such as offering contactless delivery and online ordering options."
keywords = re.findall(r'\b\w{5,}\b', text)
```
Once you have your list of keywords, combine them with this title prompt: "Can You Solve the Puzzle and Craft a Title for This Hospitality Industry Passage?" Be sure that your title accurately reflects the impact of the pandemic on the industry. Good luck, puzzle master!
'''
]
print(have_re_code(code_text))

0 comments on commit 3849032

Please sign in to comment.