From 6b934113827cebd2be93d13d83408ed49fc91b72 Mon Sep 17 00:00:00 2001 From: Quinten Steenhuis Date: Sat, 10 Feb 2024 10:20:54 -0500 Subject: [PATCH 01/13] WIP --- .../data/questions/generate_translation.yml | 13 ++ docassemble/ALDashboard/translation.py | 148 +++++++++++++++++- setup.py | 2 +- 3 files changed, 158 insertions(+), 5 deletions(-) diff --git a/docassemble/ALDashboard/data/questions/generate_translation.yml b/docassemble/ALDashboard/data/questions/generate_translation.yml index edab62f..a149617 100644 --- a/docassemble/ALDashboard/data/questions/generate_translation.yml +++ b/docassemble/ALDashboard/data/questions/generate_translation.yml @@ -28,6 +28,19 @@ fields: ] - "Language codes (like: es, one per line)": tr_langs datatype: area + - Include a draft translation with GPT or Google Translate: use_gpt + datatype: yesno + show if: + code: | + gpt_is_available() or google_translate_is_available() + - note: | + If you want to use Google Translate, you need to set up a Google Cloud project and enable the Google Translate API. + + To use OpenAI's GPT-3, you need to set up an OpenAI account and get an API key. + show if: + code: | + not (gpt_is_available() or google_translate_is_available()) + --- mandatory: True question: | diff --git a/docassemble/ALDashboard/translation.py b/docassemble/ALDashboard/translation.py index 7d525a9..25b20d0 100644 --- a/docassemble/ALDashboard/translation.py +++ b/docassemble/ALDashboard/translation.py @@ -3,7 +3,7 @@ import os import re import tempfile -from typing import Tuple +from typing import List, Optional, Tuple, Union import xml.etree.ElementTree as ET import zipfile @@ -38,23 +38,120 @@ import xlsxwriter -from docassemble.base.util import DAFile +from docassemble.base.util import DAFile, language_name, get_config from docassemble.webapp.server import mako_parts from typing import NamedTuple, Dict +from docassemble.ALToolbox.llms import * + DEFAULT_LANGUAGE = "en" __all__ = [ "Translation", "translation_file", + "translate_fragments", + "gpt_is_available", ] +def gpt_is_available() -> bool: + """ + Return True if the GPT API is available. + """ + return get_config("open ai", {}).get("key") is not None + + +def may_have_mako(text:str) -> bool: + """ + Return True if the text appears to contain any Mako code, such as ${...} or % at the beginning of a line. + """ + return re.search(r'\${|^\s*%', text, flags=re.MULTILINE) is not None + +def may_have_html(text:str) -> bool: + """ + Return True if the text appears to contain any HTML code, such as

or

. + """ + return re.search(r'<\w+.*?>.*?<\/\w+>', text, flags=re.MULTILINE) is not None + +def translate_fragments_gpt(fragments:Union[str,List[str]], source_language:str, tr_lang:str, special_words: Optional[Dict[str,str]] = None) -> Dict[str]: + """Use GPT-3.5 to translate a list of fragments (strings) from one language to another and provide a dictionary + with the original text and the translated text. + """ + try: + language_in_english = language_name(source_language) + except: + language_in_english = source_language + try: + tr_language_in_english = language_name(tr_lang) + except: + tr_language_in_english = tr_lang + + if isinstance(fragments, str): + fragments = [fragments] + + system_prompt = f"""You are a helpful translator that translates Docassemble interviews from "{language_in_english}" to "{tr_language_in_english}". You + preserve the meaning of all sentences while aiming to produce a translation at or below a 9th grade reading level. + + When you see Mako tags or HTML tags, you do not translate them. You can translate text in quotes that appears to be intended to be shown + to the user, but if there is a chance text is intended for the program logic you do not translate it. You do not change the whitespace because + whitespace can have meaning in Docassemble. + """ + if special_words is not None: + system_prompt += """ + When you see one of the special words in the following table in the first column, you use a form of the suggested replacement rather than inventing a new translation: + + {special_words} + + Your only reply is a JSON object that looks like this: + {{ + "original text": "translated text", + }} + """ + user_message = f""" + {fragments} + """ + +def translate_fragments_google(fragments:Union[str,List[str]], source_language:str, tr_lang:str, special_words: Optional[Dict[str,str]] = None) -> Dict[str]: + """Use Google Translate to translate a list of fragments (strings) from one language to another and provide a dictionary + with the original text and the translated text. + """ + return fragments + + +def translate_fragments(fragments:Union[str,List[str]], language:str, tr_lang:str, allow_gpt=True, allow_google=True, special_words=Dict[str,str]) -> Dict[str]: + """ + Translate a list of fragments (strings) from one language to another. + """ + if not (allow_google or allow_gpt): + raise ValueError("You must allow at least one translation method") + + if isinstance(fragments, str): + fragments = [fragments] + if language == tr_lang: + return fragments + + fragments_with_code = [] + fragments_without_code = [] + + if allow_gpt and allow_google: + for fragment in fragments: + if may_have_html(fragment) or may_have_mako(fragment): + fragments_with_code.append(fragment) + else: + fragments_without_code.append(fragment) + results = translate_fragments_gpt(fragments_with_code, language, tr_lang, special_words) + results.update(translate_fragments_google(fragments_without_code, language, tr_lang, special_words)) + elif allow_gpt: + results = translate_fragments_gpt(fragments, language, tr_lang, special_words) + else: # allow_google + results = translate_fragments_google(fragments, language, tr_lang, special_words) + return results + class Translation(NamedTuple): file: DAFile # an XLSX or XLIFF file untranslated_words: int # Word count for all untranslated segments that are not Mako or HTML untranslated_segments: int # Number of rows in the output that have untranslated text - one for each question, subquestion, field, etc. total_rows: int -def translation_file(yaml_filename:str, tr_lang:str ) -> Translation: +def translation_file(yaml_filename:str, tr_lang:str, use_gpt=False, use_google_translate=False) -> Translation: """ Return a tuple of the translation file in XLSX format, plus a count of the number of words and segments that need to be translated. @@ -78,6 +175,8 @@ def translation_file(yaml_filename:str, tr_lang:str ) -> Translation: interview_source.update() interview_source.translating = True interview = interview_source.get_interview() + + # Load the existing translation files and build a cache tr_cache: Dict = {} if len(interview.translations) > 0: for item in interview.translations: @@ -190,7 +289,10 @@ def translation_file(yaml_filename:str, tr_lang:str ) -> Translation: tr_cache[orig_text][source_lang] = {} tr_cache[orig_text][source_lang][target_lang] = the_dict indexno += 1 - if filetype == 'XLSX': + + # Create the output file + # We don't have any other filetypes for now, but we could add XLIFF support later + if filetype == 'XLSX': xlsx_filename = docassemble.base.functions.space_to_underscore(os.path.splitext(os.path.basename(re.sub(r'.*:', '', yaml_filename)))[0]) + "_" + tr_lang + ".xlsx" output_file.initialize(filename=xlsx_filename) workbook = xlsxwriter.Workbook(output_file.path()) @@ -248,6 +350,37 @@ def translation_file(yaml_filename:str, tr_lang:str ) -> Translation: wholefixedunlockedtwo.set_align('top') wholefixedunlockedtwo.set_text_wrap() # wholefixedunlockedtwo.set_locked(False) + draft_translation_format = workbook.add_format() + draft_translation_format.set_bg_color('yellow') + draft_translation_format_one = workbook.add_format() + draft_translation_format_one.set_bg_color('yellow') + draft_translation_format_one.set_bold() + draft_translation_format_one.set_font_color('green') + + draft_translation_format_two = workbook.add_format() + draft_translation_format_two.set_bg_color('yellow') + draft_translation_format_two.set_bold() + draft_translation_format_two.set_font_color('blue') + + whole_draft_translation_format = workbook.add_format() + whole_draft_translation_format.set_bg_color('yellow') + whole_draft_translation_format.set_align('top') + whole_draft_translation_format.set_text_wrap() + + whole_draft_translation_format_one = workbook.add_format() + whole_draft_translation_format_one.set_bg_color('yellow') + whole_draft_translation_format_one.set_bold() + whole_draft_translation_format_one.set_font_color('green') + whole_draft_translation_format_one.set_align('top') + whole_draft_translation_format_one.set_text_wrap() + + whole_draft_translation_format_two = workbook.add_format() + whole_draft_translation_format_two.set_bg_color('yellow') + whole_draft_translation_format_two.set_bold() + whole_draft_translation_format_two.set_font_color('blue') + whole_draft_translation_format_two.set_align('top') + whole_draft_translation_format_two.set_text_wrap() + numb = workbook.add_format() numb.set_align('top') worksheet.write('A1', 'interview', bold) @@ -269,6 +402,8 @@ def translation_file(yaml_filename:str, tr_lang:str ) -> Translation: untranslated_segments = 0 untranslated_text = "" total_rows = 0 + + hold_for_draft_translation = [] for question in interview.all_questions: if not hasattr(question, 'translations'): continue @@ -327,10 +462,14 @@ def translation_file(yaml_filename:str, tr_lang:str ) -> Translation: parts.extend([fixedtwo, part[0]]) parts.append(fixedcell) worksheet.write_rich_string(*parts) + + # + mako = mako_parts(tr_text) if len(mako) == 0: worksheet.write_string(row, 7, '', wholefixedunlocked) elif len(mako) == 1: + # mode 0 is normal text, mode 1 is Mako or HTML, mode 2 is a variable if mako[0][1] == 0: worksheet.write_string(row, 7, tr_text, wholefixedunlocked) elif mako[0][1] == 1: @@ -356,6 +495,7 @@ def translation_file(yaml_filename:str, tr_lang:str ) -> Translation: indexno += 1 row += 1 seen.append(item) + for item, cache_item in tr_cache.items(): if item in seen or language not in cache_item or tr_lang not in cache_item[language]: continue diff --git a/setup.py b/setup.py index c4afb63..90915a0 100644 --- a/setup.py +++ b/setup.py @@ -54,7 +54,7 @@ def find_package_data(where='.', package='', exclude=standard_exclude, exclude_d url='https://github.com/SuffolkLITLab/docassemble-ALDashboard', packages=find_packages(), namespace_packages=['docassemble'], - install_requires=['PyGithub>=2.1.1', 'docassemble.ALToolbox>=0.9.2', 'openai>=1.0', 'tiktoken', 'pyaml'], + install_requires=['PyGithub>=2.1.1', 'docassemble.ALToolbox>=0.10.0', 'openai>=1.0', 'tiktoken', 'pyaml'], zip_safe=False, package_data=find_package_data(where='docassemble/ALDashboard/', package='docassemble.ALDashboard'), ) From 5135a323134d4c79d3ebde3ec469f049cf026668 Mon Sep 17 00:00:00 2001 From: Quinten Steenhuis Date: Mon, 25 Mar 2024 21:37:17 -0400 Subject: [PATCH 02/13] WIP --- .../data/questions/generate_translation.yml | 43 +++- .../ALDashboard/data/questions/menu.yml | 5 + docassemble/ALDashboard/translation.py | 188 ++++++++++++++---- 3 files changed, 191 insertions(+), 45 deletions(-) diff --git a/docassemble/ALDashboard/data/questions/generate_translation.yml b/docassemble/ALDashboard/data/questions/generate_translation.yml index a149617..d08b89a 100644 --- a/docassemble/ALDashboard/data/questions/generate_translation.yml +++ b/docassemble/ALDashboard/data/questions/generate_translation.yml @@ -9,12 +9,45 @@ metadata: title: | Translation support tool --- +mandatory: True +code: | + the_yaml_path + if not the_task.ready(): + waiting_screen + show_translation_results +--- +# code: | +# translations = [ +# translation_file(the_yaml_path, tr_lang, use_gpt=use_gpt) +# for tr_lang +# in tr_langs.split() +# ] +--- +code: | + the_task = background_action('translate_file') +--- +event: waiting_screen +question: | + Please wait while we translate your file +subquestion: | +
+ Processing... +
+reload: True +--- +event: translate_file code: | translations = [ - translation_file(the_yaml_path, tr_lang) + translation_file(the_yaml_path, tr_lang, use_gpt=use_gpt) for tr_lang in tr_langs.split() ] + background_response_action('save_translations', translations=translations) +--- +event: save_translations +code: | + translations = action_argument('translations') + background_response() --- question: | What file do you want to translate? @@ -32,17 +65,15 @@ fields: datatype: yesno show if: code: | - gpt_is_available() or google_translate_is_available() + gpt_is_available() # or google_translate_is_available() - note: | - If you want to use Google Translate, you need to set up a Google Cloud project and enable the Google Translate API. - To use OpenAI's GPT-3, you need to set up an OpenAI account and get an API key. show if: code: | - not (gpt_is_available() or google_translate_is_available()) + not gpt_is_available() --- -mandatory: True +event: show_translation_results question: | Translation results subquestion: | diff --git a/docassemble/ALDashboard/data/questions/menu.yml b/docassemble/ALDashboard/data/questions/menu.yml index 61054e2..68b9f4a 100644 --- a/docassemble/ALDashboard/data/questions/menu.yml +++ b/docassemble/ALDashboard/data/questions/menu.yml @@ -61,6 +61,11 @@ data: privilege: - admin - developer + - name: Manage answer file viewers + url: ${ interview_url(i=user_info().package + ":manage_answer_viewers.yml", reset=1) } + image: pencil-alt + privilege: + - admin - name: Generate a review screen draft url: ${ interview_url(i=user_info().package + ":review_screen_generator.yml", reset=1) } image: pencil-alt diff --git a/docassemble/ALDashboard/translation.py b/docassemble/ALDashboard/translation.py index 25b20d0..f0a1c62 100644 --- a/docassemble/ALDashboard/translation.py +++ b/docassemble/ALDashboard/translation.py @@ -41,7 +41,9 @@ from docassemble.base.util import DAFile, language_name, get_config from docassemble.webapp.server import mako_parts from typing import NamedTuple, Dict -from docassemble.ALToolbox.llms import * +from docassemble.ALToolbox.llms import chat_completion + +import tiktoken DEFAULT_LANGUAGE = "en" @@ -50,6 +52,7 @@ "translation_file", "translate_fragments", "gpt_is_available", + "translate_fragments_gpt", ] def gpt_is_available() -> bool: @@ -71,9 +74,18 @@ def may_have_html(text:str) -> bool: """ return re.search(r'<\w+.*?>.*?<\/\w+>', text, flags=re.MULTILINE) is not None -def translate_fragments_gpt(fragments:Union[str,List[str]], source_language:str, tr_lang:str, special_words: Optional[Dict[str,str]] = None) -> Dict[str]: - """Use GPT-3.5 to translate a list of fragments (strings) from one language to another and provide a dictionary +def translate_fragments_gpt(fragments:Union[str,List[str]], source_language:str, tr_lang:str, special_words: Optional[Dict[str,str]] = None, model="gpt-3.5-turbo-1106", max_tokens=3900) -> Dict[int, str]: + """Use GPT-3.5-1106 to translate a list of fragments (strings) from one language to another and provide a dictionary with the original text and the translated text. + + You can optionally provide an alternative model, but it must support JSON mode. + + Args: + fragments: A list of strings to be translated. + source_language: The language of the original text. + tr_lang: The language to translate the text into. + special_words: A dictionary of special words that should be translated in a specific way. + model: The GPT model to use. The default is "gpt-3.5-turbo-1106". """ try: language_in_english = language_name(source_language) @@ -90,6 +102,14 @@ def translate_fragments_gpt(fragments:Union[str,List[str]], source_language:str, system_prompt = f"""You are a helpful translator that translates Docassemble interviews from "{language_in_english}" to "{tr_language_in_english}". You preserve the meaning of all sentences while aiming to produce a translation at or below a 9th grade reading level. + You will get input that looks like this that indicates a row in a table and the untranslated text in that row: + + [ + {{0, "Your name"}}, + {{10, "When was ${{ user.name }} born?"}}, + {{32, "
Here is some text and a link.
}} + ] + When you see Mako tags or HTML tags, you do not translate them. You can translate text in quotes that appears to be intended to be shown to the user, but if there is a chance text is intended for the program logic you do not translate it. You do not change the whitespace because whitespace can have meaning in Docassemble. @@ -99,24 +119,56 @@ def translate_fragments_gpt(fragments:Union[str,List[str]], source_language:str, When you see one of the special words in the following table in the first column, you use a form of the suggested replacement rather than inventing a new translation: {special_words} - - Your only reply is a JSON object that looks like this: - {{ - "original text": "translated text", - }} """ - user_message = f""" - {fragments} + system_prompt += """ + Your only reply is a JSON object that looks like this: + { + [ROW NUMBER]: "[TRANSLATED TEXT]", + } + + Where [ROW NUMBER] is the matching row index number, and [TRANSLATED TEXT] is the translated text. """ -def translate_fragments_google(fragments:Union[str,List[str]], source_language:str, tr_lang:str, special_words: Optional[Dict[str,str]] = None) -> Dict[str]: + encoding = tiktoken.encoding_for_model(model) + system_token_count = len(encoding.encode(system_prompt)) + user_message_token_count = len(encoding.encode(repr(fragments))) + token_count = system_token_count + user_message_token_count + number_of_chunks_to_make = 1 + if token_count > max_tokens: + # Divide the fragments into smaller chunks + max_chunk_size = max_tokens - system_token_count + chunked_fragments = [] + + # Most of the time, each fragment will be well under the max token limit, + # so heuristic of just assuming each fragment is equal size should be OK + number_of_chunks_to_make = math.ceil(token_count / max_tokens) + + results = {} + for c in range(number_of_chunks_to_make): + chunked_fragments = fragments + if number_of_chunks_to_make > 1: + chunked_fragments = fragments[c*max_chunk_size:(c+1)*max_chunk_size] + response = chat_completion( + system_prompt, + user_message = repr(chunked_fragments), + temperature = 0.0, + json_mode = True, + model=model, + ) + + results.update(response) + + return results + + +def translate_fragments_google(fragments:Union[str,List[str]], source_language:str, tr_lang:str, special_words: Optional[Dict[str,str]] = None) -> Dict[int, str]: """Use Google Translate to translate a list of fragments (strings) from one language to another and provide a dictionary with the original text and the translated text. """ return fragments -def translate_fragments(fragments:Union[str,List[str]], language:str, tr_lang:str, allow_gpt=True, allow_google=True, special_words=Dict[str,str]) -> Dict[str]: +def translate_fragments(fragments:Union[str,List[str]], language:str, tr_lang:str, allow_gpt=True, allow_google=True, special_words=Dict[str,str]) -> Dict[int, str]: """ Translate a list of fragments (strings) from one language to another. """ @@ -151,6 +203,17 @@ class Translation(NamedTuple): untranslated_segments: int # Number of rows in the output that have untranslated text - one for each question, subquestion, field, etc. total_rows: int +class TranslationRow(NamedTuple): + source: str + question_id: str + index_num: int + hash: str + orig_lang: str + tr_lang: str + orig_text: str + tr_text: str + + def translation_file(yaml_filename:str, tr_lang:str, use_gpt=False, use_google_translate=False) -> Translation: """ Return a tuple of the translation file in XLSX format, plus a count of the @@ -291,15 +354,35 @@ def translation_file(yaml_filename:str, tr_lang:str, use_gpt=False, use_google_t indexno += 1 # Create the output file - # We don't have any other filetypes for now, but we could add XLIFF support later - if filetype == 'XLSX': + if filetype == 'XLSX': # We only support XLSX for now, but this came from upstream implementation xlsx_filename = docassemble.base.functions.space_to_underscore(os.path.splitext(os.path.basename(re.sub(r'.*:', '', yaml_filename)))[0]) + "_" + tr_lang + ".xlsx" output_file.initialize(filename=xlsx_filename) workbook = xlsxwriter.Workbook(output_file.path()) worksheet = workbook.add_worksheet() + + # Add a bold format for the header bold = workbook.add_format({'bold': 1}) - text = workbook.add_format() - text.set_align('top') + + # Add the table headings + worksheet.write('A1', 'interview', bold) + worksheet.write('B1', 'question_id', bold) + worksheet.write('C1', 'index_num', bold) + worksheet.write('D1', 'hash', bold) + worksheet.write('E1', 'orig_lang', bold) + worksheet.write('F1', 'tr_lang', bold) + worksheet.write('G1', 'orig_text', bold) + worksheet.write('H1', 'tr_text', bold) + + # Set column widths + worksheet.set_column(0, 0, 25) # interview source + worksheet.set_column(1, 1, 15) # question_id + worksheet.set_column(2, 2, 12) # index_num + worksheet.set_column(6, 6, 75) # orig_text + worksheet.set_column(6, 7, 75) # tr_text + + # Create some formats to use for syntax highlighting + text_format = workbook.add_format() + text_format.set_align('top') fixedcell = workbook.add_format() fixedcell.set_align('top') fixedcell.set_text_wrap() @@ -350,6 +433,8 @@ def translation_file(yaml_filename:str, tr_lang:str, use_gpt=False, use_google_t wholefixedunlockedtwo.set_align('top') wholefixedunlockedtwo.set_text_wrap() # wholefixedunlockedtwo.set_locked(False) + + # This is a variation on above formats to be used to mark "draft" translations (from GPT-4) draft_translation_format = workbook.add_format() draft_translation_format.set_bg_color('yellow') draft_translation_format_one = workbook.add_format() @@ -381,22 +466,11 @@ def translation_file(yaml_filename:str, tr_lang:str, use_gpt=False, use_google_t whole_draft_translation_format_two.set_align('top') whole_draft_translation_format_two.set_text_wrap() + # Default number format numb = workbook.add_format() numb.set_align('top') - worksheet.write('A1', 'interview', bold) - worksheet.write('B1', 'question_id', bold) - worksheet.write('C1', 'index_num', bold) - worksheet.write('D1', 'hash', bold) - worksheet.write('E1', 'orig_lang', bold) - worksheet.write('F1', 'tr_lang', bold) - worksheet.write('G1', 'orig_text', bold) - worksheet.write('H1', 'tr_text', bold) - worksheet.set_column(0, 0, 25) - worksheet.set_column(1, 1, 15) - worksheet.set_column(2, 2, 12) - worksheet.set_column(6, 6, 75) - worksheet.set_column(6, 7, 75) + # Write the data row = 1 seen = [] untranslated_segments = 0 @@ -428,13 +502,17 @@ def translation_file(yaml_filename:str, tr_lang:str, use_gpt=False, use_google_t tr_text = str(tr_cache[item][language][tr_lang]['tr_text']) else: # This string needs to be translated tr_text = '' + hold_for_draft_translation.append((row, item )) # item is the original untranslated string, pre-mako parsing untranslated_segments += 1 - worksheet.write_string(row, 0, question.from_source.get_name(), text) - worksheet.write_string(row, 1, question_id, text) + + # Add the metadata + + worksheet.write_string(row, 0, question.from_source.get_name(), text_format) + worksheet.write_string(row, 1, question_id, text_format) worksheet.write_number(row, 2, indexno, numb) - worksheet.write_string(row, 3, hashlib.md5(item.encode('utf-8')).hexdigest(), text) - worksheet.write_string(row, 4, language, text) - worksheet.write_string(row, 5, tr_lang, text) + worksheet.write_string(row, 3, hashlib.md5(item.encode('utf-8')).hexdigest(), text_format) + worksheet.write_string(row, 4, language, text_format) + worksheet.write_string(row, 5, tr_lang, text_format) mako = mako_parts(item) if not tr_text: @@ -496,15 +574,46 @@ def translation_file(yaml_filename:str, tr_lang:str, use_gpt=False, use_google_t row += 1 seen.append(item) + # Now we need to translate the hold_for_draft_translation items + if use_gpt: + translated_fragments = translate_fragments_gpt( + [{item[0]: item[1]} for item in hold_for_draft_translation], + source_language=language, + tr_lang=tr_lang, + ) + for row, item in translated_fragments.items(): + # Get the mako parts + mako = mako_parts(item) + if len(mako) == 0: + worksheet.write_string(row, 7, tr_text, whole_draft_translation_format) + elif len(mako) == 1: + if mako[0][1] == 0: + worksheet.write_string(row, 7, tr_text, whole_draft_translation_format) + elif mako[0][1] == 1: + worksheet.write_string(row, 7, tr_text, whole_draft_translation_format_one) + elif mako[0][1] == 2: + worksheet.write_string(row, 7, tr_text, whole_draft_translation_format_two) + else: + parts = [row, 7] + for part in mako: + if part[1] == 0: + parts.extend([fixed, part[0]]) + elif part[1] == 1: + parts.extend([fixedone, part[0]]) + elif part[1] == 2: + parts.extend([fixedtwo, part[0]]) + parts.append(fixedcell) + worksheet.write_rich_string(*parts) + for item, cache_item in tr_cache.items(): if item in seen or language not in cache_item or tr_lang not in cache_item[language]: continue - worksheet.write_string(row, 0, cache_item[language][tr_lang]['interview'], text) - worksheet.write_string(row, 1, cache_item[language][tr_lang]['question_id'], text) + worksheet.write_string(row, 0, cache_item[language][tr_lang]['interview'], text_format) + worksheet.write_string(row, 1, cache_item[language][tr_lang]['question_id'], text_format) worksheet.write_number(row, 2, 1000 + cache_item[language][tr_lang]['index_num'], numb) - worksheet.write_string(row, 3, cache_item[language][tr_lang]['hash'], text) - worksheet.write_string(row, 4, cache_item[language][tr_lang]['orig_lang'], text) - worksheet.write_string(row, 5, cache_item[language][tr_lang]['tr_lang'], text) + worksheet.write_string(row, 3, cache_item[language][tr_lang]['hash'], text_format) + worksheet.write_string(row, 4, cache_item[language][tr_lang]['orig_lang'], text_format) + worksheet.write_string(row, 5, cache_item[language][tr_lang]['tr_lang'], text_format) mako = mako_parts(cache_item[language][tr_lang]['orig_text']) if len(mako) == 1: if mako[0][1] == 0: @@ -550,4 +659,5 @@ def translation_file(yaml_filename:str, tr_lang:str, use_gpt=False, use_google_t workbook.close() untranslated_words = len(re.findall(r"\w+", untranslated_text)) return Translation(output_file, untranslated_words,untranslated_segments, total_rows) + raise ValueError("That's not a valid filetype for a translation file") \ No newline at end of file From c1f72e451c5ad3c1395d6ab4fbba048253233a3c Mon Sep 17 00:00:00 2001 From: Quinten Steenhuis Date: Mon, 25 Mar 2024 23:35:26 -0400 Subject: [PATCH 03/13] Add a draft translation with GPT-3.5 feature --- .../data/questions/generate_translation.yml | 2 +- docassemble/ALDashboard/translation.py | 45 +++++++++---------- 2 files changed, 23 insertions(+), 24 deletions(-) diff --git a/docassemble/ALDashboard/data/questions/generate_translation.yml b/docassemble/ALDashboard/data/questions/generate_translation.yml index d08b89a..09119b3 100644 --- a/docassemble/ALDashboard/data/questions/generate_translation.yml +++ b/docassemble/ALDashboard/data/questions/generate_translation.yml @@ -38,7 +38,7 @@ reload: True event: translate_file code: | translations = [ - translation_file(the_yaml_path, tr_lang, use_gpt=use_gpt) + translation_file(the_yaml_path, tr_lang, use_gpt=use_gpt, openai_api=get_config("openai api key", get_config("open ai", {}).get("key"))) for tr_lang in tr_langs.split() ] diff --git a/docassemble/ALDashboard/translation.py b/docassemble/ALDashboard/translation.py index 1c39dce..c9fccc0 100644 --- a/docassemble/ALDashboard/translation.py +++ b/docassemble/ALDashboard/translation.py @@ -76,7 +76,7 @@ def may_have_html(text:str) -> bool: """ return re.search(r'<\w+.*?>.*?<\/\w+>', text, flags=re.MULTILINE) is not None -def translate_fragments_gpt(fragments:Union[str,List[str]], source_language:str, tr_lang:str, special_words: Optional[Dict[str,str]] = None, model="gpt-3.5-turbo-1106", max_tokens=3900) -> Dict[int, str]: +def translate_fragments_gpt(fragments:Union[str,List[str]], source_language:str, tr_lang:str, special_words: Optional[Dict[str,str]] = None, model="gpt-3.5-turbo-1106", max_tokens=3900, openai_api:Optional[str]=None) -> Dict[int, str]: """Use GPT-3.5-1106 to translate a list of fragments (strings) from one language to another and provide a dictionary with the original text and the translated text. @@ -156,6 +156,7 @@ def translate_fragments_gpt(fragments:Union[str,List[str]], source_language:str, temperature = 0.0, json_mode = True, model=model, + openai_api=openai_api, ) results.update(response) @@ -205,18 +206,8 @@ class Translation(NamedTuple): untranslated_segments: int # Number of rows in the output that have untranslated text - one for each question, subquestion, field, etc. total_rows: int -class TranslationRow(NamedTuple): - source: str - question_id: str - index_num: int - hash: str - orig_lang: str - tr_lang: str - orig_text: str - tr_text: str - -def translation_file(yaml_filename:str, tr_lang:str, use_gpt=False, use_google_translate=False) -> Translation: +def translation_file(yaml_filename:str, tr_lang:str, use_gpt=False, use_google_translate=False, openai_api:Optional[str]=None) -> Translation: """ Return a tuple of the translation file in XLSX format, plus a count of the number of words and segments that need to be translated. @@ -524,6 +515,7 @@ def translation_file(yaml_filename:str, tr_lang:str, use_gpt=False, use_google_t whole_draft_translation_format = workbook.add_format() whole_draft_translation_format.set_bg_color('yellow') + whole_draft_translation_format.set_font_color('black') whole_draft_translation_format.set_align('top') whole_draft_translation_format.set_text_wrap() @@ -541,6 +533,11 @@ def translation_file(yaml_filename:str, tr_lang:str, use_gpt=False, use_google_t whole_draft_translation_format_two.set_align('top') whole_draft_translation_format_two.set_text_wrap() + draft_fixedcell = workbook.add_format() + draft_fixedcell.set_align("top") + draft_fixedcell.set_text_wrap() + draft_fixedcell.set_bg_color('yellow') + # Default number format numb = workbook.add_format() numb.set_align('top') @@ -652,32 +649,34 @@ def translation_file(yaml_filename:str, tr_lang:str, use_gpt=False, use_google_t # Now we need to translate the hold_for_draft_translation items if use_gpt: translated_fragments = translate_fragments_gpt( - [{item[0]: item[1]} for item in hold_for_draft_translation], + [{item[0]: item[1]} for item in hold_for_draft_translation], # We send a list of dictionaries for easier partitioning if we exceed max_tokens source_language=language, tr_lang=tr_lang, + openai_api=openai_api, ) - for row, item in translated_fragments.items(): - # Get the mako parts + for row, item in translated_fragments.items(): # But we get back one dictionary + row = int(row) # it seems sometimes GPT-3.5 makes it a string, not an int + # Get the mako mako = mako_parts(item) if len(mako) == 0: - worksheet.write_string(row, 7, tr_text, whole_draft_translation_format) + worksheet.write_string(row, 7, item, whole_draft_translation_format) elif len(mako) == 1: if mako[0][1] == 0: - worksheet.write_string(row, 7, tr_text, whole_draft_translation_format) + worksheet.write_string(row, 7, item, whole_draft_translation_format) elif mako[0][1] == 1: - worksheet.write_string(row, 7, tr_text, whole_draft_translation_format_one) + worksheet.write_string(row, 7, item, whole_draft_translation_format_one) elif mako[0][1] == 2: - worksheet.write_string(row, 7, tr_text, whole_draft_translation_format_two) + worksheet.write_string(row, 7, item, whole_draft_translation_format_two) else: parts = [row, 7] for part in mako: if part[1] == 0: - parts.extend([fixed, part[0]]) + parts.extend([whole_draft_translation_format, part[0]]) elif part[1] == 1: - parts.extend([fixedone, part[0]]) + parts.extend([whole_draft_translation_format_one, part[0]]) elif part[1] == 2: - parts.extend([fixedtwo, part[0]]) - parts.append(fixedcell) + parts.extend([whole_draft_translation_format_two, part[0]]) + parts.append(draft_fixedcell) worksheet.write_rich_string(*parts) for item, cache_item in tr_cache.items(): From 1e123342921db1e6e43e63730607a9176fe91dc6 Mon Sep 17 00:00:00 2001 From: Quinten Steenhuis Date: Mon, 25 Mar 2024 23:39:20 -0400 Subject: [PATCH 04/13] Remove accidentally added menu item --- docassemble/ALDashboard/data/questions/menu.yml | 5 ----- 1 file changed, 5 deletions(-) diff --git a/docassemble/ALDashboard/data/questions/menu.yml b/docassemble/ALDashboard/data/questions/menu.yml index 68b9f4a..61054e2 100644 --- a/docassemble/ALDashboard/data/questions/menu.yml +++ b/docassemble/ALDashboard/data/questions/menu.yml @@ -61,11 +61,6 @@ data: privilege: - admin - developer - - name: Manage answer file viewers - url: ${ interview_url(i=user_info().package + ":manage_answer_viewers.yml", reset=1) } - image: pencil-alt - privilege: - - admin - name: Generate a review screen draft url: ${ interview_url(i=user_info().package + ":review_screen_generator.yml", reset=1) } image: pencil-alt From 9d3efc67721365aa58a97b4e7d4a4c0e1d66f5d6 Mon Sep 17 00:00:00 2001 From: Quinten Steenhuis Date: Mon, 25 Mar 2024 23:53:33 -0400 Subject: [PATCH 05/13] Format with Black --- docassemble/ALDashboard/package_scanner.py | 6 +- docassemble/ALDashboard/translation.py | 321 +++++++++++++-------- 2 files changed, 207 insertions(+), 120 deletions(-) diff --git a/docassemble/ALDashboard/package_scanner.py b/docassemble/ALDashboard/package_scanner.py index 28cfe35..3258331 100644 --- a/docassemble/ALDashboard/package_scanner.py +++ b/docassemble/ALDashboard/package_scanner.py @@ -163,9 +163,9 @@ def fetch_github_repo_version(repo_list, key_pkgs, github_user) -> dict: version_num = decoded_line[str_start:str_end][9:].replace( "',\n", "" ) - v[ - "version" - ] = version_num # Add version number to the original repo_list. + v["version"] = ( + version_num # Add version number to the original repo_list. + ) has_version_num = True break diff --git a/docassemble/ALDashboard/translation.py b/docassemble/ALDashboard/translation.py index c9fccc0..f1aeced 100644 --- a/docassemble/ALDashboard/translation.py +++ b/docassemble/ALDashboard/translation.py @@ -52,31 +52,41 @@ __all__ = [ "Translation", "translation_file", - "translate_fragments", "gpt_is_available", "translate_fragments_gpt", ] + def gpt_is_available() -> bool: """ Return True if the GPT API is available. """ return get_config("open ai", {}).get("key") is not None - -def may_have_mako(text:str) -> bool: + +def may_have_mako(text: str) -> bool: """ Return True if the text appears to contain any Mako code, such as ${...} or % at the beginning of a line. """ - return re.search(r'\${|^\s*%', text, flags=re.MULTILINE) is not None + return re.search(r"\${|^\s*%", text, flags=re.MULTILINE) is not None -def may_have_html(text:str) -> bool: + +def may_have_html(text: str) -> bool: """ Return True if the text appears to contain any HTML code, such as

or

. """ - return re.search(r'<\w+.*?>.*?<\/\w+>', text, flags=re.MULTILINE) is not None - -def translate_fragments_gpt(fragments:Union[str,List[str]], source_language:str, tr_lang:str, special_words: Optional[Dict[str,str]] = None, model="gpt-3.5-turbo-1106", max_tokens=3900, openai_api:Optional[str]=None) -> Dict[int, str]: + return re.search(r"<\w+.*?>.*?<\/\w+>", text, flags=re.MULTILINE) is not None + + +def translate_fragments_gpt( + fragments: Union[str, List[Dict[int, str]]], + source_language: str, + tr_lang: str, + special_words: Optional[Dict[int, str]] = None, + model="gpt-3.5-turbo-1106", + max_tokens=3900, + openai_api: Optional[str] = None, +) -> Dict[int, str]: """Use GPT-3.5-1106 to translate a list of fragments (strings) from one language to another and provide a dictionary with the original text and the translated text. @@ -99,8 +109,8 @@ def translate_fragments_gpt(fragments:Union[str,List[str]], source_language:str, tr_language_in_english = tr_lang if isinstance(fragments, str): - fragments = [fragments] - + fragments = [{0: fragments}] + system_prompt = f"""You are a helpful translator that translates Docassemble interviews from "{language_in_english}" to "{tr_language_in_english}". You preserve the meaning of all sentences while aiming to produce a translation at or below a 9th grade reading level. @@ -144,17 +154,17 @@ def translate_fragments_gpt(fragments:Union[str,List[str]], source_language:str, # Most of the time, each fragment will be well under the max token limit, # so heuristic of just assuming each fragment is equal size should be OK number_of_chunks_to_make = math.ceil(token_count / max_tokens) - + results = {} for c in range(number_of_chunks_to_make): chunked_fragments = fragments if number_of_chunks_to_make > 1: - chunked_fragments = fragments[c*max_chunk_size:(c+1)*max_chunk_size] + chunked_fragments = fragments[c * max_chunk_size : (c + 1) * max_chunk_size] response = chat_completion( system_prompt, - user_message = repr(chunked_fragments), - temperature = 0.0, - json_mode = True, + user_message=repr(chunked_fragments), + temperature=0.0, + json_mode=True, model=model, openai_api=openai_api, ) @@ -164,50 +174,79 @@ def translate_fragments_gpt(fragments:Union[str,List[str]], source_language:str, return results -def translate_fragments_google(fragments:Union[str,List[str]], source_language:str, tr_lang:str, special_words: Optional[Dict[str,str]] = None) -> Dict[int, str]: - """Use Google Translate to translate a list of fragments (strings) from one language to another and provide a dictionary - with the original text and the translated text. - """ - return fragments - - -def translate_fragments(fragments:Union[str,List[str]], language:str, tr_lang:str, allow_gpt=True, allow_google=True, special_words=Dict[str,str]) -> Dict[int, str]: - """ - Translate a list of fragments (strings) from one language to another. - """ - if not (allow_google or allow_gpt): - raise ValueError("You must allow at least one translation method") - - if isinstance(fragments, str): - fragments = [fragments] - if language == tr_lang: - return fragments - - fragments_with_code = [] - fragments_without_code = [] +# def translate_fragments_google( +# fragments: Union[str, List[str]], +# source_language: str, +# tr_lang: str, +# special_words: Optional[Dict[str, str]] = None, +# ) -> Dict[int, str]: +# """Use Google Translate to translate a list of fragments (strings) from one language to another and provide a dictionary +# with the original text and the translated text. +# """ +# return fragments + + +# def translate_fragments( +# fragments: Union[str, List[str]], +# language: str, +# tr_lang: str, +# allow_gpt=True, +# allow_google=True, +# special_words=Dict[str, str], +# ) -> Dict[int, str]: +# """ +# Translate a list of fragments (strings) from one language to another. +# """ +# if not (allow_google or allow_gpt): +# raise ValueError("You must allow at least one translation method") + +# if isinstance(fragments, str): +# fragments = [fragments] +# if language == tr_lang: +# return fragments + +# fragments_with_code = [] +# fragments_without_code = [] + +# if allow_gpt and allow_google: +# for fragment in fragments: +# if may_have_html(fragment) or may_have_mako(fragment): +# fragments_with_code.append(fragment) +# else: +# fragments_without_code.append(fragment) +# results = translate_fragments_gpt( +# fragments_with_code, language, tr_lang, special_words +# ) +# results.update( +# translate_fragments_google( +# fragments_without_code, language, tr_lang, special_words +# ) +# ) +# elif allow_gpt: +# results = translate_fragments_gpt(fragments, language, tr_lang, special_words) +# else: # allow_google +# results = translate_fragments_google( +# fragments, language, tr_lang, special_words +# ) +# return results - if allow_gpt and allow_google: - for fragment in fragments: - if may_have_html(fragment) or may_have_mako(fragment): - fragments_with_code.append(fragment) - else: - fragments_without_code.append(fragment) - results = translate_fragments_gpt(fragments_with_code, language, tr_lang, special_words) - results.update(translate_fragments_google(fragments_without_code, language, tr_lang, special_words)) - elif allow_gpt: - results = translate_fragments_gpt(fragments, language, tr_lang, special_words) - else: # allow_google - results = translate_fragments_google(fragments, language, tr_lang, special_words) - return results class Translation(NamedTuple): file: DAFile # an XLSX or XLIFF file - untranslated_words: int # Word count for all untranslated segments that are not Mako or HTML + untranslated_words: ( + int # Word count for all untranslated segments that are not Mako or HTML + ) untranslated_segments: int # Number of rows in the output that have untranslated text - one for each question, subquestion, field, etc. total_rows: int -def translation_file(yaml_filename:str, tr_lang:str, use_gpt=False, use_google_translate=False, openai_api:Optional[str]=None) -> Translation: +def translation_file( + yaml_filename: str, + tr_lang: str, + use_gpt=False, + use_google_translate=False, + openai_api: Optional[str] = None, +) -> Translation: """ Return a tuple of the translation file in XLSX format, plus a count of the number of words and segments that need to be translated. @@ -217,7 +256,9 @@ def translation_file(yaml_filename:str, tr_lang:str, use_gpt=False, use_google_t This code was adjusted from the Flask endpoint-only version in server.py. XLIFF support was removed for now but can be added later. """ - filetype: str = "XLSX" # Look in server.py for support of XLIFF format, but we won't implement it here + filetype: str = ( + "XLSX" # Look in server.py for support of XLIFF format, but we won't implement it here + ) output_file = DAFile() setup_translation() if yaml_filename is None or not re.search(r"\S", yaml_filename): @@ -418,37 +459,46 @@ def translation_file(yaml_filename:str, tr_lang:str, use_gpt=False, use_google_t tr_cache[orig_text][source_lang] = {} tr_cache[orig_text][source_lang][target_lang] = the_dict indexno += 1 - + # Create the output file - if filetype == 'XLSX': # We only support XLSX for now, but this came from upstream implementation - xlsx_filename = docassemble.base.functions.space_to_underscore(os.path.splitext(os.path.basename(re.sub(r'.*:', '', yaml_filename)))[0]) + "_" + tr_lang + ".xlsx" + if ( + filetype == "XLSX" + ): # We only support XLSX for now, but this came from upstream implementation + xlsx_filename = ( + docassemble.base.functions.space_to_underscore( + os.path.splitext(os.path.basename(re.sub(r".*:", "", yaml_filename)))[0] + ) + + "_" + + tr_lang + + ".xlsx" + ) output_file.initialize(filename=xlsx_filename) workbook = xlsxwriter.Workbook(output_file.path()) worksheet = workbook.add_worksheet() # Add a bold format for the header - bold = workbook.add_format({'bold': 1}) + bold = workbook.add_format({"bold": 1}) # Add the table headings - worksheet.write('A1', 'interview', bold) - worksheet.write('B1', 'question_id', bold) - worksheet.write('C1', 'index_num', bold) - worksheet.write('D1', 'hash', bold) - worksheet.write('E1', 'orig_lang', bold) - worksheet.write('F1', 'tr_lang', bold) - worksheet.write('G1', 'orig_text', bold) - worksheet.write('H1', 'tr_text', bold) + worksheet.write("A1", "interview", bold) + worksheet.write("B1", "question_id", bold) + worksheet.write("C1", "index_num", bold) + worksheet.write("D1", "hash", bold) + worksheet.write("E1", "orig_lang", bold) + worksheet.write("F1", "tr_lang", bold) + worksheet.write("G1", "orig_text", bold) + worksheet.write("H1", "tr_text", bold) # Set column widths - worksheet.set_column(0, 0, 25) # interview source - worksheet.set_column(1, 1, 15) # question_id - worksheet.set_column(2, 2, 12) # index_num - worksheet.set_column(6, 6, 75) # orig_text - worksheet.set_column(6, 7, 75) # tr_text + worksheet.set_column(0, 0, 25) # interview source + worksheet.set_column(1, 1, 15) # question_id + worksheet.set_column(2, 2, 12) # index_num + worksheet.set_column(6, 6, 75) # orig_text + worksheet.set_column(6, 7, 75) # tr_text # Create some formats to use for syntax highlighting text_format = workbook.add_format() - text_format.set_align('top') + text_format.set_align("top") fixedcell = workbook.add_format() fixedcell.set_align("top") fixedcell.set_text_wrap() @@ -502,45 +552,45 @@ def translation_file(yaml_filename:str, tr_lang:str, use_gpt=False, use_google_t # This is a variation on above formats to be used to mark "draft" translations (from GPT-4) draft_translation_format = workbook.add_format() - draft_translation_format.set_bg_color('yellow') + draft_translation_format.set_bg_color("yellow") draft_translation_format_one = workbook.add_format() - draft_translation_format_one.set_bg_color('yellow') + draft_translation_format_one.set_bg_color("yellow") draft_translation_format_one.set_bold() - draft_translation_format_one.set_font_color('green') + draft_translation_format_one.set_font_color("green") draft_translation_format_two = workbook.add_format() - draft_translation_format_two.set_bg_color('yellow') + draft_translation_format_two.set_bg_color("yellow") draft_translation_format_two.set_bold() - draft_translation_format_two.set_font_color('blue') + draft_translation_format_two.set_font_color("blue") whole_draft_translation_format = workbook.add_format() - whole_draft_translation_format.set_bg_color('yellow') - whole_draft_translation_format.set_font_color('black') - whole_draft_translation_format.set_align('top') + whole_draft_translation_format.set_bg_color("yellow") + whole_draft_translation_format.set_font_color("black") + whole_draft_translation_format.set_align("top") whole_draft_translation_format.set_text_wrap() whole_draft_translation_format_one = workbook.add_format() - whole_draft_translation_format_one.set_bg_color('yellow') + whole_draft_translation_format_one.set_bg_color("yellow") whole_draft_translation_format_one.set_bold() - whole_draft_translation_format_one.set_font_color('green') - whole_draft_translation_format_one.set_align('top') + whole_draft_translation_format_one.set_font_color("green") + whole_draft_translation_format_one.set_align("top") whole_draft_translation_format_one.set_text_wrap() whole_draft_translation_format_two = workbook.add_format() - whole_draft_translation_format_two.set_bg_color('yellow') + whole_draft_translation_format_two.set_bg_color("yellow") whole_draft_translation_format_two.set_bold() - whole_draft_translation_format_two.set_font_color('blue') - whole_draft_translation_format_two.set_align('top') - whole_draft_translation_format_two.set_text_wrap() + whole_draft_translation_format_two.set_font_color("blue") + whole_draft_translation_format_two.set_align("top") + whole_draft_translation_format_two.set_text_wrap() draft_fixedcell = workbook.add_format() draft_fixedcell.set_align("top") draft_fixedcell.set_text_wrap() - draft_fixedcell.set_bg_color('yellow') + draft_fixedcell.set_bg_color("yellow") # Default number format numb = workbook.add_format() - numb.set_align('top') + numb.set_align("top") # Write the data row = 1 @@ -570,19 +620,29 @@ def translation_file(yaml_filename:str, tr_lang:str, use_gpt=False, use_google_t continue total_rows += 1 # The segment has already been translated and the translation is still valid - if item in tr_cache and language in tr_cache[item] and tr_lang in tr_cache[item][language]: - tr_text = str(tr_cache[item][language][tr_lang]['tr_text']) - else: # This string needs to be translated - tr_text = '' - hold_for_draft_translation.append((row, item )) # item is the original untranslated string, pre-mako parsing + if ( + item in tr_cache + and language in tr_cache[item] + and tr_lang in tr_cache[item][language] + ): + tr_text = str(tr_cache[item][language][tr_lang]["tr_text"]) + else: # This string needs to be translated + tr_text = "" + hold_for_draft_translation.append( + (row, item) + ) # item is the original untranslated string, pre-mako parsing untranslated_segments += 1 - - # Add the metadata - - worksheet.write_string(row, 0, question.from_source.get_name(), text_format) + + # Add the metadata + + worksheet.write_string( + row, 0, question.from_source.get_name(), text_format + ) worksheet.write_string(row, 1, question_id, text_format) worksheet.write_number(row, 2, indexno, numb) - worksheet.write_string(row, 3, hashlib.md5(item.encode('utf-8')).hexdigest(), text_format) + worksheet.write_string( + row, 3, hashlib.md5(item.encode("utf-8")).hexdigest(), text_format + ) worksheet.write_string(row, 4, language, text_format) worksheet.write_string(row, 5, tr_lang, text_format) mako = mako_parts(item) @@ -612,8 +672,8 @@ def translation_file(yaml_filename:str, tr_lang:str, use_gpt=False, use_google_t parts.extend([fixedtwo, part[0]]) parts.append(fixedcell) worksheet.write_rich_string(*parts) - - # + + # mako = mako_parts(tr_text) if len(mako) == 0: @@ -645,28 +705,41 @@ def translation_file(yaml_filename:str, tr_lang:str, use_gpt=False, use_google_t indexno += 1 row += 1 seen.append(item) - + # Now we need to translate the hold_for_draft_translation items if use_gpt: translated_fragments = translate_fragments_gpt( - [{item[0]: item[1]} for item in hold_for_draft_translation], # We send a list of dictionaries for easier partitioning if we exceed max_tokens + [ + {item[0]: item[1]} for item in hold_for_draft_translation + ], # We send a list of dictionaries for easier partitioning if we exceed max_tokens source_language=language, tr_lang=tr_lang, openai_api=openai_api, ) - for row, item in translated_fragments.items(): # But we get back one dictionary - row = int(row) # it seems sometimes GPT-3.5 makes it a string, not an int + for ( + row, + item, + ) in translated_fragments.items(): # But we get back one dictionary + row = int( + row + ) # it seems sometimes GPT-3.5 makes it a string, not an int # Get the mako mako = mako_parts(item) if len(mako) == 0: worksheet.write_string(row, 7, item, whole_draft_translation_format) elif len(mako) == 1: if mako[0][1] == 0: - worksheet.write_string(row, 7, item, whole_draft_translation_format) + worksheet.write_string( + row, 7, item, whole_draft_translation_format + ) elif mako[0][1] == 1: - worksheet.write_string(row, 7, item, whole_draft_translation_format_one) + worksheet.write_string( + row, 7, item, whole_draft_translation_format_one + ) elif mako[0][1] == 2: - worksheet.write_string(row, 7, item, whole_draft_translation_format_two) + worksheet.write_string( + row, 7, item, whole_draft_translation_format_two + ) else: parts = [row, 7] for part in mako: @@ -678,7 +751,7 @@ def translation_file(yaml_filename:str, tr_lang:str, use_gpt=False, use_google_t parts.extend([whole_draft_translation_format_two, part[0]]) parts.append(draft_fixedcell) worksheet.write_rich_string(*parts) - + for item, cache_item in tr_cache.items(): if ( item in seen @@ -686,13 +759,25 @@ def translation_file(yaml_filename:str, tr_lang:str, use_gpt=False, use_google_t or tr_lang not in cache_item[language] ): continue - worksheet.write_string(row, 0, cache_item[language][tr_lang]['interview'], text_format) - worksheet.write_string(row, 1, cache_item[language][tr_lang]['question_id'], text_format) - worksheet.write_number(row, 2, 1000 + cache_item[language][tr_lang]['index_num'], numb) - worksheet.write_string(row, 3, cache_item[language][tr_lang]['hash'], text_format) - worksheet.write_string(row, 4, cache_item[language][tr_lang]['orig_lang'], text_format) - worksheet.write_string(row, 5, cache_item[language][tr_lang]['tr_lang'], text_format) - mako = mako_parts(cache_item[language][tr_lang]['orig_text']) + worksheet.write_string( + row, 0, cache_item[language][tr_lang]["interview"], text_format + ) + worksheet.write_string( + row, 1, cache_item[language][tr_lang]["question_id"], text_format + ) + worksheet.write_number( + row, 2, 1000 + cache_item[language][tr_lang]["index_num"], numb + ) + worksheet.write_string( + row, 3, cache_item[language][tr_lang]["hash"], text_format + ) + worksheet.write_string( + row, 4, cache_item[language][tr_lang]["orig_lang"], text_format + ) + worksheet.write_string( + row, 5, cache_item[language][tr_lang]["tr_lang"], text_format + ) + mako = mako_parts(cache_item[language][tr_lang]["orig_text"]) if len(mako) == 1: if mako[0][1] == 0: worksheet.write_string( @@ -762,7 +847,9 @@ def translation_file(yaml_filename:str, tr_lang:str, use_gpt=False, use_google_t worksheet.set_row(row, 15 * (num_lines + 1)) row += 1 workbook.close() - untranslated_words = len(re.findall(r"\w+", untranslated_text)) - return Translation(output_file, untranslated_words,untranslated_segments, total_rows) - + untranslated_words = len(re.findall(r"\w+", untranslated_text)) + return Translation( + output_file, untranslated_words, untranslated_segments, total_rows + ) + raise ValueError("That's not a valid filetype for a translation file") From 76094c9bd0f12aa46bf49b233046e7632eeddc5a Mon Sep 17 00:00:00 2001 From: Quinten Steenhuis Date: Tue, 26 Mar 2024 12:09:07 -0400 Subject: [PATCH 06/13] Typing --- docassemble/ALDashboard/translation.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/docassemble/ALDashboard/translation.py b/docassemble/ALDashboard/translation.py index f1aeced..1225230 100644 --- a/docassemble/ALDashboard/translation.py +++ b/docassemble/ALDashboard/translation.py @@ -155,7 +155,7 @@ def translate_fragments_gpt( # so heuristic of just assuming each fragment is equal size should be OK number_of_chunks_to_make = math.ceil(token_count / max_tokens) - results = {} + results:Dict[int, str] = {} for c in range(number_of_chunks_to_make): chunked_fragments = fragments if number_of_chunks_to_make > 1: @@ -169,7 +169,9 @@ def translate_fragments_gpt( openai_api=openai_api, ) - results.update(response) + assert isinstance(response, dict) + + results.update(response) # type-ignore return results From 1fa86083422848ba578b6faed33e53802eba647a Mon Sep 17 00:00:00 2001 From: Quinten Steenhuis Date: Wed, 15 May 2024 11:44:39 -0400 Subject: [PATCH 07/13] WIP --- .../data/questions/generate_translation.yml | 39 +++++++- .../data/questions/manage_answer_viewers.yml | 91 +++++++++++++++++++ .../questions/test_translate_fragments.yml | 27 ++++++ docassemble/ALDashboard/translation.py | 31 ++++--- 4 files changed, 176 insertions(+), 12 deletions(-) create mode 100644 docassemble/ALDashboard/data/questions/manage_answer_viewers.yml create mode 100644 docassemble/ALDashboard/data/questions/test_translate_fragments.yml diff --git a/docassemble/ALDashboard/data/questions/generate_translation.yml b/docassemble/ALDashboard/data/questions/generate_translation.yml index 09119b3..0a19d71 100644 --- a/docassemble/ALDashboard/data/questions/generate_translation.yml +++ b/docassemble/ALDashboard/data/questions/generate_translation.yml @@ -14,6 +14,8 @@ code: | the_yaml_path if not the_task.ready(): waiting_screen + if the_task.failed(): + error_screen show_translation_results --- # code: | @@ -37,6 +39,7 @@ reload: True --- event: translate_file code: | + background_error_action('bg_fail', stage='calculation') translations = [ translation_file(the_yaml_path, tr_lang, use_gpt=use_gpt, openai_api=get_config("openai api key", get_config("open ai", {}).get("key"))) for tr_lang @@ -46,6 +49,7 @@ code: | --- event: save_translations code: | + background_error_action('bg_fail', stage='saving') translations = action_argument('translations') background_response() --- @@ -86,4 +90,37 @@ subquestion: | Number of untranslated rows: ${ translations[index].untranslated_segments } Percentage of rows that are not translated: %${ translations[index].untranslated_segments/translations[index].total_rows * 100 } - % endfor \ No newline at end of file + % endfor +--- +event: bg_fail +code: | + errmess = "Failure at the " \ + + action_argument('stage') \ + + " stage due to a " \ + + action_argument('error_type') \ + + " error" + background_response('handled_error') +--- +event: error_screen +question: | + There was an error. +subquestion: | + The saved error message was + ${ errmess }. + + The value was + `${ the_task.get() }`. + + The error was + `${ the_task.result().error_type }`. + + The trace was + + ${ indent(the_task.result().error_trace) } + + The message was + + ${ indent(the_task.result().error_message) } + + The variables were + `${ repr(the_task.result().variables) }`. \ No newline at end of file diff --git a/docassemble/ALDashboard/data/questions/manage_answer_viewers.yml b/docassemble/ALDashboard/data/questions/manage_answer_viewers.yml new file mode 100644 index 0000000..9a427e4 --- /dev/null +++ b/docassemble/ALDashboard/data/questions/manage_answer_viewers.yml @@ -0,0 +1,91 @@ +--- +include: + - nav.yml +--- +modules: + - .aldashboard +--- +metadata: + required privileges: + - admin + title: | + Manage limited answer viewers +--- +code: | + # Get the list of dispatch interviews + interviews = {interview['filename']:interview for interview in interview_menu()} +--- +objects: + - viewers: DADict.using(object_type = DAObject, auto_gather=False) + - viewers[i].allowed_interviews: DAList.using(there_are_any=False) +--- +table: viewers.table +rows: viewers +columns: + - Privilege: | + row_index + - Allowed Interviews: | + comma_and_list(row_item.allowed_interviews) +--- +table: viewers[i].allowed_interviews.table +rows: viewers[i].allowed_interviews +columns: + - Interview: | + row_item +--- +code: | + existing_viewers = get_config("assembly line",{}).get("interview viewers",{}) + + for privilege in manage_privileges('list'): + viewers.initializeObject(privilege) + if privilege in existing_viewers: + viewers[privilege].allowed_interviews = DAList( + viewers[privilege].attr_name("allowed_interviews"), + elements=existing_viewers[privilege], + auto_gather=False, + gathered=True + ) + viewers.gathered = True +--- +id: interview order +mandatory: True +code: | + view_viewers +--- +id: allowed interviews i +question: | + Add an interview that users with the privilege "${ i }" are allowed to view +subquestion: | + % if len(viewers[i].allowed_interviews): + The following interviews are currently allowed for this privilege: + + ${ comma_and_list(viewers[i].allowed_interviews) } + % endif +fields: + - Interview name: viewers[i].allowed_interviews[j] + datatype: combobox + code: | + sorted([{interview: interviews[interview].get('title')} for interview in interviews], key=lambda y: next(iter(y.values()), '')) +validation code: | + if viewers[i].allowed_interviews[j] in viewers[i].allowed_interviews[:j]: + validation_error("This interview is already in the list", field="viewers[i].allowed_interviews[j]") +--- +event: view_viewers +id: viewers +question: | + Who is allowed to view limited answers? +subquestion: | + The answer viewing feature makes use of Docassemble's built-in privilege system. + + To assign a user the right to view a particular interview's sessions, you must add a matching + privilege and then assign the interview to that privilege. + + % for privilege in viewers: +

${ privilege }

+ + ${ viewers[privilege].allowed_interviews.table } + + ${ viewers[privilege].allowed_interviews.add_action() } + % endfor + + ${ action_button_html(url_action("save_changes", label="Save to global configuration", color="primary", )) } \ No newline at end of file diff --git a/docassemble/ALDashboard/data/questions/test_translate_fragments.yml b/docassemble/ALDashboard/data/questions/test_translate_fragments.yml new file mode 100644 index 0000000..4bfa372 --- /dev/null +++ b/docassemble/ALDashboard/data/questions/test_translate_fragments.yml @@ -0,0 +1,27 @@ +--- +mandatory: True +code: | + intro_screen + show_results +--- +modules: + - .translation +--- +code: | + example_strings = { + 0: "The quick brown fox jumps over the lazy dog.", + 20: "Your name", + 22: "${ users[0] }'s birthdate is" + } +--- +question: | + Click to send the dictionary to GPT-3.5-turbo-1106 +subquestion: | + ${ repr(example_strings) } +continue button field: intro_screen +--- +continue button field: show_results +question: | + Here are the results +subquestion: | + ${ repr(translate_fragments_gpt(example_strings, "en", "fr")) } \ No newline at end of file diff --git a/docassemble/ALDashboard/translation.py b/docassemble/ALDashboard/translation.py index f1aeced..4c5a942 100644 --- a/docassemble/ALDashboard/translation.py +++ b/docassemble/ALDashboard/translation.py @@ -40,7 +40,7 @@ import xlsxwriter -from docassemble.base.util import DAFile, language_name, get_config +from docassemble.base.util import DAFile, language_name, get_config, log from docassemble.webapp.server import mako_parts from typing import NamedTuple, Dict from docassemble.ALToolbox.llms import chat_completion @@ -160,16 +160,23 @@ def translate_fragments_gpt( chunked_fragments = fragments if number_of_chunks_to_make > 1: chunked_fragments = fragments[c * max_chunk_size : (c + 1) * max_chunk_size] - response = chat_completion( - system_prompt, - user_message=repr(chunked_fragments), - temperature=0.0, - json_mode=True, - model=model, - openai_api=openai_api, - ) - - results.update(response) + try: + response = chat_completion( + system_prompt, + user_message=repr(chunked_fragments), + temperature=0.0, + json_mode=True, + model=model, + openai_api=openai_api, + ) + # Get the exception and log it + except Exception as e: + log(f"Exception when calling chatcompletion: { e }") + response = str(e) + try: + results.update(response) + except: + log(f"Unexpected format in response from GPT: { response }") return results @@ -246,6 +253,7 @@ def translation_file( use_gpt=False, use_google_translate=False, openai_api: Optional[str] = None, + max_tokens=4000, ) -> Translation: """ Return a tuple of the translation file in XLSX format, plus a count of the @@ -715,6 +723,7 @@ def translation_file( source_language=language, tr_lang=tr_lang, openai_api=openai_api, + max_tokens=max_tokens, ) for ( row, From e7204a4004aa7b8c21c5c9a7bba9240b3e1a2275 Mon Sep 17 00:00:00 2001 From: Quinten Steenhuis Date: Wed, 15 May 2024 11:50:22 -0400 Subject: [PATCH 08/13] Remove accidental addition --- .../data/questions/manage_answer_viewers.yml | 91 ------------------- 1 file changed, 91 deletions(-) delete mode 100644 docassemble/ALDashboard/data/questions/manage_answer_viewers.yml diff --git a/docassemble/ALDashboard/data/questions/manage_answer_viewers.yml b/docassemble/ALDashboard/data/questions/manage_answer_viewers.yml deleted file mode 100644 index 9a427e4..0000000 --- a/docassemble/ALDashboard/data/questions/manage_answer_viewers.yml +++ /dev/null @@ -1,91 +0,0 @@ ---- -include: - - nav.yml ---- -modules: - - .aldashboard ---- -metadata: - required privileges: - - admin - title: | - Manage limited answer viewers ---- -code: | - # Get the list of dispatch interviews - interviews = {interview['filename']:interview for interview in interview_menu()} ---- -objects: - - viewers: DADict.using(object_type = DAObject, auto_gather=False) - - viewers[i].allowed_interviews: DAList.using(there_are_any=False) ---- -table: viewers.table -rows: viewers -columns: - - Privilege: | - row_index - - Allowed Interviews: | - comma_and_list(row_item.allowed_interviews) ---- -table: viewers[i].allowed_interviews.table -rows: viewers[i].allowed_interviews -columns: - - Interview: | - row_item ---- -code: | - existing_viewers = get_config("assembly line",{}).get("interview viewers",{}) - - for privilege in manage_privileges('list'): - viewers.initializeObject(privilege) - if privilege in existing_viewers: - viewers[privilege].allowed_interviews = DAList( - viewers[privilege].attr_name("allowed_interviews"), - elements=existing_viewers[privilege], - auto_gather=False, - gathered=True - ) - viewers.gathered = True ---- -id: interview order -mandatory: True -code: | - view_viewers ---- -id: allowed interviews i -question: | - Add an interview that users with the privilege "${ i }" are allowed to view -subquestion: | - % if len(viewers[i].allowed_interviews): - The following interviews are currently allowed for this privilege: - - ${ comma_and_list(viewers[i].allowed_interviews) } - % endif -fields: - - Interview name: viewers[i].allowed_interviews[j] - datatype: combobox - code: | - sorted([{interview: interviews[interview].get('title')} for interview in interviews], key=lambda y: next(iter(y.values()), '')) -validation code: | - if viewers[i].allowed_interviews[j] in viewers[i].allowed_interviews[:j]: - validation_error("This interview is already in the list", field="viewers[i].allowed_interviews[j]") ---- -event: view_viewers -id: viewers -question: | - Who is allowed to view limited answers? -subquestion: | - The answer viewing feature makes use of Docassemble's built-in privilege system. - - To assign a user the right to view a particular interview's sessions, you must add a matching - privilege and then assign the interview to that privilege. - - % for privilege in viewers: -

${ privilege }

- - ${ viewers[privilege].allowed_interviews.table } - - ${ viewers[privilege].allowed_interviews.add_action() } - % endfor - - ${ action_button_html(url_action("save_changes", label="Save to global configuration", color="primary", )) } \ No newline at end of file From d8d39540e5a20cd719ff2c7b7576003670eacfb0 Mon Sep 17 00:00:00 2001 From: Quinten Steenhuis Date: Wed, 4 Jun 2025 18:59:14 -0400 Subject: [PATCH 09/13] Simplify implementation - not worth saving a few pennies to get the bulk translation to work, esp. now we have cached prompts --- .../data/questions/generate_translation.yml | 23 ++- docassemble/ALDashboard/translation.py | 161 ++++++------------ 2 files changed, 74 insertions(+), 110 deletions(-) diff --git a/docassemble/ALDashboard/data/questions/generate_translation.yml b/docassemble/ALDashboard/data/questions/generate_translation.yml index 0a19d71..e8bfc82 100644 --- a/docassemble/ALDashboard/data/questions/generate_translation.yml +++ b/docassemble/ALDashboard/data/questions/generate_translation.yml @@ -41,7 +41,14 @@ event: translate_file code: | background_error_action('bg_fail', stage='calculation') translations = [ - translation_file(the_yaml_path, tr_lang, use_gpt=use_gpt, openai_api=get_config("openai api key", get_config("open ai", {}).get("key"))) + translation_file( + the_yaml_path, + tr_lang, + use_gpt=use_gpt, + openai_api=get_config("openai api key", get_config("open ai", {}).get("key")), + interview_context=interview_context if use_context else None, + + ) for tr_lang in tr_langs.split() ] @@ -70,8 +77,20 @@ fields: show if: code: | gpt_is_available() # or google_translate_is_available() + - Include context to help the translation: use_context + datatype: yesno + show if: use_gpt + - Context (explain what the interview is about, so isolated fragments get a better translation): interview_context + datatype: area + show if: use_context + - Add a glossary of special terms to help the translation: use_special_words + datatype: yesno + show if: use_gpt + - 'Add a list of terms and their translations (one per line, like: "term: translation")': special_words + datatype: area + show if: use_special_words - note: | - To use OpenAI's GPT-3, you need to set up an OpenAI account and get an API key. + To use AI translation, you need to set up an OpenAI account and get an API key. show if: code: | not gpt_is_available() diff --git a/docassemble/ALDashboard/translation.py b/docassemble/ALDashboard/translation.py index af3d6b3..bdb6069 100644 --- a/docassemble/ALDashboard/translation.py +++ b/docassemble/ALDashboard/translation.py @@ -82,12 +82,15 @@ def translate_fragments_gpt( fragments: Union[str, List[Dict[int, str]]], source_language: str, tr_lang: str, + interview_context: Optional[str] = None, special_words: Optional[Dict[int, str]] = None, - model="gpt-3.5-turbo-1106", - max_tokens=3900, + model="gpt-4.1-nano", + openai_base_url: Optional[str] = None, + max_output_tokens: int = None, + max_input_tokens: int = None, openai_api: Optional[str] = None, ) -> Dict[int, str]: - """Use GPT-3.5-1106 to translate a list of fragments (strings) from one language to another and provide a dictionary + """Use an AI model to translate a list of fragments (strings) from one language to another and provide a dictionary with the original text and the translated text. You can optionally provide an alternative model, but it must support JSON mode. @@ -97,8 +100,16 @@ def translate_fragments_gpt( source_language: The language of the original text. tr_lang: The language to translate the text into. special_words: A dictionary of special words that should be translated in a specific way. - model: The GPT model to use. The default is "gpt-3.5-turbo-1106". + model: The GPT model to use. The default is "gpt-4.1-nano" + openai_base_url: The base URL for the OpenAI API. If not provided, the default OpenAI URL will be used. + max_output_tokens: The maximum number of tokens to generate in the output. + max_input_tokens: The maximum number of tokens in the input. If not provided, it will be set to 4000. + openai_api: The OpenAI API key. If not provided, it will use the key from the configuration. + Returns: + A dictionary where the keys are the indices of the fragments and the values are the translated text. """ + if not model: + model = "gpt-4.1-nano" try: language_in_english = language_name(source_language) except: @@ -111,133 +122,55 @@ def translate_fragments_gpt( if isinstance(fragments, str): fragments = [{0: fragments}] - system_prompt = f"""You are a helpful translator that translates Docassemble interviews from "{language_in_english}" to "{tr_language_in_english}". You + system_prompt = f"""You translate Docassemble interviews from "{language_in_english}" to "{tr_language_in_english}". You preserve the meaning of all sentences while aiming to produce a translation at or below a 9th grade reading level. - - You will get input that looks like this that indicates a row in a table and the untranslated text in that row: - [ - {{0, "Your name"}}, - {{10, "When was ${{ user.name }} born?"}}, - {{32, "
Here is some text and a link.
}} - ] + Sometimes the input text may contain Mako tags or HTML tags. You do not translate these tags. + + You do not change the whitespace because whitespace can have meaning in Docassemble. - When you see Mako tags or HTML tags, you do not translate them. You can translate text in quotes that appears to be intended to be shown - to the user, but if there is a chance text is intended for the program logic you do not translate it. You do not change the whitespace because - whitespace can have meaning in Docassemble. + **Reply only with the translated text. Do not include any additional text or explanations.** """ + if interview_context is not None: + system_prompt += f"""When translating, keep in mind the purpose of this interview: ```{ interview_context }``` + """ + if special_words is not None: - system_prompt += """ + system_prompt += f""" When you see one of the special words in the following table in the first column, you use a form of the suggested replacement rather than inventing a new translation: + ``` {special_words} + ``` """ - system_prompt += """ - Your only reply is a JSON object that looks like this: - { - [ROW NUMBER]: "[TRANSLATED TEXT]", - } - - Where [ROW NUMBER] is the matching row index number, and [TRANSLATED TEXT] is the translated text. - """ - - encoding = tiktoken.encoding_for_model(model) - system_token_count = len(encoding.encode(system_prompt)) - user_message_token_count = len(encoding.encode(repr(fragments))) - token_count = system_token_count + user_message_token_count - number_of_chunks_to_make = 1 - if token_count > max_tokens: - # Divide the fragments into smaller chunks - max_chunk_size = max_tokens - system_token_count - chunked_fragments = [] - - # Most of the time, each fragment will be well under the max token limit, - # so heuristic of just assuming each fragment is equal size should be OK - number_of_chunks_to_make = math.ceil(token_count / max_tokens) + # row number: text to translate results:Dict[int, str] = {} - for c in range(number_of_chunks_to_make): - chunked_fragments = fragments - if number_of_chunks_to_make > 1: - chunked_fragments = fragments[c * max_chunk_size : (c + 1) * max_chunk_size] + + for row_number, text_to_translate in fragments: try: response = chat_completion( system_prompt, - user_message=repr(chunked_fragments), + user_message=text_to_translate, temperature=0.0, - json_mode=True, model=model, + max_output_tokens=max_output_tokens, + openai_base_url=openai_base_url, + max_input_tokens=max_input_tokens, openai_api=openai_api, ) + if isinstance(response, str): + results[row_number] = response.rstrip() # Remove any trailing whitespace some LLM models might add + else: + log(f"Unexpected response type from chat completion: {type(response)}") # Get the exception and log it except Exception as e: log(f"Exception when calling chatcompletion: { e }") response = str(e) - try: - results.update(response) - except: - log(f"Unexpected format in response from GPT: { response }") return results -# def translate_fragments_google( -# fragments: Union[str, List[str]], -# source_language: str, -# tr_lang: str, -# special_words: Optional[Dict[str, str]] = None, -# ) -> Dict[int, str]: -# """Use Google Translate to translate a list of fragments (strings) from one language to another and provide a dictionary -# with the original text and the translated text. -# """ -# return fragments - - -# def translate_fragments( -# fragments: Union[str, List[str]], -# language: str, -# tr_lang: str, -# allow_gpt=True, -# allow_google=True, -# special_words=Dict[str, str], -# ) -> Dict[int, str]: -# """ -# Translate a list of fragments (strings) from one language to another. -# """ -# if not (allow_google or allow_gpt): -# raise ValueError("You must allow at least one translation method") - -# if isinstance(fragments, str): -# fragments = [fragments] -# if language == tr_lang: -# return fragments - -# fragments_with_code = [] -# fragments_without_code = [] - -# if allow_gpt and allow_google: -# for fragment in fragments: -# if may_have_html(fragment) or may_have_mako(fragment): -# fragments_with_code.append(fragment) -# else: -# fragments_without_code.append(fragment) -# results = translate_fragments_gpt( -# fragments_with_code, language, tr_lang, special_words -# ) -# results.update( -# translate_fragments_google( -# fragments_without_code, language, tr_lang, special_words -# ) -# ) -# elif allow_gpt: -# results = translate_fragments_gpt(fragments, language, tr_lang, special_words) -# else: # allow_google -# results = translate_fragments_google( -# fragments, language, tr_lang, special_words -# ) -# return results - - class Translation(NamedTuple): file: DAFile # an XLSX or XLIFF file untranslated_words: ( @@ -254,6 +187,12 @@ def translation_file( use_google_translate=False, openai_api: Optional[str] = None, max_tokens=4000, + interview_context: Optional[str] = None, + special_words: Optional[Dict[int, str]] = None, + model: Optional[str] = None, + openai_base_url: Optional[str] = None, + max_input_tokens: Optional[int] = None, + max_output_tokens: Optional[int] = None, ) -> Translation: """ Return a tuple of the translation file in XLSX format, plus a count of the @@ -718,12 +657,18 @@ def translation_file( if use_gpt: translated_fragments = translate_fragments_gpt( [ - {item[0]: item[1]} for item in hold_for_draft_translation - ], # We send a list of dictionaries for easier partitioning if we exceed max_tokens + # row, text to translate + (item[0], item[1]) for item in hold_for_draft_translation + ], # We send a list of tuples for easier partitioning if we exceed max_tokens source_language=language, tr_lang=tr_lang, openai_api=openai_api, - max_tokens=max_tokens, + interview_context=interview_context, + special_words=special_words, + model=model, + openai_base_url=openai_base_url, + max_input_tokens=max_input_tokens, + max_output_tokens=max_output_tokens, ) for ( row, From 2a6fad17c1ea87d947635ec2fb534bca06d2eaa7 Mon Sep 17 00:00:00 2001 From: Quinten Steenhuis Date: Tue, 1 Jul 2025 15:45:30 -0400 Subject: [PATCH 10/13] Explicit optional --- docassemble/ALDashboard/create_package.py | 6 +++--- docassemble/ALDashboard/translation.py | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/docassemble/ALDashboard/create_package.py b/docassemble/ALDashboard/create_package.py index 1909437..3b5acff 100644 --- a/docassemble/ALDashboard/create_package.py +++ b/docassemble/ALDashboard/create_package.py @@ -15,7 +15,7 @@ import zipfile import os import re -from typing import Any, Dict, List, Tuple, Union # , Set +from typing import Optional, Any, Dict, List, Tuple, Union # , Set __all__ = [ "get_files", @@ -48,7 +48,7 @@ def project_name(name): def create_user_playground_zip( - user_id: int, name: str, project: str = "default", fileobj: DAFile = None + user_id: int, name: str, project: str = "default", fileobj: Optional[DAFile] = None ): folders_and_files = {} for section in ( @@ -82,7 +82,7 @@ def create_package_zip( info: dict, author_info: dict, folders_and_files: dict, - fileobj: DAFile = None, + fileobj: Optional[DAFile] = None, ) -> DAFile: """ Given a dictionary of lists, with the keys representing folders and the values diff --git a/docassemble/ALDashboard/translation.py b/docassemble/ALDashboard/translation.py index bdb6069..baab942 100644 --- a/docassemble/ALDashboard/translation.py +++ b/docassemble/ALDashboard/translation.py @@ -86,8 +86,8 @@ def translate_fragments_gpt( special_words: Optional[Dict[int, str]] = None, model="gpt-4.1-nano", openai_base_url: Optional[str] = None, - max_output_tokens: int = None, - max_input_tokens: int = None, + max_output_tokens: Optional[int] = None, + max_input_tokens: Optional[int] = None, openai_api: Optional[str] = None, ) -> Dict[int, str]: """Use an AI model to translate a list of fragments (strings) from one language to another and provide a dictionary From aafa7d507573a9d79ee18fb51e6d5e02b63ef472 Mon Sep 17 00:00:00 2001 From: Quinten Steenhuis Date: Tue, 1 Jul 2025 15:52:54 -0400 Subject: [PATCH 11/13] Update types to match signature --- docassemble/ALDashboard/docx_wrangling.py | 3 +- docassemble/ALDashboard/translation.py | 58 +++++++++++++---------- 2 files changed, 34 insertions(+), 27 deletions(-) diff --git a/docassemble/ALDashboard/docx_wrangling.py b/docassemble/ALDashboard/docx_wrangling.py index 0c3b45d..28ef942 100644 --- a/docassemble/ALDashboard/docx_wrangling.py +++ b/docassemble/ALDashboard/docx_wrangling.py @@ -33,7 +33,8 @@ def add_paragraph_before(paragraph, text): def update_docx( - document: Union[docx.document.Document, str], modified_runs: List[Tuple[int, int, str, int]] + document: Union[docx.document.Document, str], + modified_runs: List[Tuple[int, int, str, int]], ) -> docx.document.Document: """Update the document with modified runs. diff --git a/docassemble/ALDashboard/translation.py b/docassemble/ALDashboard/translation.py index baab942..918e935 100644 --- a/docassemble/ALDashboard/translation.py +++ b/docassemble/ALDashboard/translation.py @@ -145,28 +145,33 @@ def translate_fragments_gpt( """ # row number: text to translate - results:Dict[int, str] = {} - - for row_number, text_to_translate in fragments: - try: - response = chat_completion( - system_prompt, - user_message=text_to_translate, - temperature=0.0, - model=model, - max_output_tokens=max_output_tokens, - openai_base_url=openai_base_url, - max_input_tokens=max_input_tokens, - openai_api=openai_api, - ) - if isinstance(response, str): - results[row_number] = response.rstrip() # Remove any trailing whitespace some LLM models might add - else: - log(f"Unexpected response type from chat completion: {type(response)}") - # Get the exception and log it - except Exception as e: - log(f"Exception when calling chatcompletion: { e }") - response = str(e) + results: Dict[int, str] = {} + + for fragment_dict in fragments: + for row_number, text_to_translate in fragment_dict.items(): + try: + response = chat_completion( + system_prompt, + user_message=text_to_translate, + temperature=0.0, + model=model, + max_output_tokens=max_output_tokens, + openai_base_url=openai_base_url, + max_input_tokens=max_input_tokens, + openai_api=openai_api, + ) + if isinstance(response, str): + results[row_number] = ( + response.rstrip() + ) # Remove any trailing whitespace some LLM models might add + else: + log( + f"Unexpected response type from chat completion: {type(response)}" + ) + # Get the exception and log it + except Exception as e: + log(f"Exception when calling chatcompletion: { e }") + response = str(e) return results @@ -192,7 +197,7 @@ def translation_file( model: Optional[str] = None, openai_base_url: Optional[str] = None, max_input_tokens: Optional[int] = None, - max_output_tokens: Optional[int] = None, + max_output_tokens: Optional[int] = None, ) -> Translation: """ Return a tuple of the translation file in XLSX format, plus a count of the @@ -657,9 +662,10 @@ def translation_file( if use_gpt: translated_fragments = translate_fragments_gpt( [ - # row, text to translate - (item[0], item[1]) for item in hold_for_draft_translation - ], # We send a list of tuples for easier partitioning if we exceed max_tokens + # row: text to translate + {item[0]: item[1]} + for item in hold_for_draft_translation + ], # We send a list of dicts for easier partitioning if we exceed max_tokens source_language=language, tr_lang=tr_lang, openai_api=openai_api, From 9571eb0a6a10e05052c860ccc97ee89a0bb937c6 Mon Sep 17 00:00:00 2001 From: Quinten Steenhuis Date: Tue, 1 Jul 2025 15:55:41 -0400 Subject: [PATCH 12/13] Remove unnecessary comment triggering secrets error --- docassemble/ALDashboard/aldashboard.py | 1 - 1 file changed, 1 deletion(-) diff --git a/docassemble/ALDashboard/aldashboard.py b/docassemble/ALDashboard/aldashboard.py index ddfe275..0c15419 100644 --- a/docassemble/ALDashboard/aldashboard.py +++ b/docassemble/ALDashboard/aldashboard.py @@ -70,7 +70,6 @@ def install_from_github_url(url: str, branch: str = "", pat: Optional[str] = None): giturl = url.strip().rstrip("/") if pat: - # modify so it looks like https://ghp_...:x-oauth-basic@github.com/johnsmith/docassemble-missouri-familylaw giturl = re.sub(r"^https://", f"https://{pat}:x-oauth-basic@", giturl) if isinstance(branch, str): branch = branch.strip() From bfbef8df9a9bbadfa823961d1009bb9179042e33 Mon Sep 17 00:00:00 2001 From: Quinten Steenhuis Date: Tue, 1 Jul 2025 16:58:48 -0400 Subject: [PATCH 13/13] Revert to tuples, slightly tweak prompt to improve performance with Mako tags, allow custom model selection --- .../data/questions/generate_translation.yml | 10 ++- .../questions/test_translate_fragments.yml | 25 +++++-- docassemble/ALDashboard/translation.py | 69 ++++++++++--------- 3 files changed, 64 insertions(+), 40 deletions(-) diff --git a/docassemble/ALDashboard/data/questions/generate_translation.yml b/docassemble/ALDashboard/data/questions/generate_translation.yml index e8bfc82..b8149a8 100644 --- a/docassemble/ALDashboard/data/questions/generate_translation.yml +++ b/docassemble/ALDashboard/data/questions/generate_translation.yml @@ -47,7 +47,7 @@ code: | use_gpt=use_gpt, openai_api=get_config("openai api key", get_config("open ai", {}).get("key")), interview_context=interview_context if use_context else None, - + model = model, ) for tr_lang in tr_langs.split() @@ -77,6 +77,14 @@ fields: show if: code: | gpt_is_available() # or google_translate_is_available() + - Model to use (nano is normally smart enough): model + input type: radio + choices: + - Nano (cheapest): gpt-4.1-nano + - Mini (still cheap): gpt-4.1-mini + - Normal (moderately expensive): gpt-4.1 + show if: use_gpt + default: gpt-4.1-nano - Include context to help the translation: use_context datatype: yesno show if: use_gpt diff --git a/docassemble/ALDashboard/data/questions/test_translate_fragments.yml b/docassemble/ALDashboard/data/questions/test_translate_fragments.yml index 4bfa372..af16570 100644 --- a/docassemble/ALDashboard/data/questions/test_translate_fragments.yml +++ b/docassemble/ALDashboard/data/questions/test_translate_fragments.yml @@ -8,20 +8,31 @@ modules: - .translation --- code: | - example_strings = { - 0: "The quick brown fox jumps over the lazy dog.", - 20: "Your name", - 22: "${ users[0] }'s birthdate is" - } + example_strings = ( + (0, "The quick brown fox jumps over the lazy dog."), + (20, "Your name"), + (22, "${ users[0] }'s birthdate is"), + (55, """% if user_is_cool_status == "cool": + You are cool! + ${ "and it's a cool day!" if the_date < today() else "" } + % else: + You are not cool! + % endif + """), + ) --- question: | - Click to send the dictionary to GPT-3.5-turbo-1106 + Click to test the translation subquestion: | + ``` ${ repr(example_strings) } + ``` continue button field: intro_screen --- continue button field: show_results question: | Here are the results subquestion: | - ${ repr(translate_fragments_gpt(example_strings, "en", "fr")) } \ No newline at end of file + ``` + ${ repr(translate_fragments_gpt(example_strings, "en", "es")) } + ``` \ No newline at end of file diff --git a/docassemble/ALDashboard/translation.py b/docassemble/ALDashboard/translation.py index 918e935..75827ea 100644 --- a/docassemble/ALDashboard/translation.py +++ b/docassemble/ALDashboard/translation.py @@ -79,7 +79,7 @@ def may_have_html(text: str) -> bool: def translate_fragments_gpt( - fragments: Union[str, List[Dict[int, str]]], + fragments: Union[str, List[Tuple[int, str]]], source_language: str, tr_lang: str, interview_context: Optional[str] = None, @@ -120,16 +120,25 @@ def translate_fragments_gpt( tr_language_in_english = tr_lang if isinstance(fragments, str): - fragments = [{0: fragments}] + fragments = [(0, fragments)] system_prompt = f"""You translate Docassemble interviews from "{language_in_english}" to "{tr_language_in_english}". You preserve the meaning of all sentences while aiming to produce a translation at or below a 9th grade reading level. - Sometimes the input text may contain Mako tags or HTML tags. You do not translate these tags. + Whenever you see anything that looks like code, variable interpolation, Mako template syntax, HTML tags, or Python keywords, assume it is code and do not touch it. + You are only translating natural-language content. - You do not change the whitespace because whitespace can have meaning in Docassemble. + **Do not translate** any text matching these patterns (pass it through verbatim): + • `% if …:` / `% endif` / `% for …:` / `% endfor` + • `% elif …:` / `% else:` + • `${{…}}` + • `{{% … %}}` + • `<…>` HTML tags + • Python keywords: def, if, else, elif, import, for, while, return - **Reply only with the translated text. Do not include any additional text or explanations.** + You only translate natural-language text. + Preserve all whitespace exactly. + Reply *only* with the translated text—no extra commentary. """ if interview_context is not None: system_prompt += f"""When translating, keep in mind the purpose of this interview: ```{ interview_context }``` @@ -147,32 +156,28 @@ def translate_fragments_gpt( # row number: text to translate results: Dict[int, str] = {} - for fragment_dict in fragments: - for row_number, text_to_translate in fragment_dict.items(): - try: - response = chat_completion( - system_prompt, - user_message=text_to_translate, - temperature=0.0, - model=model, - max_output_tokens=max_output_tokens, - openai_base_url=openai_base_url, - max_input_tokens=max_input_tokens, - openai_api=openai_api, - ) - if isinstance(response, str): - results[row_number] = ( - response.rstrip() - ) # Remove any trailing whitespace some LLM models might add - else: - log( - f"Unexpected response type from chat completion: {type(response)}" - ) - # Get the exception and log it - except Exception as e: - log(f"Exception when calling chatcompletion: { e }") - response = str(e) - + for row_number, text_to_translate in fragments: + try: + response = chat_completion( + system_prompt, + user_message=text_to_translate, + temperature=0.0, + model=model, + max_output_tokens=max_output_tokens, + openai_base_url=openai_base_url, + max_input_tokens=max_input_tokens, + openai_api=openai_api, + ) + if isinstance(response, str): + results[row_number] = ( + response.rstrip() + ) # Remove any trailing whitespace some LLM models might add + else: + log(f"Unexpected response type from chat completion: {type(response)}") + # Get the exception and log it + except Exception as e: + log(f"Exception when calling chatcompletion: { e }") + response = str(e) return results @@ -663,7 +668,7 @@ def translation_file( translated_fragments = translate_fragments_gpt( [ # row: text to translate - {item[0]: item[1]} + (item[0], item[1]) for item in hold_for_draft_translation ], # We send a list of dicts for easier partitioning if we exceed max_tokens source_language=language,