diff --git a/docassemble/ALDashboard/data/questions/docx_wrangling.yml b/docassemble/ALDashboard/data/questions/docx_wrangling.yml index 4d67ab3..5c2d542 100644 --- a/docassemble/ALDashboard/data/questions/docx_wrangling.yml +++ b/docassemble/ALDashboard/data/questions/docx_wrangling.yml @@ -12,10 +12,31 @@ id: interview order mandatory: True code: | docx_file + if not started_task.ready(): + waiting_screen + show_stats ask_about_labels save_changes show_final_docx --- +event: waiting_screen +question: | + Please wait while we process your file +subquestion: | +
+ Processing... +
+reload: True +--- +continue button field: show_stats +question: | + Your DOCX file has been processed +subquestion: | + GPT-4 found ${ len(draft_labels) } labels in your DOCX file. + + On the next screen, you can review and make any necessary changes + to the draft Jinja2 labels. +--- question: | Upload a DOCX file subquestion: | @@ -23,9 +44,14 @@ subquestion: | [AssemblyLine convention](https://suffolklitlab.org/docassemble-AssemblyLine-documentation/docs/label_variables) to your DOCX file. + + Your upload can have up to 300 pages, but the result cannot be larger than about 4,000 words. The result will + only include the modified paragraphs. fields: - DOCX file: docx_file datatype: file + accept: | + ".docx, application/vnd.openxmlformats-officedocument.wordprocessingml.document" - Include custom people names: include_custom_people_names datatype: yesno default: False @@ -43,12 +69,22 @@ fields: These names will be used to label variables in the DOCX file. --- +code: | + started_task = background_action('task_draft_labels') +--- +event: task_draft_labels code: | if include_custom_people_names: custom_people_names = [tuple(line.split(':')) for line in custom_people_names_text.split('\n')] else: custom_people_names = None draft_labels = get_labeled_docx_runs(docx_file[0].path(), custom_people_names = custom_people_names) + background_response_action('save_draft_labels', draft_labels=draft_labels) +--- +event: save_draft_labels +code: | + draft_labels = action_argument('draft_labels') + background_response() --- objects: - final_labels: DAList.using(object_type=DAObject, auto_gather=False, gathered=True) @@ -59,26 +95,51 @@ code: | label_question = [] for idx, item in enumerate(draft_labels): + new_obj = final_labels.appendObject() + new_obj.paragraph = item[0] + new_obj.run = item[1] + new_obj.draft_label_text = item[2] + # Results will be a tuple of paragraph number, run, modified text with label label_question.append({ 'label': original_doc.paragraphs[item[0]].runs[item[1]].text, - 'field': f'final_labels[idx].label', + 'field': f'final_labels[{idx}].label', 'default': item[2], + 'label above field': True, + 'grid': 8, + 'hide if': f'final_labels[{idx}].leave_unchanged' + }) + label_question.append({ + 'label': 'Leave unchanged', + 'field': f'final_labels[{idx}].leave_unchanged', + 'datatype': 'yesno', + 'grid': { + 'width': 4, + 'end': True, + }, + 'label above field': True, }) del docx del original_doc --- code: | - for idx, item in enumerate(draft_labels): - item[2] = final_labels[idx].label - new_doc_obj = update_docx(docx_file[0].path(), draft_labels) + new_doc_obj = update_docx( + docx_file[0].path(), + [ + (item.paragraph, item.run, item.label, 0) + for item in final_labels + if not item.leave_unchanged + ] + ) new_docx.initialize(filename=docx_file[0].filename) new_doc_obj.save(new_docx.path()) new_docx.commit() + del new_doc_obj + save_changes = True --- continue button field: ask_about_labels diff --git a/docassemble/ALDashboard/docx_wrangling.py b/docassemble/ALDashboard/docx_wrangling.py index bf491f6..20fedc9 100644 --- a/docassemble/ALDashboard/docx_wrangling.py +++ b/docassemble/ALDashboard/docx_wrangling.py @@ -11,7 +11,7 @@ os.environ["OPENAI_API_KEY"] = get_config("openai api key") -from typing import List, Tuple, Optional +from typing import List, Tuple, Optional, Union __all__ = [ "get_labeled_docx_runs", @@ -33,12 +33,12 @@ def add_paragraph_before(paragraph, text): def update_docx( - document: docx.Document, modified_runs: List[Tuple[int, int, str, int]] + document: Union[docx.Document, str], modified_runs: List[Tuple[int, int, str, int]] ) -> docx.Document: """Update the document with the modified runs. Args: - document: the docx.Document object + document: the docx.Document object, or the path to the DOCX file modified_runs: a tuple of paragraph number, run number, the modified text, a question (not used), and whether a new paragraph should be inserted (for conditional text) Returns: @@ -50,6 +50,9 @@ def update_docx( ## also sort each run in the modified_runs so that the runs are in the correct order # modified_runs = sorted(modified_runs, key=lambda x: x[1], reverse=True) + if isinstance(document, str): + document = docx.Document(document) + for paragraph_number, run_number, modified_text, new_paragraph in modified_runs: paragraph = document.paragraphs[paragraph_number] run = paragraph.runs[run_number] @@ -150,7 +153,8 @@ def get_labeled_docx_runs( Name Forms: users (full name of all users) - users[0] (Full name) + users[0] (full name of first user) + users[0].name.full() (Alternate full name of first user) users[0].name.first (First name only) users[0].name.middle (Middle name only) users[0].name.middle_initial() (First letter of middle name) @@ -225,6 +229,10 @@ def get_labeled_docx_runs( encoding = tiktoken.encoding_for_model("gpt-4") token_count = len(encoding.encode(role_description + rules + repr(items))) + if token_count > 128000: + raise Exception( + f"Input to OpenAI is too long ({token_count} tokens). Maximum is 128000 tokens." + ) response = openai_client.chat.completions.create( model="gpt-4-1106-preview",