Get a working MVP

SuffolkLITLab · Nov 21, 2023 · aa43845 · aa43845
1 parent c022809
commit aa43845
Show file tree

Hide file tree

Showing 2 changed files with 77 additions and 8 deletions.
diff --git a/docassemble/ALDashboard/data/questions/docx_wrangling.yml b/docassemble/ALDashboard/data/questions/docx_wrangling.yml
@@ -12,20 +12,46 @@ id: interview order
 mandatory: True
 code: |
   docx_file
+  if not started_task.ready():
+    waiting_screen
+  show_stats
   ask_about_labels
   save_changes
   show_final_docx
 ---
+event: waiting_screen
+question: |
+  Please wait while we process your file
+subquestion: |
+  <div class="spinner-border" role="status">
+    <span class="visually-hidden">Processing...</span>
+  </div>
+reload: True
+---
+continue button field: show_stats
+question: |
+  Your DOCX file has been processed
+subquestion: |
+  GPT-4 found ${ len(draft_labels) } labels in your DOCX file.
+
+  On the next screen, you can review and make any necessary changes
+  to the draft Jinja2 labels.
+---
 question: |
   Upload a DOCX file
 subquestion: |
   We will use GPT-4 to try to add variables in the
   [AssemblyLine 
   convention](https://suffolklitlab.org/docassemble-AssemblyLine-documentation/docs/label_variables) 
   to your DOCX file.
+
+  Your upload can have up to 300 pages, but the result cannot be larger than about 4,000 words. The result will
+  only include the modified paragraphs.
 fields:
   - DOCX file: docx_file
     datatype: file
+    accept: |
+      ".docx, application/vnd.openxmlformats-officedocument.wordprocessingml.document"
   - Include custom people names: include_custom_people_names
     datatype: yesno
     default: False
@@ -43,12 +69,22 @@ fields:
       These names will be used to label variables
       in the DOCX file.
 ---
+code: |
+  started_task = background_action('task_draft_labels')      
+---
+event: task_draft_labels
 code: |
   if include_custom_people_names:
     custom_people_names = [tuple(line.split(':')) for line in custom_people_names_text.split('\n')]
   else:
     custom_people_names = None
   draft_labels = get_labeled_docx_runs(docx_file[0].path(), custom_people_names = custom_people_names)
+  background_response_action('save_draft_labels', draft_labels=draft_labels)
+---
+event: save_draft_labels
+code: |
+  draft_labels = action_argument('draft_labels')
+  background_response()
 ---
 objects:
   - final_labels: DAList.using(object_type=DAObject, auto_gather=False, gathered=True)
@@ -59,26 +95,51 @@ code: |
 
   label_question = []
   for idx, item in enumerate(draft_labels):
+    new_obj = final_labels.appendObject()
+    new_obj.paragraph = item[0]
+    new_obj.run = item[1]
+    new_obj.draft_label_text = item[2]
+
     # Results will be a tuple of paragraph number, run, modified text with label
     label_question.append({
       'label': original_doc.paragraphs[item[0]].runs[item[1]].text,
-      'field': f'final_labels[idx].label',
+      'field': f'final_labels[{idx}].label',
       'default': item[2],
+      'label above field': True,
+      'grid': 8,
+      'hide if': f'final_labels[{idx}].leave_unchanged'
+    })
+    label_question.append({
+      'label': 'Leave unchanged',
+      'field': f'final_labels[{idx}].leave_unchanged',
+      'datatype': 'yesno',
+      'grid': {
+        'width': 4,
+        'end': True,
+      },
+      'label above field': True,
     })
 
   del docx
   del original_doc
 ---
 code: |
-  for idx, item in enumerate(draft_labels):
-    item[2] = final_labels[idx].label
 
-  new_doc_obj = update_docx(docx_file[0].path(), draft_labels)
+  new_doc_obj = update_docx(
+    docx_file[0].path(), 
+    [
+      (item.paragraph, item.run, item.label, 0) 
+      for item in final_labels 
+      if not item.leave_unchanged
+    ]
+  )
 
   new_docx.initialize(filename=docx_file[0].filename)
   new_doc_obj.save(new_docx.path())
   new_docx.commit()
 
+  del new_doc_obj
+
   save_changes = True
 ---
 continue button field: ask_about_labels

diff --git a/docassemble/ALDashboard/docx_wrangling.py b/docassemble/ALDashboard/docx_wrangling.py
@@ -11,7 +11,7 @@
 
 os.environ["OPENAI_API_KEY"] = get_config("openai api key")
 
-from typing import List, Tuple, Optional
+from typing import List, Tuple, Optional, Union
 
 __all__ = [
     "get_labeled_docx_runs",
@@ -33,12 +33,12 @@ def add_paragraph_before(paragraph, text):
 
 
 def update_docx(
-    document: docx.Document, modified_runs: List[Tuple[int, int, str, int]]
+    document: Union[docx.Document, str], modified_runs: List[Tuple[int, int, str, int]]
 ) -> docx.Document:
     """Update the document with the modified runs.
 
     Args:
-        document: the docx.Document object
+        document: the docx.Document object, or the path to the DOCX file
         modified_runs: a tuple of paragraph number, run number, the modified text, a question (not used), and whether a new paragraph should be inserted (for conditional text)
 
     Returns:
@@ -50,6 +50,9 @@ def update_docx(
     ## also sort each run in the modified_runs so that the runs are in the correct order
     # modified_runs = sorted(modified_runs, key=lambda x: x[1], reverse=True)
 
+    if isinstance(document, str):
+        document = docx.Document(document)
+
     for paragraph_number, run_number, modified_text, new_paragraph in modified_runs:
         paragraph = document.paragraphs[paragraph_number]
         run = paragraph.runs[run_number]
@@ -150,7 +153,8 @@ def get_labeled_docx_runs(
 
         Name Forms:
             users (full name of all users)
-            users[0] (Full name)
+            users[0] (full name of first user)
+            users[0].name.full() (Alternate full name of first user)
             users[0].name.first (First name only)
             users[0].name.middle (Middle name only)
             users[0].name.middle_initial() (First letter of middle name)
@@ -225,6 +229,10 @@ def get_labeled_docx_runs(
 
     encoding = tiktoken.encoding_for_model("gpt-4")
     token_count = len(encoding.encode(role_description + rules + repr(items)))
+    if token_count > 128000:
+        raise Exception(
+            f"Input to OpenAI is too long ({token_count} tokens). Maximum is 128000 tokens."
+        )
 
     response = openai_client.chat.completions.create(
         model="gpt-4-1106-preview",