diff --git a/docassemble/ALDashboard/data/questions/docx_wrangling.yml b/docassemble/ALDashboard/data/questions/docx_wrangling.yml
index 4d67ab3..5c2d542 100644
--- a/docassemble/ALDashboard/data/questions/docx_wrangling.yml
+++ b/docassemble/ALDashboard/data/questions/docx_wrangling.yml
@@ -12,10 +12,31 @@ id: interview order
mandatory: True
code: |
docx_file
+ if not started_task.ready():
+ waiting_screen
+ show_stats
ask_about_labels
save_changes
show_final_docx
---
+event: waiting_screen
+question: |
+ Please wait while we process your file
+subquestion: |
+
+ Processing...
+
+reload: True
+---
+continue button field: show_stats
+question: |
+ Your DOCX file has been processed
+subquestion: |
+ GPT-4 found ${ len(draft_labels) } labels in your DOCX file.
+
+ On the next screen, you can review and make any necessary changes
+ to the draft Jinja2 labels.
+---
question: |
Upload a DOCX file
subquestion: |
@@ -23,9 +44,14 @@ subquestion: |
[AssemblyLine
convention](https://suffolklitlab.org/docassemble-AssemblyLine-documentation/docs/label_variables)
to your DOCX file.
+
+ Your upload can have up to 300 pages, but the result cannot be larger than about 4,000 words. The result will
+ only include the modified paragraphs.
fields:
- DOCX file: docx_file
datatype: file
+ accept: |
+ ".docx, application/vnd.openxmlformats-officedocument.wordprocessingml.document"
- Include custom people names: include_custom_people_names
datatype: yesno
default: False
@@ -43,12 +69,22 @@ fields:
These names will be used to label variables
in the DOCX file.
---
+code: |
+ started_task = background_action('task_draft_labels')
+---
+event: task_draft_labels
code: |
if include_custom_people_names:
custom_people_names = [tuple(line.split(':')) for line in custom_people_names_text.split('\n')]
else:
custom_people_names = None
draft_labels = get_labeled_docx_runs(docx_file[0].path(), custom_people_names = custom_people_names)
+ background_response_action('save_draft_labels', draft_labels=draft_labels)
+---
+event: save_draft_labels
+code: |
+ draft_labels = action_argument('draft_labels')
+ background_response()
---
objects:
- final_labels: DAList.using(object_type=DAObject, auto_gather=False, gathered=True)
@@ -59,26 +95,51 @@ code: |
label_question = []
for idx, item in enumerate(draft_labels):
+ new_obj = final_labels.appendObject()
+ new_obj.paragraph = item[0]
+ new_obj.run = item[1]
+ new_obj.draft_label_text = item[2]
+
# Results will be a tuple of paragraph number, run, modified text with label
label_question.append({
'label': original_doc.paragraphs[item[0]].runs[item[1]].text,
- 'field': f'final_labels[idx].label',
+ 'field': f'final_labels[{idx}].label',
'default': item[2],
+ 'label above field': True,
+ 'grid': 8,
+ 'hide if': f'final_labels[{idx}].leave_unchanged'
+ })
+ label_question.append({
+ 'label': 'Leave unchanged',
+ 'field': f'final_labels[{idx}].leave_unchanged',
+ 'datatype': 'yesno',
+ 'grid': {
+ 'width': 4,
+ 'end': True,
+ },
+ 'label above field': True,
})
del docx
del original_doc
---
code: |
- for idx, item in enumerate(draft_labels):
- item[2] = final_labels[idx].label
- new_doc_obj = update_docx(docx_file[0].path(), draft_labels)
+ new_doc_obj = update_docx(
+ docx_file[0].path(),
+ [
+ (item.paragraph, item.run, item.label, 0)
+ for item in final_labels
+ if not item.leave_unchanged
+ ]
+ )
new_docx.initialize(filename=docx_file[0].filename)
new_doc_obj.save(new_docx.path())
new_docx.commit()
+ del new_doc_obj
+
save_changes = True
---
continue button field: ask_about_labels
diff --git a/docassemble/ALDashboard/docx_wrangling.py b/docassemble/ALDashboard/docx_wrangling.py
index bf491f6..20fedc9 100644
--- a/docassemble/ALDashboard/docx_wrangling.py
+++ b/docassemble/ALDashboard/docx_wrangling.py
@@ -11,7 +11,7 @@
os.environ["OPENAI_API_KEY"] = get_config("openai api key")
-from typing import List, Tuple, Optional
+from typing import List, Tuple, Optional, Union
__all__ = [
"get_labeled_docx_runs",
@@ -33,12 +33,12 @@ def add_paragraph_before(paragraph, text):
def update_docx(
- document: docx.Document, modified_runs: List[Tuple[int, int, str, int]]
+ document: Union[docx.Document, str], modified_runs: List[Tuple[int, int, str, int]]
) -> docx.Document:
"""Update the document with the modified runs.
Args:
- document: the docx.Document object
+ document: the docx.Document object, or the path to the DOCX file
modified_runs: a tuple of paragraph number, run number, the modified text, a question (not used), and whether a new paragraph should be inserted (for conditional text)
Returns:
@@ -50,6 +50,9 @@ def update_docx(
## also sort each run in the modified_runs so that the runs are in the correct order
# modified_runs = sorted(modified_runs, key=lambda x: x[1], reverse=True)
+ if isinstance(document, str):
+ document = docx.Document(document)
+
for paragraph_number, run_number, modified_text, new_paragraph in modified_runs:
paragraph = document.paragraphs[paragraph_number]
run = paragraph.runs[run_number]
@@ -150,7 +153,8 @@ def get_labeled_docx_runs(
Name Forms:
users (full name of all users)
- users[0] (Full name)
+ users[0] (full name of first user)
+ users[0].name.full() (Alternate full name of first user)
users[0].name.first (First name only)
users[0].name.middle (Middle name only)
users[0].name.middle_initial() (First letter of middle name)
@@ -225,6 +229,10 @@ def get_labeled_docx_runs(
encoding = tiktoken.encoding_for_model("gpt-4")
token_count = len(encoding.encode(role_description + rules + repr(items)))
+ if token_count > 128000:
+ raise Exception(
+ f"Input to OpenAI is too long ({token_count} tokens). Maximum is 128000 tokens."
+ )
response = openai_client.chat.completions.create(
model="gpt-4-1106-preview",