Skip to content

Commit

Permalink
Get a working MVP
Browse files Browse the repository at this point in the history
  • Loading branch information
nonprofittechy committed Nov 21, 2023
1 parent c022809 commit aa43845
Show file tree
Hide file tree
Showing 2 changed files with 77 additions and 8 deletions.
69 changes: 65 additions & 4 deletions docassemble/ALDashboard/data/questions/docx_wrangling.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,20 +12,46 @@ id: interview order
mandatory: True
code: |
docx_file
if not started_task.ready():
waiting_screen
show_stats
ask_about_labels
save_changes
show_final_docx
---
event: waiting_screen
question: |
Please wait while we process your file
subquestion: |
<div class="spinner-border" role="status">
<span class="visually-hidden">Processing...</span>
</div>
reload: True
---
continue button field: show_stats
question: |
Your DOCX file has been processed
subquestion: |
GPT-4 found ${ len(draft_labels) } labels in your DOCX file.
On the next screen, you can review and make any necessary changes
to the draft Jinja2 labels.
---
question: |
Upload a DOCX file
subquestion: |
We will use GPT-4 to try to add variables in the
[AssemblyLine
convention](https://suffolklitlab.org/docassemble-AssemblyLine-documentation/docs/label_variables)
to your DOCX file.
Your upload can have up to 300 pages, but the result cannot be larger than about 4,000 words. The result will
only include the modified paragraphs.
fields:
- DOCX file: docx_file
datatype: file
accept: |
".docx, application/vnd.openxmlformats-officedocument.wordprocessingml.document"
- Include custom people names: include_custom_people_names
datatype: yesno
default: False
Expand All @@ -43,12 +69,22 @@ fields:
These names will be used to label variables
in the DOCX file.
---
code: |
started_task = background_action('task_draft_labels')
---
event: task_draft_labels
code: |
if include_custom_people_names:
custom_people_names = [tuple(line.split(':')) for line in custom_people_names_text.split('\n')]
else:
custom_people_names = None
draft_labels = get_labeled_docx_runs(docx_file[0].path(), custom_people_names = custom_people_names)
background_response_action('save_draft_labels', draft_labels=draft_labels)
---
event: save_draft_labels
code: |
draft_labels = action_argument('draft_labels')
background_response()
---
objects:
- final_labels: DAList.using(object_type=DAObject, auto_gather=False, gathered=True)
Expand All @@ -59,26 +95,51 @@ code: |
label_question = []
for idx, item in enumerate(draft_labels):
new_obj = final_labels.appendObject()
new_obj.paragraph = item[0]
new_obj.run = item[1]
new_obj.draft_label_text = item[2]
# Results will be a tuple of paragraph number, run, modified text with label
label_question.append({
'label': original_doc.paragraphs[item[0]].runs[item[1]].text,
'field': f'final_labels[idx].label',
'field': f'final_labels[{idx}].label',
'default': item[2],
'label above field': True,
'grid': 8,
'hide if': f'final_labels[{idx}].leave_unchanged'
})
label_question.append({
'label': 'Leave unchanged',
'field': f'final_labels[{idx}].leave_unchanged',
'datatype': 'yesno',
'grid': {
'width': 4,
'end': True,
},
'label above field': True,
})
del docx
del original_doc
---
code: |
for idx, item in enumerate(draft_labels):
item[2] = final_labels[idx].label
new_doc_obj = update_docx(docx_file[0].path(), draft_labels)
new_doc_obj = update_docx(
docx_file[0].path(),
[
(item.paragraph, item.run, item.label, 0)
for item in final_labels
if not item.leave_unchanged
]
)
new_docx.initialize(filename=docx_file[0].filename)
new_doc_obj.save(new_docx.path())
new_docx.commit()
del new_doc_obj
save_changes = True
---
continue button field: ask_about_labels
Expand Down
16 changes: 12 additions & 4 deletions docassemble/ALDashboard/docx_wrangling.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@

os.environ["OPENAI_API_KEY"] = get_config("openai api key")

from typing import List, Tuple, Optional
from typing import List, Tuple, Optional, Union

__all__ = [
"get_labeled_docx_runs",
Expand All @@ -33,12 +33,12 @@ def add_paragraph_before(paragraph, text):


def update_docx(
document: docx.Document, modified_runs: List[Tuple[int, int, str, int]]
document: Union[docx.Document, str], modified_runs: List[Tuple[int, int, str, int]]
) -> docx.Document:
"""Update the document with the modified runs.
Args:
document: the docx.Document object
document: the docx.Document object, or the path to the DOCX file
modified_runs: a tuple of paragraph number, run number, the modified text, a question (not used), and whether a new paragraph should be inserted (for conditional text)
Returns:
Expand All @@ -50,6 +50,9 @@ def update_docx(
## also sort each run in the modified_runs so that the runs are in the correct order
# modified_runs = sorted(modified_runs, key=lambda x: x[1], reverse=True)

if isinstance(document, str):
document = docx.Document(document)

for paragraph_number, run_number, modified_text, new_paragraph in modified_runs:
paragraph = document.paragraphs[paragraph_number]
run = paragraph.runs[run_number]
Expand Down Expand Up @@ -150,7 +153,8 @@ def get_labeled_docx_runs(
Name Forms:
users (full name of all users)
users[0] (Full name)
users[0] (full name of first user)
users[0].name.full() (Alternate full name of first user)
users[0].name.first (First name only)
users[0].name.middle (Middle name only)
users[0].name.middle_initial() (First letter of middle name)
Expand Down Expand Up @@ -225,6 +229,10 @@ def get_labeled_docx_runs(

encoding = tiktoken.encoding_for_model("gpt-4")
token_count = len(encoding.encode(role_description + rules + repr(items)))
if token_count > 128000:
raise Exception(
f"Input to OpenAI is too long ({token_count} tokens). Maximum is 128000 tokens."
)

response = openai_client.chat.completions.create(
model="gpt-4-1106-preview",
Expand Down

0 comments on commit aa43845

Please sign in to comment.