From e4d023e6964b7027a77ae4b346e40121fd6cfb61 Mon Sep 17 00:00:00 2001 From: Quinten Steenhuis Date: Thu, 11 Jul 2024 13:23:09 -0400 Subject: [PATCH 1/2] Unpin scikit-learn because it conflicts with docassemble --- formfyxer/requirements.txt | 4 ++-- setup.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/formfyxer/requirements.txt b/formfyxer/requirements.txt index 3f84a83..9b5aa6d 100644 --- a/formfyxer/requirements.txt +++ b/formfyxer/requirements.txt @@ -13,7 +13,7 @@ pikepdf reportlab requests ocrmypdf -scikit-learn==1.2.2 +scikit-learn spacy textstat transformers @@ -21,4 +21,4 @@ types-requests types-PyYAML en-core-web-lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.5.0/en_core_web_lg-3.5.0-py3-none-any.whl typer>=0.4.1,<0.5.0 -python-docx \ No newline at end of file +python-docx diff --git a/setup.py b/setup.py index f13a7e8..7af26a9 100644 --- a/setup.py +++ b/setup.py @@ -29,7 +29,7 @@ def run(self): license='MIT', packages=['formfyxer'], install_requires=['spacy', 'pdfminer.six', 'pandas', 'pikepdf', - 'textstat', 'requests', 'numpy', 'scikit-learn==1.2.2', 'networkx', 'joblib', + 'textstat', 'requests', 'numpy', 'scikit-learn', 'networkx', 'joblib', 'nltk', 'boxdetect', 'pdf2image', 'reportlab>=3.6.13', 'pdfminer.six', 'opencv-python', 'ocrmypdf', 'eyecite', 'passivepy>=0.2.16', 'sigfig', 'typer>=0.4.1,<0.5.0', # typer pre 0.4.1 was broken by click 8.1.0: https://github.com/explosion/spaCy/issues/10564 From 5f7862540db11ed406eca09757d9935995789b9b Mon Sep 17 00:00:00 2001 From: Quinten Steenhuis Date: Thu, 11 Jul 2024 13:39:50 -0400 Subject: [PATCH 2/2] New Black formatting rules --- formfyxer/docx_wrangling.py | 79 +++++++++++++------------ formfyxer/lit_explorer.py | 87 ++++++++++++++++------------ formfyxer/pdf_wrangling.py | 6 +- formfyxer/tests/test_lit_explorer.py | 50 ++++++++-------- setup.py | 2 +- 5 files changed, 123 insertions(+), 101 deletions(-) diff --git a/formfyxer/docx_wrangling.py b/formfyxer/docx_wrangling.py index 4c80c46..6959ce6 100644 --- a/formfyxer/docx_wrangling.py +++ b/formfyxer/docx_wrangling.py @@ -22,8 +22,8 @@ def add_paragraph_after(paragraph, text): p = OxmlElement("w:p") - r = OxmlElement('w:r') - t = OxmlElement('w:t') + r = OxmlElement("w:r") + t = OxmlElement("w:t") t.text = text r.append(t) @@ -33,17 +33,18 @@ def add_paragraph_after(paragraph, text): def add_paragraph_before(paragraph, text): p = OxmlElement("w:p") - r = OxmlElement('w:r') - t = OxmlElement('w:t') + r = OxmlElement("w:r") + t = OxmlElement("w:t") t.text = text r.append(t) p.append(r) paragraph._element.addprevious(p) + def add_run_after(run, text): - r = OxmlElement('w:r') - t = OxmlElement('w:t') + r = OxmlElement("w:r") + t = OxmlElement("w:t") t.text = text r.append(t) @@ -55,8 +56,8 @@ def update_docx( ) -> docx.Document: """Update the document with the modified runs. - Note: OpenAI is probabilistic, so the modified run indices may not be correct. - When the index of a run or paragraph is out of range, a new paragraph + Note: OpenAI is probabilistic, so the modified run indices may not be correct. + When the index of a run or paragraph is out of range, a new paragraph will be inserted at the end of the document or a new run at the end of the paragraph's runs. @@ -88,19 +89,22 @@ def update_docx( continue run = paragraph.runs[run_number] if new_paragraph == 1: - add_paragraph_after(paragraph, modified_text) + add_paragraph_after(paragraph, modified_text) elif new_paragraph == -1: - add_paragraph_before(paragraph, modified_text) + add_paragraph_before(paragraph, modified_text) else: run.text = modified_text return document -def get_docx_repr(docx_path: str, paragraph_start:int=0, paragraph_end:Optional[int]=None): + +def get_docx_repr( + docx_path: str, paragraph_start: int = 0, paragraph_end: Optional[int] = None +): """Return a JSON representation of the paragraphs and runs in the DOCX file. Args: docx_path: path to the DOCX file - + Returns: A JSON representation of the paragraphs and runs in the DOCX file. """ @@ -117,9 +121,10 @@ def get_docx_repr(docx_path: str, paragraph_start:int=0, paragraph_end:Optional[ ) return repr(items) + def get_labeled_docx_runs( docx_path: Optional[str] = None, - docx_repr = Optional[str], + docx_repr=Optional[str], custom_people_names: Optional[Tuple[str, str]] = None, openai_client: Optional[OpenAI] = None, api_key: Optional[str] = None, @@ -264,22 +269,23 @@ def get_labeled_docx_runs( "(State the reason for eviction)" transforms into `{{ eviction_reason }}`. """ return get_modified_docx_runs( - docx_path = docx_path, - docx_repr = docx_repr, + docx_path=docx_path, + docx_repr=docx_repr, custom_example=custom_example, instructions=instructions, openai_client=openai_client, api_key=api_key, ) + def get_modified_docx_runs( - docx_path: Optional[str] = None, - docx_repr: Optional[str] = None, - custom_example:str = "", - instructions:str = "", - openai_client: Optional[OpenAI] = None, - api_key:Optional[str]=None, - temperature=0.5, + docx_path: Optional[str] = None, + docx_repr: Optional[str] = None, + custom_example: str = "", + instructions: str = "", + openai_client: Optional[OpenAI] = None, + api_key: Optional[str] = None, + temperature=0.5, ) -> List[Tuple[int, int, str, int]]: """Use GPT to rewrite the contents of a DOCX file paragraph by paragraph. Does not handle tables, footers, or other structures yet. @@ -301,9 +307,9 @@ def get_modified_docx_runs( [1, 0, "I hope this letter finds you well."], ] - Your custom instructions should include an example of how the sample will be modified, like the one below: - - Example reply, indicating paragraph, run, the new text, and a number indicating if this changes the + Your custom instructions should include an example of how the sample will be modified, like the one below: + + Example reply, indicating paragraph, run, the new text, and a number indicating if this changes the current paragraph, adds one before, or adds one after (-1, 0, 1): {"results": @@ -336,9 +342,7 @@ def get_modified_docx_runs( assert isinstance(docx_repr, str) if not openai_client: - openai_client = OpenAI( - api_key = api_key or os.environ.get("OPENAI_API_KEY") - ) + openai_client = OpenAI(api_key=api_key or os.environ.get("OPENAI_API_KEY")) if not custom_example: custom_example = """[ @@ -347,7 +351,9 @@ def get_modified_docx_runs( [1, 0, "I hope this letter finds you well."], ]""" - if not "[" in instructions: # Make sure we have at least a minimal example of the output + if ( + not "[" in instructions + ): # Make sure we have at least a minimal example of the output instructions += """The result will look like this: {"results": @@ -357,7 +363,7 @@ def get_modified_docx_runs( ] } """ - + role_description = f""" You will process a DOCX document and return a JSON structure that transforms the DOCX file based on the following guidelines and examples. The DOCX will be provided as an annotated series of @@ -386,11 +392,11 @@ def get_modified_docx_runs( f"Input to OpenAI is too long ({token_count} tokens). Maximum is 128000 tokens." ) - moderation_response = openai_client.moderations.create(input=role_description + docx_repr) + moderation_response = openai_client.moderations.create( + input=role_description + docx_repr + ) if moderation_response.results[0].flagged: - raise Exception( - f"OpenAI moderation error: {moderation_response.results[0]}" - ) + raise Exception(f"OpenAI moderation error: {moderation_response.results[0]}") response = openai_client.chat.completions.create( model="gpt-4-1106-preview", @@ -416,6 +422,7 @@ def get_modified_docx_runs( guesses = json.loads(response.choices[0].message.content)["results"] return guesses + def make_docx_plain_language(docx_path: str) -> docx.Document: """ Convert a DOCX file to plain language with the help of OpenAI. @@ -439,10 +446,10 @@ def make_docx_plain_language(docx_path: str) -> docx.Document: ] } """, - ) return update_docx(docx.Document(docx_path), guesses) + def modify_docx_with_openai_guesses(docx_path: str) -> docx.Document: """Uses OpenAI to guess the variable names for a document and then modifies the document with the guesses. @@ -459,4 +466,4 @@ def modify_docx_with_openai_guesses(docx_path: str) -> docx.Document: if __name__ == "__main__": new_doc = modify_docx_with_openai_guesses(sys.argv[1]) - new_doc.save(sys.argv[1] + ".output.docx") \ No newline at end of file + new_doc.save(sys.argv[1] + ".output.docx") diff --git a/formfyxer/lit_explorer.py b/formfyxer/lit_explorer.py index 6e90b41..d1e38e6 100644 --- a/formfyxer/lit_explorer.py +++ b/formfyxer/lit_explorer.py @@ -34,7 +34,7 @@ FieldType, unlock_pdf_in_place, is_tagged, -) +) try: from nltk.corpus import stopwords @@ -131,18 +131,20 @@ with open( os.path.join(os.path.dirname(__file__), "keys", "openai_key.txt"), "r" ) as in_file: - default_key:Optional[str] = in_file.read().rstrip() + default_key: Optional[str] = in_file.read().rstrip() except: default_key = None try: with open( os.path.join(os.path.dirname(__file__), "keys", "openai_org.txt"), "r" ) as in_file: - default_org:Optional[str] = in_file.read().rstrip() + default_org: Optional[str] = in_file.read().rstrip() except: default_org = None if default_key: - client:Optional[OpenAI] = OpenAI(api_key=default_key, organization=default_org or None) + client: Optional[OpenAI] = OpenAI( + api_key=default_key, organization=default_org or None + ) elif os.getenv("OPENAI_API_KEY"): client = OpenAI() else: @@ -160,6 +162,7 @@ CURRENT_DIRECTORY, "data", "simplified_words.yml" ) + # This creates a timeout exception that can be triggered when something hangs too long. class TimeoutException(Exception): pass @@ -429,18 +432,19 @@ def normalize_name( not, to a snake_case variable name of appropriate length. HACK: temporarily all we do is re-case it and normalize it using regex rules. - Will be replaced with call to LLM soon. + Will be replaced with call to LLM soon. """ - + if this_field not in included_fields: this_field = re_case(this_field) this_field = regex_norm_field(this_field) if this_field in included_fields: return f"*{this_field}", 0.01 - + return reformat_field(this_field, tools_token=tools_token), 0.5 + # Take a list of AL variables and spits out suggested groupings. Here's what's going on: # # 1. It reads in a list of fields (e.g., `["user_name","user_address"]`) @@ -652,23 +656,21 @@ def classify_field(field: FieldInfo, new_name: str) -> AnswerType: return AnswerType.GATHERED -def get_adjusted_character_count( - field: FieldInfo -)-> float: +def get_adjusted_character_count(field: FieldInfo) -> float: """ - Determines the bracketed length of an input field based on its max_length attribute, - returning a float representing the approximate length of the field content. + Determines the bracketed length of an input field based on its max_length attribute, + returning a float representing the approximate length of the field content. The function chunks the answers into 5 different lengths (checkboxes, 2 words, short, medium, and long) instead of directly using the character count, as forms can allocate different spaces for the same data without considering the space the user actually needs. Args: - field (FieldInfo): An object containing information about the input field, + field (FieldInfo): An object containing information about the input field, including the "max_length" attribute. Returns: - float: The approximate length of the field content, categorized into checkboxes, 2 words, short, + float: The approximate length of the field content, categorized into checkboxes, 2 words, short, medium, or long based on the max_length attribute. Examples: @@ -694,10 +696,8 @@ def get_adjusted_character_count( ) # Anything over 10 lines probably needs a full page but form author skimped on space if field["type"] != InputType.TEXT: return ONE_WORD - - if field["max_length"] <= ONE_LINE or ( - field["max_length"] <= ONE_LINE * 2 - ): + + if field["max_length"] <= ONE_LINE or (field["max_length"] <= ONE_LINE * 2): return ONE_WORD * 2 elif field["max_length"] <= SHORT_ANSWER: return SHORT_ANSWER @@ -816,7 +816,12 @@ class OpenAiCreds(TypedDict): key: str -def text_complete(prompt:str, max_tokens:int=500, creds: Optional[OpenAiCreds] = None, temperature:float=0) -> str: +def text_complete( + prompt: str, + max_tokens: int = 500, + creds: Optional[OpenAiCreds] = None, + temperature: float = 0, +) -> str: """Run a prompt via openAI's API and return the result. Args: @@ -836,16 +841,13 @@ def text_complete(prompt:str, max_tokens:int=500, creds: Optional[OpenAiCreds] = response = openai_client.chat.completions.create( model="gpt-3.5-turbo", messages=[ - { - "role": "system", - "content": prompt - }, + {"role": "system", "content": prompt}, ], temperature=temperature, max_tokens=max_tokens, top_p=1.0, frequency_penalty=0.0, - presence_penalty=0.0 + presence_penalty=0.0, ) return str((response.choices[0].message.content or "").strip()) except Exception as ex: @@ -1004,7 +1006,9 @@ def substitute_phrases( # Find all matches for the substitution phrases for original, replacement in sorted_phrases: - for match in re.finditer(r"\b" + re.escape(original) + r"\b", input_string, re.IGNORECASE): + for match in re.finditer( + r"\b" + re.escape(original) + r"\b", input_string, re.IGNORECASE + ): matches.append((match.start(), match.end(), replacement)) # Sort the matches based on their starting position @@ -1108,7 +1112,11 @@ def parse_form( except: readability = -1 # Still attempt to re-evaluate if not using openai - if not original_text or (openai_creds and description == "abortthisnow.") or readability > 30: + if ( + not original_text + or (openai_creds and description == "abortthisnow.") + or readability > 30 + ): # We do not care what the PDF output is, doesn't add that much time ocr_p = [ "ocrmypdf", @@ -1216,9 +1224,11 @@ def parse_form( "category": cat, "pages": pages_count, "reading grade level": readability, - "time to answer": time_to_answer_form(field_types_and_sizes(ff), new_names) - if ff - else [-1, -1], + "time to answer": ( + time_to_answer_form(field_types_and_sizes(ff), new_names) + if ff + else [-1, -1] + ), "list": nsmi, "avg fields per page": f_per_page, "fields": new_names, @@ -1236,16 +1246,21 @@ def parse_form( "slotin percent": slotin_count / field_count if field_count > 0 else 0, "gathered percent": gathered_count / field_count if field_count > 0 else 0, "created percent": created_count / field_count if field_count > 0 else 0, - "third party percent": third_party_count / field_count - if field_count > 0 - else 0, + "third party percent": ( + third_party_count / field_count if field_count > 0 else 0 + ), "passive voice percent": ( passive_sentences_count / sentence_count if sentence_count > 0 else 0 ), "citations per field": citation_count / field_count if field_count > 0 else 0, "citation count": citation_count, "all caps percent": all_caps_count / word_count, - "normalized characters per field": sum(get_adjusted_character_count(field) for field in field_types ) / field_count if ff else 0, + "normalized characters per field": ( + sum(get_adjusted_character_count(field) for field in field_types) + / field_count + if ff + else 0 + ), "difficult words": difficult_words, "difficult word count": difficult_word_count, "difficult word percent": difficult_word_count / word_count, @@ -1304,7 +1319,7 @@ def _form_complexity_per_metric(stats): {"name": "pages", "weight": 2}, {"name": "citations per field", "weight": 1.2}, {"name": "avg fields per page", "weight": 1 / 8}, - {"name": "normalized characters per field", "weight": 1/8}, + {"name": "normalized characters per field", "weight": 1 / 8}, {"name": "sentences per page", "weight": 0.05}, # percents will have a higher weight, because they are between 0 and 1 {"name": "slotin percent", "weight": 2}, @@ -1322,11 +1337,11 @@ def weight(stats, metric): weight = metric.get("weight") or 1 val = 0 if "clip" in metric: - val = min(max(stats.get(name,0), metric["clip"][0]), metric["clip"][1]) + val = min(max(stats.get(name, 0), metric["clip"][0]), metric["clip"][1]) elif isinstance(stats.get(name), bool): val = 1 if stats.get(name) else 0 else: - val = stats.get(name,0) + val = stats.get(name, 0) if "intercept" in metric: val -= metric["intercept"] return val * weight diff --git a/formfyxer/pdf_wrangling.py b/formfyxer/pdf_wrangling.py index 0a5a509..370a6b8 100644 --- a/formfyxer/pdf_wrangling.py +++ b/formfyxer/pdf_wrangling.py @@ -1234,10 +1234,12 @@ def sort_contours(cnts, method: str = "left-to-right"): # construct tuple of bounding boxes and sort them top to bottom boundingBoxes = tuple(cv2.boundingRect(c) for c in cnts) if not boundingBoxes: - return (),() + return (), () # Sort the contours and bounding boxes - sorted_zip = sorted(zip(cnts, boundingBoxes), key=lambda b: b[1][coord], reverse=reverse) + sorted_zip = sorted( + zip(cnts, boundingBoxes), key=lambda b: b[1][coord], reverse=reverse + ) if not sorted_zip: return (), () diff --git a/formfyxer/tests/test_lit_explorer.py b/formfyxer/tests/test_lit_explorer.py index 5c57076..bd7158a 100644 --- a/formfyxer/tests/test_lit_explorer.py +++ b/formfyxer/tests/test_lit_explorer.py @@ -115,44 +115,42 @@ def test_phrase_and_position_various_orders(self): class TestSpot(unittest.TestCase): def setUp(self) -> None: self.request_args = { - 'url': 'https://spot.suffolklitlab.org/v0/entities-nested/', - 'headers': { - 'Authorization': 'Bearer your_SPOT_API_token goes here', - 'Content-Type': 'application/json' + "url": "https://spot.suffolklitlab.org/v0/entities-nested/", + "headers": { + "Authorization": "Bearer your_SPOT_API_token goes here", + "Content-Type": "application/json", + }, + "data": { + "text": "", + "save-text": 0, + "cutoff-lower": 0.25, + "cutoff-pred": 0.5, + "cutoff-upper": 0.6, }, - 'data': { - 'text': '', - 'save-text': 0, - 'cutoff-lower': 0.25, - 'cutoff-pred': 0.5, - 'cutoff-upper': 0.6, - } } return super().setUp() - - @mock.patch('requests.post') + @mock.patch("requests.post") def test_calls_spot(self, mock_post): - text = 'The quick brown fox jumps over the lazy dog.' - self.request_args['data']['text'] = text + text = "The quick brown fox jumps over the lazy dog." + self.request_args["data"]["text"] = text spot(text) mock_post.assert_called_with( - self.request_args['url'], - headers=self.request_args['headers'], - data=json.dumps(self.request_args['data']) + self.request_args["url"], + headers=self.request_args["headers"], + data=json.dumps(self.request_args["data"]), ) - - @mock.patch('requests.post') + @mock.patch("requests.post") def test_calls_spot_with_reduced_character_count(self, mock_post): - text = 'a' * 5001 - reduced_text = 'a' * 5000 - self.request_args['data']['text'] = reduced_text + text = "a" * 5001 + reduced_text = "a" * 5000 + self.request_args["data"]["text"] = reduced_text spot(text) mock_post.assert_called_with( - self.request_args['url'], - headers=self.request_args['headers'], - data=json.dumps(self.request_args['data']) + self.request_args["url"], + headers=self.request_args["headers"], + data=json.dumps(self.request_args["data"]), ) diff --git a/setup.py b/setup.py index 7af26a9..2ec7834 100644 --- a/setup.py +++ b/setup.py @@ -16,7 +16,7 @@ def run(self): setuptools.setup( name='formfyxer', - version='0.3.0a1', + version='0.3.0a2', author='Suffolk LIT Lab', author_email='litlab@suffolk.edu', description='A tool for learning about and pre-processing pdf forms.',