From e4d023e6964b7027a77ae4b346e40121fd6cfb61 Mon Sep 17 00:00:00 2001
From: Quinten Steenhuis <qsteenhuis@gmail.com>
Date: Thu, 11 Jul 2024 13:23:09 -0400
Subject: [PATCH 1/2] Unpin scikit-learn because it conflicts with docassemble

---
 formfyxer/requirements.txt | 4 ++--
 setup.py                   | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/formfyxer/requirements.txt b/formfyxer/requirements.txt
index 3f84a83..9b5aa6d 100644
--- a/formfyxer/requirements.txt
+++ b/formfyxer/requirements.txt
@@ -13,7 +13,7 @@ pikepdf
 reportlab
 requests
 ocrmypdf
-scikit-learn==1.2.2
+scikit-learn
 spacy
 textstat
 transformers
@@ -21,4 +21,4 @@ types-requests
 types-PyYAML
 en-core-web-lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.5.0/en_core_web_lg-3.5.0-py3-none-any.whl
 typer>=0.4.1,<0.5.0
-python-docx
\ No newline at end of file
+python-docx
diff --git a/setup.py b/setup.py
index f13a7e8..7af26a9 100644
--- a/setup.py
+++ b/setup.py
@@ -29,7 +29,7 @@ def run(self):
     license='MIT',
     packages=['formfyxer'],
     install_requires=['spacy', 'pdfminer.six', 'pandas', 'pikepdf',
-        'textstat', 'requests', 'numpy', 'scikit-learn==1.2.2', 'networkx', 'joblib',
+        'textstat', 'requests', 'numpy', 'scikit-learn', 'networkx', 'joblib',
         'nltk', 'boxdetect', 'pdf2image', 'reportlab>=3.6.13', 'pdfminer.six',
         'opencv-python', 'ocrmypdf', 'eyecite', 'passivepy>=0.2.16', 'sigfig',
         'typer>=0.4.1,<0.5.0', # typer pre 0.4.1 was broken by click 8.1.0: https://github.com/explosion/spaCy/issues/10564

From 5f7862540db11ed406eca09757d9935995789b9b Mon Sep 17 00:00:00 2001
From: Quinten Steenhuis <qsteenhuis@gmail.com>
Date: Thu, 11 Jul 2024 13:39:50 -0400
Subject: [PATCH 2/2] New Black formatting rules

---
 formfyxer/docx_wrangling.py          | 79 +++++++++++++------------
 formfyxer/lit_explorer.py            | 87 ++++++++++++++++------------
 formfyxer/pdf_wrangling.py           |  6 +-
 formfyxer/tests/test_lit_explorer.py | 50 ++++++++--------
 setup.py                             |  2 +-
 5 files changed, 123 insertions(+), 101 deletions(-)

diff --git a/formfyxer/docx_wrangling.py b/formfyxer/docx_wrangling.py
index 4c80c46..6959ce6 100644
--- a/formfyxer/docx_wrangling.py
+++ b/formfyxer/docx_wrangling.py
@@ -22,8 +22,8 @@
 
 def add_paragraph_after(paragraph, text):
     p = OxmlElement("w:p")
-    r = OxmlElement('w:r')
-    t = OxmlElement('w:t')
+    r = OxmlElement("w:r")
+    t = OxmlElement("w:t")
     t.text = text
 
     r.append(t)
@@ -33,17 +33,18 @@ def add_paragraph_after(paragraph, text):
 
 def add_paragraph_before(paragraph, text):
     p = OxmlElement("w:p")
-    r = OxmlElement('w:r')
-    t = OxmlElement('w:t')
+    r = OxmlElement("w:r")
+    t = OxmlElement("w:t")
     t.text = text
 
     r.append(t)
     p.append(r)
     paragraph._element.addprevious(p)
 
+
 def add_run_after(run, text):
-    r = OxmlElement('w:r')
-    t = OxmlElement('w:t')
+    r = OxmlElement("w:r")
+    t = OxmlElement("w:t")
     t.text = text
 
     r.append(t)
@@ -55,8 +56,8 @@ def update_docx(
 ) -> docx.Document:
     """Update the document with the modified runs.
 
-    Note: OpenAI is probabilistic, so the modified run indices may not be correct. 
-    When the index of a run or paragraph is out of range, a new paragraph 
+    Note: OpenAI is probabilistic, so the modified run indices may not be correct.
+    When the index of a run or paragraph is out of range, a new paragraph
     will be inserted at the end of the document or a new run at the end of the
     paragraph's runs.
 
@@ -88,19 +89,22 @@ def update_docx(
             continue
         run = paragraph.runs[run_number]
         if new_paragraph == 1:
-           add_paragraph_after(paragraph, modified_text)
+            add_paragraph_after(paragraph, modified_text)
         elif new_paragraph == -1:
-           add_paragraph_before(paragraph, modified_text)
+            add_paragraph_before(paragraph, modified_text)
         else:
             run.text = modified_text
     return document
 
-def get_docx_repr(docx_path: str, paragraph_start:int=0, paragraph_end:Optional[int]=None):
+
+def get_docx_repr(
+    docx_path: str, paragraph_start: int = 0, paragraph_end: Optional[int] = None
+):
     """Return a JSON representation of the paragraphs and runs in the DOCX file.
 
     Args:
         docx_path: path to the DOCX file
-    
+
     Returns:
         A JSON representation of the paragraphs and runs in the DOCX file.
     """
@@ -117,9 +121,10 @@ def get_docx_repr(docx_path: str, paragraph_start:int=0, paragraph_end:Optional[
             )
     return repr(items)
 
+
 def get_labeled_docx_runs(
     docx_path: Optional[str] = None,
-    docx_repr = Optional[str],
+    docx_repr=Optional[str],
     custom_people_names: Optional[Tuple[str, str]] = None,
     openai_client: Optional[OpenAI] = None,
     api_key: Optional[str] = None,
@@ -264,22 +269,23 @@ def get_labeled_docx_runs(
         "(State the reason for eviction)" transforms into `{{ eviction_reason }}`.
     """
     return get_modified_docx_runs(
-        docx_path = docx_path,
-        docx_repr = docx_repr,
+        docx_path=docx_path,
+        docx_repr=docx_repr,
         custom_example=custom_example,
         instructions=instructions,
         openai_client=openai_client,
         api_key=api_key,
     )
 
+
 def get_modified_docx_runs(
-        docx_path: Optional[str] = None,
-        docx_repr: Optional[str] = None,
-        custom_example:str = "",
-        instructions:str = "",
-        openai_client: Optional[OpenAI] = None, 
-        api_key:Optional[str]=None,
-        temperature=0.5,
+    docx_path: Optional[str] = None,
+    docx_repr: Optional[str] = None,
+    custom_example: str = "",
+    instructions: str = "",
+    openai_client: Optional[OpenAI] = None,
+    api_key: Optional[str] = None,
+    temperature=0.5,
 ) -> List[Tuple[int, int, str, int]]:
     """Use GPT to rewrite the contents of a DOCX file paragraph by paragraph. Does not handle tables, footers, or
     other structures yet.
@@ -301,9 +307,9 @@ def get_modified_docx_runs(
         [1, 0, "I hope this letter finds you well."],
     ]
 
-    Your custom instructions should include an example of how the sample will be modified, like the one below: 
-    
-    Example reply, indicating paragraph, run, the new text, and a number indicating if this changes the 
+    Your custom instructions should include an example of how the sample will be modified, like the one below:
+
+    Example reply, indicating paragraph, run, the new text, and a number indicating if this changes the
     current paragraph, adds one before, or adds one after (-1, 0, 1):
 
     {"results":
@@ -336,9 +342,7 @@ def get_modified_docx_runs(
     assert isinstance(docx_repr, str)
 
     if not openai_client:
-        openai_client = OpenAI(
-            api_key = api_key or os.environ.get("OPENAI_API_KEY")
-        )
+        openai_client = OpenAI(api_key=api_key or os.environ.get("OPENAI_API_KEY"))
 
     if not custom_example:
         custom_example = """[
@@ -347,7 +351,9 @@ def get_modified_docx_runs(
         [1, 0, "I hope this letter finds you well."],
     ]"""
 
-    if not "[" in instructions: # Make sure we have at least a minimal example of the output
+    if (
+        not "[" in instructions
+    ):  # Make sure we have at least a minimal example of the output
         instructions += """The result will look like this:
 
     {"results":
@@ -357,7 +363,7 @@ def get_modified_docx_runs(
         ]
     }
     """
-        
+
     role_description = f"""
     You will process a DOCX document and return a JSON structure that transforms the DOCX file
     based on the following guidelines and examples. The DOCX will be provided as an annotated series of
@@ -386,11 +392,11 @@ def get_modified_docx_runs(
             f"Input to OpenAI is too long ({token_count} tokens). Maximum is 128000 tokens."
         )
 
-    moderation_response = openai_client.moderations.create(input=role_description + docx_repr)
+    moderation_response = openai_client.moderations.create(
+        input=role_description + docx_repr
+    )
     if moderation_response.results[0].flagged:
-        raise Exception(
-            f"OpenAI moderation error: {moderation_response.results[0]}"
-        )
+        raise Exception(f"OpenAI moderation error: {moderation_response.results[0]}")
 
     response = openai_client.chat.completions.create(
         model="gpt-4-1106-preview",
@@ -416,6 +422,7 @@ def get_modified_docx_runs(
     guesses = json.loads(response.choices[0].message.content)["results"]
     return guesses
 
+
 def make_docx_plain_language(docx_path: str) -> docx.Document:
     """
     Convert a DOCX file to plain language with the help of OpenAI.
@@ -439,10 +446,10 @@ def make_docx_plain_language(docx_path: str) -> docx.Document:
         ]
     }
     """,
-    
     )
     return update_docx(docx.Document(docx_path), guesses)
 
+
 def modify_docx_with_openai_guesses(docx_path: str) -> docx.Document:
     """Uses OpenAI to guess the variable names for a document and then modifies the document with the guesses.
 
@@ -459,4 +466,4 @@ def modify_docx_with_openai_guesses(docx_path: str) -> docx.Document:
 
 if __name__ == "__main__":
     new_doc = modify_docx_with_openai_guesses(sys.argv[1])
-    new_doc.save(sys.argv[1] + ".output.docx")
\ No newline at end of file
+    new_doc.save(sys.argv[1] + ".output.docx")
diff --git a/formfyxer/lit_explorer.py b/formfyxer/lit_explorer.py
index 6e90b41..d1e38e6 100644
--- a/formfyxer/lit_explorer.py
+++ b/formfyxer/lit_explorer.py
@@ -34,7 +34,7 @@
     FieldType,
     unlock_pdf_in_place,
     is_tagged,
-)   
+)
 
 try:
     from nltk.corpus import stopwords
@@ -131,18 +131,20 @@
     with open(
         os.path.join(os.path.dirname(__file__), "keys", "openai_key.txt"), "r"
     ) as in_file:
-        default_key:Optional[str] = in_file.read().rstrip()
+        default_key: Optional[str] = in_file.read().rstrip()
 except:
     default_key = None
 try:
     with open(
         os.path.join(os.path.dirname(__file__), "keys", "openai_org.txt"), "r"
     ) as in_file:
-        default_org:Optional[str] = in_file.read().rstrip()
+        default_org: Optional[str] = in_file.read().rstrip()
 except:
     default_org = None
 if default_key:
-    client:Optional[OpenAI] = OpenAI(api_key=default_key, organization=default_org or None)
+    client: Optional[OpenAI] = OpenAI(
+        api_key=default_key, organization=default_org or None
+    )
 elif os.getenv("OPENAI_API_KEY"):
     client = OpenAI()
 else:
@@ -160,6 +162,7 @@
     CURRENT_DIRECTORY, "data", "simplified_words.yml"
 )
 
+
 # This creates a timeout exception that can be triggered when something hangs too long.
 class TimeoutException(Exception):
     pass
@@ -429,18 +432,19 @@ def normalize_name(
     not, to a snake_case variable name of appropriate length.
 
     HACK: temporarily all we do is re-case it and normalize it using regex rules.
-    Will be replaced with call to LLM soon.        
+    Will be replaced with call to LLM soon.
     """
-    
+
     if this_field not in included_fields:
         this_field = re_case(this_field)
         this_field = regex_norm_field(this_field)
 
     if this_field in included_fields:
         return f"*{this_field}", 0.01
-    
+
     return reformat_field(this_field, tools_token=tools_token), 0.5
 
+
 # Take a list of AL variables and spits out suggested groupings. Here's what's going on:
 #
 # 1. It reads in a list of fields (e.g., `["user_name","user_address"]`)
@@ -652,23 +656,21 @@ def classify_field(field: FieldInfo, new_name: str) -> AnswerType:
     return AnswerType.GATHERED
 
 
-def get_adjusted_character_count(
-        field: FieldInfo
-)-> float:
+def get_adjusted_character_count(field: FieldInfo) -> float:
     """
-    Determines the bracketed length of an input field based on its max_length attribute, 
-    returning a float representing the approximate length of the field content. 
+    Determines the bracketed length of an input field based on its max_length attribute,
+    returning a float representing the approximate length of the field content.
 
     The function chunks the answers into 5 different lengths (checkboxes, 2 words, short, medium, and long)
     instead of directly using the character count, as forms can allocate different spaces
     for the same data without considering the space the user actually needs.
 
     Args:
-        field (FieldInfo): An object containing information about the input field, 
+        field (FieldInfo): An object containing information about the input field,
                            including the "max_length" attribute.
 
     Returns:
-        float: The approximate length of the field content, categorized into checkboxes, 2 words, short, 
+        float: The approximate length of the field content, categorized into checkboxes, 2 words, short,
                medium, or long based on the max_length attribute.
 
     Examples:
@@ -694,10 +696,8 @@ def get_adjusted_character_count(
     )  # Anything over 10 lines probably needs a full page but form author skimped on space
     if field["type"] != InputType.TEXT:
         return ONE_WORD
-    
-    if field["max_length"] <= ONE_LINE or (
-        field["max_length"] <= ONE_LINE * 2
-    ):
+
+    if field["max_length"] <= ONE_LINE or (field["max_length"] <= ONE_LINE * 2):
         return ONE_WORD * 2
     elif field["max_length"] <= SHORT_ANSWER:
         return SHORT_ANSWER
@@ -816,7 +816,12 @@ class OpenAiCreds(TypedDict):
     key: str
 
 
-def text_complete(prompt:str, max_tokens:int=500, creds: Optional[OpenAiCreds] = None, temperature:float=0) -> str:
+def text_complete(
+    prompt: str,
+    max_tokens: int = 500,
+    creds: Optional[OpenAiCreds] = None,
+    temperature: float = 0,
+) -> str:
     """Run a prompt via openAI's API and return the result.
 
     Args:
@@ -836,16 +841,13 @@ def text_complete(prompt:str, max_tokens:int=500, creds: Optional[OpenAiCreds] =
         response = openai_client.chat.completions.create(
             model="gpt-3.5-turbo",
             messages=[
-                {
-                    "role": "system", 
-                    "content": prompt
-                },
+                {"role": "system", "content": prompt},
             ],
             temperature=temperature,
             max_tokens=max_tokens,
             top_p=1.0,
             frequency_penalty=0.0,
-            presence_penalty=0.0
+            presence_penalty=0.0,
         )
         return str((response.choices[0].message.content or "").strip())
     except Exception as ex:
@@ -1004,7 +1006,9 @@ def substitute_phrases(
 
     # Find all matches for the substitution phrases
     for original, replacement in sorted_phrases:
-        for match in re.finditer(r"\b" + re.escape(original) + r"\b", input_string, re.IGNORECASE):
+        for match in re.finditer(
+            r"\b" + re.escape(original) + r"\b", input_string, re.IGNORECASE
+        ):
             matches.append((match.start(), match.end(), replacement))
 
     # Sort the matches based on their starting position
@@ -1108,7 +1112,11 @@ def parse_form(
     except:
         readability = -1
     # Still attempt to re-evaluate if not using openai
-    if not original_text or (openai_creds and description == "abortthisnow.") or readability > 30:
+    if (
+        not original_text
+        or (openai_creds and description == "abortthisnow.")
+        or readability > 30
+    ):
         # We do not care what the PDF output is, doesn't add that much time
         ocr_p = [
             "ocrmypdf",
@@ -1216,9 +1224,11 @@ def parse_form(
         "category": cat,
         "pages": pages_count,
         "reading grade level": readability,
-        "time to answer": time_to_answer_form(field_types_and_sizes(ff), new_names)
-        if ff
-        else [-1, -1],
+        "time to answer": (
+            time_to_answer_form(field_types_and_sizes(ff), new_names)
+            if ff
+            else [-1, -1]
+        ),
         "list": nsmi,
         "avg fields per page": f_per_page,
         "fields": new_names,
@@ -1236,16 +1246,21 @@ def parse_form(
         "slotin percent": slotin_count / field_count if field_count > 0 else 0,
         "gathered percent": gathered_count / field_count if field_count > 0 else 0,
         "created percent": created_count / field_count if field_count > 0 else 0,
-        "third party percent": third_party_count / field_count
-        if field_count > 0
-        else 0,
+        "third party percent": (
+            third_party_count / field_count if field_count > 0 else 0
+        ),
         "passive voice percent": (
             passive_sentences_count / sentence_count if sentence_count > 0 else 0
         ),
         "citations per field": citation_count / field_count if field_count > 0 else 0,
         "citation count": citation_count,
         "all caps percent": all_caps_count / word_count,
-        "normalized characters per field": sum(get_adjusted_character_count(field) for field in field_types ) / field_count if ff else 0,
+        "normalized characters per field": (
+            sum(get_adjusted_character_count(field) for field in field_types)
+            / field_count
+            if ff
+            else 0
+        ),
         "difficult words": difficult_words,
         "difficult word count": difficult_word_count,
         "difficult word percent": difficult_word_count / word_count,
@@ -1304,7 +1319,7 @@ def _form_complexity_per_metric(stats):
         {"name": "pages", "weight": 2},
         {"name": "citations per field", "weight": 1.2},
         {"name": "avg fields per page", "weight": 1 / 8},
-        {"name": "normalized characters per field", "weight": 1/8},
+        {"name": "normalized characters per field", "weight": 1 / 8},
         {"name": "sentences per page", "weight": 0.05},
         # percents will have a higher weight, because they are between 0 and 1
         {"name": "slotin percent", "weight": 2},
@@ -1322,11 +1337,11 @@ def weight(stats, metric):
         weight = metric.get("weight") or 1
         val = 0
         if "clip" in metric:
-            val = min(max(stats.get(name,0), metric["clip"][0]), metric["clip"][1])
+            val = min(max(stats.get(name, 0), metric["clip"][0]), metric["clip"][1])
         elif isinstance(stats.get(name), bool):
             val = 1 if stats.get(name) else 0
         else:
-            val = stats.get(name,0)
+            val = stats.get(name, 0)
         if "intercept" in metric:
             val -= metric["intercept"]
         return val * weight
diff --git a/formfyxer/pdf_wrangling.py b/formfyxer/pdf_wrangling.py
index 0a5a509..370a6b8 100644
--- a/formfyxer/pdf_wrangling.py
+++ b/formfyxer/pdf_wrangling.py
@@ -1234,10 +1234,12 @@ def sort_contours(cnts, method: str = "left-to-right"):
         # construct tuple of bounding boxes and sort them top to bottom
         boundingBoxes = tuple(cv2.boundingRect(c) for c in cnts)
         if not boundingBoxes:
-            return (),()
+            return (), ()
 
         # Sort the contours and bounding boxes
-        sorted_zip = sorted(zip(cnts, boundingBoxes), key=lambda b: b[1][coord], reverse=reverse)
+        sorted_zip = sorted(
+            zip(cnts, boundingBoxes), key=lambda b: b[1][coord], reverse=reverse
+        )
 
         if not sorted_zip:
             return (), ()
diff --git a/formfyxer/tests/test_lit_explorer.py b/formfyxer/tests/test_lit_explorer.py
index 5c57076..bd7158a 100644
--- a/formfyxer/tests/test_lit_explorer.py
+++ b/formfyxer/tests/test_lit_explorer.py
@@ -115,44 +115,42 @@ def test_phrase_and_position_various_orders(self):
 class TestSpot(unittest.TestCase):
     def setUp(self) -> None:
         self.request_args = {
-            'url': 'https://spot.suffolklitlab.org/v0/entities-nested/',
-            'headers': {
-                'Authorization': 'Bearer your_SPOT_API_token goes here',
-                'Content-Type': 'application/json'
+            "url": "https://spot.suffolklitlab.org/v0/entities-nested/",
+            "headers": {
+                "Authorization": "Bearer your_SPOT_API_token goes here",
+                "Content-Type": "application/json",
+            },
+            "data": {
+                "text": "",
+                "save-text": 0,
+                "cutoff-lower": 0.25,
+                "cutoff-pred": 0.5,
+                "cutoff-upper": 0.6,
             },
-            'data': {
-                'text': '',
-                'save-text': 0,
-                'cutoff-lower': 0.25,
-                'cutoff-pred': 0.5,
-                'cutoff-upper': 0.6,
-            }
         }
         return super().setUp()
 
-
-    @mock.patch('requests.post')
+    @mock.patch("requests.post")
     def test_calls_spot(self, mock_post):
-        text = 'The quick brown fox jumps over the lazy dog.'
-        self.request_args['data']['text'] = text
+        text = "The quick brown fox jumps over the lazy dog."
+        self.request_args["data"]["text"] = text
         spot(text)
         mock_post.assert_called_with(
-            self.request_args['url'],
-            headers=self.request_args['headers'],
-            data=json.dumps(self.request_args['data'])
+            self.request_args["url"],
+            headers=self.request_args["headers"],
+            data=json.dumps(self.request_args["data"]),
         )
 
-
-    @mock.patch('requests.post')
+    @mock.patch("requests.post")
     def test_calls_spot_with_reduced_character_count(self, mock_post):
-        text = 'a' * 5001
-        reduced_text = 'a' * 5000
-        self.request_args['data']['text'] = reduced_text
+        text = "a" * 5001
+        reduced_text = "a" * 5000
+        self.request_args["data"]["text"] = reduced_text
         spot(text)
         mock_post.assert_called_with(
-            self.request_args['url'],
-            headers=self.request_args['headers'],
-            data=json.dumps(self.request_args['data'])
+            self.request_args["url"],
+            headers=self.request_args["headers"],
+            data=json.dumps(self.request_args["data"]),
         )
 
 
diff --git a/setup.py b/setup.py
index 7af26a9..2ec7834 100644
--- a/setup.py
+++ b/setup.py
@@ -16,7 +16,7 @@ def run(self):
 
 setuptools.setup(
     name='formfyxer',
-    version='0.3.0a1',
+    version='0.3.0a2',
     author='Suffolk LIT Lab',
     author_email='litlab@suffolk.edu',
     description='A tool for learning about and pre-processing pdf forms.',