diff --git a/.gitignore b/.gitignore index a1e09aa..949d3de 100644 --- a/.gitignore +++ b/.gitignore @@ -11,3 +11,4 @@ dev-testing/.DS_Store .env .venv venv +formfyxer/keys/** \ No newline at end of file diff --git a/CHANGELOG.md b/CHANGELOG.md index 46037e4..ff374f2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,19 @@ # CHANGELOG + +## Version v0.2.0 + +### Added +* Add warning when sensitive fields are detected by @codestronger in https://github.com/SuffolkLITLab/RateMyPDF/issues/25 + +### Changed +N/A + +### Fixed +N/A + +**Full Changelog**: https://github.com/SuffolkLITLab/FormFyxer/compare/v0.2.0...v0.3.0a2 + ## Version v0.2.0 ### Added @@ -22,7 +36,7 @@ ### Fixed -* If GPT-3 says the readability is too high (i.e. high likelyhood we have garabage), we will use ocrmypydf to re-evaluate the text in a PDF (https://github.com/SuffolkLITLab/FormFyxer/commit/a6dcd9872d2d0a6542f687aa46b1b9b00f16d3e5) +* If GPT-3 says the readability is too high (i.e. high likelihood we have garbage), we will use ocrmypydf to re-evaluate the text in a PDF (https://github.com/SuffolkLITLab/FormFyxer/commit/a6dcd9872d2d0a6542f687aa46b1b9b00f16d3e5) * Adds more actionable information to the stats returned from `parse_form` (https://github.com/SuffolkLITLab/FormFyxer/pull/83): * Gives more context for citations in found in the text: https://github.com/SuffolkLITLab/FormFyxer/pull/83/commits/b62bd41958fc1bd0373b7698adde1a234779f77a diff --git a/README.md b/README.md index c005281..02b5d01 100644 --- a/README.md +++ b/README.md @@ -80,9 +80,12 @@ Functions from `pdf_wrangling` are found on [our documentation site](https://suf - [Parameters:](#parameters-10) - [Returns:](#returns-10) - [Example:](#example-10) + - [formfyxer.get\_sensitive\_fields(fields)](#formfyxerget_sensitive_fieldsfields) + - [Parameters:](#parameters-11) + - [Returns:](#returns-11) + - [Example:](#example-11) - [License](#license) - ### formfyxer.re_case(text) Reformats snake_case, camelCase, and similarly-formatted text into individual words. #### Parameters: @@ -99,9 +102,9 @@ A string where words combined by cases like snake_case are split back into indiv ### formfyxer.regex_norm_field(text) -Given an auto-generated field name (e.g., those applied by a PDF editor's find form feilds function), this function uses regular expressions to replace common auto-generated field names for those found in our [standard field names](https://suffolklitlab.org/docassemble-AssemblyLine-documentation/docs/label_variables/). +Given an auto-generated field name (e.g., those applied by a PDF editor's find form fields function), this function uses regular expressions to replace common auto-generated field names for those found in our [standard field names](https://suffolklitlab.org/docassemble-AssemblyLine-documentation/docs/label_variables/). #### Parameters: -* **text : str** A string of words, such as that found in an auto-generated field name (e.g., those applied by a PDF editor's find form feilds function). +* **text : str** A string of words, such as that found in an auto-generated field name (e.g., those applied by a PDF editor's find form fields function). #### Returns: Either the original string/field name, or if a standard field name is found, the standard field name. #### Example: @@ -124,7 +127,7 @@ A snake_case string summarizing the input sentence. #### Example: ```python >>> import formfyxer ->>> reformat_field("this is a variable where you fill out your name") +>>> formfyxer.reformat_field("this is a variable where you fill out your name") 'variable_fill_name' ``` [back to top](#formfyxer) @@ -345,7 +348,7 @@ A string with a proposed plain language rewrite. ### formfyxer.describe_form(text) An OpenAI-enabled tool that will write a draft plain language description for a form. In order to use this feature **you must edit the `openai_org.txt` and `openai_key.txt` files found in this package to contain your OpenAI credentials**. You can sign up for an account and get your token on the [OpenAI signup](https://beta.openai.com/signup). -Given a string conataining the full text of a court form, this function will return its a draft description of the form written in plain language. +Given a string containing the full text of a court form, this function will return its a draft description of the form written in plain language. #### Parameters: * **text : str** text. @@ -444,6 +447,23 @@ An object grouping together similar field names. [back to top](#formfyxer) + +### formfyxer.get_sensitive_fields(fields) +Given a list of fields, identify those related to sensitive information. Sensitive fields include Social Security Number(SSN)), Driver's License (DL), and account numbers. +#### Parameters: +* **fields : List[str]** List of field names. +#### Returns: +List of sensitive fields found within the fields passed in. +#### Example: +```python +>>> import formfyxer +>>> formfyxer.get_sensitive_fields(["users1_name", "users1_address", "users1_ssn", "users1_routing_number"]) +['Social Security Number', 'Bank Account Number'] +``` +[back to top](#formfyxer) + + + ## License [MIT](https://github.com/SuffolkLITLab/FormFyxer/blob/main/LICENSE) diff --git a/formfyxer/lit_explorer.py b/formfyxer/lit_explorer.py index 6e90b41..de1c37d 100644 --- a/formfyxer/lit_explorer.py +++ b/formfyxer/lit_explorer.py @@ -972,6 +972,39 @@ def get_citations(text: str, tokenized_sentences: List[str]) -> List[str]: return citations_with_context +def get_sensitive_fields(fields: List[str]) -> List[str]: + """ + Given a list of fields, identify those related to sensitive information. Sensitive fields include + Social Security Number(SSN)), Driver's License (DL), and account numbers. + """ + # NOTE: omitting CID since it has a lot of false positives. + field_patterns = { + "Social Security Number": [ + "social[\W_]*security[\W_]*number", + "SSN", + "TIN$" + ], + "Bank Account Number": [ + "account[\W_]*number", + "ABA$", + "routing[\W_]*number", + "checking" + ], + "Credit Card Number": [ + "credit[\W_]*card", + "(CV[CDV]2?|CCV|CSC)" + ], + "Driver's License": [ + "drivers[\W_]*license", + ".?DL$" + ] + } + text = "\n".join(fields) + field_regexes = {name: re.compile("|".join(patterns), re.IGNORECASE | re.MULTILINE) for name, patterns in field_patterns.items()} + sensitive_fields = [name for name, regex in field_regexes.items() if re.search(regex, text)] + + return sensitive_fields + def substitute_phrases( input_string: str, substitution_phrases: Dict[str, str] ) -> Tuple[str, List[Tuple[int, int]]]: @@ -1198,6 +1231,10 @@ def parse_form( classify_field(field, new_names[index]) for index, field in enumerate(field_types) ] + # NOTE: we send both the original and the cleaned up field names. There are cases where one or the other is cleaner. + # Since the sensitive fields are tagged as a group name rather than individual field names, it does no harm to send + # more variations to help detection. + sensitive_fields = get_sensitive_fields(field_names + new_names) slotin_count = sum(1 for c in classified if c == AnswerType.SLOT_IN) gathered_count = sum(1 for c in classified if c == AnswerType.GATHERED) @@ -1224,6 +1261,7 @@ def parse_form( "fields": new_names, "fields_conf": new_names_conf, "fields_old": field_names, + "sensitive fields": sensitive_fields, "text": text, "original_text": original_text, "number of sentences": sentence_count, diff --git a/setup.py b/setup.py index f13a7e8..b51b4d2 100644 --- a/setup.py +++ b/setup.py @@ -16,7 +16,7 @@ def run(self): setuptools.setup( name='formfyxer', - version='0.3.0a1', + version='0.3.0a2', author='Suffolk LIT Lab', author_email='litlab@suffolk.edu', description='A tool for learning about and pre-processing pdf forms.', @@ -33,7 +33,7 @@ def run(self): 'nltk', 'boxdetect', 'pdf2image', 'reportlab>=3.6.13', 'pdfminer.six', 'opencv-python', 'ocrmypdf', 'eyecite', 'passivepy>=0.2.16', 'sigfig', 'typer>=0.4.1,<0.5.0', # typer pre 0.4.1 was broken by click 8.1.0: https://github.com/explosion/spaCy/issues/10564 - 'openai', 'transformers' + 'openai', 'python-docx', 'tiktoken', 'transformers' ], cmdclass={ 'install': InstallSpacyModelCommand,