From 593c5c2b72041aae07ef479ee9baa7398f9752c1 Mon Sep 17 00:00:00 2001
From: Mart Ratas <mart.ratas@gmail.com>
Date: Mon, 27 Nov 2023 06:25:32 -0600
Subject: [PATCH] Cu 8692y0n0a Add mypy (#9)

* CU-8692y0n0a Add dev-requirements

* CU-8692y0n0a Install dev-requirements in workflow

* CU-8692y0n0a Add mypy config

* CU-8692y0n0a Some typing fixes in cogstack and credentials modules

* CU-8692y0n0a Some typing fixes in python modules regarding model creation and training

* CU-8692y0n0a Fix method name typo and a typing fix for model running module

* CU-8692y0n0a Some typing fixes in mct evaluate module

* CU-8692y0n0a Add mypy to workflow

* CU-8692y0n0a Add type ignore comment for pandas chained assignment in cdb creation

* CU-8692y0n0a Add type ignore comment for pandas chained assignment in cdb creation (UMLS)
---
 .github/workflows/main.yml                    |  7 +++++++
 cogstack.py                                   | 19 ++++++++-----------
 credentials.py                                |  3 ++-
 .../1_create_model/create_cdb/create_cdb.py   |  4 ++--
 .../create_cdb/create_umls_cdb.py             |  2 +-
 .../1_unsupervised_training/splitter.py       |  4 ++--
 .../unsupervised_medcattraining.py            |  6 +++---
 medcat/3_run_model/run_model.py               |  4 ++--
 medcat/evaluate_mct_export/mct_analysis.py    | 13 +++++++------
 mypy.ini                                      |  5 +++++
 requirements-dev.txt                          |  5 +++++
 11 files changed, 44 insertions(+), 28 deletions(-)
 create mode 100644 mypy.ini
 create mode 100644 requirements-dev.txt

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index 922f8a8..04427ea 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -25,6 +25,13 @@ jobs:
         run: |
           python -m pip install --upgrade pip
           pip install -r requirements.txt
+          pip install -r requirements-dev.txt
+      - name: Typing
+        # run mypy on all tracked non-test python modules
+        # and use explicit package base since the project
+        # is not set up as a python package
+        run: |
+          python -m mypy `git ls-tree --full-tree --name-only -r HEAD | grep ".py$" | grep -v "tests/"` --explicit-package-bases --follow-imports=normal
       - name: Test
         run: |
           python -m unittest discover
diff --git a/cogstack.py b/cogstack.py
index 22c27b3..2da9e50 100644
--- a/cogstack.py
+++ b/cogstack.py
@@ -1,5 +1,5 @@
 import getpass
-from typing import Dict, List, Any, Optional
+from typing import Dict, List, Any, Optional, Iterable, Tuple
 import elasticsearch
 import elasticsearch.helpers
 import pandas as pd
@@ -22,7 +22,8 @@ class CogStack(object):
         password (str, optional): The password to use when connecting to Elasticsearch. If not provided, the user will be prompted to enter a password.
         api (bool, optional): A boolean value indicating whether to use API keys or basic authentication to connect to Elasticsearch. Defaults to False (i.e., use basic authentication).
     """
-    def __init__(self, hosts: List, username: str=None, password: str=None, api=False):
+    def __init__(self, hosts: List, username: Optional[str] = None, password: Optional[str] = None,
+                 api: bool = False):
 
         if api:
             api_username, api_password = self._check_auth_details(username, password)
@@ -36,7 +37,7 @@ def __init__(self, hosts: List, username: str=None, password: str=None, api=Fals
                                                        verify_certs=False)
 
 
-    def _check_auth_details(self, username=None, password=None):
+    def _check_auth_details(self, username=None, password=None) -> Tuple[str, str]:
         """
         Prompt the user for a username and password if the values are not provided as function arguments.
         
@@ -53,7 +54,7 @@ def _check_auth_details(self, username=None, password=None):
             password = getpass.getpass("Password: ")
         return username, password
 
-    def get_docs_generator(self, index: List, query: Dict, es_gen_size: int=800, request_timeout: int=300):
+    def get_docs_generator(self, index: List, query: Dict, es_gen_size: int=800, request_timeout: Optional[int] = 300):
         """
         Retrieve a generator object that can be used to iterate through documents in an Elasticsearch index.
         
@@ -95,12 +96,8 @@ def cogstack2df(self, query: Dict, index: str, column_headers=None, es_gen_size:
                                                     size=es_gen_size,
                                                     request_timeout=request_timeout)
         temp_results = []
-        results = self.elastic.count(index=index, query=query['query'], request_timeout=300)
-        if show_progress:
-            _tqdm = tqdm
-        else:
-            _tqdm = _no_progress_bar
-        for hit in _tqdm(docs_generator, total=results['count'], desc="CogStack retrieved..."):
+        results = self.elastic.count(index=index, query=query['query'], request_timeout=300)  # type: ignore
+        for hit in tqdm(docs_generator, total=results['count'], desc="CogStack retrieved...", disable=not show_progress):
             row = dict()
             row['_index'] = hit['_index']
             row['_id'] = hit['_id']
@@ -144,6 +141,6 @@ def list_chunker(user_list: List[Any], n: int) -> List[List[Any]]:
     return [user_list[i:i+n] for i in range(0, len(user_list), n)]
 
 
-def _no_progress_bar(iterable: list, **kwargs):
+def _no_progress_bar(iterable: Iterable, **kwargs):
     return iterable
 
diff --git a/credentials.py b/credentials.py
index fd1e4f2..d33b4b9 100644
--- a/credentials.py
+++ b/credentials.py
@@ -1,7 +1,8 @@
+from typing import List
 # CogStack login details
 ## Any questions on what these details are please contact your local CogStack administrator.
 
-hosts = []  # This is a list of your CogStack ElasticSearch instances.
+hosts: List[str] = []  # This is a list of your CogStack ElasticSearch instances.
 
 ## These are your login details (either via http_auth or API) Should be in str format
 username = None
diff --git a/medcat/1_create_model/create_cdb/create_cdb.py b/medcat/1_create_model/create_cdb/create_cdb.py
index 6ba2bbe..b163422 100644
--- a/medcat/1_create_model/create_cdb/create_cdb.py
+++ b/medcat/1_create_model/create_cdb/create_cdb.py
@@ -3,7 +3,7 @@
 from medcat.config import Config
 from medcat.cdb_maker import CDBMaker
 
-pd.options.mode.chained_assignment = None
+pd.options.mode.chained_assignment = None  # type: ignore
 
 # relative to file path
 _FILE_DIR = os.path.dirname(__file__)
@@ -41,7 +41,7 @@
 print("Cleaning acronyms...")
 for i, row in csv[(~csv['acronym'].isnull()) & (csv['name_status'] == 'A')][['name', 'acronym']].iterrows():
     if row['name'][0:len(row['acronym'])] == row['acronym']:
-        csv['name'].iloc[i] = row['acronym']
+        csv['name'].iloc[i] = row['acronym']  # type: ignore
 
 print("acronyms complete")
 
diff --git a/medcat/1_create_model/create_cdb/create_umls_cdb.py b/medcat/1_create_model/create_cdb/create_umls_cdb.py
index 51dbf16..f692024 100644
--- a/medcat/1_create_model/create_cdb/create_umls_cdb.py
+++ b/medcat/1_create_model/create_cdb/create_umls_cdb.py
@@ -3,7 +3,7 @@
 from medcat.config import Config
 from medcat.cdb_maker import CDBMaker
 
-pd.options.mode.chained_assignment = None
+pd.options.mode.chained_assignment = None  # type: ignore
 
 # relative to file path
 _FILE_DIR = os.path.dirname(__file__)
diff --git a/medcat/2_train_model/1_unsupervised_training/splitter.py b/medcat/2_train_model/1_unsupervised_training/splitter.py
index c980d8a..8dae276 100644
--- a/medcat/2_train_model/1_unsupervised_training/splitter.py
+++ b/medcat/2_train_model/1_unsupervised_training/splitter.py
@@ -134,8 +134,8 @@ def split(self, in_file: str):
                         1, self.opts, self.split_identifier, header=line)
                     continue
                 buffer = buffer.process_or_write(line_nr, line)
-        if len(buffer.lines) > 1: # if there's more than just a header
-            buffer.save() # saver remaining
+        if buffer and len(buffer.lines) > 1:  # if there's more than just a header
+            buffer.save()  # saver remaining
 
 
 def split_file(in_file: str, nr_of_lines: int, out_file_format: str) -> None:
diff --git a/medcat/2_train_model/1_unsupervised_training/unsupervised_medcattraining.py b/medcat/2_train_model/1_unsupervised_training/unsupervised_medcattraining.py
index e4c1199..89c9b51 100644
--- a/medcat/2_train_model/1_unsupervised_training/unsupervised_medcattraining.py
+++ b/medcat/2_train_model/1_unsupervised_training/unsupervised_medcattraining.py
@@ -11,7 +11,7 @@
 medcat_logger.addHandler(fh)
 
 ###Change parameters here###
-cogstack_indices = []  # list of cogstack indexes here
+cogstack_indices: list = []  # list of cogstack indexes here
 text_columns = ['body_analysed']  # list of all text containing fields
 # relative to file path
 _FILE_DIR = os.path.dirname(__file__)
@@ -24,8 +24,8 @@
 model_pack_name = ''
 output_modelpack_name = ''  # name of modelpack to save
 
-cs = CogStack(hosts, api_username=api_username, api_password=api_password, api=True)
-df = cs.DataFrame(index=cogstack_indices, columns=text_columns)
+cs = CogStack(hosts, username=username, password=password, api=True)
+df = cs.DataFrame(index=cogstack_indices, columns=text_columns)  # type: ignore
 
 cat = CAT.load_model_pack(model_pack_path+model_pack_name)
 cat.cdb.print_stats()
diff --git a/medcat/3_run_model/run_model.py b/medcat/3_run_model/run_model.py
index dcc6745..3d97c42 100644
--- a/medcat/3_run_model/run_model.py
+++ b/medcat/3_run_model/run_model.py
@@ -29,7 +29,7 @@
 cogstack_indices = [''] # Enter your list of relevant cogstack indices here
 
 # log size of indices
-df = cs.DataFrame(index=cogstack_indices, columns=['body_analysed'])
+df = cs.DataFrame(index=cogstack_indices, columns=['body_analysed'])  # type: ignore
 medcat_logger.warning(f'The index size is {df.shape[0]}!')
 del df
 
@@ -44,7 +44,7 @@
 
 data_dir = 'data'
 ann_folder_path = os.path.join(base_path, data_dir, f'annotated_docs')
-if not os.path.exisits(ann_folder_path):
+if not os.path.exists(ann_folder_path):
     os.makedirs(ann_folder_path)
 
 medcat_logger.warning(f'Anntotations will be saved here: {ann_folder_path}')
diff --git a/medcat/evaluate_mct_export/mct_analysis.py b/medcat/evaluate_mct_export/mct_analysis.py
index 5fc0a2a..286fb31 100644
--- a/medcat/evaluate_mct_export/mct_analysis.py
+++ b/medcat/evaluate_mct_export/mct_analysis.py
@@ -147,7 +147,7 @@ def user_stats(self, by_user: bool = True):
             return data
         return data[['user', 'count', 'date']]
 
-    def plot_user_stats(self, save_fig: bool = False, save_fig_filename: str = False):
+    def plot_user_stats(self, save_fig: bool = False, save_fig_filename: str = ''):
         """
         Plot annotator user stats against time.
         An alternative method of saving the file is: plot_user_stats().write_image("path/filename.png")
@@ -352,7 +352,7 @@ def meta_anns_concept_summary(self):
         meta_anns_df.insert(1, 'concept_name', meta_anns_df['cui'].map(self.cat.cdb.cui2preferred_name))
         return meta_anns_df
 
-    def generate_report(self, path: str = 'mct_report.xlsx', meta_ann=False, concept_filter: List = None):
+    def generate_report(self, path: str = 'mct_report.xlsx', meta_ann=False, concept_filter: Optional[List] = None):
         """
         :param path: Outfile path
         :param meta_ann: Include Meta_annotation evaluation in the summary as well
@@ -362,8 +362,9 @@ def generate_report(self, path: str = 'mct_report.xlsx', meta_ann=False, concept
         if concept_filter:
             with pd.ExcelWriter(path, engine_kwargs={'options': {'remove_timezone': True}}) as writer:
                 print('Generating report...')
-                df = pd.DataFrame.from_dict([self.cat.get_model_card(as_dict=True)]).T.reset_index(drop=False)
-                df.columns = ['MCT report', f'Generated on {date.today().strftime("%Y/%m/%d")}']
+                # array-like is allowed by documentation but not by typing
+                df = pd.DataFrame.from_dict([self.cat.get_model_card(as_dict=True)]).T.reset_index(drop=False)  # type: ignore
+                df.columns = ['MCT report', f'Generated on {date.today().strftime("%Y/%m/%d")}']  # type: ignore
                 df = pd.concat([df, pd.DataFrame([['MCT Custom filter', concept_filter]], columns=df.columns)],
                                ignore_index = True)
                 df.to_excel(writer, index=False, sheet_name='medcat_model_card')
@@ -390,8 +391,8 @@ def generate_report(self, path: str = 'mct_report.xlsx', meta_ann=False, concept
         else:
             with pd.ExcelWriter(path, engine_kwargs={'options': {'remove_timezone': True}}) as writer:
                 print('Generating report...')
-                df = pd.DataFrame.from_dict([self.cat.get_model_card(as_dict=True)]).T.reset_index(drop=False)
-                df.columns = ['MCT report', f'Generated on {date.today().strftime("%Y/%m/%d")}']
+                df = pd.DataFrame.from_dict([self.cat.get_model_card(as_dict=True)]).T.reset_index(drop=False)  # type: ignore
+                df.columns = ['MCT report', f'Generated on {date.today().strftime("%Y/%m/%d")}']  # type: ignore
                 df.to_excel(writer, index=False, sheet_name='medcat_model_card')
                 self.user_stats().to_excel(writer, index=False, sheet_name='user_stats')
                 #self.plot_user_stats().to_excel(writer, index=False, sheet_name='user_stats_plot')
diff --git a/mypy.ini b/mypy.ini
new file mode 100644
index 0000000..9738850
--- /dev/null
+++ b/mypy.ini
@@ -0,0 +1,5 @@
+# Global options:
+
+[mypy]
+ignore_missing_imports = True
+allow_redefinition = True
diff --git a/requirements-dev.txt b/requirements-dev.txt
new file mode 100644
index 0000000..7889f13
--- /dev/null
+++ b/requirements-dev.txt
@@ -0,0 +1,5 @@
+mypy
+pandas-stubs
+types-tqdm
+types-requests
+types-regex
\ No newline at end of file