51/new address features (#60)

* #51 add WordEmbeddingConditions, Abbreviations and Emails * #51 fix test pre-processing path --------- Co-authored-by: atarchetti <atarchetti@avantstay.com>
hygia-org · Feb 2, 2023 · 39229b3 · 39229b3
1 parent 6fd16f3
commit 39229b3
Show file tree

Hide file tree

Showing 11 changed files with 162 additions and 66 deletions.
diff --git a/data/dicts/mexico_abbreviations.csv b/data/dicts/mexico_abbreviations.csv
@@ -0,0 +1,7 @@
+abreviation,replace
+AV,AVENUE
+NO,NUMBER
+NUM,NUMBER
+PO,POST OFFICE
+BLVD,BOULEVARD
+LT,LOTE
diff --git a/data/dicts/mexico_context.csv b/data/dicts/mexico_context.csv
@@ -0,0 +1,3 @@
+text,valid
+pendiente,0
+pending,0
diff --git a/data/models/RandomForest_Ksmash_WordEmbedding_Regex_Enrichments.pkl b/data/models/RandomForest_Ksmash_WordEmbedding_Regex_Enrichments.pkl
diff --git a/examples/MEXICO_retrain_predict_example.ipynb b/examples/MEXICO_retrain_predict_example.ipynb
@@ -2,9 +2,25 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 2,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/anapaula/.local/lib/python3.8/site-packages/tqdm/auto.py:22: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n",
+      "2023-01-28 20:08:59.580652: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA\n",
+      "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
+      "2023-01-28 20:08:59.659490: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory\n",
+      "2023-01-28 20:08:59.659504: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.\n",
+      "2023-01-28 20:09:00.126530: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory\n",
+      "2023-01-28 20:09:00.126588: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory\n",
+      "2023-01-28 20:09:00.126592: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly.\n"
+     ]
+    }
+   ],
    "source": [
     "import pandas as pd\n",
     "import hygia as hg\n",
@@ -73,7 +89,16 @@
    "cell_type": "code",
    "execution_count": 5,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "aliases indified: \u001b[1mconcat_STREET_ADDRESS_1_STREET_ADDRESS_2 -> \u001b[22m['STREET_ADDRESS_1', 'STREET_ADDRESS_2']\n",
+      "handle null values in the column \u001b[1mconcat_STREET_ADDRESS_1_STREET_ADDRESS_2\u001b[22m\n"
+     ]
+    }
+   ],
    "source": [
     "concatened_column_name = 'concat_STREET_ADDRESS_1_STREET_ADDRESS_2'\n",
     "df = pre_process_data.pre_process_data(df, ['STREET_ADDRESS_1', 'STREET_ADDRESS_2'], concatened_column_name)\n",
@@ -133,7 +158,7 @@
        " 'feature_re_contains_email_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n",
        " 'feature_re_contains_url_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n",
        " 'feature_re_contains_date_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n",
-       " 'feature_re_contains_invalid_words_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n",
+       " 'feature_re_contains_exactly_invalid_words_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n",
        " 'feature_re_is_substring_of_column_name_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n",
        " 'feature_re_only_one_char_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n",
        " 'feature_re_only_white_spaces_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n",
@@ -166,15 +191,18 @@
     {
      "data": {
       "text/plain": [
-       "valid                             1363810\n",
-       "key_smash                             904\n",
-       "contains_email                        412\n",
-       "contains_exactly_the_word_test        182\n",
-       "only_special_characters               151\n",
-       "contains_exactly_the_word_dell        126\n",
-       "only_one_char                          16\n",
-       "contains_invalid_words                 10\n",
-       "only_white_spaces                       2\n",
+       "valid                             1346998\n",
+       "key_smash                             739\n",
+       "contains_email                        569\n",
+       "contains_exactly_the_word_test        177\n",
+       "only_special_characters               144\n",
+       "contains_exactly_the_word_dell        125\n",
+       "only_numbers                          106\n",
+       "only_one_char                          14\n",
+       "contains_exactly_invalid_words         10\n",
+       "is_substring_of_column_name             3\n",
+       "contains_date                           1\n",
+       "empty                                   1\n",
        "Name: target, dtype: int64"
       ]
      },
@@ -204,15 +232,18 @@
     {
      "data": {
       "text/plain": [
-       "valid                             2514911\n",
-       "key_smash                            1770\n",
+       "valid                             2514338\n",
+       "key_smash                            1762\n",
        "only_special_characters              1291\n",
-       "contains_email                        720\n",
+       "contains_email                       1048\n",
        "contains_exactly_the_word_test        667\n",
        "contains_exactly_the_word_dell        553\n",
        "only_one_char                         287\n",
-       "only_white_spaces                      71\n",
-       "contains_invalid_words                 26\n",
+       "only_numbers                          239\n",
+       "empty                                  71\n",
+       "contains_exactly_invalid_words         26\n",
+       "is_substring_of_column_name            12\n",
+       "contains_date                           2\n",
        "Name: target, dtype: int64"
       ]
      },
@@ -241,10 +272,10 @@
     {
      "data": {
       "text/plain": [
-       "{'accuracy': 0.998769987699877,\n",
-       " 'precision': 0.9965156794425087,\n",
-       " 'recall': 1.0,\n",
-       " 'f1': 0.9982547993019197}"
+       "{'accuracy': 0.9885931558935361,\n",
+       " 'precision': 0.9822222222222222,\n",
+       " 'recall': 0.9778761061946902,\n",
+       " 'f1': 0.9800443458980044}"
       ]
      },
      "execution_count": 9,
@@ -274,15 +305,15 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "/home/anapaula/dell/Playground/env/lib/python3.8/site-packages/sklearn/base.py:409: UserWarning: X does not have valid feature names, but RandomForestClassifier was fitted with feature names\n",
+      "/home/anapaula/.local/lib/python3.8/site-packages/sklearn/base.py:409: UserWarning: X does not have valid feature names, but RandomForestClassifier was fitted with feature names\n",
       "  warnings.warn(\n"
      ]
     },
     {
      "data": {
       "text/plain": [
-       "0.0    1362585\n",
-       "1.0       3028\n",
+       "0.0    1344545\n",
+       "1.0       4342\n",
        "Name: prediction, dtype: int64"
       ]
      },
@@ -306,25 +337,25 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": 11,
    "metadata": {},
    "outputs": [],
    "source": [
-    "pickle.dump(clf, open('../data/models/RandomForest_Ksmash_WordEmbedding_Regex.pkl', 'wb'))"
+    "pickle.dump(clf, open('../data/models/RandomForest_Ksmash_WordEmbedding_Regex_Enrichments.pkl', 'wb'))"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 14,
    "metadata": {},
    "outputs": [],
    "source": [
-    "df[df['prediction'] == 1][[concatened_column_name, 'prediction']].drop_duplicates(subset=[concatened_column_name]).to_csv('../data/tmp/prediction.csv')"
+    "df[df['prediction'] == 1][[concatened_column_name, 'prediction']].drop_duplicates(subset=[concatened_column_name]).to_csv('../data/tmp/prediction_enrichments.csv')"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 13,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -338,7 +369,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3.9.16 ('env': venv)",
+   "display_name": "Python 3",
    "language": "python",
    "name": "python3"
   },
@@ -357,7 +388,7 @@
   "orig_nbformat": 4,
   "vscode": {
    "interpreter": {
-    "hash": "e46586f29fe5e457cab913e550e4adb40fe0e0134f73028154375029c57105e1"
+    "hash": "e7370f93d1d0cde622a1f8e1c04877d8463912d04d973331ad4851f04de6915a"
    }
   }
  },

diff --git a/hygia/data_pipeline/feature_engineering/regex.py b/hygia/data_pipeline/feature_engineering/regex.py
@@ -18,8 +18,9 @@ def only_special_characters(self, text:str) -> bool:
         return bool(re.search(pattern, text, re.IGNORECASE))
 
     def contains_email(self, text:str) -> bool:
-        pattern = r'\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}\b'
-        return bool(re.search(pattern, text, re.IGNORECASE))
+        pattern_1 = r'\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}\b'
+        pattern_2 = r'(GMAIL|HOTMAIL|YAHOO|OUTLOOK)'
+        return bool(re.search(pattern_1, text, re.IGNORECASE)) or bool(re.search(pattern_2, text, re.IGNORECASE)) 
 
     def contains_url(self, text:str) -> bool:
         pattern = r'\b(https?:\/\/|www\.)[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b(?:[-a-zA-Z0-9()@:%_\+.~#?&//=]*)'
@@ -29,7 +30,7 @@ def contains_date(self, text:str) -> bool:
         pattern = r'^(?P<day>\d{1,2})(?:-|\.|/)(?P<month>\d{1,2})(?:-|\.|/)(?P<year>\d{4})$'
         return bool(re.search(pattern, text, re.IGNORECASE))
 
-    def contains_invalid_words(self, text:str) -> bool:
+    def contains_exactly_invalid_words(self, text:str) -> bool:
         pattern = r'\b(null|undefined|dummy)\b'
         return bool(re.search(pattern, text, re.IGNORECASE))
 
@@ -57,7 +58,7 @@ def extract_regex_features(self, df:pd.DataFrame, column_name:str) -> pd.DataFra
             self.contains_email,
             self.contains_url,
             self.contains_date,
-            self.contains_invalid_words,
+            self.contains_exactly_invalid_words,
             self.is_substring_of_column_name,
             self.only_one_char,
             self.only_white_spaces,

diff --git a/hygia/data_pipeline/feature_engineering/word_embedding.py b/hygia/data_pipeline/feature_engineering/word_embedding.py
@@ -6,6 +6,7 @@
 # from whatlies.language import Glove
 # from whatlies.language import BagOfWords
 import pandas as pd
+import re
 
 class WordEmbedding:
     """
@@ -58,6 +59,10 @@ def _load_model(self) -> Any:
             raise NotImplementedError
         else:
             raise ValueError
+
+    def _pre_embedding(self, text: str) -> str:
+        text = ' '.join(e for e in text.split() if e.isalpha() and len(e) >= 3 and not e.isspace())
+        return text
 
     def get_embedding(self, text: str) -> np.ndarray:
         """
@@ -82,8 +87,17 @@ def get_embedding(self, text: str) -> np.ndarray:
         print(embedding)
         # Output: [0.5, 0.6, ..., 0.7] (a list of float values representing the word embedding vector)
         """
-        return self.word_embedding_model[text].vector
+
+        empty_vector = [0.0] * self.dimensions
 
+        text = self._pre_embedding(text)
+
+        # White space string
+        if len(text.strip().split()) == 0:
+            return empty_vector
+
+        return self.word_embedding_model[text].vector
+
     def extract_word_embedding_features(self, df: pd.DataFrame, column_name: str, normalize: bool = False) -> pd.DataFrame:
         """
         Extract word embedding features from a given dataframe and column.
@@ -110,7 +124,7 @@ def extract_word_embedding_features(self, df: pd.DataFrame, column_name: str, no
             print(df.head())
         """
 
-        feature_we_tmp = df[column_name].fillna('').apply(lambda x: self.get_embedding(x) if len(x.strip().split()) > 0 else [0.0] * self.dimensions)
+        feature_we_tmp = df[column_name].fillna('').apply(lambda x: self.get_embedding(x))
 
         for i in range(self.dimensions):
             df[f'feature_we_{i}_{column_name}'] = feature_we_tmp.apply(lambda x: x[i])

diff --git a/hygia/data_pipeline/pre_process_data/augment_data.py b/hygia/data_pipeline/pre_process_data/augment_data.py
@@ -15,7 +15,6 @@ def __init__(self, country:str='MEXICO') -> None:
             'south_america': 'zip_to_lat_lon_South America.pkl'
         }
         country_mappings = {
-            # TODO document list of supported countries
             # TODO implement only numbers validation in zipcode
             'BRAZIL': {'code': 'BR', 'zipcode_file': continent_files['south_america'], 'length':7, 'only_numbers':True},
             'US': {'code': 'US', 'zipcode_file': continent_files['north_america'], 'length':5, 'only_numbers':True},
@@ -33,14 +32,14 @@ def validate_zipcode(self, text:str) -> bool:
         return text in self.country_zipcode_df['postal code'].values
 
     def validate_zipcodes(self, df:pd.DataFrame, zipcode_column_name:str) -> pd.DataFrame:
-        # TODO raise errors if zipcode_column_name not in df
+        if zipcode_column_name not in df:
+            return
         validated_column = f"{zipcode_column_name}_is_valid"
         indicator_column = f"{zipcode_column_name}_is_valid_indicator"
         df_aux = pd.merge(df, self.country_zipcode_df, how='left', left_on=zipcode_column_name, right_on='postal code', indicator=indicator_column)
         df_aux[validated_column] = df_aux[indicator_column] == 'both'
         return df_aux[[validated_column]]
 
-
     def augment_data(self, df:pd.DataFrame, zipcode_column_name:str) -> pd.DataFrame:
         df = pd.concat([df, self.validate_zipcodes(df, zipcode_column_name)], axis=1)
         return df

diff --git a/hygia/data_pipeline/pre_process_data/pre_process_data.py b/hygia/data_pipeline/pre_process_data/pre_process_data.py
@@ -1,7 +1,17 @@
 import pandas as pd
+import re
 from colorama import Style
-
 class PreProcessData:
+    def __init__(self, country:str='MEXICO') -> None:
+        country_mappings = {
+            'MEXICO': {'code': 'MX', 'abbrevitations_file': 'data/dicts/mexico_abbreviations.csv'},
+        }
+        self.abbreviations_dict = {}
+        with open(country_mappings[country]['abbrevitations_file'], 'r') as f:
+            for line in f:
+                key, value = line.strip().split(',')
+                self.abbreviations_dict.update({key: value})
+
     def concatenate_columns(self, df, columns, concatenated_column_name):
         print(f'aliases indified: {Style.BRIGHT}{concatenated_column_name} -> {Style.NORMAL}{columns}')
 
@@ -14,12 +24,22 @@ def handle_nulls(self, df, column_name):
         df[column_name].fillna('').astype(str)
         return df
 
-    def pre_process_data(self, df, columns_to_concat=None, column_name=None):
+    def _replace_abbreviation(self, text:str) -> str:
+        for abbreviation in self.abbreviations_dict:
+            text = ' '.join([re.sub(rf'(\b|(?<=[^a-zA-Z])){abbreviation}(\b|(?=[^a-zA-Z]))', self.abbreviations_dict[abbreviation], e, flags=re.IGNORECASE) for e in text.split()])
+        return text
+
+    def handle_abreviations(self, df, column_name):
+        df[column_name] = df[column_name].apply(lambda x: self._replace_abbreviation(x))
+        return df
+
+    def pre_process_data(self, df, columns_to_concat=None, column_name=None, zipcode_columns=None):
 
         if columns_to_concat and column_name:
             df = self.concatenate_columns(df, columns_to_concat, column_name)
 
         if column_name and column_name in df.columns:
             df = self.handle_nulls(df, column_name)
+            df = self.handle_abreviations(df, column_name)
 
         return df
diff --git a/tests/data_pipeline/feature_engineering/test_regex.py b/tests/data_pipeline/feature_engineering/test_regex.py
@@ -1,5 +1,3 @@
-import pytest
-import pandas as pd
 from hygia.data_pipeline.feature_engineering.regex import Regex
 
 class TestRegex:
@@ -28,6 +26,13 @@ def test_only_special_characters(self):
 
     def test_contains_email(self):
         assert self.regex.contains_email("EXAMPLE@GMAIL.COM") == True
+        assert self.regex.contains_email("EXAMPLE@HOTMAIL.COM") == True
+        assert self.regex.contains_email("EXAMPLE@YAHOO.COM") == True
+        assert self.regex.contains_email("EXAMPLE@OUTLOOK.COM") == True
+        assert self.regex.contains_email("EXAMPLE@HOTMAIL.COM") == True
+        assert self.regex.contains_email("EXAMPLE@HOTMAILCOM") == True
+        assert self.regex.contains_email("EXAMPLEHOTMAILCOM") == True
+        assert self.regex.contains_email("EXAMPLEHOTMAIL.COM") == True
         assert self.regex.contains_email("EXAMPLE") == False
         assert self.regex.contains_email("") == False
 
@@ -54,16 +59,16 @@ def test_contains_date(self):
         assert self.regex.contains_date("LAGUNA 375-1-1001") == False
         assert self.regex.contains_date("") == False
 
-    def test_contains_invalid_words(self):
-        assert self.regex.contains_invalid_words("NULL") == True
-        assert self.regex.contains_invalid_words("A NULL") == True
-        assert self.regex.contains_invalid_words("UNDEFINED") == True
-        assert self.regex.contains_invalid_words("A UNDEFINED") == True
-        assert self.regex.contains_invalid_words("DUMMY") == True
-        assert self.regex.contains_invalid_words("A DUMMY") == True
-        assert self.regex.contains_invalid_words("ABC") == False
-        assert self.regex.contains_invalid_words("A ABC") == False
-        assert self.regex.contains_invalid_words("") == False
+    def test_contains_exactly_invalid_words(self):
+        assert self.regex.contains_exactly_invalid_words("NULL") == True
+        assert self.regex.contains_exactly_invalid_words("A NULL") == True
+        assert self.regex.contains_exactly_invalid_words("UNDEFINED") == True
+        assert self.regex.contains_exactly_invalid_words("A UNDEFINED") == True
+        assert self.regex.contains_exactly_invalid_words("DUMMY") == True
+        assert self.regex.contains_exactly_invalid_words("A DUMMY") == True
+        assert self.regex.contains_exactly_invalid_words("ABC") == False
+        assert self.regex.contains_exactly_invalid_words("A ABC") == False
+        assert self.regex.contains_exactly_invalid_words("") == False
 
     def test_is_substring_of_column_name(self):
         assert self.regex.is_substring_of_column_name("STREET_ADDRESS_1", "concat_STREET_ADDRESS_1_STREET_ADDRESS_2") == True