Skip to content

Commit

Permalink
51/new address features (#60)
Browse files Browse the repository at this point in the history
* #51 add WordEmbeddingConditions, Abbreviations and Emails

* #51 fix test pre-processing path

---------

Co-authored-by: atarchetti <atarchetti@avantstay.com>
  • Loading branch information
apmt and atarchetti authored Feb 2, 2023
1 parent 6fd16f3 commit 39229b3
Show file tree
Hide file tree
Showing 11 changed files with 162 additions and 66 deletions.
7 changes: 7 additions & 0 deletions data/dicts/mexico_abbreviations.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
abreviation,replace
AV,AVENUE
NO,NUMBER
NUM,NUMBER
PO,POST OFFICE
BLVD,BOULEVARD
LT,LOTE
3 changes: 3 additions & 0 deletions data/dicts/mexico_context.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
text,valid
pendiente,0
pending,0
Binary file not shown.
95 changes: 63 additions & 32 deletions examples/MEXICO_retrain_predict_example.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,25 @@
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"execution_count": 2,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/anapaula/.local/lib/python3.8/site-packages/tqdm/auto.py:22: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
" from .autonotebook import tqdm as notebook_tqdm\n",
"2023-01-28 20:08:59.580652: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA\n",
"To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
"2023-01-28 20:08:59.659490: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory\n",
"2023-01-28 20:08:59.659504: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.\n",
"2023-01-28 20:09:00.126530: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory\n",
"2023-01-28 20:09:00.126588: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory\n",
"2023-01-28 20:09:00.126592: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly.\n"
]
}
],
"source": [
"import pandas as pd\n",
"import hygia as hg\n",
Expand Down Expand Up @@ -73,7 +89,16 @@
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"aliases indified: \u001b[1mconcat_STREET_ADDRESS_1_STREET_ADDRESS_2 -> \u001b[22m['STREET_ADDRESS_1', 'STREET_ADDRESS_2']\n",
"handle null values in the column \u001b[1mconcat_STREET_ADDRESS_1_STREET_ADDRESS_2\u001b[22m\n"
]
}
],
"source": [
"concatened_column_name = 'concat_STREET_ADDRESS_1_STREET_ADDRESS_2'\n",
"df = pre_process_data.pre_process_data(df, ['STREET_ADDRESS_1', 'STREET_ADDRESS_2'], concatened_column_name)\n",
Expand Down Expand Up @@ -133,7 +158,7 @@
" 'feature_re_contains_email_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n",
" 'feature_re_contains_url_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n",
" 'feature_re_contains_date_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n",
" 'feature_re_contains_invalid_words_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n",
" 'feature_re_contains_exactly_invalid_words_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n",
" 'feature_re_is_substring_of_column_name_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n",
" 'feature_re_only_one_char_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n",
" 'feature_re_only_white_spaces_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n",
Expand Down Expand Up @@ -166,15 +191,18 @@
{
"data": {
"text/plain": [
"valid 1363810\n",
"key_smash 904\n",
"contains_email 412\n",
"contains_exactly_the_word_test 182\n",
"only_special_characters 151\n",
"contains_exactly_the_word_dell 126\n",
"only_one_char 16\n",
"contains_invalid_words 10\n",
"only_white_spaces 2\n",
"valid 1346998\n",
"key_smash 739\n",
"contains_email 569\n",
"contains_exactly_the_word_test 177\n",
"only_special_characters 144\n",
"contains_exactly_the_word_dell 125\n",
"only_numbers 106\n",
"only_one_char 14\n",
"contains_exactly_invalid_words 10\n",
"is_substring_of_column_name 3\n",
"contains_date 1\n",
"empty 1\n",
"Name: target, dtype: int64"
]
},
Expand Down Expand Up @@ -204,15 +232,18 @@
{
"data": {
"text/plain": [
"valid 2514911\n",
"key_smash 1770\n",
"valid 2514338\n",
"key_smash 1762\n",
"only_special_characters 1291\n",
"contains_email 720\n",
"contains_email 1048\n",
"contains_exactly_the_word_test 667\n",
"contains_exactly_the_word_dell 553\n",
"only_one_char 287\n",
"only_white_spaces 71\n",
"contains_invalid_words 26\n",
"only_numbers 239\n",
"empty 71\n",
"contains_exactly_invalid_words 26\n",
"is_substring_of_column_name 12\n",
"contains_date 2\n",
"Name: target, dtype: int64"
]
},
Expand Down Expand Up @@ -241,10 +272,10 @@
{
"data": {
"text/plain": [
"{'accuracy': 0.998769987699877,\n",
" 'precision': 0.9965156794425087,\n",
" 'recall': 1.0,\n",
" 'f1': 0.9982547993019197}"
"{'accuracy': 0.9885931558935361,\n",
" 'precision': 0.9822222222222222,\n",
" 'recall': 0.9778761061946902,\n",
" 'f1': 0.9800443458980044}"
]
},
"execution_count": 9,
Expand Down Expand Up @@ -274,15 +305,15 @@
"name": "stderr",
"output_type": "stream",
"text": [
"/home/anapaula/dell/Playground/env/lib/python3.8/site-packages/sklearn/base.py:409: UserWarning: X does not have valid feature names, but RandomForestClassifier was fitted with feature names\n",
"/home/anapaula/.local/lib/python3.8/site-packages/sklearn/base.py:409: UserWarning: X does not have valid feature names, but RandomForestClassifier was fitted with feature names\n",
" warnings.warn(\n"
]
},
{
"data": {
"text/plain": [
"0.0 1362585\n",
"1.0 3028\n",
"0.0 1344545\n",
"1.0 4342\n",
"Name: prediction, dtype: int64"
]
},
Expand All @@ -306,25 +337,25 @@
},
{
"cell_type": "code",
"execution_count": 14,
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"pickle.dump(clf, open('../data/models/RandomForest_Ksmash_WordEmbedding_Regex.pkl', 'wb'))"
"pickle.dump(clf, open('../data/models/RandomForest_Ksmash_WordEmbedding_Regex_Enrichments.pkl', 'wb'))"
]
},
{
"cell_type": "code",
"execution_count": 11,
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"df[df['prediction'] == 1][[concatened_column_name, 'prediction']].drop_duplicates(subset=[concatened_column_name]).to_csv('../data/tmp/prediction.csv')"
"df[df['prediction'] == 1][[concatened_column_name, 'prediction']].drop_duplicates(subset=[concatened_column_name]).to_csv('../data/tmp/prediction_enrichments.csv')"
]
},
{
"cell_type": "code",
"execution_count": 12,
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -338,7 +369,7 @@
],
"metadata": {
"kernelspec": {
"display_name": "Python 3.9.16 ('env': venv)",
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
Expand All @@ -357,7 +388,7 @@
"orig_nbformat": 4,
"vscode": {
"interpreter": {
"hash": "e46586f29fe5e457cab913e550e4adb40fe0e0134f73028154375029c57105e1"
"hash": "e7370f93d1d0cde622a1f8e1c04877d8463912d04d973331ad4851f04de6915a"
}
}
},
Expand Down
9 changes: 5 additions & 4 deletions hygia/data_pipeline/feature_engineering/regex.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,9 @@ def only_special_characters(self, text:str) -> bool:
return bool(re.search(pattern, text, re.IGNORECASE))

def contains_email(self, text:str) -> bool:
pattern = r'\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}\b'
return bool(re.search(pattern, text, re.IGNORECASE))
pattern_1 = r'\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}\b'
pattern_2 = r'(GMAIL|HOTMAIL|YAHOO|OUTLOOK)'
return bool(re.search(pattern_1, text, re.IGNORECASE)) or bool(re.search(pattern_2, text, re.IGNORECASE))

def contains_url(self, text:str) -> bool:
pattern = r'\b(https?:\/\/|www\.)[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b(?:[-a-zA-Z0-9()@:%_\+.~#?&//=]*)'
Expand All @@ -29,7 +30,7 @@ def contains_date(self, text:str) -> bool:
pattern = r'^(?P<day>\d{1,2})(?:-|\.|/)(?P<month>\d{1,2})(?:-|\.|/)(?P<year>\d{4})$'
return bool(re.search(pattern, text, re.IGNORECASE))

def contains_invalid_words(self, text:str) -> bool:
def contains_exactly_invalid_words(self, text:str) -> bool:
pattern = r'\b(null|undefined|dummy)\b'
return bool(re.search(pattern, text, re.IGNORECASE))

Expand Down Expand Up @@ -57,7 +58,7 @@ def extract_regex_features(self, df:pd.DataFrame, column_name:str) -> pd.DataFra
self.contains_email,
self.contains_url,
self.contains_date,
self.contains_invalid_words,
self.contains_exactly_invalid_words,
self.is_substring_of_column_name,
self.only_one_char,
self.only_white_spaces,
Expand Down
18 changes: 16 additions & 2 deletions hygia/data_pipeline/feature_engineering/word_embedding.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
# from whatlies.language import Glove
# from whatlies.language import BagOfWords
import pandas as pd
import re

class WordEmbedding:
"""
Expand Down Expand Up @@ -58,6 +59,10 @@ def _load_model(self) -> Any:
raise NotImplementedError
else:
raise ValueError

def _pre_embedding(self, text: str) -> str:
text = ' '.join(e for e in text.split() if e.isalpha() and len(e) >= 3 and not e.isspace())
return text

def get_embedding(self, text: str) -> np.ndarray:
"""
Expand All @@ -82,8 +87,17 @@ def get_embedding(self, text: str) -> np.ndarray:
print(embedding)
# Output: [0.5, 0.6, ..., 0.7] (a list of float values representing the word embedding vector)
"""
return self.word_embedding_model[text].vector

empty_vector = [0.0] * self.dimensions

text = self._pre_embedding(text)

# White space string
if len(text.strip().split()) == 0:
return empty_vector

return self.word_embedding_model[text].vector

def extract_word_embedding_features(self, df: pd.DataFrame, column_name: str, normalize: bool = False) -> pd.DataFrame:
"""
Extract word embedding features from a given dataframe and column.
Expand All @@ -110,7 +124,7 @@ def extract_word_embedding_features(self, df: pd.DataFrame, column_name: str, no
print(df.head())
"""

feature_we_tmp = df[column_name].fillna('').apply(lambda x: self.get_embedding(x) if len(x.strip().split()) > 0 else [0.0] * self.dimensions)
feature_we_tmp = df[column_name].fillna('').apply(lambda x: self.get_embedding(x))

for i in range(self.dimensions):
df[f'feature_we_{i}_{column_name}'] = feature_we_tmp.apply(lambda x: x[i])
Expand Down
5 changes: 2 additions & 3 deletions hygia/data_pipeline/pre_process_data/augment_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@ def __init__(self, country:str='MEXICO') -> None:
'south_america': 'zip_to_lat_lon_South America.pkl'
}
country_mappings = {
# TODO document list of supported countries
# TODO implement only numbers validation in zipcode
'BRAZIL': {'code': 'BR', 'zipcode_file': continent_files['south_america'], 'length':7, 'only_numbers':True},
'US': {'code': 'US', 'zipcode_file': continent_files['north_america'], 'length':5, 'only_numbers':True},
Expand All @@ -33,14 +32,14 @@ def validate_zipcode(self, text:str) -> bool:
return text in self.country_zipcode_df['postal code'].values

def validate_zipcodes(self, df:pd.DataFrame, zipcode_column_name:str) -> pd.DataFrame:
# TODO raise errors if zipcode_column_name not in df
if zipcode_column_name not in df:
return
validated_column = f"{zipcode_column_name}_is_valid"
indicator_column = f"{zipcode_column_name}_is_valid_indicator"
df_aux = pd.merge(df, self.country_zipcode_df, how='left', left_on=zipcode_column_name, right_on='postal code', indicator=indicator_column)
df_aux[validated_column] = df_aux[indicator_column] == 'both'
return df_aux[[validated_column]]


def augment_data(self, df:pd.DataFrame, zipcode_column_name:str) -> pd.DataFrame:
df = pd.concat([df, self.validate_zipcodes(df, zipcode_column_name)], axis=1)
return df
Expand Down
24 changes: 22 additions & 2 deletions hygia/data_pipeline/pre_process_data/pre_process_data.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,17 @@
import pandas as pd
import re
from colorama import Style

class PreProcessData:
def __init__(self, country:str='MEXICO') -> None:
country_mappings = {
'MEXICO': {'code': 'MX', 'abbrevitations_file': 'data/dicts/mexico_abbreviations.csv'},
}
self.abbreviations_dict = {}
with open(country_mappings[country]['abbrevitations_file'], 'r') as f:
for line in f:
key, value = line.strip().split(',')
self.abbreviations_dict.update({key: value})

def concatenate_columns(self, df, columns, concatenated_column_name):
print(f'aliases indified: {Style.BRIGHT}{concatenated_column_name} -> {Style.NORMAL}{columns}')

Expand All @@ -14,12 +24,22 @@ def handle_nulls(self, df, column_name):
df[column_name].fillna('').astype(str)
return df

def pre_process_data(self, df, columns_to_concat=None, column_name=None):
def _replace_abbreviation(self, text:str) -> str:
for abbreviation in self.abbreviations_dict:
text = ' '.join([re.sub(rf'(\b|(?<=[^a-zA-Z])){abbreviation}(\b|(?=[^a-zA-Z]))', self.abbreviations_dict[abbreviation], e, flags=re.IGNORECASE) for e in text.split()])
return text

def handle_abreviations(self, df, column_name):
df[column_name] = df[column_name].apply(lambda x: self._replace_abbreviation(x))
return df

def pre_process_data(self, df, columns_to_concat=None, column_name=None, zipcode_columns=None):

if columns_to_concat and column_name:
df = self.concatenate_columns(df, columns_to_concat, column_name)

if column_name and column_name in df.columns:
df = self.handle_nulls(df, column_name)
df = self.handle_abreviations(df, column_name)

return df
29 changes: 17 additions & 12 deletions tests/data_pipeline/feature_engineering/test_regex.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
import pytest
import pandas as pd
from hygia.data_pipeline.feature_engineering.regex import Regex

class TestRegex:
Expand Down Expand Up @@ -28,6 +26,13 @@ def test_only_special_characters(self):

def test_contains_email(self):
assert self.regex.contains_email("EXAMPLE@GMAIL.COM") == True
assert self.regex.contains_email("EXAMPLE@HOTMAIL.COM") == True
assert self.regex.contains_email("EXAMPLE@YAHOO.COM") == True
assert self.regex.contains_email("EXAMPLE@OUTLOOK.COM") == True
assert self.regex.contains_email("EXAMPLE@HOTMAIL.COM") == True
assert self.regex.contains_email("EXAMPLE@HOTMAILCOM") == True
assert self.regex.contains_email("EXAMPLEHOTMAILCOM") == True
assert self.regex.contains_email("EXAMPLEHOTMAIL.COM") == True
assert self.regex.contains_email("EXAMPLE") == False
assert self.regex.contains_email("") == False

Expand All @@ -54,16 +59,16 @@ def test_contains_date(self):
assert self.regex.contains_date("LAGUNA 375-1-1001") == False
assert self.regex.contains_date("") == False

def test_contains_invalid_words(self):
assert self.regex.contains_invalid_words("NULL") == True
assert self.regex.contains_invalid_words("A NULL") == True
assert self.regex.contains_invalid_words("UNDEFINED") == True
assert self.regex.contains_invalid_words("A UNDEFINED") == True
assert self.regex.contains_invalid_words("DUMMY") == True
assert self.regex.contains_invalid_words("A DUMMY") == True
assert self.regex.contains_invalid_words("ABC") == False
assert self.regex.contains_invalid_words("A ABC") == False
assert self.regex.contains_invalid_words("") == False
def test_contains_exactly_invalid_words(self):
assert self.regex.contains_exactly_invalid_words("NULL") == True
assert self.regex.contains_exactly_invalid_words("A NULL") == True
assert self.regex.contains_exactly_invalid_words("UNDEFINED") == True
assert self.regex.contains_exactly_invalid_words("A UNDEFINED") == True
assert self.regex.contains_exactly_invalid_words("DUMMY") == True
assert self.regex.contains_exactly_invalid_words("A DUMMY") == True
assert self.regex.contains_exactly_invalid_words("ABC") == False
assert self.regex.contains_exactly_invalid_words("A ABC") == False
assert self.regex.contains_exactly_invalid_words("") == False

def test_is_substring_of_column_name(self):
assert self.regex.is_substring_of_column_name("STREET_ADDRESS_1", "concat_STREET_ADDRESS_1_STREET_ADDRESS_2") == True
Expand Down
Loading

0 comments on commit 39229b3

Please sign in to comment.