Skip to content

Commit

Permalink
#65 save notebook outputs
Browse files Browse the repository at this point in the history
  • Loading branch information
atarchetti committed Feb 8, 2023
1 parent e322c85 commit b40eae5
Show file tree
Hide file tree
Showing 4 changed files with 100 additions and 117 deletions.
66 changes: 42 additions & 24 deletions examples/MEXICO_predict_example.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@
"pre_process_data = hg.PreProcessData(country=\"MEXICO\")\n",
"augment_data = hg.AugmentData(country=\"MEXICO\")\n",
"feature_engineering = hg.FeatureEngineering(country=\"MEXICO\")\n",
"rf_model = hg.RandomForestModel('../data/models/RandomForest_Ksmash_Regex_Enrichments_Normalization.pkl',\n",
"rf_model = hg.RandomForestModel('../data/models/RandomForest_Ksmash_Word_Embedding_Regex_Enrichments_Normalization.pkl',\n",
" normalization_absolutes_file='../data/models/normalization_absolutes.csv')"
]
},
Expand All @@ -49,8 +49,7 @@
"outputs": [],
"source": [
"file_path = '../data/tmp/AI_LATA_ADDRESS_MEX_modificado.csv'\n",
"# df = pd.read_csv(file_path, sep='¨', nrows=None, engine='python')\n",
"df = pd.read_csv(file_path, sep='¨', nrows=5000, engine='python')"
"df = pd.read_csv(file_path, sep='¨', nrows=None, engine='python')"
]
},
{
Expand Down Expand Up @@ -125,6 +124,31 @@
" 'feature_ks_count_sequence_squared_consonants_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n",
" 'feature_ks_count_sequence_squared_special_characters_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n",
" 'feature_ks_average_of_char_count_squared_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n",
" 'feature_we_0_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n",
" 'feature_we_1_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n",
" 'feature_we_2_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n",
" 'feature_we_3_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n",
" 'feature_we_4_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n",
" 'feature_we_5_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n",
" 'feature_we_6_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n",
" 'feature_we_7_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n",
" 'feature_we_8_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n",
" 'feature_we_9_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n",
" 'feature_we_10_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n",
" 'feature_we_11_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n",
" 'feature_we_12_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n",
" 'feature_we_13_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n",
" 'feature_we_14_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n",
" 'feature_we_15_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n",
" 'feature_we_16_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n",
" 'feature_we_17_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n",
" 'feature_we_18_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n",
" 'feature_we_19_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n",
" 'feature_we_20_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n",
" 'feature_we_21_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n",
" 'feature_we_22_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n",
" 'feature_we_23_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n",
" 'feature_we_24_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n",
" 'feature_re_contains_context_invalid_words_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n",
" 'feature_re_contains_exactly_the_word_dell_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n",
" 'feature_re_contains_exactly_the_word_test_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n",
Expand All @@ -147,9 +171,7 @@
],
"source": [
"all_features_columns = [col for col in df if col.startswith('feature_ks') or col.startswith('feature_we') or col.startswith('feature_re')]\n",
"model_features_columns = [col for col in all_features_columns \\\n",
" if not col.startswith('feature_we') \\\n",
" and 'ratio_of_numeric_digits_squared' not in col]\n",
"model_features_columns = [col for col in all_features_columns if 'ratio_of_numeric_digits_squared' not in col]\n",
"model_features_columns"
]
},
Expand All @@ -170,24 +192,20 @@
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[33mrunning model...\u001b[37m\n",
"AAAAAAAAAAA concatened_column_name concat_STREET_ADDRESS_1_STREET_ADDRESS_2\n",
"AAAAAAAAAAA features_column_to_normalize feature_ks_count_sequence_squared_vowels_concat_STREET_ADDRESS_1_STREET_ADDRESS_2\n",
"AAAAAAAAAAA 1 feature_ks_count_sequence_squared_vowels\n"
"\u001b[33mrunning model...\u001b[37m\n"
]
},
{
"ename": "TypeError",
"evalue": "'NoneType' object is not subscriptable",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[0;32mIn[6], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m df[\u001b[39m'\u001b[39m\u001b[39mprediction_is_key_smash\u001b[39m\u001b[39m'\u001b[39m] \u001b[39m=\u001b[39m rf_model\u001b[39m.\u001b[39;49mpredict(df[model_features_columns], concatened_column_name)\n\u001b[1;32m 2\u001b[0m df[\u001b[39m'\u001b[39m\u001b[39mprediction_is_key_smash\u001b[39m\u001b[39m'\u001b[39m]\u001b[39m.\u001b[39mvalue_counts()\n",
"File \u001b[0;32m~/dell/Playground/hygia/data_pipeline/model/random_forest.py:107\u001b[0m, in \u001b[0;36mRandomForestModel.predict\u001b[0;34m(self, X, concatened_column_name)\u001b[0m\n\u001b[1;32m 104\u001b[0m \u001b[39mprint\u001b[39m(\u001b[39mf\u001b[39m\u001b[39m'\u001b[39m\u001b[39m{\u001b[39;00mFore\u001b[39m.\u001b[39mYELLOW\u001b[39m}\u001b[39;00m\u001b[39mrunning model...\u001b[39m\u001b[39m{\u001b[39;00mFore\u001b[39m.\u001b[39mWHITE\u001b[39m}\u001b[39;00m\u001b[39m'\u001b[39m)\n\u001b[1;32m 106\u001b[0m key_smash_features_columns \u001b[39m=\u001b[39m [column \u001b[39mfor\u001b[39;00m column \u001b[39min\u001b[39;00m X\u001b[39m.\u001b[39mcolumns \u001b[39mif\u001b[39;00m column\u001b[39m.\u001b[39mstartswith(\u001b[39m'\u001b[39m\u001b[39mfeature_ks\u001b[39m\u001b[39m'\u001b[39m)]\n\u001b[0;32m--> 107\u001b[0m X \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_normalization(X\u001b[39m.\u001b[39;49mcopy(), key_smash_features_columns, concatened_column_name)\n\u001b[1;32m 109\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mmodel\u001b[39m.\u001b[39mpredict(X\u001b[39m.\u001b[39mvalues)\n",
"File \u001b[0;32m~/dell/Playground/hygia/data_pipeline/model/random_forest.py:46\u001b[0m, in \u001b[0;36mRandomForestModel._normalization\u001b[0;34m(self, df, features_columns_to_normalize, concatened_column_name)\u001b[0m\n\u001b[1;32m 44\u001b[0m \u001b[39mprint\u001b[39m(\u001b[39m'\u001b[39m\u001b[39mAAAAAAAAAAA features_column_to_normalize\u001b[39m\u001b[39m'\u001b[39m, features_column_to_normalize)\n\u001b[1;32m 45\u001b[0m \u001b[39mprint\u001b[39m(\u001b[39m'\u001b[39m\u001b[39mAAAAAAAAAAA 1\u001b[39m\u001b[39m'\u001b[39m, features_column_to_normalize\u001b[39m.\u001b[39mreplace(\u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39m_\u001b[39m\u001b[39m{\u001b[39;00mconcatened_column_name\u001b[39m}\u001b[39;00m\u001b[39m\"\u001b[39m, \u001b[39m'\u001b[39m\u001b[39m'\u001b[39m))\n\u001b[0;32m---> 46\u001b[0m \u001b[39mprint\u001b[39m(\u001b[39m'\u001b[39m\u001b[39mAAAAAAAAAAA\u001b[39m\u001b[39m'\u001b[39m, \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mnormalization_absolutes[features_column_to_normalize\u001b[39m.\u001b[39;49mreplace(\u001b[39mf\u001b[39;49m\u001b[39m\"\u001b[39;49m\u001b[39m_\u001b[39;49m\u001b[39m{\u001b[39;49;00mconcatened_column_name\u001b[39m}\u001b[39;49;00m\u001b[39m\"\u001b[39;49m, \u001b[39m'\u001b[39;49m\u001b[39m'\u001b[39;49m)])\n\u001b[1;32m 47\u001b[0m column_absolute_maximum \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mnormalization_absolutes[features_column_to_normalize\u001b[39m.\u001b[39mreplace(\u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39m_\u001b[39m\u001b[39m{\u001b[39;00mconcatened_column_name\u001b[39m}\u001b[39;00m\u001b[39m\"\u001b[39m, \u001b[39m'\u001b[39m\u001b[39m'\u001b[39m)]\u001b[39m.\u001b[39mvalues[\u001b[39m0\u001b[39m]\n\u001b[1;32m 48\u001b[0m df[features_column_to_normalize] \u001b[39m=\u001b[39m df[features_column_to_normalize] \u001b[39m/\u001b[39m column_absolute_maximum\n",
"\u001b[0;31mTypeError\u001b[0m: 'NoneType' object is not subscriptable"
]
"data": {
"text/plain": [
"0.0 2512460\n",
"1.0 7836\n",
"Name: prediction_is_key_smash, dtype: int64"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
Expand All @@ -205,13 +223,13 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"df[df['prediction_is_key_smash'] == 1][[concatened_column_name, 'target', 'prediction']] \\\n",
"df[df['prediction_is_key_smash'] == 1][[concatened_column_name, 'prediction_is_key_smash']] \\\n",
" .drop_duplicates(subset=[concatened_column_name]) \\\n",
" .to_csv(f'../data/tmp/prediction_rf_ks_we_regex_enrich_normal.csv')"
" .to_csv(f'../data/tmp/prediction_rf_ks_regex_enrich_normal.csv')"
]
}
],
Expand Down
30 changes: 13 additions & 17 deletions examples/MEXICO_predict_example_no_embeddings copy.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -170,24 +170,20 @@
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[33mrunning model...\u001b[37m\n",
"AAAAAAAAAAA concatened_column_name concat_STREET_ADDRESS_1_STREET_ADDRESS_2\n",
"AAAAAAAAAAA features_column_to_normalize feature_ks_count_sequence_squared_vowels_concat_STREET_ADDRESS_1_STREET_ADDRESS_2\n",
"AAAAAAAAAAA 1 feature_ks_count_sequence_squared_vowels\n"
"\u001b[33mrunning model...\u001b[37m\n"
]
},
{
"ename": "TypeError",
"evalue": "'NoneType' object is not subscriptable",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[0;32mIn[6], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m df[\u001b[39m'\u001b[39m\u001b[39mprediction_is_key_smash\u001b[39m\u001b[39m'\u001b[39m] \u001b[39m=\u001b[39m rf_model\u001b[39m.\u001b[39;49mpredict(df[model_features_columns], concatened_column_name)\n\u001b[1;32m 2\u001b[0m df[\u001b[39m'\u001b[39m\u001b[39mprediction_is_key_smash\u001b[39m\u001b[39m'\u001b[39m]\u001b[39m.\u001b[39mvalue_counts()\n",
"File \u001b[0;32m~/dell/Playground/hygia/data_pipeline/model/random_forest.py:107\u001b[0m, in \u001b[0;36mRandomForestModel.predict\u001b[0;34m(self, X, concatened_column_name)\u001b[0m\n\u001b[1;32m 104\u001b[0m \u001b[39mprint\u001b[39m(\u001b[39mf\u001b[39m\u001b[39m'\u001b[39m\u001b[39m{\u001b[39;00mFore\u001b[39m.\u001b[39mYELLOW\u001b[39m}\u001b[39;00m\u001b[39mrunning model...\u001b[39m\u001b[39m{\u001b[39;00mFore\u001b[39m.\u001b[39mWHITE\u001b[39m}\u001b[39;00m\u001b[39m'\u001b[39m)\n\u001b[1;32m 106\u001b[0m key_smash_features_columns \u001b[39m=\u001b[39m [column \u001b[39mfor\u001b[39;00m column \u001b[39min\u001b[39;00m X\u001b[39m.\u001b[39mcolumns \u001b[39mif\u001b[39;00m column\u001b[39m.\u001b[39mstartswith(\u001b[39m'\u001b[39m\u001b[39mfeature_ks\u001b[39m\u001b[39m'\u001b[39m)]\n\u001b[0;32m--> 107\u001b[0m X \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_normalization(X\u001b[39m.\u001b[39;49mcopy(), key_smash_features_columns, concatened_column_name)\n\u001b[1;32m 109\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mmodel\u001b[39m.\u001b[39mpredict(X\u001b[39m.\u001b[39mvalues)\n",
"File \u001b[0;32m~/dell/Playground/hygia/data_pipeline/model/random_forest.py:46\u001b[0m, in \u001b[0;36mRandomForestModel._normalization\u001b[0;34m(self, df, features_columns_to_normalize, concatened_column_name)\u001b[0m\n\u001b[1;32m 44\u001b[0m \u001b[39mprint\u001b[39m(\u001b[39m'\u001b[39m\u001b[39mAAAAAAAAAAA features_column_to_normalize\u001b[39m\u001b[39m'\u001b[39m, features_column_to_normalize)\n\u001b[1;32m 45\u001b[0m \u001b[39mprint\u001b[39m(\u001b[39m'\u001b[39m\u001b[39mAAAAAAAAAAA 1\u001b[39m\u001b[39m'\u001b[39m, features_column_to_normalize\u001b[39m.\u001b[39mreplace(\u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39m_\u001b[39m\u001b[39m{\u001b[39;00mconcatened_column_name\u001b[39m}\u001b[39;00m\u001b[39m\"\u001b[39m, \u001b[39m'\u001b[39m\u001b[39m'\u001b[39m))\n\u001b[0;32m---> 46\u001b[0m \u001b[39mprint\u001b[39m(\u001b[39m'\u001b[39m\u001b[39mAAAAAAAAAAA\u001b[39m\u001b[39m'\u001b[39m, \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mnormalization_absolutes[features_column_to_normalize\u001b[39m.\u001b[39;49mreplace(\u001b[39mf\u001b[39;49m\u001b[39m\"\u001b[39;49m\u001b[39m_\u001b[39;49m\u001b[39m{\u001b[39;49;00mconcatened_column_name\u001b[39m}\u001b[39;49;00m\u001b[39m\"\u001b[39;49m, \u001b[39m'\u001b[39;49m\u001b[39m'\u001b[39;49m)])\n\u001b[1;32m 47\u001b[0m column_absolute_maximum \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mnormalization_absolutes[features_column_to_normalize\u001b[39m.\u001b[39mreplace(\u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39m_\u001b[39m\u001b[39m{\u001b[39;00mconcatened_column_name\u001b[39m}\u001b[39;00m\u001b[39m\"\u001b[39m, \u001b[39m'\u001b[39m\u001b[39m'\u001b[39m)]\u001b[39m.\u001b[39mvalues[\u001b[39m0\u001b[39m]\n\u001b[1;32m 48\u001b[0m df[features_column_to_normalize] \u001b[39m=\u001b[39m df[features_column_to_normalize] \u001b[39m/\u001b[39m column_absolute_maximum\n",
"\u001b[0;31mTypeError\u001b[0m: 'NoneType' object is not subscriptable"
]
"data": {
"text/plain": [
"0.0 4984\n",
"1.0 16\n",
"Name: prediction_is_key_smash, dtype: int64"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
Expand All @@ -205,11 +201,11 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"df[df['prediction_is_key_smash'] == 1][[concatened_column_name, 'target', 'prediction']] \\\n",
"df[df['prediction_is_key_smash'] == 1][[concatened_column_name, 'prediction_is_key_smash']] \\\n",
" .drop_duplicates(subset=[concatened_column_name]) \\\n",
" .to_csv(f'../data/tmp/prediction_rf_ks_we_regex_enrich_normal.csv')"
]
Expand Down
Loading

0 comments on commit b40eae5

Please sign in to comment.