diff --git a/examples/MEXICO_predict_example.ipynb b/examples/MEXICO_predict_example.ipynb index 544584fe..4e461654 100644 --- a/examples/MEXICO_predict_example.ipynb +++ b/examples/MEXICO_predict_example.ipynb @@ -30,7 +30,7 @@ "pre_process_data = hg.PreProcessData(country=\"MEXICO\")\n", "augment_data = hg.AugmentData(country=\"MEXICO\")\n", "feature_engineering = hg.FeatureEngineering(country=\"MEXICO\")\n", - "rf_model = hg.RandomForestModel('../data/models/RandomForest_Ksmash_Regex_Enrichments_Normalization.pkl',\n", + "rf_model = hg.RandomForestModel('../data/models/RandomForest_Ksmash_Word_Embedding_Regex_Enrichments_Normalization.pkl',\n", " normalization_absolutes_file='../data/models/normalization_absolutes.csv')" ] }, @@ -49,8 +49,7 @@ "outputs": [], "source": [ "file_path = '../data/tmp/AI_LATA_ADDRESS_MEX_modificado.csv'\n", - "# df = pd.read_csv(file_path, sep='¨', nrows=None, engine='python')\n", - "df = pd.read_csv(file_path, sep='¨', nrows=5000, engine='python')" + "df = pd.read_csv(file_path, sep='¨', nrows=None, engine='python')" ] }, { @@ -125,6 +124,31 @@ " 'feature_ks_count_sequence_squared_consonants_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n", " 'feature_ks_count_sequence_squared_special_characters_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n", " 'feature_ks_average_of_char_count_squared_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n", + " 'feature_we_0_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n", + " 'feature_we_1_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n", + " 'feature_we_2_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n", + " 'feature_we_3_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n", + " 'feature_we_4_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n", + " 'feature_we_5_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n", + " 'feature_we_6_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n", + " 'feature_we_7_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n", + " 'feature_we_8_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n", + " 'feature_we_9_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n", + " 'feature_we_10_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n", + " 'feature_we_11_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n", + " 'feature_we_12_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n", + " 'feature_we_13_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n", + " 'feature_we_14_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n", + " 'feature_we_15_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n", + " 'feature_we_16_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n", + " 'feature_we_17_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n", + " 'feature_we_18_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n", + " 'feature_we_19_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n", + " 'feature_we_20_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n", + " 'feature_we_21_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n", + " 'feature_we_22_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n", + " 'feature_we_23_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n", + " 'feature_we_24_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n", " 'feature_re_contains_context_invalid_words_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n", " 'feature_re_contains_exactly_the_word_dell_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n", " 'feature_re_contains_exactly_the_word_test_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n", @@ -147,9 +171,7 @@ ], "source": [ "all_features_columns = [col for col in df if col.startswith('feature_ks') or col.startswith('feature_we') or col.startswith('feature_re')]\n", - "model_features_columns = [col for col in all_features_columns \\\n", - " if not col.startswith('feature_we') \\\n", - " and 'ratio_of_numeric_digits_squared' not in col]\n", + "model_features_columns = [col for col in all_features_columns if 'ratio_of_numeric_digits_squared' not in col]\n", "model_features_columns" ] }, @@ -170,24 +192,20 @@ "name": "stdout", "output_type": "stream", "text": [ - "\u001b[33mrunning model...\u001b[37m\n", - "AAAAAAAAAAA concatened_column_name concat_STREET_ADDRESS_1_STREET_ADDRESS_2\n", - "AAAAAAAAAAA features_column_to_normalize feature_ks_count_sequence_squared_vowels_concat_STREET_ADDRESS_1_STREET_ADDRESS_2\n", - "AAAAAAAAAAA 1 feature_ks_count_sequence_squared_vowels\n" + "\u001b[33mrunning model...\u001b[37m\n" ] }, { - "ename": "TypeError", - "evalue": "'NoneType' object is not subscriptable", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[6], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m df[\u001b[39m'\u001b[39m\u001b[39mprediction_is_key_smash\u001b[39m\u001b[39m'\u001b[39m] \u001b[39m=\u001b[39m rf_model\u001b[39m.\u001b[39;49mpredict(df[model_features_columns], concatened_column_name)\n\u001b[1;32m 2\u001b[0m df[\u001b[39m'\u001b[39m\u001b[39mprediction_is_key_smash\u001b[39m\u001b[39m'\u001b[39m]\u001b[39m.\u001b[39mvalue_counts()\n", - "File \u001b[0;32m~/dell/Playground/hygia/data_pipeline/model/random_forest.py:107\u001b[0m, in \u001b[0;36mRandomForestModel.predict\u001b[0;34m(self, X, concatened_column_name)\u001b[0m\n\u001b[1;32m 104\u001b[0m \u001b[39mprint\u001b[39m(\u001b[39mf\u001b[39m\u001b[39m'\u001b[39m\u001b[39m{\u001b[39;00mFore\u001b[39m.\u001b[39mYELLOW\u001b[39m}\u001b[39;00m\u001b[39mrunning model...\u001b[39m\u001b[39m{\u001b[39;00mFore\u001b[39m.\u001b[39mWHITE\u001b[39m}\u001b[39;00m\u001b[39m'\u001b[39m)\n\u001b[1;32m 106\u001b[0m key_smash_features_columns \u001b[39m=\u001b[39m [column \u001b[39mfor\u001b[39;00m column \u001b[39min\u001b[39;00m X\u001b[39m.\u001b[39mcolumns \u001b[39mif\u001b[39;00m column\u001b[39m.\u001b[39mstartswith(\u001b[39m'\u001b[39m\u001b[39mfeature_ks\u001b[39m\u001b[39m'\u001b[39m)]\n\u001b[0;32m--> 107\u001b[0m X \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_normalization(X\u001b[39m.\u001b[39;49mcopy(), key_smash_features_columns, concatened_column_name)\n\u001b[1;32m 109\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mmodel\u001b[39m.\u001b[39mpredict(X\u001b[39m.\u001b[39mvalues)\n", - "File \u001b[0;32m~/dell/Playground/hygia/data_pipeline/model/random_forest.py:46\u001b[0m, in \u001b[0;36mRandomForestModel._normalization\u001b[0;34m(self, df, features_columns_to_normalize, concatened_column_name)\u001b[0m\n\u001b[1;32m 44\u001b[0m \u001b[39mprint\u001b[39m(\u001b[39m'\u001b[39m\u001b[39mAAAAAAAAAAA features_column_to_normalize\u001b[39m\u001b[39m'\u001b[39m, features_column_to_normalize)\n\u001b[1;32m 45\u001b[0m \u001b[39mprint\u001b[39m(\u001b[39m'\u001b[39m\u001b[39mAAAAAAAAAAA 1\u001b[39m\u001b[39m'\u001b[39m, features_column_to_normalize\u001b[39m.\u001b[39mreplace(\u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39m_\u001b[39m\u001b[39m{\u001b[39;00mconcatened_column_name\u001b[39m}\u001b[39;00m\u001b[39m\"\u001b[39m, \u001b[39m'\u001b[39m\u001b[39m'\u001b[39m))\n\u001b[0;32m---> 46\u001b[0m \u001b[39mprint\u001b[39m(\u001b[39m'\u001b[39m\u001b[39mAAAAAAAAAAA\u001b[39m\u001b[39m'\u001b[39m, \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mnormalization_absolutes[features_column_to_normalize\u001b[39m.\u001b[39;49mreplace(\u001b[39mf\u001b[39;49m\u001b[39m\"\u001b[39;49m\u001b[39m_\u001b[39;49m\u001b[39m{\u001b[39;49;00mconcatened_column_name\u001b[39m}\u001b[39;49;00m\u001b[39m\"\u001b[39;49m, \u001b[39m'\u001b[39;49m\u001b[39m'\u001b[39;49m)])\n\u001b[1;32m 47\u001b[0m column_absolute_maximum \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mnormalization_absolutes[features_column_to_normalize\u001b[39m.\u001b[39mreplace(\u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39m_\u001b[39m\u001b[39m{\u001b[39;00mconcatened_column_name\u001b[39m}\u001b[39;00m\u001b[39m\"\u001b[39m, \u001b[39m'\u001b[39m\u001b[39m'\u001b[39m)]\u001b[39m.\u001b[39mvalues[\u001b[39m0\u001b[39m]\n\u001b[1;32m 48\u001b[0m df[features_column_to_normalize] \u001b[39m=\u001b[39m df[features_column_to_normalize] \u001b[39m/\u001b[39m column_absolute_maximum\n", - "\u001b[0;31mTypeError\u001b[0m: 'NoneType' object is not subscriptable" - ] + "data": { + "text/plain": [ + "0.0 2512460\n", + "1.0 7836\n", + "Name: prediction_is_key_smash, dtype: int64" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ @@ -205,13 +223,13 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "metadata": {}, "outputs": [], "source": [ - "df[df['prediction_is_key_smash'] == 1][[concatened_column_name, 'target', 'prediction']] \\\n", + "df[df['prediction_is_key_smash'] == 1][[concatened_column_name, 'prediction_is_key_smash']] \\\n", " .drop_duplicates(subset=[concatened_column_name]) \\\n", - " .to_csv(f'../data/tmp/prediction_rf_ks_we_regex_enrich_normal.csv')" + " .to_csv(f'../data/tmp/prediction_rf_ks_regex_enrich_normal.csv')" ] } ], diff --git a/examples/MEXICO_predict_example_no_embeddings copy.ipynb b/examples/MEXICO_predict_example_no_embeddings copy.ipynb index 544584fe..df311d97 100644 --- a/examples/MEXICO_predict_example_no_embeddings copy.ipynb +++ b/examples/MEXICO_predict_example_no_embeddings copy.ipynb @@ -170,24 +170,20 @@ "name": "stdout", "output_type": "stream", "text": [ - "\u001b[33mrunning model...\u001b[37m\n", - "AAAAAAAAAAA concatened_column_name concat_STREET_ADDRESS_1_STREET_ADDRESS_2\n", - "AAAAAAAAAAA features_column_to_normalize feature_ks_count_sequence_squared_vowels_concat_STREET_ADDRESS_1_STREET_ADDRESS_2\n", - "AAAAAAAAAAA 1 feature_ks_count_sequence_squared_vowels\n" + "\u001b[33mrunning model...\u001b[37m\n" ] }, { - "ename": "TypeError", - "evalue": "'NoneType' object is not subscriptable", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[6], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m df[\u001b[39m'\u001b[39m\u001b[39mprediction_is_key_smash\u001b[39m\u001b[39m'\u001b[39m] \u001b[39m=\u001b[39m rf_model\u001b[39m.\u001b[39;49mpredict(df[model_features_columns], concatened_column_name)\n\u001b[1;32m 2\u001b[0m df[\u001b[39m'\u001b[39m\u001b[39mprediction_is_key_smash\u001b[39m\u001b[39m'\u001b[39m]\u001b[39m.\u001b[39mvalue_counts()\n", - "File \u001b[0;32m~/dell/Playground/hygia/data_pipeline/model/random_forest.py:107\u001b[0m, in \u001b[0;36mRandomForestModel.predict\u001b[0;34m(self, X, concatened_column_name)\u001b[0m\n\u001b[1;32m 104\u001b[0m \u001b[39mprint\u001b[39m(\u001b[39mf\u001b[39m\u001b[39m'\u001b[39m\u001b[39m{\u001b[39;00mFore\u001b[39m.\u001b[39mYELLOW\u001b[39m}\u001b[39;00m\u001b[39mrunning model...\u001b[39m\u001b[39m{\u001b[39;00mFore\u001b[39m.\u001b[39mWHITE\u001b[39m}\u001b[39;00m\u001b[39m'\u001b[39m)\n\u001b[1;32m 106\u001b[0m key_smash_features_columns \u001b[39m=\u001b[39m [column \u001b[39mfor\u001b[39;00m column \u001b[39min\u001b[39;00m X\u001b[39m.\u001b[39mcolumns \u001b[39mif\u001b[39;00m column\u001b[39m.\u001b[39mstartswith(\u001b[39m'\u001b[39m\u001b[39mfeature_ks\u001b[39m\u001b[39m'\u001b[39m)]\n\u001b[0;32m--> 107\u001b[0m X \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_normalization(X\u001b[39m.\u001b[39;49mcopy(), key_smash_features_columns, concatened_column_name)\n\u001b[1;32m 109\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mmodel\u001b[39m.\u001b[39mpredict(X\u001b[39m.\u001b[39mvalues)\n", - "File \u001b[0;32m~/dell/Playground/hygia/data_pipeline/model/random_forest.py:46\u001b[0m, in \u001b[0;36mRandomForestModel._normalization\u001b[0;34m(self, df, features_columns_to_normalize, concatened_column_name)\u001b[0m\n\u001b[1;32m 44\u001b[0m \u001b[39mprint\u001b[39m(\u001b[39m'\u001b[39m\u001b[39mAAAAAAAAAAA features_column_to_normalize\u001b[39m\u001b[39m'\u001b[39m, features_column_to_normalize)\n\u001b[1;32m 45\u001b[0m \u001b[39mprint\u001b[39m(\u001b[39m'\u001b[39m\u001b[39mAAAAAAAAAAA 1\u001b[39m\u001b[39m'\u001b[39m, features_column_to_normalize\u001b[39m.\u001b[39mreplace(\u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39m_\u001b[39m\u001b[39m{\u001b[39;00mconcatened_column_name\u001b[39m}\u001b[39;00m\u001b[39m\"\u001b[39m, \u001b[39m'\u001b[39m\u001b[39m'\u001b[39m))\n\u001b[0;32m---> 46\u001b[0m \u001b[39mprint\u001b[39m(\u001b[39m'\u001b[39m\u001b[39mAAAAAAAAAAA\u001b[39m\u001b[39m'\u001b[39m, \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mnormalization_absolutes[features_column_to_normalize\u001b[39m.\u001b[39;49mreplace(\u001b[39mf\u001b[39;49m\u001b[39m\"\u001b[39;49m\u001b[39m_\u001b[39;49m\u001b[39m{\u001b[39;49;00mconcatened_column_name\u001b[39m}\u001b[39;49;00m\u001b[39m\"\u001b[39;49m, \u001b[39m'\u001b[39;49m\u001b[39m'\u001b[39;49m)])\n\u001b[1;32m 47\u001b[0m column_absolute_maximum \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mnormalization_absolutes[features_column_to_normalize\u001b[39m.\u001b[39mreplace(\u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39m_\u001b[39m\u001b[39m{\u001b[39;00mconcatened_column_name\u001b[39m}\u001b[39;00m\u001b[39m\"\u001b[39m, \u001b[39m'\u001b[39m\u001b[39m'\u001b[39m)]\u001b[39m.\u001b[39mvalues[\u001b[39m0\u001b[39m]\n\u001b[1;32m 48\u001b[0m df[features_column_to_normalize] \u001b[39m=\u001b[39m df[features_column_to_normalize] \u001b[39m/\u001b[39m column_absolute_maximum\n", - "\u001b[0;31mTypeError\u001b[0m: 'NoneType' object is not subscriptable" - ] + "data": { + "text/plain": [ + "0.0 4984\n", + "1.0 16\n", + "Name: prediction_is_key_smash, dtype: int64" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ @@ -205,11 +201,11 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 9, "metadata": {}, "outputs": [], "source": [ - "df[df['prediction_is_key_smash'] == 1][[concatened_column_name, 'target', 'prediction']] \\\n", + "df[df['prediction_is_key_smash'] == 1][[concatened_column_name, 'prediction_is_key_smash']] \\\n", " .drop_duplicates(subset=[concatened_column_name]) \\\n", " .to_csv(f'../data/tmp/prediction_rf_ks_we_regex_enrich_normal.csv')" ] diff --git a/examples/MEXICO_retrain_predict_example.ipynb b/examples/MEXICO_retrain_predict_example.ipynb index 2d670ad0..b13f60f0 100644 --- a/examples/MEXICO_retrain_predict_example.ipynb +++ b/examples/MEXICO_retrain_predict_example.ipynb @@ -35,18 +35,6 @@ "\u001b[1mlanguage -> \u001b[22mes\n", "\u001b[1mdimensions -> \u001b[22m25\n" ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/home/anapaula/dell/Playground/hygia/data_pipeline/augment_data/augment_data.py:29: SettingWithCopyWarning: \n", - "A value is trying to be set on a copy of a slice from a DataFrame.\n", - "Try using .loc[row_indexer,col_indexer] = value instead\n", - "\n", - "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", - " country_zipcode_df_raw['postal code'] = country_zipcode_df_raw['postal code'].astype(str).str.pad(country_mappings[country]['length'],fillchar='0')\n" - ] } ], "source": [ @@ -207,6 +195,31 @@ " 'feature_ks_count_sequence_squared_consonants_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n", " 'feature_ks_count_sequence_squared_special_characters_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n", " 'feature_ks_average_of_char_count_squared_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n", + " 'feature_we_0_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n", + " 'feature_we_1_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n", + " 'feature_we_2_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n", + " 'feature_we_3_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n", + " 'feature_we_4_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n", + " 'feature_we_5_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n", + " 'feature_we_6_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n", + " 'feature_we_7_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n", + " 'feature_we_8_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n", + " 'feature_we_9_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n", + " 'feature_we_10_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n", + " 'feature_we_11_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n", + " 'feature_we_12_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n", + " 'feature_we_13_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n", + " 'feature_we_14_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n", + " 'feature_we_15_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n", + " 'feature_we_16_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n", + " 'feature_we_17_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n", + " 'feature_we_18_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n", + " 'feature_we_19_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n", + " 'feature_we_20_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n", + " 'feature_we_21_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n", + " 'feature_we_22_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n", + " 'feature_we_23_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n", + " 'feature_we_24_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n", " 'feature_re_contains_context_invalid_words_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n", " 'feature_re_contains_exactly_the_word_dell_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n", " 'feature_re_contains_exactly_the_word_test_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n", @@ -257,11 +270,12 @@ { "data": { "text/plain": [ - "valid 1344306\n", - "key_smash 665\n", + "valid 1344182\n", + "key_smash 661\n", "contains_email 569\n", "contains_exactly_the_word_test 177\n", "only_special_characters 144\n", + "contains_context_invalid_words 128\n", "contains_exactly_the_word_dell 125\n", "only_numbers 106\n", "only_one_char 14\n", @@ -298,8 +312,9 @@ { "data": { "text/plain": [ - "valid 2514579\n", - "key_smash 1521\n", + "valid 2511507\n", + "contains_context_invalid_words 3079\n", + "key_smash 1514\n", "only_special_characters 1291\n", "contains_email 1048\n", "contains_exactly_the_word_test 667\n", @@ -340,16 +355,12 @@ "output_type": "stream", "text": [ "\u001b[33mtranning model...\u001b[37m\n", - "AAAAAA feature_ks_count_sequence_squared_vowels_concat_STREET_ADDRESS_1_STREET_ADDRESS_2\n", - "AAAAAA feature_ks_count_sequence_squared_consonants_concat_STREET_ADDRESS_1_STREET_ADDRESS_2\n", - "AAAAAA feature_ks_count_sequence_squared_special_characters_concat_STREET_ADDRESS_1_STREET_ADDRESS_2\n", - "AAAAAA feature_ks_average_of_char_count_squared_concat_STREET_ADDRESS_1_STREET_ADDRESS_2\n", "\u001b[32mdone\u001b[37m\n", "\u001b[33mget model score...\u001b[37m\n", - "\u001b[1maccuracy -> \u001b[22m1.0\n", - "\u001b[1mprecision -> \u001b[22m1.0\n", - "\u001b[1mrecall -> \u001b[22m1.0\n", - "\u001b[1mf1 -> \u001b[22m1.0\n" + "\u001b[1maccuracy -> \u001b[22m0.9910256410256411\n", + "\u001b[1mprecision -> \u001b[22m0.9776536312849162\n", + "\u001b[1mrecall -> \u001b[22m0.9831460674157303\n", + "\u001b[1mf1 -> \u001b[22m0.9803921568627452\n" ] } ], @@ -377,25 +388,11 @@ "\u001b[33mrunning model...\u001b[37m\n" ] }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/home/anapaula/dell/Playground/hygia/data_pipeline/model/random_forest.py:45: SettingWithCopyWarning: \n", - "A value is trying to be set on a copy of a slice from a DataFrame.\n", - "Try using .loc[row_indexer,col_indexer] = value instead\n", - "\n", - "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", - " df[features_column_to_normalize] = df[features_column_to_normalize] / column_absolute_maximum\n", - "/home/anapaula/dell/Playground/env/lib/python3.8/site-packages/sklearn/base.py:409: UserWarning: X does not have valid feature names, but RandomForestClassifier was fitted with feature names\n", - " warnings.warn(\n" - ] - }, { "data": { "text/plain": [ - "0.0 1345115\n", - "1.0 1006\n", + "0.0 1343185\n", + "1.0 2936\n", "Name: prediction, dtype: int64" ] }, diff --git a/examples/MEXICO_retrain_predict_example_no_embedding.ipynb b/examples/MEXICO_retrain_predict_example_no_embedding.ipynb index e6198a66..49d27c08 100644 --- a/examples/MEXICO_retrain_predict_example_no_embedding.ipynb +++ b/examples/MEXICO_retrain_predict_example_no_embedding.ipynb @@ -35,18 +35,6 @@ "\u001b[1mlanguage -> \u001b[22mes\n", "\u001b[1mdimensions -> \u001b[22m25\n" ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/home/anapaula/dell/Playground/hygia/data_pipeline/augment_data/augment_data.py:29: SettingWithCopyWarning: \n", - "A value is trying to be set on a copy of a slice from a DataFrame.\n", - "Try using .loc[row_indexer,col_indexer] = value instead\n", - "\n", - "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", - " country_zipcode_df_raw['postal code'] = country_zipcode_df_raw['postal code'].astype(str).str.pad(country_mappings[country]['length'],fillchar='0')\n" - ] } ], "source": [ @@ -259,11 +247,12 @@ { "data": { "text/plain": [ - "valid 1344306\n", - "key_smash 665\n", + "valid 1344182\n", + "key_smash 661\n", "contains_email 569\n", "contains_exactly_the_word_test 177\n", "only_special_characters 144\n", + "contains_context_invalid_words 128\n", "contains_exactly_the_word_dell 125\n", "only_numbers 106\n", "only_one_char 14\n", @@ -300,8 +289,9 @@ { "data": { "text/plain": [ - "valid 2514579\n", - "key_smash 1521\n", + "valid 2511507\n", + "contains_context_invalid_words 3079\n", + "key_smash 1514\n", "only_special_characters 1291\n", "contains_email 1048\n", "contains_exactly_the_word_test 667\n", @@ -342,10 +332,6 @@ "output_type": "stream", "text": [ "\u001b[33mtranning model...\u001b[37m\n", - "AAAAAA feature_ks_count_sequence_squared_vowels_concat_STREET_ADDRESS_1_STREET_ADDRESS_2\n", - "AAAAAA feature_ks_count_sequence_squared_consonants_concat_STREET_ADDRESS_1_STREET_ADDRESS_2\n", - "AAAAAA feature_ks_count_sequence_squared_special_characters_concat_STREET_ADDRESS_1_STREET_ADDRESS_2\n", - "AAAAAA feature_ks_average_of_char_count_squared_concat_STREET_ADDRESS_1_STREET_ADDRESS_2\n", "\u001b[32mdone\u001b[37m\n", "\u001b[33mget model score...\u001b[37m\n", "\u001b[1maccuracy -> \u001b[22m1.0\n", @@ -379,25 +365,11 @@ "\u001b[33mrunning model...\u001b[37m\n" ] }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/home/anapaula/dell/Playground/hygia/data_pipeline/model/random_forest.py:45: SettingWithCopyWarning: \n", - "A value is trying to be set on a copy of a slice from a DataFrame.\n", - "Try using .loc[row_indexer,col_indexer] = value instead\n", - "\n", - "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", - " df[features_column_to_normalize] = df[features_column_to_normalize] / column_absolute_maximum\n", - "/home/anapaula/dell/Playground/env/lib/python3.8/site-packages/sklearn/base.py:409: UserWarning: X does not have valid feature names, but RandomForestClassifier was fitted with feature names\n", - " warnings.warn(\n" - ] - }, { "data": { "text/plain": [ - "0.0 1345115\n", - "1.0 1006\n", + "0.0 1345148\n", + "1.0 973\n", "Name: prediction, dtype: int64" ] },