forked from danzzzlll/It_purple_hack
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
0 parents
commit 2521c65
Showing
3 changed files
with
3 additions
and
0 deletions.
There are no files selected for viewing
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
{"cells":[{"cell_type":"code","execution_count":null,"metadata":{"_cell_guid":"b1076dfc-b9ad-4769-8c92-a6c4dae69d19","_uuid":"8f2839f25d086af736a60e9eeb907d3b93b6e0e5","trusted":true},"outputs":[],"source":["import pandas as pd\n","import numpy as np\n","from catboost import CatBoostClassifier, Pool\n","from lightgbm import LGBMClassifier\n","from sklearn.model_selection import StratifiedKFold\n","from sklearn import metrics\n","from sklearn.preprocessing import LabelEncoder\n","import random\n","import warnings\n","from sklearn.model_selection import train_test_split \n","from sklearn.metrics import confusion_matrix\n","from sklearn.metrics import f1_score\n","from sklearn.metrics import precision_score\n","from sklearn.metrics import recall_score\n","import numpy as np # linear algebra\n","import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\n","import numpy as np\n","import pandas as pd\n","import matplotlib.pyplot as plt\n","from sklearn.preprocessing import StandardScaler\n","\n","from sklearn.model_selection import train_test_split, StratifiedKFold\n","from imblearn.over_sampling import SMOTE\n","from imblearn.under_sampling import RandomUnderSampler\n","from sklearn.linear_model import LogisticRegressionCV, LogisticRegression\n","from sklearn.metrics import f1_score, precision_score, recall_score\n","import seaborn as sns\n","from sklearn.metrics import roc_curve, roc_auc_score, confusion_matrix, auc, f1_score\n","import tqdm\n","random.seed(42)\n","np.random.seed(42)"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["df = pd.read_parquet('/kaggle/input/purple-hack/train_ai_comp_final_dp.parquet')\n","test = pd.read_parquet('/kaggle/input/purple-hack-with-test/test_sber.parquet')"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["for col in df.columns:\n"," if df[col].dtype == 'int64':\n"," df[col] = df[col].astype('int32')\n"," elif df[col].dtype == 'float64':\n"," df[col] = df[col].astype('float32')\n"," elif df[col].dtype == 'object':\n"," if len(df[col].unique()) / len(df[col]) < 0.5:\n"," df[col] = df[col].astype('category')"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["# Удаление фичей, которые коррелируют друг с другом больше, чем на 0.9\n","def get_correlated_feats(corr_matrix, feat_stats, greater_is_better=True, corr_threshold=0.95):\n"," cols = corr_matrix.columns.to_list()\n"," dropped = {col:0 for col in cols}\n"," for col in tqdm.tqdm(cols, desc='Get correlated features'):\n"," if dropped[col] == 0:\n"," columns_to_check = corr_matrix.index.values[np.abs(corr_matrix[col]) >= corr_threshold]\n"," if len(columns_to_check) > 1:\n"," if feat_stats is None:\n"," bad_cols = columns_to_check[1:]\n"," else:\n"," sel_stats = feat_stats.loc[columns_to_check]\n"," if greater_is_better:\n"," bad_cond = np.abs(sel_stats) < np.abs(sel_stats).max()\n"," else:\n"," bad_cond = np.abs(sel_stats) > np.abs(sel_stats).min()\n"," \n"," bad_cols = sel_stats[bad_cond].index.to_list()\n"," norm_cols = sel_stats[~bad_cond].index.to_list()\n"," if len(norm_cols) > 1:\n"," for norm_col in norm_cols[1:]:\n"," dropped[norm_col] += 1\n"," \n"," for bad_col in bad_cols:\n"," dropped[bad_col] += 1\n"," high_corr_cols = [c for c in dropped.keys() if dropped[c] > 0]\n"," return high_corr_cols"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["clean_df = df.copy()\n","nan_percentage = (clean_df == 0).mean()\n","cols_to_drop = list(nan_percentage[nan_percentage > 0.95].index)\n","\n","ignore_features = ['id', 'target', 'sample_ml_new', 'feature756']+cols_to_drop\n","print(len(ignore_features))\n","clean_df = clean_df.drop(columns=ignore_features)\n","\n","corr_mx = pd.DataFrame(np.corrcoef(clean_df.values, rowvar=False), columns=clean_df.columns, index=clean_df.columns)"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["corr_feats = get_correlated_feats(corr_mx, feat_stats=nan_percentage, greater_is_better=False, corr_threshold=0.95)\n","len(corr_feats)"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["df_no_corr = clean_df.drop(columns=corr_feats)\n","print(\"Было:\", df.shape)\n","print(\"Стало:\", df_no_corr.shape)"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["final_feats = df_no_corr.columns\n","\n","X = df[final_feats].drop(columns = ['feature642'])\n","y = df['target']"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["X = X.reset_index(drop = True)\n","y = y.reset_index(drop = True)"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["cat_cols = []\n","for col in X.columns:\n"," if len(X[col].value_counts()) < 3:\n"," cat_cols.append(col)"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y, test_size=0.25, random_state = 42)"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["cat = CatBoostClassifier(random_seed = 42, class_weights = [1, 10])\n","cat.fit(X_train,y_train)"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["importance_df = pd.DataFrame({'Feature': X_train.columns, 'Importance': feature_importance})\n","\n","importance_df = importance_df.sort_values(by='Importance', ascending=False)\n","\n","top_10_features = importance_df.head(90)['Feature'].tolist()\n","\n","X_train_top_90 = X_train[top_90_features]\n","X_test_top_90 = X_test[top_90_features]"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["X_train_top_90['sin341'] = np.sin(X_train_top_90['feature341'])\n","X_train_top_90['log940'] = np.log1p(X_train_top_90['feature940'])\n","X_train_top_90['new1'] = (X_train_top_90['feature1004'] * X_train_top_90['feature994'])\n","\n","X_test_top_90['sin341'] = np.sin(X_test_top_90['feature341'])\n","X_test_top_90['log940'] = np.log1p(X_test_top_90['feature940'])\n","X_test_top_90['new1'] = (X_test_top_90['feature1004'] * X_test_top_90['feature994'])"]}],"metadata":{"kaggle":{"accelerator":"none","dataSources":[],"isGpuEnabled":false,"isInternetEnabled":true,"language":"python","sourceType":"notebook"},"kernelspec":{"display_name":"Python 3","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.6.4"}},"nbformat":4,"nbformat_minor":4} |
Large diffs are not rendered by default.
Oops, something went wrong.