From 2dfe16c7950678efab4d2433abab2928642282a7 Mon Sep 17 00:00:00 2001 From: Layla Nyrabia Date: Mon, 12 Aug 2024 10:25:45 +0200 Subject: [PATCH] scaled, resampled for class imbalance, stratified k-fold, feature selection, logreg --- dummies-stratified-scaled-kfold-logreg.ipynb | 1268 ++++++++++++++++++ 1 file changed, 1268 insertions(+) create mode 100644 dummies-stratified-scaled-kfold-logreg.ipynb diff --git a/dummies-stratified-scaled-kfold-logreg.ipynb b/dummies-stratified-scaled-kfold-logreg.ipynb new file mode 100644 index 0000000..d7706b3 --- /dev/null +++ b/dummies-stratified-scaled-kfold-logreg.ipynb @@ -0,0 +1,1268 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "%run base.ipynb" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Import original Data " + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "df = get_original_data()" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
CategorySubcategoryCountryLaunchedDeadlineGoalPledgedBackersStateDuration
0FashionFashionUnited_States2009-04-21 21:02:482009-05-31100062530Failed39
1Film__VideoShortsUnited_States2009-04-23 00:07:532009-07-2080000223Failed87
2ArtIllustrationUnited_States2009-04-24 21:52:032009-05-0320353Successful8
3TechnologySoftwareUnited_States2009-04-25 17:36:212009-07-149914525Successful79
4FashionFashionUnited_States2009-04-27 14:10:392009-05-26190038710Failed28
\n", + "
" + ], + "text/plain": [ + " Category Subcategory Country Launched Deadline \n", + "0 Fashion Fashion United_States 2009-04-21 21:02:48 2009-05-31 \\\n", + "1 Film__Video Shorts United_States 2009-04-23 00:07:53 2009-07-20 \n", + "2 Art Illustration United_States 2009-04-24 21:52:03 2009-05-03 \n", + "3 Technology Software United_States 2009-04-25 17:36:21 2009-07-14 \n", + "4 Fashion Fashion United_States 2009-04-27 14:10:39 2009-05-26 \n", + "\n", + " Goal Pledged Backers State Duration \n", + "0 1000 625 30 Failed 39 \n", + "1 80000 22 3 Failed 87 \n", + "2 20 35 3 Successful 8 \n", + "3 99 145 25 Successful 79 \n", + "4 1900 387 10 Failed 28 " + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Clean categorical strings from spaces and troublesome special characters" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "import re\n", + "\n", + "# Define a function to clean up string values\n", + "def clean_string(s):\n", + " # Remove leading/trailing spaces\n", + " s = s.strip()\n", + " # Replace all spaces with underscores (or remove them if desired)\n", + " s = re.sub(r'\\s+', '_', s) # Replace spaces with underscores\n", + " # Remove problematic special characters\n", + " s = re.sub(r'[^\\w\\s]', '', s)\n", + " return s\n", + "\n", + "# Apply the function to all columns in the DataFrame\n", + "df = df.applymap(lambda x: clean_string(x) if isinstance(x, str) else x)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
CategorySubcategoryCountryLaunchedDeadlineGoalPledgedBackersStateDuration
0FashionFashionUnited_States2009-04-21 21:02:482009-05-31100062530Failed39
1Film__VideoShortsUnited_States2009-04-23 00:07:532009-07-2080000223Failed87
2ArtIllustrationUnited_States2009-04-24 21:52:032009-05-0320353Successful8
3TechnologySoftwareUnited_States2009-04-25 17:36:212009-07-149914525Successful79
4FashionFashionUnited_States2009-04-27 14:10:392009-05-26190038710Failed28
\n", + "
" + ], + "text/plain": [ + " Category Subcategory Country Launched Deadline \n", + "0 Fashion Fashion United_States 2009-04-21 21:02:48 2009-05-31 \\\n", + "1 Film__Video Shorts United_States 2009-04-23 00:07:53 2009-07-20 \n", + "2 Art Illustration United_States 2009-04-24 21:52:03 2009-05-03 \n", + "3 Technology Software United_States 2009-04-25 17:36:21 2009-07-14 \n", + "4 Fashion Fashion United_States 2009-04-27 14:10:39 2009-05-26 \n", + "\n", + " Goal Pledged Backers State Duration \n", + "0 1000 625 30 Failed 39 \n", + "1 80000 22 3 Failed 87 \n", + "2 20 35 3 Successful 8 \n", + "3 99 145 25 Successful 79 \n", + "4 1900 387 10 Failed 28 " + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Index: 331462 entries, 0 to 374605\n", + "Data columns (total 10 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 Category 331462 non-null object \n", + " 1 Subcategory 331462 non-null object \n", + " 2 Country 331462 non-null object \n", + " 3 Launched 331462 non-null datetime64[ns]\n", + " 4 Deadline 331462 non-null datetime64[ns]\n", + " 5 Goal 331462 non-null int64 \n", + " 6 Pledged 331462 non-null int64 \n", + " 7 Backers 331462 non-null int64 \n", + " 8 State 331462 non-null object \n", + " 9 Duration 331462 non-null int64 \n", + "dtypes: datetime64[ns](2), int64(4), object(4)\n", + "memory usage: 27.8+ MB\n" + ] + } + ], + "source": [ + "df.info()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Encode categorical features as dummy (binary) variables" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "df = pd.get_dummies(df, drop_first=True, columns=df.select_dtypes(include=['object']).columns)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
LaunchedDeadlineGoalPledgedBackersDurationCategory_ComicsCategory_CraftsCategory_DanceCategory_Design...Country_NetherlandsCountry_New_ZealandCountry_NorwayCountry_SingaporeCountry_SpainCountry_SwedenCountry_SwitzerlandCountry_United_KingdomCountry_United_StatesState_Successful
02009-04-21 21:02:482009-05-3110006253039FalseFalseFalseFalse...FalseFalseFalseFalseFalseFalseFalseFalseTrueFalse
12009-04-23 00:07:532009-07-208000022387FalseFalseFalseFalse...FalseFalseFalseFalseFalseFalseFalseFalseTrueFalse
22009-04-24 21:52:032009-05-03203538FalseFalseFalseFalse...FalseFalseFalseFalseFalseFalseFalseFalseTrueTrue
32009-04-25 17:36:212009-07-14991452579FalseFalseFalseFalse...FalseFalseFalseFalseFalseFalseFalseFalseTrueTrue
42009-04-27 14:10:392009-05-2619003871028FalseFalseFalseFalse...FalseFalseFalseFalseFalseFalseFalseFalseTrueFalse
\n", + "

5 rows × 200 columns

\n", + "
" + ], + "text/plain": [ + " Launched Deadline Goal Pledged Backers Duration \n", + "0 2009-04-21 21:02:48 2009-05-31 1000 625 30 39 \\\n", + "1 2009-04-23 00:07:53 2009-07-20 80000 22 3 87 \n", + "2 2009-04-24 21:52:03 2009-05-03 20 35 3 8 \n", + "3 2009-04-25 17:36:21 2009-07-14 99 145 25 79 \n", + "4 2009-04-27 14:10:39 2009-05-26 1900 387 10 28 \n", + "\n", + " Category_Comics Category_Crafts Category_Dance Category_Design ... \n", + "0 False False False False ... \\\n", + "1 False False False False ... \n", + "2 False False False False ... \n", + "3 False False False False ... \n", + "4 False False False False ... \n", + "\n", + " Country_Netherlands Country_New_Zealand Country_Norway \n", + "0 False False False \\\n", + "1 False False False \n", + "2 False False False \n", + "3 False False False \n", + "4 False False False \n", + "\n", + " Country_Singapore Country_Spain Country_Sweden Country_Switzerland \n", + "0 False False False False \\\n", + "1 False False False False \n", + "2 False False False False \n", + "3 False False False False \n", + "4 False False False False \n", + "\n", + " Country_United_Kingdom Country_United_States State_Successful \n", + "0 False True False \n", + "1 False True False \n", + "2 False True True \n", + "3 False True True \n", + "4 False True False \n", + "\n", + "[5 rows x 200 columns]" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(331462, 200)" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.shape" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Define target & features" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "from sklearn.model_selection import StratifiedKFold, GridSearchCV\n", + "from sklearn.linear_model import LogisticRegression\n", + "from sklearn.metrics import f1_score, make_scorer\n", + "\n", + "# Drop or convert datetime columns\n", + "X = df.drop(columns=['State_Successful', 'Launched', 'Deadline', 'Pledged']) # Drop datetime columns + Pledged (data leakage)\n", + "y = df['State_Successful'] # Target column\n" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "# Extract features from datetime columns (here only the year to avoid too many features)\n", + "X['year_launched'] = df['Launched'].dt.year\n", + "# X['month_launched'] = df['Launched'].dt.month\n", + "# X['day_launched'] = df['Launched'].dt.day\n", + "\n", + "X['year_deadline'] = df['Deadline'].dt.year\n", + "# X['year_deadline'] = df['Deadline'].dt.month\n", + "# X['year_deadline'] = df['Deadline'].dt.day" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Split the data into training and test sets" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.model_selection import train_test_split\n", + "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Scale the data" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "# Scale numerical data using StandarScaler:\n", + "\n", + "from sklearn.preprocessing import StandardScaler\n", + "\n", + "# Identify numerical columns (excluding date-time and timedelta types)\n", + "numerical_cols = X.select_dtypes(include=['number']).columns.tolist()\n", + "\n", + "# Initialize StandardScaler\n", + "scaler = StandardScaler()\n", + "\n", + "# Fit and transform only the numerical columns to the training data\n", + "X_train[numerical_cols] = scaler.fit_transform(X_train[numerical_cols])\n", + "\n", + "# Use the same scaler to transform the test data\n", + "X_test[numerical_cols] = scaler.transform(X_test[numerical_cols])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Resample the data for imbalanced target classes\n", + "Since the resampling approach using SMOTE from the imbalanced-learn library completely crashed the local machine (MacBook Air M2 2022, 8GB, Sonoma 14.5) and the approach of randomly under-sampling the majority target class, while being computationally cheaper, will risk us losing meaningful information, there doesn't seem to be a clear optimal solution yet. By using the simplest and computationally cheapest form of resampling imbalanced target classes, namely by randomly under-sampling the majority class of the target variable, we apply the most (and only) feasible resampling method to handle imbalanced target classes within our limited computational means:" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "# Apply random undersampling to the training set\n", + "from imblearn.under_sampling import RandomUnderSampler\n", + "undersampler = RandomUnderSampler(random_state=42)\n", + "X_train_resampled, y_train_resampled = undersampler.fit_resample(X_train, y_train)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Prepare the cross-validation strategy\n", + "- Stratified K-fold for better representation of classes\n", + "- Saga solver for faster convergence times\n", + "- Small number of max. iteration for computational limitations\n", + "- logistic regression with weights for additional class balance\n", + "- Ridge and Lasso regularization for feature selection insights\n", + "- Smaller regularization options for more aggressive feature selection due to large dimensionality of dummy encoded data" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.linear_model import LogisticRegression\n", + "from sklearn.model_selection import StratifiedKFold, GridSearchCV\n", + "from sklearn.metrics import make_scorer, f1_score\n", + "\n", + "# Define the StratifiedKFold cross-validator\n", + "skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)\n", + "\n", + "# Define F1 scorer\n", + "f1_scorer = make_scorer(f1_score, average='weighted')\n", + "\n", + "# Define the logistic regression model with class_weight='balanced'\n", + "logistic_regression = LogisticRegression(solver='saga', max_iter=100, class_weight='balanced')\n", + "\n", + "# Define the parameter grid for Logistic Regression\n", + "param_grid_lr = {\n", + " 'penalty': ['l1', 'l2'],\n", + " 'C': [0.01, 0.1, 1, 10]\n", + " }\n", + "\n", + "# GridSearchCV for Logistic Regression\n", + "grid_search_lr = GridSearchCV(logistic_regression, param_grid_lr, cv=skf, scoring=f1_scorer, n_jobs=-1)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Apply stratified k-fold cross validation gridsearch\n", + "Fitted to the resampled training data." + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/laylanyrabia/neuefische/kickstarter/project_kickstarter/.venv/lib/python3.11/site-packages/sklearn/linear_model/_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n", + " warnings.warn(\n", + "/Users/laylanyrabia/neuefische/kickstarter/project_kickstarter/.venv/lib/python3.11/site-packages/sklearn/linear_model/_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n", + " warnings.warn(\n", + "/Users/laylanyrabia/neuefische/kickstarter/project_kickstarter/.venv/lib/python3.11/site-packages/sklearn/linear_model/_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n", + " warnings.warn(\n", + "/Users/laylanyrabia/neuefische/kickstarter/project_kickstarter/.venv/lib/python3.11/site-packages/sklearn/linear_model/_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n", + " warnings.warn(\n", + "/Users/laylanyrabia/neuefische/kickstarter/project_kickstarter/.venv/lib/python3.11/site-packages/sklearn/linear_model/_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n", + " warnings.warn(\n", + "/Users/laylanyrabia/neuefische/kickstarter/project_kickstarter/.venv/lib/python3.11/site-packages/sklearn/linear_model/_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n", + " warnings.warn(\n", + "/Users/laylanyrabia/neuefische/kickstarter/project_kickstarter/.venv/lib/python3.11/site-packages/sklearn/linear_model/_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n", + " warnings.warn(\n", + "/Users/laylanyrabia/neuefische/kickstarter/project_kickstarter/.venv/lib/python3.11/site-packages/sklearn/linear_model/_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n", + " warnings.warn(\n", + "/Users/laylanyrabia/neuefische/kickstarter/project_kickstarter/.venv/lib/python3.11/site-packages/sklearn/linear_model/_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n", + " warnings.warn(\n", + "/Users/laylanyrabia/neuefische/kickstarter/project_kickstarter/.venv/lib/python3.11/site-packages/sklearn/linear_model/_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n", + " warnings.warn(\n", + "/Users/laylanyrabia/neuefische/kickstarter/project_kickstarter/.venv/lib/python3.11/site-packages/sklearn/linear_model/_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n", + " warnings.warn(\n", + "/Users/laylanyrabia/neuefische/kickstarter/project_kickstarter/.venv/lib/python3.11/site-packages/sklearn/linear_model/_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n", + " warnings.warn(\n", + "/Users/laylanyrabia/neuefische/kickstarter/project_kickstarter/.venv/lib/python3.11/site-packages/sklearn/linear_model/_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n", + " warnings.warn(\n", + "/Users/laylanyrabia/neuefische/kickstarter/project_kickstarter/.venv/lib/python3.11/site-packages/sklearn/linear_model/_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n", + " warnings.warn(\n", + "/Users/laylanyrabia/neuefische/kickstarter/project_kickstarter/.venv/lib/python3.11/site-packages/sklearn/linear_model/_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n", + " warnings.warn(\n", + "/Users/laylanyrabia/neuefische/kickstarter/project_kickstarter/.venv/lib/python3.11/site-packages/sklearn/linear_model/_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n", + " warnings.warn(\n", + "/Users/laylanyrabia/neuefische/kickstarter/project_kickstarter/.venv/lib/python3.11/site-packages/sklearn/linear_model/_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n", + " warnings.warn(\n", + "/Users/laylanyrabia/neuefische/kickstarter/project_kickstarter/.venv/lib/python3.11/site-packages/sklearn/linear_model/_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n", + " warnings.warn(\n", + "/Users/laylanyrabia/neuefische/kickstarter/project_kickstarter/.venv/lib/python3.11/site-packages/sklearn/linear_model/_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n", + " warnings.warn(\n", + "/Users/laylanyrabia/neuefische/kickstarter/project_kickstarter/.venv/lib/python3.11/site-packages/sklearn/linear_model/_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n", + " warnings.warn(\n", + "/Users/laylanyrabia/neuefische/kickstarter/project_kickstarter/.venv/lib/python3.11/site-packages/sklearn/linear_model/_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n", + " warnings.warn(\n", + "/Users/laylanyrabia/neuefische/kickstarter/project_kickstarter/.venv/lib/python3.11/site-packages/sklearn/linear_model/_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n", + " warnings.warn(\n", + "/Users/laylanyrabia/neuefische/kickstarter/project_kickstarter/.venv/lib/python3.11/site-packages/sklearn/linear_model/_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n", + " warnings.warn(\n", + "/Users/laylanyrabia/neuefische/kickstarter/project_kickstarter/.venv/lib/python3.11/site-packages/sklearn/linear_model/_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n", + " warnings.warn(\n", + "/Users/laylanyrabia/neuefische/kickstarter/project_kickstarter/.venv/lib/python3.11/site-packages/sklearn/linear_model/_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n", + " warnings.warn(\n", + "/Users/laylanyrabia/neuefische/kickstarter/project_kickstarter/.venv/lib/python3.11/site-packages/sklearn/linear_model/_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n", + " warnings.warn(\n", + "/Users/laylanyrabia/neuefische/kickstarter/project_kickstarter/.venv/lib/python3.11/site-packages/sklearn/linear_model/_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n", + " warnings.warn(\n", + "/Users/laylanyrabia/neuefische/kickstarter/project_kickstarter/.venv/lib/python3.11/site-packages/sklearn/linear_model/_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n", + " warnings.warn(\n", + "/Users/laylanyrabia/neuefische/kickstarter/project_kickstarter/.venv/lib/python3.11/site-packages/sklearn/linear_model/_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n", + " warnings.warn(\n", + "/Users/laylanyrabia/neuefische/kickstarter/project_kickstarter/.venv/lib/python3.11/site-packages/sklearn/linear_model/_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n", + " warnings.warn(\n", + "/Users/laylanyrabia/neuefische/kickstarter/project_kickstarter/.venv/lib/python3.11/site-packages/sklearn/linear_model/_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n", + " warnings.warn(\n", + "/Users/laylanyrabia/neuefische/kickstarter/project_kickstarter/.venv/lib/python3.11/site-packages/sklearn/linear_model/_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n", + " warnings.warn(\n", + "/Users/laylanyrabia/neuefische/kickstarter/project_kickstarter/.venv/lib/python3.11/site-packages/sklearn/linear_model/_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n", + " warnings.warn(\n", + "/Users/laylanyrabia/neuefische/kickstarter/project_kickstarter/.venv/lib/python3.11/site-packages/sklearn/linear_model/_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n", + " warnings.warn(\n", + "/Users/laylanyrabia/neuefische/kickstarter/project_kickstarter/.venv/lib/python3.11/site-packages/sklearn/linear_model/_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n", + " warnings.warn(\n", + "/Users/laylanyrabia/neuefische/kickstarter/project_kickstarter/.venv/lib/python3.11/site-packages/sklearn/linear_model/_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n", + " warnings.warn(\n", + "/Users/laylanyrabia/neuefische/kickstarter/project_kickstarter/.venv/lib/python3.11/site-packages/sklearn/linear_model/_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n", + " warnings.warn(\n", + "/Users/laylanyrabia/neuefische/kickstarter/project_kickstarter/.venv/lib/python3.11/site-packages/sklearn/linear_model/_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n", + " warnings.warn(\n", + "/Users/laylanyrabia/neuefische/kickstarter/project_kickstarter/.venv/lib/python3.11/site-packages/sklearn/linear_model/_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n", + " warnings.warn(\n", + "/Users/laylanyrabia/neuefische/kickstarter/project_kickstarter/.venv/lib/python3.11/site-packages/sklearn/linear_model/_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n", + " warnings.warn(\n", + "/Users/laylanyrabia/neuefische/kickstarter/project_kickstarter/.venv/lib/python3.11/site-packages/sklearn/linear_model/_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n", + " warnings.warn(\n" + ] + }, + { + "data": { + "text/html": [ + "
GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=42, shuffle=True),\n",
+       "             estimator=LogisticRegression(class_weight='balanced',\n",
+       "                                          solver='saga'),\n",
+       "             n_jobs=-1,\n",
+       "             param_grid={'C': [0.01, 0.1, 1, 10], 'penalty': ['l1', 'l2']},\n",
+       "             scoring=make_scorer(f1_score, average=weighted))
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" + ], + "text/plain": [ + "GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=42, shuffle=True),\n", + " estimator=LogisticRegression(class_weight='balanced',\n", + " solver='saga'),\n", + " n_jobs=-1,\n", + " param_grid={'C': [0.01, 0.1, 1, 10], 'penalty': ['l1', 'l2']},\n", + " scoring=make_scorer(f1_score, average=weighted))" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "grid_search_lr.fit(X_train_resampled, y_train_resampled)" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Best Logistic Regression Model: LogisticRegression(C=0.01, class_weight='balanced', penalty='l1', solver='saga')\n", + "Logistic Regression F1 Score: 0.7975915850439066\n", + " Feature Coefficient\n", + "1 Backers 1.015463e+01\n", + "0 Goal -2.289651e+00\n", + "84 Subcategory_HipHop -9.202204e-01\n", + "16 Category_Theater 8.521464e-01\n", + "164 Subcategory_Video_Games -7.783138e-01\n", + ".. ... ...\n", + "56 Subcategory_Drama 1.381033e-02\n", + "55 Subcategory_Documentary -1.449379e-03\n", + "115 Subcategory_Performances 9.369830e-04\n", + "74 Subcategory_Food 3.724464e-08\n", + "106 Subcategory_Music -3.476722e-09\n", + "\n", + "[61 rows x 2 columns]\n" + ] + } + ], + "source": [ + "# Best Logistic Regression Model\n", + "best_lr = grid_search_lr.best_estimator_\n", + "\n", + "# Output the best model and its corresponding hyperparameters\n", + "print(\"Best Logistic Regression Model:\", best_lr)\n", + "print(\"Logistic Regression F1 Score:\", f1_score(y_test, best_lr.predict(X_test), average='weighted'))\n", + "\n", + "# Output the best hyperparameters\n", + "\n", + "coefficients = best_lr.coef_.flatten()\n", + "feature_names = X_test.columns\n", + "coeff_df = pd.DataFrame({\n", + " 'Feature': feature_names,\n", + " 'Coefficient': coefficients\n", + "})\n", + "selected_features = coeff_df[coeff_df['Coefficient'] != 0]\n", + "selected_features = selected_features.reindex(selected_features['Coefficient'].abs().sort_values(ascending=False).index)\n", + "print(selected_features)\n", + "\n", + "# Best Logistic Regression Model: LogisticRegression(C=0.01, class_weight='balanced', penalty='l1', solver='saga')\n", + "# Logistic Regression F1 Score: 0.7975915850439066\n", + "# 61 features selected" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Feature Pre-Selection: \n", + "Since the categorical features include a large number of unique values and the data set is of a 6-figure order, we might need to perform a more selective lasso regression prior to the modelling for this to be computationally feasible. This also means decreasing the regularization constant (C):" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Number of selected features: 10\n", + "Selected feature indices: [ 0 1 2 9 12 15 16 145 164 197]\n", + "Selected feature names: ['Goal' 'Backers' 'Duration' 'Category_Food' 'Category_Music'\n", + " 'Category_Technology' 'Category_Theater' 'Subcategory_Shorts'\n", + " 'Subcategory_Video_Games' 'year_deadline']\n", + "Shape of the reduced feature set: (214162, 10)\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/laylanyrabia/neuefische/kickstarter/project_kickstarter/.venv/lib/python3.11/site-packages/sklearn/linear_model/_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n", + " warnings.warn(\n" + ] + } + ], + "source": [ + "from sklearn.linear_model import LogisticRegression\n", + "from sklearn.feature_selection import SelectFromModel\n", + "\n", + "# Step 1: Define a more selective Logistic Regression model by decreasing C\n", + "# Smaller C values increase regularization strength\n", + "logreg = LogisticRegression(penalty=\"l1\", solver=\"saga\", max_iter=100, C=0.001)\n", + "\n", + "# Step 2: Fit the feature selector model with the more selective logistic regression\n", + "selector = SelectFromModel(logreg, threshold=\"mean\")\n", + "selector.fit(X_train_resampled, y_train_resampled)\n", + "\n", + "# Step 3: Apply the selector to reduce dimensionality\n", + "X_train_reduced = selector.transform(X_train_resampled)\n", + "X_test_reduced = selector.transform(X_test)\n", + "\n", + "# Step 4: Retrieve the indices and names of selected features\n", + "selected_feature_indices = selector.get_support(indices=True)\n", + "selected_feature_names = np.array(feature_names)[selected_feature_indices]\n", + "\n", + "print(\"Number of selected features:\", len(selected_feature_indices))\n", + "print(\"Selected feature indices:\", selected_feature_indices)\n", + "print(\"Selected feature names:\", selected_feature_names)\n", + "print(\"Shape of the reduced feature set:\", X_train_reduced.shape)\n", + "\n", + "# C=0.0001, 0.001 (feature selection with L1 regularization reached saturation)\n", + "# Number of selected features: 10\n", + "# Selected feature indices: [ 0 1 2 9 12 15 16 145 164 197]\n", + "# Selected feature names: ['Goal' 'Backers' 'Duration' 'Category_Food' 'Category_Music'\n", + "# 'Category_Technology' 'Category_Theater' 'Subcategory_Shorts'\n", + "# 'Subcategory_Video_Games' 'year_deadline']\n", + "# Shape of the reduced feature set: (214162, 10)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/laylanyrabia/neuefische/kickstarter/project_kickstarter/.venv/lib/python3.11/site-packages/sklearn/linear_model/_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n", + " warnings.warn(\n", + "/Users/laylanyrabia/neuefische/kickstarter/project_kickstarter/.venv/lib/python3.11/site-packages/sklearn/linear_model/_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n", + " warnings.warn(\n", + "/Users/laylanyrabia/neuefische/kickstarter/project_kickstarter/.venv/lib/python3.11/site-packages/sklearn/linear_model/_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n", + " warnings.warn(\n", + "/Users/laylanyrabia/neuefische/kickstarter/project_kickstarter/.venv/lib/python3.11/site-packages/sklearn/linear_model/_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n", + " warnings.warn(\n", + "/Users/laylanyrabia/neuefische/kickstarter/project_kickstarter/.venv/lib/python3.11/site-packages/sklearn/linear_model/_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n", + " warnings.warn(\n", + "/Users/laylanyrabia/neuefische/kickstarter/project_kickstarter/.venv/lib/python3.11/site-packages/sklearn/linear_model/_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n", + " warnings.warn(\n", + "/Users/laylanyrabia/neuefische/kickstarter/project_kickstarter/.venv/lib/python3.11/site-packages/sklearn/linear_model/_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n", + " warnings.warn(\n", + "/Users/laylanyrabia/neuefische/kickstarter/project_kickstarter/.venv/lib/python3.11/site-packages/sklearn/linear_model/_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n", + " warnings.warn(\n", + "/Users/laylanyrabia/neuefische/kickstarter/project_kickstarter/.venv/lib/python3.11/site-packages/sklearn/linear_model/_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n", + " warnings.warn(\n", + "/Users/laylanyrabia/neuefische/kickstarter/project_kickstarter/.venv/lib/python3.11/site-packages/sklearn/linear_model/_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n", + " warnings.warn(\n", + "/Users/laylanyrabia/neuefische/kickstarter/project_kickstarter/.venv/lib/python3.11/site-packages/sklearn/linear_model/_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n", + " warnings.warn(\n", + "/Users/laylanyrabia/neuefische/kickstarter/project_kickstarter/.venv/lib/python3.11/site-packages/sklearn/linear_model/_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n", + " warnings.warn(\n", + "/Users/laylanyrabia/neuefische/kickstarter/project_kickstarter/.venv/lib/python3.11/site-packages/sklearn/linear_model/_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n", + " warnings.warn(\n", + "/Users/laylanyrabia/neuefische/kickstarter/project_kickstarter/.venv/lib/python3.11/site-packages/sklearn/linear_model/_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n", + " warnings.warn(\n", + "/Users/laylanyrabia/neuefische/kickstarter/project_kickstarter/.venv/lib/python3.11/site-packages/sklearn/linear_model/_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n", + " warnings.warn(\n", + "/Users/laylanyrabia/neuefische/kickstarter/project_kickstarter/.venv/lib/python3.11/site-packages/sklearn/model_selection/_validation.py:378: FitFailedWarning: \n", + "50 fits failed out of a total of 75.\n", + "The score on these train-test partitions for these parameters will be set to nan.\n", + "If these failures are not expected, you can try to debug them by setting error_score='raise'.\n", + "\n", + "Below are more details about the failures:\n", + "--------------------------------------------------------------------------------\n", + "9 fits failed with the following error:\n", + "Traceback (most recent call last):\n", + " File \"/Users/laylanyrabia/neuefische/kickstarter/project_kickstarter/.venv/lib/python3.11/site-packages/sklearn/model_selection/_validation.py\", line 686, in _fit_and_score\n", + " estimator.fit(X_train, y_train, **fit_params)\n", + " File \"/Users/laylanyrabia/neuefische/kickstarter/project_kickstarter/.venv/lib/python3.11/site-packages/sklearn/linear_model/_logistic.py\", line 1160, in fit\n", + " self._validate_params()\n", + " File \"/Users/laylanyrabia/neuefische/kickstarter/project_kickstarter/.venv/lib/python3.11/site-packages/sklearn/base.py\", line 600, in _validate_params\n", + " validate_parameter_constraints(\n", + " File \"/Users/laylanyrabia/neuefische/kickstarter/project_kickstarter/.venv/lib/python3.11/site-packages/sklearn/utils/_param_validation.py\", line 97, in validate_parameter_constraints\n", + " raise InvalidParameterError(\n", + "sklearn.utils._param_validation.InvalidParameterError: The 'penalty' parameter of LogisticRegression must be a str among {'none' (deprecated), 'elasticnet', 'l2', 'l1'} or None. Got 'None' instead.\n", + "\n", + "--------------------------------------------------------------------------------\n", + "1 fits failed with the following error:\n", + "Traceback (most recent call last):\n", + " File \"/Users/laylanyrabia/neuefische/kickstarter/project_kickstarter/.venv/lib/python3.11/site-packages/sklearn/model_selection/_validation.py\", line 686, in _fit_and_score\n", + " estimator.fit(X_train, y_train, **fit_params)\n", + " File \"/Users/laylanyrabia/neuefische/kickstarter/project_kickstarter/.venv/lib/python3.11/site-packages/sklearn/linear_model/_logistic.py\", line 1160, in fit\n", + " self._validate_params()\n", + " File \"/Users/laylanyrabia/neuefische/kickstarter/project_kickstarter/.venv/lib/python3.11/site-packages/sklearn/base.py\", line 600, in _validate_params\n", + " validate_parameter_constraints(\n", + " File \"/Users/laylanyrabia/neuefische/kickstarter/project_kickstarter/.venv/lib/python3.11/site-packages/sklearn/utils/_param_validation.py\", line 97, in validate_parameter_constraints\n", + " raise InvalidParameterError(\n", + "sklearn.utils._param_validation.InvalidParameterError: The 'penalty' parameter of LogisticRegression must be a str among {'none' (deprecated), 'l2', 'l1', 'elasticnet'} or None. Got 'None' instead.\n", + "\n", + "--------------------------------------------------------------------------------\n", + "25 fits failed with the following error:\n", + "Traceback (most recent call last):\n", + " File \"/Users/laylanyrabia/neuefische/kickstarter/project_kickstarter/.venv/lib/python3.11/site-packages/sklearn/model_selection/_validation.py\", line 686, in _fit_and_score\n", + " estimator.fit(X_train, y_train, **fit_params)\n", + " File \"/Users/laylanyrabia/neuefische/kickstarter/project_kickstarter/.venv/lib/python3.11/site-packages/sklearn/linear_model/_logistic.py\", line 1291, in fit\n", + " fold_coefs_ = Parallel(n_jobs=self.n_jobs, verbose=self.verbose, prefer=prefer)(\n", + " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", + " File \"/Users/laylanyrabia/neuefische/kickstarter/project_kickstarter/.venv/lib/python3.11/site-packages/sklearn/utils/parallel.py\", line 63, in __call__\n", + " return super().__call__(iterable_with_config)\n", + " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", + " File \"/Users/laylanyrabia/neuefische/kickstarter/project_kickstarter/.venv/lib/python3.11/site-packages/joblib/parallel.py\", line 1918, in __call__\n", + " return output if self.return_generator else list(output)\n", + " ^^^^^^^^^^^^\n", + " File \"/Users/laylanyrabia/neuefische/kickstarter/project_kickstarter/.venv/lib/python3.11/site-packages/joblib/parallel.py\", line 1847, in _get_sequential_output\n", + " res = func(*args, **kwargs)\n", + " ^^^^^^^^^^^^^^^^^^^^^\n", + " File \"/Users/laylanyrabia/neuefische/kickstarter/project_kickstarter/.venv/lib/python3.11/site-packages/sklearn/utils/parallel.py\", line 123, in __call__\n", + " return self.function(*args, **kwargs)\n", + " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", + " File \"/Users/laylanyrabia/neuefische/kickstarter/project_kickstarter/.venv/lib/python3.11/site-packages/sklearn/linear_model/_logistic.py\", line 521, in _logistic_regression_path\n", + " alpha = (1.0 / C) * (1 - l1_ratio)\n", + " ~~^~~~~~~~~~\n", + "TypeError: unsupported operand type(s) for -: 'int' and 'NoneType'\n", + "\n", + "--------------------------------------------------------------------------------\n", + "3 fits failed with the following error:\n", + "Traceback (most recent call last):\n", + " File \"/Users/laylanyrabia/neuefische/kickstarter/project_kickstarter/.venv/lib/python3.11/site-packages/sklearn/model_selection/_validation.py\", line 686, in _fit_and_score\n", + " estimator.fit(X_train, y_train, **fit_params)\n", + " File \"/Users/laylanyrabia/neuefische/kickstarter/project_kickstarter/.venv/lib/python3.11/site-packages/sklearn/linear_model/_logistic.py\", line 1160, in fit\n", + " self._validate_params()\n", + " File \"/Users/laylanyrabia/neuefische/kickstarter/project_kickstarter/.venv/lib/python3.11/site-packages/sklearn/base.py\", line 600, in _validate_params\n", + " validate_parameter_constraints(\n", + " File \"/Users/laylanyrabia/neuefische/kickstarter/project_kickstarter/.venv/lib/python3.11/site-packages/sklearn/utils/_param_validation.py\", line 97, in validate_parameter_constraints\n", + " raise InvalidParameterError(\n", + "sklearn.utils._param_validation.InvalidParameterError: The 'penalty' parameter of LogisticRegression must be a str among {'l1', 'l2', 'elasticnet', 'none' (deprecated)} or None. Got 'None' instead.\n", + "\n", + "--------------------------------------------------------------------------------\n", + "2 fits failed with the following error:\n", + "Traceback (most recent call last):\n", + " File \"/Users/laylanyrabia/neuefische/kickstarter/project_kickstarter/.venv/lib/python3.11/site-packages/sklearn/model_selection/_validation.py\", line 686, in _fit_and_score\n", + " estimator.fit(X_train, y_train, **fit_params)\n", + " File \"/Users/laylanyrabia/neuefische/kickstarter/project_kickstarter/.venv/lib/python3.11/site-packages/sklearn/linear_model/_logistic.py\", line 1160, in fit\n", + " self._validate_params()\n", + " File \"/Users/laylanyrabia/neuefische/kickstarter/project_kickstarter/.venv/lib/python3.11/site-packages/sklearn/base.py\", line 600, in _validate_params\n", + " validate_parameter_constraints(\n", + " File \"/Users/laylanyrabia/neuefische/kickstarter/project_kickstarter/.venv/lib/python3.11/site-packages/sklearn/utils/_param_validation.py\", line 97, in validate_parameter_constraints\n", + " raise InvalidParameterError(\n", + "sklearn.utils._param_validation.InvalidParameterError: The 'penalty' parameter of LogisticRegression must be a str among {'l1', 'elasticnet', 'l2', 'none' (deprecated)} or None. Got 'None' instead.\n", + "\n", + "--------------------------------------------------------------------------------\n", + "5 fits failed with the following error:\n", + "Traceback (most recent call last):\n", + " File \"/Users/laylanyrabia/neuefische/kickstarter/project_kickstarter/.venv/lib/python3.11/site-packages/sklearn/model_selection/_validation.py\", line 686, in _fit_and_score\n", + " estimator.fit(X_train, y_train, **fit_params)\n", + " File \"/Users/laylanyrabia/neuefische/kickstarter/project_kickstarter/.venv/lib/python3.11/site-packages/sklearn/linear_model/_logistic.py\", line 1160, in fit\n", + " self._validate_params()\n", + " File \"/Users/laylanyrabia/neuefische/kickstarter/project_kickstarter/.venv/lib/python3.11/site-packages/sklearn/base.py\", line 600, in _validate_params\n", + " validate_parameter_constraints(\n", + " File \"/Users/laylanyrabia/neuefische/kickstarter/project_kickstarter/.venv/lib/python3.11/site-packages/sklearn/utils/_param_validation.py\", line 97, in validate_parameter_constraints\n", + " raise InvalidParameterError(\n", + "sklearn.utils._param_validation.InvalidParameterError: The 'penalty' parameter of LogisticRegression must be a str among {'l1', 'none' (deprecated), 'l2', 'elasticnet'} or None. Got 'None' instead.\n", + "\n", + "--------------------------------------------------------------------------------\n", + "5 fits failed with the following error:\n", + "Traceback (most recent call last):\n", + " File \"/Users/laylanyrabia/neuefische/kickstarter/project_kickstarter/.venv/lib/python3.11/site-packages/sklearn/model_selection/_validation.py\", line 686, in _fit_and_score\n", + " estimator.fit(X_train, y_train, **fit_params)\n", + " File \"/Users/laylanyrabia/neuefische/kickstarter/project_kickstarter/.venv/lib/python3.11/site-packages/sklearn/linear_model/_logistic.py\", line 1160, in fit\n", + " self._validate_params()\n", + " File \"/Users/laylanyrabia/neuefische/kickstarter/project_kickstarter/.venv/lib/python3.11/site-packages/sklearn/base.py\", line 600, in _validate_params\n", + " validate_parameter_constraints(\n", + " File \"/Users/laylanyrabia/neuefische/kickstarter/project_kickstarter/.venv/lib/python3.11/site-packages/sklearn/utils/_param_validation.py\", line 97, in validate_parameter_constraints\n", + " raise InvalidParameterError(\n", + "sklearn.utils._param_validation.InvalidParameterError: The 'penalty' parameter of LogisticRegression must be a str among {'l2', 'none' (deprecated), 'elasticnet', 'l1'} or None. Got 'None' instead.\n", + "\n", + " warnings.warn(some_fits_failed_message, FitFailedWarning)\n", + "/Users/laylanyrabia/neuefische/kickstarter/project_kickstarter/.venv/lib/python3.11/site-packages/sklearn/model_selection/_search.py:952: UserWarning: One or more of the test scores are non-finite: [0.77022697 nan nan 0.82605299 nan nan\n", + " 0.84994351 nan nan 0.85304871 nan nan\n", + " 0.85323814 nan nan]\n", + " warnings.warn(\n", + "/Users/laylanyrabia/neuefische/kickstarter/project_kickstarter/.venv/lib/python3.11/site-packages/sklearn/linear_model/_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n", + " warnings.warn(\n" + ] + }, + { + "data": { + "text/html": [ + "
GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=42, shuffle=True),\n",
+       "             estimator=LogisticRegression(class_weight='balanced',\n",
+       "                                          max_iter=1000, solver='saga'),\n",
+       "             n_jobs=-1,\n",
+       "             param_grid={'C': [0.01, 0.1, 1, 10, 100],\n",
+       "                         'penalty': ['l2', 'None', 'elasticnet']},\n",
+       "             scoring=make_scorer(f1_score, average=weighted))
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" + ], + "text/plain": [ + "GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=42, shuffle=True),\n", + " estimator=LogisticRegression(class_weight='balanced',\n", + " max_iter=1000, solver='saga'),\n", + " n_jobs=-1,\n", + " param_grid={'C': [0.01, 0.1, 1, 10, 100],\n", + " 'penalty': ['l2', 'None', 'elasticnet']},\n", + " scoring=make_scorer(f1_score, average=weighted))" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Define the logistic regression model with class_weight='balanced'\n", + "logistic_regression = LogisticRegression(solver='saga', max_iter=1000, class_weight='balanced')\n", + "\n", + "# Define the parameter grid for Logistic Regression\n", + "param_grid_lr = {\n", + " 'penalty': ['l2', 'None', 'elasticnet'],\n", + " 'C': [0.01, 0.1, 1, 10, 100]\n", + " }\n", + "grid_search_lr = GridSearchCV(logistic_regression, param_grid_lr, cv=skf, scoring=f1_scorer, n_jobs=-1)\n", + "grid_search_lr.fit(X_test_reduced, y_test)" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Best Logistic Regression Model: LogisticRegression(C=100, class_weight='balanced', max_iter=1000, solver='saga')\n", + "Logistic Regression F1 Score: 0.8561574779080967\n" + ] + } + ], + "source": [ + "# Best Logistic Regression Model\n", + "best_lr = grid_search_lr.best_estimator_\n", + "\n", + "# Output the best model and its corresponding hyperparameters\n", + "print(\"Best Logistic Regression Model:\", best_lr)\n", + "print(\"Logistic Regression F1 Score:\", f1_score(y_test, best_lr.predict(X_test_reduced), average='weighted'))\n", + "\n", + "# BBest Logistic Regression Model: LogisticRegression(C=100, class_weight='balanced', max_iter=1000, solver='saga')\n", + "# Logistic Regression F1 Score: 0.856157477908096" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.3" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}