From 2dfe16c7950678efab4d2433abab2928642282a7 Mon Sep 17 00:00:00 2001
From: Layla Nyrabia <lnyrabia@gmail.com>
Date: Mon, 12 Aug 2024 10:25:45 +0200
Subject: [PATCH] scaled, resampled for class imbalance, stratified k-fold,
 feature selection, logreg

---
 dummies-stratified-scaled-kfold-logreg.ipynb | 1268 ++++++++++++++++++
 1 file changed, 1268 insertions(+)
 create mode 100644 dummies-stratified-scaled-kfold-logreg.ipynb
diff --git a/dummies-stratified-scaled-kfold-logreg.ipynb b/dummies-stratified-scaled-kfold-logreg.ipynb
new file mode 100644
index 0000000..d7706b3
--- /dev/null
+++ b/dummies-stratified-scaled-kfold-logreg.ipynb
@@ -0,0 +1,1268 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%run base.ipynb"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Import original Data "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = get_original_data()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Category</th>\n",
+       "      <th>Subcategory</th>\n",
+       "      <th>Country</th>\n",
+       "      <th>Launched</th>\n",
+       "      <th>Deadline</th>\n",
+       "      <th>Goal</th>\n",
+       "      <th>Pledged</th>\n",
+       "      <th>Backers</th>\n",
+       "      <th>State</th>\n",
+       "      <th>Duration</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>Fashion</td>\n",
+       "      <td>Fashion</td>\n",
+       "      <td>United_States</td>\n",
+       "      <td>2009-04-21 21:02:48</td>\n",
+       "      <td>2009-05-31</td>\n",
+       "      <td>1000</td>\n",
+       "      <td>625</td>\n",
+       "      <td>30</td>\n",
+       "      <td>Failed</td>\n",
+       "      <td>39</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>Film__Video</td>\n",
+       "      <td>Shorts</td>\n",
+       "      <td>United_States</td>\n",
+       "      <td>2009-04-23 00:07:53</td>\n",
+       "      <td>2009-07-20</td>\n",
+       "      <td>80000</td>\n",
+       "      <td>22</td>\n",
+       "      <td>3</td>\n",
+       "      <td>Failed</td>\n",
+       "      <td>87</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>Art</td>\n",
+       "      <td>Illustration</td>\n",
+       "      <td>United_States</td>\n",
+       "      <td>2009-04-24 21:52:03</td>\n",
+       "      <td>2009-05-03</td>\n",
+       "      <td>20</td>\n",
+       "      <td>35</td>\n",
+       "      <td>3</td>\n",
+       "      <td>Successful</td>\n",
+       "      <td>8</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>Technology</td>\n",
+       "      <td>Software</td>\n",
+       "      <td>United_States</td>\n",
+       "      <td>2009-04-25 17:36:21</td>\n",
+       "      <td>2009-07-14</td>\n",
+       "      <td>99</td>\n",
+       "      <td>145</td>\n",
+       "      <td>25</td>\n",
+       "      <td>Successful</td>\n",
+       "      <td>79</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>Fashion</td>\n",
+       "      <td>Fashion</td>\n",
+       "      <td>United_States</td>\n",
+       "      <td>2009-04-27 14:10:39</td>\n",
+       "      <td>2009-05-26</td>\n",
+       "      <td>1900</td>\n",
+       "      <td>387</td>\n",
+       "      <td>10</td>\n",
+       "      <td>Failed</td>\n",
+       "      <td>28</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "      Category   Subcategory        Country            Launched   Deadline   \n",
+       "0      Fashion       Fashion  United_States 2009-04-21 21:02:48 2009-05-31  \\\n",
+       "1  Film__Video        Shorts  United_States 2009-04-23 00:07:53 2009-07-20   \n",
+       "2          Art  Illustration  United_States 2009-04-24 21:52:03 2009-05-03   \n",
+       "3   Technology      Software  United_States 2009-04-25 17:36:21 2009-07-14   \n",
+       "4      Fashion       Fashion  United_States 2009-04-27 14:10:39 2009-05-26   \n",
+       "\n",
+       "    Goal  Pledged  Backers       State  Duration  \n",
+       "0   1000      625       30      Failed        39  \n",
+       "1  80000       22        3      Failed        87  \n",
+       "2     20       35        3  Successful         8  \n",
+       "3     99      145       25  Successful        79  \n",
+       "4   1900      387       10      Failed        28  "
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df.head()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Clean categorical strings from spaces and troublesome special characters"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import re\n",
+    "\n",
+    "# Define a function to clean up string values\n",
+    "def clean_string(s):\n",
+    "    # Remove leading/trailing spaces\n",
+    "    s = s.strip()\n",
+    "    # Replace all spaces with underscores (or remove them if desired)\n",
+    "    s = re.sub(r'\\s+', '_', s)  # Replace spaces with underscores\n",
+    "    # Remove problematic special characters\n",
+    "    s = re.sub(r'[^\\w\\s]', '', s)\n",
+    "    return s\n",
+    "\n",
+    "# Apply the function to all columns in the DataFrame\n",
+    "df = df.applymap(lambda x: clean_string(x) if isinstance(x, str) else x)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Category</th>\n",
+       "      <th>Subcategory</th>\n",
+       "      <th>Country</th>\n",
+       "      <th>Launched</th>\n",
+       "      <th>Deadline</th>\n",
+       "      <th>Goal</th>\n",
+       "      <th>Pledged</th>\n",
+       "      <th>Backers</th>\n",
+       "      <th>State</th>\n",
+       "      <th>Duration</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>Fashion</td>\n",
+       "      <td>Fashion</td>\n",
+       "      <td>United_States</td>\n",
+       "      <td>2009-04-21 21:02:48</td>\n",
+       "      <td>2009-05-31</td>\n",
+       "      <td>1000</td>\n",
+       "      <td>625</td>\n",
+       "      <td>30</td>\n",
+       "      <td>Failed</td>\n",
+       "      <td>39</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>Film__Video</td>\n",
+       "      <td>Shorts</td>\n",
+       "      <td>United_States</td>\n",
+       "      <td>2009-04-23 00:07:53</td>\n",
+       "      <td>2009-07-20</td>\n",
+       "      <td>80000</td>\n",
+       "      <td>22</td>\n",
+       "      <td>3</td>\n",
+       "      <td>Failed</td>\n",
+       "      <td>87</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>Art</td>\n",
+       "      <td>Illustration</td>\n",
+       "      <td>United_States</td>\n",
+       "      <td>2009-04-24 21:52:03</td>\n",
+       "      <td>2009-05-03</td>\n",
+       "      <td>20</td>\n",
+       "      <td>35</td>\n",
+       "      <td>3</td>\n",
+       "      <td>Successful</td>\n",
+       "      <td>8</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>Technology</td>\n",
+       "      <td>Software</td>\n",
+       "      <td>United_States</td>\n",
+       "      <td>2009-04-25 17:36:21</td>\n",
+       "      <td>2009-07-14</td>\n",
+       "      <td>99</td>\n",
+       "      <td>145</td>\n",
+       "      <td>25</td>\n",
+       "      <td>Successful</td>\n",
+       "      <td>79</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>Fashion</td>\n",
+       "      <td>Fashion</td>\n",
+       "      <td>United_States</td>\n",
+       "      <td>2009-04-27 14:10:39</td>\n",
+       "      <td>2009-05-26</td>\n",
+       "      <td>1900</td>\n",
+       "      <td>387</td>\n",
+       "      <td>10</td>\n",
+       "      <td>Failed</td>\n",
+       "      <td>28</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "      Category   Subcategory        Country            Launched   Deadline   \n",
+       "0      Fashion       Fashion  United_States 2009-04-21 21:02:48 2009-05-31  \\\n",
+       "1  Film__Video        Shorts  United_States 2009-04-23 00:07:53 2009-07-20   \n",
+       "2          Art  Illustration  United_States 2009-04-24 21:52:03 2009-05-03   \n",
+       "3   Technology      Software  United_States 2009-04-25 17:36:21 2009-07-14   \n",
+       "4      Fashion       Fashion  United_States 2009-04-27 14:10:39 2009-05-26   \n",
+       "\n",
+       "    Goal  Pledged  Backers       State  Duration  \n",
+       "0   1000      625       30      Failed        39  \n",
+       "1  80000       22        3      Failed        87  \n",
+       "2     20       35        3  Successful         8  \n",
+       "3     99      145       25  Successful        79  \n",
+       "4   1900      387       10      Failed        28  "
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "<class 'pandas.core.frame.DataFrame'>\n",
+      "Index: 331462 entries, 0 to 374605\n",
+      "Data columns (total 10 columns):\n",
+      " #   Column       Non-Null Count   Dtype         \n",
+      "---  ------       --------------   -----         \n",
+      " 0   Category     331462 non-null  object        \n",
+      " 1   Subcategory  331462 non-null  object        \n",
+      " 2   Country      331462 non-null  object        \n",
+      " 3   Launched     331462 non-null  datetime64[ns]\n",
+      " 4   Deadline     331462 non-null  datetime64[ns]\n",
+      " 5   Goal         331462 non-null  int64         \n",
+      " 6   Pledged      331462 non-null  int64         \n",
+      " 7   Backers      331462 non-null  int64         \n",
+      " 8   State        331462 non-null  object        \n",
+      " 9   Duration     331462 non-null  int64         \n",
+      "dtypes: datetime64[ns](2), int64(4), object(4)\n",
+      "memory usage: 27.8+ MB\n"
+     ]
+    }
+   ],
+   "source": [
+    "df.info()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Encode categorical features as dummy (binary) variables"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = pd.get_dummies(df, drop_first=True, columns=df.select_dtypes(include=['object']).columns)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Launched</th>\n",
+       "      <th>Deadline</th>\n",
+       "      <th>Goal</th>\n",
+       "      <th>Pledged</th>\n",
+       "      <th>Backers</th>\n",
+       "      <th>Duration</th>\n",
+       "      <th>Category_Comics</th>\n",
+       "      <th>Category_Crafts</th>\n",
+       "      <th>Category_Dance</th>\n",
+       "      <th>Category_Design</th>\n",
+       "      <th>...</th>\n",
+       "      <th>Country_Netherlands</th>\n",
+       "      <th>Country_New_Zealand</th>\n",
+       "      <th>Country_Norway</th>\n",
+       "      <th>Country_Singapore</th>\n",
+       "      <th>Country_Spain</th>\n",
+       "      <th>Country_Sweden</th>\n",
+       "      <th>Country_Switzerland</th>\n",
+       "      <th>Country_United_Kingdom</th>\n",
+       "      <th>Country_United_States</th>\n",
+       "      <th>State_Successful</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>2009-04-21 21:02:48</td>\n",
+       "      <td>2009-05-31</td>\n",
+       "      <td>1000</td>\n",
+       "      <td>625</td>\n",
+       "      <td>30</td>\n",
+       "      <td>39</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>...</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>True</td>\n",
+       "      <td>False</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>2009-04-23 00:07:53</td>\n",
+       "      <td>2009-07-20</td>\n",
+       "      <td>80000</td>\n",
+       "      <td>22</td>\n",
+       "      <td>3</td>\n",
+       "      <td>87</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>...</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>True</td>\n",
+       "      <td>False</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>2009-04-24 21:52:03</td>\n",
+       "      <td>2009-05-03</td>\n",
+       "      <td>20</td>\n",
+       "      <td>35</td>\n",
+       "      <td>3</td>\n",
+       "      <td>8</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>...</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>2009-04-25 17:36:21</td>\n",
+       "      <td>2009-07-14</td>\n",
+       "      <td>99</td>\n",
+       "      <td>145</td>\n",
+       "      <td>25</td>\n",
+       "      <td>79</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>...</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>2009-04-27 14:10:39</td>\n",
+       "      <td>2009-05-26</td>\n",
+       "      <td>1900</td>\n",
+       "      <td>387</td>\n",
+       "      <td>10</td>\n",
+       "      <td>28</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>...</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>True</td>\n",
+       "      <td>False</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>5 rows × 200 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "             Launched   Deadline   Goal  Pledged  Backers  Duration   \n",
+       "0 2009-04-21 21:02:48 2009-05-31   1000      625       30        39  \\\n",
+       "1 2009-04-23 00:07:53 2009-07-20  80000       22        3        87   \n",
+       "2 2009-04-24 21:52:03 2009-05-03     20       35        3         8   \n",
+       "3 2009-04-25 17:36:21 2009-07-14     99      145       25        79   \n",
+       "4 2009-04-27 14:10:39 2009-05-26   1900      387       10        28   \n",
+       "\n",
+       "   Category_Comics  Category_Crafts  Category_Dance  Category_Design  ...   \n",
+       "0            False            False           False            False  ...  \\\n",
+       "1            False            False           False            False  ...   \n",
+       "2            False            False           False            False  ...   \n",
+       "3            False            False           False            False  ...   \n",
+       "4            False            False           False            False  ...   \n",
+       "\n",
+       "   Country_Netherlands  Country_New_Zealand  Country_Norway   \n",
+       "0                False                False           False  \\\n",
+       "1                False                False           False   \n",
+       "2                False                False           False   \n",
+       "3                False                False           False   \n",
+       "4                False                False           False   \n",
+       "\n",
+       "   Country_Singapore  Country_Spain  Country_Sweden  Country_Switzerland   \n",
+       "0              False          False           False                False  \\\n",
+       "1              False          False           False                False   \n",
+       "2              False          False           False                False   \n",
+       "3              False          False           False                False   \n",
+       "4              False          False           False                False   \n",
+       "\n",
+       "   Country_United_Kingdom  Country_United_States  State_Successful  \n",
+       "0                   False                   True             False  \n",
+       "1                   False                   True             False  \n",
+       "2                   False                   True              True  \n",
+       "3                   False                   True              True  \n",
+       "4                   False                   True             False  \n",
+       "\n",
+       "[5 rows x 200 columns]"
+      ]
+     },
+     "execution_count": 11,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(331462, 200)"
+      ]
+     },
+     "execution_count": 12,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df.shape"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Define target & features"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "from sklearn.model_selection import StratifiedKFold, GridSearchCV\n",
+    "from sklearn.linear_model import LogisticRegression\n",
+    "from sklearn.metrics import f1_score, make_scorer\n",
+    "\n",
+    "# Drop or convert datetime columns\n",
+    "X = df.drop(columns=['State_Successful', 'Launched', 'Deadline', 'Pledged']) # Drop datetime columns + Pledged (data leakage)\n",
+    "y = df['State_Successful']  # Target column\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Extract features from datetime columns (here only the year to avoid too many features)\n",
+    "X['year_launched'] = df['Launched'].dt.year\n",
+    "# X['month_launched'] = df['Launched'].dt.month\n",
+    "# X['day_launched'] = df['Launched'].dt.day\n",
+    "\n",
+    "X['year_deadline'] = df['Deadline'].dt.year\n",
+    "# X['year_deadline'] = df['Deadline'].dt.month\n",
+    "# X['year_deadline'] = df['Deadline'].dt.day"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Split the data into training and test sets"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sklearn.model_selection import train_test_split\n",
+    "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Scale the data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Scale numerical data using StandarScaler:\n",
+    "\n",
+    "from sklearn.preprocessing import StandardScaler\n",
+    "\n",
+    "# Identify numerical columns (excluding date-time and timedelta types)\n",
+    "numerical_cols = X.select_dtypes(include=['number']).columns.tolist()\n",
+    "\n",
+    "# Initialize StandardScaler\n",
+    "scaler = StandardScaler()\n",
+    "\n",
+    "# Fit and transform only the numerical columns to the training data\n",
+    "X_train[numerical_cols] = scaler.fit_transform(X_train[numerical_cols])\n",
+    "\n",
+    "# Use the same scaler to transform the test data\n",
+    "X_test[numerical_cols] = scaler.transform(X_test[numerical_cols])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Resample the data for imbalanced target classes\n",
+    "Since the resampling approach using SMOTE from the imbalanced-learn library completely crashed the local machine (MacBook Air M2 2022, 8GB, Sonoma 14.5) and the approach of randomly under-sampling the majority target class, while being computationally cheaper, will risk us losing meaningful information, there doesn't seem to be a clear optimal solution yet. By using the simplest and computationally cheapest form of resampling imbalanced target classes, namely by randomly under-sampling the majority class of the target variable, we apply the most (and only) feasible resampling method to handle imbalanced target classes within our limited computational means:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Apply random undersampling to the training set\n",
+    "from imblearn.under_sampling import RandomUnderSampler\n",
+    "undersampler = RandomUnderSampler(random_state=42)\n",
+    "X_train_resampled, y_train_resampled = undersampler.fit_resample(X_train, y_train)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Prepare the cross-validation strategy\n",
+    "- Stratified K-fold for better representation of classes\n",
+    "- Saga solver for faster convergence times\n",
+    "- Small number of max. iteration for computational limitations\n",
+    "- logistic regression with weights for additional class balance\n",
+    "- Ridge and Lasso regularization for feature selection insights\n",
+    "- Smaller regularization options for more aggressive feature selection due to large dimensionality of dummy encoded data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sklearn.linear_model import LogisticRegression\n",
+    "from sklearn.model_selection import StratifiedKFold, GridSearchCV\n",
+    "from sklearn.metrics import make_scorer, f1_score\n",
+    "\n",
+    "# Define the StratifiedKFold cross-validator\n",
+    "skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)\n",
+    "\n",
+    "# Define F1 scorer\n",
+    "f1_scorer = make_scorer(f1_score, average='weighted')\n",
+    "\n",
+    "# Define the logistic regression model with class_weight='balanced'\n",
+    "logistic_regression = LogisticRegression(solver='saga', max_iter=100, class_weight='balanced')\n",
+    "\n",
+    "# Define the parameter grid for Logistic Regression\n",
+    "param_grid_lr = {\n",
+    "    'penalty': ['l1', 'l2'],\n",
+    "    'C': [0.01, 0.1, 1, 10]\n",
+    "    }\n",
+    "\n",
+    "# GridSearchCV for Logistic Regression\n",
+    "grid_search_lr = GridSearchCV(logistic_regression, param_grid_lr, cv=skf, scoring=f1_scorer, n_jobs=-1)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Apply stratified k-fold cross validation gridsearch\n",
+    "Fitted to the resampled training data."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/Users/laylanyrabia/neuefische/kickstarter/project_kickstarter/.venv/lib/python3.11/site-packages/sklearn/linear_model/_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n",
+      "  warnings.warn(\n",
+      "/Users/laylanyrabia/neuefische/kickstarter/project_kickstarter/.venv/lib/python3.11/site-packages/sklearn/linear_model/_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n",
+      "  warnings.warn(\n",
+      "/Users/laylanyrabia/neuefische/kickstarter/project_kickstarter/.venv/lib/python3.11/site-packages/sklearn/linear_model/_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n",
+      "  warnings.warn(\n",
+      "/Users/laylanyrabia/neuefische/kickstarter/project_kickstarter/.venv/lib/python3.11/site-packages/sklearn/linear_model/_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n",
+      "  warnings.warn(\n",
+      "/Users/laylanyrabia/neuefische/kickstarter/project_kickstarter/.venv/lib/python3.11/site-packages/sklearn/linear_model/_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n",
+      "  warnings.warn(\n",
+      "/Users/laylanyrabia/neuefische/kickstarter/project_kickstarter/.venv/lib/python3.11/site-packages/sklearn/linear_model/_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n",
+      "  warnings.warn(\n",
+      "/Users/laylanyrabia/neuefische/kickstarter/project_kickstarter/.venv/lib/python3.11/site-packages/sklearn/linear_model/_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n",
+      "  warnings.warn(\n",
+      "/Users/laylanyrabia/neuefische/kickstarter/project_kickstarter/.venv/lib/python3.11/site-packages/sklearn/linear_model/_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n",
+      "  warnings.warn(\n",
+      "/Users/laylanyrabia/neuefische/kickstarter/project_kickstarter/.venv/lib/python3.11/site-packages/sklearn/linear_model/_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n",
+      "  warnings.warn(\n",
+      "/Users/laylanyrabia/neuefische/kickstarter/project_kickstarter/.venv/lib/python3.11/site-packages/sklearn/linear_model/_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n",
+      "  warnings.warn(\n",
+      "/Users/laylanyrabia/neuefische/kickstarter/project_kickstarter/.venv/lib/python3.11/site-packages/sklearn/linear_model/_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n",
+      "  warnings.warn(\n",
+      "/Users/laylanyrabia/neuefische/kickstarter/project_kickstarter/.venv/lib/python3.11/site-packages/sklearn/linear_model/_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n",
+      "  warnings.warn(\n",
+      "/Users/laylanyrabia/neuefische/kickstarter/project_kickstarter/.venv/lib/python3.11/site-packages/sklearn/linear_model/_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n",
+      "  warnings.warn(\n",
+      "/Users/laylanyrabia/neuefische/kickstarter/project_kickstarter/.venv/lib/python3.11/site-packages/sklearn/linear_model/_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n",
+      "  warnings.warn(\n",
+      "/Users/laylanyrabia/neuefische/kickstarter/project_kickstarter/.venv/lib/python3.11/site-packages/sklearn/linear_model/_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n",
+      "  warnings.warn(\n",
+      "/Users/laylanyrabia/neuefische/kickstarter/project_kickstarter/.venv/lib/python3.11/site-packages/sklearn/linear_model/_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n",
+      "  warnings.warn(\n",
+      "/Users/laylanyrabia/neuefische/kickstarter/project_kickstarter/.venv/lib/python3.11/site-packages/sklearn/linear_model/_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n",
+      "  warnings.warn(\n",
+      "/Users/laylanyrabia/neuefische/kickstarter/project_kickstarter/.venv/lib/python3.11/site-packages/sklearn/linear_model/_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n",
+      "  warnings.warn(\n",
+      "/Users/laylanyrabia/neuefische/kickstarter/project_kickstarter/.venv/lib/python3.11/site-packages/sklearn/linear_model/_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n",
+      "  warnings.warn(\n",
+      "/Users/laylanyrabia/neuefische/kickstarter/project_kickstarter/.venv/lib/python3.11/site-packages/sklearn/linear_model/_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n",
+      "  warnings.warn(\n",
+      "/Users/laylanyrabia/neuefische/kickstarter/project_kickstarter/.venv/lib/python3.11/site-packages/sklearn/linear_model/_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n",
+      "  warnings.warn(\n",
+      "/Users/laylanyrabia/neuefische/kickstarter/project_kickstarter/.venv/lib/python3.11/site-packages/sklearn/linear_model/_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n",
+      "  warnings.warn(\n",
+      "/Users/laylanyrabia/neuefische/kickstarter/project_kickstarter/.venv/lib/python3.11/site-packages/sklearn/linear_model/_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n",
+      "  warnings.warn(\n",
+      "/Users/laylanyrabia/neuefische/kickstarter/project_kickstarter/.venv/lib/python3.11/site-packages/sklearn/linear_model/_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n",
+      "  warnings.warn(\n",
+      "/Users/laylanyrabia/neuefische/kickstarter/project_kickstarter/.venv/lib/python3.11/site-packages/sklearn/linear_model/_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n",
+      "  warnings.warn(\n",
+      "/Users/laylanyrabia/neuefische/kickstarter/project_kickstarter/.venv/lib/python3.11/site-packages/sklearn/linear_model/_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n",
+      "  warnings.warn(\n",
+      "/Users/laylanyrabia/neuefische/kickstarter/project_kickstarter/.venv/lib/python3.11/site-packages/sklearn/linear_model/_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n",
+      "  warnings.warn(\n",
+      "/Users/laylanyrabia/neuefische/kickstarter/project_kickstarter/.venv/lib/python3.11/site-packages/sklearn/linear_model/_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n",
+      "  warnings.warn(\n",
+      "/Users/laylanyrabia/neuefische/kickstarter/project_kickstarter/.venv/lib/python3.11/site-packages/sklearn/linear_model/_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n",
+      "  warnings.warn(\n",
+      "/Users/laylanyrabia/neuefische/kickstarter/project_kickstarter/.venv/lib/python3.11/site-packages/sklearn/linear_model/_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n",
+      "  warnings.warn(\n",
+      "/Users/laylanyrabia/neuefische/kickstarter/project_kickstarter/.venv/lib/python3.11/site-packages/sklearn/linear_model/_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n",
+      "  warnings.warn(\n",
+      "/Users/laylanyrabia/neuefische/kickstarter/project_kickstarter/.venv/lib/python3.11/site-packages/sklearn/linear_model/_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n",
+      "  warnings.warn(\n",
+      "/Users/laylanyrabia/neuefische/kickstarter/project_kickstarter/.venv/lib/python3.11/site-packages/sklearn/linear_model/_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n",
+      "  warnings.warn(\n",
+      "/Users/laylanyrabia/neuefische/kickstarter/project_kickstarter/.venv/lib/python3.11/site-packages/sklearn/linear_model/_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n",
+      "  warnings.warn(\n",
+      "/Users/laylanyrabia/neuefische/kickstarter/project_kickstarter/.venv/lib/python3.11/site-packages/sklearn/linear_model/_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n",
+      "  warnings.warn(\n",
+      "/Users/laylanyrabia/neuefische/kickstarter/project_kickstarter/.venv/lib/python3.11/site-packages/sklearn/linear_model/_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n",
+      "  warnings.warn(\n",
+      "/Users/laylanyrabia/neuefische/kickstarter/project_kickstarter/.venv/lib/python3.11/site-packages/sklearn/linear_model/_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n",
+      "  warnings.warn(\n",
+      "/Users/laylanyrabia/neuefische/kickstarter/project_kickstarter/.venv/lib/python3.11/site-packages/sklearn/linear_model/_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n",
+      "  warnings.warn(\n",
+      "/Users/laylanyrabia/neuefische/kickstarter/project_kickstarter/.venv/lib/python3.11/site-packages/sklearn/linear_model/_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n",
+      "  warnings.warn(\n",
+      "/Users/laylanyrabia/neuefische/kickstarter/project_kickstarter/.venv/lib/python3.11/site-packages/sklearn/linear_model/_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n",
+      "  warnings.warn(\n",
+      "/Users/laylanyrabia/neuefische/kickstarter/project_kickstarter/.venv/lib/python3.11/site-packages/sklearn/linear_model/_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n",
+      "  warnings.warn(\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<style>#sk-container-id-1 {color: black;background-color: white;}#sk-container-id-1 pre{padding: 0;}#sk-container-id-1 div.sk-toggleable {background-color: white;}#sk-container-id-1 label.sk-toggleable__label {cursor: pointer;display: block;width: 100%;margin-bottom: 0;padding: 0.3em;box-sizing: border-box;text-align: center;}#sk-container-id-1 label.sk-toggleable__label-arrow:before {content: \"▸\";float: left;margin-right: 0.25em;color: #696969;}#sk-container-id-1 label.sk-toggleable__label-arrow:hover:before {color: black;}#sk-container-id-1 div.sk-estimator:hover label.sk-toggleable__label-arrow:before {color: black;}#sk-container-id-1 div.sk-toggleable__content {max-height: 0;max-width: 0;overflow: hidden;text-align: left;background-color: #f0f8ff;}#sk-container-id-1 div.sk-toggleable__content pre {margin: 0.2em;color: black;border-radius: 0.25em;background-color: #f0f8ff;}#sk-container-id-1 input.sk-toggleable__control:checked~div.sk-toggleable__content {max-height: 200px;max-width: 100%;overflow: auto;}#sk-container-id-1 input.sk-toggleable__control:checked~label.sk-toggleable__label-arrow:before {content: \"▾\";}#sk-container-id-1 div.sk-estimator input.sk-toggleable__control:checked~label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-1 div.sk-label input.sk-toggleable__control:checked~label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-1 input.sk-hidden--visually {border: 0;clip: rect(1px 1px 1px 1px);clip: rect(1px, 1px, 1px, 1px);height: 1px;margin: -1px;overflow: hidden;padding: 0;position: absolute;width: 1px;}#sk-container-id-1 div.sk-estimator {font-family: monospace;background-color: #f0f8ff;border: 1px dotted black;border-radius: 0.25em;box-sizing: border-box;margin-bottom: 0.5em;}#sk-container-id-1 div.sk-estimator:hover {background-color: #d4ebff;}#sk-container-id-1 div.sk-parallel-item::after {content: \"\";width: 100%;border-bottom: 1px solid gray;flex-grow: 1;}#sk-container-id-1 div.sk-label:hover label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-1 div.sk-serial::before {content: \"\";position: absolute;border-left: 1px solid gray;box-sizing: border-box;top: 0;bottom: 0;left: 50%;z-index: 0;}#sk-container-id-1 div.sk-serial {display: flex;flex-direction: column;align-items: center;background-color: white;padding-right: 0.2em;padding-left: 0.2em;position: relative;}#sk-container-id-1 div.sk-item {position: relative;z-index: 1;}#sk-container-id-1 div.sk-parallel {display: flex;align-items: stretch;justify-content: center;background-color: white;position: relative;}#sk-container-id-1 div.sk-item::before, #sk-container-id-1 div.sk-parallel-item::before {content: \"\";position: absolute;border-left: 1px solid gray;box-sizing: border-box;top: 0;bottom: 0;left: 50%;z-index: -1;}#sk-container-id-1 div.sk-parallel-item {display: flex;flex-direction: column;z-index: 1;position: relative;background-color: white;}#sk-container-id-1 div.sk-parallel-item:first-child::after {align-self: flex-end;width: 50%;}#sk-container-id-1 div.sk-parallel-item:last-child::after {align-self: flex-start;width: 50%;}#sk-container-id-1 div.sk-parallel-item:only-child::after {width: 0;}#sk-container-id-1 div.sk-dashed-wrapped {border: 1px dashed gray;margin: 0 0.4em 0.5em 0.4em;box-sizing: border-box;padding-bottom: 0.4em;background-color: white;}#sk-container-id-1 div.sk-label label {font-family: monospace;font-weight: bold;display: inline-block;line-height: 1.2em;}#sk-container-id-1 div.sk-label-container {text-align: center;}#sk-container-id-1 div.sk-container {/* jupyter's `normalize.less` sets `[hidden] { display: none; }` but bootstrap.min.css set `[hidden] { display: none !important; }` so we also need the `!important` here to be able to override the default hidden behavior on the sphinx rendered scikit-learn.org. See: https://github.com/scikit-learn/scikit-learn/issues/21755 */display: inline-block !important;position: relative;}#sk-container-id-1 div.sk-text-repr-fallback {display: none;}</style><div id=\"sk-container-id-1\" class=\"sk-top-container\"><div class=\"sk-text-repr-fallback\"><pre>GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=42, shuffle=True),\n",
+       "             estimator=LogisticRegression(class_weight=&#x27;balanced&#x27;,\n",
+       "                                          solver=&#x27;saga&#x27;),\n",
+       "             n_jobs=-1,\n",
+       "             param_grid={&#x27;C&#x27;: [0.01, 0.1, 1, 10], &#x27;penalty&#x27;: [&#x27;l1&#x27;, &#x27;l2&#x27;]},\n",
+       "             scoring=make_scorer(f1_score, average=weighted))</pre><b>In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. <br />On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.</b></div><div class=\"sk-container\" hidden><div class=\"sk-item sk-dashed-wrapped\"><div class=\"sk-label-container\"><div class=\"sk-label sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-1\" type=\"checkbox\" ><label for=\"sk-estimator-id-1\" class=\"sk-toggleable__label sk-toggleable__label-arrow\">GridSearchCV</label><div class=\"sk-toggleable__content\"><pre>GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=42, shuffle=True),\n",
+       "             estimator=LogisticRegression(class_weight=&#x27;balanced&#x27;,\n",
+       "                                          solver=&#x27;saga&#x27;),\n",
+       "             n_jobs=-1,\n",
+       "             param_grid={&#x27;C&#x27;: [0.01, 0.1, 1, 10], &#x27;penalty&#x27;: [&#x27;l1&#x27;, &#x27;l2&#x27;]},\n",
+       "             scoring=make_scorer(f1_score, average=weighted))</pre></div></div></div><div class=\"sk-parallel\"><div class=\"sk-parallel-item\"><div class=\"sk-item\"><div class=\"sk-label-container\"><div class=\"sk-label sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-2\" type=\"checkbox\" ><label for=\"sk-estimator-id-2\" class=\"sk-toggleable__label sk-toggleable__label-arrow\">estimator: LogisticRegression</label><div class=\"sk-toggleable__content\"><pre>LogisticRegression(class_weight=&#x27;balanced&#x27;, solver=&#x27;saga&#x27;)</pre></div></div></div><div class=\"sk-serial\"><div class=\"sk-item\"><div class=\"sk-estimator sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-3\" type=\"checkbox\" ><label for=\"sk-estimator-id-3\" class=\"sk-toggleable__label sk-toggleable__label-arrow\">LogisticRegression</label><div class=\"sk-toggleable__content\"><pre>LogisticRegression(class_weight=&#x27;balanced&#x27;, solver=&#x27;saga&#x27;)</pre></div></div></div></div></div></div></div></div></div></div>"
+      ],
+      "text/plain": [
+       "GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=42, shuffle=True),\n",
+       "             estimator=LogisticRegression(class_weight='balanced',\n",
+       "                                          solver='saga'),\n",
+       "             n_jobs=-1,\n",
+       "             param_grid={'C': [0.01, 0.1, 1, 10], 'penalty': ['l1', 'l2']},\n",
+       "             scoring=make_scorer(f1_score, average=weighted))"
+      ]
+     },
+     "execution_count": 20,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "grid_search_lr.fit(X_train_resampled, y_train_resampled)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Best Logistic Regression Model: LogisticRegression(C=0.01, class_weight='balanced', penalty='l1', solver='saga')\n",
+      "Logistic Regression F1 Score: 0.7975915850439066\n",
+      "                      Feature   Coefficient\n",
+      "1                     Backers  1.015463e+01\n",
+      "0                        Goal -2.289651e+00\n",
+      "84         Subcategory_HipHop -9.202204e-01\n",
+      "16           Category_Theater  8.521464e-01\n",
+      "164   Subcategory_Video_Games -7.783138e-01\n",
+      "..                        ...           ...\n",
+      "56          Subcategory_Drama  1.381033e-02\n",
+      "55    Subcategory_Documentary -1.449379e-03\n",
+      "115  Subcategory_Performances  9.369830e-04\n",
+      "74           Subcategory_Food  3.724464e-08\n",
+      "106         Subcategory_Music -3.476722e-09\n",
+      "\n",
+      "[61 rows x 2 columns]\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Best Logistic Regression Model\n",
+    "best_lr = grid_search_lr.best_estimator_\n",
+    "\n",
+    "# Output the best model and its corresponding hyperparameters\n",
+    "print(\"Best Logistic Regression Model:\", best_lr)\n",
+    "print(\"Logistic Regression F1 Score:\", f1_score(y_test, best_lr.predict(X_test), average='weighted'))\n",
+    "\n",
+    "# Output the best hyperparameters\n",
+    "\n",
+    "coefficients = best_lr.coef_.flatten()\n",
+    "feature_names = X_test.columns\n",
+    "coeff_df = pd.DataFrame({\n",
+    "    'Feature': feature_names,\n",
+    "    'Coefficient': coefficients\n",
+    "})\n",
+    "selected_features = coeff_df[coeff_df['Coefficient'] != 0]\n",
+    "selected_features = selected_features.reindex(selected_features['Coefficient'].abs().sort_values(ascending=False).index)\n",
+    "print(selected_features)\n",
+    "\n",
+    "# Best Logistic Regression Model: LogisticRegression(C=0.01, class_weight='balanced', penalty='l1', solver='saga')\n",
+    "# Logistic Regression F1 Score: 0.7975915850439066\n",
+    "# 61 features selected"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Feature Pre-Selection: \n",
+    "Since the categorical features include a large number of unique values and the data set is of a 6-figure order, we might need to perform a more selective lasso regression prior to the modelling for this to be computationally feasible. This also means decreasing the regularization constant (C):"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Number of selected features: 10\n",
+      "Selected feature indices: [  0   1   2   9  12  15  16 145 164 197]\n",
+      "Selected feature names: ['Goal' 'Backers' 'Duration' 'Category_Food' 'Category_Music'\n",
+      " 'Category_Technology' 'Category_Theater' 'Subcategory_Shorts'\n",
+      " 'Subcategory_Video_Games' 'year_deadline']\n",
+      "Shape of the reduced feature set: (214162, 10)\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/Users/laylanyrabia/neuefische/kickstarter/project_kickstarter/.venv/lib/python3.11/site-packages/sklearn/linear_model/_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n",
+      "  warnings.warn(\n"
+     ]
+    }
+   ],
+   "source": [
+    "from sklearn.linear_model import LogisticRegression\n",
+    "from sklearn.feature_selection import SelectFromModel\n",
+    "\n",
+    "# Step 1: Define a more selective Logistic Regression model by decreasing C\n",
+    "# Smaller C values increase regularization strength\n",
+    "logreg = LogisticRegression(penalty=\"l1\", solver=\"saga\", max_iter=100, C=0.001)\n",
+    "\n",
+    "# Step 2: Fit the feature selector model with the more selective logistic regression\n",
+    "selector = SelectFromModel(logreg, threshold=\"mean\")\n",
+    "selector.fit(X_train_resampled, y_train_resampled)\n",
+    "\n",
+    "# Step 3: Apply the selector to reduce dimensionality\n",
+    "X_train_reduced = selector.transform(X_train_resampled)\n",
+    "X_test_reduced = selector.transform(X_test)\n",
+    "\n",
+    "# Step 4: Retrieve the indices and names of selected features\n",
+    "selected_feature_indices = selector.get_support(indices=True)\n",
+    "selected_feature_names = np.array(feature_names)[selected_feature_indices]\n",
+    "\n",
+    "print(\"Number of selected features:\", len(selected_feature_indices))\n",
+    "print(\"Selected feature indices:\", selected_feature_indices)\n",
+    "print(\"Selected feature names:\", selected_feature_names)\n",
+    "print(\"Shape of the reduced feature set:\", X_train_reduced.shape)\n",
+    "\n",
+    "# C=0.0001, 0.001 (feature selection with L1 regularization reached saturation)\n",
+    "# Number of selected features: 10\n",
+    "# Selected feature indices: [  0   1   2   9  12  15  16 145 164 197]\n",
+    "# Selected feature names: ['Goal' 'Backers' 'Duration' 'Category_Food' 'Category_Music'\n",
+    "# 'Category_Technology' 'Category_Theater' 'Subcategory_Shorts'\n",
+    "# 'Subcategory_Video_Games' 'year_deadline']\n",
+    "# Shape of the reduced feature set: (214162, 10)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/Users/laylanyrabia/neuefische/kickstarter/project_kickstarter/.venv/lib/python3.11/site-packages/sklearn/linear_model/_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n",
+      "  warnings.warn(\n",
+      "/Users/laylanyrabia/neuefische/kickstarter/project_kickstarter/.venv/lib/python3.11/site-packages/sklearn/linear_model/_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n",
+      "  warnings.warn(\n",
+      "/Users/laylanyrabia/neuefische/kickstarter/project_kickstarter/.venv/lib/python3.11/site-packages/sklearn/linear_model/_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n",
+      "  warnings.warn(\n",
+      "/Users/laylanyrabia/neuefische/kickstarter/project_kickstarter/.venv/lib/python3.11/site-packages/sklearn/linear_model/_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n",
+      "  warnings.warn(\n",
+      "/Users/laylanyrabia/neuefische/kickstarter/project_kickstarter/.venv/lib/python3.11/site-packages/sklearn/linear_model/_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n",
+      "  warnings.warn(\n",
+      "/Users/laylanyrabia/neuefische/kickstarter/project_kickstarter/.venv/lib/python3.11/site-packages/sklearn/linear_model/_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n",
+      "  warnings.warn(\n",
+      "/Users/laylanyrabia/neuefische/kickstarter/project_kickstarter/.venv/lib/python3.11/site-packages/sklearn/linear_model/_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n",
+      "  warnings.warn(\n",
+      "/Users/laylanyrabia/neuefische/kickstarter/project_kickstarter/.venv/lib/python3.11/site-packages/sklearn/linear_model/_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n",
+      "  warnings.warn(\n",
+      "/Users/laylanyrabia/neuefische/kickstarter/project_kickstarter/.venv/lib/python3.11/site-packages/sklearn/linear_model/_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n",
+      "  warnings.warn(\n",
+      "/Users/laylanyrabia/neuefische/kickstarter/project_kickstarter/.venv/lib/python3.11/site-packages/sklearn/linear_model/_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n",
+      "  warnings.warn(\n",
+      "/Users/laylanyrabia/neuefische/kickstarter/project_kickstarter/.venv/lib/python3.11/site-packages/sklearn/linear_model/_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n",
+      "  warnings.warn(\n",
+      "/Users/laylanyrabia/neuefische/kickstarter/project_kickstarter/.venv/lib/python3.11/site-packages/sklearn/linear_model/_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n",
+      "  warnings.warn(\n",
+      "/Users/laylanyrabia/neuefische/kickstarter/project_kickstarter/.venv/lib/python3.11/site-packages/sklearn/linear_model/_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n",
+      "  warnings.warn(\n",
+      "/Users/laylanyrabia/neuefische/kickstarter/project_kickstarter/.venv/lib/python3.11/site-packages/sklearn/linear_model/_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n",
+      "  warnings.warn(\n",
+      "/Users/laylanyrabia/neuefische/kickstarter/project_kickstarter/.venv/lib/python3.11/site-packages/sklearn/linear_model/_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n",
+      "  warnings.warn(\n",
+      "/Users/laylanyrabia/neuefische/kickstarter/project_kickstarter/.venv/lib/python3.11/site-packages/sklearn/model_selection/_validation.py:378: FitFailedWarning: \n",
+      "50 fits failed out of a total of 75.\n",
+      "The score on these train-test partitions for these parameters will be set to nan.\n",
+      "If these failures are not expected, you can try to debug them by setting error_score='raise'.\n",
+      "\n",
+      "Below are more details about the failures:\n",
+      "--------------------------------------------------------------------------------\n",
+      "9 fits failed with the following error:\n",
+      "Traceback (most recent call last):\n",
+      "  File \"/Users/laylanyrabia/neuefische/kickstarter/project_kickstarter/.venv/lib/python3.11/site-packages/sklearn/model_selection/_validation.py\", line 686, in _fit_and_score\n",
+      "    estimator.fit(X_train, y_train, **fit_params)\n",
+      "  File \"/Users/laylanyrabia/neuefische/kickstarter/project_kickstarter/.venv/lib/python3.11/site-packages/sklearn/linear_model/_logistic.py\", line 1160, in fit\n",
+      "    self._validate_params()\n",
+      "  File \"/Users/laylanyrabia/neuefische/kickstarter/project_kickstarter/.venv/lib/python3.11/site-packages/sklearn/base.py\", line 600, in _validate_params\n",
+      "    validate_parameter_constraints(\n",
+      "  File \"/Users/laylanyrabia/neuefische/kickstarter/project_kickstarter/.venv/lib/python3.11/site-packages/sklearn/utils/_param_validation.py\", line 97, in validate_parameter_constraints\n",
+      "    raise InvalidParameterError(\n",
+      "sklearn.utils._param_validation.InvalidParameterError: The 'penalty' parameter of LogisticRegression must be a str among {'none' (deprecated), 'elasticnet', 'l2', 'l1'} or None. Got 'None' instead.\n",
+      "\n",
+      "--------------------------------------------------------------------------------\n",
+      "1 fits failed with the following error:\n",
+      "Traceback (most recent call last):\n",
+      "  File \"/Users/laylanyrabia/neuefische/kickstarter/project_kickstarter/.venv/lib/python3.11/site-packages/sklearn/model_selection/_validation.py\", line 686, in _fit_and_score\n",
+      "    estimator.fit(X_train, y_train, **fit_params)\n",
+      "  File \"/Users/laylanyrabia/neuefische/kickstarter/project_kickstarter/.venv/lib/python3.11/site-packages/sklearn/linear_model/_logistic.py\", line 1160, in fit\n",
+      "    self._validate_params()\n",
+      "  File \"/Users/laylanyrabia/neuefische/kickstarter/project_kickstarter/.venv/lib/python3.11/site-packages/sklearn/base.py\", line 600, in _validate_params\n",
+      "    validate_parameter_constraints(\n",
+      "  File \"/Users/laylanyrabia/neuefische/kickstarter/project_kickstarter/.venv/lib/python3.11/site-packages/sklearn/utils/_param_validation.py\", line 97, in validate_parameter_constraints\n",
+      "    raise InvalidParameterError(\n",
+      "sklearn.utils._param_validation.InvalidParameterError: The 'penalty' parameter of LogisticRegression must be a str among {'none' (deprecated), 'l2', 'l1', 'elasticnet'} or None. Got 'None' instead.\n",
+      "\n",
+      "--------------------------------------------------------------------------------\n",
+      "25 fits failed with the following error:\n",
+      "Traceback (most recent call last):\n",
+      "  File \"/Users/laylanyrabia/neuefische/kickstarter/project_kickstarter/.venv/lib/python3.11/site-packages/sklearn/model_selection/_validation.py\", line 686, in _fit_and_score\n",
+      "    estimator.fit(X_train, y_train, **fit_params)\n",
+      "  File \"/Users/laylanyrabia/neuefische/kickstarter/project_kickstarter/.venv/lib/python3.11/site-packages/sklearn/linear_model/_logistic.py\", line 1291, in fit\n",
+      "    fold_coefs_ = Parallel(n_jobs=self.n_jobs, verbose=self.verbose, prefer=prefer)(\n",
+      "                  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
+      "  File \"/Users/laylanyrabia/neuefische/kickstarter/project_kickstarter/.venv/lib/python3.11/site-packages/sklearn/utils/parallel.py\", line 63, in __call__\n",
+      "    return super().__call__(iterable_with_config)\n",
+      "           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
+      "  File \"/Users/laylanyrabia/neuefische/kickstarter/project_kickstarter/.venv/lib/python3.11/site-packages/joblib/parallel.py\", line 1918, in __call__\n",
+      "    return output if self.return_generator else list(output)\n",
+      "                                                ^^^^^^^^^^^^\n",
+      "  File \"/Users/laylanyrabia/neuefische/kickstarter/project_kickstarter/.venv/lib/python3.11/site-packages/joblib/parallel.py\", line 1847, in _get_sequential_output\n",
+      "    res = func(*args, **kwargs)\n",
+      "          ^^^^^^^^^^^^^^^^^^^^^\n",
+      "  File \"/Users/laylanyrabia/neuefische/kickstarter/project_kickstarter/.venv/lib/python3.11/site-packages/sklearn/utils/parallel.py\", line 123, in __call__\n",
+      "    return self.function(*args, **kwargs)\n",
+      "           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
+      "  File \"/Users/laylanyrabia/neuefische/kickstarter/project_kickstarter/.venv/lib/python3.11/site-packages/sklearn/linear_model/_logistic.py\", line 521, in _logistic_regression_path\n",
+      "    alpha = (1.0 / C) * (1 - l1_ratio)\n",
+      "                         ~~^~~~~~~~~~\n",
+      "TypeError: unsupported operand type(s) for -: 'int' and 'NoneType'\n",
+      "\n",
+      "--------------------------------------------------------------------------------\n",
+      "3 fits failed with the following error:\n",
+      "Traceback (most recent call last):\n",
+      "  File \"/Users/laylanyrabia/neuefische/kickstarter/project_kickstarter/.venv/lib/python3.11/site-packages/sklearn/model_selection/_validation.py\", line 686, in _fit_and_score\n",
+      "    estimator.fit(X_train, y_train, **fit_params)\n",
+      "  File \"/Users/laylanyrabia/neuefische/kickstarter/project_kickstarter/.venv/lib/python3.11/site-packages/sklearn/linear_model/_logistic.py\", line 1160, in fit\n",
+      "    self._validate_params()\n",
+      "  File \"/Users/laylanyrabia/neuefische/kickstarter/project_kickstarter/.venv/lib/python3.11/site-packages/sklearn/base.py\", line 600, in _validate_params\n",
+      "    validate_parameter_constraints(\n",
+      "  File \"/Users/laylanyrabia/neuefische/kickstarter/project_kickstarter/.venv/lib/python3.11/site-packages/sklearn/utils/_param_validation.py\", line 97, in validate_parameter_constraints\n",
+      "    raise InvalidParameterError(\n",
+      "sklearn.utils._param_validation.InvalidParameterError: The 'penalty' parameter of LogisticRegression must be a str among {'l1', 'l2', 'elasticnet', 'none' (deprecated)} or None. Got 'None' instead.\n",
+      "\n",
+      "--------------------------------------------------------------------------------\n",
+      "2 fits failed with the following error:\n",
+      "Traceback (most recent call last):\n",
+      "  File \"/Users/laylanyrabia/neuefische/kickstarter/project_kickstarter/.venv/lib/python3.11/site-packages/sklearn/model_selection/_validation.py\", line 686, in _fit_and_score\n",
+      "    estimator.fit(X_train, y_train, **fit_params)\n",
+      "  File \"/Users/laylanyrabia/neuefische/kickstarter/project_kickstarter/.venv/lib/python3.11/site-packages/sklearn/linear_model/_logistic.py\", line 1160, in fit\n",
+      "    self._validate_params()\n",
+      "  File \"/Users/laylanyrabia/neuefische/kickstarter/project_kickstarter/.venv/lib/python3.11/site-packages/sklearn/base.py\", line 600, in _validate_params\n",
+      "    validate_parameter_constraints(\n",
+      "  File \"/Users/laylanyrabia/neuefische/kickstarter/project_kickstarter/.venv/lib/python3.11/site-packages/sklearn/utils/_param_validation.py\", line 97, in validate_parameter_constraints\n",
+      "    raise InvalidParameterError(\n",
+      "sklearn.utils._param_validation.InvalidParameterError: The 'penalty' parameter of LogisticRegression must be a str among {'l1', 'elasticnet', 'l2', 'none' (deprecated)} or None. Got 'None' instead.\n",
+      "\n",
+      "--------------------------------------------------------------------------------\n",
+      "5 fits failed with the following error:\n",
+      "Traceback (most recent call last):\n",
+      "  File \"/Users/laylanyrabia/neuefische/kickstarter/project_kickstarter/.venv/lib/python3.11/site-packages/sklearn/model_selection/_validation.py\", line 686, in _fit_and_score\n",
+      "    estimator.fit(X_train, y_train, **fit_params)\n",
+      "  File \"/Users/laylanyrabia/neuefische/kickstarter/project_kickstarter/.venv/lib/python3.11/site-packages/sklearn/linear_model/_logistic.py\", line 1160, in fit\n",
+      "    self._validate_params()\n",
+      "  File \"/Users/laylanyrabia/neuefische/kickstarter/project_kickstarter/.venv/lib/python3.11/site-packages/sklearn/base.py\", line 600, in _validate_params\n",
+      "    validate_parameter_constraints(\n",
+      "  File \"/Users/laylanyrabia/neuefische/kickstarter/project_kickstarter/.venv/lib/python3.11/site-packages/sklearn/utils/_param_validation.py\", line 97, in validate_parameter_constraints\n",
+      "    raise InvalidParameterError(\n",
+      "sklearn.utils._param_validation.InvalidParameterError: The 'penalty' parameter of LogisticRegression must be a str among {'l1', 'none' (deprecated), 'l2', 'elasticnet'} or None. Got 'None' instead.\n",
+      "\n",
+      "--------------------------------------------------------------------------------\n",
+      "5 fits failed with the following error:\n",
+      "Traceback (most recent call last):\n",
+      "  File \"/Users/laylanyrabia/neuefische/kickstarter/project_kickstarter/.venv/lib/python3.11/site-packages/sklearn/model_selection/_validation.py\", line 686, in _fit_and_score\n",
+      "    estimator.fit(X_train, y_train, **fit_params)\n",
+      "  File \"/Users/laylanyrabia/neuefische/kickstarter/project_kickstarter/.venv/lib/python3.11/site-packages/sklearn/linear_model/_logistic.py\", line 1160, in fit\n",
+      "    self._validate_params()\n",
+      "  File \"/Users/laylanyrabia/neuefische/kickstarter/project_kickstarter/.venv/lib/python3.11/site-packages/sklearn/base.py\", line 600, in _validate_params\n",
+      "    validate_parameter_constraints(\n",
+      "  File \"/Users/laylanyrabia/neuefische/kickstarter/project_kickstarter/.venv/lib/python3.11/site-packages/sklearn/utils/_param_validation.py\", line 97, in validate_parameter_constraints\n",
+      "    raise InvalidParameterError(\n",
+      "sklearn.utils._param_validation.InvalidParameterError: The 'penalty' parameter of LogisticRegression must be a str among {'l2', 'none' (deprecated), 'elasticnet', 'l1'} or None. Got 'None' instead.\n",
+      "\n",
+      "  warnings.warn(some_fits_failed_message, FitFailedWarning)\n",
+      "/Users/laylanyrabia/neuefische/kickstarter/project_kickstarter/.venv/lib/python3.11/site-packages/sklearn/model_selection/_search.py:952: UserWarning: One or more of the test scores are non-finite: [0.77022697        nan        nan 0.82605299        nan        nan\n",
+      " 0.84994351        nan        nan 0.85304871        nan        nan\n",
+      " 0.85323814        nan        nan]\n",
+      "  warnings.warn(\n",
+      "/Users/laylanyrabia/neuefische/kickstarter/project_kickstarter/.venv/lib/python3.11/site-packages/sklearn/linear_model/_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n",
+      "  warnings.warn(\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<style>#sk-container-id-2 {color: black;background-color: white;}#sk-container-id-2 pre{padding: 0;}#sk-container-id-2 div.sk-toggleable {background-color: white;}#sk-container-id-2 label.sk-toggleable__label {cursor: pointer;display: block;width: 100%;margin-bottom: 0;padding: 0.3em;box-sizing: border-box;text-align: center;}#sk-container-id-2 label.sk-toggleable__label-arrow:before {content: \"▸\";float: left;margin-right: 0.25em;color: #696969;}#sk-container-id-2 label.sk-toggleable__label-arrow:hover:before {color: black;}#sk-container-id-2 div.sk-estimator:hover label.sk-toggleable__label-arrow:before {color: black;}#sk-container-id-2 div.sk-toggleable__content {max-height: 0;max-width: 0;overflow: hidden;text-align: left;background-color: #f0f8ff;}#sk-container-id-2 div.sk-toggleable__content pre {margin: 0.2em;color: black;border-radius: 0.25em;background-color: #f0f8ff;}#sk-container-id-2 input.sk-toggleable__control:checked~div.sk-toggleable__content {max-height: 200px;max-width: 100%;overflow: auto;}#sk-container-id-2 input.sk-toggleable__control:checked~label.sk-toggleable__label-arrow:before {content: \"▾\";}#sk-container-id-2 div.sk-estimator input.sk-toggleable__control:checked~label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-2 div.sk-label input.sk-toggleable__control:checked~label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-2 input.sk-hidden--visually {border: 0;clip: rect(1px 1px 1px 1px);clip: rect(1px, 1px, 1px, 1px);height: 1px;margin: -1px;overflow: hidden;padding: 0;position: absolute;width: 1px;}#sk-container-id-2 div.sk-estimator {font-family: monospace;background-color: #f0f8ff;border: 1px dotted black;border-radius: 0.25em;box-sizing: border-box;margin-bottom: 0.5em;}#sk-container-id-2 div.sk-estimator:hover {background-color: #d4ebff;}#sk-container-id-2 div.sk-parallel-item::after {content: \"\";width: 100%;border-bottom: 1px solid gray;flex-grow: 1;}#sk-container-id-2 div.sk-label:hover label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-2 div.sk-serial::before {content: \"\";position: absolute;border-left: 1px solid gray;box-sizing: border-box;top: 0;bottom: 0;left: 50%;z-index: 0;}#sk-container-id-2 div.sk-serial {display: flex;flex-direction: column;align-items: center;background-color: white;padding-right: 0.2em;padding-left: 0.2em;position: relative;}#sk-container-id-2 div.sk-item {position: relative;z-index: 1;}#sk-container-id-2 div.sk-parallel {display: flex;align-items: stretch;justify-content: center;background-color: white;position: relative;}#sk-container-id-2 div.sk-item::before, #sk-container-id-2 div.sk-parallel-item::before {content: \"\";position: absolute;border-left: 1px solid gray;box-sizing: border-box;top: 0;bottom: 0;left: 50%;z-index: -1;}#sk-container-id-2 div.sk-parallel-item {display: flex;flex-direction: column;z-index: 1;position: relative;background-color: white;}#sk-container-id-2 div.sk-parallel-item:first-child::after {align-self: flex-end;width: 50%;}#sk-container-id-2 div.sk-parallel-item:last-child::after {align-self: flex-start;width: 50%;}#sk-container-id-2 div.sk-parallel-item:only-child::after {width: 0;}#sk-container-id-2 div.sk-dashed-wrapped {border: 1px dashed gray;margin: 0 0.4em 0.5em 0.4em;box-sizing: border-box;padding-bottom: 0.4em;background-color: white;}#sk-container-id-2 div.sk-label label {font-family: monospace;font-weight: bold;display: inline-block;line-height: 1.2em;}#sk-container-id-2 div.sk-label-container {text-align: center;}#sk-container-id-2 div.sk-container {/* jupyter's `normalize.less` sets `[hidden] { display: none; }` but bootstrap.min.css set `[hidden] { display: none !important; }` so we also need the `!important` here to be able to override the default hidden behavior on the sphinx rendered scikit-learn.org. See: https://github.com/scikit-learn/scikit-learn/issues/21755 */display: inline-block !important;position: relative;}#sk-container-id-2 div.sk-text-repr-fallback {display: none;}</style><div id=\"sk-container-id-2\" class=\"sk-top-container\"><div class=\"sk-text-repr-fallback\"><pre>GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=42, shuffle=True),\n",
+       "             estimator=LogisticRegression(class_weight=&#x27;balanced&#x27;,\n",
+       "                                          max_iter=1000, solver=&#x27;saga&#x27;),\n",
+       "             n_jobs=-1,\n",
+       "             param_grid={&#x27;C&#x27;: [0.01, 0.1, 1, 10, 100],\n",
+       "                         &#x27;penalty&#x27;: [&#x27;l2&#x27;, &#x27;None&#x27;, &#x27;elasticnet&#x27;]},\n",
+       "             scoring=make_scorer(f1_score, average=weighted))</pre><b>In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. <br />On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.</b></div><div class=\"sk-container\" hidden><div class=\"sk-item sk-dashed-wrapped\"><div class=\"sk-label-container\"><div class=\"sk-label sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-4\" type=\"checkbox\" ><label for=\"sk-estimator-id-4\" class=\"sk-toggleable__label sk-toggleable__label-arrow\">GridSearchCV</label><div class=\"sk-toggleable__content\"><pre>GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=42, shuffle=True),\n",
+       "             estimator=LogisticRegression(class_weight=&#x27;balanced&#x27;,\n",
+       "                                          max_iter=1000, solver=&#x27;saga&#x27;),\n",
+       "             n_jobs=-1,\n",
+       "             param_grid={&#x27;C&#x27;: [0.01, 0.1, 1, 10, 100],\n",
+       "                         &#x27;penalty&#x27;: [&#x27;l2&#x27;, &#x27;None&#x27;, &#x27;elasticnet&#x27;]},\n",
+       "             scoring=make_scorer(f1_score, average=weighted))</pre></div></div></div><div class=\"sk-parallel\"><div class=\"sk-parallel-item\"><div class=\"sk-item\"><div class=\"sk-label-container\"><div class=\"sk-label sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-5\" type=\"checkbox\" ><label for=\"sk-estimator-id-5\" class=\"sk-toggleable__label sk-toggleable__label-arrow\">estimator: LogisticRegression</label><div class=\"sk-toggleable__content\"><pre>LogisticRegression(class_weight=&#x27;balanced&#x27;, max_iter=1000, solver=&#x27;saga&#x27;)</pre></div></div></div><div class=\"sk-serial\"><div class=\"sk-item\"><div class=\"sk-estimator sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-6\" type=\"checkbox\" ><label for=\"sk-estimator-id-6\" class=\"sk-toggleable__label sk-toggleable__label-arrow\">LogisticRegression</label><div class=\"sk-toggleable__content\"><pre>LogisticRegression(class_weight=&#x27;balanced&#x27;, max_iter=1000, solver=&#x27;saga&#x27;)</pre></div></div></div></div></div></div></div></div></div></div>"
+      ],
+      "text/plain": [
+       "GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=42, shuffle=True),\n",
+       "             estimator=LogisticRegression(class_weight='balanced',\n",
+       "                                          max_iter=1000, solver='saga'),\n",
+       "             n_jobs=-1,\n",
+       "             param_grid={'C': [0.01, 0.1, 1, 10, 100],\n",
+       "                         'penalty': ['l2', 'None', 'elasticnet']},\n",
+       "             scoring=make_scorer(f1_score, average=weighted))"
+      ]
+     },
+     "execution_count": 23,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Define the logistic regression model with class_weight='balanced'\n",
+    "logistic_regression = LogisticRegression(solver='saga', max_iter=1000, class_weight='balanced')\n",
+    "\n",
+    "# Define the parameter grid for Logistic Regression\n",
+    "param_grid_lr = {\n",
+    "    'penalty': ['l2', 'None', 'elasticnet'],\n",
+    "    'C': [0.01, 0.1, 1, 10, 100]\n",
+    "    }\n",
+    "grid_search_lr = GridSearchCV(logistic_regression, param_grid_lr, cv=skf, scoring=f1_scorer, n_jobs=-1)\n",
+    "grid_search_lr.fit(X_test_reduced, y_test)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Best Logistic Regression Model: LogisticRegression(C=100, class_weight='balanced', max_iter=1000, solver='saga')\n",
+      "Logistic Regression F1 Score: 0.8561574779080967\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Best Logistic Regression Model\n",
+    "best_lr = grid_search_lr.best_estimator_\n",
+    "\n",
+    "# Output the best model and its corresponding hyperparameters\n",
+    "print(\"Best Logistic Regression Model:\", best_lr)\n",
+    "print(\"Logistic Regression F1 Score:\", f1_score(y_test, best_lr.predict(X_test_reduced), average='weighted'))\n",
+    "\n",
+    "# BBest Logistic Regression Model: LogisticRegression(C=100, class_weight='balanced', max_iter=1000, solver='saga')\n",
+    "# Logistic Regression F1 Score: 0.856157477908096"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

	Category	Subcategory	Country	Launched	Deadline	Goal	Pledged	Backers	State	Duration
0	Fashion	Fashion	United_States	2009-04-21 21:02:48	2009-05-31	1000	625	30	Failed	39
1	Film__Video	Shorts	United_States	2009-04-23 00:07:53	2009-07-20	80000	22	3	Failed	87
2	Art	Illustration	United_States	2009-04-24 21:52:03	2009-05-03	20	35	3	Successful	8
3	Technology	Software	United_States	2009-04-25 17:36:21	2009-07-14	99	145	25	Successful	79
4	Fashion	Fashion	United_States	2009-04-27 14:10:39	2009-05-26	1900	387	10	Failed	28