From a69bc06912c43366aa177d0601e3a9da341c3274 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tobias=20K=C3=A4mmerling?= <Tobiaskaemmerling86@gmail.com>
Date: Fri, 9 Aug 2024 10:25:41 +0200
Subject: [PATCH 1/3] added xgboost to the requirements

---
 requirements.txt | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index f82b4b8..a2fcb07 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -9,4 +9,5 @@ jupyterlab-dash==0.1.0a3
 scikit-learn==1.2.2
 statsmodels==0.13.5
 pytest==7.3.1
-import-ipynb
\ No newline at end of file
+xgboost==2.1.1
+import-ipynb

From 0985cf628ec8dcbaa3c8db4af0d82397a1b626b2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tobias=20K=C3=A4mmerling?= <Tobiaskaemmerling86@gmail.com>
Date: Fri, 9 Aug 2024 12:21:32 +0200
Subject: [PATCH 2/3] Added decision tree, xgboost and tried out the grid
 search and machine learning algorithms to get our first model

---
 base.ipynb | 618 +++++++++++++++++++++++++++++++++++++++--------------
 1 file changed, 462 insertions(+), 156 deletions(-)

diff --git a/base.ipynb b/base.ipynb
index 8173696..048ce07 100644
--- a/base.ipynb
+++ b/base.ipynb
@@ -15,7 +15,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 2,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -74,7 +74,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -86,6 +86,7 @@
     "    data = pd.read_csv('data/kickstarter_projects.csv')\n",
     "    # transform the categorical values to numerical values\n",
     "\n",
+    "    data = data.drop(\"ID\",axis=1)\n",
     "    data = data[(data[\"State\"] == \"Successful\") | (data[\"State\"] == \"Failed\")]\n",
     "    data[\"Deadline\"] = pd.to_datetime(data[\"Deadline\"],format='%Y-%m-%d')\n",
     "    data[\"Launched\"] = pd.to_datetime(data[\"Launched\"],format='%Y-%m-%d %H:%M:%S')\n",
@@ -107,18 +108,24 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 83,
+   "execution_count": 5,
    "metadata": {},
    "outputs": [
     {
-     "data": {
-      "text/plain": [
-       "Series([], Name: State, dtype: int64)"
-      ]
-     },
-     "execution_count": 83,
-     "metadata": {},
-     "output_type": "execute_result"
+     "ename": "KeyError",
+     "evalue": "\"['ID'] not found in axis\"",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[1;31mKeyError\u001b[0m                                  Traceback (most recent call last)",
+      "Cell \u001b[1;32mIn[5], line 1\u001b[0m\n\u001b[1;32m----> 1\u001b[0m data, transform_data\u001b[38;5;241m=\u001b[39m \u001b[43mget_data\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m      2\u001b[0m data\u001b[38;5;241m.\u001b[39mdrop(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mName\u001b[39m\u001b[38;5;124m\"\u001b[39m, axis\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m1\u001b[39m, inplace\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)\n\u001b[0;32m      3\u001b[0m data\u001b[38;5;241m.\u001b[39mhead(\u001b[38;5;241m20\u001b[39m)\n",
+      "Cell \u001b[1;32mIn[4], line 9\u001b[0m, in \u001b[0;36mget_data\u001b[1;34m()\u001b[0m\n\u001b[0;32m      6\u001b[0m data \u001b[38;5;241m=\u001b[39m pd\u001b[38;5;241m.\u001b[39mread_csv(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mdata/kickstarter_projects.csv\u001b[39m\u001b[38;5;124m'\u001b[39m)\n\u001b[0;32m      7\u001b[0m \u001b[38;5;66;03m# transform the categorical values to numerical values\u001b[39;00m\n\u001b[1;32m----> 9\u001b[0m data \u001b[38;5;241m=\u001b[39m \u001b[43mdata\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdrop\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mID\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[0;32m     10\u001b[0m data \u001b[38;5;241m=\u001b[39m data[(data[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mState\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mSuccessful\u001b[39m\u001b[38;5;124m\"\u001b[39m) \u001b[38;5;241m|\u001b[39m (data[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mState\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mFailed\u001b[39m\u001b[38;5;124m\"\u001b[39m)]\n\u001b[0;32m     11\u001b[0m data[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mDeadline\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m pd\u001b[38;5;241m.\u001b[39mto_datetime(data[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mDeadline\u001b[39m\u001b[38;5;124m\"\u001b[39m],\u001b[38;5;28mformat\u001b[39m\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m%\u001b[39m\u001b[38;5;124mY-\u001b[39m\u001b[38;5;124m%\u001b[39m\u001b[38;5;124mm-\u001b[39m\u001b[38;5;132;01m%d\u001b[39;00m\u001b[38;5;124m'\u001b[39m)\n",
+      "File \u001b[1;32mc:\\Users\\tobia\\anaconda3\\envs\\env\\Lib\\site-packages\\pandas\\core\\frame.py:5268\u001b[0m, in \u001b[0;36mDataFrame.drop\u001b[1;34m(self, labels, axis, index, columns, level, inplace, errors)\u001b[0m\n\u001b[0;32m   5120\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mdrop\u001b[39m(\n\u001b[0;32m   5121\u001b[0m     \u001b[38;5;28mself\u001b[39m,\n\u001b[0;32m   5122\u001b[0m     labels: IndexLabel \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[1;32m   (...)\u001b[0m\n\u001b[0;32m   5129\u001b[0m     errors: IgnoreRaise \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mraise\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[0;32m   5130\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m DataFrame \u001b[38;5;241m|\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m   5131\u001b[0m \u001b[38;5;250m    \u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[0;32m   5132\u001b[0m \u001b[38;5;124;03m    Drop specified labels from rows or columns.\u001b[39;00m\n\u001b[0;32m   5133\u001b[0m \n\u001b[1;32m   (...)\u001b[0m\n\u001b[0;32m   5266\u001b[0m \u001b[38;5;124;03m            weight  1.0     0.8\u001b[39;00m\n\u001b[0;32m   5267\u001b[0m \u001b[38;5;124;03m    \"\"\"\u001b[39;00m\n\u001b[1;32m-> 5268\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdrop\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m   5269\u001b[0m \u001b[43m        \u001b[49m\u001b[43mlabels\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mlabels\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   5270\u001b[0m \u001b[43m        \u001b[49m\u001b[43maxis\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43maxis\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   5271\u001b[0m \u001b[43m        \u001b[49m\u001b[43mindex\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mindex\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   5272\u001b[0m \u001b[43m        \u001b[49m\u001b[43mcolumns\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcolumns\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   5273\u001b[0m \u001b[43m        \u001b[49m\u001b[43mlevel\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mlevel\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   5274\u001b[0m \u001b[43m        \u001b[49m\u001b[43minplace\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43minplace\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   5275\u001b[0m \u001b[43m        \u001b[49m\u001b[43merrors\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43merrors\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   5276\u001b[0m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[1;32mc:\\Users\\tobia\\anaconda3\\envs\\env\\Lib\\site-packages\\pandas\\core\\generic.py:4549\u001b[0m, in \u001b[0;36mNDFrame.drop\u001b[1;34m(self, labels, axis, index, columns, level, inplace, errors)\u001b[0m\n\u001b[0;32m   4547\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m axis, labels \u001b[38;5;129;01min\u001b[39;00m axes\u001b[38;5;241m.\u001b[39mitems():\n\u001b[0;32m   4548\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m labels \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m-> 4549\u001b[0m         obj \u001b[38;5;241m=\u001b[39m \u001b[43mobj\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_drop_axis\u001b[49m\u001b[43m(\u001b[49m\u001b[43mlabels\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43maxis\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mlevel\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mlevel\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43merrors\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43merrors\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m   4551\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m inplace:\n\u001b[0;32m   4552\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_update_inplace(obj)\n",
+      "File \u001b[1;32mc:\\Users\\tobia\\anaconda3\\envs\\env\\Lib\\site-packages\\pandas\\core\\generic.py:4591\u001b[0m, in \u001b[0;36mNDFrame._drop_axis\u001b[1;34m(self, labels, axis, level, errors, only_slice)\u001b[0m\n\u001b[0;32m   4589\u001b[0m         new_axis \u001b[38;5;241m=\u001b[39m axis\u001b[38;5;241m.\u001b[39mdrop(labels, level\u001b[38;5;241m=\u001b[39mlevel, errors\u001b[38;5;241m=\u001b[39merrors)\n\u001b[0;32m   4590\u001b[0m     \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m-> 4591\u001b[0m         new_axis \u001b[38;5;241m=\u001b[39m \u001b[43maxis\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdrop\u001b[49m\u001b[43m(\u001b[49m\u001b[43mlabels\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43merrors\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43merrors\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m   4592\u001b[0m     indexer \u001b[38;5;241m=\u001b[39m axis\u001b[38;5;241m.\u001b[39mget_indexer(new_axis)\n\u001b[0;32m   4594\u001b[0m \u001b[38;5;66;03m# Case for non-unique axis\u001b[39;00m\n\u001b[0;32m   4595\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n",
+      "File \u001b[1;32mc:\\Users\\tobia\\anaconda3\\envs\\env\\Lib\\site-packages\\pandas\\core\\indexes\\base.py:6696\u001b[0m, in \u001b[0;36mIndex.drop\u001b[1;34m(self, labels, errors)\u001b[0m\n\u001b[0;32m   6694\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m mask\u001b[38;5;241m.\u001b[39many():\n\u001b[0;32m   6695\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m errors \u001b[38;5;241m!=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mignore\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n\u001b[1;32m-> 6696\u001b[0m         \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mlist\u001b[39m(labels[mask])\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m not found in axis\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m   6697\u001b[0m     indexer \u001b[38;5;241m=\u001b[39m indexer[\u001b[38;5;241m~\u001b[39mmask]\n\u001b[0;32m   6698\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdelete(indexer)\n",
+      "\u001b[1;31mKeyError\u001b[0m: \"['ID'] not found in axis\""
+     ]
     }
    ],
    "source": [
@@ -127,12 +134,12 @@
     "data.head(20)\n",
     "\n",
     "data = transform_numerical_to_string(data,transform_data)\n",
-    "data[\"State\"].groupby(data[\"State\"]).count()"
+    "data[\"State\"].groupby(data[\"State\"]).count()\n"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -143,7 +150,7 @@
     "    # read in the data from the csv file\n",
     "    data = pd.read_csv('data/kickstarter_projects.csv')\n",
     "\n",
-    "\n",
+    "    data = data.drop(\"ID\",axis=1)\n",
     "    data = data[(data[\"State\"] == \"Successful\") | (data[\"State\"] == \"Failed\")]\n",
     "    data[\"Deadline\"] = pd.to_datetime(data[\"Deadline\"],format='%Y-%m-%d')\n",
     "    data[\"Launched\"] = pd.to_datetime(data[\"Launched\"],format='%Y-%m-%d %H:%M:%S')\n",
@@ -162,67 +169,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 85,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>ID</th>\n",
-       "      <th>Name</th>\n",
-       "      <th>Category</th>\n",
-       "      <th>Subcategory</th>\n",
-       "      <th>Country</th>\n",
-       "      <th>Launched</th>\n",
-       "      <th>Deadline</th>\n",
-       "      <th>Goal</th>\n",
-       "      <th>Pledged</th>\n",
-       "      <th>Backers</th>\n",
-       "      <th>State</th>\n",
-       "      <th>Duration</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "Empty DataFrame\n",
-       "Columns: [ID, Name, Category, Subcategory, Country, Launched, Deadline, Goal, Pledged, Backers, State, Duration]\n",
-       "Index: []"
-      ]
-     },
-     "execution_count": 85,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "get_original_data().head(20)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 86,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -247,7 +194,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 21,
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -268,7 +222,7 @@
     "        C : The regularization parameter for the logistic regression\n",
     "\n",
     "    Returns:\n",
-    "        metric : The number of the metric specified in the arguments\n",
+    "        metric_value : The number of the metric specified in the arguments\n",
     "    \"\"\"\n",
     "\n",
     "    # create a logistic regression model\n",
@@ -300,7 +254,58 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "KeyError",
+     "evalue": "\"['ID'] not found in axis\"",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[1;31mKeyError\u001b[0m                                  Traceback (most recent call last)",
+      "Cell \u001b[1;32mIn[6], line 3\u001b[0m\n\u001b[0;32m      1\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01msklearn\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mmodel_selection\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m train_test_split\n\u001b[0;32m      2\u001b[0m \u001b[38;5;66;03m# test out the functions\u001b[39;00m\n\u001b[1;32m----> 3\u001b[0m data,transform_data \u001b[38;5;241m=\u001b[39m \u001b[43mget_data\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m      4\u001b[0m \u001b[38;5;28mprint\u001b[39m(data\u001b[38;5;241m.\u001b[39mhead())\n\u001b[0;32m      6\u001b[0m y \u001b[38;5;241m=\u001b[39m data[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mState\u001b[39m\u001b[38;5;124m'\u001b[39m]\n",
+      "Cell \u001b[1;32mIn[4], line 9\u001b[0m, in \u001b[0;36mget_data\u001b[1;34m()\u001b[0m\n\u001b[0;32m      6\u001b[0m data \u001b[38;5;241m=\u001b[39m pd\u001b[38;5;241m.\u001b[39mread_csv(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mdata/kickstarter_projects.csv\u001b[39m\u001b[38;5;124m'\u001b[39m)\n\u001b[0;32m      7\u001b[0m \u001b[38;5;66;03m# transform the categorical values to numerical values\u001b[39;00m\n\u001b[1;32m----> 9\u001b[0m data \u001b[38;5;241m=\u001b[39m \u001b[43mdata\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdrop\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mID\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[0;32m     10\u001b[0m data \u001b[38;5;241m=\u001b[39m data[(data[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mState\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mSuccessful\u001b[39m\u001b[38;5;124m\"\u001b[39m) \u001b[38;5;241m|\u001b[39m (data[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mState\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mFailed\u001b[39m\u001b[38;5;124m\"\u001b[39m)]\n\u001b[0;32m     11\u001b[0m data[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mDeadline\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m pd\u001b[38;5;241m.\u001b[39mto_datetime(data[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mDeadline\u001b[39m\u001b[38;5;124m\"\u001b[39m],\u001b[38;5;28mformat\u001b[39m\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m%\u001b[39m\u001b[38;5;124mY-\u001b[39m\u001b[38;5;124m%\u001b[39m\u001b[38;5;124mm-\u001b[39m\u001b[38;5;132;01m%d\u001b[39;00m\u001b[38;5;124m'\u001b[39m)\n",
+      "File \u001b[1;32mc:\\Users\\tobia\\anaconda3\\envs\\env\\Lib\\site-packages\\pandas\\core\\frame.py:5268\u001b[0m, in \u001b[0;36mDataFrame.drop\u001b[1;34m(self, labels, axis, index, columns, level, inplace, errors)\u001b[0m\n\u001b[0;32m   5120\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mdrop\u001b[39m(\n\u001b[0;32m   5121\u001b[0m     \u001b[38;5;28mself\u001b[39m,\n\u001b[0;32m   5122\u001b[0m     labels: IndexLabel \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[1;32m   (...)\u001b[0m\n\u001b[0;32m   5129\u001b[0m     errors: IgnoreRaise \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mraise\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[0;32m   5130\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m DataFrame \u001b[38;5;241m|\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m   5131\u001b[0m \u001b[38;5;250m    \u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[0;32m   5132\u001b[0m \u001b[38;5;124;03m    Drop specified labels from rows or columns.\u001b[39;00m\n\u001b[0;32m   5133\u001b[0m \n\u001b[1;32m   (...)\u001b[0m\n\u001b[0;32m   5266\u001b[0m \u001b[38;5;124;03m            weight  1.0     0.8\u001b[39;00m\n\u001b[0;32m   5267\u001b[0m \u001b[38;5;124;03m    \"\"\"\u001b[39;00m\n\u001b[1;32m-> 5268\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdrop\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m   5269\u001b[0m \u001b[43m        \u001b[49m\u001b[43mlabels\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mlabels\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   5270\u001b[0m \u001b[43m        \u001b[49m\u001b[43maxis\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43maxis\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   5271\u001b[0m \u001b[43m        \u001b[49m\u001b[43mindex\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mindex\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   5272\u001b[0m \u001b[43m        \u001b[49m\u001b[43mcolumns\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcolumns\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   5273\u001b[0m \u001b[43m        \u001b[49m\u001b[43mlevel\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mlevel\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   5274\u001b[0m \u001b[43m        \u001b[49m\u001b[43minplace\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43minplace\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   5275\u001b[0m \u001b[43m        \u001b[49m\u001b[43merrors\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43merrors\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   5276\u001b[0m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[1;32mc:\\Users\\tobia\\anaconda3\\envs\\env\\Lib\\site-packages\\pandas\\core\\generic.py:4549\u001b[0m, in \u001b[0;36mNDFrame.drop\u001b[1;34m(self, labels, axis, index, columns, level, inplace, errors)\u001b[0m\n\u001b[0;32m   4547\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m axis, labels \u001b[38;5;129;01min\u001b[39;00m axes\u001b[38;5;241m.\u001b[39mitems():\n\u001b[0;32m   4548\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m labels \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m-> 4549\u001b[0m         obj \u001b[38;5;241m=\u001b[39m \u001b[43mobj\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_drop_axis\u001b[49m\u001b[43m(\u001b[49m\u001b[43mlabels\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43maxis\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mlevel\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mlevel\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43merrors\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43merrors\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m   4551\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m inplace:\n\u001b[0;32m   4552\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_update_inplace(obj)\n",
+      "File \u001b[1;32mc:\\Users\\tobia\\anaconda3\\envs\\env\\Lib\\site-packages\\pandas\\core\\generic.py:4591\u001b[0m, in \u001b[0;36mNDFrame._drop_axis\u001b[1;34m(self, labels, axis, level, errors, only_slice)\u001b[0m\n\u001b[0;32m   4589\u001b[0m         new_axis \u001b[38;5;241m=\u001b[39m axis\u001b[38;5;241m.\u001b[39mdrop(labels, level\u001b[38;5;241m=\u001b[39mlevel, errors\u001b[38;5;241m=\u001b[39merrors)\n\u001b[0;32m   4590\u001b[0m     \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m-> 4591\u001b[0m         new_axis \u001b[38;5;241m=\u001b[39m \u001b[43maxis\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdrop\u001b[49m\u001b[43m(\u001b[49m\u001b[43mlabels\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43merrors\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43merrors\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m   4592\u001b[0m     indexer \u001b[38;5;241m=\u001b[39m axis\u001b[38;5;241m.\u001b[39mget_indexer(new_axis)\n\u001b[0;32m   4594\u001b[0m \u001b[38;5;66;03m# Case for non-unique axis\u001b[39;00m\n\u001b[0;32m   4595\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n",
+      "File \u001b[1;32mc:\\Users\\tobia\\anaconda3\\envs\\env\\Lib\\site-packages\\pandas\\core\\indexes\\base.py:6696\u001b[0m, in \u001b[0;36mIndex.drop\u001b[1;34m(self, labels, errors)\u001b[0m\n\u001b[0;32m   6694\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m mask\u001b[38;5;241m.\u001b[39many():\n\u001b[0;32m   6695\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m errors \u001b[38;5;241m!=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mignore\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n\u001b[1;32m-> 6696\u001b[0m         \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mlist\u001b[39m(labels[mask])\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m not found in axis\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m   6697\u001b[0m     indexer \u001b[38;5;241m=\u001b[39m indexer[\u001b[38;5;241m~\u001b[39mmask]\n\u001b[0;32m   6698\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdelete(indexer)\n",
+      "\u001b[1;31mKeyError\u001b[0m: \"['ID'] not found in axis\""
+     ]
+    }
+   ],
+   "source": [
+    "from sklearn.model_selection import train_test_split\n",
+    "# test out the functions\n",
+    "data,transform_data = get_data()\n",
+    "print(data.head())\n",
+    "\n",
+    "y = data['State']\n",
+    "X = data.drop(['State','Launched','Deadline'], axis=1)\n",
+    "\n",
+    "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 1337)\n",
+    "X_train.shape, X_test.shape, y_train.shape, y_test.shape\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#logistic_regression(X_train,X_test,y_train,y_test,metric=\"f1\",verbose=True,norm=\"l1\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -351,7 +356,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 23,
+   "execution_count": 12,
    "metadata": {},
    "outputs": [
     {
@@ -379,7 +384,7 @@
        "((265169, 9), (66293, 9), (265169,), (66293,))"
       ]
      },
-     "execution_count": 23,
+     "execution_count": 12,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -399,63 +404,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 24,
+   "execution_count": 23,
    "metadata": {},
    "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "c:\\Users\\tobia\\anaconda3\\envs\\env\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1):\n",
-      "STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n",
-      "\n",
-      "Increase the number of iterations (max_iter) or scale the data as shown in:\n",
-      "    https://scikit-learn.org/stable/modules/preprocessing.html\n",
-      "Please also refer to the documentation for alternative solver options:\n",
-      "    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n",
-      "  n_iter_i = _check_optimize_result(\n",
-      "c:\\Users\\tobia\\anaconda3\\envs\\env\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1):\n",
-      "STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n",
-      "\n",
-      "Increase the number of iterations (max_iter) or scale the data as shown in:\n",
-      "    https://scikit-learn.org/stable/modules/preprocessing.html\n",
-      "Please also refer to the documentation for alternative solver options:\n",
-      "    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n",
-      "  n_iter_i = _check_optimize_result(\n",
-      "c:\\Users\\tobia\\anaconda3\\envs\\env\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1):\n",
-      "STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n",
-      "\n",
-      "Increase the number of iterations (max_iter) or scale the data as shown in:\n",
-      "    https://scikit-learn.org/stable/modules/preprocessing.html\n",
-      "Please also refer to the documentation for alternative solver options:\n",
-      "    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n",
-      "  n_iter_i = _check_optimize_result(\n",
-      "c:\\Users\\tobia\\anaconda3\\envs\\env\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1):\n",
-      "STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n",
-      "\n",
-      "Increase the number of iterations (max_iter) or scale the data as shown in:\n",
-      "    https://scikit-learn.org/stable/modules/preprocessing.html\n",
-      "Please also refer to the documentation for alternative solver options:\n",
-      "    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n",
-      "  n_iter_i = _check_optimize_result(\n",
-      "c:\\Users\\tobia\\anaconda3\\envs\\env\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1):\n",
-      "STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n",
-      "\n",
-      "Increase the number of iterations (max_iter) or scale the data as shown in:\n",
-      "    https://scikit-learn.org/stable/modules/preprocessing.html\n",
-      "Please also refer to the documentation for alternative solver options:\n",
-      "    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n",
-      "  n_iter_i = _check_optimize_result(\n",
-      "c:\\Users\\tobia\\anaconda3\\envs\\env\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1):\n",
-      "STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n",
-      "\n",
-      "Increase the number of iterations (max_iter) or scale the data as shown in:\n",
-      "    https://scikit-learn.org/stable/modules/preprocessing.html\n",
-      "Please also refer to the documentation for alternative solver options:\n",
-      "    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n",
-      "  n_iter_i = _check_optimize_result(\n"
-     ]
-    },
     {
      "data": {
       "text/html": [
@@ -478,49 +429,49 @@
        "    <tr style=\"text-align: right;\">\n",
        "      <th></th>\n",
        "      <th>Parameters</th>\n",
-       "      <th>accuracy</th>\n",
+       "      <th>f1</th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>0</th>\n",
-       "      <td>{'C': 10.0, 'max_iter': 1000}</td>\n",
-       "      <td>0.999110</td>\n",
+       "      <td>{'eta': 0.3, 'max_depth': 3}</td>\n",
+       "      <td>0.999261</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
-       "      <td>{'C': 0.1, 'max_iter': 1000}</td>\n",
-       "      <td>0.999095</td>\n",
+       "      <td>{'eta': 0.3, 'max_depth': 6}</td>\n",
+       "      <td>0.999261</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2</th>\n",
-       "      <td>{'C': 1.0, 'max_iter': 1000}</td>\n",
-       "      <td>0.999095</td>\n",
+       "      <td>{'eta': 0.3, 'max_depth': 12}</td>\n",
+       "      <td>0.999261</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>3</th>\n",
-       "      <td>{'C': 10.0, 'max_iter': 100}</td>\n",
-       "      <td>0.998989</td>\n",
+       "      <td>{'eta': 0.3, 'max_depth': 24}</td>\n",
+       "      <td>0.999261</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>4</th>\n",
-       "      <td>{'C': 0.1, 'max_iter': 100}</td>\n",
-       "      <td>0.998884</td>\n",
+       "      <td>{'eta': 1, 'max_depth': 3}</td>\n",
+       "      <td>0.999125</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "</div>"
       ],
       "text/plain": [
-       "                      Parameters  accuracy\n",
-       "0  {'C': 10.0, 'max_iter': 1000}  0.999110\n",
-       "1   {'C': 0.1, 'max_iter': 1000}  0.999095\n",
-       "2   {'C': 1.0, 'max_iter': 1000}  0.999095\n",
-       "3   {'C': 10.0, 'max_iter': 100}  0.998989\n",
-       "4    {'C': 0.1, 'max_iter': 100}  0.998884"
+       "                      Parameters        f1\n",
+       "0   {'eta': 0.3, 'max_depth': 3}  0.999261\n",
+       "1   {'eta': 0.3, 'max_depth': 6}  0.999261\n",
+       "2  {'eta': 0.3, 'max_depth': 12}  0.999261\n",
+       "3  {'eta': 0.3, 'max_depth': 24}  0.999261\n",
+       "4     {'eta': 1, 'max_depth': 3}  0.999125"
       ]
      },
-     "execution_count": 24,
+     "execution_count": 23,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -530,11 +481,366 @@
     "#\n",
     "# warnings.filterwarnings(\"ignore\", category=ConvergenceWarning)\n",
     "\n",
-    "hyperparameters = {\"C\":[0.1,1.0,10.0],\"max_iter\":[10,100,1000]}\n",
-    "results = grid_search(logistic_regression,hyperparameters,\"accuracy\",X_train,X_test,y_train,y_test)\n",
-    "\n",
+    "hyperparameters = {\"eta\":[0.1,0.3,1],\"max_depth\":[3,6,12,24]}\n",
+    "results = grid_search(xgb,hyperparameters,\"f1\",X_train,X_test,y_train,y_test)\n",
     "results.head()\n"
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Parameters</th>\n",
+       "      <th>f1</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>{'C': 0.1, 'max_iter': 100, 'verbose': 'True'}</td>\n",
+       "      <td>0.651894</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>{'C': 0.1, 'max_iter': 500, 'verbose': 'True'}</td>\n",
+       "      <td>0.651894</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>{'C': 0.1, 'max_iter': 1000, 'verbose': 'True'}</td>\n",
+       "      <td>0.651894</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>{'C': 1, 'max_iter': 100, 'verbose': 'True'}</td>\n",
+       "      <td>0.651894</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>{'C': 1, 'max_iter': 500, 'verbose': 'True'}</td>\n",
+       "      <td>0.651894</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                        Parameters        f1\n",
+       "0   {'C': 0.1, 'max_iter': 100, 'verbose': 'True'}  0.651894\n",
+       "1   {'C': 0.1, 'max_iter': 500, 'verbose': 'True'}  0.651894\n",
+       "2  {'C': 0.1, 'max_iter': 1000, 'verbose': 'True'}  0.651894\n",
+       "3     {'C': 1, 'max_iter': 100, 'verbose': 'True'}  0.651894\n",
+       "4     {'C': 1, 'max_iter': 500, 'verbose': 'True'}  0.651894"
+      ]
+     },
+     "execution_count": 25,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "hyperparameters = {\"C\":[0.1,1,10],\"max_iter\":[100,500,1000]}\n",
+    "results = grid_search(logistic_regression,hyperparameters,\"f1\",X_train,X_test,y_train,y_test)\n",
+    "results.head()\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Model Coefficients: [[-9.01102837e-11 -1.12392780e-06 -9.64314847e-10 -1.74546747e-09\n",
+      "  -4.85736444e-10 -1.49496674e-05  1.96343898e-05  2.60577944e-07\n",
+      "  -7.16729454e-09]]\n",
+      "Model Intercept: [-8.14858205e-11]\n",
+      "Model Score: [0 0 0 ... 0 0 0]\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "0.2483796371690063"
+      ]
+     },
+     "execution_count": 26,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "logistic_regression(X_train,X_test,y_train,y_test,metric=\"f1\",verbose=True,C=0.1,max_iter=100)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>ID</th>\n",
+       "      <th>Name</th>\n",
+       "      <th>Category</th>\n",
+       "      <th>Subcategory</th>\n",
+       "      <th>Country</th>\n",
+       "      <th>Goal</th>\n",
+       "      <th>Pledged</th>\n",
+       "      <th>Backers</th>\n",
+       "      <th>Duration</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>164774</th>\n",
+       "      <td>1912850706</td>\n",
+       "      <td>16071</td>\n",
+       "      <td>3</td>\n",
+       "      <td>99</td>\n",
+       "      <td>21</td>\n",
+       "      <td>3500</td>\n",
+       "      <td>3501</td>\n",
+       "      <td>19</td>\n",
+       "      <td>29</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>74178</th>\n",
+       "      <td>1913882316</td>\n",
+       "      <td>208878</td>\n",
+       "      <td>10</td>\n",
+       "      <td>90</td>\n",
+       "      <td>20</td>\n",
+       "      <td>320</td>\n",
+       "      <td>567</td>\n",
+       "      <td>27</td>\n",
+       "      <td>29</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>296198</th>\n",
+       "      <td>687737038</td>\n",
+       "      <td>135398</td>\n",
+       "      <td>13</td>\n",
+       "      <td>138</td>\n",
+       "      <td>21</td>\n",
+       "      <td>250000</td>\n",
+       "      <td>275</td>\n",
+       "      <td>2</td>\n",
+       "      <td>29</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>92665</th>\n",
+       "      <td>1614457395</td>\n",
+       "      <td>208763</td>\n",
+       "      <td>6</td>\n",
+       "      <td>129</td>\n",
+       "      <td>21</td>\n",
+       "      <td>7000</td>\n",
+       "      <td>528</td>\n",
+       "      <td>23</td>\n",
+       "      <td>38</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>191647</th>\n",
+       "      <td>906726512</td>\n",
+       "      <td>300984</td>\n",
+       "      <td>12</td>\n",
+       "      <td>95</td>\n",
+       "      <td>21</td>\n",
+       "      <td>2000</td>\n",
+       "      <td>80</td>\n",
+       "      <td>3</td>\n",
+       "      <td>14</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                ID    Name  Category  Subcategory  Country    Goal  Pledged   \n",
+       "164774  1912850706   16071         3           99       21    3500     3501  \\\n",
+       "74178   1913882316  208878        10           90       20     320      567   \n",
+       "296198   687737038  135398        13          138       21  250000      275   \n",
+       "92665   1614457395  208763         6          129       21    7000      528   \n",
+       "191647   906726512  300984        12           95       21    2000       80   \n",
+       "\n",
+       "        Backers  Duration  \n",
+       "164774       19        29  \n",
+       "74178        27        29  \n",
+       "296198        2        29  \n",
+       "92665        23        38  \n",
+       "191647        3        14  "
+      ]
+     },
+     "execution_count": 27,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "X_train.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def decision_tree(X_train,X_test,y_train,y_test,metric=\"accuracy\",verbose=False):\n",
+    "    from sklearn.tree import DecisionTreeClassifier\n",
+    "    from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay\n",
+    "    \"\"\"\n",
+    "    This function performs logistic regression on the data and returns the accuracy of the model\n",
+    "    Necessary Arguments:\n",
+    "        X_train : The training data\n",
+    "        X_test : The test data\n",
+    "        y_train : The target values\n",
+    "        y_test : The target values for the training data\n",
+    "\n",
+    "    Optional Arguments:\n",
+    "        metric : The metric to calculate the model performance, Options: \"accuracy\", \"precision\", \"recall\", \"f1\"\n",
+    "        verbose : If True, the function will print the metric of the model\n",
+    "\n",
+    "    Returns:\n",
+    "        metric_value : The number of the metric specified in the arguments\n",
+    "    \"\"\"\n",
+    "\n",
+    "    # create a decision tree model\n",
+    "    model = DecisionTreeClassifier()\n",
+    "\n",
+    "    # fit the model to the training data\n",
+    "    model.fit(X_train, y_train)\n",
+    "\n",
+    "    # predict the target values for the test data\n",
+    "    y_pred = model.predict(X_test)\n",
+    "\n",
+    "    if verbose==True:\n",
+    "        print(f\"Model Score: {y_pred}\")\n",
+    "        cm = confusion_matrix(y_test, y_pred)\n",
+    "        disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=data.target_names)\n",
+    "        disp.plot()\n",
+    "\n",
+    "    metric_value = 0\n",
+    "    # calculate the metric of the model\n",
+    "    if metric == \"accuracy\":\n",
+    "        metric_value = accuracy_score(y_test, y_pred)\n",
+    "    if metric == \"precision\":\n",
+    "        metric_value = precision_score(y_test, y_pred)\n",
+    "    if metric == \"recall\":\n",
+    "        metric_value = recall_score(y_test, y_pred)\n",
+    "    if metric == \"f1\":\n",
+    "        metric_value = f1_score(y_test, y_pred)\n",
+    "    return metric_value"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def xgb(X_train,X_test,y_train,y_test,metric=\"accuracy\",booster=\"gbtree\",eta=0.3,max_depth=6, reg_lambda=1, verbose=False):\n",
+    "    from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay\n",
+    "    \"\"\"\n",
+    "    This function performs xgboost on the data and returns the accuracy of the model\n",
+    "    Necessary Arguments:\n",
+    "        X_train : The training data\n",
+    "        X_test : The test data\n",
+    "        y_train : The target values\n",
+    "        y_test : The target values for the training data\n",
+    "\n",
+    "    Optional Arguments:\n",
+    "        metric : The metric to calculate the model performance, Options: \"accuracy\", \"precision\", \"recall\", \"f1\"\n",
+    "        verbose : If True, the function will print the metric of the model\n",
+    "        booster : The type of booster to use, Options: \"gbtree\", \"gblinear\", \"dart\"\n",
+    "        eta : The learning rate of the model, between [0,1]\n",
+    "        max_depth : The maximum depth of the trees, default is 6 to avoid overfitting\n",
+    "        reg_lambda : The regularization parameter of the model\n",
+    "\n",
+    "    Returns:\n",
+    "        metric_value : The number of the metric specified in the arguments\n",
+    "    \"\"\"\n",
+    "    from xgboost import XGBClassifier\n",
+    "    # create a logistic regression model\n",
+    "    model = XGBClassifier(booster=booster,eta=eta)\n",
+    "\n",
+    "    # fit the model to the training data\n",
+    "    model.fit(X_train, y_train)\n",
+    "\n",
+    "    # predict the target values for the test data\n",
+    "    y_pred = model.predict(X_test)\n",
+    "\n",
+    "    if verbose==True:\n",
+    "        print(f\"Model Score: {y_pred}\")\n",
+    "        cm = confusion_matrix(y_test, y_pred)\n",
+    "        disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=data.target_names)\n",
+    "        disp.plot()\n",
+    "        \n",
+    "\n",
+    "    metric_value = 0\n",
+    "    # calculate the metric of the model\n",
+    "    if metric == \"accuracy\":\n",
+    "        metric_value = accuracy_score(y_test, y_pred)\n",
+    "    if metric == \"precision\":\n",
+    "        metric_value = precision_score(y_test, y_pred)\n",
+    "    if metric == \"recall\":\n",
+    "        metric_value = recall_score(y_test, y_pred)\n",
+    "    if metric == \"f1\":\n",
+    "        metric_value = f1_score(y_test, y_pred)\n",
+    "    return metric_value"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
   }
  ],
  "metadata": {

From 470f1359bbfe1a55da3828bb3abf9c024ca28869 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tobias=20K=C3=A4mmerling?= <Tobiaskaemmerling86@gmail.com>
Date: Fri, 9 Aug 2024 13:49:29 +0200
Subject: [PATCH 3/3] Added the code to start from importing the data to the
 final prediction into the test.ipynb to look for errors, didn't find any

---
 base.ipynb | 507 +++++++++++++++++++++++++------------------------
 test.ipynb | 541 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 791 insertions(+), 257 deletions(-)

diff --git a/base.ipynb b/base.ipynb
index 048ce07..e43207d 100644
--- a/base.ipynb
+++ b/base.ipynb
@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 21,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -10,7 +10,8 @@
     "import numpy as np\n",
     "from sklearn.preprocessing import LabelEncoder\n",
     "from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score\n",
-    "from sklearn.linear_model import LogisticRegression"
+    "from sklearn.linear_model import LogisticRegression\n",
+    "from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay"
    ]
   },
   {
@@ -74,7 +75,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 18,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -86,7 +87,7 @@
     "    data = pd.read_csv('data/kickstarter_projects.csv')\n",
     "    # transform the categorical values to numerical values\n",
     "\n",
-    "    data = data.drop(\"ID\",axis=1)\n",
+    "    data = data.drop([\"ID\",\"Name\"],axis=1)\n",
     "    data = data[(data[\"State\"] == \"Successful\") | (data[\"State\"] == \"Failed\")]\n",
     "    data[\"Deadline\"] = pd.to_datetime(data[\"Deadline\"],format='%Y-%m-%d')\n",
     "    data[\"Launched\"] = pd.to_datetime(data[\"Launched\"],format='%Y-%m-%d %H:%M:%S')\n",
@@ -110,37 +111,6 @@
    "cell_type": "code",
    "execution_count": 5,
    "metadata": {},
-   "outputs": [
-    {
-     "ename": "KeyError",
-     "evalue": "\"['ID'] not found in axis\"",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[1;31mKeyError\u001b[0m                                  Traceback (most recent call last)",
-      "Cell \u001b[1;32mIn[5], line 1\u001b[0m\n\u001b[1;32m----> 1\u001b[0m data, transform_data\u001b[38;5;241m=\u001b[39m \u001b[43mget_data\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m      2\u001b[0m data\u001b[38;5;241m.\u001b[39mdrop(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mName\u001b[39m\u001b[38;5;124m\"\u001b[39m, axis\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m1\u001b[39m, inplace\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)\n\u001b[0;32m      3\u001b[0m data\u001b[38;5;241m.\u001b[39mhead(\u001b[38;5;241m20\u001b[39m)\n",
-      "Cell \u001b[1;32mIn[4], line 9\u001b[0m, in \u001b[0;36mget_data\u001b[1;34m()\u001b[0m\n\u001b[0;32m      6\u001b[0m data \u001b[38;5;241m=\u001b[39m pd\u001b[38;5;241m.\u001b[39mread_csv(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mdata/kickstarter_projects.csv\u001b[39m\u001b[38;5;124m'\u001b[39m)\n\u001b[0;32m      7\u001b[0m \u001b[38;5;66;03m# transform the categorical values to numerical values\u001b[39;00m\n\u001b[1;32m----> 9\u001b[0m data \u001b[38;5;241m=\u001b[39m \u001b[43mdata\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdrop\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mID\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[0;32m     10\u001b[0m data \u001b[38;5;241m=\u001b[39m data[(data[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mState\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mSuccessful\u001b[39m\u001b[38;5;124m\"\u001b[39m) \u001b[38;5;241m|\u001b[39m (data[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mState\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mFailed\u001b[39m\u001b[38;5;124m\"\u001b[39m)]\n\u001b[0;32m     11\u001b[0m data[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mDeadline\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m pd\u001b[38;5;241m.\u001b[39mto_datetime(data[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mDeadline\u001b[39m\u001b[38;5;124m\"\u001b[39m],\u001b[38;5;28mformat\u001b[39m\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m%\u001b[39m\u001b[38;5;124mY-\u001b[39m\u001b[38;5;124m%\u001b[39m\u001b[38;5;124mm-\u001b[39m\u001b[38;5;132;01m%d\u001b[39;00m\u001b[38;5;124m'\u001b[39m)\n",
-      "File \u001b[1;32mc:\\Users\\tobia\\anaconda3\\envs\\env\\Lib\\site-packages\\pandas\\core\\frame.py:5268\u001b[0m, in \u001b[0;36mDataFrame.drop\u001b[1;34m(self, labels, axis, index, columns, level, inplace, errors)\u001b[0m\n\u001b[0;32m   5120\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mdrop\u001b[39m(\n\u001b[0;32m   5121\u001b[0m     \u001b[38;5;28mself\u001b[39m,\n\u001b[0;32m   5122\u001b[0m     labels: IndexLabel \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[1;32m   (...)\u001b[0m\n\u001b[0;32m   5129\u001b[0m     errors: IgnoreRaise \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mraise\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[0;32m   5130\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m DataFrame \u001b[38;5;241m|\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m   5131\u001b[0m \u001b[38;5;250m    \u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[0;32m   5132\u001b[0m \u001b[38;5;124;03m    Drop specified labels from rows or columns.\u001b[39;00m\n\u001b[0;32m   5133\u001b[0m \n\u001b[1;32m   (...)\u001b[0m\n\u001b[0;32m   5266\u001b[0m \u001b[38;5;124;03m            weight  1.0     0.8\u001b[39;00m\n\u001b[0;32m   5267\u001b[0m \u001b[38;5;124;03m    \"\"\"\u001b[39;00m\n\u001b[1;32m-> 5268\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdrop\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m   5269\u001b[0m \u001b[43m        \u001b[49m\u001b[43mlabels\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mlabels\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   5270\u001b[0m \u001b[43m        \u001b[49m\u001b[43maxis\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43maxis\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   5271\u001b[0m \u001b[43m        \u001b[49m\u001b[43mindex\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mindex\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   5272\u001b[0m \u001b[43m        \u001b[49m\u001b[43mcolumns\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcolumns\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   5273\u001b[0m \u001b[43m        \u001b[49m\u001b[43mlevel\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mlevel\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   5274\u001b[0m \u001b[43m        \u001b[49m\u001b[43minplace\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43minplace\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   5275\u001b[0m \u001b[43m        \u001b[49m\u001b[43merrors\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43merrors\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   5276\u001b[0m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\n",
-      "File \u001b[1;32mc:\\Users\\tobia\\anaconda3\\envs\\env\\Lib\\site-packages\\pandas\\core\\generic.py:4549\u001b[0m, in \u001b[0;36mNDFrame.drop\u001b[1;34m(self, labels, axis, index, columns, level, inplace, errors)\u001b[0m\n\u001b[0;32m   4547\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m axis, labels \u001b[38;5;129;01min\u001b[39;00m axes\u001b[38;5;241m.\u001b[39mitems():\n\u001b[0;32m   4548\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m labels \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m-> 4549\u001b[0m         obj \u001b[38;5;241m=\u001b[39m \u001b[43mobj\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_drop_axis\u001b[49m\u001b[43m(\u001b[49m\u001b[43mlabels\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43maxis\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mlevel\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mlevel\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43merrors\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43merrors\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m   4551\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m inplace:\n\u001b[0;32m   4552\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_update_inplace(obj)\n",
-      "File \u001b[1;32mc:\\Users\\tobia\\anaconda3\\envs\\env\\Lib\\site-packages\\pandas\\core\\generic.py:4591\u001b[0m, in \u001b[0;36mNDFrame._drop_axis\u001b[1;34m(self, labels, axis, level, errors, only_slice)\u001b[0m\n\u001b[0;32m   4589\u001b[0m         new_axis \u001b[38;5;241m=\u001b[39m axis\u001b[38;5;241m.\u001b[39mdrop(labels, level\u001b[38;5;241m=\u001b[39mlevel, errors\u001b[38;5;241m=\u001b[39merrors)\n\u001b[0;32m   4590\u001b[0m     \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m-> 4591\u001b[0m         new_axis \u001b[38;5;241m=\u001b[39m \u001b[43maxis\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdrop\u001b[49m\u001b[43m(\u001b[49m\u001b[43mlabels\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43merrors\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43merrors\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m   4592\u001b[0m     indexer \u001b[38;5;241m=\u001b[39m axis\u001b[38;5;241m.\u001b[39mget_indexer(new_axis)\n\u001b[0;32m   4594\u001b[0m \u001b[38;5;66;03m# Case for non-unique axis\u001b[39;00m\n\u001b[0;32m   4595\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n",
-      "File \u001b[1;32mc:\\Users\\tobia\\anaconda3\\envs\\env\\Lib\\site-packages\\pandas\\core\\indexes\\base.py:6696\u001b[0m, in \u001b[0;36mIndex.drop\u001b[1;34m(self, labels, errors)\u001b[0m\n\u001b[0;32m   6694\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m mask\u001b[38;5;241m.\u001b[39many():\n\u001b[0;32m   6695\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m errors \u001b[38;5;241m!=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mignore\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n\u001b[1;32m-> 6696\u001b[0m         \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mlist\u001b[39m(labels[mask])\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m not found in axis\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m   6697\u001b[0m     indexer \u001b[38;5;241m=\u001b[39m indexer[\u001b[38;5;241m~\u001b[39mmask]\n\u001b[0;32m   6698\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdelete(indexer)\n",
-      "\u001b[1;31mKeyError\u001b[0m: \"['ID'] not found in axis\""
-     ]
-    }
-   ],
-   "source": [
-    "data, transform_data= get_data()\n",
-    "data.drop(\"Name\", axis=1, inplace=True)\n",
-    "data.head(20)\n",
-    "\n",
-    "data = transform_numerical_to_string(data,transform_data)\n",
-    "data[\"State\"].groupby(data[\"State\"]).count()\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
    "outputs": [],
    "source": [
     "def get_original_data():\n",
@@ -150,7 +120,7 @@
     "    # read in the data from the csv file\n",
     "    data = pd.read_csv('data/kickstarter_projects.csv')\n",
     "\n",
-    "    data = data.drop(\"ID\",axis=1)\n",
+    "    data = data.drop([\"ID\",\"Name\"],axis=1)\n",
     "    data = data[(data[\"State\"] == \"Successful\") | (data[\"State\"] == \"Failed\")]\n",
     "    data[\"Deadline\"] = pd.to_datetime(data[\"Deadline\"],format='%Y-%m-%d')\n",
     "    data[\"Launched\"] = pd.to_datetime(data[\"Launched\"],format='%Y-%m-%d %H:%M:%S')\n",
@@ -169,7 +139,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 6,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -201,11 +171,12 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 14,
    "metadata": {},
    "outputs": [],
    "source": [
     "def logistic_regression(X_train,X_test,y_train,y_test,metric=\"accuracy\",verbose=False,norm=\"l2\",max_iter=1000,C=1.0):\n",
+    "    from sklearn.metrics import confusion_matrix\n",
     "    \"\"\"\n",
     "    This function performs logistic regression on the data and returns the accuracy of the model\n",
     "    Necessary Arguments:\n",
@@ -238,6 +209,8 @@
     "        print(f\"Model Coefficients: {model.coef_}\")\n",
     "        print(f\"Model Intercept: {model.intercept_}\")\n",
     "        print(f\"Model Score: {y_pred}\")\n",
+    "        print(f'Confusion Matrix: {confusion_matrix(y_test, y_pred)}')\n",
+    "\n",
     "\n",
     "    metric_value = 0\n",
     "    # calculate the metric of the model\n",
@@ -254,31 +227,37 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 8,
    "metadata": {},
    "outputs": [
     {
-     "ename": "KeyError",
-     "evalue": "\"['ID'] not found in axis\"",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[1;31mKeyError\u001b[0m                                  Traceback (most recent call last)",
-      "Cell \u001b[1;32mIn[6], line 3\u001b[0m\n\u001b[0;32m      1\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01msklearn\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mmodel_selection\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m train_test_split\n\u001b[0;32m      2\u001b[0m \u001b[38;5;66;03m# test out the functions\u001b[39;00m\n\u001b[1;32m----> 3\u001b[0m data,transform_data \u001b[38;5;241m=\u001b[39m \u001b[43mget_data\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m      4\u001b[0m \u001b[38;5;28mprint\u001b[39m(data\u001b[38;5;241m.\u001b[39mhead())\n\u001b[0;32m      6\u001b[0m y \u001b[38;5;241m=\u001b[39m data[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mState\u001b[39m\u001b[38;5;124m'\u001b[39m]\n",
-      "Cell \u001b[1;32mIn[4], line 9\u001b[0m, in \u001b[0;36mget_data\u001b[1;34m()\u001b[0m\n\u001b[0;32m      6\u001b[0m data \u001b[38;5;241m=\u001b[39m pd\u001b[38;5;241m.\u001b[39mread_csv(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mdata/kickstarter_projects.csv\u001b[39m\u001b[38;5;124m'\u001b[39m)\n\u001b[0;32m      7\u001b[0m \u001b[38;5;66;03m# transform the categorical values to numerical values\u001b[39;00m\n\u001b[1;32m----> 9\u001b[0m data \u001b[38;5;241m=\u001b[39m \u001b[43mdata\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdrop\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mID\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[0;32m     10\u001b[0m data \u001b[38;5;241m=\u001b[39m data[(data[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mState\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mSuccessful\u001b[39m\u001b[38;5;124m\"\u001b[39m) \u001b[38;5;241m|\u001b[39m (data[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mState\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mFailed\u001b[39m\u001b[38;5;124m\"\u001b[39m)]\n\u001b[0;32m     11\u001b[0m data[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mDeadline\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m pd\u001b[38;5;241m.\u001b[39mto_datetime(data[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mDeadline\u001b[39m\u001b[38;5;124m\"\u001b[39m],\u001b[38;5;28mformat\u001b[39m\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m%\u001b[39m\u001b[38;5;124mY-\u001b[39m\u001b[38;5;124m%\u001b[39m\u001b[38;5;124mm-\u001b[39m\u001b[38;5;132;01m%d\u001b[39;00m\u001b[38;5;124m'\u001b[39m)\n",
-      "File \u001b[1;32mc:\\Users\\tobia\\anaconda3\\envs\\env\\Lib\\site-packages\\pandas\\core\\frame.py:5268\u001b[0m, in \u001b[0;36mDataFrame.drop\u001b[1;34m(self, labels, axis, index, columns, level, inplace, errors)\u001b[0m\n\u001b[0;32m   5120\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mdrop\u001b[39m(\n\u001b[0;32m   5121\u001b[0m     \u001b[38;5;28mself\u001b[39m,\n\u001b[0;32m   5122\u001b[0m     labels: IndexLabel \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[1;32m   (...)\u001b[0m\n\u001b[0;32m   5129\u001b[0m     errors: IgnoreRaise \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mraise\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[0;32m   5130\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m DataFrame \u001b[38;5;241m|\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m   5131\u001b[0m \u001b[38;5;250m    \u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[0;32m   5132\u001b[0m \u001b[38;5;124;03m    Drop specified labels from rows or columns.\u001b[39;00m\n\u001b[0;32m   5133\u001b[0m \n\u001b[1;32m   (...)\u001b[0m\n\u001b[0;32m   5266\u001b[0m \u001b[38;5;124;03m            weight  1.0     0.8\u001b[39;00m\n\u001b[0;32m   5267\u001b[0m \u001b[38;5;124;03m    \"\"\"\u001b[39;00m\n\u001b[1;32m-> 5268\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdrop\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m   5269\u001b[0m \u001b[43m        \u001b[49m\u001b[43mlabels\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mlabels\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   5270\u001b[0m \u001b[43m        \u001b[49m\u001b[43maxis\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43maxis\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   5271\u001b[0m \u001b[43m        \u001b[49m\u001b[43mindex\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mindex\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   5272\u001b[0m \u001b[43m        \u001b[49m\u001b[43mcolumns\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcolumns\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   5273\u001b[0m \u001b[43m        \u001b[49m\u001b[43mlevel\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mlevel\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   5274\u001b[0m \u001b[43m        \u001b[49m\u001b[43minplace\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43minplace\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   5275\u001b[0m \u001b[43m        \u001b[49m\u001b[43merrors\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43merrors\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   5276\u001b[0m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\n",
-      "File \u001b[1;32mc:\\Users\\tobia\\anaconda3\\envs\\env\\Lib\\site-packages\\pandas\\core\\generic.py:4549\u001b[0m, in \u001b[0;36mNDFrame.drop\u001b[1;34m(self, labels, axis, index, columns, level, inplace, errors)\u001b[0m\n\u001b[0;32m   4547\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m axis, labels \u001b[38;5;129;01min\u001b[39;00m axes\u001b[38;5;241m.\u001b[39mitems():\n\u001b[0;32m   4548\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m labels \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m-> 4549\u001b[0m         obj \u001b[38;5;241m=\u001b[39m \u001b[43mobj\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_drop_axis\u001b[49m\u001b[43m(\u001b[49m\u001b[43mlabels\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43maxis\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mlevel\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mlevel\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43merrors\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43merrors\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m   4551\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m inplace:\n\u001b[0;32m   4552\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_update_inplace(obj)\n",
-      "File \u001b[1;32mc:\\Users\\tobia\\anaconda3\\envs\\env\\Lib\\site-packages\\pandas\\core\\generic.py:4591\u001b[0m, in \u001b[0;36mNDFrame._drop_axis\u001b[1;34m(self, labels, axis, level, errors, only_slice)\u001b[0m\n\u001b[0;32m   4589\u001b[0m         new_axis \u001b[38;5;241m=\u001b[39m axis\u001b[38;5;241m.\u001b[39mdrop(labels, level\u001b[38;5;241m=\u001b[39mlevel, errors\u001b[38;5;241m=\u001b[39merrors)\n\u001b[0;32m   4590\u001b[0m     \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m-> 4591\u001b[0m         new_axis \u001b[38;5;241m=\u001b[39m \u001b[43maxis\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdrop\u001b[49m\u001b[43m(\u001b[49m\u001b[43mlabels\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43merrors\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43merrors\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m   4592\u001b[0m     indexer \u001b[38;5;241m=\u001b[39m axis\u001b[38;5;241m.\u001b[39mget_indexer(new_axis)\n\u001b[0;32m   4594\u001b[0m \u001b[38;5;66;03m# Case for non-unique axis\u001b[39;00m\n\u001b[0;32m   4595\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n",
-      "File \u001b[1;32mc:\\Users\\tobia\\anaconda3\\envs\\env\\Lib\\site-packages\\pandas\\core\\indexes\\base.py:6696\u001b[0m, in \u001b[0;36mIndex.drop\u001b[1;34m(self, labels, errors)\u001b[0m\n\u001b[0;32m   6694\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m mask\u001b[38;5;241m.\u001b[39many():\n\u001b[0;32m   6695\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m errors \u001b[38;5;241m!=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mignore\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n\u001b[1;32m-> 6696\u001b[0m         \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mlist\u001b[39m(labels[mask])\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m not found in axis\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m   6697\u001b[0m     indexer \u001b[38;5;241m=\u001b[39m indexer[\u001b[38;5;241m~\u001b[39mmask]\n\u001b[0;32m   6698\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdelete(indexer)\n",
-      "\u001b[1;31mKeyError\u001b[0m: \"['ID'] not found in axis\""
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "   Category  Subcategory  Country            Launched   Deadline   Goal   \n",
+      "0         5           52       21 2009-04-21 21:02:48 2009-05-31   1000  \\\n",
+      "1         6          129       21 2009-04-23 00:07:53 2009-07-20  80000   \n",
+      "2         0           70       21 2009-04-24 21:52:03 2009-05-03     20   \n",
+      "3        13          131       21 2009-04-25 17:36:21 2009-07-14     99   \n",
+      "4         5           52       21 2009-04-27 14:10:39 2009-05-26   1900   \n",
+      "\n",
+      "   Pledged  Backers  State  Duration  \n",
+      "0      625       30      0        39  \n",
+      "1       22        3      0        87  \n",
+      "2       35        3      1         8  \n",
+      "3      145       25      1        79  \n",
+      "4      387       10      0        28  \n"
      ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "((265169, 7), (66293, 7), (265169,), (66293,))"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
     }
    ],
    "source": [
@@ -291,21 +270,12 @@
     "X = data.drop(['State','Launched','Deadline'], axis=1)\n",
     "\n",
     "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 1337)\n",
-    "X_train.shape, X_test.shape, y_train.shape, y_test.shape\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 10,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "#logistic_regression(X_train,X_test,y_train,y_test,metric=\"f1\",verbose=True,norm=\"l1\")"
+    "X_train.shape, X_test.shape, y_train.shape, y_test.shape"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 9,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -348,63 +318,12 @@
     "\n",
     "    return results\n",
     "\n",
-    "\n",
-    "\n",
-    "\n",
     "    "
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "           ID    Name  Category  Subcategory  Country            Launched   \n",
-      "0  1860890148  115746         5           52       21 2009-04-21 21:02:48  \\\n",
-      "1   709707365   56298         6          129       21 2009-04-23 00:07:53   \n",
-      "2  1703704063  323842         0           70       21 2009-04-24 21:52:03   \n",
-      "3      727286  192565        13          131       21 2009-04-25 17:36:21   \n",
-      "4  1622952265  200031         5           52       21 2009-04-27 14:10:39   \n",
-      "\n",
-      "    Deadline   Goal  Pledged  Backers  State  Duration  \n",
-      "0 2009-05-31   1000      625       30      0        39  \n",
-      "1 2009-07-20  80000       22        3      0        87  \n",
-      "2 2009-05-03     20       35        3      1         8  \n",
-      "3 2009-07-14     99      145       25      1        79  \n",
-      "4 2009-05-26   1900      387       10      0        28  \n"
-     ]
-    },
-    {
-     "data": {
-      "text/plain": [
-       "((265169, 9), (66293, 9), (265169,), (66293,))"
-      ]
-     },
-     "execution_count": 12,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "from sklearn.model_selection import train_test_split\n",
-    "# test out the functions\n",
-    "data,transform_data = get_data()\n",
-    "print(data.head())\n",
-    "\n",
-    "y = data['State']\n",
-    "X = data.drop(['State','Launched','Deadline'], axis=1)\n",
-    "\n",
-    "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 1337)\n",
-    "X_train.shape, X_test.shape, y_train.shape, y_test.shape"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 23,
+   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
@@ -483,12 +402,12 @@
     "\n",
     "hyperparameters = {\"eta\":[0.1,0.3,1],\"max_depth\":[3,6,12,24]}\n",
     "results = grid_search(xgb,hyperparameters,\"f1\",X_train,X_test,y_train,y_test)\n",
-    "results.head()\n"
+    "results.head()"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 25,
+   "execution_count": 11,
    "metadata": {},
    "outputs": [
     {
@@ -512,97 +431,123 @@
        "  <thead>\n",
        "    <tr style=\"text-align: right;\">\n",
        "      <th></th>\n",
-       "      <th>Parameters</th>\n",
-       "      <th>f1</th>\n",
+       "      <th>Category</th>\n",
+       "      <th>Subcategory</th>\n",
+       "      <th>Country</th>\n",
+       "      <th>Goal</th>\n",
+       "      <th>Pledged</th>\n",
+       "      <th>Backers</th>\n",
+       "      <th>Duration</th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>{'C': 0.1, 'max_iter': 100, 'verbose': 'True'}</td>\n",
-       "      <td>0.651894</td>\n",
+       "      <th>164774</th>\n",
+       "      <td>3</td>\n",
+       "      <td>99</td>\n",
+       "      <td>21</td>\n",
+       "      <td>3500</td>\n",
+       "      <td>3501</td>\n",
+       "      <td>19</td>\n",
+       "      <td>29</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>{'C': 0.1, 'max_iter': 500, 'verbose': 'True'}</td>\n",
-       "      <td>0.651894</td>\n",
+       "      <th>74178</th>\n",
+       "      <td>10</td>\n",
+       "      <td>90</td>\n",
+       "      <td>20</td>\n",
+       "      <td>320</td>\n",
+       "      <td>567</td>\n",
+       "      <td>27</td>\n",
+       "      <td>29</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>{'C': 0.1, 'max_iter': 1000, 'verbose': 'True'}</td>\n",
-       "      <td>0.651894</td>\n",
+       "      <th>296198</th>\n",
+       "      <td>13</td>\n",
+       "      <td>138</td>\n",
+       "      <td>21</td>\n",
+       "      <td>250000</td>\n",
+       "      <td>275</td>\n",
+       "      <td>2</td>\n",
+       "      <td>29</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>3</th>\n",
-       "      <td>{'C': 1, 'max_iter': 100, 'verbose': 'True'}</td>\n",
-       "      <td>0.651894</td>\n",
+       "      <th>92665</th>\n",
+       "      <td>6</td>\n",
+       "      <td>129</td>\n",
+       "      <td>21</td>\n",
+       "      <td>7000</td>\n",
+       "      <td>528</td>\n",
+       "      <td>23</td>\n",
+       "      <td>38</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>4</th>\n",
-       "      <td>{'C': 1, 'max_iter': 500, 'verbose': 'True'}</td>\n",
-       "      <td>0.651894</td>\n",
+       "      <th>191647</th>\n",
+       "      <td>12</td>\n",
+       "      <td>95</td>\n",
+       "      <td>21</td>\n",
+       "      <td>2000</td>\n",
+       "      <td>80</td>\n",
+       "      <td>3</td>\n",
+       "      <td>14</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "</div>"
       ],
       "text/plain": [
-       "                                        Parameters        f1\n",
-       "0   {'C': 0.1, 'max_iter': 100, 'verbose': 'True'}  0.651894\n",
-       "1   {'C': 0.1, 'max_iter': 500, 'verbose': 'True'}  0.651894\n",
-       "2  {'C': 0.1, 'max_iter': 1000, 'verbose': 'True'}  0.651894\n",
-       "3     {'C': 1, 'max_iter': 100, 'verbose': 'True'}  0.651894\n",
-       "4     {'C': 1, 'max_iter': 500, 'verbose': 'True'}  0.651894"
+       "        Category  Subcategory  Country    Goal  Pledged  Backers  Duration\n",
+       "164774         3           99       21    3500     3501       19        29\n",
+       "74178         10           90       20     320      567       27        29\n",
+       "296198        13          138       21  250000      275        2        29\n",
+       "92665          6          129       21    7000      528       23        38\n",
+       "191647        12           95       21    2000       80        3        14"
       ]
      },
-     "execution_count": 25,
+     "execution_count": 11,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "hyperparameters = {\"C\":[0.1,1,10],\"max_iter\":[100,500,1000]}\n",
-    "results = grid_search(logistic_regression,hyperparameters,\"f1\",X_train,X_test,y_train,y_test)\n",
-    "results.head()\n",
-    "\n"
+    "X_train.head()"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 26,
+   "execution_count": 12,
    "metadata": {},
    "outputs": [
     {
-     "name": "stdout",
+     "name": "stderr",
      "output_type": "stream",
      "text": [
-      "Model Coefficients: [[-9.01102837e-11 -1.12392780e-06 -9.64314847e-10 -1.74546747e-09\n",
-      "  -4.85736444e-10 -1.49496674e-05  1.96343898e-05  2.60577944e-07\n",
-      "  -7.16729454e-09]]\n",
-      "Model Intercept: [-8.14858205e-11]\n",
-      "Model Score: [0 0 0 ... 0 0 0]\n"
+      "c:\\Users\\tobia\\anaconda3\\envs\\env\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1):\n",
+      "STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n",
+      "\n",
+      "Increase the number of iterations (max_iter) or scale the data as shown in:\n",
+      "    https://scikit-learn.org/stable/modules/preprocessing.html\n",
+      "Please also refer to the documentation for alternative solver options:\n",
+      "    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n",
+      "  n_iter_i = _check_optimize_result(\n",
+      "c:\\Users\\tobia\\anaconda3\\envs\\env\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1):\n",
+      "STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n",
+      "\n",
+      "Increase the number of iterations (max_iter) or scale the data as shown in:\n",
+      "    https://scikit-learn.org/stable/modules/preprocessing.html\n",
+      "Please also refer to the documentation for alternative solver options:\n",
+      "    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n",
+      "  n_iter_i = _check_optimize_result(\n",
+      "c:\\Users\\tobia\\anaconda3\\envs\\env\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1):\n",
+      "STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n",
+      "\n",
+      "Increase the number of iterations (max_iter) or scale the data as shown in:\n",
+      "    https://scikit-learn.org/stable/modules/preprocessing.html\n",
+      "Please also refer to the documentation for alternative solver options:\n",
+      "    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n",
+      "  n_iter_i = _check_optimize_result(\n"
      ]
     },
-    {
-     "data": {
-      "text/plain": [
-       "0.2483796371690063"
-      ]
-     },
-     "execution_count": 26,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "logistic_regression(X_train,X_test,y_train,y_test,metric=\"f1\",verbose=True,C=0.1,max_iter=100)\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 27,
-   "metadata": {},
-   "outputs": [
     {
      "data": {
       "text/html": [
@@ -624,105 +569,90 @@
        "  <thead>\n",
        "    <tr style=\"text-align: right;\">\n",
        "      <th></th>\n",
-       "      <th>ID</th>\n",
-       "      <th>Name</th>\n",
-       "      <th>Category</th>\n",
-       "      <th>Subcategory</th>\n",
-       "      <th>Country</th>\n",
-       "      <th>Goal</th>\n",
-       "      <th>Pledged</th>\n",
-       "      <th>Backers</th>\n",
-       "      <th>Duration</th>\n",
+       "      <th>Parameters</th>\n",
+       "      <th>f1</th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
-       "      <th>164774</th>\n",
-       "      <td>1912850706</td>\n",
-       "      <td>16071</td>\n",
-       "      <td>3</td>\n",
-       "      <td>99</td>\n",
-       "      <td>21</td>\n",
-       "      <td>3500</td>\n",
-       "      <td>3501</td>\n",
-       "      <td>19</td>\n",
-       "      <td>29</td>\n",
+       "      <th>0</th>\n",
+       "      <td>{'C': 0.1, 'max_iter': 100}</td>\n",
+       "      <td>0.999155</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>74178</th>\n",
-       "      <td>1913882316</td>\n",
-       "      <td>208878</td>\n",
-       "      <td>10</td>\n",
-       "      <td>90</td>\n",
-       "      <td>20</td>\n",
-       "      <td>320</td>\n",
-       "      <td>567</td>\n",
-       "      <td>27</td>\n",
-       "      <td>29</td>\n",
+       "      <th>1</th>\n",
+       "      <td>{'C': 0.1, 'max_iter': 500}</td>\n",
+       "      <td>0.999155</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>296198</th>\n",
-       "      <td>687737038</td>\n",
-       "      <td>135398</td>\n",
-       "      <td>13</td>\n",
-       "      <td>138</td>\n",
-       "      <td>21</td>\n",
-       "      <td>250000</td>\n",
-       "      <td>275</td>\n",
-       "      <td>2</td>\n",
-       "      <td>29</td>\n",
+       "      <th>2</th>\n",
+       "      <td>{'C': 0.1, 'max_iter': 1000}</td>\n",
+       "      <td>0.999155</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>92665</th>\n",
-       "      <td>1614457395</td>\n",
-       "      <td>208763</td>\n",
-       "      <td>6</td>\n",
-       "      <td>129</td>\n",
-       "      <td>21</td>\n",
-       "      <td>7000</td>\n",
-       "      <td>528</td>\n",
-       "      <td>23</td>\n",
-       "      <td>38</td>\n",
+       "      <th>3</th>\n",
+       "      <td>{'C': 1, 'max_iter': 100}</td>\n",
+       "      <td>0.999155</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>191647</th>\n",
-       "      <td>906726512</td>\n",
-       "      <td>300984</td>\n",
-       "      <td>12</td>\n",
-       "      <td>95</td>\n",
-       "      <td>21</td>\n",
-       "      <td>2000</td>\n",
-       "      <td>80</td>\n",
-       "      <td>3</td>\n",
-       "      <td>14</td>\n",
+       "      <th>4</th>\n",
+       "      <td>{'C': 1, 'max_iter': 500}</td>\n",
+       "      <td>0.999155</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "</div>"
       ],
       "text/plain": [
-       "                ID    Name  Category  Subcategory  Country    Goal  Pledged   \n",
-       "164774  1912850706   16071         3           99       21    3500     3501  \\\n",
-       "74178   1913882316  208878        10           90       20     320      567   \n",
-       "296198   687737038  135398        13          138       21  250000      275   \n",
-       "92665   1614457395  208763         6          129       21    7000      528   \n",
-       "191647   906726512  300984        12           95       21    2000       80   \n",
-       "\n",
-       "        Backers  Duration  \n",
-       "164774       19        29  \n",
-       "74178        27        29  \n",
-       "296198        2        29  \n",
-       "92665        23        38  \n",
-       "191647        3        14  "
+       "                     Parameters        f1\n",
+       "0   {'C': 0.1, 'max_iter': 100}  0.999155\n",
+       "1   {'C': 0.1, 'max_iter': 500}  0.999155\n",
+       "2  {'C': 0.1, 'max_iter': 1000}  0.999155\n",
+       "3     {'C': 1, 'max_iter': 100}  0.999155\n",
+       "4     {'C': 1, 'max_iter': 500}  0.999155"
       ]
      },
-     "execution_count": 27,
+     "execution_count": 12,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "X_train.head()"
+    "hyperparameters = {\"C\":[0.1,1,10],\"max_iter\":[100,500,1000]}\n",
+    "results = grid_search(logistic_regression,hyperparameters,\"f1\",X_train,X_test,y_train,y_test)\n",
+    "results.head()\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Model Coefficients: [[ 0.02520122  0.00417009  0.04143354 -0.10483769  0.10516741  0.08317194\n",
+      "   0.00119811]]\n",
+      "Model Intercept: [0.00554605]\n",
+      "Model Score: [0 0 0 ... 0 1 0]\n",
+      "Confusion Matrix: [[39351    56]\n",
+      " [    0 26886]]\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "0.9989596492531768"
+      ]
+     },
+     "execution_count": 16,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "logistic_regression(X_train,X_test,y_train,y_test,metric=\"f1\",verbose=True,C=0.1,max_iter=500)"
    ]
   },
   {
@@ -733,7 +663,6 @@
    "source": [
     "def decision_tree(X_train,X_test,y_train,y_test,metric=\"accuracy\",verbose=False):\n",
     "    from sklearn.tree import DecisionTreeClassifier\n",
-    "    from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay\n",
     "    \"\"\"\n",
     "    This function performs logistic regression on the data and returns the accuracy of the model\n",
     "    Necessary Arguments:\n",
@@ -780,12 +709,12 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 22,
+   "execution_count": 13,
    "metadata": {},
    "outputs": [],
    "source": [
     "def xgb(X_train,X_test,y_train,y_test,metric=\"accuracy\",booster=\"gbtree\",eta=0.3,max_depth=6, reg_lambda=1, verbose=False):\n",
-    "    from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay\n",
+    "\n",
     "    \"\"\"\n",
     "    This function performs xgboost on the data and returns the accuracy of the model\n",
     "    Necessary Arguments:\n",
@@ -835,12 +764,76 @@
     "    return metric_value"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "data,transform_data = get_data()\n",
+    "data.head()\n",
+    "data.groupby(\"State\").count()\n",
+    "\n",
+    "y = data['State']\n",
+    "X = data.drop(['State','Launched','Deadline'], axis=1)"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
    "outputs": [],
-   "source": []
+   "source": [
+    "data = pd.read_csv('data/kickstarter_projects.csv')\n",
+    "\n",
+    "\n",
+    "data = data.drop([\"ID\",\"Name\"],axis=1)\n",
+    "data = data[(data[\"State\"] == \"Successful\") | (data[\"State\"] == \"Failed\")]\n",
+    "data[\"Deadline\"] = pd.to_datetime(data[\"Deadline\"],format='%Y-%m-%d')\n",
+    "data[\"Launched\"] = pd.to_datetime(data[\"Launched\"],format='%Y-%m-%d %H:%M:%S')\n",
+    "data[\"Duration\"] = (data[\"Deadline\"] - data[\"Launched\"]).dt.days\n",
+    "\n",
+    "data, transform_data = transform_strings_to_numerical(data)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Model Coefficients: [[ 0.02520122  0.00417009  0.04143354 -0.10483769  0.10516741  0.08317194\n",
+      "   0.00119811]]\n",
+      "Model Intercept: [0.00554605]\n",
+      "Model Score: [0 0 0 ... 0 1 0]\n",
+      "Confusion Matrix: [[39351    56]\n",
+      " [    0 26886]]\n",
+      "0.9989596492531768\n"
+     ]
+    }
+   ],
+   "source": [
+    "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 1337)\n",
+    "X_train.shape, X_test.shape, y_train.shape, y_test.shape\n",
+    "\n",
+    "model = LogisticRegression(max_iter=1000,penalty=\"l2\",C=0.1)\n",
+    "# fit the model to the training data\n",
+    "model.fit(X_train, y_train)\n",
+    "\n",
+    "# predict the target values for the test data\n",
+    "y_pred = model.predict(X_test)\n",
+    "\n",
+    "print(f\"Model Coefficients: {model.coef_}\")\n",
+    "print(f\"Model Intercept: {model.intercept_}\")\n",
+    "print(f\"Model Score: {y_pred}\")\n",
+    "print(f'Confusion Matrix: {confusion_matrix(y_test, y_pred)}')\n",
+    "\n",
+    "metric_value = f1_score(y_test, y_pred)\n",
+    "print(metric_value)"
+   ]
   }
  ],
  "metadata": {
diff --git a/test.ipynb b/test.ipynb
index e69de29..6039a6c 100644
--- a/test.ipynb
+++ b/test.ipynb
@@ -0,0 +1,541 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "from sklearn.preprocessing import LabelEncoder\n",
+    "from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score\n",
+    "from sklearn.linear_model import LogisticRegression\n",
+    "from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "\n",
+    "data = pd.read_csv('data/kickstarter_projects.csv')\n",
+    "\n",
+    "data = data.drop([\"ID\",\"Name\"],axis=1)\n",
+    "data = data[(data[\"State\"] == \"Successful\") | (data[\"State\"] == \"Failed\")]\n",
+    "data[\"Deadline\"] = pd.to_datetime(data[\"Deadline\"],format='%Y-%m-%d')\n",
+    "data[\"Launched\"] = pd.to_datetime(data[\"Launched\"],format='%Y-%m-%d %H:%M:%S')\n",
+    "data[\"Duration\"] = (data[\"Deadline\"] - data[\"Launched\"]).dt.days\n",
+    "\n",
+    "for column in data.columns:\n",
+    "        # If data type is an object, for example a string, we want to convert the column to numerical values\n",
+    "        if data[column].dtype == 'object' and data[column].dtype != 'datetime64[ns]':\n",
+    "            le = LabelEncoder()\n",
+    "            data[column] = le.fit_transform(data[column])\n",
+    "\n",
+    "#data, transform_data = transform_strings_to_numerical(data)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "y = data['State']\n",
+    "X = data.drop(['State','Launched','Deadline'], axis=1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Category</th>\n",
+       "      <th>Subcategory</th>\n",
+       "      <th>Country</th>\n",
+       "      <th>Goal</th>\n",
+       "      <th>Pledged</th>\n",
+       "      <th>Backers</th>\n",
+       "      <th>Duration</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>5</td>\n",
+       "      <td>52</td>\n",
+       "      <td>21</td>\n",
+       "      <td>1000</td>\n",
+       "      <td>625</td>\n",
+       "      <td>30</td>\n",
+       "      <td>39</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>6</td>\n",
+       "      <td>129</td>\n",
+       "      <td>21</td>\n",
+       "      <td>80000</td>\n",
+       "      <td>22</td>\n",
+       "      <td>3</td>\n",
+       "      <td>87</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>0</td>\n",
+       "      <td>70</td>\n",
+       "      <td>21</td>\n",
+       "      <td>20</td>\n",
+       "      <td>35</td>\n",
+       "      <td>3</td>\n",
+       "      <td>8</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>13</td>\n",
+       "      <td>131</td>\n",
+       "      <td>21</td>\n",
+       "      <td>99</td>\n",
+       "      <td>145</td>\n",
+       "      <td>25</td>\n",
+       "      <td>79</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>5</td>\n",
+       "      <td>52</td>\n",
+       "      <td>21</td>\n",
+       "      <td>1900</td>\n",
+       "      <td>387</td>\n",
+       "      <td>10</td>\n",
+       "      <td>28</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5</th>\n",
+       "      <td>9</td>\n",
+       "      <td>77</td>\n",
+       "      <td>21</td>\n",
+       "      <td>3000</td>\n",
+       "      <td>3329</td>\n",
+       "      <td>110</td>\n",
+       "      <td>17</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6</th>\n",
+       "      <td>6</td>\n",
+       "      <td>129</td>\n",
+       "      <td>21</td>\n",
+       "      <td>200</td>\n",
+       "      <td>41</td>\n",
+       "      <td>3</td>\n",
+       "      <td>29</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>7</th>\n",
+       "      <td>12</td>\n",
+       "      <td>54</td>\n",
+       "      <td>21</td>\n",
+       "      <td>500</td>\n",
+       "      <td>563</td>\n",
+       "      <td>18</td>\n",
+       "      <td>29</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>9</th>\n",
+       "      <td>10</td>\n",
+       "      <td>125</td>\n",
+       "      <td>21</td>\n",
+       "      <td>300</td>\n",
+       "      <td>15</td>\n",
+       "      <td>2</td>\n",
+       "      <td>16</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>10</th>\n",
+       "      <td>11</td>\n",
+       "      <td>104</td>\n",
+       "      <td>21</td>\n",
+       "      <td>350</td>\n",
+       "      <td>1630</td>\n",
+       "      <td>31</td>\n",
+       "      <td>48</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "    Category  Subcategory  Country   Goal  Pledged  Backers  Duration\n",
+       "0          5           52       21   1000      625       30        39\n",
+       "1          6          129       21  80000       22        3        87\n",
+       "2          0           70       21     20       35        3         8\n",
+       "3         13          131       21     99      145       25        79\n",
+       "4          5           52       21   1900      387       10        28\n",
+       "5          9           77       21   3000     3329      110        17\n",
+       "6          6          129       21    200       41        3        29\n",
+       "7         12           54       21    500      563       18        29\n",
+       "9         10          125       21    300       15        2        16\n",
+       "10        11          104       21    350     1630       31        48"
+      ]
+     },
+     "execution_count": 11,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "X.head(10)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0     0\n",
+       "1     0\n",
+       "2     1\n",
+       "3     1\n",
+       "4     0\n",
+       "5     1\n",
+       "6     0\n",
+       "7     1\n",
+       "9     0\n",
+       "10    1\n",
+       "Name: State, dtype: int32"
+      ]
+     },
+     "execution_count": 12,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "y.head(10)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "((265169, 7), (66293, 7), (265169,), (66293,))"
+      ]
+     },
+     "execution_count": 18,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)\n",
+    "X_train.shape, X_test.shape, y_train.shape, y_test.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Category</th>\n",
+       "      <th>Subcategory</th>\n",
+       "      <th>Country</th>\n",
+       "      <th>Goal</th>\n",
+       "      <th>Pledged</th>\n",
+       "      <th>Backers</th>\n",
+       "      <th>Duration</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>322439</th>\n",
+       "      <td>10</td>\n",
+       "      <td>125</td>\n",
+       "      <td>21</td>\n",
+       "      <td>1500</td>\n",
+       "      <td>1825</td>\n",
+       "      <td>39</td>\n",
+       "      <td>34</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>334349</th>\n",
+       "      <td>4</td>\n",
+       "      <td>37</td>\n",
+       "      <td>21</td>\n",
+       "      <td>1100</td>\n",
+       "      <td>6027</td>\n",
+       "      <td>419</td>\n",
+       "      <td>6</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>219576</th>\n",
+       "      <td>4</td>\n",
+       "      <td>113</td>\n",
+       "      <td>0</td>\n",
+       "      <td>116629</td>\n",
+       "      <td>1622</td>\n",
+       "      <td>9</td>\n",
+       "      <td>59</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>128525</th>\n",
+       "      <td>12</td>\n",
+       "      <td>95</td>\n",
+       "      <td>21</td>\n",
+       "      <td>5000</td>\n",
+       "      <td>5050</td>\n",
+       "      <td>31</td>\n",
+       "      <td>29</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>13392</th>\n",
+       "      <td>12</td>\n",
+       "      <td>95</td>\n",
+       "      <td>21</td>\n",
+       "      <td>5000</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>44</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "        Category  Subcategory  Country    Goal  Pledged  Backers  Duration\n",
+       "322439        10          125       21    1500     1825       39        34\n",
+       "334349         4           37       21    1100     6027      419         6\n",
+       "219576         4          113        0  116629     1622        9        59\n",
+       "128525        12           95       21    5000     5050       31        29\n",
+       "13392         12           95       21    5000        0        0        44"
+      ]
+     },
+     "execution_count": 19,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "X_train.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Category</th>\n",
+       "      <th>Subcategory</th>\n",
+       "      <th>Country</th>\n",
+       "      <th>Goal</th>\n",
+       "      <th>Pledged</th>\n",
+       "      <th>Backers</th>\n",
+       "      <th>Duration</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>214728</th>\n",
+       "      <td>12</td>\n",
+       "      <td>19</td>\n",
+       "      <td>13</td>\n",
+       "      <td>5582</td>\n",
+       "      <td>1743</td>\n",
+       "      <td>27</td>\n",
+       "      <td>39</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>224796</th>\n",
+       "      <td>4</td>\n",
+       "      <td>65</td>\n",
+       "      <td>21</td>\n",
+       "      <td>700</td>\n",
+       "      <td>60</td>\n",
+       "      <td>2</td>\n",
+       "      <td>39</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>158299</th>\n",
+       "      <td>13</td>\n",
+       "      <td>131</td>\n",
+       "      <td>21</td>\n",
+       "      <td>800</td>\n",
+       "      <td>57</td>\n",
+       "      <td>4</td>\n",
+       "      <td>29</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>329487</th>\n",
+       "      <td>10</td>\n",
+       "      <td>47</td>\n",
+       "      <td>21</td>\n",
+       "      <td>8000</td>\n",
+       "      <td>9179</td>\n",
+       "      <td>108</td>\n",
+       "      <td>27</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>162625</th>\n",
+       "      <td>0</td>\n",
+       "      <td>98</td>\n",
+       "      <td>21</td>\n",
+       "      <td>25000</td>\n",
+       "      <td>51</td>\n",
+       "      <td>3</td>\n",
+       "      <td>44</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "        Category  Subcategory  Country   Goal  Pledged  Backers  Duration\n",
+       "214728        12           19       13   5582     1743       27        39\n",
+       "224796         4           65       21    700       60        2        39\n",
+       "158299        13          131       21    800       57        4        29\n",
+       "329487        10           47       21   8000     9179      108        27\n",
+       "162625         0           98       21  25000       51        3        44"
+      ]
+     },
+     "execution_count": 20,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "X_test.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "\n",
+    "model = LogisticRegression(max_iter=1000,penalty=\"l2\",C=0.1)\n",
+    "# fit the model to the training data\n",
+    "model.fit(X_train, y_train)\n",
+    "\n",
+    "# predict the target values for the test data\n",
+    "y_pred = model.predict(X_test)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Model Coefficients: [[ 0.01684149  0.0032329   0.03237224 -0.1055174   0.10587912  0.07303369\n",
+      "  -0.00594841]]\n",
+      "Model Intercept: [0.58553995]\n",
+      "Model Score: [0 0 0 ... 1 0 0]\n",
+      "Confusion Matrix: [[39342    76]\n",
+      " [    0 26875]]\n",
+      "0.9985880429532197\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(f\"Model Coefficients: {model.coef_}\")\n",
+    "print(f\"Model Intercept: {model.intercept_}\")\n",
+    "print(f\"Model Score: {y_pred}\")\n",
+    "print(f'Confusion Matrix: {confusion_matrix(y_test, y_pred)}')\n",
+    "\n",
+    "metric_value = f1_score(y_test, y_pred)\n",
+    "print(metric_value)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "env",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}