From 2912cb6005e18cffc983c441bbb87b013086fc98 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tobias=20K=C3=A4mmerling?= <Tobiaskaemmerling86@gmail.com>
Date: Thu, 8 Aug 2024 16:42:36 +0200
Subject: [PATCH] Change columns Deadline,launched to dt and added column
 duration, dropped all data which isnt failed or successful

---
 base.ipynb | 564 +++++++++++++++++++++++++++++------------------------
 1 file changed, 313 insertions(+), 251 deletions(-)

diff --git a/base.ipynb b/base.ipynb
index b08c1e4..8173696 100644
--- a/base.ipynb
+++ b/base.ipynb
@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 23,
+   "execution_count": 1,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -15,7 +15,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 17,
+   "execution_count": 9,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -35,7 +35,7 @@
     "    transform_data = {}\n",
     "    for column in data.columns:\n",
     "        # If data type is an object, for example a string, we want to convert the column to numerical values\n",
-    "        if data[column].dtype == 'object':\n",
+    "        if data[column].dtype == 'object' and data[column].dtype != 'datetime64[ns]':\n",
     "            le = LabelEncoder()\n",
     "            data[column] = le.fit_transform(data[column])\n",
     "            # Save the mapping in a dictionary\n",
@@ -45,7 +45,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -74,7 +74,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 19,
+   "execution_count": 14,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -85,9 +85,21 @@
     "    # read in the data from the csv file\n",
     "    data = pd.read_csv('data/kickstarter_projects.csv')\n",
     "    # transform the categorical values to numerical values\n",
+    "\n",
+    "    data = data[(data[\"State\"] == \"Successful\") | (data[\"State\"] == \"Failed\")]\n",
+    "    data[\"Deadline\"] = pd.to_datetime(data[\"Deadline\"],format='%Y-%m-%d')\n",
+    "    data[\"Launched\"] = pd.to_datetime(data[\"Launched\"],format='%Y-%m-%d %H:%M:%S')\n",
+    "    data[\"Duration\"] = (data[\"Deadline\"] - data[\"Launched\"]).dt.days\n",
+    "\n",
+    "\n",
     "    data, transform_data = transform_strings_to_numerical(data)\n",
+    "\n",
+    "\n",
+    "\n",
     "    \"\"\"\n",
-    "    If we want to universally modify the data in any other way, we can do it here\n",
+    "    We have converted Deadline and Launched to DateTime objects and calculated the duration in days\n",
+    "    We also, at least for now, drop all live or suspended or canceled projects\n",
+    "    \n",
     "    \"\"\"\n",
     "    #return the data and the transformation_data in case we want to transform the data back\n",
     "    return data, transform_data"
@@ -95,7 +107,32 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 20,
+   "execution_count": 83,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "Series([], Name: State, dtype: int64)"
+      ]
+     },
+     "execution_count": 83,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "data, transform_data= get_data()\n",
+    "data.drop(\"Name\", axis=1, inplace=True)\n",
+    "data.head(20)\n",
+    "\n",
+    "data = transform_numerical_to_string(data,transform_data)\n",
+    "data[\"State\"].groupby(data[\"State\"]).count()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -105,8 +142,19 @@
     "    \"\"\"\n",
     "    # read in the data from the csv file\n",
     "    data = pd.read_csv('data/kickstarter_projects.csv')\n",
+    "\n",
+    "\n",
+    "    data = data[(data[\"State\"] == \"Successful\") | (data[\"State\"] == \"Failed\")]\n",
+    "    data[\"Deadline\"] = pd.to_datetime(data[\"Deadline\"],format='%Y-%m-%d')\n",
+    "    data[\"Launched\"] = pd.to_datetime(data[\"Launched\"],format='%Y-%m-%d %H:%M:%S')\n",
+    "    data[\"Duration\"] = (data[\"Deadline\"] - data[\"Launched\"]).dt.days\n",
+    "\n",
+    "\n",
+    "\n",
     "    \"\"\"\n",
-    "    If we want to universally modify the data in any other way, we can do it here\n",
+    "    We have converted Deadline and Launched to DateTime objects and calculated the duration in days\n",
+    "    We also, at least for now, drop all live or suspended or canceled projects\n",
+    "    \n",
     "    \"\"\"\n",
     "    #return the data\n",
     "    return data"
@@ -114,7 +162,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 21,
+   "execution_count": 85,
    "metadata": {},
    "outputs": [
     {
@@ -149,125 +197,263 @@
        "      <th>Pledged</th>\n",
        "      <th>Backers</th>\n",
        "      <th>State</th>\n",
+       "      <th>Duration</th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>1860890148</td>\n",
-       "      <td>130454</td>\n",
-       "      <td>5</td>\n",
-       "      <td>52</td>\n",
-       "      <td>21</td>\n",
-       "      <td>0</td>\n",
-       "      <td>6</td>\n",
-       "      <td>1000</td>\n",
-       "      <td>625</td>\n",
-       "      <td>30</td>\n",
-       "      <td>1</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>709707365</td>\n",
-       "      <td>63196</td>\n",
-       "      <td>6</td>\n",
-       "      <td>129</td>\n",
-       "      <td>21</td>\n",
-       "      <td>1</td>\n",
-       "      <td>34</td>\n",
-       "      <td>80000</td>\n",
-       "      <td>22</td>\n",
-       "      <td>3</td>\n",
-       "      <td>1</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>1703704063</td>\n",
-       "      <td>365635</td>\n",
-       "      <td>0</td>\n",
-       "      <td>70</td>\n",
-       "      <td>21</td>\n",
-       "      <td>2</td>\n",
-       "      <td>0</td>\n",
-       "      <td>20</td>\n",
-       "      <td>35</td>\n",
-       "      <td>3</td>\n",
-       "      <td>3</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3</th>\n",
-       "      <td>727286</td>\n",
-       "      <td>217100</td>\n",
-       "      <td>13</td>\n",
-       "      <td>131</td>\n",
-       "      <td>21</td>\n",
-       "      <td>3</td>\n",
-       "      <td>31</td>\n",
-       "      <td>99</td>\n",
-       "      <td>145</td>\n",
-       "      <td>25</td>\n",
-       "      <td>3</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>4</th>\n",
-       "      <td>1622952265</td>\n",
-       "      <td>225555</td>\n",
-       "      <td>5</td>\n",
-       "      <td>52</td>\n",
-       "      <td>21</td>\n",
-       "      <td>4</td>\n",
-       "      <td>4</td>\n",
-       "      <td>1900</td>\n",
-       "      <td>387</td>\n",
-       "      <td>10</td>\n",
-       "      <td>1</td>\n",
-       "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "</div>"
       ],
       "text/plain": [
-       "           ID    Name  Category  Subcategory  Country  Launched  Deadline   \n",
-       "0  1860890148  130454         5           52       21         0         6  \\\n",
-       "1   709707365   63196         6          129       21         1        34   \n",
-       "2  1703704063  365635         0           70       21         2         0   \n",
-       "3      727286  217100        13          131       21         3        31   \n",
-       "4  1622952265  225555         5           52       21         4         4   \n",
-       "\n",
-       "    Goal  Pledged  Backers  State  \n",
-       "0   1000      625       30      1  \n",
-       "1  80000       22        3      1  \n",
-       "2     20       35        3      3  \n",
-       "3     99      145       25      3  \n",
-       "4   1900      387       10      1  "
+       "Empty DataFrame\n",
+       "Columns: [ID, Name, Category, Subcategory, Country, Launched, Deadline, Goal, Pledged, Backers, State, Duration]\n",
+       "Index: []"
+      ]
+     },
+     "execution_count": 85,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "get_original_data().head(20)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 86,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def remove_outlier(data,columns,threshold=3):\n",
+    "    \"\"\"\n",
+    "    This function removes outliers from the data based on the threshold\n",
+    "    Args:\n",
+    "        data : Our dataframe which we want to modify\n",
+    "        columns : The columns which we want to check for outliers\n",
+    "        threshold : The threshold which we use to determine if a value is an outlier\n",
+    "        Multiplied by the standard deviation of the column to determine the range of values which are not outliers\n",
+    "        I advise setting the threshold to 3\n",
+    "\n",
+    "    Returns:\n",
+    "        data : Our modified dataframe\n",
+    "    \"\"\"\n",
+    "    \n",
+    "    for column in columns:\n",
+    "        data = data[np.abs(data[column]-data[column].mean()) <= (threshold*data[column].std())]\n",
+    "    return data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def logistic_regression(X_train,X_test,y_train,y_test,metric=\"accuracy\",verbose=False,norm=\"l2\",max_iter=1000,C=1.0):\n",
+    "    \"\"\"\n",
+    "    This function performs logistic regression on the data and returns the accuracy of the model\n",
+    "    Necessary Arguments:\n",
+    "        X_train : The training data\n",
+    "        X_test : The test data\n",
+    "        y : The target values\n",
+    "        y_train : The target values for the training data\n",
+    "\n",
+    "    Optional Arguments:\n",
+    "        metric : The metric to calculate the model performance, Options: \"accuracy\", \"precision\", \"recall\", \"f1\"\n",
+    "        verbose : If True, the function will print the metric of the model\n",
+    "        norm : The norm to use for the logistic regression\n",
+    "        max_iter : The maximum number of iterations for the logistic regression\n",
+    "        C : The regularization parameter for the logistic regression\n",
+    "\n",
+    "    Returns:\n",
+    "        metric : The number of the metric specified in the arguments\n",
+    "    \"\"\"\n",
+    "\n",
+    "    # create a logistic regression model\n",
+    "    model = LogisticRegression(max_iter=max_iter,penalty=norm,C=C)\n",
+    "\n",
+    "    # fit the model to the training data\n",
+    "    model.fit(X_train, y_train)\n",
+    "\n",
+    "    # predict the target values for the test data\n",
+    "    y_pred = model.predict(X_test)\n",
+    "\n",
+    "    if verbose==True:\n",
+    "        print(f\"Model Coefficients: {model.coef_}\")\n",
+    "        print(f\"Model Intercept: {model.intercept_}\")\n",
+    "        print(f\"Model Score: {y_pred}\")\n",
+    "\n",
+    "    metric_value = 0\n",
+    "    # calculate the metric of the model\n",
+    "    if metric == \"accuracy\":\n",
+    "        metric_value = accuracy_score(y_test, y_pred)\n",
+    "    if metric == \"precision\":\n",
+    "        metric_value = precision_score(y_test, y_pred)\n",
+    "    if metric == \"recall\":\n",
+    "        metric_value = recall_score(y_test, y_pred)\n",
+    "    if metric == \"f1\":\n",
+    "        metric_value = f1_score(y_test, y_pred)\n",
+    "    return metric_value"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "def grid_search(model,parameters,metric,X_train,X_test,y_train,y_test):\n",
+    "    from itertools import product\n",
+    "    \"\"\"\n",
+    "    \n",
+    "    Perform grid search for the given machine learning model and hyperparameters to find the best hyperparameters\n",
+    "\n",
+    "    Parameters:\n",
+    "    model - The machine learning model function to use\n",
+    "    param_grid - The hyperparameters to test, given in form of a list of dictionaries\n",
+    "    metric - A string specifying the metric to use for evaluation\n",
+    "    X_train, X_test, y_train, y_test - The training and test data\n",
+    "\n",
+    "    Returns: A pandas Dataframe containing the hyperparameters and the corresponding metric value, \n",
+    "    sorted by the metric value in descending order\n",
+    "\n",
+    "    \"\"\"\n",
+    "    # Create all possible permutations of the hyperparameters, so if a={1,2} and b={3,4} we get [{1,3},{1,4},{2,3},{2,4}]\n",
+    "    keys, values = zip(*parameters.items())\n",
+    "    permutations = [dict(zip(keys, v)) for v in product(*values)]\n",
+    "\n",
+    "    # Create a list to store the results\n",
+    "    results = []\n",
+    "\n",
+    "    for params in permutations:\n",
+    "        # feed the model with the hyperparameters\n",
+    "        # ** unpacks the dictionary into the form dict[key]=value -> key = value\n",
+    "        metric_value = model(X_train,X_test,y_train,y_test,**params)\n",
+    "\n",
+    "        # Append the results to the list\n",
+    "        results.append((params, metric_value))\n",
+    "\n",
+    "    # After the loop is done, we sort the results by the metric value\n",
+    "    results.sort(key=lambda x: x[1], reverse=True)\n",
+    "\n",
+    "    results = pd.DataFrame(results, columns=['Parameters', metric])\n",
+    "\n",
+    "    return results\n",
+    "\n",
+    "\n",
+    "\n",
+    "\n",
+    "    "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "           ID    Name  Category  Subcategory  Country            Launched   \n",
+      "0  1860890148  115746         5           52       21 2009-04-21 21:02:48  \\\n",
+      "1   709707365   56298         6          129       21 2009-04-23 00:07:53   \n",
+      "2  1703704063  323842         0           70       21 2009-04-24 21:52:03   \n",
+      "3      727286  192565        13          131       21 2009-04-25 17:36:21   \n",
+      "4  1622952265  200031         5           52       21 2009-04-27 14:10:39   \n",
+      "\n",
+      "    Deadline   Goal  Pledged  Backers  State  Duration  \n",
+      "0 2009-05-31   1000      625       30      0        39  \n",
+      "1 2009-07-20  80000       22        3      0        87  \n",
+      "2 2009-05-03     20       35        3      1         8  \n",
+      "3 2009-07-14     99      145       25      1        79  \n",
+      "4 2009-05-26   1900      387       10      0        28  \n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "((265169, 9), (66293, 9), (265169,), (66293,))"
       ]
      },
-     "execution_count": 21,
+     "execution_count": 23,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "#tests:\n",
-    "data, transform_data = get_data()\n",
-    "data.head()\n"
+    "from sklearn.model_selection import train_test_split\n",
+    "# test out the functions\n",
+    "data,transform_data = get_data()\n",
+    "print(data.head())\n",
+    "\n",
+    "y = data['State']\n",
+    "X = data.drop(['State','Launched','Deadline'], axis=1)\n",
+    "\n",
+    "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 1337)\n",
+    "X_train.shape, X_test.shape, y_train.shape, y_test.shape"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 22,
+   "execution_count": 24,
    "metadata": {},
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "C:\\Users\\tobia\\AppData\\Local\\Temp\\ipykernel_14408\\914317090.py:21: SettingWithCopyWarning: \n",
-      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
-      "Try using .loc[row_indexer,col_indexer] = value instead\n",
+      "c:\\Users\\tobia\\anaconda3\\envs\\env\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1):\n",
+      "STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n",
+      "\n",
+      "Increase the number of iterations (max_iter) or scale the data as shown in:\n",
+      "    https://scikit-learn.org/stable/modules/preprocessing.html\n",
+      "Please also refer to the documentation for alternative solver options:\n",
+      "    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n",
+      "  n_iter_i = _check_optimize_result(\n",
+      "c:\\Users\\tobia\\anaconda3\\envs\\env\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1):\n",
+      "STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n",
+      "\n",
+      "Increase the number of iterations (max_iter) or scale the data as shown in:\n",
+      "    https://scikit-learn.org/stable/modules/preprocessing.html\n",
+      "Please also refer to the documentation for alternative solver options:\n",
+      "    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n",
+      "  n_iter_i = _check_optimize_result(\n",
+      "c:\\Users\\tobia\\anaconda3\\envs\\env\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1):\n",
+      "STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n",
+      "\n",
+      "Increase the number of iterations (max_iter) or scale the data as shown in:\n",
+      "    https://scikit-learn.org/stable/modules/preprocessing.html\n",
+      "Please also refer to the documentation for alternative solver options:\n",
+      "    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n",
+      "  n_iter_i = _check_optimize_result(\n",
+      "c:\\Users\\tobia\\anaconda3\\envs\\env\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1):\n",
+      "STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n",
       "\n",
-      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
-      "  data[column] = data[column].map(reverse_mapping)\n"
+      "Increase the number of iterations (max_iter) or scale the data as shown in:\n",
+      "    https://scikit-learn.org/stable/modules/preprocessing.html\n",
+      "Please also refer to the documentation for alternative solver options:\n",
+      "    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n",
+      "  n_iter_i = _check_optimize_result(\n",
+      "c:\\Users\\tobia\\anaconda3\\envs\\env\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1):\n",
+      "STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n",
+      "\n",
+      "Increase the number of iterations (max_iter) or scale the data as shown in:\n",
+      "    https://scikit-learn.org/stable/modules/preprocessing.html\n",
+      "Please also refer to the documentation for alternative solver options:\n",
+      "    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n",
+      "  n_iter_i = _check_optimize_result(\n",
+      "c:\\Users\\tobia\\anaconda3\\envs\\env\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1):\n",
+      "STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n",
+      "\n",
+      "Increase the number of iterations (max_iter) or scale the data as shown in:\n",
+      "    https://scikit-learn.org/stable/modules/preprocessing.html\n",
+      "Please also refer to the documentation for alternative solver options:\n",
+      "    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n",
+      "  n_iter_i = _check_optimize_result(\n"
      ]
     },
     {
@@ -291,188 +477,64 @@
        "  <thead>\n",
        "    <tr style=\"text-align: right;\">\n",
        "      <th></th>\n",
-       "      <th>ID</th>\n",
-       "      <th>Name</th>\n",
-       "      <th>Category</th>\n",
-       "      <th>Subcategory</th>\n",
-       "      <th>Country</th>\n",
-       "      <th>Launched</th>\n",
-       "      <th>Deadline</th>\n",
-       "      <th>Goal</th>\n",
-       "      <th>Pledged</th>\n",
-       "      <th>Backers</th>\n",
-       "      <th>State</th>\n",
+       "      <th>Parameters</th>\n",
+       "      <th>accuracy</th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>0</th>\n",
-       "      <td>1860890148</td>\n",
-       "      <td>Grace Jones Does Not Give A F$#% T-Shirt (limi...</td>\n",
-       "      <td>Fashion</td>\n",
-       "      <td>Fashion</td>\n",
-       "      <td>United States</td>\n",
-       "      <td>2009-04-21 21:02:48</td>\n",
-       "      <td>2009-05-31</td>\n",
-       "      <td>1000</td>\n",
-       "      <td>625</td>\n",
-       "      <td>30</td>\n",
-       "      <td>Failed</td>\n",
+       "      <td>{'C': 10.0, 'max_iter': 1000}</td>\n",
+       "      <td>0.999110</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
-       "      <td>709707365</td>\n",
-       "      <td>CRYSTAL ANTLERS UNTITLED MOVIE</td>\n",
-       "      <td>Film &amp; Video</td>\n",
-       "      <td>Shorts</td>\n",
-       "      <td>United States</td>\n",
-       "      <td>2009-04-23 00:07:53</td>\n",
-       "      <td>2009-07-20</td>\n",
-       "      <td>80000</td>\n",
-       "      <td>22</td>\n",
-       "      <td>3</td>\n",
-       "      <td>Failed</td>\n",
+       "      <td>{'C': 0.1, 'max_iter': 1000}</td>\n",
+       "      <td>0.999095</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2</th>\n",
-       "      <td>1703704063</td>\n",
-       "      <td>drawing for dollars</td>\n",
-       "      <td>Art</td>\n",
-       "      <td>Illustration</td>\n",
-       "      <td>United States</td>\n",
-       "      <td>2009-04-24 21:52:03</td>\n",
-       "      <td>2009-05-03</td>\n",
-       "      <td>20</td>\n",
-       "      <td>35</td>\n",
-       "      <td>3</td>\n",
-       "      <td>Successful</td>\n",
+       "      <td>{'C': 1.0, 'max_iter': 1000}</td>\n",
+       "      <td>0.999095</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>3</th>\n",
-       "      <td>727286</td>\n",
-       "      <td>Offline Wikipedia iPhone app</td>\n",
-       "      <td>Technology</td>\n",
-       "      <td>Software</td>\n",
-       "      <td>United States</td>\n",
-       "      <td>2009-04-25 17:36:21</td>\n",
-       "      <td>2009-07-14</td>\n",
-       "      <td>99</td>\n",
-       "      <td>145</td>\n",
-       "      <td>25</td>\n",
-       "      <td>Successful</td>\n",
+       "      <td>{'C': 10.0, 'max_iter': 100}</td>\n",
+       "      <td>0.998989</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>4</th>\n",
-       "      <td>1622952265</td>\n",
-       "      <td>Pantshirts</td>\n",
-       "      <td>Fashion</td>\n",
-       "      <td>Fashion</td>\n",
-       "      <td>United States</td>\n",
-       "      <td>2009-04-27 14:10:39</td>\n",
-       "      <td>2009-05-26</td>\n",
-       "      <td>1900</td>\n",
-       "      <td>387</td>\n",
-       "      <td>10</td>\n",
-       "      <td>Failed</td>\n",
+       "      <td>{'C': 0.1, 'max_iter': 100}</td>\n",
+       "      <td>0.998884</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "</div>"
       ],
       "text/plain": [
-       "           ID                                               Name   \n",
-       "0  1860890148  Grace Jones Does Not Give A F$#% T-Shirt (limi...  \\\n",
-       "1   709707365                     CRYSTAL ANTLERS UNTITLED MOVIE   \n",
-       "2  1703704063                                drawing for dollars   \n",
-       "3      727286                       Offline Wikipedia iPhone app   \n",
-       "4  1622952265                                         Pantshirts   \n",
-       "\n",
-       "       Category   Subcategory        Country             Launched    Deadline   \n",
-       "0       Fashion       Fashion  United States  2009-04-21 21:02:48  2009-05-31  \\\n",
-       "1  Film & Video        Shorts  United States  2009-04-23 00:07:53  2009-07-20   \n",
-       "2           Art  Illustration  United States  2009-04-24 21:52:03  2009-05-03   \n",
-       "3    Technology      Software  United States  2009-04-25 17:36:21  2009-07-14   \n",
-       "4       Fashion       Fashion  United States  2009-04-27 14:10:39  2009-05-26   \n",
-       "\n",
-       "    Goal  Pledged  Backers       State  \n",
-       "0   1000      625       30      Failed  \n",
-       "1  80000       22        3      Failed  \n",
-       "2     20       35        3  Successful  \n",
-       "3     99      145       25  Successful  \n",
-       "4   1900      387       10      Failed  "
+       "                      Parameters  accuracy\n",
+       "0  {'C': 10.0, 'max_iter': 1000}  0.999110\n",
+       "1   {'C': 0.1, 'max_iter': 1000}  0.999095\n",
+       "2   {'C': 1.0, 'max_iter': 1000}  0.999095\n",
+       "3   {'C': 10.0, 'max_iter': 100}  0.998989\n",
+       "4    {'C': 0.1, 'max_iter': 100}  0.998884"
       ]
      },
-     "execution_count": 22,
+     "execution_count": 24,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "data = transform_numerical_to_string(data.head(),transform_data)\n",
-    "data.head()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def logistic_regression(X_train,X_test,y,y_train,metric=\"accuracy\",verbose=False,norm=\"l2\",max_iter=1000,C=1.0):\n",
-    "    \"\"\"\n",
-    "    This function performs logistic regression on the data and returns the accuracy of the model\n",
-    "    Necessary Arguments:\n",
-    "        X_train : The training data\n",
-    "        X_test : The test data\n",
-    "        y : The target values\n",
-    "        y_train : The target values for the training data\n",
-    "\n",
-    "    Optional Arguments:\n",
-    "        metric : The metric to calculate the model performance, Options: \"accuracy\", \"precision\", \"recall\", \"f1\"\n",
-    "        verbose : If True, the function will print the metric of the model\n",
-    "        norm : The norm to use for the logistic regression\n",
-    "        max_iter : The maximum number of iterations for the logistic regression\n",
-    "        C : The regularization parameter for the logistic regression\n",
-    "\n",
-    "    Returns:\n",
-    "        metric : The number of the metric specified in the arguments\n",
-    "    \"\"\"\n",
-    "\n",
-    "\n",
-    "    # create a logistic regression model\n",
-    "    model = LogisticRegression(max_iter=max_iter,penalty=norm,C=C)\n",
-    "\n",
-    "    # fit the model to the training data\n",
-    "    model.fit(X_train, y_train)\n",
-    "\n",
-    "\n",
-    "\n",
-    "    # predict the target values for the test data\n",
-    "    y_pred = model.predict(X_test)\n",
+    "#import warnings\n",
+    "#\n",
+    "# warnings.filterwarnings(\"ignore\", category=ConvergenceWarning)\n",
     "\n",
-    "    if verbose==True:\n",
-    "        print(f\"Model Coefficients: {model.coef_}\")\n",
-    "        print(f\"Model Intercept: {model.intercept_}\")\n",
-    "        print(f\"Model Score: {y_pred}\")\n",
+    "hyperparameters = {\"C\":[0.1,1.0,10.0],\"max_iter\":[10,100,1000]}\n",
+    "results = grid_search(logistic_regression,hyperparameters,\"accuracy\",X_train,X_test,y_train,y_test)\n",
     "\n",
-    "    # calculate the metric of the model\n",
-    "    if metric == \"accuracy\":\n",
-    "        accuracy = accuracy_score(y, y_pred)\n",
-    "    if metric == \"precision\":\n",
-    "        accuracy = precision_score(y, y_pred)\n",
-    "    if metric == \"recall\":\n",
-    "        accuracy = recall_score(y, y_pred)\n",
-    "    if metric == \"f1\":\n",
-    "        accuracy = f1_score(y, y_pred)\n",
-    "    return accuracy"
+    "results.head()\n"
    ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
   }
  ],
  "metadata": {