From 2912cb6005e18cffc983c441bbb87b013086fc98 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tobias=20K=C3=A4mmerling?= Date: Thu, 8 Aug 2024 16:42:36 +0200 Subject: [PATCH] Change columns Deadline,launched to dt and added column duration, dropped all data which isnt failed or successful --- base.ipynb | 564 +++++++++++++++++++++++++++++------------------------ 1 file changed, 313 insertions(+), 251 deletions(-) diff --git a/base.ipynb b/base.ipynb index b08c1e4..8173696 100644 --- a/base.ipynb +++ b/base.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 23, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ @@ -15,7 +15,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 9, "metadata": {}, "outputs": [], "source": [ @@ -35,7 +35,7 @@ " transform_data = {}\n", " for column in data.columns:\n", " # If data type is an object, for example a string, we want to convert the column to numerical values\n", - " if data[column].dtype == 'object':\n", + " if data[column].dtype == 'object' and data[column].dtype != 'datetime64[ns]':\n", " le = LabelEncoder()\n", " data[column] = le.fit_transform(data[column])\n", " # Save the mapping in a dictionary\n", @@ -45,7 +45,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ @@ -74,7 +74,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 14, "metadata": {}, "outputs": [], "source": [ @@ -85,9 +85,21 @@ " # read in the data from the csv file\n", " data = pd.read_csv('data/kickstarter_projects.csv')\n", " # transform the categorical values to numerical values\n", + "\n", + " data = data[(data[\"State\"] == \"Successful\") | (data[\"State\"] == \"Failed\")]\n", + " data[\"Deadline\"] = pd.to_datetime(data[\"Deadline\"],format='%Y-%m-%d')\n", + " data[\"Launched\"] = pd.to_datetime(data[\"Launched\"],format='%Y-%m-%d %H:%M:%S')\n", + " data[\"Duration\"] = (data[\"Deadline\"] - data[\"Launched\"]).dt.days\n", + "\n", + "\n", " data, transform_data = transform_strings_to_numerical(data)\n", + "\n", + "\n", + "\n", " \"\"\"\n", - " If we want to universally modify the data in any other way, we can do it here\n", + " We have converted Deadline and Launched to DateTime objects and calculated the duration in days\n", + " We also, at least for now, drop all live or suspended or canceled projects\n", + " \n", " \"\"\"\n", " #return the data and the transformation_data in case we want to transform the data back\n", " return data, transform_data" @@ -95,7 +107,32 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 83, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Series([], Name: State, dtype: int64)" + ] + }, + "execution_count": 83, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data, transform_data= get_data()\n", + "data.drop(\"Name\", axis=1, inplace=True)\n", + "data.head(20)\n", + "\n", + "data = transform_numerical_to_string(data,transform_data)\n", + "data[\"State\"].groupby(data[\"State\"]).count()" + ] + }, + { + "cell_type": "code", + "execution_count": 13, "metadata": {}, "outputs": [], "source": [ @@ -105,8 +142,19 @@ " \"\"\"\n", " # read in the data from the csv file\n", " data = pd.read_csv('data/kickstarter_projects.csv')\n", + "\n", + "\n", + " data = data[(data[\"State\"] == \"Successful\") | (data[\"State\"] == \"Failed\")]\n", + " data[\"Deadline\"] = pd.to_datetime(data[\"Deadline\"],format='%Y-%m-%d')\n", + " data[\"Launched\"] = pd.to_datetime(data[\"Launched\"],format='%Y-%m-%d %H:%M:%S')\n", + " data[\"Duration\"] = (data[\"Deadline\"] - data[\"Launched\"]).dt.days\n", + "\n", + "\n", + "\n", " \"\"\"\n", - " If we want to universally modify the data in any other way, we can do it here\n", + " We have converted Deadline and Launched to DateTime objects and calculated the duration in days\n", + " We also, at least for now, drop all live or suspended or canceled projects\n", + " \n", " \"\"\"\n", " #return the data\n", " return data" @@ -114,7 +162,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 85, "metadata": {}, "outputs": [ { @@ -149,125 +197,263 @@ " Pledged\n", " Backers\n", " State\n", + " Duration\n", " \n", " \n", " \n", - " \n", - " 0\n", - " 1860890148\n", - " 130454\n", - " 5\n", - " 52\n", - " 21\n", - " 0\n", - " 6\n", - " 1000\n", - " 625\n", - " 30\n", - " 1\n", - " \n", - " \n", - " 1\n", - " 709707365\n", - " 63196\n", - " 6\n", - " 129\n", - " 21\n", - " 1\n", - " 34\n", - " 80000\n", - " 22\n", - " 3\n", - " 1\n", - " \n", - " \n", - " 2\n", - " 1703704063\n", - " 365635\n", - " 0\n", - " 70\n", - " 21\n", - " 2\n", - " 0\n", - " 20\n", - " 35\n", - " 3\n", - " 3\n", - " \n", - " \n", - " 3\n", - " 727286\n", - " 217100\n", - " 13\n", - " 131\n", - " 21\n", - " 3\n", - " 31\n", - " 99\n", - " 145\n", - " 25\n", - " 3\n", - " \n", - " \n", - " 4\n", - " 1622952265\n", - " 225555\n", - " 5\n", - " 52\n", - " 21\n", - " 4\n", - " 4\n", - " 1900\n", - " 387\n", - " 10\n", - " 1\n", - " \n", " \n", "\n", "" ], "text/plain": [ - " ID Name Category Subcategory Country Launched Deadline \n", - "0 1860890148 130454 5 52 21 0 6 \\\n", - "1 709707365 63196 6 129 21 1 34 \n", - "2 1703704063 365635 0 70 21 2 0 \n", - "3 727286 217100 13 131 21 3 31 \n", - "4 1622952265 225555 5 52 21 4 4 \n", - "\n", - " Goal Pledged Backers State \n", - "0 1000 625 30 1 \n", - "1 80000 22 3 1 \n", - "2 20 35 3 3 \n", - "3 99 145 25 3 \n", - "4 1900 387 10 1 " + "Empty DataFrame\n", + "Columns: [ID, Name, Category, Subcategory, Country, Launched, Deadline, Goal, Pledged, Backers, State, Duration]\n", + "Index: []" + ] + }, + "execution_count": 85, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "get_original_data().head(20)" + ] + }, + { + "cell_type": "code", + "execution_count": 86, + "metadata": {}, + "outputs": [], + "source": [ + "def remove_outlier(data,columns,threshold=3):\n", + " \"\"\"\n", + " This function removes outliers from the data based on the threshold\n", + " Args:\n", + " data : Our dataframe which we want to modify\n", + " columns : The columns which we want to check for outliers\n", + " threshold : The threshold which we use to determine if a value is an outlier\n", + " Multiplied by the standard deviation of the column to determine the range of values which are not outliers\n", + " I advise setting the threshold to 3\n", + "\n", + " Returns:\n", + " data : Our modified dataframe\n", + " \"\"\"\n", + " \n", + " for column in columns:\n", + " data = data[np.abs(data[column]-data[column].mean()) <= (threshold*data[column].std())]\n", + " return data" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [], + "source": [ + "def logistic_regression(X_train,X_test,y_train,y_test,metric=\"accuracy\",verbose=False,norm=\"l2\",max_iter=1000,C=1.0):\n", + " \"\"\"\n", + " This function performs logistic regression on the data and returns the accuracy of the model\n", + " Necessary Arguments:\n", + " X_train : The training data\n", + " X_test : The test data\n", + " y : The target values\n", + " y_train : The target values for the training data\n", + "\n", + " Optional Arguments:\n", + " metric : The metric to calculate the model performance, Options: \"accuracy\", \"precision\", \"recall\", \"f1\"\n", + " verbose : If True, the function will print the metric of the model\n", + " norm : The norm to use for the logistic regression\n", + " max_iter : The maximum number of iterations for the logistic regression\n", + " C : The regularization parameter for the logistic regression\n", + "\n", + " Returns:\n", + " metric : The number of the metric specified in the arguments\n", + " \"\"\"\n", + "\n", + " # create a logistic regression model\n", + " model = LogisticRegression(max_iter=max_iter,penalty=norm,C=C)\n", + "\n", + " # fit the model to the training data\n", + " model.fit(X_train, y_train)\n", + "\n", + " # predict the target values for the test data\n", + " y_pred = model.predict(X_test)\n", + "\n", + " if verbose==True:\n", + " print(f\"Model Coefficients: {model.coef_}\")\n", + " print(f\"Model Intercept: {model.intercept_}\")\n", + " print(f\"Model Score: {y_pred}\")\n", + "\n", + " metric_value = 0\n", + " # calculate the metric of the model\n", + " if metric == \"accuracy\":\n", + " metric_value = accuracy_score(y_test, y_pred)\n", + " if metric == \"precision\":\n", + " metric_value = precision_score(y_test, y_pred)\n", + " if metric == \"recall\":\n", + " metric_value = recall_score(y_test, y_pred)\n", + " if metric == \"f1\":\n", + " metric_value = f1_score(y_test, y_pred)\n", + " return metric_value" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "def grid_search(model,parameters,metric,X_train,X_test,y_train,y_test):\n", + " from itertools import product\n", + " \"\"\"\n", + " \n", + " Perform grid search for the given machine learning model and hyperparameters to find the best hyperparameters\n", + "\n", + " Parameters:\n", + " model - The machine learning model function to use\n", + " param_grid - The hyperparameters to test, given in form of a list of dictionaries\n", + " metric - A string specifying the metric to use for evaluation\n", + " X_train, X_test, y_train, y_test - The training and test data\n", + "\n", + " Returns: A pandas Dataframe containing the hyperparameters and the corresponding metric value, \n", + " sorted by the metric value in descending order\n", + "\n", + " \"\"\"\n", + " # Create all possible permutations of the hyperparameters, so if a={1,2} and b={3,4} we get [{1,3},{1,4},{2,3},{2,4}]\n", + " keys, values = zip(*parameters.items())\n", + " permutations = [dict(zip(keys, v)) for v in product(*values)]\n", + "\n", + " # Create a list to store the results\n", + " results = []\n", + "\n", + " for params in permutations:\n", + " # feed the model with the hyperparameters\n", + " # ** unpacks the dictionary into the form dict[key]=value -> key = value\n", + " metric_value = model(X_train,X_test,y_train,y_test,**params)\n", + "\n", + " # Append the results to the list\n", + " results.append((params, metric_value))\n", + "\n", + " # After the loop is done, we sort the results by the metric value\n", + " results.sort(key=lambda x: x[1], reverse=True)\n", + "\n", + " results = pd.DataFrame(results, columns=['Parameters', metric])\n", + "\n", + " return results\n", + "\n", + "\n", + "\n", + "\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " ID Name Category Subcategory Country Launched \n", + "0 1860890148 115746 5 52 21 2009-04-21 21:02:48 \\\n", + "1 709707365 56298 6 129 21 2009-04-23 00:07:53 \n", + "2 1703704063 323842 0 70 21 2009-04-24 21:52:03 \n", + "3 727286 192565 13 131 21 2009-04-25 17:36:21 \n", + "4 1622952265 200031 5 52 21 2009-04-27 14:10:39 \n", + "\n", + " Deadline Goal Pledged Backers State Duration \n", + "0 2009-05-31 1000 625 30 0 39 \n", + "1 2009-07-20 80000 22 3 0 87 \n", + "2 2009-05-03 20 35 3 1 8 \n", + "3 2009-07-14 99 145 25 1 79 \n", + "4 2009-05-26 1900 387 10 0 28 \n" + ] + }, + { + "data": { + "text/plain": [ + "((265169, 9), (66293, 9), (265169,), (66293,))" ] }, - "execution_count": 21, + "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "#tests:\n", - "data, transform_data = get_data()\n", - "data.head()\n" + "from sklearn.model_selection import train_test_split\n", + "# test out the functions\n", + "data,transform_data = get_data()\n", + "print(data.head())\n", + "\n", + "y = data['State']\n", + "X = data.drop(['State','Launched','Deadline'], axis=1)\n", + "\n", + "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 1337)\n", + "X_train.shape, X_test.shape, y_train.shape, y_test.shape" ] }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 24, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "C:\\Users\\tobia\\AppData\\Local\\Temp\\ipykernel_14408\\914317090.py:21: SettingWithCopyWarning: \n", - "A value is trying to be set on a copy of a slice from a DataFrame.\n", - "Try using .loc[row_indexer,col_indexer] = value instead\n", + "c:\\Users\\tobia\\anaconda3\\envs\\env\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1):\n", + "STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n", + "\n", + "Increase the number of iterations (max_iter) or scale the data as shown in:\n", + " https://scikit-learn.org/stable/modules/preprocessing.html\n", + "Please also refer to the documentation for alternative solver options:\n", + " https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n", + " n_iter_i = _check_optimize_result(\n", + "c:\\Users\\tobia\\anaconda3\\envs\\env\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1):\n", + "STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n", + "\n", + "Increase the number of iterations (max_iter) or scale the data as shown in:\n", + " https://scikit-learn.org/stable/modules/preprocessing.html\n", + "Please also refer to the documentation for alternative solver options:\n", + " https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n", + " n_iter_i = _check_optimize_result(\n", + "c:\\Users\\tobia\\anaconda3\\envs\\env\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1):\n", + "STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n", + "\n", + "Increase the number of iterations (max_iter) or scale the data as shown in:\n", + " https://scikit-learn.org/stable/modules/preprocessing.html\n", + "Please also refer to the documentation for alternative solver options:\n", + " https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n", + " n_iter_i = _check_optimize_result(\n", + "c:\\Users\\tobia\\anaconda3\\envs\\env\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1):\n", + "STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n", "\n", - "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", - " data[column] = data[column].map(reverse_mapping)\n" + "Increase the number of iterations (max_iter) or scale the data as shown in:\n", + " https://scikit-learn.org/stable/modules/preprocessing.html\n", + "Please also refer to the documentation for alternative solver options:\n", + " https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n", + " n_iter_i = _check_optimize_result(\n", + "c:\\Users\\tobia\\anaconda3\\envs\\env\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1):\n", + "STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n", + "\n", + "Increase the number of iterations (max_iter) or scale the data as shown in:\n", + " https://scikit-learn.org/stable/modules/preprocessing.html\n", + "Please also refer to the documentation for alternative solver options:\n", + " https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n", + " n_iter_i = _check_optimize_result(\n", + "c:\\Users\\tobia\\anaconda3\\envs\\env\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1):\n", + "STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n", + "\n", + "Increase the number of iterations (max_iter) or scale the data as shown in:\n", + " https://scikit-learn.org/stable/modules/preprocessing.html\n", + "Please also refer to the documentation for alternative solver options:\n", + " https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n", + " n_iter_i = _check_optimize_result(\n" ] }, { @@ -291,188 +477,64 @@ " \n", " \n", " \n", - " ID\n", - " Name\n", - " Category\n", - " Subcategory\n", - " Country\n", - " Launched\n", - " Deadline\n", - " Goal\n", - " Pledged\n", - " Backers\n", - " State\n", + " Parameters\n", + " accuracy\n", " \n", " \n", " \n", " \n", " 0\n", - " 1860890148\n", - " Grace Jones Does Not Give A F$#% T-Shirt (limi...\n", - " Fashion\n", - " Fashion\n", - " United States\n", - " 2009-04-21 21:02:48\n", - " 2009-05-31\n", - " 1000\n", - " 625\n", - " 30\n", - " Failed\n", + " {'C': 10.0, 'max_iter': 1000}\n", + " 0.999110\n", " \n", " \n", " 1\n", - " 709707365\n", - " CRYSTAL ANTLERS UNTITLED MOVIE\n", - " Film & Video\n", - " Shorts\n", - " United States\n", - " 2009-04-23 00:07:53\n", - " 2009-07-20\n", - " 80000\n", - " 22\n", - " 3\n", - " Failed\n", + " {'C': 0.1, 'max_iter': 1000}\n", + " 0.999095\n", " \n", " \n", " 2\n", - " 1703704063\n", - " drawing for dollars\n", - " Art\n", - " Illustration\n", - " United States\n", - " 2009-04-24 21:52:03\n", - " 2009-05-03\n", - " 20\n", - " 35\n", - " 3\n", - " Successful\n", + " {'C': 1.0, 'max_iter': 1000}\n", + " 0.999095\n", " \n", " \n", " 3\n", - " 727286\n", - " Offline Wikipedia iPhone app\n", - " Technology\n", - " Software\n", - " United States\n", - " 2009-04-25 17:36:21\n", - " 2009-07-14\n", - " 99\n", - " 145\n", - " 25\n", - " Successful\n", + " {'C': 10.0, 'max_iter': 100}\n", + " 0.998989\n", " \n", " \n", " 4\n", - " 1622952265\n", - " Pantshirts\n", - " Fashion\n", - " Fashion\n", - " United States\n", - " 2009-04-27 14:10:39\n", - " 2009-05-26\n", - " 1900\n", - " 387\n", - " 10\n", - " Failed\n", + " {'C': 0.1, 'max_iter': 100}\n", + " 0.998884\n", " \n", " \n", "\n", "" ], "text/plain": [ - " ID Name \n", - "0 1860890148 Grace Jones Does Not Give A F$#% T-Shirt (limi... \\\n", - "1 709707365 CRYSTAL ANTLERS UNTITLED MOVIE \n", - "2 1703704063 drawing for dollars \n", - "3 727286 Offline Wikipedia iPhone app \n", - "4 1622952265 Pantshirts \n", - "\n", - " Category Subcategory Country Launched Deadline \n", - "0 Fashion Fashion United States 2009-04-21 21:02:48 2009-05-31 \\\n", - "1 Film & Video Shorts United States 2009-04-23 00:07:53 2009-07-20 \n", - "2 Art Illustration United States 2009-04-24 21:52:03 2009-05-03 \n", - "3 Technology Software United States 2009-04-25 17:36:21 2009-07-14 \n", - "4 Fashion Fashion United States 2009-04-27 14:10:39 2009-05-26 \n", - "\n", - " Goal Pledged Backers State \n", - "0 1000 625 30 Failed \n", - "1 80000 22 3 Failed \n", - "2 20 35 3 Successful \n", - "3 99 145 25 Successful \n", - "4 1900 387 10 Failed " + " Parameters accuracy\n", + "0 {'C': 10.0, 'max_iter': 1000} 0.999110\n", + "1 {'C': 0.1, 'max_iter': 1000} 0.999095\n", + "2 {'C': 1.0, 'max_iter': 1000} 0.999095\n", + "3 {'C': 10.0, 'max_iter': 100} 0.998989\n", + "4 {'C': 0.1, 'max_iter': 100} 0.998884" ] }, - "execution_count": 22, + "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "data = transform_numerical_to_string(data.head(),transform_data)\n", - "data.head()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def logistic_regression(X_train,X_test,y,y_train,metric=\"accuracy\",verbose=False,norm=\"l2\",max_iter=1000,C=1.0):\n", - " \"\"\"\n", - " This function performs logistic regression on the data and returns the accuracy of the model\n", - " Necessary Arguments:\n", - " X_train : The training data\n", - " X_test : The test data\n", - " y : The target values\n", - " y_train : The target values for the training data\n", - "\n", - " Optional Arguments:\n", - " metric : The metric to calculate the model performance, Options: \"accuracy\", \"precision\", \"recall\", \"f1\"\n", - " verbose : If True, the function will print the metric of the model\n", - " norm : The norm to use for the logistic regression\n", - " max_iter : The maximum number of iterations for the logistic regression\n", - " C : The regularization parameter for the logistic regression\n", - "\n", - " Returns:\n", - " metric : The number of the metric specified in the arguments\n", - " \"\"\"\n", - "\n", - "\n", - " # create a logistic regression model\n", - " model = LogisticRegression(max_iter=max_iter,penalty=norm,C=C)\n", - "\n", - " # fit the model to the training data\n", - " model.fit(X_train, y_train)\n", - "\n", - "\n", - "\n", - " # predict the target values for the test data\n", - " y_pred = model.predict(X_test)\n", + "#import warnings\n", + "#\n", + "# warnings.filterwarnings(\"ignore\", category=ConvergenceWarning)\n", "\n", - " if verbose==True:\n", - " print(f\"Model Coefficients: {model.coef_}\")\n", - " print(f\"Model Intercept: {model.intercept_}\")\n", - " print(f\"Model Score: {y_pred}\")\n", + "hyperparameters = {\"C\":[0.1,1.0,10.0],\"max_iter\":[10,100,1000]}\n", + "results = grid_search(logistic_regression,hyperparameters,\"accuracy\",X_train,X_test,y_train,y_test)\n", "\n", - " # calculate the metric of the model\n", - " if metric == \"accuracy\":\n", - " accuracy = accuracy_score(y, y_pred)\n", - " if metric == \"precision\":\n", - " accuracy = precision_score(y, y_pred)\n", - " if metric == \"recall\":\n", - " accuracy = recall_score(y, y_pred)\n", - " if metric == \"f1\":\n", - " accuracy = f1_score(y, y_pred)\n", - " return accuracy" + "results.head()\n" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": {