diff --git a/base.ipynb b/base.ipynb index 8173696..e43207d 100644 --- a/base.ipynb +++ b/base.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": 21, "metadata": {}, "outputs": [], "source": [ @@ -10,12 +10,13 @@ "import numpy as np\n", "from sklearn.preprocessing import LabelEncoder\n", "from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score\n", - "from sklearn.linear_model import LogisticRegression" + "from sklearn.linear_model import LogisticRegression\n", + "from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay" ] }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -74,7 +75,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 18, "metadata": {}, "outputs": [], "source": [ @@ -86,6 +87,7 @@ " data = pd.read_csv('data/kickstarter_projects.csv')\n", " # transform the categorical values to numerical values\n", "\n", + " data = data.drop([\"ID\",\"Name\"],axis=1)\n", " data = data[(data[\"State\"] == \"Successful\") | (data[\"State\"] == \"Failed\")]\n", " data[\"Deadline\"] = pd.to_datetime(data[\"Deadline\"],format='%Y-%m-%d')\n", " data[\"Launched\"] = pd.to_datetime(data[\"Launched\"],format='%Y-%m-%d %H:%M:%S')\n", @@ -107,32 +109,7 @@ }, { "cell_type": "code", - "execution_count": 83, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "Series([], Name: State, dtype: int64)" - ] - }, - "execution_count": 83, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "data, transform_data= get_data()\n", - "data.drop(\"Name\", axis=1, inplace=True)\n", - "data.head(20)\n", - "\n", - "data = transform_numerical_to_string(data,transform_data)\n", - "data[\"State\"].groupby(data[\"State\"]).count()" - ] - }, - { - "cell_type": "code", - "execution_count": 13, + "execution_count": 5, "metadata": {}, "outputs": [], "source": [ @@ -143,7 +120,7 @@ " # read in the data from the csv file\n", " data = pd.read_csv('data/kickstarter_projects.csv')\n", "\n", - "\n", + " data = data.drop([\"ID\",\"Name\"],axis=1)\n", " data = data[(data[\"State\"] == \"Successful\") | (data[\"State\"] == \"Failed\")]\n", " data[\"Deadline\"] = pd.to_datetime(data[\"Deadline\"],format='%Y-%m-%d')\n", " data[\"Launched\"] = pd.to_datetime(data[\"Launched\"],format='%Y-%m-%d %H:%M:%S')\n", @@ -162,67 +139,7 @@ }, { "cell_type": "code", - "execution_count": 85, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
IDNameCategorySubcategoryCountryLaunchedDeadlineGoalPledgedBackersStateDuration
\n", - "
" - ], - "text/plain": [ - "Empty DataFrame\n", - "Columns: [ID, Name, Category, Subcategory, Country, Launched, Deadline, Goal, Pledged, Backers, State, Duration]\n", - "Index: []" - ] - }, - "execution_count": 85, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "get_original_data().head(20)" - ] - }, - { - "cell_type": "code", - "execution_count": 86, + "execution_count": 6, "metadata": {}, "outputs": [], "source": [ @@ -247,11 +164,19 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "def logistic_regression(X_train,X_test,y_train,y_test,metric=\"accuracy\",verbose=False,norm=\"l2\",max_iter=1000,C=1.0):\n", + " from sklearn.metrics import confusion_matrix\n", " \"\"\"\n", " This function performs logistic regression on the data and returns the accuracy of the model\n", " Necessary Arguments:\n", @@ -268,7 +193,7 @@ " C : The regularization parameter for the logistic regression\n", "\n", " Returns:\n", - " metric : The number of the metric specified in the arguments\n", + " metric_value : The number of the metric specified in the arguments\n", " \"\"\"\n", "\n", " # create a logistic regression model\n", @@ -284,6 +209,8 @@ " print(f\"Model Coefficients: {model.coef_}\")\n", " print(f\"Model Intercept: {model.intercept_}\")\n", " print(f\"Model Score: {y_pred}\")\n", + " print(f'Confusion Matrix: {confusion_matrix(y_test, y_pred)}')\n", + "\n", "\n", " metric_value = 0\n", " # calculate the metric of the model\n", @@ -300,7 +227,55 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " Category Subcategory Country Launched Deadline Goal \n", + "0 5 52 21 2009-04-21 21:02:48 2009-05-31 1000 \\\n", + "1 6 129 21 2009-04-23 00:07:53 2009-07-20 80000 \n", + "2 0 70 21 2009-04-24 21:52:03 2009-05-03 20 \n", + "3 13 131 21 2009-04-25 17:36:21 2009-07-14 99 \n", + "4 5 52 21 2009-04-27 14:10:39 2009-05-26 1900 \n", + "\n", + " Pledged Backers State Duration \n", + "0 625 30 0 39 \n", + "1 22 3 0 87 \n", + "2 35 3 1 8 \n", + "3 145 25 1 79 \n", + "4 387 10 0 28 \n" + ] + }, + { + "data": { + "text/plain": [ + "((265169, 7), (66293, 7), (265169,), (66293,))" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from sklearn.model_selection import train_test_split\n", + "# test out the functions\n", + "data,transform_data = get_data()\n", + "print(data.head())\n", + "\n", + "y = data['State']\n", + "X = data.drop(['State','Launched','Deadline'], axis=1)\n", + "\n", + "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 1337)\n", + "X_train.shape, X_test.shape, y_train.shape, y_test.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 9, "metadata": {}, "outputs": [], "source": [ @@ -343,40 +318,76 @@ "\n", " return results\n", "\n", - "\n", - "\n", - "\n", " " ] }, { "cell_type": "code", - "execution_count": 23, + "execution_count": null, "metadata": {}, "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " ID Name Category Subcategory Country Launched \n", - "0 1860890148 115746 5 52 21 2009-04-21 21:02:48 \\\n", - "1 709707365 56298 6 129 21 2009-04-23 00:07:53 \n", - "2 1703704063 323842 0 70 21 2009-04-24 21:52:03 \n", - "3 727286 192565 13 131 21 2009-04-25 17:36:21 \n", - "4 1622952265 200031 5 52 21 2009-04-27 14:10:39 \n", - "\n", - " Deadline Goal Pledged Backers State Duration \n", - "0 2009-05-31 1000 625 30 0 39 \n", - "1 2009-07-20 80000 22 3 0 87 \n", - "2 2009-05-03 20 35 3 1 8 \n", - "3 2009-07-14 99 145 25 1 79 \n", - "4 2009-05-26 1900 387 10 0 28 \n" - ] - }, { "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Parametersf1
0{'eta': 0.3, 'max_depth': 3}0.999261
1{'eta': 0.3, 'max_depth': 6}0.999261
2{'eta': 0.3, 'max_depth': 12}0.999261
3{'eta': 0.3, 'max_depth': 24}0.999261
4{'eta': 1, 'max_depth': 3}0.999125
\n", + "
" + ], "text/plain": [ - "((265169, 9), (66293, 9), (265169,), (66293,))" + " Parameters f1\n", + "0 {'eta': 0.3, 'max_depth': 3} 0.999261\n", + "1 {'eta': 0.3, 'max_depth': 6} 0.999261\n", + "2 {'eta': 0.3, 'max_depth': 12} 0.999261\n", + "3 {'eta': 0.3, 'max_depth': 24} 0.999261\n", + "4 {'eta': 1, 'max_depth': 3} 0.999125" ] }, "execution_count": 23, @@ -385,51 +396,132 @@ } ], "source": [ - "from sklearn.model_selection import train_test_split\n", - "# test out the functions\n", - "data,transform_data = get_data()\n", - "print(data.head())\n", - "\n", - "y = data['State']\n", - "X = data.drop(['State','Launched','Deadline'], axis=1)\n", + "#import warnings\n", + "#\n", + "# warnings.filterwarnings(\"ignore\", category=ConvergenceWarning)\n", "\n", - "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 1337)\n", - "X_train.shape, X_test.shape, y_train.shape, y_test.shape" + "hyperparameters = {\"eta\":[0.1,0.3,1],\"max_depth\":[3,6,12,24]}\n", + "results = grid_search(xgb,hyperparameters,\"f1\",X_train,X_test,y_train,y_test)\n", + "results.head()" ] }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
CategorySubcategoryCountryGoalPledgedBackersDuration
16477439921350035011929
741781090203205672729
2961981313821250000275229
9266561292170005282338
191647129521200080314
\n", + "
" + ], + "text/plain": [ + " Category Subcategory Country Goal Pledged Backers Duration\n", + "164774 3 99 21 3500 3501 19 29\n", + "74178 10 90 20 320 567 27 29\n", + "296198 13 138 21 250000 275 2 29\n", + "92665 6 129 21 7000 528 23 38\n", + "191647 12 95 21 2000 80 3 14" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X_train.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "c:\\Users\\tobia\\anaconda3\\envs\\env\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1):\n", - "STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n", - "\n", - "Increase the number of iterations (max_iter) or scale the data as shown in:\n", - " https://scikit-learn.org/stable/modules/preprocessing.html\n", - "Please also refer to the documentation for alternative solver options:\n", - " https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n", - " n_iter_i = _check_optimize_result(\n", - "c:\\Users\\tobia\\anaconda3\\envs\\env\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1):\n", - "STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n", - "\n", - "Increase the number of iterations (max_iter) or scale the data as shown in:\n", - " https://scikit-learn.org/stable/modules/preprocessing.html\n", - "Please also refer to the documentation for alternative solver options:\n", - " https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n", - " n_iter_i = _check_optimize_result(\n", - "c:\\Users\\tobia\\anaconda3\\envs\\env\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1):\n", - "STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n", - "\n", - "Increase the number of iterations (max_iter) or scale the data as shown in:\n", - " https://scikit-learn.org/stable/modules/preprocessing.html\n", - "Please also refer to the documentation for alternative solver options:\n", - " https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n", - " n_iter_i = _check_optimize_result(\n", "c:\\Users\\tobia\\anaconda3\\envs\\env\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1):\n", "STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n", "\n", @@ -478,62 +570,269 @@ " \n", " \n", " Parameters\n", - " accuracy\n", + " f1\n", " \n", " \n", " \n", " \n", " 0\n", - " {'C': 10.0, 'max_iter': 1000}\n", - " 0.999110\n", + " {'C': 0.1, 'max_iter': 100}\n", + " 0.999155\n", " \n", " \n", " 1\n", - " {'C': 0.1, 'max_iter': 1000}\n", - " 0.999095\n", + " {'C': 0.1, 'max_iter': 500}\n", + " 0.999155\n", " \n", " \n", " 2\n", - " {'C': 1.0, 'max_iter': 1000}\n", - " 0.999095\n", + " {'C': 0.1, 'max_iter': 1000}\n", + " 0.999155\n", " \n", " \n", " 3\n", - " {'C': 10.0, 'max_iter': 100}\n", - " 0.998989\n", + " {'C': 1, 'max_iter': 100}\n", + " 0.999155\n", " \n", " \n", " 4\n", - " {'C': 0.1, 'max_iter': 100}\n", - " 0.998884\n", + " {'C': 1, 'max_iter': 500}\n", + " 0.999155\n", " \n", " \n", "\n", "" ], "text/plain": [ - " Parameters accuracy\n", - "0 {'C': 10.0, 'max_iter': 1000} 0.999110\n", - "1 {'C': 0.1, 'max_iter': 1000} 0.999095\n", - "2 {'C': 1.0, 'max_iter': 1000} 0.999095\n", - "3 {'C': 10.0, 'max_iter': 100} 0.998989\n", - "4 {'C': 0.1, 'max_iter': 100} 0.998884" + " Parameters f1\n", + "0 {'C': 0.1, 'max_iter': 100} 0.999155\n", + "1 {'C': 0.1, 'max_iter': 500} 0.999155\n", + "2 {'C': 0.1, 'max_iter': 1000} 0.999155\n", + "3 {'C': 1, 'max_iter': 100} 0.999155\n", + "4 {'C': 1, 'max_iter': 500} 0.999155" ] }, - "execution_count": 24, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "#import warnings\n", - "#\n", - "# warnings.filterwarnings(\"ignore\", category=ConvergenceWarning)\n", + "hyperparameters = {\"C\":[0.1,1,10],\"max_iter\":[100,500,1000]}\n", + "results = grid_search(logistic_regression,hyperparameters,\"f1\",X_train,X_test,y_train,y_test)\n", + "results.head()\n" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Model Coefficients: [[ 0.02520122 0.00417009 0.04143354 -0.10483769 0.10516741 0.08317194\n", + " 0.00119811]]\n", + "Model Intercept: [0.00554605]\n", + "Model Score: [0 0 0 ... 0 1 0]\n", + "Confusion Matrix: [[39351 56]\n", + " [ 0 26886]]\n" + ] + }, + { + "data": { + "text/plain": [ + "0.9989596492531768" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "logistic_regression(X_train,X_test,y_train,y_test,metric=\"f1\",verbose=True,C=0.1,max_iter=500)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def decision_tree(X_train,X_test,y_train,y_test,metric=\"accuracy\",verbose=False):\n", + " from sklearn.tree import DecisionTreeClassifier\n", + " \"\"\"\n", + " This function performs logistic regression on the data and returns the accuracy of the model\n", + " Necessary Arguments:\n", + " X_train : The training data\n", + " X_test : The test data\n", + " y_train : The target values\n", + " y_test : The target values for the training data\n", "\n", - "hyperparameters = {\"C\":[0.1,1.0,10.0],\"max_iter\":[10,100,1000]}\n", - "results = grid_search(logistic_regression,hyperparameters,\"accuracy\",X_train,X_test,y_train,y_test)\n", + " Optional Arguments:\n", + " metric : The metric to calculate the model performance, Options: \"accuracy\", \"precision\", \"recall\", \"f1\"\n", + " verbose : If True, the function will print the metric of the model\n", "\n", - "results.head()\n" + " Returns:\n", + " metric_value : The number of the metric specified in the arguments\n", + " \"\"\"\n", + "\n", + " # create a decision tree model\n", + " model = DecisionTreeClassifier()\n", + "\n", + " # fit the model to the training data\n", + " model.fit(X_train, y_train)\n", + "\n", + " # predict the target values for the test data\n", + " y_pred = model.predict(X_test)\n", + "\n", + " if verbose==True:\n", + " print(f\"Model Score: {y_pred}\")\n", + " cm = confusion_matrix(y_test, y_pred)\n", + " disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=data.target_names)\n", + " disp.plot()\n", + "\n", + " metric_value = 0\n", + " # calculate the metric of the model\n", + " if metric == \"accuracy\":\n", + " metric_value = accuracy_score(y_test, y_pred)\n", + " if metric == \"precision\":\n", + " metric_value = precision_score(y_test, y_pred)\n", + " if metric == \"recall\":\n", + " metric_value = recall_score(y_test, y_pred)\n", + " if metric == \"f1\":\n", + " metric_value = f1_score(y_test, y_pred)\n", + " return metric_value" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "def xgb(X_train,X_test,y_train,y_test,metric=\"accuracy\",booster=\"gbtree\",eta=0.3,max_depth=6, reg_lambda=1, verbose=False):\n", + "\n", + " \"\"\"\n", + " This function performs xgboost on the data and returns the accuracy of the model\n", + " Necessary Arguments:\n", + " X_train : The training data\n", + " X_test : The test data\n", + " y_train : The target values\n", + " y_test : The target values for the training data\n", + "\n", + " Optional Arguments:\n", + " metric : The metric to calculate the model performance, Options: \"accuracy\", \"precision\", \"recall\", \"f1\"\n", + " verbose : If True, the function will print the metric of the model\n", + " booster : The type of booster to use, Options: \"gbtree\", \"gblinear\", \"dart\"\n", + " eta : The learning rate of the model, between [0,1]\n", + " max_depth : The maximum depth of the trees, default is 6 to avoid overfitting\n", + " reg_lambda : The regularization parameter of the model\n", + "\n", + " Returns:\n", + " metric_value : The number of the metric specified in the arguments\n", + " \"\"\"\n", + " from xgboost import XGBClassifier\n", + " # create a logistic regression model\n", + " model = XGBClassifier(booster=booster,eta=eta)\n", + "\n", + " # fit the model to the training data\n", + " model.fit(X_train, y_train)\n", + "\n", + " # predict the target values for the test data\n", + " y_pred = model.predict(X_test)\n", + "\n", + " if verbose==True:\n", + " print(f\"Model Score: {y_pred}\")\n", + " cm = confusion_matrix(y_test, y_pred)\n", + " disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=data.target_names)\n", + " disp.plot()\n", + " \n", + "\n", + " metric_value = 0\n", + " # calculate the metric of the model\n", + " if metric == \"accuracy\":\n", + " metric_value = accuracy_score(y_test, y_pred)\n", + " if metric == \"precision\":\n", + " metric_value = precision_score(y_test, y_pred)\n", + " if metric == \"recall\":\n", + " metric_value = recall_score(y_test, y_pred)\n", + " if metric == \"f1\":\n", + " metric_value = f1_score(y_test, y_pred)\n", + " return metric_value" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [], + "source": [ + "data,transform_data = get_data()\n", + "data.head()\n", + "data.groupby(\"State\").count()\n", + "\n", + "y = data['State']\n", + "X = data.drop(['State','Launched','Deadline'], axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "data = pd.read_csv('data/kickstarter_projects.csv')\n", + "\n", + "\n", + "data = data.drop([\"ID\",\"Name\"],axis=1)\n", + "data = data[(data[\"State\"] == \"Successful\") | (data[\"State\"] == \"Failed\")]\n", + "data[\"Deadline\"] = pd.to_datetime(data[\"Deadline\"],format='%Y-%m-%d')\n", + "data[\"Launched\"] = pd.to_datetime(data[\"Launched\"],format='%Y-%m-%d %H:%M:%S')\n", + "data[\"Duration\"] = (data[\"Deadline\"] - data[\"Launched\"]).dt.days\n", + "\n", + "data, transform_data = transform_strings_to_numerical(data)" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Model Coefficients: [[ 0.02520122 0.00417009 0.04143354 -0.10483769 0.10516741 0.08317194\n", + " 0.00119811]]\n", + "Model Intercept: [0.00554605]\n", + "Model Score: [0 0 0 ... 0 1 0]\n", + "Confusion Matrix: [[39351 56]\n", + " [ 0 26886]]\n", + "0.9989596492531768\n" + ] + } + ], + "source": [ + "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 1337)\n", + "X_train.shape, X_test.shape, y_train.shape, y_test.shape\n", + "\n", + "model = LogisticRegression(max_iter=1000,penalty=\"l2\",C=0.1)\n", + "# fit the model to the training data\n", + "model.fit(X_train, y_train)\n", + "\n", + "# predict the target values for the test data\n", + "y_pred = model.predict(X_test)\n", + "\n", + "print(f\"Model Coefficients: {model.coef_}\")\n", + "print(f\"Model Intercept: {model.intercept_}\")\n", + "print(f\"Model Score: {y_pred}\")\n", + "print(f'Confusion Matrix: {confusion_matrix(y_test, y_pred)}')\n", + "\n", + "metric_value = f1_score(y_test, y_pred)\n", + "print(metric_value)" ] } ], diff --git a/requirements.txt b/requirements.txt index f82b4b8..a2fcb07 100644 --- a/requirements.txt +++ b/requirements.txt @@ -9,4 +9,5 @@ jupyterlab-dash==0.1.0a3 scikit-learn==1.2.2 statsmodels==0.13.5 pytest==7.3.1 -import-ipynb \ No newline at end of file +xgboost==2.1.1 +import-ipynb diff --git a/test.ipynb b/test.ipynb index e69de29..6039a6c 100644 --- a/test.ipynb +++ b/test.ipynb @@ -0,0 +1,541 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "from sklearn.preprocessing import LabelEncoder\n", + "from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score\n", + "from sklearn.linear_model import LogisticRegression\n", + "from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay\n", + "from sklearn.model_selection import train_test_split\n", + "\n", + "data = pd.read_csv('data/kickstarter_projects.csv')\n", + "\n", + "data = data.drop([\"ID\",\"Name\"],axis=1)\n", + "data = data[(data[\"State\"] == \"Successful\") | (data[\"State\"] == \"Failed\")]\n", + "data[\"Deadline\"] = pd.to_datetime(data[\"Deadline\"],format='%Y-%m-%d')\n", + "data[\"Launched\"] = pd.to_datetime(data[\"Launched\"],format='%Y-%m-%d %H:%M:%S')\n", + "data[\"Duration\"] = (data[\"Deadline\"] - data[\"Launched\"]).dt.days\n", + "\n", + "for column in data.columns:\n", + " # If data type is an object, for example a string, we want to convert the column to numerical values\n", + " if data[column].dtype == 'object' and data[column].dtype != 'datetime64[ns]':\n", + " le = LabelEncoder()\n", + " data[column] = le.fit_transform(data[column])\n", + "\n", + "#data, transform_data = transform_strings_to_numerical(data)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "y = data['State']\n", + "X = data.drop(['State','Launched','Deadline'], axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
CategorySubcategoryCountryGoalPledgedBackersDuration
05522110006253039
16129218000022387
207021203538
31313121991452579
45522119003871028
5977213000332911017
661292120041329
71254215005631829
9101252130015216
10111042135016303148
\n", + "
" + ], + "text/plain": [ + " Category Subcategory Country Goal Pledged Backers Duration\n", + "0 5 52 21 1000 625 30 39\n", + "1 6 129 21 80000 22 3 87\n", + "2 0 70 21 20 35 3 8\n", + "3 13 131 21 99 145 25 79\n", + "4 5 52 21 1900 387 10 28\n", + "5 9 77 21 3000 3329 110 17\n", + "6 6 129 21 200 41 3 29\n", + "7 12 54 21 500 563 18 29\n", + "9 10 125 21 300 15 2 16\n", + "10 11 104 21 350 1630 31 48" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X.head(10)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 0\n", + "1 0\n", + "2 1\n", + "3 1\n", + "4 0\n", + "5 1\n", + "6 0\n", + "7 1\n", + "9 0\n", + "10 1\n", + "Name: State, dtype: int32" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "y.head(10)" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "((265169, 7), (66293, 7), (265169,), (66293,))" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)\n", + "X_train.shape, X_test.shape, y_train.shape, y_test.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
CategorySubcategoryCountryGoalPledgedBackersDuration
3224391012521150018253934
33434943721110060274196
219576411301166291622959
128525129521500050503129
1339212952150000044
\n", + "
" + ], + "text/plain": [ + " Category Subcategory Country Goal Pledged Backers Duration\n", + "322439 10 125 21 1500 1825 39 34\n", + "334349 4 37 21 1100 6027 419 6\n", + "219576 4 113 0 116629 1622 9 59\n", + "128525 12 95 21 5000 5050 31 29\n", + "13392 12 95 21 5000 0 0 44" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X_train.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
CategorySubcategoryCountryGoalPledgedBackersDuration
214728121913558217432739
2247964652170060239
158299131312180057429
3294871047218000917910827
162625098212500051344
\n", + "
" + ], + "text/plain": [ + " Category Subcategory Country Goal Pledged Backers Duration\n", + "214728 12 19 13 5582 1743 27 39\n", + "224796 4 65 21 700 60 2 39\n", + "158299 13 131 21 800 57 4 29\n", + "329487 10 47 21 8000 9179 108 27\n", + "162625 0 98 21 25000 51 3 44" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X_test.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "\n", + "model = LogisticRegression(max_iter=1000,penalty=\"l2\",C=0.1)\n", + "# fit the model to the training data\n", + "model.fit(X_train, y_train)\n", + "\n", + "# predict the target values for the test data\n", + "y_pred = model.predict(X_test)" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Model Coefficients: [[ 0.01684149 0.0032329 0.03237224 -0.1055174 0.10587912 0.07303369\n", + " -0.00594841]]\n", + "Model Intercept: [0.58553995]\n", + "Model Score: [0 0 0 ... 1 0 0]\n", + "Confusion Matrix: [[39342 76]\n", + " [ 0 26875]]\n", + "0.9985880429532197\n" + ] + } + ], + "source": [ + "print(f\"Model Coefficients: {model.coef_}\")\n", + "print(f\"Model Intercept: {model.intercept_}\")\n", + "print(f\"Model Score: {y_pred}\")\n", + "print(f'Confusion Matrix: {confusion_matrix(y_test, y_pred)}')\n", + "\n", + "metric_value = f1_score(y_test, y_pred)\n", + "print(metric_value)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "env", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.3" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}