diff --git a/base.ipynb b/base.ipynb
index 8173696..e43207d 100644
--- a/base.ipynb
+++ b/base.ipynb
@@ -2,7 +2,7 @@
"cells": [
{
"cell_type": "code",
- "execution_count": 1,
+ "execution_count": 21,
"metadata": {},
"outputs": [],
"source": [
@@ -10,12 +10,13 @@
"import numpy as np\n",
"from sklearn.preprocessing import LabelEncoder\n",
"from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score\n",
- "from sklearn.linear_model import LogisticRegression"
+ "from sklearn.linear_model import LogisticRegression\n",
+ "from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay"
]
},
{
"cell_type": "code",
- "execution_count": 9,
+ "execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
@@ -74,7 +75,7 @@
},
{
"cell_type": "code",
- "execution_count": 14,
+ "execution_count": 18,
"metadata": {},
"outputs": [],
"source": [
@@ -86,6 +87,7 @@
" data = pd.read_csv('data/kickstarter_projects.csv')\n",
" # transform the categorical values to numerical values\n",
"\n",
+ " data = data.drop([\"ID\",\"Name\"],axis=1)\n",
" data = data[(data[\"State\"] == \"Successful\") | (data[\"State\"] == \"Failed\")]\n",
" data[\"Deadline\"] = pd.to_datetime(data[\"Deadline\"],format='%Y-%m-%d')\n",
" data[\"Launched\"] = pd.to_datetime(data[\"Launched\"],format='%Y-%m-%d %H:%M:%S')\n",
@@ -107,32 +109,7 @@
},
{
"cell_type": "code",
- "execution_count": 83,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "Series([], Name: State, dtype: int64)"
- ]
- },
- "execution_count": 83,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "data, transform_data= get_data()\n",
- "data.drop(\"Name\", axis=1, inplace=True)\n",
- "data.head(20)\n",
- "\n",
- "data = transform_numerical_to_string(data,transform_data)\n",
- "data[\"State\"].groupby(data[\"State\"]).count()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 13,
+ "execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
@@ -143,7 +120,7 @@
" # read in the data from the csv file\n",
" data = pd.read_csv('data/kickstarter_projects.csv')\n",
"\n",
- "\n",
+ " data = data.drop([\"ID\",\"Name\"],axis=1)\n",
" data = data[(data[\"State\"] == \"Successful\") | (data[\"State\"] == \"Failed\")]\n",
" data[\"Deadline\"] = pd.to_datetime(data[\"Deadline\"],format='%Y-%m-%d')\n",
" data[\"Launched\"] = pd.to_datetime(data[\"Launched\"],format='%Y-%m-%d %H:%M:%S')\n",
@@ -162,67 +139,7 @@
},
{
"cell_type": "code",
- "execution_count": 85,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "
\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " ID | \n",
- " Name | \n",
- " Category | \n",
- " Subcategory | \n",
- " Country | \n",
- " Launched | \n",
- " Deadline | \n",
- " Goal | \n",
- " Pledged | \n",
- " Backers | \n",
- " State | \n",
- " Duration | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- "Empty DataFrame\n",
- "Columns: [ID, Name, Category, Subcategory, Country, Launched, Deadline, Goal, Pledged, Backers, State, Duration]\n",
- "Index: []"
- ]
- },
- "execution_count": 85,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "get_original_data().head(20)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 86,
+ "execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
@@ -247,11 +164,19 @@
},
{
"cell_type": "code",
- "execution_count": 21,
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"def logistic_regression(X_train,X_test,y_train,y_test,metric=\"accuracy\",verbose=False,norm=\"l2\",max_iter=1000,C=1.0):\n",
+ " from sklearn.metrics import confusion_matrix\n",
" \"\"\"\n",
" This function performs logistic regression on the data and returns the accuracy of the model\n",
" Necessary Arguments:\n",
@@ -268,7 +193,7 @@
" C : The regularization parameter for the logistic regression\n",
"\n",
" Returns:\n",
- " metric : The number of the metric specified in the arguments\n",
+ " metric_value : The number of the metric specified in the arguments\n",
" \"\"\"\n",
"\n",
" # create a logistic regression model\n",
@@ -284,6 +209,8 @@
" print(f\"Model Coefficients: {model.coef_}\")\n",
" print(f\"Model Intercept: {model.intercept_}\")\n",
" print(f\"Model Score: {y_pred}\")\n",
+ " print(f'Confusion Matrix: {confusion_matrix(y_test, y_pred)}')\n",
+ "\n",
"\n",
" metric_value = 0\n",
" # calculate the metric of the model\n",
@@ -300,7 +227,55 @@
},
{
"cell_type": "code",
- "execution_count": 18,
+ "execution_count": 8,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " Category Subcategory Country Launched Deadline Goal \n",
+ "0 5 52 21 2009-04-21 21:02:48 2009-05-31 1000 \\\n",
+ "1 6 129 21 2009-04-23 00:07:53 2009-07-20 80000 \n",
+ "2 0 70 21 2009-04-24 21:52:03 2009-05-03 20 \n",
+ "3 13 131 21 2009-04-25 17:36:21 2009-07-14 99 \n",
+ "4 5 52 21 2009-04-27 14:10:39 2009-05-26 1900 \n",
+ "\n",
+ " Pledged Backers State Duration \n",
+ "0 625 30 0 39 \n",
+ "1 22 3 0 87 \n",
+ "2 35 3 1 8 \n",
+ "3 145 25 1 79 \n",
+ "4 387 10 0 28 \n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "((265169, 7), (66293, 7), (265169,), (66293,))"
+ ]
+ },
+ "execution_count": 8,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "from sklearn.model_selection import train_test_split\n",
+ "# test out the functions\n",
+ "data,transform_data = get_data()\n",
+ "print(data.head())\n",
+ "\n",
+ "y = data['State']\n",
+ "X = data.drop(['State','Launched','Deadline'], axis=1)\n",
+ "\n",
+ "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 1337)\n",
+ "X_train.shape, X_test.shape, y_train.shape, y_test.shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
@@ -343,40 +318,76 @@
"\n",
" return results\n",
"\n",
- "\n",
- "\n",
- "\n",
" "
]
},
{
"cell_type": "code",
- "execution_count": 23,
+ "execution_count": null,
"metadata": {},
"outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- " ID Name Category Subcategory Country Launched \n",
- "0 1860890148 115746 5 52 21 2009-04-21 21:02:48 \\\n",
- "1 709707365 56298 6 129 21 2009-04-23 00:07:53 \n",
- "2 1703704063 323842 0 70 21 2009-04-24 21:52:03 \n",
- "3 727286 192565 13 131 21 2009-04-25 17:36:21 \n",
- "4 1622952265 200031 5 52 21 2009-04-27 14:10:39 \n",
- "\n",
- " Deadline Goal Pledged Backers State Duration \n",
- "0 2009-05-31 1000 625 30 0 39 \n",
- "1 2009-07-20 80000 22 3 0 87 \n",
- "2 2009-05-03 20 35 3 1 8 \n",
- "3 2009-07-14 99 145 25 1 79 \n",
- "4 2009-05-26 1900 387 10 0 28 \n"
- ]
- },
{
"data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Parameters | \n",
+ " f1 | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " {'eta': 0.3, 'max_depth': 3} | \n",
+ " 0.999261 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " {'eta': 0.3, 'max_depth': 6} | \n",
+ " 0.999261 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " {'eta': 0.3, 'max_depth': 12} | \n",
+ " 0.999261 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " {'eta': 0.3, 'max_depth': 24} | \n",
+ " 0.999261 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " {'eta': 1, 'max_depth': 3} | \n",
+ " 0.999125 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
"text/plain": [
- "((265169, 9), (66293, 9), (265169,), (66293,))"
+ " Parameters f1\n",
+ "0 {'eta': 0.3, 'max_depth': 3} 0.999261\n",
+ "1 {'eta': 0.3, 'max_depth': 6} 0.999261\n",
+ "2 {'eta': 0.3, 'max_depth': 12} 0.999261\n",
+ "3 {'eta': 0.3, 'max_depth': 24} 0.999261\n",
+ "4 {'eta': 1, 'max_depth': 3} 0.999125"
]
},
"execution_count": 23,
@@ -385,51 +396,132 @@
}
],
"source": [
- "from sklearn.model_selection import train_test_split\n",
- "# test out the functions\n",
- "data,transform_data = get_data()\n",
- "print(data.head())\n",
- "\n",
- "y = data['State']\n",
- "X = data.drop(['State','Launched','Deadline'], axis=1)\n",
+ "#import warnings\n",
+ "#\n",
+ "# warnings.filterwarnings(\"ignore\", category=ConvergenceWarning)\n",
"\n",
- "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 1337)\n",
- "X_train.shape, X_test.shape, y_train.shape, y_test.shape"
+ "hyperparameters = {\"eta\":[0.1,0.3,1],\"max_depth\":[3,6,12,24]}\n",
+ "results = grid_search(xgb,hyperparameters,\"f1\",X_train,X_test,y_train,y_test)\n",
+ "results.head()"
]
},
{
"cell_type": "code",
- "execution_count": 24,
+ "execution_count": 11,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Category | \n",
+ " Subcategory | \n",
+ " Country | \n",
+ " Goal | \n",
+ " Pledged | \n",
+ " Backers | \n",
+ " Duration | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 164774 | \n",
+ " 3 | \n",
+ " 99 | \n",
+ " 21 | \n",
+ " 3500 | \n",
+ " 3501 | \n",
+ " 19 | \n",
+ " 29 | \n",
+ "
\n",
+ " \n",
+ " 74178 | \n",
+ " 10 | \n",
+ " 90 | \n",
+ " 20 | \n",
+ " 320 | \n",
+ " 567 | \n",
+ " 27 | \n",
+ " 29 | \n",
+ "
\n",
+ " \n",
+ " 296198 | \n",
+ " 13 | \n",
+ " 138 | \n",
+ " 21 | \n",
+ " 250000 | \n",
+ " 275 | \n",
+ " 2 | \n",
+ " 29 | \n",
+ "
\n",
+ " \n",
+ " 92665 | \n",
+ " 6 | \n",
+ " 129 | \n",
+ " 21 | \n",
+ " 7000 | \n",
+ " 528 | \n",
+ " 23 | \n",
+ " 38 | \n",
+ "
\n",
+ " \n",
+ " 191647 | \n",
+ " 12 | \n",
+ " 95 | \n",
+ " 21 | \n",
+ " 2000 | \n",
+ " 80 | \n",
+ " 3 | \n",
+ " 14 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Category Subcategory Country Goal Pledged Backers Duration\n",
+ "164774 3 99 21 3500 3501 19 29\n",
+ "74178 10 90 20 320 567 27 29\n",
+ "296198 13 138 21 250000 275 2 29\n",
+ "92665 6 129 21 7000 528 23 38\n",
+ "191647 12 95 21 2000 80 3 14"
+ ]
+ },
+ "execution_count": 11,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "X_train.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
- "c:\\Users\\tobia\\anaconda3\\envs\\env\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1):\n",
- "STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n",
- "\n",
- "Increase the number of iterations (max_iter) or scale the data as shown in:\n",
- " https://scikit-learn.org/stable/modules/preprocessing.html\n",
- "Please also refer to the documentation for alternative solver options:\n",
- " https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n",
- " n_iter_i = _check_optimize_result(\n",
- "c:\\Users\\tobia\\anaconda3\\envs\\env\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1):\n",
- "STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n",
- "\n",
- "Increase the number of iterations (max_iter) or scale the data as shown in:\n",
- " https://scikit-learn.org/stable/modules/preprocessing.html\n",
- "Please also refer to the documentation for alternative solver options:\n",
- " https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n",
- " n_iter_i = _check_optimize_result(\n",
- "c:\\Users\\tobia\\anaconda3\\envs\\env\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1):\n",
- "STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n",
- "\n",
- "Increase the number of iterations (max_iter) or scale the data as shown in:\n",
- " https://scikit-learn.org/stable/modules/preprocessing.html\n",
- "Please also refer to the documentation for alternative solver options:\n",
- " https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n",
- " n_iter_i = _check_optimize_result(\n",
"c:\\Users\\tobia\\anaconda3\\envs\\env\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1):\n",
"STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n",
"\n",
@@ -478,62 +570,269 @@
" \n",
" | \n",
" Parameters | \n",
- " accuracy | \n",
+ " f1 | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
- " {'C': 10.0, 'max_iter': 1000} | \n",
- " 0.999110 | \n",
+ " {'C': 0.1, 'max_iter': 100} | \n",
+ " 0.999155 | \n",
"
\n",
" \n",
" 1 | \n",
- " {'C': 0.1, 'max_iter': 1000} | \n",
- " 0.999095 | \n",
+ " {'C': 0.1, 'max_iter': 500} | \n",
+ " 0.999155 | \n",
"
\n",
" \n",
" 2 | \n",
- " {'C': 1.0, 'max_iter': 1000} | \n",
- " 0.999095 | \n",
+ " {'C': 0.1, 'max_iter': 1000} | \n",
+ " 0.999155 | \n",
"
\n",
" \n",
" 3 | \n",
- " {'C': 10.0, 'max_iter': 100} | \n",
- " 0.998989 | \n",
+ " {'C': 1, 'max_iter': 100} | \n",
+ " 0.999155 | \n",
"
\n",
" \n",
" 4 | \n",
- " {'C': 0.1, 'max_iter': 100} | \n",
- " 0.998884 | \n",
+ " {'C': 1, 'max_iter': 500} | \n",
+ " 0.999155 | \n",
"
\n",
" \n",
"\n",
""
],
"text/plain": [
- " Parameters accuracy\n",
- "0 {'C': 10.0, 'max_iter': 1000} 0.999110\n",
- "1 {'C': 0.1, 'max_iter': 1000} 0.999095\n",
- "2 {'C': 1.0, 'max_iter': 1000} 0.999095\n",
- "3 {'C': 10.0, 'max_iter': 100} 0.998989\n",
- "4 {'C': 0.1, 'max_iter': 100} 0.998884"
+ " Parameters f1\n",
+ "0 {'C': 0.1, 'max_iter': 100} 0.999155\n",
+ "1 {'C': 0.1, 'max_iter': 500} 0.999155\n",
+ "2 {'C': 0.1, 'max_iter': 1000} 0.999155\n",
+ "3 {'C': 1, 'max_iter': 100} 0.999155\n",
+ "4 {'C': 1, 'max_iter': 500} 0.999155"
]
},
- "execution_count": 24,
+ "execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
- "#import warnings\n",
- "#\n",
- "# warnings.filterwarnings(\"ignore\", category=ConvergenceWarning)\n",
+ "hyperparameters = {\"C\":[0.1,1,10],\"max_iter\":[100,500,1000]}\n",
+ "results = grid_search(logistic_regression,hyperparameters,\"f1\",X_train,X_test,y_train,y_test)\n",
+ "results.head()\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 16,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Model Coefficients: [[ 0.02520122 0.00417009 0.04143354 -0.10483769 0.10516741 0.08317194\n",
+ " 0.00119811]]\n",
+ "Model Intercept: [0.00554605]\n",
+ "Model Score: [0 0 0 ... 0 1 0]\n",
+ "Confusion Matrix: [[39351 56]\n",
+ " [ 0 26886]]\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "0.9989596492531768"
+ ]
+ },
+ "execution_count": 16,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "logistic_regression(X_train,X_test,y_train,y_test,metric=\"f1\",verbose=True,C=0.1,max_iter=500)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def decision_tree(X_train,X_test,y_train,y_test,metric=\"accuracy\",verbose=False):\n",
+ " from sklearn.tree import DecisionTreeClassifier\n",
+ " \"\"\"\n",
+ " This function performs logistic regression on the data and returns the accuracy of the model\n",
+ " Necessary Arguments:\n",
+ " X_train : The training data\n",
+ " X_test : The test data\n",
+ " y_train : The target values\n",
+ " y_test : The target values for the training data\n",
"\n",
- "hyperparameters = {\"C\":[0.1,1.0,10.0],\"max_iter\":[10,100,1000]}\n",
- "results = grid_search(logistic_regression,hyperparameters,\"accuracy\",X_train,X_test,y_train,y_test)\n",
+ " Optional Arguments:\n",
+ " metric : The metric to calculate the model performance, Options: \"accuracy\", \"precision\", \"recall\", \"f1\"\n",
+ " verbose : If True, the function will print the metric of the model\n",
"\n",
- "results.head()\n"
+ " Returns:\n",
+ " metric_value : The number of the metric specified in the arguments\n",
+ " \"\"\"\n",
+ "\n",
+ " # create a decision tree model\n",
+ " model = DecisionTreeClassifier()\n",
+ "\n",
+ " # fit the model to the training data\n",
+ " model.fit(X_train, y_train)\n",
+ "\n",
+ " # predict the target values for the test data\n",
+ " y_pred = model.predict(X_test)\n",
+ "\n",
+ " if verbose==True:\n",
+ " print(f\"Model Score: {y_pred}\")\n",
+ " cm = confusion_matrix(y_test, y_pred)\n",
+ " disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=data.target_names)\n",
+ " disp.plot()\n",
+ "\n",
+ " metric_value = 0\n",
+ " # calculate the metric of the model\n",
+ " if metric == \"accuracy\":\n",
+ " metric_value = accuracy_score(y_test, y_pred)\n",
+ " if metric == \"precision\":\n",
+ " metric_value = precision_score(y_test, y_pred)\n",
+ " if metric == \"recall\":\n",
+ " metric_value = recall_score(y_test, y_pred)\n",
+ " if metric == \"f1\":\n",
+ " metric_value = f1_score(y_test, y_pred)\n",
+ " return metric_value"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def xgb(X_train,X_test,y_train,y_test,metric=\"accuracy\",booster=\"gbtree\",eta=0.3,max_depth=6, reg_lambda=1, verbose=False):\n",
+ "\n",
+ " \"\"\"\n",
+ " This function performs xgboost on the data and returns the accuracy of the model\n",
+ " Necessary Arguments:\n",
+ " X_train : The training data\n",
+ " X_test : The test data\n",
+ " y_train : The target values\n",
+ " y_test : The target values for the training data\n",
+ "\n",
+ " Optional Arguments:\n",
+ " metric : The metric to calculate the model performance, Options: \"accuracy\", \"precision\", \"recall\", \"f1\"\n",
+ " verbose : If True, the function will print the metric of the model\n",
+ " booster : The type of booster to use, Options: \"gbtree\", \"gblinear\", \"dart\"\n",
+ " eta : The learning rate of the model, between [0,1]\n",
+ " max_depth : The maximum depth of the trees, default is 6 to avoid overfitting\n",
+ " reg_lambda : The regularization parameter of the model\n",
+ "\n",
+ " Returns:\n",
+ " metric_value : The number of the metric specified in the arguments\n",
+ " \"\"\"\n",
+ " from xgboost import XGBClassifier\n",
+ " # create a logistic regression model\n",
+ " model = XGBClassifier(booster=booster,eta=eta)\n",
+ "\n",
+ " # fit the model to the training data\n",
+ " model.fit(X_train, y_train)\n",
+ "\n",
+ " # predict the target values for the test data\n",
+ " y_pred = model.predict(X_test)\n",
+ "\n",
+ " if verbose==True:\n",
+ " print(f\"Model Score: {y_pred}\")\n",
+ " cm = confusion_matrix(y_test, y_pred)\n",
+ " disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=data.target_names)\n",
+ " disp.plot()\n",
+ " \n",
+ "\n",
+ " metric_value = 0\n",
+ " # calculate the metric of the model\n",
+ " if metric == \"accuracy\":\n",
+ " metric_value = accuracy_score(y_test, y_pred)\n",
+ " if metric == \"precision\":\n",
+ " metric_value = precision_score(y_test, y_pred)\n",
+ " if metric == \"recall\":\n",
+ " metric_value = recall_score(y_test, y_pred)\n",
+ " if metric == \"f1\":\n",
+ " metric_value = f1_score(y_test, y_pred)\n",
+ " return metric_value"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 25,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "data,transform_data = get_data()\n",
+ "data.head()\n",
+ "data.groupby(\"State\").count()\n",
+ "\n",
+ "y = data['State']\n",
+ "X = data.drop(['State','Launched','Deadline'], axis=1)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "data = pd.read_csv('data/kickstarter_projects.csv')\n",
+ "\n",
+ "\n",
+ "data = data.drop([\"ID\",\"Name\"],axis=1)\n",
+ "data = data[(data[\"State\"] == \"Successful\") | (data[\"State\"] == \"Failed\")]\n",
+ "data[\"Deadline\"] = pd.to_datetime(data[\"Deadline\"],format='%Y-%m-%d')\n",
+ "data[\"Launched\"] = pd.to_datetime(data[\"Launched\"],format='%Y-%m-%d %H:%M:%S')\n",
+ "data[\"Duration\"] = (data[\"Deadline\"] - data[\"Launched\"]).dt.days\n",
+ "\n",
+ "data, transform_data = transform_strings_to_numerical(data)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 24,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Model Coefficients: [[ 0.02520122 0.00417009 0.04143354 -0.10483769 0.10516741 0.08317194\n",
+ " 0.00119811]]\n",
+ "Model Intercept: [0.00554605]\n",
+ "Model Score: [0 0 0 ... 0 1 0]\n",
+ "Confusion Matrix: [[39351 56]\n",
+ " [ 0 26886]]\n",
+ "0.9989596492531768\n"
+ ]
+ }
+ ],
+ "source": [
+ "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 1337)\n",
+ "X_train.shape, X_test.shape, y_train.shape, y_test.shape\n",
+ "\n",
+ "model = LogisticRegression(max_iter=1000,penalty=\"l2\",C=0.1)\n",
+ "# fit the model to the training data\n",
+ "model.fit(X_train, y_train)\n",
+ "\n",
+ "# predict the target values for the test data\n",
+ "y_pred = model.predict(X_test)\n",
+ "\n",
+ "print(f\"Model Coefficients: {model.coef_}\")\n",
+ "print(f\"Model Intercept: {model.intercept_}\")\n",
+ "print(f\"Model Score: {y_pred}\")\n",
+ "print(f'Confusion Matrix: {confusion_matrix(y_test, y_pred)}')\n",
+ "\n",
+ "metric_value = f1_score(y_test, y_pred)\n",
+ "print(metric_value)"
]
}
],
diff --git a/requirements.txt b/requirements.txt
index f82b4b8..a2fcb07 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -9,4 +9,5 @@ jupyterlab-dash==0.1.0a3
scikit-learn==1.2.2
statsmodels==0.13.5
pytest==7.3.1
-import-ipynb
\ No newline at end of file
+xgboost==2.1.1
+import-ipynb
diff --git a/test.ipynb b/test.ipynb
index e69de29..6039a6c 100644
--- a/test.ipynb
+++ b/test.ipynb
@@ -0,0 +1,541 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import pandas as pd\n",
+ "import numpy as np\n",
+ "from sklearn.preprocessing import LabelEncoder\n",
+ "from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score\n",
+ "from sklearn.linear_model import LogisticRegression\n",
+ "from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay\n",
+ "from sklearn.model_selection import train_test_split\n",
+ "\n",
+ "data = pd.read_csv('data/kickstarter_projects.csv')\n",
+ "\n",
+ "data = data.drop([\"ID\",\"Name\"],axis=1)\n",
+ "data = data[(data[\"State\"] == \"Successful\") | (data[\"State\"] == \"Failed\")]\n",
+ "data[\"Deadline\"] = pd.to_datetime(data[\"Deadline\"],format='%Y-%m-%d')\n",
+ "data[\"Launched\"] = pd.to_datetime(data[\"Launched\"],format='%Y-%m-%d %H:%M:%S')\n",
+ "data[\"Duration\"] = (data[\"Deadline\"] - data[\"Launched\"]).dt.days\n",
+ "\n",
+ "for column in data.columns:\n",
+ " # If data type is an object, for example a string, we want to convert the column to numerical values\n",
+ " if data[column].dtype == 'object' and data[column].dtype != 'datetime64[ns]':\n",
+ " le = LabelEncoder()\n",
+ " data[column] = le.fit_transform(data[column])\n",
+ "\n",
+ "#data, transform_data = transform_strings_to_numerical(data)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "y = data['State']\n",
+ "X = data.drop(['State','Launched','Deadline'], axis=1)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Category | \n",
+ " Subcategory | \n",
+ " Country | \n",
+ " Goal | \n",
+ " Pledged | \n",
+ " Backers | \n",
+ " Duration | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 5 | \n",
+ " 52 | \n",
+ " 21 | \n",
+ " 1000 | \n",
+ " 625 | \n",
+ " 30 | \n",
+ " 39 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 6 | \n",
+ " 129 | \n",
+ " 21 | \n",
+ " 80000 | \n",
+ " 22 | \n",
+ " 3 | \n",
+ " 87 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 0 | \n",
+ " 70 | \n",
+ " 21 | \n",
+ " 20 | \n",
+ " 35 | \n",
+ " 3 | \n",
+ " 8 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 13 | \n",
+ " 131 | \n",
+ " 21 | \n",
+ " 99 | \n",
+ " 145 | \n",
+ " 25 | \n",
+ " 79 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 5 | \n",
+ " 52 | \n",
+ " 21 | \n",
+ " 1900 | \n",
+ " 387 | \n",
+ " 10 | \n",
+ " 28 | \n",
+ "
\n",
+ " \n",
+ " 5 | \n",
+ " 9 | \n",
+ " 77 | \n",
+ " 21 | \n",
+ " 3000 | \n",
+ " 3329 | \n",
+ " 110 | \n",
+ " 17 | \n",
+ "
\n",
+ " \n",
+ " 6 | \n",
+ " 6 | \n",
+ " 129 | \n",
+ " 21 | \n",
+ " 200 | \n",
+ " 41 | \n",
+ " 3 | \n",
+ " 29 | \n",
+ "
\n",
+ " \n",
+ " 7 | \n",
+ " 12 | \n",
+ " 54 | \n",
+ " 21 | \n",
+ " 500 | \n",
+ " 563 | \n",
+ " 18 | \n",
+ " 29 | \n",
+ "
\n",
+ " \n",
+ " 9 | \n",
+ " 10 | \n",
+ " 125 | \n",
+ " 21 | \n",
+ " 300 | \n",
+ " 15 | \n",
+ " 2 | \n",
+ " 16 | \n",
+ "
\n",
+ " \n",
+ " 10 | \n",
+ " 11 | \n",
+ " 104 | \n",
+ " 21 | \n",
+ " 350 | \n",
+ " 1630 | \n",
+ " 31 | \n",
+ " 48 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Category Subcategory Country Goal Pledged Backers Duration\n",
+ "0 5 52 21 1000 625 30 39\n",
+ "1 6 129 21 80000 22 3 87\n",
+ "2 0 70 21 20 35 3 8\n",
+ "3 13 131 21 99 145 25 79\n",
+ "4 5 52 21 1900 387 10 28\n",
+ "5 9 77 21 3000 3329 110 17\n",
+ "6 6 129 21 200 41 3 29\n",
+ "7 12 54 21 500 563 18 29\n",
+ "9 10 125 21 300 15 2 16\n",
+ "10 11 104 21 350 1630 31 48"
+ ]
+ },
+ "execution_count": 11,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "X.head(10)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "0 0\n",
+ "1 0\n",
+ "2 1\n",
+ "3 1\n",
+ "4 0\n",
+ "5 1\n",
+ "6 0\n",
+ "7 1\n",
+ "9 0\n",
+ "10 1\n",
+ "Name: State, dtype: int32"
+ ]
+ },
+ "execution_count": 12,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "y.head(10)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 18,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "((265169, 7), (66293, 7), (265169,), (66293,))"
+ ]
+ },
+ "execution_count": 18,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)\n",
+ "X_train.shape, X_test.shape, y_train.shape, y_test.shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 19,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Category | \n",
+ " Subcategory | \n",
+ " Country | \n",
+ " Goal | \n",
+ " Pledged | \n",
+ " Backers | \n",
+ " Duration | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 322439 | \n",
+ " 10 | \n",
+ " 125 | \n",
+ " 21 | \n",
+ " 1500 | \n",
+ " 1825 | \n",
+ " 39 | \n",
+ " 34 | \n",
+ "
\n",
+ " \n",
+ " 334349 | \n",
+ " 4 | \n",
+ " 37 | \n",
+ " 21 | \n",
+ " 1100 | \n",
+ " 6027 | \n",
+ " 419 | \n",
+ " 6 | \n",
+ "
\n",
+ " \n",
+ " 219576 | \n",
+ " 4 | \n",
+ " 113 | \n",
+ " 0 | \n",
+ " 116629 | \n",
+ " 1622 | \n",
+ " 9 | \n",
+ " 59 | \n",
+ "
\n",
+ " \n",
+ " 128525 | \n",
+ " 12 | \n",
+ " 95 | \n",
+ " 21 | \n",
+ " 5000 | \n",
+ " 5050 | \n",
+ " 31 | \n",
+ " 29 | \n",
+ "
\n",
+ " \n",
+ " 13392 | \n",
+ " 12 | \n",
+ " 95 | \n",
+ " 21 | \n",
+ " 5000 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 44 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Category Subcategory Country Goal Pledged Backers Duration\n",
+ "322439 10 125 21 1500 1825 39 34\n",
+ "334349 4 37 21 1100 6027 419 6\n",
+ "219576 4 113 0 116629 1622 9 59\n",
+ "128525 12 95 21 5000 5050 31 29\n",
+ "13392 12 95 21 5000 0 0 44"
+ ]
+ },
+ "execution_count": 19,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "X_train.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 20,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Category | \n",
+ " Subcategory | \n",
+ " Country | \n",
+ " Goal | \n",
+ " Pledged | \n",
+ " Backers | \n",
+ " Duration | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 214728 | \n",
+ " 12 | \n",
+ " 19 | \n",
+ " 13 | \n",
+ " 5582 | \n",
+ " 1743 | \n",
+ " 27 | \n",
+ " 39 | \n",
+ "
\n",
+ " \n",
+ " 224796 | \n",
+ " 4 | \n",
+ " 65 | \n",
+ " 21 | \n",
+ " 700 | \n",
+ " 60 | \n",
+ " 2 | \n",
+ " 39 | \n",
+ "
\n",
+ " \n",
+ " 158299 | \n",
+ " 13 | \n",
+ " 131 | \n",
+ " 21 | \n",
+ " 800 | \n",
+ " 57 | \n",
+ " 4 | \n",
+ " 29 | \n",
+ "
\n",
+ " \n",
+ " 329487 | \n",
+ " 10 | \n",
+ " 47 | \n",
+ " 21 | \n",
+ " 8000 | \n",
+ " 9179 | \n",
+ " 108 | \n",
+ " 27 | \n",
+ "
\n",
+ " \n",
+ " 162625 | \n",
+ " 0 | \n",
+ " 98 | \n",
+ " 21 | \n",
+ " 25000 | \n",
+ " 51 | \n",
+ " 3 | \n",
+ " 44 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Category Subcategory Country Goal Pledged Backers Duration\n",
+ "214728 12 19 13 5582 1743 27 39\n",
+ "224796 4 65 21 700 60 2 39\n",
+ "158299 13 131 21 800 57 4 29\n",
+ "329487 10 47 21 8000 9179 108 27\n",
+ "162625 0 98 21 25000 51 3 44"
+ ]
+ },
+ "execution_count": 20,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "X_test.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 21,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "\n",
+ "\n",
+ "model = LogisticRegression(max_iter=1000,penalty=\"l2\",C=0.1)\n",
+ "# fit the model to the training data\n",
+ "model.fit(X_train, y_train)\n",
+ "\n",
+ "# predict the target values for the test data\n",
+ "y_pred = model.predict(X_test)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 22,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Model Coefficients: [[ 0.01684149 0.0032329 0.03237224 -0.1055174 0.10587912 0.07303369\n",
+ " -0.00594841]]\n",
+ "Model Intercept: [0.58553995]\n",
+ "Model Score: [0 0 0 ... 1 0 0]\n",
+ "Confusion Matrix: [[39342 76]\n",
+ " [ 0 26875]]\n",
+ "0.9985880429532197\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(f\"Model Coefficients: {model.coef_}\")\n",
+ "print(f\"Model Intercept: {model.intercept_}\")\n",
+ "print(f\"Model Score: {y_pred}\")\n",
+ "print(f'Confusion Matrix: {confusion_matrix(y_test, y_pred)}')\n",
+ "\n",
+ "metric_value = f1_score(y_test, y_pred)\n",
+ "print(metric_value)"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "env",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.11.3"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}