From a69bc06912c43366aa177d0601e3a9da341c3274 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tobias=20K=C3=A4mmerling?= Date: Fri, 9 Aug 2024 10:25:41 +0200 Subject: [PATCH 1/3] added xgboost to the requirements --- requirements.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index f82b4b8..a2fcb07 100644 --- a/requirements.txt +++ b/requirements.txt @@ -9,4 +9,5 @@ jupyterlab-dash==0.1.0a3 scikit-learn==1.2.2 statsmodels==0.13.5 pytest==7.3.1 -import-ipynb \ No newline at end of file +xgboost==2.1.1 +import-ipynb From 0985cf628ec8dcbaa3c8db4af0d82397a1b626b2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tobias=20K=C3=A4mmerling?= Date: Fri, 9 Aug 2024 12:21:32 +0200 Subject: [PATCH 2/3] Added decision tree, xgboost and tried out the grid search and machine learning algorithms to get our first model --- base.ipynb | 618 +++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 462 insertions(+), 156 deletions(-) diff --git a/base.ipynb b/base.ipynb index 8173696..048ce07 100644 --- a/base.ipynb +++ b/base.ipynb @@ -15,7 +15,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -74,7 +74,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ @@ -86,6 +86,7 @@ " data = pd.read_csv('data/kickstarter_projects.csv')\n", " # transform the categorical values to numerical values\n", "\n", + " data = data.drop(\"ID\",axis=1)\n", " data = data[(data[\"State\"] == \"Successful\") | (data[\"State\"] == \"Failed\")]\n", " data[\"Deadline\"] = pd.to_datetime(data[\"Deadline\"],format='%Y-%m-%d')\n", " data[\"Launched\"] = pd.to_datetime(data[\"Launched\"],format='%Y-%m-%d %H:%M:%S')\n", @@ -107,18 +108,24 @@ }, { "cell_type": "code", - "execution_count": 83, + "execution_count": 5, "metadata": {}, "outputs": [ { - "data": { - "text/plain": [ - "Series([], Name: State, dtype: int64)" - ] - }, - "execution_count": 83, - "metadata": {}, - "output_type": "execute_result" + "ename": "KeyError", + "evalue": "\"['ID'] not found in axis\"", + "output_type": "error", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mKeyError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[1;32mIn[5], line 1\u001b[0m\n\u001b[1;32m----> 1\u001b[0m data, transform_data\u001b[38;5;241m=\u001b[39m \u001b[43mget_data\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 2\u001b[0m data\u001b[38;5;241m.\u001b[39mdrop(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mName\u001b[39m\u001b[38;5;124m\"\u001b[39m, axis\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m1\u001b[39m, inplace\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)\n\u001b[0;32m 3\u001b[0m data\u001b[38;5;241m.\u001b[39mhead(\u001b[38;5;241m20\u001b[39m)\n", + "Cell \u001b[1;32mIn[4], line 9\u001b[0m, in \u001b[0;36mget_data\u001b[1;34m()\u001b[0m\n\u001b[0;32m 6\u001b[0m data \u001b[38;5;241m=\u001b[39m pd\u001b[38;5;241m.\u001b[39mread_csv(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mdata/kickstarter_projects.csv\u001b[39m\u001b[38;5;124m'\u001b[39m)\n\u001b[0;32m 7\u001b[0m \u001b[38;5;66;03m# transform the categorical values to numerical values\u001b[39;00m\n\u001b[1;32m----> 9\u001b[0m data \u001b[38;5;241m=\u001b[39m \u001b[43mdata\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdrop\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mID\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[0;32m 10\u001b[0m data \u001b[38;5;241m=\u001b[39m data[(data[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mState\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mSuccessful\u001b[39m\u001b[38;5;124m\"\u001b[39m) \u001b[38;5;241m|\u001b[39m (data[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mState\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mFailed\u001b[39m\u001b[38;5;124m\"\u001b[39m)]\n\u001b[0;32m 11\u001b[0m data[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mDeadline\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m pd\u001b[38;5;241m.\u001b[39mto_datetime(data[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mDeadline\u001b[39m\u001b[38;5;124m\"\u001b[39m],\u001b[38;5;28mformat\u001b[39m\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m%\u001b[39m\u001b[38;5;124mY-\u001b[39m\u001b[38;5;124m%\u001b[39m\u001b[38;5;124mm-\u001b[39m\u001b[38;5;132;01m%d\u001b[39;00m\u001b[38;5;124m'\u001b[39m)\n", + "File \u001b[1;32mc:\\Users\\tobia\\anaconda3\\envs\\env\\Lib\\site-packages\\pandas\\core\\frame.py:5268\u001b[0m, in \u001b[0;36mDataFrame.drop\u001b[1;34m(self, labels, axis, index, columns, level, inplace, errors)\u001b[0m\n\u001b[0;32m 5120\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mdrop\u001b[39m(\n\u001b[0;32m 5121\u001b[0m \u001b[38;5;28mself\u001b[39m,\n\u001b[0;32m 5122\u001b[0m labels: IndexLabel \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 5129\u001b[0m errors: IgnoreRaise \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mraise\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[0;32m 5130\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m DataFrame \u001b[38;5;241m|\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m 5131\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[0;32m 5132\u001b[0m \u001b[38;5;124;03m Drop specified labels from rows or columns.\u001b[39;00m\n\u001b[0;32m 5133\u001b[0m \n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 5266\u001b[0m \u001b[38;5;124;03m weight 1.0 0.8\u001b[39;00m\n\u001b[0;32m 5267\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[1;32m-> 5268\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdrop\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 5269\u001b[0m \u001b[43m \u001b[49m\u001b[43mlabels\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mlabels\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 5270\u001b[0m \u001b[43m \u001b[49m\u001b[43maxis\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43maxis\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 5271\u001b[0m \u001b[43m \u001b[49m\u001b[43mindex\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mindex\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 5272\u001b[0m \u001b[43m \u001b[49m\u001b[43mcolumns\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcolumns\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 5273\u001b[0m \u001b[43m \u001b[49m\u001b[43mlevel\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mlevel\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 5274\u001b[0m \u001b[43m \u001b[49m\u001b[43minplace\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43minplace\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 5275\u001b[0m \u001b[43m \u001b[49m\u001b[43merrors\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43merrors\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 5276\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[1;32mc:\\Users\\tobia\\anaconda3\\envs\\env\\Lib\\site-packages\\pandas\\core\\generic.py:4549\u001b[0m, in \u001b[0;36mNDFrame.drop\u001b[1;34m(self, labels, axis, index, columns, level, inplace, errors)\u001b[0m\n\u001b[0;32m 4547\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m axis, labels \u001b[38;5;129;01min\u001b[39;00m axes\u001b[38;5;241m.\u001b[39mitems():\n\u001b[0;32m 4548\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m labels \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m-> 4549\u001b[0m obj \u001b[38;5;241m=\u001b[39m \u001b[43mobj\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_drop_axis\u001b[49m\u001b[43m(\u001b[49m\u001b[43mlabels\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43maxis\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mlevel\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mlevel\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43merrors\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43merrors\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 4551\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m inplace:\n\u001b[0;32m 4552\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_update_inplace(obj)\n", + "File \u001b[1;32mc:\\Users\\tobia\\anaconda3\\envs\\env\\Lib\\site-packages\\pandas\\core\\generic.py:4591\u001b[0m, in \u001b[0;36mNDFrame._drop_axis\u001b[1;34m(self, labels, axis, level, errors, only_slice)\u001b[0m\n\u001b[0;32m 4589\u001b[0m new_axis \u001b[38;5;241m=\u001b[39m axis\u001b[38;5;241m.\u001b[39mdrop(labels, level\u001b[38;5;241m=\u001b[39mlevel, errors\u001b[38;5;241m=\u001b[39merrors)\n\u001b[0;32m 4590\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m-> 4591\u001b[0m new_axis \u001b[38;5;241m=\u001b[39m \u001b[43maxis\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdrop\u001b[49m\u001b[43m(\u001b[49m\u001b[43mlabels\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43merrors\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43merrors\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 4592\u001b[0m indexer \u001b[38;5;241m=\u001b[39m axis\u001b[38;5;241m.\u001b[39mget_indexer(new_axis)\n\u001b[0;32m 4594\u001b[0m \u001b[38;5;66;03m# Case for non-unique axis\u001b[39;00m\n\u001b[0;32m 4595\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n", + "File \u001b[1;32mc:\\Users\\tobia\\anaconda3\\envs\\env\\Lib\\site-packages\\pandas\\core\\indexes\\base.py:6696\u001b[0m, in \u001b[0;36mIndex.drop\u001b[1;34m(self, labels, errors)\u001b[0m\n\u001b[0;32m 6694\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m mask\u001b[38;5;241m.\u001b[39many():\n\u001b[0;32m 6695\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m errors \u001b[38;5;241m!=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mignore\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n\u001b[1;32m-> 6696\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mlist\u001b[39m(labels[mask])\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m not found in axis\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m 6697\u001b[0m indexer \u001b[38;5;241m=\u001b[39m indexer[\u001b[38;5;241m~\u001b[39mmask]\n\u001b[0;32m 6698\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdelete(indexer)\n", + "\u001b[1;31mKeyError\u001b[0m: \"['ID'] not found in axis\"" + ] } ], "source": [ @@ -127,12 +134,12 @@ "data.head(20)\n", "\n", "data = transform_numerical_to_string(data,transform_data)\n", - "data[\"State\"].groupby(data[\"State\"]).count()" + "data[\"State\"].groupby(data[\"State\"]).count()\n" ] }, { "cell_type": "code", - "execution_count": 13, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -143,7 +150,7 @@ " # read in the data from the csv file\n", " data = pd.read_csv('data/kickstarter_projects.csv')\n", "\n", - "\n", + " data = data.drop(\"ID\",axis=1)\n", " data = data[(data[\"State\"] == \"Successful\") | (data[\"State\"] == \"Failed\")]\n", " data[\"Deadline\"] = pd.to_datetime(data[\"Deadline\"],format='%Y-%m-%d')\n", " data[\"Launched\"] = pd.to_datetime(data[\"Launched\"],format='%Y-%m-%d %H:%M:%S')\n", @@ -162,67 +169,7 @@ }, { "cell_type": "code", - "execution_count": 85, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
IDNameCategorySubcategoryCountryLaunchedDeadlineGoalPledgedBackersStateDuration
\n", - "
" - ], - "text/plain": [ - "Empty DataFrame\n", - "Columns: [ID, Name, Category, Subcategory, Country, Launched, Deadline, Goal, Pledged, Backers, State, Duration]\n", - "Index: []" - ] - }, - "execution_count": 85, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "get_original_data().head(20)" - ] - }, - { - "cell_type": "code", - "execution_count": 86, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -247,7 +194,14 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -268,7 +222,7 @@ " C : The regularization parameter for the logistic regression\n", "\n", " Returns:\n", - " metric : The number of the metric specified in the arguments\n", + " metric_value : The number of the metric specified in the arguments\n", " \"\"\"\n", "\n", " # create a logistic regression model\n", @@ -300,7 +254,58 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "ename": "KeyError", + "evalue": "\"['ID'] not found in axis\"", + "output_type": "error", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mKeyError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[1;32mIn[6], line 3\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01msklearn\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mmodel_selection\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m train_test_split\n\u001b[0;32m 2\u001b[0m \u001b[38;5;66;03m# test out the functions\u001b[39;00m\n\u001b[1;32m----> 3\u001b[0m data,transform_data \u001b[38;5;241m=\u001b[39m \u001b[43mget_data\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 4\u001b[0m \u001b[38;5;28mprint\u001b[39m(data\u001b[38;5;241m.\u001b[39mhead())\n\u001b[0;32m 6\u001b[0m y \u001b[38;5;241m=\u001b[39m data[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mState\u001b[39m\u001b[38;5;124m'\u001b[39m]\n", + "Cell \u001b[1;32mIn[4], line 9\u001b[0m, in \u001b[0;36mget_data\u001b[1;34m()\u001b[0m\n\u001b[0;32m 6\u001b[0m data \u001b[38;5;241m=\u001b[39m pd\u001b[38;5;241m.\u001b[39mread_csv(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mdata/kickstarter_projects.csv\u001b[39m\u001b[38;5;124m'\u001b[39m)\n\u001b[0;32m 7\u001b[0m \u001b[38;5;66;03m# transform the categorical values to numerical values\u001b[39;00m\n\u001b[1;32m----> 9\u001b[0m data \u001b[38;5;241m=\u001b[39m \u001b[43mdata\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdrop\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mID\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[0;32m 10\u001b[0m data \u001b[38;5;241m=\u001b[39m data[(data[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mState\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mSuccessful\u001b[39m\u001b[38;5;124m\"\u001b[39m) \u001b[38;5;241m|\u001b[39m (data[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mState\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mFailed\u001b[39m\u001b[38;5;124m\"\u001b[39m)]\n\u001b[0;32m 11\u001b[0m data[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mDeadline\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m pd\u001b[38;5;241m.\u001b[39mto_datetime(data[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mDeadline\u001b[39m\u001b[38;5;124m\"\u001b[39m],\u001b[38;5;28mformat\u001b[39m\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m%\u001b[39m\u001b[38;5;124mY-\u001b[39m\u001b[38;5;124m%\u001b[39m\u001b[38;5;124mm-\u001b[39m\u001b[38;5;132;01m%d\u001b[39;00m\u001b[38;5;124m'\u001b[39m)\n", + "File \u001b[1;32mc:\\Users\\tobia\\anaconda3\\envs\\env\\Lib\\site-packages\\pandas\\core\\frame.py:5268\u001b[0m, in \u001b[0;36mDataFrame.drop\u001b[1;34m(self, labels, axis, index, columns, level, inplace, errors)\u001b[0m\n\u001b[0;32m 5120\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mdrop\u001b[39m(\n\u001b[0;32m 5121\u001b[0m \u001b[38;5;28mself\u001b[39m,\n\u001b[0;32m 5122\u001b[0m labels: IndexLabel \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 5129\u001b[0m errors: IgnoreRaise \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mraise\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[0;32m 5130\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m DataFrame \u001b[38;5;241m|\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m 5131\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[0;32m 5132\u001b[0m \u001b[38;5;124;03m Drop specified labels from rows or columns.\u001b[39;00m\n\u001b[0;32m 5133\u001b[0m \n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 5266\u001b[0m \u001b[38;5;124;03m weight 1.0 0.8\u001b[39;00m\n\u001b[0;32m 5267\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[1;32m-> 5268\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdrop\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 5269\u001b[0m \u001b[43m \u001b[49m\u001b[43mlabels\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mlabels\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 5270\u001b[0m \u001b[43m \u001b[49m\u001b[43maxis\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43maxis\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 5271\u001b[0m \u001b[43m \u001b[49m\u001b[43mindex\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mindex\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 5272\u001b[0m \u001b[43m \u001b[49m\u001b[43mcolumns\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcolumns\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 5273\u001b[0m \u001b[43m \u001b[49m\u001b[43mlevel\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mlevel\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 5274\u001b[0m \u001b[43m \u001b[49m\u001b[43minplace\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43minplace\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 5275\u001b[0m \u001b[43m \u001b[49m\u001b[43merrors\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43merrors\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 5276\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[1;32mc:\\Users\\tobia\\anaconda3\\envs\\env\\Lib\\site-packages\\pandas\\core\\generic.py:4549\u001b[0m, in \u001b[0;36mNDFrame.drop\u001b[1;34m(self, labels, axis, index, columns, level, inplace, errors)\u001b[0m\n\u001b[0;32m 4547\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m axis, labels \u001b[38;5;129;01min\u001b[39;00m axes\u001b[38;5;241m.\u001b[39mitems():\n\u001b[0;32m 4548\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m labels \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m-> 4549\u001b[0m obj \u001b[38;5;241m=\u001b[39m \u001b[43mobj\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_drop_axis\u001b[49m\u001b[43m(\u001b[49m\u001b[43mlabels\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43maxis\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mlevel\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mlevel\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43merrors\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43merrors\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 4551\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m inplace:\n\u001b[0;32m 4552\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_update_inplace(obj)\n", + "File \u001b[1;32mc:\\Users\\tobia\\anaconda3\\envs\\env\\Lib\\site-packages\\pandas\\core\\generic.py:4591\u001b[0m, in \u001b[0;36mNDFrame._drop_axis\u001b[1;34m(self, labels, axis, level, errors, only_slice)\u001b[0m\n\u001b[0;32m 4589\u001b[0m new_axis \u001b[38;5;241m=\u001b[39m axis\u001b[38;5;241m.\u001b[39mdrop(labels, level\u001b[38;5;241m=\u001b[39mlevel, errors\u001b[38;5;241m=\u001b[39merrors)\n\u001b[0;32m 4590\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m-> 4591\u001b[0m new_axis \u001b[38;5;241m=\u001b[39m \u001b[43maxis\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdrop\u001b[49m\u001b[43m(\u001b[49m\u001b[43mlabels\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43merrors\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43merrors\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 4592\u001b[0m indexer \u001b[38;5;241m=\u001b[39m axis\u001b[38;5;241m.\u001b[39mget_indexer(new_axis)\n\u001b[0;32m 4594\u001b[0m \u001b[38;5;66;03m# Case for non-unique axis\u001b[39;00m\n\u001b[0;32m 4595\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n", + "File \u001b[1;32mc:\\Users\\tobia\\anaconda3\\envs\\env\\Lib\\site-packages\\pandas\\core\\indexes\\base.py:6696\u001b[0m, in \u001b[0;36mIndex.drop\u001b[1;34m(self, labels, errors)\u001b[0m\n\u001b[0;32m 6694\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m mask\u001b[38;5;241m.\u001b[39many():\n\u001b[0;32m 6695\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m errors \u001b[38;5;241m!=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mignore\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n\u001b[1;32m-> 6696\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mlist\u001b[39m(labels[mask])\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m not found in axis\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m 6697\u001b[0m indexer \u001b[38;5;241m=\u001b[39m indexer[\u001b[38;5;241m~\u001b[39mmask]\n\u001b[0;32m 6698\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdelete(indexer)\n", + "\u001b[1;31mKeyError\u001b[0m: \"['ID'] not found in axis\"" + ] + } + ], + "source": [ + "from sklearn.model_selection import train_test_split\n", + "# test out the functions\n", + "data,transform_data = get_data()\n", + "print(data.head())\n", + "\n", + "y = data['State']\n", + "X = data.drop(['State','Launched','Deadline'], axis=1)\n", + "\n", + "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 1337)\n", + "X_train.shape, X_test.shape, y_train.shape, y_test.shape\n" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "#logistic_regression(X_train,X_test,y_train,y_test,metric=\"f1\",verbose=True,norm=\"l1\")" + ] + }, + { + "cell_type": "code", + "execution_count": 11, "metadata": {}, "outputs": [], "source": [ @@ -351,7 +356,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 12, "metadata": {}, "outputs": [ { @@ -379,7 +384,7 @@ "((265169, 9), (66293, 9), (265169,), (66293,))" ] }, - "execution_count": 23, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } @@ -399,63 +404,9 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 23, "metadata": {}, "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "c:\\Users\\tobia\\anaconda3\\envs\\env\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1):\n", - "STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n", - "\n", - "Increase the number of iterations (max_iter) or scale the data as shown in:\n", - " https://scikit-learn.org/stable/modules/preprocessing.html\n", - "Please also refer to the documentation for alternative solver options:\n", - " https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n", - " n_iter_i = _check_optimize_result(\n", - "c:\\Users\\tobia\\anaconda3\\envs\\env\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1):\n", - "STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n", - "\n", - "Increase the number of iterations (max_iter) or scale the data as shown in:\n", - " https://scikit-learn.org/stable/modules/preprocessing.html\n", - "Please also refer to the documentation for alternative solver options:\n", - " https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n", - " n_iter_i = _check_optimize_result(\n", - "c:\\Users\\tobia\\anaconda3\\envs\\env\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1):\n", - "STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n", - "\n", - "Increase the number of iterations (max_iter) or scale the data as shown in:\n", - " https://scikit-learn.org/stable/modules/preprocessing.html\n", - "Please also refer to the documentation for alternative solver options:\n", - " https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n", - " n_iter_i = _check_optimize_result(\n", - "c:\\Users\\tobia\\anaconda3\\envs\\env\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1):\n", - "STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n", - "\n", - "Increase the number of iterations (max_iter) or scale the data as shown in:\n", - " https://scikit-learn.org/stable/modules/preprocessing.html\n", - "Please also refer to the documentation for alternative solver options:\n", - " https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n", - " n_iter_i = _check_optimize_result(\n", - "c:\\Users\\tobia\\anaconda3\\envs\\env\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1):\n", - "STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n", - "\n", - "Increase the number of iterations (max_iter) or scale the data as shown in:\n", - " https://scikit-learn.org/stable/modules/preprocessing.html\n", - "Please also refer to the documentation for alternative solver options:\n", - " https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n", - " n_iter_i = _check_optimize_result(\n", - "c:\\Users\\tobia\\anaconda3\\envs\\env\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1):\n", - "STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n", - "\n", - "Increase the number of iterations (max_iter) or scale the data as shown in:\n", - " https://scikit-learn.org/stable/modules/preprocessing.html\n", - "Please also refer to the documentation for alternative solver options:\n", - " https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n", - " n_iter_i = _check_optimize_result(\n" - ] - }, { "data": { "text/html": [ @@ -478,49 +429,49 @@ " \n", " \n", " Parameters\n", - " accuracy\n", + " f1\n", " \n", " \n", " \n", " \n", " 0\n", - " {'C': 10.0, 'max_iter': 1000}\n", - " 0.999110\n", + " {'eta': 0.3, 'max_depth': 3}\n", + " 0.999261\n", " \n", " \n", " 1\n", - " {'C': 0.1, 'max_iter': 1000}\n", - " 0.999095\n", + " {'eta': 0.3, 'max_depth': 6}\n", + " 0.999261\n", " \n", " \n", " 2\n", - " {'C': 1.0, 'max_iter': 1000}\n", - " 0.999095\n", + " {'eta': 0.3, 'max_depth': 12}\n", + " 0.999261\n", " \n", " \n", " 3\n", - " {'C': 10.0, 'max_iter': 100}\n", - " 0.998989\n", + " {'eta': 0.3, 'max_depth': 24}\n", + " 0.999261\n", " \n", " \n", " 4\n", - " {'C': 0.1, 'max_iter': 100}\n", - " 0.998884\n", + " {'eta': 1, 'max_depth': 3}\n", + " 0.999125\n", " \n", " \n", "\n", "" ], "text/plain": [ - " Parameters accuracy\n", - "0 {'C': 10.0, 'max_iter': 1000} 0.999110\n", - "1 {'C': 0.1, 'max_iter': 1000} 0.999095\n", - "2 {'C': 1.0, 'max_iter': 1000} 0.999095\n", - "3 {'C': 10.0, 'max_iter': 100} 0.998989\n", - "4 {'C': 0.1, 'max_iter': 100} 0.998884" + " Parameters f1\n", + "0 {'eta': 0.3, 'max_depth': 3} 0.999261\n", + "1 {'eta': 0.3, 'max_depth': 6} 0.999261\n", + "2 {'eta': 0.3, 'max_depth': 12} 0.999261\n", + "3 {'eta': 0.3, 'max_depth': 24} 0.999261\n", + "4 {'eta': 1, 'max_depth': 3} 0.999125" ] }, - "execution_count": 24, + "execution_count": 23, "metadata": {}, "output_type": "execute_result" } @@ -530,11 +481,366 @@ "#\n", "# warnings.filterwarnings(\"ignore\", category=ConvergenceWarning)\n", "\n", - "hyperparameters = {\"C\":[0.1,1.0,10.0],\"max_iter\":[10,100,1000]}\n", - "results = grid_search(logistic_regression,hyperparameters,\"accuracy\",X_train,X_test,y_train,y_test)\n", - "\n", + "hyperparameters = {\"eta\":[0.1,0.3,1],\"max_depth\":[3,6,12,24]}\n", + "results = grid_search(xgb,hyperparameters,\"f1\",X_train,X_test,y_train,y_test)\n", "results.head()\n" ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Parametersf1
0{'C': 0.1, 'max_iter': 100, 'verbose': 'True'}0.651894
1{'C': 0.1, 'max_iter': 500, 'verbose': 'True'}0.651894
2{'C': 0.1, 'max_iter': 1000, 'verbose': 'True'}0.651894
3{'C': 1, 'max_iter': 100, 'verbose': 'True'}0.651894
4{'C': 1, 'max_iter': 500, 'verbose': 'True'}0.651894
\n", + "
" + ], + "text/plain": [ + " Parameters f1\n", + "0 {'C': 0.1, 'max_iter': 100, 'verbose': 'True'} 0.651894\n", + "1 {'C': 0.1, 'max_iter': 500, 'verbose': 'True'} 0.651894\n", + "2 {'C': 0.1, 'max_iter': 1000, 'verbose': 'True'} 0.651894\n", + "3 {'C': 1, 'max_iter': 100, 'verbose': 'True'} 0.651894\n", + "4 {'C': 1, 'max_iter': 500, 'verbose': 'True'} 0.651894" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "hyperparameters = {\"C\":[0.1,1,10],\"max_iter\":[100,500,1000]}\n", + "results = grid_search(logistic_regression,hyperparameters,\"f1\",X_train,X_test,y_train,y_test)\n", + "results.head()\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Model Coefficients: [[-9.01102837e-11 -1.12392780e-06 -9.64314847e-10 -1.74546747e-09\n", + " -4.85736444e-10 -1.49496674e-05 1.96343898e-05 2.60577944e-07\n", + " -7.16729454e-09]]\n", + "Model Intercept: [-8.14858205e-11]\n", + "Model Score: [0 0 0 ... 0 0 0]\n" + ] + }, + { + "data": { + "text/plain": [ + "0.2483796371690063" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "logistic_regression(X_train,X_test,y_train,y_test,metric=\"f1\",verbose=True,C=0.1,max_iter=100)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
IDNameCategorySubcategoryCountryGoalPledgedBackersDuration
16477419128507061607139921350035011929
7417819138823162088781090203205672729
2961986877370381353981313821250000275229
92665161445739520876361292170005282338
191647906726512300984129521200080314
\n", + "
" + ], + "text/plain": [ + " ID Name Category Subcategory Country Goal Pledged \n", + "164774 1912850706 16071 3 99 21 3500 3501 \\\n", + "74178 1913882316 208878 10 90 20 320 567 \n", + "296198 687737038 135398 13 138 21 250000 275 \n", + "92665 1614457395 208763 6 129 21 7000 528 \n", + "191647 906726512 300984 12 95 21 2000 80 \n", + "\n", + " Backers Duration \n", + "164774 19 29 \n", + "74178 27 29 \n", + "296198 2 29 \n", + "92665 23 38 \n", + "191647 3 14 " + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X_train.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def decision_tree(X_train,X_test,y_train,y_test,metric=\"accuracy\",verbose=False):\n", + " from sklearn.tree import DecisionTreeClassifier\n", + " from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay\n", + " \"\"\"\n", + " This function performs logistic regression on the data and returns the accuracy of the model\n", + " Necessary Arguments:\n", + " X_train : The training data\n", + " X_test : The test data\n", + " y_train : The target values\n", + " y_test : The target values for the training data\n", + "\n", + " Optional Arguments:\n", + " metric : The metric to calculate the model performance, Options: \"accuracy\", \"precision\", \"recall\", \"f1\"\n", + " verbose : If True, the function will print the metric of the model\n", + "\n", + " Returns:\n", + " metric_value : The number of the metric specified in the arguments\n", + " \"\"\"\n", + "\n", + " # create a decision tree model\n", + " model = DecisionTreeClassifier()\n", + "\n", + " # fit the model to the training data\n", + " model.fit(X_train, y_train)\n", + "\n", + " # predict the target values for the test data\n", + " y_pred = model.predict(X_test)\n", + "\n", + " if verbose==True:\n", + " print(f\"Model Score: {y_pred}\")\n", + " cm = confusion_matrix(y_test, y_pred)\n", + " disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=data.target_names)\n", + " disp.plot()\n", + "\n", + " metric_value = 0\n", + " # calculate the metric of the model\n", + " if metric == \"accuracy\":\n", + " metric_value = accuracy_score(y_test, y_pred)\n", + " if metric == \"precision\":\n", + " metric_value = precision_score(y_test, y_pred)\n", + " if metric == \"recall\":\n", + " metric_value = recall_score(y_test, y_pred)\n", + " if metric == \"f1\":\n", + " metric_value = f1_score(y_test, y_pred)\n", + " return metric_value" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [], + "source": [ + "def xgb(X_train,X_test,y_train,y_test,metric=\"accuracy\",booster=\"gbtree\",eta=0.3,max_depth=6, reg_lambda=1, verbose=False):\n", + " from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay\n", + " \"\"\"\n", + " This function performs xgboost on the data and returns the accuracy of the model\n", + " Necessary Arguments:\n", + " X_train : The training data\n", + " X_test : The test data\n", + " y_train : The target values\n", + " y_test : The target values for the training data\n", + "\n", + " Optional Arguments:\n", + " metric : The metric to calculate the model performance, Options: \"accuracy\", \"precision\", \"recall\", \"f1\"\n", + " verbose : If True, the function will print the metric of the model\n", + " booster : The type of booster to use, Options: \"gbtree\", \"gblinear\", \"dart\"\n", + " eta : The learning rate of the model, between [0,1]\n", + " max_depth : The maximum depth of the trees, default is 6 to avoid overfitting\n", + " reg_lambda : The regularization parameter of the model\n", + "\n", + " Returns:\n", + " metric_value : The number of the metric specified in the arguments\n", + " \"\"\"\n", + " from xgboost import XGBClassifier\n", + " # create a logistic regression model\n", + " model = XGBClassifier(booster=booster,eta=eta)\n", + "\n", + " # fit the model to the training data\n", + " model.fit(X_train, y_train)\n", + "\n", + " # predict the target values for the test data\n", + " y_pred = model.predict(X_test)\n", + "\n", + " if verbose==True:\n", + " print(f\"Model Score: {y_pred}\")\n", + " cm = confusion_matrix(y_test, y_pred)\n", + " disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=data.target_names)\n", + " disp.plot()\n", + " \n", + "\n", + " metric_value = 0\n", + " # calculate the metric of the model\n", + " if metric == \"accuracy\":\n", + " metric_value = accuracy_score(y_test, y_pred)\n", + " if metric == \"precision\":\n", + " metric_value = precision_score(y_test, y_pred)\n", + " if metric == \"recall\":\n", + " metric_value = recall_score(y_test, y_pred)\n", + " if metric == \"f1\":\n", + " metric_value = f1_score(y_test, y_pred)\n", + " return metric_value" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { From 470f1359bbfe1a55da3828bb3abf9c024ca28869 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tobias=20K=C3=A4mmerling?= Date: Fri, 9 Aug 2024 13:49:29 +0200 Subject: [PATCH 3/3] Added the code to start from importing the data to the final prediction into the test.ipynb to look for errors, didn't find any --- base.ipynb | 507 +++++++++++++++++++++++++------------------------ test.ipynb | 541 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 791 insertions(+), 257 deletions(-) diff --git a/base.ipynb b/base.ipynb index 048ce07..e43207d 100644 --- a/base.ipynb +++ b/base.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": 21, "metadata": {}, "outputs": [], "source": [ @@ -10,7 +10,8 @@ "import numpy as np\n", "from sklearn.preprocessing import LabelEncoder\n", "from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score\n", - "from sklearn.linear_model import LogisticRegression" + "from sklearn.linear_model import LogisticRegression\n", + "from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay" ] }, { @@ -74,7 +75,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 18, "metadata": {}, "outputs": [], "source": [ @@ -86,7 +87,7 @@ " data = pd.read_csv('data/kickstarter_projects.csv')\n", " # transform the categorical values to numerical values\n", "\n", - " data = data.drop(\"ID\",axis=1)\n", + " data = data.drop([\"ID\",\"Name\"],axis=1)\n", " data = data[(data[\"State\"] == \"Successful\") | (data[\"State\"] == \"Failed\")]\n", " data[\"Deadline\"] = pd.to_datetime(data[\"Deadline\"],format='%Y-%m-%d')\n", " data[\"Launched\"] = pd.to_datetime(data[\"Launched\"],format='%Y-%m-%d %H:%M:%S')\n", @@ -110,37 +111,6 @@ "cell_type": "code", "execution_count": 5, "metadata": {}, - "outputs": [ - { - "ename": "KeyError", - "evalue": "\"['ID'] not found in axis\"", - "output_type": "error", - "traceback": [ - "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[1;31mKeyError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[1;32mIn[5], line 1\u001b[0m\n\u001b[1;32m----> 1\u001b[0m data, transform_data\u001b[38;5;241m=\u001b[39m \u001b[43mget_data\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 2\u001b[0m data\u001b[38;5;241m.\u001b[39mdrop(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mName\u001b[39m\u001b[38;5;124m\"\u001b[39m, axis\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m1\u001b[39m, inplace\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)\n\u001b[0;32m 3\u001b[0m data\u001b[38;5;241m.\u001b[39mhead(\u001b[38;5;241m20\u001b[39m)\n", - "Cell \u001b[1;32mIn[4], line 9\u001b[0m, in \u001b[0;36mget_data\u001b[1;34m()\u001b[0m\n\u001b[0;32m 6\u001b[0m data \u001b[38;5;241m=\u001b[39m pd\u001b[38;5;241m.\u001b[39mread_csv(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mdata/kickstarter_projects.csv\u001b[39m\u001b[38;5;124m'\u001b[39m)\n\u001b[0;32m 7\u001b[0m \u001b[38;5;66;03m# transform the categorical values to numerical values\u001b[39;00m\n\u001b[1;32m----> 9\u001b[0m data \u001b[38;5;241m=\u001b[39m \u001b[43mdata\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdrop\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mID\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[0;32m 10\u001b[0m data \u001b[38;5;241m=\u001b[39m data[(data[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mState\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mSuccessful\u001b[39m\u001b[38;5;124m\"\u001b[39m) \u001b[38;5;241m|\u001b[39m (data[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mState\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mFailed\u001b[39m\u001b[38;5;124m\"\u001b[39m)]\n\u001b[0;32m 11\u001b[0m data[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mDeadline\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m pd\u001b[38;5;241m.\u001b[39mto_datetime(data[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mDeadline\u001b[39m\u001b[38;5;124m\"\u001b[39m],\u001b[38;5;28mformat\u001b[39m\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m%\u001b[39m\u001b[38;5;124mY-\u001b[39m\u001b[38;5;124m%\u001b[39m\u001b[38;5;124mm-\u001b[39m\u001b[38;5;132;01m%d\u001b[39;00m\u001b[38;5;124m'\u001b[39m)\n", - "File \u001b[1;32mc:\\Users\\tobia\\anaconda3\\envs\\env\\Lib\\site-packages\\pandas\\core\\frame.py:5268\u001b[0m, in \u001b[0;36mDataFrame.drop\u001b[1;34m(self, labels, axis, index, columns, level, inplace, errors)\u001b[0m\n\u001b[0;32m 5120\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mdrop\u001b[39m(\n\u001b[0;32m 5121\u001b[0m \u001b[38;5;28mself\u001b[39m,\n\u001b[0;32m 5122\u001b[0m labels: IndexLabel \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 5129\u001b[0m errors: IgnoreRaise \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mraise\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[0;32m 5130\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m DataFrame \u001b[38;5;241m|\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m 5131\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[0;32m 5132\u001b[0m \u001b[38;5;124;03m Drop specified labels from rows or columns.\u001b[39;00m\n\u001b[0;32m 5133\u001b[0m \n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 5266\u001b[0m \u001b[38;5;124;03m weight 1.0 0.8\u001b[39;00m\n\u001b[0;32m 5267\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[1;32m-> 5268\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdrop\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 5269\u001b[0m \u001b[43m \u001b[49m\u001b[43mlabels\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mlabels\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 5270\u001b[0m \u001b[43m \u001b[49m\u001b[43maxis\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43maxis\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 5271\u001b[0m \u001b[43m \u001b[49m\u001b[43mindex\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mindex\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 5272\u001b[0m \u001b[43m \u001b[49m\u001b[43mcolumns\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcolumns\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 5273\u001b[0m \u001b[43m \u001b[49m\u001b[43mlevel\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mlevel\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 5274\u001b[0m \u001b[43m \u001b[49m\u001b[43minplace\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43minplace\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 5275\u001b[0m \u001b[43m \u001b[49m\u001b[43merrors\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43merrors\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 5276\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[1;32mc:\\Users\\tobia\\anaconda3\\envs\\env\\Lib\\site-packages\\pandas\\core\\generic.py:4549\u001b[0m, in \u001b[0;36mNDFrame.drop\u001b[1;34m(self, labels, axis, index, columns, level, inplace, errors)\u001b[0m\n\u001b[0;32m 4547\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m axis, labels \u001b[38;5;129;01min\u001b[39;00m axes\u001b[38;5;241m.\u001b[39mitems():\n\u001b[0;32m 4548\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m labels \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m-> 4549\u001b[0m obj \u001b[38;5;241m=\u001b[39m \u001b[43mobj\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_drop_axis\u001b[49m\u001b[43m(\u001b[49m\u001b[43mlabels\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43maxis\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mlevel\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mlevel\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43merrors\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43merrors\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 4551\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m inplace:\n\u001b[0;32m 4552\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_update_inplace(obj)\n", - "File \u001b[1;32mc:\\Users\\tobia\\anaconda3\\envs\\env\\Lib\\site-packages\\pandas\\core\\generic.py:4591\u001b[0m, in \u001b[0;36mNDFrame._drop_axis\u001b[1;34m(self, labels, axis, level, errors, only_slice)\u001b[0m\n\u001b[0;32m 4589\u001b[0m new_axis \u001b[38;5;241m=\u001b[39m axis\u001b[38;5;241m.\u001b[39mdrop(labels, level\u001b[38;5;241m=\u001b[39mlevel, errors\u001b[38;5;241m=\u001b[39merrors)\n\u001b[0;32m 4590\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m-> 4591\u001b[0m new_axis \u001b[38;5;241m=\u001b[39m \u001b[43maxis\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdrop\u001b[49m\u001b[43m(\u001b[49m\u001b[43mlabels\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43merrors\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43merrors\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 4592\u001b[0m indexer \u001b[38;5;241m=\u001b[39m axis\u001b[38;5;241m.\u001b[39mget_indexer(new_axis)\n\u001b[0;32m 4594\u001b[0m \u001b[38;5;66;03m# Case for non-unique axis\u001b[39;00m\n\u001b[0;32m 4595\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n", - "File \u001b[1;32mc:\\Users\\tobia\\anaconda3\\envs\\env\\Lib\\site-packages\\pandas\\core\\indexes\\base.py:6696\u001b[0m, in \u001b[0;36mIndex.drop\u001b[1;34m(self, labels, errors)\u001b[0m\n\u001b[0;32m 6694\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m mask\u001b[38;5;241m.\u001b[39many():\n\u001b[0;32m 6695\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m errors \u001b[38;5;241m!=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mignore\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n\u001b[1;32m-> 6696\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mlist\u001b[39m(labels[mask])\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m not found in axis\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m 6697\u001b[0m indexer \u001b[38;5;241m=\u001b[39m indexer[\u001b[38;5;241m~\u001b[39mmask]\n\u001b[0;32m 6698\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdelete(indexer)\n", - "\u001b[1;31mKeyError\u001b[0m: \"['ID'] not found in axis\"" - ] - } - ], - "source": [ - "data, transform_data= get_data()\n", - "data.drop(\"Name\", axis=1, inplace=True)\n", - "data.head(20)\n", - "\n", - "data = transform_numerical_to_string(data,transform_data)\n", - "data[\"State\"].groupby(data[\"State\"]).count()\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, "outputs": [], "source": [ "def get_original_data():\n", @@ -150,7 +120,7 @@ " # read in the data from the csv file\n", " data = pd.read_csv('data/kickstarter_projects.csv')\n", "\n", - " data = data.drop(\"ID\",axis=1)\n", + " data = data.drop([\"ID\",\"Name\"],axis=1)\n", " data = data[(data[\"State\"] == \"Successful\") | (data[\"State\"] == \"Failed\")]\n", " data[\"Deadline\"] = pd.to_datetime(data[\"Deadline\"],format='%Y-%m-%d')\n", " data[\"Launched\"] = pd.to_datetime(data[\"Launched\"],format='%Y-%m-%d %H:%M:%S')\n", @@ -169,7 +139,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "metadata": {}, "outputs": [], "source": [ @@ -201,11 +171,12 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "def logistic_regression(X_train,X_test,y_train,y_test,metric=\"accuracy\",verbose=False,norm=\"l2\",max_iter=1000,C=1.0):\n", + " from sklearn.metrics import confusion_matrix\n", " \"\"\"\n", " This function performs logistic regression on the data and returns the accuracy of the model\n", " Necessary Arguments:\n", @@ -238,6 +209,8 @@ " print(f\"Model Coefficients: {model.coef_}\")\n", " print(f\"Model Intercept: {model.intercept_}\")\n", " print(f\"Model Score: {y_pred}\")\n", + " print(f'Confusion Matrix: {confusion_matrix(y_test, y_pred)}')\n", + "\n", "\n", " metric_value = 0\n", " # calculate the metric of the model\n", @@ -254,31 +227,37 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": 6, + "execution_count": 8, "metadata": {}, "outputs": [ { - "ename": "KeyError", - "evalue": "\"['ID'] not found in axis\"", - "output_type": "error", - "traceback": [ - "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[1;31mKeyError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[1;32mIn[6], line 3\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01msklearn\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mmodel_selection\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m train_test_split\n\u001b[0;32m 2\u001b[0m \u001b[38;5;66;03m# test out the functions\u001b[39;00m\n\u001b[1;32m----> 3\u001b[0m data,transform_data \u001b[38;5;241m=\u001b[39m \u001b[43mget_data\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 4\u001b[0m \u001b[38;5;28mprint\u001b[39m(data\u001b[38;5;241m.\u001b[39mhead())\n\u001b[0;32m 6\u001b[0m y \u001b[38;5;241m=\u001b[39m data[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mState\u001b[39m\u001b[38;5;124m'\u001b[39m]\n", - "Cell \u001b[1;32mIn[4], line 9\u001b[0m, in \u001b[0;36mget_data\u001b[1;34m()\u001b[0m\n\u001b[0;32m 6\u001b[0m data \u001b[38;5;241m=\u001b[39m pd\u001b[38;5;241m.\u001b[39mread_csv(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mdata/kickstarter_projects.csv\u001b[39m\u001b[38;5;124m'\u001b[39m)\n\u001b[0;32m 7\u001b[0m \u001b[38;5;66;03m# transform the categorical values to numerical values\u001b[39;00m\n\u001b[1;32m----> 9\u001b[0m data \u001b[38;5;241m=\u001b[39m \u001b[43mdata\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdrop\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mID\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[0;32m 10\u001b[0m data \u001b[38;5;241m=\u001b[39m data[(data[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mState\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mSuccessful\u001b[39m\u001b[38;5;124m\"\u001b[39m) \u001b[38;5;241m|\u001b[39m (data[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mState\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mFailed\u001b[39m\u001b[38;5;124m\"\u001b[39m)]\n\u001b[0;32m 11\u001b[0m data[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mDeadline\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m pd\u001b[38;5;241m.\u001b[39mto_datetime(data[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mDeadline\u001b[39m\u001b[38;5;124m\"\u001b[39m],\u001b[38;5;28mformat\u001b[39m\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m%\u001b[39m\u001b[38;5;124mY-\u001b[39m\u001b[38;5;124m%\u001b[39m\u001b[38;5;124mm-\u001b[39m\u001b[38;5;132;01m%d\u001b[39;00m\u001b[38;5;124m'\u001b[39m)\n", - "File \u001b[1;32mc:\\Users\\tobia\\anaconda3\\envs\\env\\Lib\\site-packages\\pandas\\core\\frame.py:5268\u001b[0m, in \u001b[0;36mDataFrame.drop\u001b[1;34m(self, labels, axis, index, columns, level, inplace, errors)\u001b[0m\n\u001b[0;32m 5120\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mdrop\u001b[39m(\n\u001b[0;32m 5121\u001b[0m \u001b[38;5;28mself\u001b[39m,\n\u001b[0;32m 5122\u001b[0m labels: IndexLabel \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 5129\u001b[0m errors: IgnoreRaise \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mraise\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[0;32m 5130\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m DataFrame \u001b[38;5;241m|\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m 5131\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[0;32m 5132\u001b[0m \u001b[38;5;124;03m Drop specified labels from rows or columns.\u001b[39;00m\n\u001b[0;32m 5133\u001b[0m \n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 5266\u001b[0m \u001b[38;5;124;03m weight 1.0 0.8\u001b[39;00m\n\u001b[0;32m 5267\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[1;32m-> 5268\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdrop\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 5269\u001b[0m \u001b[43m \u001b[49m\u001b[43mlabels\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mlabels\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 5270\u001b[0m \u001b[43m \u001b[49m\u001b[43maxis\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43maxis\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 5271\u001b[0m \u001b[43m \u001b[49m\u001b[43mindex\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mindex\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 5272\u001b[0m \u001b[43m \u001b[49m\u001b[43mcolumns\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcolumns\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 5273\u001b[0m \u001b[43m \u001b[49m\u001b[43mlevel\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mlevel\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 5274\u001b[0m \u001b[43m \u001b[49m\u001b[43minplace\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43minplace\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 5275\u001b[0m \u001b[43m \u001b[49m\u001b[43merrors\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43merrors\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 5276\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[1;32mc:\\Users\\tobia\\anaconda3\\envs\\env\\Lib\\site-packages\\pandas\\core\\generic.py:4549\u001b[0m, in \u001b[0;36mNDFrame.drop\u001b[1;34m(self, labels, axis, index, columns, level, inplace, errors)\u001b[0m\n\u001b[0;32m 4547\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m axis, labels \u001b[38;5;129;01min\u001b[39;00m axes\u001b[38;5;241m.\u001b[39mitems():\n\u001b[0;32m 4548\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m labels \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m-> 4549\u001b[0m obj \u001b[38;5;241m=\u001b[39m \u001b[43mobj\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_drop_axis\u001b[49m\u001b[43m(\u001b[49m\u001b[43mlabels\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43maxis\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mlevel\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mlevel\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43merrors\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43merrors\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 4551\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m inplace:\n\u001b[0;32m 4552\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_update_inplace(obj)\n", - "File \u001b[1;32mc:\\Users\\tobia\\anaconda3\\envs\\env\\Lib\\site-packages\\pandas\\core\\generic.py:4591\u001b[0m, in \u001b[0;36mNDFrame._drop_axis\u001b[1;34m(self, labels, axis, level, errors, only_slice)\u001b[0m\n\u001b[0;32m 4589\u001b[0m new_axis \u001b[38;5;241m=\u001b[39m axis\u001b[38;5;241m.\u001b[39mdrop(labels, level\u001b[38;5;241m=\u001b[39mlevel, errors\u001b[38;5;241m=\u001b[39merrors)\n\u001b[0;32m 4590\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m-> 4591\u001b[0m new_axis \u001b[38;5;241m=\u001b[39m \u001b[43maxis\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdrop\u001b[49m\u001b[43m(\u001b[49m\u001b[43mlabels\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43merrors\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43merrors\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 4592\u001b[0m indexer \u001b[38;5;241m=\u001b[39m axis\u001b[38;5;241m.\u001b[39mget_indexer(new_axis)\n\u001b[0;32m 4594\u001b[0m \u001b[38;5;66;03m# Case for non-unique axis\u001b[39;00m\n\u001b[0;32m 4595\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n", - "File \u001b[1;32mc:\\Users\\tobia\\anaconda3\\envs\\env\\Lib\\site-packages\\pandas\\core\\indexes\\base.py:6696\u001b[0m, in \u001b[0;36mIndex.drop\u001b[1;34m(self, labels, errors)\u001b[0m\n\u001b[0;32m 6694\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m mask\u001b[38;5;241m.\u001b[39many():\n\u001b[0;32m 6695\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m errors \u001b[38;5;241m!=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mignore\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n\u001b[1;32m-> 6696\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mlist\u001b[39m(labels[mask])\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m not found in axis\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m 6697\u001b[0m indexer \u001b[38;5;241m=\u001b[39m indexer[\u001b[38;5;241m~\u001b[39mmask]\n\u001b[0;32m 6698\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdelete(indexer)\n", - "\u001b[1;31mKeyError\u001b[0m: \"['ID'] not found in axis\"" + "name": "stdout", + "output_type": "stream", + "text": [ + " Category Subcategory Country Launched Deadline Goal \n", + "0 5 52 21 2009-04-21 21:02:48 2009-05-31 1000 \\\n", + "1 6 129 21 2009-04-23 00:07:53 2009-07-20 80000 \n", + "2 0 70 21 2009-04-24 21:52:03 2009-05-03 20 \n", + "3 13 131 21 2009-04-25 17:36:21 2009-07-14 99 \n", + "4 5 52 21 2009-04-27 14:10:39 2009-05-26 1900 \n", + "\n", + " Pledged Backers State Duration \n", + "0 625 30 0 39 \n", + "1 22 3 0 87 \n", + "2 35 3 1 8 \n", + "3 145 25 1 79 \n", + "4 387 10 0 28 \n" ] + }, + { + "data": { + "text/plain": [ + "((265169, 7), (66293, 7), (265169,), (66293,))" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ @@ -291,21 +270,12 @@ "X = data.drop(['State','Launched','Deadline'], axis=1)\n", "\n", "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 1337)\n", - "X_train.shape, X_test.shape, y_train.shape, y_test.shape\n" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [], - "source": [ - "#logistic_regression(X_train,X_test,y_train,y_test,metric=\"f1\",verbose=True,norm=\"l1\")" + "X_train.shape, X_test.shape, y_train.shape, y_test.shape" ] }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 9, "metadata": {}, "outputs": [], "source": [ @@ -348,63 +318,12 @@ "\n", " return results\n", "\n", - "\n", - "\n", - "\n", " " ] }, { "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " ID Name Category Subcategory Country Launched \n", - "0 1860890148 115746 5 52 21 2009-04-21 21:02:48 \\\n", - "1 709707365 56298 6 129 21 2009-04-23 00:07:53 \n", - "2 1703704063 323842 0 70 21 2009-04-24 21:52:03 \n", - "3 727286 192565 13 131 21 2009-04-25 17:36:21 \n", - "4 1622952265 200031 5 52 21 2009-04-27 14:10:39 \n", - "\n", - " Deadline Goal Pledged Backers State Duration \n", - "0 2009-05-31 1000 625 30 0 39 \n", - "1 2009-07-20 80000 22 3 0 87 \n", - "2 2009-05-03 20 35 3 1 8 \n", - "3 2009-07-14 99 145 25 1 79 \n", - "4 2009-05-26 1900 387 10 0 28 \n" - ] - }, - { - "data": { - "text/plain": [ - "((265169, 9), (66293, 9), (265169,), (66293,))" - ] - }, - "execution_count": 12, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from sklearn.model_selection import train_test_split\n", - "# test out the functions\n", - "data,transform_data = get_data()\n", - "print(data.head())\n", - "\n", - "y = data['State']\n", - "X = data.drop(['State','Launched','Deadline'], axis=1)\n", - "\n", - "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 1337)\n", - "X_train.shape, X_test.shape, y_train.shape, y_test.shape" - ] - }, - { - "cell_type": "code", - "execution_count": 23, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -483,12 +402,12 @@ "\n", "hyperparameters = {\"eta\":[0.1,0.3,1],\"max_depth\":[3,6,12,24]}\n", "results = grid_search(xgb,hyperparameters,\"f1\",X_train,X_test,y_train,y_test)\n", - "results.head()\n" + "results.head()" ] }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 11, "metadata": {}, "outputs": [ { @@ -512,97 +431,123 @@ " \n", " \n", " \n", - " Parameters\n", - " f1\n", + " Category\n", + " Subcategory\n", + " Country\n", + " Goal\n", + " Pledged\n", + " Backers\n", + " Duration\n", " \n", " \n", " \n", " \n", - " 0\n", - " {'C': 0.1, 'max_iter': 100, 'verbose': 'True'}\n", - " 0.651894\n", + " 164774\n", + " 3\n", + " 99\n", + " 21\n", + " 3500\n", + " 3501\n", + " 19\n", + " 29\n", " \n", " \n", - " 1\n", - " {'C': 0.1, 'max_iter': 500, 'verbose': 'True'}\n", - " 0.651894\n", + " 74178\n", + " 10\n", + " 90\n", + " 20\n", + " 320\n", + " 567\n", + " 27\n", + " 29\n", " \n", " \n", - " 2\n", - " {'C': 0.1, 'max_iter': 1000, 'verbose': 'True'}\n", - " 0.651894\n", + " 296198\n", + " 13\n", + " 138\n", + " 21\n", + " 250000\n", + " 275\n", + " 2\n", + " 29\n", " \n", " \n", - " 3\n", - " {'C': 1, 'max_iter': 100, 'verbose': 'True'}\n", - " 0.651894\n", + " 92665\n", + " 6\n", + " 129\n", + " 21\n", + " 7000\n", + " 528\n", + " 23\n", + " 38\n", " \n", " \n", - " 4\n", - " {'C': 1, 'max_iter': 500, 'verbose': 'True'}\n", - " 0.651894\n", + " 191647\n", + " 12\n", + " 95\n", + " 21\n", + " 2000\n", + " 80\n", + " 3\n", + " 14\n", " \n", " \n", "\n", "" ], "text/plain": [ - " Parameters f1\n", - "0 {'C': 0.1, 'max_iter': 100, 'verbose': 'True'} 0.651894\n", - "1 {'C': 0.1, 'max_iter': 500, 'verbose': 'True'} 0.651894\n", - "2 {'C': 0.1, 'max_iter': 1000, 'verbose': 'True'} 0.651894\n", - "3 {'C': 1, 'max_iter': 100, 'verbose': 'True'} 0.651894\n", - "4 {'C': 1, 'max_iter': 500, 'verbose': 'True'} 0.651894" + " Category Subcategory Country Goal Pledged Backers Duration\n", + "164774 3 99 21 3500 3501 19 29\n", + "74178 10 90 20 320 567 27 29\n", + "296198 13 138 21 250000 275 2 29\n", + "92665 6 129 21 7000 528 23 38\n", + "191647 12 95 21 2000 80 3 14" ] }, - "execution_count": 25, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "hyperparameters = {\"C\":[0.1,1,10],\"max_iter\":[100,500,1000]}\n", - "results = grid_search(logistic_regression,hyperparameters,\"f1\",X_train,X_test,y_train,y_test)\n", - "results.head()\n", - "\n" + "X_train.head()" ] }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 12, "metadata": {}, "outputs": [ { - "name": "stdout", + "name": "stderr", "output_type": "stream", "text": [ - "Model Coefficients: [[-9.01102837e-11 -1.12392780e-06 -9.64314847e-10 -1.74546747e-09\n", - " -4.85736444e-10 -1.49496674e-05 1.96343898e-05 2.60577944e-07\n", - " -7.16729454e-09]]\n", - "Model Intercept: [-8.14858205e-11]\n", - "Model Score: [0 0 0 ... 0 0 0]\n" + "c:\\Users\\tobia\\anaconda3\\envs\\env\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1):\n", + "STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n", + "\n", + "Increase the number of iterations (max_iter) or scale the data as shown in:\n", + " https://scikit-learn.org/stable/modules/preprocessing.html\n", + "Please also refer to the documentation for alternative solver options:\n", + " https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n", + " n_iter_i = _check_optimize_result(\n", + "c:\\Users\\tobia\\anaconda3\\envs\\env\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1):\n", + "STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n", + "\n", + "Increase the number of iterations (max_iter) or scale the data as shown in:\n", + " https://scikit-learn.org/stable/modules/preprocessing.html\n", + "Please also refer to the documentation for alternative solver options:\n", + " https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n", + " n_iter_i = _check_optimize_result(\n", + "c:\\Users\\tobia\\anaconda3\\envs\\env\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1):\n", + "STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n", + "\n", + "Increase the number of iterations (max_iter) or scale the data as shown in:\n", + " https://scikit-learn.org/stable/modules/preprocessing.html\n", + "Please also refer to the documentation for alternative solver options:\n", + " https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n", + " n_iter_i = _check_optimize_result(\n" ] }, - { - "data": { - "text/plain": [ - "0.2483796371690063" - ] - }, - "execution_count": 26, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "logistic_regression(X_train,X_test,y_train,y_test,metric=\"f1\",verbose=True,C=0.1,max_iter=100)\n" - ] - }, - { - "cell_type": "code", - "execution_count": 27, - "metadata": {}, - "outputs": [ { "data": { "text/html": [ @@ -624,105 +569,90 @@ " \n", " \n", " \n", - " ID\n", - " Name\n", - " Category\n", - " Subcategory\n", - " Country\n", - " Goal\n", - " Pledged\n", - " Backers\n", - " Duration\n", + " Parameters\n", + " f1\n", " \n", " \n", " \n", " \n", - " 164774\n", - " 1912850706\n", - " 16071\n", - " 3\n", - " 99\n", - " 21\n", - " 3500\n", - " 3501\n", - " 19\n", - " 29\n", + " 0\n", + " {'C': 0.1, 'max_iter': 100}\n", + " 0.999155\n", " \n", " \n", - " 74178\n", - " 1913882316\n", - " 208878\n", - " 10\n", - " 90\n", - " 20\n", - " 320\n", - " 567\n", - " 27\n", - " 29\n", + " 1\n", + " {'C': 0.1, 'max_iter': 500}\n", + " 0.999155\n", " \n", " \n", - " 296198\n", - " 687737038\n", - " 135398\n", - " 13\n", - " 138\n", - " 21\n", - " 250000\n", - " 275\n", - " 2\n", - " 29\n", + " 2\n", + " {'C': 0.1, 'max_iter': 1000}\n", + " 0.999155\n", " \n", " \n", - " 92665\n", - " 1614457395\n", - " 208763\n", - " 6\n", - " 129\n", - " 21\n", - " 7000\n", - " 528\n", - " 23\n", - " 38\n", + " 3\n", + " {'C': 1, 'max_iter': 100}\n", + " 0.999155\n", " \n", " \n", - " 191647\n", - " 906726512\n", - " 300984\n", - " 12\n", - " 95\n", - " 21\n", - " 2000\n", - " 80\n", - " 3\n", - " 14\n", + " 4\n", + " {'C': 1, 'max_iter': 500}\n", + " 0.999155\n", " \n", " \n", "\n", "" ], "text/plain": [ - " ID Name Category Subcategory Country Goal Pledged \n", - "164774 1912850706 16071 3 99 21 3500 3501 \\\n", - "74178 1913882316 208878 10 90 20 320 567 \n", - "296198 687737038 135398 13 138 21 250000 275 \n", - "92665 1614457395 208763 6 129 21 7000 528 \n", - "191647 906726512 300984 12 95 21 2000 80 \n", - "\n", - " Backers Duration \n", - "164774 19 29 \n", - "74178 27 29 \n", - "296198 2 29 \n", - "92665 23 38 \n", - "191647 3 14 " + " Parameters f1\n", + "0 {'C': 0.1, 'max_iter': 100} 0.999155\n", + "1 {'C': 0.1, 'max_iter': 500} 0.999155\n", + "2 {'C': 0.1, 'max_iter': 1000} 0.999155\n", + "3 {'C': 1, 'max_iter': 100} 0.999155\n", + "4 {'C': 1, 'max_iter': 500} 0.999155" ] }, - "execution_count": 27, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "X_train.head()" + "hyperparameters = {\"C\":[0.1,1,10],\"max_iter\":[100,500,1000]}\n", + "results = grid_search(logistic_regression,hyperparameters,\"f1\",X_train,X_test,y_train,y_test)\n", + "results.head()\n" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Model Coefficients: [[ 0.02520122 0.00417009 0.04143354 -0.10483769 0.10516741 0.08317194\n", + " 0.00119811]]\n", + "Model Intercept: [0.00554605]\n", + "Model Score: [0 0 0 ... 0 1 0]\n", + "Confusion Matrix: [[39351 56]\n", + " [ 0 26886]]\n" + ] + }, + { + "data": { + "text/plain": [ + "0.9989596492531768" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "logistic_regression(X_train,X_test,y_train,y_test,metric=\"f1\",verbose=True,C=0.1,max_iter=500)" ] }, { @@ -733,7 +663,6 @@ "source": [ "def decision_tree(X_train,X_test,y_train,y_test,metric=\"accuracy\",verbose=False):\n", " from sklearn.tree import DecisionTreeClassifier\n", - " from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay\n", " \"\"\"\n", " This function performs logistic regression on the data and returns the accuracy of the model\n", " Necessary Arguments:\n", @@ -780,12 +709,12 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "def xgb(X_train,X_test,y_train,y_test,metric=\"accuracy\",booster=\"gbtree\",eta=0.3,max_depth=6, reg_lambda=1, verbose=False):\n", - " from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay\n", + "\n", " \"\"\"\n", " This function performs xgboost on the data and returns the accuracy of the model\n", " Necessary Arguments:\n", @@ -835,12 +764,76 @@ " return metric_value" ] }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [], + "source": [ + "data,transform_data = get_data()\n", + "data.head()\n", + "data.groupby(\"State\").count()\n", + "\n", + "y = data['State']\n", + "X = data.drop(['State','Launched','Deadline'], axis=1)" + ] + }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "data = pd.read_csv('data/kickstarter_projects.csv')\n", + "\n", + "\n", + "data = data.drop([\"ID\",\"Name\"],axis=1)\n", + "data = data[(data[\"State\"] == \"Successful\") | (data[\"State\"] == \"Failed\")]\n", + "data[\"Deadline\"] = pd.to_datetime(data[\"Deadline\"],format='%Y-%m-%d')\n", + "data[\"Launched\"] = pd.to_datetime(data[\"Launched\"],format='%Y-%m-%d %H:%M:%S')\n", + "data[\"Duration\"] = (data[\"Deadline\"] - data[\"Launched\"]).dt.days\n", + "\n", + "data, transform_data = transform_strings_to_numerical(data)" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Model Coefficients: [[ 0.02520122 0.00417009 0.04143354 -0.10483769 0.10516741 0.08317194\n", + " 0.00119811]]\n", + "Model Intercept: [0.00554605]\n", + "Model Score: [0 0 0 ... 0 1 0]\n", + "Confusion Matrix: [[39351 56]\n", + " [ 0 26886]]\n", + "0.9989596492531768\n" + ] + } + ], + "source": [ + "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 1337)\n", + "X_train.shape, X_test.shape, y_train.shape, y_test.shape\n", + "\n", + "model = LogisticRegression(max_iter=1000,penalty=\"l2\",C=0.1)\n", + "# fit the model to the training data\n", + "model.fit(X_train, y_train)\n", + "\n", + "# predict the target values for the test data\n", + "y_pred = model.predict(X_test)\n", + "\n", + "print(f\"Model Coefficients: {model.coef_}\")\n", + "print(f\"Model Intercept: {model.intercept_}\")\n", + "print(f\"Model Score: {y_pred}\")\n", + "print(f'Confusion Matrix: {confusion_matrix(y_test, y_pred)}')\n", + "\n", + "metric_value = f1_score(y_test, y_pred)\n", + "print(metric_value)" + ] } ], "metadata": { diff --git a/test.ipynb b/test.ipynb index e69de29..6039a6c 100644 --- a/test.ipynb +++ b/test.ipynb @@ -0,0 +1,541 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "from sklearn.preprocessing import LabelEncoder\n", + "from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score\n", + "from sklearn.linear_model import LogisticRegression\n", + "from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay\n", + "from sklearn.model_selection import train_test_split\n", + "\n", + "data = pd.read_csv('data/kickstarter_projects.csv')\n", + "\n", + "data = data.drop([\"ID\",\"Name\"],axis=1)\n", + "data = data[(data[\"State\"] == \"Successful\") | (data[\"State\"] == \"Failed\")]\n", + "data[\"Deadline\"] = pd.to_datetime(data[\"Deadline\"],format='%Y-%m-%d')\n", + "data[\"Launched\"] = pd.to_datetime(data[\"Launched\"],format='%Y-%m-%d %H:%M:%S')\n", + "data[\"Duration\"] = (data[\"Deadline\"] - data[\"Launched\"]).dt.days\n", + "\n", + "for column in data.columns:\n", + " # If data type is an object, for example a string, we want to convert the column to numerical values\n", + " if data[column].dtype == 'object' and data[column].dtype != 'datetime64[ns]':\n", + " le = LabelEncoder()\n", + " data[column] = le.fit_transform(data[column])\n", + "\n", + "#data, transform_data = transform_strings_to_numerical(data)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "y = data['State']\n", + "X = data.drop(['State','Launched','Deadline'], axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
CategorySubcategoryCountryGoalPledgedBackersDuration
05522110006253039
16129218000022387
207021203538
31313121991452579
45522119003871028
5977213000332911017
661292120041329
71254215005631829
9101252130015216
10111042135016303148
\n", + "
" + ], + "text/plain": [ + " Category Subcategory Country Goal Pledged Backers Duration\n", + "0 5 52 21 1000 625 30 39\n", + "1 6 129 21 80000 22 3 87\n", + "2 0 70 21 20 35 3 8\n", + "3 13 131 21 99 145 25 79\n", + "4 5 52 21 1900 387 10 28\n", + "5 9 77 21 3000 3329 110 17\n", + "6 6 129 21 200 41 3 29\n", + "7 12 54 21 500 563 18 29\n", + "9 10 125 21 300 15 2 16\n", + "10 11 104 21 350 1630 31 48" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X.head(10)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 0\n", + "1 0\n", + "2 1\n", + "3 1\n", + "4 0\n", + "5 1\n", + "6 0\n", + "7 1\n", + "9 0\n", + "10 1\n", + "Name: State, dtype: int32" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "y.head(10)" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "((265169, 7), (66293, 7), (265169,), (66293,))" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)\n", + "X_train.shape, X_test.shape, y_train.shape, y_test.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
CategorySubcategoryCountryGoalPledgedBackersDuration
3224391012521150018253934
33434943721110060274196
219576411301166291622959
128525129521500050503129
1339212952150000044
\n", + "
" + ], + "text/plain": [ + " Category Subcategory Country Goal Pledged Backers Duration\n", + "322439 10 125 21 1500 1825 39 34\n", + "334349 4 37 21 1100 6027 419 6\n", + "219576 4 113 0 116629 1622 9 59\n", + "128525 12 95 21 5000 5050 31 29\n", + "13392 12 95 21 5000 0 0 44" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X_train.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
CategorySubcategoryCountryGoalPledgedBackersDuration
214728121913558217432739
2247964652170060239
158299131312180057429
3294871047218000917910827
162625098212500051344
\n", + "
" + ], + "text/plain": [ + " Category Subcategory Country Goal Pledged Backers Duration\n", + "214728 12 19 13 5582 1743 27 39\n", + "224796 4 65 21 700 60 2 39\n", + "158299 13 131 21 800 57 4 29\n", + "329487 10 47 21 8000 9179 108 27\n", + "162625 0 98 21 25000 51 3 44" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X_test.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "\n", + "model = LogisticRegression(max_iter=1000,penalty=\"l2\",C=0.1)\n", + "# fit the model to the training data\n", + "model.fit(X_train, y_train)\n", + "\n", + "# predict the target values for the test data\n", + "y_pred = model.predict(X_test)" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Model Coefficients: [[ 0.01684149 0.0032329 0.03237224 -0.1055174 0.10587912 0.07303369\n", + " -0.00594841]]\n", + "Model Intercept: [0.58553995]\n", + "Model Score: [0 0 0 ... 1 0 0]\n", + "Confusion Matrix: [[39342 76]\n", + " [ 0 26875]]\n", + "0.9985880429532197\n" + ] + } + ], + "source": [ + "print(f\"Model Coefficients: {model.coef_}\")\n", + "print(f\"Model Intercept: {model.intercept_}\")\n", + "print(f\"Model Score: {y_pred}\")\n", + "print(f'Confusion Matrix: {confusion_matrix(y_test, y_pred)}')\n", + "\n", + "metric_value = f1_score(y_test, y_pred)\n", + "print(metric_value)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "env", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.3" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}