From 8dc776e023cdab604335ce11930698d23a0397c0 Mon Sep 17 00:00:00 2001 From: AyeshaK <124094029+Arrowhorse@users.noreply.github.com> Date: Mon, 20 Oct 2025 09:46:04 -0400 Subject: [PATCH] Add Chicago Crime Data Analysis notebook --- Chicago Crime Data Analysis.ipynb | 522 ++++++++++++++++++++++++++++++ 1 file changed, 522 insertions(+) create mode 100644 Chicago Crime Data Analysis.ipynb diff --git a/Chicago Crime Data Analysis.ipynb b/Chicago Crime Data Analysis.ipynb new file mode 100644 index 0000000..1895222 --- /dev/null +++ b/Chicago Crime Data Analysis.ipynb @@ -0,0 +1,522 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "d4061a35-58cc-4116-bd50-39149059348d", + "metadata": {}, + "outputs": [], + "source": [ + "# Data I'm usimg:\n", + "# City of Chicago's Public Crime Data: https://data.cityofchicago.org/Public-Safety/Crimes-2001-to-Present/ijzp-q8t2/about_data\n", + "# There are about 8.34 million records from 2001 to last week. The report is updated every day. \n", + "\n", + "# Variables: ID, Case Number, Date, Block, IUCR (The Illinois Uniform Crime Reporting code), Primary Type, Description, Location Description, Arrest\n", + "# Variables Cont: Domestic, Beat, District, Ward, Community Area, FBI Code, X Coordinate, Y Coordinate, Year, Updated On, Latitude, Longitude, Location\n", + "# Types: Number, Text, Floating Timestamp, Checkbox\n", + "\n", + "# Data Preparation: Removed irrelevant columns and columns with missing or null values.\n", + "# Data Prep Part 2: Converted Date to datetime for later analysis and saved time features for trend analysis\n", + "\n", + "import pandas as pd\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "import seaborn as sns\n", + "import time\n", + "\n", + "from sklearn.preprocessing import StandardScaler\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.tree import DecisionTreeClassifier, plot_tree\n", + "from sklearn.linear_model import LogisticRegression\n", + "from sklearn.svm import SVC\n", + "from sklearn.model_selection import GridSearchCV\n", + "from sklearn.neighbors import KNeighborsClassifier\n", + "from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, ConfusionMatrixDisplay\n", + "\n", + "from sklearn.cluster import KMeans\n", + "from sklearn.metrics import silhouette_score\n", + "from yellowbrick.cluster import KElbowVisualizer\n", + "from sklearn.decomposition import PCA" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "3568c835-7e2d-4a36-a0ff-953c63dfcef5", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 8335148 entries, 0 to 8335147\n", + "Data columns (total 22 columns):\n", + " # Column Dtype \n", + "--- ------ ----- \n", + " 0 ID int64 \n", + " 1 Case Number object \n", + " 2 Date object \n", + " 3 Block object \n", + " 4 IUCR object \n", + " 5 Primary Type object \n", + " 6 Description object \n", + " 7 Location Description object \n", + " 8 Arrest bool \n", + " 9 Domestic bool \n", + " 10 Beat int64 \n", + " 11 District float64\n", + " 12 Ward float64\n", + " 13 Community Area float64\n", + " 14 FBI Code object \n", + " 15 X Coordinate float64\n", + " 16 Y Coordinate float64\n", + " 17 Year int64 \n", + " 18 Updated On object \n", + " 19 Latitude float64\n", + " 20 Longitude float64\n", + " 21 Location object \n", + "dtypes: bool(2), float64(7), int64(3), object(10)\n", + "memory usage: 1.3+ GB\n" + ] + } + ], + "source": [ + "df = pd.read_csv(\"C:/Users/Ayesh/Downloads/chicago_crime.csv\")\n", + "\n", + "df.shape\n", + "df.head()\n", + "df.columns\n", + "df.info()\n", + "df.describe(include='all')\n", + "\n", + "df.isnull().sum()\n", + "\n", + "df = df[['Date', 'Primary Type', 'Location Description', 'Arrest', 'Community Area', 'Latitude', 'Longitude']]\n", + "\n", + "df['Date'] = pd.to_datetime(df['Date'], errors='coerce')\n", + "df = df.dropna(subset=['Primary Type', 'Date', 'Community Area', 'Latitude', 'Longitude'])\n", + "\n", + "df['Hour'] = df['Date'].dt.hour\n", + "df['DayOfWeek'] = df['Date'].dt.day_name()\n", + "df['Month'] = df['Date'].dt.month\n", + "df['Year'] = df['Date'].dt.year" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "42330976-5d59-4c96-8df8-f1f3e38954e1", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "C:\\Users\\Ayesh\\AppData\\Local\\Temp\\ipykernel_6484\\1729047733.py:5: FutureWarning: \n", + "\n", + "Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.\n", + "\n", + " sns.barplot(x=top_crimes.values, y=top_crimes.index, palette=\"magma\")\n" + ] + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Plot top 10 highest types of crime\n", + "df.head()\n", + "top_crimes = df['Primary Type'].value_counts().head(10)\n", + "plt.figure(figsize=(10, 6))\n", + "sns.barplot(x=top_crimes.values, y=top_crimes.index, palette=\"magma\")\n", + "plt.title(\"Top 10 Most Common Crimes in Chicago\")\n", + "plt.xlabel(\"Number of Reports\")\n", + "plt.ylabel(\"Crime Type\")\n", + "plt.tight_layout()\n", + "plt.show()\n", + "\n", + "top5 = ['Theft', 'Battery', 'Criminal Damage', 'Narcotics', 'Assault']\n", + "df = df[df['Primary Type'].isin(top5)]\n", + "\n", + "# Results of plot: \n", + "# The top 10 most common crime types are: Theft, Battery, Criminal Damage, Narcotics, Assault, Other Offense, Burglary, \n", + "# Motor Vehicle Theft, Deceptive Practice, and Robbery. I'm only keeping the top 5" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "ca5c4a11-b4df-4b02-bfcd-7e60ca494e85", + "metadata": {}, + "outputs": [], + "source": [ + "df['Hour'] = df['Date'].dt.hour\n", + "df['DayOfWeek'] = df['Date'].dt.day_name()\n", + "df['Month'] = df['Date'].dt.month\n", + "\n", + "# Define season\n", + "def get_season(month):\n", + " if month in [12, 1, 2]:\n", + " return 'Winter'\n", + " elif month in [3, 4, 5]:\n", + " return 'Spring'\n", + " elif month in [6, 7, 8]:\n", + " return 'Summer'\n", + " else:\n", + " return 'Fall'\n", + "df['Season'] = df['Month'].apply(get_season)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "5f205cd6-948d-4813-b940-deb1c9ede721", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAqYAAAHnCAYAAABql5/LAAAAOnRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjEwLjMsIGh0dHBzOi8vbWF0cGxvdGxpYi5vcmcvZiW1igAAAAlwSFlzAAAPYQAAD2EBqD+naQAAKGlJREFUeJzt3Qmc1OV9P/BnkXLFoBxKE61Qr4iIitAmbXg1bQwGrYbDYEVTSBQ1TUVbTbRgFbyioE2rktYjJSUNbSP1IDFKFI09PBsUKVroems1BhWqlUvK/l/f5/Wf7e5yuCPL7sPO+/16jbvz29/sPDPPLH7m+xxT19DQ0JAAAKCDdenoBgAAQBBMAQAogmAKAEARBFMAAIogmAIAUATBFACAIgimAAAUQTAFAKAIgikAAEUQTKHG/fu//3v6xje+kX77t387HX744elzn/tcuvjii9Mrr7zygbd97LHH0ic+8Yn8tSPcfvvt+f63dbnssss6pF27op3Zl7//+7+fL9sS93vDDTe0+f0Cu56uHd0AoOPMnz8/ffOb30yf/OQn0/nnn5/23nvv9NJLL6W//uu/Tvfee2+aN29eOuSQQ7Z5+yFDhqQf/OAH6cADD0wdac6cOWmvvfba4nj//v07pD0AfDiCKdSoJUuWpCuvvDKdeuqp6aKLLmo8HiE1qqZjx45N06dPz1XJbdl9993TkUcemTra4MGD07777tvRzQBgBxnKhxoVVdGPfvSj6bzzztviZ3379k1/8id/ko4++ui0du3afOyzn/1srq5Onjw5D/lHmG05/BvDsaNHj0733XdfOv7449PQoUPTmDFj0pNPPpmWLl2aJkyYkG8bP3vkkUea3ed//ud/prPOOisdddRR+fKHf/iHrZpO0BrRrlGjRuXK6q//+q+nkSNHpv/+7//OP1uwYEH63d/93XTYYYfl6Qxx7v/+7/82u/2iRYvSCSeckNs+bty4/HgOPfTQxtBemVLw6quvNrtdPGfxPFZs3rw53XzzzbktcX+f//zn09/+7d82u00MecdzG+dFe+I5PPnkk9OyZcuanRfP52mnnZafq0996lO5H9944420adOm/PiiAt7SMccck/70T/90u8/Vs88+m0455ZR8v9HOpu0755xz0m/91m/lx9FUtDceS1vZsGFD+va3v51fS9GOaHc8H03vt+Vzu7V+2F6/A2VSMYUa1NDQkP71X/81/8+9Z8+eWz3nuOOO2+rQ/1e+8pV0xhlnpI985CNp48aNW5zz85//PF199dXpj//4j1OvXr3S5ZdfngPNL/3SL6WvfvWr6WMf+1jjzx988MHUo0eP9MILL+Twtf/++6dZs2blcPVXf/VXaeLEiWnhwoWpX79+2308EVjiNk116dIlXypee+219E//9E/pz//8z9OaNWvSHnvskW666aZ8/Utf+lKaNm1a+o//+I8cZl5//fUcwsP999+fzj333BymYy7u8uXL0x/8wR9sEc5aY+bMmTk8RQAfNmxY+rd/+7d8P++8804O4hU/+clP0gEHHJBDZPRVPCdTp05NDzzwQNptt93SM888k9t8xBFHpNmzZ+cg/Wd/9mfp9NNPT3feeWeudkeg/J//+Z9c1a5UyGOaRjz323PVVVelSZMmpa997Wv5/q644or8WOMNyRe/+MXctngj8hu/8Rv5/PXr1+fgHq+J7YnH0bKPtnVevE4ieJ999tl5Kknc31/8xV/kNyrxeqrG1vodKJdgCjVo9erVuSpV7fD3xz/+8fT1r3+98frWFsqsW7cuzZgxI1fWKhW4CE0xbSCCTYgqbITVCKQxDB8VrQjIf/M3f9MYpCL4xJSC73znO+nCCy/cbruiKtZSVMeiKlwRoSh+z4gRI/L1d999N/3lX/5l+r3f+73GKmLcZs8998zXI4AfdNBBuXIX1c14DCEeV11dXQ5K1YjHeuutt+bK5plnntl4f/G7IiBHlbJPnz6NbY22V56L9957L7c9gnO05cYbb8ztnDt3burevXs+J+YHR5W0vr4+nXjiiemWW27JITK+DxFYBw0alCus23PSSSelCy64oLF9UYWN9kUlN67/8i//cv5dlWAa1fHozwjD2xMhPOYkf5B//ud/Tg8//HD61re+lSvZ4dOf/nR+A3Pdddfl0Bz90lot+x0om6F8qEFRdQsth6w/SITI1mgafioLkKK6VxGhKkSlMDz66KN5qDXCRwSJuEQoizARIeWDRHX1H//xH5tdYmeB7bU/huOj2hdV48p9xiWuh4ceeiiH7KeffjpPaWjqC1/4QqpWPMaoBm7t/uJNQlQ0K2IxWSWUhgEDBuSv0Z4Q50ZAroTSEBXYqHDGY/zVX/3VNHz48FxtDvE477nnnjR+/PgPbGfLSnmE/rfeeis9//zzuQIdUxliYVylLXfccUf6zd/8zRxYtydCacs+qlyaevzxx1PXrl3zMP7WnvP4ebVa+7oFOp6KKdSgGM6MofgY5tyWqIK9//77zYY+Y2i+NZqGqoptTRkIMcR6991358vW5rt+kIMPPrhV1d94zE3vM1Sqly394he/aAzOLdtQCYrVqNxfpQrYUlQmt/VcVaYkVKYPxO/6oOkNUZ2OxWsxLSGCbFRdP6iqubWdDCr3U5mbGRXYqNhGOI25rTFX+Nprr23Vcx/zRT9I3E9Ujitvnioquy5EpbtaTfsdKJtgCjUqhmVjKD6qdU0rbxUx7BxzG6Oi1Zoh2B0Ri7Ci6hbD5y1F9Wxn6N27d/4aoSqGuLcW0OKcCIVvvvnmVkNmRQzHh5bzTiMMtry/2IJra0EppklU83y9/fbbWxyPuZRRHYxh/ag4xvzQmP/5s5/9LA+HtyZQt1wcVHnslYD6K7/yK7m6HRXYeB7iTUhMuWgr8UYopppENb9pOI03CqEy3WFrFf/KQj1g12UoH2pUrOiOYLG1uZKrVq3K8xdjSHlnh9IQQSfmokaoiqpaXGIuZcw5jTmMO0NMLYgFWVGprNxnXCIIx/zGWNkdlcsYIo/qYAzDV/z0pz/daoU4Fn5VPPfcc80CbGWOY4SupvcXATPmTrYMu9sTvyumGjRdfBYLoqL6G1MPKtXtGJa/66678rmtGcYPsSCtqR//+Md5wdrAgQObVWNjikX87riPrb2x2ZHXQkxxiEDd1A9/+MP8NaYoVJ7zps93aDodAtg1qZhCjYr9R2O1eQTTCFExzBvVqFg8EwtvopJa7QKfDytWgMeq/FitHivxI+jExv2LFy9O119//U65z3isU6ZMyaEwVq/H/q0RUuN6VEArHywQi5W+/OUv51XzsVDq5Zdfzuc0FbeN+bGx4j2e06iURrsrc2lDbGMU8yRj7ut//dd/5eAdC6JitXhMQ9ha1XZ7z1e0JZ6vWAwUc0ijr2I7q6iMNg2QcV5UIVtb1YzV/FHRje2wIpT+y7/8S175X6kKh9gaKlbHxxZWW5vLuyNi7mw8n7EALfoj+iHmlcZirpjfWvkwh9/5nd/Ji7LiEm8yYn5tzOMFdm2CKdSw2PYoAkjlE6BiGDeqY7F/ZmVrp/YQ4SPaECEtVoRHdTLmjcaK+JYLj9rSH/3RH+W5i3/3d3+XV/9HgIvV5hFGY7i8Up2MoH7NNdfk7Yv222+/vMq76X6gMUwf20zFyv0IsPvss08+N1avt9yKKYLUP/zDP+RqXwyPR8Ux2tFyTuX2RJ9FgIz7i9tG9fAzn/lM3jGhW7duzd58RDiO+2h6fHti+D+eiwi6MWzfdHV8RbxxiPmlsSAqwnBbquxSEME+KuZRUY7gHn3SdKpHhPL4WfRNzIWO12zs/BCvaWDXVdfQdHwKgA8Uw/wRmCNotnaIvCM89dRTefunWJ2/vY+WrVZUaCMIR+U29jcFaCsqpgCdTCxqi0tUbGORW1uF0piCENtDxfzSqGxW9kgFaCsWPwF0MrHA6rvf/W7eWSCG5ttK7FAQUwhiGkJMu9jatmAAHTKUH6tBYwgrJr7HRPWtiVWi8Qkw8RnYMWH90ksvzRP+AQCgTSqmsVo3JqLH6t1tif3kYuuSWDgQnw0dW67EZHX7zAEA0CbBNPYajMn0sWXK9sQnuMTKzVhhe8ABB6SLLroob0HScm86AAD4UME09pOLofvYY/CDVoPGRsiVve/ia3x+9tKlSz3zAADs+Kr8U045pVXnxSfHVDZCrog9+7Y3/N/Uk08+mfcyjE9mAQCgPLGPcBQfY8pm0dtFrVu3bosNneN604/Q254IpXFp7fkAAOzadlowjfmlLUNlXI+P7WuNqJTG+fExffF51XRu8UbmxRdf1N81Qn/XFv1dW/R3bamvr89byRUfTAcMGJDefPPNZsfi+t57713V74kXda9evdq4dZRKf9cW/V1b9Hdt0d+1oe7/ryUqfoP9I444onGeaIivTzzxRD4OAAA7NZjGgqf4DOUwevTo9M4776Qrr7wybzEVX6O8f+yxx7blXQIA0Em0aTCNz2SO/UtDfFTdTTfdlJYsWZI/ISq2j7r55puV9QEAaPs5pitXrtzu9cMPPzzdcccdO3IXAADUiJ02xxQAAKohmAIAUATBFACAIgimAAAUQTAFAKAIgikAAEUQTAEAKIJgCgBAEQRTAACKIJgCAFAEwRQAgCIIpgAAFEEwBQCgCIIpAABFEEwBACiCYAoAQBEEUwAAiiCYAgBQBMEUAIAiCKYAABRBMAUAoAiCKQAARRBMAQAogmAKAEARBFMAAIogmAIAUATBFACAIgimAAAUQTAFAKAIgikAAEUQTAEAKIJgCgBAEQRTAACKIJgCAFAEwRQAgCIIpgAAFEEwBQCgCIIpAABFEEwBACiCYAoAQBEEUwAAiiCYAgBQBMEUAIAiCKYAABRBMAUAoAiCKQAARRBMAQAogmAKAEARBFMAAIogmAIAUATBFACAIgimAAAUQTAFAKAIgikAAEUQTAEAKIJgCgBAEQRTAACKIJgCAFAEwRQAgCIIpgAAFEEwBQCgCIIpAABFEEwBACiCYAoAQBEEUwAAiiCYAgBQBMEUAIAiCKYAABRBMAUAoAiCKQAARRBMAQAogmAKAEARBFMAAHbNYLphw4Y0ffr0NGLEiDRy5Mg0d+7cbZ573333pWOPPTYNGzYsTZw4MT399NM72l4AADqpqoPp7Nmz0/Lly9O8efPSjBkz0pw5c9KiRYu2OK++vj6df/756ayzzkoLFy5MgwcPzt+vW7eurdoOAECtBtO1a9emBQsWpIsuuigNGTIkjRo1Kk2ZMiXNnz9/i3MfeuihdOCBB6axY8em/fbbL5133nlp1apV6dlnn23L9gMA0El0rebkFStWpE2bNuWh+Yrhw4enG2+8MW3evDl16fJ/OXfPPffMIXTJkiX5/Ntvvz3tvvvuOaRWQ4W1NlT6WX/XBv1dW/R3bdHftaWhoSHV1dV1TDCNimefPn1St27dGo/1798/zztds2ZN6tu3b+Px4447Lj3wwAPplFNOSbvttlsOrTfddFPaY489qmrgiy++WNX57Nr0d23R37VFf9cW/V07ujXJhe0aTOPdT8s7r1zfuHFjs+OrV6/OQfaSSy5JRxxxRPr7v//7NG3atHTHHXekfv36tfo+Bw0alHr27FlNM9kFxWsr/hHT37VBf9cW/V1b9Hdtqa+vb9PfV1Uw7d69+xYBtHK9R48ezY5fe+216eCDD06nnnpqvn755ZfnFfq33XZbOvPMM1t9n/Gi7tWrVzXNZBemv2uL/q4t+ru26O/aUNeGw/hVL34aMGBAroTGPNOKqIpGKO3du3ezc2NrqEMOOeT/7qhLl3z9tddea4t2AwDQyVQVTGPLp65du6alS5c2HovFTUOHDm228Cnsvffe6bnnnmt27IUXXkj77rvvjrYZAIBaD6ZRlo/tn2bOnJmWLVuWFi9enDfYnzRpUmP1dP369fn7k046Kd16663pzjvvTC+99FIe2o9q6bhx43bOIwEAYJdW1RzTEAuYIphOnjw5b/80derUdMwxx+SfxSdBXXXVVWn8+PF5Vf57772XV+L//Oc/z9XW2JS/moVPAADUjqqDaVRNZ82alS8trVy5stn1CRMm5AsAALT5R5ICAMDOIJgCAFAEwRQAgCIIpgAAFEEwBQCgCIIpAABFEEwBACiCYAoAQBEEUwAAiiCYAgBQBMEUAIAiCKYAABRBMAUAoAiCKQAARRBMAQAogmAKAEARBFMAAIogmAIAUATBFACAIgimAAAUQTAFAKAIgikAAEUQTAEAKIJgCgBAEQRTAACKIJgCAFAEwRQAgCIIpgAAFEEwBQCgCIIpAABFEEwBACiCYAoAQBEEUwAAiiCYAgBQBMEUAIAiCKYAABRBMAUAoAiCKQAARRBMAQAogmAKAEARBFMAAIogmAIAUATBFACAIgimAAAUQTAFAKAIgikAAEUQTAEAKIJgCgBAEQRTAACKIJgCAFAEwRQAgCIIpgAAFEEwBQCgCIIpAABFEEwBACiCYAoAQBEEUwAAiiCYAgBQBMEUAIAiCKYAABRBMAUAoAiCKQAARRBMAQAogmAKAEARBFMAAIogmAIAUATBFACAIgimAAAUQTAFAKAIgikAAEUQTAEAKIJgCgDArhlMN2zYkKZPn55GjBiRRo4cmebOnbvNc1euXJkmTpyYDj/88HTCCSekRx99dEfbCwBAJ1V1MJ09e3Zavnx5mjdvXpoxY0aaM2dOWrRo0Rbnvfvuu+m0005LBx54YPrRj36URo0alc4+++z01ltvtVXbAQCo1WC6du3atGDBgnTRRRelIUOG5LA5ZcqUNH/+/C3OveOOO1KvXr3SzJkz08CBA9M555yTv0aoBQCAlrqmKqxYsSJt2rQpDRs2rPHY8OHD04033pg2b96cunT5v5z7+OOPp6OPPjrttttujcduu+22au4OAIAaUlUwXbVqVerTp0/q1q1b47H+/fvneadr1qxJffv2bTz+yiuv5LmlF198cXrggQfSPvvsky688MIcZKuxbt26qs5n11TpZ/1dG/R3bdHftUV/15aGhoZUV1fXMcE0XmRNQ2moXN+4ceMWw/4333xzmjRpUrrlllvSj3/843T66aene+65J33sYx9r9X2++OKL1TSRXZz+ri36u7bo79qiv2tHtxbZsN2Caffu3bcIoJXrPXr0aHY8hvAHDx6c55aGQw89ND300ENp4cKF6atf/Wqr73PQoEGpZ8+e1TSTXVC86Yl/xPR3bdDftUV/1xb9XVvq6+vb9PdVFUwHDBiQVq9eneeZdu3atXF4P0Jp7969m5271157pf3337/ZsXiRvv7661U1MF7UsYiK2qC/a4v+ri36u7bo79pQ14bD+FWvyo8KaATSpUuXNh5bsmRJGjp0aLOFT+HII4/M+5g29fzzz+e5pgAAsEPBNN79jB07Nm8BtWzZsrR48eK8wX7MI61UT9evX5+/P/nkk3MwveGGG9JLL72UrrvuurwgasyYMdXcJQAANaLqDfanTZuW9zCdPHlyuvTSS9PUqVPTMccck38WnwR199135++jMvqd73wn/fSnP03HH398/hqLoWI6AAAA7NAc00rVdNasWfnSUsuh+9ga6vbbb6/2LgAAqEFVV0wBAGBnEEwBACiCYAoAQBEEUwAAiiCYAgBQBMEUAIAiCKYAABRBMAUAoAiCKQAARRBMAQAogmAKAEARBFMAAIogmAIAUATBFACAIgimAAAUQTAFAKAIgikAAEUQTAEAKIJgCgBAEQRTAACKIJgCAFAEwRQAgCIIpgAAFEEwBQCgCIIpAABFEEwBACiCYAoAQBEEUwAAiiCYAgBQBMEUAIAiCKYAABRBMAUAoAiCKQAARRBMAQAogmAKAEARBFMAAIogmAIAUATBFACAIgimAAAUQTAFAKAIgikAAEUQTAEAKIJgCgBAEQRTAACKIJgCAFAEwRQAgCIIpgAAFEEwBQCgCIIpAABFEEwBACiCYAoAQBEEUwAAiiCYAgBQBMEUAIAiCKYAABRBMAUAoAiCKQAARRBMAQAogmAKAEARBFMAAIogmAIAUATBFACAIgimAAAUQTAFAKAIgikAAEUQTAEAKIJgCgBAEQRTAACKIJgCAFAEwRQAgCIIpgAAFEEwBQCgCIIpAAC7ZjDdsGFDmj59ehoxYkQaOXJkmjt37gfe5tVXX03Dhg1Ljz322IdtJwAAnVzXam8we/bstHz58jRv3rz02muvpQsvvDB9/OMfT6NHj97mbWbOnJnWrl27o20FAKATqyqYRrhcsGBBuuWWW9KQIUPypb6+Ps2fP3+bwfSHP/xheu+999qqvQAAdFJVDeWvWLEibdq0KQ/LVwwfPjw99dRTafPmzVucv3r16nTNNdekyy67rG1aCwBAp1VVxXTVqlWpT58+qVu3bo3H+vfvn+edrlmzJvXt27fZ+VdffXUaN25cOuiggz50A9etW/ehb8uuo9LP+rs26O/aor9ri/6uLQ0NDamurq5jgmm8yJqG0lC5vnHjxmbHH3744bRkyZJ011137VADX3zxxR26PbsW/V1b9Hdt0d+1RX/Xjm4tsmG7BdPu3btvEUAr13v06NF4bP369emSSy5JM2bMaHb8wxg0aFDq2bPnDv0OyhdveuIfMf1dG/R3bdHftUV/15b6+vo2/X1VBdMBAwbkeaMxz7Rr166Nw/sRPnv37t143rJly9Irr7ySzjnnnGa3P+OMM9LYsWOrmnMaL+pevXpV00x2Yfq7tujv2qK/a4v+rg11bTiMX3UwHTx4cA6kS5cuzfuYhhiuHzp0aOrS5f/WUR1++OHp3nvvbXbbY445Jl1xxRXp05/+dFu1HQCATqRrte9+ouIZ+5J+85vfTL/4xS/yBvtXXXVVY/X0ox/9aK6gDhw4cKsV1379+rVd6wEAqN1Pfpo2bVrev3Ty5Mnp0ksvTVOnTs3V0BCfBHX33XfvjHYCANDJVf3JT1E1nTVrVr60tHLlym3ebns/AwCAqiumAACwMwimAAAUQTAFAKAIgikAAEUQTAEAKIJgCgBAEQRTAACKIJgCAFAEwRQAgCIIpgAAFEEwBQCgCIIpAABFEEwBACiCYAoAQBEEUwAAiiCYAgBQBMEUAIAiCKYAABRBMAUAoAiCKQAARRBMAQAogmAKAEARBFMAAIogmAIAUATBFACAIgimAAAUQTAFAKAIgikAAEUQTAEAKIJgCgBAEQRTAACKIJgCAFAEwRQAgCIIpgAAFEEwBQCgCIIpAABFEEwBACiCYAoAQBEEUwAAiiCYAgBQBMEUAIAiCKYAABRBMAUAoAiCKQAARRBMAQAogmAKAEARBFMAAIogmAIAUATBFACAIgimAAAUQTAFAKAIgikAAEUQTAEAKIJgCgBAEQRTAACKIJgCAFAEwRQAgCIIpgAAFEEwBQCgCIIpAABFEEwBACiCYAoAQBEEUwAAiiCYAgBQBMEUAIAiCKYAABRBMAUAoAiCKQAARRBMAQAogmAKAEARBFMAAIogmAIAUATBFACAXTOYbtiwIU2fPj2NGDEijRw5Ms2dO3eb5z744INpzJgxadiwYemEE05I999//462FwCATqrqYDp79uy0fPnyNG/evDRjxow0Z86ctGjRoi3OW7FiRTr77LPTiSeemO6888508sknp3PPPTcfBwCAlrqmKqxduzYtWLAg3XLLLWnIkCH5Ul9fn+bPn59Gjx7d7Ny77rorfepTn0qTJk3K1wcOHJgeeOCBdM8996RDDjmkmrsFAKAGVBVMo9q5adOmPDRfMXz48HTjjTemzZs3py5d/q8AO27cuPT+++9v8TvefffdHW0zAAC1HkxXrVqV+vTpk7p169Z4rH///nne6Zo1a1Lfvn0bjx9wwAHNbhuV1UceeSQP6Vdj3bp1VZ3PrqnSz/q7Nujv2qK/a4v+ri0NDQ2prq6uY4JpvMiahtJQub5x48Zt3u7tt99OU6dOTUcddVQ6+uijq2rgiy++WNX57Nr0d23R37VFf9cW/V07urXIhu0WTLt3775FAK1c79Gjx1Zv8+abb6avfOUrOVFff/31zYb7W2PQoEGpZ8+eVd2GXU+86Yl/xPR3bdDftUV/1xb9XVvq6+vb9PdVFUwHDBiQVq9eneeZdu3atXF4P0Jp7969tzj/jTfeaFz89L3vfa/ZUH9rxYu6V69eVd+OXZP+ri36u7bo79qiv2tDXRsO44eqypeDBw/OgXTp0qWNx5YsWZKGDh26RSU0VvBPmTIlH//+97+fQy0AALRJMI13P2PHjk0zZ85My5YtS4sXL84b7FeqolE9Xb9+ff7+pptuSi+//HKaNWtW48/iYlU+AAA7PJQfpk2bloPp5MmT0+67754XNR1zzDH5Z/FJUFdddVUaP358+slPfpJD6oQJE5rdPraRuvrqq6u9WwAAOrmqg2lUTaMKWqmENrVy5crG77f2aVAAANBmH0kKAAA7g2AKAEARBFMAAIogmAIAUATBFACAIgimAAAUQTAFAKAIgikAAEUQTAEAKIJgCgBAEQRTAACKIJgCAFAEwRQAgCIIpgAAFEEwBQCgCIIpAABFEEwBACiCYAoAQBEEUwAAiiCYAgBQBMEUAIAiCKYAABRBMAUAoAiCKQAARRBMAQAogmAKAEARBFMAAIogmAIAUATBFACAIgimAAAUQTAFAKAIgikAAEUQTAEAKIJgCgBAEQRTAACKIJgCAFAEwRQAgCIIpgAAFEEwBQCgCIIpAABFEEwBACiCYAoAQBEEUwAAiiCYAgBQBMEUAIAiCKYAABRBMAUAoAiCKQAARRBMAQAogmAKAEARBFMAAIogmAIAUATBFACAIgimAAAUQTAFAKAIgikAAEUQTAEAKIJgCgBAEQRTAACKIJgCAFAEwRQAgCIIpgAAFEEwBQCgCIIpAABFEEwBACiCYAoAQBEEUwAAiiCYAgBQBMEUAIAiCKYAABRBMAUAoAiCKQAAu2Yw3bBhQ5o+fXoaMWJEGjlyZJo7d+42z33mmWfShAkT0hFHHJFOPPHEtHz58h1tLwAAnVTVwXT27Nk5YM6bNy/NmDEjzZkzJy1atGiL89auXZvOPPPMHGBvv/32NGzYsHTWWWfl4wAAsEPBNELlggUL0kUXXZSGDBmSRo0alaZMmZLmz5+/xbl333136t69e7rgggvSAQcckG/zkY98ZKshFgAAqgqmK1asSJs2bcrVz4rhw4enp556Km3evLnZuXEsflZXV5evx9ejjjoqLV26tK3aDgBAJ9K1mpNXrVqV+vTpk7p169Z4rH///nne6Zo1a1Lfvn2bnXvggQc2u32/fv1SfX19q+7r/fffz1/j/Eq4pfNqaGjIX/V3bdDftUV/1xb9XVvef//9Nu3nqoLpunXrmoXSULm+cePGVp3b8rxtqTzILl1sHFALor9bvl7ovPR3bdHftUV/115/13VUMI05oy2DZeV6jx49WnVuy/O2pel0AQAAOr+qypEDBgxIq1evzvNMmw7ZR9js3bv3Fue++eabzY7F9b333ntH2wwAQK0H08GDB6euXbs2W8C0ZMmSNHTo0C2G3GPv0ieffLJxrkl8feKJJ/JxAADYoWDas2fPNHbs2DRz5sy0bNmytHjx4rzB/qRJkxqrp+vXr8/fjx49Or3zzjvpyiuvTM8++2z+GvNOjz322GruEgCAGlHXUClptlKEywim9957b9p9993T6aefnr785S/nn33iE59IV111VRo/fny+HuE1NuF/7rnn8s8uvfTSdOihh+6cRwIAQG0FUwAA2BnsxQQAQBEEUwAAiiCYAgBQBMEUAIAidGgw3bBhQ5o+fXoaMWJEGjlyZN56alueeeaZNGHChLwP6oknnpiWL1/erm2lffv7wQcfTGPGjMmfAHbCCSek+++/v13bSvv2d8Wrr76a+/yxxx5rlzbSMf29cuXKNHHixHT44Yfnv+9HH320XdtK+/b3fffdl7eKjL/t6Penn366XdtK24lP8Dz++OO3+2/0jua1Dg2ms2fPzg2eN29e3lZqzpw5adGiRVuct3bt2nTmmWfmP4Dbb789v7jPOuusfJxdR2v7e8WKFenss8/OL+g777wznXzyyencc8/Nx+l8/d1UbEXn77pz9/e7776bTjvttHTggQemH/3oR2nUqFH57/2tt97qkHazc/u7vr4+nX/++fn/2QsXLswf1BPfx9aT7HpvRs4777zcp9vSJnmtoYO89957DUOHDm149NFHG499+9vfbvjSl760xbkLFixo+OxnP9uwefPmfD2+jho1quG2225r1zbTPv19zTXXNJx++unNjp122mkN3/rWt9qlrbRvf1csXLiw4eSTT244+OCDm92OztXf8+bNa/jc5z7XsGnTpsZj48ePb3jwwQfbrb20X39/97vfbRg3blzj9XfffTf/jS9btqzd2suOq6+vb/jCF77QcMIJJ2z33+i2yGsdVjGN6temTZtymq4YPnx4euqpp9LmzZubnRvH4md1dXX5enw96qijmn00KmWrpr/HjRuXvv71r2+10kLn6++wevXqdM0116TLLrusnVtKe/f3448/no4++ui02267NR677bbb0mc+85l2bTPt09977rln/vTH+Pjy+FlU0eLDefbbb78OaDkfVvzdfvKTn0w/+MEPtnteW+S1rqmDxMeX9unTJ3Xr1q3xWP/+/XOpeM2aNalv377Nzo1hn6b69eu33XIyZammvw844IBmt41+fuSRR/KQPp2vv8PVV1+d35AcdNBBHdBa2rO/X3nllTy39OKLL04PPPBA2meffdKFF16Y/2dG5+vv4447LvfzKaeckt+MdOnSJd10001pjz326KDW82FE/7VGW+S1DquYxvySpi/qULkek2tbc27L8yhXNf3d1Ntvv52mTp2a33FFlYXO198PP/xwrqZ87Wtfa9c20jH9HXPNbr755rTXXnulW265Jf3ar/1a/mjr119/vV3bTPv0d4yGRFi55JJL0q233poXtU6bNs2c4k5qXRvktQ4Lpt27d9+ioZXrPXr0aNW5Lc+jXNX0d8Wbb76ZJk+eHPOg0/XXX5/fadO5+nv9+vX5f1ixeMLfc238fUfVLBbAnHPOOenQQw9N3/jGN9KgQYPywhg6X39fe+216eCDD06nnnpqOuyww9Lll1+eevbsmadv0Pm0RV7rsP/TDxgwIL+TinkqFfGuKhrfu3fvLc6NkNJUXN97773brb20X3+HN954I/9DFi/o733ve1sM/dI5+nvZsmV5aDdCSsxXq8xZO+OMM3JgpfP9fUeldP/99292LIKpimnn7O/YGuqQQw5pvB4Fhrj+2muvtWubaR9tkdc6LJjGO+auXbs2mxAbw3lDhw7dojIWe2E9+eSTuXIW4usTTzyRj7NrqKa/Y6hvypQp+fj3v//9/EKnc/Z3zDW8995787ZglUu44oor8hZhdL6/7yOPPDLvY9rU888/n+ea0vn6OwLJc8891+zYCy+8kPbdd992ay/tpy3yWocF0yjljx07Nu9bGFWTxYsX5w16J02a1PjuK4b5wujRo9M777yTrrzyyry6L77GPIbYsJddQzX9HRPjX3755TRr1qzGn8XFqvzO199RYRk4cGCzS4g3IzFhns739x2LGCOY3nDDDemll15K1113Xa6ax9xDOl9/n3TSSXluabzpjP6Oof2olsZiRzqHVW2d1xo60Nq1axsuuOCChiOPPLJh5MiReb+zitgnq+m+V0899VTD2LFj895pX/ziFxuefvrpDmo1O7u/P//5z+frLS8XXnhhB7aenfn33ZR9TDt/f//sZz/Le1sedthhDWPGjGl4/PHHO6jVtEd/33rrrQ2jR4/O506cOLFh+fLlHdRq2kLLf6PbOq/VxX92Xo4GAIDWscwZAIAiCKYAABRBMAUAoAiCKQAARRBMAQAogmAKAEARBFMAAIogmAIAUATBFACAIgimAAAUQTAFACCV4P8Bd9lYu1PqWOYAAAAASUVORK5CYII=", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "sns.countplot(data=df, x='Hour', palette='coolwarm')\n", + "plt.title(\"Crime Frequency by Hour\")\n", + "plt.show()\n", + "\n", + "sns.countplot(data=df, x='DayOfWeek', order=['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday'], palette='pastel')\n", + "plt.title(\"Crime Frequency by Day of Week\")\n", + "plt.xticks(rotation=45)\n", + "plt.show()\n", + "\n", + "sns.countplot(data=df, x='Season', palette='Set2')\n", + "plt.title(\"Crime Frequency by Season\")\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "cad0760d-91df-4dc0-8356-3f41f3ad4722", + "metadata": {}, + "outputs": [ + { + "ename": "ValueError", + "evalue": "With n_samples=0, test_size=0.2 and train_size=None, the resulting train set will be empty. Adjust any of the aforementioned parameters.", + "output_type": "error", + "traceback": [ + "\u001b[31m---------------------------------------------------------------------------\u001b[39m", + "\u001b[31mValueError\u001b[39m Traceback (most recent call last)", + "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[6]\u001b[39m\u001b[32m, line 18\u001b[39m\n\u001b[32m 15\u001b[39m X = df[features]\n\u001b[32m 16\u001b[39m y = df[\u001b[33m'\u001b[39m\u001b[33mCrime_Code\u001b[39m\u001b[33m'\u001b[39m]\n\u001b[32m---> \u001b[39m\u001b[32m18\u001b[39m X_train, X_test, y_train, y_test = \u001b[43mtrain_test_split\u001b[49m\u001b[43m(\u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43my\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mrandom_state\u001b[49m\u001b[43m=\u001b[49m\u001b[32;43m42\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtest_size\u001b[49m\u001b[43m=\u001b[49m\u001b[32;43m0.2\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mstratify\u001b[49m\u001b[43m=\u001b[49m\u001b[43my\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 20\u001b[39m scaler = StandardScaler()\n\u001b[32m 21\u001b[39m X_train = scaler.fit_transform(X_train)\n", + "\u001b[36mFile \u001b[39m\u001b[32m~\\AppData\\Roaming\\Python\\Python313\\site-packages\\sklearn\\utils\\_param_validation.py:218\u001b[39m, in \u001b[36mvalidate_params..decorator..wrapper\u001b[39m\u001b[34m(*args, **kwargs)\u001b[39m\n\u001b[32m 212\u001b[39m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[32m 213\u001b[39m \u001b[38;5;28;01mwith\u001b[39;00m config_context(\n\u001b[32m 214\u001b[39m skip_parameter_validation=(\n\u001b[32m 215\u001b[39m prefer_skip_nested_validation \u001b[38;5;129;01mor\u001b[39;00m global_skip_validation\n\u001b[32m 216\u001b[39m )\n\u001b[32m 217\u001b[39m ):\n\u001b[32m--> \u001b[39m\u001b[32m218\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[43m*\u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 219\u001b[39m \u001b[38;5;28;01mexcept\u001b[39;00m InvalidParameterError \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[32m 220\u001b[39m \u001b[38;5;66;03m# When the function is just a wrapper around an estimator, we allow\u001b[39;00m\n\u001b[32m 221\u001b[39m \u001b[38;5;66;03m# the function to delegate validation to the estimator, but we replace\u001b[39;00m\n\u001b[32m 222\u001b[39m \u001b[38;5;66;03m# the name of the estimator by the name of the function in the error\u001b[39;00m\n\u001b[32m 223\u001b[39m \u001b[38;5;66;03m# message to avoid confusion.\u001b[39;00m\n\u001b[32m 224\u001b[39m msg = re.sub(\n\u001b[32m 225\u001b[39m \u001b[33mr\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mparameter of \u001b[39m\u001b[33m\\\u001b[39m\u001b[33mw+ must be\u001b[39m\u001b[33m\"\u001b[39m,\n\u001b[32m 226\u001b[39m \u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mparameter of \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mfunc.\u001b[34m__qualname__\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m must be\u001b[39m\u001b[33m\"\u001b[39m,\n\u001b[32m 227\u001b[39m \u001b[38;5;28mstr\u001b[39m(e),\n\u001b[32m 228\u001b[39m )\n", + "\u001b[36mFile \u001b[39m\u001b[32m~\\AppData\\Roaming\\Python\\Python313\\site-packages\\sklearn\\model_selection\\_split.py:2919\u001b[39m, in \u001b[36mtrain_test_split\u001b[39m\u001b[34m(test_size, train_size, random_state, shuffle, stratify, *arrays)\u001b[39m\n\u001b[32m 2916\u001b[39m arrays = indexable(*arrays)\n\u001b[32m 2918\u001b[39m n_samples = _num_samples(arrays[\u001b[32m0\u001b[39m])\n\u001b[32m-> \u001b[39m\u001b[32m2919\u001b[39m n_train, n_test = \u001b[43m_validate_shuffle_split\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 2920\u001b[39m \u001b[43m \u001b[49m\u001b[43mn_samples\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtest_size\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtrain_size\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdefault_test_size\u001b[49m\u001b[43m=\u001b[49m\u001b[32;43m0.25\u001b[39;49m\n\u001b[32m 2921\u001b[39m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 2923\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m shuffle \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mFalse\u001b[39;00m:\n\u001b[32m 2924\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m stratify \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n", + "\u001b[36mFile \u001b[39m\u001b[32m~\\AppData\\Roaming\\Python\\Python313\\site-packages\\sklearn\\model_selection\\_split.py:2499\u001b[39m, in \u001b[36m_validate_shuffle_split\u001b[39m\u001b[34m(n_samples, test_size, train_size, default_test_size)\u001b[39m\n\u001b[32m 2496\u001b[39m n_train, n_test = \u001b[38;5;28mint\u001b[39m(n_train), \u001b[38;5;28mint\u001b[39m(n_test)\n\u001b[32m 2498\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m n_train == \u001b[32m0\u001b[39m:\n\u001b[32m-> \u001b[39m\u001b[32m2499\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[32m 2500\u001b[39m \u001b[33m\"\u001b[39m\u001b[33mWith n_samples=\u001b[39m\u001b[38;5;132;01m{}\u001b[39;00m\u001b[33m, test_size=\u001b[39m\u001b[38;5;132;01m{}\u001b[39;00m\u001b[33m and train_size=\u001b[39m\u001b[38;5;132;01m{}\u001b[39;00m\u001b[33m, the \u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m 2501\u001b[39m \u001b[33m\"\u001b[39m\u001b[33mresulting train set will be empty. Adjust any of the \u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m 2502\u001b[39m \u001b[33m\"\u001b[39m\u001b[33maforementioned parameters.\u001b[39m\u001b[33m\"\u001b[39m.format(n_samples, test_size, train_size)\n\u001b[32m 2503\u001b[39m )\n\u001b[32m 2505\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m n_train, n_test\n", + "\u001b[31mValueError\u001b[39m: With n_samples=0, test_size=0.2 and train_size=None, the resulting train set will be empty. Adjust any of the aforementioned parameters." + ] + } + ], + "source": [ + "df['Crime_Code'] = df['Primary Type'].replace({\n", + " 'Theft': 0,\n", + " 'Battery': 1,\n", + " 'Criminal Damage': 2,\n", + " 'Narcotics': 3,\n", + " 'Assault': 4\n", + "})\n", + "\n", + "# Encode input features \n", + "df['DayOfWeek_Code'] = df['DayOfWeek'].astype('category').cat.codes\n", + "df['Season_Code'] = df['Season'].astype('category').cat.codes\n", + "\n", + "# Define X and y \n", + "features = ['Hour', 'DayOfWeek_Code', 'Season_Code', 'Community Area', 'Latitude', 'Longitude']\n", + "X = df[features]\n", + "y = df['Crime_Code']\n", + "\n", + "X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2, stratify=y)\n", + "\n", + "scaler = StandardScaler()\n", + "X_train = scaler.fit_transform(X_train)\n", + "X_test = scaler.transform(X_test)\n", + "\n", + "# Train Decision Tree model\n", + "clf = DecisionTreeClassifier(max_depth=None, random_state=42)\n", + "clf.fit(X_train, y_train)\n", + "\n", + "# Predict and evaluate\n", + "y_pred = clf.predict(X_test)\n", + "accuracy = accuracy_score(y_test, y_pred)\n", + "print(\"Accuracy:\", accuracy)\n", + "\n", + "conf_matrix = confusion_matrix(y_test, y_pred)\n", + "print(\"Confusion Matrix:\\n\", conf_matrix)\n", + "\n", + "print(\"\\nClassification Report:\\n\", classification_report(y_test, y_pred))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dc67ca2b-2d92-470a-8fa7-805c4048321b", + "metadata": {}, + "outputs": [], + "source": [ + "print(X_train.shape)\n", + "print(X_test.shape)\n", + "\n", + "# Step 3: Grid search like in demo\n", + "start = time.time()\n", + "\n", + "svc = SVC(random_state=2021)\n", + "\n", + "param_grid = [\n", + " {'kernel': ['linear'], 'C': [0.1, 1, 10, 100, 1000]},\n", + " {'kernel': ['poly'], 'degree': [2, 3, 5], 'C': [0.1, 1, 10, 100, 1000]},\n", + " {'kernel': ['rbf'], 'gamma': ['auto', 'scale'], 'C': [0.1, 1, 10, 100, 1000]},\n", + " {'kernel': ['sigmoid'], 'gamma': ['auto', 'scale'], 'C': [0.1, 1, 10, 100, 1000]},\n", + "]\n", + "\n", + "scores = ['precision', 'recall']\n", + "\n", + "for score in scores:\n", + " print(f'# Tuning hyper-parameters for {score}\\n')\n", + "\n", + " CV_svc = GridSearchCV(\n", + " estimator=svc,\n", + " param_grid=param_grid,\n", + " scoring=f'{score}_macro',\n", + " cv=10,\n", + " verbose=1,\n", + " n_jobs=-1\n", + " )\n", + " CV_svc.fit(X_train, y_train.values.ravel())\n", + "\n", + " print(\"Best parameters set found on development set:\\n\")\n", + " print(CV_svc.best_params_)\n", + " print(\"\\nGrid scores on development set:\\n\")\n", + " means = CV_svc.cv_results_['mean_test_score']\n", + " stds = CV_svc.cv_results_['std_test_score']\n", + "\n", + " for mean, std, params in zip(means, stds, CV_svc.cv_results_['params']):\n", + " print(f\"{mean:.3f} (+/-{std * 2:.03f}) for {params}\")\n", + " print()\n", + "\n", + " print(\"Detailed classification report:\\n\")\n", + " y_true, y_pred = y_test, CV_svc.predict(X_test)\n", + " print(classification_report(y_true, y_pred))\n", + " print()\n", + "\n", + "print(\"Total time:\", round(time.time() - start, 2), \"seconds\")\n", + "\n", + "# Fitting the best model and evaluate on test set like in demo\n", + "best_svc = CV_svc.best_estimator_\n", + "best_svc.fit(X_train, y_train.values.ravel())\n", + "\n", + "y_test_pred = best_svc.predict(X_test)\n", + "\n", + "# Step 5: Printing \n", + "print(confusion_matrix(y_test, y_test_pred))\n", + "print(classification_report(y_test, y_test_pred))\n", + "\n", + "# Plotting\n", + "ConfusionMatrixDisplay.from_estimator(best_svc, X_test, y_test)\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b9b48024-a5cf-40bd-a4bb-4d48a7505965", + "metadata": {}, + "outputs": [], + "source": [ + "scaler = StandardScaler()\n", + "X_scaled = scaler.fit_transform(X)\n", + "X_scaled_df = pd.DataFrame(X_scaled, columns=features)\n", + "\n", + "pca = PCA(random_state=2021, n_components=None)\n", + "pca.fit(X_scaled_df)\n", + "X_pca = pca.transform(X_scaled_df)\n", + "\n", + "exp_var_ratio = pca.explained_variance_ratio_\n", + "\n", + "def explained_variance_ratio_plot(exp_var_ratio):\n", + " x_axis = range(1, len(exp_var_ratio)+1)\n", + " plt.bar(x_axis, exp_var_ratio, align='center', label='Individual EVR')\n", + " plt.step(x_axis, np.cumsum(exp_var_ratio), where='mid', color='red', label='Cumulative EVR')\n", + " plt.ylim(0, 1.1)\n", + " plt.xticks(x_axis)\n", + " plt.xlabel('Principal Components')\n", + " plt.ylabel('Explained Variance Ratio')\n", + " plt.grid()\n", + " plt.legend()\n", + " plt.title(\"Explained Variance by Principal Component\")\n", + " plt.show()\n", + "\n", + "explained_variance_ratio_plot(exp_var_ratio)\n", + "\n", + "num_pc = 5\n", + "loadings = pca.components_\n", + "pc_list = [\"PC\"+str(i) for i in list(range(1, num_pc+1))]\n", + "\n", + "loadings_df = pd.DataFrame(loadings, columns=features, index=pc_list)\n", + "sns.heatmap(loadings_df.T, annot=True, cmap='Spectral')\n", + "plt.title(\"PCA Component Loadings (Crime Features)\")\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "99d7770e-168a-46e9-a63f-d1705300fc97", + "metadata": {}, + "outputs": [], + "source": [ + "df_pc12 = scores_df[['PC1', 'PC2']]\n", + "state_st = StandardScaler().fit_transform(df_pc12)\n", + "\n", + "scores = []\n", + "cluster_results = {}\n", + "\n", + "kmeans = KMeans()\n", + "\n", + "for n in range(2, 14):\n", + " kmeans.set_params(n_clusters=n)\n", + " labels = kmeans.fit_predict(state_st)\n", + " cluster_results[n] = labels\n", + " score = silhouette_score(state_st, labels)\n", + " scores.append(score)\n", + "\n", + "plt.figure(figsize=(12, 6))\n", + "plt.title(\"Silhouette Score for Crime Clusters (PC1 & PC2)\")\n", + "plt.xlabel(\"Number of Clusters\")\n", + "plt.ylabel(\"Silhouette Score\")\n", + "plt.plot(range(2, 14), scores, marker='s')\n", + "plt.grid()\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e28dc33d-2b85-408a-b3bc-7869a150e870", + "metadata": {}, + "outputs": [], + "source": [ + "kmeans = KMeans()\n", + "elbow = KElbowVisualizer(kmeans, k=(2, 20))\n", + "elbow.fit(state_st)\n", + "elbow.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e4bbc76f-65f8-4937-9524-08da082113d2", + "metadata": {}, + "outputs": [], + "source": [ + "kmeans = KMeans(n_clusters=5, random_state=42)\n", + "labels = kmeans.fit_predict(state_st)\n", + "\n", + "scores_df['cluster'] = labels + 1" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "762ce5e6-7d03-40e4-94fd-d016b04ae441", + "metadata": {}, + "outputs": [], + "source": [ + "plt.figure(figsize=(8, 6))\n", + "plt.scatter(state_st[:, 0], state_st[:, 1], c=labels, cmap=\"viridis\", s=50)\n", + "plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1],\n", + " c=\"red\", s=200, alpha=0.8, label=\"Centroids\")\n", + "plt.title(\"K-Means Clusters (Crime PCA)\")\n", + "plt.xlabel(\"PC1\")\n", + "plt.ylabel(\"PC2\")\n", + "plt.legend()\n", + "plt.grid()\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "17b2e9a1-d5f7-49f5-b274-fc949e3a8de3", + "metadata": {}, + "outputs": [], + "source": [ + "cluster_summary = scores_df.groupby('cluster')[['PC1', 'PC2']].mean()\n", + "print(cluster_summary)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5113bea6-4e93-4f54-9cae-d0a1f4f05dce", + "metadata": {}, + "outputs": [], + "source": [ + "# Academic Integrity Statement: I did not use artificial intelligence to help me with this assignment, and I am aware that if I did, \n", + "# I would have to provide the link to the transcript of the session. \n", + "# Also I submitted a downloaded version of the code. " + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.13.3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}