From 585937d88bddde581abb8d596eda7233356c9ecd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maha=20Alshehri=C2=A0?= Date: Sun, 21 Sep 2025 20:51:32 +1000 Subject: [PATCH] implement user segmentation clustering model Built K-Means clustering using RFM features to group users for segmentation-based recommendations. --- .gitignore | 3 + .vscode/settings.json | 5 + ...rediction_based_preferences_features.ipynb | 512 ++++++++++++++++++ 3 files changed, 520 insertions(+) create mode 100644 .vscode/settings.json create mode 100644 ML/Smart Cart Maha/smart_cart_prediction_based_preferences_features.ipynb diff --git a/.gitignore b/.gitignore index e823f743..7ab8e872 100644 --- a/.gitignore +++ b/.gitignore @@ -4,3 +4,6 @@ Backend/.env Frontend/node_modules Frontend/.expo Scrapping/Australia_GroceriesScraper/configuration.ini +*.csv +*.png +*.py diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 00000000..a8c20032 --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,5 @@ +{ + "python-envs.defaultEnvManager": "ms-python.python:conda", + "python-envs.defaultPackageManager": "ms-python.python:conda", + "python-envs.pythonProjects": [] +} \ No newline at end of file diff --git a/ML/Smart Cart Maha/smart_cart_prediction_based_preferences_features.ipynb b/ML/Smart Cart Maha/smart_cart_prediction_based_preferences_features.ipynb new file mode 100644 index 00000000..6881a980 --- /dev/null +++ b/ML/Smart Cart Maha/smart_cart_prediction_based_preferences_features.ipynb @@ -0,0 +1,512 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "1f5fe132", + "metadata": {}, + "source": [ + "## Add Smart Cart Features \n", + " Objective: \n", + "
\n", + "Add compute recency/frequency/budget alignment, behavioral features, and product relationships." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "3726c2be", + "metadata": {}, + "outputs": [], + "source": [ + "# Feature Engineering\n", + "import pandas as pd\n", + "import numpy as np\n", + "from datetime import datetime" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "b1b6c357", + "metadata": {}, + "outputs": [], + "source": [ + "df = pd.read_csv('data/all_features_preference_features.csv')" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "e88e686f", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 1338619 entries, 0 to 1338618\n", + "Data columns (total 25 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 Unnamed: 0.2 1338619 non-null int64 \n", + " 1 Unnamed: 0.1 1338619 non-null int64 \n", + " 2 Unnamed: 0 1338619 non-null int64 \n", + " 3 transaction_id 1338619 non-null object \n", + " 4 user_id 1338619 non-null object \n", + " 5 product_code 1338619 non-null int64 \n", + " 6 category 1338619 non-null object \n", + " 7 item_name 1338619 non-null object \n", + " 8 discount_percentage 1338619 non-null float64\n", + " 9 transaction_date 1338619 non-null object \n", + " 10 transaction_price 1338619 non-null float64\n", + " 11 age_group 1338619 non-null object \n", + " 12 gender 1338619 non-null object \n", + " 13 income_bracket 1338619 non-null object \n", + " 14 customer_type 1338619 non-null object \n", + " 15 state 1338619 non-null object \n", + " 16 month 1338619 non-null int64 \n", + " 17 seasonal_factor 1338619 non-null float64\n", + " 18 adjusted_spend 1338619 non-null float64\n", + " 19 promotion_applied 1338619 non-null int64 \n", + " 20 discount_amount 1338619 non-null float64\n", + " 21 final_spend 1338619 non-null float64\n", + " 22 recency_days 1338619 non-null int64 \n", + " 23 freq_30d 1338619 non-null float64\n", + " 24 budget_alignment 1338619 non-null float64\n", + "dtypes: float64(8), int64(7), object(10)\n", + "memory usage: 255.3+ MB\n" + ] + } + ], + "source": [ + "df.info()" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "635b2286", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "1338619" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(df)" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "a9239589", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Total transactions: 1338619\n", + "Training transactions: 1070877\n", + "Testing transactions: 267742\n" + ] + } + ], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "from sklearn.decomposition import TruncatedSVD\n", + "from scipy.sparse import csr_matrix\n", + "\n", + "# Assuming your DataFrame is named 'df' and is already loaded.\n", + "# It should have columns: 'user_id', 'product_code', and a value column for interactions.\n", + "# If 'transaction_price' exists, use it. Otherwise, a simple 'interaction' column of 1s is fine.\n", + "# For example: df['interaction'] = 1.0\n", + "\n", + "# Split data into training and testing for each user\n", + "train_df = pd.DataFrame(columns=df.columns)\n", + "test_df = pd.DataFrame(columns=df.columns)\n", + "\n", + "for user in df['user_id'].unique():\n", + " user_data = df[df['user_id'] == user].copy()\n", + " split_point = int(len(user_data) * 0.8) # 80% for training\n", + " train_df = pd.concat([train_df, user_data.iloc[:split_point]])\n", + " test_df = pd.concat([test_df, user_data.iloc[split_point:]])\n", + "\n", + "# Create the user-item matrix from the training data\n", + "# This matrix will be the input for our recommendation model.\n", + "train_matrix = train_df.pivot_table(\n", + " index='user_id',\n", + " columns='product_code',\n", + " values=df.columns[-1], # Use the last column as the interaction value\n", + " aggfunc='sum'\n", + ").fillna(0)\n", + "\n", + "print(f\"Total transactions: {len(df)}\")\n", + "print(f\"Training transactions: {len(train_df)}\")\n", + "print(f\"Testing transactions: {len(test_df)}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "76b7f75f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
TruncatedSVD(n_components=50, random_state=42)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" + ], + "text/plain": [ + "TruncatedSVD(n_components=50, random_state=42)" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Align columns to ensure the model sees all products from the original dataset\n", + "full_product_codes = df['product_code'].unique()\n", + "train_matrix = train_matrix.reindex(columns=full_product_codes, fill_value=0)\n", + "train_matrix_sparse = csr_matrix(train_matrix.values)\n", + "\n", + "# Initialize and train the SVD model\n", + "svd = TruncatedSVD(n_components=50, random_state=42) # n_components is a tunable hyperparameter\n", + "svd.fit(train_matrix_sparse)" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "d5867912", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Precision@5 for User 'user_1': 0.00 (0%)\n", + "Precision@5 for User 'user_10': 0.00 (0%)\n", + "Precision@5 for User 'user_11': 0.00 (0%)\n", + "Precision@5 for User 'user_12': 0.00 (0%)\n", + "Precision@5 for User 'user_13': 0.00 (0%)\n", + "Precision@5 for User 'user_14': 0.00 (0%)\n", + "Precision@5 for User 'user_15': 0.00 (0%)\n", + "Precision@5 for User 'user_16': 0.00 (0%)\n", + "Precision@5 for User 'user_17': 0.00 (0%)\n", + "Precision@5 for User 'user_18': 0.00 (0%)\n", + "Precision@5 for User 'user_19': 0.00 (0%)\n", + "Precision@5 for User 'user_2': 0.00 (0%)\n", + "Precision@5 for User 'user_20': 0.00 (0%)\n", + "Precision@5 for User 'user_21': 0.00 (0%)\n", + "Precision@5 for User 'user_22': 0.00 (0%)\n", + "Precision@5 for User 'user_23': 0.00 (0%)\n", + "Precision@5 for User 'user_24': 0.00 (0%)\n", + "Precision@5 for User 'user_25': 0.00 (0%)\n", + "Precision@5 for User 'user_26': 0.40 (40%)\n", + "Precision@5 for User 'user_27': 0.00 (0%)\n", + "Precision@5 for User 'user_28': 0.00 (0%)\n", + "Precision@5 for User 'user_29': 0.00 (0%)\n", + "Precision@5 for User 'user_3': 0.00 (0%)\n", + "Precision@5 for User 'user_30': 0.00 (0%)\n", + "Precision@5 for User 'user_31': 0.00 (0%)\n", + "Precision@5 for User 'user_32': 0.00 (0%)\n", + "Precision@5 for User 'user_33': 0.00 (0%)\n", + "Precision@5 for User 'user_34': 0.20 (20%)\n", + "Precision@5 for User 'user_35': 0.00 (0%)\n", + "Precision@5 for User 'user_36': 0.00 (0%)\n", + "Precision@5 for User 'user_37': 0.00 (0%)\n", + "Precision@5 for User 'user_38': 0.00 (0%)\n", + "Precision@5 for User 'user_39': 0.00 (0%)\n", + "Precision@5 for User 'user_4': 0.00 (0%)\n", + "Precision@5 for User 'user_40': 0.00 (0%)\n", + "Precision@5 for User 'user_41': 0.00 (0%)\n", + "Precision@5 for User 'user_42': 0.00 (0%)\n", + "Precision@5 for User 'user_43': 0.00 (0%)\n", + "Precision@5 for User 'user_44': 0.00 (0%)\n", + "Precision@5 for User 'user_45': 0.00 (0%)\n", + "Precision@5 for User 'user_46': 0.00 (0%)\n", + "Precision@5 for User 'user_47': 0.00 (0%)\n", + "Precision@5 for User 'user_48': 0.00 (0%)\n", + "Precision@5 for User 'user_49': 0.00 (0%)\n", + "Precision@5 for User 'user_5': 0.20 (20%)\n", + "Precision@5 for User 'user_50': 0.00 (0%)\n", + "Precision@5 for User 'user_6': 0.00 (0%)\n", + "Precision@5 for User 'user_7': 0.20 (20%)\n", + "Precision@5 for User 'user_8': 0.00 (0%)\n", + "Precision@5 for User 'user_9': 0.00 (0%)\n", + "\n", + "Overall Mean Precision@5: 0.02 (2%)\n" + ] + } + ], + "source": [ + "def precision_at_k(recommended_items, relevant_items, k):\n", + " \"\"\"Calculates Precision@K.\"\"\"\n", + " if not relevant_items:\n", + " return 0.0\n", + " \n", + " recommended_set = set(recommended_items[:k])\n", + " relevant_set = set(relevant_items)\n", + " \n", + " hits = len(recommended_set.intersection(relevant_set))\n", + " return hits / k if k > 0 else 0.0\n", + "\n", + "k = 5 # Number of top recommendations to consider\n", + "user_precision_scores = {}\n", + "relevant_items_per_user = test_df.groupby('user_id')['product_code'].apply(list).to_dict()\n", + "\n", + "for user_id in relevant_items_per_user:\n", + " # Skip users not present in the training data\n", + " if user_id not in train_matrix.index:\n", + " continue\n", + "\n", + " # Get the user's vector from the training matrix\n", + " user_row = train_matrix.loc[user_id].values.reshape(1, -1)\n", + " \n", + " # Make predictions for this user\n", + " predicted_scores_user = svd.inverse_transform(svd.transform(user_row))\n", + " predicted_scores_series = pd.Series(\n", + " predicted_scores_user.flatten(),\n", + " index=train_matrix.columns\n", + " )\n", + " \n", + " # Exclude items the user has already seen to avoid recommending them again\n", + " seen_items = list(train_df[train_df['user_id'] == user_id]['product_code'].unique())\n", + " predicted_scores_filtered = predicted_scores_series[~predicted_scores_series.index.isin(seen_items)]\n", + " \n", + " # Get the top K recommended items\n", + " recommended_items = predicted_scores_filtered.sort_values(ascending=False).head(k).index.tolist()\n", + " \n", + " # Get the actual items the user interacted with in the test set (the ground truth)\n", + " relevant_items = relevant_items_per_user[user_id]\n", + " \n", + " # Calculate and store the precision score for this user\n", + " score = precision_at_k(recommended_items, relevant_items, k)\n", + " user_precision_scores[user_id] = score\n", + " print(f\"Precision@{k} for User '{user_id}': {score:.2f} ({score*100:.0f}%)\")\n", + "\n", + "# Calculate and print the overall average precision\n", + "if user_precision_scores:\n", + " overall_precision = np.mean(list(user_precision_scores.values()))\n", + " print(f\"\\nOverall Mean Precision@{k}: {overall_precision:.2f} ({overall_precision*100:.0f}%)\")\n", + "else:\n", + " print(\"No users found in the test set to evaluate.\")" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "9e1e7986", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "c:\\Users\\rayed\\anaconda3\\envs\\dolfin\\Lib\\site-packages\\threadpoolctl.py:1010: RuntimeWarning: \n", + "Found Intel OpenMP ('libiomp') and LLVM OpenMP ('libomp') loaded at\n", + "the same time. Both libraries are known to be incompatible and this\n", + "can cause random crashes or deadlocks on Linux when loaded in the\n", + "same Python program.\n", + "Using threadpoolctl may cause crashes or deadlocks. For more\n", + "information and possible workarounds, please see\n", + " https://github.com/joblib/threadpoolctl/blob/master/multiple_openmp.md\n", + "\n", + " warnings.warn(msg, RuntimeWarning)\n" + ] + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Using the elbow method plot, choose a K where the decrease in inertia slows down.\n" + ] + } + ], + "source": [ + "import pandas as pd\n", + "from sklearn.preprocessing import StandardScaler\n", + "from sklearn.cluster import KMeans\n", + "from sklearn.metrics import silhouette_score\n", + "import matplotlib.pyplot as plt\n", + "\n", + "# Assuming your DataFrame is named 'df' and contains the new columns.\n", + "# --- 1. Prepare the Data ---\n", + "# Select the RFM-like features\n", + "features = ['recency_days', 'freq_30d', 'budget_alignment']\n", + "X = df[features]\n", + "\n", + "# Standardize the features\n", + "scaler = StandardScaler()\n", + "X_scaled = scaler.fit_transform(X)\n", + "\n", + "# --- 2. Determine the Optimal Number of Clusters (K) ---\n", + "# Use the Elbow Method to find the best K\n", + "inertia = []\n", + "for k in range(1, 11):\n", + " kmeans = KMeans(n_clusters=k, random_state=42, n_init='auto')\n", + " kmeans.fit(X_scaled)\n", + " inertia.append(kmeans.inertia_)\n", + "\n", + "plt.plot(range(1, 11), inertia, marker='o')\n", + "plt.title('Elbow Method')\n", + "plt.xlabel('Number of Clusters (K)')\n", + "plt.ylabel('Inertia')\n", + "plt.show()\n", + "print(\"Using the elbow method plot, choose a K where the decrease in inertia slows down.\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "497c2fd7", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "K-Means Model Trained. Users are now assigned to a cluster.\n", + " Unnamed: 0.2 Unnamed: 0.1 Unnamed: 0 transaction_id user_id \\\n", + "0 0 0 0 452a413df6 user_1 \n", + "1 1 1 1 452a413df6 user_1 \n", + "2 2 2 2 452a413df6 user_1 \n", + "3 3 3 3 dd22f95cce user_1 \n", + "4 4 4 4 dd22f95cce user_1 \n", + "\n", + " product_code category \\\n", + "0 5355182 MENS DEOS & GROOMING \n", + "1 9050664 SNACKS \n", + "2 5055940 INFANT FOOD \n", + "3 3994635 BAKING MIXES \n", + "4 4842440 DENTAL HEALTH \n", + "\n", + " item_name discount_percentage \\\n", + "0 Deo Roll On Men Intense Protection Fresh 0.500000 \n", + "1 Original Multipack Potato Chips 0.416667 \n", + "2 Puffcorn BBQ 0.157895 \n", + "3 Deluxe Chocolate Layer Cake Mix 0.309091 \n", + "4 Advanced Whitening Charcoal Toothpaste 0.500000 \n", + "\n", + " transaction_date ... month seasonal_factor adjusted_spend \\\n", + "0 2023-08-21 ... 8 1.0 5.168301 \n", + "1 2023-08-21 ... 8 1.0 6.082251 \n", + "2 2023-08-21 ... 8 1.0 4.819962 \n", + "3 2023-07-22 ... 7 1.0 5.577268 \n", + "4 2023-07-22 ... 7 1.0 10.193634 \n", + "\n", + " promotion_applied discount_amount final_spend recency_days freq_30d \\\n", + "0 0 0.0 5.168301 91 0.0 \n", + "1 0 0.0 6.082251 102 0.0 \n", + "2 0 0.0 4.819962 84 0.0 \n", + "3 0 0.0 5.577268 64 0.0 \n", + "4 0 0.0 10.193634 471 0.0 \n", + "\n", + " budget_alignment cluster \n", + "0 0.403699 3 \n", + "1 0.476220 1 \n", + "2 0.380002 3 \n", + "3 0.436140 3 \n", + "4 0.787659 2 \n", + "\n", + "[5 rows x 26 columns]\n" + ] + }, + { + "ename": "KeyboardInterrupt", + "evalue": "", + "output_type": "error", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", + "Cell \u001b[1;32mIn[23], line 20\u001b[0m\n\u001b[0;32m 10\u001b[0m \u001b[38;5;28mprint\u001b[39m(df\u001b[38;5;241m.\u001b[39mhead())\n\u001b[0;32m 12\u001b[0m \u001b[38;5;66;03m# --- 4. Analyze Cluster Profiles for Recommendations ---\u001b[39;00m\n\u001b[0;32m 13\u001b[0m \u001b[38;5;66;03m# Here you would link the clusters to product data\u001b[39;00m\n\u001b[0;32m 14\u001b[0m \u001b[38;5;66;03m# For example: df_with_products.groupby('cluster')['product_code'].value_counts()\u001b[39;00m\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 18\u001b[0m \u001b[38;5;66;03m# For clustering, the primary evaluation is **internal validation**.\u001b[39;00m\n\u001b[0;32m 19\u001b[0m \u001b[38;5;66;03m# It measures how well-defined the clusters are, not predictive accuracy.\u001b[39;00m\n\u001b[1;32m---> 20\u001b[0m silhouette_avg \u001b[38;5;241m=\u001b[39m \u001b[43msilhouette_score\u001b[49m\u001b[43m(\u001b[49m\u001b[43mX_scaled\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdf\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mcluster\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 21\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124mSilhouette Score: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00msilhouette_avg\u001b[38;5;132;01m:\u001b[39;00m\u001b[38;5;124m.2f\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m 23\u001b[0m \u001b[38;5;66;03m# A silhouette score close to +1 indicates well-separated clusters.\u001b[39;00m\n\u001b[0;32m 24\u001b[0m \u001b[38;5;66;03m# A score near 0 indicates overlapping clusters.\u001b[39;00m\n\u001b[0;32m 25\u001b[0m \u001b[38;5;66;03m# A score near -1 indicates bad clustering.\u001b[39;00m\n", + "File \u001b[1;32mc:\\Users\\rayed\\anaconda3\\envs\\dolfin\\Lib\\site-packages\\sklearn\\utils\\_param_validation.py:211\u001b[0m, in \u001b[0;36mvalidate_params..decorator..wrapper\u001b[1;34m(*args, **kwargs)\u001b[0m\n\u001b[0;32m 205\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m 206\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m config_context(\n\u001b[0;32m 207\u001b[0m skip_parameter_validation\u001b[38;5;241m=\u001b[39m(\n\u001b[0;32m 208\u001b[0m prefer_skip_nested_validation \u001b[38;5;129;01mor\u001b[39;00m global_skip_validation\n\u001b[0;32m 209\u001b[0m )\n\u001b[0;32m 210\u001b[0m ):\n\u001b[1;32m--> 211\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 212\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m InvalidParameterError \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[0;32m 213\u001b[0m \u001b[38;5;66;03m# When the function is just a wrapper around an estimator, we allow\u001b[39;00m\n\u001b[0;32m 214\u001b[0m \u001b[38;5;66;03m# the function to delegate validation to the estimator, but we replace\u001b[39;00m\n\u001b[0;32m 215\u001b[0m \u001b[38;5;66;03m# the name of the estimator by the name of the function in the error\u001b[39;00m\n\u001b[0;32m 216\u001b[0m \u001b[38;5;66;03m# message to avoid confusion.\u001b[39;00m\n\u001b[0;32m 217\u001b[0m msg \u001b[38;5;241m=\u001b[39m re\u001b[38;5;241m.\u001b[39msub(\n\u001b[0;32m 218\u001b[0m \u001b[38;5;124mr\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mparameter of \u001b[39m\u001b[38;5;124m\\\u001b[39m\u001b[38;5;124mw+ must be\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[0;32m 219\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mparameter of \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mfunc\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__qualname__\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m must be\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[0;32m 220\u001b[0m \u001b[38;5;28mstr\u001b[39m(e),\n\u001b[0;32m 221\u001b[0m )\n", + "File \u001b[1;32mc:\\Users\\rayed\\anaconda3\\envs\\dolfin\\Lib\\site-packages\\sklearn\\metrics\\cluster\\_unsupervised.py:131\u001b[0m, in \u001b[0;36msilhouette_score\u001b[1;34m(X, labels, metric, sample_size, random_state, **kwds)\u001b[0m\n\u001b[0;32m 129\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m 130\u001b[0m X, labels \u001b[38;5;241m=\u001b[39m X[indices], labels[indices]\n\u001b[1;32m--> 131\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m np\u001b[38;5;241m.\u001b[39mmean(\u001b[43msilhouette_samples\u001b[49m\u001b[43m(\u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mlabels\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmetric\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmetric\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwds\u001b[49m\u001b[43m)\u001b[49m)\n", + "File \u001b[1;32mc:\\Users\\rayed\\anaconda3\\envs\\dolfin\\Lib\\site-packages\\sklearn\\utils\\_param_validation.py:184\u001b[0m, in \u001b[0;36mvalidate_params..decorator..wrapper\u001b[1;34m(*args, **kwargs)\u001b[0m\n\u001b[0;32m 182\u001b[0m global_skip_validation \u001b[38;5;241m=\u001b[39m get_config()[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mskip_parameter_validation\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n\u001b[0;32m 183\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m global_skip_validation:\n\u001b[1;32m--> 184\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 186\u001b[0m func_sig \u001b[38;5;241m=\u001b[39m signature(func)\n\u001b[0;32m 188\u001b[0m \u001b[38;5;66;03m# Map *args/**kwargs to the function signature\u001b[39;00m\n", + "File \u001b[1;32mc:\\Users\\rayed\\anaconda3\\envs\\dolfin\\Lib\\site-packages\\sklearn\\metrics\\cluster\\_unsupervised.py:283\u001b[0m, in \u001b[0;36msilhouette_samples\u001b[1;34m(X, labels, metric, **kwds)\u001b[0m\n\u001b[0;32m 279\u001b[0m kwds[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mmetric\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m metric\n\u001b[0;32m 280\u001b[0m reduce_func \u001b[38;5;241m=\u001b[39m functools\u001b[38;5;241m.\u001b[39mpartial(\n\u001b[0;32m 281\u001b[0m _silhouette_reduce, labels\u001b[38;5;241m=\u001b[39mlabels, label_freqs\u001b[38;5;241m=\u001b[39mlabel_freqs\n\u001b[0;32m 282\u001b[0m )\n\u001b[1;32m--> 283\u001b[0m results \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mzip\u001b[39;49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mpairwise_distances_chunked\u001b[49m\u001b[43m(\u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mreduce_func\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mreduce_func\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwds\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 284\u001b[0m intra_clust_dists, inter_clust_dists \u001b[38;5;241m=\u001b[39m results\n\u001b[0;32m 285\u001b[0m intra_clust_dists \u001b[38;5;241m=\u001b[39m np\u001b[38;5;241m.\u001b[39mconcatenate(intra_clust_dists)\n", + "File \u001b[1;32mc:\\Users\\rayed\\anaconda3\\envs\\dolfin\\Lib\\site-packages\\sklearn\\metrics\\pairwise.py:2017\u001b[0m, in \u001b[0;36mpairwise_distances_chunked\u001b[1;34m(X, Y, reduce_func, metric, n_jobs, working_memory, **kwds)\u001b[0m\n\u001b[0;32m 2015\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m 2016\u001b[0m X_chunk \u001b[38;5;241m=\u001b[39m X[sl]\n\u001b[1;32m-> 2017\u001b[0m D_chunk \u001b[38;5;241m=\u001b[39m \u001b[43mpairwise_distances\u001b[49m\u001b[43m(\u001b[49m\u001b[43mX_chunk\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mY\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmetric\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmetric\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mn_jobs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mn_jobs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwds\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 2018\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m (X \u001b[38;5;129;01mis\u001b[39;00m Y \u001b[38;5;129;01mor\u001b[39;00m Y \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m) \u001b[38;5;129;01mand\u001b[39;00m PAIRWISE_DISTANCE_FUNCTIONS\u001b[38;5;241m.\u001b[39mget(\n\u001b[0;32m 2019\u001b[0m metric, \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[0;32m 2020\u001b[0m ) \u001b[38;5;129;01mis\u001b[39;00m euclidean_distances:\n\u001b[0;32m 2021\u001b[0m \u001b[38;5;66;03m# zeroing diagonal, taking care of aliases of \"euclidean\",\u001b[39;00m\n\u001b[0;32m 2022\u001b[0m \u001b[38;5;66;03m# i.e. \"l2\"\u001b[39;00m\n\u001b[0;32m 2023\u001b[0m D_chunk\u001b[38;5;241m.\u001b[39mflat[sl\u001b[38;5;241m.\u001b[39mstart :: _num_samples(X) \u001b[38;5;241m+\u001b[39m \u001b[38;5;241m1\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m0\u001b[39m\n", + "File \u001b[1;32mc:\\Users\\rayed\\anaconda3\\envs\\dolfin\\Lib\\site-packages\\sklearn\\metrics\\pairwise.py:2195\u001b[0m, in \u001b[0;36mpairwise_distances\u001b[1;34m(X, Y, metric, n_jobs, force_all_finite, **kwds)\u001b[0m\n\u001b[0;32m 2192\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m distance\u001b[38;5;241m.\u001b[39msquareform(distance\u001b[38;5;241m.\u001b[39mpdist(X, metric\u001b[38;5;241m=\u001b[39mmetric, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwds))\n\u001b[0;32m 2193\u001b[0m func \u001b[38;5;241m=\u001b[39m partial(distance\u001b[38;5;241m.\u001b[39mcdist, metric\u001b[38;5;241m=\u001b[39mmetric, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwds)\n\u001b[1;32m-> 2195\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43m_parallel_pairwise\u001b[49m\u001b[43m(\u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mY\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mfunc\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mn_jobs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwds\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[1;32mc:\\Users\\rayed\\anaconda3\\envs\\dolfin\\Lib\\site-packages\\sklearn\\metrics\\pairwise.py:1765\u001b[0m, in \u001b[0;36m_parallel_pairwise\u001b[1;34m(X, Y, func, n_jobs, **kwds)\u001b[0m\n\u001b[0;32m 1762\u001b[0m X, Y, dtype \u001b[38;5;241m=\u001b[39m _return_float_dtype(X, Y)\n\u001b[0;32m 1764\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m effective_n_jobs(n_jobs) \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m1\u001b[39m:\n\u001b[1;32m-> 1765\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mY\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwds\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 1767\u001b[0m \u001b[38;5;66;03m# enforce a threading backend to prevent data communication overhead\u001b[39;00m\n\u001b[0;32m 1768\u001b[0m fd \u001b[38;5;241m=\u001b[39m delayed(_dist_wrapper)\n", + "File \u001b[1;32mc:\\Users\\rayed\\anaconda3\\envs\\dolfin\\Lib\\site-packages\\sklearn\\metrics\\pairwise.py:338\u001b[0m, in \u001b[0;36meuclidean_distances\u001b[1;34m(X, Y, Y_norm_squared, squared, X_norm_squared)\u001b[0m\n\u001b[0;32m 332\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m Y_norm_squared\u001b[38;5;241m.\u001b[39mshape \u001b[38;5;241m!=\u001b[39m (\u001b[38;5;241m1\u001b[39m, Y\u001b[38;5;241m.\u001b[39mshape[\u001b[38;5;241m0\u001b[39m]):\n\u001b[0;32m 333\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[0;32m 334\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mIncompatible dimensions for Y of shape \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mY\u001b[38;5;241m.\u001b[39mshape\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m and \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 335\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mY_norm_squared of shape \u001b[39m\u001b[38;5;132;01m{\u001b[39;00moriginal_shape\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 336\u001b[0m )\n\u001b[1;32m--> 338\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43m_euclidean_distances\u001b[49m\u001b[43m(\u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mY\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mX_norm_squared\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mY_norm_squared\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43msquared\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[1;32mc:\\Users\\rayed\\anaconda3\\envs\\dolfin\\Lib\\site-packages\\sklearn\\metrics\\pairwise.py:382\u001b[0m, in \u001b[0;36m_euclidean_distances\u001b[1;34m(X, Y, X_norm_squared, Y_norm_squared, squared)\u001b[0m\n\u001b[0;32m 380\u001b[0m distances \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m XX\n\u001b[0;32m 381\u001b[0m distances \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m YY\n\u001b[1;32m--> 382\u001b[0m \u001b[43mnp\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmaximum\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdistances\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m0\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mout\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdistances\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 384\u001b[0m \u001b[38;5;66;03m# Ensure that distances between vectors and themselves are set to 0.0.\u001b[39;00m\n\u001b[0;32m 385\u001b[0m \u001b[38;5;66;03m# This may not be the case due to floating point rounding errors.\u001b[39;00m\n\u001b[0;32m 386\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m X \u001b[38;5;129;01mis\u001b[39;00m Y:\n", + "\u001b[1;31mKeyboardInterrupt\u001b[0m: " + ] + } + ], + "source": [ + "# In a real scenario, you would inspect the plot and choose an appropriate K.\n", + "# Let's assume we choose K=4 for this example.\n", + "optimal_k = 4\n", + "\n", + "# --- 3. Train the K-Means Model ---\n", + "kmeans = KMeans(n_clusters=optimal_k, random_state=42, n_init='auto')\n", + "df['cluster'] = kmeans.fit_predict(X_scaled)\n", + "\n", + "print(\"\\nK-Means Model Trained. Users are now assigned to a cluster.\")\n", + "print(df.head())\n", + "\n", + "# --- 4. Analyze Cluster Profiles for Recommendations ---\n", + "# Here you would link the clusters to product data\n", + "# For example: df_with_products.groupby('cluster')['product_code'].value_counts()\n", + "# This allows you to find the most popular items within each cluster.\n", + "\n", + "# --- 5. Evaluation ---\n", + "# For clustering, the primary evaluation is **internal validation**.\n", + "# It measures how well-defined the clusters are, not predictive accuracy.\n", + "silhouette_avg = silhouette_score(X_scaled, df['cluster'])\n", + "print(f\"\\nSilhouette Score: {silhouette_avg:.2f}\")\n", + "\n", + "# A silhouette score close to +1 indicates well-separated clusters.\n", + "# A score near 0 indicates overlapping clusters.\n", + "# A score near -1 indicates bad clustering." + ] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "dolfin", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.5" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}