diff --git a/TabDDPMAnalysis.ipynb b/TabDDPMAnalysis.ipynb new file mode 100644 index 00000000..870993c5 --- /dev/null +++ b/TabDDPMAnalysis.ipynb @@ -0,0 +1,213 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + } + }, + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 771 + }, + "id": "_8vQ99lZBSgv", + "outputId": "b1932d63-768a-40e0-bdaa-6b74b088e2fb" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Training shape: (4435, 37)\n", + "Testing shape: (2000, 37)\n", + "\n", + "Training TabDDPM...\n", + "Epoch 1, Loss: 0.7926\n", + "Epoch 2, Loss: 0.3590\n", + "Epoch 3, Loss: 0.2392\n", + "Epoch 4, Loss: 0.2303\n", + "Epoch 5, Loss: 0.2164\n", + "Epoch 6, Loss: 0.2065\n", + "Epoch 7, Loss: 0.2035\n", + "Epoch 8, Loss: 0.2030\n", + "Epoch 9, Loss: 0.2012\n", + "Epoch 10, Loss: 0.2020\n", + "\n", + "Evaluating model on test set...\n", + "\n", + "Denoising Evaluation Metrics:\n", + "- MSE: 0.1998\n", + "- R² Score: 0.7980\n", + "\n", + "Generating synthetic samples...\n", + "Synthetic data saved to synthetic_data.csv\n", + "\n", + "Plotting heatmaps...\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "
" + ], + "image/png": "\n" + }, + "metadata": {} + } + ], + "source": [ + "import torch\n", + "import torch.nn as nn\n", + "import numpy as np\n", + "import pandas as pd\n", + "import matplotlib.pyplot as plt\n", + "import seaborn as sns\n", + "from sklearn.preprocessing import StandardScaler\n", + "from sklearn.metrics import mean_squared_error, r2_score\n", + "from torch.utils.data import DataLoader, TensorDataset\n", + "\n", + "# MLP for denoising network\n", + "def get_mlp(in_dim, out_dim):\n", + " return nn.Sequential(\n", + " nn.Linear(in_dim + 1, 128),\n", + " nn.ReLU(),\n", + " nn.Linear(128, 128),\n", + " nn.ReLU(),\n", + " nn.Linear(128, out_dim),\n", + " )\n", + "\n", + "class TabDDPM(nn.Module):\n", + " def __init__(self, input_dim, timesteps=1000):\n", + " super(TabDDPM, self).__init__()\n", + " self.input_dim = input_dim\n", + " self.timesteps = timesteps\n", + " self.model = get_mlp(input_dim, input_dim)\n", + "\n", + " betas = torch.linspace(1e-4, 0.02, timesteps)\n", + " self.register_buffer('betas', betas)\n", + " self.register_buffer('alphas', 1. - betas)\n", + " self.register_buffer('alphas_cumprod', torch.cumprod(1. - betas, dim=0))\n", + "\n", + " def forward(self, x, t):\n", + " t_embed = t.float().unsqueeze(1) / self.timesteps\n", + " x_in = torch.cat([x, t_embed], dim=1)\n", + " return self.model(x_in)\n", + "\n", + " def q_sample(self, x0, t):\n", + " noise = torch.randn_like(x0)\n", + " sqrt_alpha_cumprod = self.alphas_cumprod[t].sqrt().unsqueeze(1)\n", + " sqrt_one_minus_alpha_cumprod = (1 - self.alphas_cumprod[t]).sqrt().unsqueeze(1)\n", + " return sqrt_alpha_cumprod * x0 + sqrt_one_minus_alpha_cumprod * noise, noise\n", + "\n", + " def sample(self, num_samples):\n", + " x = torch.randn(num_samples, self.input_dim)\n", + " for t in reversed(range(self.timesteps)):\n", + " t_tensor = torch.full((num_samples,), t, dtype=torch.long)\n", + " pred_noise = self.forward(x, t_tensor)\n", + " beta = self.betas[t]\n", + " alpha = self.alphas[t]\n", + " alpha_hat = self.alphas_cumprod[t]\n", + " coef = 1 / alpha.sqrt()\n", + " noise = torch.randn_like(x) if t > 0 else 0\n", + " x = coef * (x - beta / (1 - alpha_hat).sqrt() * pred_noise) + beta.sqrt() * noise\n", + " return x\n", + "\n", + "# Load and preprocess your data\n", + "train_path = \"/content/sat_train.csv\"\n", + "test_path = \"/content/sat_test.csv\"\n", + "\n", + "# Load CSVs\n", + "train_df = pd.read_csv(train_path)\n", + "test_df = pd.read_csv(test_path)\n", + "\n", + "# Select numeric columns only\n", + "train_df = train_df.select_dtypes(include=[np.number]).dropna()\n", + "test_df = test_df.select_dtypes(include=[np.number]).dropna()\n", + "\n", + "print(f\"Training shape: {train_df.shape}\")\n", + "print(f\"Testing shape: {test_df.shape}\")\n", + "\n", + "# Normalize data\n", + "scaler = StandardScaler()\n", + "X_train = torch.tensor(scaler.fit_transform(train_df.values), dtype=torch.float32)\n", + "X_test = torch.tensor(scaler.transform(test_df.values), dtype=torch.float32)\n", + "\n", + "# Dataloader\n", + "train_loader = DataLoader(TensorDataset(X_train), batch_size=64, shuffle=True)\n", + "\n", + "# Initialize model\n", + "input_dim = X_train.shape[1]\n", + "model = TabDDPM(input_dim=input_dim)\n", + "optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)\n", + "loss_fn = nn.MSELoss()\n", + "\n", + "# === TRAINING LOOP ===\n", + "print(\"\\nTraining TabDDPM...\")\n", + "model.train()\n", + "for epoch in range(10):\n", + " total_loss = 0\n", + " for batch in train_loader:\n", + " x0 = batch[0]\n", + " t = torch.randint(0, model.timesteps, (x0.size(0),), dtype=torch.long)\n", + " xt, noise = model.q_sample(x0, t)\n", + " pred_noise = model(xt, t)\n", + " loss = loss_fn(pred_noise, noise)\n", + " optimizer.zero_grad()\n", + " loss.backward()\n", + " optimizer.step()\n", + " total_loss += loss.item()\n", + " print(f\"Epoch {epoch + 1}, Loss: {total_loss / len(train_loader):.4f}\")\n", + "\n", + "# === EVALUATION ===\n", + "print(\"\\nEvaluating model on test set...\")\n", + "model.eval()\n", + "with torch.no_grad():\n", + " t_test = torch.randint(0, model.timesteps, (X_test.size(0),), dtype=torch.long)\n", + " xt, true_noise = model.q_sample(X_test, t_test)\n", + " pred_noise = model(xt, t_test)\n", + "\n", + " mse = mean_squared_error(true_noise.numpy(), pred_noise.numpy())\n", + " r2 = r2_score(true_noise.numpy(), pred_noise.numpy())\n", + " print(f\"\\nDenoising Evaluation Metrics:\\n- MSE: {mse:.4f}\\n- R² Score: {r2:.4f}\")\n", + "\n", + "# === GENERATE SYNTHETIC DATA ===\n", + "print(\"\\nGenerating synthetic samples...\")\n", + "with torch.no_grad():\n", + " samples = model.sample(100).numpy()\n", + " samples_rescaled = scaler.inverse_transform(samples)\n", + " df_synth = pd.DataFrame(samples_rescaled, columns=train_df.columns)\n", + " df_synth.to_csv(\"synthetic_data.csv\", index=False)\n", + " print(\"Synthetic data saved to synthetic_data.csv\")\n", + "\n", + "# === HEATMAPS ===\n", + "print(\"\\nPlotting heatmaps...\")\n", + "fig, axes = plt.subplots(1, 3, figsize=(18, 5))\n", + "\n", + "sns.heatmap(train_df.corr(), ax=axes[0], cmap=\"coolwarm\", annot=False)\n", + "axes[0].set_title(\"Train Data Correlation\")\n", + "\n", + "sns.heatmap(test_df.corr(), ax=axes[1], cmap=\"coolwarm\", annot=False)\n", + "axes[1].set_title(\"Test Data Correlation\")\n", + "\n", + "sns.heatmap(df_synth.corr(), ax=axes[2], cmap=\"coolwarm\", annot=False)\n", + "axes[2].set_title(\"Synthetic Data Correlation\")\n", + "\n", + "plt.tight_layout()\n", + "plt.show()\n" + ] + } + ] +} \ No newline at end of file diff --git a/TabDDPM_Guide.pdf b/TabDDPM_Guide.pdf new file mode 100644 index 00000000..036481fa Binary files /dev/null and b/TabDDPM_Guide.pdf differ diff --git a/TabPFNAnalysis.ipynb b/TabPFNAnalysis.ipynb new file mode 100644 index 00000000..3465f48a --- /dev/null +++ b/TabPFNAnalysis.ipynb @@ -0,0 +1,433 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 2, + "id": "54da7a65-9787-4e26-b6fe-d798ec27e0e1", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Requirement already satisfied: tabpfn in c:\\users\\r.parsad\\anaconda3\\envs\\tddpm\\lib\\site-packages (2.0.8)\n", + "Requirement already satisfied: torch<3,>=2.1 in c:\\users\\r.parsad\\anaconda3\\envs\\tddpm\\lib\\site-packages (from tabpfn) (2.6.0)\n", + "Requirement already satisfied: scikit-learn<1.7,>=1.2.0 in c:\\users\\r.parsad\\anaconda3\\envs\\tddpm\\lib\\site-packages (from tabpfn) (1.6.1)\n", + "Requirement already satisfied: typing_extensions>=4.4.0 in c:\\users\\r.parsad\\anaconda3\\envs\\tddpm\\lib\\site-packages (from tabpfn) (4.13.1)\n", + "Requirement already satisfied: scipy<2,>=1.11.1 in c:\\users\\r.parsad\\anaconda3\\envs\\tddpm\\lib\\site-packages (from tabpfn) (1.13.1)\n", + "Requirement already satisfied: pandas<3,>=1.4.0 in c:\\users\\r.parsad\\anaconda3\\envs\\tddpm\\lib\\site-packages (from tabpfn) (2.2.3)\n", + "Requirement already satisfied: einops<0.9,>=0.2.0 in c:\\users\\r.parsad\\anaconda3\\envs\\tddpm\\lib\\site-packages (from tabpfn) (0.8.1)\n", + "Requirement already satisfied: huggingface-hub<1,>=0.0.1 in c:\\users\\r.parsad\\anaconda3\\envs\\tddpm\\lib\\site-packages (from tabpfn) (0.30.1)\n", + "Requirement already satisfied: filelock in c:\\users\\r.parsad\\anaconda3\\envs\\tddpm\\lib\\site-packages (from huggingface-hub<1,>=0.0.1->tabpfn) (3.18.0)\n", + "Requirement already satisfied: fsspec>=2023.5.0 in c:\\users\\r.parsad\\anaconda3\\envs\\tddpm\\lib\\site-packages (from huggingface-hub<1,>=0.0.1->tabpfn) (2025.3.2)\n", + "Requirement already satisfied: packaging>=20.9 in c:\\users\\r.parsad\\anaconda3\\envs\\tddpm\\lib\\site-packages (from huggingface-hub<1,>=0.0.1->tabpfn) (24.2)\n", + "Requirement already satisfied: pyyaml>=5.1 in c:\\users\\r.parsad\\anaconda3\\envs\\tddpm\\lib\\site-packages (from huggingface-hub<1,>=0.0.1->tabpfn) (5.4.1)\n", + "Requirement already satisfied: requests in c:\\users\\r.parsad\\anaconda3\\envs\\tddpm\\lib\\site-packages (from huggingface-hub<1,>=0.0.1->tabpfn) (2.32.3)\n", + "Requirement already satisfied: tqdm>=4.42.1 in c:\\users\\r.parsad\\anaconda3\\envs\\tddpm\\lib\\site-packages (from huggingface-hub<1,>=0.0.1->tabpfn) (4.62.3)\n", + "Requirement already satisfied: numpy>=1.22.4 in c:\\users\\r.parsad\\anaconda3\\envs\\tddpm\\lib\\site-packages (from pandas<3,>=1.4.0->tabpfn) (2.0.2)\n", + "Requirement already satisfied: python-dateutil>=2.8.2 in c:\\users\\r.parsad\\anaconda3\\envs\\tddpm\\lib\\site-packages (from pandas<3,>=1.4.0->tabpfn) (2.9.0.post0)\n", + "Requirement already satisfied: pytz>=2020.1 in c:\\users\\r.parsad\\anaconda3\\envs\\tddpm\\lib\\site-packages (from pandas<3,>=1.4.0->tabpfn) (2025.2)\n", + "Requirement already satisfied: tzdata>=2022.7 in c:\\users\\r.parsad\\anaconda3\\envs\\tddpm\\lib\\site-packages (from pandas<3,>=1.4.0->tabpfn) (2025.2)\n", + "Requirement already satisfied: joblib>=1.2.0 in c:\\users\\r.parsad\\anaconda3\\envs\\tddpm\\lib\\site-packages (from scikit-learn<1.7,>=1.2.0->tabpfn) (1.4.2)\n", + "Requirement already satisfied: threadpoolctl>=3.1.0 in c:\\users\\r.parsad\\anaconda3\\envs\\tddpm\\lib\\site-packages (from scikit-learn<1.7,>=1.2.0->tabpfn) (3.6.0)\n", + "Requirement already satisfied: networkx in c:\\users\\r.parsad\\anaconda3\\envs\\tddpm\\lib\\site-packages (from torch<3,>=2.1->tabpfn) (3.2.1)\n", + "Requirement already satisfied: jinja2 in c:\\users\\r.parsad\\anaconda3\\envs\\tddpm\\lib\\site-packages (from torch<3,>=2.1->tabpfn) (3.1.6)\n", + "Requirement already satisfied: sympy==1.13.1 in c:\\users\\r.parsad\\anaconda3\\envs\\tddpm\\lib\\site-packages (from torch<3,>=2.1->tabpfn) (1.13.1)\n", + "Requirement already satisfied: mpmath<1.4,>=1.1.0 in c:\\users\\r.parsad\\anaconda3\\envs\\tddpm\\lib\\site-packages (from sympy==1.13.1->torch<3,>=2.1->tabpfn) (1.3.0)\n", + "Requirement already satisfied: six>=1.5 in c:\\users\\r.parsad\\anaconda3\\envs\\tddpm\\lib\\site-packages (from python-dateutil>=2.8.2->pandas<3,>=1.4.0->tabpfn) (1.17.0)\n", + "Requirement already satisfied: colorama in c:\\users\\r.parsad\\anaconda3\\envs\\tddpm\\lib\\site-packages (from tqdm>=4.42.1->huggingface-hub<1,>=0.0.1->tabpfn) (0.4.6)\n", + "Requirement already satisfied: MarkupSafe>=2.0 in c:\\users\\r.parsad\\anaconda3\\envs\\tddpm\\lib\\site-packages (from jinja2->torch<3,>=2.1->tabpfn) (3.0.2)\n", + "Requirement already satisfied: charset-normalizer<4,>=2 in c:\\users\\r.parsad\\anaconda3\\envs\\tddpm\\lib\\site-packages (from requests->huggingface-hub<1,>=0.0.1->tabpfn) (3.4.1)\n", + "Requirement already satisfied: idna<4,>=2.5 in c:\\users\\r.parsad\\anaconda3\\envs\\tddpm\\lib\\site-packages (from requests->huggingface-hub<1,>=0.0.1->tabpfn) (3.10)\n", + "Requirement already satisfied: urllib3<3,>=1.21.1 in c:\\users\\r.parsad\\anaconda3\\envs\\tddpm\\lib\\site-packages (from requests->huggingface-hub<1,>=0.0.1->tabpfn) (2.3.0)\n", + "Requirement already satisfied: certifi>=2017.4.17 in c:\\users\\r.parsad\\anaconda3\\envs\\tddpm\\lib\\site-packages (from requests->huggingface-hub<1,>=0.0.1->tabpfn) (2025.1.31)\n" + ] + } + ], + "source": [ + "!pip3 install tabpfn" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "78678943-2bec-4967-9308-79ec88e50267", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "from sklearn.preprocessing import LabelEncoder\n", + "from sklearn.model_selection import train_test_split\n", + "import torch\n", + "\n", + "# Load your dataset\n", + "df = pd.read_csv(\"C:/Users/R.Parsad/Downloads/TabPFN/Car/car.csv\")\n", + "\n", + "# Separate features and target\n", + "X = df.drop(\"Class\", axis=1)\n", + "y = df[\"Class\"]\n", + "\n", + "# Encode categorical features (simple label encoding)\n", + "for col in X.select_dtypes(include=['object', 'category']).columns:\n", + " X[col] = LabelEncoder().fit_transform(X[col])\n", + "\n", + "# Encode target if it's categorical\n", + "if y.dtype == \"object\" or str(y.dtype).startswith(\"category\"):\n", + " y = LabelEncoder().fit_transform(y)\n", + "\n", + "X_train, X_test, y_train, y_test = train_test_split(X.values, y, test_size=0.2, random_state=42)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "71e4ce51-efb2-4ad2-a187-6442736308e4", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Accuracy: 0.9855491329479769\n" + ] + } + ], + "source": [ + "from tabpfn import TabPFNClassifier\n", + "from sklearn.metrics import accuracy_score\n", + "\n", + "import os\n", + "os.environ[\"TABPFN_ALLOW_CPU_LARGE_DATASET\"] = \"1\"\n", + "\n", + "# Initialize model\n", + "clf = TabPFNClassifier(device='cuda' if torch.cuda.is_available() else 'cpu')\n", + "\n", + "# Fit on training data\n", + "clf.fit(X_train, y_train)\n", + "\n", + "# Predict\n", + "y_pred = clf.predict(X_test)\n", + "\n", + "# Evaluate\n", + "print(\"Accuracy:\", accuracy_score(y_test, y_pred))\n" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "c82731dd-527f-4b66-a1ac-8d74c61345e3", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Accuracy: 0.9855491329479769\n", + "Precision (macro): 0.9426482571516221\n", + "Recall (macro): 0.9732459248759744\n", + "F1 Score (macro): 0.9562612870946512\n", + "\n", + "Classification Report:\n", + " precision recall f1-score support\n", + "\n", + " 0 0.99 0.95 0.97 83\n", + " 1 0.85 1.00 0.92 11\n", + " 2 1.00 1.00 1.00 235\n", + " 3 0.94 0.94 0.94 17\n", + "\n", + " accuracy 0.99 346\n", + " macro avg 0.94 0.97 0.96 346\n", + "weighted avg 0.99 0.99 0.99 346\n", + "\n", + "Confusion Matrix:\n", + " [[ 79 2 1 1]\n", + " [ 0 11 0 0]\n", + " [ 0 0 235 0]\n", + " [ 1 0 0 16]]\n" + ] + } + ], + "source": [ + "from sklearn.metrics import (\n", + " accuracy_score, precision_score, recall_score,\n", + " f1_score, classification_report, confusion_matrix\n", + ")\n", + "\n", + "# Predictions\n", + "y_pred = clf.predict(X_test)\n", + "\n", + "# Metrics\n", + "print(\"Accuracy:\", accuracy_score(y_test, y_pred))\n", + "print(\"Precision (macro):\", precision_score(y_test, y_pred, average='macro'))\n", + "print(\"Recall (macro):\", recall_score(y_test, y_pred, average='macro'))\n", + "print(\"F1 Score (macro):\", f1_score(y_test, y_pred, average='macro'))\n", + "print(\"\\nClassification Report:\\n\", classification_report(y_test, y_pred))\n", + "print(\"Confusion Matrix:\\n\", confusion_matrix(y_test, y_pred))\n" + ] + }, + { + "cell_type": "markdown", + "id": "d9efaa0f-d474-4a43-93a3-ee9cf5649690", + "metadata": {}, + "source": [ + "## Testing a custom neural network\n" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "d05b3c1e-8808-4464-bb30-d5006eeba7fd", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Epoch 1/20 - Loss: 1.2180\n", + "Epoch 2/20 - Loss: 0.9277\n", + "Epoch 3/20 - Loss: 0.7940\n", + "Epoch 4/20 - Loss: 0.7315\n", + "Epoch 5/20 - Loss: 0.7117\n", + "Epoch 6/20 - Loss: 0.6826\n", + "Epoch 7/20 - Loss: 0.6696\n", + "Epoch 8/20 - Loss: 0.6537\n", + "Epoch 9/20 - Loss: 0.6395\n", + "Epoch 10/20 - Loss: 0.6202\n", + "Epoch 11/20 - Loss: 0.6068\n", + "Epoch 12/20 - Loss: 0.5867\n", + "Epoch 13/20 - Loss: 0.5679\n", + "Epoch 14/20 - Loss: 0.5472\n", + "Epoch 15/20 - Loss: 0.5256\n", + "Epoch 16/20 - Loss: 0.5187\n", + "Epoch 17/20 - Loss: 0.4980\n", + "Epoch 18/20 - Loss: 0.4788\n", + "Epoch 19/20 - Loss: 0.4665\n", + "Epoch 20/20 - Loss: 0.4462\n", + "\n", + "Classification Report:\n", + " precision recall f1-score support\n", + "\n", + " 0 0.62 0.58 0.60 83\n", + " 1 0.00 0.00 0.00 11\n", + " 2 0.87 0.98 0.92 235\n", + " 3 1.00 0.24 0.38 17\n", + "\n", + " accuracy 0.82 346\n", + " macro avg 0.62 0.45 0.48 346\n", + "weighted avg 0.79 0.82 0.79 346\n", + "\n", + "Confusion Matrix:\n", + " [[ 48 0 35 0]\n", + " [ 11 0 0 0]\n", + " [ 5 0 230 0]\n", + " [ 13 0 0 4]]\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "c:\\Users\\R.Parsad\\anaconda3\\envs\\tddpm\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n", + " _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n", + "c:\\Users\\R.Parsad\\anaconda3\\envs\\tddpm\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n", + " _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n", + "c:\\Users\\R.Parsad\\anaconda3\\envs\\tddpm\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n", + " _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n" + ] + } + ], + "source": [ + "import torch\n", + "import torch.nn as nn\n", + "import torch.optim as optim\n", + "from torch.utils.data import DataLoader, TensorDataset\n", + "from sklearn.metrics import classification_report, confusion_matrix\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.preprocessing import StandardScaler, LabelEncoder\n", + "import pandas as pd\n", + "import numpy as np\n", + "\n", + "# Load your dataset\n", + "df = pd.read_csv(\"C:/Users/R.Parsad/Downloads/TabPFN/Car/car.csv\")\n", + "X = df.drop(\"Class\", axis=1)\n", + "y = df[\"Class\"]\n", + "\n", + "# Encode categorical features\n", + "for col in X.select_dtypes(include=[\"object\", \"category\"]).columns:\n", + " X[col] = LabelEncoder().fit_transform(X[col])\n", + "if y.dtype == \"object\" or str(y.dtype).startswith(\"category\"):\n", + " y = LabelEncoder().fit_transform(y)\n", + "\n", + "# Normalize features\n", + "scaler = StandardScaler()\n", + "X = scaler.fit_transform(X)\n", + "\n", + "# Train-test split\n", + "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n", + "\n", + "# Convert to PyTorch tensors\n", + "X_train_tensor = torch.tensor(X_train, dtype=torch.float32)\n", + "y_train_tensor = torch.tensor(y_train, dtype=torch.long)\n", + "X_test_tensor = torch.tensor(X_test, dtype=torch.float32)\n", + "y_test_tensor = torch.tensor(y_test, dtype=torch.long)\n", + "\n", + "train_loader = DataLoader(TensorDataset(X_train_tensor, y_train_tensor), batch_size=64, shuffle=True)\n", + "\n", + "# Define the neural network\n", + "class SimpleNN(nn.Module):\n", + " def __init__(self, input_dim, num_classes):\n", + " super(SimpleNN, self).__init__()\n", + " self.model = nn.Sequential(\n", + " nn.Linear(input_dim, 64),\n", + " nn.ReLU(),\n", + " nn.Dropout(0.3),\n", + " nn.Linear(64, 32),\n", + " nn.ReLU(),\n", + " nn.Linear(32, num_classes)\n", + " )\n", + "\n", + " def forward(self, x):\n", + " return self.model(x)\n", + "\n", + "model = SimpleNN(input_dim=X.shape[1], num_classes=len(np.unique(y)))\n", + "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n", + "model.to(device)\n", + "\n", + "# Training\n", + "criterion = nn.CrossEntropyLoss()\n", + "optimizer = optim.Adam(model.parameters(), lr=0.001)\n", + "\n", + "epochs = 20\n", + "model.train()\n", + "for epoch in range(epochs):\n", + " running_loss = 0.0\n", + " for xb, yb in train_loader:\n", + " xb, yb = xb.to(device), yb.to(device)\n", + " optimizer.zero_grad()\n", + " outputs = model(xb)\n", + " loss = criterion(outputs, yb)\n", + " loss.backward()\n", + " optimizer.step()\n", + " running_loss += loss.item()\n", + " print(f\"Epoch {epoch+1}/{epochs} - Loss: {running_loss/len(train_loader):.4f}\")\n", + "\n", + "# Evaluation\n", + "model.eval()\n", + "with torch.no_grad():\n", + " outputs = model(X_test_tensor.to(device))\n", + " _, y_pred = torch.max(outputs, 1)\n", + " y_pred = y_pred.cpu().numpy()\n", + "\n", + "# Metrics\n", + "print(\"\\nClassification Report:\\n\", classification_report(y_test, y_pred))\n", + "print(\"Confusion Matrix:\\n\", confusion_matrix(y_test, y_pred))\n" + ] + }, + { + "cell_type": "markdown", + "id": "e120ef1f-a4dc-4af6-a54d-fe12eaf97621", + "metadata": {}, + "source": [ + "### Testing SVM on the dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "26186a50-4f20-4d40-815a-ef4e7ec2d306", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Classification Report:\n", + " precision recall f1-score support\n", + "\n", + " 0 0.79 0.81 0.80 83\n", + " 1 0.44 0.36 0.40 11\n", + " 2 0.95 0.96 0.96 235\n", + " 3 0.94 0.88 0.91 17\n", + "\n", + " accuracy 0.90 346\n", + " macro avg 0.78 0.75 0.77 346\n", + "weighted avg 0.90 0.90 0.90 346\n", + "\n", + "Confusion Matrix:\n", + " [[ 67 5 11 0]\n", + " [ 6 4 0 1]\n", + " [ 10 0 225 0]\n", + " [ 2 0 0 15]]\n" + ] + } + ], + "source": [ + "from sklearn.svm import SVC\n", + "from sklearn.preprocessing import StandardScaler, LabelEncoder\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.metrics import classification_report, confusion_matrix\n", + "import pandas as pd\n", + "\n", + "# Load dataset\n", + "df = pd.read_csv(\"C:/Users/R.Parsad/Downloads/TabPFN/Car/car.csv\")\n", + "X = df.drop(\"Class\", axis=1)\n", + "y = df[\"Class\"]\n", + "\n", + "# Encode categorical features\n", + "for col in X.select_dtypes(include=[\"object\", \"category\"]).columns:\n", + " X[col] = LabelEncoder().fit_transform(X[col])\n", + "\n", + "# Encode target if necessary\n", + "if y.dtype == \"object\" or str(y.dtype).startswith(\"category\"):\n", + " y = LabelEncoder().fit_transform(y)\n", + "\n", + "# Scale features\n", + "scaler = StandardScaler()\n", + "X = scaler.fit_transform(X)\n", + "\n", + "# Train-test split\n", + "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n", + "\n", + "# Initialize and train SVM\n", + "svm_clf = SVC(kernel='rbf', C=1.0, gamma='scale') # try kernel='linear' or 'poly' too\n", + "svm_clf.fit(X_train, y_train)\n", + "\n", + "# Predict\n", + "y_pred = svm_clf.predict(X_test)\n", + "\n", + "# Metrics\n", + "print(\"Classification Report:\\n\", classification_report(y_test, y_pred))\n", + "print(\"Confusion Matrix:\\n\", confusion_matrix(y_test, y_pred))\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0be30d5a-455a-4495-847d-8386f1f3054f", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "tddpm", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.7" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/TabPFN_Guide.pdf b/TabPFN_Guide.pdf new file mode 100644 index 00000000..cc191cbd Binary files /dev/null and b/TabPFN_Guide.pdf differ