embeddings script

luyang-ai4med · luyang-ai4med · commit 2035055b9b7d · 2023-01-10T23:14:24.000-08:00
diff --git a/Demo.ipynb b/Demo.ipynb
@@ -15,7 +15,29 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 2,
+   "id": "ebef5221",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'/oak/stanford/groups/rbaltman/luyang/software/anaconda3/envs/popdx/bin/python'"
+      ]
+     },
+     "execution_count": 2,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import sys\n",
+    "sys.executable"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
    "id": "da1ee854",
    "metadata": {},
    "outputs": [
@@ -39,7 +61,7 @@
       "  -lr LEARNING_RATE, --learning_rate LEARNING_RATE\r\n",
       "                        Default learning rate is 0.0001\r\n",
       "  -wd WEIGHT_DECAY, --weight_decay WEIGHT_DECAY\r\n",
-      "                        Default learning rate is 0\r\n"
+      "                        Default weight decay is 0\r\n"
      ]
     }
    ],
@@ -49,7 +71,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 4,
    "id": "cc5504ac",
    "metadata": {
     "scrolled": true
@@ -60,68 +82,7 @@
      "output_type": "stream",
      "text": [
       "Namespace(hidden_size=150, learning_rate=0.0001, save_dir='./save/POPDx_train', use_gpu=True, weight_decay=0.0)\n",
-      "starting epoch 0\n",
-      "1702it [00:34, 48.71it/s]\n",
-      "[1] Training Loss: 0.017\n",
-      "674it [00:09, 71.74it/s]\n",
-      "[1] Validation Loss: 0.053\n",
-      "0.046625131951197316 saved\n",
-      "starting epoch 1\n",
-      "1702it [00:31, 54.65it/s]\n",
-      "[2] Training Loss: 0.005\n",
-      "674it [00:08, 76.77it/s]\n",
-      "[2] Validation Loss: 0.045\n",
-      "0.045230048241574616 saved\n",
-      "starting epoch 2\n",
-      "1702it [00:31, 54.25it/s]\n",
-      "[3] Training Loss: 0.007\n",
-      "674it [00:08, 76.03it/s]\n",
-      "[3] Validation Loss: 0.040\n",
-      "0.043666935014574394 saved\n",
-      "starting epoch 3\n",
-      "1702it [00:31, 53.67it/s]\n",
-      "[4] Training Loss: 0.023\n",
-      "674it [00:08, 75.01it/s]\n",
-      "[4] Validation Loss: 0.040\n",
-      "0.042381427325418865 saved\n",
-      "starting epoch 4\n",
-      "1702it [00:32, 52.17it/s]\n",
-      "[5] Training Loss: 0.010\n",
-      "674it [00:09, 74.67it/s]\n",
-      "[5] Validation Loss: 0.037\n",
-      "0.04178215405221155 saved\n",
-      "starting epoch 5\n",
-      "1702it [00:31, 54.41it/s]\n",
-      "[6] Training Loss: 0.018\n",
-      "674it [00:09, 74.64it/s]\n",
-      "[6] Validation Loss: 0.043\n",
-      "Validation loss has increased: 1 / 5.\n",
-      "starting epoch 6\n",
-      "1702it [00:30, 55.90it/s]\n",
-      "[7] Training Loss: 0.019\n",
-      "674it [00:09, 74.79it/s]\n",
-      "[7] Validation Loss: 0.043\n",
-      "Validation loss has increased: 2 / 5.\n",
-      "starting epoch 7\n",
-      "1702it [00:31, 53.83it/s]\n",
-      "[8] Training Loss: 0.013\n",
-      "674it [00:09, 74.71it/s]\n",
-      "[8] Validation Loss: 0.043\n",
-      "Validation loss has increased: 3 / 5.\n",
-      "starting epoch 8\n",
-      "1702it [00:32, 51.60it/s]\n",
-      "[9] Training Loss: 0.014\n",
-      "674it [00:12, 52.28it/s]\n",
-      "[9] Validation Loss: 0.047\n",
-      "Validation loss has increased: 4 / 5.\n",
-      "starting epoch 9\n",
-      "1702it [00:54, 31.09it/s]\n",
-      "[10] Training Loss: 0.035\n",
-      "674it [00:11, 58.34it/s]\n",
-      "[10] Validation Loss: 0.044\n",
-      "Validation loss has increased: 5 / 5.\n",
-      "Maximum waiting reached. Break the training.\n",
-      "Time used 639.6560032367706\n"
+      "^C\n"
      ]
     }
    ],
@@ -204,9 +165,9 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "POPDx",
+   "display_name": "popdx_env",
    "language": "python",
-   "name": "popdx"
+   "name": "popdx_env"
   },
   "language_info": {
    "codemirror_mode": {
@@ -218,7 +179,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.8.8"
+   "version": "3.8.13"
   }
  },
  "nbformat": 4,
diff --git a/code/create_label_embeddings.ipynb b/code/create_label_embeddings.ipynb
@@ -0,0 +1,258 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "d58208cb",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import sys\n",
+    "sys.path.append('/oak/stanford/groups/rbaltman/luyang/scripts/POPDx/code/')\n",
+    "%load_ext autoreload\n",
+    "%autoreload 2"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "e59669e8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from embeddings import *"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "79da5160",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "3.2.0\n"
+     ]
+    }
+   ],
+   "source": [
+    "import transformers\n",
+    "print(transformers.__version__)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "c3c88a94",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "l2i, i2l, onto_embeddings = ontology_emb(dim=500, ICD_network_file = '../data/19.csv', save_dir = './embeddings/', use_pretrain = True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "704fd1db",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "cuda:0\n",
+      "Batch # 1\n",
+      "Batch # 2\n",
+      "Batch # 3\n",
+      "Batch # 4\n",
+      "Batch # 5\n",
+      "Batch # 6\n",
+      "Batch # 7\n",
+      "Batch # 8\n",
+      "Batch # 9\n",
+      "Batch # 10\n",
+      "Batch # 11\n",
+      "Batch # 12\n",
+      "Batch # 13\n",
+      "Batch # 14\n",
+      "Batch # 15\n",
+      "Batch # 16\n",
+      "Batch # 17\n",
+      "Batch # 18\n",
+      "Batch # 19\n",
+      "Batch # 20\n",
+      "Batch # 21\n",
+      "Batch # 22\n",
+      "Batch # 23\n",
+      "Batch # 24\n",
+      "Batch # 25\n",
+      "Batch # 26\n",
+      "Batch # 27\n",
+      "Batch # 28\n",
+      "Batch # 29\n",
+      "Batch # 30\n",
+      "Batch # 31\n",
+      "Batch # 32\n",
+      "Batch # 33\n",
+      "Batch # 34\n",
+      "Batch # 35\n",
+      "Batch # 36\n",
+      "Batch # 37\n",
+      "Batch # 38\n",
+      "Batch # 39\n",
+      "Batch # 40\n",
+      "Batch # 41\n",
+      "Batch # 42\n",
+      "Batch # 43\n",
+      "Batch # 44\n",
+      "Batch # 45\n",
+      "Batch # 46\n",
+      "Batch # 47\n",
+      "Batch # 48\n",
+      "Batch # 49\n",
+      "Batch # 50\n",
+      "Batch # 51\n",
+      "Batch # 52\n",
+      "Batch # 53\n",
+      "Batch # 54\n",
+      "Batch # 55\n",
+      "Batch # 56\n",
+      "Batch # 57\n",
+      "Batch # 58\n",
+      "Batch # 59\n",
+      "Batch # 60\n",
+      "Batch # 61\n",
+      "Batch # 62\n",
+      "Batch # 63\n",
+      "Batch # 64\n",
+      "Batch # 65\n",
+      "Batch # 66\n",
+      "Batch # 67\n",
+      "Batch # 68\n",
+      "Batch # 69\n",
+      "Batch # 70\n",
+      "Batch # 71\n",
+      "Batch # 72\n",
+      "Batch # 73\n",
+      "Batch # 74\n",
+      "Batch # 75\n",
+      "Batch # 76\n",
+      "Batch # 77\n",
+      "Batch # 78\n",
+      "Batch # 79\n",
+      "Batch # 80\n",
+      "Batch # 81\n",
+      "Batch # 82\n",
+      "Batch # 83\n",
+      "Batch # 84\n",
+      "Batch # 85\n",
+      "Batch # 86\n",
+      "Batch # 87\n",
+      "Batch # 88\n",
+      "Batch # 89\n",
+      "Batch # 90\n",
+      "Batch # 91\n",
+      "Batch # 92\n",
+      "Batch # 93\n",
+      "Batch # 94\n",
+      "Batch # 95\n",
+      "Batch # 96\n",
+      "Batch # 97\n",
+      "Batch # 98\n",
+      "Batch # 99\n",
+      "Batch # 100\n",
+      "Batch # 101\n"
+     ]
+    }
+   ],
+   "source": [
+    "biboert_embeddings, biboert_embeddings_dict = run_bert(use_pretrain = False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "86a1736d",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "(12803, 500) (12803, 768)\n"
+     ]
+    }
+   ],
+   "source": [
+    "with open('../data/mc_icd10_labels.txt','r') as f:\n",
+    "    labels = f.readlines()\n",
+    "labels = [x.strip() for x in labels] \n",
+    "labels_idx = [l2i[l] for l in labels]\n",
+    "onto_embeddings = onto_embeddings[labels_idx, :]\n",
+    "print(onto_embeddings.shape,  biboert_embeddings.shape)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "80037ccc",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "(12803, 1268)\n"
+     ]
+    }
+   ],
+   "source": [
+    "Y_emb_concat = np.concatenate((onto_embeddings, biboert_embeddings), axis=1)\n",
+    "print(Y_emb_concat.shape)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "7c88d9fe",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "np.save('../data/icd10_label_embed.npy', Y_emb_concat)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3af6ac59",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "popdx_env",
+   "language": "python",
+   "name": "popdx_env"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.13"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/code/embeddings.py b/code/embeddings.py