Skip to content

Commit 2035055

Browse files
committed
embeddings script
1 parent 4331ad6 commit 2035055

File tree

3 files changed

+439
-68
lines changed

3 files changed

+439
-68
lines changed

Demo.ipynb

Lines changed: 29 additions & 68 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,29 @@
1515
},
1616
{
1717
"cell_type": "code",
18-
"execution_count": 6,
18+
"execution_count": 2,
19+
"id": "ebef5221",
20+
"metadata": {},
21+
"outputs": [
22+
{
23+
"data": {
24+
"text/plain": [
25+
"'/oak/stanford/groups/rbaltman/luyang/software/anaconda3/envs/popdx/bin/python'"
26+
]
27+
},
28+
"execution_count": 2,
29+
"metadata": {},
30+
"output_type": "execute_result"
31+
}
32+
],
33+
"source": [
34+
"import sys\n",
35+
"sys.executable"
36+
]
37+
},
38+
{
39+
"cell_type": "code",
40+
"execution_count": 3,
1941
"id": "da1ee854",
2042
"metadata": {},
2143
"outputs": [
@@ -39,7 +61,7 @@
3961
" -lr LEARNING_RATE, --learning_rate LEARNING_RATE\r\n",
4062
" Default learning rate is 0.0001\r\n",
4163
" -wd WEIGHT_DECAY, --weight_decay WEIGHT_DECAY\r\n",
42-
" Default learning rate is 0\r\n"
64+
" Default weight decay is 0\r\n"
4365
]
4466
}
4567
],
@@ -49,7 +71,7 @@
4971
},
5072
{
5173
"cell_type": "code",
52-
"execution_count": 7,
74+
"execution_count": 4,
5375
"id": "cc5504ac",
5476
"metadata": {
5577
"scrolled": true
@@ -60,68 +82,7 @@
6082
"output_type": "stream",
6183
"text": [
6284
"Namespace(hidden_size=150, learning_rate=0.0001, save_dir='./save/POPDx_train', use_gpu=True, weight_decay=0.0)\n",
63-
"starting epoch 0\n",
64-
"1702it [00:34, 48.71it/s]\n",
65-
"[1] Training Loss: 0.017\n",
66-
"674it [00:09, 71.74it/s]\n",
67-
"[1] Validation Loss: 0.053\n",
68-
"0.046625131951197316 saved\n",
69-
"starting epoch 1\n",
70-
"1702it [00:31, 54.65it/s]\n",
71-
"[2] Training Loss: 0.005\n",
72-
"674it [00:08, 76.77it/s]\n",
73-
"[2] Validation Loss: 0.045\n",
74-
"0.045230048241574616 saved\n",
75-
"starting epoch 2\n",
76-
"1702it [00:31, 54.25it/s]\n",
77-
"[3] Training Loss: 0.007\n",
78-
"674it [00:08, 76.03it/s]\n",
79-
"[3] Validation Loss: 0.040\n",
80-
"0.043666935014574394 saved\n",
81-
"starting epoch 3\n",
82-
"1702it [00:31, 53.67it/s]\n",
83-
"[4] Training Loss: 0.023\n",
84-
"674it [00:08, 75.01it/s]\n",
85-
"[4] Validation Loss: 0.040\n",
86-
"0.042381427325418865 saved\n",
87-
"starting epoch 4\n",
88-
"1702it [00:32, 52.17it/s]\n",
89-
"[5] Training Loss: 0.010\n",
90-
"674it [00:09, 74.67it/s]\n",
91-
"[5] Validation Loss: 0.037\n",
92-
"0.04178215405221155 saved\n",
93-
"starting epoch 5\n",
94-
"1702it [00:31, 54.41it/s]\n",
95-
"[6] Training Loss: 0.018\n",
96-
"674it [00:09, 74.64it/s]\n",
97-
"[6] Validation Loss: 0.043\n",
98-
"Validation loss has increased: 1 / 5.\n",
99-
"starting epoch 6\n",
100-
"1702it [00:30, 55.90it/s]\n",
101-
"[7] Training Loss: 0.019\n",
102-
"674it [00:09, 74.79it/s]\n",
103-
"[7] Validation Loss: 0.043\n",
104-
"Validation loss has increased: 2 / 5.\n",
105-
"starting epoch 7\n",
106-
"1702it [00:31, 53.83it/s]\n",
107-
"[8] Training Loss: 0.013\n",
108-
"674it [00:09, 74.71it/s]\n",
109-
"[8] Validation Loss: 0.043\n",
110-
"Validation loss has increased: 3 / 5.\n",
111-
"starting epoch 8\n",
112-
"1702it [00:32, 51.60it/s]\n",
113-
"[9] Training Loss: 0.014\n",
114-
"674it [00:12, 52.28it/s]\n",
115-
"[9] Validation Loss: 0.047\n",
116-
"Validation loss has increased: 4 / 5.\n",
117-
"starting epoch 9\n",
118-
"1702it [00:54, 31.09it/s]\n",
119-
"[10] Training Loss: 0.035\n",
120-
"674it [00:11, 58.34it/s]\n",
121-
"[10] Validation Loss: 0.044\n",
122-
"Validation loss has increased: 5 / 5.\n",
123-
"Maximum waiting reached. Break the training.\n",
124-
"Time used 639.6560032367706\n"
85+
"^C\n"
12586
]
12687
}
12788
],
@@ -204,9 +165,9 @@
204165
],
205166
"metadata": {
206167
"kernelspec": {
207-
"display_name": "POPDx",
168+
"display_name": "popdx_env",
208169
"language": "python",
209-
"name": "popdx"
170+
"name": "popdx_env"
210171
},
211172
"language_info": {
212173
"codemirror_mode": {
@@ -218,7 +179,7 @@
218179
"name": "python",
219180
"nbconvert_exporter": "python",
220181
"pygments_lexer": "ipython3",
221-
"version": "3.8.8"
182+
"version": "3.8.13"
222183
}
223184
},
224185
"nbformat": 4,

code/create_label_embeddings.ipynb

Lines changed: 258 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,258 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "code",
5+
"execution_count": 1,
6+
"id": "d58208cb",
7+
"metadata": {},
8+
"outputs": [],
9+
"source": [
10+
"import sys\n",
11+
"sys.path.append('/oak/stanford/groups/rbaltman/luyang/scripts/POPDx/code/')\n",
12+
"%load_ext autoreload\n",
13+
"%autoreload 2"
14+
]
15+
},
16+
{
17+
"cell_type": "code",
18+
"execution_count": 2,
19+
"id": "e59669e8",
20+
"metadata": {},
21+
"outputs": [],
22+
"source": [
23+
"from embeddings import *"
24+
]
25+
},
26+
{
27+
"cell_type": "code",
28+
"execution_count": 3,
29+
"id": "79da5160",
30+
"metadata": {},
31+
"outputs": [
32+
{
33+
"name": "stdout",
34+
"output_type": "stream",
35+
"text": [
36+
"3.2.0\n"
37+
]
38+
}
39+
],
40+
"source": [
41+
"import transformers\n",
42+
"print(transformers.__version__)"
43+
]
44+
},
45+
{
46+
"cell_type": "code",
47+
"execution_count": 4,
48+
"id": "c3c88a94",
49+
"metadata": {},
50+
"outputs": [],
51+
"source": [
52+
"l2i, i2l, onto_embeddings = ontology_emb(dim=500, ICD_network_file = '../data/19.csv', save_dir = './embeddings/', use_pretrain = True)"
53+
]
54+
},
55+
{
56+
"cell_type": "code",
57+
"execution_count": 6,
58+
"id": "704fd1db",
59+
"metadata": {
60+
"scrolled": true
61+
},
62+
"outputs": [
63+
{
64+
"name": "stdout",
65+
"output_type": "stream",
66+
"text": [
67+
"cuda:0\n",
68+
"Batch # 1\n",
69+
"Batch # 2\n",
70+
"Batch # 3\n",
71+
"Batch # 4\n",
72+
"Batch # 5\n",
73+
"Batch # 6\n",
74+
"Batch # 7\n",
75+
"Batch # 8\n",
76+
"Batch # 9\n",
77+
"Batch # 10\n",
78+
"Batch # 11\n",
79+
"Batch # 12\n",
80+
"Batch # 13\n",
81+
"Batch # 14\n",
82+
"Batch # 15\n",
83+
"Batch # 16\n",
84+
"Batch # 17\n",
85+
"Batch # 18\n",
86+
"Batch # 19\n",
87+
"Batch # 20\n",
88+
"Batch # 21\n",
89+
"Batch # 22\n",
90+
"Batch # 23\n",
91+
"Batch # 24\n",
92+
"Batch # 25\n",
93+
"Batch # 26\n",
94+
"Batch # 27\n",
95+
"Batch # 28\n",
96+
"Batch # 29\n",
97+
"Batch # 30\n",
98+
"Batch # 31\n",
99+
"Batch # 32\n",
100+
"Batch # 33\n",
101+
"Batch # 34\n",
102+
"Batch # 35\n",
103+
"Batch # 36\n",
104+
"Batch # 37\n",
105+
"Batch # 38\n",
106+
"Batch # 39\n",
107+
"Batch # 40\n",
108+
"Batch # 41\n",
109+
"Batch # 42\n",
110+
"Batch # 43\n",
111+
"Batch # 44\n",
112+
"Batch # 45\n",
113+
"Batch # 46\n",
114+
"Batch # 47\n",
115+
"Batch # 48\n",
116+
"Batch # 49\n",
117+
"Batch # 50\n",
118+
"Batch # 51\n",
119+
"Batch # 52\n",
120+
"Batch # 53\n",
121+
"Batch # 54\n",
122+
"Batch # 55\n",
123+
"Batch # 56\n",
124+
"Batch # 57\n",
125+
"Batch # 58\n",
126+
"Batch # 59\n",
127+
"Batch # 60\n",
128+
"Batch # 61\n",
129+
"Batch # 62\n",
130+
"Batch # 63\n",
131+
"Batch # 64\n",
132+
"Batch # 65\n",
133+
"Batch # 66\n",
134+
"Batch # 67\n",
135+
"Batch # 68\n",
136+
"Batch # 69\n",
137+
"Batch # 70\n",
138+
"Batch # 71\n",
139+
"Batch # 72\n",
140+
"Batch # 73\n",
141+
"Batch # 74\n",
142+
"Batch # 75\n",
143+
"Batch # 76\n",
144+
"Batch # 77\n",
145+
"Batch # 78\n",
146+
"Batch # 79\n",
147+
"Batch # 80\n",
148+
"Batch # 81\n",
149+
"Batch # 82\n",
150+
"Batch # 83\n",
151+
"Batch # 84\n",
152+
"Batch # 85\n",
153+
"Batch # 86\n",
154+
"Batch # 87\n",
155+
"Batch # 88\n",
156+
"Batch # 89\n",
157+
"Batch # 90\n",
158+
"Batch # 91\n",
159+
"Batch # 92\n",
160+
"Batch # 93\n",
161+
"Batch # 94\n",
162+
"Batch # 95\n",
163+
"Batch # 96\n",
164+
"Batch # 97\n",
165+
"Batch # 98\n",
166+
"Batch # 99\n",
167+
"Batch # 100\n",
168+
"Batch # 101\n"
169+
]
170+
}
171+
],
172+
"source": [
173+
"biboert_embeddings, biboert_embeddings_dict = run_bert(use_pretrain = False)"
174+
]
175+
},
176+
{
177+
"cell_type": "code",
178+
"execution_count": 7,
179+
"id": "86a1736d",
180+
"metadata": {},
181+
"outputs": [
182+
{
183+
"name": "stdout",
184+
"output_type": "stream",
185+
"text": [
186+
"(12803, 500) (12803, 768)\n"
187+
]
188+
}
189+
],
190+
"source": [
191+
"with open('../data/mc_icd10_labels.txt','r') as f:\n",
192+
" labels = f.readlines()\n",
193+
"labels = [x.strip() for x in labels] \n",
194+
"labels_idx = [l2i[l] for l in labels]\n",
195+
"onto_embeddings = onto_embeddings[labels_idx, :]\n",
196+
"print(onto_embeddings.shape, biboert_embeddings.shape)"
197+
]
198+
},
199+
{
200+
"cell_type": "code",
201+
"execution_count": 8,
202+
"id": "80037ccc",
203+
"metadata": {},
204+
"outputs": [
205+
{
206+
"name": "stdout",
207+
"output_type": "stream",
208+
"text": [
209+
"(12803, 1268)\n"
210+
]
211+
}
212+
],
213+
"source": [
214+
"Y_emb_concat = np.concatenate((onto_embeddings, biboert_embeddings), axis=1)\n",
215+
"print(Y_emb_concat.shape)"
216+
]
217+
},
218+
{
219+
"cell_type": "code",
220+
"execution_count": 9,
221+
"id": "7c88d9fe",
222+
"metadata": {},
223+
"outputs": [],
224+
"source": [
225+
"np.save('../data/icd10_label_embed.npy', Y_emb_concat)"
226+
]
227+
},
228+
{
229+
"cell_type": "code",
230+
"execution_count": null,
231+
"id": "3af6ac59",
232+
"metadata": {},
233+
"outputs": [],
234+
"source": []
235+
}
236+
],
237+
"metadata": {
238+
"kernelspec": {
239+
"display_name": "popdx_env",
240+
"language": "python",
241+
"name": "popdx_env"
242+
},
243+
"language_info": {
244+
"codemirror_mode": {
245+
"name": "ipython",
246+
"version": 3
247+
},
248+
"file_extension": ".py",
249+
"mimetype": "text/x-python",
250+
"name": "python",
251+
"nbconvert_exporter": "python",
252+
"pygments_lexer": "ipython3",
253+
"version": "3.8.13"
254+
}
255+
},
256+
"nbformat": 4,
257+
"nbformat_minor": 5
258+
}

0 commit comments

Comments
 (0)