-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathmake_mimic_notes.py
159 lines (129 loc) · 5.21 KB
/
make_mimic_notes.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
import numpy as np
import pandas as pd
import psycopg2
from scipy.stats import ks_2samp
import os
import random
random.seed(22891)
# create a database connection
sqluser = 'iychen'
dbname = 'mimic'
schema_name = 'mimiciii'
# Connect to local postgres version of mimic
con = psycopg2.connect(dbname=dbname, user=sqluser)
cur = con.cursor()
cur.execute('SET search_path to ' + schema_name)
#========helper function for imputing missing values
def replace(group):
"""
takes in a pandas group, and replaces the
null value with the mean of the none null
values of the same group
"""
mask = group.isnull()
group[mask] = group[~mask].mean()
return group
#========get the icu details
# this query extracts the following:
# Unique ids for the admission, patient and icu stay
# Patient gender
# admission & discharge times
# length of stay
# age
# ethnicity
# admission type
# in hospital death?
# in icu death?
# one year from admission death?
# first hospital stay
# icu intime, icu outime
# los in icu
# first icu stay?
denquery = \
"""
-- This query extracts useful demographic/administrative information for patient ICU stays
--DROP MATERIALIZED VIEW IF EXISTS icustay_detail CASCADE;
--CREATE MATERIALIZED VIEW icustay_detail as
--ie is the icustays table
--adm is the admissions table
SELECT ie.subject_id, ie.hadm_id, ie.icustay_id
, pat.gender
, adm.admittime, adm.dischtime, adm.diagnosis
, ROUND( (CAST(adm.dischtime AS DATE) - CAST(adm.admittime AS DATE)) , 4) AS los_hospital
, ROUND( (CAST(adm.admittime AS DATE) - CAST(pat.dob AS DATE)) / 365, 4) AS age
, adm.ethnicity, adm.ADMISSION_TYPE
--, adm.hospital_expire_flag
, adm.insurance
, CASE when adm.deathtime between adm.admittime and adm.dischtime THEN 1 ELSE 0 END AS mort_hosp
, CASE when adm.deathtime between ie.intime and ie.outtime THEN 1 ELSE 0 END AS mort_icu
, CASE when adm.deathtime between adm.admittime and adm.admittime + interval '365' day THEN 1 ELSE 0 END AS mort_oneyr
, DENSE_RANK() OVER (PARTITION BY adm.subject_id ORDER BY adm.admittime) AS hospstay_seq
, CASE
WHEN DENSE_RANK() OVER (PARTITION BY adm.subject_id ORDER BY adm.admittime) = 1 THEN 1
ELSE 0 END AS first_hosp_stay
-- icu level factors
, ie.intime, ie.outtime
, ie.FIRST_CAREUNIT
, ROUND( (CAST(ie.outtime AS DATE) - CAST(ie.intime AS DATE)) , 4) AS los_icu
, DENSE_RANK() OVER (PARTITION BY ie.hadm_id ORDER BY ie.intime) AS icustay_seq
-- first ICU stay *for the current hospitalization*
, CASE
WHEN DENSE_RANK() OVER (PARTITION BY ie.hadm_id ORDER BY ie.intime) = 1 THEN 1
ELSE 0 END AS first_icu_stay
FROM icustays ie
INNER JOIN admissions adm
ON ie.hadm_id = adm.hadm_id
INNER JOIN patients pat
ON ie.subject_id = pat.subject_id
WHERE adm.has_chartevents_data = 1
ORDER BY ie.subject_id, adm.admittime, ie.intime;
"""
den = pd.read_sql_query(denquery,con)
#----drop patients with less than 48 hour
den['los_icu_hr'] = (den.outtime - den.intime).astype('timedelta64[h]')
den = den[(den.los_icu_hr >= 48)]
den = den[(den.age<300)]
den.drop('los_icu_hr', 1, inplace = True)
#----clean up
# micu --> medical
# csru --> cardiac surgery recovery unit
# sicu --> surgical icu
# tsicu --> Trauma Surgical Intensive Care Unit
# NICU --> Neonatal
den['adult_icu'] = np.where(den['first_careunit'].isin(['PICU', 'NICU']), 0, 1)
den['gender'] = np.where(den['gender']=="M", 1, 0)
# no need to yell
den.ethnicity = den.ethnicity.str.lower()
den.ethnicity.loc[(den.ethnicity.str.contains('^white'))] = 'white'
den.ethnicity.loc[(den.ethnicity.str.contains('^black'))] = 'black'
den.ethnicity.loc[(den.ethnicity.str.contains('^hisp')) | (den.ethnicity.str.contains('^latin'))] = 'hispanic'
den.ethnicity.loc[(den.ethnicity.str.contains('^asia'))] = 'asian'
den.ethnicity.loc[~(den.ethnicity.str.contains('|'.join(['white', 'black', 'hispanic', 'asian'])))] = 'other'
den = pd.concat([den, pd.get_dummies(den['ethnicity'], prefix='eth')], 1)
den = pd.concat([den, pd.get_dummies(den['admission_type'], prefix='admType')], 1)
den.drop(['diagnosis', 'hospstay_seq', 'los_icu','icustay_seq', 'admittime', 'dischtime','los_hospital', 'intime', 'outtime', 'ethnicity', 'admission_type', 'first_careunit'], 1, inplace =True)
den = den[(den['adult_icu']==1)].dropna()
notesquery = \
"""
SELECT fin.subject_id, fin.hadm_id, fin.icustay_id, string_agg(fin.text, ' ') as chartext
FROM (
select ie.subject_id, ie.hadm_id, ie.icustay_id, ne.text
from icustays ie
left join noteevents ne
on ie.subject_id = ne.subject_id and ie.hadm_id = ne.hadm_id
and ne.charttime between ie.intime and ie.intime + interval '48' hour
--and ne.iserror != '1'
where ne.category != 'Discharge summary'
) fin
group by fin.subject_id, fin.hadm_id, fin.icustay_id
order by fin.subject_id, fin.hadm_id, fin.icustay_id;
"""
notes48 = pd.read_sql_query(notesquery,con)
output_df = notes48.merge(den,on=['subject_id', 'hadm_id', 'icustay_id'], how='inner')
print('notes48:', len(notes48))
print('demographics:', len(den))
print('merged:', len(output_df))
output_df.to_csv('data/patients_notes.csv', index=False)
cols_except_notes = [i for i in output_df.columns if i != 'chartext']
output_df[cols_except_notes].to_csv('data/patients_info.csv', index=False)
print('saved to data/')