diff --git a/Devika/Analysis_Dizziness_AbdPain_Apr2023-Copy2.ipynb b/Devika/Analysis_Dizziness_AbdPain_Apr2023-Copy2.ipynb new file mode 100644 index 0000000..866afcc --- /dev/null +++ b/Devika/Analysis_Dizziness_AbdPain_Apr2023-Copy2.ipynb @@ -0,0 +1,1823 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "brutal-royal", + "metadata": {}, + "outputs": [], + "source": [ + "%matplotlib inline\n", + "from B00_util import *\n", + "%reload_ext autoreload\n", + "%autoreload 2" + ] + }, + { + "cell_type": "markdown", + "id": "auburn-accident", + "metadata": {}, + "source": [ + "# Get trigger positive data for ML4" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "angry-analysis", + "metadata": {}, + "outputs": [], + "source": [ + "dataset = extractDataset(\"B00_ML4TrgPos_Y2016\", { \"AllMed\", \"HF\",\"NonVAMed\", \"DispensedDrug\",\"Only10daysPrior30DaysAfter\"})" + ] + }, + { + "cell_type": "markdown", + "id": "hawaiian-saturn", + "metadata": {}, + "source": [ + "# Cohorts" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "sustainable-connecticut", + "metadata": {}, + "outputs": [], + "source": [ + "cohorts = dataset['cohort']\n", + "dizzy_cohort_df, abdpain_cohort_df = separate_cohorts(cohorts)" + ] + }, + { + "cell_type": "markdown", + "id": "possible-council", + "metadata": {}, + "source": [ + "# Demographic data\n", + "- for dizzy df, there are four labeled patients for which we do not have demo or cohort records" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "turned-fitting", + "metadata": {}, + "outputs": [], + "source": [ + "demog = dataset['Demorgraphics']\n", + "dizzy_demo_coded, abdpain_demo_coded = separate_demog(demog,dizzy_cohort_df,abdpain_cohort_df)\n", + "print('Dizzy demo df = ', dizzy_demo_coded.shape, ' Abdpain demo df = ', abdpain_demo_coded.shape) " + ] + }, + { + "cell_type": "markdown", + "id": "accessory-myanmar", + "metadata": {}, + "source": [ + "# ED vitals\n", + "- vitals during ED visit\n", + " - systolic, diastolic (per visit)\n", + " - pulse, respiration, pulse oximetry, pain, temperature (count, min, max, first for multiple readings)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "institutional-township", + "metadata": {}, + "outputs": [], + "source": [ + "vitals = dataset['Vital']\n", + "\n", + "dizzy_EDvitals_df,abdpain_EDvitals_df = separate_cohorts_EDvitals(vitals,dizzy_cohort_df,abdpain_cohort_df)\n", + "print(dizzy_EDvitals_df.shape,abdpain_EDvitals_df.shape)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "radio-venue", + "metadata": {}, + "outputs": [], + "source": [ + "print(dizzy_EDvitals_df.isna().sum())" + ] + }, + { + "cell_type": "markdown", + "id": "living-oasis", + "metadata": {}, + "source": [ + "# Vitals during the hospitalizations subsequent to ED visits" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dated-fiction", + "metadata": {}, + "outputs": [], + "source": [ + "# get vitals from hospital visits\n", + "dizzy_hosp_vitals_df,abdpain_hosp_vitals_df = separate_cohorts_hosp_vitals(vitals,dizzy_cohort_df,abdpain_cohort_df)\n", + "print(dizzy_hosp_vitals_df.shape,abdpain_hosp_vitals_df.shape)\n", + "dizzy_hosp_vitals_df.isna().sum()" + ] + }, + { + "cell_type": "markdown", + "id": "confidential-companion", + "metadata": {}, + "source": [ + "# Consults ordered during ED visit\n", + "- count up top three consult depts for dizzy\n", + "- count up top seven consult depts for abdpain\n", + "\n", + "Cardiology consult count useful for dizzy\n", + "EKG consult count useful for abdpain" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "unable-technique", + "metadata": {}, + "outputs": [], + "source": [ + "consults = dataset['Consult']\n", + "dizzy_consults, abdpain_consults = separate_cohorts_consults(consults,dizzy_cohort_df,abdpain_cohort_df)\n", + "\n", + "# look at which departments are being consulted (pick top N)\n", + "dizzy_topN_consult_counts = topN_consult_counts(dizzy_consults,dizzy_demo_coded,3)\n", + "abdpain_topN_consult_counts = topN_consult_counts(abdpain_consults,abdpain_demo_coded,7)\n", + "print(dizzy_topN_consult_counts.shape, abdpain_topN_consult_counts.shape)\n", + "print(dizzy_topN_consult_counts.isna().sum())" + ] + }, + { + "cell_type": "markdown", + "id": "touched-geography", + "metadata": {}, + "source": [ + "# Imaging features\n", + "- ct images: - how many ordered, how many w/contrast, how many abnormal\n", + "- xr images: how many ordered, how many abnormal\n", + "- us images: how many ordered, how many abnormal\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "focal-albania", + "metadata": {}, + "outputs": [], + "source": [ + "# get imaging records for dizzy and abdpain\n", + "images = dataset['Rad']\n", + "\n", + "dizzy_images, abdpain_images = separate_cohorts_images(images,dizzy_cohort_df,abdpain_cohort_df)\n", + "print(dizzy_images.shape, abdpain_images.shape)\n", + "print(dizzy_images.isna().sum())" + ] + }, + { + "cell_type": "markdown", + "id": "quality-syndrome", + "metadata": {}, + "source": [ + "\n", + "# Labs: ED visit only\n", + "- for select labs, get count, min, max, abnormal_count" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "sitting-cricket", + "metadata": {}, + "outputs": [], + "source": [ + "labs = dataset['Lab']\n", + "dizzy_labs, abdpain_labs = separate_cohorts_labs(labs,dizzy_cohort_df,abdpain_cohort_df)\n", + "print(dizzy_labs.shape,abdpain_labs.shape)\n", + "\n", + "labs = ['WBC','glucose','albumin','potassium','calcium','lact','chloride','bun','creat','troponin','CO2','ast','alt',\n", + " 'alkphos','lipase','amylase','hgb']\n", + "lab_fns = ['matches_' + lab + '_loinc' for lab in labs]\n", + "\n", + "# collect all labs and then filter later on\n", + "dizzy_lab_dict = {}\n", + "abdpain_lab_dict = {}\n", + "for i in range(len(labs)):\n", + " lab, labfn = labs[i], lab_fns[i]\n", + " dizzy_lab_dict[lab] = get_labs_data(dizzy_labs,lab,eval(labfn),dizzy_demo_coded)\n", + " abdpain_lab_dict[lab] = get_labs_data(abdpain_labs,lab,eval(labfn),abdpain_demo_coded)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "painful-fashion", + "metadata": {}, + "outputs": [], + "source": [ + "# merge lab dataframes \n", + "\n", + "dizzy_merged_labs = pd.DataFrame(dizzy_cohort_df.PtSSN.unique(),columns=['PtSSN'])\n", + "abdpain_merged_labs = pd.DataFrame(abdpain_cohort_df.PtSSN.unique(),columns=['PtSSN'])\n", + "for lab in labs:\n", + " dizzy_merged_labs = pd.merge(dizzy_merged_labs,dizzy_lab_dict[lab],on='PtSSN')\n", + " abdpain_merged_labs = pd.merge(abdpain_merged_labs,abdpain_lab_dict[lab],on='PtSSN')\n", + "print(dizzy_merged_labs.shape, abdpain_merged_labs.shape)\n", + "\n", + "# drop cols with more than 10% NA\n", + "def check_nas(df,cols,thresh):\n", + " drop_cols = []\n", + " for col in cols:\n", + " pct_na = df[col].isna().sum()/df.shape[0] * 100\n", + " if pct_na > thresh:\n", + " #print(col,pct_na)\n", + " drop_cols = drop_cols + [col]\n", + " return drop_cols\n", + "\n", + "def clean_lab_df(merged_df,cols,thresh):\n", + " drop_labs = check_nas(merged_df,cols,thresh)\n", + " #print('Lab cols to drop:',drop_labs)\n", + " lab_retain = list(set(merged_df.columns).difference(set(drop_labs)))\n", + " print(merged_df[lab_retain].shape,merged_df[lab_retain].dropna().shape)\n", + " \n", + " \n", + " return merged_df[lab_retain]\n", + "\n", + "dizzy_clean_labs = clean_lab_df(dizzy_merged_labs,dizzy_merged_labs.columns[1:],10)\n", + "abdpain_clean_labs = clean_lab_df(abdpain_merged_labs,abdpain_merged_labs.columns[1:],10)\n", + "print(dizzy_clean_labs.shape,abdpain_clean_labs.shape)\n", + "print(dizzy_clean_labs.isna().sum())" + ] + }, + { + "cell_type": "markdown", + "id": "heard-marsh", + "metadata": {}, + "source": [ + "# History\n", + "- for dizzy, use Viral's ICD list\n", + "- for abdpain, use Adel's ICD list refined by Andy Z" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "confidential-saturday", + "metadata": {}, + "outputs": [], + "source": [ + "icds = dataset['ICD']\n", + "dizzy_icds,abdpain_icds = separate_cohorts_icds(icds,dizzy_cohort_df,abdpain_cohort_df)\n", + "print(dizzy_icds.shape,abdpain_icds.shape)\n", + "\n", + "# get dizzy risk factors\n", + "dizzy_rf_df = get_dizzy_rf(dizzy_icds,dizzy_cohort_df)\n", + "bool_dizzy_rf_df = pd.concat([dizzy_rf_df.PtSSN, dizzy_rf_df.iloc[:,1:].astype(bool).astype(int)],axis=1 )\n", + "print(dizzy_rf_df.shape, bool_dizzy_rf_df.shape)\n", + "\n", + "# get abdpain risk factors\n", + "abdpain_rf_df = get_abdpain_rf(abdpain_icds,abdpain_cohort_df)\n", + "bool_abdpain_rf_df = pd.concat([abdpain_rf_df.PtSSN, abdpain_rf_df.iloc[:,1:].astype(bool).astype(int)],axis=1 )\n", + "print(abdpain_rf_df.shape, bool_abdpain_rf_df.shape)\n", + "\n", + "# add a column which is the number of risk factors\n", + "bool_dizzy_rf_df['total_rf'] = bool_dizzy_rf_df.iloc[:,1:].sum(axis=1)\n", + "bool_abdpain_rf_df['total_rf'] = bool_abdpain_rf_df.iloc[:,1:].sum(axis=1)" + ] + }, + { + "cell_type": "markdown", + "id": "frequent-schedule", + "metadata": {}, + "source": [ + "# Get labeled data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "tutorial-enclosure", + "metadata": {}, + "outputs": [], + "source": [ + "dizzy_df, dizzy_labels_df = retrieveLabels_dizzy(dizzy_fname)\n", + "abdpain_df, abdpain_labels_df = retrieveLabels_abdpain(abdpain_fname)\n", + "print(dizzy_labels_df.shape,abdpain_labels_df.shape)\n", + "\n", + "print(dizzy_labels_df.label.value_counts())\n", + "print(abdpain_labels_df.label.value_counts())\n", + "print('\\nPPV for dizzy = ', dizzy_labels_df[dizzy_labels_df.label=='MOD'].shape[0]/dizzy_labels_df.shape[0])\n", + "print('PPV for abdpain = ', np.round(abdpain_labels_df[abdpain_labels_df.label=='MOD'].shape[0]/abdpain_labels_df.shape[0],3))" + ] + }, + { + "cell_type": "markdown", + "id": "caring-accuracy", + "metadata": {}, + "source": [ + "# Data for Table 1" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "stuffed-newman", + "metadata": {}, + "outputs": [], + "source": [ + "dizzy_cohort_df_labeled = pd.merge(dizzy_cohort_df,dizzy_labels_df,on='PtSSN')\n", + "dizzy_demo_coded_labeled = pd.merge(dizzy_demo_coded,dizzy_labels_df,on='PtSSN')\n", + "print(dizzy_demo_coded_labeled.columns)\n", + "dizzy_subset = dizzy_demo_coded_labeled[dizzy_demo_coded_labeled.label.isin(['MOD','NoMOD'])]\n", + "\n", + "# age analysis\n", + "display(dizzy_subset['age_at_index_visit'].describe().T.loc[[\"mean\",\"std\"]])\n", + "display(dizzy_subset.groupby('label')['age_at_index_visit'].describe().T.loc[[\"mean\",\"std\"],:])\n", + "\n", + "# gender analysis\n", + "display(dizzy_subset.groupby('label')['Gender'].value_counts())\n", + "\n", + "from scipy.stats import fisher_exact\n", + "# from scipy.stats.contingency import crosstab \n", + " \n", + "\n", + "ttest_fields(dizzy_subset,\n", + " dizzy_subset.columns[1:-1],\n", + " ['c','d','d','d','d','d','d','d'],show=True)" + ] + }, + { + "cell_type": "markdown", + "id": "touched-morris", + "metadata": {}, + "source": [ + "# Labeled versions of all dataframes" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "textile-separation", + "metadata": {}, + "outputs": [], + "source": [ + "# labeled versions of the cohort datasets\n", + "dizzy_cohort_df_labeled = pd.merge(dizzy_cohort_df,dizzy_labels_df,on='PtSSN')\n", + "abdpain_cohort_df_labeled = pd.merge(abdpain_cohort_df,abdpain_labels_df,on='PtSSN')\n", + "print('Cohort:',dizzy_cohort_df_labeled.shape, abdpain_cohort_df_labeled.shape)\n", + "\n", + "# labeled versions of demog datasets\n", + "dizzy_demo_coded_labeled = pd.merge(dizzy_demo_coded,dizzy_labels_df,on='PtSSN')\n", + "abdpain_demo_coded_labeled = pd.merge(abdpain_demo_coded,abdpain_labels_df,on='PtSSN')\n", + "print('Demo:', dizzy_demo_coded_labeled.shape,abdpain_demo_coded_labeled.shape)\n", + "\n", + "# get labeled version of ED vitals\n", + "dizzy_EDvitals_labeled = pd.merge(dizzy_EDvitals_df,dizzy_labels_df,on='PtSSN')\n", + "abdpain_EDvitals_labeled = pd.merge(abdpain_EDvitals_df,abdpain_labels_df,on='PtSSN')\n", + "print('EDVitals:', dizzy_EDvitals_labeled.shape,abdpain_EDvitals_labeled.shape)\n", + "\n", + "# get labeled versions of hosp vitals\n", + "dizzy_hosp_vitals_labeled = pd.merge(dizzy_hosp_vitals_df,dizzy_labels_df,on='PtSSN')\n", + "abdpain_hosp_vitals_labeled = pd.merge(abdpain_hosp_vitals_df,abdpain_labels_df,on='PtSSN')\n", + "print('Hosp vitals:', dizzy_hosp_vitals_labeled.shape,abdpain_hosp_vitals_labeled.shape)\n", + "\n", + "# get labeled versions of consults\n", + "dizzy_topN_consult_counts_labeled = pd.merge(dizzy_topN_consult_counts,dizzy_labels_df,on='PtSSN').fillna(0)\n", + "abdpain_topN_consult_counts_labeled = pd.merge(abdpain_topN_consult_counts,abdpain_labels_df,on='PtSSN').fillna(0)\n", + "print('Consults: ', dizzy_topN_consult_counts_labeled.shape,abdpain_topN_consult_counts_labeled.shape)\n", + "\n", + "# get labeled version of images\n", + "dizzy_images_labeled = pd.merge(dizzy_images,dizzy_labels_df,on='PtSSN')\n", + "abdpain_images_labeled = pd.merge(abdpain_images,abdpain_labels_df,on='PtSSN')\n", + "print('Imaging:', dizzy_images_labeled.shape,abdpain_images_labeled.shape)\n", + "\n", + "# get labeled versions of risk factors\n", + "dizzy_rf_df_labeled = pd.merge(dizzy_rf_df,dizzy_labels_df,on='PtSSN')\n", + "abdpain_rf_df_labeled = pd.merge(abdpain_rf_df,abdpain_labels_df,on='PtSSN')\n", + "print('Risk Factors:', dizzy_rf_df_labeled.shape,abdpain_rf_df_labeled.shape)\n", + "bool_dizzy_rf_df_labeled = pd.merge(bool_dizzy_rf_df,dizzy_labels_df,on='PtSSN')\n", + "bool_abdpain_rf_df_labeled = pd.merge(bool_abdpain_rf_df,abdpain_labels_df,on='PtSSN')\n", + "print('Boolean Risk Factors:', dizzy_rf_df_labeled.shape,abdpain_rf_df_labeled.shape)\n", + "\n", + "# get labeled versions of clean labs\n", + "dizzy_clean_labs_labeled = pd.merge(dizzy_clean_labs,dizzy_labels_df,on='PtSSN')\n", + "abdpain_clean_labs_labeled = pd.merge(abdpain_clean_labs,abdpain_labels_df,on='PtSSN')\n", + "print('Labs: ', dizzy_clean_labs_labeled.shape, abdpain_clean_labs_labeled.shape)\n" + ] + }, + { + "cell_type": "markdown", + "id": "informal-cleaning", + "metadata": {}, + "source": [ + "# Check if any of the features are useful for MOD prediction" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "talented-technology", + "metadata": {}, + "outputs": [], + "source": [ + "# cohort fields ttest\n", + "cohort_fields = ['ed_duration', 'ed_first_inp_delta','sum_hosp_stay', 'num_hosp','num_ED_visits']\n", + "cohort_ftypes = ['c','c','c','c','c']\n", + "dizzy_cohort_sig = ttest_fields(dizzy_cohort_df_labeled[dizzy_cohort_df_labeled.label.isin(['MOD','NoMOD'])],cohort_fields,cohort_ftypes)\n", + "print('Dizziness cohort:', dizzy_cohort_sig)\n", + "abdpain_cohort_sig = ttest_fields(abdpain_cohort_df_labeled,cohort_fields,cohort_ftypes)\n", + "print('Abdpain cohort:',abdpain_cohort_sig)\n", + "\n", + "# demo field ttest\n", + "demo_fields = dizzy_demo_coded.columns[1:]\n", + "demo_ftypes = ['c','d','d','d','d','d','d','d']\n", + "dizzy_demo_sig = ttest_fields(dizzy_demo_coded_labeled[dizzy_demo_coded_labeled.label.isin(['MOD','NoMOD'])],demo_fields,demo_ftypes,show=False)\n", + "print('Dizziness demo:',dizzy_demo_sig)\n", + "abdpain_demo_sig = ttest_fields(abdpain_demo_coded_labeled,demo_fields,demo_ftypes,show=False)\n", + "print('Abdpain demo:',abdpain_demo_sig)\n", + "\n", + "# ED Vitals ttest\n", + "# do a ttest with all fields with respect to MOD\n", + "ED_vitals_fields= dizzy_EDvitals_labeled.columns[3:-1]\n", + "ED_vitals_ftypes = len(ED_vitals_fields)*['c']\n", + "dizzy_vitals_sig = ttest_fields(dizzy_EDvitals_labeled[dizzy_EDvitals_labeled.label.isin(['MOD','NoMOD'])],\n", + " ED_vitals_fields,ED_vitals_ftypes)\n", + "print('Dizziness ED vitals:',dizzy_vitals_sig)\n", + "abdpain_vitals_sig = ttest_fields(abdpain_EDvitals_labeled,ED_vitals_fields,ED_vitals_ftypes)\n", + "print('Abdpain ED vitals:',abdpain_vitals_sig)\n", + "\n", + "# Hosp vitals ttest\n", + "# do a ttest with all fields with respect to MOD\n", + "hosp_vitals_fields = dizzy_hosp_vitals_df.columns[3:]\n", + "hosp_vitals_ftypes = len(hosp_vitals_fields) * ['c']\n", + "dizzy_hosp_vitals_sig = ttest_fields(dizzy_hosp_vitals_labeled[dizzy_hosp_vitals_labeled.label.isin(['MOD','NoMOD'])],\n", + " hosp_vitals_fields,hosp_vitals_ftypes)\n", + "print('Dizziness hosp vitals:',dizzy_hosp_vitals_sig)\n", + "abdpain_hosp_vitals_sig = ttest_fields(abdpain_hosp_vitals_labeled,hosp_vitals_fields,hosp_vitals_ftypes)\n", + "print('Abdpain hosp vitals:',abdpain_hosp_vitals_sig)\n", + "\n", + "# Consult ttest\n", + "dizzy_ccounts_fields = dizzy_topN_consult_counts_labeled.columns[1:-1]\n", + "dizzy_ccounts_ftypes = ['d','d','d']\n", + "dizzy_consults_sig = ttest_fields(dizzy_topN_consult_counts_labeled,dizzy_ccounts_fields,dizzy_ccounts_ftypes)\n", + "print('Dizziness consults: ',dizzy_consults_sig)\n", + "abdpain_ccounts_fields = abdpain_topN_consult_counts_labeled.columns[1:-1]\n", + "abdpain_ccounts_ftypes = len(abdpain_ccounts_fields) * ['d']\n", + "abdpain_consults_sig = ttest_fields(abdpain_topN_consult_counts_labeled,abdpain_ccounts_fields,abdpain_ccounts_ftypes)\n", + "print('Abdpain consults: ',abdpain_consults_sig)\n", + "\n", + "# Imaging ttest\n", + "dizzy_images_fields = dizzy_images_labeled.columns[2:-1]\n", + "dizzy_images_ftypes = len(dizzy_images_fields) * ['c']\n", + "dizzy_images_sig = ttest_fields(dizzy_images_labeled[dizzy_images_labeled.label.isin(['MOD','NoMOD'])],dizzy_images_fields,dizzy_images_ftypes)\n", + "print('Dizziness imaging: ', dizzy_images_sig)\n", + "abdpain_images_fields = abdpain_images_labeled.columns[2:-1]\n", + "abdpain_images_ftypes = len(abdpain_images_fields) * ['c']\n", + "abdpain_images_sig = ttest_fields(abdpain_images_labeled,abdpain_images_fields,abdpain_images_ftypes)\n", + "print('Abdpain imaging: ', abdpain_images_sig)\n", + "\n", + "# ttest for risk factors\n", + "dizzy_rf_sig = ttest_fields(bool_dizzy_rf_df_labeled,bool_dizzy_rf_df.columns[1:],len(bool_dizzy_rf_df.columns[1:])*['d'],show=False)\n", + "print('Dizzy RF: ', dizzy_rf_sig)\n", + "abdpain_rf_sig = ttest_fields(bool_abdpain_rf_df_labeled,bool_abdpain_rf_df.columns[1:],len(bool_abdpain_rf_df.columns[1:])*['d'],show=False)\n", + "print('Abdpain RF: ', abdpain_rf_sig)\n", + "\n", + "# ttest for labs\n", + "dizzy_labs_sig = ttest_fields(dizzy_clean_labs_labeled,dizzy_clean_labs_labeled.columns[1:-1],\n", + " len(dizzy_clean_labs_labeled.columns[1:])*['d'],show=False)\n", + "abdpain_labs_sig = ttest_fields(abdpain_clean_labs_labeled,abdpain_clean_labs_labeled.columns[1:-1],\n", + " len(abdpain_clean_labs_labeled.columns[1:])*['d'],show=False)\n", + "\n", + "\n", + " \n", + "print('Dizzy Lab: ',dizzy_labs_sig)\n", + "print('Abdpain Lab: ',abdpain_labs_sig)\n", + "\n", + "dizzy_good_cols = dizzy_cohort_sig + dizzy_demo_sig + dizzy_vitals_sig + dizzy_hosp_vitals_sig + dizzy_consults_sig \n", + " + dizzy_images_sig + dizzy_rf_sig + dizzy_labs_sig" + ] + }, + { + "cell_type": "markdown", + "id": "mexican-friend", + "metadata": {}, + "source": [ + "# Merge different dataframes" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "invisible-fundamentals", + "metadata": {}, + "outputs": [], + "source": [ + "# start with cohort, then demo, then vitals, then consults, imaging, labs, history\n", + "\n", + "dizzy_cohort_cols = ['TriggerType', 'PtSSN', 'EDStartDateTime', 'EDEndDateTime',\n", + " 'EDVisitReason', 'AdmitDateTime', 'DischargeDateTime','hosp_stay',\n", + " 'ed_duration', 'FirstAdmission', 'ed_first_inp_delta', 'num_ED_visits',\n", + " 'num_hosp', 'sum_hosp_stay']\n", + "\n", + "dizzy_all = pd.DataFrame(dizzy_demo_coded.PtSSN,columns=['PtSSN'])\n", + "dizzy_all = pd.merge(dizzy_all,dizzy_cohort_df[dizzy_cohort_cols],on='PtSSN')\n", + "dizzy_all = pd.merge(dizzy_all,dizzy_demo_coded,on='PtSSN')\n", + "dizzy_all = pd.merge(dizzy_all,dizzy_EDvitals_df,on=['PtSSN','EDStartDateTime'],how='left')\n", + "dizzy_all = pd.merge(dizzy_all,dizzy_hosp_vitals_df,on=['PtSSN','AdmitDateTime'],how='left')\n", + "dizzy_all = pd.merge(dizzy_all,dizzy_topN_consult_counts,on=['PtSSN'])\n", + "dizzy_all = pd.merge(dizzy_all,dizzy_images,on=['PtSSN','EDStartDateTime'])\n", + "dizzy_all = pd.merge(dizzy_all,dizzy_clean_labs,on=['PtSSN'],how='left')\n", + "dizzy_all = pd.merge(dizzy_all,bool_dizzy_rf_df,on='PtSSN')\n", + "print(dizzy_all.shape)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "incorporate-hunger", + "metadata": {}, + "outputs": [], + "source": [ + "# handle missing values\n", + "\n", + "cols_with_missing = []\n", + "for col in dizzy_all.columns:\n", + " missing = dizzy_all[col].isna().sum()\n", + " if missing > 0:\n", + " cols_with_missing.append(col)\n", + " print(col,missing)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "moved-blair", + "metadata": {}, + "outputs": [], + "source": [ + "combo_label_df.columns" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "victorian-electric", + "metadata": {}, + "outputs": [], + "source": [ + "# find a way to plot the labeled data in the combined dataframe\n", + "\n", + "\n", + "combo_label_df = pd.merge(dizzy_all,dizzy_labels_df,on='PtSSN',how='left')\n", + "combo_label_df.loc[combo_label_df.label.isna(),'label'] = 'unknown'\n", + "useful_cols = list(set(combo_label_df.columns[7:-1]).difference(['FirstAdmission']))\n", + "reduced_combo = combo_label_df[combo_label_df.label.isin(['MOD','NoMOD','unknown'])].dropna()\n", + "X = reduced_combo[useful_cols]\n", + "y = reduced_combo['label']\n", + "print(X.shape,y.shape)\n", + "label_list = list(reduced_combo.label.value_counts().index)\n", + "u = umap_plot_label(X,y,1,3,label_list,'upper left');\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "physical-mineral", + "metadata": {}, + "outputs": [], + "source": [ + "# analyze the resulting clusters\n", + "clust = cluster_umap(u,5,reduced_combo[useful_cols+['label']],'upper left')\n", + "analyze_clusters(clust,reduced_combo[useful_cols+['label']],useful_cols) " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "organizational-resort", + "metadata": {}, + "outputs": [], + "source": [ + "imp_cols = ['ed_first_inp_delta','age_at_index_visit','new_race_WHITE','new_race_BLACK OR AFRICAN AMERICAN','glucose_count',\n", + " 'glucose_min','glucose_max','glucose_abnormal_count','CO2_count','hgb_abnormal_count','Systolic_max','Systolic_first',\n", + " 'Diastolic_max','Diastolic_first','PULSE_min','PULSE_first','HOSP_Systolic_max','HOSP_Diastolic_max','HOSP_PULSE_min',\n", + " 'HOSP_PULSE_first','ct_count','ct_abnormal_count','Diabetes','Hypertension','Coronary artery disease (CAD)']\n", + "\n", + "X = reduced_combo[imp_cols]\n", + "y = reduced_combo['label']\n", + "label_list = list(reduced_combo.label.value_counts().index)\n", + "u = umap_plot_label(X,y,1,7,label_list,'upper right');\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "hybrid-postage", + "metadata": {}, + "outputs": [], + "source": [ + "# analyze the resulting clusters\n", + "clust = cluster_umap(u,3,reduced_combo[imp_cols+['label']],'upper right')\n", + "analyze_clusters(clust,reduced_combo[imp_cols+['label']],imp_cols) " + ] + }, + { + "cell_type": "markdown", + "id": "photographic-crowd", + "metadata": {}, + "source": [ + "# Need to get red flags to recreate Paarth classifier on our data\n", + "- headache\n", + "- diplopia\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "entertaining-catering", + "metadata": {}, + "outputs": [], + "source": [ + "notes.columns" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "eleven-discipline", + "metadata": {}, + "outputs": [], + "source": [ + "tmp = notes[notes.PatientSSN==ptssn]\n", + "start_time, end_time = dizzy_cohort_df[dizzy_cohort_df.PtSSN==ptssn].EDStartDateTime.values[0],dizzy_cohort_df[dizzy_cohort_df.PtSSN==ptssn].DischargeDateTime.values[0]\n", + "tmp1 = tmp[(tmp.EntryDateTime >= start_time) & (tmp.EntryDateTime <= end_time)]\n", + "for i in range(tmp1.shape[0]):\n", + " print(i,'---------------------------------------------')\n", + " print(tmp1.TIUStandardTitle.iloc[i])\n", + " print(tmp1.ReportText.iloc[i])\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "popular-africa", + "metadata": {}, + "outputs": [], + "source": [ + "notes = dataset['withRole']\n", + "notes.PatientSSN = notes.PatientSSN.astype('int64')\n", + "notes.EntryDateTime = pd.to_datetime(notes.EntryDateTime)\n", + "ed_notes = notes[notes.TIUStandardTitle=='EMERGENCY DEPT NOTE'].copy()\n", + "ed_notes.rename(columns={'PatientSSN':'PtSSN'},inplace=True)\n", + "ed_notes.PtSSN = ed_notes.PtSSN.astype('int64')\n", + "\n", + "\n", + "ed_notes_mod = pd.merge(ed_notes,dizzy_labels_df,on='PtSSN')\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "equal-myanmar", + "metadata": {}, + "outputs": [], + "source": [ + "ptssn = ed_notes_mod.iloc[0].PtSSN\n", + "#print(ed_notes_mod.ReportText.iloc[0])\n", + "display(dizzy_df[dizzy_df.PtSSN==ptssn].CaseSummaryER.values)\n", + "combo_df[combo_df.PtSSN==ptssn]\n", + "dizzy_cohort_df[dizzy_cohort_df.PtSSN==ptssn]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "sudden-albuquerque", + "metadata": {}, + "outputs": [], + "source": [ + "# UMAP the lab dataframe for abdpain\n", + "\n", + "XX = abdpain_clean_labs.dropna()\n", + "X = XX[set(XX.columns).difference(['PtSSN'])]\n", + "u = umap_plot_nolabel(X,1,7)\n", + "\n", + "# analyze the resulting clusters\n", + "clust = cluster_umap_nolabel(u,3,X,'upper left')\n", + "analyze_clusters_nolabel(clust,X,X.columns)\n", + "\n", + "# create a tapestry plot to visualize the clusters according to median values\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "veterinary-walnut", + "metadata": {}, + "outputs": [], + "source": [ + "# start merging all dataframes and UMAP them for dizzy and abdpain\n" + ] + }, + { + "cell_type": "markdown", + "id": "covered-nashville", + "metadata": {}, + "source": [ + "# Make predictive model for dizzy with all the ttest relevant fields" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "broad-child", + "metadata": {}, + "outputs": [], + "source": [ + "dizzy_cohort_tmp = dizzy_cohort_df_labeled[['PtSSN'] + dizzy_cohort_sig]\n", + "dizzy_demo_tmp = dizzy_demo_coded_labeled[['PtSSN'] + dizzy_demo_sig]\n", + "dizzy_vitals_tmp = dizzy_EDvitals_labeled[['PtSSN'] + dizzy_vitals_sig]\n", + "dizzy_hosp_vitals_tmp = dizzy_hosp_vitals_labeled[['PtSSN'] + dizzy_hosp_vitals_sig]\n", + "#dizzy_consults_tmp = dizzy_topN_consult_counts_labeled[['PtSSN'] + dizzy_consults_sig]\n", + "\n", + "# images\n", + "dizzy_images_xr_count_labeled = pd.merge(dizzy_images_xr_count,dizzy_labels_df[dizzy_labels_df.label.isin(['MOD','NoMOD'])],on='PtSSN')\n", + "dizzy_images_ct_count_labeled = pd.merge(dizzy_images_ct_count,dizzy_labels_df[dizzy_labels_df.label.isin(['MOD','NoMOD'])],on='PtSSN')\n", + "dizzy_images_ct_abnormal_count_labeled = pd.merge(dizzy_images_ct_abnormal_count,dizzy_labels_df[dizzy_labels_df.label.isin(['MOD','NoMOD'])],on='PtSSN')\n", + "\n", + "# convert image count fields into int\n", + "dizzy_images_xr_count_labeled.xr_count = dizzy_images_xr_count_labeled.xr_count.astype(int)\n", + "dizzy_images_ct_count_labeled.ct_count = dizzy_images_ct_count_labeled.ct_count.astype(int)\n", + "dizzy_images_ct_abnormal_count_labeled.ct_abnormal_count = dizzy_images_ct_abnormal_count_labeled.ct_abnormal_count.astype(int)\n", + "\n", + "dizzy_images_xr_count_tmp = dizzy_images_xr_count_labeled[['PtSSN','xr_count']].drop_duplicates()\n", + "dizzy_images_ct_count_tmp = dizzy_images_ct_count_labeled[['PtSSN','ct_count']].drop_duplicates()\n", + "dizzy_images_ct_abnormal_count_tmp = dizzy_images_ct_abnormal_count_labeled[['PtSSN','ct_abnormal_count']].drop_duplicates()\n", + "\n", + "\n", + "# labs is rel_dizzy_labs\n", + "# risk factors is dizzy_rf_df_all_labeled\n", + "\n", + "# merge them all\n", + "\n", + "labels_tmp = dizzy_labels_df[dizzy_labels_df.label.isin(['MOD','NoMOD'])][['PtSSN','label']]\n", + "dizzy_all = pd.merge(labels_tmp,dizzy_cohort_tmp,on='PtSSN')\n", + "dizzy_all = pd.merge(dizzy_all,dizzy_demo_tmp,on='PtSSN')\n", + "dizzy_all = pd.merge(dizzy_all,dizzy_vitals_tmp,on='PtSSN')\n", + "dizzy_all = pd.merge(dizzy_all,dizzy_hosp_vitals_tmp,on='PtSSN')\n", + "dizzy_all = pd.merge(dizzy_all,dizzy_consults_tmp,on='PtSSN')\n", + "dizzy_all = pd.merge(dizzy_all,rel_dizzy_labs,on='PtSSN')\n", + "dizzy_all = pd.merge(dizzy_all,dizzy_images_xr_count_tmp,on='PtSSN')\n", + "dizzy_all = pd.merge(dizzy_all,dizzy_images_ct_count_tmp,on='PtSSN')\n", + "dizzy_all = pd.merge(dizzy_all,dizzy_images_ct_abnormal_count_tmp,on='PtSSN')\n", + "dizzy_all = pd.merge(dizzy_all,dizzy_rf_df_all_labeled[['PtSSN','Hx aneurysm']],on=['PtSSN'])\n", + "print(dizzy_all.shape)\n", + "set(dizzy_all.columns).difference(set(all_dizzy_all.columns))" + ] + }, + { + "cell_type": "markdown", + "id": "authentic-afghanistan", + "metadata": {}, + "source": [ + "# make feature dataframe for entire data set" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "light-reference", + "metadata": {}, + "outputs": [], + "source": [ + "# make dataset for all of dizzy (not just the labeled data)\n", + "all_dizzy_cohort_tmp = dizzy_cohort_df[['PtSSN'] + dizzy_cohort_sig]\n", + "#print(all_dizzy_cohort_tmp.shape,all_dizzy_cohort_tmp.columns)\n", + "all_dizzy_demo_tmp = dizzy_demo_coded[['PtSSN'] + dizzy_demo_sig]\n", + "#print(all_dizzy_demo_tmp.shape,all_dizzy_demo_tmp.columns)\n", + "all_dizzy_vitals_tmp = dizzy_EDvitals_df[['PtSSN'] + dizzy_vitals_sig]\n", + "#print(all_dizzy_vitals_tmp.shape,all_dizzy_vitals_tmp.columns)\n", + "all_dizzy_hosp_vitals_tmp = dizzy_hosp_vitals_df[['PtSSN'] + dizzy_hosp_vitals_sig]\n", + "#all_dizzy_consults_tmp = dizzy_topN_consult_counts[['PtSSN'] + dizzy_consults_sig]\n", + "#print(all_dizzy_hosp_vitals_tmp.shape,all_dizzy_hosp_vitals_tmp.columns)\n", + "\n", + "# convert image count fields into int\n", + "dizzy_images_xr_count.xr_count = dizzy_images_xr_count.xr_count.astype(int)\n", + "dizzy_images_ct_count.ct_count = dizzy_images_ct_count.ct_count.astype(int)\n", + "dizzy_images_ct_abnormal_count.ct_abnormal_count = dizzy_images_ct_abnormal_count.ct_abnormal_count.astype(int)\n", + "\n", + "all_dizzy_images_xr_count_tmp = dizzy_images_xr_count[['PtSSN','xr_count']].drop_duplicates()\n", + "all_dizzy_images_ct_count_tmp = dizzy_images_ct_count[['PtSSN','ct_count']].drop_duplicates()\n", + "all_dizzy_images_ct_abnormal_count_tmp = dizzy_images_ct_abnormal_count[['PtSSN','ct_abnormal_count']].drop_duplicates()\n", + "\n", + "\n", + "# labs is rel_dizzy_labs\n", + "# risk factors is dizzy_rf_df_all_labeled\n", + "\n", + "# merge them all\n", + "\n", + "\n", + "all_dizzy_all = pd.merge(all_dizzy_cohort_tmp,all_dizzy_demo_tmp,on='PtSSN')\n", + "\n", + "all_dizzy_all = pd.merge(all_dizzy_all,all_dizzy_vitals_tmp,on='PtSSN')\n", + "\n", + "all_dizzy_all = pd.merge(all_dizzy_all,all_dizzy_hosp_vitals_tmp,on='PtSSN')\n", + "\n", + "#all_dizzy_all = pd.merge(all_dizzy_all,dizzy_consults_tmp,on='PtSSN')\n", + "all_dizzy_all = pd.merge(all_dizzy_all,rel_dizzy_labs,on='PtSSN')\n", + "\n", + "all_dizzy_all = pd.merge(all_dizzy_all,all_dizzy_images_xr_count_tmp,on='PtSSN')\n", + "\n", + "all_dizzy_all = pd.merge(all_dizzy_all,all_dizzy_images_ct_count_tmp,on='PtSSN')\n", + "\n", + "all_dizzy_all = pd.merge(all_dizzy_all,all_dizzy_images_ct_abnormal_count_tmp,on='PtSSN')\n", + "\n", + "all_dizzy_all = pd.merge(all_dizzy_all,dizzy_rf_df_all[['PtSSN','Hx aneurysm']],on=['PtSSN'])\n", + "print(all_dizzy_all.shape)\n", + "print(all_dizzy_all.columns)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "suspended-afternoon", + "metadata": {}, + "outputs": [], + "source": [ + "# which ones have NAs in them?\n", + "# make all count columns default to 0\n", + "\n", + "limit_dizzy_all = dizzy_all[dizzy_all.label.isin(['MOD','NoMOD'])].copy()\n", + "\n", + "abcols = ['WBC_abnormal_count','glucose_count','glucose_abnormal_count','albumin_abnormal_count','CO2_count',\n", + " 'alkphos_abnormal_count','hgb_abnormal_count']\n", + "for abcol in abcols:\n", + " limit_dizzy_all[abcol] = limit_dizzy_all[abcol].fillna(0)\n", + "\n", + "\n", + "\n", + "for col in limit_dizzy_all.columns[2:]:\n", + " v = limit_dizzy_all[col].isna().sum()\n", + " if v > 0:\n", + " print(col,v )\n", + " \n", + "print(limit_dizzy_all.shape)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "needed-prairie", + "metadata": {}, + "outputs": [], + "source": [ + "# for glucose_min and glucose_max use class_specific medians\n", + "\n", + "glucose_cols = ['glucose_min','glucose_max']\n", + "for gcol in glucose_cols:\n", + " vals = limit_dizzy_all.groupby('label')[gcol].describe()[['50%']].reset_index()\n", + " mod_val = vals[vals.label=='MOD']['50%'].values[0]\n", + " nomod_val = vals[vals.label=='NoMOD']['50%'].values[0]\n", + " print(mod_val,nomod_val)\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "mediterranean-bennett", + "metadata": {}, + "outputs": [], + "source": [ + "# drop highly correlated columns to support logistic regression\n", + "fig, ax = plt.subplots(figsize=(10,10))\n", + "sns.heatmap(limit_dizzy_all.iloc[:,2:].corr(),ax=ax,annot=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "forced-karma", + "metadata": {}, + "outputs": [], + "source": [ + "tmp = limit_dizzy_all.dropna()\n", + "print(tmp.shape)\n", + "X = tmp.iloc[:,2:]\n", + "yy = tmp.label\n", + "y = np.array([1 if (x=='MOD') else 0 for x in yy])\n", + "print(X.shape,y.shape)\n", + "clf = tune_model(X,y)\n", + "print('Best C = ',clf.best_params_['C'])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "internal-costa", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "#clf,select_feats = build_L1_model(X,y,clf.best_params_['C'])\n", + "clf,select_feats = build_L1_model(X,y,0.0045)\n", + "visualize_model(clf,select_feats)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "funded-estimate", + "metadata": {}, + "outputs": [], + "source": [ + "# umap it all\n", + "u = umap_plot_label(X,yy,1,7,['NoMOD','MOD'],'upper right')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dental-camcorder", + "metadata": {}, + "outputs": [], + "source": [ + "c = cluster_umap(u,4,tmp,'upper right')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ranging-fraud", + "metadata": {}, + "outputs": [], + "source": [ + "#stat_cols = featimp.iloc[:5].index\n", + "analyze_clusters(c,tmp,select_feats)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ordered-bathroom", + "metadata": {}, + "outputs": [], + "source": [ + "# find those rows in limit_dizzy_all that have nulls\n", + "null_val_rows = limit_dizzy_all[limit_dizzy_all.isnull().any(axis=1)].copy()\n", + "null_val_rows.shape" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "hired-forum", + "metadata": {}, + "outputs": [], + "source": [ + "for gcol in ['glucose_min','glucose_max']:\n", + " vals = limit_dizzy_all.groupby('label')[gcol].describe()[['50%']].reset_index()\n", + " mod_val = vals[vals.label=='MOD']['50%'].values[0]\n", + " nomod_val = vals[vals.label=='NoMOD']['50%'].values[0]\n", + " print(gcol,mod_val,nomod_val) " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "buried-symposium", + "metadata": {}, + "outputs": [], + "source": [ + "null_val_rows.loc[null_val_rows.label=='MOD','glucose_min'] = 156\n", + "null_val_rows.loc[null_val_rows.label=='MOD','glucose_max'] = 171\n", + "null_val_rows.loc[null_val_rows.label=='NoMOD','glucose_min'] = 126\n", + "null_val_rows.loc[null_val_rows.label=='NoMOD','glucose_max'] = 126" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "revised-category", + "metadata": {}, + "outputs": [], + "source": [ + "# predict on null_val_rows\n", + "print(clf.predict(null_val_rows.iloc[:,2:]))\n", + "print(null_val_rows['label'])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "studied-tiffany", + "metadata": {}, + "outputs": [], + "source": [ + "print(all_dizzy_all.shape)\n", + "abcols = ['WBC_abnormal_count','glucose_count','glucose_abnormal_count','albumin_abnormal_count','CO2_count',\n", + " 'alkphos_abnormal_count','hgb_abnormal_count']\n", + "for abcol in abcols:\n", + " all_dizzy_all[abcol] = all_dizzy_all[abcol].fillna(0)\n", + "\n", + "\n", + "\n", + "for col in all_dizzy_all.columns[1:]:\n", + " v = all_dizzy_all[col].isna().sum()\n", + " if v > 0:\n", + " print(col,v )\n", + " \n", + "print(all_dizzy_all.shape)\n", + "\n", + "# find those rows in all_dizzy_all that have null\n", + "all_null_val_rows = all_dizzy_all[all_dizzy_all.isnull().any(axis=1)].copy()\n", + "print(all_null_val_rows.shape)\n", + "\n", + "for gcol in ['glucose_min','glucose_max']:\n", + " vals = all_dizzy_all[gcol].describe()[['50%']]\n", + " g_val = vals['50%']\n", + " \n", + " print(gcol,g_val) \n", + " \n", + "# fill in glucose values that are medians across the entire set.\n", + "all_null_val_rows.loc[:,'glucose_min'] = 134\n", + "all_null_val_rows.loc[:,'glucose_max'] = 135\n", + "\n", + "\n", + "tmp1 = pd.concat([all_dizzy_all.dropna(),all_null_val_rows])\n", + "print(tmp1.iloc[:,1:].shape)\n", + "for col in tmp1.columns[1:]:\n", + " v = tmp1[col].isna().sum()\n", + " if v > 0:\n", + " print(col,v )\n", + "\n", + "print('Logistic regression prediction:')\n", + "ypred1 = clf.predict(tmp1.iloc[:,1:])\n", + "display(pd.merge(tmp1[ypred1==1].PtSSN,dizzy_labels_df,on='PtSSN')['label'].value_counts())\n", + "display(pd.merge(tmp1[ypred1==0].PtSSN,dizzy_labels_df,on='PtSSN')['label'].value_counts())\n", + "\n", + "print('Random forest prediction:')\n", + "ypred1 = rf.predict(tmp1.iloc[:,1:])\n", + "display(pd.merge(tmp1[ypred1==1].PtSSN,dizzy_labels_df,on='PtSSN')['label'].value_counts())\n", + "display(pd.merge(tmp1[ypred1==0].PtSSN,dizzy_labels_df,on='PtSSN')['label'].value_counts())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "sapphire-caribbean", + "metadata": {}, + "outputs": [], + "source": [ + "ypred1 =rf.predict(tmp1.iloc[:,1:])\n", + "display(pd.merge(tmp1[ypred1==1].PtSSN,dizzy_labels_df,on='PtSSN')['label'].value_counts())\n", + "display(pd.merge(tmp1[ypred1==0].PtSSN,dizzy_labels_df,on='PtSSN')['label'].value_counts())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "lucky-stanford", + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.tree import DecisionTreeClassifier\n", + "\n", + "\n", + "dt = DecisionTreeClassifier(max_depth=7,criterion='entropy')\n", + "scores = cross_val_score(dt,tmp.iloc[:,2:],y)\n", + "print(np.mean(scores),np.std(scores))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "peaceful-fortune", + "metadata": {}, + "outputs": [], + "source": [ + "# train and test on all 75 labeled records\n", + "from sklearn.ensemble import RandomForestClassifier\n", + "rf = RandomForestClassifier(n_estimators=10,max_depth=3)\n", + "y = np.array([1 if x=='MOD' else 0 for x in tmp.label])\n", + "rf.fit(tmp.iloc[:,2:],y)\n", + "ypred = rf.predict(tmp.iloc[:,2:])\n", + "metrics.confusion_matrix(y,ypred)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "official-letter", + "metadata": {}, + "outputs": [], + "source": [ + "rf.predict(null_val_rows.iloc[:,2:])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "smaller-hybrid", + "metadata": {}, + "outputs": [], + "source": [ + "featimp = pd.Series(rf.feature_importances_,index=tmp.columns[2:]).sort_values(ascending=False)\n", + "import seaborn as sns\n", + "\n", + "plt.figure(figsize=(8,6))\n", + "myplot = sns.barplot(featimp.index,featimp.values)\n", + "myplot.set_xticklabels(myplot.get_xticklabels(),rotation = 90);" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "working-suspension", + "metadata": {}, + "outputs": [], + "source": [ + "# now do an 80-20 split; and run train/test\n", + "Xtrain,Xtest,ytrain,ytest = train_test_split(tmp.iloc[:,2:],y,stratify=y,test_size=0.1)\n", + "print(Xtrain.shape,Xtest.shape,ytrain.shape,ytest.shape)\n", + "rf = RandomForestClassifier(n_estimators=7,max_depth=4)\n", + "rf.fit(Xtrain,ytrain)\n", + "ypred = rf.predict(Xtest)\n", + "print(metrics.confusion_matrix(ytest,ypred))\n", + "print('AUC = ',metrics.roc_auc_score(ytest,ypred))\n", + "print('F1 = ',metrics.f1_score(ytest,ypred))\n", + "print('Accuracy = ',metrics.accuracy_score(ytest,ypred))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "modified-refund", + "metadata": {}, + "outputs": [], + "source": [ + "ypred_all = rf.predict(pd.concat([tmp.iloc[:,2:],null_val_rows.iloc[:,2:]]))\n", + "y_all = [1 if x =='MOD' else 0 for x in tmp.label] + [1 if x=='MOD' else 0 for x in null_val_rows.label]\n", + "metrics.confusion_matrix(y_all,ypred_all)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "corrected-patient", + "metadata": {}, + "outputs": [], + "source": [ + "featimp = pd.Series(rf.feature_importances_,index=tmp.columns[2:]).sort_values(ascending=False)\n", + "import seaborn as sns\n", + "\n", + "plt.figure(figsize=(8,6))\n", + "myplot = sns.barplot(featimp.index,featimp.values)\n", + "myplot.set_xticklabels(myplot.get_xticklabels(),rotation = 90);" + ] + }, + { + "cell_type": "markdown", + "id": "occupational-shark", + "metadata": {}, + "source": [ + "# Classify unlabeled records in dizzy" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "determined-agency", + "metadata": {}, + "outputs": [], + "source": [ + "tmp1.shape" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dependent-hamburg", + "metadata": {}, + "outputs": [], + "source": [ + "for col in tmp1.columns[1:]:\n", + " v = tmp1[col].isna().sum()\n", + " if v > 0:\n", + " print(col,v )\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "associate-friend", + "metadata": {}, + "outputs": [], + "source": [ + "tmp2.columns" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "detailed-layout", + "metadata": {}, + "outputs": [], + "source": [ + "# merge to create temp labels\n", + "tmp2 = pd.merge(tmp1,dizzy_labels_df,on='PtSSN',how='left')\n", + "print(tmp2.shape)\n", + "#display(tmp2.label.value_counts())\n", + "len(set(dizzy_labels_df.PtSSN).difference(tmp2.PtSSN))\n", + "tmp2.label = tmp2.label.fillna('unknown')\n", + "tmp3 = tmp2[tmp2.label.isin(['MOD','NoMOD','unknown'])]\n", + "\n", + "\n", + "u = umap_plot_label(tmp3.iloc[:,1:-1],tmp3.label,4,7,['NoMOD','unknown','MOD'],'upper right');\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "accepted-globe", + "metadata": {}, + "outputs": [], + "source": [ + "c = cluster_umap(u,3,tmp3,'upper right')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "limiting-alarm", + "metadata": {}, + "outputs": [], + "source": [ + "analyze_clusters(c,tmp3,list(featimp.index)[:10])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "musical-silicon", + "metadata": {}, + "outputs": [], + "source": [ + "# plot the predicted labels for the unknowns\n" + ] + }, + { + "cell_type": "markdown", + "id": "flying-conservation", + "metadata": {}, + "source": [ + "# Assemble the abdpain_all dataframe" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "drawn-bulletin", + "metadata": {}, + "outputs": [], + "source": [ + "abdpain_cohort_tmp = abdpain_cohort_df_labeled[['PtSSN'] + cohort_fields].drop_duplicates(subset=['PtSSN'])\n", + "abdpain_demo_tmp = abdpain_demo_coded_labeled[['PtSSN'] + list(abdpain_demo_coded_labeled.columns[1:-1])].drop_duplicates(subset=['PtSSN'])\n", + "abdpain_vitals_tmp = abdpain_EDvitals_labeled[['PtSSN'] + list(abdpain_EDvitals_labeled.columns[2:-1])].drop_duplicates(subset=['PtSSN'])\n", + "abdpain_hosp_vitals_tmp = abdpain_hosp_vitals_labeled[['PtSSN'] + list(abdpain_hosp_vitals_labeled.columns[2:-1])].drop_duplicates(subset=['PtSSN'])\n", + "abdpain_consults_tmp = abdpain_topN_consult_counts_labeled[['PtSSN'] + list(abdpain_topN_consult_counts.columns[1:-1])].drop_duplicates(subset=['PtSSN'])\n", + "\n", + "# images\n", + "abdpain_images_xr_count_labeled = pd.merge(abdpain_images_xr_count,abdpain_labels_df,on='PtSSN').drop_duplicates(subset=['PtSSN'])\n", + "abdpain_images_ct_count_labeled = pd.merge(abdpain_images_ct_count,abdpain_labels_df,on='PtSSN').drop_duplicates(subset=['PtSSN'])\n", + "abdpain_images_ct_abnormal_count_labeled = pd.merge(abdpain_images_ct_abnormal_count,abdpain_labels_df,on='PtSSN').drop_duplicates(subset=['PtSSN'])\n", + "\n", + "# convert image count fields into int\n", + "abdpain_images_xr_count_labeled.xr_count = abdpain_images_xr_count_labeled.xr_count.astype(int)\n", + "abdpain_images_ct_count_labeled.ct_count = abdpain_images_ct_count_labeled.ct_count.astype(int)\n", + "abdpain_images_ct_abnormal_count_labeled.ct_abnormal_count = abdpain_images_ct_abnormal_count_labeled.ct_abnormal_count.astype(int)\n", + "\n", + "abdpain_images_xr_count_tmp = abdpain_images_xr_count_labeled[['PtSSN','xr_count']].drop_duplicates()\n", + "abdpain_images_ct_count_tmp = abdpain_images_ct_count_labeled[['PtSSN','ct_count']].drop_duplicates()\n", + "abdpain_images_ct_abnormal_count_tmp = abdpain_images_ct_abnormal_count_labeled[['PtSSN','ct_abnormal_count']].drop_duplicates()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "blond-transparency", + "metadata": {}, + "outputs": [], + "source": [ + "# labs is rel_abdpain_labs\n", + "# risk factors is abdpain_rf_df_all_labeled\n", + "\n", + "# merge them all\n", + "\n", + "abdpain_all = pd.merge(abdpain_labels_df,abdpain_cohort_tmp,on='PtSSN')\n", + "print(abdpain_all.shape)\n", + "abdpain_all = pd.merge(abdpain_all,abdpain_demo_tmp,on='PtSSN')\n", + "print(abdpain_all.shape)\n", + "\n", + "abdpain_all = pd.merge(abdpain_all,abdpain_consults_tmp,on='PtSSN')\n", + "print(abdpain_all.shape)\n", + "abdpain_all = pd.merge(abdpain_all,rel_abdpain_labs,on='PtSSN')\n", + "print(abdpain_all.shape)\n", + "abdpain_all = pd.merge(abdpain_all,abdpain_images_xr_count_tmp,on='PtSSN')\n", + "print(abdpain_all.shape)\n", + "abdpain_all = pd.merge(abdpain_all,abdpain_images_ct_count_tmp,on='PtSSN')\n", + "print(abdpain_all.shape)\n", + "abdpain_all = pd.merge(abdpain_all,abdpain_images_ct_abnormal_count_tmp,on='PtSSN')\n", + "print(abdpain_all.shape)\n", + "abdpain_all = pd.merge(abdpain_all,abdpain_rf_df_all_labeled,on=['PtSSN','label'])\n", + "print(abdpain_all.shape)\n", + "abdpain_all = pd.merge(abdpain_all,abdpain_vitals_tmp,on='PtSSN',how='left')\n", + "print(abdpain_all.shape)\n", + "abdpain_all = pd.merge(abdpain_all,abdpain_hosp_vitals_tmp,on='PtSSN',how='left')\n", + "print(abdpain_all.shape)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "decreased-hours", + "metadata": {}, + "outputs": [], + "source": [ + "# who are the missing SSNs in abdpain_EDvitals?\n", + "tmp1 = abdpain_EDvitals_labeled.groupby('PtSSN')['PtSSN'].agg('count')\n", + "missing = set(abdpain_cohort_df_labeled.PtSSN).difference(set(tmp1.index))\n", + "print(missing)\n", + "abdpain_cohort_df_labeled[abdpain_cohort_df_labeled.PtSSN.isin(missing)]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "secure-fetish", + "metadata": {}, + "outputs": [], + "source": [ + "# who are the missing SSNs in abdpain_hosp_vitals?\n", + "tmp2 = abdpain_hosp_vitals_labeled.groupby('PtSSN')['PtSSN'].agg('count')\n", + "missing1 = set(abdpain_cohort_df_labeled.PtSSN).difference(set(tmp2.index))\n", + "print(missing1)\n", + "abdpain_cohort_df_labeled[abdpain_cohort_df_labeled.PtSSN.isin(missing1)]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "considerable-manhattan", + "metadata": {}, + "outputs": [], + "source": [ + "abdpain_all.label.value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "spare-plastic", + "metadata": {}, + "outputs": [], + "source": [ + "abdpain_labels_df.label.value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "alpha-irish", + "metadata": {}, + "outputs": [], + "source": [ + "for col in abdpain_all.columns[2:]:\n", + " \n", + " v = abdpain_all[col].isna().sum()\n", + " if v > 0:\n", + " print(col,v )\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "exceptional-craps", + "metadata": {}, + "outputs": [], + "source": [ + "abdpain_all.columns" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "sealed-standard", + "metadata": {}, + "outputs": [], + "source": [ + "# fill na on the counts to be zero\n", + "count_fields = ['bun_abnormal_count','lact_abnormal_count','amylase_abnormal_count']\n", + "\n", + "for col in count_fields:\n", + " abdpain_all[col] = abdpain_all[col].fillna(0)\n", + " \n", + "print(abdpain_all.shape)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fitting-swiss", + "metadata": {}, + "outputs": [], + "source": [ + "abdpain_tmp_clean = abdpain_all.dropna(subset=['Systolic_count'])\n", + "missing = list(set(abdpain_all.PtSSN).difference(set(abdpain_tmp_clean.PtSSN)))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "temporal-shopper", + "metadata": {}, + "outputs": [], + "source": [ + "# fix ED_vitals fields with NoMOD values\n", + "\n", + "ed_vitals_cols = ['Systolic_count', 'Systolic_max', 'Systolic_min', 'Systolic_first'] + \\\n", + " ['Diastolic_count', 'Diastolic_max', 'Diastolic_min', 'Diastolic_first'] + \\\n", + " ['PULSE_count', 'PULSE_max', 'PULSE_min', 'PULSE_first'] + \\\n", + " ['RESPIRATION_count', 'RESPIRATION_max', 'RESPIRATION_min', 'RESPIRATION_first'] + \\\n", + " ['PAIN_count', 'PAIN_max', 'PAIN_min', 'PAIN_first'] + \\\n", + " ['TEMPERATURE_count', 'TEMPERATURE_max', 'TEMPERATURE_min', 'TEMPERATURE_first'] \n", + "\n", + "impval_dict = {}\n", + "for gcol in ed_vitals_cols:\n", + " vals = abdpain_tmp_clean.groupby('label')[gcol].describe()[['50%']].reset_index()\n", + " mod_val = vals[vals.label=='MOD']['50%'].values[0]\n", + " nomod_val = vals[vals.label=='NoMOD']['50%'].values[0]\n", + " print(gcol,mod_val,nomod_val) \n", + " impval_dict[gcol]={'MOD':mod_val,'NoMOD':nomod_val}\n", + " \n", + "for PtSSN in missing:\n", + " for gcol in ed_vitals_cols:\n", + " if abdpain_all[abdpain_all.PtSSN==PtSSN].label.values[0]=='MOD':\n", + " abdpain_all.loc[abdpain_all.PtSSN==PtSSN,gcol] = impval_dict[gcol]['MOD']\n", + " if abdpain_all[abdpain_all.PtSSN==PtSSN].label.values[0]=='NoMOD': \n", + " abdpain_all.loc[abdpain_all.PtSSN==PtSSN,gcol] = impval_dict[gcol]['NoMOD']\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "saved-revelation", + "metadata": {}, + "outputs": [], + "source": [ + "abdpain_tmp_clean = abdpain_all.dropna(subset=['HOSP_Systolic_count'])\n", + "missing = list(set(abdpain_all.PtSSN).difference(set(abdpain_tmp_clean.PtSSN)))\n", + "\n", + "# fix hosp_vitals fields with NoMOD/MOD values\n", + "\n", + "hosp_vitals_cols = ['HOSP_' + x for x in ed_vitals_cols]\n", + "\n", + "impval_dict = {}\n", + "for gcol in hosp_vitals_cols:\n", + " vals = abdpain_tmp_clean.groupby('label')[gcol].describe()[['50%']].reset_index()\n", + " mod_val = vals[vals.label=='MOD']['50%'].values[0]\n", + " nomod_val = vals[vals.label=='NoMOD']['50%'].values[0]\n", + " print(gcol,mod_val,nomod_val) \n", + " impval_dict[gcol]={'MOD':mod_val,'NoMOD':nomod_val}\n", + " \n", + "for PtSSN in missing:\n", + " for gcol in hosp_vitals_cols:\n", + " if abdpain_all[abdpain_all.PtSSN==PtSSN].label.values[0]=='MOD':\n", + " abdpain_all.loc[abdpain_all.PtSSN==PtSSN,gcol] = impval_dict[gcol]['MOD']\n", + " if abdpain_all[abdpain_all.PtSSN==PtSSN].label.values[0]=='NoMOD': \n", + " abdpain_all.loc[abdpain_all.PtSSN==PtSSN,gcol] = impval_dict[gcol]['NoMOD']\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "brave-detroit", + "metadata": {}, + "outputs": [], + "source": [ + "for col in abdpain_all.columns[2:]:\n", + " \n", + " v = abdpain_all[col].isna().sum()\n", + " if v > 0:\n", + " print(col,v )\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "contained-render", + "metadata": {}, + "outputs": [], + "source": [ + "# fix the lab values colums for the missing based on class\n", + "labval_cols = ['WBC_max','potassium_max','potassium_min','chloride_max','amylase_max']\n", + "\n", + "for col in labval_cols:\n", + " abdpain_tmp_clean = abdpain_all.dropna(subset=[col])\n", + " missing = list(set(abdpain_all.PtSSN).difference(set(abdpain_tmp_clean.PtSSN)))\n", + "\n", + " vals = abdpain_tmp_clean.groupby('label')[col].describe()[['50%']].reset_index()\n", + " mod_val = vals[vals.label=='MOD']['50%'].values[0]\n", + " nomod_val = vals[vals.label=='NoMOD']['50%'].values[0]\n", + " print(col,mod_val,nomod_val) \n", + " \n", + " for PtSSN in missing:\n", + " if abdpain_all[abdpain_all.PtSSN==PtSSN].label.values[0]=='MOD':\n", + " abdpain_all.loc[abdpain_all.PtSSN==PtSSN,col] = mod_val\n", + " if abdpain_all[abdpain_all.PtSSN==PtSSN].label.values[0]=='NoMOD': \n", + " abdpain_all.loc[abdpain_all.PtSSN==PtSSN,col] = nomod_val\n", + " \n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "grand-consumer", + "metadata": {}, + "outputs": [], + "source": [ + "for col in abdpain_all.columns[2:]:\n", + " \n", + " v = abdpain_all[col].isna().sum()\n", + " if v > 0:\n", + " print(col,v )\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "verified-devices", + "metadata": {}, + "outputs": [], + "source": [ + "#rel_cols = set(abdpain_all.columns).difference(set(['amylase_min','amylase_max']))\n", + "pred_cols = set(abdpain_all.columns).difference(set(['PtSSN','label']))\n", + "abdpain_all_clean = abdpain_all[['PtSSN','label'] + list(pred_cols)].copy()\n", + "abdpain_all_clean.columns" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "wicked-peninsula", + "metadata": {}, + "outputs": [], + "source": [ + "# make all count and boolean fields integers\n", + "['GI_ENDOSCOPY' 'HOSP_PAIN_count',\n", + " 'diverticulitis', 'HOSP_Systolic_count', 'GASTROENTEROLOGY',\n", + " 'appendicitis', 'HOSP_RESPIRATION_count', 'pancreatitis',\n", + " 'PAIN_count', \n", + " 'cholecystisis', 'RESPIRATION_count',\n", + " 'amylase_abnormal_count', \n", + " 'PULSE_count',\n", + " 'xr_count',\n", + " 'GENERAL SURGERY', 'HOSP_TEMPERATURE_count', \n", + " \n", + " 'HOSP_PULSE_count', \n", + " 'ct_abnormal_count', \n", + " 'bun_abnormal_count', 'HOSP_Diastolic_count', 'Systolic_count',\n", + " 'IB', 'diverticulosis',\n", + " 'num_ED_visits', 'EKG', \n", + " \n", + " \n", + " 'TEMPERATURE_count', \n", + " 'num_hosp',\n", + " 'Diastolic_count', 'cirrhosis',\n", + " 'cholelithiasis', 'ct_count',\n", + " 'CHART CONSULT', 'lact_abnormal_count',\n", + " 'TEMPERATURE_min', 'TEMPERATURE_first', 'ed_duration']\n", + "\n", + "cfields = ['bun_abnormal_count','lact_abnormal_count','amylase_abnormal_count',\n", + " 'cirrhosis','crohns_uc','diverticulitis','appendicitis','gallbladder']\n", + "for col in cfields:\n", + " abdpain_all_clean[col] = abdpain_all_clean[col].astype(int)\n", + " \n", + "abdpain_all_clean.dtypes" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "palestinian-development", + "metadata": {}, + "outputs": [], + "source": [ + "tmp = abdpain_all_clean.dropna()\n", + "X = tmp.loc[:,pred_cols]\n", + "yy = tmp.label\n", + "y = np.array([1 if (x=='MOD') else 0 for x in yy])\n", + "print(X.shape,y.shape)\n", + "clf = tune_model(X,y,5)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "aerial-somalia", + "metadata": {}, + "outputs": [], + "source": [ + "#clf,select_feats = build_L1_model(X,y,clf.best_params_['C'])\n", + "clf,select_feats = build_L1_model(X,y,1)\n", + "visualize_model(clf,select_feats)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "armed-induction", + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.ensemble import RandomForestClassifier\n", + "rf = RandomForestClassifier(n_estimators=4,max_depth=5)\n", + "y = np.array([1 if x=='MOD' else 0 for x in tmp.label])\n", + "rf.fit(tmp.iloc[:,2:],y)\n", + "ypred = rf.predict(tmp.iloc[:,2:])\n", + "metrics.confusion_matrix(y,ypred)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "supposed-decision", + "metadata": {}, + "outputs": [], + "source": [ + "featimp = pd.Series(rf.feature_importances_,index=tmp.columns[2:]).sort_values(ascending=False)\n", + "featimp_nz = featimp[featimp > 0]\n", + "import seaborn as sns\n", + "\n", + "plt.figure(figsize=(8,6))\n", + "myplot = sns.barplot(featimp_nz.index,featimp_nz.values)\n", + "myplot.set_xticklabels(myplot.get_xticklabels(),rotation = 90);" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "vanilla-american", + "metadata": {}, + "outputs": [], + "source": [ + "good_feats = featimp[featimp >= np.mean(featimp)].index" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "familiar-heavy", + "metadata": {}, + "outputs": [], + "source": [ + "# find those rows in abdpain_clean_all that are not in tmp\n", + "null_val_rows = abdpain_all_clean[abdpain_all_clean.isnull().any(axis=1)].copy()\n", + "null_val_rows" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "soviet-freeze", + "metadata": {}, + "outputs": [], + "source": [ + "# potassium_max, chlrode_max, WBC_max, amylase_min, amylase_max filled with class-dependent medians\n", + "impval_dict = {}\n", + "for gcol in ['potassium_max','chloride_max','WBC_max']:\n", + " vals = abdpain_all_clean.groupby('label')[gcol].describe()[['50%']].reset_index()\n", + " mod_val = vals[vals.label=='MOD']['50%'].values[0]\n", + " nomod_val = vals[vals.label=='NoMOD']['50%'].values[0]\n", + " print(gcol,mod_val,nomod_val) \n", + " impval_dict[gcol]={'MOD':mod_val,'NoMOD':nomod_val}\n", + " \n", + "for gcol in ['potassium_max','chloride_max','WBC_max']:\n", + " null_val_rows.loc[null_val_rows.label=='MOD',gcol] = impval_dict[gcol]['MOD']\n", + " null_val_rows.loc[null_val_rows.label=='NoMOD',gcol] = impval_dict[gcol]['NoMOD']\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "combined-credits", + "metadata": {}, + "outputs": [], + "source": [ + "ytest = [1 if x=='MOD' else 0 for x in null_val_rows.label]\n", + "metrics.confusion_matrix(ytest,rf.predict(null_val_rows.iloc[:,2:]))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "located-steps", + "metadata": {}, + "outputs": [], + "source": [ + "ypred_all = rf.predict(pd.concat([tmp.iloc[:,2:],null_val_rows.iloc[:,2:]]))\n", + "y_all = [1 if x =='MOD' else 0 for x in tmp.label] + [1 if x=='MOD' else 0 for x in null_val_rows.label]\n", + "metrics.confusion_matrix(y_all,ypred_all)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "southwest-calvin", + "metadata": {}, + "outputs": [], + "source": [ + "ypred_all_prob = rf.predict_proba(pd.concat([tmp.iloc[:,2:],null_val_rows.iloc[:,2:]]))[:,1]\n", + "print('AUC = ',metrics.roc_auc_score(y_all,ypred_all_prob))\n", + "print('F1 = ',metrics.f1_score(y_all,ypred_all))\n", + "print('AUPRC = ', metrics.precision_score(y_all,ypred_all))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "numeric-helmet", + "metadata": {}, + "outputs": [], + "source": [ + "# umap it all\n", + "u = umap_plot_label(X,yy,1,5,['NoMOD','MOD'],'upper left')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "optimum-chess", + "metadata": {}, + "outputs": [], + "source": [ + "c = cluster_umap(u,3,tmp,'upper right')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fixed-temperature", + "metadata": {}, + "outputs": [], + "source": [ + "stat_cols = featimp.iloc[:25].index\n", + "analyze_clusters(c,tmp,stat_cols)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "significant-reservoir", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.5" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}