diff --git a/Devika/Analysis_Dizziness_AbdPain_Apr2023-Copy2.ipynb b/Devika/Analysis_Dizziness_AbdPain_Apr2023-Copy2.ipynb
new file mode 100644
index 0000000..866afcc
--- /dev/null
+++ b/Devika/Analysis_Dizziness_AbdPain_Apr2023-Copy2.ipynb
@@ -0,0 +1,1823 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "brutal-royal",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%matplotlib inline\n",
+    "from B00_util import *\n",
+    "%reload_ext autoreload\n",
+    "%autoreload 2"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "auburn-accident",
+   "metadata": {},
+   "source": [
+    "# Get  trigger positive data for ML4"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "angry-analysis",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dataset = extractDataset(\"B00_ML4TrgPos_Y2016\", {  \"AllMed\",  \"HF\",\"NonVAMed\", \"DispensedDrug\",\"Only10daysPrior30DaysAfter\"})"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "hawaiian-saturn",
+   "metadata": {},
+   "source": [
+    "#  Cohorts"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "sustainable-connecticut",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "cohorts = dataset['cohort']\n",
+    "dizzy_cohort_df, abdpain_cohort_df = separate_cohorts(cohorts)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "possible-council",
+   "metadata": {},
+   "source": [
+    "#  Demographic data\n",
+    "- for dizzy df, there are four labeled patients for which we do not have demo or cohort records"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "turned-fitting",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "demog = dataset['Demorgraphics']\n",
+    "dizzy_demo_coded, abdpain_demo_coded = separate_demog(demog,dizzy_cohort_df,abdpain_cohort_df)\n",
+    "print('Dizzy demo df = ', dizzy_demo_coded.shape, ' Abdpain demo df = ', abdpain_demo_coded.shape)    "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "accessory-myanmar",
+   "metadata": {},
+   "source": [
+    "#  ED vitals\n",
+    "- vitals during ED visit\n",
+    "    - systolic, diastolic (per visit)\n",
+    "    - pulse, respiration, pulse oximetry, pain, temperature (count, min, max, first for multiple readings)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "institutional-township",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "vitals = dataset['Vital']\n",
+    "\n",
+    "dizzy_EDvitals_df,abdpain_EDvitals_df = separate_cohorts_EDvitals(vitals,dizzy_cohort_df,abdpain_cohort_df)\n",
+    "print(dizzy_EDvitals_df.shape,abdpain_EDvitals_df.shape)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "radio-venue",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(dizzy_EDvitals_df.isna().sum())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "living-oasis",
+   "metadata": {},
+   "source": [
+    "# Vitals during the hospitalizations subsequent to ED visits"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "dated-fiction",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# get vitals from hospital visits\n",
+    "dizzy_hosp_vitals_df,abdpain_hosp_vitals_df = separate_cohorts_hosp_vitals(vitals,dizzy_cohort_df,abdpain_cohort_df)\n",
+    "print(dizzy_hosp_vitals_df.shape,abdpain_hosp_vitals_df.shape)\n",
+    "dizzy_hosp_vitals_df.isna().sum()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "confidential-companion",
+   "metadata": {},
+   "source": [
+    "# Consults  ordered during ED visit\n",
+    "- count up top three consult depts for dizzy\n",
+    "- count up top seven consult depts for abdpain\n",
+    "\n",
+    "Cardiology consult count useful for dizzy\n",
+    "EKG consult count useful for abdpain"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "unable-technique",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "consults = dataset['Consult']\n",
+    "dizzy_consults, abdpain_consults = separate_cohorts_consults(consults,dizzy_cohort_df,abdpain_cohort_df)\n",
+    "\n",
+    "# look at which departments are being consulted (pick top N)\n",
+    "dizzy_topN_consult_counts = topN_consult_counts(dizzy_consults,dizzy_demo_coded,3)\n",
+    "abdpain_topN_consult_counts = topN_consult_counts(abdpain_consults,abdpain_demo_coded,7)\n",
+    "print(dizzy_topN_consult_counts.shape, abdpain_topN_consult_counts.shape)\n",
+    "print(dizzy_topN_consult_counts.isna().sum())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "touched-geography",
+   "metadata": {},
+   "source": [
+    "# Imaging features\n",
+    "- ct images: - how many ordered, how many w/contrast, how many abnormal\n",
+    "- xr images: how many ordered, how many abnormal\n",
+    "- us images: how many ordered, how many abnormal\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "focal-albania",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# get imaging records for dizzy and abdpain\n",
+    "images = dataset['Rad']\n",
+    "\n",
+    "dizzy_images, abdpain_images = separate_cohorts_images(images,dizzy_cohort_df,abdpain_cohort_df)\n",
+    "print(dizzy_images.shape, abdpain_images.shape)\n",
+    "print(dizzy_images.isna().sum())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "quality-syndrome",
+   "metadata": {},
+   "source": [
+    "\n",
+    "# Labs: ED visit only\n",
+    "- for select labs, get count, min, max, abnormal_count"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "sitting-cricket",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "labs = dataset['Lab']\n",
+    "dizzy_labs, abdpain_labs = separate_cohorts_labs(labs,dizzy_cohort_df,abdpain_cohort_df)\n",
+    "print(dizzy_labs.shape,abdpain_labs.shape)\n",
+    "\n",
+    "labs = ['WBC','glucose','albumin','potassium','calcium','lact','chloride','bun','creat','troponin','CO2','ast','alt',\n",
+    "        'alkphos','lipase','amylase','hgb']\n",
+    "lab_fns = ['matches_' + lab + '_loinc' for lab in labs]\n",
+    "\n",
+    "# collect all labs and then filter later on\n",
+    "dizzy_lab_dict = {}\n",
+    "abdpain_lab_dict = {}\n",
+    "for i in range(len(labs)):\n",
+    "    lab, labfn = labs[i], lab_fns[i]\n",
+    "    dizzy_lab_dict[lab] = get_labs_data(dizzy_labs,lab,eval(labfn),dizzy_demo_coded)\n",
+    "    abdpain_lab_dict[lab] = get_labs_data(abdpain_labs,lab,eval(labfn),abdpain_demo_coded)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "painful-fashion",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# merge lab dataframes  \n",
+    "\n",
+    "dizzy_merged_labs = pd.DataFrame(dizzy_cohort_df.PtSSN.unique(),columns=['PtSSN'])\n",
+    "abdpain_merged_labs = pd.DataFrame(abdpain_cohort_df.PtSSN.unique(),columns=['PtSSN'])\n",
+    "for lab in labs:\n",
+    "    dizzy_merged_labs = pd.merge(dizzy_merged_labs,dizzy_lab_dict[lab],on='PtSSN')\n",
+    "    abdpain_merged_labs = pd.merge(abdpain_merged_labs,abdpain_lab_dict[lab],on='PtSSN')\n",
+    "print(dizzy_merged_labs.shape, abdpain_merged_labs.shape)\n",
+    "\n",
+    "# drop cols with more than 10% NA\n",
+    "def check_nas(df,cols,thresh):\n",
+    "    drop_cols = []\n",
+    "    for col in cols:\n",
+    "        pct_na = df[col].isna().sum()/df.shape[0] * 100\n",
+    "        if pct_na > thresh:\n",
+    "            #print(col,pct_na)\n",
+    "            drop_cols = drop_cols + [col]\n",
+    "    return drop_cols\n",
+    "\n",
+    "def clean_lab_df(merged_df,cols,thresh):\n",
+    "    drop_labs = check_nas(merged_df,cols,thresh)\n",
+    "    #print('Lab cols to drop:',drop_labs)\n",
+    "    lab_retain = list(set(merged_df.columns).difference(set(drop_labs)))\n",
+    "    print(merged_df[lab_retain].shape,merged_df[lab_retain].dropna().shape)\n",
+    "    \n",
+    "    \n",
+    "    return merged_df[lab_retain]\n",
+    "\n",
+    "dizzy_clean_labs = clean_lab_df(dizzy_merged_labs,dizzy_merged_labs.columns[1:],10)\n",
+    "abdpain_clean_labs = clean_lab_df(abdpain_merged_labs,abdpain_merged_labs.columns[1:],10)\n",
+    "print(dizzy_clean_labs.shape,abdpain_clean_labs.shape)\n",
+    "print(dizzy_clean_labs.isna().sum())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "heard-marsh",
+   "metadata": {},
+   "source": [
+    "# History\n",
+    "- for dizzy, use Viral's ICD list\n",
+    "- for abdpain, use Adel's ICD list refined by Andy Z"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "confidential-saturday",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "icds = dataset['ICD']\n",
+    "dizzy_icds,abdpain_icds = separate_cohorts_icds(icds,dizzy_cohort_df,abdpain_cohort_df)\n",
+    "print(dizzy_icds.shape,abdpain_icds.shape)\n",
+    "\n",
+    "# get dizzy risk factors\n",
+    "dizzy_rf_df = get_dizzy_rf(dizzy_icds,dizzy_cohort_df)\n",
+    "bool_dizzy_rf_df = pd.concat([dizzy_rf_df.PtSSN, dizzy_rf_df.iloc[:,1:].astype(bool).astype(int)],axis=1 )\n",
+    "print(dizzy_rf_df.shape, bool_dizzy_rf_df.shape)\n",
+    "\n",
+    "# get abdpain risk factors\n",
+    "abdpain_rf_df = get_abdpain_rf(abdpain_icds,abdpain_cohort_df)\n",
+    "bool_abdpain_rf_df = pd.concat([abdpain_rf_df.PtSSN, abdpain_rf_df.iloc[:,1:].astype(bool).astype(int)],axis=1 )\n",
+    "print(abdpain_rf_df.shape, bool_abdpain_rf_df.shape)\n",
+    "\n",
+    "# add a column which is the number of risk factors\n",
+    "bool_dizzy_rf_df['total_rf'] = bool_dizzy_rf_df.iloc[:,1:].sum(axis=1)\n",
+    "bool_abdpain_rf_df['total_rf'] = bool_abdpain_rf_df.iloc[:,1:].sum(axis=1)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "frequent-schedule",
+   "metadata": {},
+   "source": [
+    "# Get labeled data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "tutorial-enclosure",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dizzy_df, dizzy_labels_df = retrieveLabels_dizzy(dizzy_fname)\n",
+    "abdpain_df, abdpain_labels_df = retrieveLabels_abdpain(abdpain_fname)\n",
+    "print(dizzy_labels_df.shape,abdpain_labels_df.shape)\n",
+    "\n",
+    "print(dizzy_labels_df.label.value_counts())\n",
+    "print(abdpain_labels_df.label.value_counts())\n",
+    "print('\\nPPV for dizzy = ', dizzy_labels_df[dizzy_labels_df.label=='MOD'].shape[0]/dizzy_labels_df.shape[0])\n",
+    "print('PPV for abdpain = ', np.round(abdpain_labels_df[abdpain_labels_df.label=='MOD'].shape[0]/abdpain_labels_df.shape[0],3))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "caring-accuracy",
+   "metadata": {},
+   "source": [
+    "# Data for Table 1"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "stuffed-newman",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dizzy_cohort_df_labeled = pd.merge(dizzy_cohort_df,dizzy_labels_df,on='PtSSN')\n",
+    "dizzy_demo_coded_labeled = pd.merge(dizzy_demo_coded,dizzy_labels_df,on='PtSSN')\n",
+    "print(dizzy_demo_coded_labeled.columns)\n",
+    "dizzy_subset = dizzy_demo_coded_labeled[dizzy_demo_coded_labeled.label.isin(['MOD','NoMOD'])]\n",
+    "\n",
+    "# age analysis\n",
+    "display(dizzy_subset['age_at_index_visit'].describe().T.loc[[\"mean\",\"std\"]])\n",
+    "display(dizzy_subset.groupby('label')['age_at_index_visit'].describe().T.loc[[\"mean\",\"std\"],:])\n",
+    "\n",
+    "# gender analysis\n",
+    "display(dizzy_subset.groupby('label')['Gender'].value_counts())\n",
+    "\n",
+    "from scipy.stats import fisher_exact\n",
+    "# from scipy.stats.contingency import crosstab   \n",
+    "    \n",
+    "\n",
+    "ttest_fields(dizzy_subset,\n",
+    "             dizzy_subset.columns[1:-1],\n",
+    "            ['c','d','d','d','d','d','d','d'],show=True)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "touched-morris",
+   "metadata": {},
+   "source": [
+    "# Labeled versions of all dataframes"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "textile-separation",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# labeled versions of the cohort datasets\n",
+    "dizzy_cohort_df_labeled = pd.merge(dizzy_cohort_df,dizzy_labels_df,on='PtSSN')\n",
+    "abdpain_cohort_df_labeled = pd.merge(abdpain_cohort_df,abdpain_labels_df,on='PtSSN')\n",
+    "print('Cohort:',dizzy_cohort_df_labeled.shape, abdpain_cohort_df_labeled.shape)\n",
+    "\n",
+    "# labeled versions of demog datasets\n",
+    "dizzy_demo_coded_labeled = pd.merge(dizzy_demo_coded,dizzy_labels_df,on='PtSSN')\n",
+    "abdpain_demo_coded_labeled = pd.merge(abdpain_demo_coded,abdpain_labels_df,on='PtSSN')\n",
+    "print('Demo:', dizzy_demo_coded_labeled.shape,abdpain_demo_coded_labeled.shape)\n",
+    "\n",
+    "# get labeled version of ED vitals\n",
+    "dizzy_EDvitals_labeled = pd.merge(dizzy_EDvitals_df,dizzy_labels_df,on='PtSSN')\n",
+    "abdpain_EDvitals_labeled = pd.merge(abdpain_EDvitals_df,abdpain_labels_df,on='PtSSN')\n",
+    "print('EDVitals:', dizzy_EDvitals_labeled.shape,abdpain_EDvitals_labeled.shape)\n",
+    "\n",
+    "# get labeled versions of hosp vitals\n",
+    "dizzy_hosp_vitals_labeled = pd.merge(dizzy_hosp_vitals_df,dizzy_labels_df,on='PtSSN')\n",
+    "abdpain_hosp_vitals_labeled = pd.merge(abdpain_hosp_vitals_df,abdpain_labels_df,on='PtSSN')\n",
+    "print('Hosp vitals:', dizzy_hosp_vitals_labeled.shape,abdpain_hosp_vitals_labeled.shape)\n",
+    "\n",
+    "# get labeled versions of consults\n",
+    "dizzy_topN_consult_counts_labeled = pd.merge(dizzy_topN_consult_counts,dizzy_labels_df,on='PtSSN').fillna(0)\n",
+    "abdpain_topN_consult_counts_labeled = pd.merge(abdpain_topN_consult_counts,abdpain_labels_df,on='PtSSN').fillna(0)\n",
+    "print('Consults: ', dizzy_topN_consult_counts_labeled.shape,abdpain_topN_consult_counts_labeled.shape)\n",
+    "\n",
+    "# get labeled version of images\n",
+    "dizzy_images_labeled = pd.merge(dizzy_images,dizzy_labels_df,on='PtSSN')\n",
+    "abdpain_images_labeled = pd.merge(abdpain_images,abdpain_labels_df,on='PtSSN')\n",
+    "print('Imaging:', dizzy_images_labeled.shape,abdpain_images_labeled.shape)\n",
+    "\n",
+    "# get labeled versions of risk factors\n",
+    "dizzy_rf_df_labeled = pd.merge(dizzy_rf_df,dizzy_labels_df,on='PtSSN')\n",
+    "abdpain_rf_df_labeled = pd.merge(abdpain_rf_df,abdpain_labels_df,on='PtSSN')\n",
+    "print('Risk Factors:', dizzy_rf_df_labeled.shape,abdpain_rf_df_labeled.shape)\n",
+    "bool_dizzy_rf_df_labeled = pd.merge(bool_dizzy_rf_df,dizzy_labels_df,on='PtSSN')\n",
+    "bool_abdpain_rf_df_labeled = pd.merge(bool_abdpain_rf_df,abdpain_labels_df,on='PtSSN')\n",
+    "print('Boolean Risk Factors:', dizzy_rf_df_labeled.shape,abdpain_rf_df_labeled.shape)\n",
+    "\n",
+    "# get labeled versions of clean labs\n",
+    "dizzy_clean_labs_labeled = pd.merge(dizzy_clean_labs,dizzy_labels_df,on='PtSSN')\n",
+    "abdpain_clean_labs_labeled = pd.merge(abdpain_clean_labs,abdpain_labels_df,on='PtSSN')\n",
+    "print('Labs: ', dizzy_clean_labs_labeled.shape, abdpain_clean_labs_labeled.shape)\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "informal-cleaning",
+   "metadata": {},
+   "source": [
+    "# Check if any of the features  are useful for MOD prediction"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "talented-technology",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# cohort fields ttest\n",
+    "cohort_fields = ['ed_duration', 'ed_first_inp_delta','sum_hosp_stay', 'num_hosp','num_ED_visits']\n",
+    "cohort_ftypes = ['c','c','c','c','c']\n",
+    "dizzy_cohort_sig = ttest_fields(dizzy_cohort_df_labeled[dizzy_cohort_df_labeled.label.isin(['MOD','NoMOD'])],cohort_fields,cohort_ftypes)\n",
+    "print('Dizziness cohort:', dizzy_cohort_sig)\n",
+    "abdpain_cohort_sig = ttest_fields(abdpain_cohort_df_labeled,cohort_fields,cohort_ftypes)\n",
+    "print('Abdpain cohort:',abdpain_cohort_sig)\n",
+    "\n",
+    "# demo field ttest\n",
+    "demo_fields = dizzy_demo_coded.columns[1:]\n",
+    "demo_ftypes = ['c','d','d','d','d','d','d','d']\n",
+    "dizzy_demo_sig = ttest_fields(dizzy_demo_coded_labeled[dizzy_demo_coded_labeled.label.isin(['MOD','NoMOD'])],demo_fields,demo_ftypes,show=False)\n",
+    "print('Dizziness demo:',dizzy_demo_sig)\n",
+    "abdpain_demo_sig = ttest_fields(abdpain_demo_coded_labeled,demo_fields,demo_ftypes,show=False)\n",
+    "print('Abdpain demo:',abdpain_demo_sig)\n",
+    "\n",
+    "# ED Vitals ttest\n",
+    "# do a ttest with all fields with respect to MOD\n",
+    "ED_vitals_fields= dizzy_EDvitals_labeled.columns[3:-1]\n",
+    "ED_vitals_ftypes = len(ED_vitals_fields)*['c']\n",
+    "dizzy_vitals_sig = ttest_fields(dizzy_EDvitals_labeled[dizzy_EDvitals_labeled.label.isin(['MOD','NoMOD'])],\n",
+    "                                ED_vitals_fields,ED_vitals_ftypes)\n",
+    "print('Dizziness ED vitals:',dizzy_vitals_sig)\n",
+    "abdpain_vitals_sig = ttest_fields(abdpain_EDvitals_labeled,ED_vitals_fields,ED_vitals_ftypes)\n",
+    "print('Abdpain ED vitals:',abdpain_vitals_sig)\n",
+    "\n",
+    "# Hosp vitals ttest\n",
+    "# do a ttest with all fields with respect to MOD\n",
+    "hosp_vitals_fields = dizzy_hosp_vitals_df.columns[3:]\n",
+    "hosp_vitals_ftypes = len(hosp_vitals_fields) * ['c']\n",
+    "dizzy_hosp_vitals_sig = ttest_fields(dizzy_hosp_vitals_labeled[dizzy_hosp_vitals_labeled.label.isin(['MOD','NoMOD'])],\n",
+    "                                     hosp_vitals_fields,hosp_vitals_ftypes)\n",
+    "print('Dizziness hosp vitals:',dizzy_hosp_vitals_sig)\n",
+    "abdpain_hosp_vitals_sig = ttest_fields(abdpain_hosp_vitals_labeled,hosp_vitals_fields,hosp_vitals_ftypes)\n",
+    "print('Abdpain hosp vitals:',abdpain_hosp_vitals_sig)\n",
+    "\n",
+    "# Consult ttest\n",
+    "dizzy_ccounts_fields = dizzy_topN_consult_counts_labeled.columns[1:-1]\n",
+    "dizzy_ccounts_ftypes = ['d','d','d']\n",
+    "dizzy_consults_sig = ttest_fields(dizzy_topN_consult_counts_labeled,dizzy_ccounts_fields,dizzy_ccounts_ftypes)\n",
+    "print('Dizziness consults: ',dizzy_consults_sig)\n",
+    "abdpain_ccounts_fields = abdpain_topN_consult_counts_labeled.columns[1:-1]\n",
+    "abdpain_ccounts_ftypes = len(abdpain_ccounts_fields) * ['d']\n",
+    "abdpain_consults_sig = ttest_fields(abdpain_topN_consult_counts_labeled,abdpain_ccounts_fields,abdpain_ccounts_ftypes)\n",
+    "print('Abdpain consults: ',abdpain_consults_sig)\n",
+    "\n",
+    "# Imaging ttest\n",
+    "dizzy_images_fields = dizzy_images_labeled.columns[2:-1]\n",
+    "dizzy_images_ftypes = len(dizzy_images_fields) * ['c']\n",
+    "dizzy_images_sig = ttest_fields(dizzy_images_labeled[dizzy_images_labeled.label.isin(['MOD','NoMOD'])],dizzy_images_fields,dizzy_images_ftypes)\n",
+    "print('Dizziness imaging: ', dizzy_images_sig)\n",
+    "abdpain_images_fields = abdpain_images_labeled.columns[2:-1]\n",
+    "abdpain_images_ftypes = len(abdpain_images_fields) * ['c']\n",
+    "abdpain_images_sig = ttest_fields(abdpain_images_labeled,abdpain_images_fields,abdpain_images_ftypes)\n",
+    "print('Abdpain imaging: ', abdpain_images_sig)\n",
+    "\n",
+    "# ttest for risk factors\n",
+    "dizzy_rf_sig = ttest_fields(bool_dizzy_rf_df_labeled,bool_dizzy_rf_df.columns[1:],len(bool_dizzy_rf_df.columns[1:])*['d'],show=False)\n",
+    "print('Dizzy RF: ', dizzy_rf_sig)\n",
+    "abdpain_rf_sig = ttest_fields(bool_abdpain_rf_df_labeled,bool_abdpain_rf_df.columns[1:],len(bool_abdpain_rf_df.columns[1:])*['d'],show=False)\n",
+    "print('Abdpain RF: ', abdpain_rf_sig)\n",
+    "\n",
+    "# ttest for labs\n",
+    "dizzy_labs_sig = ttest_fields(dizzy_clean_labs_labeled,dizzy_clean_labs_labeled.columns[1:-1],\n",
+    "                             len(dizzy_clean_labs_labeled.columns[1:])*['d'],show=False)\n",
+    "abdpain_labs_sig = ttest_fields(abdpain_clean_labs_labeled,abdpain_clean_labs_labeled.columns[1:-1],\n",
+    "                             len(abdpain_clean_labs_labeled.columns[1:])*['d'],show=False)\n",
+    "\n",
+    "\n",
+    "    \n",
+    "print('Dizzy Lab: ',dizzy_labs_sig)\n",
+    "print('Abdpain Lab: ',abdpain_labs_sig)\n",
+    "\n",
+    "dizzy_good_cols = dizzy_cohort_sig + dizzy_demo_sig + dizzy_vitals_sig + dizzy_hosp_vitals_sig + dizzy_consults_sig \n",
+    "                + dizzy_images_sig + dizzy_rf_sig + dizzy_labs_sig"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "mexican-friend",
+   "metadata": {},
+   "source": [
+    "# Merge different dataframes"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "invisible-fundamentals",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# start with cohort, then demo, then vitals, then consults, imaging, labs, history\n",
+    "\n",
+    "dizzy_cohort_cols = ['TriggerType', 'PtSSN', 'EDStartDateTime', 'EDEndDateTime',\n",
+    "       'EDVisitReason', 'AdmitDateTime', 'DischargeDateTime','hosp_stay',\n",
+    "       'ed_duration', 'FirstAdmission', 'ed_first_inp_delta', 'num_ED_visits',\n",
+    "       'num_hosp', 'sum_hosp_stay']\n",
+    "\n",
+    "dizzy_all = pd.DataFrame(dizzy_demo_coded.PtSSN,columns=['PtSSN'])\n",
+    "dizzy_all = pd.merge(dizzy_all,dizzy_cohort_df[dizzy_cohort_cols],on='PtSSN')\n",
+    "dizzy_all = pd.merge(dizzy_all,dizzy_demo_coded,on='PtSSN')\n",
+    "dizzy_all = pd.merge(dizzy_all,dizzy_EDvitals_df,on=['PtSSN','EDStartDateTime'],how='left')\n",
+    "dizzy_all = pd.merge(dizzy_all,dizzy_hosp_vitals_df,on=['PtSSN','AdmitDateTime'],how='left')\n",
+    "dizzy_all = pd.merge(dizzy_all,dizzy_topN_consult_counts,on=['PtSSN'])\n",
+    "dizzy_all = pd.merge(dizzy_all,dizzy_images,on=['PtSSN','EDStartDateTime'])\n",
+    "dizzy_all = pd.merge(dizzy_all,dizzy_clean_labs,on=['PtSSN'],how='left')\n",
+    "dizzy_all = pd.merge(dizzy_all,bool_dizzy_rf_df,on='PtSSN')\n",
+    "print(dizzy_all.shape)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "incorporate-hunger",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# handle missing values\n",
+    "\n",
+    "cols_with_missing = []\n",
+    "for col in dizzy_all.columns:\n",
+    "    missing = dizzy_all[col].isna().sum()\n",
+    "    if missing > 0:\n",
+    "        cols_with_missing.append(col)\n",
+    "        print(col,missing)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "moved-blair",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "combo_label_df.columns"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "victorian-electric",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# find a way to plot the labeled data in the combined dataframe\n",
+    "\n",
+    "\n",
+    "combo_label_df = pd.merge(dizzy_all,dizzy_labels_df,on='PtSSN',how='left')\n",
+    "combo_label_df.loc[combo_label_df.label.isna(),'label'] = 'unknown'\n",
+    "useful_cols = list(set(combo_label_df.columns[7:-1]).difference(['FirstAdmission']))\n",
+    "reduced_combo = combo_label_df[combo_label_df.label.isin(['MOD','NoMOD','unknown'])].dropna()\n",
+    "X = reduced_combo[useful_cols]\n",
+    "y = reduced_combo['label']\n",
+    "print(X.shape,y.shape)\n",
+    "label_list = list(reduced_combo.label.value_counts().index)\n",
+    "u = umap_plot_label(X,y,1,3,label_list,'upper left');\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "physical-mineral",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# analyze the resulting clusters\n",
+    "clust = cluster_umap(u,5,reduced_combo[useful_cols+['label']],'upper left')\n",
+    "analyze_clusters(clust,reduced_combo[useful_cols+['label']],useful_cols) "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "organizational-resort",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "imp_cols = ['ed_first_inp_delta','age_at_index_visit','new_race_WHITE','new_race_BLACK OR AFRICAN AMERICAN','glucose_count',\n",
+    "           'glucose_min','glucose_max','glucose_abnormal_count','CO2_count','hgb_abnormal_count','Systolic_max','Systolic_first',\n",
+    "           'Diastolic_max','Diastolic_first','PULSE_min','PULSE_first','HOSP_Systolic_max','HOSP_Diastolic_max','HOSP_PULSE_min',\n",
+    "           'HOSP_PULSE_first','ct_count','ct_abnormal_count','Diabetes','Hypertension','Coronary artery disease (CAD)']\n",
+    "\n",
+    "X = reduced_combo[imp_cols]\n",
+    "y = reduced_combo['label']\n",
+    "label_list = list(reduced_combo.label.value_counts().index)\n",
+    "u = umap_plot_label(X,y,1,7,label_list,'upper right');\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "hybrid-postage",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# analyze the resulting clusters\n",
+    "clust = cluster_umap(u,3,reduced_combo[imp_cols+['label']],'upper right')\n",
+    "analyze_clusters(clust,reduced_combo[imp_cols+['label']],imp_cols) "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "photographic-crowd",
+   "metadata": {},
+   "source": [
+    "# Need to get red flags to recreate Paarth classifier on our data\n",
+    "- headache\n",
+    "- diplopia\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "entertaining-catering",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "notes.columns"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "eleven-discipline",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tmp = notes[notes.PatientSSN==ptssn]\n",
+    "start_time, end_time = dizzy_cohort_df[dizzy_cohort_df.PtSSN==ptssn].EDStartDateTime.values[0],dizzy_cohort_df[dizzy_cohort_df.PtSSN==ptssn].DischargeDateTime.values[0]\n",
+    "tmp1 = tmp[(tmp.EntryDateTime >= start_time) & (tmp.EntryDateTime <= end_time)]\n",
+    "for i in range(tmp1.shape[0]):\n",
+    "    print(i,'---------------------------------------------')\n",
+    "    print(tmp1.TIUStandardTitle.iloc[i])\n",
+    "    print(tmp1.ReportText.iloc[i])\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "popular-africa",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "notes = dataset['withRole']\n",
+    "notes.PatientSSN = notes.PatientSSN.astype('int64')\n",
+    "notes.EntryDateTime = pd.to_datetime(notes.EntryDateTime)\n",
+    "ed_notes = notes[notes.TIUStandardTitle=='EMERGENCY DEPT NOTE'].copy()\n",
+    "ed_notes.rename(columns={'PatientSSN':'PtSSN'},inplace=True)\n",
+    "ed_notes.PtSSN = ed_notes.PtSSN.astype('int64')\n",
+    "\n",
+    "\n",
+    "ed_notes_mod = pd.merge(ed_notes,dizzy_labels_df,on='PtSSN')\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "equal-myanmar",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ptssn = ed_notes_mod.iloc[0].PtSSN\n",
+    "#print(ed_notes_mod.ReportText.iloc[0])\n",
+    "display(dizzy_df[dizzy_df.PtSSN==ptssn].CaseSummaryER.values)\n",
+    "combo_df[combo_df.PtSSN==ptssn]\n",
+    "dizzy_cohort_df[dizzy_cohort_df.PtSSN==ptssn]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "sudden-albuquerque",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# UMAP the lab dataframe for abdpain\n",
+    "\n",
+    "XX = abdpain_clean_labs.dropna()\n",
+    "X = XX[set(XX.columns).difference(['PtSSN'])]\n",
+    "u = umap_plot_nolabel(X,1,7)\n",
+    "\n",
+    "# analyze the resulting clusters\n",
+    "clust = cluster_umap_nolabel(u,3,X,'upper left')\n",
+    "analyze_clusters_nolabel(clust,X,X.columns)\n",
+    "\n",
+    "# create a tapestry plot to visualize the clusters according to median values\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "veterinary-walnut",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# start merging all dataframes and UMAP them for dizzy and abdpain\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "covered-nashville",
+   "metadata": {},
+   "source": [
+    "# Make predictive model for dizzy with all the ttest relevant fields"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "broad-child",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dizzy_cohort_tmp = dizzy_cohort_df_labeled[['PtSSN'] + dizzy_cohort_sig]\n",
+    "dizzy_demo_tmp = dizzy_demo_coded_labeled[['PtSSN'] + dizzy_demo_sig]\n",
+    "dizzy_vitals_tmp = dizzy_EDvitals_labeled[['PtSSN'] + dizzy_vitals_sig]\n",
+    "dizzy_hosp_vitals_tmp = dizzy_hosp_vitals_labeled[['PtSSN'] + dizzy_hosp_vitals_sig]\n",
+    "#dizzy_consults_tmp = dizzy_topN_consult_counts_labeled[['PtSSN'] + dizzy_consults_sig]\n",
+    "\n",
+    "# images\n",
+    "dizzy_images_xr_count_labeled = pd.merge(dizzy_images_xr_count,dizzy_labels_df[dizzy_labels_df.label.isin(['MOD','NoMOD'])],on='PtSSN')\n",
+    "dizzy_images_ct_count_labeled = pd.merge(dizzy_images_ct_count,dizzy_labels_df[dizzy_labels_df.label.isin(['MOD','NoMOD'])],on='PtSSN')\n",
+    "dizzy_images_ct_abnormal_count_labeled = pd.merge(dizzy_images_ct_abnormal_count,dizzy_labels_df[dizzy_labels_df.label.isin(['MOD','NoMOD'])],on='PtSSN')\n",
+    "\n",
+    "# convert image count fields into int\n",
+    "dizzy_images_xr_count_labeled.xr_count = dizzy_images_xr_count_labeled.xr_count.astype(int)\n",
+    "dizzy_images_ct_count_labeled.ct_count = dizzy_images_ct_count_labeled.ct_count.astype(int)\n",
+    "dizzy_images_ct_abnormal_count_labeled.ct_abnormal_count = dizzy_images_ct_abnormal_count_labeled.ct_abnormal_count.astype(int)\n",
+    "\n",
+    "dizzy_images_xr_count_tmp = dizzy_images_xr_count_labeled[['PtSSN','xr_count']].drop_duplicates()\n",
+    "dizzy_images_ct_count_tmp = dizzy_images_ct_count_labeled[['PtSSN','ct_count']].drop_duplicates()\n",
+    "dizzy_images_ct_abnormal_count_tmp = dizzy_images_ct_abnormal_count_labeled[['PtSSN','ct_abnormal_count']].drop_duplicates()\n",
+    "\n",
+    "\n",
+    "# labs is rel_dizzy_labs\n",
+    "# risk factors is dizzy_rf_df_all_labeled\n",
+    "\n",
+    "# merge them all\n",
+    "\n",
+    "labels_tmp = dizzy_labels_df[dizzy_labels_df.label.isin(['MOD','NoMOD'])][['PtSSN','label']]\n",
+    "dizzy_all = pd.merge(labels_tmp,dizzy_cohort_tmp,on='PtSSN')\n",
+    "dizzy_all = pd.merge(dizzy_all,dizzy_demo_tmp,on='PtSSN')\n",
+    "dizzy_all = pd.merge(dizzy_all,dizzy_vitals_tmp,on='PtSSN')\n",
+    "dizzy_all = pd.merge(dizzy_all,dizzy_hosp_vitals_tmp,on='PtSSN')\n",
+    "dizzy_all = pd.merge(dizzy_all,dizzy_consults_tmp,on='PtSSN')\n",
+    "dizzy_all = pd.merge(dizzy_all,rel_dizzy_labs,on='PtSSN')\n",
+    "dizzy_all = pd.merge(dizzy_all,dizzy_images_xr_count_tmp,on='PtSSN')\n",
+    "dizzy_all = pd.merge(dizzy_all,dizzy_images_ct_count_tmp,on='PtSSN')\n",
+    "dizzy_all = pd.merge(dizzy_all,dizzy_images_ct_abnormal_count_tmp,on='PtSSN')\n",
+    "dizzy_all = pd.merge(dizzy_all,dizzy_rf_df_all_labeled[['PtSSN','Hx aneurysm']],on=['PtSSN'])\n",
+    "print(dizzy_all.shape)\n",
+    "set(dizzy_all.columns).difference(set(all_dizzy_all.columns))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "authentic-afghanistan",
+   "metadata": {},
+   "source": [
+    "# make feature dataframe for entire data set"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "light-reference",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# make dataset for all of dizzy (not just the labeled data)\n",
+    "all_dizzy_cohort_tmp = dizzy_cohort_df[['PtSSN'] + dizzy_cohort_sig]\n",
+    "#print(all_dizzy_cohort_tmp.shape,all_dizzy_cohort_tmp.columns)\n",
+    "all_dizzy_demo_tmp = dizzy_demo_coded[['PtSSN'] + dizzy_demo_sig]\n",
+    "#print(all_dizzy_demo_tmp.shape,all_dizzy_demo_tmp.columns)\n",
+    "all_dizzy_vitals_tmp = dizzy_EDvitals_df[['PtSSN'] + dizzy_vitals_sig]\n",
+    "#print(all_dizzy_vitals_tmp.shape,all_dizzy_vitals_tmp.columns)\n",
+    "all_dizzy_hosp_vitals_tmp = dizzy_hosp_vitals_df[['PtSSN'] + dizzy_hosp_vitals_sig]\n",
+    "#all_dizzy_consults_tmp = dizzy_topN_consult_counts[['PtSSN'] + dizzy_consults_sig]\n",
+    "#print(all_dizzy_hosp_vitals_tmp.shape,all_dizzy_hosp_vitals_tmp.columns)\n",
+    "\n",
+    "# convert image count fields into int\n",
+    "dizzy_images_xr_count.xr_count = dizzy_images_xr_count.xr_count.astype(int)\n",
+    "dizzy_images_ct_count.ct_count = dizzy_images_ct_count.ct_count.astype(int)\n",
+    "dizzy_images_ct_abnormal_count.ct_abnormal_count = dizzy_images_ct_abnormal_count.ct_abnormal_count.astype(int)\n",
+    "\n",
+    "all_dizzy_images_xr_count_tmp = dizzy_images_xr_count[['PtSSN','xr_count']].drop_duplicates()\n",
+    "all_dizzy_images_ct_count_tmp = dizzy_images_ct_count[['PtSSN','ct_count']].drop_duplicates()\n",
+    "all_dizzy_images_ct_abnormal_count_tmp = dizzy_images_ct_abnormal_count[['PtSSN','ct_abnormal_count']].drop_duplicates()\n",
+    "\n",
+    "\n",
+    "# labs is rel_dizzy_labs\n",
+    "# risk factors is dizzy_rf_df_all_labeled\n",
+    "\n",
+    "# merge them all\n",
+    "\n",
+    "\n",
+    "all_dizzy_all = pd.merge(all_dizzy_cohort_tmp,all_dizzy_demo_tmp,on='PtSSN')\n",
+    "\n",
+    "all_dizzy_all = pd.merge(all_dizzy_all,all_dizzy_vitals_tmp,on='PtSSN')\n",
+    "\n",
+    "all_dizzy_all = pd.merge(all_dizzy_all,all_dizzy_hosp_vitals_tmp,on='PtSSN')\n",
+    "\n",
+    "#all_dizzy_all = pd.merge(all_dizzy_all,dizzy_consults_tmp,on='PtSSN')\n",
+    "all_dizzy_all = pd.merge(all_dizzy_all,rel_dizzy_labs,on='PtSSN')\n",
+    "\n",
+    "all_dizzy_all = pd.merge(all_dizzy_all,all_dizzy_images_xr_count_tmp,on='PtSSN')\n",
+    "\n",
+    "all_dizzy_all = pd.merge(all_dizzy_all,all_dizzy_images_ct_count_tmp,on='PtSSN')\n",
+    "\n",
+    "all_dizzy_all = pd.merge(all_dizzy_all,all_dizzy_images_ct_abnormal_count_tmp,on='PtSSN')\n",
+    "\n",
+    "all_dizzy_all = pd.merge(all_dizzy_all,dizzy_rf_df_all[['PtSSN','Hx aneurysm']],on=['PtSSN'])\n",
+    "print(all_dizzy_all.shape)\n",
+    "print(all_dizzy_all.columns)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "suspended-afternoon",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# which ones have NAs in them?\n",
+    "# make all  count columns default to 0\n",
+    "\n",
+    "limit_dizzy_all = dizzy_all[dizzy_all.label.isin(['MOD','NoMOD'])].copy()\n",
+    "\n",
+    "abcols = ['WBC_abnormal_count','glucose_count','glucose_abnormal_count','albumin_abnormal_count','CO2_count',\n",
+    "        'alkphos_abnormal_count','hgb_abnormal_count']\n",
+    "for abcol in abcols:\n",
+    "    limit_dizzy_all[abcol] = limit_dizzy_all[abcol].fillna(0)\n",
+    "\n",
+    "\n",
+    "\n",
+    "for col in limit_dizzy_all.columns[2:]:\n",
+    "    v = limit_dizzy_all[col].isna().sum()\n",
+    "    if v > 0:\n",
+    "        print(col,v )\n",
+    "    \n",
+    "print(limit_dizzy_all.shape)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "needed-prairie",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# for glucose_min and glucose_max use class_specific medians\n",
+    "\n",
+    "glucose_cols = ['glucose_min','glucose_max']\n",
+    "for gcol in glucose_cols:\n",
+    "    vals = limit_dizzy_all.groupby('label')[gcol].describe()[['50%']].reset_index()\n",
+    "    mod_val = vals[vals.label=='MOD']['50%'].values[0]\n",
+    "    nomod_val = vals[vals.label=='NoMOD']['50%'].values[0]\n",
+    "    print(mod_val,nomod_val)\n",
+    "    "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "mediterranean-bennett",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# drop highly correlated columns to support logistic regression\n",
+    "fig, ax = plt.subplots(figsize=(10,10))\n",
+    "sns.heatmap(limit_dizzy_all.iloc[:,2:].corr(),ax=ax,annot=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "forced-karma",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tmp = limit_dizzy_all.dropna()\n",
+    "print(tmp.shape)\n",
+    "X = tmp.iloc[:,2:]\n",
+    "yy = tmp.label\n",
+    "y = np.array([1 if (x=='MOD') else 0 for x in yy])\n",
+    "print(X.shape,y.shape)\n",
+    "clf = tune_model(X,y)\n",
+    "print('Best C = ',clf.best_params_['C'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "internal-costa",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "#clf,select_feats = build_L1_model(X,y,clf.best_params_['C'])\n",
+    "clf,select_feats = build_L1_model(X,y,0.0045)\n",
+    "visualize_model(clf,select_feats)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "funded-estimate",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# umap it all\n",
+    "u = umap_plot_label(X,yy,1,7,['NoMOD','MOD'],'upper right')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "dental-camcorder",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "c = cluster_umap(u,4,tmp,'upper right')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ranging-fraud",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#stat_cols = featimp.iloc[:5].index\n",
+    "analyze_clusters(c,tmp,select_feats)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ordered-bathroom",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# find those rows in limit_dizzy_all that have nulls\n",
+    "null_val_rows = limit_dizzy_all[limit_dizzy_all.isnull().any(axis=1)].copy()\n",
+    "null_val_rows.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "hired-forum",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for gcol in ['glucose_min','glucose_max']:\n",
+    "    vals = limit_dizzy_all.groupby('label')[gcol].describe()[['50%']].reset_index()\n",
+    "    mod_val = vals[vals.label=='MOD']['50%'].values[0]\n",
+    "    nomod_val = vals[vals.label=='NoMOD']['50%'].values[0]\n",
+    "    print(gcol,mod_val,nomod_val)                                     "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "buried-symposium",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "null_val_rows.loc[null_val_rows.label=='MOD','glucose_min'] = 156\n",
+    "null_val_rows.loc[null_val_rows.label=='MOD','glucose_max'] = 171\n",
+    "null_val_rows.loc[null_val_rows.label=='NoMOD','glucose_min'] = 126\n",
+    "null_val_rows.loc[null_val_rows.label=='NoMOD','glucose_max'] = 126"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "revised-category",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# predict on null_val_rows\n",
+    "print(clf.predict(null_val_rows.iloc[:,2:]))\n",
+    "print(null_val_rows['label'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "studied-tiffany",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(all_dizzy_all.shape)\n",
+    "abcols = ['WBC_abnormal_count','glucose_count','glucose_abnormal_count','albumin_abnormal_count','CO2_count',\n",
+    "        'alkphos_abnormal_count','hgb_abnormal_count']\n",
+    "for abcol in abcols:\n",
+    "    all_dizzy_all[abcol] = all_dizzy_all[abcol].fillna(0)\n",
+    "\n",
+    "\n",
+    "\n",
+    "for col in all_dizzy_all.columns[1:]:\n",
+    "    v = all_dizzy_all[col].isna().sum()\n",
+    "    if v > 0:\n",
+    "        print(col,v )\n",
+    "    \n",
+    "print(all_dizzy_all.shape)\n",
+    "\n",
+    "# find those rows in all_dizzy_all that have null\n",
+    "all_null_val_rows = all_dizzy_all[all_dizzy_all.isnull().any(axis=1)].copy()\n",
+    "print(all_null_val_rows.shape)\n",
+    "\n",
+    "for gcol in ['glucose_min','glucose_max']:\n",
+    "    vals = all_dizzy_all[gcol].describe()[['50%']]\n",
+    "    g_val = vals['50%']\n",
+    "   \n",
+    "    print(gcol,g_val)     \n",
+    "    \n",
+    "# fill in glucose values that are medians across the entire set.\n",
+    "all_null_val_rows.loc[:,'glucose_min'] = 134\n",
+    "all_null_val_rows.loc[:,'glucose_max'] = 135\n",
+    "\n",
+    "\n",
+    "tmp1 = pd.concat([all_dizzy_all.dropna(),all_null_val_rows])\n",
+    "print(tmp1.iloc[:,1:].shape)\n",
+    "for col in tmp1.columns[1:]:\n",
+    "    v = tmp1[col].isna().sum()\n",
+    "    if v > 0:\n",
+    "        print(col,v )\n",
+    "\n",
+    "print('Logistic regression prediction:')\n",
+    "ypred1 = clf.predict(tmp1.iloc[:,1:])\n",
+    "display(pd.merge(tmp1[ypred1==1].PtSSN,dizzy_labels_df,on='PtSSN')['label'].value_counts())\n",
+    "display(pd.merge(tmp1[ypred1==0].PtSSN,dizzy_labels_df,on='PtSSN')['label'].value_counts())\n",
+    "\n",
+    "print('Random forest prediction:')\n",
+    "ypred1 = rf.predict(tmp1.iloc[:,1:])\n",
+    "display(pd.merge(tmp1[ypred1==1].PtSSN,dizzy_labels_df,on='PtSSN')['label'].value_counts())\n",
+    "display(pd.merge(tmp1[ypred1==0].PtSSN,dizzy_labels_df,on='PtSSN')['label'].value_counts())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "sapphire-caribbean",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ypred1 =rf.predict(tmp1.iloc[:,1:])\n",
+    "display(pd.merge(tmp1[ypred1==1].PtSSN,dizzy_labels_df,on='PtSSN')['label'].value_counts())\n",
+    "display(pd.merge(tmp1[ypred1==0].PtSSN,dizzy_labels_df,on='PtSSN')['label'].value_counts())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "lucky-stanford",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sklearn.tree import DecisionTreeClassifier\n",
+    "\n",
+    "\n",
+    "dt = DecisionTreeClassifier(max_depth=7,criterion='entropy')\n",
+    "scores = cross_val_score(dt,tmp.iloc[:,2:],y)\n",
+    "print(np.mean(scores),np.std(scores))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "peaceful-fortune",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# train and test on all 75 labeled records\n",
+    "from sklearn.ensemble import RandomForestClassifier\n",
+    "rf = RandomForestClassifier(n_estimators=10,max_depth=3)\n",
+    "y = np.array([1 if x=='MOD' else 0 for x in tmp.label])\n",
+    "rf.fit(tmp.iloc[:,2:],y)\n",
+    "ypred = rf.predict(tmp.iloc[:,2:])\n",
+    "metrics.confusion_matrix(y,ypred)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "official-letter",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "rf.predict(null_val_rows.iloc[:,2:])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "smaller-hybrid",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "featimp = pd.Series(rf.feature_importances_,index=tmp.columns[2:]).sort_values(ascending=False)\n",
+    "import seaborn as sns\n",
+    "\n",
+    "plt.figure(figsize=(8,6))\n",
+    "myplot = sns.barplot(featimp.index,featimp.values)\n",
+    "myplot.set_xticklabels(myplot.get_xticklabels(),rotation = 90);"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "working-suspension",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# now do an 80-20 split; and run train/test\n",
+    "Xtrain,Xtest,ytrain,ytest = train_test_split(tmp.iloc[:,2:],y,stratify=y,test_size=0.1)\n",
+    "print(Xtrain.shape,Xtest.shape,ytrain.shape,ytest.shape)\n",
+    "rf = RandomForestClassifier(n_estimators=7,max_depth=4)\n",
+    "rf.fit(Xtrain,ytrain)\n",
+    "ypred = rf.predict(Xtest)\n",
+    "print(metrics.confusion_matrix(ytest,ypred))\n",
+    "print('AUC = ',metrics.roc_auc_score(ytest,ypred))\n",
+    "print('F1 = ',metrics.f1_score(ytest,ypred))\n",
+    "print('Accuracy = ',metrics.accuracy_score(ytest,ypred))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "modified-refund",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ypred_all = rf.predict(pd.concat([tmp.iloc[:,2:],null_val_rows.iloc[:,2:]]))\n",
+    "y_all = [1 if x =='MOD' else 0 for x in tmp.label] + [1 if x=='MOD' else 0 for x in null_val_rows.label]\n",
+    "metrics.confusion_matrix(y_all,ypred_all)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "corrected-patient",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "featimp = pd.Series(rf.feature_importances_,index=tmp.columns[2:]).sort_values(ascending=False)\n",
+    "import seaborn as sns\n",
+    "\n",
+    "plt.figure(figsize=(8,6))\n",
+    "myplot = sns.barplot(featimp.index,featimp.values)\n",
+    "myplot.set_xticklabels(myplot.get_xticklabels(),rotation = 90);"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "occupational-shark",
+   "metadata": {},
+   "source": [
+    "# Classify unlabeled records in dizzy"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "determined-agency",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tmp1.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "dependent-hamburg",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for col in tmp1.columns[1:]:\n",
+    "    v = tmp1[col].isna().sum()\n",
+    "    if v > 0:\n",
+    "        print(col,v )\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "associate-friend",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tmp2.columns"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "detailed-layout",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# merge to create temp labels\n",
+    "tmp2 = pd.merge(tmp1,dizzy_labels_df,on='PtSSN',how='left')\n",
+    "print(tmp2.shape)\n",
+    "#display(tmp2.label.value_counts())\n",
+    "len(set(dizzy_labels_df.PtSSN).difference(tmp2.PtSSN))\n",
+    "tmp2.label = tmp2.label.fillna('unknown')\n",
+    "tmp3 = tmp2[tmp2.label.isin(['MOD','NoMOD','unknown'])]\n",
+    "\n",
+    "\n",
+    "u = umap_plot_label(tmp3.iloc[:,1:-1],tmp3.label,4,7,['NoMOD','unknown','MOD'],'upper right');\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "accepted-globe",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "c = cluster_umap(u,3,tmp3,'upper right')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "limiting-alarm",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "analyze_clusters(c,tmp3,list(featimp.index)[:10])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "musical-silicon",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# plot the predicted labels for the unknowns\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "flying-conservation",
+   "metadata": {},
+   "source": [
+    "# Assemble the abdpain_all dataframe"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "drawn-bulletin",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "abdpain_cohort_tmp = abdpain_cohort_df_labeled[['PtSSN'] + cohort_fields].drop_duplicates(subset=['PtSSN'])\n",
+    "abdpain_demo_tmp = abdpain_demo_coded_labeled[['PtSSN'] + list(abdpain_demo_coded_labeled.columns[1:-1])].drop_duplicates(subset=['PtSSN'])\n",
+    "abdpain_vitals_tmp = abdpain_EDvitals_labeled[['PtSSN'] + list(abdpain_EDvitals_labeled.columns[2:-1])].drop_duplicates(subset=['PtSSN'])\n",
+    "abdpain_hosp_vitals_tmp = abdpain_hosp_vitals_labeled[['PtSSN'] + list(abdpain_hosp_vitals_labeled.columns[2:-1])].drop_duplicates(subset=['PtSSN'])\n",
+    "abdpain_consults_tmp = abdpain_topN_consult_counts_labeled[['PtSSN'] + list(abdpain_topN_consult_counts.columns[1:-1])].drop_duplicates(subset=['PtSSN'])\n",
+    "\n",
+    "# images\n",
+    "abdpain_images_xr_count_labeled = pd.merge(abdpain_images_xr_count,abdpain_labels_df,on='PtSSN').drop_duplicates(subset=['PtSSN'])\n",
+    "abdpain_images_ct_count_labeled = pd.merge(abdpain_images_ct_count,abdpain_labels_df,on='PtSSN').drop_duplicates(subset=['PtSSN'])\n",
+    "abdpain_images_ct_abnormal_count_labeled = pd.merge(abdpain_images_ct_abnormal_count,abdpain_labels_df,on='PtSSN').drop_duplicates(subset=['PtSSN'])\n",
+    "\n",
+    "# convert image count fields into int\n",
+    "abdpain_images_xr_count_labeled.xr_count = abdpain_images_xr_count_labeled.xr_count.astype(int)\n",
+    "abdpain_images_ct_count_labeled.ct_count = abdpain_images_ct_count_labeled.ct_count.astype(int)\n",
+    "abdpain_images_ct_abnormal_count_labeled.ct_abnormal_count = abdpain_images_ct_abnormal_count_labeled.ct_abnormal_count.astype(int)\n",
+    "\n",
+    "abdpain_images_xr_count_tmp = abdpain_images_xr_count_labeled[['PtSSN','xr_count']].drop_duplicates()\n",
+    "abdpain_images_ct_count_tmp = abdpain_images_ct_count_labeled[['PtSSN','ct_count']].drop_duplicates()\n",
+    "abdpain_images_ct_abnormal_count_tmp = abdpain_images_ct_abnormal_count_labeled[['PtSSN','ct_abnormal_count']].drop_duplicates()\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "blond-transparency",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# labs is rel_abdpain_labs\n",
+    "# risk factors is abdpain_rf_df_all_labeled\n",
+    "\n",
+    "# merge them all\n",
+    "\n",
+    "abdpain_all = pd.merge(abdpain_labels_df,abdpain_cohort_tmp,on='PtSSN')\n",
+    "print(abdpain_all.shape)\n",
+    "abdpain_all = pd.merge(abdpain_all,abdpain_demo_tmp,on='PtSSN')\n",
+    "print(abdpain_all.shape)\n",
+    "\n",
+    "abdpain_all = pd.merge(abdpain_all,abdpain_consults_tmp,on='PtSSN')\n",
+    "print(abdpain_all.shape)\n",
+    "abdpain_all = pd.merge(abdpain_all,rel_abdpain_labs,on='PtSSN')\n",
+    "print(abdpain_all.shape)\n",
+    "abdpain_all = pd.merge(abdpain_all,abdpain_images_xr_count_tmp,on='PtSSN')\n",
+    "print(abdpain_all.shape)\n",
+    "abdpain_all = pd.merge(abdpain_all,abdpain_images_ct_count_tmp,on='PtSSN')\n",
+    "print(abdpain_all.shape)\n",
+    "abdpain_all = pd.merge(abdpain_all,abdpain_images_ct_abnormal_count_tmp,on='PtSSN')\n",
+    "print(abdpain_all.shape)\n",
+    "abdpain_all = pd.merge(abdpain_all,abdpain_rf_df_all_labeled,on=['PtSSN','label'])\n",
+    "print(abdpain_all.shape)\n",
+    "abdpain_all = pd.merge(abdpain_all,abdpain_vitals_tmp,on='PtSSN',how='left')\n",
+    "print(abdpain_all.shape)\n",
+    "abdpain_all = pd.merge(abdpain_all,abdpain_hosp_vitals_tmp,on='PtSSN',how='left')\n",
+    "print(abdpain_all.shape)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "decreased-hours",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# who are the missing SSNs in abdpain_EDvitals?\n",
+    "tmp1 = abdpain_EDvitals_labeled.groupby('PtSSN')['PtSSN'].agg('count')\n",
+    "missing = set(abdpain_cohort_df_labeled.PtSSN).difference(set(tmp1.index))\n",
+    "print(missing)\n",
+    "abdpain_cohort_df_labeled[abdpain_cohort_df_labeled.PtSSN.isin(missing)]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "secure-fetish",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# who are the missing SSNs in abdpain_hosp_vitals?\n",
+    "tmp2 = abdpain_hosp_vitals_labeled.groupby('PtSSN')['PtSSN'].agg('count')\n",
+    "missing1 = set(abdpain_cohort_df_labeled.PtSSN).difference(set(tmp2.index))\n",
+    "print(missing1)\n",
+    "abdpain_cohort_df_labeled[abdpain_cohort_df_labeled.PtSSN.isin(missing1)]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "considerable-manhattan",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "abdpain_all.label.value_counts()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "spare-plastic",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "abdpain_labels_df.label.value_counts()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "alpha-irish",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for col in abdpain_all.columns[2:]:\n",
+    "   \n",
+    "    v = abdpain_all[col].isna().sum()\n",
+    "    if v > 0:\n",
+    "        print(col,v )\n",
+    "    "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "exceptional-craps",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "abdpain_all.columns"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "sealed-standard",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# fill na on the counts to be zero\n",
+    "count_fields = ['bun_abnormal_count','lact_abnormal_count','amylase_abnormal_count']\n",
+    "\n",
+    "for col in count_fields:\n",
+    "    abdpain_all[col] = abdpain_all[col].fillna(0)\n",
+    "    \n",
+    "print(abdpain_all.shape)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "fitting-swiss",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "abdpain_tmp_clean = abdpain_all.dropna(subset=['Systolic_count'])\n",
+    "missing = list(set(abdpain_all.PtSSN).difference(set(abdpain_tmp_clean.PtSSN)))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "temporal-shopper",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# fix ED_vitals fields with NoMOD values\n",
+    "\n",
+    "ed_vitals_cols = ['Systolic_count', 'Systolic_max', 'Systolic_min', 'Systolic_first'] + \\\n",
+    "                 ['Diastolic_count', 'Diastolic_max', 'Diastolic_min', 'Diastolic_first'] + \\\n",
+    "                 ['PULSE_count', 'PULSE_max', 'PULSE_min', 'PULSE_first'] + \\\n",
+    "                 ['RESPIRATION_count', 'RESPIRATION_max', 'RESPIRATION_min', 'RESPIRATION_first'] + \\\n",
+    "                 ['PAIN_count', 'PAIN_max', 'PAIN_min', 'PAIN_first'] + \\\n",
+    "                 ['TEMPERATURE_count', 'TEMPERATURE_max', 'TEMPERATURE_min', 'TEMPERATURE_first'] \n",
+    "\n",
+    "impval_dict = {}\n",
+    "for gcol in ed_vitals_cols:\n",
+    "    vals = abdpain_tmp_clean.groupby('label')[gcol].describe()[['50%']].reset_index()\n",
+    "    mod_val = vals[vals.label=='MOD']['50%'].values[0]\n",
+    "    nomod_val = vals[vals.label=='NoMOD']['50%'].values[0]\n",
+    "    print(gcol,mod_val,nomod_val) \n",
+    "    impval_dict[gcol]={'MOD':mod_val,'NoMOD':nomod_val}\n",
+    "    \n",
+    "for PtSSN in missing:\n",
+    "    for gcol in ed_vitals_cols:\n",
+    "        if abdpain_all[abdpain_all.PtSSN==PtSSN].label.values[0]=='MOD':\n",
+    "            abdpain_all.loc[abdpain_all.PtSSN==PtSSN,gcol] = impval_dict[gcol]['MOD']\n",
+    "        if abdpain_all[abdpain_all.PtSSN==PtSSN].label.values[0]=='NoMOD':   \n",
+    "            abdpain_all.loc[abdpain_all.PtSSN==PtSSN,gcol] = impval_dict[gcol]['NoMOD']\n",
+    "    "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "saved-revelation",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "abdpain_tmp_clean = abdpain_all.dropna(subset=['HOSP_Systolic_count'])\n",
+    "missing = list(set(abdpain_all.PtSSN).difference(set(abdpain_tmp_clean.PtSSN)))\n",
+    "\n",
+    "# fix hosp_vitals fields with NoMOD/MOD values\n",
+    "\n",
+    "hosp_vitals_cols = ['HOSP_' + x for x in ed_vitals_cols]\n",
+    "\n",
+    "impval_dict = {}\n",
+    "for gcol in hosp_vitals_cols:\n",
+    "    vals = abdpain_tmp_clean.groupby('label')[gcol].describe()[['50%']].reset_index()\n",
+    "    mod_val = vals[vals.label=='MOD']['50%'].values[0]\n",
+    "    nomod_val = vals[vals.label=='NoMOD']['50%'].values[0]\n",
+    "    print(gcol,mod_val,nomod_val) \n",
+    "    impval_dict[gcol]={'MOD':mod_val,'NoMOD':nomod_val}\n",
+    "    \n",
+    "for PtSSN in missing:\n",
+    "    for gcol in hosp_vitals_cols:\n",
+    "        if abdpain_all[abdpain_all.PtSSN==PtSSN].label.values[0]=='MOD':\n",
+    "            abdpain_all.loc[abdpain_all.PtSSN==PtSSN,gcol] = impval_dict[gcol]['MOD']\n",
+    "        if abdpain_all[abdpain_all.PtSSN==PtSSN].label.values[0]=='NoMOD':   \n",
+    "            abdpain_all.loc[abdpain_all.PtSSN==PtSSN,gcol] = impval_dict[gcol]['NoMOD']\n",
+    "    "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "brave-detroit",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for col in abdpain_all.columns[2:]:\n",
+    "   \n",
+    "    v = abdpain_all[col].isna().sum()\n",
+    "    if v > 0:\n",
+    "        print(col,v )\n",
+    "    "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "contained-render",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# fix the lab values colums for the missing based on class\n",
+    "labval_cols = ['WBC_max','potassium_max','potassium_min','chloride_max','amylase_max']\n",
+    "\n",
+    "for col in labval_cols:\n",
+    "    abdpain_tmp_clean = abdpain_all.dropna(subset=[col])\n",
+    "    missing = list(set(abdpain_all.PtSSN).difference(set(abdpain_tmp_clean.PtSSN)))\n",
+    "\n",
+    "    vals = abdpain_tmp_clean.groupby('label')[col].describe()[['50%']].reset_index()\n",
+    "    mod_val = vals[vals.label=='MOD']['50%'].values[0]\n",
+    "    nomod_val = vals[vals.label=='NoMOD']['50%'].values[0]\n",
+    "    print(col,mod_val,nomod_val) \n",
+    "    \n",
+    "    for PtSSN in missing:\n",
+    "        if abdpain_all[abdpain_all.PtSSN==PtSSN].label.values[0]=='MOD':\n",
+    "            abdpain_all.loc[abdpain_all.PtSSN==PtSSN,col] = mod_val\n",
+    "        if abdpain_all[abdpain_all.PtSSN==PtSSN].label.values[0]=='NoMOD':   \n",
+    "            abdpain_all.loc[abdpain_all.PtSSN==PtSSN,col] = nomod_val\n",
+    "    \n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "grand-consumer",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for col in abdpain_all.columns[2:]:\n",
+    "   \n",
+    "    v = abdpain_all[col].isna().sum()\n",
+    "    if v > 0:\n",
+    "        print(col,v )\n",
+    "    "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "verified-devices",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#rel_cols = set(abdpain_all.columns).difference(set(['amylase_min','amylase_max']))\n",
+    "pred_cols = set(abdpain_all.columns).difference(set(['PtSSN','label']))\n",
+    "abdpain_all_clean = abdpain_all[['PtSSN','label'] + list(pred_cols)].copy()\n",
+    "abdpain_all_clean.columns"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "wicked-peninsula",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# make all count and boolean fields integers\n",
+    "['GI_ENDOSCOPY'  'HOSP_PAIN_count',\n",
+    "       'diverticulitis', 'HOSP_Systolic_count', 'GASTROENTEROLOGY',\n",
+    "       'appendicitis', 'HOSP_RESPIRATION_count',  'pancreatitis',\n",
+    "        'PAIN_count', \n",
+    "       'cholecystisis',  'RESPIRATION_count',\n",
+    "       'amylase_abnormal_count', \n",
+    "       'PULSE_count',\n",
+    "        'xr_count',\n",
+    "       'GENERAL SURGERY', 'HOSP_TEMPERATURE_count', \n",
+    "       \n",
+    "        'HOSP_PULSE_count', \n",
+    "       'ct_abnormal_count', \n",
+    "       'bun_abnormal_count', 'HOSP_Diastolic_count', 'Systolic_count',\n",
+    "       'IB',  'diverticulosis',\n",
+    "       'num_ED_visits', 'EKG', \n",
+    "       \n",
+    "       \n",
+    "       'TEMPERATURE_count', \n",
+    "       'num_hosp',\n",
+    "        'Diastolic_count', 'cirrhosis',\n",
+    "       'cholelithiasis', 'ct_count',\n",
+    "       'CHART CONSULT',  'lact_abnormal_count',\n",
+    "       'TEMPERATURE_min', 'TEMPERATURE_first', 'ed_duration']\n",
+    "\n",
+    "cfields = ['bun_abnormal_count','lact_abnormal_count','amylase_abnormal_count',\n",
+    "           'cirrhosis','crohns_uc','diverticulitis','appendicitis','gallbladder']\n",
+    "for col in cfields:\n",
+    "    abdpain_all_clean[col] = abdpain_all_clean[col].astype(int)\n",
+    "    \n",
+    "abdpain_all_clean.dtypes"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "palestinian-development",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tmp = abdpain_all_clean.dropna()\n",
+    "X = tmp.loc[:,pred_cols]\n",
+    "yy = tmp.label\n",
+    "y = np.array([1 if (x=='MOD') else 0 for x in yy])\n",
+    "print(X.shape,y.shape)\n",
+    "clf = tune_model(X,y,5)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "aerial-somalia",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#clf,select_feats = build_L1_model(X,y,clf.best_params_['C'])\n",
+    "clf,select_feats = build_L1_model(X,y,1)\n",
+    "visualize_model(clf,select_feats)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "armed-induction",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sklearn.ensemble import RandomForestClassifier\n",
+    "rf = RandomForestClassifier(n_estimators=4,max_depth=5)\n",
+    "y = np.array([1 if x=='MOD' else 0 for x in tmp.label])\n",
+    "rf.fit(tmp.iloc[:,2:],y)\n",
+    "ypred = rf.predict(tmp.iloc[:,2:])\n",
+    "metrics.confusion_matrix(y,ypred)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "supposed-decision",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "featimp = pd.Series(rf.feature_importances_,index=tmp.columns[2:]).sort_values(ascending=False)\n",
+    "featimp_nz = featimp[featimp > 0]\n",
+    "import seaborn as sns\n",
+    "\n",
+    "plt.figure(figsize=(8,6))\n",
+    "myplot = sns.barplot(featimp_nz.index,featimp_nz.values)\n",
+    "myplot.set_xticklabels(myplot.get_xticklabels(),rotation = 90);"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "vanilla-american",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "good_feats = featimp[featimp >= np.mean(featimp)].index"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "familiar-heavy",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# find those rows in abdpain_clean_all that are not in tmp\n",
+    "null_val_rows = abdpain_all_clean[abdpain_all_clean.isnull().any(axis=1)].copy()\n",
+    "null_val_rows"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "soviet-freeze",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# potassium_max, chlrode_max, WBC_max, amylase_min, amylase_max filled with class-dependent medians\n",
+    "impval_dict = {}\n",
+    "for gcol in ['potassium_max','chloride_max','WBC_max']:\n",
+    "    vals = abdpain_all_clean.groupby('label')[gcol].describe()[['50%']].reset_index()\n",
+    "    mod_val = vals[vals.label=='MOD']['50%'].values[0]\n",
+    "    nomod_val = vals[vals.label=='NoMOD']['50%'].values[0]\n",
+    "    print(gcol,mod_val,nomod_val) \n",
+    "    impval_dict[gcol]={'MOD':mod_val,'NoMOD':nomod_val}\n",
+    "    \n",
+    "for gcol in ['potassium_max','chloride_max','WBC_max']:\n",
+    "    null_val_rows.loc[null_val_rows.label=='MOD',gcol] = impval_dict[gcol]['MOD']\n",
+    "    null_val_rows.loc[null_val_rows.label=='NoMOD',gcol] = impval_dict[gcol]['NoMOD']\n",
+    "    "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "combined-credits",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ytest = [1 if x=='MOD' else 0 for x in null_val_rows.label]\n",
+    "metrics.confusion_matrix(ytest,rf.predict(null_val_rows.iloc[:,2:]))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "located-steps",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ypred_all = rf.predict(pd.concat([tmp.iloc[:,2:],null_val_rows.iloc[:,2:]]))\n",
+    "y_all = [1 if x =='MOD' else 0 for x in tmp.label] + [1 if x=='MOD' else 0 for x in null_val_rows.label]\n",
+    "metrics.confusion_matrix(y_all,ypred_all)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "southwest-calvin",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ypred_all_prob = rf.predict_proba(pd.concat([tmp.iloc[:,2:],null_val_rows.iloc[:,2:]]))[:,1]\n",
+    "print('AUC = ',metrics.roc_auc_score(y_all,ypred_all_prob))\n",
+    "print('F1 = ',metrics.f1_score(y_all,ypred_all))\n",
+    "print('AUPRC = ', metrics.precision_score(y_all,ypred_all))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "numeric-helmet",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# umap it all\n",
+    "u = umap_plot_label(X,yy,1,5,['NoMOD','MOD'],'upper left')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "optimum-chess",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "c = cluster_umap(u,3,tmp,'upper right')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "fixed-temperature",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "stat_cols = featimp.iloc[:25].index\n",
+    "analyze_clusters(c,tmp,stat_cols)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "significant-reservoir",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}