Initial commit

gaure · Jul 11, 2017 · cdc3ee1 · cdc3ee1
commit cdc3ee1
Show file tree

Hide file tree

Showing 8 changed files with 24,587 additions and 0 deletions.
diff --git a/Hospitalization-Elapse-Time.html b/Hospitalization-Elapse-Time.html
diff --git a/Hospitalization-Elapse-Time.ipynb b/Hospitalization-Elapse-Time.ipynb
diff --git a/README.md b/README.md
@@ -0,0 +1,40 @@
+Machine Nanodegree Program
+
+Capstone Project Data.
+
+The following text file explains how to obtain the data for the nanodegree project. The project's goal is to predict the time a patient is hospitalized based on the BMI (body mass index) and other additional features extracted from the medical "obesity paradox" previous studies.
+
+We will use the MIMIC-III database to extract the project dataset. The MIMIC-III (‘Medical Information Mart for Intensive Care’) is a large, single-center database comprising information relating to patients admitted to critical care units at a large tertiary care hospital. Data includes vital signs, medications, laboratory measurements, observations and notes charted by care providers, fluid balance, procedure codes, diagnostic codes, imaging reports, hospital length of stay, survival data, and more. Because the database is not open source, it can be access or shared freely; to get access to the database's records, training must be completed, and access has to be requested. The database can be obtained at (https://www.google.com/url?q=https://mimic.physionet.org/&sa=D&ust=1491769209941000&usg=AFQjCNHF2Y_9FzPHejpji9DryoOOfojFyA)
+
+The features and tables’ names that I will use are shown below.
+
+FEATURES
+
+Body Mass Index: CHARTEVENTS table
+Sub-Set of Medical Conditions: DIAGNOSESICD table
+Age: PATIENT table
+Sex: PATIENT table
+
+LABEL
+Admission Time Discharge Time (MIMIC admissions table): ADMISSION table
+
+NOTE: The body mass index has to calculated from the height and weight values of the CHARTEVENTS table. MimicIII database doesn't have this stored directly.
+
+INSTRUCTIONS TO CREATED THE DATA SET
+
+1. Install Eclipse BIRT report designer. (http://download.eclipse.org/birt/downloads/)
+2. Gain access to the MIMIC-III database by following the steps at “https://mimic.physionet.org/gettingstarted/access/.“
+3. Rebuild the database using the instructions at https://mimic.physionet.org/tutorials/install-mimic-locally-ubuntu/.
+4. Once the database is up and running, in this package you will find a file called eclipse-birt-mimiciii-dataset-report.zip¨, 
+this file contains an ECLIPSE BIRT workbench report design file that can be imported to an ECLIPSE BIRT installation to recreate the dataset used in this project.
+You will have to perform a minor modification to the imported eclipse report. The data source needs to point to your instance of the MIMIC-III database, after that just
+run it and it will create the dataset used in this project.
+5. I replaced some data sets records from 300 to 89 using excel after exporting the Birt report to a CSV file. You can do the same in BIRT before exporting the data or after using Python code, just don’t forget to do it. 
+6. Finally, run the Jupiter notebook included in this package.
+
+
+
+NOTE: Once the file has been imported, you will have to modify the report "DataSource" object to point to the PostgreSQL installation where the MIMICIII database resides.
+
+
+
diff --git a/eclipse-birt-mimiciii-dataset-report.zip b/eclipse-birt-mimiciii-dataset-report.zip
diff --git a/icd9Codes.py b/icd9Codes.py
@@ -0,0 +1,43 @@
+#Neoplasia codes
+neoplasia_codes = [179,185,193,220,226,23699]
+neoplasia_codes.extend(range(1400,2398))
+neoplasia_codes.extend(range(17300,17400))
+neoplasia_codes.extend(range(19881,19890))
+neoplasia_codes.extend(range(20382,22390))
+neoplasia_codes.extend(range(23690,23692))
+neoplasia_codes.extend(range(25802,25804))
+
+#Hypertension codes
+hypertension_codes = range(4010,4021)
+hypertension_codes.extend(range(40501,40600))
+
+#Diabetes codes
+diabetes_code = [2535,2537]
+diabetes_code.extend(range(25000,25003))
+
+#Lipid Disorders
+lipid_disorders = range(2721,2730)
+
+#Cardiovascular Disorders
+cardiovascular_disorders = range(42511,42519)
+cardiovascular_disorders.extend(range(4252,4294))
+cardiovascular_disorders.extend(range(3900,4600))
+cardiovascular_disorders.extend(range(42700,42900))
+
+# Function to assign label
+def assign_label(x):
+    label = x
+    if x in neoplasia_codes:
+        label = "neoplasia"
+    elif x in hypertension_codes:
+        label = "hypertension"
+    elif x in diabetes_code:
+        label = "diabetes"
+    elif x in lipid_disorders:
+        label = "lipid_disorders"
+    elif x in cardiovascular_disorders:
+        label = "cardiovascular_disorders"
+    else:
+        label = None
+    return label
+
diff --git a/project_report.pdf b/project_report.pdf
diff --git a/proposal.pdf b/proposal.pdf
diff --git a/visuals.py b/visuals.py
@@ -0,0 +1,218 @@
+###########################################
+# Suppress matplotlib user warnings
+# Necessary for newer version of matplotlib
+import warnings
+warnings.filterwarnings("ignore", category = UserWarning, module = "matplotlib")
+#
+# Display inline matplotlib plots with IPython
+from IPython import get_ipython
+get_ipython().run_line_magic('matplotlib', 'inline')
+###########################################
+
+import matplotlib.pyplot as plt
+import matplotlib.cm as cm
+import pandas as pd
+import numpy as np
+import sklearn.learning_curve as curves
+from sklearn import linear_model
+from sklearn.cross_validation import ShuffleSplit, train_test_split
+
+
+def pca_results(good_data, pca):
+	'''
+	Create a DataFrame of the PCA results
+	Includes dimension feature weights and explained variance
+	Visualizes the PCA results
+	'''
+
+	# Dimension indexing
+	dimensions = dimensions = ['Dimension {}'.format(i) for i in range(1,len(pca.components_)+1)]
+
+	# PCA components
+	components = pd.DataFrame(np.round(pca.components_, 4), columns = good_data.keys())
+	components.index = dimensions
+
+	# PCA explained variance
+	ratios = pca.explained_variance_ratio_.reshape(len(pca.components_), 1)
+	variance_ratios = pd.DataFrame(np.round(ratios, 4), columns = ['Explained Variance'])
+	variance_ratios.index = dimensions
+
+	# Create a bar plot visualization
+	fig, ax = plt.subplots(figsize = (14,8))
+
+	# Plot the feature weights as a function of the components
+	components.plot(ax = ax, kind = 'bar');
+	ax.set_ylabel("Feature Weights")
+	ax.set_xticklabels(dimensions, rotation=0)
+
+
+	# Display the explained variance ratios
+	for i, ev in enumerate(pca.explained_variance_ratio_):
+		ax.text(i-0.40, ax.get_ylim()[1] + 0.05, "Explained Variance\n          %.4f"%(ev))
+
+	# Return a concatenated DataFrame
+	return pd.concat([variance_ratios, components], axis = 1)
+
+def cluster_results(reduced_data, preds, centers, pca_samples):
+	'''
+	Visualizes the PCA-reduced cluster data in two dimensions
+	Adds cues for cluster centers and student-selected sample data
+	'''
+
+	predictions = pd.DataFrame(preds, columns = ['Cluster'])
+	plot_data = pd.concat([predictions, reduced_data], axis = 1)
+
+	# Generate the cluster plot
+	fig, ax = plt.subplots(figsize = (14,8))
+
+	# Color map
+	cmap = cm.get_cmap('gist_rainbow')
+
+	# Color the points based on assigned cluster
+	for i, cluster in plot_data.groupby('Cluster'):   
+	    cluster.plot(ax = ax, kind = 'scatter', x = 'Dimension 1', y = 'Dimension 2', \
+	                 color = cmap((i)*1.0/(len(centers)-1)), label = 'Cluster %i'%(i), s=30);
+
+	# Plot centers with indicators
+	for i, c in enumerate(centers):
+	    ax.scatter(x = c[0], y = c[1], color = 'white', edgecolors = 'black', \
+	               alpha = 1, linewidth = 2, marker = 'o', s=200);
+	    ax.scatter(x = c[0], y = c[1], marker='$%d$'%(i), alpha = 1, s=100);
+
+	# Plot transformed sample points 
+	ax.scatter(x = pca_samples[:,0], y = pca_samples[:,1], \
+	           s = 150, linewidth = 4, color = 'black', marker = 'x');
+
+	# Set plot title
+	ax.set_title("Cluster Learning on PCA-Reduced Data - Centroids Marked by Number\nTransformed Sample Data Marked by Black Cross");
+
+
+def biplot(good_data, reduced_data, pca):
+    '''
+    Produce a biplot that shows a scatterplot of the reduced
+    data and the projections of the original features.
+    
+    good_data: original data, before transformation.
+               Needs to be a pandas dataframe with valid column names
+    reduced_data: the reduced data (the first two dimensions are plotted)
+    pca: pca object that contains the components_ attribute
+
+    return: a matplotlib AxesSubplot object (for any additional customization)
+    
+    This procedure is inspired by the script:
+    https://github.com/teddyroland/python-biplot
+    '''
+
+    fig, ax = plt.subplots(figsize = (14,8))
+    # scatterplot of the reduced data    
+    ax.scatter(x=reduced_data.loc[:, 'Dimension 1'], y=reduced_data.loc[:, 'Dimension 2'], 
+        facecolors='b', edgecolors='b', s=70, alpha=0.5)
+
+    feature_vectors = pca.components_.T
+
+    # we use scaling factors to make the arrows easier to see
+    arrow_size, text_pos = 7.0, 8.0,
+
+    # projections of the original features
+    for i, v in enumerate(feature_vectors):
+        ax.arrow(0, 0, arrow_size*v[0], arrow_size*v[1], 
+                  head_width=0.2, head_length=0.2, linewidth=2, color='red')
+        ax.text(v[0]*text_pos, v[1]*text_pos, good_data.columns[i], color='black', 
+                 ha='center', va='center', fontsize=18)
+
+    ax.set_xlabel("Dimension 1", fontsize=14)
+    ax.set_ylabel("Dimension 2", fontsize=14)
+    ax.set_title("PC plane with original feature projections.", fontsize=16);
+    return ax
+
+
+def channel_results(reduced_data, outliers, pca_samples):
+	'''
+	Visualizes the PCA-reduced cluster data in two dimensions using the full dataset
+	Data is labeled by "Channel" and cues added for student-selected sample data
+	'''
+
+	# Check that the dataset is loadable
+	try:
+	    full_data = pd.read_csv("customers.csv")
+	except:
+	    print "Dataset could not be loaded. Is the file missing?"
+	    return False
+
+	# Create the Channel DataFrame
+	channel = pd.DataFrame(full_data['Channel'], columns = ['Channel'])
+	channel = channel.drop(channel.index[outliers]).reset_index(drop = True)
+	labeled = pd.concat([reduced_data, channel], axis = 1)
+
+	# Generate the cluster plot
+	fig, ax = plt.subplots(figsize = (14,8))
+
+	# Color map
+	cmap = cm.get_cmap('gist_rainbow')
+
+	# Color the points based on assigned Channel
+	labels = ['Hotel/Restaurant/Cafe', 'Retailer']
+	grouped = labeled.groupby('Channel')
+	for i, channel in grouped:   
+	    channel.plot(ax = ax, kind = 'scatter', x = 'Dimension 1', y = 'Dimension 2', \
+	                 color = cmap((i-1)*1.0/2), label = labels[i-1], s=30);
+
+	# Plot transformed sample points   
+	for i, sample in enumerate(pca_samples):
+		ax.scatter(x = sample[0], y = sample[1], \
+	           s = 200, linewidth = 3, color = 'black', marker = 'o', facecolors = 'none');
+		ax.scatter(x = sample[0]+0.25, y = sample[1]+0.3, marker='$%d$'%(i), alpha = 1, s=125);
+
+	# Set plot title
+	ax.set_title("PCA-Reduced Data Labeled by 'Channel'\nTransformed Sample Data Circled");
+
+def ModelLearning(X, y):
+    """ Calculates the performance of several models with varying sizes of training data.
+        The learning and testing scores for each model are then plotted. """
+
+    # Create 10 cross-validation sets for training and testing
+    cv = ShuffleSplit(X.shape[0], n_iter = 10, test_size = 0.2, random_state = 0)
+
+    # Generate the training set sizes increasing by 50
+    train_sizes = np.rint(np.linspace(1, X.shape[0]*0.8 - 1, 9)).astype(int)
+
+    # Create the figure window
+    fig = plt.figure(figsize=(10,7))
+
+    # Create three different models based on max_depth
+    for k, norm in enumerate([True,False]):
+
+        # Create a Decision tree regressor at max_depth = depth
+        regressor = linear_model.LinearRegression(normalize = norm)
+
+        # Calculate the training and testing scores
+        sizes, train_scores, test_scores = curves.learning_curve(regressor, X, y, \
+            cv = cv, train_sizes = train_sizes, scoring = 'r2')
+
+        # Find the mean and standard deviation for smoothing
+        train_std = np.std(train_scores, axis = 1)
+        train_mean = np.mean(train_scores, axis = 1)
+        test_std = np.std(test_scores, axis = 1)
+        test_mean = np.mean(test_scores, axis = 1)
+
+        # Subplot the learning curve 
+        ax = fig.add_subplot(2, 2, k+1)
+        ax.plot(sizes, train_mean, 'o-', color = 'r', label = 'Training Score')
+        ax.plot(sizes, test_mean, 'o-', color = 'g', label = 'Testing Score')
+        ax.fill_between(sizes, train_mean - train_std, \
+            train_mean + train_std, alpha = 0.15, color = 'r')
+        ax.fill_between(sizes, test_mean - test_std, \
+            test_mean + test_std, alpha = 0.15, color = 'g')
+
+        # Labels
+        ax.set_title('max_depth = %s'%(depth))
+        ax.set_xlabel('Number of Training Points')
+        ax.set_ylabel('Score')
+        ax.set_xlim([0, X.shape[0]*0.8])
+        ax.set_ylim([-0.05, 1.05])
+
+    # Visual aesthetics
+    ax.legend(bbox_to_anchor=(1.05, 2.05), loc='lower left', borderaxespad = 0.)
+    fig.suptitle('Linear Regressor Learning Performances', fontsize = 16, y = 1.03)
+    fig.tight_layout()
+    fig.show()