Data Analysis - Github Friendly

# Data Analysis for Metabolomics in Alzheimer's Disease 
## Setting up the notebook
##### Importing required packages
import math
import matplotlib as mpl
import matplotlib.dates as md
import matplotlib.patches as mpatches
import matplotlib.pyplot as plt
import missingno as msno
import networkx as nx
import numpy as np
import os
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import re
import seaborn as sns
import sklearn as sk
import statsmodels.api as sm
import statsmodels.stats.outliers_influence as smo
import statsmodels.tools.tools as smt

from ipywidgets import Layout, widgets
from matplotlib.pyplot import figure
from mpl_toolkits import mplot3d
from patsy import dmatrices
from plotly.subplots import make_subplots
from sklearn import metrics
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA as skPCA
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import chi2
from sklearn.feature_selection import SelectKBest
from sklearn.impute import KNNImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from statsmodels.formula.api import logit
from statsmodels.regression.linear_model import OLS
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant
from yellowbrick.cluster import KElbowVisualizer
from yellowbrick.features import PCA
from yellowbrick.style import set_palette

%config InlineBackend.figure_format='retina'
%matplotlib inline

plt.style.use('ggplot')
set_palette('sns_colorblind')

##### Setting seed for reproducibility
np.random.seed(0)

##### Setting the Working Directory
os.getcwd()
os.chdir("")

##### Importing the Alzheimer’s Disease Data ("Identification of Altered Metabolic Pathways in Plasma and CSF in Mild Cognitive Impairment and Alzheimer’s Disease Using Metabolomics" - plasma dataset ST000046)
_The paper can be found here:_

https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3658985/

_The data can be found here:_

https://www.metabolomicsworkbench.org/data/DRCCMetadata.php?Mode=Study&StudyID=ST000046
_Note that the title of the paper slightly varies between the two links and also when viewed in pdf format, however it has been established that despite the three different titles the paper remains the same._

AllAlzData = pd.read_csv("ADMSPositiveIonModeC18Data (Additional Document 4).csv")

## Data Cleaning and Pre-Processing
##### Checking the Contents of the Data
AllAlzData.head()

##### Checking the Dimensions of the Data
_15 participants in each clinical group plus a column of metabolite names means expectation of 46 columns_
print(AllAlzData.shape)

##### Setting the Index
AlzData1 = AllAlzData.set_index("Unnamed: 0", drop = True)
AlzData1

##### Rotating the Data so that observations (samples) become the rows and features (metabolites) become the columns
AlzData = AlzData1.transpose()
AlzData;

##### Checking missing values
missing_values_count = AlzData.isnull().sum()
total_cells = np.product(AlzData.shape)

total_missing = missing_values_count.sum()
percent_missing = (total_missing/total_cells) * 100

print(percent_missing)

##### Visualising missing value locations - by metabolite
print(AlzData.isnull().sum())

##### Checking Percentage Missing to eliminate any samples/metabolites with >70% missing values as per Lee and Styczynski (2018)
percent_missing = AlzData.isnull().sum() * 100 / len(AlzData)
missing_value_df = pd.DataFrame({'percent_missing': percent_missing})
missing_value_df.sort_values('percent_missing', inplace=True, ascending=False)
print(missing_value_df)

percent_missing = AlzData1.isnull().sum() * 100 / len(AlzData1)
missing_value_df = pd.DataFrame({'percent_missing': percent_missing})
missing_value_df.sort_values('percent_missing', inplace=True, ascending=False)
print(missing_value_df)

_According to Lee and Styczynski (2018) it is a common standard practice to remove any samples or metabolites with >70% missing values. In this dataset there are no samples or metablites missing values above the 70% threshold with the greatest quantity of missing values being from __sample 17__ and the metabolites __10_12_15-octadecatrienoicacid__ and __D-Glucose__._
##### Missingno Matrix Plot Visualizing the Sparsity of Data (excluding cognitive status column)
_Due to the dimensions of the dataset, this Matrix Plot is only readable with the data pre transposing_
msno.matrix(AlzData.iloc[:,1:].T, fontsize=18)
gray_patch = mpatches.Patch(color='gray', label='Data Present')
white_patch = mpatches.Patch(color='white', label='Data Absent')
plt.legend(fontsize=18, bbox_to_anchor=(1.0, 0.7), borderaxespad=0.2, handles=[gray_patch, white_patch])
mpl.rcParams["legend.frameon"]=True
mpl.rcParams["legend.facecolor"]='#C0C0C0'
plt.title('Matrix Plot Showing Sparsity of Data', fontsize=30)
mpl.rcParams["axes.titlepad"]=20
plt.ylabel('Metabolites', fontsize=22)
plt.xlabel('Samples', fontsize=22)
plt.show()

##### Building a dendrogram to view relationships of missing values between samples (excluding cognitive status column)
_(Data also not transposed here for the purposes of visibility and size)_
msno.dendrogram(AlzData.iloc[0:,1:].T)
plt.title('Dendrogram showing Relationships of Missing Values Between Samples', fontsize=30)
mpl.rcParams["axes.titlepad"]=20
plt.ylabel('Nullity Correlation Level', fontsize=22)
plt.xlabel('Samples', fontsize=22)
plt.show()

_The dendrogram does not show any samples linked together below level 6, which suggests that there are no samples which can be classed as 'very similar' and therefore likely to have a significant influence on each other_

### Imputation
##### Using NS-kNN Imputation _(Lee and Styczynski 2018)_ to resolve missing values
##### Converting the Data to Numeric
nAlzData = AlzData.iloc[0:,1:1910].astype(float)
nAlzData;

##### Separating out Cognitive Status Column to be added back after numeric processing
CogStat = AlzData.iloc[0:,0:1]
CogStat;

##### 'Autoscaling' prior to Imputation
scalar = StandardScaler().fit(nAlzData)
AlzScSK = scalar.transform(nAlzData)
AlzScSK = pd.DataFrame(AlzScSK, columns=nAlzData.columns, index=nAlzData.index)
AlzScSK;

##### KNN Imputation Method
imputer = KNNImputer(n_neighbors=2)
After_imputation = imputer.fit_transform(AlzScSK)
AlzIm = pd.DataFrame(After_imputation, columns=AlzScSK.columns, index=AlzScSK.index)
AlzIm;

##### Un-scaling and then log transforming the data
AlzZunSc = scalar.inverse_transform(AlzIm)
AlzZunSc = pd.DataFrame(AlzZunSc, columns=nAlzData.columns, index=nAlzData.index)
AlzZ = np.log(AlzZunSc)
AlzZ;

##### Plot matrix to visualise the data during the various pre-processing stages
##### Defining plot grids
fig, axs = plt.subplots(nrows=2, ncols=2)
fig.tight_layout(h_pad=3)
fig.suptitle('Plot Matrix Showing Scaling, Processing and Normalizing of Data', fontsize = 30)
plt.subplots_adjust(top=0.93)
plt.rcParams['figure.figsize'] = [20, 20]

##### Histogram Creation
axs[0, 0].hist(nAlzData, edgecolor='black')
axs[0, 1].hist(AlzScSK, edgecolor='black')
axs[1, 0].hist(AlzZunSc, edgecolor='black')
axs[1, 1].hist(AlzZ, edgecolor='black')

##### Assigning Histogram Titles
axs[0, 0].set_title('Raw Data', fontsize = 20)
axs[0, 1].set_title('Pre kNN Scaling', fontsize = 20)
axs[1, 0].set_title('Post kNN UnScaling', fontsize = 20)
axs[1, 1].set_title('Log-Transformed, Clean Data', fontsize = 20)

AlzZ.insert(0,'Cog_Status', CogStat)
AlzZ;
tAlzZ = AlzZ.T
new_header = tAlzZ.iloc[0]
tAlzZ = tAlzZ[1:]
tAlzZ.columns = new_header
tAlzZ;

##### Encoding the AD, MCI and CN values as labels
label_encoder = LabelEncoder()
AlzZ['Cog_Status'] = label_encoder.fit_transform(AlzZ['Cog_Status'])

##### Making a dictionary of metabolites
metabolites1 = AlzIm.columns.get_level_values('Unnamed: 0')
cogstats = AlzZ["Cog_Status"].values
metabolites1

tempnames = list(range(1,1909+1))
tempnames = [int(d) for d in tempnames]
tempnamesdict = dict(zip(metabolites1, tempnames))

##### Creating a newly labelled dataset so the metabolite names don't interfere with the code
AlzZtmp = AlzZ.copy()
AlzZtmp.columns = AlzZtmp.columns.map(tempnamesdict)
AlzZtmp.iloc[0:,1:1910];
AlzZtmp.insert(0,'Cog_Status', CogStat)
AlzZtmp = AlzZtmp.drop(AlzZtmp.columns[[1]], axis=1)
AlzZtmp;
tAlzZtmp = AlzZtmp.T

##### Defining Independent Variables
ivar = AlzZtmp.iloc[0:,1:1910]
ivar = ivar
ivar.head()

tivar = ivar.T
tivar;

## Exploring the Data
### Testing for Multicollinearity
#### Visualising Multicollinearity
#####Creating a Correlation Matrix
ivarnmd = AlzZ.iloc[0:,1:1910]
cor = ivarnmd.corr()
cor

##### Building a clustermap
sns.clustermap(cor, cmap="bwr", figsize=(50,50), metric='euclidean', yticklabels=False).ax_col_dendrogram.set_title(
    'Euclidean Based Clustermap Showing Highly Correlated Metabolites', fontsize=50)

##### Building an interactive network graph to narrow down the information based on the correlation value threshold
cog = tAlzZ.columns.get_level_values('Cog_Status')
colormapMCLN = {
    'AD': '#949494',  ##### Colourblind friendly grey
    'CN': '#0173b2',  ##### Colourblind friendly blue
    'MCI': '#cc78bc',  ##### Colourblind friendly pink
}
colorlistMCLN = [colormapMCLN[c] for c in tAlzZ.columns.get_level_values('Cog_Status')]
threshold_choice = widgets.FloatSlider(description="Correlation Threshold", value=0.8, min=0.1, max=1, step=0.05, 
                                    continuous_update=False, orientation='horizontal', layout=Layout(width='500px'), 
                                    style=dict(description_width= 'initial'))

network = go.FigureWidget(data=[go.Scatter(x=[], y=[], mode='lines', text=[],  line=dict(color='#cc78bc', width=2.5), 
                                    marker=dict(size=16, line_width=5,line=dict(color='#cc78bc',width=2))),
                                    go.Scatter(x=[], y=[],mode='markers+text', textposition="top center", text=[], 
                                    hoverinfo='text',textfont_size=12, marker=dict(size=10, color=[],line_width=1))],
                                    layout=go.Layout(showlegend=False, annotations=[], margin=dict(t=40, b=0, l=0, r=0), 
                                    width=950, height=800, title='Network Graph Showing Correlated Metabolites at Adjustable Correlation Thresholds')) 
                          
df = AlzZ.copy()
correlation_matrix = cor.to_numpy()

def plot_corr_graph(change):

    threshold, corr_mode = None, None
    threshold = change.new
    
    tr_ind = np.triu_indices(correlation_matrix.shape[0])
    correlation_matrix[tr_ind] = 0
    G = nx.from_numpy_matrix(correlation_matrix)
    G = nx.relabel_nodes(G, lambda x: df.columns.tolist()[x])
    
    remove = []
    
    for col1, col2, weight in G.edges(data=True):
      if math.isnan(weight["weight"]):
        remove.append((col1,col2))
    
      if abs(weight["weight"]) < threshold:
        remove.append((col1,col2))
    
    G.remove_edges_from(remove)
    
    remove = []
    edges = list(sum(G.edges, ()))
    
    for node in G.nodes:
      if node not in edges:
        remove.append(node)
    G.remove_nodes_from(remove)
    mst = nx.maximum_spanning_tree(G)
    
    def assign_color(col):
      return colorlistMCLN

    def assign_color_edge(correlation):
      if correlation < 0:
        return "#BF0603"
      else:
        return "#00CC66"
    edge_colors = []
    node_colors = []
    for key, value in nx.get_edge_attributes(mst, 'weight').items():
        edge_colors.append(assign_color_edge(value))
    for key, value in dict(mst.degree).items():
        node_colors.append(assign_color(key))
      
    labels = {n:n for n in mst.nodes()}
    node_x = []
    node_y = []
    
    tree = nx.fruchterman_reingold_layout(mst, k=0.25).items()
    
    for node, (x_,y_) in tree:
        node_x.append(x_)
        node_y.append(y_)
        
    def get_dim_of_node(name):
        for node, (x,y) in tree:
            if node == name:
                return x,y
        
    edge_x = []
    edge_y = []
    
    weights= []
    for node1, node2, w in mst.edges(data=True):
        x0, y0 = get_dim_of_node(node1)
        x1, y1 =  get_dim_of_node(node2)
        edge_x.append(x0)
        edge_x.append(x1)
        edge_x.append(None)
        edge_y.append(y0)
        edge_y.append(y1)
        edge_y.append(None)
        weights.append((round(w["weight"],1), (x0+x1)/2, (y0+y1)/2))
                              
    with network.batch_update():
        network.data[1].x = node_x
        network.data[1].y = node_y
        network.data[1].text = list(labels)
        network.data[1].marker.color = node_colors
                          
        network.data[0].x = edge_x
        network.data[0].y = edge_y
        network.data[0].text = list(weights)
        network.update_layout(xaxis_zeroline=False, yaxis_zeroline=False, xaxis_showgrid=False, yaxis_showgrid=False, 
                              plot_bgcolor='rgba(0,0,0,0)')

threshold_choice.observe(plot_corr_graph, names="value")
widgets.VBox([threshold_choice, network])

_Recommended threshold 0.95_

#### Variance Inflation Factor (VIF) test
_Low to zero levels means this data would be a suitable candidate for Logistic Regression a method recommended by Lee and Hu (2019) and matching studies by Proitsi et al (2016) and Trushina et al (2013)_

##### Adding a constant
ivarc = smt.add_constant(data=ivarnmd, prepend=False)

##### Calculating VIF for each feature
vif_info = pd.DataFrame()
vif_info['VIF'] = [variance_inflation_factor(ivarnmd.values, i) for i in range(ivarnmd.shape[1])]
vif_info['Column'] = ivarnmd.columns
vif_info.sort_values('VIF', ascending=False)

### Principal Component Analysis (PCA)
__PCA analysis should only include the features (i.e. metabolites) category labels (such as Cognitive Status) are not included in the analysis - however they can be used to colour code visualisations.__

##### Using the Sklearn package to perform the PCA so that we can extract numerical attributes of the model
skpca = skPCA(n_components=45, random_state = 2020)
skpca.fit(ivar)
skAlzZpca = skpca.transform(ivar)

##### Using Yellowbrick to perform PCA to produce a visualisation of the components by cognitive type
_It is clear that there are no separate clusters between the cognitive types - this matches other metabolomics for AD studies_
AlzZpcavs = PCA(random_state = 2020, 
                classes = ['AD','CN','MCI'], colors=['#949494', '#0173b2', '#cc78bc'])
AlzZpcavs.fit_transform(ivar, AlzZ.Cog_Status)
AlzZpcavs.show()

##### Checking that the percentage variance of the data is completely explained by all of the principal components
print("Variance explained by all 45 principle components =", 
     sum(skpca.explained_variance_ratio_*100))

##### Checking total variance explained by each of the selected components
np.cumsum(skpca.explained_variance_ratio_*100)

##### Visualising the explained variance by principal component using and elbow plot
plt.plot(np.cumsum(skpca.explained_variance_ratio_*100))
plt.xlabel('Number of Components')
plt.ylabel('Explained Varience')
plt.title('Elbow Plot Showing the Optimum Number of Principle Components by Explained Variance')
plt.savefig('elbowplot.png', dpi=100)

##### Checking numerical values of explained variance by principal component
_A large number of principle components (40 of 45) are still needed to explain 95% of the variance)_
print("Varience explained by the First principle component =",
      np.cumsum(skpca.explained_variance_ratio_*100)[0])

print("Varience explained by the First 10 principle components =",
      np.cumsum(skpca.explained_variance_ratio_*100)[9])

print("Varience explained by the First 30 principle components =",
      np.cumsum(skpca.explained_variance_ratio_*100)[29])

print("Varience explained by the First 40 principle components =",
      np.cumsum(skpca.explained_variance_ratio_*100)[39])

##### Selection of 95% variance principal components
skpca_95 = skPCA(n_components=0.95, random_state = 2020)
skpca_95.fit(ivar)
skAlzZpca_95 = skpca_95.transform(ivar)
skAlzZpca_95.shape

##### Visualisation of the newly transformed data
plt.figure(figsize=(10, 7))
plt.plot(skAlzZpca_95)
plt.xlabel('Observation')
plt.ylabel('Transformed Data')
plt.title('Transformed data by the principle components (95% Variability)', 
         pad = 15)
plt.savefig('plt.png')

##### Creating a new dataframe of the 95% variance explained principal components
b = 40
a = range(1, b + 1)
pcCol = ["PC" + str(i) for i in a]
pcadf = pd.DataFrame(skAlzZpca_95, columns = pcCol)
pcadf['label'] = AlzZ.Cog_Status
pcadf.head()

##### Saving the new dataframe to a csv
pcadf = pcadf.iloc[0:,0:40]
pcadf.insert(0,'Cog Status', CogStat)
pcadf;
pcadf.to_csv('AD_95.csv', index=False)
tpcadf = pcadf.T
tpcadf;
new_header = tpcadf.iloc[0]
tpcadf = tpcadf[1:]
tpcadf.columns = CogStat
tpcadf;

### Partial Least Squares
#### Alzheimer's Disease against Cognitive Normal
##### Defining a dataset of specifically AD and CN data (Using the PCA 95% dataframe)
ADvsCN = tpcadf.iloc[1:,0:30]
ADvsCN;

##### Visually comparing the components by cognitive type
ADvsCN.columns.get_level_values(0)

##### building a dictionary
colormap = {
    ('AD',): '#949494',  ##### Colourblind friendly grey
    ('CN',): '#0173b2',  ##### Colourblind friendly blue
}
colorlist = [colormap[c] for c in ADvsCN.columns.get_level_values(0)]

ADCNfig = ADvsCN.plot(kind='line', legend=False, figsize=(12,4), color=colorlist, title='Comparing AD Metabolite Data against CN Data')
plt.xlabel('Principal Components')
plt.ylabel('Metabolite Concentrations (ppm)')
ADCNfig.xaxis.set_major_formatter(plt.NullFormatter())
plt.xticks(np.arange(0, 50, 50))
greydot = mpatches.Patch(color='#949494', label='AD')
bluedot = mpatches.Patch(color='#0173b2', label='CN')
plt.legend(fontsize=12, bbox_to_anchor=(1.0, 1.005), borderaxespad=0.2, handles=[greydot, bluedot])
_There does not seem to be any visually obvious outliers between the two groups although there are clear points on the right side of the plot where the CN data spikes higher._

##### Creating a pseudolinear Y value (using AD) against which to correlate the samples
y = [g == ('CN',) for g in ADvsCN.columns.get_level_values(0)]
y;

##### Conversion of boolean values to numerical
y = np.array(y, dtype=int)
y

##### Apply fitting a PLS-DA model to the data
from sklearn.cross_decomposition import PLSRegression
plsr = PLSRegression(n_components=2, scale=False)
plsr.fit(ADvsCN.values.T, y)

##### Viewing the model scores and building a dataframe to be used for plotting
##### (Scores describe the position of each sample in each determined latent variable (LV))
plsr.x_scores_;
plsr.x_weights_.shape
scores = pd.DataFrame(plsr.x_scores_)
scores.index=ADvsCN.columns
scores;

##### Making a separate version of the scores dataframe which can be rounded to two significant figures for labelling the points without interfering in the plotting locations
scorelab = scores.round(decimals = 2)
scorelab;

##### Plotting the PLS Regression scores
ax = scores.plot(x=0, y=1, kind='scatter', s=50, alpha=0.7,
                    c=colorlist, figsize=(6,6))
ax.set_xlabel('Scores on LV 1')
ax.set_ylabel('Scores on LV 2')
ax.set_title('PLS Scores of AD versus CN Metabolite Data')

greydot = mpatches.Patch(color='#949494', label='AD')
bluedot = mpatches.Patch(color='#0173b2', label='CN')
plt.legend(fontsize=12, bbox_to_anchor=(1.2, 1.005), borderaxespad=0.2, handles=[greydot, bluedot])

for n, (x, y) in enumerate(scores.values):
    label = scorelab.values[n][0]
    ax.text(x,y,label)
_There is separation between AD and CN values on Latent Variable 1_

#### Mild Cognitive Impairment against Cognitive Normal
##### Defining a dataset of specifically MCI and CN data (Using the PCA 95% dataframe)
MCIvsCN = tpcadf.iloc[0:,15:]
MCIvsCN;

##### Visually comparing the components by cognitive type
MCIvsCN.columns.get_level_values(0)

#####building a dictionary
colormap2 = {
    ('MCI',): '#cc78bc',  ##### Colourblind friendly pink
    ('CN',): '#0173b2',  ##### Colourblind friendly blue
}
colorlist2 = [colormap2[c] for c in MCIvsCN.columns.get_level_values(0)]

MCICNfig = MCIvsCN.plot(kind='line', legend=False, figsize=(12,4), color=colorlist2, title='Comparing MCI Metabolite Data against CN Data')
plt.xlabel('Principal Components')
plt.ylabel('Metabolite Concentrations (ppm)')
MCICNfig.xaxis.set_major_formatter(plt.NullFormatter())
plt.xticks(np.arange(0, 50, 50))
pinkdot = mpatches.Patch(color='#cc78bc', label='MCI')
bluedot = mpatches.Patch(color='#0173b2', label='CN')
plt.legend(fontsize=12, bbox_to_anchor=(1.0, 1.005), borderaxespad=0.2, handles=[pinkdot, bluedot])
_There seems to be potential outliers in the CN data which can be seen by the large blue spike on the left hand side of the plot and the two blue spikes on the right hand side of the plot._

##### Creating a pseudolinear Y value (using AD) against which to correlate the samples
y2 = [g == ('CN',) for g in MCIvsCN.columns.get_level_values(0)]
y2;

##### Conversion of boolean values to numerical
y2 = np.array(y2, dtype=int)
y2

##### Apply fitting a PLS-DA model to the data
plsr2 = PLSRegression(n_components=2, scale=False)
plsr2.fit(MCIvsCN.values.T, y2)

##### Viewing the model scores and building a dataframe to be used for plotting
plsr2.x_scores_;
plsr2.x_weights_.shape
scores2 = pd.DataFrame(plsr2.x_scores_)
scores2.index=MCIvsCN.columns
scores2;

##### Making a separate version of the scores dataframe which can be rounded to two significant figures for labelling the points without interfering in the plotting locations
score2lab = scores2.round(decimals = 2)
score2lab;

##### Plotting the PLS Regression scores
ax = scores2.plot(x=0, y=1, kind='scatter', s=50, alpha=0.7,
                    c=colorlist2, figsize=(6,6))
ax.set_xlabel('Scores on LV 1')
ax.set_ylabel('Scores on LV 2')
ax.set_title('PLS Scores of MCI versus CN Metabolite Data')

pinkdot = mpatches.Patch(color='#cc78bc', label='MCI')
bluedot = mpatches.Patch(color='#0173b2', label='CN')
plt.legend(fontsize=12, bbox_to_anchor=(1.21, 1.005), borderaxespad=0.2, handles=[pinkdot, bluedot])

for n, (x, y) in enumerate(scores2.values):
    label = score2lab.values[n][0]
    ax.text(x,y,label)
_There appears to be separation between MCI and CN values on Latent Variable 1_

#### Alzheimer's Disease against Mild Cognitive Impairment
##### Defining a dataset of specifically AD and MCI data (Using the PCA 95% dataframe)
AD = tpcadf.iloc[0:,0:15]
AD;
MCI = tpcadf.iloc[0:,30:]
MCI;
ADvsMCI = AD.join(MCI)
ADvsMCI;

##### Visually comparing the components by cognitive type
ADvsMCI.columns.get_level_values(0)

##### building a dictionary
colormap3 = {
    ('MCI',): '#cc78bc',  ##### Colourblind friendly pink
    ('AD',): '#949494',  ##### Colourblind friendly grey
}

colorlist3 = [colormap3[c] for c in ADvsMCI.columns.get_level_values(0)]

MCIADfig = ADvsMCI.plot(kind='line', legend=False, figsize=(12,4), color=colorlist3, title='Comparing MCI Metabolite Data against AD Data')
plt.xlabel('Principal Components')
plt.ylabel('Metabolite Concentrations (ppm)')
MCIADfig.xaxis.set_major_formatter(plt.NullFormatter())
plt.xticks(np.arange(0, 50, 50))
pinkdot = mpatches.Patch(color='#cc78bc', label='MCI')
greydot = mpatches.Patch(color='#949494', label='AD')
plt.legend(fontsize=12, bbox_to_anchor=(1.0, 1.005), borderaxespad=0.2, handles=[pinkdot, greydot])
_There does appear to be some potential outliers in the AD data on the left side of the plot where the grey spikes higher._

##### Creating a pseudolinear y value (using AD) against which to correlate the samples
y3 = [g == ('AD',) for g in ADvsMCI.columns.get_level_values(0)]
y3;

##### Conversion of boolean values to numerical
y3 = np.array(y3, dtype=int)
y3

##### Apply fitting a PLS-DA model to the data
plsr3 = PLSRegression(n_components=2, scale=False)
plsr3.fit(ADvsMCI.values.T, y3)

##### Viewing the model scores and building a dataframe to be used for plotting
plsr3.x_scores_;
plsr3.x_weights_.shape
scores3 = pd.DataFrame(plsr3.x_scores_)
scores3.index=ADvsMCI.columns
scores3;

##### Making a separate version of the scores dataframe which can be rounded to two significant figures for labelling the points without interfering in the plotting locations
score3lab = scores3.round(decimals = 2)
score3lab;

##### Plotting the PLS Regression scores
ax = scores3.plot(x=0, y=1, kind='scatter', s=50, alpha=0.7,
                    c=colorlist3, figsize=(6,6))
ax.set_xlabel('Scores on LV 1')
ax.set_ylabel('Scores on LV 2')
ax.set_title('PLS Scores of MCI versus AD Metabolite Data')

pinkdot = mpatches.Patch(color='#cc78bc', label='MCI')
greydot = mpatches.Patch(color='#949494', label='AD')
plt.legend(fontsize=12, bbox_to_anchor=(1.21, 1.005), borderaxespad=0.2, handles=[pinkdot, greydot])

for n, (x, y) in enumerate(scores3.values):
    label = score3lab.values[n][0]
    ax.text(x,y,label)
_There appears to be significant separation between MCI and AD values on Latent Variable 1_

### Logistic Regression
#### AD vs CN
##### Creating a dataframe of only AD and CN Values
_1 values relate to AD and 0 values relate to CN_
AD_log = AlzZ.iloc[0:15,:]
AD_log.eval("Cog_Status = Cog_Status + 1", inplace=True)
AD_log;
CN_log = AlzZ.iloc[15:30,:]
CN_log.eval("Cog_Status = Cog_Status - 1", inplace=True)
CN_log;
ADvsCN_log = pd.concat([AD_log, CN_log])
ADvsCN_log

##### Creating a features array
X1 = ADvsCN_log.iloc[:,1:]
X1;

##### Creating an output array
Y1 = ADvsCN_log.iloc[:,0]
Y1;

##### Splitting the data
X_train1, X_test1, Y_train1, Y_test1 = train_test_split(X1, Y1, random_state=0)

##### Building the classifier
logclassifier1 = LogisticRegression(solver='lbfgs',random_state=0)
logclassifier1.fit(X_train1, Y_train1)

##### Testing the classifier
predicted_y1 = logclassifier1.predict(X_test1)
predicted_y1

##### Testing the accuracy of the classifier
print('Accuracy: {:.2f}'.format(logclassifier1.score(X_test1, Y_test1)))

##### Cross Validation testing
scores_log1 = cross_val_score(logclassifier1, X1, Y1, cv=10)
print('Cross-Validation Accuracy Scores', scores)
scores_log1 = pd.Series(scores_log1)
scores_log1.min(), scores_log1.mean(), scores_log1.max()
_Range of accuracy is between 0.34 and 1.0 but averaging on 0.57_

##### Extracting coefficients of specific metabolites
##### Create a list of original variable names from the training DataFrame
original_variables1 = X1.columns

###### Extract the coefficients of the logistic regression estimator
model_coefficients1 = logclassifier1.coef_[0]

###### Create a dataframe of the variables and coefficients & print it out
coefficient_df1 = pd.DataFrame({"Metabolite" : original_variables1, "Coefficient": model_coefficients1})
print(coefficient_df1)

###### Print out the top 3 positive variables
top_three_df1 = coefficient_df1.sort_values(by="Coefficient", axis=0, ascending=False)[0:3]
print(top_three_df1)

###### Print out the bottom 3 positive variables
bottom_three_df1 = coefficient_df1.sort_values(by="Coefficient", axis=0, ascending=True)[0:3]
print(bottom_three_df1)

##### Feature Selection to improve the model
model1 = ExtraTreesClassifier()
model1.fit(X1,Y1)
print(model1.feature_importances_) 

##### plot the graph of feature importances 
figure(figsize=(20,20))
feat_importances1 = pd.Series(model1.feature_importances_, index=X1.columns)
feat_importances1.nlargest(50).plot(kind='barh')
plt.rc('font', size=20)
plt.xlabel('Feature Importance Score')
plt.ylabel('Metabolite')
plt.title('Plot Showing Scores of 50 "Most Important" features for the AD vs CN model')
plt.show()

feat_importances1
someimport1 = feat_importances1[feat_importances1!=0]
someimport1 = someimport1.sort_values(ascending=False)
someimport1

##### Creating a list of the most important metabolites for AD vs CN
top50_1 = (someimport1[:50])
top50_1

ls1 = list(top50_1.index.values)
ls1;

##### Using the list of metabolites (i.e., column names) to build a new X1 dataframe to be used in the logistic regression model
X1_2 = X1[ls1]
X1_2;

##### Splitting the data for the second model
X_train1_2, X_test1_2, Y_train1_2, Y_test1_2 = train_test_split(X1_2, Y1, random_state=0)

##### Building the classifier
logclassifier1_2 = LogisticRegression(solver='lbfgs',random_state=0)
result1_2=logclassifier1_2.fit(X_train1_2, Y_train1_2)

##### Testing the classifier
predicted_y1_2 = logclassifier1_2.predict(X_test1_2)
predicted_y1_2

##### Testing the accuracy of the classifier
print('Accuracy: {:.2f}'.format(logclassifier1_2.score(X_test1_2, Y_test1_2)))

##### Cross Validation testing
scores_log1_2 = cross_val_score(logclassifier1_2, X1_2, Y1, cv=15)
print('Cross-Validation Accuracy Scores', scores)
scores_log1_2 = pd.Series(scores_log1_2)
scores_log1_2.min(), scores_log1_2.mean(), scores_log1_2.max()
_Range of accuracy is now between 0.5 and 1.0 but averaging on 0.94_

##### Extracting coefficients of specific metabolites

##### Create a list of original variable names from the training DataFrame
original_variables1_2 = X1_2.columns

##### Extract the coefficients of the logistic regression estimator
model_coefficients1_2 = logclassifier1_2.coef_[0]

##### Create a dataframe of the variables and coefficients & print it out
coefficient_df1_2 = pd.DataFrame({"Metabolite" : original_variables1_2, "Coefficient": model_coefficients1_2})
print(coefficient_df1_2)

##### Print out the top 3 positive variables
top_three_df1_2 = coefficient_df1_2.sort_values(by="Coefficient", axis=0, ascending=False)[0:3]
print(top_three_df1_2)

##### Print out the bottom 3 positive variables
bottom_three_df1_2 = coefficient_df1_2.sort_values(by="Coefficient", axis=0, ascending=True)[0:3]
print(bottom_three_df1_2)

#### MCI vs CN
##### Creating a dataframe of only AD and CN Values
_1 values relate to MCI and 0 values relate to CN_
MCI_log = AlzZ.iloc[30:,:]
MCI_log.eval("Cog_Status = Cog_Status - 1", inplace=True)
MCI_log;
MCIvsCN_log = pd.concat([MCI_log, CN_log])
MCIvsCN_log;

##### Creating a features array
X2 = MCIvsCN_log.iloc[:,1:]
X2;

##### Creating an output array
Y2 = MCIvsCN_log.iloc[:,0]
Y2;

##### Splitting the data
X_train2, X_test2, Y_train2, Y_test2 = train_test_split(X2, Y2, random_state=0)

##### Building the classifier
logclassifier2 = LogisticRegression(solver='lbfgs',random_state=0)
result2=logclassifier2.fit(X_train2, Y_train2)

##### Testing the classifier
predicted_y2 = logclassifier2.predict(X_test2)
predicted_y2

##### Testing the accuracy of the classifier
print('Accuracy: {:.2f}'.format(logclassifier2.score(X_test2, Y_test2)))

##### Cross Validation testing
scores_log2 = cross_val_score(logclassifier2, X2, Y2, cv=10)
print('Cross-Validation Accuracy Scores', scores)
scores_log2 = pd.Series(scores_log2)
scores_log2.min(), scores_log2.mean(), scores_log2.max()
_Range of accuracy is between 0.34 and 1.0 but averaging on 0.54_

##### Extracting coefficients of specific metabolites
##### Create a list of original variable names from the training DataFrame
original_variables2 = X2.columns

##### Extract the coefficients of the logistic regression estimator
model_coefficients2 = logclassifier2.coef_[0]

##### Create a dataframe of the variables and coefficients & print it out
coefficient_df2 = pd.DataFrame({"Metabolite" : original_variables2, "Coefficient": model_coefficients2})
print(coefficient_df2)

##### Print out the top 3 positive variables
top_three_df2 = coefficient_df2.sort_values(by="Coefficient", axis=0, ascending=False)[0:3]
print(top_three_df2)

##### Print out the bottom 3 positive variables
bottom_three_df2 = coefficient_df2.sort_values(by="Coefficient", axis=0, ascending=True)[0:3]
print(bottom_three_df2)

##### Feature Selection to improve the model
model2 = ExtraTreesClassifier()
model2.fit(X2,Y2)
print(model2.feature_importances_) 

##### plot the graph of feature importances 
figure(figsize=(20,20))
feat_importances2 = pd.Series(model2.feature_importances_, index=X2.columns)
feat_importances2.nlargest(50).plot(kind='barh')
plt.rc('font', size=20)
plt.xlabel('Feature Importance Score')
plt.ylabel('Metabolite')
plt.title('Plot Showing Scores of 50 "Most Important" features for the MCI vs CN model')
plt.show()

feat_importances2
someimport2 = feat_importances2[feat_importances2!=0]
someimport2 = someimport2.sort_values(ascending=False)
someimport2

##### Creating a list of the most important metabolites for MCI vs CN
top50_2 = (someimport2[:50])
top50_2
ls2 = list(top50_2.index.values)
ls2;

##### Using the list of metabolites (i.e., column names) to build a new X2 dataframe to be used in the logistic regression model
X2_2 = X2[ls2]
X2_2;

##### Splitting the data for the second model
X_train2_2, X_test2_2, Y_train2_2, Y_test2_2 = train_test_split(X2_2, Y2, random_state=0)

##### Building the classifier
logclassifier2_2 = LogisticRegression(solver='lbfgs',random_state=0)
result2_2=logclassifier2_2.fit(X_train2_2, Y_train2_2)

##### Testing the classifier
predicted_y2_2 = logclassifier2_2.predict(X_test2_2)
predicted_y2_2

##### Testing the accuracy of the classifier
print('Accuracy: {:.2f}'.format(logclassifier2_2.score(X_test2_2, Y_test2_2)))

##### Cross Validation testing
scores_log2_2 = cross_val_score(logclassifier2_2, X2_2, Y2, cv=15)
print('Cross-Validation Accuracy Scores', scores)
scores_log2_2 = pd.Series(scores_log2_2)
scores_log2_2.min(), scores_log2_2.mean(), scores_log2_2.max()
_Range of accuracy is now between 0.5 and 1.0 but averaging on 0.9_

##### Extracting coefficients of specific metabolites
##### Create a list of original variable names from the training DataFrame
original_variables2_2 = X2_2.columns

##### Extract the coefficients of the logistic regression estimator
model_coefficients2_2 = logclassifier2_2.coef_[0]

##### Create a dataframe of the variables and coefficients & print it out
coefficient_df2_2 = pd.DataFrame({"Metabolite" : original_variables2_2, "Coefficient": model_coefficients2_2})
print(coefficient_df2_2)

##### Print out the top 3 positive variables
top_three_df2_2 = coefficient_df2_2.sort_values(by="Coefficient", axis=0, ascending=False)[0:3]
print(top_three_df2_2)

##### Print out the bottom 3 positive variables
bottom_three_df2_2 = coefficient_df2_2.sort_values(by="Coefficient", axis=0, ascending=True)[0:3]
print(bottom_three_df2_2)