-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathData Analysis - Github Friendly
900 lines (724 loc) · 33.8 KB
/
Data Analysis - Github Friendly
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
# Data Analysis for Metabolomics in Alzheimer's Disease
## Setting up the notebook
##### Importing required packages
import math
import matplotlib as mpl
import matplotlib.dates as md
import matplotlib.patches as mpatches
import matplotlib.pyplot as plt
import missingno as msno
import networkx as nx
import numpy as np
import os
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import re
import seaborn as sns
import sklearn as sk
import statsmodels.api as sm
import statsmodels.stats.outliers_influence as smo
import statsmodels.tools.tools as smt
from ipywidgets import Layout, widgets
from matplotlib.pyplot import figure
from mpl_toolkits import mplot3d
from patsy import dmatrices
from plotly.subplots import make_subplots
from sklearn import metrics
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA as skPCA
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import chi2
from sklearn.feature_selection import SelectKBest
from sklearn.impute import KNNImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from statsmodels.formula.api import logit
from statsmodels.regression.linear_model import OLS
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant
from yellowbrick.cluster import KElbowVisualizer
from yellowbrick.features import PCA
from yellowbrick.style import set_palette
%config InlineBackend.figure_format='retina'
%matplotlib inline
plt.style.use('ggplot')
set_palette('sns_colorblind')
##### Setting seed for reproducibility
np.random.seed(0)
##### Setting the Working Directory
os.getcwd()
os.chdir("")
##### Importing the Alzheimer’s Disease Data ("Identification of Altered Metabolic Pathways in Plasma and CSF in Mild Cognitive Impairment and Alzheimer’s Disease Using Metabolomics" - plasma dataset ST000046)
_The paper can be found here:_
https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3658985/
_The data can be found here:_
https://www.metabolomicsworkbench.org/data/DRCCMetadata.php?Mode=Study&StudyID=ST000046
_Note that the title of the paper slightly varies between the two links and also when viewed in pdf format, however it has been established that despite the three different titles the paper remains the same._
AllAlzData = pd.read_csv("ADMSPositiveIonModeC18Data (Additional Document 4).csv")
## Data Cleaning and Pre-Processing
##### Checking the Contents of the Data
AllAlzData.head()
##### Checking the Dimensions of the Data
_15 participants in each clinical group plus a column of metabolite names means expectation of 46 columns_
print(AllAlzData.shape)
##### Setting the Index
AlzData1 = AllAlzData.set_index("Unnamed: 0", drop = True)
AlzData1
##### Rotating the Data so that observations (samples) become the rows and features (metabolites) become the columns
AlzData = AlzData1.transpose()
AlzData;
##### Checking missing values
missing_values_count = AlzData.isnull().sum()
total_cells = np.product(AlzData.shape)
total_missing = missing_values_count.sum()
percent_missing = (total_missing/total_cells) * 100
print(percent_missing)
##### Visualising missing value locations - by metabolite
print(AlzData.isnull().sum())
##### Checking Percentage Missing to eliminate any samples/metabolites with >70% missing values as per Lee and Styczynski (2018)
percent_missing = AlzData.isnull().sum() * 100 / len(AlzData)
missing_value_df = pd.DataFrame({'percent_missing': percent_missing})
missing_value_df.sort_values('percent_missing', inplace=True, ascending=False)
print(missing_value_df)
percent_missing = AlzData1.isnull().sum() * 100 / len(AlzData1)
missing_value_df = pd.DataFrame({'percent_missing': percent_missing})
missing_value_df.sort_values('percent_missing', inplace=True, ascending=False)
print(missing_value_df)
_According to Lee and Styczynski (2018) it is a common standard practice to remove any samples or metabolites with >70% missing values. In this dataset there are no samples or metablites missing values above the 70% threshold with the greatest quantity of missing values being from __sample 17__ and the metabolites __10_12_15-octadecatrienoicacid__ and __D-Glucose__._
##### Missingno Matrix Plot Visualizing the Sparsity of Data (excluding cognitive status column)
_Due to the dimensions of the dataset, this Matrix Plot is only readable with the data pre transposing_
msno.matrix(AlzData.iloc[:,1:].T, fontsize=18)
gray_patch = mpatches.Patch(color='gray', label='Data Present')
white_patch = mpatches.Patch(color='white', label='Data Absent')
plt.legend(fontsize=18, bbox_to_anchor=(1.0, 0.7), borderaxespad=0.2, handles=[gray_patch, white_patch])
mpl.rcParams["legend.frameon"]=True
mpl.rcParams["legend.facecolor"]='#C0C0C0'
plt.title('Matrix Plot Showing Sparsity of Data', fontsize=30)
mpl.rcParams["axes.titlepad"]=20
plt.ylabel('Metabolites', fontsize=22)
plt.xlabel('Samples', fontsize=22)
plt.show()
##### Building a dendrogram to view relationships of missing values between samples (excluding cognitive status column)
_(Data also not transposed here for the purposes of visibility and size)_
msno.dendrogram(AlzData.iloc[0:,1:].T)
plt.title('Dendrogram showing Relationships of Missing Values Between Samples', fontsize=30)
mpl.rcParams["axes.titlepad"]=20
plt.ylabel('Nullity Correlation Level', fontsize=22)
plt.xlabel('Samples', fontsize=22)
plt.show()
_The dendrogram does not show any samples linked together below level 6, which suggests that there are no samples which can be classed as 'very similar' and therefore likely to have a significant influence on each other_
### Imputation
##### Using NS-kNN Imputation _(Lee and Styczynski 2018)_ to resolve missing values
##### Converting the Data to Numeric
nAlzData = AlzData.iloc[0:,1:1910].astype(float)
nAlzData;
##### Separating out Cognitive Status Column to be added back after numeric processing
CogStat = AlzData.iloc[0:,0:1]
CogStat;
##### 'Autoscaling' prior to Imputation
scalar = StandardScaler().fit(nAlzData)
AlzScSK = scalar.transform(nAlzData)
AlzScSK = pd.DataFrame(AlzScSK, columns=nAlzData.columns, index=nAlzData.index)
AlzScSK;
##### KNN Imputation Method
imputer = KNNImputer(n_neighbors=2)
After_imputation = imputer.fit_transform(AlzScSK)
AlzIm = pd.DataFrame(After_imputation, columns=AlzScSK.columns, index=AlzScSK.index)
AlzIm;
##### Un-scaling and then log transforming the data
AlzZunSc = scalar.inverse_transform(AlzIm)
AlzZunSc = pd.DataFrame(AlzZunSc, columns=nAlzData.columns, index=nAlzData.index)
AlzZ = np.log(AlzZunSc)
AlzZ;
##### Plot matrix to visualise the data during the various pre-processing stages
##### Defining plot grids
fig, axs = plt.subplots(nrows=2, ncols=2)
fig.tight_layout(h_pad=3)
fig.suptitle('Plot Matrix Showing Scaling, Processing and Normalizing of Data', fontsize = 30)
plt.subplots_adjust(top=0.93)
plt.rcParams['figure.figsize'] = [20, 20]
##### Histogram Creation
axs[0, 0].hist(nAlzData, edgecolor='black')
axs[0, 1].hist(AlzScSK, edgecolor='black')
axs[1, 0].hist(AlzZunSc, edgecolor='black')
axs[1, 1].hist(AlzZ, edgecolor='black')
##### Assigning Histogram Titles
axs[0, 0].set_title('Raw Data', fontsize = 20)
axs[0, 1].set_title('Pre kNN Scaling', fontsize = 20)
axs[1, 0].set_title('Post kNN UnScaling', fontsize = 20)
axs[1, 1].set_title('Log-Transformed, Clean Data', fontsize = 20)
AlzZ.insert(0,'Cog_Status', CogStat)
AlzZ;
tAlzZ = AlzZ.T
new_header = tAlzZ.iloc[0]
tAlzZ = tAlzZ[1:]
tAlzZ.columns = new_header
tAlzZ;
##### Encoding the AD, MCI and CN values as labels
label_encoder = LabelEncoder()
AlzZ['Cog_Status'] = label_encoder.fit_transform(AlzZ['Cog_Status'])
##### Making a dictionary of metabolites
metabolites1 = AlzIm.columns.get_level_values('Unnamed: 0')
cogstats = AlzZ["Cog_Status"].values
metabolites1
tempnames = list(range(1,1909+1))
tempnames = [int(d) for d in tempnames]
tempnamesdict = dict(zip(metabolites1, tempnames))
##### Creating a newly labelled dataset so the metabolite names don't interfere with the code
AlzZtmp = AlzZ.copy()
AlzZtmp.columns = AlzZtmp.columns.map(tempnamesdict)
AlzZtmp.iloc[0:,1:1910];
AlzZtmp.insert(0,'Cog_Status', CogStat)
AlzZtmp = AlzZtmp.drop(AlzZtmp.columns[[1]], axis=1)
AlzZtmp;
tAlzZtmp = AlzZtmp.T
##### Defining Independent Variables
ivar = AlzZtmp.iloc[0:,1:1910]
ivar = ivar
ivar.head()
tivar = ivar.T
tivar;
## Exploring the Data
### Testing for Multicollinearity
#### Visualising Multicollinearity
#####Creating a Correlation Matrix
ivarnmd = AlzZ.iloc[0:,1:1910]
cor = ivarnmd.corr()
cor
##### Building a clustermap
sns.clustermap(cor, cmap="bwr", figsize=(50,50), metric='euclidean', yticklabels=False).ax_col_dendrogram.set_title(
'Euclidean Based Clustermap Showing Highly Correlated Metabolites', fontsize=50)
##### Building an interactive network graph to narrow down the information based on the correlation value threshold
cog = tAlzZ.columns.get_level_values('Cog_Status')
colormapMCLN = {
'AD': '#949494', ##### Colourblind friendly grey
'CN': '#0173b2', ##### Colourblind friendly blue
'MCI': '#cc78bc', ##### Colourblind friendly pink
}
colorlistMCLN = [colormapMCLN[c] for c in tAlzZ.columns.get_level_values('Cog_Status')]
threshold_choice = widgets.FloatSlider(description="Correlation Threshold", value=0.8, min=0.1, max=1, step=0.05,
continuous_update=False, orientation='horizontal', layout=Layout(width='500px'),
style=dict(description_width= 'initial'))
network = go.FigureWidget(data=[go.Scatter(x=[], y=[], mode='lines', text=[], line=dict(color='#cc78bc', width=2.5),
marker=dict(size=16, line_width=5,line=dict(color='#cc78bc',width=2))),
go.Scatter(x=[], y=[],mode='markers+text', textposition="top center", text=[],
hoverinfo='text',textfont_size=12, marker=dict(size=10, color=[],line_width=1))],
layout=go.Layout(showlegend=False, annotations=[], margin=dict(t=40, b=0, l=0, r=0),
width=950, height=800, title='Network Graph Showing Correlated Metabolites at Adjustable Correlation Thresholds'))
df = AlzZ.copy()
correlation_matrix = cor.to_numpy()
def plot_corr_graph(change):
threshold, corr_mode = None, None
threshold = change.new
tr_ind = np.triu_indices(correlation_matrix.shape[0])
correlation_matrix[tr_ind] = 0
G = nx.from_numpy_matrix(correlation_matrix)
G = nx.relabel_nodes(G, lambda x: df.columns.tolist()[x])
remove = []
for col1, col2, weight in G.edges(data=True):
if math.isnan(weight["weight"]):
remove.append((col1,col2))
if abs(weight["weight"]) < threshold:
remove.append((col1,col2))
G.remove_edges_from(remove)
remove = []
edges = list(sum(G.edges, ()))
for node in G.nodes:
if node not in edges:
remove.append(node)
G.remove_nodes_from(remove)
mst = nx.maximum_spanning_tree(G)
def assign_color(col):
return colorlistMCLN
def assign_color_edge(correlation):
if correlation < 0:
return "#BF0603"
else:
return "#00CC66"
edge_colors = []
node_colors = []
for key, value in nx.get_edge_attributes(mst, 'weight').items():
edge_colors.append(assign_color_edge(value))
for key, value in dict(mst.degree).items():
node_colors.append(assign_color(key))
labels = {n:n for n in mst.nodes()}
node_x = []
node_y = []
tree = nx.fruchterman_reingold_layout(mst, k=0.25).items()
for node, (x_,y_) in tree:
node_x.append(x_)
node_y.append(y_)
def get_dim_of_node(name):
for node, (x,y) in tree:
if node == name:
return x,y
edge_x = []
edge_y = []
weights= []
for node1, node2, w in mst.edges(data=True):
x0, y0 = get_dim_of_node(node1)
x1, y1 = get_dim_of_node(node2)
edge_x.append(x0)
edge_x.append(x1)
edge_x.append(None)
edge_y.append(y0)
edge_y.append(y1)
edge_y.append(None)
weights.append((round(w["weight"],1), (x0+x1)/2, (y0+y1)/2))
with network.batch_update():
network.data[1].x = node_x
network.data[1].y = node_y
network.data[1].text = list(labels)
network.data[1].marker.color = node_colors
network.data[0].x = edge_x
network.data[0].y = edge_y
network.data[0].text = list(weights)
network.update_layout(xaxis_zeroline=False, yaxis_zeroline=False, xaxis_showgrid=False, yaxis_showgrid=False,
plot_bgcolor='rgba(0,0,0,0)')
threshold_choice.observe(plot_corr_graph, names="value")
widgets.VBox([threshold_choice, network])
_Recommended threshold 0.95_
#### Variance Inflation Factor (VIF) test
_Low to zero levels means this data would be a suitable candidate for Logistic Regression a method recommended by Lee and Hu (2019) and matching studies by Proitsi et al (2016) and Trushina et al (2013)_
##### Adding a constant
ivarc = smt.add_constant(data=ivarnmd, prepend=False)
##### Calculating VIF for each feature
vif_info = pd.DataFrame()
vif_info['VIF'] = [variance_inflation_factor(ivarnmd.values, i) for i in range(ivarnmd.shape[1])]
vif_info['Column'] = ivarnmd.columns
vif_info.sort_values('VIF', ascending=False)
### Principal Component Analysis (PCA)
__PCA analysis should only include the features (i.e. metabolites) category labels (such as Cognitive Status) are not included in the analysis - however they can be used to colour code visualisations.__
##### Using the Sklearn package to perform the PCA so that we can extract numerical attributes of the model
skpca = skPCA(n_components=45, random_state = 2020)
skpca.fit(ivar)
skAlzZpca = skpca.transform(ivar)
##### Using Yellowbrick to perform PCA to produce a visualisation of the components by cognitive type
_It is clear that there are no separate clusters between the cognitive types - this matches other metabolomics for AD studies_
AlzZpcavs = PCA(random_state = 2020,
classes = ['AD','CN','MCI'], colors=['#949494', '#0173b2', '#cc78bc'])
AlzZpcavs.fit_transform(ivar, AlzZ.Cog_Status)
AlzZpcavs.show()
##### Checking that the percentage variance of the data is completely explained by all of the principal components
print("Variance explained by all 45 principle components =",
sum(skpca.explained_variance_ratio_*100))
##### Checking total variance explained by each of the selected components
np.cumsum(skpca.explained_variance_ratio_*100)
##### Visualising the explained variance by principal component using and elbow plot
plt.plot(np.cumsum(skpca.explained_variance_ratio_*100))
plt.xlabel('Number of Components')
plt.ylabel('Explained Varience')
plt.title('Elbow Plot Showing the Optimum Number of Principle Components by Explained Variance')
plt.savefig('elbowplot.png', dpi=100)
##### Checking numerical values of explained variance by principal component
_A large number of principle components (40 of 45) are still needed to explain 95% of the variance)_
print("Varience explained by the First principle component =",
np.cumsum(skpca.explained_variance_ratio_*100)[0])
print("Varience explained by the First 10 principle components =",
np.cumsum(skpca.explained_variance_ratio_*100)[9])
print("Varience explained by the First 30 principle components =",
np.cumsum(skpca.explained_variance_ratio_*100)[29])
print("Varience explained by the First 40 principle components =",
np.cumsum(skpca.explained_variance_ratio_*100)[39])
##### Selection of 95% variance principal components
skpca_95 = skPCA(n_components=0.95, random_state = 2020)
skpca_95.fit(ivar)
skAlzZpca_95 = skpca_95.transform(ivar)
skAlzZpca_95.shape
##### Visualisation of the newly transformed data
plt.figure(figsize=(10, 7))
plt.plot(skAlzZpca_95)
plt.xlabel('Observation')
plt.ylabel('Transformed Data')
plt.title('Transformed data by the principle components (95% Variability)',
pad = 15)
plt.savefig('plt.png')
##### Creating a new dataframe of the 95% variance explained principal components
b = 40
a = range(1, b + 1)
pcCol = ["PC" + str(i) for i in a]
pcadf = pd.DataFrame(skAlzZpca_95, columns = pcCol)
pcadf['label'] = AlzZ.Cog_Status
pcadf.head()
##### Saving the new dataframe to a csv
pcadf = pcadf.iloc[0:,0:40]
pcadf.insert(0,'Cog Status', CogStat)
pcadf;
pcadf.to_csv('AD_95.csv', index=False)
tpcadf = pcadf.T
tpcadf;
new_header = tpcadf.iloc[0]
tpcadf = tpcadf[1:]
tpcadf.columns = CogStat
tpcadf;
### Partial Least Squares
#### Alzheimer's Disease against Cognitive Normal
##### Defining a dataset of specifically AD and CN data (Using the PCA 95% dataframe)
ADvsCN = tpcadf.iloc[1:,0:30]
ADvsCN;
##### Visually comparing the components by cognitive type
ADvsCN.columns.get_level_values(0)
##### building a dictionary
colormap = {
('AD',): '#949494', ##### Colourblind friendly grey
('CN',): '#0173b2', ##### Colourblind friendly blue
}
colorlist = [colormap[c] for c in ADvsCN.columns.get_level_values(0)]
ADCNfig = ADvsCN.plot(kind='line', legend=False, figsize=(12,4), color=colorlist, title='Comparing AD Metabolite Data against CN Data')
plt.xlabel('Principal Components')
plt.ylabel('Metabolite Concentrations (ppm)')
ADCNfig.xaxis.set_major_formatter(plt.NullFormatter())
plt.xticks(np.arange(0, 50, 50))
greydot = mpatches.Patch(color='#949494', label='AD')
bluedot = mpatches.Patch(color='#0173b2', label='CN')
plt.legend(fontsize=12, bbox_to_anchor=(1.0, 1.005), borderaxespad=0.2, handles=[greydot, bluedot])
_There does not seem to be any visually obvious outliers between the two groups although there are clear points on the right side of the plot where the CN data spikes higher._
##### Creating a pseudolinear Y value (using AD) against which to correlate the samples
y = [g == ('CN',) for g in ADvsCN.columns.get_level_values(0)]
y;
##### Conversion of boolean values to numerical
y = np.array(y, dtype=int)
y
##### Apply fitting a PLS-DA model to the data
from sklearn.cross_decomposition import PLSRegression
plsr = PLSRegression(n_components=2, scale=False)
plsr.fit(ADvsCN.values.T, y)
##### Viewing the model scores and building a dataframe to be used for plotting
##### (Scores describe the position of each sample in each determined latent variable (LV))
plsr.x_scores_;
plsr.x_weights_.shape
scores = pd.DataFrame(plsr.x_scores_)
scores.index=ADvsCN.columns
scores;
##### Making a separate version of the scores dataframe which can be rounded to two significant figures for labelling the points without interfering in the plotting locations
scorelab = scores.round(decimals = 2)
scorelab;
##### Plotting the PLS Regression scores
ax = scores.plot(x=0, y=1, kind='scatter', s=50, alpha=0.7,
c=colorlist, figsize=(6,6))
ax.set_xlabel('Scores on LV 1')
ax.set_ylabel('Scores on LV 2')
ax.set_title('PLS Scores of AD versus CN Metabolite Data')
greydot = mpatches.Patch(color='#949494', label='AD')
bluedot = mpatches.Patch(color='#0173b2', label='CN')
plt.legend(fontsize=12, bbox_to_anchor=(1.2, 1.005), borderaxespad=0.2, handles=[greydot, bluedot])
for n, (x, y) in enumerate(scores.values):
label = scorelab.values[n][0]
ax.text(x,y,label)
_There is separation between AD and CN values on Latent Variable 1_
#### Mild Cognitive Impairment against Cognitive Normal
##### Defining a dataset of specifically MCI and CN data (Using the PCA 95% dataframe)
MCIvsCN = tpcadf.iloc[0:,15:]
MCIvsCN;
##### Visually comparing the components by cognitive type
MCIvsCN.columns.get_level_values(0)
#####building a dictionary
colormap2 = {
('MCI',): '#cc78bc', ##### Colourblind friendly pink
('CN',): '#0173b2', ##### Colourblind friendly blue
}
colorlist2 = [colormap2[c] for c in MCIvsCN.columns.get_level_values(0)]
MCICNfig = MCIvsCN.plot(kind='line', legend=False, figsize=(12,4), color=colorlist2, title='Comparing MCI Metabolite Data against CN Data')
plt.xlabel('Principal Components')
plt.ylabel('Metabolite Concentrations (ppm)')
MCICNfig.xaxis.set_major_formatter(plt.NullFormatter())
plt.xticks(np.arange(0, 50, 50))
pinkdot = mpatches.Patch(color='#cc78bc', label='MCI')
bluedot = mpatches.Patch(color='#0173b2', label='CN')
plt.legend(fontsize=12, bbox_to_anchor=(1.0, 1.005), borderaxespad=0.2, handles=[pinkdot, bluedot])
_There seems to be potential outliers in the CN data which can be seen by the large blue spike on the left hand side of the plot and the two blue spikes on the right hand side of the plot._
##### Creating a pseudolinear Y value (using AD) against which to correlate the samples
y2 = [g == ('CN',) for g in MCIvsCN.columns.get_level_values(0)]
y2;
##### Conversion of boolean values to numerical
y2 = np.array(y2, dtype=int)
y2
##### Apply fitting a PLS-DA model to the data
plsr2 = PLSRegression(n_components=2, scale=False)
plsr2.fit(MCIvsCN.values.T, y2)
##### Viewing the model scores and building a dataframe to be used for plotting
plsr2.x_scores_;
plsr2.x_weights_.shape
scores2 = pd.DataFrame(plsr2.x_scores_)
scores2.index=MCIvsCN.columns
scores2;
##### Making a separate version of the scores dataframe which can be rounded to two significant figures for labelling the points without interfering in the plotting locations
score2lab = scores2.round(decimals = 2)
score2lab;
##### Plotting the PLS Regression scores
ax = scores2.plot(x=0, y=1, kind='scatter', s=50, alpha=0.7,
c=colorlist2, figsize=(6,6))
ax.set_xlabel('Scores on LV 1')
ax.set_ylabel('Scores on LV 2')
ax.set_title('PLS Scores of MCI versus CN Metabolite Data')
pinkdot = mpatches.Patch(color='#cc78bc', label='MCI')
bluedot = mpatches.Patch(color='#0173b2', label='CN')
plt.legend(fontsize=12, bbox_to_anchor=(1.21, 1.005), borderaxespad=0.2, handles=[pinkdot, bluedot])
for n, (x, y) in enumerate(scores2.values):
label = score2lab.values[n][0]
ax.text(x,y,label)
_There appears to be separation between MCI and CN values on Latent Variable 1_
#### Alzheimer's Disease against Mild Cognitive Impairment
##### Defining a dataset of specifically AD and MCI data (Using the PCA 95% dataframe)
AD = tpcadf.iloc[0:,0:15]
AD;
MCI = tpcadf.iloc[0:,30:]
MCI;
ADvsMCI = AD.join(MCI)
ADvsMCI;
##### Visually comparing the components by cognitive type
ADvsMCI.columns.get_level_values(0)
##### building a dictionary
colormap3 = {
('MCI',): '#cc78bc', ##### Colourblind friendly pink
('AD',): '#949494', ##### Colourblind friendly grey
}
colorlist3 = [colormap3[c] for c in ADvsMCI.columns.get_level_values(0)]
MCIADfig = ADvsMCI.plot(kind='line', legend=False, figsize=(12,4), color=colorlist3, title='Comparing MCI Metabolite Data against AD Data')
plt.xlabel('Principal Components')
plt.ylabel('Metabolite Concentrations (ppm)')
MCIADfig.xaxis.set_major_formatter(plt.NullFormatter())
plt.xticks(np.arange(0, 50, 50))
pinkdot = mpatches.Patch(color='#cc78bc', label='MCI')
greydot = mpatches.Patch(color='#949494', label='AD')
plt.legend(fontsize=12, bbox_to_anchor=(1.0, 1.005), borderaxespad=0.2, handles=[pinkdot, greydot])
_There does appear to be some potential outliers in the AD data on the left side of the plot where the grey spikes higher._
##### Creating a pseudolinear y value (using AD) against which to correlate the samples
y3 = [g == ('AD',) for g in ADvsMCI.columns.get_level_values(0)]
y3;
##### Conversion of boolean values to numerical
y3 = np.array(y3, dtype=int)
y3
##### Apply fitting a PLS-DA model to the data
plsr3 = PLSRegression(n_components=2, scale=False)
plsr3.fit(ADvsMCI.values.T, y3)
##### Viewing the model scores and building a dataframe to be used for plotting
plsr3.x_scores_;
plsr3.x_weights_.shape
scores3 = pd.DataFrame(plsr3.x_scores_)
scores3.index=ADvsMCI.columns
scores3;
##### Making a separate version of the scores dataframe which can be rounded to two significant figures for labelling the points without interfering in the plotting locations
score3lab = scores3.round(decimals = 2)
score3lab;
##### Plotting the PLS Regression scores
ax = scores3.plot(x=0, y=1, kind='scatter', s=50, alpha=0.7,
c=colorlist3, figsize=(6,6))
ax.set_xlabel('Scores on LV 1')
ax.set_ylabel('Scores on LV 2')
ax.set_title('PLS Scores of MCI versus AD Metabolite Data')
pinkdot = mpatches.Patch(color='#cc78bc', label='MCI')
greydot = mpatches.Patch(color='#949494', label='AD')
plt.legend(fontsize=12, bbox_to_anchor=(1.21, 1.005), borderaxespad=0.2, handles=[pinkdot, greydot])
for n, (x, y) in enumerate(scores3.values):
label = score3lab.values[n][0]
ax.text(x,y,label)
_There appears to be significant separation between MCI and AD values on Latent Variable 1_
### Logistic Regression
#### AD vs CN
##### Creating a dataframe of only AD and CN Values
_1 values relate to AD and 0 values relate to CN_
AD_log = AlzZ.iloc[0:15,:]
AD_log.eval("Cog_Status = Cog_Status + 1", inplace=True)
AD_log;
CN_log = AlzZ.iloc[15:30,:]
CN_log.eval("Cog_Status = Cog_Status - 1", inplace=True)
CN_log;
ADvsCN_log = pd.concat([AD_log, CN_log])
ADvsCN_log
##### Creating a features array
X1 = ADvsCN_log.iloc[:,1:]
X1;
##### Creating an output array
Y1 = ADvsCN_log.iloc[:,0]
Y1;
##### Splitting the data
X_train1, X_test1, Y_train1, Y_test1 = train_test_split(X1, Y1, random_state=0)
##### Building the classifier
logclassifier1 = LogisticRegression(solver='lbfgs',random_state=0)
logclassifier1.fit(X_train1, Y_train1)
##### Testing the classifier
predicted_y1 = logclassifier1.predict(X_test1)
predicted_y1
##### Testing the accuracy of the classifier
print('Accuracy: {:.2f}'.format(logclassifier1.score(X_test1, Y_test1)))
##### Cross Validation testing
scores_log1 = cross_val_score(logclassifier1, X1, Y1, cv=10)
print('Cross-Validation Accuracy Scores', scores)
scores_log1 = pd.Series(scores_log1)
scores_log1.min(), scores_log1.mean(), scores_log1.max()
_Range of accuracy is between 0.34 and 1.0 but averaging on 0.57_
##### Extracting coefficients of specific metabolites
##### Create a list of original variable names from the training DataFrame
original_variables1 = X1.columns
###### Extract the coefficients of the logistic regression estimator
model_coefficients1 = logclassifier1.coef_[0]
###### Create a dataframe of the variables and coefficients & print it out
coefficient_df1 = pd.DataFrame({"Metabolite" : original_variables1, "Coefficient": model_coefficients1})
print(coefficient_df1)
###### Print out the top 3 positive variables
top_three_df1 = coefficient_df1.sort_values(by="Coefficient", axis=0, ascending=False)[0:3]
print(top_three_df1)
###### Print out the bottom 3 positive variables
bottom_three_df1 = coefficient_df1.sort_values(by="Coefficient", axis=0, ascending=True)[0:3]
print(bottom_three_df1)
##### Feature Selection to improve the model
model1 = ExtraTreesClassifier()
model1.fit(X1,Y1)
print(model1.feature_importances_)
##### plot the graph of feature importances
figure(figsize=(20,20))
feat_importances1 = pd.Series(model1.feature_importances_, index=X1.columns)
feat_importances1.nlargest(50).plot(kind='barh')
plt.rc('font', size=20)
plt.xlabel('Feature Importance Score')
plt.ylabel('Metabolite')
plt.title('Plot Showing Scores of 50 "Most Important" features for the AD vs CN model')
plt.show()
feat_importances1
someimport1 = feat_importances1[feat_importances1!=0]
someimport1 = someimport1.sort_values(ascending=False)
someimport1
##### Creating a list of the most important metabolites for AD vs CN
top50_1 = (someimport1[:50])
top50_1
ls1 = list(top50_1.index.values)
ls1;
##### Using the list of metabolites (i.e., column names) to build a new X1 dataframe to be used in the logistic regression model
X1_2 = X1[ls1]
X1_2;
##### Splitting the data for the second model
X_train1_2, X_test1_2, Y_train1_2, Y_test1_2 = train_test_split(X1_2, Y1, random_state=0)
##### Building the classifier
logclassifier1_2 = LogisticRegression(solver='lbfgs',random_state=0)
result1_2=logclassifier1_2.fit(X_train1_2, Y_train1_2)
##### Testing the classifier
predicted_y1_2 = logclassifier1_2.predict(X_test1_2)
predicted_y1_2
##### Testing the accuracy of the classifier
print('Accuracy: {:.2f}'.format(logclassifier1_2.score(X_test1_2, Y_test1_2)))
##### Cross Validation testing
scores_log1_2 = cross_val_score(logclassifier1_2, X1_2, Y1, cv=15)
print('Cross-Validation Accuracy Scores', scores)
scores_log1_2 = pd.Series(scores_log1_2)
scores_log1_2.min(), scores_log1_2.mean(), scores_log1_2.max()
_Range of accuracy is now between 0.5 and 1.0 but averaging on 0.94_
##### Extracting coefficients of specific metabolites
##### Create a list of original variable names from the training DataFrame
original_variables1_2 = X1_2.columns
##### Extract the coefficients of the logistic regression estimator
model_coefficients1_2 = logclassifier1_2.coef_[0]
##### Create a dataframe of the variables and coefficients & print it out
coefficient_df1_2 = pd.DataFrame({"Metabolite" : original_variables1_2, "Coefficient": model_coefficients1_2})
print(coefficient_df1_2)
##### Print out the top 3 positive variables
top_three_df1_2 = coefficient_df1_2.sort_values(by="Coefficient", axis=0, ascending=False)[0:3]
print(top_three_df1_2)
##### Print out the bottom 3 positive variables
bottom_three_df1_2 = coefficient_df1_2.sort_values(by="Coefficient", axis=0, ascending=True)[0:3]
print(bottom_three_df1_2)
#### MCI vs CN
##### Creating a dataframe of only AD and CN Values
_1 values relate to MCI and 0 values relate to CN_
MCI_log = AlzZ.iloc[30:,:]
MCI_log.eval("Cog_Status = Cog_Status - 1", inplace=True)
MCI_log;
MCIvsCN_log = pd.concat([MCI_log, CN_log])
MCIvsCN_log;
##### Creating a features array
X2 = MCIvsCN_log.iloc[:,1:]
X2;
##### Creating an output array
Y2 = MCIvsCN_log.iloc[:,0]
Y2;
##### Splitting the data
X_train2, X_test2, Y_train2, Y_test2 = train_test_split(X2, Y2, random_state=0)
##### Building the classifier
logclassifier2 = LogisticRegression(solver='lbfgs',random_state=0)
result2=logclassifier2.fit(X_train2, Y_train2)
##### Testing the classifier
predicted_y2 = logclassifier2.predict(X_test2)
predicted_y2
##### Testing the accuracy of the classifier
print('Accuracy: {:.2f}'.format(logclassifier2.score(X_test2, Y_test2)))
##### Cross Validation testing
scores_log2 = cross_val_score(logclassifier2, X2, Y2, cv=10)
print('Cross-Validation Accuracy Scores', scores)
scores_log2 = pd.Series(scores_log2)
scores_log2.min(), scores_log2.mean(), scores_log2.max()
_Range of accuracy is between 0.34 and 1.0 but averaging on 0.54_
##### Extracting coefficients of specific metabolites
##### Create a list of original variable names from the training DataFrame
original_variables2 = X2.columns
##### Extract the coefficients of the logistic regression estimator
model_coefficients2 = logclassifier2.coef_[0]
##### Create a dataframe of the variables and coefficients & print it out
coefficient_df2 = pd.DataFrame({"Metabolite" : original_variables2, "Coefficient": model_coefficients2})
print(coefficient_df2)
##### Print out the top 3 positive variables
top_three_df2 = coefficient_df2.sort_values(by="Coefficient", axis=0, ascending=False)[0:3]
print(top_three_df2)
##### Print out the bottom 3 positive variables
bottom_three_df2 = coefficient_df2.sort_values(by="Coefficient", axis=0, ascending=True)[0:3]
print(bottom_three_df2)
##### Feature Selection to improve the model
model2 = ExtraTreesClassifier()
model2.fit(X2,Y2)
print(model2.feature_importances_)
##### plot the graph of feature importances
figure(figsize=(20,20))
feat_importances2 = pd.Series(model2.feature_importances_, index=X2.columns)
feat_importances2.nlargest(50).plot(kind='barh')
plt.rc('font', size=20)
plt.xlabel('Feature Importance Score')
plt.ylabel('Metabolite')
plt.title('Plot Showing Scores of 50 "Most Important" features for the MCI vs CN model')
plt.show()
feat_importances2
someimport2 = feat_importances2[feat_importances2!=0]
someimport2 = someimport2.sort_values(ascending=False)
someimport2
##### Creating a list of the most important metabolites for MCI vs CN
top50_2 = (someimport2[:50])
top50_2
ls2 = list(top50_2.index.values)
ls2;
##### Using the list of metabolites (i.e., column names) to build a new X2 dataframe to be used in the logistic regression model
X2_2 = X2[ls2]
X2_2;
##### Splitting the data for the second model
X_train2_2, X_test2_2, Y_train2_2, Y_test2_2 = train_test_split(X2_2, Y2, random_state=0)
##### Building the classifier
logclassifier2_2 = LogisticRegression(solver='lbfgs',random_state=0)
result2_2=logclassifier2_2.fit(X_train2_2, Y_train2_2)
##### Testing the classifier
predicted_y2_2 = logclassifier2_2.predict(X_test2_2)
predicted_y2_2
##### Testing the accuracy of the classifier
print('Accuracy: {:.2f}'.format(logclassifier2_2.score(X_test2_2, Y_test2_2)))
##### Cross Validation testing
scores_log2_2 = cross_val_score(logclassifier2_2, X2_2, Y2, cv=15)
print('Cross-Validation Accuracy Scores', scores)
scores_log2_2 = pd.Series(scores_log2_2)
scores_log2_2.min(), scores_log2_2.mean(), scores_log2_2.max()
_Range of accuracy is now between 0.5 and 1.0 but averaging on 0.9_
##### Extracting coefficients of specific metabolites
##### Create a list of original variable names from the training DataFrame
original_variables2_2 = X2_2.columns
##### Extract the coefficients of the logistic regression estimator
model_coefficients2_2 = logclassifier2_2.coef_[0]
##### Create a dataframe of the variables and coefficients & print it out
coefficient_df2_2 = pd.DataFrame({"Metabolite" : original_variables2_2, "Coefficient": model_coefficients2_2})
print(coefficient_df2_2)
##### Print out the top 3 positive variables
top_three_df2_2 = coefficient_df2_2.sort_values(by="Coefficient", axis=0, ascending=False)[0:3]
print(top_three_df2_2)
##### Print out the bottom 3 positive variables
bottom_three_df2_2 = coefficient_df2_2.sort_values(by="Coefficient", axis=0, ascending=True)[0:3]
print(bottom_three_df2_2)