-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrandom_forest_using_qiime2.sh
164 lines (138 loc) · 6.54 KB
/
random_forest_using_qiime2.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
# Supervised learning classifiers predict the categorical metadata classes of unlabeled samples by learning
# the composition of labeled training samples. For example, we may use a classifier to
# diagnose or predict disease susceptibility based on stool microbiome composition,
# or predict sample type as a function of the sequence variants, microbial taxa, or metabolites detected in a sample.
##USE THIS FOR RAREFIED TABLE####
qiime sample-classifier classify-samples \
--i-table ggtable.qza \
--m-metadata-file metadata_to_use.tsv \
--m-metadata-column diet \
--p-optimize-feature-selection \
--p-parameter-tuning \
--p-estimator RandomForestClassifier \
--p-n-estimators 20 \
--p-random-state 123 \
--output-dir diet-classifier
# For species only
qiime sample-classifier classify-samples \
--i-table species-collapse.qza \
--m-metadata-file metadata_to_use.tsv \
--m-metadata-column LSdiet \
--p-optimize-feature-selection \
--p-parameter-tuning \
--p-estimator RandomForestClassifier \
--p-n-estimators 20 \
--p-random-state 123 \
--output-dir species-classifier
# First let’s check out accuracy_results.qzv, which presents classification
# accuracy results in the form of a confusion matrix, as well as Receiver Operating Characteristic (ROC) curves.
# view the .qzv file for accuracy_results.qzv
# This pipeline also reports the actual predictions made for each test sample
# in the predictions.qza output. This is a SampleData[ClassifierPredictions] artifact,
# which is viewable as metadata. So we can take a peak with metadata tabulate:
#for diet
qiime metadata tabulate \
--m-input-file diet-classifier/predictions.qza \
--o-visualization diet-classifier/predictions.qzv
#for LSdiet
qiime metadata tabulate \
--m-input-file LSdiet-classifier/predictions.qza \
--o-visualization LSdiet-classifier/predictions.qzv
# for species
qiime metadata tabulate \
--m-input-file species-classifier/predictions.qza \
--o-visualization species-classifier/predictions.qzv
# In addition to the predicted class information, the model also reports the individual class probabilities
# in probabilities.qza. This is a SampleData[Probabilities] artifact, and is also viewable as metadata,
# so let’s take a peak with metadata tabulate:
#for diet
qiime metadata tabulate \
--m-input-file diet-classifier/probabilities.qza \
--o-visualization diet-classifier/probabilities.qzv
# for LSdiet
qiime metadata tabulate \
--m-input-file LSdiet-classifier/probabilities.qza \
--o-visualization LSdiet-classifier/probabilities.qzv
# for species
qiime metadata tabulate \
--m-input-file species-classifier/probabilities.qza \
--o-visualization species-classifier/probabilities.qzv
# Another really useful output of supervised learning methods is feature selection, i.e.,
# they report which features (e.g., ASVs or taxa) are most predictive.
# A list of all features, and their relative importances (or feature weights or model coefficients,
# depending on the learning model used), will be reported in feature_importance.qza
#for diet
qiime metadata tabulate \
--m-input-file diet-classifier/feature_importance.qza \
--o-visualization diet-classifier/feature_importance.qzv
#for LS diet
qiime metadata tabulate \
--m-input-file LSdiet-classifier/feature_importance.qza \
--o-visualization LSdiet-classifier/feature_importance.qzv
# for species
qiime metadata tabulate \
--m-input-file species-classifier/feature_importance.qza \
--o-visualization species-classifier/feature_importance.qzv
# If --p-optimize-feature-selection is enabled, only the selected features (i.e., the most important features,
# which maximize model accuracy, as determined using recursive feature elimination) will be reported in this
# artifact, and all other results (e.g., model accuracy and predictions) that are output use the final,
# optimized model that utilizes this reduced feature set. This allows us to not only see which features
# are most important (and hence used by the model), but also use that information to filter out uninformative
# features from our feature table for other downstream analyses outside of q2-sample-classifier:
#for diet
qiime feature-table filter-features \
--i-table ggtable.qza \
--m-metadata-file diet-classifier/feature_importance.qza \
--o-filtered-table diet-classifier/important-feature-table.qza
#for LSdiet
qiime feature-table filter-features \
--i-table ggtable.qza \
--m-metadata-file LSdiet-classifier/feature_importance.qza \
--o-filtered-table LSdiet-classifier/important-feature-table.qza
# for species
qiime feature-table filter-features \
--i-table species-collapse.qza \
--m-metadata-file species-classifier/feature_importance.qza \
--o-filtered-table species-classifier/important-feature-table.qza
# We can also use the heatmap pipeline to generate an abundance heatmap of the most important
# features in each sample or group. Let’s make a heatmap of the top 30 most abundant features in each of our sample types:
#diet
qiime sample-classifier heatmap \
--i-table ggtable.qza \
--i-importance diet-classifier/feature_importance.qza \
--m-sample-metadata-file metadata_to_use.tsv \
--m-sample-metadata-column diet \
--m-feature-metadata-file new_taxonomy.tsv \
--m-feature-metadata-column Taxon \
--p-cluster both \
--p-group-samples \
--p-feature-count 30 \
--o-filtered-table diet-classifier/important-feature-table-top-30.qza \
--o-heatmap diet-classifier/important-feature-heatmap.qzv \
--p-color-scheme RdYlBu_r
#LSdiet
qiime sample-classifier heatmap \
--i-table ggtable.qza \
--i-importance LSdiet-classifier/feature_importance.qza \
--m-sample-metadata-file metadata.tsv \
--m-sample-metadata-column LSdiet \
--m-feature-metadata-file taxonomy.tsv \
--m-feature-metadata-column Taxon \
--p-cluster both \
--p-group-samples \
--p-feature-count 30 \
--o-filtered-table LSdiet-classifier/important-feature-table-top-30.qza \
--o-heatmap LSdiet-classifier/important-feature-heatmap.qzv \
--p-color-scheme RdYlBu_r
# for species
qiime sample-classifier heatmap \
--i-table species-collapse.qza \
--i-importance species-classifier/feature_importance.qza \
--m-sample-metadata-file metadata_to_use.tsv \
--m-sample-metadata-column LSdiet \
--p-cluster features \
--p-group-samples \
--p-feature-count 30 \
--o-filtered-table species-classifier/important-feature-table-30.qza \
--o-heatmap species-classifier/important-feature-heatmap.qzv \
--p-color-scheme RdYlBu_r