-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathMOFA.R
229 lines (202 loc) · 8.62 KB
/
MOFA.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
library(data.table)
library(MOFA2)
library(ggplot2)
# import data as matrices
setwd("/media/saifeldeen/E43E57ED3E57B6F0/Research_Projects/Private/MultiOmics/Dr_Sally/DATA/merged/")
circRNA <- "/media/saifeldeen/E43E57ED3E57B6F0/Research_Projects/Private/MultiOmics/Dr_Sally/DATA/merged/circRNA.csv"
circRNA <- read.csv(circRNA, header = T, sep = ",", row.names = 1)
miRNA <- "/media/saifeldeen/E43E57ED3E57B6F0/Research_Projects/Private/MultiOmics/Dr_Sally/DATA/merged/miRNA.tsv"
miRNA <- read.csv(miRNA, header = T, sep = "\t", row.names = 1)
phosphorylation <- "/media/saifeldeen/E43E57ED3E57B6F0/Research_Projects/Private/MultiOmics/Dr_Sally/DATA/merged/phos.tsv"
phosphorylation <- read.csv(phosphorylation, header = T, sep = ",", row.names = 1)
proteome <- "/media/saifeldeen/E43E57ED3E57B6F0/Research_Projects/Private/MultiOmics/Dr_Sally/DATA/merged/prot.tsv"
proteome <- read.csv(proteome, header = T, sep = ",", row.names = 1)
transcriptome <- "/media/saifeldeen/E43E57ED3E57B6F0/Research_Projects/Private/MultiOmics/Dr_Sally/DATA/merged/Trans.tsv"
transcriptome <- read.csv(transcriptome, header = T, sep = ",", row.names = 1)
metaData <- "/media/saifeldeen/E43E57ED3E57B6F0/Research_Projects/Private/MultiOmics/Dr_Sally/DATA/merged/meta.csv"
metaData <- read.csv(metaData, header = T, sep = ",")
circRNA <- as.matrix(circRNA)
miRNA <- as.matrix(miRNA)
phosphorylation <- as.matrix(phosphorylation)
proteome <- as.matrix(proteome)
transcriptome <- as.matrix(transcriptome)
#data <- make_example_data(
# n_views = 2,
# n_samples = 200,
# n_features = 1000,
# n_factors = 10
#)[[1]]
#sample data
data <- list(circRNA,miRNA, phosphorylation, proteome,transcriptome)
#groups <- metaData$Category
multiOmocs <- create_mofa(data = data)
views_names(multiOmocs) <- c("circRNA","miRNA", "Phosphorylation", "Proteome","Transcriptome")
#if you haave groups (metaData)
# long data frame
#dt = fread("ftp://ftp.ebi.ac.uk/pub/databases/mofa/getting_started/data.txt.gz")
#head(dt)
# Let's ignore the grouping information to start
#dt[,group:=NULL]
#Create the MOFA object
#MOFAobject <- create_mofa(dt)
############################### Visualization #####################################
plot_data_overview(multiOmocs)
############################### Define options #######################################
# Define data options
#if groups/views have different ranges/variances, it is good practice to scale each group to unit variance. Default is FALSE
data_opts <- get_default_data_options(multiOmocs)
head(data_opts)
data_opts$scale_views <- TRUE
model_opts <- get_default_model_options(multiOmocs)
model_opts$num_factors <- 10
train_opts <- get_default_training_options(multiOmocs)
train_opts$convergence_mode <- "slow"
train_opts$maxiter <- 50000
################################### Build and train the MOFA object ##########################
#Prepare the MOFA object
MOFAobject <- prepare_mofa(
object = multiOmocs,
data_options = data_opts,
model_options = model_opts,
training_options = train_opts
)
#Train the MOFA model
outfile = file.path(getwd(),"model.hdf5")
MOFAobject.trained <- run_mofa(MOFAobject, outfile, use_basilisk = T)
#environment location: /home/saifeldeen/.cache/R/basilisk/1.10.2/0
# PREFIX=/home/saifeldeen/.cache/R/basilisk/1.10.2/0
################################ Down stream analysis ########################################
#load model
modelPath <- "model.hdf5"
model <- load_model(modelPath)
# overview
plot_data_overview(model)
# Add metadata to the model
#Nsamples = sum(model@dimensions$N)
#sample_metadata <- data.frame(
# sample = samples_names(model)[[1]],
# condition = sample(c("A","B"), size = Nsamples, replace = T),
# age = sample(1:100, size = Nsamples, replace = T)
#)
samples_metadata(model) <- metaData
head(model@samples_metadata, n=3)
slotNames(model) # Slots Name
names(MOFAobject@data)
dim(model@data)
names(model@expectations)
#################################### Plot Factor Correlation #################################
plot_factor_cor(model)
#################################### Variance decomposition ###################################
model@cache[["variance_explained"]] # variance in each view and group and factors
#plot varinace
plot_variance_explained(model, x="view", y="factor")
plot_variance_explained(model, x="group", y="factor", plot_total = T)
#Visualisation of single factors
plot_factor(model,
factor = 1:3,
color_by = "group1"
)
p <- plot_factor(model,
factors = c(1,2,3),
color_by = "group1",
dot_size = 3, # change dot size
dodge = T, # dodge points with different colors
legend = T, # remove legend
add_violin = T, # add violin plots,
violin_alpha = 0.25 # transparency of violin plots
)
# The output of plot_factor is a ggplot2 object that we can edit
p <- p +
scale_color_manual(values=c("Normal"="blue", "Tumor"="red")) +
scale_fill_manual(values=c("Normal"="blue", "Tumor"="red"))
print(p)
# Visualisation of combinations of factors
plot_factors(model,
factors = 1:3,
color_by = "group1"
)
# Visualisation of feature weights
plot_weights(model,
view = "Transcriptome",
factor = 1,
nfeatures = 10, # Number of features to highlight
scale = T, # Scale weights from -1 to 1
abs = F # Take the absolute value?
)
plot_top_weights(model,
view = "circRNA",
factor = 1,
nfeatures = 10
)
##################################### Visualisation of patterns in the input data ##########################
# heatmap
plot_data_heatmap(model,
view = "Proteome", # view of interest
factor = 1, # factor of interest
features = 20, # number of features to plot (they are selected by weight)
# extra arguments that are passed to the `pheatmap` function
cluster_rows = TRUE, cluster_cols = FALSE,
show_rownames = TRUE, show_colnames = FALSE
)
# Scatter plots
plot_data_scatter(model,
view = "Proteome", # view of interest
factor = 1, # factor of interest
features = 5, # number of features to plot (they are selected by weight)
add_lm = TRUE, # add linear regression
color_by = "group1"
)
# Non-linear dimensionality reduction
#Run UMAP and t-SNE
set.seed(42)
umap <- run_umap(model)
tsne <- run_tsne(model)
plot_dimred(model,
method = "TSNE", # method can be either "TSNE" or "UMAP"
color_by = "condition"
)
################################ Other functionalities ##########################
#Renaming dimensions
views_names(model) <- c("Transcriptomics", "Proteomics")
factors_names(model) <- paste("Factor", 1:model@dimensions[["K"]], sep=" ")
#Extracting data for downstream analysis
# "factors" is a list of matrices, one matrix per group with dimensions (nsamples, nfactors)
#Extract factors
factors <- get_factors(model, factors = "all")
lapply(factors,dim)
#Extract weights
# "weights" is a list of matrices, one matrix per view with dimensions (nfeatures, nfactors)
weights <- get_weights(model, views = "all", factors = "all")
lapply(weights,dim)
############################ Enrichment Analysis #####################################
library(data.table)
library(purrr)
library(ggplot2)
library(cowplot)
library(MOFA2)
library(MOFAdata)
#ReactomeGS (human):
data("reactomeGS")
head(rownames(reactomeGS), n=3)
#MSigDB 6.0 (human):
# C2: curated gene sets from online pathway databases, publications in PubMed, and knowledge of domain experts.
data("MSigDB_v6.0_C2_human")
# C5: extracted from the Gene Ontology data.base
data("MSigDB_v6.0_C5_human")
head(rownames(MSigDB_v6.0_C2_human), n=3)
#MSigDB 6.0 (mouse):
# C2: curated gene sets from online pathway databases, publications in PubMed, and knowledge of domain experts.
data("MSigDB_v6.0_C2_mouse")
# C5: extracted from the Gene Ontology data.base
data("MSigDB_v6.0_C5_mouse")
head(rownames(MSigDB_v6.0_C2_mouse), n=3)
model <- load_model("model.hdf5")
features_names(model)[["view_0"]] <- toupper(features_names(model)[["view_0"]])
head(features_names(model)[["view_0"]])
#runGSEA
enrichment.parametric <- run_enrichment(model,
view = "view_0", factors = 1:3,
feature.sets = MSigDB_v6.0_C5_mouse,
sign = "negative",
statistical.test = "parametric"
)