forked from jhawe/bggm
-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathlib.R
1552 lines (1350 loc) · 48.8 KB
/
lib.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
# ------------------------------------------------------------------------------
#' Method to quickly set a vector of default plotting colors
#' to be used in all generated plots for consistency
#'
#' TODO: parameter for number of colors to get. Then probably
#' the palettes have to be switched accordingly
#'
#' @author Johann Hawe
# ------------------------------------------------------------------------------
get_defaultcolors <- function(n=5, name=c("rcb")) {
library(RColorBrewer)
# default to rcolorbrewer palette
cols <- brewer.pal(n, "Set2")
return(cols)
}
# ------------------------------------------------------------------------------
#' Get nodes by type
#'
#' Convenience function to retrieve nodes of a certain type
#' @param g graphNEL locus graph object
#' @param type character indicating the node type
#'
#' @return character vector with the nodes of given type
# ------------------------------------------------------------------------------
get_nodes_by_type <- function(g, type) {
require(graph)
n = nodes(g)
return(n[unlist(nodeData(g, n, type))])
}
# ------------------------------------------------------------------------------
#' Gets trans-associated CpGs for a sentinel SNP
#'
#' Get all trans cpgs for an individual sentinel SNP using a "cosmo"
#' object.
#'
#' @param sentinel id of the sentinel to analyze
#' @param trans.meQTL the pruned table of trans-associations
#' @param cosmo the cosmo object containing the individual associations
#' @param cpgs the probe ranges of the cpgs to check
#' @param cosmo.idxs flag whether to return the indices in the cosmo object rather than the ones
#' for the cpg list probe ranges.
#'
#' @author Matthias Heinig
#'
# ------------------------------------------------------------------------------
get.trans.cpgs <- function(sentinel, trans.meQTL, cosmo, cpgs=NULL, cosmo.idxs=F) {
pairs = which(trans.meQTL[,"sentinel.snp"] == sentinel)
trans.snp.ranges = GRanges(seqnames=paste("chr", trans.meQTL[,"chr.snp"], sep=""),
ranges=IRanges(trans.meQTL[,"interval.start.snp"],
trans.meQTL[,"interval.end.snp"]))
trans.cpg.ranges = GRanges(seqnames=paste("chr", trans.meQTL[,"chr.cpg"], sep=""),
ranges=IRanges(trans.meQTL[,"interval.start.cpg"],
trans.meQTL[,"interval.end.cpg"]))
pair.snps = GRanges(seqnames=paste("chr", cosmo[,"snp.chr"], sep=""),
ranges=IRanges(cosmo[,"snp.pos"], width=1))
pair.cpgs = GRanges(seqnames=paste("chr", cosmo[,"cpg.chr"], sep=""),
ranges=IRanges(cosmo[,"cpg.pos"], width=2))
pairs = pair.snps %over% trans.snp.ranges[pairs] &
pair.cpgs %over% trans.cpg.ranges[pairs]
if(is.null(cpgs) | cosmo.idxs) {
return(pairs)
} else {
is.meQTL = cpgs %over% pair.cpgs[pairs]
return(is.meQTL)
}
}
# ------------------------------------------------------------------------------
#' Get human gene annotation with symbols.
#'
#' This will get the gene annotation annotated with respective gene SYMBOL from UCSC for the hg19 build
#' TODO: allow other builds as well, currently this method is not very flexible...
#'
#' @param drop.nas Flags whether to drop any annotations where the SYMBOL turns out to be NA
#'
#' @return GRanges object, where names(obj) are entrez-ids, and obj$SYMBOLS contains respective gene symbols
#'
#' DEPRECATED
#' Use the new load_gene_annotation() function instead
# ------------------------------------------------------------------------------
get.gene.annotation <- function(drop.nas=TRUE, version=19) {
warning("DEPRECATED: use file based load_gene_annotation() function instead")
if(version == 19 | version == 37) {
library(Homo.sapiens)
library(TxDb.Hsapiens.UCSC.hg19.knownGene)
txdb = TxDb.Hsapiens.UCSC.hg19.knownGene
ucsc2symbol = AnnotationDbi::select(Homo.sapiens, keys=keys(Homo.sapiens, keytype="GENEID"),
keytype="GENEID", columns="SYMBOL")
ga = genes(txdb)
ga$SYMBOL <- ucsc2symbol[match(values(ga)[,"gene_id"],
ucsc2symbol[,"GENEID"]),"SYMBOL"]
if(drop.nas){
ga <- ga[!is.na(ga$SYMBOL)]
}
} else if(version == 38) {
library(annotables)
grch38 <- data.table(grch38)
# grch38 <- grch38[grch38$biotype=="protein_coding",]
ga <- with(grch38,
GRanges(paste0("chr", chr),
IRanges(start, end), strand=strand, SYMBOL=symbol))
} else {
stop("Genome annotation version not supported.")
}
return(ga)
}
# ------------------------------------------------------------------------------
#' Load gene annotation from a GFF file
#'
#' @param fgene_annot The gene annotation file (GFF format). The method expects
#' to find gene_id, gene_name and gene_biotype in the attributes as well as
#' a single row per gene
#'
#' @author Johann Hawe <johann.hawe@helmholtz-muenchen.de>
# ------------------------------------------------------------------------------
load_gene_annotation <- function(fgene_annot) {
require(data.table)
require(GenomicRanges)
# load gene annotation
ga <- fread(fgene_annot)
# file format is: chr origin type start stop U strand U add_info
colnames(ga) <- c("chr", "origin", "type", "start", "stop", "score", "strand",
"frame", "info")
# extract ranges
ra <- with(ga, GRanges(chr, IRanges(start, stop), strand))
# extract the additional attributes and merge with ranges object
attrs <- strsplit(ga$info, ";")
gene_id <- sapply(attrs, function(x) { sapply(strsplit(x[grepl("gene_id",x)], " "), "[[", 2) })
gene_name <- sapply(attrs, function(x) { sapply(strsplit(x[grepl("gene_name",x)], " "), "[[", 3) })
gene_biotype <- sapply(attrs, function(x) { sapply(strsplit(x[grepl("gene_type",x)], " "), "[[", 3) })
# remove any lingering quotes
gene_id <- gsub("\"", "", gene_id)
gene_name <- gsub("\"", "", gene_name)
gene_biotype <- gsub("\"", "", gene_biotype)
# add to ranges object
names(ra) <- gene_id
ra$SYMBOL <- gene_name
ra$BIOTYPE <- gene_biotype
# finally, filter out 'misc_RNA' types and unusual chromosomes
ra <- ra[ra$BIOTYPE != "misc_RNA"]
ra <- keepStandardChromosomes(ra)
return(ra)
}
# ------------------------------------------------------------------------------
#'
#' Define a convenience function to get linear model p-values
#'
#' @param modelobject The linear model for which to get the p-value
#'
#' @return The p-value to the given linear model
#'
# ------------------------------------------------------------------------------
lmp <- function (modelobject) {
if (class(modelobject) != "lm") stop("Not an object of class 'lm' ")
f <- summary(modelobject)$fstatistic
p <- pf(f[1],f[2],f[3],lower.tail=F)
attributes(p) <- NULL
return(p)
}
# ------------------------------------------------------------------------------
#' Gets a single snp range from the gtex snp database
#'
#' @author Johann Hawe
#'
# ------------------------------------------------------------------------------
get.snp.range <- function(snp){
library(data.table)
snps <- fread(paste0("results/current/gtex-snp-locations.txt"))
snps <- snps[RS_ID_dbSNP142_CHG37p13 %in% snp,]
if(nrow(snps)<1){
warning("SNP not found on gtex db.\n")
return(NULL)
} else {
r <- with(snps,
GRanges(paste0("chr", Chr),
IRanges(as.integer(Pos),width=1)))
names(r) <- snps$RS_ID_dbSNP142_CHG37p13
return(r)
}
}
#' -----------------------------------------------------------------------------
#' General method to handle either CpG or TSS TFBS context
#'
#' @param entities IDs of CpGs or names of genes for which to get TFBS contet
#' @param fcontext The file containing the precalculated TFBS context. Must
#' match the entity types!
#'
#' @author Johann Hawe <johann.hawe@helmholtz-muenchen.de>
#'
#' -----------------------------------------------------------------------------
get_tfbs_context <- function(entities, fcontext) {
cont <- readRDS(fcontext)
tfbs_ann <- cont[rownames(cont) %in% entities,,drop=F]
if(nrow(tfbs_ann) < 1) return(tfbs_ann)
entities <- entities[entities %in% rownames(cont)]
return(tfbs_ann[entities,,drop=F])
}
#' -----------------------------------------------------------------------------
#' Annotates a regulatory graph with appropriate node and
#' edge attributes
#'
#' @param g The graph to be annotated
#' @param ranges The ranges to be used for the annotation. Contains
#' information on which genes are TFs, what the CpG genes are etc.
#' @param ppi_db graphNEL object containing the used PPI database
#' @param fcontext The CpG context file (cpgs annotated with TFBS)
#'
#' @return The same graph instance as given, but annotated with specific
#' node and edge attributes
#'
#' @author Johann Hawe
#'
#' -----------------------------------------------------------------------------
annotate.graph <- function(g, ranges, ppi_db, fcontext){
if(is.null(g)) {
warning("Graph g is null.")
return(NULL)
}
# work on all graph nodes
gn <- nodes(g)
# check whether we have CpGs
if(ranges$seed == "meqtl") {
nodeDataDefaults(g,"cpg") <- F
nodeData(g, gn, "cpg") <- grepl("^cg", gn)
nodeDataDefaults(g,"cpg.gene") <- F
nodeData(g, gn, "cpg.gene") <- gn %in% ranges$cpg_genes$SYMBOL
} else {
# eQTLs, we have trans.genes instead of CpGs
nodeDataDefaults(g,"trans.gene") <- F
nodeData(g, gn, "trans.gene") <- gn %in% ranges$trans_genes$SYMBOL
}
# add common nodeData
nodeDataDefaults(g,"snp") <- F
nodeData(g, gn, "snp") <- grepl("^rs", gn)
nodeDataDefaults(g,"snp.gene") <- F
nodeData(g, gn, "snp.gene") <- gn %in% ranges$snp_genes$SYMBOL
nodeDataDefaults(g, "tf") <- F
nodeData(g, gn, "tf") <- gn %in% ranges$tfs$SYMBOL
nodeDataDefaults(g, "sp.gene") <- F
nodeData(g, gn, "sp.gene") <- gn %in% ranges$spath$SYMBOL
# edge data information
edgeDataDefaults(g, "isChipSeq") <- FALSE
edgeDataDefaults(g, "isPPI") <- FALSE
# add TFBS information
tfs <- ranges$tfs$SYMBOL
if(ranges$seed == "meqtl") {
context <- get_tfbs_context(names(ranges$cpgs), fcontext)
} else {
genes <- unique(c(ranges$trans_genes$SYMBOL, ranges$cis_gene$SYMBOL,
ranges$tfs$SYMBOL, ranges$snp_genes$SYMBOL,
ranges$spath$SYMBOL))
context <- get_tfbs_context(genes, fcontext)
}
em <- matrix(ncol=2,nrow=0)
# for all entities
for(ent in rownames(context)){
for(tf in tfs) {
# might be that the TF was measured in more than one cell line
if(any(context[ent,grepl(tf, colnames(context))]>0)) {
em <- rbind(em,c(ent,tf))
}
}
}
em <- filter.edge.matrix(g, em)
if(nrow(em) > 0){
edgeData(g, em[,1], em[,2], "isChipSeq") <- T
}
# ppi edgedata
# get subset of edges which are in our current graph
ss <- subGraph(intersect(nodes(ppi_db), gn), ppi_db)
edges <- t(edgeMatrix(ss))
edges <- cbind(gn[edges[,1]], gn[edges[,2]])
edges <- filter.edge.matrix(g, edges)
if(nrow(edges) > 0){
edgeData(g,edges[,1], edges[,2],"isPPI") <- T
}
return(g)
}
#' -----------------------------------------------------------------------------
#' Plot a GGM result graph
#'
#' Plots the a built graph (estimated from the sentinel data) using the
#' twopi-visualization. Can be used to retrieve only the plot graph/node/edge
#' attributes when using pdf.out=NULL, dot.out=NULL and plot.on.device=F
#'
#' @param graph: Graph to be plotted (graphNEL)
#'
#' @param id The Id of the sentinel
#' @param dot.out File to which to write the graph in dot format
#'
#' @return List of plot graph attributes and the underlying graph structure
#'
#' @author Johann Hawe
#'
#' -----------------------------------------------------------------------------
plot_ggm <- function(g, id, graph.title=id,
plot.on.device=T,
dot.out=NULL, ...){
require(igraph)
require(graph)
require(Rgraphviz)
# get some default colors to be used here
cols <- get_defaultcolors(n=8)
# remove any unconnected nodes (primarily for bdgraph result, since
# such nodes are already removed for genenet)
if(any(graph::degree(g) == 0)){
g <- removeNode(names(which(graph::degree(g) == 0)), g)
}
# we need an id...
if(is.null(id)){
stop("Sentinel ID must not be NULL.")
}
# add sentinel to network if it is not in there yet or it has been removed
if(!is.null(id) && !(id %in% nodes(g))) {
g <- graph::addNode(c(id), g)
}
n <- graph::nodes(g)
# now get the cluster which contains our sentinel
# ig = graph_from_graphnel(g)
# cl = clusters(ig)
# sentinel_cluster <- cl$membership[names(cl$membership) == id]
# keep = nodes(g)[cl$membership == sentinel_cluster]
# g = subGraph(keep, g)
# the remaining nodes
# n <- keep
# get trans and cpg gene symbols
snp.genes <- n[unlist(nodeData(g,n,"snp.gene"))]
# this is only a quick hack: we plot cpg.genes/trans.genes from the meqtl/eqtl
# based analyses the same
if("cpg.gene" %in% names(nodeData(g, nodes(g)[1])[[1]])) {
assoc_genes <- n[unlist(nodeData(g,n,"cpg.gene"))]
} else if("trans.gene" %in% names(nodeData(g, nodes(g)[1])[[1]])){
assoc_genes <- n[unlist(nodeData(g,n,"trans.gene"))]
}
tfs <- n[unlist(nodeData(g,n,"tf"))]
# prepare plot-layout
attrs <- list(node=list(fixedsize=TRUE, fontsize=14,
style="filled", fontname="helvetica"),
graph=list(overlap="false", root=id[1], outputorder="edgesfirst",
label=graph.title, labelloc="top", labeljust="right"))
shape = rep("ellipse", numNodes(g))
names(shape) = n
shape[grep("^cg", n)] = "box"
shape[grep(id, n)] = "box"
width = rep(0.8, numNodes(g))
names(width) = n
width[grep("cg", n)] = 0.4
height = rep(0.3, numNodes(g))
names(height) = n
height[grep("cg", n)] = 0.4
label = n
names(label) = n
label[grep("cg", n)] = ""
style <- rep("filled", numNodes(g))
names(style) <- n
col = rep("#ffffff", numNodes(g))
names(col) = n
col[grep(id, n)] = cols[1]
col[grep("^cg", n)] = cols[2]
col[snp.genes] = cols[4]
col[assoc_genes] = cols[5]
if(!is.null(tfs)){
col[tfs] = cols[3]
tf_cis <- intersect(tfs, snp.genes)
if(length(tf_cis) > 0) {
# set two colors since both TF and snp gene
col[tf_cis] <- cols[6]
}
}
penwidth = rep(1, numNodes(g))
names(penwidth) = n
penwidth[snp.genes] = 3
penwidth[assoc_genes] = 3
if(!is.null(tfs)){
penwidth[tfs] = 3
}
bordercol = rep("black", numNodes(g));
names(bordercol) = n;
bordercol[assoc_genes] = "#e4d7bc";
bordercol[id] = "#ffe30f";
nAttrs = list(shape=shape, label=label, style=style, width=width,
height=height, penwidth=penwidth, fillcolor=col,
color=bordercol)
# default color for edges: black
ecol = rep("black", numEdges(g))
names(ecol) = edgeNames(g)
for(i in snp.genes) {
# color any edge from a SNP to one of our snp genes red
ecol[grepl("^rs|~rs", names(ecol)) & grepl(i, names(ecol))] = "#b3cde2"
}
# set also color for cpgs
for(cg in assoc_genes){
# color any edge from a cpg to one of its cpg genes blue (proximity edges)
ecol[grepl("^cg|~cg", names(ecol)) & grepl(cg, names(ecol))] = "#b3cde2"
}
# check edgeData and add to colors
for(edge in names(ecol)){
n1 <- strsplit(edge,"~")[[1]][1]
n2 <- strsplit(edge,"~")[[1]][2]
if(unlist(graph::edgeData(g,n1,n2, "isPPI"))){
ecol[edge] <- "#decae3"
}
if(unlist(graph::edgeData(g,n1,n2, "isChipSeq"))){
ecol[edge] <- "#ccebc5"
}
}
dir = rep("none", numEdges(g))
names(dir) = edgeNames(g)
eAttrs = list(color=ecol, dir=dir)
if(numEdges(g)>60000){
warning("Skipping plotting on device due to large amount of edges")
} else if(plot.on.device) {
plot(g, "twopi", nodeAttrs=nAttrs, edgeAttrs=eAttrs, attrs=attrs, ...)
# start with empty plot, then add legend
plot(NULL ,xaxt='n',yaxt='n',bty='n',ylab='',xlab='', xlim=0:1, ylim=0:1)
legend("left", legend = c("SNP","SNP gene","TF","assoc gene", "SNP gene/TF"),
pch=16, pt.cex=3, cex=1.5, bty='n',
col = c(cols[1], cols[4], cols[3], cols[5], cols[6]), title="graph legend")
}
if(!is.null(dot.out)){
# output the dot-information
# toDot(graph, dot.out, nodeAttrs=nAttrs, edgeAttrs=eAttrs, attrs=attrs, subGList=sgs)
toDot(g, dot.out, nodeAttrs=nAttrs, edgeAttrs=eAttrs, attrs=attrs)
}
# return the list of created plotattributes and the possibly modified graph
# object
return(list(graph=g, nodeAttrs=nAttrs, edgeAttrs=eAttrs, attrs=attrs))
}
#' -----------------------------------------------------------------------------
#' Method to quickly filter an edge matrix for only those edges, which are within
#' a specified graph
#'
#' @author Johann Hawe
#'
#' @date 2017/03/13
#'
#' -----------------------------------------------------------------------------
filter.edge.matrix <- function(g, em){
if(!is.matrix(em)){
stop("Provided edge matrix is not an actual matrix.")
}
if(nrow(em)<1){
return(em)
}
e <- graph::edges(g)
out <- matrix(ncol=2,nrow=0)
for(i in 1:nrow(em)){
e1 <- em[i,1]
e2 <- em[i,2]
if(e1 %in% names(e)){
if(e2 %in% e[[e1]]){
out <- rbind(out,c(e1,e2))
}
}
}
return(out)
}
#' -----------------------------------------------------------------------------
#' Quantile normalization
#'
#' @param x ngenes x nsamples matrix to be normalized
#' @return quantile normalized matrix
#' @export
#' -----------------------------------------------------------------------------
normalize.quantile <- function(x) {
x = as.matrix(x)
o = apply(x, 2, order)
xsort = x
for (col in 1:ncol(x)) {
xsort[,col] = x[o[,col], col]
}
means = apply(xsort, 1, mean)
normalized = matrix(NA, nrow=nrow(x), ncol=ncol(x), dimnames=dimnames(x))
for (col in 1:ncol(x)) {
normalized[o[,col], col] = means
}
return(normalized)
}
#' Annotate positions with their epigenetic states
#'
#' @param cpg.ranges GRanges object with the positions to annotate
#' @param ids character vector with the roadmap epigenome ids to use
#' @param dir the directory where chromHMM files are stored
#'
#' @return character matrix with length(cpg.ranges) rows and length(ids) columns
#' with the state annotation of the range in each epigenome
#'
#' @export
chromHMM.annotation <- function(cpg.ranges,
ids,
dir="data/current/roadmap/chromHMM/15state/",
suffix="_15_coreMarks_mnemonics.bed.bgz") {
require(Rsamtools)
annotation = sapply(ids, function(id) {
file = paste0(dir, id, suffix)
avail = as.logical(seqnames(cpg.ranges) %in% seqnamesTabix(file))
avail.ann = scanTabix(file, param=cpg.ranges[avail])
avail.ann = sapply(avail.ann, function(x) strsplit(x, "\t")[[1]][4])
ann = rep(NA, length(cpg.ranges))
ann[avail] = avail.ann
return(ann)
})
colnames(annotation) = ids
rownames(annotation) = names(cpg.ranges)
return(annotation)
}
#' For a set of symbols, gets the emsembl-ids of the corresponding genes from
#' the AnnotationHub.
#' Note: Currently only implemented for human genes
#'
#' @return A data.frame containng the SYMBOL in one column and the ensemble
#' id in another column
#'
#' @author Johann Hawe
#'
#' @date 2017/03/27
#'
ensembl.from.symbol <- function(symbols, na.drop=T, org="hs"){
result <- NULL
if("hs" %in% org){
library(org.Hs.eg.db)
hs <- org.Hs.eg.db
result <- select(hs,
keytype="SYMBOL",
keys=c(symbols),
columns=c("SYMBOL", "ENSEMBL"))
} else {
warning("Organism not supported yet: ", org)
}
if(na.drop){
result <- result[!is.na(result$ENSEMBL),,drop=F]
}
return(result)
}
#' For a set of ensemble ids, gets the symbols of the corresponding genes from
#' the AnnotationHub.
#' Note: Currently only implemented for human genes
#'
#' @return A data.frame containng the SYMBOL in one column and the ENSEMBLE
#' id in another column
#'
#' @author Johann Hawe
#'
#' @date 2017/03/28
#'
symbol.from.ensembl <- function(ensemblIDs, na.drop=T, org="hs"){
result <- NULL
if("hs" %in% org){
library(org.Hs.eg.db)
hs <- org.Hs.eg.db
result <- AnnotationDbi::select(hs,
keytype="ENSEMBL",
keys=c(ensemblIDs),
columns=c("SYMBOL", "ENSEMBL"))
} else {
warning("Organism not supported yet: ", org)
}
if(na.drop){
result <- result[!is.na(result$ENSEMBL),,drop=F]
}
# only get the first match for each id
return(result[!duplicated(result$ENSEMBL),])
}
#' Plot a simple heatmap
#'
#' Uses 'pheatmap' package to plot a very simple heatmap.
#'
#' @param x The data matrix for which to plot the heatmap
#' @param cannot vector of column annotations (must have same names as colnames
#' of x)
#' @param cluster Flag whether to cluster the data in the heatmap or not. default:F
#' @param cors Flag whether correlations are contained in the input matrix x (to
#' appropriately choose the color palette)
#'
#' @return Grob object returned by pheatmap in the 4th position
#'
simple.heatmap <- function(x, cors=F, cannot=NA, cluster=F) {
library(pheatmap)
library(RColorBrewer)
if(cors){
breaksList = seq(-1, 1, by = 0.1)
} else {
breaksList = seq(min(x), max(x), by = 0.1)
}
cols <- rev(brewer.pal(n = 7, name = "RdYlBu"))
cols <- colorRampPalette(cols)(length(breaksList))
return(pheatmap(x, annotation_col=cannot,
cluster_rows=cluster,
cluster_cols=cluster,
cex=0.7, color = cols, breaks = breaksList)[[4]])
}
#' For a list of symbols, gets all annotated array ids from the
#' illuminaHumanV3.db
#'
#' @param symbols List of symbols for which to get ids
#' @param mapping Flag whether to return a mapping ID->SYMBOL (default: F)
#' @param as_list Flag whether to return result as list or vector in
#' case mapping=F. (default:F)
#'
#' @author Johann Hawe
#'
probes.from.symbols <- function(symbols, mapping=F, as_list=F, annot=NULL) {
if(is.null(annot)) {
library(illuminaHumanv3.db)
annot <- as.list(illuminaHumanv3SYMBOLREANNOTATED)
annot <- annot[!is.na(annot)]
}
annot <- annot[which(annot %in% symbols)]
annot <- unlist(annot)
if(length(annot) > 0){
if(mapping) {
probes <- names(annot)
uprobes <- unique(probes)
if(length(uprobes) != length(probes)){
dprobes <- unname(probes[duplicated(probes)])
warning(paste0("Caution: Some probes had more than one gene annotated,
using only one symbol for those probes:\n",
dprobes))
}
map <- data.matrix(cbind(names(annot), annot))
map <- map[!is.na(map[,2]),,drop=F]
colnames(map) <- c("id", "symbol")
dropped <- length(symbols) - nrow(map)
if(dropped>0) {
cat("Dropped", dropped, "symbols since they had no probe.id available.\n")
}
return(map)
}
if(as_list) {
tmp <- sapply(symbols, function(s) { names(annot[annot %in% s])})
names(tmp) <- symbols
return(tmp)
} else {
return(unique(names(annot[annot %in% symbols])))
}
} else {
return(NULL)
}
}
#' Creates the graph at which to start the BDgraph algorithm at
#'
#' Since it is an MCMC algorithm, starting with this graph will make it more
#' likely to start not too far off of the target distribution/graph.
#' The creation of the start graph is based on the STRING database.
#' Additionally, predefined edges will be added to the graph.
#'
#' @param nodes All nodes to be incorporated in the graph
#' @param ranges A set of ranges from which cpg and cpg.genes can inferred
#'
#' @return An incidence matrix of size length(nodes)^2 where 1s indicated edges
#' and 0s indicated absence of edges
#'
#' @author Johann Hawe
#'
get.g.start <- function(nodes, ranges){
library(graph)
genes <- nodes[!grepl("^cg|^rs", nodes)]
# create output matrix
p <- length(nodes)
out <- matrix(0, nrow=p, ncol=p)
rownames(out) <- colnames(out) <- nodes
# add STRING connections
STRING_SUB <- subGraph(intersect(nodes(STRING_DB), genes), STRING_DB)
sn <- nodes(STRING_SUB)
em <- t(edgeMatrix(STRING_SUB))
em <- cbind(sn[em[,1]],sn[em[,2]])
out[em[,1],em[,2]] <- 1
out[em[,2],em[,1]] <- 1
# create edge matrix for all cpg.gene-cpg edges to be added
# to g.start
em <- matrix(ncol=2,nrow=0)
cpg.genes <- ranges$cpg_genes
cpg.genes <- cpg.genes[cpg.genes$SYMBOL %in% nodes]
cpgs <- (ranges$cpgs)
cpgs <- cpgs[names(cpgs) %in% nodes]
for(i in 1:length(cpgs)){
cpg <- cpgs[i]
g <- get.nearby.ranges(cpgs[i],cpg.genes)[[1]]
for(row in 1:length(g)){
em <- rbind(em,
c(names(cpg),
g[row]$SYMBOL))
}
}
out[em[,1],em[,2]] <- 1
out[em[,2],em[,1]] <- 1
return(out)
}
#' Similar to get.g.start, but simply uses the prior matrix and extracts all
#' pairs with prior>0.5 to create the g.start
#'
#' @param priors Matrix of prior values for all possible node edges
#' @param scaled Do we use a scaled (between 0.5 and 1) version of the priors?
#' Default: FALSE
#'
#' @author Johann Hawe
#'
get_gstart_from_priors <- function(priors, scaled=FALSE){
out <- priors
if(scaled) {
idx <- out>0.75
} else {
idx <- out>0.5
}
out[idx] <- 1
out[!idx] <- 0
return(out)
}
#' -----------------------------------------------------------------------------
#' Normalize external (geuvadis, gtex) expression data
#'
#' @param data The expression matrix to normalize, expecting
#' log-transformed RPKM values with individuals in the rows and
#' probes/genes in the columns
#'
#' @return The normalized expression matrix
#'
#' @author Johann Hawe
#'
#' -----------------------------------------------------------------------------
normalize.expression <- function(data) {
library(preprocessCore)
library(sva)
# quantile normalize
scaled = normalize.quantiles.robust(t(data))
rownames(scaled) <- colnames(data)
colnames(scaled) <- rownames(data)
# transform the scaled counts to std normal per gene
stdnorm <- function(x) {
r = rank(x, ties.method="random")
qnorm(r / (length(x) + 1))
}
# gets p x n matrix
transformed <- apply(scaled, 1, stdnorm)
pca <- prcomp(transformed)$x[,1:10]
# remove first 10 pcs
corrected <- resid(lm(transformed ~ pca))
return(corrected)
}
#' Calculate peer factors for given data and covariates
#' Then get residual data matrix
#'
#' @param data nxg matrix (n=samples, g=genes/variables)
#' @param covariates nxc matrix (n=samples, c=covariates)
#' @param get.residuals Flag whether to directly return the residuals
#' calculated on the data matrix instead of the peer factors calculated
#' @param Nk Number of factors to estimate. Default: N/4
#'
#' @return Matrix of corrected expression data
#'
#' @author Johann Hawe
#'
#' @date 20170328
#'
correct.peer <- function(data,
covariates=NULL,
Nk=ceiling(nrow(data)*0.25)) {
# create model
model <- PEER();
# input has to be a matrix!
PEER_setPhenoMean(model, as.matrix(data));
# add the mean estimation as default since it is recommended in the tutorial
# of peer. will return Nk+1 factors
PEER_setAdd_mean(model, TRUE)
# set number of hidden factors to identify. If unknown, a good measure is N/4 (see howto)
PEER_setNk(model, Nk);
# should not be neccessary but increase anyways
PEER_setNmax_iterations(model, 5000);
if(!is.null(covariates)){
# set matrix of known and important covariates,
# since we want to acknowledge their effect
PEER_setCovariates(model, as.matrix(covariates));
}
# learn
PEER_update(model);
re <- PEER_getResiduals(model)
colnames(re) <- colnames(data)
rownames(re) <- rownames(data)
return(re)
}
#' Small helper for a list() function automatically setting
#' the variable names as the names of the created list
#'
#' compare: https://stackoverflow.com/a/21059868
#'
#' @author Johann Hawe
#'
listN <- function(...){
ln <- list(...)
names(ln) <- as.character(substitute(list(...)))[-1]
ln
}
#' Augment grahs
#'
#' Add TF to CpG edges and trans gene to SNP edges to graphs. This function
#' assumes that you have an object tfbs.ann in the environment. It is not
#' passed as argument (bad style) becuase it is too big (passing by value).
#'
#' @param graphs list of graph objects to add nodes and edges to
#' @param sentinel character id of the sentinel SNP
#' @param trans.genes character vector of genes in the trans locus
#' @param trans.cpgs character vector of CpGs with trans meQTLs
#' @param tfbs.ann logical indicator matrix with CpGs in the rows and
#' transcription factors as columns
#'
#' @return list of graph objects with the nodes and edges added
#'
#' @author Matthias Heinig
#'
#' @imports reshape
#' @imports graph
#' @export
add.to.graphs <- function(graphs, sentinel, trans.genes, trans.cpgs, tfbs.ann) {
require(reshape)
require(graph)
tf.edges = melt(tfbs.ann[trans.cpgs,])
colnames(tf.edges) = c("cpg", "condition", "adjacent")
tf.edges = tf.edges[tf.edges[,"adjacent"],]
for (i in 1:2) {
tf.edges[,i] = as.character(tf.edges[,i])
}
tf = as.character(sapply(strsplit(tf.edges[,"condition"], ".", fixed=T),
"[", 1))
tf.edges = data.frame(tf.edges, tf, stringsAsFactors=F)
tf.edges = tf.edges[!duplicated(paste(tf.edges[,"tf"], tf.edges[,"cpg"])),]
## put together the graphs
out = list()
for (graph.idx in 1:length(graphs)) {
locus.graph = graphs[[graph.idx]]
## filter for TFs that are in the graph already
use.tf.edges = tf.edges[tf.edges[,"tf"] %in% nodes(locus.graph),]
new.nodes = unique(c(use.tf.edges[,"tf"], trans.cpgs,
sentinel, trans.genes))
new.nodes = setdiff(new.nodes, nodes(locus.graph))
locus.graph = graph::addNode(new.nodes, locus.graph)
## also add some meta data (the type of nodes)
nodeDataDefaults(locus.graph, "tf") = FALSE
nodeData(locus.graph, unique(use.tf.edges[,"tf"]), "tf") = TRUE
nodeDataDefaults(locus.graph, "cpg") = FALSE
nodeData(locus.graph, trans.cpgs, "cpg") = TRUE
nodeDataDefaults(locus.graph, "snp") = FALSE
nodeData(locus.graph, sentinel, "snp") = TRUE
nodeDataDefaults(locus.graph, "trans.gene") = FALSE
nodeData(locus.graph, trans.genes, "trans.gene") = TRUE
## add edges for the tfbs
locus.graph = addEdge(use.tf.edges[,"tf"],
use.tf.edges[,"cpg"], locus.graph)
## add edges for the connection of the locus and its genes
locus.graph = addEdge(rep(sentinel, length(trans.genes)),
trans.genes, locus.graph)
out[[graph.idx]] = locus.graph
}
return(out)
}
## this function is from destiny and uses the arpack function
eig.decomp <- function(M, n.eigs, sym) {
n = nrow(M)
f <- function(x, extra=NULL) as.matrix(M %*% x)
wh <- if (sym) 'LA' else 'LM'
## constraints: n >= ncv > nev
ar <- arpack(f, sym = sym, options = list(
which = wh, n = n, ncv = min(n, 4*n.eigs), nev = n.eigs + 1))
if (!sym) {
ar$vectors <- Re(ar$vectors)
ar$values <- Re(ar$values)
}
## check for negative values and flip signs
neg = which(ar$values < 0)
for (n in neg) {
ar$values[n] = -ar$values[n]