forked from resendejss/PreProcSEQ
-
Notifications
You must be signed in to change notification settings - Fork 0
/
annotationTranscripts_kallisto_matrixTximport.R
67 lines (48 loc) · 2.28 KB
/
annotationTranscripts_kallisto_matrixTximport.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
library(AnnotationHub)
library(GenomicFeatures)
library(ensembldb)
library(SummarizedExperiment)
setwd("~/PreProcSEQ-main/6-annotationTranscripts")
ah <- AnnotationHub()
edb <- query(ah, pattern = c("Homo sapiens", "EnsDb",106))[[1]]
gns <- genes(edb)
EnsDbAnnotation <- as.data.frame(gns)
EnsDbAnnotation <- EnsDbAnnotation[,c("gene_id","symbol","gene_biotype","entrezid")]
dim(EnsDbAnnotation)
colnames(EnsDbAnnotation) <- c("ensemblid","symbol","gene_biotype","entrezid")
load("../5-expressionMatrix/tximport/matrix_kallisto_tximport.RData")
matrix_expr <- mat_gse$abundance
nrow(matrix_expr)
rownames(matrix_expr)
rownames(matrix_expr) <- stringr::str_replace(rownames(matrix_expr), "\\...$", "")
rownames(matrix_expr) <- stringr::str_replace(rownames(matrix_expr), "\\..$", "")
all(rownames(matrix_expr)%in%rownames(EnsDbAnnotation))
rowAnnotation <- EnsDbAnnotation[rownames(matrix_expr),]
rowAnnotation <- data.frame(rowAnnotation, stringsAsFactors = F)
rownames(rowAnnotation) <- rowAnnotation$gene_id
idx <- match(rownames(matrix_expr),gns$gene_id)
rowRanges <- GRanges(seqnames = gns@seqnames[idx],
ranges = gns@ranges[idx],
strand = gns@strand[idx])
rownames(mat_gse$abundance) <- rownames(matrix_expr)
rownames(mat_gse$counts) <- rownames(matrix_expr)
rownames(mat_gse$length) <- rownames(matrix_expr)
dirquant <- "~/PreProcSEQ-main/4-quantification/kallisto/quant_kallisto"
files <- list.files(dirquant)
filesnames <- gsub("_quant","",files[-1])
colnames(mat_gse$abundance) <- filesnames
colnames(mat_gse$counts) <- filesnames
colnames(mat_gse$length) <- filesnames
coldata <- readxl::read_xlsx("../metadata.xlsx")
coldata <- as.data.frame(coldata)
rownames(coldata) <- coldata$Sample_id
colnames(mat_gse$abundance) <- rownames(coldata)
colnames(mat_gse$counts) <- rownames(coldata)
colnames(mat_gse$length) <- rownames(coldata)
mat_annot <- SummarizedExperiment(assays = list(counts=mat_gse$counts,
abundance=mat_gse$abundance,
length=mat_gse$length),
rowRanges = rowRanges,
colData = coldata)
rowData(mat_annot) <- rowAnnotation
save(mat_annot, file = "matrix_gse_kallisto_tximport_noted.RData")