-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy path2_Optimized _annotation_assembler.R
206 lines (151 loc) · 10.1 KB
/
2_Optimized _annotation_assembler.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
###############################################
#### 2_Automated genome reference assembly ####
###############################################
#### Input data and statement of purpose ####
#############################################
# 0. Load data and libraries
# - genome annotation file (.gtf) to be optimized
# - "overlapping_gene_list.csv" file specifying how to resolve gene overlap derived issues. "Delete" entries in $final_classification field mark genes for deletion. Transcript names in $transcripts_for_deletion mark specific transcripts for deletion.
# - "gene_extension_candidates.csv" specifying updated gene boundaries for incorporating intergenic reads
# - "rename_genes.csv" specifying gene names to be replaced and new names (under $old_names and $new_names fields, respectively)
# 1. Resolve "self-overlapping" gene (duplicate gene_ids) derived issues. Required for compatibility with multiomic datasets.
# 2. Create pre-mRNA genome annotation from input genome annotation. This step extracts all transcript entries from the genome annotation and defines them as full length exons with new transcript IDs and corresponding transcripts. This allows to capture many intronically mapped reads that otherwise get discarded.
# 3. Gene deletion step: Deletes all annotation entries for genes destined for deletion (has "Delete" entry in $final_classification field of "overlapping_gene_list.csv"
# 4. Transcript deletion step: Deletes all transcripts destined for deletion (transcript names listed in the "transcripts_for_deletion" column in ""overlapping_gene_list.csv"
# 5. Gene coordinate adjustment step: replace the left most or right most coordinate of the first exon of a gene in genome annotation if there is a coordinate in columns $new_left or $new_right in the "gene_extension_candidates.csv".
# 6. Add pre-mRNA reads to all genes not in the gene overlap list.
# 7. Rename genes to avoid discarding expression data with near perfect terminal exon overlap.
# 8. Save the optimized genome annotation in a new gtf file
#### 0. Load libraries and import data ####
###########################################
install.packages("BiocManager")
install.packages("gdata")
install.packages("stringr")
BiocManager::install("rtracklayer")
BiocManager::install("GenomicRanges")
library("rtracklayer")
library("gdata")
library("stringr")
library("GenomicRanges")
exonic_gtf<- import(con = 'genes.gtf', format = "gtf")
exonic_df = as.data.frame(exonic_gtf)
overlap_df = read.csv("overlapping_gene_list.csv", header=T)
boundary_fix = read.csv("gene_extension_candidates.csv", header=T)
rename_genes = read.csv("rename_genes.csv", header=T)
#### 1. Resolve "self-overlapping" gene (duplicate gene_ids) derived issues ####
################################################################################
length(unique(exonic_df$gene_name)) # number of all genes
length(unique(exonic_df$gene_id)) # number of all gene_id-s
## Which genes are self-overlapping
all_genes = unique(exonic_df$gene_name)
genes_of_interest = rep(0, length(all_genes))
for (i in 1:length(all_genes)){
genes_of_interest[i] = length(unique(exonic_df$gene_id[exonic_df$gene_name == all_genes[i]]))
}
selector = genes_of_interest>1
self_overlappers = all_genes[selector]
## Initiate the final dataframe for modification
new_df = exonic_df
rm(exonic_df)
#### 2. Create premRNA genome annotation from input gtf that defines transcripts as exons ####
###############################################################################################
# Note: several methods are available for recovering intronic reads from scRNA-seq data. These
# include the early approach of redefining all genes/transcripts as exons, which enables
# incorporation of intronic reads to downstream analysis. This approach works well but has
# several problems including poorer performance at mapping spliced reads (annotation loses
# splicing boundaries) and eliminates hundreds of genes due to gene overlaps. There are also
# specific modes in the latest versions of sequence aligners (e.g. --include-introns parameter
# in Cell Ranger 6) for incorporating intronic reads that work better. However, we found
# that the latter systematically fails to retrieve intronic reads for many genes that the
# traditional premrna approach performs well on. We therefore adopted a hybrid pre-mRNA
# reference approach where we supplement normal gene annotation entries by traditional
# pre-mRNA entries where transcripts have been redefined as exons and map in the
# --include-introns mode in Cell Ranger 6 or default mode in later ierations to retrieve
# most of available intronic reads.
transcripts_df = new_df[new_df$type == "transcript",]
exons_df = transcripts_df # Create new dataframe to contain premrna exons
exons_df$type = rep("exon", nrow(exons_df)) # rename "type" from transcripts to exon
premrna_df = interleave(transcripts_df, exons_df) # interleave transript entries with exon entries
premrna_df$transcript_id = gsub("000000", "100000", premrna_df$transcript_id)
premrna_df$transcript_id = gsub("000001", "110001", premrna_df$transcript_id)
premrna_df$transcript_id = gsub("000002", "110002", premrna_df$transcript_id)
premrna_df$transcript_id = gsub("000003", "110003", premrna_df$transcript_id)
premrna_df$transcript_id = gsub("000004", "110004", premrna_df$transcript_id)
premrna_df$transcript_id = gsub("000005", "110005", premrna_df$transcript_id)
premrna_df$transcript_id = gsub("000006", "110006", premrna_df$transcript_id)
premrna_df$transcript_id = gsub("000007", "110007", premrna_df$transcript_id)
premrna_df$transcript_id = gsub("000008", "110008", premrna_df$transcript_id)
premrna_df$transcript_id = gsub("000009", "110009", premrna_df$transcript_id)
#### 3. Delete select genes ####
#################################
genes_to_delete = overlap_df$genes[overlap_df$final_classification == "Delete"]
new_df = new_df[!new_df$gene_name %in% genes_to_delete,]
#### 4. Delete select transcripts ####
#######################################
transcripts_to_delete = overlap_df$transcripts_for_deletion
transcripts_to_delete <- transcripts_to_delete[transcripts_to_delete!=""]
transcripts_to_delete_final = transcripts_to_delete[!str_detect(transcripts_to_delete, ", ")]
for (i in 1:length(transcripts_to_delete)){
a = transcripts_to_delete[i]
if (str_detect(a, ", ")){
split_elements <- unlist(str_split(a, ", "))
transcripts_to_delete_final = c(transcripts_to_delete_final, split_elements)
}
}
transcripts_to_delete = transcripts_to_delete_final
new_df = new_df[!new_df$transcript_name %in% transcripts_to_delete,]
#### 5. Adjust gene coordinates ####
#####################################
left_genes = as.data.frame(cbind(boundary_fix$genes[!is.na(boundary_fix$update_start)], boundary_fix$update_start[!is.na(boundary_fix$update_start)]))
colnames(left_genes) = c("genes", "update_start")
left_genes$update_start = as.numeric(left_genes$update_start)
right_genes = as.data.frame(cbind(boundary_fix$genes[!is.na(boundary_fix$update_end)], boundary_fix$update_end[!is.na(boundary_fix$update_end)]))
colnames(right_genes) = c("genes", "update_end")
right_genes$update_end = as.numeric(right_genes$update_end)
left_exon_difs = rep(0, length(left_genes)) # for troubleshooting
right_exon_difs = rep(0, length(right_genes))
for (i in 1:dim(left_genes)[1]){
gene_entries = which(new_df$gene_name == left_genes[i, 1])
type_entries = new_df$type[gene_entries]
first_gene_exon = head(gene_entries[type_entries == "exon"], 1)
new_df[first_gene_exon, 2] = left_genes[i, 2]
left_exon_difs[i] = new_df[first_gene_exon, 3] - new_df[first_gene_exon, 2]
}
for (i in 1:dim(right_genes)[1]){
gene_entries = which(new_df$gene_name == right_genes[i, 1])
type_entries = new_df$type[gene_entries]
last_gene_exon = tail(gene_entries[type_entries == "exon"], 1)
new_df[last_gene_exon, 3] = right_genes[i, 2]
right_exon_difs[i] = new_df[last_gene_exon, 3] - new_df[last_gene_exon, 2]
}
#### 6. Add pre-mRNA transcripts to genes not in the gene overlap list ####
############################################################################
# Explanation: Cellranger --include-introns mode unfortunately does not pick up on many intronic reads (unclear why despite lengthy correspondence with their support). I can pick those up however if I add the pre-mRNA transcripts to respective genes as exons with new transcript_id values.
## Genes to modify
# Note that all genes in the overlap gene list (overlap_df$genes) will be excluded from this step to avoid the introduction of large exonic overlaps between genes. Also, excluding duplicate gene_id genes (self_overlappers) to avoid errors with assembling multiomic references.
genes_to_append = unique(new_df$gene_name)
genes_to_append = setdiff(genes_to_append, overlap_df$genes)
genes_to_append = setdiff(genes_to_append, self_overlappers)
## Reformat the final gtf dataframe such that we can add premrna data to it
new_df = new_df[, colnames(premrna_df)]
## Append premrna transcript to the end of the gene
genes_to_append = genes_to_append[1:(length(genes_to_append)-1)]
for (i in genes_to_append){
insert = premrna_df[premrna_df$gene_name %in% i,]
first_section = new_df[0:tail(which(new_df$gene_name == i), 1),]
last_section = new_df[(tail(which(new_df$gene_name == i), 1)+1):dim(new_df)[1],]
new_df = rbind(first_section, insert, last_section)
}
#### 7. Rename desired genes ####
#################################
# Rename desired genes
old_names = rename_genes$old_names
new_names = rename_genes$new_names
for (i in 1:length(old_names)){
new_df$transcript_name = str_replace_all(new_df$transcript_name, old_names[i], new_names[i])
new_df$gene_name = str_replace_all(new_df$gene_name, old_names[i], new_names[i])
}
#### 8. Save the optimized genome annotation in a new gtf file ####
###################################################################
new_gtf = makeGRangesFromDataFrame(new_df, keep.extra.columns=TRUE)
rtracklayer::export(new_gtf, "optimized_reference.gtf", format = "gtf")