-
Notifications
You must be signed in to change notification settings - Fork 1
/
.Rhistory
512 lines (512 loc) · 25.6 KB
/
.Rhistory
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
library(tidyr)
library(grDevices)
library(reshape2)
library(Rmisc)
library(ggpubr)
library(tibble)
library(hrbrthemes)
library(gridExtra)
library(tidyr)
library(zoo)
library(ComplexHeatmap)
library(circlize)
library(GSEABase)
library(data.table)
library(stringr)
path_out = 'C:/Users/samjg/Documents/Github_repositories/Airradians_multigen_OA/RAnalysis/Output/Transcriptomics/DESeq2'
# DE files - all genes and filtered raw counts
DE_F1s <- read.csv("/Output/Transcriptomics/DESeq2/F2_juveniles/F2_DESeq2results_F2s.csv", head = T) # write
knitr::opts_knit$set(root.dir = "~/Documents/Github_repositories/Airradians_multigen_OA/RAnalysis")
# DE files - all genes and filtered raw counts
DE_F1s <- read.csv("/Output/Transcriptomics/DESeq2/F2_juveniles/F2_DESeq2results_F2s.csv", head = T) # write
knitr::opts_knit$set(root.dir = "C:/Users/samjg/Documents/Github_repositories/Airradians_multigen_OA/RAnalysis")
# DE files - all genes and filtered raw counts
DE_F1s <- read.csv("/Output/Transcriptomics/DESeq2/F2_juveniles/F2_DESeq2results_F2s.csv", head = T) # write
knitr::opts_chunk$set(echo = TRUE,
warning = FALSE,
message = FALSE,
cache = TRUE)
knitr::opts_knit$set(root.dir = "C:/Users/samjg/Documents/Github_repositories/Airradians_multigen_OA/RAnalysis")
knitr::opts_knit$set(root.dir = "C:/Users/samjg/Documents/Github_repositories/Airradians_multigen_OA/RAnalysis")
# DE files - all genes and filtered raw counts
DE_F1s <- read.csv("/Output/Transcriptomics/DESeq2/F2_juveniles/F2_DESeq2results_F2s.csv", head = T) # write
### Panopea generosa - load .fna ('Geoduck_annotation') and foramt GO terms ('Geoduck_GOterms') and vectors
Airr_Cvirg_annotation <- read.csv(file="../../Output/Transcriptomics/Raw_counts_matrix/raw_count_matrix_WITH_ANNOTATION.csv",
sep = ',',
header = T) %>%
dplyr::select(c('Airradians_TranscriptID',
"blastxEval_CvirgTranscriptID",
"blastxEval_CvirgProteinID",
"blastxEval_CvirgGeneID",
"blastxEval_CvirgGOterms"))
### Panopea generosa - load .fna ('Geoduck_annotation') and foramt GO terms ('Geoduck_GOterms') and vectors
Airr_Cvirg_annotation <- read.csv(file="Output/Transcriptomics/Raw_counts_matrix/raw_count_matrix_WITH_ANNOTATION.csv",
sep = ',',
header = T) %>%
dplyr::select(c('Airradians_TranscriptID',
"blastxEval_CvirgTranscriptID",
"blastxEval_CvirgProteinID",
"blastxEval_CvirgGeneID",
"blastxEval_CvirgGOterms"))
# DE files - all genes and filtered raw counts
DE_F1s <- read.csv("Output/Transcriptomics/DESeq2/F2_juveniles/F2_DESeq2results_F2s.csv", head = T) # write
# DE files - all genes and filtered raw counts
DE_F1s <-read.csv("/Output/Transcriptomics/DESeq2/F1_juveniles/F1_DESeq2results_F1s.csv", head = T) # write
# DE files - all genes and filtered raw counts
DE_F1s <-read.csv("Output/Transcriptomics/DESeq2/F1_juveniles/F1_DESeq2results_F1s.csv", head = T) # write
DE_F2s <- read.csv("Output/Transcriptomics/DESeq2/F2_juveniles/F2_DESeq2results_F2s.csv", head = T) # write
# GOslim
slim <- getOBOCollection("http://current.geneontology.org/ontology/subsets/goslim_generic.obo") #get GO database - # call goslim_generic.obo terms as 'slim'
Airr_Cvirg_annotation
# create a referecne from the Geoduck annotation file to merge with the DE sumamry lists
ref <- Airr_Cvirg_annotation[,c(1,7)]
Airr_Cvirg_annotation
# create a referecne from the Geoduck annotation file to merge with the DE sumamry lists
ref <- Airr_Cvirg_annotation[,c(1,5)]
colnames(ref) <- c("gene.ID", "gene.annotation")
ref
DE_F1s
# create a referecne from the Geoduck annotation file to merge with the DE sumamry lists
ref <- Airr_Cvirg_annotation[,c(1,5)]
ref
colnames(ref) <- c("Airradians_TranscriptID", "gene.annotation")
ref
DE_F1s
# Day UP primary
DE_F1s_up <- DE_F1s %>%
dplyr::filter(padj < 0.05) %>%
dplyr::mutate(dir =
case_when(log2FoldChange > 0 ~ "up",
log2FoldChange < 0 ~ "down")
)
DE_F1s_up
# Day UP primary
DE_F1s_up <- DE_F1s %>%
dplyr::select('Airradians_TranscriptID','log2FoldChange','padj')
# Day UP primary
DE_F1s_up <- DE_F1s %>%
dplyr::select('Airradians_TranscriptID','log2FoldChange','padj') %>%
dplyr::filter(padj < 0.05) %>%
dplyr::mutate(dir =
case_when(log2FoldChange > 0 ~ "up",
log2FoldChange < 0 ~ "down")
)
DE_F1s_up
# Day UP primary
DE_F1s_up <- DE_F1s %>%
dplyr::select('Airradians_TranscriptID','log2FoldChange','padj') %>%
dplyr::filter(padj < 0.05) %>%
dplyr::mutate(dir =
case_when(log2FoldChange > 0 ~ "up",
log2FoldChange < 0 ~ "down"),
up =
case_when(dir == 'up', . = FALSE))
# Day UP primary
DE_F1s_up <- DE_F1s %>%
dplyr::select('Airradians_TranscriptID','log2FoldChange','padj') %>%
dplyr::filter(padj < 0.05) %>%
dplyr::mutate(dir =
case_when(log2FoldChange > 0 ~ "up",
log2FoldChange < 0 ~ "down"),
up =
case_when(dir = 'up', . = FALSE))
# Day UP primary
DE_F1s_up <- DE_F1s %>%
dplyr::select('Airradians_TranscriptID','log2FoldChange','padj') %>%
dplyr::filter(padj < 0.05) %>%
dplyr::mutate(dir =
case_when(log2FoldChange > 0 ~ "up",
log2FoldChange < 0 ~ "down"),
up =
case_when(dir = 'up' ~ TRUE, . == FALSE))
# Day UP primary
DE_F1s_up <- DE_F1s %>%
dplyr::select('Airradians_TranscriptID','log2FoldChange','padj') %>%
dplyr::filter(padj < 0.05) %>%
dplyr::mutate(dir =
case_when(log2FoldChange > 0 ~ "up",
log2FoldChange < 0 ~ "down"))
# Day UP primary
DE_F1s_up <- DE_F1s %>%
dplyr::select('Airradians_TranscriptID','log2FoldChange','padj') %>%
dplyr::filter(padj < 0.05) %>%
dplyr::mutate(dir =
case_when(log2FoldChange > 0 ~ "up",
log2FoldChange < 0 ~ "down")) %>%
dplyr::mutate(up =
case_when(dir = 'up' ~ TRUE, . == FALSE))
# Day UP primary
DE_F1s_up <- DE_F1s %>%
dplyr::select('Airradians_TranscriptID','log2FoldChange','padj') %>%
dplyr::filter(padj < 0.05) %>%
dplyr::mutate(dir =
case_when(log2FoldChange > 0 ~ "up",
log2FoldChange < 0 ~ "down")) %>%
dplyr::mutate(up =
case_when(dir == "up" ~ TRUE, . == FALSE))
# Day UP primary
DE_F1s_up <- DE_F1s %>%
dplyr::select('Airradians_TranscriptID','log2FoldChange','padj') %>%
dplyr::filter(padj < 0.05) %>%
dplyr::mutate(dir =
case_when(log2FoldChange > 0 ~ "up",
log2FoldChange < 0 ~ "down")) %>%
dplyr::mutate(up =
case_when(dir == "up" ~ TRUE))
DE_F1s_up
# Day UP primary
DE_F1s_up <- DE_F1s %>%
dplyr::select('Airradians_TranscriptID','log2FoldChange','padj') %>%
dplyr::filter(padj < 0.05) %>%
dplyr::mutate(dir =
case_when(log2FoldChange > 0 ~ "up",
log2FoldChange < 0 ~ "down")) %>%
dplyr::mutate(up =
case_when(dir == "up" ~ TRUE, .default = FALSE))
DE_F1s_up
# Day UP primary
DE_F1s_up <- DE_F1s %>%
dplyr::select('Airradians_TranscriptID','log2FoldChange','padj') %>%
dplyr::filter(padj < 0.05) %>%
dplyr::mutate(dir =
case_when(log2FoldChange > 0 ~ "up",
log2FoldChange < 0 ~ "down")) %>%
dplyr::mutate(up =
case_when(dir == "up" ~ TRUE, .default = FALSE),
down =
case_when(dir == "down" ~ TRUE, .default = FALSE))
# Day UP primary
DE_F1s_up <- DE_F1s %>%
dplyr::select('Airradians_TranscriptID','log2FoldChange','padj') %>%
dplyr::filter(padj < 0.05) %>%
dplyr::mutate(dir =
case_when(log2FoldChange > 0 ~ "up",
log2FoldChange < 0 ~ "down")) %>%
dplyr::mutate(up =
case_when(dir == "up" ~ TRUE, .default = FALSE),
down =
case_when(dir == "down" ~ TRUE, .default = FALSE)) %>%
dplyr::select(-c(padj, dir))
# Day UP primary
DE_F1s_dir <- DE_F1s %>%
dplyr::select('Airradians_TranscriptID','log2FoldChange','padj') %>%
dplyr::filter(padj < 0.05) %>%
dplyr::mutate(dir =
case_when(log2FoldChange > 0 ~ "up",
log2FoldChange < 0 ~ "down")) %>%
dplyr::mutate(up =
case_when(dir == "up" ~ TRUE, .default = FALSE),
down =
case_when(dir == "down" ~ TRUE, .default = FALSE)) %>%
dplyr::select(-c(padj, dir))
DE_F1s_dir
# F2s - call padj < 0.05 and assign boolean to logfold change direction
DE_F2s_dir <- DE_F2s %>%
dplyr::select('Airradians_TranscriptID','log2FoldChange','padj') %>%
dplyr::filter(padj < 0.05) %>%
dplyr::mutate(dir =
case_when(log2FoldChange > 0 ~ "up",
log2FoldChange < 0 ~ "down")) %>%
dplyr::mutate(up =
case_when(dir == "up" ~ TRUE, .default = FALSE),
down =
case_when(dir == "down" ~ TRUE, .default = FALSE)) %>%
dplyr::select(-c(padj, dir))
# F1s - call padj < 0.05 and assign boolean to logfold change direction
DE_F1s_dir <- DE_F1s %>%
dplyr::select('Airradians_TranscriptID','log2FoldChange','padj') %>%
dplyr::filter(padj < 0.05) %>%
dplyr::mutate(dir =
case_when(log2FoldChange > 0 ~ "up",
log2FoldChange < 0 ~ "down")) %>%
dplyr::mutate(up =
case_when(dir == "up" ~ TRUE, .default = FALSE),
down =
case_when(dir == "down" ~ TRUE, .default = FALSE)) %>%
dplyr::select(-c(padj, dir)) %>% mutate(Effect = "Generation_1")
# F2s - call padj < 0.05 and assign boolean to logfold change direction
DE_F2s_dir <- DE_F2s %>%
dplyr::select('Airradians_TranscriptID','log2FoldChange','padj') %>%
dplyr::filter(padj < 0.05) %>%
dplyr::mutate(dir =
case_when(log2FoldChange > 0 ~ "up",
log2FoldChange < 0 ~ "down")) %>%
dplyr::mutate(up =
case_when(dir == "up" ~ TRUE, .default = FALSE),
down =
case_when(dir == "down" ~ TRUE, .default = FALSE)) %>%
dplyr::select(-c(padj, dir)) %>% mutate(Effect = "Generation_2")
DE_Master <- rbind(DE_F1s_dir, DE_F2s_dir)
DE_Master_with_annotation <- merge(DE_Master, ref, by="Airradians_TranscriptID")
# sanity check - row numbers should be equal
nrow(DE_Master) == nrow(DE_Master_with_annotation)
write.csv(DE_Master_with_annotation, file = "Output/DESeq2/DE_All_Annotation.csv", row.names = FALSE)
write.csv(DE_Master_with_annotation, file = "Output/Transcriptomics/DESeq2/DE_All_Annotation.csv", row.names = FALSE)
DE_Master_with_annotation
# Objective here is to call the DE genes (up and down)
# DAY 0 ------------------------------------------- #
dim(DE_F1s_dir) # day0 - should be 14
# Objective here is to call the DE genes (up and down)
# Gen 1 ------------------------------------------- #
dim(DE_F1s_dir) # day0 - should be 14
DE_F1s_dir.UPREG <- DE_F1s_dir %>% dplyr::filter(up %in% TRUE) # UPREG call
DE_F1s_dir.DWNREG <- DE_F1s_dir %>% dplyr::filter(down %in% TRUE) # DWNREG call
DE_F2s_dir.UPREG <- DE_F2s_dir %>% dplyr::filter(up %in% TRUE) # UPREG call
DE_F2s_dir.DWNREG <- DE_F2s_dir %>% dplyr::filter(down %in% TRUE) # DWNREG call
DE_F2s_dir.UPREG
DE_F2s_dir.DWNREG
DE_F2s_dir.UPREG
DE_F1s_dir.DWNREG
DE_F1s_dir.UPREG
DE_F1s_dir.DWNREG <- DE_F1s_dir %>% dplyr::filter(down %in% TRUE) # DWNREG call
DE_F1s_dir.DWNREG
DE_F2s_dir.UPREG
DE_F2s_dir.DWNREG <- DE_F2s_dir %>% dplyr::filter(down %in% TRUE) # DWNREG call
DE_F2s_dir.DWNREG
Airr_Cvirg_annotation
# call the GO terms
AirrCgig_GOterms <- as.data.frame(Airr_Cvirg_annotation) %>% dplyr::select(c('Airradians_TranscriptID','blastxEval_CvirgGOterms'))
colnames(AirrCgig_GOterms)[1:2] <- c('transcript.ID', 'GO.terms') # call gene name and the GO terms - (Uniprot ID 'V5')
splitted <- strsplit(as.character(AirrCgig_GOterms$GO.terms), "; ") #slit into multiple GO ids by delimiter'; ' remember the space after ; is needed here! without this you will only call the first listed GO term for each gene!
GO.terms <- data.frame(v1 = rep.int(AirrCgig_GOterms$transcript.ID, sapply(splitted, length)),
v2 = unlist(splitted)) #list all genes with each of their GO terms in a single row
GO.terms
AirrCgig_GOterms
splitted
as.character(AirrCgig_GOterms$GO.terms)
GO.terms
AirrCgig_GOterms$GO.terms
splitted <- strsplit(as.character(AirrCgig_GOterms$GO.terms), ";") #slit into multiple GO ids by delimiter'; ' remember the space after ; is needed here! without this you will only call the first listed GO term for each gene!
splitted
GO.terms <- data.frame(v1 = rep.int(AirrCgig_GOterms$transcript.ID, sapply(splitted, length)),
v2 = unlist(splitted)) #list all genes with each of their GO terms in a single row
GO.terms
# Prepare dataframe(s) and vectors for goseq
# Format 'GO.term' for goseq from the P.generosa annotation .fna file 'Geoduck_annotation'
IDvector.F1 <- as.vector(unique(DE_F1s_dir$Airradians_TranscriptID)) # call unique genes (those filtered and used in DESEq2) on day0 - 'IDvector'
IDvector.F2 <- as.vector(unique(DE_F2s_dir$Airradians_TranscriptID)) # call unique genes (those filtered and used in DESEq2) on day7 - 'IDvector'
GO_unique.genes.all <- as.vector(unique(Airr_Cvirg_annotation$Airradians_TranscriptID)) # call all unique genes for GO analysis (goseq)
Airradians_TranscriptID
Airr_Cvirg_annotation
read.csv(file="Output/Transcriptomics/Raw_counts_matrix/raw_count_matrix_WITH_ANNOTATION.csv",
sep = ',',
header = T)
# read 'Cvirg_seqIDMASTER' output above
# file contains the Cvirgnica transcript ID, protein ID, gene ID and GO term annotation
Cvirg_seqID <- read.csv(file = "RAnalysis/Data/Transcriptomics/seq_id_Cvirginica_master.csv", header =T) %>%
dplyr::rename(Cvirginica_TranscriptID = TranscriptID)
# :::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
# SET WORKING DIRECTORY :::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
setwd("C:/Users/samjg/Documents/Github_repositories/Airradians_multigen_OA/") # personal computer
# read 'Cvirg_seqIDMASTER' output above
# file contains the Cvirgnica transcript ID, protein ID, gene ID and GO term annotation
Cvirg_seqID <- read.csv(file = "RAnalysis/Data/Transcriptomics/seq_id_Cvirginica_master.csv", header =T) %>%
dplyr::rename(Cvirginica_TranscriptID = TranscriptID)
Cvirg_seqID
# count matrix from prepDE.py script
# NOTE: aligned to the Airradians draft and unannotated genome!
raw.countmatrix <- read.csv(file="HPC_Analysis/output/F1_TagSeq/Airradians_transcript_count_matrix.csv", header=T)
raw.countmatrix[is.na(raw.countmatrix)] <- 0 # replace all occurances of NA with 0 in the cell NOT THE WHOLE ROW!
nrow(raw.countmatrix) # 26595 total transcripts
# due to the lack of annotation in the Airraians draft genome..
# call the Cvirginica database of protein names and transcript ID calls
Cvirg_seqID <- as.data.table(read.delim2(file = "RAnalysis/Data/Transcriptomics/seq_id.txt", header =F)) %>%
`colnames<-`("fullID")
nrow(Cvirg_seqID) # 66625
Cvirg_GOterms <- read.csv(file = "RAnalysis/Data/Transcriptomics/Cviginiva_GOterms.csv", header =T) %>%
dplyr::select(c('GeneID','Annotation_GO_ID', 'length')) %>%
unique() # there are many redundnat rows here
read.csv(file = "RAnalysis/Data/Transcriptomics/Cviginiva_GOterms.csv", header =T)
Cvirg_GOterms <- read.csv(file = "RAnalysis/Data/Transcriptomics/Cviginiva_GOterms.csv", header =T) %>%
dplyr::select(c('GeneID','Annotation_GO_ID', 'Length')) %>%
unique() # there are many redundnat rows here
nrow(Cvirg_GOterms) #35106
Cvirg_GOterms
# diamond result to obtain accession IDs of annotated genes Cvirg and Cgigas for gene ID, GO, and KEGG ID information
#(1) Airradians protein database (...pep.fna file) with Cvirginica nucleotide query
blastx_Airr_Cvirg <- as.data.table(read.delim2(file="HPC_Analysis/output/F1_TagSeq/blastx/AirrProDB_CvirgNQuery/airradians_diamond_out", header=F)) %>%
`colnames<-`(c("qseqid", "sseqid", "pident", "length", "mismatch", "gapopen", "qstart", "qend", "sstart", "send", "evalue", "bitscore"))
#(2) Cgigas protein database with Airradians nucleotide query
blastx_Airr_Cgig <- as.data.table(read.delim2(file="HPC_Analysis/output/F1_TagSeq/blastx/CgigProDB_AirrNQuery/cgigas_diamond_out", header=F)) %>%
`colnames<-`(c("qseqid", "sseqid", "pident", "length", "mismatch", "gapopen", "qstart", "qend", "sstart", "send", "evalue", "bitscore"))
nrow(raw.countmatrix) # 26595 total unique transcrips calls in A irradians count matrix
# how many unique trnascript IDs of Airradians were covered by oyster blastx(s)?
# Cvirginica
length(unique(blastx_Airr_Cvirg$sseqid)) # 19042 - Airradians transcripts - in blast x Airradiads Prot database to Cvriginica nucleotide query
(length(unique(blastx_Airr_Cvirg$sseqid)) / nrow(raw.countmatrix))* 100 # 71.6% of genes!
# C gigas
length(unique(blastx_Airr_Cgig$qseqid)) # 7046 - Airradians transcripts - in Cgigas protein database to Airradians nucleotide query
(length(unique(blastx_Airr_Cgig$sseqid)) / nrow(raw.countmatrix))* 100 # 32.3% of genes!
nrow(Cvirg_seqID) # 66625
Cvirg_GOterms <- read.csv(file = "RAnalysis/Data/Transcriptomics/Cviginiva_GOterms.csv", header =T) %>%
dplyr::select(c('GeneID','Annotation_GO_ID', 'Length')) %>%
unique(GeneID) # there are many redundnat rows here
read.csv(file = "RAnalysis/Data/Transcriptomics/Cviginiva_GOterms.csv", header =T) %>%
dplyr::select(c('GeneID','Annotation_GO_ID', 'Length'))
read.csv(file = "RAnalysis/Data/Transcriptomics/Cviginiva_GOterms.csv", header =T) %>%
dplyr::select(c('GeneID','Annotation_GO_ID', 'Length')) %>%
dplyr::group_by('GeneID','Annotation_GO_ID') %>%
dplyr::summarise(
meanLength = mean(Length))
read.csv(file = "RAnalysis/Data/Transcriptomics/Cviginiva_GOterms.csv", header =T) %>%
dplyr::select(c('GeneID','Annotation_GO_ID', 'Length')) %>%
dplyr::group_by('GeneID','Annotation_GO_ID')
read.csv(file = "RAnalysis/Data/Transcriptomics/Cviginiva_GOterms.csv", header =T) %>%
dplyr::select(c('GeneID','Annotation_GO_ID', 'Length')) %>%
dplyr::group_by('GeneID','Annotation_GO_ID') %>%
dplyr::summarise(
meanLength = mean(Length))
read.csv(file = "RAnalysis/Data/Transcriptomics/Cviginiva_GOterms.csv", header =T) %>%
dplyr::select(c('GeneID','Annotation_GO_ID', 'Length')) %>%
dplyr::group_by('GeneID') %>%
dplyr::summarise(
meanLength = mean(Length))
read.csv(file = "RAnalysis/Data/Transcriptomics/Cviginiva_GOterms.csv", header =T) %>%
dplyr::select(c('GeneID','Annotation_GO_ID', 'Length')) %>%
dplyr::group_by(GeneID,Annotation_GO_ID) %>%
dplyr::summarise(
meanLength = mean(Length))
read.csv(file = "RAnalysis/Data/Transcriptomics/Cviginiva_GOterms.csv", header =T) %>%
dplyr::select(c('GeneID','Annotation_GO_ID', 'Length')) %>%
dplyr::group_by(GeneID,Annotation_GO_ID) %>%
dplyr::summarise(
meanLength = mean(Length))
unique()
read.csv(file = "RAnalysis/Data/Transcriptomics/Cviginiva_GOterms.csv", header =T) %>%
dplyr::select(c('GeneID','Annotation_GO_ID', 'Length')) %>%
dplyr::group_by(GeneID,Annotation_GO_ID) %>%
dplyr::summarise(
meanLength = mean(Length)) %>%
unique()
Cvirg_GOterms <- read.csv(file = "RAnalysis/Data/Transcriptomics/Cviginiva_GOterms.csv", header =T) %>%
dplyr::select(c('GeneID','Annotation_GO_ID', 'Length')) %>%
dplyr::group_by(GeneID,Annotation_GO_ID) %>%
dplyr::summarise(
meanLength = mean(Length)) %>%
unique() # there are many redundnat rows here
nrow(Cvirg_GOterms) #35106
Cvirg_GOterms
# diamond result to obtain accession IDs of annotated genes Cvirg and Cgigas for gene ID, GO, and KEGG ID information
#(1) Airradians protein database (...pep.fna file) with Cvirginica nucleotide query
blastx_Airr_Cvirg <- as.data.table(read.delim2(file="HPC_Analysis/output/F1_TagSeq/blastx/AirrProDB_CvirgNQuery/airradians_diamond_out", header=F)) %>%
`colnames<-`(c("qseqid", "sseqid", "pident", "length", "mismatch", "gapopen", "qstart", "qend", "sstart", "send", "evalue", "bitscore"))
#(2) Cgigas protein database with Airradians nucleotide query
blastx_Airr_Cgig <- as.data.table(read.delim2(file="HPC_Analysis/output/F1_TagSeq/blastx/CgigProDB_AirrNQuery/cgigas_diamond_out", header=F)) %>%
`colnames<-`(c("qseqid", "sseqid", "pident", "length", "mismatch", "gapopen", "qstart", "qend", "sstart", "send", "evalue", "bitscore"))
nrow(raw.countmatrix) # 26595 total unique transcrips calls in A irradians count matrix
# how many unique trnascript IDs of Airradians were covered by oyster blastx(s)?
# Cvirginica
length(unique(blastx_Airr_Cvirg$sseqid)) # 19042 - Airradians transcripts - in blast x Airradiads Prot database to Cvriginica nucleotide query
(length(unique(blastx_Airr_Cvirg$sseqid)) / nrow(raw.countmatrix))* 100 # 71.6% of genes!
# C gigas
length(unique(blastx_Airr_Cgig$qseqid)) # 7046 - Airradians transcripts - in Cgigas protein database to Airradians nucleotide query
(length(unique(blastx_Airr_Cgig$sseqid)) / nrow(raw.countmatrix))* 100 # 32.3% of genes!
# by bitscore (highest is the best hit) use 'which.max'
bybitscore <- blastx_Airr_Cvirg[,.SD[which.max(bitscore)],by=sseqid] # max bitscore
length(unique(bybitscore$sseqid)) # 19042
length(unique(bybitscore$sseqid)) == length(unique(blastx_Airr_Cvirg$sseqid))# TRUE
nrow(bybitscore %>% dplyr::filter(sseqid %in% raw.countmatrix$transcript_id)) # 18874
# by evalue (lowest is the best hit) - use 'which.min'
byevalue <- blastx_Airr_Cvirg[,.SD[which.min(evalue)],by=sseqid] # min evalue
length(unique(byevalue$sseqid)) # 19042
length(unique(byevalue$sseqid)) == length(unique(blastx_Airr_Cvirg$sseqid))# TRUE
nrow(byevalue %>% dplyr::filter(sseqid %in% raw.countmatrix$transcript_id)) # 18874
# calla dataframe for the two sseqids of blatx dataframes by e value and bitscore
# what does this do? if only one column output than the two are the exact same,
# if two than bitscore (highest) and evalue (lowest) call different transcript IDs
head(as.data.table(c(byevalue$sseqid, bybitscore$sseqid)), header=F) # one column meaning they are the exact same!
# lets go with evalue as the 'gold stnadard'
# 'byevalue' gives us the Airradians trnascript ID (i.e. evm.model.Contig....' alonside
# for each of the corresponding C virginica IDs (i.e. XM_....') to obtaitn KEGG and GO annotation based
# on sequence relatedness
head(byevalue)
# Now lets call the C virginica transcriptome and edit to fit our needs
# seq ID reference fr C virginica data
Cvirg_seqID_editted <- as.data.frame(Cvirg_seqID[Cvirg_seqID$fullID %like% "XM_", ] %>% # call all mRNA samples - accession always starts with XM
dplyr::mutate(TranscriptID = (str_match(fullID, ">\\s*(.*?)\\s* PREDICTED:")[,2])) %>% # remove excess ID information
dplyr::mutate(ProteinID = sub('.*Crassostrea virginica ', '',(gsub("\\s\\(LOC.*|\\sLOC111.*", "", perl=TRUE, fullID))) ) %>% # parse out the protein ID
dplyr::mutate(GeneID = paste('L', (gsub('),.*', '',(gsub(".*\\s\\(L", "", fullID)))), sep = '')) %>% # parse out the gene ID
dplyr::select(-fullID)) # remove the full ID
nrow(Cvirg_seqID_editted) # 60201
nrow(Cvirg_GOterms) # 35106
Cvirg_seqIDMASTER <- full_join(Cvirg_seqID_editted,Cvirg_GOterms, by = 'GeneID')
nrow(Cvirg_seqIDMASTER) # 62578
# write csv
write.csv(Cvirg_seqIDMASTER, file = "RAnalysis/Data/Transcriptomics/seq_id_Cvirginica_master.csv", row.names = FALSE)
# read 'Cvirg_seqIDMASTER' output above
# file contains the Cvirgnica transcript ID, protein ID, gene ID and GO term annotation
Cvirg_seqID <- read.csv(file = "RAnalysis/Data/Transcriptomics/seq_id_Cvirginica_master.csv", header =T) %>%
dplyr::rename(Cvirginica_TranscriptID = TranscriptID)
# # lern how many unique A irradians transcript IDs we have in the raw count matrix
Airr.ID <- as.data.frame(raw.countmatrix$transcript_id) %>%
`colnames<-`("Airradians_TranscriptID")
nrow(unique(Airr.ID)) == nrow(Airr.ID) # TRUE
nrow(Airr.ID) # 26595 - the number of transcripts TOTAL in the raw count matrix1
# merge the Cvirginica seIDs (all cvirginica IDs) with the blastx table we made contianing Airradians hits!
Cvirg_ID.evalue <- merge(Cvirg_seqID,
(byevalue %>%
dplyr::select(sseqid, qseqid) %>%
`colnames<-`(c("Airradians_TranscriptID", "Cvirginica_TranscriptID"))), by="Cvirginica_TranscriptID", all=T) %>%
`colnames<-`(c("blastxEval_CvirgTranscriptID",
"blastxEval_CvirgProteinID",
"blastxEval_CvirgGeneID",
"blastxEval_CvirgGOterms",
"Airradians_TranscriptID"))
Cvirg_ID.evalue
# merge the Cvirginica seIDs (all cvirginica IDs) with the blastx table we made contianing Airradians hits!
Cvirg_ID.evalue <- merge(Cvirg_seqID,
(byevalue %>%
dplyr::select(sseqid, qseqid) %>%
`colnames<-`(c("Airradians_TranscriptID", "Cvirginica_TranscriptID"))), by="Cvirginica_TranscriptID", all=T) %>%
`colnames<-`(c("blastxEval_CvirgTranscriptID",
"blastxEval_CvirgProteinID",
"blastxEval_CvirgGeneID",
"blastxEval_CvirgGOterms",
"meanLength",
"Airradians_TranscriptID"))
Cvirg_ID.evalue
# we can now do a final merge
# here was have all Airradians Transcript IDs that had the highest
# evalue hit to the Cvirginica protein database
# merged are the protein names, geneID, GOterms from the Cvirginica database
# to facilitate functional analsiss of DEGs in the Airradians data
Airr_Cvirg_master_seq_ID <- merge(Airr.ID,Cvirg_ID.evalue,by="Airradians_TranscriptID")
# merge2 <- merge(merge1, Cvirg_ID.bitsc,by="Airradians_TranscriptID", all=T)
nrow(Airr_Cvirg_master_seq_ID) # 19168
(nrow(Airr_Cvirg_master_seq_ID) / nrow(raw.countmatrix))*100 # 72.0737 % of genes in our count matrix are represented
Airr_Cvirg_master_seq_ID
# write csv
write.csv(Airr_Cvirg_master_seq_ID, file = "RAnalysis/Data/Transcriptomics/seq_id_AirrCvirg_MERGED_master.csv", row.names = FALSE)
### Panopea generosa - load .fna ('Geoduck_annotation') and foramt GO terms ('Geoduck_GOterms') and vectors
Airr_Cvirg_annotation <- read.csv(file="RAnalysis/Data/Transcriptomics/seq_id_AirrCvirg_MERGED_master.csv",
sep = ',',
header = T) %>%
dplyr::select(c('Airradians_TranscriptID',
"blastxEval_CvirgTranscriptID",
"blastxEval_CvirgProteinID",
"blastxEval_CvirgGeneID",
"blastxEval_CvirgGOterms",
"meanLength"))
path_out = 'C:/Users/samjg/Documents/Github_repositories/Airradians_multigen_OA/RAnalysis/Output/Transcriptomics/DESeq2'
knitr::opts_knit$set(root.dir = "C:/Users/samjg/Documents/Github_repositories/Airradians_multigen_OA/RAnalysis")
### Panopea generosa - load .fna ('Geoduck_annotation') and foramt GO terms ('Geoduck_GOterms') and vectors
Airr_Cvirg_annotation <- read.csv(file="RAnalysis/Data/Transcriptomics/seq_id_AirrCvirg_MERGED_master.csv",
sep = ',',
header = T) %>%
dplyr::select(c('Airradians_TranscriptID',
"blastxEval_CvirgTranscriptID",
"blastxEval_CvirgProteinID",
"blastxEval_CvirgGeneID",
"blastxEval_CvirgGOterms",
"meanLength"))
### Panopea generosa - load .fna ('Geoduck_annotation') and foramt GO terms ('Geoduck_GOterms') and vectors
Airr_Cvirg_annotation <- read.csv(file="Data/Transcriptomics/seq_id_AirrCvirg_MERGED_master.csv",
sep = ',',
header = T) %>%
dplyr::select(c('Airradians_TranscriptID',
"blastxEval_CvirgTranscriptID",
"blastxEval_CvirgProteinID",
"blastxEval_CvirgGeneID",
"blastxEval_CvirgGOterms",
"meanLength"))
Airr_Cvirg_annotation