-
Notifications
You must be signed in to change notification settings - Fork 4
/
aggregate_data_tables.R
executable file
·124 lines (107 loc) · 3.95 KB
/
aggregate_data_tables.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
#!/usr/bin/env Rscript
# This script is used to extract 1 column each from multiple files and report all columns
# in a single file.
# Example use case for this script is extracting raw counts from multiple per sample RSEM
# files and creating a single raw counts matrix
#
# Eg:
#
# Rscript aggregate_data_tables.R \
# -s KO1_resent_mutated,KO1_resent,KO1_resent_unmutated,KO1_slam_mutated \
# -l ./KO1_resent/KO1_resent_mutated.RSEM.genes.results,./KO1_resent/KO1_resent.RSEM.genes.results,./KO1_resent/KO1_resent_unmutated.RSEM.genes.results \
# -c expected_count \
# -i gene_id \
# -o rsem.raw_counts_matrix.tsv
#
suppressPackageStartupMessages(library("argparse"))
suppressPackageStartupMessages(library("sets"))
## functions
checkfile <- function(filename) {
if (file.access(filename) == -1) {
stop(sprintf("Specified file ( %s ) does not exist", filename))
}
}
readfile <- function(filename, indexcols, datacol, samplename) {
d <- read.csv(filename, header = TRUE, sep = "\t")
cols <- colnames(d)
reqdcols <- c(indexcols, datacol)
if (set_is_subset(as.set(reqdcols), as.set(cols))) {
d <- d[, reqdcols]
cols <- colnames(d)
cols <- gsub(pattern = datacol, replacement = samplename, cols)
colnames(d) <- cols
return(d)
} else {
stop(sprintf("Required columns: %s are missing in file %s", reqdcols, filename))
}
}
debug <- 0
# create parser object
parser <- ArgumentParser()
# specify our desired options
# by default ArgumentParser will add an help option
parser$add_argument("-l", "--filelist",
type = "character",
help = "comma separated list of files",
required = TRUE
)
parser$add_argument("-i", "--indexcols",
type = "character",
help = "comma separated list of columns to use as index while merging. These need to exist in all files provided in the -l option.",
required = TRUE
)
parser$add_argument("-c", "--datacol",
type = "character",
help = "column to be extracted from all samples and aggregated in the outputfile. Should exist in all files provided to -l. Its name will be replaced by corresponding value in -s argument.",
required = TRUE
)
parser$add_argument("-s", "--samplenames",
type = "character",
help = "comma separated list of sample names. Need to be unique. Will replace datacol (-c) when reported in the output file. ",
required = TRUE
)
parser$add_argument("-o", "--outfile",
type = "character",
help = "aggregated outfile",
required = TRUE
)
args <- parser$parse_args()
filelist <- unlist(strsplit(args$filelist, ","))
indexcols <- unlist(strsplit(args$indexcols, ","))
datacol <- args$datacol
samplenames <- unlist(strsplit(args$samplenames, ","))
outfile <- args$outfile
if (debug == 1) {
filelist <- unlist(strsplit("/Volumes/CCBR/projects/ccbr1060/Hg38_shRNA_hybrid/HGHY2DRXY_analysis/results/596-7-2/STAR/withChimericJunctions/a.tsv,/Volumes/CCBR/projects/ccbr1060/Hg38_shRNA_hybrid/HGHY2DRXY_analysis/results/596-7-2/STAR/withChimericJunctions/b.tsv", ","))
indexcols <- unlist(strsplit("ensemblID,gene_name,mRNA_length", ","))
datacol <- "tpm"
samplenames <- unlist(strsplit("A,B", ","))
outfile <- "/Volumes/CCBR/projects/ccbr1060/Hg38_shRNA_hybrid/HGHY2DRXY_analysis/results/596-7-2/STAR/withChimericJunctions/test.tpm.tsv"
}
# 2 or more files required
if (length(filelist) <= 1) {
stop(sprintf("Two are more comma separated files needed"))
}
# nfiles and nsamplenames should match
if (length(unique(filelist)) != length(unique(samplenames))) {
stop(sprintf("Number of files need to match number of samplenames. Duplicate files and samplenames not allowed!"))
}
for (f in filelist) {
checkfile(f)
}
m <- readfile(
filename = filelist[1],
indexcols = indexcols,
datacol = datacol,
samplename = samplenames[1]
)
for (i in 2:length(filelist)) {
n <- readfile(
filename = filelist[i],
indexcols = indexcols,
datacol = datacol,
samplename = samplenames[i]
)
m <- merge(m, n, by = indexcols)
}
write.table(m, file = outfile, quote = FALSE, sep = "\t", row.names = FALSE)