Skip to content

Commit

Permalink
Merge pull request #125 from umccr/umccrise
Browse files Browse the repository at this point in the history
umccrise: refactor cancer report table parsers
  • Loading branch information
pdiakumis authored Sep 3, 2024
2 parents 24f7da7 + 421cfe4 commit e701a69
Show file tree
Hide file tree
Showing 7 changed files with 345 additions and 532 deletions.
5 changes: 1 addition & 4 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -28,10 +28,7 @@ export(TsoSampleAnalysisResultsFile)
export(TsoTargetRegionCoverageFile)
export(TsoTmbFile)
export(TsoTmbTraceTsvFile)
export(UmChordTsvFile)
export(UmHrdetectTsvFile)
export(UmQcSumFile)
export(UmSigsSnvFile)
export(UmccriseCanRepTables)
export(VCMetricsFile)
export(Wf)
export(Wf_tso_ctdna_tumor_only)
Expand Down
258 changes: 113 additions & 145 deletions R/umccrise.R
Original file line number Diff line number Diff line change
@@ -1,27 +1,81 @@
#' UmChordTsvFile R6 Class
#' UmccriseCanRepTables R6 Class
#'
#' @description
#' Contains methods for reading and displaying contents of the
#' `chord.tsv.gz` file output from umccrise.
#' Reads and writes tidy versions of files within the `cancer_report_tables` directory
#' output from the `umccrise` workflow.
#'
#' @examples
#' \dontrun{
#' x <- "/path/to/chord.tsv.gz"
#' d <- UmChordTsvFile$new(x)
#' d_parsed <- d$read() # or read(d)
#' d$write(d_parsed, out_dir = tempdir(), prefix = "sample705", out_format = "tsv")
#' p1 <- "~/icav1/g/production/analysis_data/SBJ01155/umccrise/202408300c218043"
#' p2 <- "L2101566__L2101565/SBJ01155__PRJ211091/cancer_report_tables"
#' p <- file.path(p1, p2)
#' obj <- UmccriseCanRepTables$new(p)
#' obj$path
#' obj$contents
#' d <- obj$read()
#' obj$write(d, out_dir = tempdir(), prefix = "sampleA", out_format = "tsv")
#' }
#'
#' @export
UmChordTsvFile <- R6::R6Class(
"UmChordTsvFile",
inherit = File,
UmccriseCanRepTables <- R6::R6Class(
"UmccriseCanRepTables",
public = list(
#' @description
#' Reads the `chord.tsv.gz` file output from umccrise.
#' @field path Path to the `cancer_report_tables` directory.
#' @field contents Tibble with file path, basename, and size.
path = NULL,
contents = NULL,
#' @description Create a new UmccriseCanRepTables object.
#' @param path Path to the `cancer_report_tables` directory.
initialize = function(path = NULL) {
stopifnot(is.character(path), length(path) == 1)
self$path <- normalizePath(path)
self$contents <- fs::dir_info(path, type = "file", recurse = TRUE) |>
dplyr::mutate(
bname = basename(.data$path),
size = as.character(trimws(.data$size))
) |>
dplyr::select("path", "bname", "size")
},
#' @description Print details about the cancer_report_tables directory.
#' @param ... (ignored).
print = function(...) {
bnames <- self$contents |>
dplyr::mutate(
low = tolower(.data$bname),
) |>
dplyr::arrange(.data$low) |>
dplyr::mutate(
n = dplyr::row_number(),
bn = glue("{.data$n}. {.data$bname} ({.data$size})")
) |>
dplyr::pull("bn")
cat("#--- UmccriseCanRepTables ---#\n")
cat(glue("Path: {self$path}"), "\n")
cat("Contents:\n")
cat(bnames, sep = "\n")
invisible(self)
},
#' @description Returns file with given pattern from the cancer_report_tables directory.
#' @param pat File pattern to look for.
grep_file = function(pat) {
x <- self$contents |>
dplyr::filter(grepl(pat, .data$path)) |>
dplyr::pull(.data$path)
if (length(x) > 1) {
fnames <- paste(x, collapse = ", ")
cli::cli_abort("More than 1 match found for {pat} ({fnames}). Aborting.")
}
if (length(x) == 0) {
return("") # file.exists("") returns FALSE
}
return(x)
},

#' @description Read `chord.tsv.gz` file output from umccrise.
#'
#' @return A tibble.
read = function() {
x <- self$path
#' @param x (`character(1)`)\cr
#' Path to `chord.tsv.gz` file.
read_chordtsv = function(x) {
ct <- readr::cols_only(
p_hrd = "d",
hr_status = "c",
Expand All @@ -31,49 +85,11 @@ UmChordTsvFile <- R6::R6Class(
)
read_tsvgz(x, col_types = ct)
},

#' @description
#' Writes a tidy version of the `chord.tsv.gz` file output from umccrise.
#'
#' @param d Parsed object from `self$read()`.
#' @param prefix Prefix of output file(s).
#' @param out_dir Output directory.
#' @param out_format Format of output file(s).
#' @param drid dracarys ID to use for the dataset (e.g. `wfrid.123`, `prid.456`).
write = function(d, out_dir = NULL, prefix, out_format = "tsv", drid = NULL) {
if (!is.null(out_dir)) {
prefix <- file.path(out_dir, prefix)
}
# prefix2 <- glue("{prefix}_chord")
write_dracarys(obj = d, prefix = prefix, out_format = out_format, drid = drid)
}
)
)

#' UmHrdetectTsvFile R6 Class
#'
#' @description
#' Contains methods for reading and displaying contents of the
#' `hrdetect.tsv.gz` file output from umccrise.
#'
#' @examples
#' \dontrun{
#' x <- "/path/to/hrdetect.tsv.gz"
#' d <- UmHrdetectTsvFile$new(x)
#' d_parsed <- d$read() # or read(d)
#' d$write(d_parsed, out_dir = tempdir(), prefix = "sample705", out_format = "tsv")
#' }
#' @export
UmHrdetectTsvFile <- R6::R6Class(
"UmHrdetectTsvFile",
inherit = File,
public = list(
#' @description
#' Reads the `hrdetect.tsv.gz` file output from umccrise.
#' @description Read `hrdetect.tsv.gz` file output from umccrise.
#'
#' @return A tibble.
read = function() {
x <- self$path
#' @param x (`character(1)`)\cr
#' Path to `hrdetect.tsv.gz` file.
read_hrdetecttsv = function(x) {
ct <- readr::cols(
.default = "d",
sample = "c"
Expand All @@ -82,98 +98,24 @@ UmHrdetectTsvFile <- R6::R6Class(
dplyr::select(-c("sample"))
},

#' @description
#' Writes a tidy version of the `hrdetect.tsv.gz` file output from umccrise.
#'
#' @param d Parsed object from `self$read()`.
#' @param prefix Prefix of output file(s).
#' @param out_dir Output directory.
#' @param out_format Format of output file(s).
#' @param drid dracarys ID to use for the dataset (e.g. `wfrid.123`, `prid.456`).
write = function(d, out_dir, prefix, out_format = "tsv", drid = NULL) {
if (!is.null(out_dir)) {
prefix <- file.path(out_dir, prefix)
}
# prefix2 <- glue("{prefix}_hrdetect")
write_dracarys(obj = d, prefix = prefix, out_format = out_format, drid = drid)
}
)
)

#' UmSigsSnvFile R6 Class
#'
#' @description
#' Contains methods for reading and displaying contents of the
#' `snv_20XX.tsv.gz` file with SNV signatures output from umccrise.
#'
#' @examples
#' \dontrun{
#' x <- "/path/to/snv_2015.tsv.gz"
#' d <- UmSigsSnvFile$new(x)
#' d_parsed <- d$read() # or read(d)
#' d$write(d_parsed, out_dir = tempdir(), prefix = "sample705", out_format = "tsv")
#' }
#' @export
UmSigsSnvFile <- R6::R6Class(
"UmSigsSnvFile",
inherit = File,
public = list(
#' @description
#' Reads the `snv.tsv.gz` file output from umccrise.
#' @description Read `snv_20XX.tsv.gz` file output from umccrise.
#'
#' @return A tibble.
read = function() {
x <- self$path
version <- dplyr::if_else(grepl("2015\\.tsv.\\gz", x), "2015", "2020")
#' @param x (`character(1)`)\cr
#' Path to `snv_20XX.tsv.gz` file.
read_sigs = function(x) {
ct <- readr::cols(
.default = "d",
Signature = "c"
)
read_tsvgz(x, col_types = ct)
},

#' @description
#' Writes a tidy version of the `snv_20XX.tsv.gz` signature file output from umccrise.
#'
#' @param d Parsed object from `self$read()`.
#' @param prefix Prefix of output file(s).
#' @param out_dir Output directory.
#' @param out_format Format of output file(s).
#' @param drid dracarys ID to use for the dataset (e.g. `wfrid.123`, `prid.456`).
write = function(d, out_dir, prefix, out_format = "tsv") {
if (!is.null(out_dir)) {
prefix <- file.path(out_dir, prefix)
}
# prefix2 <- glue("{prefix}_sigs_snv")
write_dracarys(obj = d, prefix = prefix, out_format = out_format, drid = drid)
}
)
)

#' UmQcSumFile R6 Class
#'
#' @description
#' Contains methods for reading and displaying contents of the
#' `qc_summary.tsv.gz` file with QC summary metrics output from umccrise.
#'
#' @examples
#' \dontrun{
#' x <- "/path/to/snv_2015.tsv.gz"
#' d <- UmQcSumFile$new(x)
#' d_parsed <- d$read() # or read(d)
#' d$write(d_parsed, out_dir = tempdir(), prefix = "sample705", out_format = "tsv")
#' }
#' @export
UmQcSumFile <- R6::R6Class(
"UmQcSumFile",
inherit = File,
public = list(
#' @description
#' Reads the `qc_summary.tsv.gz` file output from umccrise.
#' @description Read `qc_summary.tsv.gz` file output from umccrise.
#'
#' @return A tibble.
read = function() {
x <- self$path
#' @param x (`character(1)`)\cr
#' Path to `qc_summary.tsv.gz` file.
read_qcsummarytsv = function(x) {
d <- read_tsvgz(x, col_types = readr::cols(.default = "c"))
d |>
dplyr::select("variable", "value") |>
Expand Down Expand Up @@ -202,25 +144,51 @@ UmQcSumFile <- R6::R6Class(
"hrd_chord", "hrd_hrdetect", "contamination_hmf",
"deleted_genes_hmf", "tmb_hmf", "tml_hmf",
wgd_hmf = "WGD",
hypermutated, bpi_enabled
"hypermutated", "bpi_enabled"
)
},
#' @description
#' Reads contents of `cancer_report_tables` directory output by umccrise.
#'
#' @return A list of tibbles.
#' @export
read = function() {
# now return all as list elements
list(
chord = self$grep_file("-chord\\.tsv\\.gz$") |> self$read_chordtsv(),
hrdetect = self$grep_file("-hrdetect\\.tsv\\.gz$") |> self$read_hrdetecttsv(),
sigs2015 = self$grep_file("-snv_2015\\.tsv\\.gz$") |> self$read_sigs(),
sigs2020 = self$grep_file("-snv_2020\\.tsv\\.gz$") |> self$read_sigs(),
sigsdbs = self$grep_file("-dbs\\.tsv\\.gz$") |> self$read_sigs(),
sigsindel = self$grep_file("-indel\\.tsv\\.gz$") |> self$read_sigs(),
qcsum = self$grep_file("-qc_summary\\.tsv\\.gz$") |> self$read_qcsummarytsv()
)
},

#' @description
#' Writes a tidy version of the `qc_summary.tsv.gz` QC summary file output
#' from umccrise.
#' Writes tidied contents of `cancer_report_tables` directory output by umccrise.
#'
#' @param d Parsed object from `self$read()`.
#' @param prefix Prefix of output file(s).
#' @param out_dir Output directory.
#' @param out_format Format of output file(s).
#' @param drid dracarys ID to use for the dataset (e.g. `wfrid.123`, `prid.456`).
write = function(d, out_dir, prefix, out_format = "tsv", drid = NULL) {
write = function(d, out_dir = NULL, prefix, out_format = "tsv", drid = NULL) {
if (!is.null(out_dir)) {
prefix <- file.path(out_dir, prefix)
}
# prefix2 <- glue("{prefix}_qc_summary")
write_dracarys(obj = d, prefix = prefix, out_format = out_format, drid = drid)
d_write <- d |>
tibble::enframe(name = "section") |>
dplyr::rowwise() |>
dplyr::mutate(
section_low = tolower(.data$section),
p = glue("{prefix}_{.data$section_low}"),
out = list(write_dracarys(obj = .data$value, prefix = .data$p, out_format = out_format, drid = drid))
) |>
dplyr::ungroup() |>
dplyr::select("section", "value") |>
tibble::deframe()
invisible(d_write)
}
)
)
Loading

0 comments on commit e701a69

Please sign in to comment.