From 44e01a8308806287e5d2f7e5ba96fc5ed1aad102 Mon Sep 17 00:00:00 2001 From: Sigve Nakken Date: Thu, 1 Aug 2024 12:01:35 +0200 Subject: [PATCH] properly filter MAF output in tumor-only mode --- pcgr/config.py | 1 + pcgr/main.py | 2 + pcgrr/NAMESPACE | 1 + pcgrr/R/input_data.R | 5 ++ pcgrr/R/maf.R | 132 +++++++++++++++++++++++++++++++++++ pcgrr/man/filter_maf_file.Rd | 18 +++++ 6 files changed, 159 insertions(+) create mode 100644 pcgrr/R/maf.R create mode 100644 pcgrr/man/filter_maf_file.Rd diff --git a/pcgr/config.py b/pcgr/config.py index 486de4bb..3b314cc9 100644 --- a/pcgr/config.py +++ b/pcgr/config.py @@ -133,6 +133,7 @@ def create_config(arg_dict, workflow = "PCGR"): conf_options['molecular_data']['fname_cna_tsv'] = "None" conf_options['molecular_data']['fname_expression_tsv'] = "None" conf_options['molecular_data']['fname_expression_outliers_tsv'] = "None" + conf_options['molecular_data']['fname_maf_tsv'] = "None" #conf_options['molecular_data']['fname_expression_csq_tsv'] = "None" conf_options['molecular_data']['fname_expression_similarity_tsv'] = "None" conf_options['molecular_data']['fname_tmb_tsv'] = "None" diff --git a/pcgr/main.py b/pcgr/main.py index 3d2a6888..3134f058 100755 --- a/pcgr/main.py +++ b/pcgr/main.py @@ -297,6 +297,8 @@ def run_pcgr(input_data, output_data,conf_options): conf_options['output_prefix'] = output_prefix conf_options['molecular_data']['fname_mut_vcf'] = output_vcf conf_options['molecular_data']['fname_mut_tsv'] = output_pass_tsv_gz + if conf_options['other']['vcf2maf'] == 1: + conf_options['molecular_data']['fname_maf_tsv'] = output_maf if conf_options['somatic_snv']['tmb']['run'] == 1: conf_options['molecular_data']['fname_tmb_tsv'] = tmb_fname if not input_cna == 'None': diff --git a/pcgrr/NAMESPACE b/pcgrr/NAMESPACE index 961e4d09..5381731a 100644 --- a/pcgrr/NAMESPACE +++ b/pcgrr/NAMESPACE @@ -28,6 +28,7 @@ export(exclude_non_chrom_variants) export(expand_biomarker_items) export(export_quarto_evars) export(filter_eitems_by_site) +export(filter_maf_file) export(filter_read_support) export(generate_annotation_link) export(generate_report) diff --git a/pcgrr/R/input_data.R b/pcgrr/R/input_data.R index 5586a64f..5910db58 100644 --- a/pcgrr/R/input_data.R +++ b/pcgrr/R/input_data.R @@ -221,6 +221,11 @@ load_somatic_snv_indel <- function( dplyr::filter( .data$SOMATIC_CLASSIFICATION == "SOMATIC") + ## filter also MAF file if provided + pcgrr::filter_maf_file( + callset = callset, + settings = settings) + ## Issue warning if clinically actionable variants are filtered ## with current filtering settings n_actionable_filtered <- diff --git a/pcgrr/R/maf.R b/pcgrr/R/maf.R new file mode 100644 index 00000000..efbd0cb0 --- /dev/null +++ b/pcgrr/R/maf.R @@ -0,0 +1,132 @@ +#' Function that takes a MAF file generated with vcf2maf +#' and filters out variants that are presumably germline (tumor-only run) +#' +#' @param callset Callset with pre-processed somatic SNV/InDel variants +#' @param settings PCGR run/configuration settings +#' +#' @export +filter_maf_file <- function(callset, settings) { + + ## check if vcf2maf is TRUE + if (as.logical(settings[['conf']][['other']][['vcf2maf']]) == FALSE) { + return(0) + } + + filtered_vars_maf_like <- data.frame() + + if("variant" %in% names(callset)) { + if(NROW(callset[['variant']]) == 0) { + return(0) + } + + pcgrr::log4r_info(paste0( + "Updating MAF file with filtered somatic SNV/InDels")) + + if(all(c("CHROM", + "POS", + "REF", + "ALT") %in% colnames(callset[['variant']]))) { + filtered_vars_maf_like <- callset[['variant']] |> + dplyr::select(c("CHROM", "POS", "REF", "ALT")) |> + dplyr::mutate(Tumor_Seq_Allele1 = dplyr::case_when( + nchar(.data$REF) > 1 & + nchar(.data$ALT) == 1 & + substr(.data$REF, 1, 1) == .data$ALT ~ substr(.data$REF, 2, nchar(.data$REF)), + nchar(.data$ALT) > 1 & + nchar(.data$REF) == 1 & + substr(.data$ALT, 1, 1) == .data$REF ~ "-", # insertion + TRUE ~ .data$REF + )) |> + dplyr::mutate(Tumor_Seq_Allele2 = dplyr::case_when( + nchar(.data$REF) > 1 & + nchar(.data$ALT) == 1 & + substr(.data$REF, 1, 1) == .data$ALT ~ "-", + nchar(.data$ALT) > 1 & + nchar(.data$REF) == 1 & + substr(.data$ALT, 1, 1) == .data$REF ~ substr(.data$ALT, 2, nchar(.data$ALT)), + TRUE ~ .data$ALT + )) |> + dplyr::mutate(Variant_Type = dplyr::case_when( + nchar(.data$REF) == 1 & + nchar(.data$ALT) == 1 ~ "SNP", + nchar(.data$REF) > 1 & + nchar(.data$ALT) == 1 ~ "DEL", + nchar(.data$ALT) > 1 & + nchar(.data$REF) == 1 ~ "INS", + TRUE ~ "MNP" + )) |> + dplyr::mutate( + Chromosome = .data$CHROM, + Start_Position = dplyr::case_when( + .data$Variant_Type == "DEL" & + substr(.data$REF, 1, 1) == .data$ALT ~ .data$POS + 1, + TRUE ~ .data$POS + ) + ) |> + dplyr::select( + c("Chromosome", + "Start_Position", + "Tumor_Seq_Allele1", + "Tumor_Seq_Allele2", + "Variant_Type") + ) + } + + } + + maf_data_unfiltered <- data.frame() + maf_data_header <- NULL + + ## check if unfiltered MAF file exists and read it - if not, return 0 + if (file.exists(settings[['molecular_data']][['fname_maf_tsv']])) { + if(!(file.size(settings[['molecular_data']][['fname_maf_tsv']]) == 0)) { + maf_data_header <- readLines( + settings[['molecular_data']][['fname_maf_tsv']], n = 1) + + maf_data_unfiltered <- readr::read_tsv( + settings[['molecular_data']][['fname_maf_tsv']], + show_col_types = F, col_names = T, + comment = "#", na = "" + ) + } else { + pcgrr::log4r_warn("MAF file is empty - no filtering will be performed") + return(0) + } + } + + if(NROW(maf_data_unfiltered) > 0 & + NROW(filtered_vars_maf_like) > 0) { + if(all(c("Chromosome", + "Start_Position", + "Tumor_Seq_Allele1", + "Tumor_Seq_Allele2", + "Variant_Type") %in% colnames(maf_data_unfiltered)) & + all(c("Chromosome", + "Start_Position", + "Tumor_Seq_Allele1", + "Tumor_Seq_Allele2", + "Variant_Type") %in% colnames(filtered_vars_maf_like))) { + maf_data_filtered <- maf_data_unfiltered |> + dplyr::semi_join( + filtered_vars_maf_like, + by = c("Chromosome", "Start_Position", + "Tumor_Seq_Allele1", "Tumor_Seq_Allele2", + "Variant_Type") + ) + + if(!is.null(maf_data_header) & + NROW(maf_data_filtered) > 0) { + file.remove(settings[['molecular_data']][['fname_maf_tsv']]) + writeLines(maf_data_header, + settings[['molecular_data']][['fname_maf_tsv']]) + options(scipen = 999) + readr::write_tsv( + maf_data_filtered, + settings[['molecular_data']][['fname_maf_tsv']], + append = TRUE, col_names = T, quote = "none", na = "") + } + } + } + + return(0) +} diff --git a/pcgrr/man/filter_maf_file.Rd b/pcgrr/man/filter_maf_file.Rd new file mode 100644 index 00000000..6a3a3b81 --- /dev/null +++ b/pcgrr/man/filter_maf_file.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/maf.R +\name{filter_maf_file} +\alias{filter_maf_file} +\title{Function that takes a MAF file generated with vcf2maf +and filters out variants that are presumably germline (tumor-only run)} +\usage{ +filter_maf_file(callset, settings) +} +\arguments{ +\item{callset}{Callset with pre-processed somatic SNV/InDel variants} + +\item{settings}{PCGR run/configuration settings} +} +\description{ +Function that takes a MAF file generated with vcf2maf +and filters out variants that are presumably germline (tumor-only run) +}