allow missing samplesize columns in metal config

mglev1n · Mar 21, 2024 · fdca905 · fdca905
1 parent 1d43a2d
commit fdca905
Show file tree

Hide file tree

Showing 35 changed files with 215 additions and 194 deletions.
diff --git a/R/annotate_rsids.R b/R/annotate_rsids.R
@@ -1,24 +1,23 @@
 # WARNING - Generated by {fusen} from dev/flat_genomics_functions.Rmd: do not edit by hand
 
 #' Annotate a dataframe containing genomic coordinates with rsids
-#' 
+#'
 #' This function can be used rapidly add rsids to GWAS summary statistics or any other dataframe containing genomic coordinates (eg. chromosome and position). This is a rapid function that does not explicitly account for differences in variants at each position, strand flips, etc.)
-#' 
+#'
 #' @param df Dataframe containing genomic coordinates to annotate with rsid
 #' @param dbSNP Bioconductor object containing SNP locations and alleles to be used for annotation (default: `SNPlocs.Hsapiens.dbSNP144.GRCh37::SNPlocs.Hsapiens.dbSNP144.GRCh37`)
-#' @param chrom_col Chromosome column 
+#' @param chrom_col Chromosome column
 #' @param pos_col Position column
 #'
 #' @return A dataframe containing the original contents, with an additional `rsid` column.
-#' 
+#'
 #' @export
-#' @concept genomics 
+#' @concept genomics
 #' @family {annotation}
 #' @examples
 #' \dontrun{
 #' annotate_rsids(sumstats_df)
 #' }
-
 annotate_rsids <- function(df, dbSNP = SNPlocs.Hsapiens.dbSNP144.GRCh37::SNPlocs.Hsapiens.dbSNP144.GRCh37, chrom_col = Chromosome, pos_col = Position) {
   if (sum(stringr::str_detect(names(df), "rsid")) > 0) {
     cli::cli_abort("A column named 'rsid' is already present")

diff --git a/R/calc_credset.R b/R/calc_credset.R
@@ -1,26 +1,25 @@
 # WARNING - Generated by {fusen} from dev/flat_genomics_functions.Rmd: do not edit by hand
 
 #' Perform Bayesian finemapping using the Approximate Bayes Factor approach
-#' 
+#'
 #' Description
-#' 
+#'
 #' @param df Dataframe containing GWAS summary statistics
-#' @param locus_marker_col Column containing a locus-level identifier 
+#' @param locus_marker_col Column containing a locus-level identifier
 #' @param effect_col Column containing effect estimates
 #' @param se_col Column containing standard errors fo the effect estimates
 #' @param samplesize_col Column containing sample sizes
 #' @param cred_interval Credible interval for the fine-mapped credible sets (default = 0.99; 0.95 is another common but artbitrarily determined interval)
 #'
 #' @return A data.frame containing credible sets at each locus. For each variant within the credible set, the prior probability of being the casual variant is provided.
-#' 
+#'
 #' @export
 #' @family {finemapping}
 #' @concept genomics
 #' @examples
 #' \dontrun{
 #' calc_credset(gwas_df)
 #' }
-
 calc_credset <- function(df, locus_marker_col = locus_marker, effect_col = effect, se_col = std_err, samplesize_col = samplesize, cred_interval = 0.99) {
   df %>%
     group_by({{ locus_marker_col }}) %>%

diff --git a/R/coloc_run.R b/R/coloc_run.R
@@ -1,9 +1,9 @@
 # WARNING - Generated by {fusen} from dev/flat_genomics_functions.Rmd: do not edit by hand
 
 #' Run Bayesian enumeration colocalization using Coloc
-#' 
+#'
 #' This function is a wrapper around [coloc::coloc.abf()] that takes a dataframe as input, and performs colocalization under the single-causal-variant assumption. Coloc was described in Giambartolomei et al. (PLOS Genetics 2014; <https://doi.org/10.1371/journal.pgen.1004383>).
-#' 
+#'
 #' @param df Dataframe containing summary statistics at a single locus for two traits in a "long" format, with one row per variant per trait.
 #' @param trait_col Column containing trait names
 #' @param variant_col Column containing unique variant identifiers (Eg. rsids, chr:pos)
@@ -21,55 +21,57 @@
 #' @return A list containing coloc results.
 #' - `summary` is a named vector containing the number of snps, and the posterior probabilities of the 5 colocalization hypotheses
 #' - `results` is an annotated version of the input data containing log approximate Bayes Factors and posterior probability of each SNP being causal if H4 is true.
-#' 
+#'
 #' @concept genomics
 #' @family {colocalization}
 #' @export
 #' @examples
 #' \dontrun{
 #' coloc_run(locus_df)
 #' }
-
 coloc_run <- function(df, trait_col = trait, variant_col = rsid, beta_col = beta, se_col = se, samplesize_col = samplesize, maf_col = maf, type_col = type, case_prop_col = case_prop, p1 = 1e-4, p2 = 1e-4, p12 = 1e-5, ...) {
-
-df <- df %>% 
-    select(trait = {{trait_col}}, maf = {{maf_col}}, type = {{type_col}}, variant_id = {{variant_col}}, beta = {{beta_col}}, se = {{se_col}}, samplesize = {{samplesize_col}}, case_prop = {{case_prop_col}}) %>%
+  df <- df %>%
+    select(trait = {{ trait_col }}, maf = {{ maf_col }}, type = {{ type_col }}, variant_id = {{ variant_col }}, beta = {{ beta_col }}, se = {{ se_col }}, samplesize = {{ samplesize_col }}, case_prop = {{ case_prop_col }}) %>%
     add_count(variant_id) %>%
     filter(n == 2)
-  
+
   trait_dfs <- df %>%
     distinct(trait)
-  
-  if(nrow(trait_dfs) != 2) {
+
+  if (nrow(trait_dfs) != 2) {
     cli::cli_abort("The input dataframe must contain only two traits")
   }
-  
+
   trait1 <- df %>%
     filter(trait == trait_dfs$trait[1]) %>%
     distinct(variant_id, .keep_all = TRUE)
-  
+
   trait2 <- df %>%
     filter(trait == trait_dfs$trait[2]) %>%
     distinct(variant_id, .keep_all = TRUE)
-
-  trait1_dataset <- list(beta = trait1$beta, #If log_OR column is full of NAs then use beta column instead
-                          varbeta = trait1$se^2,
-                          # pvalues = trait1$pval,
-                          type = unique(trait1$type), 
-                          snp = trait1$variant_id,
-                          N = trait1$samplesize,
-                          # {if(unique(trait1$type) == "cc") {s = trait1$case_prop[1]}},
-                          MAF = trait1$maf)
-
-  trait2_dataset <- list(beta = trait2$beta, #If log_OR column is full of NAs then use beta column instead
-                          varbeta = trait2$se^2,
-                          # pvalues = trait2$pval,
-                          type = unique(trait2$type), 
-                          snp = trait2$variant_id,
-                          N = trait2$samplesize,
-                         # {if(unique(trait2$type) == "cc") {s = trait1$case_prop[2]}},
-                          MAF = trait2$maf)
-
+
+  trait1_dataset <- list(
+    beta = trait1$beta, # If log_OR column is full of NAs then use beta column instead
+    varbeta = trait1$se^2,
+    # pvalues = trait1$pval,
+    type = unique(trait1$type),
+    snp = trait1$variant_id,
+    N = trait1$samplesize,
+    # {if(unique(trait1$type) == "cc") {s = trait1$case_prop[1]}},
+    MAF = trait1$maf
+  )
+
+  trait2_dataset <- list(
+    beta = trait2$beta, # If log_OR column is full of NAs then use beta column instead
+    varbeta = trait2$se^2,
+    # pvalues = trait2$pval,
+    type = unique(trait2$type),
+    snp = trait2$variant_id,
+    N = trait2$samplesize,
+    # {if(unique(trait2$type) == "cc") {s = trait1$case_prop[2]}},
+    MAF = trait2$maf
+  )
+
   suppressWarnings(coloc_res <- coloc::coloc.abf(dataset1 = trait1_dataset, dataset2 = trait2_dataset, p1 = p1, p2 = p2, p12 = p12, ...))
   return(coloc_res)
 }
diff --git a/R/gg_manhattan_df.R b/R/gg_manhattan_df.R
@@ -26,7 +26,6 @@
 #' \dontrun{
 #' gg_manhattan_df(sumstats_df)
 #' }
-
 gg_manhattan_df <- function(sumstats_df, annotation_df = NULL, chr_col = chromosome, pos_col = position, pval_col = p_value, pval_threshold = 0.001, label_col = gene, build = "hg19", color1 = "#045ea7", color2 = "#82afd3", speed = "slow", ...) {
   if (!is.null((annotation_df))) {
     df <- sumstats_df %>%

diff --git a/R/gg_qq_df.R b/R/gg_qq_df.R
@@ -18,7 +18,6 @@
 #' \dontrun{
 #' gg_qq_df(sumstats_df)
 #' }
-
 gg_qq_df <- function(sumstats_df, pval_col = p_value, ...) {
   df <- sumstats_df %>%
     select(pvalue = {{ pval_col }}) %>%

diff --git a/R/hyprcoloc_df.R b/R/hyprcoloc_df.R
@@ -1,9 +1,9 @@
 # WARNING - Generated by {fusen} from dev/flat_genomics_functions.Rmd: do not edit by hand
 
 #' Run multi-trait colocalization using HyPrColoc
-#' 
+#'
 #' This function is a convenience wrapper around [hyprcoloc::hyprcoloc()] that takes a dataframe as input, and performs mutli-trait colocalization. Details of the HyPrColoc method are described in Foley et al. (Nature Communications 2021; <https://doi.org/10.1038/s41467-020-20885-8>).
-#' 
+#'
 #' @param df Dataframe containing summary statistics at a single locus, in a "long" format, with one row per variant per trait.
 #' @param trait_col Column containing trait names
 #' @param snp_col Column containing variant names (eg. rsid, marker_name), which should be consistent across studies
@@ -22,7 +22,6 @@
 #' \dontrun{
 #' hyprcoloc_df(gwas_results_df)
 #' }
-
 hyprcoloc_df <- function(df, trait_col = trait, snp_col = rsid, beta_col = beta, se_col = se, type_col = type, ...) {
   df <- df %>%
     select(trait = {{ trait_col }}, rsid = {{ snp_col }}, beta.exposure = {{ beta_col }}, se.exposure = {{ se_col }}, type = {{ type_col }}) %>%

diff --git a/R/ldak_h2.R b/R/ldak_h2.R
@@ -1,15 +1,15 @@
 # WARNING - Generated by {fusen} from dev/flat_genomics_functions.Rmd: do not edit by hand
 
 #' Calculate heritability using LDAK
-#' 
+#'
 #' This function wraps LDAK, a command-line tool for estimating heritability. The tool and associated reference files can be download from the LDAK website (<https://ldak.org/>). The method is described in Zhang et al. (Nature Communications 2021; <https://doi.org/10.1038/s41467-021-24485-y).
-#' 
-#' @param sumstats_file Path to "munged" GWAS summary statistics file in LDSC format. The current implementation of this function requires rsids to link to other LDAK files. 
+#'
+#' @param sumstats_file Path to "munged" GWAS summary statistics file in LDSC format. The current implementation of this function requires rsids to link to other LDAK files.
 #' @param ldak_bin Path to LDAK binary
 #' @param ldak_tagfile Path to LDAK tagfile
 #' @param sample_prev Sample prevalence of cases (for case-control studies), `NULL` (default) for quantitative traits
 #' @param population_prev Population prevalence of cases in the sample (for case-control studies), `NULL` (default) for quantitative traits
-#' @param hm3_file Path to hm3 file containing 
+#' @param hm3_file Path to hm3 file containing
 #' @param ldak_cutoff Minor allele frequency cutoff (default = 0.01)
 #'
 #' @return List of dataframes containing LDAK results
@@ -19,17 +19,16 @@
 #' @export
 #' @examples
 #' \dontrun{
-#' ldak_h2(sumstats_file = "/path/to/munged_sumstats", ldak_bin = "/path/to/ldak", ldak_tagfile = "/path/to/tagfile",  hm3_file = "/path/to/hm3")
+#' ldak_h2(sumstats_file = "/path/to/munged_sumstats", ldak_bin = "/path/to/ldak", ldak_tagfile = "/path/to/tagfile", hm3_file = "/path/to/hm3")
 #' }
-
 ldak_h2 <- function(sumstats_file, ldak_bin, ldak_tagfile, sample_prev = NULL, population_prev = NULL, hm3_file, ldak_cutoff = 0.01) {
   # format paths
   cli::cli_progress_step("Formatting paths")
   ldak_bin <- fs::path_abs(ldak_bin)
   ldak_tagfile <- fs::path_abs(ldak_tagfile)
   ldak_in <- fs::file_temp()
   ldak_dir <- fs::path_temp()
-  
+
   # Category annotations
   ldak_annotations <- glue::glue("1 Coding_UCSC*
 2 Coding_UCSC.extend.500
@@ -97,9 +96,9 @@ ldak_h2 <- function(sumstats_file, ldak_bin, ldak_tagfile, sample_prev = NULL, p
 64 CpG_Content_50kb
 65 LDAK_Weightings
 66 Base_Category") %>%
-  read_delim(col_names = c("category", "annotation"), delim = " ") %>%
-  mutate(binary = str_detect(annotation, "\\*")) %>%
-  mutate(annotation = str_replace(annotation, "\\*", ""))
+    read_delim(col_names = c("category", "annotation"), delim = " ") %>%
+    mutate(binary = str_detect(annotation, "\\*")) %>%
+    mutate(annotation = str_replace(annotation, "\\*", ""))
 
   # read files
   cli::cli_progress_step("Reading files")
@@ -123,8 +122,12 @@ ldak_h2 <- function(sumstats_file, ldak_bin, ldak_tagfile, sample_prev = NULL, p
       "--summary", ldak_in,
       "--tagfile", ldak_tagfile,
       "--check-sums", "NO",
-      if(!is.null(population_prev)){c("--prevalence", population_prev)},
-      if(!is.null(sample_prev)){c("--ascertainment", sample_prev)},
+      if (!is.null(population_prev)) {
+        c("--prevalence", population_prev)
+      },
+      if (!is.null(sample_prev)) {
+        c("--ascertainment", sample_prev)
+      },
       "--cutoff", ldak_cutoff
     ),
     error_on_status = FALSE,

diff --git a/R/magmar.R b/R/magmar.R
@@ -1,9 +1,9 @@
 # WARNING - Generated by {fusen} from dev/flat_genomics_functions.Rmd: do not edit by hand
 
 #' Run MAGMA gene-based analysis
-#' 
+#'
 #' This function provides a wrapper to MAGMA, which performs gene-based testing from GWAS summary statistics. Details of MAGMA described in de Leeuw et al. (PLoS Cumputational Biology 2015; <https://doi.org/10.1371/journal.pcbi.1004219>). The MAGMA binary and reference files are available from the Complex Trait Genetics lab (<https://ctg.cncr.nl/software/magma>).
-#' 
+#'
 #' @param sumstats_df Dataframe containing GWAS summary statistics
 #' @param pval_col Column containing p-values
 #' @param snp_col Column containing rsids
@@ -17,7 +17,7 @@
 #' @concept genomics
 #' @family Gene-based testing
 #' @export
-#' 
+#'
 #' @examples
 #' \dontrun{
 #' magmar(sumstats_df, magma_bin = "/path/to/magma", bfile = "/path/to/bfiles", gene_file = "/path/to/genes", out_file = "magma_output")

diff --git a/R/metal_config.R b/R/metal_config.R
@@ -33,8 +33,7 @@
 #' metal_config(config_name = "name-of-analysis", output_dir = "/path/to/output/", study_files = c("/path/to/sumstats_1.txt", "/path/to/sumstats_2.txt))
 #' }
 
-metal_config <- function(config_name, output_dir, study_files, SCHEME = "STDERR", AVERAGEFREQ = "ON", MINMAXFREQ = "OFF", TRACKPOSITIONS = "ON", MARKERLABEL = "MARKER", CHROMOSOMELABEL = "CHROM", POSITIONLABEL = "POS", EFFECT_ALLELE = "EFFECT_ALLELE", OTHER_ALLELE = "OTHER_ALLELE", EFFECTLABEL = "BETA", STDERR = "SE", FREQLABEL = "EAF", NCASE = "N_CASE", NCONTROL = "N_CONTROL", SAMPLESIZE = "N") {
-
+metal_config <- function(config_name, output_dir, study_files, SCHEME = "STDERR", AVERAGEFREQ = "ON", MINMAXFREQ = "OFF", TRACKPOSITIONS = "ON", MARKERLABEL = "MARKER", CHROMOSOMELABEL = "CHROM", POSITIONLABEL = "POS", EFFECT_ALLELE = "EFFECT_ALLELE", OTHER_ALLELE = "OTHER_ALLELE", EFFECTLABEL = "BETA", STDERR = "SE", FREQLABEL = "EAF", NCASE = NULL, NCONTROL = NULL, SAMPLESIZE = NULL) {
   fs::dir_create(output_dir, recurse = TRUE)
 
   config_outfile <- fs::path(output_dir, paste0(config_name, "_metal-config.txt"))
@@ -46,32 +45,49 @@ metal_config <- function(config_name, output_dir, study_files, SCHEME = "STDERR"
     "SCHEME {SCHEME}
     AVERAGEFREQ {AVERAGEFREQ}
     MINMAXFREQ {MINMAXFREQ}
-    TRACKPOSITIONs {TRACKPOSITIONS}
+    TRACKPOSITIONS {TRACKPOSITIONS}
     MARKERLABEL {MARKERLABEL}
     CHROMOSOMELABEL {CHROMOSOMELABEL}
     POSITIONLABEL {POSITIONLABEL}
     ALLELELABELS {EFFECT_ALLELE} {OTHER_ALLELE}
     EFFECTLABEL {EFFECTLABEL}
     STDERR {STDERR}
     FREQLABEL {FREQLABEL}
-    
+
+    "
+  )
+
+  if (!is.null(NCASE)) {
+    config_text <- glue::glue("
+    {config_text}\n
     CUSTOMVARIABLE NCASE
     LABEL NCASE as {NCASE}
-    
+    ")
+  }
+
+  if (!is.null(NCONTROL)) {
+    config_text <- glue::glue("
+    {config_text}\n
     CUSTOMVARIABLE NCONTROL
-    LABEL NCONTROL as {NCONTROL}
-    
+    LABEL NCONTROL as {NCONTROL}")
+  }
+
+  if (!is.null(SAMPLESIZE)) {
+    config_text <- glue::glue("
+    {config_text}\n
     CUSTOMVARIABLE SAMPLESIZE
-    LABEL SAMPLESIZE as {SAMPLESIZE}
-    
+    LABEL SAMPLESIZE as {SAMPLESIZE}")
+  }
+
+  config_text <- glue::glue("
+    {config_text}
+                            
     {study_files}
     
     OUTFILE {meta_outfile}_metal- .txt
     ANALYZE HETEROGENEITY
     
-    QUIT
-    "
-  )
+    QUIT")
 
   readr::write_lines(x = config_text, file = config_outfile)
 

diff --git a/R/metal_run.R b/R/metal_run.R
@@ -1,9 +1,9 @@
 # WARNING - Generated by {fusen} from dev/flat_genomics_functions.Rmd: do not edit by hand
 
 #' Use METAL to run a GWAS meta-analysis
-#' 
+#'
 #' This function is a wrapper for METAL, a tool for performing meta-analysis of GWAS summary statistics <https://github.com/statgen/METAL>. Details of the arguments to METAL are described in the METAL documentation: <https://genome.sph.umich.edu/wiki/METAL_Documentation>.
-#' 
+#'
 #' @param config_file (path) Path to a METAL configuration file (this can be generated using [levinmisc::metal_config()])
 #' @param metal_path (path) Path to the METAL binary
 #'
@@ -16,7 +16,6 @@
 #' \dontrun{
 #' metal_run(config_file = "config.txt", metal_path = "/path/to/metal_binary")
 #' }
-
 metal_run <- function(config_file, metal_path) {
   metal_path <- normalizePath(metal_path)
   config_file <- normalizePath(config_file)