Skip to content

Commit

Permalink
allow missing samplesize columns in metal config
Browse files Browse the repository at this point in the history
  • Loading branch information
mglev1n committed Mar 21, 2024
1 parent 1d43a2d commit fdca905
Show file tree
Hide file tree
Showing 35 changed files with 215 additions and 194 deletions.
11 changes: 5 additions & 6 deletions R/annotate_rsids.R
Original file line number Diff line number Diff line change
@@ -1,24 +1,23 @@
# WARNING - Generated by {fusen} from dev/flat_genomics_functions.Rmd: do not edit by hand

#' Annotate a dataframe containing genomic coordinates with rsids
#'
#'
#' This function can be used rapidly add rsids to GWAS summary statistics or any other dataframe containing genomic coordinates (eg. chromosome and position). This is a rapid function that does not explicitly account for differences in variants at each position, strand flips, etc.)
#'
#'
#' @param df Dataframe containing genomic coordinates to annotate with rsid
#' @param dbSNP Bioconductor object containing SNP locations and alleles to be used for annotation (default: `SNPlocs.Hsapiens.dbSNP144.GRCh37::SNPlocs.Hsapiens.dbSNP144.GRCh37`)
#' @param chrom_col Chromosome column
#' @param chrom_col Chromosome column
#' @param pos_col Position column
#'
#' @return A dataframe containing the original contents, with an additional `rsid` column.
#'
#'
#' @export
#' @concept genomics
#' @concept genomics
#' @family {annotation}
#' @examples
#' \dontrun{
#' annotate_rsids(sumstats_df)
#' }

annotate_rsids <- function(df, dbSNP = SNPlocs.Hsapiens.dbSNP144.GRCh37::SNPlocs.Hsapiens.dbSNP144.GRCh37, chrom_col = Chromosome, pos_col = Position) {
if (sum(stringr::str_detect(names(df), "rsid")) > 0) {
cli::cli_abort("A column named 'rsid' is already present")
Expand Down
9 changes: 4 additions & 5 deletions R/calc_credset.R
Original file line number Diff line number Diff line change
@@ -1,26 +1,25 @@
# WARNING - Generated by {fusen} from dev/flat_genomics_functions.Rmd: do not edit by hand

#' Perform Bayesian finemapping using the Approximate Bayes Factor approach
#'
#'
#' Description
#'
#'
#' @param df Dataframe containing GWAS summary statistics
#' @param locus_marker_col Column containing a locus-level identifier
#' @param locus_marker_col Column containing a locus-level identifier
#' @param effect_col Column containing effect estimates
#' @param se_col Column containing standard errors fo the effect estimates
#' @param samplesize_col Column containing sample sizes
#' @param cred_interval Credible interval for the fine-mapped credible sets (default = 0.99; 0.95 is another common but artbitrarily determined interval)
#'
#' @return A data.frame containing credible sets at each locus. For each variant within the credible set, the prior probability of being the casual variant is provided.
#'
#'
#' @export
#' @family {finemapping}
#' @concept genomics
#' @examples
#' \dontrun{
#' calc_credset(gwas_df)
#' }

calc_credset <- function(df, locus_marker_col = locus_marker, effect_col = effect, se_col = std_err, samplesize_col = samplesize, cred_interval = 0.99) {
df %>%
group_by({{ locus_marker_col }}) %>%
Expand Down
64 changes: 33 additions & 31 deletions R/coloc_run.R
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
# WARNING - Generated by {fusen} from dev/flat_genomics_functions.Rmd: do not edit by hand

#' Run Bayesian enumeration colocalization using Coloc
#'
#'
#' This function is a wrapper around [coloc::coloc.abf()] that takes a dataframe as input, and performs colocalization under the single-causal-variant assumption. Coloc was described in Giambartolomei et al. (PLOS Genetics 2014; <https://doi.org/10.1371/journal.pgen.1004383>).
#'
#'
#' @param df Dataframe containing summary statistics at a single locus for two traits in a "long" format, with one row per variant per trait.
#' @param trait_col Column containing trait names
#' @param variant_col Column containing unique variant identifiers (Eg. rsids, chr:pos)
Expand All @@ -21,55 +21,57 @@
#' @return A list containing coloc results.
#' - `summary` is a named vector containing the number of snps, and the posterior probabilities of the 5 colocalization hypotheses
#' - `results` is an annotated version of the input data containing log approximate Bayes Factors and posterior probability of each SNP being causal if H4 is true.
#'
#'
#' @concept genomics
#' @family {colocalization}
#' @export
#' @examples
#' \dontrun{
#' coloc_run(locus_df)
#' }

coloc_run <- function(df, trait_col = trait, variant_col = rsid, beta_col = beta, se_col = se, samplesize_col = samplesize, maf_col = maf, type_col = type, case_prop_col = case_prop, p1 = 1e-4, p2 = 1e-4, p12 = 1e-5, ...) {

df <- df %>%
select(trait = {{trait_col}}, maf = {{maf_col}}, type = {{type_col}}, variant_id = {{variant_col}}, beta = {{beta_col}}, se = {{se_col}}, samplesize = {{samplesize_col}}, case_prop = {{case_prop_col}}) %>%
df <- df %>%
select(trait = {{ trait_col }}, maf = {{ maf_col }}, type = {{ type_col }}, variant_id = {{ variant_col }}, beta = {{ beta_col }}, se = {{ se_col }}, samplesize = {{ samplesize_col }}, case_prop = {{ case_prop_col }}) %>%
add_count(variant_id) %>%
filter(n == 2)

trait_dfs <- df %>%
distinct(trait)
if(nrow(trait_dfs) != 2) {

if (nrow(trait_dfs) != 2) {
cli::cli_abort("The input dataframe must contain only two traits")
}

trait1 <- df %>%
filter(trait == trait_dfs$trait[1]) %>%
distinct(variant_id, .keep_all = TRUE)

trait2 <- df %>%
filter(trait == trait_dfs$trait[2]) %>%
distinct(variant_id, .keep_all = TRUE)

trait1_dataset <- list(beta = trait1$beta, #If log_OR column is full of NAs then use beta column instead
varbeta = trait1$se^2,
# pvalues = trait1$pval,
type = unique(trait1$type),
snp = trait1$variant_id,
N = trait1$samplesize,
# {if(unique(trait1$type) == "cc") {s = trait1$case_prop[1]}},
MAF = trait1$maf)

trait2_dataset <- list(beta = trait2$beta, #If log_OR column is full of NAs then use beta column instead
varbeta = trait2$se^2,
# pvalues = trait2$pval,
type = unique(trait2$type),
snp = trait2$variant_id,
N = trait2$samplesize,
# {if(unique(trait2$type) == "cc") {s = trait1$case_prop[2]}},
MAF = trait2$maf)


trait1_dataset <- list(
beta = trait1$beta, # If log_OR column is full of NAs then use beta column instead
varbeta = trait1$se^2,
# pvalues = trait1$pval,
type = unique(trait1$type),
snp = trait1$variant_id,
N = trait1$samplesize,
# {if(unique(trait1$type) == "cc") {s = trait1$case_prop[1]}},
MAF = trait1$maf
)

trait2_dataset <- list(
beta = trait2$beta, # If log_OR column is full of NAs then use beta column instead
varbeta = trait2$se^2,
# pvalues = trait2$pval,
type = unique(trait2$type),
snp = trait2$variant_id,
N = trait2$samplesize,
# {if(unique(trait2$type) == "cc") {s = trait1$case_prop[2]}},
MAF = trait2$maf
)

suppressWarnings(coloc_res <- coloc::coloc.abf(dataset1 = trait1_dataset, dataset2 = trait2_dataset, p1 = p1, p2 = p2, p12 = p12, ...))
return(coloc_res)
}
1 change: 0 additions & 1 deletion R/gg_manhattan_df.R
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@
#' \dontrun{
#' gg_manhattan_df(sumstats_df)
#' }

gg_manhattan_df <- function(sumstats_df, annotation_df = NULL, chr_col = chromosome, pos_col = position, pval_col = p_value, pval_threshold = 0.001, label_col = gene, build = "hg19", color1 = "#045ea7", color2 = "#82afd3", speed = "slow", ...) {
if (!is.null((annotation_df))) {
df <- sumstats_df %>%
Expand Down
1 change: 0 additions & 1 deletion R/gg_qq_df.R
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@
#' \dontrun{
#' gg_qq_df(sumstats_df)
#' }

gg_qq_df <- function(sumstats_df, pval_col = p_value, ...) {
df <- sumstats_df %>%
select(pvalue = {{ pval_col }}) %>%
Expand Down
5 changes: 2 additions & 3 deletions R/hyprcoloc_df.R
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
# WARNING - Generated by {fusen} from dev/flat_genomics_functions.Rmd: do not edit by hand

#' Run multi-trait colocalization using HyPrColoc
#'
#'
#' This function is a convenience wrapper around [hyprcoloc::hyprcoloc()] that takes a dataframe as input, and performs mutli-trait colocalization. Details of the HyPrColoc method are described in Foley et al. (Nature Communications 2021; <https://doi.org/10.1038/s41467-020-20885-8>).
#'
#'
#' @param df Dataframe containing summary statistics at a single locus, in a "long" format, with one row per variant per trait.
#' @param trait_col Column containing trait names
#' @param snp_col Column containing variant names (eg. rsid, marker_name), which should be consistent across studies
Expand All @@ -22,7 +22,6 @@
#' \dontrun{
#' hyprcoloc_df(gwas_results_df)
#' }

hyprcoloc_df <- function(df, trait_col = trait, snp_col = rsid, beta_col = beta, se_col = se, type_col = type, ...) {
df <- df %>%
select(trait = {{ trait_col }}, rsid = {{ snp_col }}, beta.exposure = {{ beta_col }}, se.exposure = {{ se_col }}, type = {{ type_col }}) %>%
Expand Down
27 changes: 15 additions & 12 deletions R/ldak_h2.R
Original file line number Diff line number Diff line change
@@ -1,15 +1,15 @@
# WARNING - Generated by {fusen} from dev/flat_genomics_functions.Rmd: do not edit by hand

#' Calculate heritability using LDAK
#'
#'
#' This function wraps LDAK, a command-line tool for estimating heritability. The tool and associated reference files can be download from the LDAK website (<https://ldak.org/>). The method is described in Zhang et al. (Nature Communications 2021; <https://doi.org/10.1038/s41467-021-24485-y).
#'
#' @param sumstats_file Path to "munged" GWAS summary statistics file in LDSC format. The current implementation of this function requires rsids to link to other LDAK files.
#'
#' @param sumstats_file Path to "munged" GWAS summary statistics file in LDSC format. The current implementation of this function requires rsids to link to other LDAK files.
#' @param ldak_bin Path to LDAK binary
#' @param ldak_tagfile Path to LDAK tagfile
#' @param sample_prev Sample prevalence of cases (for case-control studies), `NULL` (default) for quantitative traits
#' @param population_prev Population prevalence of cases in the sample (for case-control studies), `NULL` (default) for quantitative traits
#' @param hm3_file Path to hm3 file containing
#' @param hm3_file Path to hm3 file containing
#' @param ldak_cutoff Minor allele frequency cutoff (default = 0.01)
#'
#' @return List of dataframes containing LDAK results
Expand All @@ -19,17 +19,16 @@
#' @export
#' @examples
#' \dontrun{
#' ldak_h2(sumstats_file = "/path/to/munged_sumstats", ldak_bin = "/path/to/ldak", ldak_tagfile = "/path/to/tagfile", hm3_file = "/path/to/hm3")
#' ldak_h2(sumstats_file = "/path/to/munged_sumstats", ldak_bin = "/path/to/ldak", ldak_tagfile = "/path/to/tagfile", hm3_file = "/path/to/hm3")
#' }

ldak_h2 <- function(sumstats_file, ldak_bin, ldak_tagfile, sample_prev = NULL, population_prev = NULL, hm3_file, ldak_cutoff = 0.01) {
# format paths
cli::cli_progress_step("Formatting paths")
ldak_bin <- fs::path_abs(ldak_bin)
ldak_tagfile <- fs::path_abs(ldak_tagfile)
ldak_in <- fs::file_temp()
ldak_dir <- fs::path_temp()

# Category annotations
ldak_annotations <- glue::glue("1 Coding_UCSC*
2 Coding_UCSC.extend.500
Expand Down Expand Up @@ -97,9 +96,9 @@ ldak_h2 <- function(sumstats_file, ldak_bin, ldak_tagfile, sample_prev = NULL, p
64 CpG_Content_50kb
65 LDAK_Weightings
66 Base_Category") %>%
read_delim(col_names = c("category", "annotation"), delim = " ") %>%
mutate(binary = str_detect(annotation, "\\*")) %>%
mutate(annotation = str_replace(annotation, "\\*", ""))
read_delim(col_names = c("category", "annotation"), delim = " ") %>%
mutate(binary = str_detect(annotation, "\\*")) %>%
mutate(annotation = str_replace(annotation, "\\*", ""))

# read files
cli::cli_progress_step("Reading files")
Expand All @@ -123,8 +122,12 @@ ldak_h2 <- function(sumstats_file, ldak_bin, ldak_tagfile, sample_prev = NULL, p
"--summary", ldak_in,
"--tagfile", ldak_tagfile,
"--check-sums", "NO",
if(!is.null(population_prev)){c("--prevalence", population_prev)},
if(!is.null(sample_prev)){c("--ascertainment", sample_prev)},
if (!is.null(population_prev)) {
c("--prevalence", population_prev)
},
if (!is.null(sample_prev)) {
c("--ascertainment", sample_prev)
},
"--cutoff", ldak_cutoff
),
error_on_status = FALSE,
Expand Down
6 changes: 3 additions & 3 deletions R/magmar.R
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
# WARNING - Generated by {fusen} from dev/flat_genomics_functions.Rmd: do not edit by hand

#' Run MAGMA gene-based analysis
#'
#'
#' This function provides a wrapper to MAGMA, which performs gene-based testing from GWAS summary statistics. Details of MAGMA described in de Leeuw et al. (PLoS Cumputational Biology 2015; <https://doi.org/10.1371/journal.pcbi.1004219>). The MAGMA binary and reference files are available from the Complex Trait Genetics lab (<https://ctg.cncr.nl/software/magma>).
#'
#'
#' @param sumstats_df Dataframe containing GWAS summary statistics
#' @param pval_col Column containing p-values
#' @param snp_col Column containing rsids
Expand All @@ -17,7 +17,7 @@
#' @concept genomics
#' @family Gene-based testing
#' @export
#'
#'
#' @examples
#' \dontrun{
#' magmar(sumstats_df, magma_bin = "/path/to/magma", bfile = "/path/to/bfiles", gene_file = "/path/to/genes", out_file = "magma_output")
Expand Down
40 changes: 28 additions & 12 deletions R/metal_config.R
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,7 @@
#' metal_config(config_name = "name-of-analysis", output_dir = "/path/to/output/", study_files = c("/path/to/sumstats_1.txt", "/path/to/sumstats_2.txt))
#' }

metal_config <- function(config_name, output_dir, study_files, SCHEME = "STDERR", AVERAGEFREQ = "ON", MINMAXFREQ = "OFF", TRACKPOSITIONS = "ON", MARKERLABEL = "MARKER", CHROMOSOMELABEL = "CHROM", POSITIONLABEL = "POS", EFFECT_ALLELE = "EFFECT_ALLELE", OTHER_ALLELE = "OTHER_ALLELE", EFFECTLABEL = "BETA", STDERR = "SE", FREQLABEL = "EAF", NCASE = "N_CASE", NCONTROL = "N_CONTROL", SAMPLESIZE = "N") {

metal_config <- function(config_name, output_dir, study_files, SCHEME = "STDERR", AVERAGEFREQ = "ON", MINMAXFREQ = "OFF", TRACKPOSITIONS = "ON", MARKERLABEL = "MARKER", CHROMOSOMELABEL = "CHROM", POSITIONLABEL = "POS", EFFECT_ALLELE = "EFFECT_ALLELE", OTHER_ALLELE = "OTHER_ALLELE", EFFECTLABEL = "BETA", STDERR = "SE", FREQLABEL = "EAF", NCASE = NULL, NCONTROL = NULL, SAMPLESIZE = NULL) {
fs::dir_create(output_dir, recurse = TRUE)

config_outfile <- fs::path(output_dir, paste0(config_name, "_metal-config.txt"))
Expand All @@ -46,32 +45,49 @@ metal_config <- function(config_name, output_dir, study_files, SCHEME = "STDERR"
"SCHEME {SCHEME}
AVERAGEFREQ {AVERAGEFREQ}
MINMAXFREQ {MINMAXFREQ}
TRACKPOSITIONs {TRACKPOSITIONS}
TRACKPOSITIONS {TRACKPOSITIONS}
MARKERLABEL {MARKERLABEL}
CHROMOSOMELABEL {CHROMOSOMELABEL}
POSITIONLABEL {POSITIONLABEL}
ALLELELABELS {EFFECT_ALLELE} {OTHER_ALLELE}
EFFECTLABEL {EFFECTLABEL}
STDERR {STDERR}
FREQLABEL {FREQLABEL}
"
)

if (!is.null(NCASE)) {
config_text <- glue::glue("
{config_text}\n
CUSTOMVARIABLE NCASE
LABEL NCASE as {NCASE}
")
}

if (!is.null(NCONTROL)) {
config_text <- glue::glue("
{config_text}\n
CUSTOMVARIABLE NCONTROL
LABEL NCONTROL as {NCONTROL}
LABEL NCONTROL as {NCONTROL}")
}

if (!is.null(SAMPLESIZE)) {
config_text <- glue::glue("
{config_text}\n
CUSTOMVARIABLE SAMPLESIZE
LABEL SAMPLESIZE as {SAMPLESIZE}
LABEL SAMPLESIZE as {SAMPLESIZE}")
}

config_text <- glue::glue("
{config_text}
{study_files}
OUTFILE {meta_outfile}_metal- .txt
ANALYZE HETEROGENEITY
QUIT
"
)
QUIT")

readr::write_lines(x = config_text, file = config_outfile)

Expand Down
5 changes: 2 additions & 3 deletions R/metal_run.R
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
# WARNING - Generated by {fusen} from dev/flat_genomics_functions.Rmd: do not edit by hand

#' Use METAL to run a GWAS meta-analysis
#'
#'
#' This function is a wrapper for METAL, a tool for performing meta-analysis of GWAS summary statistics <https://github.com/statgen/METAL>. Details of the arguments to METAL are described in the METAL documentation: <https://genome.sph.umich.edu/wiki/METAL_Documentation>.
#'
#'
#' @param config_file (path) Path to a METAL configuration file (this can be generated using [levinmisc::metal_config()])
#' @param metal_path (path) Path to the METAL binary
#'
Expand All @@ -16,7 +16,6 @@
#' \dontrun{
#' metal_run(config_file = "config.txt", metal_path = "/path/to/metal_binary")
#' }

metal_run <- function(config_file, metal_path) {
metal_path <- normalizePath(metal_path)
config_file <- normalizePath(config_file)
Expand Down
Loading

0 comments on commit fdca905

Please sign in to comment.