From 59cf10c127a375df8170d823475a5edcf32cd30f Mon Sep 17 00:00:00 2001 From: "Brian M. Schilder" <34280215+bschilder@users.noreply.github.com> Date: Wed, 24 Jan 2024 02:01:20 +0000 Subject: [PATCH] update to match new MultiEWCE --- NAMESPACE | 1 + R/0docs.R | 1 + R/add_gene_frequency.R | 7 ++++--- R/add_genes.R | 3 --- R/add_gpt_annotations.R | 42 +++++++++++++++++++++++++++++++++++++++++ R/add_hpo_definition.R | 3 +-- R/add_hpo_id.R | 21 ++++----------------- R/add_hpo_name.R | 4 +--- R/gpt_annot_codify.R | 4 ++++ man/add_.Rd | 36 +++++++++++++++++++++++++++++------ man/gpt_annot_codify.Rd | 4 ++++ man/make_.Rd | 2 +- 12 files changed, 93 insertions(+), 35 deletions(-) create mode 100644 R/add_gpt_annotations.R diff --git a/NAMESPACE b/NAMESPACE index 8791576..37e14a5 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -7,6 +7,7 @@ export(add_disease_genes) export(add_evidence) export(add_gene_frequency) export(add_genes) +export(add_gpt_annotations) export(add_hpo_definition) export(add_hpo_id) export(add_hpo_name) diff --git a/R/0docs.R b/R/0docs.R index 19dda00..e66ca40 100644 --- a/R/0docs.R +++ b/R/0docs.R @@ -70,6 +70,7 @@ NULL #' @family add_ #' @param agg_by Column to aggregate metadata by. #' @param add_definitions Add disease definitions using \link{add_mondo}. +#' @param gpt_filters A named list of filters to apply to the GPT annotations. #' @inheritParams main #' @inheritParams make_ #' @inheritParams get_ diff --git a/R/add_gene_frequency.R b/R/add_gene_frequency.R index 16c1dd0..4ffc0aa 100644 --- a/R/add_gene_frequency.R +++ b/R/add_gene_frequency.R @@ -21,6 +21,7 @@ add_gene_frequency <- function(phenotype_to_genes = load_phenotype_to_genes(), gene_frequency_threshold = NULL, all.x = TRUE, + allow.cartesian = FALSE, verbose = TRUE){ # devoptera::args2vars(add_gene_frequency) @@ -28,8 +29,7 @@ add_gene_frequency <- function(phenotype_to_genes = load_phenotype_to_genes(), frequency <- gene_freq_name <- gene_freq_mean <- gene_freq_min <- gene_freq_max <- . <- NULL; - phenotype_to_genes <- add_hpo_id(phenos = phenotype_to_genes, - phenotype_to_genes= phenotype_to_genes) + phenotype_to_genes <- add_hpo_id(phenos = phenotype_to_genes) new_cols <- c("gene_freq_name","gene_freq_min", "gene_freq_max","gene_freq_mean") if(!all(new_cols %in% names(phenotype_to_genes))){ @@ -41,7 +41,8 @@ add_gene_frequency <- function(phenotype_to_genes = load_phenotype_to_genes(), x = phenotype_to_genes, y = g2p[,c("hpo_id","gene_symbol","frequency"),with=FALSE], by = c("hpo_id","gene_symbol"), - all.x = all.x) + all.x = all.x, + allow.cartesian = allow.cartesian) #### Parse freq data #### phenotype_to_genes[,gene_freq_name:=mapply(frequency,FUN=function(f){ if(grepl("HP:",f)) get_freq_dict()[f] else f })] diff --git a/R/add_genes.R b/R/add_genes.R index f84636b..ba2159d 100644 --- a/R/add_genes.R +++ b/R/add_genes.R @@ -21,8 +21,6 @@ add_genes <- function(phenos = NULL, gene_col = "gene_symbol", all.x = FALSE, allow.cartesian = FALSE){ - # devoptera::args2vars(add_genes, reassign = TRUE) - #### Prepare gene data #### phenotype_to_genes <- data.table::copy(phenotype_to_genes) data.table::setnames(phenotype_to_genes,"disease_id","disease_id", @@ -45,7 +43,6 @@ add_genes <- function(phenos = NULL, } #### Ensure necessary columns are in phenos #### phenos <- add_hpo_id(phenos = phenos, - phenotype_to_genes = phenotype_to_genes, hpo = hpo) phenos <- add_disease(phenos = phenos, allow.cartesian = allow.cartesian) diff --git a/R/add_gpt_annotations.R b/R/add_gpt_annotations.R new file mode 100644 index 0000000..4003631 --- /dev/null +++ b/R/add_gpt_annotations.R @@ -0,0 +1,42 @@ +#' @describeIn add_ add_ +#' Add ancestor +#' +#' Add annotations generated with a Large Language Model. +#' @param annot GPT annotation data. +#' @param annot_cols Columns to add. +#' @export +#' @examples +#' phenos <- example_phenos() +#' phenos2 <- add_gpt_annotations(phenos) +add_gpt_annotations <- function(phenos, + annot = gpt_annot_codify( + reset_tiers_dict=TRUE + )$annot_weighted, + annot_cols = names(annot)[ + !names(annot) %in% c("hpo_id","hpo_name") + ], + gpt_filters=rep(list(NULL), + length(annot_cols))|> + `names<-`(annot_cols), + force_new = FALSE){ + #### Force new columns #### + if(force_new){ + messager("Force new. Removing existing annot columns.") + rm_cols <- annot_cols[annot_cols %in% names(phenos)] + if(length(rm_cols)>0) phenos[,(rm_cols):=NULL] + } + #### Check for existing columns #### + if(all(annot_cols %in% names(phenos))){ + messager("Ancestor columns already present. Skipping.") + }else { + phenos <- data.table::merge.data.table(phenos, + annot[,-c("hpo_name")], + by= "hpo_id", + all.x = TRUE) + } + #### Filter #### + phenos <- KGExplorer::filter_dt(dat=phenos, + filters = gpt_filters) + #### Return ##### + return(phenos) +} diff --git a/R/add_hpo_definition.R b/R/add_hpo_definition.R index 2137d25..20d32ff 100644 --- a/R/add_hpo_definition.R +++ b/R/add_hpo_definition.R @@ -13,8 +13,7 @@ #' @returns A named vector of HPO term descriptions. #' #' @export -#' @importFrom stats setNames -#' @importFrom data.table := +#' @import data.table #' @examples #' phenos <- example_phenos() #' phenos2 <- add_hpo_definition(phenos = phenos) diff --git a/R/add_hpo_id.R b/R/add_hpo_id.R index c8877b5..ccf2739 100644 --- a/R/add_hpo_id.R +++ b/R/add_hpo_id.R @@ -8,25 +8,12 @@ #' phenos <- unique(phenotype_to_genes[,c("hpo_id","hpo_name")]) #' phenos2 <- add_hpo_id(phenos=phenos) add_hpo_id <- function(phenos, - hpo = get_hpo(), - phenotype_to_genes = NULL) { - HPO_term_valid <- hpo_id <- NULL; - + hpo = get_hpo()) { if(!"hpo_id" %in% names(phenos)){ messager("Adding HPO IDs.") - alt_names <- grep("hpo_id","^id$",names(phenos), - value=TRUE, ignore.case = TRUE) - if(length(alt_names)>0){ - data.table::setnames(phenos,alt_names[[1]],"hpo_id") - return(phenos) - } else { - if(is.null(phenotype_to_genes)) { - phenotype_to_genes <- load_phenotype_to_genes() - } - phenos <- fix_hpo_ids(dat=phenos, - phenotype_to_genes=phenotype_to_genes) - } - phenos[,HPO_term_valid:=(hpo_id %in% hpo@terms)] + phenos$hpo_id <- map_phenotypes(hpo = hpo, + terms = phenos$hpo_name, + to = "id") } return(phenos) } diff --git a/R/add_hpo_name.R b/R/add_hpo_name.R index d54e619..67a2e5e 100644 --- a/R/add_hpo_name.R +++ b/R/add_hpo_name.R @@ -7,9 +7,7 @@ #' phenos <- example_phenos() #' phenos2 <- add_hpo_name(phenos=phenos) add_hpo_name <- function(phenos, - hpo = get_hpo(), - phenotype_to_genes = NULL) { - + hpo = get_hpo()) { if(!"hpo_name" %in% names(phenos)){ messager("Adding HPO names.") phenos <- add_hpo_id(phenos) diff --git a/R/gpt_annot_codify.R b/R/gpt_annot_codify.R index 8ddf8f0..ae3c458 100644 --- a/R/gpt_annot_codify.R +++ b/R/gpt_annot_codify.R @@ -5,6 +5,8 @@ #' @param code_dict Numerical encodings of annotation values. #' @param tiers_dict Numerical encodings of annotation column. #' @param keep_congenital_onset Which stages of congenital onset to keep. +#' @param reset_tiers_dict Override \code{tiers_dict} values and set all values +#' to 1. This will ensure that all annotations are unweighted. #' @inheritParams gpt_annot_check #' @returns Named list #' @@ -34,6 +36,7 @@ gpt_annot_codify <- function(annot = gpt_annot_read(), cancer=3, reduced_fertility=4 ), + reset_tiers_dict=FALSE, keep_congenital_onset=head(names(code_dict),4) ){ # res <- gpt_annot_check(path="~/Downloads/gpt_hpo_annotations.csv") @@ -41,6 +44,7 @@ gpt_annot_codify <- function(annot = gpt_annot_read(), severity_score_gpt <- congenital_onset <- hpo_name <- hpo_id <- NULL; d <- data.table::copy(annot) + if(isTRUE(reset_tiers_dict)) tiers_dict <- lapply(tiers_dict,function(x){1}) #### Ensure only 1 row/hpo_name by simply taking the first #### if(isTRUE(remove_duplicates)){ d <- d[,utils::head(.SD,1), by=c("hpo_id","hpo_name")] diff --git a/man/add_.Rd b/man/add_.Rd index 7bd149d..1a3c293 100644 --- a/man/add_.Rd +++ b/man/add_.Rd @@ -1,10 +1,11 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/0docs.R, R/add_ancestor.R, R/add_death.R, % R/add_disease.R, R/add_disease_genes.R, R/add_evidence.R, -% R/add_gene_frequency.R, R/add_genes.R, R/add_hpo_definition.R, -% R/add_hpo_id.R, R/add_hpo_name.R, R/add_info_content.R, R/add_mondo.R, -% R/add_ndisease.R, R/add_omop.R, R/add_onset.R, R/add_ont_lvl.R, -% R/add_pheno_frequency.R, R/add_prevalence.R, R/add_severity.R, R/add_tier.R +% R/add_gene_frequency.R, R/add_genes.R, R/add_gpt_annotations.R, +% R/add_hpo_definition.R, R/add_hpo_id.R, R/add_hpo_name.R, +% R/add_info_content.R, R/add_mondo.R, R/add_ndisease.R, R/add_omop.R, +% R/add_onset.R, R/add_ont_lvl.R, R/add_pheno_frequency.R, R/add_prevalence.R, +% R/add_severity.R, R/add_tier.R \name{add_} \alias{add_} \alias{add_ancestor} @@ -14,6 +15,7 @@ \alias{add_evidence} \alias{add_gene_frequency} \alias{add_genes} +\alias{add_gpt_annotations} \alias{add_hpo_definition} \alias{add_hpo_id} \alias{add_hpo_name} @@ -69,6 +71,7 @@ add_gene_frequency( phenotype_to_genes = load_phenotype_to_genes(), gene_frequency_threshold = NULL, all.x = TRUE, + allow.cartesian = FALSE, verbose = TRUE ) @@ -82,6 +85,14 @@ add_genes( allow.cartesian = FALSE ) +add_gpt_annotations( + phenos, + annot = gpt_annot_codify(reset_tiers_dict = TRUE)$annot_weighted, + annot_cols = names(annot)[!names(annot) \%in\% c("hpo_id", "hpo_name")], + gpt_filters = `names<-`(rep(list(NULL), length(annot_cols)), annot_cols), + force_new = FALSE +) + add_hpo_definition( phenos, line_length = FALSE, @@ -89,9 +100,9 @@ add_hpo_definition( verbose = TRUE ) -add_hpo_id(phenos, hpo = get_hpo(), phenotype_to_genes = NULL) +add_hpo_id(phenos, hpo = get_hpo()) -add_hpo_name(phenos, hpo = get_hpo(), phenotype_to_genes = NULL) +add_hpo_name(phenos, hpo = get_hpo()) add_info_content(phenos, hpo = get_hpo()) @@ -238,6 +249,12 @@ If \code{y} has no key columns, this defaults to the key of \code{x}.} \item{gene_col}{Name of the gene column.} +\item{annot}{GPT annotation data.} + +\item{annot_cols}{Columns to add.} + +\item{gpt_filters}{A named list of filters to apply to the GPT annotations.} + \item{line_length}{The number of desired words per line \} \item{use_api}{Get definitions from the HPO API, @@ -444,6 +461,11 @@ Add genes Add genes associated with each phenotype (in the context of a particular disease). +\item \code{add_gpt_annotations()}: add_ +Add ancestor + +Add annotations generated with a Large Language Model. + \item \code{add_hpo_definition()}: add_ Get term definition @@ -583,6 +605,8 @@ phenos2 <- add_gene_frequency(phenotype_to_genes = phenotype_to_genes) phenos <- example_phenos() phenos2 <- add_genes(phenos = phenos) phenos <- example_phenos() +phenos2 <- add_gpt_annotations(phenos) +phenos <- example_phenos() phenos2 <- add_hpo_definition(phenos = phenos) phenotype_to_genes <- load_phenotype_to_genes() phenos <- unique(phenotype_to_genes[,c("hpo_id","hpo_name")]) diff --git a/man/gpt_annot_codify.Rd b/man/gpt_annot_codify.Rd index a4005df..c2d72da 100644 --- a/man/gpt_annot_codify.Rd +++ b/man/gpt_annot_codify.Rd @@ -11,6 +11,7 @@ gpt_annot_codify( tiers_dict = list(intellectual_disability = 1, death = 1, impaired_mobility = 2, physical_malformations = 2, blindness = 3, sensory_impairments = 3, immunodeficiency = 3, cancer = 3, reduced_fertility = 4), + reset_tiers_dict = FALSE, keep_congenital_onset = head(names(code_dict), 4) ) } @@ -23,6 +24,9 @@ gpt_annot_codify( \item{tiers_dict}{Numerical encodings of annotation column.} +\item{reset_tiers_dict}{Override \code{tiers_dict} values and set all values +to 1. This will ensure that all annotations are unweighted.} + \item{keep_congenital_onset}{Which stages of congenital onset to keep.} } \value{ diff --git a/man/make_.Rd b/man/make_.Rd index 87e3b39..4c208cd 100644 --- a/man/make_.Rd +++ b/man/make_.Rd @@ -112,7 +112,7 @@ See \link[KGExplorer]{get_ontology_levels} for more details.} \item{interactive}{Make the plot interactive.} -\item{show_plot}{Print the plot after generating it.} +\item{show_plot}{Print the plot after it's been generated.} \item{hoverbox_column}{Name of the new hoverbox column to add.}