update to match new MultiEWCE

neurogenomics · Jan 24, 2024 · 59cf10c · 59cf10c
1 parent d4292e0
commit 59cf10c
Show file tree

Hide file tree

Showing 12 changed files with 93 additions and 35 deletions.
diff --git a/NAMESPACE b/NAMESPACE
@@ -7,6 +7,7 @@ export(add_disease_genes)
 export(add_evidence)
 export(add_gene_frequency)
 export(add_genes)
+export(add_gpt_annotations)
 export(add_hpo_definition)
 export(add_hpo_id)
 export(add_hpo_name)

diff --git a/R/0docs.R b/R/0docs.R
@@ -70,6 +70,7 @@ NULL
 #' @family add_
 #' @param agg_by Column to aggregate metadata by.
 #' @param add_definitions Add disease definitions using \link{add_mondo}.
+#' @param gpt_filters A named list of filters to apply to the GPT annotations.
 #' @inheritParams main
 #' @inheritParams make_
 #' @inheritParams get_

diff --git a/R/add_gene_frequency.R b/R/add_gene_frequency.R
@@ -21,15 +21,15 @@
 add_gene_frequency <- function(phenotype_to_genes = load_phenotype_to_genes(),
                                gene_frequency_threshold = NULL,
                                all.x = TRUE,
+                               allow.cartesian = FALSE,
                                verbose = TRUE){
 
   # devoptera::args2vars(add_gene_frequency)
   # annot <- HPOExplorer::load_phenotype_to_genes("phenotype.hpoa")
   frequency <- gene_freq_name <- gene_freq_mean <-
     gene_freq_min <- gene_freq_max <- . <- NULL;
 
-  phenotype_to_genes <- add_hpo_id(phenos = phenotype_to_genes,
-                                   phenotype_to_genes= phenotype_to_genes)
+  phenotype_to_genes <- add_hpo_id(phenos = phenotype_to_genes)
   new_cols <- c("gene_freq_name","gene_freq_min",
                 "gene_freq_max","gene_freq_mean")
   if(!all(new_cols %in% names(phenotype_to_genes))){
@@ -41,7 +41,8 @@ add_gene_frequency <- function(phenotype_to_genes = load_phenotype_to_genes(),
       x = phenotype_to_genes,
       y = g2p[,c("hpo_id","gene_symbol","frequency"),with=FALSE],
       by = c("hpo_id","gene_symbol"),
-      all.x = all.x)
+      all.x = all.x,
+      allow.cartesian = allow.cartesian)
     #### Parse freq data ####
     phenotype_to_genes[,gene_freq_name:=mapply(frequency,FUN=function(f){
       if(grepl("HP:",f)) get_freq_dict()[f] else f })]

diff --git a/R/add_genes.R b/R/add_genes.R
@@ -21,8 +21,6 @@ add_genes <- function(phenos = NULL,
                       gene_col = "gene_symbol",
                       all.x = FALSE,
                       allow.cartesian = FALSE){
-  # devoptera::args2vars(add_genes, reassign = TRUE)
-
   #### Prepare gene data ####
   phenotype_to_genes <- data.table::copy(phenotype_to_genes)
   data.table::setnames(phenotype_to_genes,"disease_id","disease_id",
@@ -45,7 +43,6 @@ add_genes <- function(phenos = NULL,
   }
   #### Ensure necessary columns are in phenos ####
   phenos <- add_hpo_id(phenos = phenos,
-                       phenotype_to_genes = phenotype_to_genes,
                        hpo = hpo)
   phenos <- add_disease(phenos = phenos,
                         allow.cartesian = allow.cartesian)

diff --git a/R/add_gpt_annotations.R b/R/add_gpt_annotations.R
@@ -0,0 +1,42 @@
+#' @describeIn add_ add_
+#' Add ancestor
+#'
+#' Add annotations generated with a Large Language Model.
+#' @param annot GPT annotation data.
+#' @param annot_cols Columns to add.
+#' @export
+#' @examples
+#' phenos <- example_phenos()
+#' phenos2 <- add_gpt_annotations(phenos)
+add_gpt_annotations <- function(phenos,
+                                annot = gpt_annot_codify(
+                                  reset_tiers_dict=TRUE
+                                  )$annot_weighted,
+                                annot_cols = names(annot)[
+                                  !names(annot) %in% c("hpo_id","hpo_name")
+                                  ],
+                                gpt_filters=rep(list(NULL),
+                                                length(annot_cols))|>
+                                  `names<-`(annot_cols),
+                                force_new = FALSE){
+  #### Force new columns ####
+  if(force_new){
+    messager("Force new. Removing existing annot columns.")
+    rm_cols <- annot_cols[annot_cols %in% names(phenos)]
+    if(length(rm_cols)>0) phenos[,(rm_cols):=NULL]
+  }
+  #### Check for existing columns ####
+  if(all(annot_cols %in% names(phenos))){
+    messager("Ancestor columns already present. Skipping.")
+  }else {
+    phenos <- data.table::merge.data.table(phenos,
+                                           annot[,-c("hpo_name")],
+                                           by= "hpo_id",
+                                           all.x = TRUE)
+  }
+  #### Filter ####
+  phenos <- KGExplorer::filter_dt(dat=phenos,
+                                  filters = gpt_filters)
+  #### Return #####
+  return(phenos)
+}
diff --git a/R/add_hpo_definition.R b/R/add_hpo_definition.R
@@ -13,8 +13,7 @@
 #' @returns A named vector of HPO term descriptions.
 #'
 #' @export
-#' @importFrom stats setNames
-#' @importFrom data.table :=
+#' @import data.table
 #' @examples
 #' phenos <- example_phenos()
 #' phenos2 <- add_hpo_definition(phenos = phenos)

diff --git a/R/add_hpo_id.R b/R/add_hpo_id.R
@@ -8,25 +8,12 @@
 #' phenos <- unique(phenotype_to_genes[,c("hpo_id","hpo_name")])
 #' phenos2 <- add_hpo_id(phenos=phenos)
 add_hpo_id <- function(phenos,
-                       hpo = get_hpo(),
-                       phenotype_to_genes = NULL) {
-  HPO_term_valid <- hpo_id <- NULL;
-
+                       hpo = get_hpo()) {
   if(!"hpo_id" %in% names(phenos)){
     messager("Adding HPO IDs.")
-    alt_names <- grep("hpo_id","^id$",names(phenos),
-                      value=TRUE, ignore.case = TRUE)
-    if(length(alt_names)>0){
-      data.table::setnames(phenos,alt_names[[1]],"hpo_id")
-      return(phenos)
-    } else {
-      if(is.null(phenotype_to_genes)) {
-        phenotype_to_genes <- load_phenotype_to_genes()
-      }
-      phenos <- fix_hpo_ids(dat=phenos,
-                            phenotype_to_genes=phenotype_to_genes)
-    }
-    phenos[,HPO_term_valid:=(hpo_id %in% hpo@terms)]
+    phenos$hpo_id <- map_phenotypes(hpo = hpo,
+                                    terms = phenos$hpo_name,
+                                    to = "id")
   }
   return(phenos)
 }

diff --git a/R/add_hpo_name.R b/R/add_hpo_name.R
@@ -7,9 +7,7 @@
 #' phenos <- example_phenos()
 #' phenos2 <- add_hpo_name(phenos=phenos)
 add_hpo_name <- function(phenos,
-                         hpo = get_hpo(),
-                         phenotype_to_genes = NULL) {
-
+                         hpo = get_hpo()) {
   if(!"hpo_name" %in% names(phenos)){
     messager("Adding HPO names.")
     phenos <- add_hpo_id(phenos)

diff --git a/R/gpt_annot_codify.R b/R/gpt_annot_codify.R
@@ -5,6 +5,8 @@
 #' @param code_dict Numerical encodings of annotation values.
 #' @param tiers_dict Numerical encodings of annotation column.
 #' @param keep_congenital_onset Which stages of congenital onset to keep.
+#' @param reset_tiers_dict Override \code{tiers_dict} values and set all values
+#' to 1. This will ensure that all annotations are unweighted.
 #' @inheritParams gpt_annot_check
 #' @returns Named list
 #'
@@ -34,13 +36,15 @@ gpt_annot_codify <- function(annot = gpt_annot_read(),
                                cancer=3,
                                reduced_fertility=4
                              ),
+                             reset_tiers_dict=FALSE,
                              keep_congenital_onset=head(names(code_dict),4)
                              ){
   # res <- gpt_annot_check(path="~/Downloads/gpt_hpo_annotations.csv")
   # annot <- res$annot
   severity_score_gpt <- congenital_onset <- hpo_name <- hpo_id <- NULL;
 
   d <- data.table::copy(annot)
+  if(isTRUE(reset_tiers_dict)) tiers_dict <- lapply(tiers_dict,function(x){1})
   #### Ensure only 1 row/hpo_name by simply taking the first ####
   if(isTRUE(remove_duplicates)){
     d <- d[,utils::head(.SD,1), by=c("hpo_id","hpo_name")]

diff --git a/man/add_.Rd b/man/add_.Rd
diff --git a/man/gpt_annot_codify.Rd b/man/gpt_annot_codify.Rd
diff --git a/man/make_.Rd b/man/make_.Rd