From 59cf10c127a375df8170d823475a5edcf32cd30f Mon Sep 17 00:00:00 2001
From: "Brian M. Schilder" <34280215+bschilder@users.noreply.github.com>
Date: Wed, 24 Jan 2024 02:01:20 +0000
Subject: [PATCH] update to match new MultiEWCE

---
 NAMESPACE               |  1 +
 R/0docs.R               |  1 +
 R/add_gene_frequency.R  |  7 ++++---
 R/add_genes.R           |  3 ---
 R/add_gpt_annotations.R | 42 +++++++++++++++++++++++++++++++++++++++++
 R/add_hpo_definition.R  |  3 +--
 R/add_hpo_id.R          | 21 ++++-----------------
 R/add_hpo_name.R        |  4 +---
 R/gpt_annot_codify.R    |  4 ++++
 man/add_.Rd             | 36 +++++++++++++++++++++++++++++------
 man/gpt_annot_codify.Rd |  4 ++++
 man/make_.Rd            |  2 +-
 12 files changed, 93 insertions(+), 35 deletions(-)
 create mode 100644 R/add_gpt_annotations.R

diff --git a/NAMESPACE b/NAMESPACE
index 8791576..37e14a5 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -7,6 +7,7 @@ export(add_disease_genes)
 export(add_evidence)
 export(add_gene_frequency)
 export(add_genes)
+export(add_gpt_annotations)
 export(add_hpo_definition)
 export(add_hpo_id)
 export(add_hpo_name)
diff --git a/R/0docs.R b/R/0docs.R
index 19dda00..e66ca40 100644
--- a/R/0docs.R
+++ b/R/0docs.R
@@ -70,6 +70,7 @@ NULL
 #' @family add_
 #' @param agg_by Column to aggregate metadata by.
 #' @param add_definitions Add disease definitions using \link{add_mondo}.
+#' @param gpt_filters A named list of filters to apply to the GPT annotations.
 #' @inheritParams main
 #' @inheritParams make_
 #' @inheritParams get_
diff --git a/R/add_gene_frequency.R b/R/add_gene_frequency.R
index 16c1dd0..4ffc0aa 100644
--- a/R/add_gene_frequency.R
+++ b/R/add_gene_frequency.R
@@ -21,6 +21,7 @@
 add_gene_frequency <- function(phenotype_to_genes = load_phenotype_to_genes(),
                                gene_frequency_threshold = NULL,
                                all.x = TRUE,
+                               allow.cartesian = FALSE,
                                verbose = TRUE){
 
   # devoptera::args2vars(add_gene_frequency)
@@ -28,8 +29,7 @@ add_gene_frequency <- function(phenotype_to_genes = load_phenotype_to_genes(),
   frequency <- gene_freq_name <- gene_freq_mean <-
     gene_freq_min <- gene_freq_max <- . <- NULL;
 
-  phenotype_to_genes <- add_hpo_id(phenos = phenotype_to_genes,
-                                   phenotype_to_genes= phenotype_to_genes)
+  phenotype_to_genes <- add_hpo_id(phenos = phenotype_to_genes)
   new_cols <- c("gene_freq_name","gene_freq_min",
                 "gene_freq_max","gene_freq_mean")
   if(!all(new_cols %in% names(phenotype_to_genes))){
@@ -41,7 +41,8 @@ add_gene_frequency <- function(phenotype_to_genes = load_phenotype_to_genes(),
       x = phenotype_to_genes,
       y = g2p[,c("hpo_id","gene_symbol","frequency"),with=FALSE],
       by = c("hpo_id","gene_symbol"),
-      all.x = all.x)
+      all.x = all.x,
+      allow.cartesian = allow.cartesian)
     #### Parse freq data ####
     phenotype_to_genes[,gene_freq_name:=mapply(frequency,FUN=function(f){
       if(grepl("HP:",f)) get_freq_dict()[f] else f })]
diff --git a/R/add_genes.R b/R/add_genes.R
index f84636b..ba2159d 100644
--- a/R/add_genes.R
+++ b/R/add_genes.R
@@ -21,8 +21,6 @@ add_genes <- function(phenos = NULL,
                       gene_col = "gene_symbol",
                       all.x = FALSE,
                       allow.cartesian = FALSE){
-  # devoptera::args2vars(add_genes, reassign = TRUE)
-
   #### Prepare gene data ####
   phenotype_to_genes <- data.table::copy(phenotype_to_genes)
   data.table::setnames(phenotype_to_genes,"disease_id","disease_id",
@@ -45,7 +43,6 @@ add_genes <- function(phenos = NULL,
   }
   #### Ensure necessary columns are in phenos ####
   phenos <- add_hpo_id(phenos = phenos,
-                       phenotype_to_genes = phenotype_to_genes,
                        hpo = hpo)
   phenos <- add_disease(phenos = phenos,
                         allow.cartesian = allow.cartesian)
diff --git a/R/add_gpt_annotations.R b/R/add_gpt_annotations.R
new file mode 100644
index 0000000..4003631
--- /dev/null
+++ b/R/add_gpt_annotations.R
@@ -0,0 +1,42 @@
+#' @describeIn add_ add_
+#' Add ancestor
+#'
+#' Add annotations generated with a Large Language Model.
+#' @param annot GPT annotation data.
+#' @param annot_cols Columns to add.
+#' @export
+#' @examples
+#' phenos <- example_phenos()
+#' phenos2 <- add_gpt_annotations(phenos)
+add_gpt_annotations <- function(phenos,
+                                annot = gpt_annot_codify(
+                                  reset_tiers_dict=TRUE
+                                  )$annot_weighted,
+                                annot_cols = names(annot)[
+                                  !names(annot) %in% c("hpo_id","hpo_name")
+                                  ],
+                                gpt_filters=rep(list(NULL),
+                                                length(annot_cols))|>
+                                  `names<-`(annot_cols),
+                                force_new = FALSE){
+  #### Force new columns ####
+  if(force_new){
+    messager("Force new. Removing existing annot columns.")
+    rm_cols <- annot_cols[annot_cols %in% names(phenos)]
+    if(length(rm_cols)>0) phenos[,(rm_cols):=NULL]
+  }
+  #### Check for existing columns ####
+  if(all(annot_cols %in% names(phenos))){
+    messager("Ancestor columns already present. Skipping.")
+  }else {
+    phenos <- data.table::merge.data.table(phenos,
+                                           annot[,-c("hpo_name")],
+                                           by= "hpo_id",
+                                           all.x = TRUE)
+  }
+  #### Filter ####
+  phenos <- KGExplorer::filter_dt(dat=phenos,
+                                  filters = gpt_filters)
+  #### Return #####
+  return(phenos)
+}
diff --git a/R/add_hpo_definition.R b/R/add_hpo_definition.R
index 2137d25..20d32ff 100644
--- a/R/add_hpo_definition.R
+++ b/R/add_hpo_definition.R
@@ -13,8 +13,7 @@
 #' @returns A named vector of HPO term descriptions.
 #'
 #' @export
-#' @importFrom stats setNames
-#' @importFrom data.table :=
+#' @import data.table
 #' @examples
 #' phenos <- example_phenos()
 #' phenos2 <- add_hpo_definition(phenos = phenos)
diff --git a/R/add_hpo_id.R b/R/add_hpo_id.R
index c8877b5..ccf2739 100644
--- a/R/add_hpo_id.R
+++ b/R/add_hpo_id.R
@@ -8,25 +8,12 @@
 #' phenos <- unique(phenotype_to_genes[,c("hpo_id","hpo_name")])
 #' phenos2 <- add_hpo_id(phenos=phenos)
 add_hpo_id <- function(phenos,
-                       hpo = get_hpo(),
-                       phenotype_to_genes = NULL) {
-  HPO_term_valid <- hpo_id <- NULL;
-
+                       hpo = get_hpo()) {
   if(!"hpo_id" %in% names(phenos)){
     messager("Adding HPO IDs.")
-    alt_names <- grep("hpo_id","^id$",names(phenos),
-                      value=TRUE, ignore.case = TRUE)
-    if(length(alt_names)>0){
-      data.table::setnames(phenos,alt_names[[1]],"hpo_id")
-      return(phenos)
-    } else {
-      if(is.null(phenotype_to_genes)) {
-        phenotype_to_genes <- load_phenotype_to_genes()
-      }
-      phenos <- fix_hpo_ids(dat=phenos,
-                            phenotype_to_genes=phenotype_to_genes)
-    }
-    phenos[,HPO_term_valid:=(hpo_id %in% hpo@terms)]
+    phenos$hpo_id <- map_phenotypes(hpo = hpo,
+                                    terms = phenos$hpo_name,
+                                    to = "id")
   }
   return(phenos)
 }
diff --git a/R/add_hpo_name.R b/R/add_hpo_name.R
index d54e619..67a2e5e 100644
--- a/R/add_hpo_name.R
+++ b/R/add_hpo_name.R
@@ -7,9 +7,7 @@
 #' phenos <- example_phenos()
 #' phenos2 <- add_hpo_name(phenos=phenos)
 add_hpo_name <- function(phenos,
-                         hpo = get_hpo(),
-                         phenotype_to_genes = NULL) {
-
+                         hpo = get_hpo()) {
   if(!"hpo_name" %in% names(phenos)){
     messager("Adding HPO names.")
     phenos <- add_hpo_id(phenos)
diff --git a/R/gpt_annot_codify.R b/R/gpt_annot_codify.R
index 8ddf8f0..ae3c458 100644
--- a/R/gpt_annot_codify.R
+++ b/R/gpt_annot_codify.R
@@ -5,6 +5,8 @@
 #' @param code_dict Numerical encodings of annotation values.
 #' @param tiers_dict Numerical encodings of annotation column.
 #' @param keep_congenital_onset Which stages of congenital onset to keep.
+#' @param reset_tiers_dict Override \code{tiers_dict} values and set all values
+#' to 1. This will ensure that all annotations are unweighted.
 #' @inheritParams gpt_annot_check
 #' @returns Named list
 #'
@@ -34,6 +36,7 @@ gpt_annot_codify <- function(annot = gpt_annot_read(),
                                cancer=3,
                                reduced_fertility=4
                              ),
+                             reset_tiers_dict=FALSE,
                              keep_congenital_onset=head(names(code_dict),4)
                              ){
   # res <- gpt_annot_check(path="~/Downloads/gpt_hpo_annotations.csv")
@@ -41,6 +44,7 @@ gpt_annot_codify <- function(annot = gpt_annot_read(),
   severity_score_gpt <- congenital_onset <- hpo_name <- hpo_id <- NULL;
 
   d <- data.table::copy(annot)
+  if(isTRUE(reset_tiers_dict)) tiers_dict <- lapply(tiers_dict,function(x){1})
   #### Ensure only 1 row/hpo_name by simply taking the first ####
   if(isTRUE(remove_duplicates)){
     d <- d[,utils::head(.SD,1), by=c("hpo_id","hpo_name")]
diff --git a/man/add_.Rd b/man/add_.Rd
index 7bd149d..1a3c293 100644
--- a/man/add_.Rd
+++ b/man/add_.Rd
@@ -1,10 +1,11 @@
 % Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/0docs.R, R/add_ancestor.R, R/add_death.R,
 %   R/add_disease.R, R/add_disease_genes.R, R/add_evidence.R,
-%   R/add_gene_frequency.R, R/add_genes.R, R/add_hpo_definition.R,
-%   R/add_hpo_id.R, R/add_hpo_name.R, R/add_info_content.R, R/add_mondo.R,
-%   R/add_ndisease.R, R/add_omop.R, R/add_onset.R, R/add_ont_lvl.R,
-%   R/add_pheno_frequency.R, R/add_prevalence.R, R/add_severity.R, R/add_tier.R
+%   R/add_gene_frequency.R, R/add_genes.R, R/add_gpt_annotations.R,
+%   R/add_hpo_definition.R, R/add_hpo_id.R, R/add_hpo_name.R,
+%   R/add_info_content.R, R/add_mondo.R, R/add_ndisease.R, R/add_omop.R,
+%   R/add_onset.R, R/add_ont_lvl.R, R/add_pheno_frequency.R, R/add_prevalence.R,
+%   R/add_severity.R, R/add_tier.R
 \name{add_}
 \alias{add_}
 \alias{add_ancestor}
@@ -14,6 +15,7 @@
 \alias{add_evidence}
 \alias{add_gene_frequency}
 \alias{add_genes}
+\alias{add_gpt_annotations}
 \alias{add_hpo_definition}
 \alias{add_hpo_id}
 \alias{add_hpo_name}
@@ -69,6 +71,7 @@ add_gene_frequency(
   phenotype_to_genes = load_phenotype_to_genes(),
   gene_frequency_threshold = NULL,
   all.x = TRUE,
+  allow.cartesian = FALSE,
   verbose = TRUE
 )
 
@@ -82,6 +85,14 @@ add_genes(
   allow.cartesian = FALSE
 )
 
+add_gpt_annotations(
+  phenos,
+  annot = gpt_annot_codify(reset_tiers_dict = TRUE)$annot_weighted,
+  annot_cols = names(annot)[!names(annot) \%in\% c("hpo_id", "hpo_name")],
+  gpt_filters = `names<-`(rep(list(NULL), length(annot_cols)), annot_cols),
+  force_new = FALSE
+)
+
 add_hpo_definition(
   phenos,
   line_length = FALSE,
@@ -89,9 +100,9 @@ add_hpo_definition(
   verbose = TRUE
 )
 
-add_hpo_id(phenos, hpo = get_hpo(), phenotype_to_genes = NULL)
+add_hpo_id(phenos, hpo = get_hpo())
 
-add_hpo_name(phenos, hpo = get_hpo(), phenotype_to_genes = NULL)
+add_hpo_name(phenos, hpo = get_hpo())
 
 add_info_content(phenos, hpo = get_hpo())
 
@@ -238,6 +249,12 @@ If \code{y} has no key columns, this defaults to the key of \code{x}.}
 
 \item{gene_col}{Name of the gene column.}
 
+\item{annot}{GPT annotation data.}
+
+\item{annot_cols}{Columns to add.}
+
+\item{gpt_filters}{A named list of filters to apply to the GPT annotations.}
+
 \item{line_length}{The number of desired words per line \<int\>}
 
 \item{use_api}{Get definitions from the HPO API,
@@ -444,6 +461,11 @@ Add genes
 Add genes associated with each phenotype
 (in the context of a particular disease).
 
+\item \code{add_gpt_annotations()}: add_
+Add ancestor
+
+Add annotations generated with a Large Language Model.
+
 \item \code{add_hpo_definition()}: add_
 Get term definition
 
@@ -583,6 +605,8 @@ phenos2 <- add_gene_frequency(phenotype_to_genes = phenotype_to_genes)
 phenos <- example_phenos()
 phenos2 <- add_genes(phenos = phenos)
 phenos <- example_phenos()
+phenos2 <- add_gpt_annotations(phenos)
+phenos <- example_phenos()
 phenos2 <- add_hpo_definition(phenos = phenos)
 phenotype_to_genes <- load_phenotype_to_genes()
 phenos <- unique(phenotype_to_genes[,c("hpo_id","hpo_name")])
diff --git a/man/gpt_annot_codify.Rd b/man/gpt_annot_codify.Rd
index a4005df..c2d72da 100644
--- a/man/gpt_annot_codify.Rd
+++ b/man/gpt_annot_codify.Rd
@@ -11,6 +11,7 @@ gpt_annot_codify(
   tiers_dict = list(intellectual_disability = 1, death = 1, impaired_mobility = 2,
     physical_malformations = 2, blindness = 3, sensory_impairments = 3, immunodeficiency
     = 3, cancer = 3, reduced_fertility = 4),
+  reset_tiers_dict = FALSE,
   keep_congenital_onset = head(names(code_dict), 4)
 )
 }
@@ -23,6 +24,9 @@ gpt_annot_codify(
 
 \item{tiers_dict}{Numerical encodings of annotation column.}
 
+\item{reset_tiers_dict}{Override \code{tiers_dict} values and set all values
+to 1. This will ensure that all annotations are unweighted.}
+
 \item{keep_congenital_onset}{Which stages of congenital onset to keep.}
 }
 \value{
diff --git a/man/make_.Rd b/man/make_.Rd
index 87e3b39..4c208cd 100644
--- a/man/make_.Rd
+++ b/man/make_.Rd
@@ -112,7 +112,7 @@ See \link[KGExplorer]{get_ontology_levels} for more details.}
 
 \item{interactive}{Make the plot interactive.}
 
-\item{show_plot}{Print the plot after generating it.}
+\item{show_plot}{Print the plot after it's been generated.}
 
 \item{hoverbox_column}{Name of the new hoverbox column to add.}