update gpt funcs

neurogenomics · Mar 31, 2024 · dc1bbf3 · dc1bbf3
1 parent 02be201
commit dc1bbf3
Show file tree

Hide file tree

Showing 7 changed files with 46 additions and 31 deletions.
diff --git a/R/add_gpt_annotations.R b/R/add_gpt_annotations.R
@@ -29,8 +29,11 @@ add_gpt_annotations <- function(phenos,
   if(all(annot_cols %in% names(phenos))){
     messager("Ancestor columns already present. Skipping.")
   }else {
+    ## According to the latest GPT annotations (3-15-2024),
+    ## merging on "hpo_id" yields more annotated results (10724)
+    ## than merging on "hpo_name" (10678).
     phenos <- data.table::merge.data.table(phenos,
-                                           annot[,-c("hpo_name")],
+                                           annot[,-c("hpo_name")][,.SD[1], by="hpo_id"],
                                            by= "hpo_id",
                                            all.x = TRUE)
   }

diff --git a/R/gpt_annot_codify.R b/R/gpt_annot_codify.R
@@ -21,21 +21,20 @@ gpt_annot_codify <- function(annot = gpt_annot_read(),
                              code_dict = c(
                                "never"=0,
                                "rarely"=1,
-                               "varies"=2,
-                               "often"=3,
-                               "always"=4
+                               "often"=2,
+                               "always"=3
                              ),
                              tiers_dict=list(
-                               intellectual_disability=1,
-                               death=1,
-                               impaired_mobility=2,
-                               physical_malformations=2,
-                               blindness=3,
+                               intellectual_disability=5,
+                               death=5,
+                               impaired_mobility=4,
+                               physical_malformations=3,
+                               blindness=4,
                                sensory_impairments=3,
                                immunodeficiency=3,
                                cancer=3,
-                               reduced_fertility=4,
-                               congenital_onset=1
+                               reduced_fertility=1,
+                               congenital_onset=4
                              ),
                              reset_tiers_dict=FALSE,
                              filters=list()
@@ -62,15 +61,14 @@ gpt_annot_codify <- function(annot = gpt_annot_read(),
   max_score <-
     sum(
       max(code_dict, na.rm = TRUE) *
-        (max(unlist(tiers_dict))+1) - unlist(tiers_dict)
+      (max(unlist(tiers_dict))*length(tiers_dict))
     )
   d_coded <- d[,lapply(.SD,FUN=function(x){
     unlist(code_dict[tolower(x)])}),.SDcols = cols, by=c("hpo_id","hpo_name")]
   d_weighted <- data.table::as.data.table(
     lapply(stats::setNames(cols,cols),
            function(co){
-           d_coded[[co]]*
-               ((max(unlist(tiers_dict))+1)-tiers_dict[[co]])
+           d_coded[[co]]*tiers_dict[[co]]
              })
   )[,hpo_name:=d_coded$hpo_name][,severity_score_gpt:=(
     rowSums(.SD,na.rm = TRUE)/max_score*100),

diff --git a/R/gpt_annot_read.R b/R/gpt_annot_read.R
@@ -15,15 +15,12 @@
 #' @export
 #' @examples
 #' gpt_annot <- gpt_annot_read()
-gpt_annot_read <- function(save_path=file.path(
-  KGExplorer::cache_dir(package = "HPOExplorer"),
-  "gpt4_hpo_annotations.csv"
-),
-
-phenotype_to_genes = load_phenotype_to_genes(),
+gpt_annot_read <- function(save_path=file.path(KGExplorer::cache_dir(package="HPOExplorer"),
+                                               "gpt4_hpo_annotations.csv"),
+                           phenotype_to_genes = load_phenotype_to_genes(),
                            force_new=FALSE,
-hpo=get_hpo(),
-include_nogenes=TRUE,
+                           hpo=get_hpo(),
+                           include_nogenes=TRUE,
                            verbose=TRUE){
   pheno_count <- hpo_name <- hpo_id <- phenotype <- NULL;
 
@@ -35,10 +32,19 @@ include_nogenes=TRUE,
     utils::download.file(path, save_path)
     # path <- get_data("gpt4_hpo_annotations.csv")
   }
-  d <- data.table::fread(save_path, header = TRUE)
-  d <- d[!is.na(phenotype)]
-  data.table::setnames(d,"phenotype","hpo_name")
-  d <- add_hpo_id(d, hpo = hpo)
+  {
+    d <- data.table::fread(save_path, header = TRUE)
+    d <- d[!is.na(phenotype)]
+    data.table::setnames(d,"phenotype","hpo_name")
+    d <- add_hpo_id(d, hpo = hpo)
+  }
+  {
+    #### Add subset with fixed hpo_names ####
+    # https://github.com/neurogenomics/RareDiseasePrioritisation/issues/31#issuecomment-1989079044
+    fixmap <- data.table::fread("https://github.com/neurogenomics/RareDiseasePrioritisation/files/14562614/mismatched_hpo_names_fixed.csv")
+    d <- rbind(d[!hpo_name %in% unique(fixmap$hpo_name)],
+               fixmap, fill=TRUE)
+  }
   #### Check phenotype names ####
   d <- merge(d,
              unique(phenotype_to_genes[,c("hpo_id","hpo_name")]),

diff --git a/R/map_phenotypes.R b/R/map_phenotypes.R
@@ -21,11 +21,15 @@ map_phenotypes <- function(terms,
                            to=c("name","id"),
                            keep_order = TRUE,
                            ignore_case = TRUE,
+                           ignore_char = eval(formals(
+                               KGExplorer::map_ontology_terms
+                               )$ignore_char),
                            invert = FALSE){
   KGExplorer::map_ontology_terms(terms = terms,
                                  ont = hpo,
                                  to = to,
                                  keep_order = keep_order,
                                  ignore_case = ignore_case,
+                                 ignore_char = ignore_char,
                                  invert = invert)
 }
diff --git a/R/search_hpo.R b/R/search_hpo.R
@@ -60,7 +60,7 @@ search_hpo <- function(hpo = get_hpo(),
     return(unique(res))
   })
   hit_counts <- lapply(query_hits, length)
-  messager("Number of phenotype gits per query group:")
+  messager("Number of phenotype hits per query group:")
   messager(paste(paste(" -",names(hit_counts)),hit_counts,
                  collapse = "\n",sep=": "),v=verbose)
   return(query_hits)

diff --git a/man/gpt_annot_codify.Rd b/man/gpt_annot_codify.Rd
diff --git a/man/map_phenotypes.Rd b/man/map_phenotypes.Rd