diff --git a/R/add_gpt_annotations.R b/R/add_gpt_annotations.R index 4003631..01bebda 100644 --- a/R/add_gpt_annotations.R +++ b/R/add_gpt_annotations.R @@ -29,8 +29,11 @@ add_gpt_annotations <- function(phenos, if(all(annot_cols %in% names(phenos))){ messager("Ancestor columns already present. Skipping.") }else { + ## According to the latest GPT annotations (3-15-2024), + ## merging on "hpo_id" yields more annotated results (10724) + ## than merging on "hpo_name" (10678). phenos <- data.table::merge.data.table(phenos, - annot[,-c("hpo_name")], + annot[,-c("hpo_name")][,.SD[1], by="hpo_id"], by= "hpo_id", all.x = TRUE) } diff --git a/R/gpt_annot_codify.R b/R/gpt_annot_codify.R index 546a551..3793503 100644 --- a/R/gpt_annot_codify.R +++ b/R/gpt_annot_codify.R @@ -21,21 +21,20 @@ gpt_annot_codify <- function(annot = gpt_annot_read(), code_dict = c( "never"=0, "rarely"=1, - "varies"=2, - "often"=3, - "always"=4 + "often"=2, + "always"=3 ), tiers_dict=list( - intellectual_disability=1, - death=1, - impaired_mobility=2, - physical_malformations=2, - blindness=3, + intellectual_disability=5, + death=5, + impaired_mobility=4, + physical_malformations=3, + blindness=4, sensory_impairments=3, immunodeficiency=3, cancer=3, - reduced_fertility=4, - congenital_onset=1 + reduced_fertility=1, + congenital_onset=4 ), reset_tiers_dict=FALSE, filters=list() @@ -62,15 +61,14 @@ gpt_annot_codify <- function(annot = gpt_annot_read(), max_score <- sum( max(code_dict, na.rm = TRUE) * - (max(unlist(tiers_dict))+1) - unlist(tiers_dict) + (max(unlist(tiers_dict))*length(tiers_dict)) ) d_coded <- d[,lapply(.SD,FUN=function(x){ unlist(code_dict[tolower(x)])}),.SDcols = cols, by=c("hpo_id","hpo_name")] d_weighted <- data.table::as.data.table( lapply(stats::setNames(cols,cols), function(co){ - d_coded[[co]]* - ((max(unlist(tiers_dict))+1)-tiers_dict[[co]]) + d_coded[[co]]*tiers_dict[[co]] }) )[,hpo_name:=d_coded$hpo_name][,severity_score_gpt:=( rowSums(.SD,na.rm = TRUE)/max_score*100), diff --git a/R/gpt_annot_read.R b/R/gpt_annot_read.R index 5ed8c33..b20f56d 100644 --- a/R/gpt_annot_read.R +++ b/R/gpt_annot_read.R @@ -15,15 +15,12 @@ #' @export #' @examples #' gpt_annot <- gpt_annot_read() -gpt_annot_read <- function(save_path=file.path( - KGExplorer::cache_dir(package = "HPOExplorer"), - "gpt4_hpo_annotations.csv" -), - -phenotype_to_genes = load_phenotype_to_genes(), +gpt_annot_read <- function(save_path=file.path(KGExplorer::cache_dir(package="HPOExplorer"), + "gpt4_hpo_annotations.csv"), + phenotype_to_genes = load_phenotype_to_genes(), force_new=FALSE, -hpo=get_hpo(), -include_nogenes=TRUE, + hpo=get_hpo(), + include_nogenes=TRUE, verbose=TRUE){ pheno_count <- hpo_name <- hpo_id <- phenotype <- NULL; @@ -35,10 +32,19 @@ include_nogenes=TRUE, utils::download.file(path, save_path) # path <- get_data("gpt4_hpo_annotations.csv") } - d <- data.table::fread(save_path, header = TRUE) - d <- d[!is.na(phenotype)] - data.table::setnames(d,"phenotype","hpo_name") - d <- add_hpo_id(d, hpo = hpo) + { + d <- data.table::fread(save_path, header = TRUE) + d <- d[!is.na(phenotype)] + data.table::setnames(d,"phenotype","hpo_name") + d <- add_hpo_id(d, hpo = hpo) + } + { + #### Add subset with fixed hpo_names #### + # https://github.com/neurogenomics/RareDiseasePrioritisation/issues/31#issuecomment-1989079044 + fixmap <- data.table::fread("https://github.com/neurogenomics/RareDiseasePrioritisation/files/14562614/mismatched_hpo_names_fixed.csv") + d <- rbind(d[!hpo_name %in% unique(fixmap$hpo_name)], + fixmap, fill=TRUE) + } #### Check phenotype names #### d <- merge(d, unique(phenotype_to_genes[,c("hpo_id","hpo_name")]), diff --git a/R/map_phenotypes.R b/R/map_phenotypes.R index 9c394a7..66cd0b2 100644 --- a/R/map_phenotypes.R +++ b/R/map_phenotypes.R @@ -21,11 +21,15 @@ map_phenotypes <- function(terms, to=c("name","id"), keep_order = TRUE, ignore_case = TRUE, + ignore_char = eval(formals( + KGExplorer::map_ontology_terms + )$ignore_char), invert = FALSE){ KGExplorer::map_ontology_terms(terms = terms, ont = hpo, to = to, keep_order = keep_order, ignore_case = ignore_case, + ignore_char = ignore_char, invert = invert) } diff --git a/R/search_hpo.R b/R/search_hpo.R index 52aaa64..7378997 100644 --- a/R/search_hpo.R +++ b/R/search_hpo.R @@ -60,7 +60,7 @@ search_hpo <- function(hpo = get_hpo(), return(unique(res)) }) hit_counts <- lapply(query_hits, length) - messager("Number of phenotype gits per query group:") + messager("Number of phenotype hits per query group:") messager(paste(paste(" -",names(hit_counts)),hit_counts, collapse = "\n",sep=": "),v=verbose) return(query_hits) diff --git a/man/gpt_annot_codify.Rd b/man/gpt_annot_codify.Rd index e84fbcb..ccfecad 100644 --- a/man/gpt_annot_codify.Rd +++ b/man/gpt_annot_codify.Rd @@ -7,10 +7,10 @@ gpt_annot_codify( annot = gpt_annot_read(), remove_duplicates = TRUE, - code_dict = c(never = 0, rarely = 1, varies = 2, often = 3, always = 4), - tiers_dict = list(intellectual_disability = 1, death = 1, impaired_mobility = 2, - physical_malformations = 2, blindness = 3, sensory_impairments = 3, immunodeficiency - = 3, cancer = 3, reduced_fertility = 4, congenital_onset = 1), + code_dict = c(never = 0, rarely = 1, often = 2, always = 3), + tiers_dict = list(intellectual_disability = 5, death = 5, impaired_mobility = 4, + physical_malformations = 3, blindness = 4, sensory_impairments = 3, immunodeficiency + = 3, cancer = 3, reduced_fertility = 1, congenital_onset = 4), reset_tiers_dict = FALSE, filters = list() ) diff --git a/man/map_phenotypes.Rd b/man/map_phenotypes.Rd index 92ad473..ec90bd7 100644 --- a/man/map_phenotypes.Rd +++ b/man/map_phenotypes.Rd @@ -10,6 +10,7 @@ map_phenotypes( to = c("name", "id"), keep_order = TRUE, ignore_case = TRUE, + ignore_char = eval(formals(KGExplorer::map_ontology_terms)$ignore_char), invert = FALSE ) } @@ -31,6 +32,9 @@ sometimes in a different order.} \item{ignore_case}{Ignore case when mapping terms.} +\item{ignore_char}{A character vector of characters to ignore when +mapping terms.} + \item{invert}{Invert the keys/values of the dictionary, such that the key becomes the values (and vice versa).} }