From 7907a3a27d68f09f6edb1be74f80093e38941614 Mon Sep 17 00:00:00 2001
From: "Brian M. Schilder" <34280215+bschilder@users.noreply.github.com>
Date: Wed, 22 May 2024 13:44:27 +0100
Subject: [PATCH] Switch to using piggyback to distribute GPT annotations

---
 R/gpt_annot_read.R    | 36 ++++++++++++++++++------------------
 man/gpt_annot_read.Rd | 24 ++++++++++++++++++------
 2 files changed, 36 insertions(+), 24 deletions(-)

diff --git a/R/gpt_annot_read.R b/R/gpt_annot_read.R
index 1eb34af..96badd6 100644
--- a/R/gpt_annot_read.R
+++ b/R/gpt_annot_read.R
@@ -4,44 +4,44 @@
 #'  do some initial preprocessing (e.g. adding HPO IDs).
 #' @inheritParams main
 #' @inheritParams make_
-#' @param save_path Path to annotations CSV file.
-#'  If the file does not exist, the data will be downloaded from GitHub.
+#' @inheritParams get_
 #' @param force_new If \code{TRUE}, the data will be downloaded from GitHub
 #' even if it already exists locally.
 #' @param verbose Print messages.
 #' @param include_nogenes Include phenotypes with no associated genes.
+#' @inheritDotParams get_data
 #' @returns data.table of phenotype annotations
 #'
 #' @export
 #' @examples
 #' gpt_annot <- gpt_annot_read()
-gpt_annot_read <- function(save_path=file.path(KGExplorer::cache_dir(package="HPOExplorer"),
-                                               "gpt4_hpo_annotations.csv"),
-                           phenotype_to_genes = load_phenotype_to_genes(),
+gpt_annot_read <- function(save_dir=KGExplorer::cache_dir(package="HPOExplorer"),
+                           phenotype_to_genes = load_phenotype_to_genes(save_dir = save_dir),
                            force_new=FALSE,
                            hpo=get_hpo(),
                            include_nogenes=TRUE,
-                           verbose=TRUE){
-  pheno_count <- hpo_name <- hpo_id <- phenotype <- NULL;
+                           verbose=TRUE,
+                           ...){
+  pheno_count <- hpo_name <- hpo_id <- NULL;
 
-  if(!file.exists(save_path) || isTRUE(force_new)){
-    path <- paste0(
-      "https://github.com/neurogenomics/gpt_hpo_annotations/raw/master/",
-      "data/gpt4_hpo_annotations.csv"
-    )
-    utils::download.file(path, save_path)
-    # path <- get_data("gpt4_hpo_annotations.csv")
-  }
+  save_path <- get_data(file = "gpt4_hpo_annotations.csv.gz",
+                        save_dir = save_dir,
+                        overwrite = force_new,
+                        ...)
   {
     d <- data.table::fread(save_path, header = TRUE)
-    d <- d[!is.na(phenotype)]
-    data.table::setnames(d,"phenotype","hpo_name")
+    data.table::setnames(d,"phenotype","hpo_name", skip_absent = TRUE)
+    d <- d[!is.na(hpo_name)]
     d <- add_hpo_id(d, hpo = hpo)
   }
   {
     #### Add subset with fixed hpo_names ####
     # https://github.com/neurogenomics/RareDiseasePrioritisation/issues/31#issuecomment-1989079044
-    fixmap <- data.table::fread("https://github.com/neurogenomics/RareDiseasePrioritisation/files/14562614/mismatched_hpo_names_fixed.csv")
+    save_path_fixmap <- get_data(file = "mismatched_hpo_names_fixed.csv.gz",
+                                 save_dir = save_dir,
+                                 overwrite = force_new,
+                                 ...)
+    fixmap <- data.table::fread(save_path_fixmap)
     d <- rbind(d[!hpo_name %in% unique(fixmap$hpo_name)],
                fixmap, fill=TRUE)
   }
diff --git a/man/gpt_annot_read.Rd b/man/gpt_annot_read.Rd
index 21eec84..fb27bf5 100644
--- a/man/gpt_annot_read.Rd
+++ b/man/gpt_annot_read.Rd
@@ -5,18 +5,17 @@
 \title{Read annotations from GPT}
 \usage{
 gpt_annot_read(
-  save_path = file.path(KGExplorer::cache_dir(package = "HPOExplorer"),
-    "gpt4_hpo_annotations.csv"),
-  phenotype_to_genes = load_phenotype_to_genes(),
+  save_dir = KGExplorer::cache_dir(package = "HPOExplorer"),
+  phenotype_to_genes = load_phenotype_to_genes(save_dir = save_dir),
   force_new = FALSE,
   hpo = get_hpo(),
   include_nogenes = TRUE,
-  verbose = TRUE
+  verbose = TRUE,
+  ...
 )
 }
 \arguments{
-\item{save_path}{Path to annotations CSV file.
-If the file does not exist, the data will be downloaded from GitHub.}
+\item{save_dir}{Directory to save a file to.}
 
 \item{phenotype_to_genes}{Output of
 \link{load_phenotype_to_genes} mapping phenotypes
@@ -31,6 +30,19 @@ loaded from \link[KGExplorer]{get_ontology}.}
 \item{include_nogenes}{Include phenotypes with no associated genes.}
 
 \item{verbose}{Print messages.}
+
+\item{...}{
+  Arguments passed on to \code{\link[=get_data]{get_data}}
+  \describe{
+    \item{\code{add_version}}{Add the release version
+to the returned object's \link[base]{attributes}}
+    \item{\code{file}}{name or vector of names of files to be downloaded. If \code{NULL},
+all assets attached to the release will be downloaded.}
+    \item{\code{repo}}{Repository name in format "owner/repo". Defaults to \code{guess_repo()}.}
+    \item{\code{tag}}{tag for the GitHub release to which this data should be attached.}
+    \item{\code{overwrite}}{Should any local files of the same name be overwritten?
+default \code{TRUE}.}
+  }}
 }
 \value{
 data.table of phenotype annotations