From a955184bc5ef6e0f884257bbd87a511afb201f0a Mon Sep 17 00:00:00 2001 From: Jenny Medina Date: Mon, 6 May 2024 22:24:05 -0400 Subject: [PATCH 1/2] Update validate.py --- validate.py | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/validate.py b/validate.py index 96e8cfe..d5d32d4 100644 --- a/validate.py +++ b/validate.py @@ -8,9 +8,12 @@ import argparse import json +import os -import pandas as pd import numpy as np +import pandas as pd + +from glob import glob EXPECTED_COLS = { 'id': str, @@ -23,7 +26,7 @@ def get_args(): parser = argparse.ArgumentParser() parser.add_argument("-p", "--predictions_file", type=str, required=True) - parser.add_argument("-g", "--goldstandard_file", + parser.add_argument("-g", "--goldstandard_folder", type=str, required=True) parser.add_argument("-o", "--output", type=str, default="results.json") @@ -82,10 +85,20 @@ def check_prob_values(pred): return "" -def validate(gold_file, pred_file): +def extract_gs_file(folder): + """Extract gold standard file from folder.""" + files = glob(os.path.join(folder, "*")) + if len(files) != 1: + raise ValueError(f"Expected exactly one gold standard file in folder. Got {len(files)}. Exiting.") + + return files[0] + + +def validate(gold_folder, pred_file): """Validate predictions file against goldstandard.""" errors = [] + gold_file = extract_gs_file(gold_folder) gold = pd.read_csv(gold_file, index_col="id") try: pred = pd.read_csv( @@ -117,7 +130,7 @@ def main(): errors = [f.read()] else: errors = validate( - gold_file=args.goldstandard_file, + gold_folder=args.goldstandard_folder, pred_file=args.predictions_file ) From f5d5a3c8c0aa13d3140434251df11c490fa059a0 Mon Sep 17 00:00:00 2001 From: Jenny Medina Date: Mon, 6 May 2024 22:28:58 -0400 Subject: [PATCH 2/2] Update score.py --- score.py | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/score.py b/score.py index 65a9024..30f91df 100644 --- a/score.py +++ b/score.py @@ -8,16 +8,18 @@ import argparse import json +import os import pandas as pd from sklearn.metrics import roc_auc_score, average_precision_score +from glob import glob def get_args(): """Set up command-line interface and get arguments.""" parser = argparse.ArgumentParser() parser.add_argument("-p", "--predictions_file", type=str, required=True) - parser.add_argument("-g", "--goldstandard_file", type=str, required=True) + parser.add_argument("-g", "--goldstandard_folder", type=str, required=True) parser.add_argument("-o", "--output", type=str, default="results.json") return parser.parse_args() @@ -31,6 +33,15 @@ def score(gold, gold_col, pred, pred_col): return {"auc_roc": roc, "auprc": pr} +def extract_gs_file(folder): + """Extract gold standard file from folder.""" + files = glob(os.path.join(folder, "*")) + if len(files) != 1: + raise ValueError(f"Expected exactly one gold standard file in folder. Got {len(files)}. Exiting.") + + return files[0] + + def main(): """Main function.""" args = get_args() @@ -38,9 +49,11 @@ def main(): with open(args.output, encoding="utf-8") as out: res = json.load(out) + gold_file = extract_gs_file(args.goldstandard_folder) + if res.get("validation_status") == "VALIDATED": pred = pd.read_csv(args.predictions_file) - gold = pd.read_csv(args.goldstandard_file) + gold = pd.read_csv(gold_file) scores = score(gold, "disease", pred, "disease_probability") status = "SCORED" else: