From c9a066d5c52027e7ec5c2c86f42746a0582f4bb6 Mon Sep 17 00:00:00 2001 From: verena <9377970+vpchung@users.noreply.github.com> Date: Tue, 2 Jul 2024 17:00:36 -0700 Subject: [PATCH 1/4] update colnames per organizer request --- score.py | 4 ++-- validate.py | 14 +++++++------- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/score.py b/score.py index eb851d4..788f628 100644 --- a/score.py +++ b/score.py @@ -14,8 +14,8 @@ import numpy as np from sklearn.metrics import roc_auc_score, average_precision_score -GOLDSTANDARD_COLS = {"id": str, "disease": int} -PREDICTION_COLS = {"id": str, "disease_probability": np.float64} +GOLDSTANDARD_COLS = {"epr_number": str, "disease_probability": str} +PREDICTION_COLS = {"epr_number": str, "disease_probability": np.float64} def get_args(): diff --git a/validate.py b/validate.py index c9373a2..1847480 100644 --- a/validate.py +++ b/validate.py @@ -13,8 +13,8 @@ import numpy as np import pandas as pd -GOLDSTANDARD_COLS = {"id": str, "disease": int} -EXPECTED_COLS = {"id": str, "disease_probability": np.float64} +GOLDSTANDARD_COLS = {"epr_number": str, "disease_probability": str} +EXPECTED_COLS = {"epr_number": str, "disease_probability": np.float64} def get_args(): @@ -28,18 +28,18 @@ def get_args(): def check_dups(pred): """Check for duplicate participant IDs.""" - duplicates = pred.duplicated(subset=["id"]) + duplicates = pred.duplicated(subset=["epr_number"]) if duplicates.any(): return ( f"Found {duplicates.sum()} duplicate ID(s): " - f"{pred[duplicates].id.to_list()}" + f"{pred[duplicates].epr_number.to_list()}" ) return "" def check_missing_ids(gold, pred): """Check for missing participant IDs.""" - pred = pred.set_index("id") + pred = pred.set_index("epr_number") missing_ids = gold.index.difference(pred.index) if missing_ids.any(): return ( @@ -51,7 +51,7 @@ def check_missing_ids(gold, pred): def check_unknown_ids(gold, pred): """Check for unknown participant IDs.""" - pred = pred.set_index("id") + pred = pred.set_index("epr_number") unknown_ids = pred.index.difference(gold.index) if unknown_ids.any(): return ( @@ -92,7 +92,7 @@ def validate(gold_folder, pred_file): """Validate predictions file against goldstandard.""" errors = [] gold_file = extract_gs_file(gold_folder) - gold = pd.read_csv(gold_file, dtype=GOLDSTANDARD_COLS, index_col="id") + gold = pd.read_csv(gold_file, dtype=GOLDSTANDARD_COLS, index_col="epr_number") try: pred = pd.read_csv( pred_file, From 055b89f795a533aab2b4a956db4d6a2133cce686 Mon Sep 17 00:00:00 2001 From: verena <9377970+vpchung@users.noreply.github.com> Date: Tue, 2 Jul 2024 17:01:12 -0700 Subject: [PATCH 2/4] ensure subject order is same between gold and pred --- score.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/score.py b/score.py index 788f628..d2ca164 100644 --- a/score.py +++ b/score.py @@ -27,12 +27,21 @@ def get_args(): return parser.parse_args() -def score(gold, gold_col, pred, pred_col): +def score(gold, pred, id_colname, prob_colname): """ Calculate metrics for: AUC-ROC, AUCPR """ - roc = roc_auc_score(gold[gold_col], pred[pred_col]) - pr = average_precision_score(gold[gold_col], pred[pred_col]) + # Join the two dataframes so that the order of the ids are the same + # between goldstandard and prediction. + merged = gold.merge(pred, how="left", on=id_colname) + roc = roc_auc_score( + merged[prob_colname + "_x"], + merged[prob_colname + "_y"] + ) + pr = average_precision_score( + merged[prob_colname + "_x"], + merged[prob_colname + "_y"] + ) return {"auc_roc": roc, "auprc": pr} From 66a4f355a7bdc2289a036f861c1dc654ad964fad Mon Sep 17 00:00:00 2001 From: verena <9377970+vpchung@users.noreply.github.com> Date: Tue, 2 Jul 2024 17:01:35 -0700 Subject: [PATCH 3/4] preprocess data before scoring --- score.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/score.py b/score.py index d2ca164..33e2102 100644 --- a/score.py +++ b/score.py @@ -57,6 +57,13 @@ def extract_gs_file(folder): return files[0] +def preprocess(df, colname): + """Preprocess dataframe and convert column as needed.""" + df = df[~df[colname].isin([".M"])] + df[colname] = df[colname].astype(int) + return df + + def main(): """Main function.""" args = get_args() @@ -80,7 +87,8 @@ def main(): usecols=GOLDSTANDARD_COLS, dtype=GOLDSTANDARD_COLS ) - scores = score(gold, "disease", pred, "disease_probability") + gold = preprocess(gold, "disease_probability") + scores = score(gold, pred, "epr_number", "disease_probability") status = "SCORED" errors = "" except ValueError: From eda41f7cad0f98adce0320159d62842ae9693cca Mon Sep 17 00:00:00 2001 From: verena <9377970+vpchung@users.noreply.github.com> Date: Tue, 2 Jul 2024 17:01:45 -0700 Subject: [PATCH 4/4] general lint --- score.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/score.py b/score.py index 33e2102..be94296 100644 --- a/score.py +++ b/score.py @@ -5,14 +5,14 @@ - ROC curve - PR curve """ -from glob import glob import argparse import json import os +from glob import glob -import pandas as pd import numpy as np -from sklearn.metrics import roc_auc_score, average_precision_score +import pandas as pd +from sklearn.metrics import average_precision_score, roc_auc_score GOLDSTANDARD_COLS = {"epr_number": str, "disease_probability": str} PREDICTION_COLS = {"epr_number": str, "disease_probability": np.float64} @@ -53,7 +53,6 @@ def extract_gs_file(folder): "Expected exactly one gold standard file in folder. " f"Got {len(files)}. Exiting." ) - return files[0]