diff --git a/score.py b/score.py index eb851d4..be94296 100644 --- a/score.py +++ b/score.py @@ -5,17 +5,17 @@ - ROC curve - PR curve """ -from glob import glob import argparse import json import os +from glob import glob -import pandas as pd import numpy as np -from sklearn.metrics import roc_auc_score, average_precision_score +import pandas as pd +from sklearn.metrics import average_precision_score, roc_auc_score -GOLDSTANDARD_COLS = {"id": str, "disease": int} -PREDICTION_COLS = {"id": str, "disease_probability": np.float64} +GOLDSTANDARD_COLS = {"epr_number": str, "disease_probability": str} +PREDICTION_COLS = {"epr_number": str, "disease_probability": np.float64} def get_args(): @@ -27,12 +27,21 @@ def get_args(): return parser.parse_args() -def score(gold, gold_col, pred, pred_col): +def score(gold, pred, id_colname, prob_colname): """ Calculate metrics for: AUC-ROC, AUCPR """ - roc = roc_auc_score(gold[gold_col], pred[pred_col]) - pr = average_precision_score(gold[gold_col], pred[pred_col]) + # Join the two dataframes so that the order of the ids are the same + # between goldstandard and prediction. + merged = gold.merge(pred, how="left", on=id_colname) + roc = roc_auc_score( + merged[prob_colname + "_x"], + merged[prob_colname + "_y"] + ) + pr = average_precision_score( + merged[prob_colname + "_x"], + merged[prob_colname + "_y"] + ) return {"auc_roc": roc, "auprc": pr} @@ -44,10 +53,16 @@ def extract_gs_file(folder): "Expected exactly one gold standard file in folder. " f"Got {len(files)}. Exiting." ) - return files[0] +def preprocess(df, colname): + """Preprocess dataframe and convert column as needed.""" + df = df[~df[colname].isin([".M"])] + df[colname] = df[colname].astype(int) + return df + + def main(): """Main function.""" args = get_args() @@ -71,7 +86,8 @@ def main(): usecols=GOLDSTANDARD_COLS, dtype=GOLDSTANDARD_COLS ) - scores = score(gold, "disease", pred, "disease_probability") + gold = preprocess(gold, "disease_probability") + scores = score(gold, pred, "epr_number", "disease_probability") status = "SCORED" errors = "" except ValueError: diff --git a/validate.py b/validate.py index c9373a2..1847480 100644 --- a/validate.py +++ b/validate.py @@ -13,8 +13,8 @@ import numpy as np import pandas as pd -GOLDSTANDARD_COLS = {"id": str, "disease": int} -EXPECTED_COLS = {"id": str, "disease_probability": np.float64} +GOLDSTANDARD_COLS = {"epr_number": str, "disease_probability": str} +EXPECTED_COLS = {"epr_number": str, "disease_probability": np.float64} def get_args(): @@ -28,18 +28,18 @@ def get_args(): def check_dups(pred): """Check for duplicate participant IDs.""" - duplicates = pred.duplicated(subset=["id"]) + duplicates = pred.duplicated(subset=["epr_number"]) if duplicates.any(): return ( f"Found {duplicates.sum()} duplicate ID(s): " - f"{pred[duplicates].id.to_list()}" + f"{pred[duplicates].epr_number.to_list()}" ) return "" def check_missing_ids(gold, pred): """Check for missing participant IDs.""" - pred = pred.set_index("id") + pred = pred.set_index("epr_number") missing_ids = gold.index.difference(pred.index) if missing_ids.any(): return ( @@ -51,7 +51,7 @@ def check_missing_ids(gold, pred): def check_unknown_ids(gold, pred): """Check for unknown participant IDs.""" - pred = pred.set_index("id") + pred = pred.set_index("epr_number") unknown_ids = pred.index.difference(gold.index) if unknown_ids.any(): return ( @@ -92,7 +92,7 @@ def validate(gold_folder, pred_file): """Validate predictions file against goldstandard.""" errors = [] gold_file = extract_gs_file(gold_folder) - gold = pd.read_csv(gold_file, dtype=GOLDSTANDARD_COLS, index_col="id") + gold = pd.read_csv(gold_file, dtype=GOLDSTANDARD_COLS, index_col="epr_number") try: pred = pd.read_csv( pred_file,