From 4f521a6a7bfe17b5348fb56b269d30c734a73640 Mon Sep 17 00:00:00 2001
From: Eric Hanson <5846501+ericphanson@users.noreply.github.com>
Date: Tue, 30 Nov 2021 18:08:49 -0500
Subject: [PATCH 01/21] wip

---
 Project.toml      |   2 +
 src/Lighthouse.jl |   4 ++
 src/learn.jl      | 142 ++++++++++++++++++++++++++--------------------
 src/row.jl        |  25 ++++++++
 4 files changed, 110 insertions(+), 63 deletions(-)
 create mode 100644 src/row.jl

diff --git a/Project.toml b/Project.toml
index e59df5e..cac7a1a 100644
--- a/Project.toml
+++ b/Project.toml
@@ -5,6 +5,7 @@ version = "0.13.2"
 
 [deps]
 Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"
+Legolas = "741b9549-f6ed-4911-9fbf-4a1c0c97f0cd"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 Logging = "56ddb016-857b-54e1-b83d-db4d58db5568"
 Makie = "ee78f7c6-11fb-53f2-987a-cfe4a2b5a57a"
@@ -19,6 +20,7 @@ CairoMakie = "0.5.2, 0.6"
 Makie = "0.13.14, 0.14, 0.15"
 StatsBase = "0.33"
 TensorBoardLogger = "0.1"
+Legolas = "0.3"
 julia = "1.5"
 
 [extras]
diff --git a/src/Lighthouse.jl b/src/Lighthouse.jl
index 1ffacb1..38f9095 100644
--- a/src/Lighthouse.jl
+++ b/src/Lighthouse.jl
@@ -6,6 +6,7 @@ using StatsBase: StatsBase
 using TensorBoardLogger
 using Makie
 using Printf
+using Legolas
 
 include("plotting.jl")
 
@@ -18,7 +19,10 @@ export confusion_matrix, accuracy, binary_statistics, cohens_kappa, calibration_
 include("classifier.jl")
 export AbstractClassifier
 
+include("row.jl")
+
 include("learn.jl")
 export LearnLogger, learn!, upon, evaluate!, predict!
 
+
 end # module
diff --git a/src/learn.jl b/src/learn.jl
index 8beb8d6..fd20746 100644
--- a/src/learn.jl
+++ b/src/learn.jl
@@ -238,7 +238,7 @@ function _calculate_stratified_ea_kappas(predicted_hard_labels, elected_hard_lab
         elected = elected_hard_labels[index]
         k = _calculate_ea_kappas(predicted, elected, class_count)
         push!(kappas,
-              group => (per_class=k.per_class, multiclass=k.multiclass, n=sum(index)))
+              group => (per_class=k.per_class_kappas, multiclass=k.multiclass_kappa, n=sum(index)))
     end
     return sort(kappas; by=p -> last(p).multiclass)
 end
@@ -246,9 +246,9 @@ end
 """
     _calculate_ea_kappas(predicted_hard_labels, elected_hard_labels, classes)
 
-Return `NamedTuple` with keys `:per_class`, `:multiclass` containing the Cohen's
+Return `NamedTuple` with keys `:per_class_kappas`, `:multiclass_kappa` containing the Cohen's
 Kappa per-class and over all classes, respectively. The value of output key
-`:per_class` is an `Array` such that item `i` is the Cohen's kappa calculated
+`:per_class_kappas` is an `Array` such that item `i` is the Cohen's kappa calculated
 for class `i`.
 
 Where...
@@ -272,15 +272,15 @@ function _calculate_ea_kappas(predicted_hard_labels, elected_hard_labels, class_
         elected = ((label == class_index) + 1 for label in elected_hard_labels)
         return first(cohens_kappa(CLASS_VS_ALL_CLASS_COUNT, zip(predicted, elected)))
     end
-    return (per_class=per_class, multiclass=multiclass)
+    return (per_class_kappas=per_class, multiclass_kappa=multiclass)
 end
 
 """
     _calculate_ira_kappas(votes, classes)
 
-Return `NamedTuple` with keys `:per_class`, `:multiclass` containing the Cohen's
+Return `NamedTuple` with keys `:per_class_IRA_kappas`, `:multiclass_IRA_kappas` containing the Cohen's
 Kappa for inter-rater agreement (IRA) per-class and over all classes, respectively.
-The value of output key `:per_class` is an `Array` such that item `i` is the
+The value of output key `:per_class_IRA_kappas` is an `Array` such that item `i` is the
 IRA kappa calculated for class `i`.
 
 Where...
@@ -292,12 +292,13 @@ Where...
 
 - `classes` all possible classes voted on.
 
-Returns `nothing` if `votes` has only a single voter (i.e., a single column) or if
+Returns `(per_class_IRA_kappas=missing, multiclass_IRA_kappas=missing)` if `votes` has only a single voter (i.e., a single column) or if
 no two voters rated the same sample. Note that vote entries of `0` are taken to
 mean that the voter did not rate that sample.
 """
 function _calculate_ira_kappas(votes, classes)
-    (isnothing(votes) || size(votes, 2) < 2) && return nothing  # no votes given or only one expert
+    # no votes given or only one expert:
+    (isnothing(votes) || size(votes, 2) < 2) && return (per_class_IRA_kappas=missing, multiclass_IRA_kappas=missing)
 
     all_hard_label_pairs = Array{Int}(undef, 0, 2)
     num_voters = size(votes, 2)
@@ -319,7 +320,7 @@ function _calculate_ira_kappas(votes, classes)
                                             hard_label_pairs)
         return first(cohens_kappa(CLASS_VS_ALL_CLASS_COUNT, class_v_other_hard_label_pair))
     end
-    return (per_class=per_class_ira, multiclass=multiclass_ira)
+    return (per_class_IRA_kappas=per_class_ira, multiclass_IRA_kappas=multiclass_ira)
 end
 
 function _spearman_corr(predicted_soft_labels, elected_soft_labels)
@@ -442,7 +443,9 @@ function _get_optimal_threshold_from_ROC(per_class_roc_curves; thresholds,
 end
 
 function _validate_threshold_class(optimal_threshold_class, classes)
-    isnothing(optimal_threshold_class) && return nothing
+    if ismissing(optimal_threshold_class) || isnothing(optimal_threshold_class)
+        return nothing
+    end
     length(classes) == 2 ||
         throw(ArgumentError("Only valid for binary classification problems"))
     optimal_threshold_class in Set([1, 2]) ||
@@ -451,16 +454,16 @@ function _validate_threshold_class(optimal_threshold_class, classes)
 end
 
 """
-    evaluation_metrics(predicted_hard_labels::AbstractVector,
-                       predicted_soft_labels::AbstractMatrix,
-                       elected_hard_labels::AbstractVector,
-                       classes,
-                       thresholds=0.0:0.01:1.0;
-                       votes::Union{Nothing,AbstractMatrix}=nothing,
-                       strata::Union{Nothing,AbstractVector{Set{T}} where T}=nothing,
-                       optimal_threshold_class::Union{Nothing,Integer}=nothing)
-
-Returns dictionary containing a battery of classifier performance
+    evaluation_metrics_row(predicted_hard_labels::AbstractVector,
+                           predicted_soft_labels::AbstractMatrix,
+                           elected_hard_labels::AbstractVector,
+                           classes,
+                           thresholds=0.0:0.01:1.0;
+                           votes::Union{Nothing,AbstractMatrix}=nothing,
+                           strata::Union{Nothing,AbstractVector{Set{T}} where T}=nothing,
+                           optimal_threshold_class::Union{Nothing,Integer}=nothing)
+
+Returns `EvaluationRow` containing a battery of classifier performance
 metrics that each compare `predicted_soft_labels` and/or `predicted_hard_labels`
 agaist `elected_hard_labels`.
 
@@ -495,12 +498,12 @@ Where...
 
 See also [`evaluation_metrics_plot`](@ref).
 """
-function evaluation_metrics(predicted_hard_labels::AbstractVector,
+function evaluation_metrics_row(predicted_hard_labels::AbstractVector,
                             predicted_soft_labels::AbstractMatrix,
                             elected_hard_labels::AbstractVector, classes, thresholds;
                             votes::Union{Nothing,AbstractMatrix}=nothing,
                             strata::Union{Nothing,AbstractVector{Set{T}} where T}=nothing,
-                            optimal_threshold_class::Union{Nothing,Integer}=nothing)
+                            optimal_threshold_class::Union{Missing,Integer}=missing)
     _validate_threshold_class(optimal_threshold_class, classes)
 
     class_count = length(classes)
@@ -508,22 +511,16 @@ function evaluation_metrics(predicted_hard_labels::AbstractVector,
     class_labels = string.(class_vector)
     per_class_stats = per_class_confusion_statistics(predicted_soft_labels,
                                                      elected_hard_labels, thresholds)
-    plot_dict = Dict()
-    plot_dict["class_labels"] = class_labels
-    plot_dict["thresholds"] = thresholds
 
     # ROC curves
-    plot_dict["per_class_roc_curves"] = [(map(t -> t.false_positive_rate, stats),
+    per_class_roc_curves = [(map(t -> t.false_positive_rate, stats),
                                           map(t -> t.true_positive_rate, stats))
                                          for stats in per_class_stats]
-    plot_dict["per_class_roc_aucs"] = [area_under_curve(x, y)
-                                       for (x, y) in plot_dict["per_class_roc_curves"]]
+    per_class_roc_aucs = [area_under_curve(x, y)
+                                       for (x, y) in per_class_roc_curves]
 
     # Optionally calculate optimal threshold
-    if !isnothing(optimal_threshold_class)
-        plot_dict["optimal_threshold_class"] = optimal_threshold_class
-        threshold = nothing
-
+    if !ismissing(optimal_threshold_class)
         # If votes exist, calculate the threshold based on comparing against
         # vote probabilities. Otherwise, use the ROC curve.
         if !isnothing(votes)
@@ -531,21 +528,24 @@ function evaluation_metrics(predicted_hard_labels::AbstractVector,
                                                                              votes;
                                                                              thresholds=thresholds,
                                                                              class_of_interest_index=optimal_threshold_class)
-            threshold = c.threshold
-            plot_dict["discrimination_calibration_curve"] = c.plot_curve_data
-            plot_dict["discrimination_calibration_score"] = c.mse
+            optimal_threshold = c.threshold
+            discrimination_calibration_curve = c.plot_curve_data
+            discrimination_calibration_score = c.mse
 
             expert_cal = _calculate_voter_discrimination_calibration(votes;
                                                                      class_of_interest_index=optimal_threshold_class)
-            plot_dict["per_expert_discrimination_calibration_curves"] = expert_cal.plot_curve_data
-            plot_dict["per_expert_discrimination_calibration_scores"] = expert_cal.mse
+            per_expert_discrimination_calibration_curves = expert_cal.plot_curve_data
+            per_expert_discrimination_calibration_scores = expert_cal.mse
         else
+            discrimination_calibration_curve = missing
+            discrimination_calibration_score = missing
+            per_expert_discrimination_calibration_curves = missing
+            per_expert_discrimination_calibration_scores = missing
             # ...based on ROC curve otherwise
-            threshold = _get_optimal_threshold_from_ROC(plot_dict["per_class_roc_curves"];
+            optimal_threshold = _get_optimal_threshold_from_ROC(per_class_roc_curves;
                                                         thresholds=thresholds,
                                                         class_of_interest_index=optimal_threshold_class)
         end
-        plot_dict["optimal_threshold"] = threshold
 
         # Recalculate `predicted_hard_labels` with this new threshold
         other_class = optimal_threshold_class == 1 ? 2 : 1
@@ -553,53 +553,69 @@ function evaluation_metrics(predicted_hard_labels::AbstractVector,
             predicted_hard_labels[i] = row[optimal_threshold_class] .>= threshold ?
                                        optimal_threshold_class : other_class
         end
+    else
+        discrimination_calibration_curve = missing
+        discrimination_calibration_score = missing
+        per_expert_discrimination_calibration_curves = missing
+        per_expert_discrimination_calibration_scores = missing
+        optimal_threshold = missing
     end
 
     # PR curves
-    plot_dict["per_class_pr_curves"] = [(map(t -> t.true_positive_rate, stats),
+    per_class_pr_curves = [(map(t -> t.true_positive_rate, stats),
                                          map(t -> t.precision, stats))
                                         for stats in per_class_stats]
 
-    # Cohen's kappa
-    kappas = _calculate_ea_kappas(predicted_hard_labels, elected_hard_labels, class_count)
-    plot_dict["per_class_kappas"] = kappas.per_class
-    plot_dict["multiclass_kappa"] = kappas.multiclass
-    ira = _calculate_ira_kappas(votes, classes)
-    if !isnothing(ira)
-        plot_dict["per_class_IRA_kappas"] = ira.per_class
-        plot_dict["multiclass_IRA_kappas"] = ira.multiclass
-    end
-
+    
     # Stratified kappas
-    if !isnothing(strata)
-        plot_dict["stratified_kappas"] = _calculate_stratified_ea_kappas(predicted_hard_labels,
+    if isnothing(strata)
+        stratified_kappas = missing
+    else
+        stratified_kappas = _calculate_stratified_ea_kappas(predicted_hard_labels,
                                                                          elected_hard_labels,
                                                                          class_count,
                                                                          strata)
     end
 
     # Reliability calibration curves
-    per_class_reliability_calibration_curves = map(1:class_count) do class_index
+    per_class_reliability_calibration = map(1:class_count) do class_index
         class_probabilities = view(predicted_soft_labels, :, class_index)
         return calibration_curve(class_probabilities, elected_hard_labels .== class_index)
     end
-    plot_dict["per_class_reliability_calibration_curves"] = map(x -> (mean.(x.bins),
+    per_class_reliability_calibration_curves = map(x -> (mean.(x.bins),
                                                                       x.fractions),
-                                                                per_class_reliability_calibration_curves)
-    plot_dict["per_class_reliability_calibration_scores"] = map(x -> x.mean_squared_error,
-                                                                per_class_reliability_calibration_curves)
+                                                                per_class_reliability_calibration)
+    per_class_reliability_calibration_scores = map(x -> x.mean_squared_error,
+                                                                per_class_reliability_calibration)
 
-    # Confusion matrix
-    plot_dict["confusion_matrix"] = confusion_matrix(class_count,
-                                                     zip(predicted_hard_labels,
-                                                         elected_hard_labels))
 
     # Log Spearman correlation, iff this is a binary classification problem
     if length(classes) == 2 && !isnothing(votes)
-        plot_dict["spearman_correlation"] = _calculate_spearman_correlation(predicted_soft_labels,
+        spearman_correlation = _calculate_spearman_correlation(predicted_soft_labels,
                                                                             votes, classes)
+    else
+        spearman_correlation = missing
     end
-    return plot_dict
+
+    return EvaluationRow(; class_labels,
+        confusion_matrix = confusion_matrix(class_count, zip(predicted_hard_labels, elected_hard_labels)),
+        spearman_correlation, per_class_reliability_calibration_curves, per_class_reliability_calibration_scores,
+        _calculate_ira_kappas(votes, classes)...,
+        _calculate_ea_kappas(predicted_hard_labels, elected_hard_labels, class_count)...,
+        stratified_kappas,per_class_pr_curves,
+        per_class_roc_curves, per_class_roc_aucs,
+        discrimination_calibration_curve,
+        discrimination_calibration_score,
+        per_expert_discrimination_calibration_curves,
+        per_expert_discrimination_calibration_scores,
+        optimal_threshold,
+        thresholds,
+        )
+end
+
+function evaluation_metrics(args...; optimal_threshold_class=nothing, kwargs...)
+    row = evaluation_metrics_row(args...; optimal_threshold_class=something(optimal_threshold_class, missing), kwargs...)
+    return Dict(string(k) => v for (k,v) in pairs(NamedTuple(row)) if !ismissing(v))
 end
 
 """
diff --git a/src/row.jl b/src/row.jl
new file mode 100644
index 0000000..7794999
--- /dev/null
+++ b/src/row.jl
@@ -0,0 +1,25 @@
+vec_to_mat(mat::AbstractMatrix) = mat
+function vec_to_mat(vec::AbstractVector)
+    n = isqrt(length(vec))
+    return reshape(vec, n, n)
+end
+vec_to_mat(::Missing) = missing
+
+const EvaluationRow = Legolas.@row("lighthouse.evaluation@1",
+    class_labels::Union{Missing, Vector{String}},
+    confusion_matrix::Union{Missing, Matrix{Int64}}=vec_to_mat(confusion_matrix),
+    discrimination_calibration_curve::Union{Missing, Tuple{Vector{Float64}, Vector{Union{Missing, Float64}}}},
+    discrimination_calibration_score::Union{Missing, Float64},
+    multiclass_IRA_kappas::Union{Missing, Float64}, 
+    multiclass_kappa::Union{Missing, Float64}, 
+    optimal_threshold::Union{Missing, Float64},
+    per_class_kappas::Union{Missing, Vector{Float64}},   
+    per_class_pr_curves::Union{Missing, Vector{Tuple{Vector{Float64}, Vector{Union{Missing, Float64}}}}},
+    per_class_reliability_calibration_curves::Union{Missing, Vector{Tuple{Vector{Float64}, Vector{Union{Missing, Float64}}}}},
+    per_class_reliability_calibration_scores::Union{Missing, Vector{Float64}},
+    per_class_roc_aucs::Union{Missing, Vector{Float64}}, 
+    per_class_roc_curves::Union{Missing, Vector{Tuple{Vector{Float64}, Vector{Float64}}}},
+    per_expert_discrimination_calibration_curves::Union{Missing, Vector{Tuple{Vector{Float64}, Vector{Union{Missing, Float64}}}}},
+    per_expert_discrimination_calibration_scores::Union{Missing, Vector{Float64}},
+    spearman_correlation::Union{Missing, NamedTuple{(:ρ, :n, :ci_lower, :ci_upper), Tuple{Float64, Int64, Float64, Float64}}}, 
+    thresholds::Union{Missing, Vector{Float64}})

From 8bf4cfcdf69c8687fece662ee5bf85b1fd1a255f Mon Sep 17 00:00:00 2001
From: hannahilea <hannahilea@users.noreply.github.com>
Date: Tue, 15 Mar 2022 18:56:51 +0000
Subject: [PATCH 02/21] Bump patch version, fix tests

---
 Project.toml | 2 +-
 src/learn.jl | 9 ++++-----
 2 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/Project.toml b/Project.toml
index c30157b..3b39591 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,7 +1,7 @@
 name = "Lighthouse"
 uuid = "ac2c24cd-07f0-4848-96b2-1b82c3ea0e59"
 authors = ["Beacon Biosignals, Inc."]
-version = "0.13.3"
+version = "0.13.5"
 
 [deps]
 Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"
diff --git a/src/learn.jl b/src/learn.jl
index fd20746..fbd4be6 100644
--- a/src/learn.jl
+++ b/src/learn.jl
@@ -298,7 +298,7 @@ mean that the voter did not rate that sample.
 """
 function _calculate_ira_kappas(votes, classes)
     # no votes given or only one expert:
-    (isnothing(votes) || size(votes, 2) < 2) && return (per_class_IRA_kappas=missing, multiclass_IRA_kappas=missing)
+    (isnothing(votes) || size(votes, 2) < 2) && return (; per_class_IRA_kappas=missing, multiclass_IRA_kappas=missing)
 
     all_hard_label_pairs = Array{Int}(undef, 0, 2)
     num_voters = size(votes, 2)
@@ -308,7 +308,7 @@ function _calculate_ira_kappas(votes, classes)
         end
     end
     hard_label_pairs = filter(row -> all(row .!= 0), collect(eachrow(all_hard_label_pairs)))
-    length(hard_label_pairs) > 0 || return nothing  # No common observations voted on
+    length(hard_label_pairs) > 0 || return (; per_class_IRA_kappas=missing, multiclass_IRA_kappas=missing)  # No common observations voted on
     length(hard_label_pairs) < 10 &&
         @warn "...only $(length(hard_label_pairs)) in common, potentially questionable IRA results"
 
@@ -320,7 +320,7 @@ function _calculate_ira_kappas(votes, classes)
                                             hard_label_pairs)
         return first(cohens_kappa(CLASS_VS_ALL_CLASS_COUNT, class_v_other_hard_label_pair))
     end
-    return (per_class_IRA_kappas=per_class_ira, multiclass_IRA_kappas=multiclass_ira)
+    return (; per_class_IRA_kappas = per_class_ira, multiclass_IRA_kappas = multiclass_ira)
 end
 
 function _spearman_corr(predicted_soft_labels, elected_soft_labels)
@@ -550,7 +550,7 @@ function evaluation_metrics_row(predicted_hard_labels::AbstractVector,
         # Recalculate `predicted_hard_labels` with this new threshold
         other_class = optimal_threshold_class == 1 ? 2 : 1
         for (i, row) in enumerate(eachrow(predicted_soft_labels))
-            predicted_hard_labels[i] = row[optimal_threshold_class] .>= threshold ?
+            predicted_hard_labels[i] = row[optimal_threshold_class] .>= optimal_threshold ?
                                        optimal_threshold_class : other_class
         end
     else
@@ -566,7 +566,6 @@ function evaluation_metrics_row(predicted_hard_labels::AbstractVector,
                                          map(t -> t.precision, stats))
                                         for stats in per_class_stats]
 
-    
     # Stratified kappas
     if isnothing(strata)
         stratified_kappas = missing

From f4fc682f2c35bca8e1030cad44bacf1b797d243a Mon Sep 17 00:00:00 2001
From: hannahilea <hannahilea@users.noreply.github.com>
Date: Tue, 15 Mar 2022 19:52:29 +0000
Subject: [PATCH 03/21] add missing param

---
 src/learn.jl  | 48 ++++++++++++++++++++++++------------------------
 src/row.jl    | 35 ++++++++++++++++++-----------------
 test/learn.jl |  8 +++++---
 3 files changed, 47 insertions(+), 44 deletions(-)

diff --git a/src/learn.jl b/src/learn.jl
index fbd4be6..d8c1bde 100644
--- a/src/learn.jl
+++ b/src/learn.jl
@@ -499,11 +499,11 @@ Where...
 See also [`evaluation_metrics_plot`](@ref).
 """
 function evaluation_metrics_row(predicted_hard_labels::AbstractVector,
-                            predicted_soft_labels::AbstractMatrix,
-                            elected_hard_labels::AbstractVector, classes, thresholds;
-                            votes::Union{Nothing,AbstractMatrix}=nothing,
-                            strata::Union{Nothing,AbstractVector{Set{T}} where T}=nothing,
-                            optimal_threshold_class::Union{Missing,Integer}=missing)
+                                predicted_soft_labels::AbstractMatrix,
+                                elected_hard_labels::AbstractVector, classes, thresholds;
+                                votes::Union{Nothing,AbstractMatrix} = nothing,
+                                strata::Union{Nothing,AbstractVector{Set{T}} where T} = nothing,
+                                optimal_threshold_class::Union{Missing,Integer} = missing)
     _validate_threshold_class(optimal_threshold_class, classes)
 
     class_count = length(classes)
@@ -514,10 +514,9 @@ function evaluation_metrics_row(predicted_hard_labels::AbstractVector,
 
     # ROC curves
     per_class_roc_curves = [(map(t -> t.false_positive_rate, stats),
-                                          map(t -> t.true_positive_rate, stats))
-                                         for stats in per_class_stats]
-    per_class_roc_aucs = [area_under_curve(x, y)
-                                       for (x, y) in per_class_roc_curves]
+                            map(t -> t.true_positive_rate, stats))
+                                                for stats in per_class_stats]
+    per_class_roc_aucs = [area_under_curve(x, y) for (x, y) in per_class_roc_curves]
 
     # Optionally calculate optimal threshold
     if !ismissing(optimal_threshold_class)
@@ -533,7 +532,7 @@ function evaluation_metrics_row(predicted_hard_labels::AbstractVector,
             discrimination_calibration_score = c.mse
 
             expert_cal = _calculate_voter_discrimination_calibration(votes;
-                                                                     class_of_interest_index=optimal_threshold_class)
+                class_of_interest_index = optimal_threshold_class)
             per_expert_discrimination_calibration_curves = expert_cal.plot_curve_data
             per_expert_discrimination_calibration_scores = expert_cal.mse
         else
@@ -543,8 +542,8 @@ function evaluation_metrics_row(predicted_hard_labels::AbstractVector,
             per_expert_discrimination_calibration_scores = missing
             # ...based on ROC curve otherwise
             optimal_threshold = _get_optimal_threshold_from_ROC(per_class_roc_curves;
-                                                        thresholds=thresholds,
-                                                        class_of_interest_index=optimal_threshold_class)
+                thresholds = thresholds,
+                class_of_interest_index = optimal_threshold_class)
         end
 
         # Recalculate `predicted_hard_labels` with this new threshold
@@ -563,17 +562,17 @@ function evaluation_metrics_row(predicted_hard_labels::AbstractVector,
 
     # PR curves
     per_class_pr_curves = [(map(t -> t.true_positive_rate, stats),
-                                         map(t -> t.precision, stats))
-                                        for stats in per_class_stats]
+        map(t -> t.precision, stats))
+                           for stats in per_class_stats]
 
     # Stratified kappas
     if isnothing(strata)
         stratified_kappas = missing
     else
         stratified_kappas = _calculate_stratified_ea_kappas(predicted_hard_labels,
-                                                                         elected_hard_labels,
-                                                                         class_count,
-                                                                         strata)
+            elected_hard_labels,
+            class_count,
+            strata)
     end
 
     # Reliability calibration curves
@@ -582,16 +581,16 @@ function evaluation_metrics_row(predicted_hard_labels::AbstractVector,
         return calibration_curve(class_probabilities, elected_hard_labels .== class_index)
     end
     per_class_reliability_calibration_curves = map(x -> (mean.(x.bins),
-                                                                      x.fractions),
-                                                                per_class_reliability_calibration)
+            x.fractions),
+        per_class_reliability_calibration)
     per_class_reliability_calibration_scores = map(x -> x.mean_squared_error,
-                                                                per_class_reliability_calibration)
+        per_class_reliability_calibration)
 
 
     # Log Spearman correlation, iff this is a binary classification problem
     if length(classes) == 2 && !isnothing(votes)
         spearman_correlation = _calculate_spearman_correlation(predicted_soft_labels,
-                                                                            votes, classes)
+            votes, classes)
     else
         spearman_correlation = missing
     end
@@ -601,15 +600,16 @@ function evaluation_metrics_row(predicted_hard_labels::AbstractVector,
         spearman_correlation, per_class_reliability_calibration_curves, per_class_reliability_calibration_scores,
         _calculate_ira_kappas(votes, classes)...,
         _calculate_ea_kappas(predicted_hard_labels, elected_hard_labels, class_count)...,
-        stratified_kappas,per_class_pr_curves,
+        stratified_kappas, per_class_pr_curves,
         per_class_roc_curves, per_class_roc_aucs,
         discrimination_calibration_curve,
         discrimination_calibration_score,
         per_expert_discrimination_calibration_curves,
         per_expert_discrimination_calibration_scores,
         optimal_threshold,
-        thresholds,
-        )
+        optimal_threshold_class,
+        thresholds
+    )
 end
 
 function evaluation_metrics(args...; optimal_threshold_class=nothing, kwargs...)
diff --git a/src/row.jl b/src/row.jl
index 7794999..fb247d5 100644
--- a/src/row.jl
+++ b/src/row.jl
@@ -6,20 +6,21 @@ end
 vec_to_mat(::Missing) = missing
 
 const EvaluationRow = Legolas.@row("lighthouse.evaluation@1",
-    class_labels::Union{Missing, Vector{String}},
-    confusion_matrix::Union{Missing, Matrix{Int64}}=vec_to_mat(confusion_matrix),
-    discrimination_calibration_curve::Union{Missing, Tuple{Vector{Float64}, Vector{Union{Missing, Float64}}}},
-    discrimination_calibration_score::Union{Missing, Float64},
-    multiclass_IRA_kappas::Union{Missing, Float64}, 
-    multiclass_kappa::Union{Missing, Float64}, 
-    optimal_threshold::Union{Missing, Float64},
-    per_class_kappas::Union{Missing, Vector{Float64}},   
-    per_class_pr_curves::Union{Missing, Vector{Tuple{Vector{Float64}, Vector{Union{Missing, Float64}}}}},
-    per_class_reliability_calibration_curves::Union{Missing, Vector{Tuple{Vector{Float64}, Vector{Union{Missing, Float64}}}}},
-    per_class_reliability_calibration_scores::Union{Missing, Vector{Float64}},
-    per_class_roc_aucs::Union{Missing, Vector{Float64}}, 
-    per_class_roc_curves::Union{Missing, Vector{Tuple{Vector{Float64}, Vector{Float64}}}},
-    per_expert_discrimination_calibration_curves::Union{Missing, Vector{Tuple{Vector{Float64}, Vector{Union{Missing, Float64}}}}},
-    per_expert_discrimination_calibration_scores::Union{Missing, Vector{Float64}},
-    spearman_correlation::Union{Missing, NamedTuple{(:ρ, :n, :ci_lower, :ci_upper), Tuple{Float64, Int64, Float64, Float64}}}, 
-    thresholds::Union{Missing, Vector{Float64}})
+    class_labels::Union{Missing,Vector{String}},
+    confusion_matrix::Union{Missing,Matrix{Int64}} = vec_to_mat(confusion_matrix),
+    discrimination_calibration_curve::Union{Missing,Tuple{Vector{Float64},Vector{Union{Missing,Float64}}}},
+    discrimination_calibration_score::Union{Missing,Float64},
+    multiclass_IRA_kappas::Union{Missing,Float64},
+    multiclass_kappa::Union{Missing,Float64},
+    optimal_threshold::Union{Missing,Float64},
+    optimal_threshold_class::Union{Missing,Int64},
+    per_class_kappas::Union{Missing,Vector{Float64}},
+    per_class_pr_curves::Union{Missing,Vector{Tuple{Vector{Float64},Vector{Union{Missing,Float64}}}}},
+    per_class_reliability_calibration_curves::Union{Missing,Vector{Tuple{Vector{Float64},Vector{Union{Missing,Float64}}}}},
+    per_class_reliability_calibration_scores::Union{Missing,Vector{Float64}},
+    per_class_roc_aucs::Union{Missing,Vector{Float64}},
+    per_class_roc_curves::Union{Missing,Vector{Tuple{Vector{Float64},Vector{Float64}}}},
+    per_expert_discrimination_calibration_curves::Union{Missing,Vector{Tuple{Vector{Float64},Vector{Union{Missing,Float64}}}}},
+    per_expert_discrimination_calibration_scores::Union{Missing,Vector{Float64}},
+    spearman_correlation::Union{Missing,NamedTuple{(:ρ, :n, :ci_lower, :ci_upper),Tuple{Float64,Int64,Float64,Float64}}},
+    thresholds::Union{Missing,Vector{Float64}})
diff --git a/test/learn.jl b/test/learn.jl
index 1d1b6aa..174e5a0 100644
--- a/test/learn.jl
+++ b/test/learn.jl
@@ -266,10 +266,12 @@ end
     end
 end
 
-@testset "`_calculate_ira_kappas`" begin
+@testset "Invalid `_calculate_ira_kappas`" begin
     classes = ["roy", "gee", "biv"]
-    @test isnothing(Lighthouse._calculate_ira_kappas([1; 1; 1; 1], classes))  # Only one voter...
-    @test isnothing(Lighthouse._calculate_ira_kappas([1 0; 1 0; 0 1], classes))  # No observations in common...
+    @test isequal(Lighthouse._calculate_ira_kappas([1; 1; 1; 1], classes),
+        (; per_class_IRA_kappas = missing, multiclass_IRA_kappas = missing))  # Only one voter...
+    @test isequal(Lighthouse._calculate_ira_kappas([1 0; 1 0; 0 1], classes),
+        (; per_class_IRA_kappas = missing, multiclass_IRA_kappas = missing))  # No observations in common...
 end
 
 @testset "Calculate `_spearman_corr`" begin

From 4ccafc444372a96e7f71cc57d667cd5f5c5c4406 Mon Sep 17 00:00:00 2001
From: hannahilea <hannahilea@users.noreply.github.com>
Date: Tue, 15 Mar 2022 19:53:02 +0000
Subject: [PATCH 04/21] bump minor version

---
 Project.toml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Project.toml b/Project.toml
index 3b39591..d5bfd0b 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,7 +1,7 @@
 name = "Lighthouse"
 uuid = "ac2c24cd-07f0-4848-96b2-1b82c3ea0e59"
 authors = ["Beacon Biosignals, Inc."]
-version = "0.13.5"
+version = "0.14.0"
 
 [deps]
 Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"
@@ -17,10 +17,10 @@ TensorBoardLogger = "899adc3e-224a-11e9-021f-63837185c80f"
 
 [compat]
 CairoMakie = "0.7"
+Legolas = "0.3"
 Makie = "0.16.5"
 StatsBase = "0.33"
 TensorBoardLogger = "0.1"
-Legolas = "0.3"
 julia = "1.5"
 
 [extras]

From 982ef8be1d5a41760d02582271034a59bea6056f Mon Sep 17 00:00:00 2001
From: hannahilea <hannahilea@users.noreply.github.com>
Date: Tue, 15 Mar 2022 20:21:02 +0000
Subject: [PATCH 05/21] add codecov

---
 test/row.jl      | 7 +++++++
 test/runtests.jl | 1 +
 2 files changed, 8 insertions(+)
 create mode 100644 test/row.jl

diff --git a/test/row.jl b/test/row.jl
new file mode 100644
index 0000000..69169d9
--- /dev/null
+++ b/test/row.jl
@@ -0,0 +1,7 @@
+@testset `vec_to_mat` begin
+    mat = [3 5 6; 6 7 8; 9 10 11]
+    @test Lighthouse.vec_to_mat(vec(mat)) == mat
+    @test Lighthouse.vec_to_mat(mat) == mat
+    @test ismissing(Lighthouse.vec_to_mat(missing))
+    @test_throws DimensionMismatch Lighthouse.vec_to_mat(collect(1:6)) # Invalid dimensions
+end
diff --git a/test/runtests.jl b/test/runtests.jl
index f15aef5..65f07a0 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -30,3 +30,4 @@ include("metrics.jl")
 include("learn.jl")
 include("utilities.jl")
 include("logger.jl")
+include("row.jl")

From 21349d609b850dc92e3c473551f2072cdc898631 Mon Sep 17 00:00:00 2001
From: hannahilea <hannahilea@users.noreply.github.com>
Date: Tue, 15 Mar 2022 20:41:03 +0000
Subject: [PATCH 06/21] more codecov

---
 test/learn.jl | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/test/learn.jl b/test/learn.jl
index 174e5a0..7322673 100644
--- a/test/learn.jl
+++ b/test/learn.jl
@@ -98,6 +98,10 @@ end
         @test length(logger.logged["wheeeeeee/time_in_seconds_for_all_time"]) == 1
         @test length(logger.logged["wheeeeeee/metrics_for_all_time"]) == 1
 
+        # Round-trip `onehot` for codecov
+        onehot_hard = map(h -> vec(Lighthouse.onehot(model, h)), predicted_hard)
+        @test map(h -> findfirst(h), onehot_hard) == predicted_hard
+
         # Test startified eval
         strata = [Set("group $(j % Int(ceil(sqrt(j))))" for j in 1:(i - 1))
                   for i in 1:size(votes, 1)]

From 6194a88d58f8f80efd0bbd6adbacd4f75273181b Mon Sep 17 00:00:00 2001
From: hannahilea <hannahilea@users.noreply.github.com>
Date: Tue, 15 Mar 2022 21:25:03 +0000
Subject: [PATCH 07/21] Bump julia version to 1.6, testing to 1.6+1.7

---
 .github/workflows/CI.yml | 2 +-
 Project.toml             | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml
index ea42e1a..6bdd908 100644
--- a/.github/workflows/CI.yml
+++ b/.github/workflows/CI.yml
@@ -14,7 +14,7 @@ jobs:
       fail-fast: false
       matrix:
         version:
-          - '1.5'
+          - '1.7'
           - '1.6'
         os:
           - ubuntu-latest
diff --git a/Project.toml b/Project.toml
index d5bfd0b..3dbdc20 100644
--- a/Project.toml
+++ b/Project.toml
@@ -21,7 +21,7 @@ Legolas = "0.3"
 Makie = "0.16.5"
 StatsBase = "0.33"
 TensorBoardLogger = "0.1"
-julia = "1.5"
+julia = "1.6"
 
 [extras]
 CairoMakie = "13f3f980-e62b-5c42-98c6-ff1f3baf88f0"

From 571a7fe9a8874319e4a6ed74bf19439971a85929 Mon Sep 17 00:00:00 2001
From: hannahilea <hannahilea@users.noreply.github.com>
Date: Tue, 15 Mar 2022 22:44:55 +0000
Subject: [PATCH 08/21] wip rt tests

---
 src/learn.jl  | 105 +++++++++++++++++++++-------------------
 src/row.jl    |  71 ++++++++++++++++++++-------
 test/learn.jl | 130 ++++++++++++++++++++++++++++----------------------
 test/row.jl   |  16 +++++++
 4 files changed, 199 insertions(+), 123 deletions(-)

diff --git a/src/learn.jl b/src/learn.jl
index d8c1bde..a23fe69 100644
--- a/src/learn.jl
+++ b/src/learn.jl
@@ -211,8 +211,7 @@ function evaluate!(predicted_hard_labels::AbstractVector,
     _validate_threshold_class(optimal_threshold_class, classes)
 
     log_resource_info!(logger, logger_prefix; suffix=logger_suffix) do
-        plot_data = evaluation_metrics(predicted_hard_labels,
-                                       predicted_soft_labels,
+        plot_data = evaluation_metrics(predicted_hard_labels, predicted_soft_labels,
                                        elected_hard_labels, classes, thresholds;
                                        votes=votes,
                                        optimal_threshold_class=optimal_threshold_class)
@@ -226,8 +225,6 @@ function evaluate!(predicted_hard_labels::AbstractVector,
     return nothing
 end
 
-
-
 function _calculate_stratified_ea_kappas(predicted_hard_labels, elected_hard_labels,
                                          class_count, strata)
     groups = reduce(∪, strata)
@@ -238,7 +235,8 @@ function _calculate_stratified_ea_kappas(predicted_hard_labels, elected_hard_lab
         elected = elected_hard_labels[index]
         k = _calculate_ea_kappas(predicted, elected, class_count)
         push!(kappas,
-              group => (per_class=k.per_class_kappas, multiclass=k.multiclass_kappa, n=sum(index)))
+              group => (per_class=k.per_class_kappas, multiclass=k.multiclass_kappa,
+                        n=sum(index)))
     end
     return sort(kappas; by=p -> last(p).multiclass)
 end
@@ -298,7 +296,8 @@ mean that the voter did not rate that sample.
 """
 function _calculate_ira_kappas(votes, classes)
     # no votes given or only one expert:
-    (isnothing(votes) || size(votes, 2) < 2) && return (; per_class_IRA_kappas=missing, multiclass_IRA_kappas=missing)
+    (isnothing(votes) || size(votes, 2) < 2) &&
+        return (; per_class_IRA_kappas=missing, multiclass_IRA_kappas=missing)
 
     all_hard_label_pairs = Array{Int}(undef, 0, 2)
     num_voters = size(votes, 2)
@@ -308,7 +307,8 @@ function _calculate_ira_kappas(votes, classes)
         end
     end
     hard_label_pairs = filter(row -> all(row .!= 0), collect(eachrow(all_hard_label_pairs)))
-    length(hard_label_pairs) > 0 || return (; per_class_IRA_kappas=missing, multiclass_IRA_kappas=missing)  # No common observations voted on
+    length(hard_label_pairs) > 0 ||
+        return (; per_class_IRA_kappas=missing, multiclass_IRA_kappas=missing)  # No common observations voted on
     length(hard_label_pairs) < 10 &&
         @warn "...only $(length(hard_label_pairs)) in common, potentially questionable IRA results"
 
@@ -320,7 +320,7 @@ function _calculate_ira_kappas(votes, classes)
                                             hard_label_pairs)
         return first(cohens_kappa(CLASS_VS_ALL_CLASS_COUNT, class_v_other_hard_label_pair))
     end
-    return (; per_class_IRA_kappas = per_class_ira, multiclass_IRA_kappas = multiclass_ira)
+    return (; per_class_IRA_kappas=per_class_ira, multiclass_IRA_kappas=multiclass_ira)
 end
 
 function _spearman_corr(predicted_soft_labels, elected_soft_labels)
@@ -430,7 +430,7 @@ function _get_optimal_threshold_from_ROC(per_class_roc_curves; thresholds,
     opt_point = nothing
     threshold_idx = 1
     for point in zip(per_class_roc_curves[class_of_interest_index][1],
-            per_class_roc_curves[class_of_interest_index][2])
+                     per_class_roc_curves[class_of_interest_index][2])
         d = dist((0, 1), point)
         if d < min
             min = d
@@ -501,9 +501,9 @@ See also [`evaluation_metrics_plot`](@ref).
 function evaluation_metrics_row(predicted_hard_labels::AbstractVector,
                                 predicted_soft_labels::AbstractMatrix,
                                 elected_hard_labels::AbstractVector, classes, thresholds;
-                                votes::Union{Nothing,AbstractMatrix} = nothing,
-                                strata::Union{Nothing,AbstractVector{Set{T}} where T} = nothing,
-                                optimal_threshold_class::Union{Missing,Integer} = missing)
+                                votes::Union{Nothing,AbstractMatrix}=nothing,
+                                strata::Union{Nothing,AbstractVector{Set{T}} where T}=nothing,
+                                optimal_threshold_class::Union{Missing,Integer}=missing)
     _validate_threshold_class(optimal_threshold_class, classes)
 
     class_count = length(classes)
@@ -514,8 +514,8 @@ function evaluation_metrics_row(predicted_hard_labels::AbstractVector,
 
     # ROC curves
     per_class_roc_curves = [(map(t -> t.false_positive_rate, stats),
-                            map(t -> t.true_positive_rate, stats))
-                                                for stats in per_class_stats]
+                             map(t -> t.true_positive_rate, stats))
+                            for stats in per_class_stats]
     per_class_roc_aucs = [area_under_curve(x, y) for (x, y) in per_class_roc_curves]
 
     # Optionally calculate optimal threshold
@@ -532,7 +532,7 @@ function evaluation_metrics_row(predicted_hard_labels::AbstractVector,
             discrimination_calibration_score = c.mse
 
             expert_cal = _calculate_voter_discrimination_calibration(votes;
-                class_of_interest_index = optimal_threshold_class)
+                                                                     class_of_interest_index=optimal_threshold_class)
             per_expert_discrimination_calibration_curves = expert_cal.plot_curve_data
             per_expert_discrimination_calibration_scores = expert_cal.mse
         else
@@ -542,8 +542,8 @@ function evaluation_metrics_row(predicted_hard_labels::AbstractVector,
             per_expert_discrimination_calibration_scores = missing
             # ...based on ROC curve otherwise
             optimal_threshold = _get_optimal_threshold_from_ROC(per_class_roc_curves;
-                thresholds = thresholds,
-                class_of_interest_index = optimal_threshold_class)
+                                                                thresholds=thresholds,
+                                                                class_of_interest_index=optimal_threshold_class)
         end
 
         # Recalculate `predicted_hard_labels` with this new threshold
@@ -562,17 +562,15 @@ function evaluation_metrics_row(predicted_hard_labels::AbstractVector,
 
     # PR curves
     per_class_pr_curves = [(map(t -> t.true_positive_rate, stats),
-        map(t -> t.precision, stats))
-                           for stats in per_class_stats]
+                            map(t -> t.precision, stats)) for stats in per_class_stats]
 
     # Stratified kappas
     if isnothing(strata)
         stratified_kappas = missing
     else
         stratified_kappas = _calculate_stratified_ea_kappas(predicted_hard_labels,
-            elected_hard_labels,
-            class_count,
-            strata)
+                                                            elected_hard_labels,
+                                                            class_count, strata)
     end
 
     # Reliability calibration curves
@@ -580,41 +578,50 @@ function evaluation_metrics_row(predicted_hard_labels::AbstractVector,
         class_probabilities = view(predicted_soft_labels, :, class_index)
         return calibration_curve(class_probabilities, elected_hard_labels .== class_index)
     end
-    per_class_reliability_calibration_curves = map(x -> (mean.(x.bins),
-            x.fractions),
-        per_class_reliability_calibration)
+    per_class_reliability_calibration_curves = map(x -> (mean.(x.bins), x.fractions),
+                                                   per_class_reliability_calibration)
     per_class_reliability_calibration_scores = map(x -> x.mean_squared_error,
-        per_class_reliability_calibration)
-
+                                                   per_class_reliability_calibration)
 
     # Log Spearman correlation, iff this is a binary classification problem
     if length(classes) == 2 && !isnothing(votes)
-        spearman_correlation = _calculate_spearman_correlation(predicted_soft_labels,
-            votes, classes)
+        spearman_correlation = _calculate_spearman_correlation(predicted_soft_labels, votes,
+                                                               classes)
     else
         spearman_correlation = missing
     end
 
     return EvaluationRow(; class_labels,
-        confusion_matrix = confusion_matrix(class_count, zip(predicted_hard_labels, elected_hard_labels)),
-        spearman_correlation, per_class_reliability_calibration_curves, per_class_reliability_calibration_scores,
-        _calculate_ira_kappas(votes, classes)...,
-        _calculate_ea_kappas(predicted_hard_labels, elected_hard_labels, class_count)...,
-        stratified_kappas, per_class_pr_curves,
-        per_class_roc_curves, per_class_roc_aucs,
-        discrimination_calibration_curve,
-        discrimination_calibration_score,
-        per_expert_discrimination_calibration_curves,
-        per_expert_discrimination_calibration_scores,
-        optimal_threshold,
-        optimal_threshold_class,
-        thresholds
-    )
+                         confusion_matrix=confusion_matrix(class_count,
+                                                           zip(predicted_hard_labels,
+                                                               elected_hard_labels)),
+                         spearman_correlation, per_class_reliability_calibration_curves,
+                         per_class_reliability_calibration_scores,
+                         _calculate_ira_kappas(votes, classes)...,
+                         _calculate_ea_kappas(predicted_hard_labels, elected_hard_labels,
+                                              class_count)..., stratified_kappas,
+                         per_class_pr_curves, per_class_roc_curves, per_class_roc_aucs,
+                         discrimination_calibration_curve, discrimination_calibration_score,
+                         per_expert_discrimination_calibration_curves,
+                         per_expert_discrimination_calibration_scores, optimal_threshold,
+                         optimal_threshold_class, thresholds)
 end
 
+"""
+    evaluation_metrics(args...; optimal_threshold_class=nothing, kwargs...)
+
+Return [`evaluation_metrics_row`](@ref) after converting output `EvaluationRow`
+into a `Dict`. For argument details, see [`evaluation_metrics_row`](@ref).
+"""
 function evaluation_metrics(args...; optimal_threshold_class=nothing, kwargs...)
-    row = evaluation_metrics_row(args...; optimal_threshold_class=something(optimal_threshold_class, missing), kwargs...)
-    return Dict(string(k) => v for (k,v) in pairs(NamedTuple(row)) if !ismissing(v))
+    row = evaluation_metrics_row(args...;
+                                 optimal_threshold_class=something(optimal_threshold_class,
+                                                                   missing), kwargs...)
+    return _evaluation_row_dict(row)
+end
+
+function _evaluation_row_dict(row::EvaluationRow)
+    return Dict(string(k) => v for (k, v) in pairs(NamedTuple(row)) if !ismissing(v))
 end
 
 """
@@ -642,7 +649,6 @@ function evaluation_metrics_plot(predicted_hard_labels::AbstractVector,
                                  votes::Union{Nothing,AbstractMatrix}=nothing,
                                  strata::Union{Nothing,AbstractVector{Set{T}} where T}=nothing,
                                  optimal_threshold_class::Union{Nothing,Integer}=nothing)
-
     Base.depwarn("""
     ```
     evaluation_metrics_plot(predicted_hard_labels::AbstractVector,
@@ -661,8 +667,8 @@ function evaluation_metrics_plot(predicted_hard_labels::AbstractVector,
     ```
     """, :evaluation_metrics_plot)
     plot_dict = evaluation_metrics(predicted_hard_labels, predicted_soft_labels,
-                                   elected_hard_labels, classes, thresholds;
-                                   votes, strata, optimal_threshold_class)
+                                   elected_hard_labels, classes, thresholds; votes, strata,
+                                   optimal_threshold_class)
     return evaluation_metrics_plot(plot_dict), plot_dict
 end
 
@@ -778,7 +784,8 @@ function learn!(model::AbstractClassifier, logger, get_train_batches, get_test_b
             predict!(model, predicted, get_test_batches(), logger;
                      logger_prefix="$(test_set_logger_prefix)_prediction")
             evaluate!(map(label -> onecold(model, label), eachrow(predicted)), predicted,
-                      elected, classes(model), logger; logger_prefix="$(test_set_logger_prefix)_evaluation",
+                      elected, classes(model), logger;
+                      logger_prefix="$(test_set_logger_prefix)_evaluation",
                       logger_suffix="_per_epoch", votes=votes,
                       optimal_threshold_class=optimal_threshold_class)
             post_epoch_callback(current_epoch)
diff --git a/src/row.jl b/src/row.jl
index fb247d5..689f2ac 100644
--- a/src/row.jl
+++ b/src/row.jl
@@ -5,22 +5,57 @@ function vec_to_mat(vec::AbstractVector)
 end
 vec_to_mat(::Missing) = missing
 
+"""
+    evaluation_metrics(args...; optimal_threshold_class=nothing, kwargs...)
+
+Convert `Dict` of  [`evaluation_metrics`](@ref) results (e.g. from Lighthouse <v0.14.0)
+into an [`EvaluationRow`](@ref).
+"""
+evaluation_row(row::EvaluationRow) = row
+function evaluation_row(evaluation_row_dict::Dict)
+    row = (; Dict(Symbol(k) => v for (k, v) in pairs(evaluation_row_dict))...)
+    return EvaluationRow(row)
+end
+
 const EvaluationRow = Legolas.@row("lighthouse.evaluation@1",
-    class_labels::Union{Missing,Vector{String}},
-    confusion_matrix::Union{Missing,Matrix{Int64}} = vec_to_mat(confusion_matrix),
-    discrimination_calibration_curve::Union{Missing,Tuple{Vector{Float64},Vector{Union{Missing,Float64}}}},
-    discrimination_calibration_score::Union{Missing,Float64},
-    multiclass_IRA_kappas::Union{Missing,Float64},
-    multiclass_kappa::Union{Missing,Float64},
-    optimal_threshold::Union{Missing,Float64},
-    optimal_threshold_class::Union{Missing,Int64},
-    per_class_kappas::Union{Missing,Vector{Float64}},
-    per_class_pr_curves::Union{Missing,Vector{Tuple{Vector{Float64},Vector{Union{Missing,Float64}}}}},
-    per_class_reliability_calibration_curves::Union{Missing,Vector{Tuple{Vector{Float64},Vector{Union{Missing,Float64}}}}},
-    per_class_reliability_calibration_scores::Union{Missing,Vector{Float64}},
-    per_class_roc_aucs::Union{Missing,Vector{Float64}},
-    per_class_roc_curves::Union{Missing,Vector{Tuple{Vector{Float64},Vector{Float64}}}},
-    per_expert_discrimination_calibration_curves::Union{Missing,Vector{Tuple{Vector{Float64},Vector{Union{Missing,Float64}}}}},
-    per_expert_discrimination_calibration_scores::Union{Missing,Vector{Float64}},
-    spearman_correlation::Union{Missing,NamedTuple{(:ρ, :n, :ci_lower, :ci_upper),Tuple{Float64,Int64,Float64,Float64}}},
-    thresholds::Union{Missing,Vector{Float64}})
+                                   class_labels::Union{Missing,Vector{String}},
+                                   confusion_matrix::Union{Missing,Matrix{Int64}} = vec_to_mat(confusion_matrix),
+                                   discrimination_calibration_curve::Union{Missing,
+                                                                           Tuple{Vector{Float64},
+                                                                                 Vector{Union{Missing,
+                                                                                              Float64}}}},
+                                   discrimination_calibration_score::Union{Missing,Float64},
+                                   multiclass_IRA_kappas::Union{Missing,Float64},
+                                   multiclass_kappa::Union{Missing,Float64},
+                                   optimal_threshold::Union{Missing,Float64},
+                                   optimal_threshold_class::Union{Missing,Int64},
+                                   per_class_kappas::Union{Missing,Vector{Float64}},
+                                   per_class_pr_curves::Union{Missing,
+                                                              Vector{Tuple{Vector{Float64},
+                                                                           Vector{Union{Missing,
+                                                                                        Float64}}}}},
+                                   per_class_reliability_calibration_curves::Union{Missing,
+                                                                                   Vector{Tuple{Vector{Float64},
+                                                                                                Vector{Union{Missing,
+                                                                                                             Float64}}}}},
+                                   per_class_reliability_calibration_scores::Union{Missing,
+                                                                                   Vector{Float64}},
+                                   per_class_roc_aucs::Union{Missing,Vector{Float64}},
+                                   per_class_roc_curves::Union{Missing,
+                                                               Vector{Tuple{Vector{Float64},
+                                                                            Vector{Float64}}}},
+                                   per_expert_discrimination_calibration_curves::Union{Missing,
+                                                                                       Vector{Tuple{Vector{Float64},
+                                                                                                    Vector{Union{Missing,
+                                                                                                                 Float64}}}}},
+                                   per_expert_discrimination_calibration_scores::Union{Missing,
+                                                                                       Vector{Float64}},
+                                   spearman_correlation::Union{Missing,
+                                                               NamedTuple{(:ρ, :n,
+                                                                           :ci_lower,
+                                                                           :ci_upper),
+                                                                          Tuple{Float64,
+                                                                                Int64,
+                                                                                Float64,
+                                                                                Float64}}},
+                                   thresholds::Union{Missing,Vector{Float64}})
diff --git a/test/learn.jl b/test/learn.jl
index 7322673..2d83053 100644
--- a/test/learn.jl
+++ b/test/learn.jl
@@ -22,6 +22,16 @@ function Lighthouse.loss_and_prediction(c::TestClassifier, dummy_input_batch)
     return c.dummy_loss, dummy_soft_label_batch
 end
 
+function test_roundtrip_results(results_dict)
+    row = evaluation_row(results_dict)
+    p = mktempdir() * "rt_test.arrow"
+    #todo write this to tempdir
+    rt_row = nothing # read this from tempdir
+    @test rt_row == row
+    @test _evaluation_row_dict(rt_row) == results_dict
+    return true
+end
+
 @testset "Multi-class learn!(::TestModel, ...)" begin
     mktempdir() do tmpdir
         model = TestClassifier(1000000.0, ["class_$i" for i in 1:5])
@@ -44,24 +54,25 @@ end
                     @info counted n
                 end
             end
-            elected = majority.((rng,), eachrow(votes), (1:length(Lighthouse.classes(model)),))
-            Lighthouse.learn!(model, logger, () -> train_batches, () -> test_batches, votes, elected;
-                              epoch_limit=limit, post_epoch_callback=callback)
+            elected = majority.((rng,), eachrow(votes),
+                                (1:length(Lighthouse.classes(model)),))
+            Lighthouse.learn!(model, logger, () -> train_batches, () -> test_batches, votes,
+                              elected; epoch_limit=limit, post_epoch_callback=callback)
             @test counted == sum(1:limit)
         end
         @test length(logger.logged["train/loss_per_batch"]) == length(train_batches) * limit
         for key in ["test_set_prediction/loss_per_batch",
-             "test_set_prediction/time_in_seconds_per_batch",
-             "test_set_prediction/gc_time_in_seconds_per_batch",
-             "test_set_prediction/allocations_per_batch",
-             "test_set_prediction/memory_in_mb_per_batch"]
+                    "test_set_prediction/time_in_seconds_per_batch",
+                    "test_set_prediction/gc_time_in_seconds_per_batch",
+                    "test_set_prediction/allocations_per_batch",
+                    "test_set_prediction/memory_in_mb_per_batch"]
             @test length(logger.logged[key]) == length(test_batches) * limit
         end
         for key in ["test_set_prediction/mean_loss_per_epoch",
-             "test_set_evaluation/time_in_seconds_per_epoch",
-             "test_set_evaluation/gc_time_in_seconds_per_epoch",
-             "test_set_evaluation/allocations_per_epoch",
-             "test_set_evaluation/memory_in_mb_per_epoch"]
+                    "test_set_evaluation/time_in_seconds_per_epoch",
+                    "test_set_evaluation/gc_time_in_seconds_per_epoch",
+                    "test_set_evaluation/allocations_per_epoch",
+                    "test_set_evaluation/memory_in_mb_per_epoch"]
             @test length(logger.logged[key]) == limit
         end
         @test length(logger.logged["test_set_evaluation/metrics_per_epoch"]) == limit
@@ -105,16 +116,23 @@ end
         # Test startified eval
         strata = [Set("group $(j % Int(ceil(sqrt(j))))" for j in 1:(i - 1))
                   for i in 1:size(votes, 1)]
-        plot_data = evaluation_metrics(predicted_hard, predicted_soft,
-                                        elected_hard, model.classes, 0.0:0.01:1.0;
-                                        votes=votes, strata=strata)
+        plot_data = evaluation_metrics(predicted_hard, predicted_soft, elected_hard,
+                                       model.classes, 0.0:0.01:1.0; votes=votes,
+                                       strata=strata)
         @test haskey(plot_data, "stratified_kappas")
         plot = evaluation_metrics_plot(plot_data)
 
-        plot2, plot_data2 = @test_deprecated evaluation_metrics_plot(predicted_hard, predicted_soft,
-                                                                     elected_hard, model.classes, 0.0:0.01:1.0;
-                                                                     votes=votes, strata=strata)
+        @test test_roundtrip_results(plot_data)
+
+        plot2, plot_data2 = @test_deprecated evaluation_metrics_plot(predicted_hard,
+                                                                     predicted_soft,
+                                                                     elected_hard,
+                                                                     model.classes,
+                                                                     0.0:0.01:1.0;
+                                                                     votes=votes,
+                                                                     strata=strata)
         @test isequal(plot_data, plot_data2) # check these are the same
+        @test test_roundtrip_results(plot_data2)
 
         # Test plotting
         plot_data = last(logger.logged["test_set_evaluation/metrics_per_epoch"])
@@ -128,15 +146,17 @@ end
         @testplot roc
 
         # Kappa no IRA
-        kappas_no_ira = plot_kappas(vcat(plot_data["multiclass_kappa"], plot_data["per_class_kappas"]),
-                        vcat("Multiclass", plot_data["class_labels"]))
+        kappas_no_ira = plot_kappas(vcat(plot_data["multiclass_kappa"],
+                                         plot_data["per_class_kappas"]),
+                                    vcat("Multiclass", plot_data["class_labels"]))
         @testplot kappas_no_ira
 
         # Kappa with IRA
-        kappas_ira = plot_kappas(vcat(plot_data["multiclass_kappa"], plot_data["per_class_kappas"]),
-                        vcat("Multiclass", plot_data["class_labels"]),
-                        vcat(plot_data["multiclass_IRA_kappas"],
-                             plot_data["per_class_IRA_kappas"]))
+        kappas_ira = plot_kappas(vcat(plot_data["multiclass_kappa"],
+                                      plot_data["per_class_kappas"]),
+                                 vcat("Multiclass", plot_data["class_labels"]),
+                                 vcat(plot_data["multiclass_IRA_kappas"],
+                                      plot_data["per_class_IRA_kappas"]))
         @testplot kappas_ira
 
         reliability_calibration = plot_reliability_calibration_curves(plot_data["per_class_reliability_calibration_curves"],
@@ -181,9 +201,10 @@ end
                     @info counted n
                 end
             end
-            elected = majority.((rng,), eachrow(votes), (1:length(Lighthouse.classes(model)),))
-            Lighthouse.learn!(model, logger, () -> train_batches, () -> test_batches, votes, elected;
-                              epoch_limit=limit, post_epoch_callback=callback)
+            elected = majority.((rng,), eachrow(votes),
+                                (1:length(Lighthouse.classes(model)),))
+            Lighthouse.learn!(model, logger, () -> train_batches, () -> test_batches, votes,
+                              elected; epoch_limit=limit, post_epoch_callback=callback)
             @test counted == sum(1:limit)
         end
         # Binary classification logs some additional metrics
@@ -191,6 +212,7 @@ end
               limit
         plot_data = last(logger.logged["test_set_evaluation/metrics_per_epoch"])
         @test haskey(plot_data, "spearman_correlation")
+        @test test_roundtrip_results(plot_data)
 
         # No `optimal_threshold_class` during learning...
         @test !haskey(plot_data, "optimal_threshold")
@@ -198,13 +220,14 @@ end
 
         # And now, `optimal_threshold_class` during learning
         elected = majority.((rng,), eachrow(votes), (1:length(Lighthouse.classes(model)),))
-        Lighthouse.learn!(model, logger, () -> train_batches, () -> test_batches, votes, elected;
-                          epoch_limit=limit, optimal_threshold_class=2,
+        Lighthouse.learn!(model, logger, () -> train_batches, () -> test_batches, votes,
+                          elected; epoch_limit=limit, optimal_threshold_class=2,
                           test_set_logger_prefix="validation_set")
         plot_data = last(logger.logged["validation_set_evaluation/metrics_per_epoch"])
         @test haskey(plot_data, "optimal_threshold")
         @test haskey(plot_data, "optimal_threshold_class")
         @test plot_data["optimal_threshold_class"] == 2
+        @test test_roundtrip_results(plot_data)
 
         # `optimal_threshold_class` param invalid
         @test_throws ArgumentError Lighthouse.learn!(model, logger, () -> train_batches,
@@ -230,12 +253,14 @@ end
         plot_data = last(logger.logged["wheeeeeee/metrics_for_all_time"])
         @test !haskey(plot_data, "per_class_IRA_kappas")
         @test !haskey(plot_data, "multiclass_IRA_kappas")
+        @test test_roundtrip_results(plot_data)
 
         evaluate!(predicted_hard, predicted_soft, elected_hard, model.classes, logger;
                   logger_prefix="wheeeeeee", logger_suffix="_for_all_time", votes=votes)
         plot_data = last(logger.logged["wheeeeeee/metrics_for_all_time"])
         @test haskey(plot_data, "per_class_IRA_kappas")
         @test haskey(plot_data, "multiclass_IRA_kappas")
+        @test test_roundtrip_results(plot_data)
 
         # Test `evaluate` for different optimal_threshold classes
         evaluate!(predicted_hard, predicted_soft, elected_hard, model.classes, logger;
@@ -246,6 +271,7 @@ end
                   logger_prefix="wheeeeeee", logger_suffix="_for_all_time", votes=votes,
                   optimal_threshold_class=2)
         plot_data_2 = last(logger.logged["wheeeeeee/metrics_for_all_time"])
+        @test test_roundtrip_results(plot_data_2)
 
         # The thresholds should not be identical (since they are *inclusive* when applied:
         # values greater than _or equal to_ the threshold are given the class value)
@@ -273,9 +299,9 @@ end
 @testset "Invalid `_calculate_ira_kappas`" begin
     classes = ["roy", "gee", "biv"]
     @test isequal(Lighthouse._calculate_ira_kappas([1; 1; 1; 1], classes),
-        (; per_class_IRA_kappas = missing, multiclass_IRA_kappas = missing))  # Only one voter...
+                  (; per_class_IRA_kappas=missing, multiclass_IRA_kappas=missing))  # Only one voter...
     @test isequal(Lighthouse._calculate_ira_kappas([1 0; 1 0; 0 1], classes),
-        (; per_class_IRA_kappas = missing, multiclass_IRA_kappas = missing))  # No observations in common...
+                  (; per_class_IRA_kappas=missing, multiclass_IRA_kappas=missing))  # No observations in common...
 end
 
 @testset "Calculate `_spearman_corr`" begin
@@ -309,11 +335,9 @@ end
 
     # Test NaN spearman due to unranked input
     votes = [1; 2; 2]
-    predicted_soft = [
-        0.3 0.7
-        0.3 0.7
-        0.3 0.7
-    ]
+    predicted_soft = [0.3 0.7
+                      0.3 0.7
+                      0.3 0.7]
     sp = Lighthouse._calculate_spearman_correlation(predicted_soft, votes, ["oh" "em"])
     @test isnan(sp.ρ)
 
@@ -335,11 +359,9 @@ end
     @test length(single_voter_calibration.mse) == 1
 
     # Test multi-voter voter discrimination calibration
-    votes = [
-        0 1 1 1
-        1 2 0 0
-        2 1 2 2
-    ] # Note: voters 3 and 4 have voted identically
+    votes = [0 1 1 1
+             1 2 0 0
+             2 1 2 2] # Note: voters 3 and 4 have voted identically
     voter_calibration = Lighthouse._calculate_voter_discrimination_calibration(votes;
                                                                                class_of_interest_index=1)
     @test length(voter_calibration.mse) == size(votes, 2)
@@ -366,13 +388,11 @@ end
 end
 
 @testset "2-class per_class_confusion_statistics" begin
-    predicted_soft_labels = [
-        0.51 0.49
-        0.49 0.51
-        0.1 0.9
-        0.9 0.1
-        0.0 1.0
-    ]
+    predicted_soft_labels = [0.51 0.49
+                             0.49 0.51
+                             0.1 0.9
+                             0.9 0.1
+                             0.0 1.0]
     elected_hard_labels = [1, 2, 2, 2, 1]
     thresholds = [0.25, 0.5, 0.75]
     class_1, class_2 = Lighthouse.per_class_confusion_statistics(predicted_soft_labels,
@@ -447,15 +467,13 @@ end
 end
 
 @testset "3-class per_class_confusion_statistics" begin
-    predicted_soft_labels = [
-        1/3 1/3 1/3
-        0.1 0.7 0.2
-        0.25 0.25 0.5
-        0.4 0.5 0.1
-        0.0 0.0 1.0
-        0.2 0.5 0.3
-        0.5 0.4 0.1
-    ]
+    predicted_soft_labels = [1/3 1/3 1/3
+                             0.1 0.7 0.2
+                             0.25 0.25 0.5
+                             0.4 0.5 0.1
+                             0.0 0.0 1.0
+                             0.2 0.5 0.3
+                             0.5 0.4 0.1]
     elected_hard_labels = [1, 2, 2, 1, 3, 3, 1]
     # TODO would be more robust to have multiple thresholds, but our naive tests
     # here will have to be refactored to avoid becoming a nightmare if we do that
diff --git a/test/row.jl b/test/row.jl
index 69169d9..5dee858 100644
--- a/test/row.jl
+++ b/test/row.jl
@@ -5,3 +5,19 @@
     @test ismissing(Lighthouse.vec_to_mat(missing))
     @test_throws DimensionMismatch Lighthouse.vec_to_mat(collect(1:6)) # Invalid dimensions
 end
+
+@testset `round trip EvaluationRow tests` begin
+    # Basic roundtrip
+    dict = Dict("class_labels" => ["foo", "bar"], "multiclass_kappa" => 3)
+    row = Lighthouse.evaluation_row(dict)
+    @test isa(row, Lighthouse.EvaluationRow)
+    @test isequal(Lighthouse._evaluation_row_dict(row), dict)
+
+    # Should ignore any additional fields that we don't convert
+    extra_dict = Dict("class_labels" => ["foo", "bar"], "multiclass_kappa" => 3, "rabbit" => 2432)
+    row = Lighthouse.evaluation_row(dict)
+    @test isa(row, Lighthouse.EvaluationRow)
+    @test isequal(Lighthouse._evaluation_row_dict(row), dict)
+
+
+end
\ No newline at end of file

From ac1f66656e3c0b1c608eb7fc653c9d643da219ff Mon Sep 17 00:00:00 2001
From: hannahilea <hannahilea@users.noreply.github.com>
Date: Wed, 16 Mar 2022 22:53:18 +0000
Subject: [PATCH 09/21] wip

---
 src/learn.jl     |  4 ----
 src/row.jl       | 46 ++++++++++++++++++++++++++++++++--------------
 test/learn.jl    | 12 +-----------
 test/row.jl      | 19 +++++++++----------
 test/runtests.jl | 16 ++++++++++++++++
 5 files changed, 58 insertions(+), 39 deletions(-)

diff --git a/src/learn.jl b/src/learn.jl
index a23fe69..1262374 100644
--- a/src/learn.jl
+++ b/src/learn.jl
@@ -620,10 +620,6 @@ function evaluation_metrics(args...; optimal_threshold_class=nothing, kwargs...)
     return _evaluation_row_dict(row)
 end
 
-function _evaluation_row_dict(row::EvaluationRow)
-    return Dict(string(k) => v for (k, v) in pairs(NamedTuple(row)) if !ismissing(v))
-end
-
 """
     evaluation_metrics_plot(predicted_hard_labels::AbstractVector,
                             predicted_soft_labels::AbstractMatrix,
diff --git a/src/row.jl b/src/row.jl
index 689f2ac..dece1c3 100644
--- a/src/row.jl
+++ b/src/row.jl
@@ -1,22 +1,16 @@
-vec_to_mat(mat::AbstractMatrix) = mat
+function vec_to_mat(mat::AbstractMatrix)
+    @info "here"
+    return mat
+end
 function vec_to_mat(vec::AbstractVector)
+    @info "ok..."
     n = isqrt(length(vec))
     return reshape(vec, n, n)
 end
-vec_to_mat(::Missing) = missing
-
-"""
-    evaluation_metrics(args...; optimal_threshold_class=nothing, kwargs...)
-
-Convert `Dict` of  [`evaluation_metrics`](@ref) results (e.g. from Lighthouse <v0.14.0)
-into an [`EvaluationRow`](@ref).
-"""
-evaluation_row(row::EvaluationRow) = row
-function evaluation_row(evaluation_row_dict::Dict)
-    row = (; Dict(Symbol(k) => v for (k, v) in pairs(evaluation_row_dict))...)
-    return EvaluationRow(row)
-end
+vec_to_mat(x::Missing) = (@info "why"; @info typeof(x); return missing)
 
+# Redefinition is workaround for https://github.com/beacon-biosignals/Legolas.jl/issues/9
+const EVALUATION_ROW_SCHEMA = Legolas.Schema("lighthouse.evaluation@1")
 const EvaluationRow = Legolas.@row("lighthouse.evaluation@1",
                                    class_labels::Union{Missing,Vector{String}},
                                    confusion_matrix::Union{Missing,Matrix{Int64}} = vec_to_mat(confusion_matrix),
@@ -59,3 +53,27 @@ const EvaluationRow = Legolas.@row("lighthouse.evaluation@1",
                                                                                 Float64,
                                                                                 Float64}}},
                                    thresholds::Union{Missing,Vector{Float64}})
+
+"""
+    EvaluationRow(evaluation_row_dict::Dict{String, Any}) -> EvaluationRow
+
+Convert `Dict` of  [`evaluation_metrics`](@ref) results (e.g. from Lighthouse <v0.14.0)
+into an [`EvaluationRow`](@ref).
+"""
+function EvaluationRow(evaluation_row_dict::Dict{String,Any})
+    @info "this"
+    row = (; Dict(Symbol(k) => v for (k, v) in pairs(evaluation_row_dict))...)
+    return EvaluationRow(row)
+end
+
+
+"""
+    _evaluation_row_dict(row::EvaluationRow) -> Dict{String,Any}
+
+Convert [`EvaluationRow`](@ref) into `::Dict{String, Any}` results, as are
+output by `[`evaluation_metrics`](@ref)` (and predated use of `EvaluationRow` in
+Lighthouse <v0.14.0.
+"""
+function _evaluation_row_dict(row::EvaluationRow)
+    return Dict(string(k) => v for (k, v) in pairs(NamedTuple(row)) if !ismissing(v))
+end
\ No newline at end of file
diff --git a/test/learn.jl b/test/learn.jl
index 2d83053..7361ec5 100644
--- a/test/learn.jl
+++ b/test/learn.jl
@@ -22,16 +22,6 @@ function Lighthouse.loss_and_prediction(c::TestClassifier, dummy_input_batch)
     return c.dummy_loss, dummy_soft_label_batch
 end
 
-function test_roundtrip_results(results_dict)
-    row = evaluation_row(results_dict)
-    p = mktempdir() * "rt_test.arrow"
-    #todo write this to tempdir
-    rt_row = nothing # read this from tempdir
-    @test rt_row == row
-    @test _evaluation_row_dict(rt_row) == results_dict
-    return true
-end
-
 @testset "Multi-class learn!(::TestModel, ...)" begin
     mktempdir() do tmpdir
         model = TestClassifier(1000000.0, ["class_$i" for i in 1:5])
@@ -122,7 +112,7 @@ end
         @test haskey(plot_data, "stratified_kappas")
         plot = evaluation_metrics_plot(plot_data)
 
-        @test test_roundtrip_results(plot_data)
+        @test test_roundtrip_evaluation(plot_data)
 
         plot2, plot_data2 = @test_deprecated evaluation_metrics_plot(predicted_hard,
                                                                      predicted_soft,
diff --git a/test/row.jl b/test/row.jl
index 5dee858..1a4f9a3 100644
--- a/test/row.jl
+++ b/test/row.jl
@@ -1,4 +1,4 @@
-@testset `vec_to_mat` begin
+@testset "`vec_to_mat`" begin
     mat = [3 5 6; 6 7 8; 9 10 11]
     @test Lighthouse.vec_to_mat(vec(mat)) == mat
     @test Lighthouse.vec_to_mat(mat) == mat
@@ -6,18 +6,17 @@
     @test_throws DimensionMismatch Lighthouse.vec_to_mat(collect(1:6)) # Invalid dimensions
 end
 
-@testset `round trip EvaluationRow tests` begin
+@testset "`EvaluationRow`" begin
     # Basic roundtrip
     dict = Dict("class_labels" => ["foo", "bar"], "multiclass_kappa" => 3)
-    row = Lighthouse.evaluation_row(dict)
-    @test isa(row, Lighthouse.EvaluationRow)
-    @test isequal(Lighthouse._evaluation_row_dict(row), dict)
+    @test test_roundtrip_evaluation(dict)
 
     # Should ignore any additional fields that we don't convert
-    extra_dict = Dict("class_labels" => ["foo", "bar"], "multiclass_kappa" => 3, "rabbit" => 2432)
-    row = Lighthouse.evaluation_row(dict)
-    @test isa(row, Lighthouse.EvaluationRow)
-    @test isequal(Lighthouse._evaluation_row_dict(row), dict)
-
+    extra_dict = Dict("class_labels" => ["foo", "bar"], "multiclass_kappa" => 3,
+                      "rabbit" => 2432)
+    @test test_roundtrip_evaluation(extra_dict)
 
+    mat_dict = Dict("confusion_matrix" => [3 5 6; 6 7 8; 9 10 11])
+    mat_row = Lighthouse.EvaluationRow(mat_dict)
+    rt_row = roundtrip_row(mat_row)
 end
\ No newline at end of file
diff --git a/test/runtests.jl b/test/runtests.jl
index 65f07a0..cf38a7e 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -6,6 +6,7 @@ using Lighthouse: plot_reliability_calibration_curves, plot_pr_curves,
                   evaluation_metrics_plot, evaluation_metrics
 using Base.Threads
 using CairoMakie
+using Legolas, Tables
 
 # Needs to be set for figures
 # returning true for showable("image/png", obj)
@@ -25,6 +26,21 @@ macro testplot(fig_name)
     end
 end
 
+function test_roundtrip_evaluation(row_dict::Dict{String,Any})
+    row = Lighthouse.EvaluationRow(row_dict)
+    rt_row = roundtrip_row(row)
+    @test isequal(rt_row, row)
+    @test Lighthouse._evaluation_row_dict(rt_row) == row_dict
+    return true
+end
+
+function roundtrip_row(row::Lighthouse.EvaluationRow)
+    p = mktempdir() * "rt_test.arrow"
+    tbl = [row]
+    Legolas.write(p, tbl, Lighthouse.EVALUATION_ROW_SCHEMA)
+    return Lighthouse.EvaluationRow(only(Tables.rows(Legolas.read(p))))
+end
+
 include("plotting.jl")
 include("metrics.jl")
 include("learn.jl")

From 424a548e717df12a316495d1edcf59b56c04bbcd Mon Sep 17 00:00:00 2001
From: hannahilea <hannahilea@users.noreply.github.com>
Date: Thu, 17 Mar 2022 22:11:59 +0000
Subject: [PATCH 10/21] Support matrix serialization/deserialization

---
 Project.toml      |  2 ++
 src/Lighthouse.jl |  1 +
 src/row.jl        | 21 ++++++++++-----------
 test/row.jl       |  6 +++---
 test/runtests.jl  |  2 +-
 5 files changed, 17 insertions(+), 15 deletions(-)

diff --git a/Project.toml b/Project.toml
index 3dbdc20..c4fcdcb 100644
--- a/Project.toml
+++ b/Project.toml
@@ -4,6 +4,7 @@ authors = ["Beacon Biosignals, Inc."]
 version = "0.14.0"
 
 [deps]
+Arrow = "69666777-d1a9-59fb-9406-91d4454c9d45"
 Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"
 Legolas = "741b9549-f6ed-4911-9fbf-4a1c0c97f0cd"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
@@ -16,6 +17,7 @@ StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
 TensorBoardLogger = "899adc3e-224a-11e9-021f-63837185c80f"
 
 [compat]
+Arrow = "2.2"
 CairoMakie = "0.7"
 Legolas = "0.3"
 Makie = "0.16.5"
diff --git a/src/Lighthouse.jl b/src/Lighthouse.jl
index 38f9095..d96e333 100644
--- a/src/Lighthouse.jl
+++ b/src/Lighthouse.jl
@@ -7,6 +7,7 @@ using TensorBoardLogger
 using Makie
 using Printf
 using Legolas
+using Arrow
 
 include("plotting.jl")
 
diff --git a/src/row.jl b/src/row.jl
index dece1c3..e645f49 100644
--- a/src/row.jl
+++ b/src/row.jl
@@ -1,19 +1,19 @@
-function vec_to_mat(mat::AbstractMatrix)
-    @info "here"
-    return mat
-end
+# Arrow can't handle matrices---so when we write/read matrices, we have to pack and unpack them o_O
+# https://github.com/apache/arrow-julia/issues/125
+vec_to_mat(mat::AbstractMatrix) = mat
+
 function vec_to_mat(vec::AbstractVector)
-    @info "ok..."
     n = isqrt(length(vec))
     return reshape(vec, n, n)
 end
-vec_to_mat(x::Missing) = (@info "why"; @info typeof(x); return missing)
+
+vec_to_mat(x::Missing) = return missing
 
 # Redefinition is workaround for https://github.com/beacon-biosignals/Legolas.jl/issues/9
 const EVALUATION_ROW_SCHEMA = Legolas.Schema("lighthouse.evaluation@1")
 const EvaluationRow = Legolas.@row("lighthouse.evaluation@1",
                                    class_labels::Union{Missing,Vector{String}},
-                                   confusion_matrix::Union{Missing,Matrix{Int64}} = vec_to_mat(confusion_matrix),
+                                   confusion_matrix::Union{Missing,Array{Int64}} = vec_to_mat(confusion_matrix), #TODO: file issue to make Matrix{Int64} in future
                                    discrimination_calibration_curve::Union{Missing,
                                                                            Tuple{Vector{Float64},
                                                                                  Vector{Union{Missing,
@@ -60,13 +60,12 @@ const EvaluationRow = Legolas.@row("lighthouse.evaluation@1",
 Convert `Dict` of  [`evaluation_metrics`](@ref) results (e.g. from Lighthouse <v0.14.0)
 into an [`EvaluationRow`](@ref).
 """
-function EvaluationRow(evaluation_row_dict::Dict{String,Any})
-    @info "this"
-    row = (; Dict(Symbol(k) => v for (k, v) in pairs(evaluation_row_dict))...)
+function Legolas.Row{S}(evaluation_row_dict::Dict) where {S<:Legolas.Schema{Symbol("lighthouse.evaluation"),
+                                                                            1}}
+    row = (; (Symbol(k) => v for (k, v) in pairs(evaluation_row_dict))...)
     return EvaluationRow(row)
 end
 
-
 """
     _evaluation_row_dict(row::EvaluationRow) -> Dict{String,Any}
 
diff --git a/test/row.jl b/test/row.jl
index 1a4f9a3..5ec4786 100644
--- a/test/row.jl
+++ b/test/row.jl
@@ -11,12 +11,12 @@ end
     dict = Dict("class_labels" => ["foo", "bar"], "multiclass_kappa" => 3)
     @test test_roundtrip_evaluation(dict)
 
-    # Should ignore any additional fields that we don't convert
+    # Don't lose extra columns (basic Legolas functionality)
     extra_dict = Dict("class_labels" => ["foo", "bar"], "multiclass_kappa" => 3,
                       "rabbit" => 2432)
     @test test_roundtrip_evaluation(extra_dict)
 
+    # Handle fun cases
     mat_dict = Dict("confusion_matrix" => [3 5 6; 6 7 8; 9 10 11])
-    mat_row = Lighthouse.EvaluationRow(mat_dict)
-    rt_row = roundtrip_row(mat_row)
+    @test test_roundtrip_evaluation(mat_dict)
 end
\ No newline at end of file
diff --git a/test/runtests.jl b/test/runtests.jl
index cf38a7e..c651f6a 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -26,7 +26,7 @@ macro testplot(fig_name)
     end
 end
 
-function test_roundtrip_evaluation(row_dict::Dict{String,Any})
+function test_roundtrip_evaluation(row_dict::Dict{String,S}) where S
     row = Lighthouse.EvaluationRow(row_dict)
     rt_row = roundtrip_row(row)
     @test isequal(rt_row, row)

From 38bac6b1b11bb84c564ab3d0dc82dc78db61ca03 Mon Sep 17 00:00:00 2001
From: hannahilea <hannahilea@users.noreply.github.com>
Date: Thu, 17 Mar 2022 22:11:59 +0000
Subject: [PATCH 11/21] Support matrix serialization/deserialization

---
 Project.toml      |  2 ++
 src/Lighthouse.jl |  3 ++-
 src/row.jl        | 21 ++++++++++-----------
 test/row.jl       |  6 +++---
 test/runtests.jl  |  2 +-
 5 files changed, 18 insertions(+), 16 deletions(-)

diff --git a/Project.toml b/Project.toml
index 3dbdc20..c4fcdcb 100644
--- a/Project.toml
+++ b/Project.toml
@@ -4,6 +4,7 @@ authors = ["Beacon Biosignals, Inc."]
 version = "0.14.0"
 
 [deps]
+Arrow = "69666777-d1a9-59fb-9406-91d4454c9d45"
 Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"
 Legolas = "741b9549-f6ed-4911-9fbf-4a1c0c97f0cd"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
@@ -16,6 +17,7 @@ StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
 TensorBoardLogger = "899adc3e-224a-11e9-021f-63837185c80f"
 
 [compat]
+Arrow = "2.2"
 CairoMakie = "0.7"
 Legolas = "0.3"
 Makie = "0.16.5"
diff --git a/src/Lighthouse.jl b/src/Lighthouse.jl
index 38f9095..4ce5c3e 100644
--- a/src/Lighthouse.jl
+++ b/src/Lighthouse.jl
@@ -7,6 +7,7 @@ using TensorBoardLogger
 using Makie
 using Printf
 using Legolas
+using Arrow
 
 include("plotting.jl")
 
@@ -20,9 +21,9 @@ include("classifier.jl")
 export AbstractClassifier
 
 include("row.jl")
+# TODO: export EvaluationRow ?
 
 include("learn.jl")
 export LearnLogger, learn!, upon, evaluate!, predict!
 
-
 end # module
diff --git a/src/row.jl b/src/row.jl
index dece1c3..e645f49 100644
--- a/src/row.jl
+++ b/src/row.jl
@@ -1,19 +1,19 @@
-function vec_to_mat(mat::AbstractMatrix)
-    @info "here"
-    return mat
-end
+# Arrow can't handle matrices---so when we write/read matrices, we have to pack and unpack them o_O
+# https://github.com/apache/arrow-julia/issues/125
+vec_to_mat(mat::AbstractMatrix) = mat
+
 function vec_to_mat(vec::AbstractVector)
-    @info "ok..."
     n = isqrt(length(vec))
     return reshape(vec, n, n)
 end
-vec_to_mat(x::Missing) = (@info "why"; @info typeof(x); return missing)
+
+vec_to_mat(x::Missing) = return missing
 
 # Redefinition is workaround for https://github.com/beacon-biosignals/Legolas.jl/issues/9
 const EVALUATION_ROW_SCHEMA = Legolas.Schema("lighthouse.evaluation@1")
 const EvaluationRow = Legolas.@row("lighthouse.evaluation@1",
                                    class_labels::Union{Missing,Vector{String}},
-                                   confusion_matrix::Union{Missing,Matrix{Int64}} = vec_to_mat(confusion_matrix),
+                                   confusion_matrix::Union{Missing,Array{Int64}} = vec_to_mat(confusion_matrix), #TODO: file issue to make Matrix{Int64} in future
                                    discrimination_calibration_curve::Union{Missing,
                                                                            Tuple{Vector{Float64},
                                                                                  Vector{Union{Missing,
@@ -60,13 +60,12 @@ const EvaluationRow = Legolas.@row("lighthouse.evaluation@1",
 Convert `Dict` of  [`evaluation_metrics`](@ref) results (e.g. from Lighthouse <v0.14.0)
 into an [`EvaluationRow`](@ref).
 """
-function EvaluationRow(evaluation_row_dict::Dict{String,Any})
-    @info "this"
-    row = (; Dict(Symbol(k) => v for (k, v) in pairs(evaluation_row_dict))...)
+function Legolas.Row{S}(evaluation_row_dict::Dict) where {S<:Legolas.Schema{Symbol("lighthouse.evaluation"),
+                                                                            1}}
+    row = (; (Symbol(k) => v for (k, v) in pairs(evaluation_row_dict))...)
     return EvaluationRow(row)
 end
 
-
 """
     _evaluation_row_dict(row::EvaluationRow) -> Dict{String,Any}
 
diff --git a/test/row.jl b/test/row.jl
index 1a4f9a3..5ec4786 100644
--- a/test/row.jl
+++ b/test/row.jl
@@ -11,12 +11,12 @@ end
     dict = Dict("class_labels" => ["foo", "bar"], "multiclass_kappa" => 3)
     @test test_roundtrip_evaluation(dict)
 
-    # Should ignore any additional fields that we don't convert
+    # Don't lose extra columns (basic Legolas functionality)
     extra_dict = Dict("class_labels" => ["foo", "bar"], "multiclass_kappa" => 3,
                       "rabbit" => 2432)
     @test test_roundtrip_evaluation(extra_dict)
 
+    # Handle fun cases
     mat_dict = Dict("confusion_matrix" => [3 5 6; 6 7 8; 9 10 11])
-    mat_row = Lighthouse.EvaluationRow(mat_dict)
-    rt_row = roundtrip_row(mat_row)
+    @test test_roundtrip_evaluation(mat_dict)
 end
\ No newline at end of file
diff --git a/test/runtests.jl b/test/runtests.jl
index cf38a7e..c651f6a 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -26,7 +26,7 @@ macro testplot(fig_name)
     end
 end
 
-function test_roundtrip_evaluation(row_dict::Dict{String,Any})
+function test_roundtrip_evaluation(row_dict::Dict{String,S}) where S
     row = Lighthouse.EvaluationRow(row_dict)
     rt_row = roundtrip_row(row)
     @test isequal(rt_row, row)

From 353b22e00326401d112be3db56a195f4c5f3c220 Mon Sep 17 00:00:00 2001
From: hannahilea <hannahilea@users.noreply.github.com>
Date: Thu, 17 Mar 2022 23:49:24 +0000
Subject: [PATCH 12/21] Replace missing with NaN

---
 src/learn.jl     |  4 ++--
 src/metrics.jl   | 28 +++++++++++++---------------
 src/row.jl       | 19 +++++++++++--------
 test/learn.jl    | 17 +++++++++--------
 test/metrics.jl  | 20 +++++++++++++++++---
 test/runtests.jl | 24 +++++++++++++++++++++---
 6 files changed, 73 insertions(+), 39 deletions(-)

diff --git a/src/learn.jl b/src/learn.jl
index 1262374..6b539ef 100644
--- a/src/learn.jl
+++ b/src/learn.jl
@@ -238,7 +238,8 @@ function _calculate_stratified_ea_kappas(predicted_hard_labels, elected_hard_lab
               group => (per_class=k.per_class_kappas, multiclass=k.multiclass_kappa,
                         n=sum(index)))
     end
-    return sort(kappas; by=p -> last(p).multiclass)
+    kappas = sort(kappas; by=p -> last(p).multiclass)
+    return [k = v for (k, v) in kappas]
 end
 
 """
@@ -369,7 +370,6 @@ Where...
 function _calculate_spearman_correlation(predicted_soft_labels, votes, classes)
     length(classes) > 2 && throw(ArgumentError("Only valid for 2-class problems"))
     if !all(x -> x ≈ 1, sum(predicted_soft_labels; dims=2))
-        @info predicted_soft_labels
         throw(ArgumentError("Input probabiliities fail softmax assumption"))
     end
 
diff --git a/src/metrics.jl b/src/metrics.jl
index d0bc98c..71f0ab8 100644
--- a/src/metrics.jl
+++ b/src/metrics.jl
@@ -24,14 +24,14 @@ end
     accuracy(confusion::AbstractMatrix)
 
 Returns the percentage of matching classifications out of total classifications,
-or `missing` if `all(iszero, confusion)`.
+or `NaN` if `all(iszero, confusion)`.
 
 Note that `accuracy(confusion)` is equivalent to overall percent agreement
 between `confusion`'s row classifier and column classifier.
 """
 function accuracy(confusion::AbstractMatrix)
     total = sum(confusion)
-    total == 0 && return missing
+    total == 0 && return NaN
     return tr(confusion) / total
 end
 
@@ -78,15 +78,12 @@ function binary_statistics(confusion::AbstractMatrix, class_index::Integer)
     false_negative_rate = (false_negatives == 0 && actual_positives == 0) ?
                           (zero(false_negatives) / one(actual_positives)) :
                           (false_negatives / actual_positives)
-    precision = (true_positives == 0 && predicted_positives == 0) ? missing :
+    precision = (true_positives == 0 && predicted_positives == 0) ? NaN :
                 (true_positives / predicted_positives)
-    return (predicted_positives=predicted_positives,
-            predicted_negatives=predicted_negatives, actual_positives=actual_positives,
-            actual_negatives=actual_negatives, true_positives=true_positives,
-            true_negatives=true_negatives, false_positives=false_positives,
-            false_negatives=false_negatives, true_positive_rate=true_positive_rate,
-            true_negative_rate=true_negative_rate, false_positive_rate=false_positive_rate,
-            false_negative_rate=false_negative_rate, precision=precision)
+    return (; predicted_positives, predicted_negatives, actual_positives, actual_negatives,
+            true_positives, true_negatives, false_positives, false_negatives,
+            true_positive_rate, true_negative_rate, false_positive_rate,
+            false_negative_rate, precision)
 end
 
 function binary_statistics(confusion::AbstractMatrix)
@@ -105,7 +102,8 @@ Return `(κ, p₀)` where `κ` is Cohen's kappa and `p₀` percent agreement giv
 their equivalents in [`confusion_matrix`](@ref)).
 """
 function cohens_kappa(class_count, hard_label_pairs)
-    all(issubset(pair, 1:class_count) for pair in hard_label_pairs) || throw(ArgumentError("Unexpected class in `hard_label_pairs`."))
+    all(issubset(pair, 1:class_count) for pair in hard_label_pairs) ||
+        throw(ArgumentError("Unexpected class in `hard_label_pairs`."))
     p₀ = accuracy(confusion_matrix(class_count, hard_label_pairs))
     pₑ = _probability_of_chance_agreement(class_count, hard_label_pairs)
     return _cohens_kappa(p₀, pₑ), p₀
@@ -137,7 +135,7 @@ where:
 
 - `bins` a vector with `bin_count` `Pairs` specifying the calibration curve's probability bins
 - `fractions`: a vector where `fractions[i]` is the number of values in `probabilities`
-  that falls within `bin[i]` over the total number of values within `bin[i]`, or `missing`
+  that falls within `bin[i]` over the total number of values within `bin[i]`, or `NaN`
   if the total number of values in `bin[i]` is zero.
 - `totals`: a vector where `totals[i]` the total number of values within `bin[i]`.
 - `mean_squared_error`: The mean squared error of `fractions` vs. an ideal calibration curve.
@@ -150,12 +148,12 @@ function calibration_curve(probabilities, bitmask; bin_count=10)
     bins = probability_bins(bin_count)
     per_bin = [fraction_within(probabilities, bitmask, bin...) for bin in bins]
     fractions, totals = first.(per_bin), last.(per_bin)
-    nonempty_indices = findall(!ismissing, fractions)
+    nonempty_indices = findall(!isnan, fractions)
     if !isempty(nonempty_indices)
         ideal = range(mean(first(bins)), mean(last(bins)); length=length(bins))
         mean_squared_error = mse(fractions[nonempty_indices], ideal[nonempty_indices])
     else
-        mean_squared_error = missing
+        mean_squared_error = NaN
     end
     return (bins=bins, fractions=fractions, totals=totals,
             mean_squared_error=mean_squared_error)
@@ -179,6 +177,6 @@ function fraction_within(values, bitmask, start, stop)
             total += 1
         end
     end
-    fraction = iszero(total) ? missing : (count / total)
+    fraction = iszero(total) ? NaN : (count / total)
     return (fraction=fraction, total=total)
 end
diff --git a/src/row.jl b/src/row.jl
index e645f49..377bcab 100644
--- a/src/row.jl
+++ b/src/row.jl
@@ -16,22 +16,26 @@ const EvaluationRow = Legolas.@row("lighthouse.evaluation@1",
                                    confusion_matrix::Union{Missing,Array{Int64}} = vec_to_mat(confusion_matrix), #TODO: file issue to make Matrix{Int64} in future
                                    discrimination_calibration_curve::Union{Missing,
                                                                            Tuple{Vector{Float64},
-                                                                                 Vector{Union{Missing,
-                                                                                              Float64}}}},
+                                                                                 Vector{Float64}}},
                                    discrimination_calibration_score::Union{Missing,Float64},
                                    multiclass_IRA_kappas::Union{Missing,Float64},
                                    multiclass_kappa::Union{Missing,Float64},
                                    optimal_threshold::Union{Missing,Float64},
                                    optimal_threshold_class::Union{Missing,Int64},
                                    per_class_kappas::Union{Missing,Vector{Float64}},
+                                   stratified_kappas::Union{Missing,
+                                                            Vector{NamedTuple{(:per_class,
+                                                                               :multiclass,
+                                                                               :n),
+                                                                              Tuple{Vector{Float64},
+                                                                                    Float64,
+                                                                                    Int64}}}},
                                    per_class_pr_curves::Union{Missing,
                                                               Vector{Tuple{Vector{Float64},
-                                                                           Vector{Union{Missing,
-                                                                                        Float64}}}}},
+                                                                           Vector{Float64}}}},
                                    per_class_reliability_calibration_curves::Union{Missing,
                                                                                    Vector{Tuple{Vector{Float64},
-                                                                                                Vector{Union{Missing,
-                                                                                                             Float64}}}}},
+                                                                                                Vector{Float64}}}},
                                    per_class_reliability_calibration_scores::Union{Missing,
                                                                                    Vector{Float64}},
                                    per_class_roc_aucs::Union{Missing,Vector{Float64}},
@@ -40,8 +44,7 @@ const EvaluationRow = Legolas.@row("lighthouse.evaluation@1",
                                                                             Vector{Float64}}}},
                                    per_expert_discrimination_calibration_curves::Union{Missing,
                                                                                        Vector{Tuple{Vector{Float64},
-                                                                                                    Vector{Union{Missing,
-                                                                                                                 Float64}}}}},
+                                                                                                    Vector{Float64}}}},
                                    per_expert_discrimination_calibration_scores::Union{Missing,
                                                                                        Vector{Float64}},
                                    spearman_correlation::Union{Missing,
diff --git a/test/learn.jl b/test/learn.jl
index 7361ec5..c887b6f 100644
--- a/test/learn.jl
+++ b/test/learn.jl
@@ -107,8 +107,7 @@ end
         strata = [Set("group $(j % Int(ceil(sqrt(j))))" for j in 1:(i - 1))
                   for i in 1:size(votes, 1)]
         plot_data = evaluation_metrics(predicted_hard, predicted_soft, elected_hard,
-                                       model.classes, 0.0:0.01:1.0; votes=votes,
-                                       strata=strata)
+                                       model.classes, 0.0:0.01:1.0; votes, strata)
         @test haskey(plot_data, "stratified_kappas")
         plot = evaluation_metrics_plot(plot_data)
 
@@ -122,12 +121,14 @@ end
                                                                      votes=votes,
                                                                      strata=strata)
         @test isequal(plot_data, plot_data2) # check these are the same
-        @test test_roundtrip_results(plot_data2)
+        @test test_roundtrip_evaluation(plot_data2)
 
         # Test plotting
         plot_data = last(logger.logged["test_set_evaluation/metrics_per_epoch"])
         @test isa(plot_data["thresholds"], AbstractVector)
 
+        @test isa(last(plot_data["per_class_pr_curves"]),
+                  Tuple{Vector{Float64},Vector{Float64}})
         pr = plot_pr_curves(plot_data["per_class_pr_curves"], plot_data["class_labels"])
         @testplot pr
 
@@ -202,7 +203,7 @@ end
               limit
         plot_data = last(logger.logged["test_set_evaluation/metrics_per_epoch"])
         @test haskey(plot_data, "spearman_correlation")
-        @test test_roundtrip_results(plot_data)
+        @test test_roundtrip_evaluation(plot_data)
 
         # No `optimal_threshold_class` during learning...
         @test !haskey(plot_data, "optimal_threshold")
@@ -217,7 +218,7 @@ end
         @test haskey(plot_data, "optimal_threshold")
         @test haskey(plot_data, "optimal_threshold_class")
         @test plot_data["optimal_threshold_class"] == 2
-        @test test_roundtrip_results(plot_data)
+        @test test_roundtrip_evaluation(plot_data)
 
         # `optimal_threshold_class` param invalid
         @test_throws ArgumentError Lighthouse.learn!(model, logger, () -> train_batches,
@@ -243,14 +244,14 @@ end
         plot_data = last(logger.logged["wheeeeeee/metrics_for_all_time"])
         @test !haskey(plot_data, "per_class_IRA_kappas")
         @test !haskey(plot_data, "multiclass_IRA_kappas")
-        @test test_roundtrip_results(plot_data)
+        @test test_roundtrip_evaluation(plot_data)
 
         evaluate!(predicted_hard, predicted_soft, elected_hard, model.classes, logger;
                   logger_prefix="wheeeeeee", logger_suffix="_for_all_time", votes=votes)
         plot_data = last(logger.logged["wheeeeeee/metrics_for_all_time"])
         @test haskey(plot_data, "per_class_IRA_kappas")
         @test haskey(plot_data, "multiclass_IRA_kappas")
-        @test test_roundtrip_results(plot_data)
+        @test test_roundtrip_evaluation(plot_data)
 
         # Test `evaluate` for different optimal_threshold classes
         evaluate!(predicted_hard, predicted_soft, elected_hard, model.classes, logger;
@@ -261,7 +262,7 @@ end
                   logger_prefix="wheeeeeee", logger_suffix="_for_all_time", votes=votes,
                   optimal_threshold_class=2)
         plot_data_2 = last(logger.logged["wheeeeeee/metrics_for_all_time"])
-        @test test_roundtrip_results(plot_data_2)
+        @test test_roundtrip_evaluation(plot_data_2)
 
         # The thresholds should not be identical (since they are *inclusive* when applied:
         # values greater than _or equal to_ the threshold are given the class value)
diff --git a/test/metrics.jl b/test/metrics.jl
index 2254afe..eca833d 100644
--- a/test/metrics.jl
+++ b/test/metrics.jl
@@ -65,8 +65,8 @@
     @test isapprox(stats.precision, 0.5; atol=0.02)
 
     @test confusion_matrix(10, ()) == zeros(10, 10)
-    @test all(ismissing, cohens_kappa(10, ()))
-    @test ismissing(accuracy(zeros(10, 10)))
+    @test all(isnan, cohens_kappa(10, ()))
+    @test isnan(accuracy(zeros(10, 10)))
     stats = binary_statistics(zeros(10, 10), 1)
     @test stats.predicted_positives == 0
     @test stats.predicted_negatives == 0
@@ -80,7 +80,7 @@
     @test stats.true_negative_rate == 1
     @test stats.false_positive_rate == 0
     @test stats.false_negative_rate == 0
-    @test ismissing(stats.precision)
+    @test isnan(stats.precision)
 
     for p in 0:0.1:1
         @test Lighthouse._cohens_kappa(p, p) == 0
@@ -103,6 +103,7 @@ end
     @test bin_count == length(bins)
     @test first(first(bins)) == 0.0 && last(last(bins)) == 1.0
     @test all(!ismissing, fractions)
+    @test all(!isnan, fractions)
     @test all(!iszero, totals)
     @test all(isapprox.(fractions, 0.5; atol=0.02))
     @test all(isapprox.(totals, length(probs) / bin_count; atol=1000))
@@ -120,6 +121,7 @@ end
     @test bin_count == length(bins)
     @test first(first(bins)) == 0.0 && last(last(bins)) == 1.0
     @test all(!ismissing, fractions)
+    @test all(!isnan, fractions)
     @test all(!iszero, totals)
     @test all(isapprox.(fractions, ideal; atol=0.01))
     @test all(totals .== 1_000_000 / bin_count)
@@ -132,9 +134,21 @@ end
     @test bin_count == length(bins)
     @test first(first(bins)) == 0.0 && last(last(bins)) == 1.0
     @test all(!ismissing, fractions)
+    @test all(!isnan, fractions)
     @test all(!iszero, totals)
     @test all(isapprox.(fractions, reverse(ideal); atol=0.01))
     @test all(totals .== 1_000_000 / bin_count)
     @test isapprox(ceil(mean(fractions) * length(bitmask)), count(bitmask); atol=1)
     @test isapprox(mean_squared_error, 1 / 3; atol=0.01)
+
+    # Handle garbage input---ensure non-existant results are NaN
+    probs = fill(-1, 40)
+    bitmask = zeros(Bool, 40)
+    bins, fractions, totals, mean_squared_error = calibration_curve(probs, bitmask;
+                                                                    bin_count)
+    @test bin_count == length(bins)
+    @test first(first(bins)) == 0.0 && last(last(bins)) == 1.0
+    @test all(isnan, fractions)
+    @test all(iszero, totals)
+    @test isnan(mean_squared_error)
 end
diff --git a/test/runtests.jl b/test/runtests.jl
index c651f6a..ac05304 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -26,11 +26,29 @@ macro testplot(fig_name)
     end
 end
 
-function test_roundtrip_evaluation(row_dict::Dict{String,S}) where S
+function test_roundtrip_evaluation(row_dict::Dict{String,S}) where {S}
     row = Lighthouse.EvaluationRow(row_dict)
     rt_row = roundtrip_row(row)
-    @test isequal(rt_row, row)
-    @test Lighthouse._evaluation_row_dict(rt_row) == row_dict
+
+    # Make sure row roundtrips correctly
+    @test issetequal(keys(row), keys(rt_row))
+    for (k, v) in pairs(row)
+        if ismissing(v)
+            @test ismissing(rt_row[k])
+        else
+            @test issetequal(v, rt_row[k])
+        end
+    end
+
+    # Make sure originating dictionary roundtrips correctly
+    rt_dict = Lighthouse._evaluation_row_dict(rt_row)
+    for (k, v) in pairs(row_dict)
+        if ismissing(v)
+            @test ismissing(rt_dict[k])
+        else
+            @test issetequal(v, rt_dict[k])
+        end
+    end
     return true
 end
 

From eb874d343fda3bd4dbf6a0c3b17805f322fad612 Mon Sep 17 00:00:00 2001
From: hannahilea <hannahilea@users.noreply.github.com>
Date: Thu, 17 Mar 2022 23:54:19 +0000
Subject: [PATCH 13/21] Fix test dep

---
 Project.toml | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/Project.toml b/Project.toml
index c4fcdcb..cdae0fe 100644
--- a/Project.toml
+++ b/Project.toml
@@ -22,13 +22,15 @@ CairoMakie = "0.7"
 Legolas = "0.3"
 Makie = "0.16.5"
 StatsBase = "0.33"
+Tables = "1.7"
 TensorBoardLogger = "0.1"
 julia = "1.6"
 
 [extras]
 CairoMakie = "13f3f980-e62b-5c42-98c6-ff1f3baf88f0"
 StableRNGs = "860ef19b-820b-49d6-a774-d7a799459cd3"
+Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 
 [targets]
-test = ["Test", "CairoMakie", "StableRNGs"]
+test = ["Test", "CairoMakie", "StableRNGs", "Tables"]

From 8bf1f47c50303366343b77a21ee94220e2ad0665 Mon Sep 17 00:00:00 2001
From: hannahilea <hannahilea@users.noreply.github.com>
Date: Fri, 18 Mar 2022 14:30:58 +0000
Subject: [PATCH 14/21] test for inclusion of all metrics

---
 src/row.jl       |  5 +++--
 test/learn.jl    | 14 +++++++-------
 test/row.jl      |  6 +++---
 test/runtests.jl | 15 +++++++++++----
 4 files changed, 24 insertions(+), 16 deletions(-)

diff --git a/src/row.jl b/src/row.jl
index 377bcab..949287e 100644
--- a/src/row.jl
+++ b/src/row.jl
@@ -13,7 +13,7 @@ vec_to_mat(x::Missing) = return missing
 const EVALUATION_ROW_SCHEMA = Legolas.Schema("lighthouse.evaluation@1")
 const EvaluationRow = Legolas.@row("lighthouse.evaluation@1",
                                    class_labels::Union{Missing,Vector{String}},
-                                   confusion_matrix::Union{Missing,Array{Int64}} = vec_to_mat(confusion_matrix), #TODO: file issue to make Matrix{Int64} in future
+                                   confusion_matrix::Union{Missing,Array{Int64}} = vec_to_mat(confusion_matrix),
                                    discrimination_calibration_curve::Union{Missing,
                                                                            Tuple{Vector{Float64},
                                                                                  Vector{Float64}}},
@@ -22,6 +22,7 @@ const EvaluationRow = Legolas.@row("lighthouse.evaluation@1",
                                    multiclass_kappa::Union{Missing,Float64},
                                    optimal_threshold::Union{Missing,Float64},
                                    optimal_threshold_class::Union{Missing,Int64},
+                                   per_class_IRA_kappas::Union{Missing,Vector{Float64}},
                                    per_class_kappas::Union{Missing,Vector{Float64}},
                                    stratified_kappas::Union{Missing,
                                                             Vector{NamedTuple{(:per_class,
@@ -78,4 +79,4 @@ Lighthouse <v0.14.0.
 """
 function _evaluation_row_dict(row::EvaluationRow)
     return Dict(string(k) => v for (k, v) in pairs(NamedTuple(row)) if !ismissing(v))
-end
\ No newline at end of file
+end
diff --git a/test/learn.jl b/test/learn.jl
index c887b6f..fc46e54 100644
--- a/test/learn.jl
+++ b/test/learn.jl
@@ -111,7 +111,7 @@ end
         @test haskey(plot_data, "stratified_kappas")
         plot = evaluation_metrics_plot(plot_data)
 
-        @test test_roundtrip_evaluation(plot_data)
+        test_evaluation_metrics_roundtrip(plot_data)
 
         plot2, plot_data2 = @test_deprecated evaluation_metrics_plot(predicted_hard,
                                                                      predicted_soft,
@@ -121,7 +121,7 @@ end
                                                                      votes=votes,
                                                                      strata=strata)
         @test isequal(plot_data, plot_data2) # check these are the same
-        @test test_roundtrip_evaluation(plot_data2)
+        test_evaluation_metrics_roundtrip(plot_data2)
 
         # Test plotting
         plot_data = last(logger.logged["test_set_evaluation/metrics_per_epoch"])
@@ -203,7 +203,7 @@ end
               limit
         plot_data = last(logger.logged["test_set_evaluation/metrics_per_epoch"])
         @test haskey(plot_data, "spearman_correlation")
-        @test test_roundtrip_evaluation(plot_data)
+        test_evaluation_metrics_roundtrip(plot_data)
 
         # No `optimal_threshold_class` during learning...
         @test !haskey(plot_data, "optimal_threshold")
@@ -218,7 +218,7 @@ end
         @test haskey(plot_data, "optimal_threshold")
         @test haskey(plot_data, "optimal_threshold_class")
         @test plot_data["optimal_threshold_class"] == 2
-        @test test_roundtrip_evaluation(plot_data)
+        test_evaluation_metrics_roundtrip(plot_data)
 
         # `optimal_threshold_class` param invalid
         @test_throws ArgumentError Lighthouse.learn!(model, logger, () -> train_batches,
@@ -244,14 +244,14 @@ end
         plot_data = last(logger.logged["wheeeeeee/metrics_for_all_time"])
         @test !haskey(plot_data, "per_class_IRA_kappas")
         @test !haskey(plot_data, "multiclass_IRA_kappas")
-        @test test_roundtrip_evaluation(plot_data)
+        test_evaluation_metrics_roundtrip(plot_data)
 
         evaluate!(predicted_hard, predicted_soft, elected_hard, model.classes, logger;
                   logger_prefix="wheeeeeee", logger_suffix="_for_all_time", votes=votes)
         plot_data = last(logger.logged["wheeeeeee/metrics_for_all_time"])
         @test haskey(plot_data, "per_class_IRA_kappas")
         @test haskey(plot_data, "multiclass_IRA_kappas")
-        @test test_roundtrip_evaluation(plot_data)
+        test_evaluation_metrics_roundtrip(plot_data)
 
         # Test `evaluate` for different optimal_threshold classes
         evaluate!(predicted_hard, predicted_soft, elected_hard, model.classes, logger;
@@ -262,7 +262,7 @@ end
                   logger_prefix="wheeeeeee", logger_suffix="_for_all_time", votes=votes,
                   optimal_threshold_class=2)
         plot_data_2 = last(logger.logged["wheeeeeee/metrics_for_all_time"])
-        @test test_roundtrip_evaluation(plot_data_2)
+        test_evaluation_metrics_roundtrip(plot_data_2)
 
         # The thresholds should not be identical (since they are *inclusive* when applied:
         # values greater than _or equal to_ the threshold are given the class value)
diff --git a/test/row.jl b/test/row.jl
index 5ec4786..02b18ca 100644
--- a/test/row.jl
+++ b/test/row.jl
@@ -9,14 +9,14 @@ end
 @testset "`EvaluationRow`" begin
     # Basic roundtrip
     dict = Dict("class_labels" => ["foo", "bar"], "multiclass_kappa" => 3)
-    @test test_roundtrip_evaluation(dict)
+    test_evaluation_metrics_roundtrip(dict)
 
     # Don't lose extra columns (basic Legolas functionality)
     extra_dict = Dict("class_labels" => ["foo", "bar"], "multiclass_kappa" => 3,
                       "rabbit" => 2432)
-    @test test_roundtrip_evaluation(extra_dict)
+    test_evaluation_metrics_roundtrip(extra_dict)
 
     # Handle fun cases
     mat_dict = Dict("confusion_matrix" => [3 5 6; 6 7 8; 9 10 11])
-    @test test_roundtrip_evaluation(mat_dict)
+    test_evaluation_metrics_roundtrip(mat_dict)
 end
\ No newline at end of file
diff --git a/test/runtests.jl b/test/runtests.jl
index ac05304..f152990 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -26,11 +26,18 @@ macro testplot(fig_name)
     end
 end
 
-function test_roundtrip_evaluation(row_dict::Dict{String,S}) where {S}
+const EVALUATION_ROW_KEYS = string.(keys(Lighthouse.EvaluationRow()))
+
+function test_evaluation_metrics_roundtrip(row_dict::Dict{String,S}) where {S}
+    # Make sure we're capturing all metrics keys in our Schema
+    keys_not_in_schema = setdiff(keys(row_dict), EVALUATION_ROW_KEYS)
+    @test isempty(keys_not_in_schema)
+
+    # Do the roundtripping (will fail if schema types do not validate after roundtrip)
     row = Lighthouse.EvaluationRow(row_dict)
     rt_row = roundtrip_row(row)
 
-    # Make sure row roundtrips correctly
+    # Make sure full row roundtrips correctly
     @test issetequal(keys(row), keys(rt_row))
     for (k, v) in pairs(row)
         if ismissing(v)
@@ -40,7 +47,7 @@ function test_roundtrip_evaluation(row_dict::Dict{String,S}) where {S}
         end
     end
 
-    # Make sure originating dictionary roundtrips correctly
+    # Make sure originating metrics dictionary roundtrips correctly
     rt_dict = Lighthouse._evaluation_row_dict(rt_row)
     for (k, v) in pairs(row_dict)
         if ismissing(v)
@@ -49,7 +56,7 @@ function test_roundtrip_evaluation(row_dict::Dict{String,S}) where {S}
             @test issetequal(v, rt_dict[k])
         end
     end
-    return true
+    return nothing
 end
 
 function roundtrip_row(row::Lighthouse.EvaluationRow)

From b353c5a5a35bd655b09d96618059a81550927412 Mon Sep 17 00:00:00 2001
From: hannahilea <hannahilea@users.noreply.github.com>
Date: Fri, 18 Mar 2022 14:35:53 +0000
Subject: [PATCH 15/21] foiled by my own test case

---
 test/row.jl | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/test/row.jl b/test/row.jl
index 02b18ca..aaec57e 100644
--- a/test/row.jl
+++ b/test/row.jl
@@ -6,17 +6,15 @@
     @test_throws DimensionMismatch Lighthouse.vec_to_mat(collect(1:6)) # Invalid dimensions
 end
 
-@testset "`EvaluationRow`" begin
-    # Basic roundtrip
+@testset "`EvaluationRow` basics" begin
+    # Most EvaluationRow testing happens via the `test_evaluation_metrics_roundtrip`
+    # in test/learn.jl
+
+    # Roundtrip from dict
     dict = Dict("class_labels" => ["foo", "bar"], "multiclass_kappa" => 3)
     test_evaluation_metrics_roundtrip(dict)
 
-    # Don't lose extra columns (basic Legolas functionality)
-    extra_dict = Dict("class_labels" => ["foo", "bar"], "multiclass_kappa" => 3,
-                      "rabbit" => 2432)
-    test_evaluation_metrics_roundtrip(extra_dict)
-
-    # Handle fun cases
+    # Handle fun case
     mat_dict = Dict("confusion_matrix" => [3 5 6; 6 7 8; 9 10 11])
     test_evaluation_metrics_roundtrip(mat_dict)
 end
\ No newline at end of file

From 9c82ab68a5e53763a936bdaf54b6689b67155be0 Mon Sep 17 00:00:00 2001
From: hannahilea <hannahilea@users.noreply.github.com>
Date: Fri, 18 Mar 2022 14:49:40 +0000
Subject: [PATCH 16/21] cleanup

---
 src/row.jl  | 2 +-
 test/row.jl | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/row.jl b/src/row.jl
index 949287e..2f973b7 100644
--- a/src/row.jl
+++ b/src/row.jl
@@ -75,7 +75,7 @@ end
 
 Convert [`EvaluationRow`](@ref) into `::Dict{String, Any}` results, as are
 output by `[`evaluation_metrics`](@ref)` (and predated use of `EvaluationRow` in
-Lighthouse <v0.14.0.
+Lighthouse <v0.14.0).
 """
 function _evaluation_row_dict(row::EvaluationRow)
     return Dict(string(k) => v for (k, v) in pairs(NamedTuple(row)) if !ismissing(v))
diff --git a/test/row.jl b/test/row.jl
index aaec57e..9f836db 100644
--- a/test/row.jl
+++ b/test/row.jl
@@ -17,4 +17,4 @@ end
     # Handle fun case
     mat_dict = Dict("confusion_matrix" => [3 5 6; 6 7 8; 9 10 11])
     test_evaluation_metrics_roundtrip(mat_dict)
-end
\ No newline at end of file
+end

From abaa14e2132baf41318e1970dcad73af53e84e38 Mon Sep 17 00:00:00 2001
From: hannahilea <hannahilea@users.noreply.github.com>
Date: Fri, 18 Mar 2022 16:41:41 +0000
Subject: [PATCH 17/21] Add new docstrings to docs

---
 docs/src/index.md | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/docs/src/index.md b/docs/src/index.md
index 36e6b76..2fe0b6f 100644
--- a/docs/src/index.md
+++ b/docs/src/index.md
@@ -39,6 +39,9 @@ binary_statistics
 cohens_kappa
 calibration_curve
 Lighthouse.evaluation_metrics
+Lighthouse._evaluation_row_dict
+Lighthouse.Row
+Lighthouse.evaluation_metrics_row
 ```
 
 ## Utilities

From 4a999ce5fe7ef2934e25ed17ce319f502f75e387 Mon Sep 17 00:00:00 2001
From: hannahilea <hannahilea@users.noreply.github.com>
Date: Fri, 18 Mar 2022 17:09:26 +0000
Subject: [PATCH 18/21] fix docs

---
 docs/src/index.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/src/index.md b/docs/src/index.md
index 2fe0b6f..00d25be 100644
--- a/docs/src/index.md
+++ b/docs/src/index.md
@@ -40,7 +40,7 @@ cohens_kappa
 calibration_curve
 Lighthouse.evaluation_metrics
 Lighthouse._evaluation_row_dict
-Lighthouse.Row
+Lighthouse.Row :: Union{Tuple{Dict}, Tuple{S}} where S<:Legolas.Schema{Symbol("lighthouse.evaluation"), 1}
 Lighthouse.evaluation_metrics_row
 ```
 

From 3af9d23a4dfdd337d86eb0541b4cf9ece384feac Mon Sep 17 00:00:00 2001
From: hannahilea <hannahilea@users.noreply.github.com>
Date: Fri, 18 Mar 2022 17:54:28 +0000
Subject: [PATCH 19/21] fix docstring

---
 docs/src/index.md |  2 +-
 src/row.jl        | 61 +++++++++++++++++++++++++++++++++++++++++++----
 2 files changed, 57 insertions(+), 6 deletions(-)

diff --git a/docs/src/index.md b/docs/src/index.md
index 00d25be..bbe63b6 100644
--- a/docs/src/index.md
+++ b/docs/src/index.md
@@ -40,7 +40,7 @@ cohens_kappa
 calibration_curve
 Lighthouse.evaluation_metrics
 Lighthouse._evaluation_row_dict
-Lighthouse.Row :: Union{Tuple{Dict}, Tuple{S}} where S<:Legolas.Schema{Symbol("lighthouse.evaluation"), 1}
+Lighthouse.EvaluationRow
 Lighthouse.evaluation_metrics_row
 ```
 
diff --git a/src/row.jl b/src/row.jl
index 2f973b7..f036f41 100644
--- a/src/row.jl
+++ b/src/row.jl
@@ -11,7 +11,9 @@ vec_to_mat(x::Missing) = return missing
 
 # Redefinition is workaround for https://github.com/beacon-biosignals/Legolas.jl/issues/9
 const EVALUATION_ROW_SCHEMA = Legolas.Schema("lighthouse.evaluation@1")
-const EvaluationRow = Legolas.@row("lighthouse.evaluation@1",
+
+"""
+    const EvaluationRow = Legolas.@row("lighthouse.evaluation@1",
                                    class_labels::Union{Missing,Vector{String}},
                                    confusion_matrix::Union{Missing,Array{Int64}} = vec_to_mat(confusion_matrix),
                                    discrimination_calibration_curve::Union{Missing,
@@ -57,13 +59,62 @@ const EvaluationRow = Legolas.@row("lighthouse.evaluation@1",
                                                                                 Float64,
                                                                                 Float64}}},
                                    thresholds::Union{Missing,Vector{Float64}})
-
-"""
     EvaluationRow(evaluation_row_dict::Dict{String, Any}) -> EvaluationRow
 
-Convert `Dict` of  [`evaluation_metrics`](@ref) results (e.g. from Lighthouse <v0.14.0)
-into an [`EvaluationRow`](@ref).
+A type alias for [`Legolas.Row{typeof(Legolas.Schema("lighthouse.evaluation@1@1"))}`](https://beacon-biosignals.github.io/Legolas.jl/stable/#Legolas.@row)
+representing the output metrics computed by [`evaluation_metrics_row`](@ref) and
+[`evaluation_metrics`](@ref).
+
+Constructor that takes `evaluation_row_dict` converts [`evaluation_metrics`](@ref)
+`Dict` of metrics results (e.g. from Lighthouse <v0.14.0) into an [`EvaluationRow`](@ref).
 """
+const EvaluationRow = Legolas.@row("lighthouse.evaluation@1",
+                                   class_labels::Union{Missing,Vector{String}},
+                                   confusion_matrix::Union{Missing,Array{Int64}} = vec_to_mat(confusion_matrix),
+                                   discrimination_calibration_curve::Union{Missing,
+                                                                           Tuple{Vector{Float64},
+                                                                                 Vector{Float64}}},
+                                   discrimination_calibration_score::Union{Missing,Float64},
+                                   multiclass_IRA_kappas::Union{Missing,Float64},
+                                   multiclass_kappa::Union{Missing,Float64},
+                                   optimal_threshold::Union{Missing,Float64},
+                                   optimal_threshold_class::Union{Missing,Int64},
+                                   per_class_IRA_kappas::Union{Missing,Vector{Float64}},
+                                   per_class_kappas::Union{Missing,Vector{Float64}},
+                                   stratified_kappas::Union{Missing,
+                                                            Vector{NamedTuple{(:per_class,
+                                                                               :multiclass,
+                                                                               :n),
+                                                                              Tuple{Vector{Float64},
+                                                                                    Float64,
+                                                                                    Int64}}}},
+                                   per_class_pr_curves::Union{Missing,
+                                                              Vector{Tuple{Vector{Float64},
+                                                                           Vector{Float64}}}},
+                                   per_class_reliability_calibration_curves::Union{Missing,
+                                                                                   Vector{Tuple{Vector{Float64},
+                                                                                                Vector{Float64}}}},
+                                   per_class_reliability_calibration_scores::Union{Missing,
+                                                                                   Vector{Float64}},
+                                   per_class_roc_aucs::Union{Missing,Vector{Float64}},
+                                   per_class_roc_curves::Union{Missing,
+                                                               Vector{Tuple{Vector{Float64},
+                                                                            Vector{Float64}}}},
+                                   per_expert_discrimination_calibration_curves::Union{Missing,
+                                                                                       Vector{Tuple{Vector{Float64},
+                                                                                                    Vector{Float64}}}},
+                                   per_expert_discrimination_calibration_scores::Union{Missing,
+                                                                                       Vector{Float64}},
+                                   spearman_correlation::Union{Missing,
+                                                               NamedTuple{(:ρ, :n,
+                                                                           :ci_lower,
+                                                                           :ci_upper),
+                                                                          Tuple{Float64,
+                                                                                Int64,
+                                                                                Float64,
+                                                                                Float64}}},
+                                   thresholds::Union{Missing,Vector{Float64}})
+
 function Legolas.Row{S}(evaluation_row_dict::Dict) where {S<:Legolas.Schema{Symbol("lighthouse.evaluation"),
                                                                             1}}
     row = (; (Symbol(k) => v for (k, v) in pairs(evaluation_row_dict))...)

From 0067fec3a715f01a5b3199d783274b35f9d487ad Mon Sep 17 00:00:00 2001
From: hannahilea <hannahilea@users.noreply.github.com>
Date: Fri, 18 Mar 2022 15:23:37 -0400
Subject: [PATCH 20/21] export EvaluationRow

---
 docs/src/index.md | 2 +-
 src/Lighthouse.jl | 2 +-
 test/runtests.jl  | 8 ++++----
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/docs/src/index.md b/docs/src/index.md
index bbe63b6..3ccffbf 100644
--- a/docs/src/index.md
+++ b/docs/src/index.md
@@ -38,9 +38,9 @@ accuracy
 binary_statistics
 cohens_kappa
 calibration_curve
+EvaluationRow
 Lighthouse.evaluation_metrics
 Lighthouse._evaluation_row_dict
-Lighthouse.EvaluationRow
 Lighthouse.evaluation_metrics_row
 ```
 
diff --git a/src/Lighthouse.jl b/src/Lighthouse.jl
index 4ce5c3e..9fc3ce1 100644
--- a/src/Lighthouse.jl
+++ b/src/Lighthouse.jl
@@ -21,7 +21,7 @@ include("classifier.jl")
 export AbstractClassifier
 
 include("row.jl")
-# TODO: export EvaluationRow ?
+export EvaluationRow
 
 include("learn.jl")
 export LearnLogger, learn!, upon, evaluate!, predict!
diff --git a/test/runtests.jl b/test/runtests.jl
index f152990..c92ff9f 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -26,7 +26,7 @@ macro testplot(fig_name)
     end
 end
 
-const EVALUATION_ROW_KEYS = string.(keys(Lighthouse.EvaluationRow()))
+const EVALUATION_ROW_KEYS = string.(keys(EvaluationRow()))
 
 function test_evaluation_metrics_roundtrip(row_dict::Dict{String,S}) where {S}
     # Make sure we're capturing all metrics keys in our Schema
@@ -34,7 +34,7 @@ function test_evaluation_metrics_roundtrip(row_dict::Dict{String,S}) where {S}
     @test isempty(keys_not_in_schema)
 
     # Do the roundtripping (will fail if schema types do not validate after roundtrip)
-    row = Lighthouse.EvaluationRow(row_dict)
+    row = EvaluationRow(row_dict)
     rt_row = roundtrip_row(row)
 
     # Make sure full row roundtrips correctly
@@ -59,11 +59,11 @@ function test_evaluation_metrics_roundtrip(row_dict::Dict{String,S}) where {S}
     return nothing
 end
 
-function roundtrip_row(row::Lighthouse.EvaluationRow)
+function roundtrip_row(row::EvaluationRow)
     p = mktempdir() * "rt_test.arrow"
     tbl = [row]
     Legolas.write(p, tbl, Lighthouse.EVALUATION_ROW_SCHEMA)
-    return Lighthouse.EvaluationRow(only(Tables.rows(Legolas.read(p))))
+    return EvaluationRow(only(Tables.rows(Legolas.read(p))))
 end
 
 include("plotting.jl")

From 72be90baa658dcefd2c25fb3850a79f2a340465d Mon Sep 17 00:00:00 2001
From: hannahilea <hannahilea@users.noreply.github.com>
Date: Fri, 18 Mar 2022 15:38:18 -0400
Subject: [PATCH 21/21] remove uneeded dep

---
 Project.toml      | 2 --
 src/Lighthouse.jl | 1 -
 2 files changed, 3 deletions(-)

diff --git a/Project.toml b/Project.toml
index cdae0fe..93d50af 100644
--- a/Project.toml
+++ b/Project.toml
@@ -4,7 +4,6 @@ authors = ["Beacon Biosignals, Inc."]
 version = "0.14.0"
 
 [deps]
-Arrow = "69666777-d1a9-59fb-9406-91d4454c9d45"
 Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"
 Legolas = "741b9549-f6ed-4911-9fbf-4a1c0c97f0cd"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
@@ -17,7 +16,6 @@ StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
 TensorBoardLogger = "899adc3e-224a-11e9-021f-63837185c80f"
 
 [compat]
-Arrow = "2.2"
 CairoMakie = "0.7"
 Legolas = "0.3"
 Makie = "0.16.5"
diff --git a/src/Lighthouse.jl b/src/Lighthouse.jl
index 9fc3ce1..6ed6c3a 100644
--- a/src/Lighthouse.jl
+++ b/src/Lighthouse.jl
@@ -7,7 +7,6 @@ using TensorBoardLogger
 using Makie
 using Printf
 using Legolas
-using Arrow
 
 include("plotting.jl")