From 4f521a6a7bfe17b5348fb56b269d30c734a73640 Mon Sep 17 00:00:00 2001 From: Eric Hanson <5846501+ericphanson@users.noreply.github.com> Date: Tue, 30 Nov 2021 18:08:49 -0500 Subject: [PATCH 01/21] wip --- Project.toml | 2 + src/Lighthouse.jl | 4 ++ src/learn.jl | 142 ++++++++++++++++++++++++++-------------------- src/row.jl | 25 ++++++++ 4 files changed, 110 insertions(+), 63 deletions(-) create mode 100644 src/row.jl diff --git a/Project.toml b/Project.toml index e59df5e..cac7a1a 100644 --- a/Project.toml +++ b/Project.toml @@ -5,6 +5,7 @@ version = "0.13.2" [deps] Dates = "ade2ca70-3891-5945-98fb-dc099432e06a" +Legolas = "741b9549-f6ed-4911-9fbf-4a1c0c97f0cd" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" Logging = "56ddb016-857b-54e1-b83d-db4d58db5568" Makie = "ee78f7c6-11fb-53f2-987a-cfe4a2b5a57a" @@ -19,6 +20,7 @@ CairoMakie = "0.5.2, 0.6" Makie = "0.13.14, 0.14, 0.15" StatsBase = "0.33" TensorBoardLogger = "0.1" +Legolas = "0.3" julia = "1.5" [extras] diff --git a/src/Lighthouse.jl b/src/Lighthouse.jl index 1ffacb1..38f9095 100644 --- a/src/Lighthouse.jl +++ b/src/Lighthouse.jl @@ -6,6 +6,7 @@ using StatsBase: StatsBase using TensorBoardLogger using Makie using Printf +using Legolas include("plotting.jl") @@ -18,7 +19,10 @@ export confusion_matrix, accuracy, binary_statistics, cohens_kappa, calibration_ include("classifier.jl") export AbstractClassifier +include("row.jl") + include("learn.jl") export LearnLogger, learn!, upon, evaluate!, predict! + end # module diff --git a/src/learn.jl b/src/learn.jl index 8beb8d6..fd20746 100644 --- a/src/learn.jl +++ b/src/learn.jl @@ -238,7 +238,7 @@ function _calculate_stratified_ea_kappas(predicted_hard_labels, elected_hard_lab elected = elected_hard_labels[index] k = _calculate_ea_kappas(predicted, elected, class_count) push!(kappas, - group => (per_class=k.per_class, multiclass=k.multiclass, n=sum(index))) + group => (per_class=k.per_class_kappas, multiclass=k.multiclass_kappa, n=sum(index))) end return sort(kappas; by=p -> last(p).multiclass) end @@ -246,9 +246,9 @@ end """ _calculate_ea_kappas(predicted_hard_labels, elected_hard_labels, classes) -Return `NamedTuple` with keys `:per_class`, `:multiclass` containing the Cohen's +Return `NamedTuple` with keys `:per_class_kappas`, `:multiclass_kappa` containing the Cohen's Kappa per-class and over all classes, respectively. The value of output key -`:per_class` is an `Array` such that item `i` is the Cohen's kappa calculated +`:per_class_kappas` is an `Array` such that item `i` is the Cohen's kappa calculated for class `i`. Where... @@ -272,15 +272,15 @@ function _calculate_ea_kappas(predicted_hard_labels, elected_hard_labels, class_ elected = ((label == class_index) + 1 for label in elected_hard_labels) return first(cohens_kappa(CLASS_VS_ALL_CLASS_COUNT, zip(predicted, elected))) end - return (per_class=per_class, multiclass=multiclass) + return (per_class_kappas=per_class, multiclass_kappa=multiclass) end """ _calculate_ira_kappas(votes, classes) -Return `NamedTuple` with keys `:per_class`, `:multiclass` containing the Cohen's +Return `NamedTuple` with keys `:per_class_IRA_kappas`, `:multiclass_IRA_kappas` containing the Cohen's Kappa for inter-rater agreement (IRA) per-class and over all classes, respectively. -The value of output key `:per_class` is an `Array` such that item `i` is the +The value of output key `:per_class_IRA_kappas` is an `Array` such that item `i` is the IRA kappa calculated for class `i`. Where... @@ -292,12 +292,13 @@ Where... - `classes` all possible classes voted on. -Returns `nothing` if `votes` has only a single voter (i.e., a single column) or if +Returns `(per_class_IRA_kappas=missing, multiclass_IRA_kappas=missing)` if `votes` has only a single voter (i.e., a single column) or if no two voters rated the same sample. Note that vote entries of `0` are taken to mean that the voter did not rate that sample. """ function _calculate_ira_kappas(votes, classes) - (isnothing(votes) || size(votes, 2) < 2) && return nothing # no votes given or only one expert + # no votes given or only one expert: + (isnothing(votes) || size(votes, 2) < 2) && return (per_class_IRA_kappas=missing, multiclass_IRA_kappas=missing) all_hard_label_pairs = Array{Int}(undef, 0, 2) num_voters = size(votes, 2) @@ -319,7 +320,7 @@ function _calculate_ira_kappas(votes, classes) hard_label_pairs) return first(cohens_kappa(CLASS_VS_ALL_CLASS_COUNT, class_v_other_hard_label_pair)) end - return (per_class=per_class_ira, multiclass=multiclass_ira) + return (per_class_IRA_kappas=per_class_ira, multiclass_IRA_kappas=multiclass_ira) end function _spearman_corr(predicted_soft_labels, elected_soft_labels) @@ -442,7 +443,9 @@ function _get_optimal_threshold_from_ROC(per_class_roc_curves; thresholds, end function _validate_threshold_class(optimal_threshold_class, classes) - isnothing(optimal_threshold_class) && return nothing + if ismissing(optimal_threshold_class) || isnothing(optimal_threshold_class) + return nothing + end length(classes) == 2 || throw(ArgumentError("Only valid for binary classification problems")) optimal_threshold_class in Set([1, 2]) || @@ -451,16 +454,16 @@ function _validate_threshold_class(optimal_threshold_class, classes) end """ - evaluation_metrics(predicted_hard_labels::AbstractVector, - predicted_soft_labels::AbstractMatrix, - elected_hard_labels::AbstractVector, - classes, - thresholds=0.0:0.01:1.0; - votes::Union{Nothing,AbstractMatrix}=nothing, - strata::Union{Nothing,AbstractVector{Set{T}} where T}=nothing, - optimal_threshold_class::Union{Nothing,Integer}=nothing) - -Returns dictionary containing a battery of classifier performance + evaluation_metrics_row(predicted_hard_labels::AbstractVector, + predicted_soft_labels::AbstractMatrix, + elected_hard_labels::AbstractVector, + classes, + thresholds=0.0:0.01:1.0; + votes::Union{Nothing,AbstractMatrix}=nothing, + strata::Union{Nothing,AbstractVector{Set{T}} where T}=nothing, + optimal_threshold_class::Union{Nothing,Integer}=nothing) + +Returns `EvaluationRow` containing a battery of classifier performance metrics that each compare `predicted_soft_labels` and/or `predicted_hard_labels` agaist `elected_hard_labels`. @@ -495,12 +498,12 @@ Where... See also [`evaluation_metrics_plot`](@ref). """ -function evaluation_metrics(predicted_hard_labels::AbstractVector, +function evaluation_metrics_row(predicted_hard_labels::AbstractVector, predicted_soft_labels::AbstractMatrix, elected_hard_labels::AbstractVector, classes, thresholds; votes::Union{Nothing,AbstractMatrix}=nothing, strata::Union{Nothing,AbstractVector{Set{T}} where T}=nothing, - optimal_threshold_class::Union{Nothing,Integer}=nothing) + optimal_threshold_class::Union{Missing,Integer}=missing) _validate_threshold_class(optimal_threshold_class, classes) class_count = length(classes) @@ -508,22 +511,16 @@ function evaluation_metrics(predicted_hard_labels::AbstractVector, class_labels = string.(class_vector) per_class_stats = per_class_confusion_statistics(predicted_soft_labels, elected_hard_labels, thresholds) - plot_dict = Dict() - plot_dict["class_labels"] = class_labels - plot_dict["thresholds"] = thresholds # ROC curves - plot_dict["per_class_roc_curves"] = [(map(t -> t.false_positive_rate, stats), + per_class_roc_curves = [(map(t -> t.false_positive_rate, stats), map(t -> t.true_positive_rate, stats)) for stats in per_class_stats] - plot_dict["per_class_roc_aucs"] = [area_under_curve(x, y) - for (x, y) in plot_dict["per_class_roc_curves"]] + per_class_roc_aucs = [area_under_curve(x, y) + for (x, y) in per_class_roc_curves] # Optionally calculate optimal threshold - if !isnothing(optimal_threshold_class) - plot_dict["optimal_threshold_class"] = optimal_threshold_class - threshold = nothing - + if !ismissing(optimal_threshold_class) # If votes exist, calculate the threshold based on comparing against # vote probabilities. Otherwise, use the ROC curve. if !isnothing(votes) @@ -531,21 +528,24 @@ function evaluation_metrics(predicted_hard_labels::AbstractVector, votes; thresholds=thresholds, class_of_interest_index=optimal_threshold_class) - threshold = c.threshold - plot_dict["discrimination_calibration_curve"] = c.plot_curve_data - plot_dict["discrimination_calibration_score"] = c.mse + optimal_threshold = c.threshold + discrimination_calibration_curve = c.plot_curve_data + discrimination_calibration_score = c.mse expert_cal = _calculate_voter_discrimination_calibration(votes; class_of_interest_index=optimal_threshold_class) - plot_dict["per_expert_discrimination_calibration_curves"] = expert_cal.plot_curve_data - plot_dict["per_expert_discrimination_calibration_scores"] = expert_cal.mse + per_expert_discrimination_calibration_curves = expert_cal.plot_curve_data + per_expert_discrimination_calibration_scores = expert_cal.mse else + discrimination_calibration_curve = missing + discrimination_calibration_score = missing + per_expert_discrimination_calibration_curves = missing + per_expert_discrimination_calibration_scores = missing # ...based on ROC curve otherwise - threshold = _get_optimal_threshold_from_ROC(plot_dict["per_class_roc_curves"]; + optimal_threshold = _get_optimal_threshold_from_ROC(per_class_roc_curves; thresholds=thresholds, class_of_interest_index=optimal_threshold_class) end - plot_dict["optimal_threshold"] = threshold # Recalculate `predicted_hard_labels` with this new threshold other_class = optimal_threshold_class == 1 ? 2 : 1 @@ -553,53 +553,69 @@ function evaluation_metrics(predicted_hard_labels::AbstractVector, predicted_hard_labels[i] = row[optimal_threshold_class] .>= threshold ? optimal_threshold_class : other_class end + else + discrimination_calibration_curve = missing + discrimination_calibration_score = missing + per_expert_discrimination_calibration_curves = missing + per_expert_discrimination_calibration_scores = missing + optimal_threshold = missing end # PR curves - plot_dict["per_class_pr_curves"] = [(map(t -> t.true_positive_rate, stats), + per_class_pr_curves = [(map(t -> t.true_positive_rate, stats), map(t -> t.precision, stats)) for stats in per_class_stats] - # Cohen's kappa - kappas = _calculate_ea_kappas(predicted_hard_labels, elected_hard_labels, class_count) - plot_dict["per_class_kappas"] = kappas.per_class - plot_dict["multiclass_kappa"] = kappas.multiclass - ira = _calculate_ira_kappas(votes, classes) - if !isnothing(ira) - plot_dict["per_class_IRA_kappas"] = ira.per_class - plot_dict["multiclass_IRA_kappas"] = ira.multiclass - end - + # Stratified kappas - if !isnothing(strata) - plot_dict["stratified_kappas"] = _calculate_stratified_ea_kappas(predicted_hard_labels, + if isnothing(strata) + stratified_kappas = missing + else + stratified_kappas = _calculate_stratified_ea_kappas(predicted_hard_labels, elected_hard_labels, class_count, strata) end # Reliability calibration curves - per_class_reliability_calibration_curves = map(1:class_count) do class_index + per_class_reliability_calibration = map(1:class_count) do class_index class_probabilities = view(predicted_soft_labels, :, class_index) return calibration_curve(class_probabilities, elected_hard_labels .== class_index) end - plot_dict["per_class_reliability_calibration_curves"] = map(x -> (mean.(x.bins), + per_class_reliability_calibration_curves = map(x -> (mean.(x.bins), x.fractions), - per_class_reliability_calibration_curves) - plot_dict["per_class_reliability_calibration_scores"] = map(x -> x.mean_squared_error, - per_class_reliability_calibration_curves) + per_class_reliability_calibration) + per_class_reliability_calibration_scores = map(x -> x.mean_squared_error, + per_class_reliability_calibration) - # Confusion matrix - plot_dict["confusion_matrix"] = confusion_matrix(class_count, - zip(predicted_hard_labels, - elected_hard_labels)) # Log Spearman correlation, iff this is a binary classification problem if length(classes) == 2 && !isnothing(votes) - plot_dict["spearman_correlation"] = _calculate_spearman_correlation(predicted_soft_labels, + spearman_correlation = _calculate_spearman_correlation(predicted_soft_labels, votes, classes) + else + spearman_correlation = missing end - return plot_dict + + return EvaluationRow(; class_labels, + confusion_matrix = confusion_matrix(class_count, zip(predicted_hard_labels, elected_hard_labels)), + spearman_correlation, per_class_reliability_calibration_curves, per_class_reliability_calibration_scores, + _calculate_ira_kappas(votes, classes)..., + _calculate_ea_kappas(predicted_hard_labels, elected_hard_labels, class_count)..., + stratified_kappas,per_class_pr_curves, + per_class_roc_curves, per_class_roc_aucs, + discrimination_calibration_curve, + discrimination_calibration_score, + per_expert_discrimination_calibration_curves, + per_expert_discrimination_calibration_scores, + optimal_threshold, + thresholds, + ) +end + +function evaluation_metrics(args...; optimal_threshold_class=nothing, kwargs...) + row = evaluation_metrics_row(args...; optimal_threshold_class=something(optimal_threshold_class, missing), kwargs...) + return Dict(string(k) => v for (k,v) in pairs(NamedTuple(row)) if !ismissing(v)) end """ diff --git a/src/row.jl b/src/row.jl new file mode 100644 index 0000000..7794999 --- /dev/null +++ b/src/row.jl @@ -0,0 +1,25 @@ +vec_to_mat(mat::AbstractMatrix) = mat +function vec_to_mat(vec::AbstractVector) + n = isqrt(length(vec)) + return reshape(vec, n, n) +end +vec_to_mat(::Missing) = missing + +const EvaluationRow = Legolas.@row("lighthouse.evaluation@1", + class_labels::Union{Missing, Vector{String}}, + confusion_matrix::Union{Missing, Matrix{Int64}}=vec_to_mat(confusion_matrix), + discrimination_calibration_curve::Union{Missing, Tuple{Vector{Float64}, Vector{Union{Missing, Float64}}}}, + discrimination_calibration_score::Union{Missing, Float64}, + multiclass_IRA_kappas::Union{Missing, Float64}, + multiclass_kappa::Union{Missing, Float64}, + optimal_threshold::Union{Missing, Float64}, + per_class_kappas::Union{Missing, Vector{Float64}}, + per_class_pr_curves::Union{Missing, Vector{Tuple{Vector{Float64}, Vector{Union{Missing, Float64}}}}}, + per_class_reliability_calibration_curves::Union{Missing, Vector{Tuple{Vector{Float64}, Vector{Union{Missing, Float64}}}}}, + per_class_reliability_calibration_scores::Union{Missing, Vector{Float64}}, + per_class_roc_aucs::Union{Missing, Vector{Float64}}, + per_class_roc_curves::Union{Missing, Vector{Tuple{Vector{Float64}, Vector{Float64}}}}, + per_expert_discrimination_calibration_curves::Union{Missing, Vector{Tuple{Vector{Float64}, Vector{Union{Missing, Float64}}}}}, + per_expert_discrimination_calibration_scores::Union{Missing, Vector{Float64}}, + spearman_correlation::Union{Missing, NamedTuple{(:ρ, :n, :ci_lower, :ci_upper), Tuple{Float64, Int64, Float64, Float64}}}, + thresholds::Union{Missing, Vector{Float64}}) From 8bf4cfcdf69c8687fece662ee5bf85b1fd1a255f Mon Sep 17 00:00:00 2001 From: hannahilea Date: Tue, 15 Mar 2022 18:56:51 +0000 Subject: [PATCH 02/21] Bump patch version, fix tests --- Project.toml | 2 +- src/learn.jl | 9 ++++----- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/Project.toml b/Project.toml index c30157b..3b39591 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "Lighthouse" uuid = "ac2c24cd-07f0-4848-96b2-1b82c3ea0e59" authors = ["Beacon Biosignals, Inc."] -version = "0.13.3" +version = "0.13.5" [deps] Dates = "ade2ca70-3891-5945-98fb-dc099432e06a" diff --git a/src/learn.jl b/src/learn.jl index fd20746..fbd4be6 100644 --- a/src/learn.jl +++ b/src/learn.jl @@ -298,7 +298,7 @@ mean that the voter did not rate that sample. """ function _calculate_ira_kappas(votes, classes) # no votes given or only one expert: - (isnothing(votes) || size(votes, 2) < 2) && return (per_class_IRA_kappas=missing, multiclass_IRA_kappas=missing) + (isnothing(votes) || size(votes, 2) < 2) && return (; per_class_IRA_kappas=missing, multiclass_IRA_kappas=missing) all_hard_label_pairs = Array{Int}(undef, 0, 2) num_voters = size(votes, 2) @@ -308,7 +308,7 @@ function _calculate_ira_kappas(votes, classes) end end hard_label_pairs = filter(row -> all(row .!= 0), collect(eachrow(all_hard_label_pairs))) - length(hard_label_pairs) > 0 || return nothing # No common observations voted on + length(hard_label_pairs) > 0 || return (; per_class_IRA_kappas=missing, multiclass_IRA_kappas=missing) # No common observations voted on length(hard_label_pairs) < 10 && @warn "...only $(length(hard_label_pairs)) in common, potentially questionable IRA results" @@ -320,7 +320,7 @@ function _calculate_ira_kappas(votes, classes) hard_label_pairs) return first(cohens_kappa(CLASS_VS_ALL_CLASS_COUNT, class_v_other_hard_label_pair)) end - return (per_class_IRA_kappas=per_class_ira, multiclass_IRA_kappas=multiclass_ira) + return (; per_class_IRA_kappas = per_class_ira, multiclass_IRA_kappas = multiclass_ira) end function _spearman_corr(predicted_soft_labels, elected_soft_labels) @@ -550,7 +550,7 @@ function evaluation_metrics_row(predicted_hard_labels::AbstractVector, # Recalculate `predicted_hard_labels` with this new threshold other_class = optimal_threshold_class == 1 ? 2 : 1 for (i, row) in enumerate(eachrow(predicted_soft_labels)) - predicted_hard_labels[i] = row[optimal_threshold_class] .>= threshold ? + predicted_hard_labels[i] = row[optimal_threshold_class] .>= optimal_threshold ? optimal_threshold_class : other_class end else @@ -566,7 +566,6 @@ function evaluation_metrics_row(predicted_hard_labels::AbstractVector, map(t -> t.precision, stats)) for stats in per_class_stats] - # Stratified kappas if isnothing(strata) stratified_kappas = missing From f4fc682f2c35bca8e1030cad44bacf1b797d243a Mon Sep 17 00:00:00 2001 From: hannahilea Date: Tue, 15 Mar 2022 19:52:29 +0000 Subject: [PATCH 03/21] add missing param --- src/learn.jl | 48 ++++++++++++++++++++++++------------------------ src/row.jl | 35 ++++++++++++++++++----------------- test/learn.jl | 8 +++++--- 3 files changed, 47 insertions(+), 44 deletions(-) diff --git a/src/learn.jl b/src/learn.jl index fbd4be6..d8c1bde 100644 --- a/src/learn.jl +++ b/src/learn.jl @@ -499,11 +499,11 @@ Where... See also [`evaluation_metrics_plot`](@ref). """ function evaluation_metrics_row(predicted_hard_labels::AbstractVector, - predicted_soft_labels::AbstractMatrix, - elected_hard_labels::AbstractVector, classes, thresholds; - votes::Union{Nothing,AbstractMatrix}=nothing, - strata::Union{Nothing,AbstractVector{Set{T}} where T}=nothing, - optimal_threshold_class::Union{Missing,Integer}=missing) + predicted_soft_labels::AbstractMatrix, + elected_hard_labels::AbstractVector, classes, thresholds; + votes::Union{Nothing,AbstractMatrix} = nothing, + strata::Union{Nothing,AbstractVector{Set{T}} where T} = nothing, + optimal_threshold_class::Union{Missing,Integer} = missing) _validate_threshold_class(optimal_threshold_class, classes) class_count = length(classes) @@ -514,10 +514,9 @@ function evaluation_metrics_row(predicted_hard_labels::AbstractVector, # ROC curves per_class_roc_curves = [(map(t -> t.false_positive_rate, stats), - map(t -> t.true_positive_rate, stats)) - for stats in per_class_stats] - per_class_roc_aucs = [area_under_curve(x, y) - for (x, y) in per_class_roc_curves] + map(t -> t.true_positive_rate, stats)) + for stats in per_class_stats] + per_class_roc_aucs = [area_under_curve(x, y) for (x, y) in per_class_roc_curves] # Optionally calculate optimal threshold if !ismissing(optimal_threshold_class) @@ -533,7 +532,7 @@ function evaluation_metrics_row(predicted_hard_labels::AbstractVector, discrimination_calibration_score = c.mse expert_cal = _calculate_voter_discrimination_calibration(votes; - class_of_interest_index=optimal_threshold_class) + class_of_interest_index = optimal_threshold_class) per_expert_discrimination_calibration_curves = expert_cal.plot_curve_data per_expert_discrimination_calibration_scores = expert_cal.mse else @@ -543,8 +542,8 @@ function evaluation_metrics_row(predicted_hard_labels::AbstractVector, per_expert_discrimination_calibration_scores = missing # ...based on ROC curve otherwise optimal_threshold = _get_optimal_threshold_from_ROC(per_class_roc_curves; - thresholds=thresholds, - class_of_interest_index=optimal_threshold_class) + thresholds = thresholds, + class_of_interest_index = optimal_threshold_class) end # Recalculate `predicted_hard_labels` with this new threshold @@ -563,17 +562,17 @@ function evaluation_metrics_row(predicted_hard_labels::AbstractVector, # PR curves per_class_pr_curves = [(map(t -> t.true_positive_rate, stats), - map(t -> t.precision, stats)) - for stats in per_class_stats] + map(t -> t.precision, stats)) + for stats in per_class_stats] # Stratified kappas if isnothing(strata) stratified_kappas = missing else stratified_kappas = _calculate_stratified_ea_kappas(predicted_hard_labels, - elected_hard_labels, - class_count, - strata) + elected_hard_labels, + class_count, + strata) end # Reliability calibration curves @@ -582,16 +581,16 @@ function evaluation_metrics_row(predicted_hard_labels::AbstractVector, return calibration_curve(class_probabilities, elected_hard_labels .== class_index) end per_class_reliability_calibration_curves = map(x -> (mean.(x.bins), - x.fractions), - per_class_reliability_calibration) + x.fractions), + per_class_reliability_calibration) per_class_reliability_calibration_scores = map(x -> x.mean_squared_error, - per_class_reliability_calibration) + per_class_reliability_calibration) # Log Spearman correlation, iff this is a binary classification problem if length(classes) == 2 && !isnothing(votes) spearman_correlation = _calculate_spearman_correlation(predicted_soft_labels, - votes, classes) + votes, classes) else spearman_correlation = missing end @@ -601,15 +600,16 @@ function evaluation_metrics_row(predicted_hard_labels::AbstractVector, spearman_correlation, per_class_reliability_calibration_curves, per_class_reliability_calibration_scores, _calculate_ira_kappas(votes, classes)..., _calculate_ea_kappas(predicted_hard_labels, elected_hard_labels, class_count)..., - stratified_kappas,per_class_pr_curves, + stratified_kappas, per_class_pr_curves, per_class_roc_curves, per_class_roc_aucs, discrimination_calibration_curve, discrimination_calibration_score, per_expert_discrimination_calibration_curves, per_expert_discrimination_calibration_scores, optimal_threshold, - thresholds, - ) + optimal_threshold_class, + thresholds + ) end function evaluation_metrics(args...; optimal_threshold_class=nothing, kwargs...) diff --git a/src/row.jl b/src/row.jl index 7794999..fb247d5 100644 --- a/src/row.jl +++ b/src/row.jl @@ -6,20 +6,21 @@ end vec_to_mat(::Missing) = missing const EvaluationRow = Legolas.@row("lighthouse.evaluation@1", - class_labels::Union{Missing, Vector{String}}, - confusion_matrix::Union{Missing, Matrix{Int64}}=vec_to_mat(confusion_matrix), - discrimination_calibration_curve::Union{Missing, Tuple{Vector{Float64}, Vector{Union{Missing, Float64}}}}, - discrimination_calibration_score::Union{Missing, Float64}, - multiclass_IRA_kappas::Union{Missing, Float64}, - multiclass_kappa::Union{Missing, Float64}, - optimal_threshold::Union{Missing, Float64}, - per_class_kappas::Union{Missing, Vector{Float64}}, - per_class_pr_curves::Union{Missing, Vector{Tuple{Vector{Float64}, Vector{Union{Missing, Float64}}}}}, - per_class_reliability_calibration_curves::Union{Missing, Vector{Tuple{Vector{Float64}, Vector{Union{Missing, Float64}}}}}, - per_class_reliability_calibration_scores::Union{Missing, Vector{Float64}}, - per_class_roc_aucs::Union{Missing, Vector{Float64}}, - per_class_roc_curves::Union{Missing, Vector{Tuple{Vector{Float64}, Vector{Float64}}}}, - per_expert_discrimination_calibration_curves::Union{Missing, Vector{Tuple{Vector{Float64}, Vector{Union{Missing, Float64}}}}}, - per_expert_discrimination_calibration_scores::Union{Missing, Vector{Float64}}, - spearman_correlation::Union{Missing, NamedTuple{(:ρ, :n, :ci_lower, :ci_upper), Tuple{Float64, Int64, Float64, Float64}}}, - thresholds::Union{Missing, Vector{Float64}}) + class_labels::Union{Missing,Vector{String}}, + confusion_matrix::Union{Missing,Matrix{Int64}} = vec_to_mat(confusion_matrix), + discrimination_calibration_curve::Union{Missing,Tuple{Vector{Float64},Vector{Union{Missing,Float64}}}}, + discrimination_calibration_score::Union{Missing,Float64}, + multiclass_IRA_kappas::Union{Missing,Float64}, + multiclass_kappa::Union{Missing,Float64}, + optimal_threshold::Union{Missing,Float64}, + optimal_threshold_class::Union{Missing,Int64}, + per_class_kappas::Union{Missing,Vector{Float64}}, + per_class_pr_curves::Union{Missing,Vector{Tuple{Vector{Float64},Vector{Union{Missing,Float64}}}}}, + per_class_reliability_calibration_curves::Union{Missing,Vector{Tuple{Vector{Float64},Vector{Union{Missing,Float64}}}}}, + per_class_reliability_calibration_scores::Union{Missing,Vector{Float64}}, + per_class_roc_aucs::Union{Missing,Vector{Float64}}, + per_class_roc_curves::Union{Missing,Vector{Tuple{Vector{Float64},Vector{Float64}}}}, + per_expert_discrimination_calibration_curves::Union{Missing,Vector{Tuple{Vector{Float64},Vector{Union{Missing,Float64}}}}}, + per_expert_discrimination_calibration_scores::Union{Missing,Vector{Float64}}, + spearman_correlation::Union{Missing,NamedTuple{(:ρ, :n, :ci_lower, :ci_upper),Tuple{Float64,Int64,Float64,Float64}}}, + thresholds::Union{Missing,Vector{Float64}}) diff --git a/test/learn.jl b/test/learn.jl index 1d1b6aa..174e5a0 100644 --- a/test/learn.jl +++ b/test/learn.jl @@ -266,10 +266,12 @@ end end end -@testset "`_calculate_ira_kappas`" begin +@testset "Invalid `_calculate_ira_kappas`" begin classes = ["roy", "gee", "biv"] - @test isnothing(Lighthouse._calculate_ira_kappas([1; 1; 1; 1], classes)) # Only one voter... - @test isnothing(Lighthouse._calculate_ira_kappas([1 0; 1 0; 0 1], classes)) # No observations in common... + @test isequal(Lighthouse._calculate_ira_kappas([1; 1; 1; 1], classes), + (; per_class_IRA_kappas = missing, multiclass_IRA_kappas = missing)) # Only one voter... + @test isequal(Lighthouse._calculate_ira_kappas([1 0; 1 0; 0 1], classes), + (; per_class_IRA_kappas = missing, multiclass_IRA_kappas = missing)) # No observations in common... end @testset "Calculate `_spearman_corr`" begin From 4ccafc444372a96e7f71cc57d667cd5f5c5c4406 Mon Sep 17 00:00:00 2001 From: hannahilea Date: Tue, 15 Mar 2022 19:53:02 +0000 Subject: [PATCH 04/21] bump minor version --- Project.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Project.toml b/Project.toml index 3b39591..d5bfd0b 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "Lighthouse" uuid = "ac2c24cd-07f0-4848-96b2-1b82c3ea0e59" authors = ["Beacon Biosignals, Inc."] -version = "0.13.5" +version = "0.14.0" [deps] Dates = "ade2ca70-3891-5945-98fb-dc099432e06a" @@ -17,10 +17,10 @@ TensorBoardLogger = "899adc3e-224a-11e9-021f-63837185c80f" [compat] CairoMakie = "0.7" +Legolas = "0.3" Makie = "0.16.5" StatsBase = "0.33" TensorBoardLogger = "0.1" -Legolas = "0.3" julia = "1.5" [extras] From 982ef8be1d5a41760d02582271034a59bea6056f Mon Sep 17 00:00:00 2001 From: hannahilea Date: Tue, 15 Mar 2022 20:21:02 +0000 Subject: [PATCH 05/21] add codecov --- test/row.jl | 7 +++++++ test/runtests.jl | 1 + 2 files changed, 8 insertions(+) create mode 100644 test/row.jl diff --git a/test/row.jl b/test/row.jl new file mode 100644 index 0000000..69169d9 --- /dev/null +++ b/test/row.jl @@ -0,0 +1,7 @@ +@testset `vec_to_mat` begin + mat = [3 5 6; 6 7 8; 9 10 11] + @test Lighthouse.vec_to_mat(vec(mat)) == mat + @test Lighthouse.vec_to_mat(mat) == mat + @test ismissing(Lighthouse.vec_to_mat(missing)) + @test_throws DimensionMismatch Lighthouse.vec_to_mat(collect(1:6)) # Invalid dimensions +end diff --git a/test/runtests.jl b/test/runtests.jl index f15aef5..65f07a0 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -30,3 +30,4 @@ include("metrics.jl") include("learn.jl") include("utilities.jl") include("logger.jl") +include("row.jl") From 21349d609b850dc92e3c473551f2072cdc898631 Mon Sep 17 00:00:00 2001 From: hannahilea Date: Tue, 15 Mar 2022 20:41:03 +0000 Subject: [PATCH 06/21] more codecov --- test/learn.jl | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/test/learn.jl b/test/learn.jl index 174e5a0..7322673 100644 --- a/test/learn.jl +++ b/test/learn.jl @@ -98,6 +98,10 @@ end @test length(logger.logged["wheeeeeee/time_in_seconds_for_all_time"]) == 1 @test length(logger.logged["wheeeeeee/metrics_for_all_time"]) == 1 + # Round-trip `onehot` for codecov + onehot_hard = map(h -> vec(Lighthouse.onehot(model, h)), predicted_hard) + @test map(h -> findfirst(h), onehot_hard) == predicted_hard + # Test startified eval strata = [Set("group $(j % Int(ceil(sqrt(j))))" for j in 1:(i - 1)) for i in 1:size(votes, 1)] From 6194a88d58f8f80efd0bbd6adbacd4f75273181b Mon Sep 17 00:00:00 2001 From: hannahilea Date: Tue, 15 Mar 2022 21:25:03 +0000 Subject: [PATCH 07/21] Bump julia version to 1.6, testing to 1.6+1.7 --- .github/workflows/CI.yml | 2 +- Project.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml index ea42e1a..6bdd908 100644 --- a/.github/workflows/CI.yml +++ b/.github/workflows/CI.yml @@ -14,7 +14,7 @@ jobs: fail-fast: false matrix: version: - - '1.5' + - '1.7' - '1.6' os: - ubuntu-latest diff --git a/Project.toml b/Project.toml index d5bfd0b..3dbdc20 100644 --- a/Project.toml +++ b/Project.toml @@ -21,7 +21,7 @@ Legolas = "0.3" Makie = "0.16.5" StatsBase = "0.33" TensorBoardLogger = "0.1" -julia = "1.5" +julia = "1.6" [extras] CairoMakie = "13f3f980-e62b-5c42-98c6-ff1f3baf88f0" From 571a7fe9a8874319e4a6ed74bf19439971a85929 Mon Sep 17 00:00:00 2001 From: hannahilea Date: Tue, 15 Mar 2022 22:44:55 +0000 Subject: [PATCH 08/21] wip rt tests --- src/learn.jl | 105 +++++++++++++++++++++------------------- src/row.jl | 71 ++++++++++++++++++++------- test/learn.jl | 130 ++++++++++++++++++++++++++++---------------------- test/row.jl | 16 +++++++ 4 files changed, 199 insertions(+), 123 deletions(-) diff --git a/src/learn.jl b/src/learn.jl index d8c1bde..a23fe69 100644 --- a/src/learn.jl +++ b/src/learn.jl @@ -211,8 +211,7 @@ function evaluate!(predicted_hard_labels::AbstractVector, _validate_threshold_class(optimal_threshold_class, classes) log_resource_info!(logger, logger_prefix; suffix=logger_suffix) do - plot_data = evaluation_metrics(predicted_hard_labels, - predicted_soft_labels, + plot_data = evaluation_metrics(predicted_hard_labels, predicted_soft_labels, elected_hard_labels, classes, thresholds; votes=votes, optimal_threshold_class=optimal_threshold_class) @@ -226,8 +225,6 @@ function evaluate!(predicted_hard_labels::AbstractVector, return nothing end - - function _calculate_stratified_ea_kappas(predicted_hard_labels, elected_hard_labels, class_count, strata) groups = reduce(∪, strata) @@ -238,7 +235,8 @@ function _calculate_stratified_ea_kappas(predicted_hard_labels, elected_hard_lab elected = elected_hard_labels[index] k = _calculate_ea_kappas(predicted, elected, class_count) push!(kappas, - group => (per_class=k.per_class_kappas, multiclass=k.multiclass_kappa, n=sum(index))) + group => (per_class=k.per_class_kappas, multiclass=k.multiclass_kappa, + n=sum(index))) end return sort(kappas; by=p -> last(p).multiclass) end @@ -298,7 +296,8 @@ mean that the voter did not rate that sample. """ function _calculate_ira_kappas(votes, classes) # no votes given or only one expert: - (isnothing(votes) || size(votes, 2) < 2) && return (; per_class_IRA_kappas=missing, multiclass_IRA_kappas=missing) + (isnothing(votes) || size(votes, 2) < 2) && + return (; per_class_IRA_kappas=missing, multiclass_IRA_kappas=missing) all_hard_label_pairs = Array{Int}(undef, 0, 2) num_voters = size(votes, 2) @@ -308,7 +307,8 @@ function _calculate_ira_kappas(votes, classes) end end hard_label_pairs = filter(row -> all(row .!= 0), collect(eachrow(all_hard_label_pairs))) - length(hard_label_pairs) > 0 || return (; per_class_IRA_kappas=missing, multiclass_IRA_kappas=missing) # No common observations voted on + length(hard_label_pairs) > 0 || + return (; per_class_IRA_kappas=missing, multiclass_IRA_kappas=missing) # No common observations voted on length(hard_label_pairs) < 10 && @warn "...only $(length(hard_label_pairs)) in common, potentially questionable IRA results" @@ -320,7 +320,7 @@ function _calculate_ira_kappas(votes, classes) hard_label_pairs) return first(cohens_kappa(CLASS_VS_ALL_CLASS_COUNT, class_v_other_hard_label_pair)) end - return (; per_class_IRA_kappas = per_class_ira, multiclass_IRA_kappas = multiclass_ira) + return (; per_class_IRA_kappas=per_class_ira, multiclass_IRA_kappas=multiclass_ira) end function _spearman_corr(predicted_soft_labels, elected_soft_labels) @@ -430,7 +430,7 @@ function _get_optimal_threshold_from_ROC(per_class_roc_curves; thresholds, opt_point = nothing threshold_idx = 1 for point in zip(per_class_roc_curves[class_of_interest_index][1], - per_class_roc_curves[class_of_interest_index][2]) + per_class_roc_curves[class_of_interest_index][2]) d = dist((0, 1), point) if d < min min = d @@ -501,9 +501,9 @@ See also [`evaluation_metrics_plot`](@ref). function evaluation_metrics_row(predicted_hard_labels::AbstractVector, predicted_soft_labels::AbstractMatrix, elected_hard_labels::AbstractVector, classes, thresholds; - votes::Union{Nothing,AbstractMatrix} = nothing, - strata::Union{Nothing,AbstractVector{Set{T}} where T} = nothing, - optimal_threshold_class::Union{Missing,Integer} = missing) + votes::Union{Nothing,AbstractMatrix}=nothing, + strata::Union{Nothing,AbstractVector{Set{T}} where T}=nothing, + optimal_threshold_class::Union{Missing,Integer}=missing) _validate_threshold_class(optimal_threshold_class, classes) class_count = length(classes) @@ -514,8 +514,8 @@ function evaluation_metrics_row(predicted_hard_labels::AbstractVector, # ROC curves per_class_roc_curves = [(map(t -> t.false_positive_rate, stats), - map(t -> t.true_positive_rate, stats)) - for stats in per_class_stats] + map(t -> t.true_positive_rate, stats)) + for stats in per_class_stats] per_class_roc_aucs = [area_under_curve(x, y) for (x, y) in per_class_roc_curves] # Optionally calculate optimal threshold @@ -532,7 +532,7 @@ function evaluation_metrics_row(predicted_hard_labels::AbstractVector, discrimination_calibration_score = c.mse expert_cal = _calculate_voter_discrimination_calibration(votes; - class_of_interest_index = optimal_threshold_class) + class_of_interest_index=optimal_threshold_class) per_expert_discrimination_calibration_curves = expert_cal.plot_curve_data per_expert_discrimination_calibration_scores = expert_cal.mse else @@ -542,8 +542,8 @@ function evaluation_metrics_row(predicted_hard_labels::AbstractVector, per_expert_discrimination_calibration_scores = missing # ...based on ROC curve otherwise optimal_threshold = _get_optimal_threshold_from_ROC(per_class_roc_curves; - thresholds = thresholds, - class_of_interest_index = optimal_threshold_class) + thresholds=thresholds, + class_of_interest_index=optimal_threshold_class) end # Recalculate `predicted_hard_labels` with this new threshold @@ -562,17 +562,15 @@ function evaluation_metrics_row(predicted_hard_labels::AbstractVector, # PR curves per_class_pr_curves = [(map(t -> t.true_positive_rate, stats), - map(t -> t.precision, stats)) - for stats in per_class_stats] + map(t -> t.precision, stats)) for stats in per_class_stats] # Stratified kappas if isnothing(strata) stratified_kappas = missing else stratified_kappas = _calculate_stratified_ea_kappas(predicted_hard_labels, - elected_hard_labels, - class_count, - strata) + elected_hard_labels, + class_count, strata) end # Reliability calibration curves @@ -580,41 +578,50 @@ function evaluation_metrics_row(predicted_hard_labels::AbstractVector, class_probabilities = view(predicted_soft_labels, :, class_index) return calibration_curve(class_probabilities, elected_hard_labels .== class_index) end - per_class_reliability_calibration_curves = map(x -> (mean.(x.bins), - x.fractions), - per_class_reliability_calibration) + per_class_reliability_calibration_curves = map(x -> (mean.(x.bins), x.fractions), + per_class_reliability_calibration) per_class_reliability_calibration_scores = map(x -> x.mean_squared_error, - per_class_reliability_calibration) - + per_class_reliability_calibration) # Log Spearman correlation, iff this is a binary classification problem if length(classes) == 2 && !isnothing(votes) - spearman_correlation = _calculate_spearman_correlation(predicted_soft_labels, - votes, classes) + spearman_correlation = _calculate_spearman_correlation(predicted_soft_labels, votes, + classes) else spearman_correlation = missing end return EvaluationRow(; class_labels, - confusion_matrix = confusion_matrix(class_count, zip(predicted_hard_labels, elected_hard_labels)), - spearman_correlation, per_class_reliability_calibration_curves, per_class_reliability_calibration_scores, - _calculate_ira_kappas(votes, classes)..., - _calculate_ea_kappas(predicted_hard_labels, elected_hard_labels, class_count)..., - stratified_kappas, per_class_pr_curves, - per_class_roc_curves, per_class_roc_aucs, - discrimination_calibration_curve, - discrimination_calibration_score, - per_expert_discrimination_calibration_curves, - per_expert_discrimination_calibration_scores, - optimal_threshold, - optimal_threshold_class, - thresholds - ) + confusion_matrix=confusion_matrix(class_count, + zip(predicted_hard_labels, + elected_hard_labels)), + spearman_correlation, per_class_reliability_calibration_curves, + per_class_reliability_calibration_scores, + _calculate_ira_kappas(votes, classes)..., + _calculate_ea_kappas(predicted_hard_labels, elected_hard_labels, + class_count)..., stratified_kappas, + per_class_pr_curves, per_class_roc_curves, per_class_roc_aucs, + discrimination_calibration_curve, discrimination_calibration_score, + per_expert_discrimination_calibration_curves, + per_expert_discrimination_calibration_scores, optimal_threshold, + optimal_threshold_class, thresholds) end +""" + evaluation_metrics(args...; optimal_threshold_class=nothing, kwargs...) + +Return [`evaluation_metrics_row`](@ref) after converting output `EvaluationRow` +into a `Dict`. For argument details, see [`evaluation_metrics_row`](@ref). +""" function evaluation_metrics(args...; optimal_threshold_class=nothing, kwargs...) - row = evaluation_metrics_row(args...; optimal_threshold_class=something(optimal_threshold_class, missing), kwargs...) - return Dict(string(k) => v for (k,v) in pairs(NamedTuple(row)) if !ismissing(v)) + row = evaluation_metrics_row(args...; + optimal_threshold_class=something(optimal_threshold_class, + missing), kwargs...) + return _evaluation_row_dict(row) +end + +function _evaluation_row_dict(row::EvaluationRow) + return Dict(string(k) => v for (k, v) in pairs(NamedTuple(row)) if !ismissing(v)) end """ @@ -642,7 +649,6 @@ function evaluation_metrics_plot(predicted_hard_labels::AbstractVector, votes::Union{Nothing,AbstractMatrix}=nothing, strata::Union{Nothing,AbstractVector{Set{T}} where T}=nothing, optimal_threshold_class::Union{Nothing,Integer}=nothing) - Base.depwarn(""" ``` evaluation_metrics_plot(predicted_hard_labels::AbstractVector, @@ -661,8 +667,8 @@ function evaluation_metrics_plot(predicted_hard_labels::AbstractVector, ``` """, :evaluation_metrics_plot) plot_dict = evaluation_metrics(predicted_hard_labels, predicted_soft_labels, - elected_hard_labels, classes, thresholds; - votes, strata, optimal_threshold_class) + elected_hard_labels, classes, thresholds; votes, strata, + optimal_threshold_class) return evaluation_metrics_plot(plot_dict), plot_dict end @@ -778,7 +784,8 @@ function learn!(model::AbstractClassifier, logger, get_train_batches, get_test_b predict!(model, predicted, get_test_batches(), logger; logger_prefix="$(test_set_logger_prefix)_prediction") evaluate!(map(label -> onecold(model, label), eachrow(predicted)), predicted, - elected, classes(model), logger; logger_prefix="$(test_set_logger_prefix)_evaluation", + elected, classes(model), logger; + logger_prefix="$(test_set_logger_prefix)_evaluation", logger_suffix="_per_epoch", votes=votes, optimal_threshold_class=optimal_threshold_class) post_epoch_callback(current_epoch) diff --git a/src/row.jl b/src/row.jl index fb247d5..689f2ac 100644 --- a/src/row.jl +++ b/src/row.jl @@ -5,22 +5,57 @@ function vec_to_mat(vec::AbstractVector) end vec_to_mat(::Missing) = missing +""" + evaluation_metrics(args...; optimal_threshold_class=nothing, kwargs...) + +Convert `Dict` of [`evaluation_metrics`](@ref) results (e.g. from Lighthouse v for (k, v) in pairs(evaluation_row_dict))...) + return EvaluationRow(row) +end + const EvaluationRow = Legolas.@row("lighthouse.evaluation@1", - class_labels::Union{Missing,Vector{String}}, - confusion_matrix::Union{Missing,Matrix{Int64}} = vec_to_mat(confusion_matrix), - discrimination_calibration_curve::Union{Missing,Tuple{Vector{Float64},Vector{Union{Missing,Float64}}}}, - discrimination_calibration_score::Union{Missing,Float64}, - multiclass_IRA_kappas::Union{Missing,Float64}, - multiclass_kappa::Union{Missing,Float64}, - optimal_threshold::Union{Missing,Float64}, - optimal_threshold_class::Union{Missing,Int64}, - per_class_kappas::Union{Missing,Vector{Float64}}, - per_class_pr_curves::Union{Missing,Vector{Tuple{Vector{Float64},Vector{Union{Missing,Float64}}}}}, - per_class_reliability_calibration_curves::Union{Missing,Vector{Tuple{Vector{Float64},Vector{Union{Missing,Float64}}}}}, - per_class_reliability_calibration_scores::Union{Missing,Vector{Float64}}, - per_class_roc_aucs::Union{Missing,Vector{Float64}}, - per_class_roc_curves::Union{Missing,Vector{Tuple{Vector{Float64},Vector{Float64}}}}, - per_expert_discrimination_calibration_curves::Union{Missing,Vector{Tuple{Vector{Float64},Vector{Union{Missing,Float64}}}}}, - per_expert_discrimination_calibration_scores::Union{Missing,Vector{Float64}}, - spearman_correlation::Union{Missing,NamedTuple{(:ρ, :n, :ci_lower, :ci_upper),Tuple{Float64,Int64,Float64,Float64}}}, - thresholds::Union{Missing,Vector{Float64}}) + class_labels::Union{Missing,Vector{String}}, + confusion_matrix::Union{Missing,Matrix{Int64}} = vec_to_mat(confusion_matrix), + discrimination_calibration_curve::Union{Missing, + Tuple{Vector{Float64}, + Vector{Union{Missing, + Float64}}}}, + discrimination_calibration_score::Union{Missing,Float64}, + multiclass_IRA_kappas::Union{Missing,Float64}, + multiclass_kappa::Union{Missing,Float64}, + optimal_threshold::Union{Missing,Float64}, + optimal_threshold_class::Union{Missing,Int64}, + per_class_kappas::Union{Missing,Vector{Float64}}, + per_class_pr_curves::Union{Missing, + Vector{Tuple{Vector{Float64}, + Vector{Union{Missing, + Float64}}}}}, + per_class_reliability_calibration_curves::Union{Missing, + Vector{Tuple{Vector{Float64}, + Vector{Union{Missing, + Float64}}}}}, + per_class_reliability_calibration_scores::Union{Missing, + Vector{Float64}}, + per_class_roc_aucs::Union{Missing,Vector{Float64}}, + per_class_roc_curves::Union{Missing, + Vector{Tuple{Vector{Float64}, + Vector{Float64}}}}, + per_expert_discrimination_calibration_curves::Union{Missing, + Vector{Tuple{Vector{Float64}, + Vector{Union{Missing, + Float64}}}}}, + per_expert_discrimination_calibration_scores::Union{Missing, + Vector{Float64}}, + spearman_correlation::Union{Missing, + NamedTuple{(:ρ, :n, + :ci_lower, + :ci_upper), + Tuple{Float64, + Int64, + Float64, + Float64}}}, + thresholds::Union{Missing,Vector{Float64}}) diff --git a/test/learn.jl b/test/learn.jl index 7322673..2d83053 100644 --- a/test/learn.jl +++ b/test/learn.jl @@ -22,6 +22,16 @@ function Lighthouse.loss_and_prediction(c::TestClassifier, dummy_input_batch) return c.dummy_loss, dummy_soft_label_batch end +function test_roundtrip_results(results_dict) + row = evaluation_row(results_dict) + p = mktempdir() * "rt_test.arrow" + #todo write this to tempdir + rt_row = nothing # read this from tempdir + @test rt_row == row + @test _evaluation_row_dict(rt_row) == results_dict + return true +end + @testset "Multi-class learn!(::TestModel, ...)" begin mktempdir() do tmpdir model = TestClassifier(1000000.0, ["class_$i" for i in 1:5]) @@ -44,24 +54,25 @@ end @info counted n end end - elected = majority.((rng,), eachrow(votes), (1:length(Lighthouse.classes(model)),)) - Lighthouse.learn!(model, logger, () -> train_batches, () -> test_batches, votes, elected; - epoch_limit=limit, post_epoch_callback=callback) + elected = majority.((rng,), eachrow(votes), + (1:length(Lighthouse.classes(model)),)) + Lighthouse.learn!(model, logger, () -> train_batches, () -> test_batches, votes, + elected; epoch_limit=limit, post_epoch_callback=callback) @test counted == sum(1:limit) end @test length(logger.logged["train/loss_per_batch"]) == length(train_batches) * limit for key in ["test_set_prediction/loss_per_batch", - "test_set_prediction/time_in_seconds_per_batch", - "test_set_prediction/gc_time_in_seconds_per_batch", - "test_set_prediction/allocations_per_batch", - "test_set_prediction/memory_in_mb_per_batch"] + "test_set_prediction/time_in_seconds_per_batch", + "test_set_prediction/gc_time_in_seconds_per_batch", + "test_set_prediction/allocations_per_batch", + "test_set_prediction/memory_in_mb_per_batch"] @test length(logger.logged[key]) == length(test_batches) * limit end for key in ["test_set_prediction/mean_loss_per_epoch", - "test_set_evaluation/time_in_seconds_per_epoch", - "test_set_evaluation/gc_time_in_seconds_per_epoch", - "test_set_evaluation/allocations_per_epoch", - "test_set_evaluation/memory_in_mb_per_epoch"] + "test_set_evaluation/time_in_seconds_per_epoch", + "test_set_evaluation/gc_time_in_seconds_per_epoch", + "test_set_evaluation/allocations_per_epoch", + "test_set_evaluation/memory_in_mb_per_epoch"] @test length(logger.logged[key]) == limit end @test length(logger.logged["test_set_evaluation/metrics_per_epoch"]) == limit @@ -105,16 +116,23 @@ end # Test startified eval strata = [Set("group $(j % Int(ceil(sqrt(j))))" for j in 1:(i - 1)) for i in 1:size(votes, 1)] - plot_data = evaluation_metrics(predicted_hard, predicted_soft, - elected_hard, model.classes, 0.0:0.01:1.0; - votes=votes, strata=strata) + plot_data = evaluation_metrics(predicted_hard, predicted_soft, elected_hard, + model.classes, 0.0:0.01:1.0; votes=votes, + strata=strata) @test haskey(plot_data, "stratified_kappas") plot = evaluation_metrics_plot(plot_data) - plot2, plot_data2 = @test_deprecated evaluation_metrics_plot(predicted_hard, predicted_soft, - elected_hard, model.classes, 0.0:0.01:1.0; - votes=votes, strata=strata) + @test test_roundtrip_results(plot_data) + + plot2, plot_data2 = @test_deprecated evaluation_metrics_plot(predicted_hard, + predicted_soft, + elected_hard, + model.classes, + 0.0:0.01:1.0; + votes=votes, + strata=strata) @test isequal(plot_data, plot_data2) # check these are the same + @test test_roundtrip_results(plot_data2) # Test plotting plot_data = last(logger.logged["test_set_evaluation/metrics_per_epoch"]) @@ -128,15 +146,17 @@ end @testplot roc # Kappa no IRA - kappas_no_ira = plot_kappas(vcat(plot_data["multiclass_kappa"], plot_data["per_class_kappas"]), - vcat("Multiclass", plot_data["class_labels"])) + kappas_no_ira = plot_kappas(vcat(plot_data["multiclass_kappa"], + plot_data["per_class_kappas"]), + vcat("Multiclass", plot_data["class_labels"])) @testplot kappas_no_ira # Kappa with IRA - kappas_ira = plot_kappas(vcat(plot_data["multiclass_kappa"], plot_data["per_class_kappas"]), - vcat("Multiclass", plot_data["class_labels"]), - vcat(plot_data["multiclass_IRA_kappas"], - plot_data["per_class_IRA_kappas"])) + kappas_ira = plot_kappas(vcat(plot_data["multiclass_kappa"], + plot_data["per_class_kappas"]), + vcat("Multiclass", plot_data["class_labels"]), + vcat(plot_data["multiclass_IRA_kappas"], + plot_data["per_class_IRA_kappas"])) @testplot kappas_ira reliability_calibration = plot_reliability_calibration_curves(plot_data["per_class_reliability_calibration_curves"], @@ -181,9 +201,10 @@ end @info counted n end end - elected = majority.((rng,), eachrow(votes), (1:length(Lighthouse.classes(model)),)) - Lighthouse.learn!(model, logger, () -> train_batches, () -> test_batches, votes, elected; - epoch_limit=limit, post_epoch_callback=callback) + elected = majority.((rng,), eachrow(votes), + (1:length(Lighthouse.classes(model)),)) + Lighthouse.learn!(model, logger, () -> train_batches, () -> test_batches, votes, + elected; epoch_limit=limit, post_epoch_callback=callback) @test counted == sum(1:limit) end # Binary classification logs some additional metrics @@ -191,6 +212,7 @@ end limit plot_data = last(logger.logged["test_set_evaluation/metrics_per_epoch"]) @test haskey(plot_data, "spearman_correlation") + @test test_roundtrip_results(plot_data) # No `optimal_threshold_class` during learning... @test !haskey(plot_data, "optimal_threshold") @@ -198,13 +220,14 @@ end # And now, `optimal_threshold_class` during learning elected = majority.((rng,), eachrow(votes), (1:length(Lighthouse.classes(model)),)) - Lighthouse.learn!(model, logger, () -> train_batches, () -> test_batches, votes, elected; - epoch_limit=limit, optimal_threshold_class=2, + Lighthouse.learn!(model, logger, () -> train_batches, () -> test_batches, votes, + elected; epoch_limit=limit, optimal_threshold_class=2, test_set_logger_prefix="validation_set") plot_data = last(logger.logged["validation_set_evaluation/metrics_per_epoch"]) @test haskey(plot_data, "optimal_threshold") @test haskey(plot_data, "optimal_threshold_class") @test plot_data["optimal_threshold_class"] == 2 + @test test_roundtrip_results(plot_data) # `optimal_threshold_class` param invalid @test_throws ArgumentError Lighthouse.learn!(model, logger, () -> train_batches, @@ -230,12 +253,14 @@ end plot_data = last(logger.logged["wheeeeeee/metrics_for_all_time"]) @test !haskey(plot_data, "per_class_IRA_kappas") @test !haskey(plot_data, "multiclass_IRA_kappas") + @test test_roundtrip_results(plot_data) evaluate!(predicted_hard, predicted_soft, elected_hard, model.classes, logger; logger_prefix="wheeeeeee", logger_suffix="_for_all_time", votes=votes) plot_data = last(logger.logged["wheeeeeee/metrics_for_all_time"]) @test haskey(plot_data, "per_class_IRA_kappas") @test haskey(plot_data, "multiclass_IRA_kappas") + @test test_roundtrip_results(plot_data) # Test `evaluate` for different optimal_threshold classes evaluate!(predicted_hard, predicted_soft, elected_hard, model.classes, logger; @@ -246,6 +271,7 @@ end logger_prefix="wheeeeeee", logger_suffix="_for_all_time", votes=votes, optimal_threshold_class=2) plot_data_2 = last(logger.logged["wheeeeeee/metrics_for_all_time"]) + @test test_roundtrip_results(plot_data_2) # The thresholds should not be identical (since they are *inclusive* when applied: # values greater than _or equal to_ the threshold are given the class value) @@ -273,9 +299,9 @@ end @testset "Invalid `_calculate_ira_kappas`" begin classes = ["roy", "gee", "biv"] @test isequal(Lighthouse._calculate_ira_kappas([1; 1; 1; 1], classes), - (; per_class_IRA_kappas = missing, multiclass_IRA_kappas = missing)) # Only one voter... + (; per_class_IRA_kappas=missing, multiclass_IRA_kappas=missing)) # Only one voter... @test isequal(Lighthouse._calculate_ira_kappas([1 0; 1 0; 0 1], classes), - (; per_class_IRA_kappas = missing, multiclass_IRA_kappas = missing)) # No observations in common... + (; per_class_IRA_kappas=missing, multiclass_IRA_kappas=missing)) # No observations in common... end @testset "Calculate `_spearman_corr`" begin @@ -309,11 +335,9 @@ end # Test NaN spearman due to unranked input votes = [1; 2; 2] - predicted_soft = [ - 0.3 0.7 - 0.3 0.7 - 0.3 0.7 - ] + predicted_soft = [0.3 0.7 + 0.3 0.7 + 0.3 0.7] sp = Lighthouse._calculate_spearman_correlation(predicted_soft, votes, ["oh" "em"]) @test isnan(sp.ρ) @@ -335,11 +359,9 @@ end @test length(single_voter_calibration.mse) == 1 # Test multi-voter voter discrimination calibration - votes = [ - 0 1 1 1 - 1 2 0 0 - 2 1 2 2 - ] # Note: voters 3 and 4 have voted identically + votes = [0 1 1 1 + 1 2 0 0 + 2 1 2 2] # Note: voters 3 and 4 have voted identically voter_calibration = Lighthouse._calculate_voter_discrimination_calibration(votes; class_of_interest_index=1) @test length(voter_calibration.mse) == size(votes, 2) @@ -366,13 +388,11 @@ end end @testset "2-class per_class_confusion_statistics" begin - predicted_soft_labels = [ - 0.51 0.49 - 0.49 0.51 - 0.1 0.9 - 0.9 0.1 - 0.0 1.0 - ] + predicted_soft_labels = [0.51 0.49 + 0.49 0.51 + 0.1 0.9 + 0.9 0.1 + 0.0 1.0] elected_hard_labels = [1, 2, 2, 2, 1] thresholds = [0.25, 0.5, 0.75] class_1, class_2 = Lighthouse.per_class_confusion_statistics(predicted_soft_labels, @@ -447,15 +467,13 @@ end end @testset "3-class per_class_confusion_statistics" begin - predicted_soft_labels = [ - 1/3 1/3 1/3 - 0.1 0.7 0.2 - 0.25 0.25 0.5 - 0.4 0.5 0.1 - 0.0 0.0 1.0 - 0.2 0.5 0.3 - 0.5 0.4 0.1 - ] + predicted_soft_labels = [1/3 1/3 1/3 + 0.1 0.7 0.2 + 0.25 0.25 0.5 + 0.4 0.5 0.1 + 0.0 0.0 1.0 + 0.2 0.5 0.3 + 0.5 0.4 0.1] elected_hard_labels = [1, 2, 2, 1, 3, 3, 1] # TODO would be more robust to have multiple thresholds, but our naive tests # here will have to be refactored to avoid becoming a nightmare if we do that diff --git a/test/row.jl b/test/row.jl index 69169d9..5dee858 100644 --- a/test/row.jl +++ b/test/row.jl @@ -5,3 +5,19 @@ @test ismissing(Lighthouse.vec_to_mat(missing)) @test_throws DimensionMismatch Lighthouse.vec_to_mat(collect(1:6)) # Invalid dimensions end + +@testset `round trip EvaluationRow tests` begin + # Basic roundtrip + dict = Dict("class_labels" => ["foo", "bar"], "multiclass_kappa" => 3) + row = Lighthouse.evaluation_row(dict) + @test isa(row, Lighthouse.EvaluationRow) + @test isequal(Lighthouse._evaluation_row_dict(row), dict) + + # Should ignore any additional fields that we don't convert + extra_dict = Dict("class_labels" => ["foo", "bar"], "multiclass_kappa" => 3, "rabbit" => 2432) + row = Lighthouse.evaluation_row(dict) + @test isa(row, Lighthouse.EvaluationRow) + @test isequal(Lighthouse._evaluation_row_dict(row), dict) + + +end \ No newline at end of file From ac1f66656e3c0b1c608eb7fc653c9d643da219ff Mon Sep 17 00:00:00 2001 From: hannahilea Date: Wed, 16 Mar 2022 22:53:18 +0000 Subject: [PATCH 09/21] wip --- src/learn.jl | 4 ---- src/row.jl | 46 ++++++++++++++++++++++++++++++++-------------- test/learn.jl | 12 +----------- test/row.jl | 19 +++++++++---------- test/runtests.jl | 16 ++++++++++++++++ 5 files changed, 58 insertions(+), 39 deletions(-) diff --git a/src/learn.jl b/src/learn.jl index a23fe69..1262374 100644 --- a/src/learn.jl +++ b/src/learn.jl @@ -620,10 +620,6 @@ function evaluation_metrics(args...; optimal_threshold_class=nothing, kwargs...) return _evaluation_row_dict(row) end -function _evaluation_row_dict(row::EvaluationRow) - return Dict(string(k) => v for (k, v) in pairs(NamedTuple(row)) if !ismissing(v)) -end - """ evaluation_metrics_plot(predicted_hard_labels::AbstractVector, predicted_soft_labels::AbstractMatrix, diff --git a/src/row.jl b/src/row.jl index 689f2ac..dece1c3 100644 --- a/src/row.jl +++ b/src/row.jl @@ -1,22 +1,16 @@ -vec_to_mat(mat::AbstractMatrix) = mat +function vec_to_mat(mat::AbstractMatrix) + @info "here" + return mat +end function vec_to_mat(vec::AbstractVector) + @info "ok..." n = isqrt(length(vec)) return reshape(vec, n, n) end -vec_to_mat(::Missing) = missing - -""" - evaluation_metrics(args...; optimal_threshold_class=nothing, kwargs...) - -Convert `Dict` of [`evaluation_metrics`](@ref) results (e.g. from Lighthouse v for (k, v) in pairs(evaluation_row_dict))...) - return EvaluationRow(row) -end +vec_to_mat(x::Missing) = (@info "why"; @info typeof(x); return missing) +# Redefinition is workaround for https://github.com/beacon-biosignals/Legolas.jl/issues/9 +const EVALUATION_ROW_SCHEMA = Legolas.Schema("lighthouse.evaluation@1") const EvaluationRow = Legolas.@row("lighthouse.evaluation@1", class_labels::Union{Missing,Vector{String}}, confusion_matrix::Union{Missing,Matrix{Int64}} = vec_to_mat(confusion_matrix), @@ -59,3 +53,27 @@ const EvaluationRow = Legolas.@row("lighthouse.evaluation@1", Float64, Float64}}}, thresholds::Union{Missing,Vector{Float64}}) + +""" + EvaluationRow(evaluation_row_dict::Dict{String, Any}) -> EvaluationRow + +Convert `Dict` of [`evaluation_metrics`](@ref) results (e.g. from Lighthouse v for (k, v) in pairs(evaluation_row_dict))...) + return EvaluationRow(row) +end + + +""" + _evaluation_row_dict(row::EvaluationRow) -> Dict{String,Any} + +Convert [`EvaluationRow`](@ref) into `::Dict{String, Any}` results, as are +output by `[`evaluation_metrics`](@ref)` (and predated use of `EvaluationRow` in +Lighthouse v for (k, v) in pairs(NamedTuple(row)) if !ismissing(v)) +end \ No newline at end of file diff --git a/test/learn.jl b/test/learn.jl index 2d83053..7361ec5 100644 --- a/test/learn.jl +++ b/test/learn.jl @@ -22,16 +22,6 @@ function Lighthouse.loss_and_prediction(c::TestClassifier, dummy_input_batch) return c.dummy_loss, dummy_soft_label_batch end -function test_roundtrip_results(results_dict) - row = evaluation_row(results_dict) - p = mktempdir() * "rt_test.arrow" - #todo write this to tempdir - rt_row = nothing # read this from tempdir - @test rt_row == row - @test _evaluation_row_dict(rt_row) == results_dict - return true -end - @testset "Multi-class learn!(::TestModel, ...)" begin mktempdir() do tmpdir model = TestClassifier(1000000.0, ["class_$i" for i in 1:5]) @@ -122,7 +112,7 @@ end @test haskey(plot_data, "stratified_kappas") plot = evaluation_metrics_plot(plot_data) - @test test_roundtrip_results(plot_data) + @test test_roundtrip_evaluation(plot_data) plot2, plot_data2 = @test_deprecated evaluation_metrics_plot(predicted_hard, predicted_soft, diff --git a/test/row.jl b/test/row.jl index 5dee858..1a4f9a3 100644 --- a/test/row.jl +++ b/test/row.jl @@ -1,4 +1,4 @@ -@testset `vec_to_mat` begin +@testset "`vec_to_mat`" begin mat = [3 5 6; 6 7 8; 9 10 11] @test Lighthouse.vec_to_mat(vec(mat)) == mat @test Lighthouse.vec_to_mat(mat) == mat @@ -6,18 +6,17 @@ @test_throws DimensionMismatch Lighthouse.vec_to_mat(collect(1:6)) # Invalid dimensions end -@testset `round trip EvaluationRow tests` begin +@testset "`EvaluationRow`" begin # Basic roundtrip dict = Dict("class_labels" => ["foo", "bar"], "multiclass_kappa" => 3) - row = Lighthouse.evaluation_row(dict) - @test isa(row, Lighthouse.EvaluationRow) - @test isequal(Lighthouse._evaluation_row_dict(row), dict) + @test test_roundtrip_evaluation(dict) # Should ignore any additional fields that we don't convert - extra_dict = Dict("class_labels" => ["foo", "bar"], "multiclass_kappa" => 3, "rabbit" => 2432) - row = Lighthouse.evaluation_row(dict) - @test isa(row, Lighthouse.EvaluationRow) - @test isequal(Lighthouse._evaluation_row_dict(row), dict) - + extra_dict = Dict("class_labels" => ["foo", "bar"], "multiclass_kappa" => 3, + "rabbit" => 2432) + @test test_roundtrip_evaluation(extra_dict) + mat_dict = Dict("confusion_matrix" => [3 5 6; 6 7 8; 9 10 11]) + mat_row = Lighthouse.EvaluationRow(mat_dict) + rt_row = roundtrip_row(mat_row) end \ No newline at end of file diff --git a/test/runtests.jl b/test/runtests.jl index 65f07a0..cf38a7e 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -6,6 +6,7 @@ using Lighthouse: plot_reliability_calibration_curves, plot_pr_curves, evaluation_metrics_plot, evaluation_metrics using Base.Threads using CairoMakie +using Legolas, Tables # Needs to be set for figures # returning true for showable("image/png", obj) @@ -25,6 +26,21 @@ macro testplot(fig_name) end end +function test_roundtrip_evaluation(row_dict::Dict{String,Any}) + row = Lighthouse.EvaluationRow(row_dict) + rt_row = roundtrip_row(row) + @test isequal(rt_row, row) + @test Lighthouse._evaluation_row_dict(rt_row) == row_dict + return true +end + +function roundtrip_row(row::Lighthouse.EvaluationRow) + p = mktempdir() * "rt_test.arrow" + tbl = [row] + Legolas.write(p, tbl, Lighthouse.EVALUATION_ROW_SCHEMA) + return Lighthouse.EvaluationRow(only(Tables.rows(Legolas.read(p)))) +end + include("plotting.jl") include("metrics.jl") include("learn.jl") From 424a548e717df12a316495d1edcf59b56c04bbcd Mon Sep 17 00:00:00 2001 From: hannahilea Date: Thu, 17 Mar 2022 22:11:59 +0000 Subject: [PATCH 10/21] Support matrix serialization/deserialization --- Project.toml | 2 ++ src/Lighthouse.jl | 1 + src/row.jl | 21 ++++++++++----------- test/row.jl | 6 +++--- test/runtests.jl | 2 +- 5 files changed, 17 insertions(+), 15 deletions(-) diff --git a/Project.toml b/Project.toml index 3dbdc20..c4fcdcb 100644 --- a/Project.toml +++ b/Project.toml @@ -4,6 +4,7 @@ authors = ["Beacon Biosignals, Inc."] version = "0.14.0" [deps] +Arrow = "69666777-d1a9-59fb-9406-91d4454c9d45" Dates = "ade2ca70-3891-5945-98fb-dc099432e06a" Legolas = "741b9549-f6ed-4911-9fbf-4a1c0c97f0cd" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" @@ -16,6 +17,7 @@ StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" TensorBoardLogger = "899adc3e-224a-11e9-021f-63837185c80f" [compat] +Arrow = "2.2" CairoMakie = "0.7" Legolas = "0.3" Makie = "0.16.5" diff --git a/src/Lighthouse.jl b/src/Lighthouse.jl index 38f9095..d96e333 100644 --- a/src/Lighthouse.jl +++ b/src/Lighthouse.jl @@ -7,6 +7,7 @@ using TensorBoardLogger using Makie using Printf using Legolas +using Arrow include("plotting.jl") diff --git a/src/row.jl b/src/row.jl index dece1c3..e645f49 100644 --- a/src/row.jl +++ b/src/row.jl @@ -1,19 +1,19 @@ -function vec_to_mat(mat::AbstractMatrix) - @info "here" - return mat -end +# Arrow can't handle matrices---so when we write/read matrices, we have to pack and unpack them o_O +# https://github.com/apache/arrow-julia/issues/125 +vec_to_mat(mat::AbstractMatrix) = mat + function vec_to_mat(vec::AbstractVector) - @info "ok..." n = isqrt(length(vec)) return reshape(vec, n, n) end -vec_to_mat(x::Missing) = (@info "why"; @info typeof(x); return missing) + +vec_to_mat(x::Missing) = return missing # Redefinition is workaround for https://github.com/beacon-biosignals/Legolas.jl/issues/9 const EVALUATION_ROW_SCHEMA = Legolas.Schema("lighthouse.evaluation@1") const EvaluationRow = Legolas.@row("lighthouse.evaluation@1", class_labels::Union{Missing,Vector{String}}, - confusion_matrix::Union{Missing,Matrix{Int64}} = vec_to_mat(confusion_matrix), + confusion_matrix::Union{Missing,Array{Int64}} = vec_to_mat(confusion_matrix), #TODO: file issue to make Matrix{Int64} in future discrimination_calibration_curve::Union{Missing, Tuple{Vector{Float64}, Vector{Union{Missing, @@ -60,13 +60,12 @@ const EvaluationRow = Legolas.@row("lighthouse.evaluation@1", Convert `Dict` of [`evaluation_metrics`](@ref) results (e.g. from Lighthouse v for (k, v) in pairs(evaluation_row_dict))...) +function Legolas.Row{S}(evaluation_row_dict::Dict) where {S<:Legolas.Schema{Symbol("lighthouse.evaluation"), + 1}} + row = (; (Symbol(k) => v for (k, v) in pairs(evaluation_row_dict))...) return EvaluationRow(row) end - """ _evaluation_row_dict(row::EvaluationRow) -> Dict{String,Any} diff --git a/test/row.jl b/test/row.jl index 1a4f9a3..5ec4786 100644 --- a/test/row.jl +++ b/test/row.jl @@ -11,12 +11,12 @@ end dict = Dict("class_labels" => ["foo", "bar"], "multiclass_kappa" => 3) @test test_roundtrip_evaluation(dict) - # Should ignore any additional fields that we don't convert + # Don't lose extra columns (basic Legolas functionality) extra_dict = Dict("class_labels" => ["foo", "bar"], "multiclass_kappa" => 3, "rabbit" => 2432) @test test_roundtrip_evaluation(extra_dict) + # Handle fun cases mat_dict = Dict("confusion_matrix" => [3 5 6; 6 7 8; 9 10 11]) - mat_row = Lighthouse.EvaluationRow(mat_dict) - rt_row = roundtrip_row(mat_row) + @test test_roundtrip_evaluation(mat_dict) end \ No newline at end of file diff --git a/test/runtests.jl b/test/runtests.jl index cf38a7e..c651f6a 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -26,7 +26,7 @@ macro testplot(fig_name) end end -function test_roundtrip_evaluation(row_dict::Dict{String,Any}) +function test_roundtrip_evaluation(row_dict::Dict{String,S}) where S row = Lighthouse.EvaluationRow(row_dict) rt_row = roundtrip_row(row) @test isequal(rt_row, row) From 38bac6b1b11bb84c564ab3d0dc82dc78db61ca03 Mon Sep 17 00:00:00 2001 From: hannahilea Date: Thu, 17 Mar 2022 22:11:59 +0000 Subject: [PATCH 11/21] Support matrix serialization/deserialization --- Project.toml | 2 ++ src/Lighthouse.jl | 3 ++- src/row.jl | 21 ++++++++++----------- test/row.jl | 6 +++--- test/runtests.jl | 2 +- 5 files changed, 18 insertions(+), 16 deletions(-) diff --git a/Project.toml b/Project.toml index 3dbdc20..c4fcdcb 100644 --- a/Project.toml +++ b/Project.toml @@ -4,6 +4,7 @@ authors = ["Beacon Biosignals, Inc."] version = "0.14.0" [deps] +Arrow = "69666777-d1a9-59fb-9406-91d4454c9d45" Dates = "ade2ca70-3891-5945-98fb-dc099432e06a" Legolas = "741b9549-f6ed-4911-9fbf-4a1c0c97f0cd" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" @@ -16,6 +17,7 @@ StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" TensorBoardLogger = "899adc3e-224a-11e9-021f-63837185c80f" [compat] +Arrow = "2.2" CairoMakie = "0.7" Legolas = "0.3" Makie = "0.16.5" diff --git a/src/Lighthouse.jl b/src/Lighthouse.jl index 38f9095..4ce5c3e 100644 --- a/src/Lighthouse.jl +++ b/src/Lighthouse.jl @@ -7,6 +7,7 @@ using TensorBoardLogger using Makie using Printf using Legolas +using Arrow include("plotting.jl") @@ -20,9 +21,9 @@ include("classifier.jl") export AbstractClassifier include("row.jl") +# TODO: export EvaluationRow ? include("learn.jl") export LearnLogger, learn!, upon, evaluate!, predict! - end # module diff --git a/src/row.jl b/src/row.jl index dece1c3..e645f49 100644 --- a/src/row.jl +++ b/src/row.jl @@ -1,19 +1,19 @@ -function vec_to_mat(mat::AbstractMatrix) - @info "here" - return mat -end +# Arrow can't handle matrices---so when we write/read matrices, we have to pack and unpack them o_O +# https://github.com/apache/arrow-julia/issues/125 +vec_to_mat(mat::AbstractMatrix) = mat + function vec_to_mat(vec::AbstractVector) - @info "ok..." n = isqrt(length(vec)) return reshape(vec, n, n) end -vec_to_mat(x::Missing) = (@info "why"; @info typeof(x); return missing) + +vec_to_mat(x::Missing) = return missing # Redefinition is workaround for https://github.com/beacon-biosignals/Legolas.jl/issues/9 const EVALUATION_ROW_SCHEMA = Legolas.Schema("lighthouse.evaluation@1") const EvaluationRow = Legolas.@row("lighthouse.evaluation@1", class_labels::Union{Missing,Vector{String}}, - confusion_matrix::Union{Missing,Matrix{Int64}} = vec_to_mat(confusion_matrix), + confusion_matrix::Union{Missing,Array{Int64}} = vec_to_mat(confusion_matrix), #TODO: file issue to make Matrix{Int64} in future discrimination_calibration_curve::Union{Missing, Tuple{Vector{Float64}, Vector{Union{Missing, @@ -60,13 +60,12 @@ const EvaluationRow = Legolas.@row("lighthouse.evaluation@1", Convert `Dict` of [`evaluation_metrics`](@ref) results (e.g. from Lighthouse v for (k, v) in pairs(evaluation_row_dict))...) +function Legolas.Row{S}(evaluation_row_dict::Dict) where {S<:Legolas.Schema{Symbol("lighthouse.evaluation"), + 1}} + row = (; (Symbol(k) => v for (k, v) in pairs(evaluation_row_dict))...) return EvaluationRow(row) end - """ _evaluation_row_dict(row::EvaluationRow) -> Dict{String,Any} diff --git a/test/row.jl b/test/row.jl index 1a4f9a3..5ec4786 100644 --- a/test/row.jl +++ b/test/row.jl @@ -11,12 +11,12 @@ end dict = Dict("class_labels" => ["foo", "bar"], "multiclass_kappa" => 3) @test test_roundtrip_evaluation(dict) - # Should ignore any additional fields that we don't convert + # Don't lose extra columns (basic Legolas functionality) extra_dict = Dict("class_labels" => ["foo", "bar"], "multiclass_kappa" => 3, "rabbit" => 2432) @test test_roundtrip_evaluation(extra_dict) + # Handle fun cases mat_dict = Dict("confusion_matrix" => [3 5 6; 6 7 8; 9 10 11]) - mat_row = Lighthouse.EvaluationRow(mat_dict) - rt_row = roundtrip_row(mat_row) + @test test_roundtrip_evaluation(mat_dict) end \ No newline at end of file diff --git a/test/runtests.jl b/test/runtests.jl index cf38a7e..c651f6a 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -26,7 +26,7 @@ macro testplot(fig_name) end end -function test_roundtrip_evaluation(row_dict::Dict{String,Any}) +function test_roundtrip_evaluation(row_dict::Dict{String,S}) where S row = Lighthouse.EvaluationRow(row_dict) rt_row = roundtrip_row(row) @test isequal(rt_row, row) From 353b22e00326401d112be3db56a195f4c5f3c220 Mon Sep 17 00:00:00 2001 From: hannahilea Date: Thu, 17 Mar 2022 23:49:24 +0000 Subject: [PATCH 12/21] Replace missing with NaN --- src/learn.jl | 4 ++-- src/metrics.jl | 28 +++++++++++++--------------- src/row.jl | 19 +++++++++++-------- test/learn.jl | 17 +++++++++-------- test/metrics.jl | 20 +++++++++++++++++--- test/runtests.jl | 24 +++++++++++++++++++++--- 6 files changed, 73 insertions(+), 39 deletions(-) diff --git a/src/learn.jl b/src/learn.jl index 1262374..6b539ef 100644 --- a/src/learn.jl +++ b/src/learn.jl @@ -238,7 +238,8 @@ function _calculate_stratified_ea_kappas(predicted_hard_labels, elected_hard_lab group => (per_class=k.per_class_kappas, multiclass=k.multiclass_kappa, n=sum(index))) end - return sort(kappas; by=p -> last(p).multiclass) + kappas = sort(kappas; by=p -> last(p).multiclass) + return [k = v for (k, v) in kappas] end """ @@ -369,7 +370,6 @@ Where... function _calculate_spearman_correlation(predicted_soft_labels, votes, classes) length(classes) > 2 && throw(ArgumentError("Only valid for 2-class problems")) if !all(x -> x ≈ 1, sum(predicted_soft_labels; dims=2)) - @info predicted_soft_labels throw(ArgumentError("Input probabiliities fail softmax assumption")) end diff --git a/src/metrics.jl b/src/metrics.jl index d0bc98c..71f0ab8 100644 --- a/src/metrics.jl +++ b/src/metrics.jl @@ -24,14 +24,14 @@ end accuracy(confusion::AbstractMatrix) Returns the percentage of matching classifications out of total classifications, -or `missing` if `all(iszero, confusion)`. +or `NaN` if `all(iszero, confusion)`. Note that `accuracy(confusion)` is equivalent to overall percent agreement between `confusion`'s row classifier and column classifier. """ function accuracy(confusion::AbstractMatrix) total = sum(confusion) - total == 0 && return missing + total == 0 && return NaN return tr(confusion) / total end @@ -78,15 +78,12 @@ function binary_statistics(confusion::AbstractMatrix, class_index::Integer) false_negative_rate = (false_negatives == 0 && actual_positives == 0) ? (zero(false_negatives) / one(actual_positives)) : (false_negatives / actual_positives) - precision = (true_positives == 0 && predicted_positives == 0) ? missing : + precision = (true_positives == 0 && predicted_positives == 0) ? NaN : (true_positives / predicted_positives) - return (predicted_positives=predicted_positives, - predicted_negatives=predicted_negatives, actual_positives=actual_positives, - actual_negatives=actual_negatives, true_positives=true_positives, - true_negatives=true_negatives, false_positives=false_positives, - false_negatives=false_negatives, true_positive_rate=true_positive_rate, - true_negative_rate=true_negative_rate, false_positive_rate=false_positive_rate, - false_negative_rate=false_negative_rate, precision=precision) + return (; predicted_positives, predicted_negatives, actual_positives, actual_negatives, + true_positives, true_negatives, false_positives, false_negatives, + true_positive_rate, true_negative_rate, false_positive_rate, + false_negative_rate, precision) end function binary_statistics(confusion::AbstractMatrix) @@ -105,7 +102,8 @@ Return `(κ, p₀)` where `κ` is Cohen's kappa and `p₀` percent agreement giv their equivalents in [`confusion_matrix`](@ref)). """ function cohens_kappa(class_count, hard_label_pairs) - all(issubset(pair, 1:class_count) for pair in hard_label_pairs) || throw(ArgumentError("Unexpected class in `hard_label_pairs`.")) + all(issubset(pair, 1:class_count) for pair in hard_label_pairs) || + throw(ArgumentError("Unexpected class in `hard_label_pairs`.")) p₀ = accuracy(confusion_matrix(class_count, hard_label_pairs)) pₑ = _probability_of_chance_agreement(class_count, hard_label_pairs) return _cohens_kappa(p₀, pₑ), p₀ @@ -137,7 +135,7 @@ where: - `bins` a vector with `bin_count` `Pairs` specifying the calibration curve's probability bins - `fractions`: a vector where `fractions[i]` is the number of values in `probabilities` - that falls within `bin[i]` over the total number of values within `bin[i]`, or `missing` + that falls within `bin[i]` over the total number of values within `bin[i]`, or `NaN` if the total number of values in `bin[i]` is zero. - `totals`: a vector where `totals[i]` the total number of values within `bin[i]`. - `mean_squared_error`: The mean squared error of `fractions` vs. an ideal calibration curve. @@ -150,12 +148,12 @@ function calibration_curve(probabilities, bitmask; bin_count=10) bins = probability_bins(bin_count) per_bin = [fraction_within(probabilities, bitmask, bin...) for bin in bins] fractions, totals = first.(per_bin), last.(per_bin) - nonempty_indices = findall(!ismissing, fractions) + nonempty_indices = findall(!isnan, fractions) if !isempty(nonempty_indices) ideal = range(mean(first(bins)), mean(last(bins)); length=length(bins)) mean_squared_error = mse(fractions[nonempty_indices], ideal[nonempty_indices]) else - mean_squared_error = missing + mean_squared_error = NaN end return (bins=bins, fractions=fractions, totals=totals, mean_squared_error=mean_squared_error) @@ -179,6 +177,6 @@ function fraction_within(values, bitmask, start, stop) total += 1 end end - fraction = iszero(total) ? missing : (count / total) + fraction = iszero(total) ? NaN : (count / total) return (fraction=fraction, total=total) end diff --git a/src/row.jl b/src/row.jl index e645f49..377bcab 100644 --- a/src/row.jl +++ b/src/row.jl @@ -16,22 +16,26 @@ const EvaluationRow = Legolas.@row("lighthouse.evaluation@1", confusion_matrix::Union{Missing,Array{Int64}} = vec_to_mat(confusion_matrix), #TODO: file issue to make Matrix{Int64} in future discrimination_calibration_curve::Union{Missing, Tuple{Vector{Float64}, - Vector{Union{Missing, - Float64}}}}, + Vector{Float64}}}, discrimination_calibration_score::Union{Missing,Float64}, multiclass_IRA_kappas::Union{Missing,Float64}, multiclass_kappa::Union{Missing,Float64}, optimal_threshold::Union{Missing,Float64}, optimal_threshold_class::Union{Missing,Int64}, per_class_kappas::Union{Missing,Vector{Float64}}, + stratified_kappas::Union{Missing, + Vector{NamedTuple{(:per_class, + :multiclass, + :n), + Tuple{Vector{Float64}, + Float64, + Int64}}}}, per_class_pr_curves::Union{Missing, Vector{Tuple{Vector{Float64}, - Vector{Union{Missing, - Float64}}}}}, + Vector{Float64}}}}, per_class_reliability_calibration_curves::Union{Missing, Vector{Tuple{Vector{Float64}, - Vector{Union{Missing, - Float64}}}}}, + Vector{Float64}}}}, per_class_reliability_calibration_scores::Union{Missing, Vector{Float64}}, per_class_roc_aucs::Union{Missing,Vector{Float64}}, @@ -40,8 +44,7 @@ const EvaluationRow = Legolas.@row("lighthouse.evaluation@1", Vector{Float64}}}}, per_expert_discrimination_calibration_curves::Union{Missing, Vector{Tuple{Vector{Float64}, - Vector{Union{Missing, - Float64}}}}}, + Vector{Float64}}}}, per_expert_discrimination_calibration_scores::Union{Missing, Vector{Float64}}, spearman_correlation::Union{Missing, diff --git a/test/learn.jl b/test/learn.jl index 7361ec5..c887b6f 100644 --- a/test/learn.jl +++ b/test/learn.jl @@ -107,8 +107,7 @@ end strata = [Set("group $(j % Int(ceil(sqrt(j))))" for j in 1:(i - 1)) for i in 1:size(votes, 1)] plot_data = evaluation_metrics(predicted_hard, predicted_soft, elected_hard, - model.classes, 0.0:0.01:1.0; votes=votes, - strata=strata) + model.classes, 0.0:0.01:1.0; votes, strata) @test haskey(plot_data, "stratified_kappas") plot = evaluation_metrics_plot(plot_data) @@ -122,12 +121,14 @@ end votes=votes, strata=strata) @test isequal(plot_data, plot_data2) # check these are the same - @test test_roundtrip_results(plot_data2) + @test test_roundtrip_evaluation(plot_data2) # Test plotting plot_data = last(logger.logged["test_set_evaluation/metrics_per_epoch"]) @test isa(plot_data["thresholds"], AbstractVector) + @test isa(last(plot_data["per_class_pr_curves"]), + Tuple{Vector{Float64},Vector{Float64}}) pr = plot_pr_curves(plot_data["per_class_pr_curves"], plot_data["class_labels"]) @testplot pr @@ -202,7 +203,7 @@ end limit plot_data = last(logger.logged["test_set_evaluation/metrics_per_epoch"]) @test haskey(plot_data, "spearman_correlation") - @test test_roundtrip_results(plot_data) + @test test_roundtrip_evaluation(plot_data) # No `optimal_threshold_class` during learning... @test !haskey(plot_data, "optimal_threshold") @@ -217,7 +218,7 @@ end @test haskey(plot_data, "optimal_threshold") @test haskey(plot_data, "optimal_threshold_class") @test plot_data["optimal_threshold_class"] == 2 - @test test_roundtrip_results(plot_data) + @test test_roundtrip_evaluation(plot_data) # `optimal_threshold_class` param invalid @test_throws ArgumentError Lighthouse.learn!(model, logger, () -> train_batches, @@ -243,14 +244,14 @@ end plot_data = last(logger.logged["wheeeeeee/metrics_for_all_time"]) @test !haskey(plot_data, "per_class_IRA_kappas") @test !haskey(plot_data, "multiclass_IRA_kappas") - @test test_roundtrip_results(plot_data) + @test test_roundtrip_evaluation(plot_data) evaluate!(predicted_hard, predicted_soft, elected_hard, model.classes, logger; logger_prefix="wheeeeeee", logger_suffix="_for_all_time", votes=votes) plot_data = last(logger.logged["wheeeeeee/metrics_for_all_time"]) @test haskey(plot_data, "per_class_IRA_kappas") @test haskey(plot_data, "multiclass_IRA_kappas") - @test test_roundtrip_results(plot_data) + @test test_roundtrip_evaluation(plot_data) # Test `evaluate` for different optimal_threshold classes evaluate!(predicted_hard, predicted_soft, elected_hard, model.classes, logger; @@ -261,7 +262,7 @@ end logger_prefix="wheeeeeee", logger_suffix="_for_all_time", votes=votes, optimal_threshold_class=2) plot_data_2 = last(logger.logged["wheeeeeee/metrics_for_all_time"]) - @test test_roundtrip_results(plot_data_2) + @test test_roundtrip_evaluation(plot_data_2) # The thresholds should not be identical (since they are *inclusive* when applied: # values greater than _or equal to_ the threshold are given the class value) diff --git a/test/metrics.jl b/test/metrics.jl index 2254afe..eca833d 100644 --- a/test/metrics.jl +++ b/test/metrics.jl @@ -65,8 +65,8 @@ @test isapprox(stats.precision, 0.5; atol=0.02) @test confusion_matrix(10, ()) == zeros(10, 10) - @test all(ismissing, cohens_kappa(10, ())) - @test ismissing(accuracy(zeros(10, 10))) + @test all(isnan, cohens_kappa(10, ())) + @test isnan(accuracy(zeros(10, 10))) stats = binary_statistics(zeros(10, 10), 1) @test stats.predicted_positives == 0 @test stats.predicted_negatives == 0 @@ -80,7 +80,7 @@ @test stats.true_negative_rate == 1 @test stats.false_positive_rate == 0 @test stats.false_negative_rate == 0 - @test ismissing(stats.precision) + @test isnan(stats.precision) for p in 0:0.1:1 @test Lighthouse._cohens_kappa(p, p) == 0 @@ -103,6 +103,7 @@ end @test bin_count == length(bins) @test first(first(bins)) == 0.0 && last(last(bins)) == 1.0 @test all(!ismissing, fractions) + @test all(!isnan, fractions) @test all(!iszero, totals) @test all(isapprox.(fractions, 0.5; atol=0.02)) @test all(isapprox.(totals, length(probs) / bin_count; atol=1000)) @@ -120,6 +121,7 @@ end @test bin_count == length(bins) @test first(first(bins)) == 0.0 && last(last(bins)) == 1.0 @test all(!ismissing, fractions) + @test all(!isnan, fractions) @test all(!iszero, totals) @test all(isapprox.(fractions, ideal; atol=0.01)) @test all(totals .== 1_000_000 / bin_count) @@ -132,9 +134,21 @@ end @test bin_count == length(bins) @test first(first(bins)) == 0.0 && last(last(bins)) == 1.0 @test all(!ismissing, fractions) + @test all(!isnan, fractions) @test all(!iszero, totals) @test all(isapprox.(fractions, reverse(ideal); atol=0.01)) @test all(totals .== 1_000_000 / bin_count) @test isapprox(ceil(mean(fractions) * length(bitmask)), count(bitmask); atol=1) @test isapprox(mean_squared_error, 1 / 3; atol=0.01) + + # Handle garbage input---ensure non-existant results are NaN + probs = fill(-1, 40) + bitmask = zeros(Bool, 40) + bins, fractions, totals, mean_squared_error = calibration_curve(probs, bitmask; + bin_count) + @test bin_count == length(bins) + @test first(first(bins)) == 0.0 && last(last(bins)) == 1.0 + @test all(isnan, fractions) + @test all(iszero, totals) + @test isnan(mean_squared_error) end diff --git a/test/runtests.jl b/test/runtests.jl index c651f6a..ac05304 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -26,11 +26,29 @@ macro testplot(fig_name) end end -function test_roundtrip_evaluation(row_dict::Dict{String,S}) where S +function test_roundtrip_evaluation(row_dict::Dict{String,S}) where {S} row = Lighthouse.EvaluationRow(row_dict) rt_row = roundtrip_row(row) - @test isequal(rt_row, row) - @test Lighthouse._evaluation_row_dict(rt_row) == row_dict + + # Make sure row roundtrips correctly + @test issetequal(keys(row), keys(rt_row)) + for (k, v) in pairs(row) + if ismissing(v) + @test ismissing(rt_row[k]) + else + @test issetequal(v, rt_row[k]) + end + end + + # Make sure originating dictionary roundtrips correctly + rt_dict = Lighthouse._evaluation_row_dict(rt_row) + for (k, v) in pairs(row_dict) + if ismissing(v) + @test ismissing(rt_dict[k]) + else + @test issetequal(v, rt_dict[k]) + end + end return true end From eb874d343fda3bd4dbf6a0c3b17805f322fad612 Mon Sep 17 00:00:00 2001 From: hannahilea Date: Thu, 17 Mar 2022 23:54:19 +0000 Subject: [PATCH 13/21] Fix test dep --- Project.toml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/Project.toml b/Project.toml index c4fcdcb..cdae0fe 100644 --- a/Project.toml +++ b/Project.toml @@ -22,13 +22,15 @@ CairoMakie = "0.7" Legolas = "0.3" Makie = "0.16.5" StatsBase = "0.33" +Tables = "1.7" TensorBoardLogger = "0.1" julia = "1.6" [extras] CairoMakie = "13f3f980-e62b-5c42-98c6-ff1f3baf88f0" StableRNGs = "860ef19b-820b-49d6-a774-d7a799459cd3" +Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" [targets] -test = ["Test", "CairoMakie", "StableRNGs"] +test = ["Test", "CairoMakie", "StableRNGs", "Tables"] From 8bf1f47c50303366343b77a21ee94220e2ad0665 Mon Sep 17 00:00:00 2001 From: hannahilea Date: Fri, 18 Mar 2022 14:30:58 +0000 Subject: [PATCH 14/21] test for inclusion of all metrics --- src/row.jl | 5 +++-- test/learn.jl | 14 +++++++------- test/row.jl | 6 +++--- test/runtests.jl | 15 +++++++++++---- 4 files changed, 24 insertions(+), 16 deletions(-) diff --git a/src/row.jl b/src/row.jl index 377bcab..949287e 100644 --- a/src/row.jl +++ b/src/row.jl @@ -13,7 +13,7 @@ vec_to_mat(x::Missing) = return missing const EVALUATION_ROW_SCHEMA = Legolas.Schema("lighthouse.evaluation@1") const EvaluationRow = Legolas.@row("lighthouse.evaluation@1", class_labels::Union{Missing,Vector{String}}, - confusion_matrix::Union{Missing,Array{Int64}} = vec_to_mat(confusion_matrix), #TODO: file issue to make Matrix{Int64} in future + confusion_matrix::Union{Missing,Array{Int64}} = vec_to_mat(confusion_matrix), discrimination_calibration_curve::Union{Missing, Tuple{Vector{Float64}, Vector{Float64}}}, @@ -22,6 +22,7 @@ const EvaluationRow = Legolas.@row("lighthouse.evaluation@1", multiclass_kappa::Union{Missing,Float64}, optimal_threshold::Union{Missing,Float64}, optimal_threshold_class::Union{Missing,Int64}, + per_class_IRA_kappas::Union{Missing,Vector{Float64}}, per_class_kappas::Union{Missing,Vector{Float64}}, stratified_kappas::Union{Missing, Vector{NamedTuple{(:per_class, @@ -78,4 +79,4 @@ Lighthouse v for (k, v) in pairs(NamedTuple(row)) if !ismissing(v)) -end \ No newline at end of file +end diff --git a/test/learn.jl b/test/learn.jl index c887b6f..fc46e54 100644 --- a/test/learn.jl +++ b/test/learn.jl @@ -111,7 +111,7 @@ end @test haskey(plot_data, "stratified_kappas") plot = evaluation_metrics_plot(plot_data) - @test test_roundtrip_evaluation(plot_data) + test_evaluation_metrics_roundtrip(plot_data) plot2, plot_data2 = @test_deprecated evaluation_metrics_plot(predicted_hard, predicted_soft, @@ -121,7 +121,7 @@ end votes=votes, strata=strata) @test isequal(plot_data, plot_data2) # check these are the same - @test test_roundtrip_evaluation(plot_data2) + test_evaluation_metrics_roundtrip(plot_data2) # Test plotting plot_data = last(logger.logged["test_set_evaluation/metrics_per_epoch"]) @@ -203,7 +203,7 @@ end limit plot_data = last(logger.logged["test_set_evaluation/metrics_per_epoch"]) @test haskey(plot_data, "spearman_correlation") - @test test_roundtrip_evaluation(plot_data) + test_evaluation_metrics_roundtrip(plot_data) # No `optimal_threshold_class` during learning... @test !haskey(plot_data, "optimal_threshold") @@ -218,7 +218,7 @@ end @test haskey(plot_data, "optimal_threshold") @test haskey(plot_data, "optimal_threshold_class") @test plot_data["optimal_threshold_class"] == 2 - @test test_roundtrip_evaluation(plot_data) + test_evaluation_metrics_roundtrip(plot_data) # `optimal_threshold_class` param invalid @test_throws ArgumentError Lighthouse.learn!(model, logger, () -> train_batches, @@ -244,14 +244,14 @@ end plot_data = last(logger.logged["wheeeeeee/metrics_for_all_time"]) @test !haskey(plot_data, "per_class_IRA_kappas") @test !haskey(plot_data, "multiclass_IRA_kappas") - @test test_roundtrip_evaluation(plot_data) + test_evaluation_metrics_roundtrip(plot_data) evaluate!(predicted_hard, predicted_soft, elected_hard, model.classes, logger; logger_prefix="wheeeeeee", logger_suffix="_for_all_time", votes=votes) plot_data = last(logger.logged["wheeeeeee/metrics_for_all_time"]) @test haskey(plot_data, "per_class_IRA_kappas") @test haskey(plot_data, "multiclass_IRA_kappas") - @test test_roundtrip_evaluation(plot_data) + test_evaluation_metrics_roundtrip(plot_data) # Test `evaluate` for different optimal_threshold classes evaluate!(predicted_hard, predicted_soft, elected_hard, model.classes, logger; @@ -262,7 +262,7 @@ end logger_prefix="wheeeeeee", logger_suffix="_for_all_time", votes=votes, optimal_threshold_class=2) plot_data_2 = last(logger.logged["wheeeeeee/metrics_for_all_time"]) - @test test_roundtrip_evaluation(plot_data_2) + test_evaluation_metrics_roundtrip(plot_data_2) # The thresholds should not be identical (since they are *inclusive* when applied: # values greater than _or equal to_ the threshold are given the class value) diff --git a/test/row.jl b/test/row.jl index 5ec4786..02b18ca 100644 --- a/test/row.jl +++ b/test/row.jl @@ -9,14 +9,14 @@ end @testset "`EvaluationRow`" begin # Basic roundtrip dict = Dict("class_labels" => ["foo", "bar"], "multiclass_kappa" => 3) - @test test_roundtrip_evaluation(dict) + test_evaluation_metrics_roundtrip(dict) # Don't lose extra columns (basic Legolas functionality) extra_dict = Dict("class_labels" => ["foo", "bar"], "multiclass_kappa" => 3, "rabbit" => 2432) - @test test_roundtrip_evaluation(extra_dict) + test_evaluation_metrics_roundtrip(extra_dict) # Handle fun cases mat_dict = Dict("confusion_matrix" => [3 5 6; 6 7 8; 9 10 11]) - @test test_roundtrip_evaluation(mat_dict) + test_evaluation_metrics_roundtrip(mat_dict) end \ No newline at end of file diff --git a/test/runtests.jl b/test/runtests.jl index ac05304..f152990 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -26,11 +26,18 @@ macro testplot(fig_name) end end -function test_roundtrip_evaluation(row_dict::Dict{String,S}) where {S} +const EVALUATION_ROW_KEYS = string.(keys(Lighthouse.EvaluationRow())) + +function test_evaluation_metrics_roundtrip(row_dict::Dict{String,S}) where {S} + # Make sure we're capturing all metrics keys in our Schema + keys_not_in_schema = setdiff(keys(row_dict), EVALUATION_ROW_KEYS) + @test isempty(keys_not_in_schema) + + # Do the roundtripping (will fail if schema types do not validate after roundtrip) row = Lighthouse.EvaluationRow(row_dict) rt_row = roundtrip_row(row) - # Make sure row roundtrips correctly + # Make sure full row roundtrips correctly @test issetequal(keys(row), keys(rt_row)) for (k, v) in pairs(row) if ismissing(v) @@ -40,7 +47,7 @@ function test_roundtrip_evaluation(row_dict::Dict{String,S}) where {S} end end - # Make sure originating dictionary roundtrips correctly + # Make sure originating metrics dictionary roundtrips correctly rt_dict = Lighthouse._evaluation_row_dict(rt_row) for (k, v) in pairs(row_dict) if ismissing(v) @@ -49,7 +56,7 @@ function test_roundtrip_evaluation(row_dict::Dict{String,S}) where {S} @test issetequal(v, rt_dict[k]) end end - return true + return nothing end function roundtrip_row(row::Lighthouse.EvaluationRow) From b353c5a5a35bd655b09d96618059a81550927412 Mon Sep 17 00:00:00 2001 From: hannahilea Date: Fri, 18 Mar 2022 14:35:53 +0000 Subject: [PATCH 15/21] foiled by my own test case --- test/row.jl | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/test/row.jl b/test/row.jl index 02b18ca..aaec57e 100644 --- a/test/row.jl +++ b/test/row.jl @@ -6,17 +6,15 @@ @test_throws DimensionMismatch Lighthouse.vec_to_mat(collect(1:6)) # Invalid dimensions end -@testset "`EvaluationRow`" begin - # Basic roundtrip +@testset "`EvaluationRow` basics" begin + # Most EvaluationRow testing happens via the `test_evaluation_metrics_roundtrip` + # in test/learn.jl + + # Roundtrip from dict dict = Dict("class_labels" => ["foo", "bar"], "multiclass_kappa" => 3) test_evaluation_metrics_roundtrip(dict) - # Don't lose extra columns (basic Legolas functionality) - extra_dict = Dict("class_labels" => ["foo", "bar"], "multiclass_kappa" => 3, - "rabbit" => 2432) - test_evaluation_metrics_roundtrip(extra_dict) - - # Handle fun cases + # Handle fun case mat_dict = Dict("confusion_matrix" => [3 5 6; 6 7 8; 9 10 11]) test_evaluation_metrics_roundtrip(mat_dict) end \ No newline at end of file From 9c82ab68a5e53763a936bdaf54b6689b67155be0 Mon Sep 17 00:00:00 2001 From: hannahilea Date: Fri, 18 Mar 2022 14:49:40 +0000 Subject: [PATCH 16/21] cleanup --- src/row.jl | 2 +- test/row.jl | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/row.jl b/src/row.jl index 949287e..2f973b7 100644 --- a/src/row.jl +++ b/src/row.jl @@ -75,7 +75,7 @@ end Convert [`EvaluationRow`](@ref) into `::Dict{String, Any}` results, as are output by `[`evaluation_metrics`](@ref)` (and predated use of `EvaluationRow` in -Lighthouse v for (k, v) in pairs(NamedTuple(row)) if !ismissing(v)) diff --git a/test/row.jl b/test/row.jl index aaec57e..9f836db 100644 --- a/test/row.jl +++ b/test/row.jl @@ -17,4 +17,4 @@ end # Handle fun case mat_dict = Dict("confusion_matrix" => [3 5 6; 6 7 8; 9 10 11]) test_evaluation_metrics_roundtrip(mat_dict) -end \ No newline at end of file +end From abaa14e2132baf41318e1970dcad73af53e84e38 Mon Sep 17 00:00:00 2001 From: hannahilea Date: Fri, 18 Mar 2022 16:41:41 +0000 Subject: [PATCH 17/21] Add new docstrings to docs --- docs/src/index.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/docs/src/index.md b/docs/src/index.md index 36e6b76..2fe0b6f 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -39,6 +39,9 @@ binary_statistics cohens_kappa calibration_curve Lighthouse.evaluation_metrics +Lighthouse._evaluation_row_dict +Lighthouse.Row +Lighthouse.evaluation_metrics_row ``` ## Utilities From 4a999ce5fe7ef2934e25ed17ce319f502f75e387 Mon Sep 17 00:00:00 2001 From: hannahilea Date: Fri, 18 Mar 2022 17:09:26 +0000 Subject: [PATCH 18/21] fix docs --- docs/src/index.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/src/index.md b/docs/src/index.md index 2fe0b6f..00d25be 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -40,7 +40,7 @@ cohens_kappa calibration_curve Lighthouse.evaluation_metrics Lighthouse._evaluation_row_dict -Lighthouse.Row +Lighthouse.Row :: Union{Tuple{Dict}, Tuple{S}} where S<:Legolas.Schema{Symbol("lighthouse.evaluation"), 1} Lighthouse.evaluation_metrics_row ``` From 3af9d23a4dfdd337d86eb0541b4cf9ece384feac Mon Sep 17 00:00:00 2001 From: hannahilea Date: Fri, 18 Mar 2022 17:54:28 +0000 Subject: [PATCH 19/21] fix docstring --- docs/src/index.md | 2 +- src/row.jl | 61 +++++++++++++++++++++++++++++++++++++++++++---- 2 files changed, 57 insertions(+), 6 deletions(-) diff --git a/docs/src/index.md b/docs/src/index.md index 00d25be..bbe63b6 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -40,7 +40,7 @@ cohens_kappa calibration_curve Lighthouse.evaluation_metrics Lighthouse._evaluation_row_dict -Lighthouse.Row :: Union{Tuple{Dict}, Tuple{S}} where S<:Legolas.Schema{Symbol("lighthouse.evaluation"), 1} +Lighthouse.EvaluationRow Lighthouse.evaluation_metrics_row ``` diff --git a/src/row.jl b/src/row.jl index 2f973b7..f036f41 100644 --- a/src/row.jl +++ b/src/row.jl @@ -11,7 +11,9 @@ vec_to_mat(x::Missing) = return missing # Redefinition is workaround for https://github.com/beacon-biosignals/Legolas.jl/issues/9 const EVALUATION_ROW_SCHEMA = Legolas.Schema("lighthouse.evaluation@1") -const EvaluationRow = Legolas.@row("lighthouse.evaluation@1", + +""" + const EvaluationRow = Legolas.@row("lighthouse.evaluation@1", class_labels::Union{Missing,Vector{String}}, confusion_matrix::Union{Missing,Array{Int64}} = vec_to_mat(confusion_matrix), discrimination_calibration_curve::Union{Missing, @@ -57,13 +59,62 @@ const EvaluationRow = Legolas.@row("lighthouse.evaluation@1", Float64, Float64}}}, thresholds::Union{Missing,Vector{Float64}}) - -""" EvaluationRow(evaluation_row_dict::Dict{String, Any}) -> EvaluationRow -Convert `Dict` of [`evaluation_metrics`](@ref) results (e.g. from Lighthouse v for (k, v) in pairs(evaluation_row_dict))...) From 0067fec3a715f01a5b3199d783274b35f9d487ad Mon Sep 17 00:00:00 2001 From: hannahilea Date: Fri, 18 Mar 2022 15:23:37 -0400 Subject: [PATCH 20/21] export EvaluationRow --- docs/src/index.md | 2 +- src/Lighthouse.jl | 2 +- test/runtests.jl | 8 ++++---- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/docs/src/index.md b/docs/src/index.md index bbe63b6..3ccffbf 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -38,9 +38,9 @@ accuracy binary_statistics cohens_kappa calibration_curve +EvaluationRow Lighthouse.evaluation_metrics Lighthouse._evaluation_row_dict -Lighthouse.EvaluationRow Lighthouse.evaluation_metrics_row ``` diff --git a/src/Lighthouse.jl b/src/Lighthouse.jl index 4ce5c3e..9fc3ce1 100644 --- a/src/Lighthouse.jl +++ b/src/Lighthouse.jl @@ -21,7 +21,7 @@ include("classifier.jl") export AbstractClassifier include("row.jl") -# TODO: export EvaluationRow ? +export EvaluationRow include("learn.jl") export LearnLogger, learn!, upon, evaluate!, predict! diff --git a/test/runtests.jl b/test/runtests.jl index f152990..c92ff9f 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -26,7 +26,7 @@ macro testplot(fig_name) end end -const EVALUATION_ROW_KEYS = string.(keys(Lighthouse.EvaluationRow())) +const EVALUATION_ROW_KEYS = string.(keys(EvaluationRow())) function test_evaluation_metrics_roundtrip(row_dict::Dict{String,S}) where {S} # Make sure we're capturing all metrics keys in our Schema @@ -34,7 +34,7 @@ function test_evaluation_metrics_roundtrip(row_dict::Dict{String,S}) where {S} @test isempty(keys_not_in_schema) # Do the roundtripping (will fail if schema types do not validate after roundtrip) - row = Lighthouse.EvaluationRow(row_dict) + row = EvaluationRow(row_dict) rt_row = roundtrip_row(row) # Make sure full row roundtrips correctly @@ -59,11 +59,11 @@ function test_evaluation_metrics_roundtrip(row_dict::Dict{String,S}) where {S} return nothing end -function roundtrip_row(row::Lighthouse.EvaluationRow) +function roundtrip_row(row::EvaluationRow) p = mktempdir() * "rt_test.arrow" tbl = [row] Legolas.write(p, tbl, Lighthouse.EVALUATION_ROW_SCHEMA) - return Lighthouse.EvaluationRow(only(Tables.rows(Legolas.read(p)))) + return EvaluationRow(only(Tables.rows(Legolas.read(p)))) end include("plotting.jl") From 72be90baa658dcefd2c25fb3850a79f2a340465d Mon Sep 17 00:00:00 2001 From: hannahilea Date: Fri, 18 Mar 2022 15:38:18 -0400 Subject: [PATCH 21/21] remove uneeded dep --- Project.toml | 2 -- src/Lighthouse.jl | 1 - 2 files changed, 3 deletions(-) diff --git a/Project.toml b/Project.toml index cdae0fe..93d50af 100644 --- a/Project.toml +++ b/Project.toml @@ -4,7 +4,6 @@ authors = ["Beacon Biosignals, Inc."] version = "0.14.0" [deps] -Arrow = "69666777-d1a9-59fb-9406-91d4454c9d45" Dates = "ade2ca70-3891-5945-98fb-dc099432e06a" Legolas = "741b9549-f6ed-4911-9fbf-4a1c0c97f0cd" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" @@ -17,7 +16,6 @@ StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" TensorBoardLogger = "899adc3e-224a-11e9-021f-63837185c80f" [compat] -Arrow = "2.2" CairoMakie = "0.7" Legolas = "0.3" Makie = "0.16.5" diff --git a/src/Lighthouse.jl b/src/Lighthouse.jl index 9fc3ce1..6ed6c3a 100644 --- a/src/Lighthouse.jl +++ b/src/Lighthouse.jl @@ -7,7 +7,6 @@ using TensorBoardLogger using Makie using Printf using Legolas -using Arrow include("plotting.jl")