feature: added support for missing

xiaodaigh · Oct 18, 2024 · ed3b7ca · ed3b7ca
1 parent 4d87875
commit ed3b7ca
Show file tree

Hide file tree

Showing 9 changed files with 105 additions and 35 deletions.
diff --git a/README.jmd b/README.jmd
@@ -1,11 +1,3 @@
-# No more development; not even bug fixes in the foreseeable future
-
-Due to life changes. I have 0 time now to handle this Open Source project. So this will be archived
-until I can come back to it.
-
-I will refocus my energy on only a couple of open source packages one of them being {disk.frame}.
-
-
 # JLBoost.jl
 
 This is a 100%-Julia implementation of Gradient Boosting Regresssion Trees (GBRT) based heavily on the algorithms published in the XGBoost, LightGBM and Catboost papers. GBRT is also referred to as Gradient Boosting Decision Tree (GBDT).

diff --git a/src/JLBoostTrees/filter_tbl_by_splits.jl b/src/JLBoostTrees/filter_tbl_by_splits.jl
@@ -25,10 +25,21 @@ function keeprow_vec(tbl, node::Union{Nothing, AbstractJLBoostTree})::BitArray
         keeprow = keeprow_vec(tbl, node.parent)
     end
 
+    tmp = getproperty(tblc, node.parent.splitfeature)
     if is_left_child(node)
-        keeprow .&= getproperty(tblc, node.parent.splitfeature) .<= node.parent.split
+        # TODO fix this
+        # @warn "Currently assuming missing go left; this NEEDS to be FIXED"
+        keeprow .&= ismissing.(tmp) .|| tmp .<= node.parent.split
+        # keeprow .&= tmp .<= node.parent.split
     else
-        keeprow .&= getproperty(tblc, node.parent.splitfeature) .> node.parent.split
+        # @warn "Currently assuming missing go left; this NEEDS to be FIXED"
+        # keeprow .&= .!ismissing.(tmp) .|| tmp .> node.parent.split
+        # the above causes errors so apply De Morgan's law
+        # not(missing) | x > y = not not (not(missing) | x > y)
+        #                      = not (missing | x <= y) # by De Morgan's law
+        # keeping the above as reminder of how dumb I am. Just negate man
+        keeprow .&= .!(ismissing.(tmp) .|| tmp .<= node.parent.split)
+        # keeprow .&= tmp .> node.parent.split
     end
 
     keeprow

diff --git a/src/JLBoostTrees/tree-and-tree-models.jl b/src/JLBoostTrees/tree-and-tree-models.jl
@@ -26,6 +26,7 @@ mutable struct JLBoostTree{T} <: AbstractJLBoostTree{T}
     weight
 	parent::Union{JLBoostTree, Nothing}
     children::AbstractVector{AbstractJLBoostTree} # this is deliberate kept as an vector of AbstractJLBoostTree; because we can genuinely mix and match types in htere
+    # TODO store the node value as FeatureSplitPredictate so you can generalise it to include missing
     splitfeature
     split
     gain

diff --git a/src/find_best_split.jl b/src/find_best_split.jl
@@ -18,13 +18,12 @@ Does not assume that Feature, target, and warmstart are sorted and will sort the
 function find_best_split(loss, df, feature::Symbol, target::Symbol, warmstart::AbstractVector, lambda, gamma; verbose = false, kwargs...)
 	 @assert Tables.istable(df)
 
-
 	 dfc = Tables.columns(df)
 
 	 x = getproperty(dfc, feature)
 
     if verbose
-        @info "find_best_split(): Calculating a split on `$feature` with extrema $(extrema(x))"
+        @info "find_best_split(): Calculating a split on `$feature` with extrema $(extrema(x |> skipmissing))"
 	end
 
 
@@ -77,6 +76,26 @@ function _find_best_split(loss, feature, target, warmstart, lambda::Number, gamm
 	@assert length(target) == length(feature)
 	@assert length(warmstart) == length(feature)
 
+
+    # if the feature vector has missings
+    non_missing_ends = length(feature)
+    if Missing <: eltype(feature)
+        pos = searchsortedfirst(feature, missing)
+
+        if pos > non_missing_ends # this means it couldn't find any missing
+            # do nothing
+        elseif pos == 1
+            # all features are missing
+            return (split_at = missing, cutpt = missing, gain = missing, lweight = missing, rweight = missing, should_split_further = false)
+        else
+            non_missing_ends = pos - 1
+
+            feature = @view feature[1:non_missing_ends]
+            target = @view target[1:non_missing_ends]
+            warmstart = @view warmstart[1:non_missing_ends]
+        end
+    end
+
     # TODO maybe use some kind of argmax here
     # TODO can reduce allocations here by skipping the broadcasting .
 	cg = cumsum(g.(loss, target, warmstart))
@@ -131,8 +150,10 @@ function _find_best_split(loss, feature, target, warmstart, lambda::Number, gamm
 		end
     end
 
+    # TODO if should split further add missing
+    # @info "Got here: $(count(ismissing, feature))"
     # TODO return a type
-    (split_at = split_at, cutpt = cutpt, gain = best_gain, lweight = lweight, rweight = rweight, should_split_further = should_split_further)
+    (split_at = split_at, cutpt = cutpt, gain = best_gain, lweight = lweight, rweight = rweight, should_split_further = should_split_further, missing_go_left = true)
 end
 
 # TODO more reseach into GPU friendliness

diff --git a/src/fit_tree.jl b/src/fit_tree.jl
@@ -86,7 +86,7 @@ function _fit_tree!(loss, tbl, target, features, warm_start,
     # this shouldn't be reset and should be placed outside of the while loop below
 
 
-    while !no_more_gains_to_found && !stopping_criterion(jlt)
+    while (!no_more_gains_to_found) && !stopping_criterion(jlt)
         if verbose
             @info "BEST SPLIT PHASE: Tree Depth=$(treedepth(jlt))"
         end
@@ -153,7 +153,7 @@ function _fit_tree!(loss, tbl, target, features, warm_start,
             end
 
             if verbose
-                @info("BEST SPLIT PHASE: found a best split at $(split_with_best_gain.feature) <= $(split_with_best_gain.split_at); gain:$(split_with_best_gain.gain) further:$(split_with_best_gain.should_split_further) for $(leaf_node)")
+                @info("BEST SPLIT PHASE: found a best split at `$(split_with_best_gain.feature)`` <= $(split_with_best_gain.split_at); gain:$(split_with_best_gain.gain) further:$(split_with_best_gain.should_split_further) for $(leaf_node)")
             end
 
             best_split_dict[leaf_node] = split_with_best_gain
@@ -169,9 +169,8 @@ function _fit_tree!(loss, tbl, target, features, warm_start,
 
         # tree_growth phase
         # select the node to grow based on growth function
-        # the tree_growth function will return the list of
-        # nodes_to_split = tree_growth(jlt)
-        nodes_to_split::Vector{<:AbstractJLBoostTree} = tree_growth(jlt)
+        # the tree_growth function needs to return the an Iterable of JLBoost trees
+        nodes_to_split = tree_growth(jlt)
         if verbose
             @info "TREE GROWTH PHASE: Found $(length(nodes_to_split)) node-candidates to split"
         end

diff --git a/src/jlboost-fit.jl b/src/jlboost-fit.jl
@@ -34,8 +34,6 @@ see https://xgboost.readthedocs.io/en/latest/parameter.html
 * monotone_contraints: Not yet implemented
 * interaction_constraints: Not yet implemented
 """
-
-
 function jlboost(df, target::Union{Symbol, String}; kwargs...)
     target = Symbol(target)
     warm_start = fill(0.0, nrow(df))
@@ -107,6 +105,7 @@ function jlboost(df, target, features, warm_start::AbstractVector,
     target = Symbol(target)
     features = Symbol.(features)
 
+    # TODO get only the needed columns from the table
 	# dfc = Tables.columns(df)
     dfc = df
 
@@ -130,14 +129,10 @@ function jlboost(df, target, features, warm_start::AbstractVector,
             warm_start = predict(res_jlt[1:nround-1], dfs)
         end
 
-        # println(nround)
-        # println(dfc)
-
         new_jlt = _fit_tree!(loss, dfc, target, features_sample, warm_start, JLBoostTree(0.0),
                              tree_growth,
                              stopping_criterion; verbose=verbose, kwargs...);
 
-        # println("mehmehmehmeh")
         # added a new round of tree
         push!(res_jlt, eta*deepcopy(new_jlt))
 	end

diff --git a/src/predict.jl b/src/predict.jl
@@ -15,16 +15,16 @@ Apply the fitted model on data.
 * df - a Tables.jl compatible Table
 """
 
-(jlt::JLBoostTreeModel)(args...) = predict(jlt, args...)
+(jlt::AbstractJLBoostTree)(args...) = predict(jlt, args...)
+
+(jltm::JLBoostTreeModel)(args...) = predict(jltm, args...)
 
 (jlt::AbstractArray{JLBoostTreeModel})(args...) = predict(jlt, args...)
 
 predict(jlt::JLBoostTreeModel, df) = predict(trees(jlt), df)
 
-predict(jlt::JLBoostTreeModel, df::AbstractDataFrame) = predict(trees(jlt), df)
-
 # defaults to Float64 (double) as output
-predict(jlt::AbstractJLBoostTree, df) = predict(jlt::AbstractJLBoostTree, df, Float64)
+predict(jlt::AbstractJLBoostTree, df) = predict(jlt, df, Float64)
 
 function predict(jlt::AbstractJLBoostTree, df, out_eltype::Type{T})::Vector{T} where {T <: Number}
 	# TODO a more efficient algorithm. Currently there are too many assignbools being
@@ -43,10 +43,13 @@ end
 
 function predict!(jlt::JLBoostTree, df, res, assignbool)
 	if length(jlt.children) == 2
-	    new_assignbool = assignbool .& (getproperty(Tables.columns(df), jlt.splitfeature) .<= jlt.split)
+        tmp = getproperty(Tables.columns(df), jlt.splitfeature)
+
+        # TODO
+	    new_assignbool = assignbool .& ismissing.(tmp) .|| (tmp .<= jlt.split)
 	    predict!(jlt.children[1], df, res, new_assignbool)
 
-	    new_assignbool .= assignbool .& (getproperty(Tables.columns(df), jlt.splitfeature) .> jlt.split)
+        new_assignbool .= assignbool .& .!(ismissing.(tmp) .|| (tmp .<= jlt.split))
 	    predict!(jlt.children[2], df, res, new_assignbool)
     elseif length(jlt.children) == 0
 	    res[assignbool] .+= jlt.weight

diff --git a/tutorial/give-me-some-credit/1-code.jl b/tutorial/give-me-some-credit/1-code.jl
@@ -1,21 +1,70 @@
 using Revise
-using JLBoost: is_left_child
-using AbstractTrees: isroot
 using JLBoost, CSV, JDF, TidierData
-using DataFrames
 
-using TidierData
+using JLBoost: is_left_child
+using AbstractTrees: isroot
+using DataFrames, TidierData
 using TidierData: @clean_names, @group_by, @summarise
 using Chain: @chain
 
 data = @chain JDF.load("cs-training.jdf") begin
+    DataFrame
+    # @select -(monthly_income, number_of_dependents)
+    @mutate revolving_utilization_of_unsecured_lines = round(revolving_utilization_of_unsecured_lines, digits=2)
+end
+
+jlboost(data, :serious_dlqin2yrs, [:monthly_income]; verbose=true, max_depth=2)
+
+jlboost(data, :serious_dlqin2yrs, [:monthly_income]; verbose=true, max_depth=2)
+
+jlboost(data, :serious_dlqin2yrs; verbose=true, max_depth=2)
+
+test = @chain JDF.load("cs-test.jdf") begin
     DataFrame
     @select -(monthly_income, number_of_dependents)
     @mutate revolving_utilization_of_unsecured_lines = round(revolving_utilization_of_unsecured_lines, digits=2)
 end
 
 names(data)
 
+idx = rand(1:5, nrow(data))
+
+
+model = jlboost(data, :serious_dlqin2yrs; nrounds=1, max_depth=4)
+gini(-model(data), data[!, :serious_dlqin2yrs])
+
+data[!, :group] = idx
+
+@chain data begin
+    groupby(:group)
+    combine(_) do subdf
+        gini(-model(subdf), subdf[!, :serious_dlqin2yrs])
+    end
+end
+
+model = jlboost(data, :serious_dlqin2yrs; nrounds=2, max_depth=2)
+gini(-model(data), data[!, :serious_dlqin2yrs])
+
+model = jlboost(data, :serious_dlqin2yrs; nrounds=3, max_depth=2)
+gini(-model(data), data[!, :serious_dlqin2yrs])
+
+model = jlboost(data, :serious_dlqin2yrs; nrounds=4, max_depth=2)
+gini(-model(data), data[!, :serious_dlqin2yrs])
+
+model = jlboost(data, :serious_dlqin2yrs; nrounds=5, max_depth=2)
+gini(-model(data), data[!, :serious_dlqin2yrs])
+
+model = jlboost(data, :serious_dlqin2yrs; nrounds=6, max_depth=2)
+gini(-model(data), data[!, :serious_dlqin2yrs])
+
+model = jlboost(data, :serious_dlqin2yrs; nrounds=7, max_depth=2)
+gini(-model(data), data[!, :serious_dlqin2yrs])
+
+model = jlboost(data, :serious_dlqin2yrs; nrounds=8, max_depth=2)
+gini(-model(data), data[!, :serious_dlqin2yrs])
+
+
+
 function fit_score_card(data, target, features)
     warm_start = fill(0.0, nrow(data))
     gini = 0

diff --git a/tutorial/give-me-some-credit/Project.toml b/tutorial/give-me-some-credit/Project.toml
@@ -3,5 +3,4 @@ AbstractTrees = "1520ce14-60c1-5f80-bbc7-55ef81b5835c"
 CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b"
 JDF = "babc3d20-cd49-4f60-a736-a8f9c08892d3"
 JLBoost = "13d6d4a1-5e7f-472c-9ebc-8123a4fbb95f"
-Memoize = "c03570c3-d221-55d1-a50c-7939bbd78826"
 StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"