Skip to content

Commit

Permalink
feature: added support for missing
Browse files Browse the repository at this point in the history
  • Loading branch information
xiaodaigh committed Oct 18, 2024
1 parent 4d87875 commit ed3b7ca
Show file tree
Hide file tree
Showing 9 changed files with 105 additions and 35 deletions.
8 changes: 0 additions & 8 deletions README.jmd
Original file line number Diff line number Diff line change
@@ -1,11 +1,3 @@
# No more development; not even bug fixes in the foreseeable future

Due to life changes. I have 0 time now to handle this Open Source project. So this will be archived
until I can come back to it.

I will refocus my energy on only a couple of open source packages one of them being {disk.frame}.


# JLBoost.jl

This is a 100%-Julia implementation of Gradient Boosting Regresssion Trees (GBRT) based heavily on the algorithms published in the XGBoost, LightGBM and Catboost papers. GBRT is also referred to as Gradient Boosting Decision Tree (GBDT).
Expand Down
15 changes: 13 additions & 2 deletions src/JLBoostTrees/filter_tbl_by_splits.jl
Original file line number Diff line number Diff line change
Expand Up @@ -25,10 +25,21 @@ function keeprow_vec(tbl, node::Union{Nothing, AbstractJLBoostTree})::BitArray
keeprow = keeprow_vec(tbl, node.parent)
end

tmp = getproperty(tblc, node.parent.splitfeature)
if is_left_child(node)
keeprow .&= getproperty(tblc, node.parent.splitfeature) .<= node.parent.split
# TODO fix this
# @warn "Currently assuming missing go left; this NEEDS to be FIXED"
keeprow .&= ismissing.(tmp) .|| tmp .<= node.parent.split
# keeprow .&= tmp .<= node.parent.split
else
keeprow .&= getproperty(tblc, node.parent.splitfeature) .> node.parent.split
# @warn "Currently assuming missing go left; this NEEDS to be FIXED"
# keeprow .&= .!ismissing.(tmp) .|| tmp .> node.parent.split
# the above causes errors so apply De Morgan's law
# not(missing) | x > y = not not (not(missing) | x > y)
# = not (missing | x <= y) # by De Morgan's law
# keeping the above as reminder of how dumb I am. Just negate man
keeprow .&= .!(ismissing.(tmp) .|| tmp .<= node.parent.split)
# keeprow .&= tmp .> node.parent.split
end

keeprow
Expand Down
1 change: 1 addition & 0 deletions src/JLBoostTrees/tree-and-tree-models.jl
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ mutable struct JLBoostTree{T} <: AbstractJLBoostTree{T}
weight
parent::Union{JLBoostTree, Nothing}
children::AbstractVector{AbstractJLBoostTree} # this is deliberate kept as an vector of AbstractJLBoostTree; because we can genuinely mix and match types in htere
# TODO store the node value as FeatureSplitPredictate so you can generalise it to include missing
splitfeature
split
gain
Expand Down
27 changes: 24 additions & 3 deletions src/find_best_split.jl
Original file line number Diff line number Diff line change
Expand Up @@ -18,13 +18,12 @@ Does not assume that Feature, target, and warmstart are sorted and will sort the
function find_best_split(loss, df, feature::Symbol, target::Symbol, warmstart::AbstractVector, lambda, gamma; verbose = false, kwargs...)
@assert Tables.istable(df)


dfc = Tables.columns(df)

x = getproperty(dfc, feature)

if verbose
@info "find_best_split(): Calculating a split on `$feature` with extrema $(extrema(x))"
@info "find_best_split(): Calculating a split on `$feature` with extrema $(extrema(x |> skipmissing))"
end


Expand Down Expand Up @@ -77,6 +76,26 @@ function _find_best_split(loss, feature, target, warmstart, lambda::Number, gamm
@assert length(target) == length(feature)
@assert length(warmstart) == length(feature)


# if the feature vector has missings
non_missing_ends = length(feature)
if Missing <: eltype(feature)
pos = searchsortedfirst(feature, missing)

if pos > non_missing_ends # this means it couldn't find any missing
# do nothing
elseif pos == 1
# all features are missing
return (split_at = missing, cutpt = missing, gain = missing, lweight = missing, rweight = missing, should_split_further = false)
else
non_missing_ends = pos - 1

feature = @view feature[1:non_missing_ends]
target = @view target[1:non_missing_ends]
warmstart = @view warmstart[1:non_missing_ends]
end
end

# TODO maybe use some kind of argmax here
# TODO can reduce allocations here by skipping the broadcasting .
cg = cumsum(g.(loss, target, warmstart))
Expand Down Expand Up @@ -131,8 +150,10 @@ function _find_best_split(loss, feature, target, warmstart, lambda::Number, gamm
end
end

# TODO if should split further add missing
# @info "Got here: $(count(ismissing, feature))"
# TODO return a type
(split_at = split_at, cutpt = cutpt, gain = best_gain, lweight = lweight, rweight = rweight, should_split_further = should_split_further)
(split_at = split_at, cutpt = cutpt, gain = best_gain, lweight = lweight, rweight = rweight, should_split_further = should_split_further, missing_go_left = true)
end

# TODO more reseach into GPU friendliness
Expand Down
9 changes: 4 additions & 5 deletions src/fit_tree.jl
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ function _fit_tree!(loss, tbl, target, features, warm_start,
# this shouldn't be reset and should be placed outside of the while loop below


while !no_more_gains_to_found && !stopping_criterion(jlt)
while (!no_more_gains_to_found) && !stopping_criterion(jlt)
if verbose
@info "BEST SPLIT PHASE: Tree Depth=$(treedepth(jlt))"
end
Expand Down Expand Up @@ -153,7 +153,7 @@ function _fit_tree!(loss, tbl, target, features, warm_start,
end

if verbose
@info("BEST SPLIT PHASE: found a best split at $(split_with_best_gain.feature) <= $(split_with_best_gain.split_at); gain:$(split_with_best_gain.gain) further:$(split_with_best_gain.should_split_further) for $(leaf_node)")
@info("BEST SPLIT PHASE: found a best split at `$(split_with_best_gain.feature)`` <= $(split_with_best_gain.split_at); gain:$(split_with_best_gain.gain) further:$(split_with_best_gain.should_split_further) for $(leaf_node)")
end

best_split_dict[leaf_node] = split_with_best_gain
Expand All @@ -169,9 +169,8 @@ function _fit_tree!(loss, tbl, target, features, warm_start,

# tree_growth phase
# select the node to grow based on growth function
# the tree_growth function will return the list of
# nodes_to_split = tree_growth(jlt)
nodes_to_split::Vector{<:AbstractJLBoostTree} = tree_growth(jlt)
# the tree_growth function needs to return the an Iterable of JLBoost trees
nodes_to_split = tree_growth(jlt)
if verbose
@info "TREE GROWTH PHASE: Found $(length(nodes_to_split)) node-candidates to split"
end
Expand Down
7 changes: 1 addition & 6 deletions src/jlboost-fit.jl
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,6 @@ see https://xgboost.readthedocs.io/en/latest/parameter.html
* monotone_contraints: Not yet implemented
* interaction_constraints: Not yet implemented
"""


function jlboost(df, target::Union{Symbol, String}; kwargs...)
target = Symbol(target)
warm_start = fill(0.0, nrow(df))
Expand Down Expand Up @@ -107,6 +105,7 @@ function jlboost(df, target, features, warm_start::AbstractVector,
target = Symbol(target)
features = Symbol.(features)

# TODO get only the needed columns from the table
# dfc = Tables.columns(df)
dfc = df

Expand All @@ -130,14 +129,10 @@ function jlboost(df, target, features, warm_start::AbstractVector,
warm_start = predict(res_jlt[1:nround-1], dfs)
end

# println(nround)
# println(dfc)

new_jlt = _fit_tree!(loss, dfc, target, features_sample, warm_start, JLBoostTree(0.0),
tree_growth,
stopping_criterion; verbose=verbose, kwargs...);

# println("mehmehmehmeh")
# added a new round of tree
push!(res_jlt, eta*deepcopy(new_jlt))
end
Expand Down
15 changes: 9 additions & 6 deletions src/predict.jl
Original file line number Diff line number Diff line change
Expand Up @@ -15,16 +15,16 @@ Apply the fitted model on data.
* df - a Tables.jl compatible Table
"""

(jlt::JLBoostTreeModel)(args...) = predict(jlt, args...)
(jlt::AbstractJLBoostTree)(args...) = predict(jlt, args...)

(jltm::JLBoostTreeModel)(args...) = predict(jltm, args...)

(jlt::AbstractArray{JLBoostTreeModel})(args...) = predict(jlt, args...)

predict(jlt::JLBoostTreeModel, df) = predict(trees(jlt), df)

predict(jlt::JLBoostTreeModel, df::AbstractDataFrame) = predict(trees(jlt), df)

# defaults to Float64 (double) as output
predict(jlt::AbstractJLBoostTree, df) = predict(jlt::AbstractJLBoostTree, df, Float64)
predict(jlt::AbstractJLBoostTree, df) = predict(jlt, df, Float64)

function predict(jlt::AbstractJLBoostTree, df, out_eltype::Type{T})::Vector{T} where {T <: Number}
# TODO a more efficient algorithm. Currently there are too many assignbools being
Expand All @@ -43,10 +43,13 @@ end

function predict!(jlt::JLBoostTree, df, res, assignbool)
if length(jlt.children) == 2
new_assignbool = assignbool .& (getproperty(Tables.columns(df), jlt.splitfeature) .<= jlt.split)
tmp = getproperty(Tables.columns(df), jlt.splitfeature)

# TODO
new_assignbool = assignbool .& ismissing.(tmp) .|| (tmp .<= jlt.split)
predict!(jlt.children[1], df, res, new_assignbool)

new_assignbool .= assignbool .& (getproperty(Tables.columns(df), jlt.splitfeature) .> jlt.split)
new_assignbool .= assignbool .& .!(ismissing.(tmp) .|| (tmp .<= jlt.split))
predict!(jlt.children[2], df, res, new_assignbool)
elseif length(jlt.children) == 0
res[assignbool] .+= jlt.weight
Expand Down
57 changes: 53 additions & 4 deletions tutorial/give-me-some-credit/1-code.jl
Original file line number Diff line number Diff line change
@@ -1,21 +1,70 @@
using Revise
using JLBoost: is_left_child
using AbstractTrees: isroot
using JLBoost, CSV, JDF, TidierData
using DataFrames

using TidierData
using JLBoost: is_left_child
using AbstractTrees: isroot
using DataFrames, TidierData
using TidierData: @clean_names, @group_by, @summarise
using Chain: @chain

data = @chain JDF.load("cs-training.jdf") begin
DataFrame
# @select -(monthly_income, number_of_dependents)
@mutate revolving_utilization_of_unsecured_lines = round(revolving_utilization_of_unsecured_lines, digits=2)
end

jlboost(data, :serious_dlqin2yrs, [:monthly_income]; verbose=true, max_depth=2)

jlboost(data, :serious_dlqin2yrs, [:monthly_income]; verbose=true, max_depth=2)

jlboost(data, :serious_dlqin2yrs; verbose=true, max_depth=2)

test = @chain JDF.load("cs-test.jdf") begin
DataFrame
@select -(monthly_income, number_of_dependents)
@mutate revolving_utilization_of_unsecured_lines = round(revolving_utilization_of_unsecured_lines, digits=2)
end

names(data)

idx = rand(1:5, nrow(data))


model = jlboost(data, :serious_dlqin2yrs; nrounds=1, max_depth=4)
gini(-model(data), data[!, :serious_dlqin2yrs])

data[!, :group] = idx

@chain data begin
groupby(:group)
combine(_) do subdf
gini(-model(subdf), subdf[!, :serious_dlqin2yrs])
end
end

model = jlboost(data, :serious_dlqin2yrs; nrounds=2, max_depth=2)
gini(-model(data), data[!, :serious_dlqin2yrs])

model = jlboost(data, :serious_dlqin2yrs; nrounds=3, max_depth=2)
gini(-model(data), data[!, :serious_dlqin2yrs])

model = jlboost(data, :serious_dlqin2yrs; nrounds=4, max_depth=2)
gini(-model(data), data[!, :serious_dlqin2yrs])

model = jlboost(data, :serious_dlqin2yrs; nrounds=5, max_depth=2)
gini(-model(data), data[!, :serious_dlqin2yrs])

model = jlboost(data, :serious_dlqin2yrs; nrounds=6, max_depth=2)
gini(-model(data), data[!, :serious_dlqin2yrs])

model = jlboost(data, :serious_dlqin2yrs; nrounds=7, max_depth=2)
gini(-model(data), data[!, :serious_dlqin2yrs])

model = jlboost(data, :serious_dlqin2yrs; nrounds=8, max_depth=2)
gini(-model(data), data[!, :serious_dlqin2yrs])



function fit_score_card(data, target, features)
warm_start = fill(0.0, nrow(data))
gini = 0
Expand Down
1 change: 0 additions & 1 deletion tutorial/give-me-some-credit/Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -3,5 +3,4 @@ AbstractTrees = "1520ce14-60c1-5f80-bbc7-55ef81b5835c"
CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b"
JDF = "babc3d20-cd49-4f60-a736-a8f9c08892d3"
JLBoost = "13d6d4a1-5e7f-472c-9ebc-8123a4fbb95f"
Memoize = "c03570c3-d221-55d1-a50c-7939bbd78826"
StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"

0 comments on commit ed3b7ca

Please sign in to comment.