diff --git a/README.jmd b/README.jmd index 3a7fca7..f55fc52 100644 --- a/README.jmd +++ b/README.jmd @@ -1,11 +1,3 @@ -# No more development; not even bug fixes in the foreseeable future - -Due to life changes. I have 0 time now to handle this Open Source project. So this will be archived -until I can come back to it. - -I will refocus my energy on only a couple of open source packages one of them being {disk.frame}. - - # JLBoost.jl This is a 100%-Julia implementation of Gradient Boosting Regresssion Trees (GBRT) based heavily on the algorithms published in the XGBoost, LightGBM and Catboost papers. GBRT is also referred to as Gradient Boosting Decision Tree (GBDT). diff --git a/src/JLBoostTrees/filter_tbl_by_splits.jl b/src/JLBoostTrees/filter_tbl_by_splits.jl index f68f13f..54ab5d0 100644 --- a/src/JLBoostTrees/filter_tbl_by_splits.jl +++ b/src/JLBoostTrees/filter_tbl_by_splits.jl @@ -25,10 +25,21 @@ function keeprow_vec(tbl, node::Union{Nothing, AbstractJLBoostTree})::BitArray keeprow = keeprow_vec(tbl, node.parent) end + tmp = getproperty(tblc, node.parent.splitfeature) if is_left_child(node) - keeprow .&= getproperty(tblc, node.parent.splitfeature) .<= node.parent.split + # TODO fix this + # @warn "Currently assuming missing go left; this NEEDS to be FIXED" + keeprow .&= ismissing.(tmp) .|| tmp .<= node.parent.split + # keeprow .&= tmp .<= node.parent.split else - keeprow .&= getproperty(tblc, node.parent.splitfeature) .> node.parent.split + # @warn "Currently assuming missing go left; this NEEDS to be FIXED" + # keeprow .&= .!ismissing.(tmp) .|| tmp .> node.parent.split + # the above causes errors so apply De Morgan's law + # not(missing) | x > y = not not (not(missing) | x > y) + # = not (missing | x <= y) # by De Morgan's law + # keeping the above as reminder of how dumb I am. Just negate man + keeprow .&= .!(ismissing.(tmp) .|| tmp .<= node.parent.split) + # keeprow .&= tmp .> node.parent.split end keeprow diff --git a/src/JLBoostTrees/tree-and-tree-models.jl b/src/JLBoostTrees/tree-and-tree-models.jl index e6ca4bf..3fe7125 100644 --- a/src/JLBoostTrees/tree-and-tree-models.jl +++ b/src/JLBoostTrees/tree-and-tree-models.jl @@ -26,6 +26,7 @@ mutable struct JLBoostTree{T} <: AbstractJLBoostTree{T} weight parent::Union{JLBoostTree, Nothing} children::AbstractVector{AbstractJLBoostTree} # this is deliberate kept as an vector of AbstractJLBoostTree; because we can genuinely mix and match types in htere + # TODO store the node value as FeatureSplitPredictate so you can generalise it to include missing splitfeature split gain diff --git a/src/find_best_split.jl b/src/find_best_split.jl index 628c62d..c99d91b 100644 --- a/src/find_best_split.jl +++ b/src/find_best_split.jl @@ -18,13 +18,12 @@ Does not assume that Feature, target, and warmstart are sorted and will sort the function find_best_split(loss, df, feature::Symbol, target::Symbol, warmstart::AbstractVector, lambda, gamma; verbose = false, kwargs...) @assert Tables.istable(df) - dfc = Tables.columns(df) x = getproperty(dfc, feature) if verbose - @info "find_best_split(): Calculating a split on `$feature` with extrema $(extrema(x))" + @info "find_best_split(): Calculating a split on `$feature` with extrema $(extrema(x |> skipmissing))" end @@ -77,6 +76,26 @@ function _find_best_split(loss, feature, target, warmstart, lambda::Number, gamm @assert length(target) == length(feature) @assert length(warmstart) == length(feature) + + # if the feature vector has missings + non_missing_ends = length(feature) + if Missing <: eltype(feature) + pos = searchsortedfirst(feature, missing) + + if pos > non_missing_ends # this means it couldn't find any missing + # do nothing + elseif pos == 1 + # all features are missing + return (split_at = missing, cutpt = missing, gain = missing, lweight = missing, rweight = missing, should_split_further = false) + else + non_missing_ends = pos - 1 + + feature = @view feature[1:non_missing_ends] + target = @view target[1:non_missing_ends] + warmstart = @view warmstart[1:non_missing_ends] + end + end + # TODO maybe use some kind of argmax here # TODO can reduce allocations here by skipping the broadcasting . cg = cumsum(g.(loss, target, warmstart)) @@ -131,8 +150,10 @@ function _find_best_split(loss, feature, target, warmstart, lambda::Number, gamm end end + # TODO if should split further add missing + # @info "Got here: $(count(ismissing, feature))" # TODO return a type - (split_at = split_at, cutpt = cutpt, gain = best_gain, lweight = lweight, rweight = rweight, should_split_further = should_split_further) + (split_at = split_at, cutpt = cutpt, gain = best_gain, lweight = lweight, rweight = rweight, should_split_further = should_split_further, missing_go_left = true) end # TODO more reseach into GPU friendliness diff --git a/src/fit_tree.jl b/src/fit_tree.jl index 3922e6a..0ed2afa 100644 --- a/src/fit_tree.jl +++ b/src/fit_tree.jl @@ -86,7 +86,7 @@ function _fit_tree!(loss, tbl, target, features, warm_start, # this shouldn't be reset and should be placed outside of the while loop below - while !no_more_gains_to_found && !stopping_criterion(jlt) + while (!no_more_gains_to_found) && !stopping_criterion(jlt) if verbose @info "BEST SPLIT PHASE: Tree Depth=$(treedepth(jlt))" end @@ -153,7 +153,7 @@ function _fit_tree!(loss, tbl, target, features, warm_start, end if verbose - @info("BEST SPLIT PHASE: found a best split at $(split_with_best_gain.feature) <= $(split_with_best_gain.split_at); gain:$(split_with_best_gain.gain) further:$(split_with_best_gain.should_split_further) for $(leaf_node)") + @info("BEST SPLIT PHASE: found a best split at `$(split_with_best_gain.feature)`` <= $(split_with_best_gain.split_at); gain:$(split_with_best_gain.gain) further:$(split_with_best_gain.should_split_further) for $(leaf_node)") end best_split_dict[leaf_node] = split_with_best_gain @@ -169,9 +169,8 @@ function _fit_tree!(loss, tbl, target, features, warm_start, # tree_growth phase # select the node to grow based on growth function - # the tree_growth function will return the list of - # nodes_to_split = tree_growth(jlt) - nodes_to_split::Vector{<:AbstractJLBoostTree} = tree_growth(jlt) + # the tree_growth function needs to return the an Iterable of JLBoost trees + nodes_to_split = tree_growth(jlt) if verbose @info "TREE GROWTH PHASE: Found $(length(nodes_to_split)) node-candidates to split" end diff --git a/src/jlboost-fit.jl b/src/jlboost-fit.jl index ec098dc..e6ce331 100644 --- a/src/jlboost-fit.jl +++ b/src/jlboost-fit.jl @@ -34,8 +34,6 @@ see https://xgboost.readthedocs.io/en/latest/parameter.html * monotone_contraints: Not yet implemented * interaction_constraints: Not yet implemented """ - - function jlboost(df, target::Union{Symbol, String}; kwargs...) target = Symbol(target) warm_start = fill(0.0, nrow(df)) @@ -107,6 +105,7 @@ function jlboost(df, target, features, warm_start::AbstractVector, target = Symbol(target) features = Symbol.(features) + # TODO get only the needed columns from the table # dfc = Tables.columns(df) dfc = df @@ -130,14 +129,10 @@ function jlboost(df, target, features, warm_start::AbstractVector, warm_start = predict(res_jlt[1:nround-1], dfs) end - # println(nround) - # println(dfc) - new_jlt = _fit_tree!(loss, dfc, target, features_sample, warm_start, JLBoostTree(0.0), tree_growth, stopping_criterion; verbose=verbose, kwargs...); - # println("mehmehmehmeh") # added a new round of tree push!(res_jlt, eta*deepcopy(new_jlt)) end diff --git a/src/predict.jl b/src/predict.jl index ba3147f..49ec40d 100644 --- a/src/predict.jl +++ b/src/predict.jl @@ -15,16 +15,16 @@ Apply the fitted model on data. * df - a Tables.jl compatible Table """ -(jlt::JLBoostTreeModel)(args...) = predict(jlt, args...) +(jlt::AbstractJLBoostTree)(args...) = predict(jlt, args...) + +(jltm::JLBoostTreeModel)(args...) = predict(jltm, args...) (jlt::AbstractArray{JLBoostTreeModel})(args...) = predict(jlt, args...) predict(jlt::JLBoostTreeModel, df) = predict(trees(jlt), df) -predict(jlt::JLBoostTreeModel, df::AbstractDataFrame) = predict(trees(jlt), df) - # defaults to Float64 (double) as output -predict(jlt::AbstractJLBoostTree, df) = predict(jlt::AbstractJLBoostTree, df, Float64) +predict(jlt::AbstractJLBoostTree, df) = predict(jlt, df, Float64) function predict(jlt::AbstractJLBoostTree, df, out_eltype::Type{T})::Vector{T} where {T <: Number} # TODO a more efficient algorithm. Currently there are too many assignbools being @@ -43,10 +43,13 @@ end function predict!(jlt::JLBoostTree, df, res, assignbool) if length(jlt.children) == 2 - new_assignbool = assignbool .& (getproperty(Tables.columns(df), jlt.splitfeature) .<= jlt.split) + tmp = getproperty(Tables.columns(df), jlt.splitfeature) + + # TODO + new_assignbool = assignbool .& ismissing.(tmp) .|| (tmp .<= jlt.split) predict!(jlt.children[1], df, res, new_assignbool) - new_assignbool .= assignbool .& (getproperty(Tables.columns(df), jlt.splitfeature) .> jlt.split) + new_assignbool .= assignbool .& .!(ismissing.(tmp) .|| (tmp .<= jlt.split)) predict!(jlt.children[2], df, res, new_assignbool) elseif length(jlt.children) == 0 res[assignbool] .+= jlt.weight diff --git a/tutorial/give-me-some-credit/1-code.jl b/tutorial/give-me-some-credit/1-code.jl index b92f063..ae560ba 100644 --- a/tutorial/give-me-some-credit/1-code.jl +++ b/tutorial/give-me-some-credit/1-code.jl @@ -1,14 +1,25 @@ using Revise -using JLBoost: is_left_child -using AbstractTrees: isroot using JLBoost, CSV, JDF, TidierData -using DataFrames -using TidierData +using JLBoost: is_left_child +using AbstractTrees: isroot +using DataFrames, TidierData using TidierData: @clean_names, @group_by, @summarise using Chain: @chain data = @chain JDF.load("cs-training.jdf") begin + DataFrame + # @select -(monthly_income, number_of_dependents) + @mutate revolving_utilization_of_unsecured_lines = round(revolving_utilization_of_unsecured_lines, digits=2) +end + +jlboost(data, :serious_dlqin2yrs, [:monthly_income]; verbose=true, max_depth=2) + +jlboost(data, :serious_dlqin2yrs, [:monthly_income]; verbose=true, max_depth=2) + +jlboost(data, :serious_dlqin2yrs; verbose=true, max_depth=2) + +test = @chain JDF.load("cs-test.jdf") begin DataFrame @select -(monthly_income, number_of_dependents) @mutate revolving_utilization_of_unsecured_lines = round(revolving_utilization_of_unsecured_lines, digits=2) @@ -16,6 +27,44 @@ end names(data) +idx = rand(1:5, nrow(data)) + + +model = jlboost(data, :serious_dlqin2yrs; nrounds=1, max_depth=4) +gini(-model(data), data[!, :serious_dlqin2yrs]) + +data[!, :group] = idx + +@chain data begin + groupby(:group) + combine(_) do subdf + gini(-model(subdf), subdf[!, :serious_dlqin2yrs]) + end +end + +model = jlboost(data, :serious_dlqin2yrs; nrounds=2, max_depth=2) +gini(-model(data), data[!, :serious_dlqin2yrs]) + +model = jlboost(data, :serious_dlqin2yrs; nrounds=3, max_depth=2) +gini(-model(data), data[!, :serious_dlqin2yrs]) + +model = jlboost(data, :serious_dlqin2yrs; nrounds=4, max_depth=2) +gini(-model(data), data[!, :serious_dlqin2yrs]) + +model = jlboost(data, :serious_dlqin2yrs; nrounds=5, max_depth=2) +gini(-model(data), data[!, :serious_dlqin2yrs]) + +model = jlboost(data, :serious_dlqin2yrs; nrounds=6, max_depth=2) +gini(-model(data), data[!, :serious_dlqin2yrs]) + +model = jlboost(data, :serious_dlqin2yrs; nrounds=7, max_depth=2) +gini(-model(data), data[!, :serious_dlqin2yrs]) + +model = jlboost(data, :serious_dlqin2yrs; nrounds=8, max_depth=2) +gini(-model(data), data[!, :serious_dlqin2yrs]) + + + function fit_score_card(data, target, features) warm_start = fill(0.0, nrow(data)) gini = 0 diff --git a/tutorial/give-me-some-credit/Project.toml b/tutorial/give-me-some-credit/Project.toml index eaf886d..660d12f 100644 --- a/tutorial/give-me-some-credit/Project.toml +++ b/tutorial/give-me-some-credit/Project.toml @@ -3,5 +3,4 @@ AbstractTrees = "1520ce14-60c1-5f80-bbc7-55ef81b5835c" CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b" JDF = "babc3d20-cd49-4f60-a736-a8f9c08892d3" JLBoost = "13d6d4a1-5e7f-472c-9ebc-8123a4fbb95f" -Memoize = "c03570c3-d221-55d1-a50c-7939bbd78826" StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"