Skip to content

Commit

Permalink
xgboost updates (#47)
Browse files Browse the repository at this point in the history
* Make xgboost converter more robust. Add xgboost converter test

* Enable import multiple compiled C++ backend bridges in same session

* Fix xgboost multiclass conversion. Add multiclass and regression test

* Simplify treeToDict for xgboost converter
  • Loading branch information
thesps authored Jul 5, 2023
1 parent f52409a commit 086c82e
Show file tree
Hide file tree
Showing 5 changed files with 133 additions and 102 deletions.
2 changes: 1 addition & 1 deletion conifer/backends/cpp/template/bridge.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

namespace py = pybind11;
PYBIND11_MODULE(conifer_bridge, m){
py::class_<conifer::BDT<T,U,false>>(m, "BDT")
py::class_<conifer::BDT<T,U,false>>(m, "BDT", py::module_local())
.def(py::init<const std::string &>())
.def("decision_function", &conifer::BDT<T,U>::_decision_function_double);
}
2 changes: 1 addition & 1 deletion conifer/backends/cpp/writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ def write(self):
newline = f"typedef {cfg.threshold_precision} T;\n"
newline += f"typedef {cfg.score_precision} U;\n"
elif 'PYBIND11_MODULE' in line:
newline = f'PYBIND11_MODULE(conifer_bridge_{self._stamp}, m){{\n'
newline = line.replace('conifer_bridge', f'conifer_bridge_{self._stamp}')
elif '// conifer insert include' in line:
newline = '#include "ap_fixed.h"' if cfg.any_ap_types() else ''
fout.write(newline)
Expand Down
28 changes: 10 additions & 18 deletions conifer/converters/__init__.py
Original file line number Diff line number Diff line change
@@ -1,24 +1,16 @@
from conifer.converters import sklearn
from conifer.converters import tmva
from conifer.converters import xgboost
from conifer.converters import onnx
from conifer.model import make_model

try:
from conifer.converters import tf_df
except ImportError:
tf_df = None
print("Warning: The python package tensorflow_decision_forests is not available. Conversions "
"from TensorFlow Decision Forests models is not possible.")

import logging
logger = logging.getLogger(__name__)

_converter_map = {'sklearn' : sklearn,
'tmva' : tmva,
'xgboost' : xgboost,
'onnx' : onnx,
'tf_df': tf_df}
_converter_map = {}
import importlib
for module in ['sklearn', 'tmva', 'xgboost', 'onnx', 'tf_df']:
try:
the_module = importlib.import_module(f'conifer.converters.{module}')
_converter_map[module] = the_module
except ImportError:
logger.warn(f'Could not import conifer {module} converter')

from conifer.model import make_model

def get_converter(converter):
'''Get converter object from string'''
Expand Down
114 changes: 32 additions & 82 deletions conifer/converters/xgboost.py
Original file line number Diff line number Diff line change
@@ -1,110 +1,60 @@
import numpy as np
import json
import xgboost as xgb
import pandas
from typing import Union

def convert(bdt):
meta = json.loads(bdt.save_config())
max_depth = int(meta['learner']['gradient_booster']['updater']['grow_colmaker']['train_param']['max_depth'])
n_classes = int(meta['learner']['learner_model_param']['num_class'])
def convert(bdt : Union[xgb.core.Booster, xgb.XGBClassifier, xgb.XGBRegressor]):
assert isinstance(bdt, (xgb.core.Booster, xgb.XGBClassifier, xgb.XGBRegressor))
if isinstance(bdt, xgb.core.Booster):
bst = bdt
elif isinstance(bdt, (xgb.XGBClassifier, xgb.XGBRegressor)):
bst = bdt.get_booster()
meta = json.loads(bst.save_config())
updater = meta.get('learner').get('gradient_booster').get('gbtree_train_param').get('updater').split(',')[0]
max_depth = int(meta.get('learner').get('gradient_booster').get('updater').get(updater).get('train_param').get('max_depth'))
n_classes = int(meta.get('learner').get('learner_model_param').get('num_class'))
fn_classes = 1 if n_classes == 0 else n_classes # the number of learners
n_classes = 2 if n_classes == 0 else n_classes # the actual number of classes
n_trees = int(int(meta.get('learner').get('gradient_booster').get('gbtree_model_param').get('num_trees')) / fn_classes)
n_features = int(meta['learner']['learner_model_param']['num_feature'])
ensembleDict = {'max_depth' : max_depth,
'n_trees' : int(len(bdt.get_dump()) / fn_classes),
'n_trees' : n_trees,
'n_classes' : n_classes,
'n_features' : n_features,
'trees' : [],
'init_predict' : [0] * n_classes,
'norm' : 1}

feature_names = {}
if bdt.feature_names is None:
if bst.feature_names is None:
for i in range(n_features):
feature_names[f'f{i}'] = i
else:
for i, feature_name in enumerate(bdt.feature_names):
for i, feature_name in enumerate(bst.feature_names):
feature_names[feature_name] = i

trees = bdt.get_dump()
trees = bst.trees_to_dataframe()
for i in range(ensembleDict['n_trees']):
treesl = []
for j in range(fn_classes):
tree = trees[fn_classes * i + j]
tree = trees[trees.Tree == fn_classes * i + j]
tree = treeToDict(tree, feature_names)
treesl.append(tree)
ensembleDict['trees'].append(treesl)
return ensembleDict

def treeToDict(tree, feature_names):
# First of all make the tree sklearn-like
# split by newline, ignore the last line
nodes = tree.split('\n')[:-1]
# remove tab characters
nodes = list(map(lambda x: x.replace('\t',''), nodes))
real_nNodes = len(nodes)
# Number of nodes that are in the tree
# Pruning removes nodes but does not reset index
old_node_indices = []
for i in range(real_nNodes):
iNode = int(nodes[i].split(':')[0])
old_node_indices.append(iNode)
# Node indices that are left in the tree after pruning
nNodes = max(old_node_indices)+1
# Maximum Node index
nPrunedNodes = nNodes - len(old_node_indices)
if nPrunedNodes > 0:
node_to_node_dict = dict(list(enumerate(sorted(old_node_indices))))
node_to_node_dict = {value:key for key, value in node_to_node_dict.items()}
# Create a dictionary remapping old Node indicies to new node indicies and invert
features = [0] * nNodes
thresholds = [0] * nNodes
children_left = [0] * nNodes
children_right = [0] * nNodes
values = [0] * nNodes
for node in nodes:
if node == '':
pass
elif 'leaf' in node: # is a leaf
# Looks like: 'i:leaf=value[i]'
data = node.split('leaf')
iNode = int(data[0].replace(':',''))
if nPrunedNodes > 0:
iNode = node_to_node_dict[iNode]
# Remap node index
feature = -2
threshold = 0
child_left = -1
child_right = -1
value = float(data[1].replace('=',''))
else:
# Looks like:
# 'i:[f{feature[i]}<{threshold[i]} yes={children_left[i]},no={children_right[i]}...'
iNode = int(node.split(':')[0]) # index comes before ':'
if nPrunedNodes > 0:
iNode = node_to_node_dict[iNode]
# Remap node index
# split around 'feature<threshold'
data = node.split('<')
feature = feature_names[data[0].split('[')[-1]]
threshold = float(data[1].split(']')[0])
child_left = int(node.split('yes=')[1].split(',')[0])
child_right = int(node.split('no=')[1].split(',')[0])
if nPrunedNodes > 0:
child_left = node_to_node_dict[child_left]
child_right = node_to_node_dict[child_right]
# Remap node index for children to preserve tree structure
value = 0
features[iNode] = feature
thresholds[iNode] = threshold
children_left[iNode] = child_left
children_right[iNode] = child_right
values[iNode] = value
if nPrunedNodes > 0:
del features[-nPrunedNodes:]
del thresholds[-nPrunedNodes:]
del children_left[-nPrunedNodes:]
del children_right[-nPrunedNodes:]
del values[-nPrunedNodes:]
# Remove the last N unused nodes in the tree
treeDict = {'feature' : features, 'threshold' : thresholds, 'children_left' : children_left,
'children_right' : children_right, 'value' : values}
def treeToDict(tree : pandas.DataFrame, feature_names):
assert isinstance(tree, pandas.DataFrame), "This method expects the tree as a pandas DataFrame"
thresholds = tree.Split.fillna(0).tolist()
features = tree.Feature.map(lambda x : -2 if x == 'Leaf' else feature_names[x]).tolist()
children_left = tree.Yes.map(lambda x : int(x.split('-')[1]) if isinstance(x, str) else -1).tolist()
children_right = tree.No.map(lambda x : int(x.split('-')[1]) if isinstance(x, str) else -1).tolist()
values = tree.Gain.tolist()
treeDict = {'feature' : features,
'threshold' : thresholds,
'children_left' : children_left,
'children_right' : children_right,
'value' : values
}
return treeDict
89 changes: 89 additions & 0 deletions tests/test_xgb_converter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
from sklearn.datasets import make_hastie_10_2, load_diabetes, load_iris
import xgboost as xgb
import conifer
import datetime
from scipy.special import expit, logit, softmax
import numpy as np
import pytest
import logging
import sys

#logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)

# Make a random dataset from sklearn 'hastie'
# binary classification
def hastie():
X, y = make_hastie_10_2(random_state=0)
y[y == -1] = 0
return X, y, xgb.DMatrix(X, label=y)

# multiclass classification
def iris():
X, y = load_iris(return_X_y=True)
return X, y, xgb.DMatrix(X, label=y)

# regression
def diabetes():
X, y = load_diabetes(return_X_y=True)
return X, y, xgb.DMatrix(X, label=y)

def model_0(d, kwarg_params={}):
# Train a BDT
param = {'max_depth': 3, 'eta': 1, 'objective': 'binary:logistic'}
param.update(kwarg_params)
num_round = 20 # num_round is equivalent to number of trees
bst = xgb.train(param, d, num_round)
return bst

def model_1(d, kwarg_params={}):
# Train a BDT
param = {'max_depth': 3, 'eta': 1, 'objective': 'binary:logistic', 'updater':'grow_histmaker'}
param.update(kwarg_params)
num_round = 20 # num_round is equivalent to number of trees
bst = xgb.train(param, d, num_round)
return bst

def model_2(X, y, kwarg_params={}):
bdt = xgb.XGBClassifier(n_estimators=10, max_depth=3, **kwarg_params)
bdt.fit(X, y)
return bdt

def model_3(X, y, kwarg_params={}):
bdt = xgb.XGBRegressor(n_estimators=10, max_depth=3, **kwarg_params)
bdt.fit(X, y)
return bdt

# parameter format: (test index, model function, data function, data fmt ['np', 'xgb'], predictor function, prediction transform)
@pytest.mark.parametrize('params', [(0, model_0, {}, hastie, 'xgb', 'predict', expit, {}),
(1, model_1, {}, hastie, 'xgb', 'predict', expit, {}),
(2, model_2, {}, hastie, 'np', 'predict_proba', expit, {}),
(3, model_2, {}, iris, 'np', 'predict_proba', softmax, {'axis':1}),
(4, model_0, {'objective': 'multi:softprob', 'num_class': 3}, iris, 'xgb', 'predict', softmax, {'axis':1}),
(5, model_3, {}, diabetes, 'np', 'predict', lambda x: x, {}), # not yet possible
])
def test_xgb(params):
test_idx = params[0]
get_model_function = params[1]
get_model_kwargs = params[2]
get_data_function = params[3]
data_fmt = params[4]
predictor = params[5]
transform = params[6]
transform_kwargs = params[7]
X, y, d = get_data_function()
if data_fmt == 'xgb':
model = get_model_function(d, get_model_kwargs)
else:
model = get_model_function(X, y, get_model_kwargs)
cfg = conifer.backends.cpp.auto_config()
cfg['Precision'] = 'double'
# Set the output directory to something unique
cfg['OutputDir'] = f'prj_xgb_converter_{test_idx}_{int(datetime.datetime.now().timestamp())}'
cnf_model = conifer.converters.convert_from_xgboost(model, cfg)
cnf_model.compile()
y_cnf = np.squeeze(transform(cnf_model.decision_function(X), **transform_kwargs))
X_xgb = X if data_fmt == 'np' else d
y_xgb = getattr(model, predictor)(X_xgb)
if len(y_xgb.shape) == 2 and y_xgb.shape[1] == 2:
y_xgb = y_xgb[:,-1]
np.testing.assert_array_almost_equal(y_cnf, y_xgb)

0 comments on commit 086c82e

Please sign in to comment.