-
Notifications
You must be signed in to change notification settings - Fork 27
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* Make xgboost converter more robust. Add xgboost converter test * Enable import multiple compiled C++ backend bridges in same session * Fix xgboost multiclass conversion. Add multiclass and regression test * Simplify treeToDict for xgboost converter
- Loading branch information
Showing
5 changed files
with
133 additions
and
102 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,110 +1,60 @@ | ||
import numpy as np | ||
import json | ||
import xgboost as xgb | ||
import pandas | ||
from typing import Union | ||
|
||
def convert(bdt): | ||
meta = json.loads(bdt.save_config()) | ||
max_depth = int(meta['learner']['gradient_booster']['updater']['grow_colmaker']['train_param']['max_depth']) | ||
n_classes = int(meta['learner']['learner_model_param']['num_class']) | ||
def convert(bdt : Union[xgb.core.Booster, xgb.XGBClassifier, xgb.XGBRegressor]): | ||
assert isinstance(bdt, (xgb.core.Booster, xgb.XGBClassifier, xgb.XGBRegressor)) | ||
if isinstance(bdt, xgb.core.Booster): | ||
bst = bdt | ||
elif isinstance(bdt, (xgb.XGBClassifier, xgb.XGBRegressor)): | ||
bst = bdt.get_booster() | ||
meta = json.loads(bst.save_config()) | ||
updater = meta.get('learner').get('gradient_booster').get('gbtree_train_param').get('updater').split(',')[0] | ||
max_depth = int(meta.get('learner').get('gradient_booster').get('updater').get(updater).get('train_param').get('max_depth')) | ||
n_classes = int(meta.get('learner').get('learner_model_param').get('num_class')) | ||
fn_classes = 1 if n_classes == 0 else n_classes # the number of learners | ||
n_classes = 2 if n_classes == 0 else n_classes # the actual number of classes | ||
n_trees = int(int(meta.get('learner').get('gradient_booster').get('gbtree_model_param').get('num_trees')) / fn_classes) | ||
n_features = int(meta['learner']['learner_model_param']['num_feature']) | ||
ensembleDict = {'max_depth' : max_depth, | ||
'n_trees' : int(len(bdt.get_dump()) / fn_classes), | ||
'n_trees' : n_trees, | ||
'n_classes' : n_classes, | ||
'n_features' : n_features, | ||
'trees' : [], | ||
'init_predict' : [0] * n_classes, | ||
'norm' : 1} | ||
|
||
feature_names = {} | ||
if bdt.feature_names is None: | ||
if bst.feature_names is None: | ||
for i in range(n_features): | ||
feature_names[f'f{i}'] = i | ||
else: | ||
for i, feature_name in enumerate(bdt.feature_names): | ||
for i, feature_name in enumerate(bst.feature_names): | ||
feature_names[feature_name] = i | ||
|
||
trees = bdt.get_dump() | ||
trees = bst.trees_to_dataframe() | ||
for i in range(ensembleDict['n_trees']): | ||
treesl = [] | ||
for j in range(fn_classes): | ||
tree = trees[fn_classes * i + j] | ||
tree = trees[trees.Tree == fn_classes * i + j] | ||
tree = treeToDict(tree, feature_names) | ||
treesl.append(tree) | ||
ensembleDict['trees'].append(treesl) | ||
return ensembleDict | ||
|
||
def treeToDict(tree, feature_names): | ||
# First of all make the tree sklearn-like | ||
# split by newline, ignore the last line | ||
nodes = tree.split('\n')[:-1] | ||
# remove tab characters | ||
nodes = list(map(lambda x: x.replace('\t',''), nodes)) | ||
real_nNodes = len(nodes) | ||
# Number of nodes that are in the tree | ||
# Pruning removes nodes but does not reset index | ||
old_node_indices = [] | ||
for i in range(real_nNodes): | ||
iNode = int(nodes[i].split(':')[0]) | ||
old_node_indices.append(iNode) | ||
# Node indices that are left in the tree after pruning | ||
nNodes = max(old_node_indices)+1 | ||
# Maximum Node index | ||
nPrunedNodes = nNodes - len(old_node_indices) | ||
if nPrunedNodes > 0: | ||
node_to_node_dict = dict(list(enumerate(sorted(old_node_indices)))) | ||
node_to_node_dict = {value:key for key, value in node_to_node_dict.items()} | ||
# Create a dictionary remapping old Node indicies to new node indicies and invert | ||
features = [0] * nNodes | ||
thresholds = [0] * nNodes | ||
children_left = [0] * nNodes | ||
children_right = [0] * nNodes | ||
values = [0] * nNodes | ||
for node in nodes: | ||
if node == '': | ||
pass | ||
elif 'leaf' in node: # is a leaf | ||
# Looks like: 'i:leaf=value[i]' | ||
data = node.split('leaf') | ||
iNode = int(data[0].replace(':','')) | ||
if nPrunedNodes > 0: | ||
iNode = node_to_node_dict[iNode] | ||
# Remap node index | ||
feature = -2 | ||
threshold = 0 | ||
child_left = -1 | ||
child_right = -1 | ||
value = float(data[1].replace('=','')) | ||
else: | ||
# Looks like: | ||
# 'i:[f{feature[i]}<{threshold[i]} yes={children_left[i]},no={children_right[i]}...' | ||
iNode = int(node.split(':')[0]) # index comes before ':' | ||
if nPrunedNodes > 0: | ||
iNode = node_to_node_dict[iNode] | ||
# Remap node index | ||
# split around 'feature<threshold' | ||
data = node.split('<') | ||
feature = feature_names[data[0].split('[')[-1]] | ||
threshold = float(data[1].split(']')[0]) | ||
child_left = int(node.split('yes=')[1].split(',')[0]) | ||
child_right = int(node.split('no=')[1].split(',')[0]) | ||
if nPrunedNodes > 0: | ||
child_left = node_to_node_dict[child_left] | ||
child_right = node_to_node_dict[child_right] | ||
# Remap node index for children to preserve tree structure | ||
value = 0 | ||
features[iNode] = feature | ||
thresholds[iNode] = threshold | ||
children_left[iNode] = child_left | ||
children_right[iNode] = child_right | ||
values[iNode] = value | ||
if nPrunedNodes > 0: | ||
del features[-nPrunedNodes:] | ||
del thresholds[-nPrunedNodes:] | ||
del children_left[-nPrunedNodes:] | ||
del children_right[-nPrunedNodes:] | ||
del values[-nPrunedNodes:] | ||
# Remove the last N unused nodes in the tree | ||
treeDict = {'feature' : features, 'threshold' : thresholds, 'children_left' : children_left, | ||
'children_right' : children_right, 'value' : values} | ||
def treeToDict(tree : pandas.DataFrame, feature_names): | ||
assert isinstance(tree, pandas.DataFrame), "This method expects the tree as a pandas DataFrame" | ||
thresholds = tree.Split.fillna(0).tolist() | ||
features = tree.Feature.map(lambda x : -2 if x == 'Leaf' else feature_names[x]).tolist() | ||
children_left = tree.Yes.map(lambda x : int(x.split('-')[1]) if isinstance(x, str) else -1).tolist() | ||
children_right = tree.No.map(lambda x : int(x.split('-')[1]) if isinstance(x, str) else -1).tolist() | ||
values = tree.Gain.tolist() | ||
treeDict = {'feature' : features, | ||
'threshold' : thresholds, | ||
'children_left' : children_left, | ||
'children_right' : children_right, | ||
'value' : values | ||
} | ||
return treeDict |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,89 @@ | ||
from sklearn.datasets import make_hastie_10_2, load_diabetes, load_iris | ||
import xgboost as xgb | ||
import conifer | ||
import datetime | ||
from scipy.special import expit, logit, softmax | ||
import numpy as np | ||
import pytest | ||
import logging | ||
import sys | ||
|
||
#logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) | ||
|
||
# Make a random dataset from sklearn 'hastie' | ||
# binary classification | ||
def hastie(): | ||
X, y = make_hastie_10_2(random_state=0) | ||
y[y == -1] = 0 | ||
return X, y, xgb.DMatrix(X, label=y) | ||
|
||
# multiclass classification | ||
def iris(): | ||
X, y = load_iris(return_X_y=True) | ||
return X, y, xgb.DMatrix(X, label=y) | ||
|
||
# regression | ||
def diabetes(): | ||
X, y = load_diabetes(return_X_y=True) | ||
return X, y, xgb.DMatrix(X, label=y) | ||
|
||
def model_0(d, kwarg_params={}): | ||
# Train a BDT | ||
param = {'max_depth': 3, 'eta': 1, 'objective': 'binary:logistic'} | ||
param.update(kwarg_params) | ||
num_round = 20 # num_round is equivalent to number of trees | ||
bst = xgb.train(param, d, num_round) | ||
return bst | ||
|
||
def model_1(d, kwarg_params={}): | ||
# Train a BDT | ||
param = {'max_depth': 3, 'eta': 1, 'objective': 'binary:logistic', 'updater':'grow_histmaker'} | ||
param.update(kwarg_params) | ||
num_round = 20 # num_round is equivalent to number of trees | ||
bst = xgb.train(param, d, num_round) | ||
return bst | ||
|
||
def model_2(X, y, kwarg_params={}): | ||
bdt = xgb.XGBClassifier(n_estimators=10, max_depth=3, **kwarg_params) | ||
bdt.fit(X, y) | ||
return bdt | ||
|
||
def model_3(X, y, kwarg_params={}): | ||
bdt = xgb.XGBRegressor(n_estimators=10, max_depth=3, **kwarg_params) | ||
bdt.fit(X, y) | ||
return bdt | ||
|
||
# parameter format: (test index, model function, data function, data fmt ['np', 'xgb'], predictor function, prediction transform) | ||
@pytest.mark.parametrize('params', [(0, model_0, {}, hastie, 'xgb', 'predict', expit, {}), | ||
(1, model_1, {}, hastie, 'xgb', 'predict', expit, {}), | ||
(2, model_2, {}, hastie, 'np', 'predict_proba', expit, {}), | ||
(3, model_2, {}, iris, 'np', 'predict_proba', softmax, {'axis':1}), | ||
(4, model_0, {'objective': 'multi:softprob', 'num_class': 3}, iris, 'xgb', 'predict', softmax, {'axis':1}), | ||
(5, model_3, {}, diabetes, 'np', 'predict', lambda x: x, {}), # not yet possible | ||
]) | ||
def test_xgb(params): | ||
test_idx = params[0] | ||
get_model_function = params[1] | ||
get_model_kwargs = params[2] | ||
get_data_function = params[3] | ||
data_fmt = params[4] | ||
predictor = params[5] | ||
transform = params[6] | ||
transform_kwargs = params[7] | ||
X, y, d = get_data_function() | ||
if data_fmt == 'xgb': | ||
model = get_model_function(d, get_model_kwargs) | ||
else: | ||
model = get_model_function(X, y, get_model_kwargs) | ||
cfg = conifer.backends.cpp.auto_config() | ||
cfg['Precision'] = 'double' | ||
# Set the output directory to something unique | ||
cfg['OutputDir'] = f'prj_xgb_converter_{test_idx}_{int(datetime.datetime.now().timestamp())}' | ||
cnf_model = conifer.converters.convert_from_xgboost(model, cfg) | ||
cnf_model.compile() | ||
y_cnf = np.squeeze(transform(cnf_model.decision_function(X), **transform_kwargs)) | ||
X_xgb = X if data_fmt == 'np' else d | ||
y_xgb = getattr(model, predictor)(X_xgb) | ||
if len(y_xgb.shape) == 2 and y_xgb.shape[1] == 2: | ||
y_xgb = y_xgb[:,-1] | ||
np.testing.assert_array_almost_equal(y_cnf, y_xgb) |