xgboost updates (#47)

* Make xgboost converter more robust. Add xgboost converter test * Enable import multiple compiled C++ backend bridges in same session * Fix xgboost multiclass conversion. Add multiclass and regression test * Simplify treeToDict for xgboost converter
thesps · Jul 5, 2023 · 086c82e · 086c82e
1 parent f52409a
commit 086c82e
Show file tree

Hide file tree

Showing 5 changed files with 133 additions and 102 deletions.
diff --git a/conifer/backends/cpp/template/bridge.cpp b/conifer/backends/cpp/template/bridge.cpp
@@ -7,7 +7,7 @@
 
 namespace py = pybind11;
 PYBIND11_MODULE(conifer_bridge, m){
-  py::class_<conifer::BDT<T,U,false>>(m, "BDT")
+  py::class_<conifer::BDT<T,U,false>>(m, "BDT", py::module_local())
       .def(py::init<const std::string &>())
       .def("decision_function", &conifer::BDT<T,U>::_decision_function_double);
 }
diff --git a/conifer/backends/cpp/writer.py b/conifer/backends/cpp/writer.py
@@ -57,7 +57,7 @@ def write(self):
         newline =  f"typedef {cfg.threshold_precision} T;\n"
         newline += f"typedef {cfg.score_precision} U;\n"
       elif 'PYBIND11_MODULE' in line:
-        newline = f'PYBIND11_MODULE(conifer_bridge_{self._stamp}, m){{\n'
+        newline = line.replace('conifer_bridge', f'conifer_bridge_{self._stamp}')
       elif '// conifer insert include' in line:
         newline = '#include "ap_fixed.h"' if cfg.any_ap_types() else ''
       fout.write(newline)

diff --git a/conifer/converters/__init__.py b/conifer/converters/__init__.py
@@ -1,24 +1,16 @@
-from conifer.converters import sklearn
-from conifer.converters import tmva
-from conifer.converters import xgboost
-from conifer.converters import onnx
-from conifer.model import make_model
-
-try:
-    from conifer.converters import tf_df
-except ImportError:
-    tf_df = None
-    print("Warning: The python package tensorflow_decision_forests is not available. Conversions "
-          "from TensorFlow Decision Forests models is not possible.")
-
 import logging
 logger = logging.getLogger(__name__)
 
-_converter_map = {'sklearn' : sklearn,
-                  'tmva'    : tmva,
-                  'xgboost' : xgboost,
-                  'onnx'    : onnx,
-                  'tf_df':  tf_df}
+_converter_map = {}
+import importlib
+for module in ['sklearn', 'tmva', 'xgboost', 'onnx', 'tf_df']:
+  try:
+    the_module = importlib.import_module(f'conifer.converters.{module}')
+    _converter_map[module] = the_module
+  except ImportError:
+    logger.warn(f'Could not import conifer {module} converter')
+
+from conifer.model import make_model
 
 def get_converter(converter):
   '''Get converter object from string'''

diff --git a/conifer/converters/xgboost.py b/conifer/converters/xgboost.py
@@ -1,110 +1,60 @@
 import numpy as np
 import json
+import xgboost as xgb
+import pandas
+from typing import Union
 
-def convert(bdt):
-    meta = json.loads(bdt.save_config())
-    max_depth = int(meta['learner']['gradient_booster']['updater']['grow_colmaker']['train_param']['max_depth'])
-    n_classes = int(meta['learner']['learner_model_param']['num_class'])
+def convert(bdt : Union[xgb.core.Booster, xgb.XGBClassifier, xgb.XGBRegressor]):
+    assert isinstance(bdt, (xgb.core.Booster, xgb.XGBClassifier, xgb.XGBRegressor))
+    if isinstance(bdt, xgb.core.Booster):
+      bst = bdt
+    elif isinstance(bdt, (xgb.XGBClassifier, xgb.XGBRegressor)):
+      bst = bdt.get_booster()
+    meta = json.loads(bst.save_config())
+    updater = meta.get('learner').get('gradient_booster').get('gbtree_train_param').get('updater').split(',')[0]
+    max_depth = int(meta.get('learner').get('gradient_booster').get('updater').get(updater).get('train_param').get('max_depth'))
+    n_classes = int(meta.get('learner').get('learner_model_param').get('num_class'))
     fn_classes = 1 if n_classes == 0 else n_classes # the number of learners
     n_classes = 2 if n_classes == 0 else n_classes # the actual number of classes
+    n_trees = int(int(meta.get('learner').get('gradient_booster').get('gbtree_model_param').get('num_trees')) / fn_classes)
     n_features = int(meta['learner']['learner_model_param']['num_feature'])
     ensembleDict = {'max_depth' : max_depth,
-                    'n_trees' : int(len(bdt.get_dump()) / fn_classes),
+                    'n_trees' : n_trees,
                     'n_classes' : n_classes,
                     'n_features' : n_features,
                     'trees' : [],
                     'init_predict' : [0] * n_classes,
                     'norm' : 1}
 
     feature_names = {}
-    if bdt.feature_names is None:
+    if bst.feature_names is None:
       for i in range(n_features):
         feature_names[f'f{i}'] = i
     else:
-      for i, feature_name in enumerate(bdt.feature_names):
+      for i, feature_name in enumerate(bst.feature_names):
         feature_names[feature_name] = i
 
-    trees = bdt.get_dump()
+    trees = bst.trees_to_dataframe()
     for i in range(ensembleDict['n_trees']):
         treesl = []
         for j in range(fn_classes):
-            tree = trees[fn_classes * i + j]
+            tree = trees[trees.Tree == fn_classes * i + j]
             tree = treeToDict(tree, feature_names)
             treesl.append(tree)
         ensembleDict['trees'].append(treesl)
     return ensembleDict
 
-def treeToDict(tree, feature_names):
-  # First of all make the tree sklearn-like
-  # split by newline, ignore the last line
-  nodes = tree.split('\n')[:-1]
-  # remove tab characters
-  nodes = list(map(lambda x: x.replace('\t',''), nodes))
-  real_nNodes = len(nodes)
-  # Number of nodes that are in the tree
-  # Pruning removes nodes but does not reset index 
-  old_node_indices = []
-  for i in range(real_nNodes):
-    iNode = int(nodes[i].split(':')[0])
-    old_node_indices.append(iNode)
-    # Node indices that are left in the tree after pruning
-  nNodes = max(old_node_indices)+1
-  # Maximum Node index 
-  nPrunedNodes = nNodes - len(old_node_indices) 
-  if nPrunedNodes > 0:
-    node_to_node_dict = dict(list(enumerate(sorted(old_node_indices))))
-    node_to_node_dict = {value:key for key, value in node_to_node_dict.items()}
-    # Create a dictionary remapping old Node indicies to new node indicies and invert
-  features = [0] * nNodes
-  thresholds = [0] * nNodes
-  children_left = [0] * nNodes
-  children_right = [0] * nNodes
-  values = [0] * nNodes 
-  for node in nodes:
-    if node == '':
-        pass
-    elif 'leaf' in node: # is a leaf
-      # Looks like: 'i:leaf=value[i]'
-      data = node.split('leaf')
-      iNode = int(data[0].replace(':',''))
-      if nPrunedNodes > 0:
-        iNode = node_to_node_dict[iNode]
-        # Remap node index
-      feature = -2
-      threshold = 0
-      child_left = -1
-      child_right = -1
-      value = float(data[1].replace('=',''))
-    else:
-      # Looks like:
-      # 'i:[f{feature[i]}<{threshold[i]} yes={children_left[i]},no={children_right[i]}...'
-      iNode = int(node.split(':')[0]) # index comes before ':'
-      if nPrunedNodes > 0:
-        iNode = node_to_node_dict[iNode]
-        # Remap node index
-      # split around 'feature<threshold'
-      data = node.split('<')
-      feature = feature_names[data[0].split('[')[-1]]
-      threshold = float(data[1].split(']')[0])
-      child_left = int(node.split('yes=')[1].split(',')[0])
-      child_right = int(node.split('no=')[1].split(',')[0])
-      if nPrunedNodes > 0:
-        child_left = node_to_node_dict[child_left]
-        child_right = node_to_node_dict[child_right]
-        # Remap node index for children to preserve tree structure
-      value = 0
-    features[iNode] = feature
-    thresholds[iNode] = threshold
-    children_left[iNode] = child_left
-    children_right[iNode] = child_right
-    values[iNode] = value
-  if nPrunedNodes > 0:
-    del features[-nPrunedNodes:]
-    del thresholds[-nPrunedNodes:] 
-    del children_left[-nPrunedNodes:]
-    del children_right[-nPrunedNodes:] 
-    del values[-nPrunedNodes:]
-    # Remove the last N unused nodes in the tree 
-  treeDict = {'feature' : features, 'threshold' : thresholds, 'children_left' : children_left,
-              'children_right' : children_right, 'value' : values}
+def treeToDict(tree : pandas.DataFrame, feature_names):
+  assert isinstance(tree, pandas.DataFrame), "This method expects the tree as a pandas DataFrame"
+  thresholds = tree.Split.fillna(0).tolist()
+  features = tree.Feature.map(lambda x : -2 if x == 'Leaf' else feature_names[x]).tolist()
+  children_left = tree.Yes.map(lambda x : int(x.split('-')[1]) if isinstance(x, str) else -1).tolist()
+  children_right = tree.No.map(lambda x : int(x.split('-')[1]) if isinstance(x, str) else -1).tolist()
+  values = tree.Gain.tolist()
+  treeDict = {'feature'        : features,
+              'threshold'      : thresholds,
+              'children_left'  : children_left,
+              'children_right' : children_right,
+              'value'          : values
+              }
   return treeDict
diff --git a/tests/test_xgb_converter.py b/tests/test_xgb_converter.py
@@ -0,0 +1,89 @@
+from sklearn.datasets import make_hastie_10_2, load_diabetes, load_iris
+import xgboost as xgb
+import conifer
+import datetime
+from scipy.special import expit, logit, softmax
+import numpy as np
+import pytest
+import logging
+import sys
+
+#logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
+
+# Make a random dataset from sklearn 'hastie'
+# binary classification
+def hastie():
+  X, y = make_hastie_10_2(random_state=0)
+  y[y == -1] = 0
+  return X, y, xgb.DMatrix(X, label=y)
+
+# multiclass classification
+def iris():
+  X, y = load_iris(return_X_y=True)
+  return X, y, xgb.DMatrix(X, label=y)
+
+# regression
+def diabetes():
+  X, y = load_diabetes(return_X_y=True)
+  return X, y, xgb.DMatrix(X, label=y)
+
+def model_0(d, kwarg_params={}):
+  # Train a BDT
+  param = {'max_depth': 3, 'eta': 1, 'objective': 'binary:logistic'}
+  param.update(kwarg_params)
+  num_round = 20  # num_round is equivalent to number of trees
+  bst = xgb.train(param, d, num_round)
+  return bst
+
+def model_1(d, kwarg_params={}):
+  # Train a BDT
+  param = {'max_depth': 3, 'eta': 1, 'objective': 'binary:logistic', 'updater':'grow_histmaker'}
+  param.update(kwarg_params)
+  num_round = 20  # num_round is equivalent to number of trees
+  bst = xgb.train(param, d, num_round)
+  return bst
+
+def model_2(X, y, kwarg_params={}):
+  bdt = xgb.XGBClassifier(n_estimators=10, max_depth=3, **kwarg_params)
+  bdt.fit(X, y)  
+  return bdt
+
+def model_3(X, y, kwarg_params={}):
+  bdt = xgb.XGBRegressor(n_estimators=10, max_depth=3, **kwarg_params)
+  bdt.fit(X, y)  
+  return bdt
+
+# parameter format: (test index, model function, data function, data fmt ['np', 'xgb'], predictor function, prediction transform)
+@pytest.mark.parametrize('params', [(0, model_0, {}, hastie, 'xgb', 'predict', expit, {}), 
+                                    (1, model_1, {}, hastie, 'xgb', 'predict', expit, {}),
+                                    (2, model_2, {}, hastie, 'np', 'predict_proba', expit, {}),
+                                    (3, model_2, {}, iris, 'np', 'predict_proba', softmax, {'axis':1}),
+                                    (4, model_0, {'objective': 'multi:softprob', 'num_class': 3}, iris, 'xgb', 'predict', softmax, {'axis':1}),
+                                    (5, model_3, {}, diabetes, 'np', 'predict', lambda x: x, {}), # not yet possible
+                                    ])
+def test_xgb(params):
+  test_idx = params[0]
+  get_model_function = params[1]
+  get_model_kwargs = params[2]
+  get_data_function = params[3]
+  data_fmt = params[4]
+  predictor = params[5]
+  transform = params[6]
+  transform_kwargs = params[7]
+  X, y, d = get_data_function()
+  if data_fmt == 'xgb':
+    model = get_model_function(d, get_model_kwargs)
+  else:
+    model = get_model_function(X, y, get_model_kwargs)
+  cfg = conifer.backends.cpp.auto_config()
+  cfg['Precision'] = 'double'
+  # Set the output directory to something unique
+  cfg['OutputDir'] = f'prj_xgb_converter_{test_idx}_{int(datetime.datetime.now().timestamp())}'
+  cnf_model = conifer.converters.convert_from_xgboost(model, cfg)
+  cnf_model.compile()
+  y_cnf = np.squeeze(transform(cnf_model.decision_function(X), **transform_kwargs))
+  X_xgb = X if data_fmt == 'np' else d
+  y_xgb = getattr(model, predictor)(X_xgb)
+  if len(y_xgb.shape) == 2 and y_xgb.shape[1] == 2:
+    y_xgb = y_xgb[:,-1]
+  np.testing.assert_array_almost_equal(y_cnf, y_xgb)