EliLillyCo
diff --git a/‎contrib/bin/xgbd/xgbd_evaluate.py
Lines changed: 91 additions & 0 deletions b/‎contrib/bin/xgbd/xgbd_evaluate.py
Lines changed: 91 additions & 0 deletions
diff --git a/‎contrib/bin/xgbd/xgbd_make.py
Lines changed: 178 additions & 0 deletions b/‎contrib/bin/xgbd/xgbd_make.py
Lines changed: 178 additions & 0 deletions
diff --git a/‎contrib/bin/xgbd/xgboost_model.proto
Lines changed: 27 additions & 0 deletions b/‎contrib/bin/xgbd/xgboost_model.proto
Lines changed: 27 additions & 0 deletions
diff --git a/‎contrib/bin/xgbd/xgboost_model_pb2.py
Lines changed: 27 additions & 0 deletions b/‎contrib/bin/xgbd/xgboost_model_pb2.py
Lines changed: 27 additions & 0 deletions
diff --git a/‎docs/Molecule_Lib/io.md
Lines changed: 16 additions & 0 deletions b/‎docs/Molecule_Lib/io.md
Lines changed: 16 additions & 0 deletions
@@ -0,0 +1,91 @@
+# Evaluate an xgboost descriptor model built with xgboost_make
+
+import os
+import re
+
+import pandas as pd
+
+from absl import app
+from absl import flags
+from absl import logging
+from google.protobuf import text_format
+
+from xgboost import XGBClassifier
+from xgboost import XGBRegressor
+
+import xgboost_model_pb2
+
+FLAGS = flags.FLAGS
+
+flags.DEFINE_string("mdir", "", "Model directory")
+
+def get_model(mdir: str)->tuple:
+  """Look for 'what_kind_of_model` in `mdir` and make sure it is OK
+    Return a model instantiated from mdir/xgboost.json and the
+    name of the response
+  """
+  fname = os.path.join(mdir, "model_metadata.txt")
+  if not os.path.exists(fname):
+    logging.error("%s not found", fname)
+    return None, None
+
+  with open(fname, "r") as reader:
+    text = reader.read()
+
+  proto = text_format.Parse(text, xgboost_model_pb2.XGBoostModel())
+  if not proto:
+    logging.error("Cannot interpret as proto %s", text)
+    return None, None
+
+  if not proto.response:
+    logging.error("No response in %s", fname)
+    return None, None
+
+  model_file = os.path.join(mdir, "xgboost.json")
+  if not os.path.exists(model_file):
+    logging.error("%s not found", model_file)
+    return None
+
+  model = XGBRegressor()
+  model.load_model(model_file)
+
+  return model, proto.response
+
+def xgboost_evaluate(mdir: str, fname: str)->bool:
+  """Read `fname` as descriptors for a model in `mdir`
+  """
+  if not os.path.isdir(mdir):
+    logging.error("Model directory %s not found", mdir)
+    return False
+
+  model, response = get_model(mdir)
+  if not model:
+    logging.error("Invalid mode in %s", mdir)
+    return False
+
+  data = pd.read_csv(fname, sep=' ', header=0)
+
+  logging.info("Evaluating %d rows", len(data))
+  results = model.predict(data.iloc[:,1:])
+  print(f"Id {response}")
+  for i in range(len(results)):
+    print(f"{data.iloc[i,0]} {results[i]:.4f}")
+
+  return True
+
+def main(argv):
+  """Temporary tool to fix broken multi-fragment unique smiles problem
+  """
+  if len(argv) == 1:
+    logging.error("Must specify descriptor file as argument")
+    return 1
+
+  if not FLAGS.mdir:
+    logging.error("must specify model directory via the --mdir option")
+    return 1
+
+
+  return xgboost_evaluate(FLAGS.mdir, argv[1])
+
+if __name__ == '__main__':
+  app.run(main)
@@ -0,0 +1,178 @@
+# Build and commit an xgboost model
+# Deliberately simplistic in approach
+
+import os
+
+import pandas as pd
+import sklearn
+from matplotlib import pyplot
+
+from xgboost import plot_importance
+from xgboost import XGBClassifier
+from xgboost import XGBRegressor
+
+from absl import app
+from absl import flags
+from absl import logging
+from google.protobuf import text_format
+
+import xgboost_model_pb2
+
+FLAGS = flags.FLAGS
+
+flags.DEFINE_string("activity", "", "Name of training set activity file")
+flags.DEFINE_string("desc", "", "Name of training set descriptor file")
+flags.DEFINE_boolean("classification", False, "True if this is a classification task")
+flags.DEFINE_string("mdir", "", "Directory into which the model is placed")
+flags.DEFINE_integer("max_num_features", 0, "Maximum number of features to plot in variable importance")
+flags.DEFINE_string("feature_importance", "", "File containing feature importance values")
+flags.DEFINE_integer("xgverbosity", 0, "xgboost verbosity")
+flags.DEFINE_string("proto", "", "A file containing an XGBoostParameters proto")
+flags.DEFINE_float("eta", 0.3, "xgboost learning rate parameter eta")
+flags.DEFINE_integer("max_depth", 6, "xgboost max depth")
+flags.DEFINE_integer("n_estimators", 100, "xboost number of estimators")
+
+class Options:
+  def __init__(self):
+    self.classification = False
+    self.mdir: str = ""
+    self.max_num_features: int = 10
+    self.verbosity = 0
+    self.proto = xgboost_model_pb2.XGBoostParameters()
+
+  def read_proto(self, fname)->bool:
+    """Read self.proto from `fname`
+    """
+    with open(fname, "r") as reader:
+      text = reader.read()
+
+    self.proto = text_format.Parse(text, xgboost_model_pb2.XGBoostParameters())
+    if not self.proto:
+      logging.error("Cannot intpret %s", text)
+      return False
+
+    return True
+def classification(x, y, options: Options)->bool:
+  """build a classification model
+  """
+  booster = XGBClassifier(verbosity=options.verbosity)
+  booster.fit(x, y)
+
+def regression(x, y, options: Options):
+  """build a regression model.
+  """
+  booster = XGBRegressor(verbosity=options.verbosity,
+                eta=options.proto.eta,
+                max_depth=options.proto.max_depth,
+                n_estimators = options.proto.n_estimators)
+  booster.fit(x, y)
+
+  booster.save_model(os.path.join(options.mdir, "xgboost.json"))
+  if options.max_num_features:
+    plot_importance(booster, max_num_features=options.max_num_features)
+    pyplot.show()
+  if options.feature_importance:
+    feature_importance = booster.get_booster().get_score(importance_type='weight')
+    feature_importance = sorted(feature_importance.items(), key=lambda x:x[1])
+    if options.feature_importance:
+      with open(os.path.join(options.mdir, options.feature_importance), "w") as writer:
+        # Write a markdown table, easy to undo if needed.
+        print("| Feature | Weight |", file=writer)
+        print("| ------- | ------ |", file=writer)
+        for f, i in feature_importance:
+          print(f"| {f} | {i} |", file=writer)
+
+  # config = booster.save_config()
+
+  return True
+
+def build_xgboost_model(descriptor_fname: str,
+                        activity_fname: str,
+                        options: Options)->bool:
+  """Build an xgboost model on the data in `descriptor_fname` and
+     `activity_fname`.
+    This function does data preprocessing.
+  """
+
+  descriptors = pd.read_csv(descriptor_fname, sep=' ', header=0, low_memory=False)
+  logging.info("Read %d rows and %d columns from %s", len(descriptors),
+                descriptors.shape[1], descriptor_fname)
+  activity = pd.read_csv(activity_fname, sep=' ', header=0)
+  logging.info("Read %d rows from %s", activity.shape[0], activity_fname)
+
+
+  descriptors.rename(columns={descriptors.columns[0]: "Id"}, inplace=True)
+  activity.rename(columns={activity.columns[0]: "Id"}, inplace=True)
+  combined = pd.concat([activity.set_index("Id"),
+                        descriptors.set_index("Id")], axis=1, join='inner').reset_index() 
+  if len(combined) != len(descriptors):
+    logging.error("Combined set has %d rows", len(combined))
+    return 1
+
+  if not os.path.isdir(options.mdir):
+    os.mkdir(options.mdir)
+
+  y = combined.iloc[:,1].to_numpy()
+
+  x = combined.iloc[:,2:]
+  x.apply(pd.to_numeric).to_numpy()
+
+  rc = False
+  if options.classification:
+    rc = classification(x, y, options)
+  else:
+    rc = regression(x, y, options)
+
+  if not rc:
+    return False
+
+  response = activity.columns[1]
+
+  proto = xgboost_model_pb2.XGBoostModel();
+  proto.model_type = "XGBD"
+  proto.classification = False
+  proto.response = response
+  proto.parameters.CopyFrom(options.proto)
+  with open(os.path.join(options.mdir, "model_metadata.txt"), "w") as f:
+    f.write(text_format.MessageToString(proto))
+
+  return True
+
+def main(argv):
+  """Build xgboost models from activity file and descriptor file.
+  """
+  if not FLAGS.activity:
+    logging.error("Must specifythe name of the activity file with the --activity option")
+    return False
+  if not FLAGS.desc:
+    logging.error("Must specifythe name of the descriptor file with the --desc option")
+    return False
+  if not FLAGS.mdir:
+    logging.error("Must specifyi the model directory via the --mdir option")
+    return False
+
+  options = Options()
+  options.classification = FLAGS.classification
+  options.mdir = FLAGS.mdir
+  options.max_num_features = FLAGS.max_num_features
+  options.feature_importance = FLAGS.feature_importance
+  options.verbosity = FLAGS.xgverbosity
+
+  # Build the proto first, and then anything that might overwrite it.
+  if FLAGS.proto:
+    if not options.read_proto(FLAGS.proto):
+      logging.error("Cannot read textproto parameters %s", FLAGS.proto)
+      return False
+  else:
+    options.proto.eta = FLAGS.eta
+    options.proto.max_depth = FLAGS.max_depth
+    options.proto.n_estimators = FLAGS.n_estimators
+
+  if not build_xgboost_model(FLAGS.desc, FLAGS.activity, options):
+    logging.error("Model %s not build", options.mdir)
+    return False
+
+  return True
+
+if __name__ == '__main__':
+  app.run(main)
@@ -0,0 +1,27 @@
+syntax = "proto3";
+
+package xgboost_model;
+
+message XGBoostParameters {
+  optional float eta = 1;
+
+  optional uint32 max_depth = 2;
+
+  optional uint32 n_estimators = 3;
+
+  optional uint32 min_samples_split = 4;
+}
+
+message LightGbmParameters {
+}
+
+message XGBoostModel {
+  optional string model_type = 1;
+
+  optional string response = 2;
+
+  optional XGBoostParameters parameters = 3;
+
+  optional bool classification = 4;
+
+}
@@ -349,6 +349,13 @@ still need `-i allsdfid`, and you may also need `-i SDFNONAME`. Otherwise
 the name field will be processed normally, leading to `smiles name { "FOO": "bar" }`
 which might be exactly what you want.
 
+### -I NAME2JSON
+If needed, the molecule name can be encoded in json form as well as the
+specified sdf tags. The resulting form will be
+```
+"name": "molecule name" "tag1": "something" ...
+```
+
 ### -i SDFNONAME
 Normally the first record in an MDL connection table is the name
 of the molecule. If this is specified, discard that information instead.
@@ -519,6 +526,7 @@ yields
  -o nochiral    exclude chirality info from smiles and mdl outputs
  -o nochiralflag don't write the chiral flag info to mdl files
  -o NOCT        exclude any CIS/TRANS information from smiles output
+ -o smisep=\<c\>  separator between smiles and name: 'smisep=tab' for example
  -o \<type\>      specify one or more output types (smi,usmi,nausmi,rsmi,sdf,tdt,mol2,marvin)
 ```
 
@@ -619,5 +627,13 @@ Do not write the chiral flag info to mdl files.
 ### -o NOCT
 exclude any CIS/TRANS information from smiles output.
 
+### -o smisep
+By default, smiles are written as smiles followed by a space and then
+the id. The space can be changed to any character with this directive.
+```
+-i smisep=vbar
+```
+will result in a vertical bar being the output separator.
+
 ### -o \<type\>
 specify one or more output types (smi,usmi,nausmi,rsmi,sdf,tdt,mol2,marvin).