Skip to content

Commit df40450

Browse files
authored
v7.2.1 release PR (#32)
* solve conflict with upstream master * solve conflict with upstream master * v7.2.1 release * minor fix
1 parent 129c2c0 commit df40450

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

61 files changed

+4891
-2763
lines changed

contrib/bin/xgbd/xgbd_evaluate.py

Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,91 @@
1+
# Evaluate an xgboost descriptor model built with xgboost_make
2+
3+
import os
4+
import re
5+
6+
import pandas as pd
7+
8+
from absl import app
9+
from absl import flags
10+
from absl import logging
11+
from google.protobuf import text_format
12+
13+
from xgboost import XGBClassifier
14+
from xgboost import XGBRegressor
15+
16+
import xgboost_model_pb2
17+
18+
FLAGS = flags.FLAGS
19+
20+
flags.DEFINE_string("mdir", "", "Model directory")
21+
22+
def get_model(mdir: str)->tuple:
23+
"""Look for 'what_kind_of_model` in `mdir` and make sure it is OK
24+
Return a model instantiated from mdir/xgboost.json and the
25+
name of the response
26+
"""
27+
fname = os.path.join(mdir, "model_metadata.txt")
28+
if not os.path.exists(fname):
29+
logging.error("%s not found", fname)
30+
return None, None
31+
32+
with open(fname, "r") as reader:
33+
text = reader.read()
34+
35+
proto = text_format.Parse(text, xgboost_model_pb2.XGBoostModel())
36+
if not proto:
37+
logging.error("Cannot interpret as proto %s", text)
38+
return None, None
39+
40+
if not proto.response:
41+
logging.error("No response in %s", fname)
42+
return None, None
43+
44+
model_file = os.path.join(mdir, "xgboost.json")
45+
if not os.path.exists(model_file):
46+
logging.error("%s not found", model_file)
47+
return None
48+
49+
model = XGBRegressor()
50+
model.load_model(model_file)
51+
52+
return model, proto.response
53+
54+
def xgboost_evaluate(mdir: str, fname: str)->bool:
55+
"""Read `fname` as descriptors for a model in `mdir`
56+
"""
57+
if not os.path.isdir(mdir):
58+
logging.error("Model directory %s not found", mdir)
59+
return False
60+
61+
model, response = get_model(mdir)
62+
if not model:
63+
logging.error("Invalid mode in %s", mdir)
64+
return False
65+
66+
data = pd.read_csv(fname, sep=' ', header=0)
67+
68+
logging.info("Evaluating %d rows", len(data))
69+
results = model.predict(data.iloc[:,1:])
70+
print(f"Id {response}")
71+
for i in range(len(results)):
72+
print(f"{data.iloc[i,0]} {results[i]:.4f}")
73+
74+
return True
75+
76+
def main(argv):
77+
"""Temporary tool to fix broken multi-fragment unique smiles problem
78+
"""
79+
if len(argv) == 1:
80+
logging.error("Must specify descriptor file as argument")
81+
return 1
82+
83+
if not FLAGS.mdir:
84+
logging.error("must specify model directory via the --mdir option")
85+
return 1
86+
87+
88+
return xgboost_evaluate(FLAGS.mdir, argv[1])
89+
90+
if __name__ == '__main__':
91+
app.run(main)

contrib/bin/xgbd/xgbd_make.py

Lines changed: 178 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,178 @@
1+
# Build and commit an xgboost model
2+
# Deliberately simplistic in approach
3+
4+
import os
5+
6+
import pandas as pd
7+
import sklearn
8+
from matplotlib import pyplot
9+
10+
from xgboost import plot_importance
11+
from xgboost import XGBClassifier
12+
from xgboost import XGBRegressor
13+
14+
from absl import app
15+
from absl import flags
16+
from absl import logging
17+
from google.protobuf import text_format
18+
19+
import xgboost_model_pb2
20+
21+
FLAGS = flags.FLAGS
22+
23+
flags.DEFINE_string("activity", "", "Name of training set activity file")
24+
flags.DEFINE_string("desc", "", "Name of training set descriptor file")
25+
flags.DEFINE_boolean("classification", False, "True if this is a classification task")
26+
flags.DEFINE_string("mdir", "", "Directory into which the model is placed")
27+
flags.DEFINE_integer("max_num_features", 0, "Maximum number of features to plot in variable importance")
28+
flags.DEFINE_string("feature_importance", "", "File containing feature importance values")
29+
flags.DEFINE_integer("xgverbosity", 0, "xgboost verbosity")
30+
flags.DEFINE_string("proto", "", "A file containing an XGBoostParameters proto")
31+
flags.DEFINE_float("eta", 0.3, "xgboost learning rate parameter eta")
32+
flags.DEFINE_integer("max_depth", 6, "xgboost max depth")
33+
flags.DEFINE_integer("n_estimators", 100, "xboost number of estimators")
34+
35+
class Options:
36+
def __init__(self):
37+
self.classification = False
38+
self.mdir: str = ""
39+
self.max_num_features: int = 10
40+
self.verbosity = 0
41+
self.proto = xgboost_model_pb2.XGBoostParameters()
42+
43+
def read_proto(self, fname)->bool:
44+
"""Read self.proto from `fname`
45+
"""
46+
with open(fname, "r") as reader:
47+
text = reader.read()
48+
49+
self.proto = text_format.Parse(text, xgboost_model_pb2.XGBoostParameters())
50+
if not self.proto:
51+
logging.error("Cannot intpret %s", text)
52+
return False
53+
54+
return True
55+
def classification(x, y, options: Options)->bool:
56+
"""build a classification model
57+
"""
58+
booster = XGBClassifier(verbosity=options.verbosity)
59+
booster.fit(x, y)
60+
61+
def regression(x, y, options: Options):
62+
"""build a regression model.
63+
"""
64+
booster = XGBRegressor(verbosity=options.verbosity,
65+
eta=options.proto.eta,
66+
max_depth=options.proto.max_depth,
67+
n_estimators = options.proto.n_estimators)
68+
booster.fit(x, y)
69+
70+
booster.save_model(os.path.join(options.mdir, "xgboost.json"))
71+
if options.max_num_features:
72+
plot_importance(booster, max_num_features=options.max_num_features)
73+
pyplot.show()
74+
if options.feature_importance:
75+
feature_importance = booster.get_booster().get_score(importance_type='weight')
76+
feature_importance = sorted(feature_importance.items(), key=lambda x:x[1])
77+
if options.feature_importance:
78+
with open(os.path.join(options.mdir, options.feature_importance), "w") as writer:
79+
# Write a markdown table, easy to undo if needed.
80+
print("| Feature | Weight |", file=writer)
81+
print("| ------- | ------ |", file=writer)
82+
for f, i in feature_importance:
83+
print(f"| {f} | {i} |", file=writer)
84+
85+
# config = booster.save_config()
86+
87+
return True
88+
89+
def build_xgboost_model(descriptor_fname: str,
90+
activity_fname: str,
91+
options: Options)->bool:
92+
"""Build an xgboost model on the data in `descriptor_fname` and
93+
`activity_fname`.
94+
This function does data preprocessing.
95+
"""
96+
97+
descriptors = pd.read_csv(descriptor_fname, sep=' ', header=0, low_memory=False)
98+
logging.info("Read %d rows and %d columns from %s", len(descriptors),
99+
descriptors.shape[1], descriptor_fname)
100+
activity = pd.read_csv(activity_fname, sep=' ', header=0)
101+
logging.info("Read %d rows from %s", activity.shape[0], activity_fname)
102+
103+
104+
descriptors.rename(columns={descriptors.columns[0]: "Id"}, inplace=True)
105+
activity.rename(columns={activity.columns[0]: "Id"}, inplace=True)
106+
combined = pd.concat([activity.set_index("Id"),
107+
descriptors.set_index("Id")], axis=1, join='inner').reset_index()
108+
if len(combined) != len(descriptors):
109+
logging.error("Combined set has %d rows", len(combined))
110+
return 1
111+
112+
if not os.path.isdir(options.mdir):
113+
os.mkdir(options.mdir)
114+
115+
y = combined.iloc[:,1].to_numpy()
116+
117+
x = combined.iloc[:,2:]
118+
x.apply(pd.to_numeric).to_numpy()
119+
120+
rc = False
121+
if options.classification:
122+
rc = classification(x, y, options)
123+
else:
124+
rc = regression(x, y, options)
125+
126+
if not rc:
127+
return False
128+
129+
response = activity.columns[1]
130+
131+
proto = xgboost_model_pb2.XGBoostModel();
132+
proto.model_type = "XGBD"
133+
proto.classification = False
134+
proto.response = response
135+
proto.parameters.CopyFrom(options.proto)
136+
with open(os.path.join(options.mdir, "model_metadata.txt"), "w") as f:
137+
f.write(text_format.MessageToString(proto))
138+
139+
return True
140+
141+
def main(argv):
142+
"""Build xgboost models from activity file and descriptor file.
143+
"""
144+
if not FLAGS.activity:
145+
logging.error("Must specifythe name of the activity file with the --activity option")
146+
return False
147+
if not FLAGS.desc:
148+
logging.error("Must specifythe name of the descriptor file with the --desc option")
149+
return False
150+
if not FLAGS.mdir:
151+
logging.error("Must specifyi the model directory via the --mdir option")
152+
return False
153+
154+
options = Options()
155+
options.classification = FLAGS.classification
156+
options.mdir = FLAGS.mdir
157+
options.max_num_features = FLAGS.max_num_features
158+
options.feature_importance = FLAGS.feature_importance
159+
options.verbosity = FLAGS.xgverbosity
160+
161+
# Build the proto first, and then anything that might overwrite it.
162+
if FLAGS.proto:
163+
if not options.read_proto(FLAGS.proto):
164+
logging.error("Cannot read textproto parameters %s", FLAGS.proto)
165+
return False
166+
else:
167+
options.proto.eta = FLAGS.eta
168+
options.proto.max_depth = FLAGS.max_depth
169+
options.proto.n_estimators = FLAGS.n_estimators
170+
171+
if not build_xgboost_model(FLAGS.desc, FLAGS.activity, options):
172+
logging.error("Model %s not build", options.mdir)
173+
return False
174+
175+
return True
176+
177+
if __name__ == '__main__':
178+
app.run(main)

contrib/bin/xgbd/xgboost_model.proto

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
syntax = "proto3";
2+
3+
package xgboost_model;
4+
5+
message XGBoostParameters {
6+
optional float eta = 1;
7+
8+
optional uint32 max_depth = 2;
9+
10+
optional uint32 n_estimators = 3;
11+
12+
optional uint32 min_samples_split = 4;
13+
}
14+
15+
message LightGbmParameters {
16+
}
17+
18+
message XGBoostModel {
19+
optional string model_type = 1;
20+
21+
optional string response = 2;
22+
23+
optional XGBoostParameters parameters = 3;
24+
25+
optional bool classification = 4;
26+
27+
}

contrib/bin/xgbd/xgboost_model_pb2.py

Lines changed: 27 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

docs/Molecule_Lib/io.md

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -349,6 +349,13 @@ still need `-i allsdfid`, and you may also need `-i SDFNONAME`. Otherwise
349349
the name field will be processed normally, leading to `smiles name { "FOO": "bar" }`
350350
which might be exactly what you want.
351351

352+
### -I NAME2JSON
353+
If needed, the molecule name can be encoded in json form as well as the
354+
specified sdf tags. The resulting form will be
355+
```
356+
"name": "molecule name" "tag1": "something" ...
357+
```
358+
352359
### -i SDFNONAME
353360
Normally the first record in an MDL connection table is the name
354361
of the molecule. If this is specified, discard that information instead.
@@ -519,6 +526,7 @@ yields
519526
-o nochiral exclude chirality info from smiles and mdl outputs
520527
-o nochiralflag don't write the chiral flag info to mdl files
521528
-o NOCT exclude any CIS/TRANS information from smiles output
529+
-o smisep=\<c\> separator between smiles and name: 'smisep=tab' for example
522530
-o \<type\> specify one or more output types (smi,usmi,nausmi,rsmi,sdf,tdt,mol2,marvin)
523531
```
524532

@@ -619,5 +627,13 @@ Do not write the chiral flag info to mdl files.
619627
### -o NOCT
620628
exclude any CIS/TRANS information from smiles output.
621629

630+
### -o smisep
631+
By default, smiles are written as smiles followed by a space and then
632+
the id. The space can be changed to any character with this directive.
633+
```
634+
-i smisep=vbar
635+
```
636+
will result in a vertical bar being the output separator.
637+
622638
### -o \<type\>
623639
specify one or more output types (smi,usmi,nausmi,rsmi,sdf,tdt,mol2,marvin).

0 commit comments

Comments
 (0)