-
Notifications
You must be signed in to change notification settings - Fork 14
/
Copy pathparsing.py
306 lines (249 loc) · 14.6 KB
/
parsing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
from argparse import ArgumentParser, Namespace
import json
import os
from tempfile import TemporaryDirectory
import torch
from utils import makedirs
from features import get_available_features_generators
def add_predict_args(parser: ArgumentParser):
"""
Adds predict arguments to an ArgumentParser.
:param parser: An ArgumentParser.
"""
parser.add_argument('--gpu', type=int,
choices=list(range(torch.cuda.device_count())),
help='Which GPU to use')
parser.add_argument('--test_path', type=str,
help='Path to CSV file containing testing data for which predictions will be made')
parser.add_argument('--use_compound_names', action='store_true', default=False,
help='Use when test data file contains compound names in addition to SMILES strings')
parser.add_argument('--preds_path', type=str,
help='Path to CSV file where predictions will be saved')
parser.add_argument('--checkpoint_dir', type=str,
help='Directory from which to load model checkpoints'
'(walks directory and ensembles all models that are found)')
parser.add_argument('--checkpoint_path', type=str, default=None,
help='Path to model checkpoint (.pt file)')
parser.add_argument('--batch_size', type=int, default=50,
help='Batch size')
parser.add_argument('--features_generator', type=str, nargs='*',
choices=get_available_features_generators(),
help='Method of generating additional features')
parser.add_argument('--features_path', type=str, nargs='*',
help='Path to features to use in FNN (instead of features_generator)')
parser.add_argument('--no_features_scaling', action='store_true', default=False,
help='Turn off scaling of features')
parser.add_argument('--max_data_size', type=int,
help='Maximum number of data points to load')
parser.add_argument('--diff_depth_weights', action='store_true', default=False,
help='Whether to use a different weight matrix at each step of message passing')
def add_train_args(parser: ArgumentParser):
"""
Adds training arguments to an ArgumentParser.
:param parser: An ArgumentParser.
"""
parser.add_argument('--gpuUSE', type=bool, default=False, help='gpu use or not')
parser.add_argument('--gpu', type=int,
choices=list(range(torch.cuda.device_count())),
help='Which GPU to use')
parser.add_argument('--tmp_data_dir', type=str,default='./data_RE2/tmp/',
help='Path to temp data CSV file')
parser.add_argument('--data_path', type=str,default='data_RE2/logP_labels2.csv',
help='Path to data CSV file')
parser.add_argument('--use_compound_names', action='store_true', default=False,
help='Use when test data file contains compound names in addition to SMILES strings')
parser.add_argument('--max_data_size', type=int,
help='Maximum number of data points to load')
parser.add_argument('--test', action='store_true', default=False,
help='Whether to skip training and only test the model')
parser.add_argument('--features_only', action='store_true', default=False,
help='Use only the additional features in an FFN, no graph network')
parser.add_argument('--features_generator', type=str, nargs='*',
choices=get_available_features_generators(),
help='Method of generating additional features')
parser.add_argument('--features_path', type=str, nargs='*',
help='Path to features to use in FNN (instead of features_generator)')
parser.add_argument('--save_dir', type=str, default='save_test',
help='Directory where model checkpoints will be saved')
parser.add_argument('--save_smiles_splits', action='store_true', default=False,
help='Save smiles for each train/val/test splits for prediction convenience later')
parser.add_argument('--checkpoint_dir', type=str, default=None,
help='Directory from which to load model checkpoints'
'(walks directory and ensembles all models that are found)')
parser.add_argument('--checkpoint_path', type=str, default=None,
help='Path to model checkpoint (.pt file)')
parser.add_argument('--dataset_type', type=str,default='regression',
choices=['classification', 'regression'],
help='Type of dataset, e.g. classification or regression.'
'This determines the loss function used during training.')
parser.add_argument('--separate_val_path', type=str,
help='Path to separate val set, optional')
parser.add_argument('--separate_val_features_path', type=str, nargs='*',
help='Path to file with features for separate val set')
parser.add_argument('--separate_test_path', type=str,
help='Path to separate test set, optional')
parser.add_argument('--separate_test_features_path', type=str, nargs='*',
help='Path to file with features for separate test set')
parser.add_argument('--split_type', type=str, default='random',
choices=['random', 'scaffold_balanced', 'predetermined'],
help='Method of splitting the data into train/val/test')
parser.add_argument('--split_sizes', type=float, nargs=3, default=[0.8, 0.1, 0.1],
help='Split proportions for train/validation/test sets')
parser.add_argument('--num_folds', type=int, default=2,
help='Number of folds when performing cross validation')
parser.add_argument('--folds_file', type=str, default=None,
help='Optional file of fold labels')
parser.add_argument('--val_fold_index', type=int, default=None,
help='Which fold to use as val for leave-one-out cross val')
parser.add_argument('--test_fold_index', type=int, default=None,
help='Which fold to use as test for leave-one-out cross val')
parser.add_argument('--seed', type=int, default=0,
help='Random seed to use when splitting data into train/val/test sets.')
parser.add_argument('--metric', type=str, default=None,
choices=['auc', 'prc-auc', 'rmse', 'mae', 'r2', 'accuracy'],
help='Metric to use during evaluation.'
'Note: Does NOT affect loss function used during training'
'(loss is determined by the `dataset_type` argument).'
'Note: Defaults to "auc" for classification and "rmse" for regression.')
parser.add_argument('--quiet', action='store_true', default=False,
help='Skip non-essential print statements')
parser.add_argument('--log_frequency', type=int, default=10,
help='The number of batches between each logging of the training loss')
parser.add_argument('--show_individual_scores', action='store_true', default=True,
help='Show all scores for individual targets, not just average, at the end')
parser.add_argument('--no_cache', action='store_true', default=False,
help='Turn off caching mol2graph computation')
parser.add_argument('--config_path', type=str,
help='Path to a .json file containing arguments. Any arguments present in the config'
'file will override arguments specified via the command line or by the defaults.')
parser.add_argument('--epochs', type=int, default=1,
help='Number of epochs to run')
parser.add_argument('--batch_size', type=int, default=64,
help='Batch size')
parser.add_argument('--warmup_epochs', type=float, default=2.0,
help='Number of epochs during which learning rate increases linearly from'
'init_lr to max_lr. Afterwards, learning rate decreases exponentially'
'from max_lr to final_lr.')
parser.add_argument('--init_lr', type=float, default=1e-4,
help='Initial learning rate')
parser.add_argument('--max_lr', type=float, default=1e-3,
help='Maximum learning rate')
parser.add_argument('--final_lr', type=float, default=1e-4,
help='Final learning rate')
parser.add_argument('--no_features_scaling', action='store_true', default=False,
help='Turn off scaling of features')
parser.add_argument('--layer_norm', action='store_true', default=False,
help='Add layer norm after each message passing step')
parser.add_argument('--normalize_messages', action='store_true', default=False,
help='Normalize bond messages at each message passing step')
parser.add_argument('--ensemble_size', type=int, default=1,
help='Number of models in ensemble')
parser.add_argument('--hidden_size', type=int, default=300,
help='Dimensionality of hidden layers in MPN')
parser.add_argument('--bias', action='store_true', default=False,
help='Whether to add bias to linear layers')
parser.add_argument('--depth', type=int, default=3,
help='Number of message passing steps')
parser.add_argument('--dropout', type=float, default=0.0,
help='Dropout probability')
parser.add_argument('--activation', type=str, default='ReLU',
choices=['ReLU', 'LeakyReLU', 'PReLU', 'tanh', 'SELU', 'ELU'],
help='Activation function')
parser.add_argument('--undirected', action='store_true', default=False,
help='Undirected edges (always sum the two relevant bond vectors)')
parser.add_argument('--ffn_hidden_size', type=int, default=None,
help='Hidden dim for higher-capacity FFN (defaults to hidden_size)')
parser.add_argument('--ffn_num_layers', type=int, default=2,
help='Number of layers in FFN after MPN encoding')
parser.add_argument('--atom_messages', action='store_true', default=False,
help='Use messages on atoms instead of messages on bonds')
parser.add_argument('--diff_depth_weights', action='store_true', default=False,
help='Whether to use a different weight matrix at each step of message passing')
def update_checkpoint_args(args: Namespace):
"""
Walks the checkpoint directory to find all checkpoints, updating args.checkpoint_paths and args.ensemble_size.
:param args: Arguments.
"""
if hasattr(args, 'checkpoint_paths') and args.checkpoint_paths is not None:
return
if args.checkpoint_dir is not None and args.checkpoint_path is not None:
raise ValueError('Only one of checkpoint_dir and checkpoint_path can be specified.')
if args.checkpoint_dir is None:
args.checkpoint_paths = [args.checkpoint_path] if args.checkpoint_path is not None else None
return
args.checkpoint_paths = []
for root, _, files in os.walk(args.checkpoint_dir):
for fname in files:
if fname.endswith('.pt'):
args.checkpoint_paths.append(os.path.join(root, fname))
args.ensemble_size = len(args.checkpoint_paths)
if args.ensemble_size == 0:
raise ValueError(f'Failed to find any model checkpoints in directory "{args.checkpoint_dir}"')
def modify_predict_args(args: Namespace):
"""
Modifies and validates predicting args in place.
:param args: Arguments.
"""
assert args.test_path
assert args.preds_path
assert args.checkpoint_dir is not None or args.checkpoint_path is not None or args.checkpoint_paths is not None
update_checkpoint_args(args)
args.cuda = args.gpuUSE and torch.cuda.is_available()
makedirs(args.preds_path, isfile=True)
def parse_predict_args() -> Namespace:
parser = ArgumentParser()
add_predict_args(parser)
args = parser.parse_args()
modify_predict_args(args)
return args
def modify_train_args(args: Namespace):
"""
Modifies and validates training arguments in place.
:param args: Arguments.
"""
global temp_dir
if args.config_path is not None:
with open(args.config_path) as f:
config = json.load(f)
for key, value in config.items():
setattr(args, key, value)
assert args.data_path is not None
assert args.dataset_type is not None
if args.save_dir is not None:
makedirs(args.save_dir)
else:
temp_dir = TemporaryDirectory()
args.save_dir = temp_dir.name
args.cuda = args.gpuUSE and torch.cuda.is_available()
args.features_scaling = not args.no_features_scaling
del args.no_features_scaling
if args.metric is None:
if args.dataset_type == 'classification':
args.metric = 'auc'
else:
args.metric = 'rmse'
if not ((args.dataset_type == 'classification' and args.metric in ['auc', 'prc-auc', 'accuracy']) or
(args.dataset_type == 'regression' and args.metric in ['rmse', 'mse','mae', 'r2'])):
raise ValueError(f'Metric "{args.metric}" invalid for dataset type "{args.dataset_type}".')
update_checkpoint_args(args)
if args.features_only:
assert args.features_generator or args.features_path
args.use_input_features = args.features_generator or args.features_path
if args.features_generator is not None and 'rdkit_2d_normalized' in args.features_generator:
assert not args.features_scaling
args.num_lrs = 1
if args.ffn_hidden_size is None:
args.ffn_hidden_size = args.hidden_size
assert (args.split_type == 'predetermined') == (args.folds_file is not None) == (args.test_fold_index is not None)
if args.test:
args.epochs = 0
def parse_train_args() -> Namespace:
"""
Parses arguments for training (includes modifying/validating arguments).
:return: A Namespace containing the parsed, modified, and validated args.
"""
parser = ArgumentParser()
add_train_args(parser)
args = parser.parse_args()
modify_train_args(args)
return args