-
Notifications
You must be signed in to change notification settings - Fork 0
/
config.conf
314 lines (231 loc) · 10.7 KB
/
config.conf
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
# #### BioDiscML config file #### #
# See https://github.com/mickaelleclercq/BioDiscML/tree/master/release/Test_datasets
# for examples.
# IMPORTANT: for classification, do not use classes with numeric attributes. Else,
# they will be interpreted as a regression problem.
#####################
### BASIC OPTIONS ###
#####################
## Working directory. If local execution, don't set it.
# Default: wd=*empty* (local directory)
#wd=working_directory
## Project name, used as prefix for outfiles.
# Default: project=myProject
project=myProject
## Type of classification: Classification
# Set to true if we perform a classification (nominal class).
# Default: doClassification=false
doClassification=false
# If true, set the column class name to classify.
# Default: classificationClassName=class
classificationClassName=class
## Type of classification: Regression
# Set to true if we perform a regression (numeric class).
# Default:doRegression=false
doRegression=false
# If true, set the column class name to classify.
# Default: regressionClassName=class
regressionClassName=class
## Training input files
# Set infiles here if you have several dataset with a common ID column that will
# be used for merging (see mergingID). Only IDs existing in all files will be kept
# for training, those missing in one of the file will be ignored.
# All decimal separated values commas (,) will be changed to dots (.).
# You also must remove special symbols within your data (e.g.: %/\*"':éèà).
# Usage: trainFile=filename_in_working_directory,description
# The description will be used as a prefix for features of the file to avoid
# duplicated names. It can be left empty if there is no risk of duplicated names.
# (ex: trainFile=myproteinsfile, protein
# trainFile=mygenesfile, genes
# trainFile=mymetadatafile).
# Default: trainFile=*empty*
#trainFile=trainFile1.csv, description
#trainFile=trainFile2.csv
## Predict new data input files
# If you have you own blind test dataset or new data, you can run biodiscml using
# the -predict option (java -jar biodiscml.jar -config config.conf -predict).
# This function will need two defined input files:
# - A newData file (same format and structure as the training input files.
# This file must contain at least all elements of the retained signature of the
# selected best model features. Features present in the newData file, but absent from
# the signature of the model will simply be ignored during the prediction)
# - A model file (produced during a previous execution of biodiscml where a best
# model have been identified)
# Usage: newDataFile=filename_in_working_directory,description
# The description will be used as a prefix for features of the file to avoid
# duplicated names. It can be left empty if there is no risk of duplicated names.
# (ex: newDataFile=myproteinsfile, protein
# newDataFile=mygenesfile, genes
# newDataFile=mymetadatafile).
# Default: newDataFile=*empty*
# Default: modelFile=*empty*
#newDataFile=newDataFile1.csv, description
#newDataFile=newDataFile2.csv
#modelFile=model.model
## Merging
# Merging identifier, used if you have many files to merge. It is expected to be
# found in the first column of every files.
# Only rows containing identifiers that exist in all files will be considered in
# the analysis.
# Default: mergingID=*empty*
#mergingID=identifier
## Sampling
# Perform sampling to create a random validation set not used during training and
# used for further evaluation.
# Default: sampling=true
sampling=true
# The samplingFold option separate the set in x parts, keep 1 for validation, others
# for training.
# e.g. samplingFold=3 means that the validation set will be composed of 1/3 of the
# input data.
# Ignored if sampling=false
# Default: samplingFold=3
samplingFold=3
# Instead of random sampling, you can provide a validation file on which the models
# will be tested.
# Note that the validation file must contain the same structure and features as the
# train file.
# You can also provide several validation files, they will be merged.
# If set, samplingFold options will be ignored.
# Ignored if sampling=false
# Default: validationFile=*empty*
#validationFile=validationFile1.csv
## Feature exclusion
# Features to exclude from the dataset (separated by commas(,)).
# Do not exclude the identifier (usually the first column).
# Default: excluded=*empty*
#excluded=columnA,columnB
## Best model auto-selection
# A specified number of models will be generated here, along with various performance
# metrics and correlated features for each one. Choose how many best models to
# generate and the metric on which the models will be sorted.
# Instead of a specific number of models, a threshold can also be set.
# Models will be selected based on both numberOfBestModels and
# numberOfBestModelsSortingMetricThreshold conditions
# Metrics can be any column of the results file:
# We prefer those for classification: TEST_MCC, TEST_BER, TRAIN_TEST_BS_MCC,
# TRAIN_TEST_BS_BER, AVG_BER, AVG_MCC
# We prefer those for regression: TEST_CC, TEST_RMSE, TRAIN_TEST_BS_CC,
# TRAIN_TEST_BS_RMSE, AVG_RMSE, AVG_CC
# Examples: AVG_MCC at 0.6, AVG_RMSE at 0.3, AVG_CC at 0.8
# See commands in readme.txt to extract specific models
# Default: computeBestModel=true
# numberOfBestModels=1
# numberOfBestModelsSortingMetric=AVG_MCC
# numberOfBestModelsSortingMetricThreshold=0.1
computeBestModel=true
numberOfBestModels=1
numberOfBestModelsSortingMetric=AVG_MCC
numberOfBestModelsSortingMetricThreshold=0.1
## Combine models
# If true, only one model will be computed using a combination of all models
# selected with best models options.
# Combination rules:
# AVG (Average of probabilities)
# PROD (Product of probabilities)
# MAJ (Majority voting)
# MED (Median)
# Default: combineModels=false
# combinationRule=AVG
combineModels=false
combinationRule=AVG
########################
### ADVANCED OPTIONS ###
########################
## Debug to show more outputs
# Default: debug=false
debug=false
## Maximum number of cpus to use (enter a value or "max").
# BioDiscML runs in low priority by regularly checking cpus available. So
# you can execute other softwares on your server and it will adapt itself.
# Just be careful to available memory, limit number of cpus used to avoid out
# of memory exception.
# Default: cpus=max
cpus=max
## The separator (delimiter) of infiles will be detected automatically.
# It is however possible to set it, but it must exist for all files.
# Default: separator=*empty*
#separator=\t
## Leave-One-Out cross validation
# If you have a very large set of samples (more than 2000), it may be better
# to skip Leave-One-Out cross validation by setting loocv to false
# Default: loocv=true
loocv=true
## Criterion (metrics) optimizers
# To test if a model generated with a feature subset is better with another
# subset, we use various criterions as comparison metrics.
# You can limit the list of criterions if wanted.
# Avalaible criterions for classification:AUC, MCC, FDR, BER, ACC, TPR, TNR,
# kappa, AUPRC, Fscore, Precision, Recall, TP+FN
# Default: coptimizers=AUC, MCC, FDR, BER, ACC
coptimizers=AUC, MCC, FDR, BER, ACC
## Search modes
# various search modes are implemented, including topX features according to
# information gain ranking, and stepwise search (Forward(F), Forward-Backward(FB),
# Backward(B) and Backward-Forward(BF))
searchmodes=F,FB,B,BF,top1,top5,top10,top15,top20,top30,top40,top50,top75,top100
# Regression criterions
# Available criterions for regression: CC, MAE, RMSE, RAE, RRSE
# Default: roptimizers=CC, RMSE
roptimizers=CC, RMSE
## Maximum number of features kept after feature selection ranking.
# Higher this number is, longer will be the training.
# Default: maxNumberOfSelectedFeatures = 1000
maxNumberOfSelectedFeatures = 1000
## Maximum number of features models can have.
# Default: maxNumberOfFeaturesInModel = 250
maxNumberOfFeaturesInModel = 250
## Bootstrap and repeated holdout folds
# Default: bootstrapFolds=100
bootstrapFolds=100
## Correlated features
# Thresholds for Spearman and Pearson correlations
# Correlated feature search can be disabled by setting retrieveCorrelatedGenes to
# false
# Default: retrieveCorrelatedGenes=true
# Default: spearmanCorrelation_lower = -0.99
# spearmanCorrelation_upper = 0.99
# pearsonCorrelation_lower = -0.99
# pearsonCorrelation_upper = 0.99
retrieveCorrelatedGenes=true
spearmanCorrelation_lower = -0.99
spearmanCorrelation_upper = 0.99
pearsonCorrelation_lower = -0.99
pearsonCorrelation_upper = 0.99
# Retreive features based on equivalent infogain or relieff ranking score
# Default: maxRankingScoreDifference = 0.005
# retreiveCorrelatedGenesByRankingScore=false
maxRankingScoreDifference = 0.005
retreiveCorrelatedGenesByRankingScore=false
## RUN mode
# BioDiscML will test all available classifier algorithms. If you wish to choose
# specific classifiers, you'll need to use the fast way mode and provide a
# list of classifiers configurations (classifier name and hyperparameters).
# Please use Weka GUI to help you choose the configurations
# For each configuration, you'll need to provide what optimizer to use.
#
# Fast mode classification:
# Usage: cfcmd=classifier with options,optimizer.
# Available optimizers: AUC,ACC,SEN,SPE,MCC,TP+FN,kappa, ALLOPT (all optimizers)
# Available search modes: F,FB,B,BF,top1,top5,top10,top15,top20,top30,top40,top50,top75,top100, ALLSEARCH (all search modes)
# If no optimizer or search modes are provided, they will all be tested (equivalent to provide ALLOPT and ALLSEARCH)
# Default: classificationFastWay=false
# ccmd=*empty*
classificationFastWay=false
ccmd=bayes.NaiveBayes -K, SEN
ccmd=bayes.NaiveBayes -K, AUC
ccmd=misc.VFI -B 0.4, SEN
ccmd=misc.VFI -B 0.4, AUC
ccmd=misc.VFI -B 0.4, AUC, F
ccmd=misc.VFI -B 0.6
ccmd=misc.VFI -B 0.6, ALLOPT, FB
ccmd=trees.J48
# Fast mode regression:
# Usage: rfcmd=classifier with options,optimizer.
# Available optimizers: CC, MAE, RMSE, RAE, RRSE.
# Default: regressionFastWay=false
# rcmd=*empty*
regressionFastWay=false
rcmd=functions.GaussianProcesses -L 1.0 -N 1 -K "weka.classifiers.functions.supportVector.PolyKernel -C 250007 -E 1.0", CC
rcmd=functions.GaussianProcesses -L 1.0 -N 1 -K "weka.classifiers.functions.supportVector.PolyKernel -C 250007 -E 1.0", RMSE
# #### End of configuration file #### #