-
Notifications
You must be signed in to change notification settings - Fork 1
/
stabSel.m
1699 lines (1630 loc) · 86.4 KB
/
stabSel.m
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
function [fk,fsc,fscmx,maxVars,alpha,lam,scores,oid,ctr,mdl,ep,empMaxVars,thresh,numFalsePos,allScores] = stabSel(X,y,varargin)
%
% Identify a stable set of features in your data using the framework of
% stability selection.
%
% Call: [fk,fsc,fscmx,maxVars,alpha,lam,scores,oid,ctr,mdl,ep,empMaxVars] =
% stabSel(X, y);
%
%% ------------------------------------------------------------------------
% What do I need to provide stabSel: -------------------------------------
% -------------------------------------------------------------------------
% X: an n x p matrix of p predictors
% y: an n x m vector of m response variables (can be continuous for
% regression or categorical for classification)
%
% Optional arguments can be supplied as well, like so: %
% stabSel(X,y,'maxVars',25,'stnd',false)
%
%% ------------------------------------------------------------------------
% What does stabSel do? ---------------------------------------------------
% -------------------------------------------------------------------------
% Use one of 20+ feature selection methods (see 'selAlgo'; default is an
% elastic net) within the framework of stability selection to identify a
% "stable" set of features (see output 'fk') in input matrix X. The stable
% set of features contains those features that have a higher probability of
% being selected (see fscmx for empirical probabilities for each feature)
% by your feature selection method across many perturbed versions of your
% dataset (see 'samType' for resampling options). The feature selection
% method selects the top N features in the dataset that are predictive of
% your response variable, so the stable set of features can be thought of
% as the columns of X that more consistently predict y. The variable y can
% contain multiple tasks (i.e., columns). By default, stabSel will average
% stability values computed independently for each task. But you can also
% stack your tasks. Or you can use some algorithms made specifically for
% multi-task learning (e.g., PLS).
%
% The two most critical parameters in stability selection are the q
% variables that a feature selection method selects on average (see
% 'maxVars'), and the probability threshold pi that determines whether a
% feature enters the stable set (see 'thresh'). Given these two parameters
% (and assuming a default-set proportion of the data for resampling), we
% can compute the upper bound on the number of false positives
% ('numFalsePos') in the stable set. This is
% effectively the per-family error rate, which is more stringent than FWER.
% We can also get an FDR-like p-value by taking the proportion of the
% stable set that would be the upper bound on the number of false
% positives. If we know two of these three main variables (either number of
% false positives or FDR, maxVars, or thresh), we can solve for the one
% that remains. Typically, we preset the number of false positives and
% the number of variables to select in each perturbed dataset or the threshold for what counts as a stable
% feature. It is easy to set 'thresh' prior to running stabSel, but it can
% be tricky to set 'maxVars'. That's because the number of features
% selected by some algorithms/learners is determined in an opaque way by a
% parameter that we have little insight into. The range we use for this
% parameter will determine maxVars. However, stabSel still allows you to
% set a specific 'maxVars' ahead of time in such cases. That's because for
% some methods where this occurs, we can simply look at a model's weights
% and use those as a filter, while in other cases, we can force the
% selection method to select maxVars *or fewer* features. This means that
% although we can control an error rate, in some cases, when we specify
% maxVars rather than then thresh, we can end up with a lower error rate
% (i.e., because the actual average number of features selected by the
% selection method will be less than maxVars).
%
% If you do not set 'maxVars' or 'thresh', stabSel will either use a
% heuristic to select the number of variables the selection method should
% be choosing on average (e.g., when using a filter where we *have* to know
% how many features to select), or will use a series of regularization
% parameters indescriminantly (i.e., without forcing selection of maxVars
% or fewer features) and then choose an appropriate 'thresh'. In both
% cases, by default, stabSel will try to ensure < 1 false positive in the
% stable set.
%
% StabSel also supports outlier detection and removal (one of 4 methods)
% either prior to, or during, the resampling procedure. By removing
% outliers within the resampling procedure, a more generalizable method of
% outlier removal is implemented. Note, this option has not been as
% thoroughly debugged so you may encounter errors.
%
% There are many options for you to tinker with in stabSel, but you you can
% leave these out of your call and stabSel will use very reasonable
% defaults. The exhaustive list of options are detailed in the following
% section, which also tells you what stabSel internally sets each option to
% if you don't refer to it/change it in your call.
%
% A synthesized summary of main default options if you don't want to read
% the documentation: by default, stabSel will use an elastic net to select
% your features across 300 random subsamples of your data. A series of
% regularization parameters will be used in lieu of a fixed number of
% selected variables. The series will be defined by the highest parameter
% value that is estimated to select a single feature across all resampled
% datasets.
%
% The average number of variables selected across parameters will then be
% used to determine a probability threshold for the stable set of features
% that ensures fewer than 1 false positive. Outlier removal will not be
% performed.
%
%% ------------------------------------------------------------------------
% ----------------------------- Outputs -----------------------------------
% -------------------------------------------------------------------------
% fk : indices of features that form the stable set (i.e., features kept)
%
% fsc : empirical probabilities for regularization parameters that were
% used (you probably don't care about this)
%
% fscmx : max empirical probability across regularization parameters for
% each feature. Note, in stability selection we take the max proportion of
% times that a feature was selected ACROSS all regularization parameters.
%
% maxVars : # of variables/features specified to be selected in each
% resampled dataset.
%
% alpha : alpha values used. Only applies to elastic net, lasso, ridge.
% These will map onto a dimension of fsc (different for different selection
% methods)
%
% lam : lambda values used. Only applies to elastic net, lasso, ridge, nca,
% GPR, RF. These will map onto a dimension of fsc (different for different
% selection methods)
%
% 'scores' : returns weighting of features by selected algoirthm that is
% then passed on to stability selection. If you have multiple tasks
% (columns in y) and you do not stack your tasks, this will contain the
% average feature weightings across all tasks.
%
% 'oid' : rows of X that are determined to be outliers.
%
% 'ctr' : indices of samples that were used to make each resampled dataset.
%
% 'mdl' : this is a model that was trained to identify outliers (can be
% used to predict outliers in new data). If outliers were detected inside
% the subsampling scheme, there is a model for each subsample. You can get
% a consensus of the predictions of these models on new data.
%
% 'ep' : effective FDR-like p-value based on the average # of features
% selected across resampled datasets and the threshold you selected.
%
% 'empMaxVars' : empirical maxVars. If your selection algorithm is a
% filter, then this will equal maxVars. However, algorithms that have
% regularization parameters that shrink features to zero will select a
% number of features that are not equal to maxVars because it is impossible
% to tell exactly which parameters will give exactly the number of features
% specified. In some cases the number of features taken from each resampled
% dataset will be fixed to *not more than* maxVars. Or, maxVars can be
% ignored and a threshold selected based on empirical maxVars to maintain a
% predetermined FDR-like p-threshold.
%
% 'thresh' : returns the threshold used to form the stable set.
%
% 'numFalsePos' : returns the upper bound on the number of false positives
% in the stable set. Corresponds to the per familywise error rate, which is
% more stringent than FWER.
%
% 'allScores' : if keepScores is set to true, allScores will contain scores
% (feature rankings submitted to stability selection) for every task
% (i.e., column in y).
%
%% ------------------------------------------------------------------------
% General optional arguments (apply to all selection methods) -------------
% -------------------------------------------------------------------------
% 'stack' : if set to true and you have multiple columns/tasks in y, then
% we will stack tasks together and your chosen feature ranking algorithm
% will consider all tasks together.
%
% 'samType' : determines the resampling scheme. Set to 'subsample' to take
% subsamples of your data as per the original stability selection paper
% (see section above). Set to 'bootstrap' to take bootstraps of your data.
% It's possible bootstrapping injects more noise into the data, which is
% beneficial to stability selection. But note, subsampling w/50% of your
% data is close to the equivalent of bootstrapping. Default: 'subsample'.
%
% 'compPars' : determines if subsampling or bootstrapping follows the
% complementary pairs principle where each subsequent resample has no
% overlap with the previous one. Note, this works best when 'prop' is ~0.5.
% Can be true or false. Default: false.
%
% 'rep' : number of bootstraps/subsamples to draw. Default: 300.
%
% 'prop' : proportion of data to resample (i.e., 0.7 means 70% of the data
% will be used in each subsample or that a bootstrap will be of a size that
% is 70% of the data; it makes sense to increase this number above default
% if the dataset is very small but this can break PFER assumptions. If
% 'prop' is 1 we use all of the data. Default: 0.5.
%
% 'propN' : if you would like to specify the number of samples to use in
% each resample (rather than the proportion of the data), set 'prop' to be
% greater than or equal to 1 (i.e., the number of samples you would like
% each subsample to contain) and propN to true. Default: false.
%
% 'adjProp' : if you are doing outlier detection, by default, the
% proportion of data used in each resample is cacluated *after* outlier
% removal. But, you can set adjProp to true in order to apply the
% proportion *before* outlier removal and ensure that this is the number of
% samples that is taken *after* outlier removal. Default: false.
%
% 'numFalsePos' : threshold for the upper bound on the number of false
% positives you would like to ensure in the stable set. Corresponds to the
% per family error rate. This will have no effect if both 'maxVars' and
% 'thresh' are passed in, otherwise it will be used to find the appropriate
% 'maxVars' or 'thresh'. It can be a whole value, or a decimal. Default: 1.
%
% 'fdr' : FDR-like p-value threshold for stable set. You should provide
% either this OR 'numFalsePos'. This will have no effect if both 'maxVars'
% and 'thresh' are passed in. Default: []. This option is currently
% unavailable.
%
% 'maxVars' : average # of variables the selection algorithm should choose
% on each subsample. Consider passing this into stabSel as blank if your
% selection algorithm has a regularization parameter that controls the
% number of features selected (lasso, ridge, elastic net, nca). If
% 'maxVars' is not empty, but 'thresh' is, then stabSel will determine a
% 'thresh' that ensures FWER or number of false positives provided.
% Default: [].
%
% 'thresh' : threshold for proportion of subsamples a feature must appear
% in to enter the stable set. Consider passing this into stabSel as blank
% if your selection algorithm has a regularization parameter that controls
% the number of features selected. This way, a threshold will be computed
% based on the PFER you have specified. If 'thresh' is not empty, but
% 'maxVars' is, then stabSel will determine a 'maxVars' that ensures FWER
% or number of false positives provided. If threshold is > 1 we assume you
% are telling stabSel how many features to select and fwer/false positives
% will no longer apply. Default: 0.9. Default: [].
%
% 'keepScores' : if set to true, stabSel will retain feature rankings
% submitted to stability selection for all tasks in y (i.e., columns).
% Default: false
%
% 'problem' : specify whether your problem is a regression problem
% ('regression') or classification ('classification). Some algorithms can
% be used for both. Default: 'regression'.
%
% 'selAlgo' : sets the algorithm for selecting features. This may be 'EN'
% for an elastic net (see lasso.m for more detail), 'lasso' for lasso
% regression (see lasso.m for more detail), 'ridge' for ridge regression
% (see lasso.m for more detail), 'corr' for correlation (see corr.m for
% more detail), 'robustLR' for robust linear regression (not as sensitive
% to outliers but takes MUCH longer to run; see fitlm.m for more detail),
% 'NCA' for neighborhood components analysis (see fsrnca.m for more
% detail), 'ftest' for an ftest (see fsrftest.m for more detail), 'mrmr'
% for minimum redundance maximum relevance algorithm that relies on mutual
% information (see fsmrmr.m for more detail), 'releiff' for the
% ReliefF/RReliefF algorithm that relies on nearest neighbor search (see
% relieff.m for more detail), 'GPR' for a gaussian process model (see
% fitrgp.m for more detail), and 'ens' for a random forest (see
% fitrensemble.m). Note, that an additional argument can be used to adjust
% 'EN' or 'lasso' to be adaptive (in both cases, coefficients from ridge
% regression are used to weight your data before applying lasso or an
% elastic net. For more information, see: Zou, H., & Zhang, H. H. (2009).
% On the adaptive elastic-net with a diverging number of parameters. Annals
% of statistics, 37(4), 1733.). Note, some of these options require more
% recent versions of matlab, but stabSel will check this for you. Default:
% 'EN'. For mutli-task regression 'pls' can be used as well 'groupLasso'.
% Group lasso is a very basic implementation and may not work well for all
% data. It cannot be used with the 'adaptive' option.
%
% The following selAlgo options can only be used for regression:
% 'lasso','en','corr','robustLR','ftest','relieff','gpr','pls'.
%
% The following selAlgo options can only be used for classification:
% 'ttest','wilcoxon','entropy','roc' or 'bhattacharyya'. All of these
% methods are used as filters.
%
% 'stnd' : standardize X (z-score cols) if set to true, otherwise set to
% false. When selAlgo is 'EN', 'lasso', 'ridge', or 'GPR' stnd is passed
% into the respective matlab commands. RF does not require standardization
% and can hurt performance. For all other options, standardization will be
% performed inside the subsampling scheme and *after* outlier detection.
% Stnd also applies to some outlier detection methods (ocsvm).
%
% 'parallel' : set to true to attempt parallel computing where possible.
% Applies to scripts stabSel draws on as well. Default: false.
%
% 'verbose' : set to true to get feedback in the command window about what
% stabSel is doing. Default: false.
%
%% ------------------------------------------------------------------------
% ------- Optional arguments that apply to > 1 selection algorithm --------
% -------------- (elastic net, lasso, ridge, nca, gpr, ens) ----------------
% -------------------------------------------------------------------------
% 'lam' : user-specified lambda (regularization parameter) sequence to be
% used for elastic net, lasso, ridge, GPR, or NCA. When this is empty
% (i.e., by default), lambdas are computed automatically between lmx and
% lmn (see options lmn, lmx, ln below for details). In the case of EN,
% lasso, and ridge, this is done separately for each alpha value, so each
% alpha value will have its own unique lambda sequence. This is necessary
% because lambda values for ridge and lasso will typically have very
% different scales based on whatever lambda is necessary to create the
% sparsest model possible. If adaptive is set to true, the values you give
% 'lam' will apply to the elastic net or lasso being used to select
% features and NOT to the ridge regression used to initially weight the
% data (see lamAEN below for where to supply those if you would like).
% Note, Default lambda values for NCA will be set between 0.0001 to 100
% before weighting by standard deviation of the response variable y. For
% GPR, lambda values refer to the regularization standard deviation values.
% By default, these will be set between the standard deviation of your
% response variable y multiplied by 0.01 and then divided/multiplied by 3.
% Weighting will be omitted if you supply your own values for NCA or GPR.
% Default:
%
% 'lamInside' : determines if automatically defined lambda values will be
% computed BEFORE subsampling, ACROSS subsamples or DURING subsampling
% (applies to EN/lasso/ridge and GPR). Set to 'during' to compute lambda
% values unique to each subsample. This only works for lasso, en, and ridge
% regression. It is especially recommendeded if stabSel returns warnings
% about being unable to select good lambda values to fit your subsamples
% (i.e., when lamInside is set to either of the other two options). Having
% different lambda values for each subsample introduces some variance but
% ensures random partitioning noise does not impact stability (i.e., when
% defining lambda values using all the data prior to subsampling).
% Tailoring lambda to subsamples is also more efficient and avoids
% excessively large or small lambdas that always result in full or sparse
% models. If this approach does not agree with you and you are getting many
% warnings about bad fitting lambda values, a happy middle ground is to
% define lambdas across subsamples. This is where we preallocate the
% subsamples, define lambdas tailored to each and use the min and max
% across those lambdas to define a consistent lambda range that will be fit
% to all subsamples. If using EN, all of these options will be executed
% independently for each alpha value. Default: during.
%
% 'lmx' : max lambda value to use in lambda sequence for elastic net, lasso
%, ridge, NCA, and GPR. For elastic net, lasso and ridge, leaving this
% blank will use the 'max l trick' to compute the max lambda value with
% which the model will select at least a single feature (by taking the max
% value of the dot product of your X and y variables, then dividing it by
% the number of samples in your data multiplied by the weight of the L1 to
% L2 norm being used). For NCA the default is 100 adjusted by the std(y)
% and for GPR the default is 1e-2*std(y)*3. For NCA only, if you pass in a
% value, it will still be weighted by std(y). If you would like to specify
% exact min/max lambdas for NCA, use 'lam'. Default: [].
%
% 'lmn' : min lambda value to use in lambda sequence for elastic net, lasso
%, ridge, NCA, and GPR. For elastic net, lasso and ridge, leaving this
% blank will multiply lmx by 'lamRatio' to get lmn. For NCA, the default is
% .001 adjusted by the std(y) and for GPR the default is 1e-2*std(y)/3. For
% NCA only, if you pass in a value, it will still be weighted by std(y). If
% you would like to specify exact min/max lambdas for NCA, use 'lam'.
% Default: [].
%
% 'ln' : number of lambda values to include in an automatically generated
% lambda sequence. This applies to elastic net, lasso, ridge, NCA, GPR, and
% ens. Default: 100.
%
% 'lst' : determines how values will be spaced between lmn and lmx to
% automatically produce a lambda sequence. Set to 'linear' or 'log'.
% Consider using log for en, lasso, ridge (but argument also applies to GPR
% and NCA). This recommendation is based on the fact that in most cases,
% lmx will be quite high, causing most models to be sparse. Alternatively,
% increase 'ln' dramatically (to maybe 10,000; also increase 'lamRatio')
% but this will not be very efficient. Default: 'linear'.
%
% 'logDirPref' : if 'lst' is set to 'log', this will determine which end of
% the lambda sequence you would like to sample more. Set this to 'smaller'
% to sample more lambda values closer to lmn. Set this to 'larger' to
% sample more lambda values closer to lmx (i.e., sparser models). Default:
% 'smaller'.
%
% 'lamOutlier' : only applies when lamInside is set to 'across'. Sometimes
% a lambda sequence for one subsample will be drastically different than
% the lambda sequence for another subsample. If you want to define a single
% lambda sequence across all subsamples, it is helpful to remove outlier
% lmn and lmx values as we take the lowest lmn and highest lmx to define
% this single sequence. If set to true, we will remove lmx values above the
% 95th percentile (as defined across subsamples). Default: true.
%
% 'filter' : It is possible to use some selection algorithms as a filter
% (robust linear regression, correlation, NCA). Set to true to use as
% filter. If set to true, maxVars number of variables will be selected by
% sorting the scores/weights of the selection algorithm. If false, see
% option below. Default: false.
%
% 'filterThresh', : threshold used to retain features if filter is set to
% false. Corresponds to a p-value to use for selecting features for
% correlation and robust linear regression. Corresponds to weight threshold
% for NCA (but should be 0 in most cases). Default: [].
%
%% ------------------------------------------------------------------------
% ------------ Optional arguments for elastic net, lasso, ridge -----------
% -------------------------------------------------------------------------
% stabSel was developed with lasso regression in mind so you may notice
% more options for it at the moment (see original stability selection paper
% referenced above).
%
% 'adaptive' : sets EN or lasso to be adaptive. Lasso lacks oracle
% properties, may have 'inaccurate' weights, or inconsisttently select
% variables (e.g., noise variables which is especially true when p >> n;
% see Zou & Zhang, 2009). Adaptive lasso has oracle properties but lacks
% the ability to select *all* of the correlated variables that are
% predictive. The L2 regualrization in EN fixes this, but EN lacks oracle
% properties as well so ideally we would use adaptive EN. In practice,
% adaptive EN does not always work flawlessly. This is in part because
% adaptive EN introduces a new parameter to tune: gamma, which controls the
% degree to which the ridge regression is used to weight the data. It is
% unclear how to best tune this parameter within stabSel so this is
% currently being worked out (best approach will probably be nesting so
% that we subsample the subsamples and get out of bag error). However, some
% a prior set gamma value may work quite well for your data. Note, adaptive
% EN can significantly increase computational time depending on your data.
% Finaly, note, a single regualrization parameter must be selected for the
% ridge regression used to weight data. If adaptive EN is performed inside
% the resampling scheme (see next option), we choose the parameter that
% minimizes out of sample error. Default: false.
%
% 'adaptOutside' : only applies if adaptive is true. Determines if you are
% weighting data (as in adaptive EN, lasso, ridge) BEFORE (set to true) or
% DURING (set to false) subsampling. Weighting data DURING subsampling
% probably makes the most sense, otherwise there is some data leakage.
% Default: false.
%
% 'ridgeRegSelect' : only applies if 'adaptive' is true and 'adaptOutside'
% is true. Ridge regression used for weighting data is run on a sequence of
% automatically generated lambda values and we take the result that shrinks
% the least number of features to zero (usually the model that contains all
% features). In most cases multiple lambda values meet this criterion. In
% such cases, you can decide to take the largest lambda (set this to
% 'largest'), the smallest lambda (set this to 'smallest') or the average
% lambda value (set this to 'middle'). Default: 'largest' (i.e., greatest
% penalty that shrinks the least features to zero).
%
% 'gam' : only applies if adaptive is true. Gamma value for controlling
% extent to which data is weighted by ridge regression. Default: 1.
%
% 'alpha' : alpha values for determining weight of L1 to L2 optimization in
% elastic net. This can be an n x 1 vector. 1 is equal to lasso, values
% close to but not zero are equal to ridge. Default: [0.1:0.1:1].
%
% 'lamAEN' : Lamda values to be used in ridge regression if adaptive is
% true. If false, 'max l trick' will be used. Default: [].
%
% 'lamRatio' : to get a sequence of lambda values, we use the 'max l trick'
% to get the max lambda, then use this ratio to get the smallest lambda.
% Default: 1e-4.
%
% 'fixMax' : can be set to true to force elastic net, lasso, ridge, to
% return a maximum of 'maxVars' features irrespective of regularization
% parameter lambda. This is set to true if using any of these selection
% algorithms and passing in maxVars to ensure we do not select more than
% the number of variables you want. Note also that if fixMax is set to true
% and maxVars is empty, then maxVars will be automatically computed based
% on FWER/false positives (if you use any of the above selection
% algorithms). Default: false.
%
%% ------------------------------------------------------------------------
% ------------------ Optional arguments for relieff -----------------------
% -------------------------------------------------------------------------
% 'rK' : determines k in knn (# of neighbors). Default: [1:14
% 15:10:size(X,2)/2].
%
% 'rE' : sigma, or distance scaling factor. Consider changing this to just
% 50 if stabSel is taking too long on your data. Default: [10:15:200].
%
%% ------------------------------------------------------------------------
% ---------------------- Optional arguments for GPR -----------------------
% -------------------------------------------------------------------------
% None but as a note, GPR will use: ardsquaredexponential kernel function,
% lbfgs optimizer, fully independent conditional approximation for
% prediction, and subset of regressors approximation for the fit method.
%
%% ------------------------------------------------------------------------
% ---------------------- Optional arguments for Ens -----------------------
% -------------------------------------------------------------------------
% Range of max # splits to be tested can be input using optional argument 'mns' and min leaf
% size using optional argument 'mls'. You should pass in a sequence of
% values that is n x 1 in size. 'mlsmn' can be used to set the min mls and
% 'mlsmx' the max mls (i.e., to have stabsel generate a series between
% these two values; the exact number of values can be controled by setting
% 'mln' the default for which is 3). 'mnsmn' and 'mnsmx' are equivalent min
% and max values for min leaf size (default 'mnsn' is set to 3 to generate
% 3 values between 'mnsmn' and 'mnsmx'). Default values for lam are 5 linearly
% spaced values between 1 and 300. Default values for min leaf size are
% round(linspace(1,max(2,floor(size(X,2)/2)),3)). Default values for max #
% splits are round(linspace(1,size(X,2)-1,3)). Default: [].
%
%% ------------------------------------------------------------------------
% ---------- Optional arguments for outlier detection/removal -------------
% -------------------------------------------------------------------------
% 'outlier' : determines where outlier detection removal will be performed
% on matrix X. Set to 'outside' to perform outlier detection before
% subsampling and 'inside' to perform outlier detection inside the
% subsampling loop. Set to 'none to avoid detection/removing outliers.
% Default: 'none'.
%
% 'outlierPrepro' : currently can only be set to 'none' or 'pca'. In the
% latter case, PCA is performed first, before outlier detection. This is
% useful because some methods of outlier detection do not work with lots of
% features (i.e., robustcov). Default: none.
%
% 'outlierPreproP' : if doing PCA, a permutation analysis will be performed
% to determine the number of components to keep. This sets the p-value for
% the components that will be retained. Default: 0.05.
%
% 'outlierPreproN' : # of permutations over which a p-value will be
% computed if doing PCA. Default: 5000.
%
% 'outlierPreproNonconsec' : if doing PCA, in some cases, significant
% components may not be consectuive (i.e., a component that explains very
% little variance may be significant). To keep only the first consecutive
% set of components that explain significant variance (organized by
% descending % of variance explained) set this to true. Default: true.
%
% 'outlierDir' : specify whether you want to find outliers in the rows or
% columns using 'rows' or 'columns'. If you set this to columns we assume
% you want to find outlier features. Default: 'rows'.
%
% 'outlierMethod' : specify method for outlier detection. Set to 'median',
% 'mean', 'quartiles', 'grubbs', 'gesd' to use isoutlier.m, which will
% prform outlier detection using some scaled MAD from the median, some std
% devs from the mean, some IQ ranges above/below upper lower quartiles,
% using the grubb's test, or using a generalized extreme studentized
% deviate test for outliers. Set this to 'fmcd, 'ogk', or 'olivehawkins' to
% use different methods for estimating robust covariance to detect outliers
% (robustcov.m). Use 'iforest' to generate an isolation forest to identify
% outliers. Use 'ocsvm' to use one class svm to identify outliers. Default:
% median.
%
% 'outlierReps' : some outlier detection methods are stochastic and
% repeating them can be helpful (ocsvm and iforest). This determines the
% number of times that they will be repeated. Default: 1 (i.e., no
% repeats).
%
% 'repThresh' : a threshold that determines how often outliers need to be
% identified across repeats to be kept as outliers. Default: 0.8 (i.e., 80%
% of the time).
%
% 'propOutliers' : some outlier detection methods force you to select a
% proportion of the data you expect to be an outlier. This is the case for
% iforest, ocsvm, fmcd, and olivehawkins. Default: 0.1.
%
% 'outlierThresh' : some outlier detection methods require a threshold
% factor that varies in meaning depending on the method used but determines
% the # of outliers identified. For 'median' it refers to the number of
% scaled MAD. For 'mean' it refers to the number of std devs from the mean.
% For 'grubbs' and 'gesd' it ranges from 0 - 1 and determines the detection
% threshold. For 'quartiles' it refers to the number of interquartile
% ranges. Default: 3 (i.e., good for 'median', the default for
% outlierMethod).
%
%% ------------------------------------------------------------------------
% --------------------- Optional arguments for PLS ------------------------
% -------------------------------------------------------------------------
%
% 'nComp' : number of components to test. Default: 1:30
%
%% ------------------------------------------------------------------------
% ------ Optional arguments for corr and robust linear regression ---------
% -------------------------------------------------------------------------
%
% SOME WARNINGS!
% * If using complementary pairs and/or bootstrapping, we do not make any
% adjustments to how the expected number of false positives is
% calculated.
%
%% ------------------------------------------------------------------------
% Citations --------------------------------------------------------------
% -------------------------------------------------------------------------
% For original method and details of stability selection, see: Meinshausen,
% N., & Bühlmann, P. (2010). Stability selection. Journal of the Royal
% Statistical Society: Series B (Statistical Methodology), 72(4), 417-473.
%
% For complementary pairs, see: Shah, R. D., & Samworth, R. J. (2013).
% Variable selection with error control: another look at stability
% selection. Journal of the Royal Statistical Society: Series B
% (Statistical Methodology), 75(1), 55-80.
%% ------------------------------------------------------------------------
% Internal notes ----------------------------------------------------------
% -------------------------------------------------------------------------
%
% Alex Teghipco // alex.teghipco@sc.edu // July 13, 2022
% load in defaults
options = struct('maxVars',[],'propN',false,'adjProp',true,'alpha',[0.1:0.1:1],...
'lam',[],'lamInside','during','lamAEN',[],'lamRatio',1e-4,'adaptOutside',...
false,'lmn',[],'lmx',[],'ln',1000,'prop',0.5,'rep',300,'stnd',true,'numFalsePos',...
[],'fdr',[],'thresh',0.9,'parallel',false,'adaptive',false,...
'outlier','none','outlierPrepro','none','outlierPreproNonconsec',true,...
'propOutliers',0.1,'outlierPreproP',0.05,'outlierPreproN',5000,...
'outlierReps',1,'outlierRepThresh',1,'outlierMethod','median',...
'outlierDir','row','outlierThresh',3,'selAlgo','en','lst','linear',...
'gam',1,'lrp',0.05,'rK',[1:14 15:10:size(X,2)/2],'rE',[10:15:200],...
'lr',[],'mls',[],'mns',[],'verbose',false,'logDirPref','smaller',...
'filter',false,'filterThresh',0.05,'samType','subsample','compPars',false,...
'ridgeRegSelect','largest','lamOutlier',true,'fixMax',false,'nComp',[1:30],...
'problem','regression','stack',false,'lrmn',0.001,'lrmx',1,'lrn',5,'mlsmn',[],...
'mlsmx',[],'mln',3,'mnsmn',[],'mnsmx',[],'mnsn',3,'keepScores',true,'ensType','bagged','verboseFin',true);
optionNames = fieldnames(options);
rng('default')
rng shuffle
% check if rf passed in but not lmn or lmx. Then, update defaults
vleft = varargin(1:end);
idx = find(strcmpi(vleft,'selAlgo'));
if strcmpi(vleft(idx+1),'ens')
idx1 = find(strcmpi(vleft,'lmn'));
idx2 = find(strcmpi(vleft,'lmx'));
if isempty(idx1)
options.lmn = 1;
end
if isempty(idx2)
options.lmx = 400;
end
end
idx2 = find(strcmpi(vleft,'lam'));
if ~isempty(idx2) && strcmpi(vleft(idx+1),'ens') && size(y,2) > 1
options.lam = repmat(options.lam,size(y,2),1);
end
% now parse the user-specified arguments and overwrite the defaults
for pair = reshape(vleft,2,[]) %pair is {propName;propValue}
inpName = pair{1}; % make case insensitive by using lower() here but this can be buggy
if any(strcmpi(inpName,optionNames)) % check if arg pair matches default
def = options.(inpName); % default argument
options.(inpName) = pair{2};
else
error('%s is not a valid argument',inpName)
end
end
% Throw arbitrary errors
if ~isempty(options.fdr)
error('You cannot set the FDR option at the moment, please use numFalsePos instead...')
end
% check compatibility
v = version('-release');
if str2double(v(1:end-1)) < 2022 && strcmpi(options.selAlgo,'mrmr')
error(['Uh oh, it looks like your version of matlab (' v ') is from before 2022...please install a more recent version of matlab (2022+) to use mrmr for feature selection'])
end
if (str2double(v(1:end-1)) < 2016 || (str2double(v(1:end-1)) == 2016 && ~strcmpi(v(end),b))) && strcmpi(options.selAlgo,'nca')
error(['Uh oh, it looks like your version of matlab (' v ') is from before 2016b...please install a more recent version of matlab (2016b+) to use NCA for feature selection'])
end
if (str2double(v(1:end-1)) < 2010 || (str2double(v(1:end-1)) == 2010 && ~strcmpi(v(end),b))) && strcmpi(options.selAlgo,'relieff')
error(['Uh oh, it looks like your version of matlab (' v ') is from before 2010b...please install a more recent version of matlab (2010b+) to use relieff for feature selection'])
end
if (str2double(v(1:end-1)) < 2011 || (str2double(v(1:end-1)) == 2011 && ~strcmpi(v(end),b))) && (strcmpi(options.selAlgo,'en') || strcmpi(options.selAlgo,'lasso') || strcmpi(options.selAlgo,'ridge'))
error(['Uh oh, it looks like your version of matlab (' v ') is from before 2011b...please install a more recent version of matlab (2011b+) to use elastic net, lasso, or ridge for feature selection'])
end
if (str2double(v(1:end-1)) < 2015 || (str2double(v(1:end-1)) == 2015 && ~strcmpi(v(end),b))) && strcmpi(options.selAlgo,'gpr')
error(['Uh oh, it looks like your version of matlab (' v ') is from before 2015b...please install a more recent version of matlab (2015b+) to use GPR for feature selection'])
end
if (str2double(v(1:end-1)) < 2016 || (str2double(v(1:end-1)) == 2016 && ~strcmpi(v(end),b))) && strcmpi(options.selAlgo,'ens')
error(['Uh oh, it looks like your version of matlab (' v ') is from before 2016b...please install a more recent version of matlab (2016b+) to use ensembles for feature selection'])
end
if str2double(v(1:end-1)) < 2008 % cvpartition is faster than manual subsampling but requires 2008
manualSubs = true;
else
manualSubs = false;
end
% warnings about compute
if size(y,2) > 1 && ~strcmpi(options.selAlgo,'pls') && ~options.stack
warning('Looks like your input y is a matrix. By default, we will independently run your selection algorithm on each task and average the rankings of features across tasks. This can be very computationally expensive and does not work for all problems. I would recommend using PLS instead if this is an issue.')
if options.keepScores
warning('You have chosen to keep scores for all tasks, this can use up a lot of memory')
end
end
% warning about ensembles
if strcmpi(options.selAlgo,'ens') & ~strcmpi(options.ensType,'bagged') & strcmpi(options.problem,'classification')
idx = find(options.ensType,{'LogitBoost','GentleBoost','AdaBoostM1','AdaBoostM2','RUSBoost'});
if isempty(idx)
options.ensType = 'LogitBoost';
warning('You have selected a boosted ensemble for classification but your boosting method must be: LogitBoost GentleBoost AdaBoostM1 AdaBoostM2 or RUSBoost...Setting to LogitBoost')
elseif ~isempty(intersect(idx,[4 5]))
for i = 1:size(y,2)
un(i,1) = length(unique(y(:,i)));
end
idx = find(un < 3);
if ~isempty(idx)
options.ensType = 'LogitBoost';
warning('You must have 3+ classes to use AdaBoostM2 or RUSBoost...setting to LogitBoost')
end
end
end
% check if classification is used w/incompatible method and vice versa
if strcmpi(options.problem,'classification') & strcmpi(options.selAlgo,{'lasso','en','corr','robustLR','ftest','relieff','gpr','pls'})
error('You cannot use lasso en corr robustLR ftest relieff gpr or pls algorithm options with classification problems')
end
if strcmpi(options.problem,'regression') & strcmpi(options.selAlgo,{'ttest','wilcoxon','entropy','roc','bhattacharyya'})
error('You cannot use ttest, wilcoxon, entropy, roc or bhattacharyya algorithm options with regression problems')
end
% stack independent variables
oys = size(y);
idx = find(oys==1);
% reshape y in case you did not read the readme
if idx == 1
y = y';
end
if options.stack
y = reshape(y,[size(y,1)*size(y,2),1]);
X = repmat(X,oys(2),1);
end
% fix pls options
if strcmpi(options.selAlgo,'pls')
%idx = find(options.nComp >= size(y,2));
idx = find(options.nComp >= round(size(X,1)*options.prop));
if ~isempty(idx)
options.nComp(idx) = [];
warning('Removing components larger than the number of samples in your data')
end
if isempty(options.nComp)
options.nComp = 1;
end
end
% fix lasso or ridge as inputs
if strcmpi(options.selAlgo,'lasso')
options.alpha = 1;
options.selAlgo = 'en';
end
if strcmpi(options.selAlgo,'ridge')
options.alpha = 0.00001;
options.selAlgo = 'en';
end
% fix usage of filter
poss = {'mrmr','relieff','ftest','ttest','wilcoxon','entropy','roc','bhattacharyya'};
idx = find(strcmpi(options.selAlgo,poss));
if ~options.filter && ~isempty(idx)
warning(['You can only use ' poss{idx} ' as a filter. Changing options to filter'])
options.filter = true;
options.fixMax = false;
end
% fix missing maxVars when using a filter -- deleted this for correlation.
% Need to make sure others still work with this.
% if isempty(options.maxVars) && options.filter
% options.maxVars = size(X,2)/10;
% warning(['You did not specify the number of variables to use in the filter. Using a tenth of your features...'])
% end
% fix numFalsePos
if isempty(options.numFalsePos) && isempty(options.fdr)
options.numFalsePos = 1;
end
% fix size of lam
if size(options.lam,1) == 1 && size(options.lam,2) > 1
options.lam = options.lam';
end
% proportion may reflect N of subsample instead of an N to compute--if so
% convert to proportion.
if options.propN && options.prop >= 1
tmpl = options.prop;
options.prop = options.prop/size(X,1);
if options.verbose
disp('Changing subsampling N to proportion...')
disp(['To achieve N of ' num2str(tmpl) ' in subsample, proportion must be: ' num2str(options.prop)]);
end
end
% fix fixMax if necessary
if strcmpi(options.selAlgo,'en') && ~isempty(options.maxVars) && ~options.fixMax
options.fixMax = true;
warning('When passing in a maxVars with lasso, ridge, elastic net, or nca we have to set fixMax to true to ensure a max of maxVars is returned across regularization parameters. Setting it to true now.')
end
% check for weird cases
toch = {'nca','corr','robustLR'};
idx = find(strcmpi(options.selAlgo,toch));
if ~isempty(idx) && ~isempty(options.maxVars) && ~options.filter
options.filter = true;
warning(['When passing in a maxVars with ' toch{idx} ', we assume you want to use them as a filter']);
end
% warning about threshold if it is known--cannot be below 0.5 if you want
% to do fdr-ish (p or num false positives)
if ~isempty(options.thresh) && options.thresh < 0.5 && (~isempty(options.numFalsePos) || ~isempty(options.fdr))
warning('You have asked to estimate some false positives in the stable set, but your threshold is below 0.5. We can only estimate false positives when thresh is > 0.5. Fixing thresh to 0.501.')
options.thresh = 0.501;
end
% Check for missing fdr and/or numFalsePos
if (isempty(options.numFalsePos) && isempty(options.fdr)) && (isempty(options.thresh) || isempty(options.maxVars))
options.numFalsePos = 1;
warning('You did not set thresholds for number of false positives or FDR. Typically this is not a problem, but you also did not set BOTH thresh and maxVars. Setting number of false positives to 1 so that it is possible to get these variables.')
end
% if you have provided all the variables *AND* the numFalsePos, then we
% recompute your numFalsePos
if ~isempty(options.thresh) && ~isempty(options.maxVars)
warning('You have input both the stable set threshold and the number of variables to sample. Your number of false positives is probably inaccurate!!! Interpret with caution.')
end
% get number of features (avg) our regularization parameters should produce
% if user-specified threshold does not exist
if isempty(options.thresh) && isempty(options.maxVars) && (options.fixMax && strcmpi(options.selAlgo,'en')) || (~options.filter && (strcmpi(options.selAlgo,'corr') || strcmpi(options.selAlgo,'robustLR') || strcmpi(options.selAlgo,'nca')))
if ~isempty(options.numFalsePos)
options.maxVars = round(sqrt(0.8*options.numFalsePos*size(X,2)));
disp(['Estimated maxVars will be: ' num2str(options.maxVars) ' based on a threshold of less than ' num2str(options.numFalsePos) ' false positives'])
else
warning('Estimating maxVars that will produce <1 false positive. If there is an FDR-like threshold set, it will be applied when estimating probability threshold for defining stable set..')
options.maxVars = round(sqrt(0.8*size(X,2)));
end
end
% check that there is not BOTH fdr and numFalsePos
if ~isempty(options.numFalsePos) && ~isempty(options.fdr)
options.numFalsePos = [];
warning('You cannot specify both an FDR-like threshold and a threshold for number of false positives. Keeping FDR.')
end
% get number of features (avg) our regularization parameters should produce
% if user-specified threshold exists
if isempty(options.maxVars) && ~isempty(options.thresh)
if ~isempty(options.fdr)
tmpii = linspace(0.0001,size(X,2)/3,100000);
disp(['Finding number of false positives for which FDR-like p-value should be : ' num2str(options.fdr)])
for i = 1:length(tmpii)
n1 = sqrt(size(X,2)*(tmpii(i)/(1/((2*options.thresh)-1))));
tmpi(i,1) = ((1/((2*options.thresh)-1))*((n1.^2)/size(X,2)))/n1;
end
idx = find(tmpi < options.fdr,1,'last');
if ~isempty(idx)
options.numFalsePos = tmpii(idx);
else
options.numFalsePos = NaN;
warning('It was not possible to find a number of false positives that would ensure selected fdr-like threshold. numFalsePos will be set to 1 to estimate maxVars.')
end
end
if ~isnan(options.numFalsePos)
n1 = sqrt(size(X,2)*(options.numFalsePos/(1/((2*options.thresh)-1))));
if n1 > 1
options.maxVars = round(n1);
else
options.maxVars = round(n1+1)-1;
end
else
options.maxVars = round(sqrt(0.8*size(X,2)));
end
if options.maxVars == 0
error('The specified threshold does not allow you to have a number of maxVars (average # of features selected) that can achieve less than the number of false positives you have indicated. Increase the number of false positives (or fdr-like thresh) you are comfortable with, or decrease the threshold.')
else
disp(['Average # of features selected by method (maxVars) should be : ' num2str(options.maxVars) '. This will ensure ' num2str(options.numFalsePos) ' or fewer false positives'])
end
end
% warn about correction
if options.verbose && options.prop ~= 0.5
warning('Having a sampling proportion other than 0.5 may break estimation of the number of false positives (incl. fdr option). This may be okay to do in some cases (see original stability selection paper).')
end
% check for redundant lambda args...
if (~isempty(options.lam) && (strcmpi(options.lamInside,'across') || strcmpi(options.lamInside,'inside'))) && (strcmpi(options.selAlgo,'en') || strcmpi(options.selAlgo,'gpr'))
if options.verbose
warning('You cannot pass in lambda values AND try to define lambda values inside or across the resamples. Assuming turning on lamInside was a mistake...setting this option to false.')
end
options.lamInside = false;
end
if (strcmpi(options.lamInside,'across') || strcmpi(options.lamInside,'inside')) && (~strcmpi(options.selAlgo,'en') && ~strcmpi(options.selAlgo,'gpr'))
if options.verbose
warning(['You cannot define lambda values inside or across the resamples with your chosen selection algorithm: ' options.selAlgo '. Assuming turning on lamInside was a mistake...setting this option to false.'])
end
options.lamInside = false;
end
% warn about features as outliers
if ~strcmpi(options.outlier,'none') && strcmpi(options.outlierDir,'columns')
warning('Treating features as outliers has not been thoroughly debugged...proceed with caution')
end
% check to see if you want to do outlier removal now...
oid = [];
mdl = [];
if strcmpi(options.outlier,'outside')
[X,oid,mdl,~] = rmOutliers(X,'prepro',options.outlierPrepro,'nonconsec',options.outlierPreproNonconsec,'outlierMethod',options.outlierMethod,'outlierDir',options.outlierDir,'outlierThresh',options.outlierThresh,'pcaP',options.outlierPreproP,'pcaPermN',options.outlierPreproN,'propOutliers',options.propOutliers,'rep',options.outlierReps,'repThresh',options.outlierRepThresh);
% check if you want to adjust subsampling prop. to account for #
% outliers removed...prop will be adjusted to keep training set same
% size as if the prop selected was applied to all data...
y(oid,:) = [];
if options.verbose
if strcmpi(options.outlierDir,'rows')
disp(['Removed ' num2str(length(oid)) ' row (samples) outliers from data (' num2str((length(oid)/size(X,2))*100) '%) before subsampling'])
elseif strcmpi(options.outlierDir,'collumns')
disp(['Removed ' num2str(length(oid)) ' col (features) outliers from data (' num2str((length(oid)/size(X,2))*100) '%) before subsampling'])
end
end
if options.adjProp
if strcmpi(options.outlierDir,'rows') % no adjustment if outliers are features
in = fix((size(X,1) + length(oid))*options.prop);
options.prop = in/size(X,1);
if options.verbose
disp(['Adjusting subsampling proportion to ' num2str(options.prop) ' in order to retain ' num2str(in) ' samples in each subsample'])
end
if options.prop > 1
options.prop = 1;
if options.verbose
warning('Adjusting subsampling proportion after outlier removal results in subsampling sets that comprise the entire dataset')
end
end
end
end
end
% define lambdas for EN (use max l trick unless max/min specified by user)
if (isempty(options.lam) && strcmpi(options.lamInside,'before')) && strcmpi(options.selAlgo,'en')
if ~options.adaptive % if not adaptive
if options.verbose
disp('Computing lambda values for EN (lambda values are computed from the whole dataset and the same series will be passed in to each subsample)')
end
for kk = 1:length(options.alpha)
for mm = 1:size(y,2) % Loop over output variables
[~,~,options.lam(:,kk,mm)] = defLam(X,y(:,mm),options.alpha(kk),options.stnd,options.lmx,options.lmn,options.lamRatio,options.lst,options.ln,options.logDirPref);
end
end
else % if adaptive we need to run first pass EN on whole data, get weights and get lambdas from those weights
if options.verbose
disp(['Computing lambda values for adaptive EN (lambda values are computed from the whole dataset weighted by an adaptive EN and the same series will be passed in to each subsample)'])
end
[Xtmp,~,~,~] = alasso(X,y,[],[],[],options.stnd,options.lamAEN,options.gam,options.ridgeRegSelect,options.parallel);
for kk = 1:length(options.alpha)
for mm = 1:size(y,2) % Loop over output variables
[~,~,options.lam(:,kk,mm)] = defLam(Xtmp,y(:,mm),options.alpha(kk),options.stnd,options.lmx,options.lmn,options.lamRatio,options.lst,options.ln,options.logDirPref);
end
end
if options.adaptOutside
X = Xtmp;
options.adaptive = false;
end
end
if options.verbose
disp(['The min lambda for EN/lasso is set to : ' num2str(options.lmn)])
end
if options.verbose
disp(['The min lambda for EN/lasso is set to : ' num2str(options.lmn)])
end
end
% define lambdas for NCA
if isempty(options.lam) && strcmpi(options.selAlgo,'nca')
if isempty(options.lmn)
options.lmn = 0.001;
if options.verbose
disp(['The min lambda for NCA is set to a default of: ' num2str(options.lmn)])
end
end
if isempty(options.lmx)
options.lmx = 100;
if options.verbose
disp(['The max lambda for NCA is set to a default of: ' num2str(options.lmx)])
end
end
if strcmpi(options.lst,'linear')
options.lam = linspace(options.lmn,options.lmx,options.ln)*std(y(:))/size(y,1);
elseif strcmpi(options.lst,'log')
options.lam = exp(linspace(log(options.lmn),log(options.lmx),options.ln))*std(y(:))/size(y,1);
end
end
% define lambdas for GPR
if (isempty(options.lam) && strcmpi(options.lamInside,'before')) && strcmpi(options.selAlgo,'gpr')
if isempty(options.lmn)
options.lmn = 1e-2*std(y(:))/3;
if options.verbose
disp(['The min lambda for GPR is set to a default of: ' num2str(options.lmn)])
end
end
if isempty(options.lmx)
options.lmx = 1e-2*std(y(:))*3;
if options.verbose
disp(['The max lambda for GPR is set to a default of: ' num2str(options.lmx)])
end
end
if strcmpi(options.lst,'linear')
options.lam = linspace(options.lmn,options.lmx,options.ln);
if options.verbose
disp(['Lambdas are on linear scale...(we make ' num2str(options.ln) ' of them)'])
end
else
options.lam = exp(linspace(log(options.lmn),log(options.lmx),options.ln));
if options.verbose
disp(['Lambdas are on log scale...(we make ' num2str(options.ln) ' of them)'])
end
end
end
% define lots of vars for RF -- note we currently only use lamRF (number of
% learning cycles) because doing this much tuning is expensive...RF IS
if isempty(options.lam) && strcmpi(options.selAlgo,'ens')
if isempty(options.lam)
if strcmpi(options.lst,'linear')
options.lam = linspace(options.lmn,options.lmx,options.ln);
else
options.lam = exp(linspace(log(options.lmn),log(options.lmx),options.ln));
end
end
end
if isempty(options.lr) && strcmpi(options.selAlgo,'ens')
if strcmpi(options.lst,'linear')
options.lr = linspace(options.lrmn,options.lrmx,options.lrn); % 1 500
else