-
Notifications
You must be signed in to change notification settings - Fork 0
/
UMLAUT.pro
1106 lines (954 loc) · 45.6 KB
/
UMLAUT.pro
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
FUNCTION UMLAUT, AI, BI, AO, BO, SO, NVV, CLN=CLN, MIN_CLN=MIN_CLN, TYPE_CLN=TYPE_CLN, AVERAGE=AVERAGE, test=test,fit_thresh=fit_thresh,scope=scope,CLAS_VECT=CLAS_VECT,CLAS_PROB=CLAS_PROB,CLAS_UNC=CLAS_UNC,GETPDF=GETPDF, X_PDF=X_PDF, PDF=PDF,def_x_PDF=def_x_PDF,PSM=PSM,SO_TYPE=SO_TYPE,OPTIMIZE_DIM=OPTIMIZE_DIM,BAL=BAL,OUT_SCALINGS=OUT_SCALINGS,IN_SCALINGS=IN_SCALINGS
; XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
; UMLAUT 1.0
; XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
; Unsupervised Machine Learning Algorithm based on Unbiased Topology
; By Ivano Baronchelli, May/2019 - Jan/2021
; See also: Baronchelli et al. (2021)
; UMLAUT is a variant of the KNN (K-closest neighbor) algorithm.
; - Given a set of reference data points (training set), for which
; the value of N+1 parameters is known,
; - given one analysis data point with N parameters known and the
; (N+1)-th parameter unknown,
; -> UMLAUT computes the value of the (N+1)-th parameter for the
; analysis data point. To this purpose, UMLAUT finds the closest
; data-points, from a reference sample, in a N-dimensional space
; "associated" (see NOTE 1 below) with the parameter space.
; After finding the closest data points, the unknown parameter
; is obtained as the combination of the values assumed by the
; closest reference data points, along the (N+1)-th dimension.
; >> NOTES:
; 1) the "associated" N-dimensional space is NOT the parameter
; space itself. During the training phase,
; - every dimension is "ordinalized": the actual value assumed
; by each of the M data points of the reference sample along
; the N dimensions is replaced by the position (1,2,...,M)
; of the data point itself in a ordered scale
; - The N ordinalized dimensions are scaled following a
; weighting process that tries to minimize the dispersion
; associated with the estimated (N+1)-th parameter.
; 2) The simplest configuration, with only one unknown
; parameter (the [N+1]-th), is described above. In reality,
; UMLAUT can be used to determine many (no limitations)
; unknown parameters. Obviously, the same parameters must be
; known for the data points of the reference sample.
; 3) UMLAUT is originally designed for REGRESSION purposes, but
; it can also be used for CLASSIFICATION. However, when used for
; classification purposes, the current version of UMLAUT
; a) does not support the weighting of the input
; parameters/dimensions (OPTIMIZE_DIM='no')
; b) provides the classification of one single parameter
; (1-dimensional CLAS_VECT input vector)
; 4) UMLAUT can be trained and tested using the same sample (the
; keyword "test" must be set in this case). As demonstrated in
; Baronchelli et al. (2021), This configuration does not
; introduce overfitting problems, as the training is performed
; using a "leave one out" strategy, wehere the data point left
; out is properly the data point tested.
; XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
; INPUT PARAMETERS:
; XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
; AI= input array AI[N,M] characterized by:
; --> N dimensions (indipendent input parameters)
; --> M elements (elements or data points used to train the
; algorithm).
; The NxM values in AI are those assumed by the N input
; parameters for each of the M data point of the training set.
; For every data point in this array, the value of the
; indipendent variable B (the N+1 unknown parameter) is known.
; BI= input vector BI[M] containing the values assumed by the
; M elements of the training sample along the (N+1)-th independent
; dimension (B).
; If there are more than just one independent variable that the
; user wants to estimate, then BI[M,O] is a MxO array. Here, "O"
; is the number of parameters that the user wants to estimate for
; the analysis data point (they are known for the data points
; in the training sample). When UMLAUT is used for classification
; purposes, then BI contains the labels associated with the
; training data points
; AO= Input array AO[N,L] similar to AI, characterized by:
; --> N dimensions (number of indipendent input parameters)
; --> L elements (number of analysis data points for which the
; user wants to compute the indipendent variable B).
; The parameter B will be estimated for the L datapoints;
; NVV= Input N dimensional vector specifying, for each of the N
; dimensions considered, the Not Valid Values that should not
; be considered (example: -99., 0, etc...). When one of the
; dimensions, for a certain datum, assumes the value specified
; in NVV, that dimension is not considered.
; CLN= Number of closest elements to be considered for evaluating
; the indipendent variable. If not set, a warning message is
; issued and a default value corresponding to (M/50)+1 (i.e. ~2%
; of the available training data points) is assumed.
; MIN_CLN= when TYPE_CLN is set to "min_max", MIN_CLN sets the minimum
; value of CLN to be considered (whereas CLN is considered
; the maximum, in this case). This parameter is not taken
; into account when the scope parameter is set to
; "classification".
; TYPE_CLN= if this keyword is not set, or if it is set to the default
; option 'fixed', then CLN represents the amount of closest
; datapoints that will be considered by UMLAUT to determine
; BO. With TYPE_CLN='fixed' (or not set), the MIN_CLN option
; is not taken into account. If TYPE_CLN is set to
; "min_max", then the algorithm automatically finds, for
; each element in AO, the best value of CLN that should be
; used, ranging from MIN_CLN and CLN (that are set by the
; user). TYPE_CLN is set to 'fixed' when
; scope='classification'.
; AVERAGE= Type of average to be considered. Options are: "median",
; "mean", "mode", "weighted", "fit".
; - Default option when scope="regression" is "mean".
; - Default option when scope="classification" is "mode".
; Besides "median" and "mean", whose meaning is clear, the
; "weighted" option distributes the weights normally, with
; CLN assumed to be sigma of that distribution. In order of
; N-dimensional distance, the i-th element is weigthed as:
; W=exp{- (i^2) /(2*CLN^2)}
; Using the option "fit", B (the [N+1]-th parameter) is
; obtained from a N-dimensional linear fit of the closest
; reference data points. This option is particularly useful
; when the values assumed by some of the N parameters of the
; analysis data point(s) are located at the border of (or
; outside) the range of values expressed by the training
; sample. After selecting the closest reference elements,
; the data points having the (N+1)th parameter that differs
; from the average more than "fit_thresh" times the
; dispersion, are excluded from the fit. The fit_thresh
; paramter is optional.
; The "mode" option is valid only for "classification"
; purposes (scope="classification"), and it represents
; the default option in this configuration. However, in this
; case, also the "weighted" option is available. When UMLAUT
; is used for classification purposes, then the options
; "median", "mean" and "fit" have no sense, as the output is
; not a real number but it is a label. In this case, the
; output label can be obtained as the "mode" (default) of the
; closest data points of the training set or as the
; "weighted" mode. The weighting factor is computed as
; described above and it takes into account the distances
; from the analysis data point.
; BAL= when this keyword is set (/BAL), (only for scope=classification
; configuration), the estimated output label of the analysis data
; point is obtained by weighting the probabilities associated to
; each possible label (CLAS_PROB output) taking into account the
; fraction of training data points that are labelled with the
; same label, with respect to the total.
; test= setting this keyword, the closest datapoint of the training
; set is not taken into account when computing the output
; parameter (BO). This configuration should be used when
; UMLAUT is trained and tested on ovrelapping or even identical
; datasets. Notice that training UMLAUT in this way does not
; introduce overfitting problems, as the "leave one out"
; testing strategy is adopted (See Baronchelli et al. 2021).
; scope= set this keyword to "regression" or to "classification".
; Defaulft is regression.
; - If set to regression, BI must contain real numbers
; (example BI[0]=3.45, BI[1]=2.21, BI[3]=1.5, BI[4]=5.7
; etc...). In this case, UMLAUT provides the best estimation
; of the output parameter (BO array) as a real number.
; - If set to "classification", BI must contain a discrete
; classification for each of the single data point of the
; training set (example BI[0]='OII', BI[1]='Ha',
; BI[3]='Hb',BI[4]='Ha', etc...). Moreover,
; > in the input vector "CLAS_VECT", the user must specify
; the different possibilities (labels);
; > in the output array "CLAS_PROB", UMLAUT provides the
; probabilities associated to each of the possible input
; labels.
; > in the output array "CLAS_UNC", UMLAUT provides the
; poissonian uncertainties associated to each of the
; probabilities indicated in "CLAS_PROB".
; Example:
; > CLAS_VECT=["Ha", "Hb", "OIII", "OII"] (input vector);
; > CLAS_PROB=[0.6,0.1,0.2,0.1] (output vector for one
; single analysis data point).
; > CLAS_UNC=[0.12,0.02,0.03,0.01] (output vector for one
; single analysis data point).
; Under the "classification" configuration, TYPE_CLN is set to
; "fixed" and the AVERAGE keyword is not considered (the
; output is not an average).
;GETPDF=setting this keyword, (/GETPDF), UMLAUT provides an output
; PDF for the output parameter BO (See also the "X_PDF", "PDF",
; and "PSM" parameters. The PDF is not provided under the
; scope="classification" configuration.
;def_x_PDF=setting this keyword (/def_x_PDF) "X_PDF" (see below) is
; not considered as an input. Instead, x_PDF will be
; overwritten by a default scale automatically selected by
; UMLAUT. If scope="classification", this parameter is not
; taken into account.
;PSM=smoothing factor applied to the output PDF. A good compromise
; for this parameter is PSM ~1/1000 - 1/200 of the total number
; of data points in the training set. If scope="classification",
; this parameter is not taken into account.
;OPTIMIZE_DIM=set to "yes" (default) to let the algorithm free to
; weight the input dimensions after their
; ordinalization. If scope="classification", dimensions
; are not optimized.
;SO_TYPE=Type of uncertainties that will be incuded in the output
; vector SO These uncertainties are associated with the
; estimated values of the output parameter B (BO). OPTIONS:
; sigma --> sigma(default),
; perc --> percentiles 16%-84%,
; aver --> average between sigma and symmetrized percentiles
;IN_SCALINGS=optional NxO elements input vector containing the weights
; associated to each of the N input parameters
; (dimensions). If more than one single output parameter B
; has to be estimated (O>1), than the user can provide a
; different set of weights, one for each of the output
; parameters to estimate. Notice that the this vector can
; be obtained from previous runs of UMLAUT. In fact, the
; output vector OUT_SCALINGS provides the list of weights
; used for each of the analysis data points. Hence,
; each of the N elements of IN_SCALINGS can be obtained
; as the average weights reported in OUT_SCALINGS (along
; each of the N dimensions), computed in previous
; runs. In an iterative strategy, at each run of UMLAUT,
; the IN_SCALING values of the previous iterations can be
; multiplied for the weights obtained from the newer
; iterations, in order to obtain more and more precise
; weights at the end.
;fit_thresh=optional parameter to be used when "AVERAGE" is set to
; "fit". After selecting the closest reference elements,
; the data points having the (N+1)th parameter that differs
; from the average more than "fit_thresh" times the
; dispersion, are excluded from the fit. The fit_thresh
; paramter is optional.
; XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
; OUTPUT PARAMETERS:
; XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
; BO=output vector BO[L] with L elements. It contains the output
; estimations of the (N+1)th parameter (B) for all the analysis
; data points. For each of the analysis data point, the values of
; the N parameters are specified in the input array AO (see
; above). If there are more than just one independent variable
; that the user wants to estimate, then BO is a LxO array. Here,
; "O" is the number of parameters that the user wants to estimate
; for the analysis data point (they must be known for the data
; points in the training sample).
; SO=output vector SO[L] containing the values of dispersion
; associated with the estimations of the independent parameter B,
; (specified in BO). The type of uncertainty that the user wants
; to use (sigma/percentiles) can be specified in the "SO_TYPE"
; array.
; CLAS_VECT=see (input parameter "scope" )
; CLAS_PROB=see (input parameter "scope" )
; CLAS_UNC=see (input parameter "scope" )
; PDF= Output Probability Distribution Functions. One PDF for every
; datum is given in utput. The PDF is computed in the binning
; decided by the user if X_PDF is set to a particular vector.
; OUT_SCALINGS=LxNxO vector specifying, for each of the L analysis
; elements and for each of the N input dimensions
; (parameters), the weight used to compute the output
; value of the unknown paramter B. If more than one
; output parameter has to be estimated (O>1), then the
; weights are estimated for each of the output
; parameters computed.
; -------------------------------------------------------------------
; XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
; INPUT/OUTPUT PARAMETERS:
; XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
; X_PDF= x values for the output Probability Distribution Functions.
; > IF the "def_x_PDF" keyword is set, X_PDF is considered an
; output and it corresponds to the vector BI.
; > if the "def_x_PDF" keyword is NOT set, X_PDF is an input
; vector that should be defined by the user.
; When the user wants to estimate just one single output
; parameter (i.e., BI and BO are LxO arrays with O=1), then
; X_PDF is a one dimensional vector made of H elements (the
; number of elements H depends on how the user set "X_PDF"
; itself and "def_x_PDF". Instead, when O>1, X_PDF is a HxO
; array of elements.
; --------------------------------------------------------------------
; ADDITIONAL NOTES:
; - When UMLAUT evaluates one of the input dimensions, if there are
; too few elements in the training set with a valid value along the
; same dimension (less than 3*CLN), then the dimension is not
; considered at all.
; - In the classification configuration, the output array BO is always
; consiered as an array of strings.
; --------------------------------------------------------------------
; Dimensions and corresponding cycling variables in this program:
; O --> OO - output dimensios to estimate
; L --> LL - analysis data points
; N --> EE and NN - known dimensions
; --------------------------------------------------------------------
;MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
; INTERNAL PARAMETERS
;MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
N_EXP_step=+1. ; Initial "expansion" or "scaling" factor
Prize1=0.2 ; Positive prize to dimensions that decrease
; the sigma on the output parameter when expanded
Prize2=-0.1 ; negative prize to dimensions that increase
; the sigma on the output parameter when expanded
Prize3=0.4 ; Special prize for the dimension that provides
; the lowest (best) value of sigma when expanded
MAX_MM=25 ; Maximum number of iterations per analysis element
Reduction_factor=2.
;MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
;MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
; INTERNAL PARAMETERS - TEST VERSION
;MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
N_EXP_step=+1. ; "expansion" or "scaling" factor
Prize1=0.2 ; Positive prize to dimensions that decrease
; the sigma on the output parameter when expanded
Prize2=-0.1 ; negative prize to dimensions that increase
; the sigma on the output parameter when expanded
Prize3=0.4 ; Special prize for the dimension that provides
; the lowest (best) value of sigma when expanded
MAX_MM=25 ; Maximum number of iterations per analysis element
Reduction_factor=2;1/(1-1./(2*MAX_MM))
;MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
N_DIM=n_elements(AI[0,*]) ; Number of input dimensions (N)
M_ELE=n_elements(AI[*,0]) ; Number of training data points (M)
O_DIM=n_elements(BI[0,*]) ; Number of output dimensions (O)
L_ELE=n_elements(AO[*,0]) ; Number of analysis data points (L)
; OOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOO
; DEFAULT VALUES, WARNINGS AND ERRORS
; OOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOO
if not(keyword_set(scope)) then scope='regression'
scope=strlowcase(scope)
if not(keyword_set(SO_TYPE)) then SO_TYPE="sigma"
SO_TYPE=strlowcase(SO_TYPE)
if not(keyword_set(TYPE_CLN)) then TYPE_CLN='fixed'
TYPE_CLN=strlowcase(TYPE_CLN)
if not(keyword_set(OPTIMIZE_DIM)) then OPTIMIZE_DIM='yes'
OPTIMIZE_DIM=strlowcase(OPTIMIZE_DIM)
if not(keyword_set(CLN)) then begin
print, "************ UMLAUT WARNING *************"
print, "The relevant parameter 'CLN' has not been set"
print, "This numer represents the number of closest"
print, "data points, in the multi-dimensional"
print, "parameter space, to be considered."
CLN=long((M_ELE/50)+1) ; <-- ATTENTION !!!
print, "CLN=(M/50)+1= "+strcompress(string(CLN))+" assumed"
print, "*****************************************"
endif
if not(keyword_set(MIN_CLN)) then MIN_CLN=CLN
if scope ne 'regression' and scope ne 'classification' then begin
print, "************ UMLAUT WARNING *************"
print, "Parameter 'scope' set to unknown value "
print, "scope=",scope
print, "scope=regression assumed'"
print, "*****************************************"
scope='regression'
endif
if TYPE_CLN ne 'fixed' and TYPE_CLN ne 'min_max' then begin
print, "************ UMLAUT WARNING *************"
print, "Parameter 'TYPE_CLN' set to unknown value "
print, "TYPE_CLN=",TYPE_CLN
print, "TYPE_CLN='fixed' assumed'"
print, "*****************************************"
scope='regression'
endif
if MIN_CLN gt CLN then begin
print, "************ UMLAUT WARNING *************"
if TYPE_CLN eq 'fixed' then begin
print, "Parameter MIN_CLN must be smaller than CLN!"
print, "In any case, TYPE_CLN is set to 'fixed' and"
print, "MIN_CLN will be ignored"
endif
if TYPE_CLN eq 'min_max' then begin
print, "Parameter MIN_CLN must be smaller than CLN!"
print, "MIN_CLN=CLN assumed"
print, "TYPE_CLN='fixed' assumed"
MIN_CLN = CLN
TYPE_CLN="fixed"
endif
print, "*****************************************"
endif
if n_elements(AI[0,*]) ne n_elements(AO[0,*]) then begin
print, "************* UMLAUT ERROR **************"
print, "The number of parameters (N) in AI and AO "
print, "must be the same."
print, "N(AI)="+strcompress(string(n_elements(AI[0,*])))
print, "N(AO)="+strcompress(string(n_elements(AO[0,*])))
print, "*****************************************"
stop
endif
if n_elements(AI[*,0]) ne n_elements(BI[*,0]) then begin
print, "************* UMLAUT ERROR **************"
print, "The number of training data points (M)"
print, "in AI and BI must be the same."
print, "M(AI)="+strcompress(string(n_elements(AI[*,0])))
print, "M(BI)="+strcompress(string(n_elements(BI[*,0])))
print, "*****************************************"
stop
endif
if SO_TYPE ne 'sigma' and SO_TYPE ne 'perc' and SO_TYPE ne 'aver' then begin
print, "************ UMLAUT WARNING *************"
print, "Parameter 'SO_TYPE' set to unknown value "
print, "SO_TYPE=",SO_TYPE
print, "SO_TYPE='sigma' assumed"
print, "*****************************************"
SO_TYPE="sigma"
endif
if OPTIMIZE_DIM ne 'yes' and OPTIMIZE_DIM ne 'no' then begin
print, "************ UMLAUT WARNING *************"
print, "Parameter 'OPTIMIZE_DIM' set to unknown value "
print, "OPTIMIZE_DIM=",OPTIMIZE_DIM
print, "OPTIMIZE_DIM='yes' assumed"
print, "*****************************************"
OPTIMIZE_DIM='yes'
endif
if scope eq 'classification' then TYPE_CLN='fixed'
if not(keyword_set(AVERAGE)) and scope eq 'regression' then AVERAGE='mean' ; Default type of average is the mean when UMLAUT is used for regression
if not(keyword_set(AVERAGE)) and scope eq 'classification' then AVERAGE='mode' ; Default type of average is the mode when UMLAUT is used for classification
AVERAGE=strlowcase(AVERAGE)
if AVERAGE ne "median" and AVERAGE ne "mean" and AVERAGE ne "mode" and AVERAGE ne "weighted" and AVERAGE ne "fit" then begin
print, "************ UMLAUT WARNING *************"
print, "Parameter 'AVERAGE' set to unknown value "
print, "AVERAGE=",AVERAGE
if scope eq 'regression' then begin
AVERAGE='mean'
print, "AVERAGE='mean' assumed"
endif
if scope eq 'classification' then begin
AVERAGE='mode'
print, "AVERAGE='mode' assumed"
endif
print, "*****************************************"
endif
; OOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOO
; DEFAULT VALUES, WARNINGS AND ERRORS - END
; OOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOO
SO=dblarr(L_ELE,O_DIM)
SO[*,*]=-99.
if scope eq 'regression' then begin
BO=dblarr(L_ELE,O_DIM)
BO[*,*]=-99.
endif
if scope eq 'classification' then begin
BO=strarr(L_ELE,O_DIM)
BO[*,*]="-99."
CLAS_PROB=dblarr(L_ELE,n_elements(CLAS_VECT[*,0]),O_DIM)
CLAS_UNC=dblarr(L_ELE,n_elements(CLAS_VECT[*,0]),O_DIM)
endif
if not(keyword_set(test)) then CLOSEST=0
if keyword_set(test) then begin
CLOSEST=1
CLN=CLN+1
endif
if TYPE_CLN eq 'fixed' then MIN_CLN=CLN
MIN_CLN=CLOSEST+MIN_CLN
; Overall sigma of the BI distribution
if scope eq 'regression' then begin
SIGMA_COMPARE=dblarr(O_DIM)
OO=0L
while OO lt O_DIM do begin
SIGMA_COMPARE_perc=percentiles(BI[*,OO], value=[0.16,0.84])
SIGMA_COMPARE_el=0.5*abs(SIGMA_COMPARE_perc[1]-SIGMA_COMPARE_perc[0])
SIGMA_COMPARE[OO]=(SIGMA_COMPARE_el+SIGMA(BI[*,OO]))/2.
OO=OO+1
endwhile
endif
PDF=0.
if keyword_set(GETPDF) and scope eq 'regression' then begin
if keyword_set(def_x_PDF) then begin
X_PDF=dblarr(n_elements(BI[*,0]),O_DIM)
PDF=dblarr(L_ELE,n_elements(BI[*,0]),O_DIM)
endif
if not keyword_set(def_x_PDF) then begin
X_PDF=X_PDF
PDF=dblarr(L_ELE,n_elements(X_PDF[*,0]),O_DIM)
endif
;-----------------------------------------
X_PDF0=dblarr(n_elements(BI[*,0]),O_DIM)
OO=0L
while OO lt O_DIM do begin
X_PDF0[*,OO]=BI[*,OO]
X_PDF0[*,OO]=X_PDF0[sort(X_PDF0[*,OO]),OO]
UNIDX=uniq(X_PDF0[*,OO])
X_PDF0[0:n_elements(UNIDX)-1,OO]=X_PDF0[UNIDX,OO]
if n_elements(X_PDF0[*,OO]) gt n_elements(UNIDX) then begin
X_PDF0[n_elements(UNIDX):n_elements(X_PDF0[*,OO])-1,OO]=min(X_PDF0[*,OO])-99.
endif
if keyword_set(def_x_PDF) then begin
X_PDF[*,OO]=X_PDF0[*,OO]
endif
OO=OO+1
endwhile
;-----------------------------------------
endif
OUT_SCALINGS=dblarr(L_ELE,n_elements(AI[0,*]),O_DIM)
OUT_SCALINGS[*]=-99.
; OOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOO
; ORDINALIZATION
; OOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOO
; Rescale the input N dimensions to a constant density scale.
; This transforms the physical scale along each dimension into
; an ORDINAL SCALE
AI_NEW=AI
AO_NEW=AO ; Values are set in the next passage
EE=0L
WHILE EE LT N_DIM DO BEGIN ; Cycle on the N dimensions
;--------------------------------------
;1) SET Nan and inf to NVV (Not valid values)
ADD_NVV_AI_idx=where(AI[*,EE] ne AI[*,EE])
ADD_NVV_AO_idx=where(AO[*,EE] ne AO[*,EE])
if ADD_NVV_AI_idx[0] ne -1 then AI[ADD_NVV_AI_idx,EE]=NVV[EE]
if ADD_NVV_AO_idx[0] ne -1 then AO[ADD_NVV_AO_idx,EE]=NVV[EE]
;---------------------------------------
; 2 Set the values of the rescaled AI array
IDX_V=where(AI[*,EE] ne NVV[EE]); Valid values
IDX1=sort(AI[IDX_V,EE]) ; Sorted indexes (only for valid values)
AI_NEW[IDX_V[IDX1],EE]=findgen(n_elements(IDX1))/double(n_elements(IDX1))
;---------------------------------------
; 3) Set the values for the rescaled AO array
LL=0L
WHILE LL LT L_ELE DO BEGIN ; Cycle on the analysis elements (data points)
AO_NEW[LL,EE]=NVV[EE]
IF AO[LL,EE] ne NVV[EE] THEN BEGIN ; only for valid AO values
Dist=AI[*,EE]-AO[LL,EE]
IDXV1=where(Dist gt 0 and AI[*,EE] ne NVV[EE])
IDXV2=where(Dist lt 0 and AI[*,EE] ne NVV[EE])
V1=min(abs(Dist))
V2=V1
IDXV1B=[-1]
IDXV2B=[-1]
if IDXV1[0] ne -1 then V1=min(Dist[IDXV1],IDXV1B); min index written in IDXV1B
if IDXV2[0] ne -1 then V2=max(Dist[IDXV2],IDXV2B); min index written in IDXV2B
; IDXV1B=where(Dist eq V1)
; IDXV2B=where(Dist eq V2)
IDXV1B=IDXV1B[0]
IDXV2B=IDXV2B[0]
if IDXV1[0] ne -1 and IDXV2[0] ne -1 then AO_NEW[LL,EE]=interpol([AI_NEW[IDXV1[IDXV1B[0]],EE],AI_NEW[IDXV2[IDXV2B[0]],EE]],[AI[IDXV1[IDXV1B[0]],EE],AI[IDXV2[IDXV2B[0]],EE]],AO[LL,EE])
; KKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKK
if IDXV1[0] eq -1 and IDXV2[0] ne -1 then AO_NEW[LL,EE]=AI_NEW[IDXV2[IDXV2B[0]],EE]
if IDXV1[0] ne -1 and IDXV2[0] eq -1 then AO_NEW[LL,EE]=AI_NEW[IDXV1[IDXV1B[0]],EE]
if IDXV1[0] eq -1 and IDXV2[0] eq -1 then AO_NEW[LL,EE]=NVV[EE];min(AI_NEW[EE,*])
; KKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKK
ENDIF
LL=LL+1
ENDWHILE
EE=EE+1
ENDWHILE
; OOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOO
; ORDINALIZATION - END -
; OOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOO
; OOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOO
; Evaluate valid data points to compare with
; OOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOO
LL=0L
WHILE LL LT L_ELE DO BEGIN ; Cycle on the evaluation elements
; Valid dimensions N for the data point LL under evaluation.
VDK=where(AO_NEW[LL,*] ne NVV) ; indexes of Valid Dimensions for this LL
; (Dimensions for which this analysis
; datum (LL) has a valid value)
if VDK[0] ne -1 then begin ; if there is one valid dimension at least...
; Only valid values in arrays AI and BI will be considered.
; Only input dimensions N for which the LL element has a valid
; value will be taken into account. Sources in the training set
; with no valid values in one of these VDK dimensions will not be
; considered, UNLESS there are too few data points to compare with.
VECT_VALID_M=lonarr(M_ELE) ; 1=valid data point. 0=not valid
VECT_VALID_M[*]=1
; FIND VALID COMPARISON DATA POINTS IN THE TRAINING SET
NN=0L
WHILE NN LT n_elements(VDK) DO BEGIN ; Cycle on the valid dimensions
NVM=where(AI_NEW[*,VDK[NN]] eq NVV[NN]) ; not valid training data points (total=M)
;1) Don't consider this dimension if there are too few datapoints available:
; in the training set:
if n_elements(AI_NEW[*,VDK[NN]])-n_elements(NVM) lt 3.*CLN then begin
VDK=VDK[where(VDK ne NN)]; remove index from VDK (don't consider this dimension)
endif else begin
;2) Otherwise, remove all the data points of the training set that
; don't have the same dimensions available:
if NVM[0] ge 0 then VECT_VALID_M[NVM]=0
endelse
NN=NN+1
ENDWHILE
M_ELE_OK_idx=where(VECT_VALID_M eq 1) ; indexes of usable training datapoints
M_ELE_OK=n_elements(M_ELE_OK_idx) ; Number of usable training datapoints
;MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
; DIMENSIONS SCALING
;MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
N_EXP=dblarr(n_elements(VDK),O_DIM)
N_EXP[*,*]=1.
N_EXP_s=N_EXP_step ;=+1. ; Initial "expansion" or "scaling" factor
;Prize1=0.2
;Prize2=-0.1
;Prize3=0.4
vect_exp=lonarr(n_elements(VDK),O_DIM)
vect_exp[*,*]=0
EXIT=strarr(O_DIM)
EXIT[*]='no'
AI2_TOT=dblarr(M_ELE_OK,O_DIM)
DISTANCES=dblarr(M_ELE_OK,O_DIM); multi-dimensional distances from data point K
AI2_TOT_1EXP=dblarr(M_ELE_OK,n_elements(VDK),O_DIM)
DISTANCES_1EXP=dblarr(M_ELE_OK,n_elements(VDK),O_DIM) ; multi-dimensional distances from data point K (one dimension expanded)
IDX_SD_X=lonarr(M_ELE_OK,O_DIM) ; Indexes of sorted distances along each dimension
;;;;; IDX_SD_1EXP=lonarr(M_ELE_OK,n_elements(VDK),O_DIM) ;
IDX_SD_1EXP=lonarr(M_ELE_OK) ;
NEW_CLN=lonarr(n_elements(VDK),O_DIM)
PC_SIG_B_SAVE=dblarr(n_elements(VDK),O_DIM)
NEW_CLN[*,*]=CLN
AI_NEW2=AI_NEW
AO_NEW2=AO_NEW
; MAX_MM=MAX_MM_SET;25
MM=0L
WEXIT=where(EXIT eq 'no')
WHILE MM lt MAX_MM and WEXIT[0] ne -1 DO BEGIN
AI2_TOT[*,WEXIT]=0.
DISTANCES[*,WEXIT]=0.
AI2_TOT_1EXP[*,*,WEXIT]=0.
PC_SIG_B_SAVE[*,*]=0.
DISTANCES_1EXP[*,*,WEXIT]=0.
IDX_SD_X[*,WEXIT]=-99
;;;;; IDX_SD_1EXP[*,*,WEXIT]=-99
IDX_SD_1EXP[*]=-99
OO=0L
while OO lt O_DIM do begin ; Cycle on the output DIMENSIONS
if EXIT[OO] eq 'no' then begin
if keyword_set(IN_SCALINGS) then begin
N_EXP[*,OO]=IN_SCALINGS[VDK,OO]
endif
NN=0L
WHILE NN LT n_elements(VDK) DO BEGIN ; Cycle on the available DIMENSIONS
; Method: uses only calibration datapoints with a valid value for
; all the same N-dimensions (listed in the VDK vector) as those
; available for the evaluation element
Projected_D0=AI_NEW[M_ELE_OK_idx,VDK[NN]]-AO_NEW[LL,VDK[NN]] ; projected distances
Projected_D0=Projected_D0*N_EXP[NN,OO] ; NOTE: PROJECTED DISTANCES ARE EXPANDED, not the original coordinates!
AI2_TOT[*,OO]=AI2_TOT[*,OO]+(Projected_D0)^2 ; Dimensions not expanded (MM=0) or expanded as best solution indicates
AI2_TOT_1EXP[*,NN,OO]=((Projected_D0*1.*(N_EXP[NN,OO]+N_EXP_s))^2)-(Projected_D0^2) ; one additional dim. expansion (completed below)
NN=NN+1
ENDWHILE ; WHILE on the N (input) DIMENSIONS
DISTANCES[*,OO]=sqrt(AI2_TOT[*,OO]) ; multi-dimensional distances from data point LL
IDX_SD_OO=SORT(DISTANCES[*,OO]) ; Indexes of sorted distances
; Sigma for the elements in the selection (no dimension expansion)
;PSIG_A=percentiles(BI[M_ELE_OK_idx[IDX_SD_OO[CLOSEST:CLN-1]],OO], value=[0.16,0.84])
;PERCEN_SIGMA_A=(PSIG_A[1]-PSIG_A[0])/2.
if scope eq 'regression' then begin
PERCEN_SIGMA_A=SIGMA(BI[M_ELE_OK_idx[IDX_SD_OO[CLOSEST:CLN-1]],OO])
PERCEN_SIGMA_LOW=PERCEN_SIGMA_A ; reference
endif
if scope eq 'classification' then OPTIMIZE_DIM='no'
vect_exp[*,OO]=0
EXIT[OO]='yes'
if OPTIMIZE_DIM eq 'yes' then begin
NN=0L
WHILE NN LT n_elements(VDK) DO BEGIN ; Cycle on the available input DIMENSIONS
AI2_TOT_1EXP[*,NN,OO]=AI2_TOT[*,OO]+AI2_TOT_1EXP[*,NN,OO]
DISTANCES_1EXP[*,NN,OO]=sqrt(AI2_TOT_1EXP[*,NN,OO])
IDX_SD_1EXP=SORT(DISTANCES_1EXP[*,NN,OO]) ; Indexes of sorted distances
; 1) Compute sigma to compare
;PSIG_B=percentiles(BI[M_ELE_OK_idx[IDX_SD_1EXP[CLOSEST:CLN-1]],OO], value=[0.16,0.84])
;PERCEN_SIGMA_B=(PSIG_B[1]-PSIG_B[0])/2.
PERCEN_SIGMA_B=SIGMA(BI[M_ELE_OK_idx[IDX_SD_1EXP[CLOSEST:CLN-1]],OO])
PC_SIG_B_SAVE[NN,OO]=PERCEN_SIGMA_B
; 2) compare sigma and set prizes
if PERCEN_SIGMA_B lt PERCEN_SIGMA_A then begin ;and PERCEN_SIGMA_B lt PERCEN_SIGMA_LOW then begin
N_EXP[NN,OO]=N_EXP[NN,OO]+ Prize1*N_EXP_s
vect_exp[NN,OO]=1
EXIT[OO]='no'
endif
if PERCEN_SIGMA_B ge PERCEN_SIGMA_A then begin
if N_EXP[NN,OO] gt 0 then begin
N_EXP[NN,OO]=N_EXP[NN,OO]+Prize2*N_EXP_s
EXIT[OO]='no'
endif
if N_EXP[NN,OO] le 0 then begin
N_EXP[NN,OO]=0.
endif
endif
NN=NN+1
ENDWHILE ; WHILE on N (input) DIMENSIONS
if total(vect_exp[*,OO]) le 0 then begin
;EXIT[OO]='yes'
;N_EXP_s=N_EXP_s/2. ; expansion factor
;N_EXP_s=N_EXP_s*(1-1./(2*MAX_MM)) ; expansion factor
N_EXP_s=N_EXP_s/Reduction_factor ; expansion factor
if N_EXP_s lt 0.01 then EXIT[OO]='yes'
endif
if total(vect_exp[*,OO]) gt 0 then begin
; Special prize to the dimension expressing the lowest sigma
PRIZE_ID=where(PC_SIG_B_SAVE[*,OO] eq min(PC_SIG_B_SAVE[*,OO]))
N_EXP[PRIZE_ID[0],OO]=N_EXP[PRIZE_ID[0],OO]+Prize3*N_EXP_s
endif
endif ; if OPTIMIZE_DIM eq 'yes'
if where(N_EXP[*,OO] gt 0) eq [-1] then begin
EXIT[OO]='yes'
N_EXP[*,OO]=1
endif
endif ;if EXIT[OO] eq 'no'
; TEST TEST TEST TEST TEST TEST TEST TEST TEST
OUT_SCALINGS[LL,VDK,OO]=N_EXP[*,OO]/max(N_EXP[*,OO])
; TEST TEST TEST TEST TEST TEST TEST TEST TEST
OO=OO+1
ENDWHILE ; WHILE on O (output) DIMENSIONS
; print, MM,N_EXP[*,*]
WEXIT=where(EXIT eq 'no')
MM=MM+1
ENDWHILE
DISTANCES=sqrt(AI2_TOT) ; multi-dimensional distances from data point LL
;MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
; DIMENSIONS SCALING - END
;MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
;CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
; FIND BEST VALUE OF CLN
;CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
NEW_CLN2=lonarr(O_DIM)
OO=0L
WHILE OO lt O_DIM DO BEGIN
IDX_SD_OO=SORT(DISTANCES[*,OO]) ; Indexes of sorted distances
; OPTION A: use CLN set by the user
IF TYPE_CLN eq 'fixed' then begin
NEW_CLN2[*]=CLN
ENDIF
; OPTION B: compute best CLN between MIN_CLN and CLN (set by the user)
IF TYPE_CLN eq 'min_max' then begin
SIGMA_REF=SIGMA_COMPARE[OO]
TT=MIN_CLN;+1;CLOSEST+1
NEW_CLN2[OO]=MIN_CLN
WHILE TT le CLN DO BEGIN
PSIG=percentiles(BI[M_ELE_OK_idx[IDX_SD_OO[CLOSEST:TT-1]],OO], value=[0.16,0.84])
PERCEN_SIGMA=(PSIG[1]-PSIG[0])/2.
if PERCEN_SIGMA le SIGMA_REF and PERCEN_SIGMA gt 0 and TT ge CLOSEST+1 then begin
SIGMA_REF=PERCEN_SIGMA
NEW_CLN2[OO]=TT
endif
TT=TT+1
ENDWHILE
ENDIF
;CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
; FIND BEST VALUE OF CLN -END
;CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
; xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
; CLASSIFICATION xxxxxxxxxxxxxxxxxxxxxxxxxxx
; xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
if scope eq 'classification' then begin
NPOSS=n_elements(CLAS_VECT[*,OO])
PROB_REF=0.
CLAS_REF=0
UNC_REF=0.
JJ=0L
while JJ lt NPOSS do begin
POX1=where(BI[M_ELE_OK_idx[IDX_SD_OO[CLOSEST:NEW_CLN2[OO]-1]],OO] eq CLAS_VECT[JJ,OO])
NEL_POX1=n_elements(POX1) ; If POX1[0]=-1 counts one element!
; Balancing factor
bfact= float(n_elements(where(BI[M_ELE_OK_idx[IDX_SD_OO],OO] eq CLAS_VECT[JJ,OO])))/float(n_elements(IDX_SD_OO))
; NWNWNWNWNWNWNWNWNWNWNWNWNWNWNWNW
; NOT WEIGHTED CLASSIFICATION
; NWNWNWNWNWNWNWNWNWNWNWNWNWNWNWNW
if AVERAGE eq 'mode' then begin
CLAS_PROB[LL,JJ,OO]=float(NEL_POX1)/float(NEW_CLN2[OO]-CLOSEST)
CLAS_UNC[LL,JJ,OO]=sqrt(float(NEL_POX1))/float(NEW_CLN2[OO]-CLOSEST)
if keyword_set(BAL) then begin
CLAS_PROB[LL,JJ,OO]=(1/Bfact)*CLAS_PROB[LL,JJ,OO]
CLAS_UNC[LL,JJ,OO]=sqrt(1/Bfact)*CLAS_UNC
endif
IF POX1[0] eq -1 then CLAS_PROB[LL,JJ,OO]=0.
IF POX1[0] eq -1 then CLAS_UNC[LL,JJ,OO]=-99.
endif
; NWNWNWNWNWNWNWNWNWNWNWNWNWNWNWNW
; WEIGHTED CLASSIFICATION
; NWNWNWNWNWNWNWNWNWNWNWNWNWNWNWNW
if AVERAGE eq 'weighted' then begin
; weight computed for ALL the data points in the training sample
POX2=1+where(BI[M_ELE_OK_idx[IDX_SD_OO[CLOSEST:n_elements(IDX_SD_OO)-1]],OO] eq CLAS_VECT[JJ,OO]) ; must start from 1, not 0!
;W1=exp(-0.5*(((findgen(M_ELE_OK-CLOSEST))/float(NEW_CLN2[OO]-CLOSEST))^2))
W1C=exp(-0.5*(((float(POX2))/float(NEW_CLN2[OO]-CLOSEST))^2))
W1C_all=exp(-0.5*(((1.+findgen(M_ELE_OK-CLOSEST))/float(NEW_CLN2[OO]-CLOSEST))^2))
CLAS_PROB[LL,JJ,OO]=total(W1C)/total(W1C_all)
CLAS_UNC[LL,JJ,OO]=sqrt(float(NEL_POX1))/float(NEW_CLN2[OO]-CLOSEST) ; USE POX1 NOT POX2 !
;stop
if keyword_set(BAL) then begin
;bfact=float(n_elements(POX2))/float(n_elements(IDX_SD_OO)-(1+CLOSEST)) ; Balancing factor
CLAS_PROB[LL,JJ,OO]=(1/Bfact)*CLAS_PROB[LL,JJ,OO]
CLAS_UNC[LL,JJ,OO]=sqrt(1/Bfact)*CLAS_UNC[LL,JJ,OO]
endif
IF POX2[0]-1 eq -1 then CLAS_PROB[LL,JJ,OO]=0.
IF POX2[0]-1 eq -1 then CLAS_UNC[LL,JJ,OO]=-99.
endif
; SET REFERENCE PROBABILITY LINE
if CLAS_PROB[LL,JJ] gt PROB_REF then begin
PROB_REF=CLAS_PROB[LL,JJ,OO]
CLAS_REF=CLAS_VECT[JJ,OO]
UNC_REF=CLAS_UNC[LL,JJ,OO]
endif
JJ=JJ+1
endwhile
; NORMALIZATION AFTER BALANCING
NORM_P=total(CLAS_PROB[LL,*,OO])
CLAS_PROB[LL,*,OO]=CLAS_PROB[LL,*,OO]/NORM_P
CLAS_UNC[LL,*,OO]=CLAS_UNC[LL,*,OO]/NORM_P
PROB_REF=PROB_REF/NORM_P
UNC_REF=UNC_REF/NORM_P
val99=where(CLAS_UNC[LL,*,OO] eq -99.)
if val99[0] ge 0 then CLAS_UNC[LL,val99,OO]=-99. ; reset -99 values to -99.
BO[LL,OO]=CLAS_REF
SO[LL,OO]=UNC_REF ;sqrt(float(NEL_POX))/n_elements(NEW_CLN2)
endif ; if scope eq 'classification'
; xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
; REGRESSION xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
; xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
if scope eq 'regression' then begin
BO_0=BI[M_ELE_OK_idx[IDX_SD_OO[CLOSEST:NEW_CLN2[OO]-1]],OO]
BO_IDX_OK=where(BO_0 eq BO_0)
IF SO_TYPE eq 'sigma' or SO_TYPE eq 'aver' then begin
SO_1A=SIGMA(BI[M_ELE_OK_idx[IDX_SD_OO[CLOSEST:CLN-1]],OO]) ; more conservative sigma computed on the maximum possible value of CLN
SO_1=SO_1A
ENDIF
IF SO_TYPE eq 'perc' or SO_TYPE eq 'aver' then begin
SO_1perc=percentiles(BI[M_ELE_OK_idx[IDX_SD_OO[CLOSEST:CLN-1]],OO],value=[0.16,0.5,0.84])
SO_1B=max([abs(SO_1perc[1]-SO_1perc[0]),abs(SO_1perc[2]-SO_1perc[1])])
;SO_1B=(SO_1perc[2]-SO_1perc[0])/2.
SO_1=SO_1B
ENDIF
IF SO_TYPE eq 'aver' then begin
SO_1C=(SO_1A+SO_1B)/2.
SO_1=SO_1C
ENDIF
; Value along the unknown dimension expected for this element
; VVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVV
; median
; VVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVV
if AVERAGE eq 'median' then begin
if BO_IDX_OK[0] ne -1 then BO_1=median(BO_0[BO_IDX_OK])
if BO_IDX_OK[0] eq -1 then BO_1=median(BO_0)
endif
; VVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVV
; mean
; VVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVV
if AVERAGE eq 'mean' then begin
if BO_IDX_OK[0] ne -1 then BO_2=mean(BO_0[BO_IDX_OK])
if BO_IDX_OK[0] eq -1 then BO_2=mean(BO_0)
endif
; VVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVV
; weighted average
; VVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVV
; weight used for the weigthed option
if AVERAGE eq 'weighted' or AVERAGE eq 'fit' then begin
;W1=exp(-0.5*(((findgen(M_ELE_OK-CLOSEST))/float(NEW_CLN2[OO]-CLOSEST))^2))
W1=exp(-0.5*(((1+findgen(M_ELE_OK-CLOSEST))/float(NEW_CLN2[OO]-CLOSEST))^2))
BO_3=total(BI[M_ELE_OK_idx[IDX_SD_OO[CLOSEST:M_ELE_OK-1]],OO]*W1)/total(W1)
endif
; VVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVV
; N-dimensional linear fit of the closest datapoints
; (MULTIPLE LINEAR REGRESSION METHOD)
; VVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVV
if AVERAGE eq 'fit' then begin
IDX_DIM_OK=where(N_EXP[*,OO] gt 0)
VDK_O=VDK[IDX_DIM_OK]
; MULTIPLE LINEAR REGRESSION METHOD (n-dimensional linear fit)
BO_4=-99.
CLN_FIT=NEW_CLN2[OO]
; if NEW_CLN2[OO]-CLOSEST le n_elements(VDK_O) then CLN_FIT=min([n_elements(VDK_O)+1,CLN])
if NEW_CLN2[OO]-CLOSEST le n_elements(VDK_O) then CLN_FIT=n_elements(VDK_O)+1
; Variables used for the fit option
X_REGR=dblarr(n_elements(VDK_O),CLN_FIT-CLOSEST)
Y_REGR=dblarr(CLN_FIT-CLOSEST)
PP=0L
WHILE PP lt n_elements(VDK_O) do begin
;X_REGR[PP,*]=AI_NEW2[M_ELE_OK_idx[IDX_SD_OO[CLOSEST:CLN_FIT-1]],VDK_O[PP]]
X_REGR[PP,*]=AI_NEW2[M_ELE_OK_idx[IDX_SD_OO[CLOSEST:CLN_FIT-1]],VDK_O[PP]]*N_EXP[IDX_DIM_OK[PP],OO]
PP=PP+1
ENDWHILE
Y_REGR[*]=BI[M_ELE_OK_idx[IDX_SD_OO[CLOSEST:CLN_FIT-1]],OO]
BETA_COEFF=REGRESS(X_REGR,Y_REGR,const=const_REGR,STATUS=STATUS)
;IF STATUS EQ 0 then BO_4=total(AO_NEW2[LL,VDK_O]*BETA_COEFF)+const_REGR
IF STATUS EQ 0 then BO_4=total(AO_NEW2[LL,VDK_O]*N_EXP[IDX_DIM_OK,OO]*BETA_COEFF)+const_REGR
IF STATUS EQ 1 or STATUS EQ 2 or (STATUS EQ 0 and BO_4 ne BO_4) then begin
BO_4=mean(BO_0[BO_IDX_OK])
print, "n-dimensional linear Regression didn't work, returning status: ",strcompress(string(STATUS))
print, "the mean is used instead"
ENDIF
if BO_4 ne BO_4 then stop
; The next option uses the average when the difference
; between fit and average is greater than
;fit_thresh*sigma (sigma from the average)
if keyword_set(fit_thresh) then begin
; if abs(BO_4-mean(BO_0)) gt fit_thresh*SO_1 then BO_4=mean(BO_0)
if abs(BO_4-mean(BO_0)) gt fit_thresh*SO_1 then BO_4=BO_3
endif
; SO_1=max([SO_1,abs(BO_4-mean(BO_0))])
; VVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVV
endif
if AVERAGE eq 'median' then BO[LL,OO]=BO_1
if AVERAGE eq 'mean' then BO[LL,OO]=BO_2
if AVERAGE eq 'weighted' then BO[LL,OO]=BO_3
if AVERAGE eq 'fit' then BO[LL,OO]=BO_4
SO[LL,OO]=SO_1
endif