-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbagging.go
144 lines (129 loc) · 3.48 KB
/
bagging.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
package ensemble
import (
"fmt"
"math"
"math/rand"
"rf/algo"
"rf/algo/decision"
"rf/mathelper"
"sort"
"gonum.org/v1/gonum/mat"
"gonum.org/v1/gonum/stat"
)
/*
RandoForest is a bagging algorithm based on decision Trees
*/
type RandomForest struct {
estimators []algo.Model
Score float64
// feMapping stores the mapping between subtrees that learn only on a subset of all the features the Matrix has.
feMapping map[algo.Model][]int
}
/*
fit builds decision trees on subsamples of the matrix X using the sqare root of nFeatures
*/
func Fit(m *mat.Dense, yCol int, params map[string]int) algo.Model {
return fit(m, yCol, params["n_estimator"], params["maxDepth"], params["minSize"])
}
func fit(m *mat.Dense, yCol int, nEstimators, maxDepth, minSize int) *RandomForest {
if yCol == -1 {
_, dC := m.Dims()
yCol = dC - 1
}
feCols := extractFeatures(m, yCol)
rf := &RandomForest{
feMapping: make(map[algo.Model][]int),
}
ratioR := 1.0
ratioC := 1 - sqrtRatio(len(feCols))
for estimator := 0; estimator < nEstimators; estimator++ {
subCols := randomSubColumns(feCols, ratioC)
subM := subsample(m, ratioR, append(subCols, yCol))
t := decision.Fit(subM, -1, map[string]int{"maxDepth": maxDepth, "minSize": minSize})
rf.estimators = append(rf.estimators, t)
rf.feMapping[t] = subCols
}
return rf
}
// Predict returns an array of predictions for each row in the Matrix
func (rf *RandomForest) Predict(m *mat.Dense) (predictions []float64) {
dR, _ := m.Dims()
predictions = make([]float64, dR)
for i := 0; i < dR; i++ {
predictions[i] = rf.PredictRow(m.RowView(i))
}
return predictions
}
// PredictRow returns the most frequent predictions accross all estimators predictions
func (rf *RandomForest) PredictRow(row mat.Vector) float64 {
var predictions mathelper.Row = make([]float64, len(rf.estimators))
for i, estimator := range rf.estimators {
features := rf.feMapping[estimator]
projectedRow := make([]float64, len(features))
for i, f := range features {
projectedRow[i] = row.AtVec(f)
}
predictions[i] = estimator.PredictRow(mathelper.Row(projectedRow))
}
mode, _ := stat.Mode(predictions, nil)
return mode
}
// IsFitted returns False if NFeatures is <= 0 or Score < 0 or treeBag length is < 0
func (rf *RandomForest) IsFitted() bool {
if len(rf.estimators) >= 0 && rf.Score >= 0 {
return true
}
return false
}
func (rf RandomForest) String() string {
s := ""
for i, e := range rf.estimators {
s += fmt.Sprintln("Estimator #", i)
s += fmt.Sprintln("Feature mapping : ", rf.feMapping[e])
s += fmt.Sprintln(e)
}
return s
}
func extractFeatures(m mat.Matrix, yCol int) []int {
feCols := []int{}
_, dC := m.Dims()
for c := 0; c < dC; c++ {
if c != yCol {
feCols = append(feCols, c)
}
}
return feCols
}
func subsample(m *mat.Dense, ratio float64, columns []int) (samples *mat.Dense) {
r, _ := m.Dims()
nRow := int(float64(r) * ratio)
sub := mat.NewDense(nRow, len(columns), nil)
for i := 0; i < nRow; i++ {
id := rand.Intn(r)
row := m.RawRowView(id)
for j, cid := range columns {
sub.Set(i, j, row[cid])
}
}
return sub
}
func randomSubColumns(columns []int, ratio float64) []int {
n := int(ratio * float64(len(columns)))
indexes := make(map[int]bool)
cols := make([]int, n)
for len(indexes) < n {
r := rand.Intn(len(columns) - 1)
indexes[r] = true
}
i := 0
for k := range indexes {
cols[i] = k
i++
}
sort.Ints(cols)
return cols
}
func sqrtRatio(n int) float64 {
sqrt := math.Sqrt(float64(n))
return math.Round(1/sqrt*10) / 10
}