-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathrecursive_binning.py
118 lines (92 loc) · 3.96 KB
/
recursive_binning.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from scipy import stats
class RecursiveOptimalBin(BaseEstimator, TransformerMixin):
'''
Take in a 1d array and a set of binary labels (x, y)
and return the mean y in each of a set of
'optimal' bins
'''
def __init__(self, a=10, max_bins=100, method='pdf'):
'''
Init with 3 args:
a : smoothing factor
max_bins : most bins that will be tested
method : pdf will return a normalised pdf
- sum y in bin/total sum y
avg will return
- mean of y bin
'''
assert (method is 'pdf' or
method is 'avg')
self.method = method
self.max_bins = max_bins
self.a = a
def _lnL(self, bins, x, y):
'''
Log likelihood from Hogg 2008
'''
N, e, _ = stats.binned_statistic(x, statistic='sum',
values=y, bins=bins)
d = (bins - np.roll(bins, 1))[1:]
if any((N + self.a) < 1):
return np.nan
s = np.sum(N + self.a)
L = np.sum(N * np.log((N + self.a - 1) / (d * (s - 1))))
return L
def optimal_binning(self, x, y):
'''
Find the optimal binning for a data set
Begin with 100 bins, and keep combining
adjacent bins, so long as it increases the
log-likelihood.
'''
# make 100 even bins
bins = list(np.histogram(x, bins=self.max_bins)[1])
# get the log L for that, set up loop variables
L = self._lnL(bins, x, y)
L_new = L
bins_new = bins
while (L_new >= L) and (len(bins_new) > 2):
# so long as the lgL is increasing, and we have
# at least 2 bin edges (start and end)
L = L_new
bins = bins_new
Ls = []
for i in range(1, len(bins)-1):
test_bins = bins[:i] + bins[i+1:]
Ls.append(self._lnL(test_bins, x, y))
L_new = np.nanmax(Ls)
L_ind = 1 + np.nanargmax(Ls)
bins_new = bins[:L_ind] + bins[L_ind+1:]
return bins
def fit(self, x, y):
'''
Get the optimal bin number and create those bins,
then aggregate the labels accordingly and populate
self.mu
'''
self.bins = self.optimal_binning(x, y)
avg = lambda a: np.sum(a) / len(a) if len(a) else 0
pdf = lambda a: np.sum(a) / np.sum(y)
if self.method is 'avg':
agg = avg
else:
agg = pdf
self.mu = stats.binned_statistic(x, statistic=agg,
values=y, bins=self.bins)[0]
return self
def transform(self, x):
'''
get the bin that each x fell into and return
the mu in corresponding to that bin
'''
locs = np.digitize(x, self.bins)
# np hist and digitize handle bins differently
# so this fudge is required (numpy/issues/9208)
locs -= 1
locs[locs == len(self.mu)] -= 1
return self.mu[locs]
def fit_transform(self, x, y):
self.fit(x, y)
return self.transform(x)