Skip to content
This repository was archived by the owner on Jun 28, 2024. It is now read-only.

Commit 9d6b1c5

Browse files
Merge pull request #220 from CamDavidsonPilon/0.10.0
0.10.0
2 parents 3bace68 + 9ac00c8 commit 9d6b1c5

14 files changed

+291
-150
lines changed

CHANGELOG.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,11 @@
11
# Changelog
22

3+
### 0.10.0
4+
- `BetaGeoBetaBinomFitter.fit` has replaced `n_custs` with the more appropriately named `weights` (to align with other statisical libraries). By default and if unspecified, `weights` is equal to an array of 1s.
5+
- The `conditional_` methods on `BetaGeoBetaBinomFitter` have been updated to handle exogenously provided recency, frequency and periods.
6+
- Performance improvements in `BetaGeoBetaBinomFitter`. `fit` takes about 50% less time than previously.
7+
- `BetaGeoFitter`, `ParetoNBDFitter`, and `ModifiedBetaGeoFitter` both have a new `weights` argument in their `fit`. This can be used to reduce the size of the data (collapsing subjects with the same recency, frequency, T).
8+
39
### 0.9.1
410
- Added a data generation method, `generate_new_data` to `BetaGeoBetaBinomFitter`. @zscore
511
- Fixed a bug in `summary_data_from_transaction_data` that was casting values to `int` prematurely. This was solved by including a new param `freq_multiplier` to be used to scale the resulting durations. See #100 for the original issue. @aprotopopov

docs/conf.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -77,9 +77,9 @@
7777
# built documents.
7878
#
7979
# The short X.Y version.
80-
version = '0.8.0.0'
80+
version = '0.10.0.0'
8181
# The full version, including alpha/beta/rc tags.
82-
release = '0.8.0.0'
82+
release = '0.10.0.0'
8383

8484
# The language for content autogenerated by Sphinx. Refer to documentation
8585
# for a list of supported languages.

lifetimes/datasets/donations.csv

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
frequency,recency,n,n_custs
1+
frequency,recency,periods,weights
22
0,0,6,3464
33
1,1,6,1091
44
1,2,6,277

lifetimes/fitters/beta_geo_beta_binom_fitter.py

Lines changed: 59 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
import pandas as pd
88
from numpy import log, exp, logaddexp, asarray, c_ as vconcat
99
from pandas import DataFrame
10-
from scipy.special import gammaln, betaln, binom
10+
from scipy.special import gammaln, betaln, binom, beta as betaf
1111

1212
from ..utils import _fit, _check_inputs
1313
from . import BaseFitter
@@ -56,39 +56,37 @@ def _loglikelihood(params, x, tx, T):
5656
"""Loglikelihood for optimizer."""
5757
alpha, beta, gamma, delta = params
5858

59-
beta_ab = betaln(alpha, beta)
60-
beta_gd = betaln(gamma, delta)
61-
62-
indiv_loglike = (betaln(alpha + x, beta + T - x) - beta_ab +
63-
betaln(gamma, delta + T) - beta_gd)
64-
59+
betaln_ab = betaln(alpha, beta)
60+
betaln_gd = betaln(gamma, delta)
6561
recency_T = T - tx - 1
6662

63+
A = (betaln(alpha + x, beta + T - x) - betaln_ab +
64+
betaln(gamma, delta + T) - betaln_gd)
65+
6766
J = np.arange(recency_T.max() + 1)
6867

69-
@np.vectorize
70-
def _sum(x, tx, recency_T):
68+
def _sum_(x, tx, recency_T):
7169
if recency_T <= -1:
72-
return -np.inf
70+
return 10e-10
71+
elif recency_T == 0:
72+
return betaf(alpha + x, beta + tx - x) * betaf(gamma + 1, delta + tx)
73+
else:
74+
j = J[:recency_T + 1]
75+
return (betaf(alpha + x, beta + tx - x + j) * betaf(gamma + 1, delta + tx + j)).sum()
7376

74-
j = J[:int(recency_T) + 1]
75-
return log(
76-
np.sum(exp(betaln(alpha + x, beta + tx - x + j) - beta_ab +
77-
betaln(gamma + 1, delta + tx + j) - beta_gd)))
77+
sum_ = np.vectorize(_sum_, [np.float])
7878

79-
s = _sum(x, tx, recency_T)
80-
indiv_loglike = logaddexp(indiv_loglike, s)
81-
82-
return indiv_loglike
79+
B = log(sum_(x, tx, recency_T)) - betaln_gd - betaln_ab
80+
return logaddexp(A, B)
8381

8482
@staticmethod
85-
def _negative_log_likelihood(params, frequency, recency, n, n_custs,
83+
def _negative_log_likelihood(params, frequency, recency, n_periods, weights,
8684
penalizer_coef=0):
8785
penalizer_term = penalizer_coef * sum(np.asarray(params) ** 2)
8886
return -np.mean(BetaGeoBetaBinomFitter._loglikelihood(
89-
params, frequency, recency, n) * n_custs) + penalizer_term
87+
params, frequency, recency, n_periods) * weights) + penalizer_term
9088

91-
def fit(self, frequency, recency, n, n_custs, verbose=False,
89+
def fit(self, frequency, recency, n_periods, weights=None, verbose=False,
9290
tol=1e-4, iterative_fitting=1, index=None,
9391
fit_method='Nelder-Mead', maxiter=2000, initial_params=None,
9492
**kwargs):
@@ -101,17 +99,18 @@ def fit(self, frequency, recency, n, n_custs, verbose=False,
10199
Total periods with observed transactions
102100
recency: array_like
103101
Period of most recent transaction
104-
n: array_like
105-
Number of transaction opportunities.
106-
n_custs: array_like
107-
Number of customers with given frequency/recency/T. Fader
108-
and Hardie condense the individual RFM matrix into all
102+
n_periods: array_like
103+
Number of transaction opportunities. Previously called `n`.
104+
weights: None or array_like
105+
Number of customers with given frequency/recency/T,
106+
defaults to 1 if not specified. Fader and
107+
Hardie condense the individual RFM matrix into all
109108
observed combinations of frequency/recency/T. This
110109
parameter represents the count of customers with a given
111110
purchase pattern. Instead of calculating individual
112111
loglikelihood, the loglikelihood is calculated for each
113112
pattern and multiplied by the number of customers with
114-
that pattern.
113+
that pattern. Previously called `n_custs`.
115114
verbose: boolean, optional
116115
Set to true to print out convergence diagnostics.
117116
tol: float, optional
@@ -137,15 +136,20 @@ def fit(self, frequency, recency, n, n_custs, verbose=False,
137136
fitted and with parameters estimated
138137
139138
"""
140-
frequency = asarray(frequency)
141-
recency = asarray(recency)
142-
n = asarray(n)
143-
n_custs = asarray(n_custs)
144-
_check_inputs(frequency, recency, n)
139+
frequency = asarray(frequency).astype(int)
140+
recency = asarray(recency).astype(int)
141+
n_periods = asarray(n_periods).astype(int)
142+
143+
if weights is None:
144+
weights = np.ones_like(recency, dtype=np.int64)
145+
else:
146+
weights = asarray(weights)
147+
148+
_check_inputs(frequency, recency, n_periods)
145149

146150
params, self._negative_log_likelihood_ = _fit(
147151
self._negative_log_likelihood,
148-
[frequency, recency, n, n_custs, self.penalizer_coef],
152+
[frequency, recency, n_periods, weights, self.penalizer_coef],
149153
iterative_fitting,
150154
initial_params,
151155
4,
@@ -156,44 +160,43 @@ def fit(self, frequency, recency, n, n_custs, verbose=False,
156160
**kwargs)
157161
self.params_ = OrderedDict(zip(['alpha', 'beta', 'gamma', 'delta'],
158162
params))
159-
self.data = DataFrame(vconcat[frequency, recency, n, n_custs],
160-
columns=['frequency', 'recency', 'n', 'n_custs'])
163+
self.data = DataFrame(vconcat[frequency, recency, n_periods, weights],
164+
columns=['frequency', 'recency', 'n_periods', 'weights'])
161165
if index is not None:
162166
self.data.index = index
163-
# Making a large array replicating n by n_custs having n.
164-
n_exploded = []
165-
for n_, n_cust in zip(n, n_custs):
166-
n_exploded += [n_] * n_cust
167+
167168
self.generate_new_data = lambda size=1: beta_geometric_beta_binom_model(
168-
np.array(n_exploded), *self._unload_params('alpha', 'beta', 'gamma', 'delta'), size=size)
169+
# Making a large array replicating n by n_custs having n.
170+
np.array(sum([n_] * n_cust for (n_, n_cust) in zip(n_periods, weights))),
171+
*self._unload_params('alpha', 'beta', 'gamma', 'delta'), size=size)
169172
return self
170173

171-
def conditional_expected_number_of_purchases_up_to_time(self, t):
174+
def conditional_expected_number_of_purchases_up_to_time(self, m_periods_in_future, frequency, recency, n_periods):
172175
"""
173176
Conditional expected purchases in future time period.
174177
175-
The expected number of future transactions across the next t
178+
The expected number of future transactions across the next m_periods_in_future
176179
transaction opportunities by a customer with purchase history
177180
(x, tx, n).
178181
179-
.. math:: E(X(n, n+n*)|alpha, beta, gamma, delta, frequency, recency, n)
182+
.. math:: E(X(n_periods, n_periods+m_periods_in_future)|alpha, beta, gamma, delta, frequency, recency, n_periods)
180183
181184
See (13) in Fader & Hardie 2010.
182185
183186
Parameters
184187
----------
185188
t: array_like
186-
time periods (n+t)
189+
time n_periods (n+t)
187190
188191
Returns
189192
-------
190193
array_like
191194
predicted transactions
192195
193196
"""
194-
x = self.data['frequency']
195-
tx = self.data['recency']
196-
n = self.data['n']
197+
x = frequency
198+
tx = recency
199+
n = n_periods
197200

198201
params = self._unload_params('alpha', 'beta', 'gamma', 'delta')
199202
alpha, beta, gamma, delta = params
@@ -203,18 +206,18 @@ def conditional_expected_number_of_purchases_up_to_time(self, t):
203206
p3 = delta / (gamma - 1) * exp(gammaln(gamma + delta) -
204207
gammaln(1 + delta))
205208
p4 = exp(gammaln(1 + delta + n) - gammaln(gamma + delta + n))
206-
p5 = exp(gammaln(1 + delta + n + t) - gammaln(gamma + delta + n + t))
209+
p5 = exp(gammaln(1 + delta + n + m_periods_in_future) - gammaln(gamma + delta + n + m_periods_in_future))
207210

208211
return p1 * p2 * p3 * (p4 - p5)
209212

210-
def conditional_probability_alive(self, m):
213+
def conditional_probability_alive(self, m_periods_in_future, frequency, recency, n_periods):
211214
"""
212215
Conditional probability alive.
213216
214217
Conditional probability customer is alive at transaction opportunity
215-
n + m.
218+
n_periods + m_periods_in_future.
216219
217-
.. math:: P(alive at n + m|alpha, beta, gamma, delta, frequency, recency, n)
220+
.. math:: P(alive at n_periods + m_periods_in_future|alpha, beta, gamma, delta, frequency, recency, n_periods)
218221
219222
See (A10) in Fader and Hardie 2010.
220223
@@ -232,19 +235,16 @@ def conditional_probability_alive(self, m):
232235
params = self._unload_params('alpha', 'beta', 'gamma', 'delta')
233236
alpha, beta, gamma, delta = params
234237

235-
x = self.data['frequency']
236-
tx = self.data['recency']
237-
n = self.data['n']
238238

239-
p1 = betaln(alpha + x, beta + n - x) - betaln(alpha, beta)
240-
p2 = betaln(gamma, delta + n + m) - betaln(gamma, delta)
241-
p3 = self._loglikelihood(params, x, tx, n)
239+
p1 = betaln(alpha + frequency, beta + n_periods - frequency) - betaln(alpha, beta)
240+
p2 = betaln(gamma, delta + n_periods + m_periods_in_future) - betaln(gamma, delta)
241+
p3 = self._loglikelihood(params, frequency, recency, n_periods)
242242

243243
return exp(p1 + p2) / exp(p3)
244244

245245
def expected_number_of_transactions_in_first_n_periods(self, n):
246246
"""
247-
Return expected number of transactions in first n periods.
247+
Return expected number of transactions in first n n_periods.
248248
249249
Expected number of transactions occurring across first n transaction
250250
opportunities.
@@ -268,7 +268,7 @@ def expected_number_of_transactions_in_first_n_periods(self, n):
268268
params = self._unload_params('alpha', 'beta', 'gamma', 'delta')
269269
alpha, beta, gamma, delta = params
270270

271-
x_counts = self.data.groupby('frequency')['n_custs'].sum()
271+
x_counts = self.data.groupby('frequency')['weights'].sum()
272272
x = asarray(x_counts.index)
273273

274274
p1 = binom(n, x) * exp(betaln(alpha + x, beta + n - x) -

lifetimes/fitters/beta_geo_fitter.py

Lines changed: 22 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@ def __init__(self, penalizer_coef=0.0):
5555
"""Initialization, set penalizer_coef."""
5656
self.penalizer_coef = penalizer_coef
5757

58-
def fit(self, frequency, recency, T, iterative_fitting=1,
58+
def fit(self, frequency, recency, T, weights=None, iterative_fitting=1,
5959
initial_params=None, verbose=False, tol=1e-4, index=None,
6060
fit_method='Nelder-Mead', maxiter=2000, **kwargs):
6161
"""
@@ -71,6 +71,16 @@ def fit(self, frequency, recency, T, iterative_fitting=1,
7171
(denoted t_x in literature).
7272
T: array_like
7373
customers' age (time units since first purchase)
74+
weights: None or array_like
75+
Number of customers with given frequency/recency/T,
76+
defaults to 1 if not specified. Fader and
77+
Hardie condense the individual RFM matrix into all
78+
observed combinations of frequency/recency/T. This
79+
parameter represents the count of customers with a given
80+
purchase pattern. Instead of calculating individual
81+
loglikelihood, the loglikelihood is calculated for each
82+
pattern and multiplied by the number of customers with
83+
that pattern.
7484
iterative_fitting: int, optional
7585
perform iterative_fitting fits over random/warm-started initial params
7686
initial_params: array_like, optional
@@ -97,18 +107,24 @@ def fit(self, frequency, recency, T, iterative_fitting=1,
97107
with additional properties like params_ and methods like predict
98108
99109
"""
100-
frequency = asarray(frequency)
110+
frequency = asarray(frequency).astype(int)
101111
recency = asarray(recency)
102112
T = asarray(T)
103113
_check_inputs(frequency, recency, T)
104114

115+
if weights is None:
116+
weights = np.ones_like(recency, dtype=np.int64)
117+
else:
118+
weights = asarray(weights)
119+
120+
105121
self._scale = _scale_time(T)
106122
scaled_recency = recency * self._scale
107123
scaled_T = T * self._scale
108124

109125
params, self._negative_log_likelihood_ = _fit(
110126
self._negative_log_likelihood,
111-
[frequency, scaled_recency, scaled_T, self.penalizer_coef],
127+
[frequency, scaled_recency, scaled_T, weights, self.penalizer_coef],
112128
iterative_fitting,
113129
initial_params,
114130
4,
@@ -132,7 +148,7 @@ def fit(self, frequency, recency, T, iterative_fitting=1,
132148
return self
133149

134150
@staticmethod
135-
def _negative_log_likelihood(params, freq, rec, T, penalizer_coef):
151+
def _negative_log_likelihood(params, freq, rec, T, weights, penalizer_coef):
136152
if npany(asarray(params) <= 0):
137153
return np.inf
138154

@@ -148,8 +164,8 @@ def _negative_log_likelihood(params, freq, rec, T, penalizer_coef):
148164
(r + freq) * log(rec + alpha)
149165
A_4[isnan(A_4) | isinf(A_4)] = 0
150166
penalizer_term = penalizer_coef * sum(np.asarray(params) ** 2)
151-
return -(A_1 + A_2 + misc.logsumexp(
152-
vconcat[A_3, A_4], axis=1, b=d)).mean() + penalizer_term
167+
return - (weights * (A_1 + A_2 + misc.logsumexp(vconcat[A_3, A_4], axis=1, b=d))).mean() \
168+
+ penalizer_term
153169

154170
def conditional_expected_number_of_purchases_up_to_time(self, t, frequency,
155171
recency, T):

lifetimes/fitters/modified_beta_geo_fitter.py

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ def __init__(self, penalizer_coef=0.0):
3737
"""Initialization, set penalizer_coef."""
3838
super(self.__class__, self).__init__(penalizer_coef)
3939

40-
def fit(self, frequency, recency, T, iterative_fitting=1,
40+
def fit(self, frequency, recency, T, weights=None, iterative_fitting=1,
4141
initial_params=None, verbose=False, tol=1e-4, index=None,
4242
fit_method='Nelder-Mead', maxiter=2000, **kwargs):
4343
"""
@@ -53,6 +53,16 @@ def fit(self, frequency, recency, T, iterative_fitting=1,
5353
(denoted t_x in literature).
5454
T: array_like
5555
customers' age (time units since first purchase)
56+
weights: None or array_like
57+
Number of customers with given frequency/recency/T,
58+
defaults to 1 if not specified. Fader and
59+
Hardie condense the individual RFM matrix into all
60+
observed combinations of frequency/recency/T. This
61+
parameter represents the count of customers with a given
62+
purchase pattern. Instead of calculating individual
63+
loglikelihood, the loglikelihood is calculated for each
64+
pattern and multiplied by the number of customers with
65+
that pattern.
5666
iterative_fitting: int, optional
5767
perform iterative_fitting fits over random/warm-started initial params
5868
initial_params: array_like, optional
@@ -83,6 +93,7 @@ def fit(self, frequency, recency, T, iterative_fitting=1,
8393
super(self.__class__, self).fit(frequency,
8494
recency,
8595
T,
96+
weights,
8697
iterative_fitting,
8798
initial_params,
8899
verbose,
@@ -99,7 +110,7 @@ def fit(self, frequency, recency, T, iterative_fitting=1,
99110
return self
100111

101112
@staticmethod
102-
def _negative_log_likelihood(params, freq, rec, T, penalizer_coef):
113+
def _negative_log_likelihood(params, freq, rec, T, weights, penalizer_coef):
103114
if npany(asarray(params) <= 0):
104115
return np.inf
105116

@@ -113,7 +124,7 @@ def _negative_log_likelihood(params, freq, rec, T, penalizer_coef):
113124
log(alpha + rec))
114125

115126
penalizer_term = penalizer_coef * sum(np.asarray(params) ** 2)
116-
return -(A_1 + A_2 + A_3 + logaddexp(A_4, 0)).mean() + penalizer_term
127+
return -(weights * (A_1 + A_2 + A_3 + logaddexp(A_4, 0))).mean() + penalizer_term
117128

118129
def expected_number_of_purchases_up_to_time(self, t):
119130
"""

0 commit comments

Comments
 (0)