From 65144a36d59be09b0a632eef45515a5875f928da Mon Sep 17 00:00:00 2001 From: Cam Davidson-Pilon Date: Mon, 7 Jan 2019 10:24:58 -0500 Subject: [PATCH 1/4] some performance increases, cut new version too --- .travis.yml | 8 +++- CHANGELOG.md | 7 ++++ docs/conf.py | 4 +- lifetimes/fitters/pareto_nbd_fitter.py | 7 ++-- lifetimes/generate_data.py | 54 +++++++++++++------------- lifetimes/version.py | 2 +- requirements.txt | 2 +- setup.py | 4 +- 8 files changed, 49 insertions(+), 39 deletions(-) diff --git a/.travis.yml b/.travis.yml index 72a4560d..93b49785 100644 --- a/.travis.yml +++ b/.travis.yml @@ -2,10 +2,14 @@ language: python dist: trusty python: - "2.7" - - "3.4" - "3.5" - "3.6" -before_install: +# Enable newer 3.7 without globally enabling sudo and dist: xenial for other build jobs +matrix: + include: + - python: 3.7 + dist: xenial + sudo: truebefore_install: - sudo apt-get update install: - "pip install -r dev_requirements.txt" diff --git a/CHANGELOG.md b/CHANGELOG.md index fd58ad0b..ba1090f2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,12 @@ # Changelog +### 0.10.1 + - performance improvements to `generate_data.py` for large datasets #195 + - performance improvements to `summary_data_from_transaction_data`, thanks @MichaelSchreier + - Previously, `GammaGammaFitter` would have an infinite mean when its `q` parameter was less than 1. This was possible for some datasets. In 0.10.1, a new argument is added to `GammaGammaFitter` to constrain that `q` is greater than 1. This can be done with `q_constraint=True` in the call to `GammaGammaFitter.fit`. See issue #146. Thanks @vruvora + - Stop support of scipy < 1.0. + - Stop support of < Python 3.5. + ### 0.10.0 - `BetaGeoBetaBinomFitter.fit` has replaced `n_custs` with the more appropriately named `weights` (to align with other statisical libraries). By default and if unspecified, `weights` is equal to an array of 1s. - The `conditional_` methods on `BetaGeoBetaBinomFitter` have been updated to handle exogenously provided recency, frequency and periods. diff --git a/docs/conf.py b/docs/conf.py index 56a394fd..64962b51 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -77,9 +77,9 @@ # built documents. # # The short X.Y version. -version = '0.10.0.0' +version = '0.10.1' # The full version, including alpha/beta/rc tags. -release = '0.10.0.0' +release = version # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. diff --git a/lifetimes/fitters/pareto_nbd_fitter.py b/lifetimes/fitters/pareto_nbd_fitter.py index fb2a92ee..8dc1cec2 100644 --- a/lifetimes/fitters/pareto_nbd_fitter.py +++ b/lifetimes/fitters/pareto_nbd_fitter.py @@ -8,6 +8,7 @@ from pandas import DataFrame from scipy.special import gammaln, hyp2f1, betaln from scipy import misc +from scipy.special import logsumexp from . import BaseFitter from ..utils import _fit, _check_inputs, _scale_time @@ -162,7 +163,7 @@ def _log_A_0(params, freq, recency, age): except TypeError: sign = 1 - return (misc.logsumexp([log(p_1) + rsf * log(q_2), log(p_2) + + return (logsumexp([log(p_1) + rsf * log(q_2), log(p_2) + rsf * log(q_1)], axis=0, b=[sign, -sign]) - rsf * log(q_1 * q_2)) @@ -376,7 +377,7 @@ def _log_B_three(i): zeroth_term = (n == 0) * (1 - exp(log_p_zero)) first_term = n * log(t) - gammaln(n + 1) + log_B_one - log_l second_term = log_B_two - log_l - third_term = misc.logsumexp( + third_term = logsumexp( [i * log(t) - gammaln(i + 1) + _log_B_three(i) - log_l for i in range(n + 1)], axis=0 ) @@ -389,7 +390,7 @@ def _log_B_three(i): # In some scenarios (e.g. large n) tiny numerical errors in the calculation of second_term and third_term # cause sumexp to be ever so slightly negative and logsumexp throws an error. Hence we ignore the sign here. - return zeroth_term + exp(misc.logsumexp( + return zeroth_term + exp(logsumexp( [first_term, second_term, third_term], b=[sign, sign, -sign], axis=0, return_sign=True diff --git a/lifetimes/generate_data.py b/lifetimes/generate_data.py index 1f357871..e2cf4642 100644 --- a/lifetimes/generate_data.py +++ b/lifetimes/generate_data.py @@ -1,6 +1,6 @@ import numpy as np -from scipy import stats +from numpy import random import pandas as pd @@ -36,8 +36,8 @@ def beta_geometric_nbd_model(T, r, alpha, a, b, size=1): else: T = np.asarray(T) - probability_of_post_purchase_death = stats.beta.rvs(a, b, size=size) - lambda_ = stats.gamma.rvs(r, scale=1. / alpha, size=size) + probability_of_post_purchase_death = random.beta(a, b, size=size) + lambda_ = random.gamma(r, scale=1. / alpha, size=size) columns = ['frequency', 'recency', 'T', 'lambda', 'p', 'alive', 'customer_id'] df = pd.DataFrame(np.zeros((size, len(columns))), columns=columns) @@ -48,12 +48,12 @@ def beta_geometric_nbd_model(T, r, alpha, a, b, size=1): # hacky until I can find something better times = [] - next_purchase_in = stats.expon.rvs(scale=1. / l) + next_purchase_in = random.exponential(scale=1. / l) alive = True while (np.sum(times) + next_purchase_in < T[i]) and alive: times.append(next_purchase_in) - next_purchase_in = stats.expon.rvs(scale=1. / l) - alive = np.random.random() > p + next_purchase_in = random.exponential(scale=1. / l) + alive = random.random() > p times = np.array(times).cumsum() df.iloc[i] = np.unique(np.array(times).astype(int)).shape[0], np.max(times if times.shape[0] > 0 else 0), T[i], l, p, alive, i @@ -102,8 +102,8 @@ def beta_geometric_nbd_model_transactional_data(T, r, alpha, a, b, observation_p start_date = [observation_period_end - pd.Timedelta(T[i] - 1, unit=freq) for i in range(size)] T = np.asarray(T) - probability_of_post_purchase_death = stats.beta.rvs(a, b, size=size) - lambda_ = stats.gamma.rvs(r, scale=1. / alpha, size=size) + probability_of_post_purchase_death = random.beta(a, b, size=size) + lambda_ = random.gamma(r, scale=1. / alpha, size=size) columns = ['customer_id', 'date'] df = pd.DataFrame(columns=columns) @@ -115,13 +115,13 @@ def beta_geometric_nbd_model_transactional_data(T, r, alpha, a, b, observation_p age = T[i] purchases = [[i, s - pd.Timedelta(1, unit=freq)]] - next_purchase_in = stats.expon.rvs(scale=1. / l) + next_purchase_in = random.exponential(scale=1. / l) alive = True while next_purchase_in < age and alive: purchases.append([i, s + pd.Timedelta(next_purchase_in, unit=freq)]) - next_purchase_in += stats.expon.rvs(scale=1. / l) - alive = np.random.random() > p + next_purchase_in += random.exponential(scale=1. / l) + alive = random.random() > p df = df.append(pd.DataFrame(purchases, columns=columns)) @@ -160,8 +160,8 @@ def pareto_nbd_model(T, r, alpha, s, beta, size=1): else: T = np.asarray(T) - lambda_ = stats.gamma.rvs(r, scale=1. / alpha, size=size) - mus = stats.gamma.rvs(s, scale=1. / beta, size=size) + lambda_ = random.gamma(r, scale=1. / alpha, size=size) + mus = random.gamma(s, scale=1. / beta, size=size) columns = ['frequency', 'recency', 'T', 'lambda', 'mu', 'alive', 'customer_id'] df = pd.DataFrame(np.zeros((size, len(columns))), columns=columns) @@ -169,14 +169,14 @@ def pareto_nbd_model(T, r, alpha, s, beta, size=1): for i in range(size): l = lambda_[i] mu = mus[i] - time_of_death = stats.expon.rvs(scale=1. / mu) + time_of_death = random.exponential(scale=1. / mu) # hacky until I can find something better times = [] - next_purchase_in = stats.expon.rvs(scale=1. / l) + next_purchase_in = random.exponential(scale=1. / l) while np.sum(times) + next_purchase_in < min(time_of_death, T[i]): times.append(next_purchase_in) - next_purchase_in = stats.expon.rvs(scale=1. / l) + next_purchase_in = random.exponential(scale=1. / l) times = np.array(times).cumsum() df.iloc[i] = np.unique(np.array(times).astype(int)).shape[0], np.max(times if times.shape[0] > 0 else 0), T[i], l, mu, time_of_death > T[i], i @@ -219,8 +219,8 @@ def modified_beta_geometric_nbd_model(T, r, alpha, a, b, size=1): else: T = np.asarray(T) - probability_of_post_purchase_death = stats.beta.rvs(a, b, size=size) - lambda_ = stats.gamma.rvs(r, scale=1. / alpha, size=size) + probability_of_post_purchase_death = random.beta(a, b, size=size) + lambda_ = random.gamma(r, scale=1. / alpha, size=size) columns = ['frequency', 'recency', 'T', 'lambda', 'p', 'alive', 'customer_id'] df = pd.DataFrame(np.zeros((size, len(columns))), columns=columns) @@ -231,12 +231,12 @@ def modified_beta_geometric_nbd_model(T, r, alpha, a, b, size=1): # hacky until I can find something better times = [] - next_purchase_in = stats.expon.rvs(scale=1. / l) - alive = np.random.random() > p # essentially the difference between this model and BG/NBD + next_purchase_in = random.exponential(scale=1. / l) + alive = random.random() > p # essentially the difference between this model and BG/NBD while (np.sum(times) + next_purchase_in < T[i]) and alive: times.append(next_purchase_in) - next_purchase_in = stats.expon.rvs(scale=1. / l) - alive = np.random.random() > p + next_purchase_in = random.exponential(scale=1. / l) + alive = random.random() > p times = np.array(times).cumsum() df.iloc[i] = np.unique(np.array(times).astype(int)).shape[0], np.max(times if times.shape[0] > 0 else 0), T[i], l, p, alive, i @@ -282,8 +282,8 @@ def beta_geometric_beta_binom_model(N, alpha, beta, gamma, delta, size=1): else: N = np.asarray(N) - probability_of_post_purchase_death = np.random.beta(a=alpha, b=beta, size=size) - thetas = np.random.beta(a=gamma, b=delta, size=size) + probability_of_post_purchase_death = random.beta(a=alpha, b=beta, size=size) + thetas = random.beta(a=gamma, b=delta, size=size) columns = ['frequency', 'recency', 'n_periods', 'p', 'theta', 'alive', 'customer_id'] df = pd.DataFrame(np.zeros((size, len(columns))), columns=columns) @@ -296,12 +296,12 @@ def beta_geometric_beta_binom_model(N, alpha, beta, gamma, delta, size=1): alive = True times = [] while current_t < N[i] and alive: - alive = np.random.binomial(1, theta) == 0 - if alive and np.random.binomial(1, p) == 1: + alive = random.binomial(1, theta) == 0 + if alive and random.binomial(1, p) == 1: times.append(current_t) current_t += 1 # adding in final death opportunity to agree with [1] if alive: - alive = np.random.binomial(1, theta) == 0 + alive = random.binomial(1, theta) == 0 df.iloc[i] = len(times), times[-1] + 1 if len(times) != 0 else 0, N[i], p, theta, alive, i return df diff --git a/lifetimes/version.py b/lifetimes/version.py index 787f75fb..ba3160e9 100644 --- a/lifetimes/version.py +++ b/lifetimes/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '0.10.0.0' +__version__ = '0.10.1' diff --git a/requirements.txt b/requirements.txt index 5d726e30..e81d94ef 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ numpy -scipy +scipy>=1.0.0 pandas>=0.19 dill diff --git a/setup.py b/setup.py index 19698540..47bef8dc 100644 --- a/setup.py +++ b/setup.py @@ -31,14 +31,12 @@ "License :: OSI Approved :: MIT License", "Programming Language :: Python", "Programming Language :: Python :: 2.7", - "Programming Language :: Python :: 3.3", - "Programming Language :: Python :: 3.4", "Programming Language :: Python :: 3.5", "Topic :: Scientific/Engineering", ], install_requires=[ "numpy", - "scipy", + "scipy>=1.0.0", "pandas>=0.19", "dill" ], From 24830c68296b6a8e3dfcc302a0f5454df13f2bcb Mon Sep 17 00:00:00 2001 From: Cam Davidson-Pilon Date: Mon, 7 Jan 2019 10:29:36 -0500 Subject: [PATCH 2/4] pin requirements --- requirements.txt | 4 ++-- setup.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/requirements.txt b/requirements.txt index e81d94ef..584948d4 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -numpy +numpy>1.10.0 scipy>=1.0.0 pandas>=0.19 -dill +dill>=0.2.6 diff --git a/setup.py b/setup.py index 47bef8dc..feb59da7 100644 --- a/setup.py +++ b/setup.py @@ -35,10 +35,10 @@ "Topic :: Scientific/Engineering", ], install_requires=[ - "numpy", + "numpy>=1.10.0", "scipy>=1.0.0", "pandas>=0.19", - "dill" + "dill>=0.2.6" ], package_data={ "lifetimes": [ From 9a0c416e183093008c12deece023ae825c8ab318 Mon Sep 17 00:00:00 2001 From: Cam Davidson-Pilon Date: Mon, 7 Jan 2019 10:38:45 -0500 Subject: [PATCH 3/4] fix travis yml --- .travis.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 93b49785..b6ec08fe 100644 --- a/.travis.yml +++ b/.travis.yml @@ -9,7 +9,8 @@ matrix: include: - python: 3.7 dist: xenial - sudo: truebefore_install: + sudo: true +before_install: - sudo apt-get update install: - "pip install -r dev_requirements.txt" From f74e1955fb5fde360c5b22940f6fd98db36acc0b Mon Sep 17 00:00:00 2001 From: Cam Davidson-Pilon Date: Mon, 7 Jan 2019 10:47:30 -0500 Subject: [PATCH 4/4] more deprecations and pinning pytest-cov --- dev_requirements.txt | 2 +- lifetimes/fitters/beta_geo_fitter.py | 5 ++--- lifetimes/fitters/pareto_nbd_fitter.py | 1 - 3 files changed, 3 insertions(+), 5 deletions(-) diff --git a/dev_requirements.txt b/dev_requirements.txt index 30e3c7f5..ba4bd349 100644 --- a/dev_requirements.txt +++ b/dev_requirements.txt @@ -3,7 +3,7 @@ flake8 autopep8 pytest matplotlib -pytest-cov +pytest-cov==2.5.1 pytest-mpl coveralls pydocstyle diff --git a/lifetimes/fitters/beta_geo_fitter.py b/lifetimes/fitters/beta_geo_fitter.py index ac911d73..dd3ea6c5 100644 --- a/lifetimes/fitters/beta_geo_fitter.py +++ b/lifetimes/fitters/beta_geo_fitter.py @@ -8,8 +8,7 @@ where, exp from numpy import ones_like from pandas import DataFrame -from scipy.special import gammaln, hyp2f1, beta, gamma -from scipy import misc +from scipy.special import gammaln, hyp2f1, beta, gamma, logsumexp from . import BaseFitter from ..utils import _fit, _scale_time, _check_inputs @@ -164,7 +163,7 @@ def _negative_log_likelihood(params, freq, rec, T, weights, penalizer_coef): (r + freq) * log(rec + alpha) A_4[isnan(A_4) | isinf(A_4)] = 0 penalizer_term = penalizer_coef * sum(np.asarray(params) ** 2) - return - (weights * (A_1 + A_2 + misc.logsumexp(vconcat[A_3, A_4], axis=1, b=d))).mean() \ + return - (weights * (A_1 + A_2 + logsumexp(vconcat[A_3, A_4], axis=1, b=d))).mean() \ + penalizer_term def conditional_expected_number_of_purchases_up_to_time(self, t, frequency, diff --git a/lifetimes/fitters/pareto_nbd_fitter.py b/lifetimes/fitters/pareto_nbd_fitter.py index 8dc1cec2..46f511d4 100644 --- a/lifetimes/fitters/pareto_nbd_fitter.py +++ b/lifetimes/fitters/pareto_nbd_fitter.py @@ -7,7 +7,6 @@ from numpy import log, exp, logaddexp, asarray, any as npany, c_ as vconcat from pandas import DataFrame from scipy.special import gammaln, hyp2f1, betaln -from scipy import misc from scipy.special import logsumexp from . import BaseFitter