Merge pull request #241 from CamDavidsonPilon/v0.10.1

v0.10.1
CamDavidsonPilon · Jan 7, 2019 · ed37455 · ed37455
2 parents 00ca929 + f74e195
commit ed37455
Show file tree

Hide file tree

Showing 10 changed files with 56 additions and 47 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -2,9 +2,14 @@ language: python
 dist: trusty
 python:
    - "2.7"
-   - "3.4"
    - "3.5"
    - "3.6"
+# Enable newer 3.7 without globally enabling sudo and dist: xenial for other build jobs
+matrix:
+  include:
+    - python: 3.7
+      dist: xenial
+      sudo: true
 before_install:
   - sudo apt-get update
 install:

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,12 @@
 # Changelog
 
+### 0.10.1
+ - performance improvements to `generate_data.py` for large datasets #195
+ - performance improvements to `summary_data_from_transaction_data`, thanks @MichaelSchreier
+ - Previously, `GammaGammaFitter` would have an infinite mean when its `q` parameter was less than 1. This was possible for some datasets. In 0.10.1, a new argument is added to `GammaGammaFitter` to constrain that `q` is greater than 1. This can be done with `q_constraint=True` in the call to `GammaGammaFitter.fit`. See issue #146. Thanks @vruvora 
+ - Stop support of scipy < 1.0.
+ - Stop support of < Python 3.5.
+
 ### 0.10.0
  - `BetaGeoBetaBinomFitter.fit` has replaced `n_custs` with the more appropriately named `weights` (to align with other statisical libraries). By default and if unspecified, `weights` is equal to an array of 1s. 
  - The `conditional_` methods on `BetaGeoBetaBinomFitter` have been updated to handle exogenously provided recency, frequency and periods. 

diff --git a/dev_requirements.txt b/dev_requirements.txt
@@ -3,7 +3,7 @@ flake8
 autopep8
 pytest
 matplotlib
-pytest-cov
+pytest-cov==2.5.1
 pytest-mpl
 coveralls
 pydocstyle

diff --git a/docs/conf.py b/docs/conf.py
@@ -77,9 +77,9 @@
 # built documents.
 #
 # The short X.Y version.
-version = '0.10.0.0'
+version = '0.10.1'
 # The full version, including alpha/beta/rc tags.
-release = '0.10.0.0'
+release = version
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.

diff --git a/lifetimes/fitters/beta_geo_fitter.py b/lifetimes/fitters/beta_geo_fitter.py
@@ -8,8 +8,7 @@
     where, exp
 from numpy import ones_like
 from pandas import DataFrame
-from scipy.special import gammaln, hyp2f1, beta, gamma
-from scipy import misc
+from scipy.special import gammaln, hyp2f1, beta, gamma, logsumexp
 
 from . import BaseFitter
 from ..utils import _fit, _scale_time, _check_inputs
@@ -164,7 +163,7 @@ def _negative_log_likelihood(params, freq, rec, T, weights, penalizer_coef):
             (r + freq) * log(rec + alpha)
         A_4[isnan(A_4) | isinf(A_4)] = 0
         penalizer_term = penalizer_coef * sum(np.asarray(params) ** 2)
-        return - (weights * (A_1 + A_2 + misc.logsumexp(vconcat[A_3, A_4], axis=1, b=d))).mean() \
+        return - (weights * (A_1 + A_2 + logsumexp(vconcat[A_3, A_4], axis=1, b=d))).mean() \
                             + penalizer_term
 
     def conditional_expected_number_of_purchases_up_to_time(self, t, frequency,

diff --git a/lifetimes/fitters/pareto_nbd_fitter.py b/lifetimes/fitters/pareto_nbd_fitter.py
@@ -7,7 +7,7 @@
 from numpy import log, exp, logaddexp, asarray, any as npany, c_ as vconcat
 from pandas import DataFrame
 from scipy.special import gammaln, hyp2f1, betaln
-from scipy import misc
+from scipy.special import logsumexp
 
 from . import BaseFitter
 from ..utils import _fit, _check_inputs, _scale_time
@@ -162,7 +162,7 @@ def _log_A_0(params, freq, recency, age):
         except TypeError:
             sign = 1
 
-        return (misc.logsumexp([log(p_1) + rsf * log(q_2), log(p_2) +
+        return (logsumexp([log(p_1) + rsf * log(q_2), log(p_2) +
                 rsf * log(q_1)], axis=0, b=[sign, -sign]) -
                 rsf * log(q_1 * q_2))
 
@@ -376,7 +376,7 @@ def _log_B_three(i):
         zeroth_term = (n == 0) * (1 - exp(log_p_zero))
         first_term = n * log(t) - gammaln(n + 1) + log_B_one - log_l
         second_term = log_B_two - log_l
-        third_term = misc.logsumexp(
+        third_term = logsumexp(
             [i * log(t) - gammaln(i + 1) + _log_B_three(i) - log_l for i in range(n + 1)],
             axis=0
         )
@@ -389,7 +389,7 @@ def _log_B_three(i):
 
         # In some scenarios (e.g. large n) tiny numerical errors in the calculation of second_term and third_term
         # cause sumexp to be ever so slightly negative and logsumexp throws an error. Hence we ignore the sign here.
-        return zeroth_term + exp(misc.logsumexp(
+        return zeroth_term + exp(logsumexp(
             [first_term, second_term, third_term], b=[sign, sign, -sign],
             axis=0,
             return_sign=True

diff --git a/lifetimes/generate_data.py b/lifetimes/generate_data.py
@@ -1,6 +1,6 @@
 
 import numpy as np
-from scipy import stats
+from numpy import random
 import pandas as pd
 
 
@@ -36,8 +36,8 @@ def beta_geometric_nbd_model(T, r, alpha, a, b, size=1):
     else:
         T = np.asarray(T)
 
-    probability_of_post_purchase_death = stats.beta.rvs(a, b, size=size)
-    lambda_ = stats.gamma.rvs(r, scale=1. / alpha, size=size)
+    probability_of_post_purchase_death = random.beta(a, b, size=size)
+    lambda_ = random.gamma(r, scale=1. / alpha, size=size)
 
     columns = ['frequency', 'recency', 'T', 'lambda', 'p', 'alive', 'customer_id']
     df = pd.DataFrame(np.zeros((size, len(columns))), columns=columns)
@@ -48,12 +48,12 @@ def beta_geometric_nbd_model(T, r, alpha, a, b, size=1):
 
         # hacky until I can find something better
         times = []
-        next_purchase_in = stats.expon.rvs(scale=1. / l)
+        next_purchase_in = random.exponential(scale=1. / l)
         alive = True
         while (np.sum(times) + next_purchase_in < T[i]) and alive:
             times.append(next_purchase_in)
-            next_purchase_in = stats.expon.rvs(scale=1. / l)
-            alive = np.random.random() > p
+            next_purchase_in = random.exponential(scale=1. / l)
+            alive = random.random() > p
 
         times = np.array(times).cumsum()
         df.iloc[i] = np.unique(np.array(times).astype(int)).shape[0], np.max(times if times.shape[0] > 0 else 0), T[i], l, p, alive, i
@@ -102,8 +102,8 @@ def beta_geometric_nbd_model_transactional_data(T, r, alpha, a, b, observation_p
         start_date = [observation_period_end - pd.Timedelta(T[i] - 1, unit=freq) for i in range(size)]
         T = np.asarray(T)
 
-    probability_of_post_purchase_death = stats.beta.rvs(a, b, size=size)
-    lambda_ = stats.gamma.rvs(r, scale=1. / alpha, size=size)
+    probability_of_post_purchase_death = random.beta(a, b, size=size)
+    lambda_ = random.gamma(r, scale=1. / alpha, size=size)
 
     columns = ['customer_id', 'date']
     df = pd.DataFrame(columns=columns)
@@ -115,13 +115,13 @@ def beta_geometric_nbd_model_transactional_data(T, r, alpha, a, b, observation_p
         age = T[i]
 
         purchases = [[i, s - pd.Timedelta(1, unit=freq)]]
-        next_purchase_in = stats.expon.rvs(scale=1. / l)
+        next_purchase_in = random.exponential(scale=1. / l)
         alive = True
 
         while next_purchase_in < age and alive:
             purchases.append([i, s + pd.Timedelta(next_purchase_in, unit=freq)])
-            next_purchase_in += stats.expon.rvs(scale=1. / l)
-            alive = np.random.random() > p
+            next_purchase_in += random.exponential(scale=1. / l)
+            alive = random.random() > p
 
         df = df.append(pd.DataFrame(purchases, columns=columns))
 
@@ -160,23 +160,23 @@ def pareto_nbd_model(T, r, alpha, s, beta, size=1):
     else:
         T = np.asarray(T)
 
-    lambda_ = stats.gamma.rvs(r, scale=1. / alpha, size=size)
-    mus = stats.gamma.rvs(s, scale=1. / beta, size=size)
+    lambda_ = random.gamma(r, scale=1. / alpha, size=size)
+    mus = random.gamma(s, scale=1. / beta, size=size)
 
     columns = ['frequency', 'recency', 'T', 'lambda', 'mu', 'alive', 'customer_id']
     df = pd.DataFrame(np.zeros((size, len(columns))), columns=columns)
 
     for i in range(size):
         l = lambda_[i]
         mu = mus[i]
-        time_of_death = stats.expon.rvs(scale=1. / mu)
+        time_of_death = random.exponential(scale=1. / mu)
 
         # hacky until I can find something better
         times = []
-        next_purchase_in = stats.expon.rvs(scale=1. / l)
+        next_purchase_in = random.exponential(scale=1. / l)
         while np.sum(times) + next_purchase_in < min(time_of_death, T[i]):
             times.append(next_purchase_in)
-            next_purchase_in = stats.expon.rvs(scale=1. / l)
+            next_purchase_in = random.exponential(scale=1. / l)
 
         times = np.array(times).cumsum()
         df.iloc[i] = np.unique(np.array(times).astype(int)).shape[0], np.max(times if times.shape[0] > 0 else 0), T[i], l, mu, time_of_death > T[i], i
@@ -219,8 +219,8 @@ def modified_beta_geometric_nbd_model(T, r, alpha, a, b, size=1):
     else:
         T = np.asarray(T)
 
-    probability_of_post_purchase_death = stats.beta.rvs(a, b, size=size)
-    lambda_ = stats.gamma.rvs(r, scale=1. / alpha, size=size)
+    probability_of_post_purchase_death = random.beta(a, b, size=size)
+    lambda_ = random.gamma(r, scale=1. / alpha, size=size)
 
     columns = ['frequency', 'recency', 'T', 'lambda', 'p', 'alive', 'customer_id']
     df = pd.DataFrame(np.zeros((size, len(columns))), columns=columns)
@@ -231,12 +231,12 @@ def modified_beta_geometric_nbd_model(T, r, alpha, a, b, size=1):
 
         # hacky until I can find something better
         times = []
-        next_purchase_in = stats.expon.rvs(scale=1. / l)
-        alive = np.random.random() > p  # essentially the difference between this model and BG/NBD
+        next_purchase_in = random.exponential(scale=1. / l)
+        alive = random.random() > p  # essentially the difference between this model and BG/NBD
         while (np.sum(times) + next_purchase_in < T[i]) and alive:
             times.append(next_purchase_in)
-            next_purchase_in = stats.expon.rvs(scale=1. / l)
-            alive = np.random.random() > p
+            next_purchase_in = random.exponential(scale=1. / l)
+            alive = random.random() > p
 
         times = np.array(times).cumsum()
         df.iloc[i] = np.unique(np.array(times).astype(int)).shape[0], np.max(times if times.shape[0] > 0 else 0), T[i], l, p, alive, i
@@ -282,8 +282,8 @@ def beta_geometric_beta_binom_model(N, alpha, beta, gamma, delta, size=1):
     else:
         N = np.asarray(N)
 
-    probability_of_post_purchase_death = np.random.beta(a=alpha, b=beta, size=size)
-    thetas = np.random.beta(a=gamma, b=delta, size=size)
+    probability_of_post_purchase_death = random.beta(a=alpha, b=beta, size=size)
+    thetas = random.beta(a=gamma, b=delta, size=size)
 
     columns = ['frequency', 'recency', 'n_periods', 'p', 'theta', 'alive', 'customer_id']
     df = pd.DataFrame(np.zeros((size, len(columns))), columns=columns)
@@ -296,12 +296,12 @@ def beta_geometric_beta_binom_model(N, alpha, beta, gamma, delta, size=1):
         alive = True
         times = []
         while current_t < N[i] and alive:
-            alive = np.random.binomial(1, theta) == 0
-            if alive and np.random.binomial(1, p) == 1:
+            alive = random.binomial(1, theta) == 0
+            if alive and random.binomial(1, p) == 1:
                 times.append(current_t)
             current_t += 1
         # adding in final death opportunity to agree with [1]
         if alive:
-            alive = np.random.binomial(1, theta) == 0
+            alive = random.binomial(1, theta) == 0
         df.iloc[i] = len(times), times[-1] + 1 if len(times) != 0 else 0, N[i], p, theta, alive, i
     return df
diff --git a/lifetimes/version.py b/lifetimes/version.py
@@ -1,3 +1,3 @@
 from __future__ import unicode_literals
 
-__version__ = '0.10.0.0'
+__version__ = '0.10.1'
diff --git a/requirements.txt b/requirements.txt
@@ -1,4 +1,4 @@
-numpy
-scipy
+numpy>1.10.0
+scipy>=1.0.0
 pandas>=0.19
-dill
+dill>=0.2.6
diff --git a/setup.py b/setup.py
@@ -31,16 +31,14 @@
           "License :: OSI Approved :: MIT License",
           "Programming Language :: Python",
           "Programming Language :: Python :: 2.7",
-          "Programming Language :: Python :: 3.3",
-          "Programming Language :: Python :: 3.4",
           "Programming Language :: Python :: 3.5",
           "Topic :: Scientific/Engineering",
       ],
       install_requires=[
-          "numpy",
-          "scipy",
+          "numpy>=1.10.0",
+          "scipy>=1.0.0",
           "pandas>=0.19",
-          "dill"
+          "dill>=0.2.6"
       ],
       package_data={
           "lifetimes": [