From 65144a36d59be09b0a632eef45515a5875f928da Mon Sep 17 00:00:00 2001
From: Cam Davidson-Pilon <cam.davidson.pilon@gmail.com>
Date: Mon, 7 Jan 2019 10:24:58 -0500
Subject: [PATCH 1/4] some performance increases, cut new version too

---
 .travis.yml                            |  8 +++-
 CHANGELOG.md                           |  7 ++++
 docs/conf.py                           |  4 +-
 lifetimes/fitters/pareto_nbd_fitter.py |  7 ++--
 lifetimes/generate_data.py             | 54 +++++++++++++-------------
 lifetimes/version.py                   |  2 +-
 requirements.txt                       |  2 +-
 setup.py                               |  4 +-
 8 files changed, 49 insertions(+), 39 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index 72a4560d..93b49785 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -2,10 +2,14 @@ language: python
 dist: trusty
 python:
    - "2.7"
-   - "3.4"
    - "3.5"
    - "3.6"
-before_install:
+# Enable newer 3.7 without globally enabling sudo and dist: xenial for other build jobs
+matrix:
+  include:
+    - python: 3.7
+      dist: xenial
+      sudo: truebefore_install:
   - sudo apt-get update
 install:
   - "pip install -r dev_requirements.txt"
diff --git a/CHANGELOG.md b/CHANGELOG.md
index fd58ad0b..ba1090f2 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,12 @@
 # Changelog
 
+### 0.10.1
+ - performance improvements to `generate_data.py` for large datasets #195
+ - performance improvements to `summary_data_from_transaction_data`, thanks @MichaelSchreier
+ - Previously, `GammaGammaFitter` would have an infinite mean when its `q` parameter was less than 1. This was possible for some datasets. In 0.10.1, a new argument is added to `GammaGammaFitter` to constrain that `q` is greater than 1. This can be done with `q_constraint=True` in the call to `GammaGammaFitter.fit`. See issue #146. Thanks @vruvora 
+ - Stop support of scipy < 1.0.
+ - Stop support of < Python 3.5.
+
 ### 0.10.0
  - `BetaGeoBetaBinomFitter.fit` has replaced `n_custs` with the more appropriately named `weights` (to align with other statisical libraries). By default and if unspecified, `weights` is equal to an array of 1s. 
  - The `conditional_` methods on `BetaGeoBetaBinomFitter` have been updated to handle exogenously provided recency, frequency and periods. 
diff --git a/docs/conf.py b/docs/conf.py
index 56a394fd..64962b51 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -77,9 +77,9 @@
 # built documents.
 #
 # The short X.Y version.
-version = '0.10.0.0'
+version = '0.10.1'
 # The full version, including alpha/beta/rc tags.
-release = '0.10.0.0'
+release = version
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.
diff --git a/lifetimes/fitters/pareto_nbd_fitter.py b/lifetimes/fitters/pareto_nbd_fitter.py
index fb2a92ee..8dc1cec2 100644
--- a/lifetimes/fitters/pareto_nbd_fitter.py
+++ b/lifetimes/fitters/pareto_nbd_fitter.py
@@ -8,6 +8,7 @@
 from pandas import DataFrame
 from scipy.special import gammaln, hyp2f1, betaln
 from scipy import misc
+from scipy.special import logsumexp
 
 from . import BaseFitter
 from ..utils import _fit, _check_inputs, _scale_time
@@ -162,7 +163,7 @@ def _log_A_0(params, freq, recency, age):
         except TypeError:
             sign = 1
 
-        return (misc.logsumexp([log(p_1) + rsf * log(q_2), log(p_2) +
+        return (logsumexp([log(p_1) + rsf * log(q_2), log(p_2) +
                 rsf * log(q_1)], axis=0, b=[sign, -sign]) -
                 rsf * log(q_1 * q_2))
 
@@ -376,7 +377,7 @@ def _log_B_three(i):
         zeroth_term = (n == 0) * (1 - exp(log_p_zero))
         first_term = n * log(t) - gammaln(n + 1) + log_B_one - log_l
         second_term = log_B_two - log_l
-        third_term = misc.logsumexp(
+        third_term = logsumexp(
             [i * log(t) - gammaln(i + 1) + _log_B_three(i) - log_l for i in range(n + 1)],
             axis=0
         )
@@ -389,7 +390,7 @@ def _log_B_three(i):
 
         # In some scenarios (e.g. large n) tiny numerical errors in the calculation of second_term and third_term
         # cause sumexp to be ever so slightly negative and logsumexp throws an error. Hence we ignore the sign here.
-        return zeroth_term + exp(misc.logsumexp(
+        return zeroth_term + exp(logsumexp(
             [first_term, second_term, third_term], b=[sign, sign, -sign],
             axis=0,
             return_sign=True
diff --git a/lifetimes/generate_data.py b/lifetimes/generate_data.py
index 1f357871..e2cf4642 100644
--- a/lifetimes/generate_data.py
+++ b/lifetimes/generate_data.py
@@ -1,6 +1,6 @@
 
 import numpy as np
-from scipy import stats
+from numpy import random
 import pandas as pd
 
 
@@ -36,8 +36,8 @@ def beta_geometric_nbd_model(T, r, alpha, a, b, size=1):
     else:
         T = np.asarray(T)
 
-    probability_of_post_purchase_death = stats.beta.rvs(a, b, size=size)
-    lambda_ = stats.gamma.rvs(r, scale=1. / alpha, size=size)
+    probability_of_post_purchase_death = random.beta(a, b, size=size)
+    lambda_ = random.gamma(r, scale=1. / alpha, size=size)
 
     columns = ['frequency', 'recency', 'T', 'lambda', 'p', 'alive', 'customer_id']
     df = pd.DataFrame(np.zeros((size, len(columns))), columns=columns)
@@ -48,12 +48,12 @@ def beta_geometric_nbd_model(T, r, alpha, a, b, size=1):
 
         # hacky until I can find something better
         times = []
-        next_purchase_in = stats.expon.rvs(scale=1. / l)
+        next_purchase_in = random.exponential(scale=1. / l)
         alive = True
         while (np.sum(times) + next_purchase_in < T[i]) and alive:
             times.append(next_purchase_in)
-            next_purchase_in = stats.expon.rvs(scale=1. / l)
-            alive = np.random.random() > p
+            next_purchase_in = random.exponential(scale=1. / l)
+            alive = random.random() > p
 
         times = np.array(times).cumsum()
         df.iloc[i] = np.unique(np.array(times).astype(int)).shape[0], np.max(times if times.shape[0] > 0 else 0), T[i], l, p, alive, i
@@ -102,8 +102,8 @@ def beta_geometric_nbd_model_transactional_data(T, r, alpha, a, b, observation_p
         start_date = [observation_period_end - pd.Timedelta(T[i] - 1, unit=freq) for i in range(size)]
         T = np.asarray(T)
 
-    probability_of_post_purchase_death = stats.beta.rvs(a, b, size=size)
-    lambda_ = stats.gamma.rvs(r, scale=1. / alpha, size=size)
+    probability_of_post_purchase_death = random.beta(a, b, size=size)
+    lambda_ = random.gamma(r, scale=1. / alpha, size=size)
 
     columns = ['customer_id', 'date']
     df = pd.DataFrame(columns=columns)
@@ -115,13 +115,13 @@ def beta_geometric_nbd_model_transactional_data(T, r, alpha, a, b, observation_p
         age = T[i]
 
         purchases = [[i, s - pd.Timedelta(1, unit=freq)]]
-        next_purchase_in = stats.expon.rvs(scale=1. / l)
+        next_purchase_in = random.exponential(scale=1. / l)
         alive = True
 
         while next_purchase_in < age and alive:
             purchases.append([i, s + pd.Timedelta(next_purchase_in, unit=freq)])
-            next_purchase_in += stats.expon.rvs(scale=1. / l)
-            alive = np.random.random() > p
+            next_purchase_in += random.exponential(scale=1. / l)
+            alive = random.random() > p
 
         df = df.append(pd.DataFrame(purchases, columns=columns))
 
@@ -160,8 +160,8 @@ def pareto_nbd_model(T, r, alpha, s, beta, size=1):
     else:
         T = np.asarray(T)
 
-    lambda_ = stats.gamma.rvs(r, scale=1. / alpha, size=size)
-    mus = stats.gamma.rvs(s, scale=1. / beta, size=size)
+    lambda_ = random.gamma(r, scale=1. / alpha, size=size)
+    mus = random.gamma(s, scale=1. / beta, size=size)
 
     columns = ['frequency', 'recency', 'T', 'lambda', 'mu', 'alive', 'customer_id']
     df = pd.DataFrame(np.zeros((size, len(columns))), columns=columns)
@@ -169,14 +169,14 @@ def pareto_nbd_model(T, r, alpha, s, beta, size=1):
     for i in range(size):
         l = lambda_[i]
         mu = mus[i]
-        time_of_death = stats.expon.rvs(scale=1. / mu)
+        time_of_death = random.exponential(scale=1. / mu)
 
         # hacky until I can find something better
         times = []
-        next_purchase_in = stats.expon.rvs(scale=1. / l)
+        next_purchase_in = random.exponential(scale=1. / l)
         while np.sum(times) + next_purchase_in < min(time_of_death, T[i]):
             times.append(next_purchase_in)
-            next_purchase_in = stats.expon.rvs(scale=1. / l)
+            next_purchase_in = random.exponential(scale=1. / l)
 
         times = np.array(times).cumsum()
         df.iloc[i] = np.unique(np.array(times).astype(int)).shape[0], np.max(times if times.shape[0] > 0 else 0), T[i], l, mu, time_of_death > T[i], i
@@ -219,8 +219,8 @@ def modified_beta_geometric_nbd_model(T, r, alpha, a, b, size=1):
     else:
         T = np.asarray(T)
 
-    probability_of_post_purchase_death = stats.beta.rvs(a, b, size=size)
-    lambda_ = stats.gamma.rvs(r, scale=1. / alpha, size=size)
+    probability_of_post_purchase_death = random.beta(a, b, size=size)
+    lambda_ = random.gamma(r, scale=1. / alpha, size=size)
 
     columns = ['frequency', 'recency', 'T', 'lambda', 'p', 'alive', 'customer_id']
     df = pd.DataFrame(np.zeros((size, len(columns))), columns=columns)
@@ -231,12 +231,12 @@ def modified_beta_geometric_nbd_model(T, r, alpha, a, b, size=1):
 
         # hacky until I can find something better
         times = []
-        next_purchase_in = stats.expon.rvs(scale=1. / l)
-        alive = np.random.random() > p  # essentially the difference between this model and BG/NBD
+        next_purchase_in = random.exponential(scale=1. / l)
+        alive = random.random() > p  # essentially the difference between this model and BG/NBD
         while (np.sum(times) + next_purchase_in < T[i]) and alive:
             times.append(next_purchase_in)
-            next_purchase_in = stats.expon.rvs(scale=1. / l)
-            alive = np.random.random() > p
+            next_purchase_in = random.exponential(scale=1. / l)
+            alive = random.random() > p
 
         times = np.array(times).cumsum()
         df.iloc[i] = np.unique(np.array(times).astype(int)).shape[0], np.max(times if times.shape[0] > 0 else 0), T[i], l, p, alive, i
@@ -282,8 +282,8 @@ def beta_geometric_beta_binom_model(N, alpha, beta, gamma, delta, size=1):
     else:
         N = np.asarray(N)
 
-    probability_of_post_purchase_death = np.random.beta(a=alpha, b=beta, size=size)
-    thetas = np.random.beta(a=gamma, b=delta, size=size)
+    probability_of_post_purchase_death = random.beta(a=alpha, b=beta, size=size)
+    thetas = random.beta(a=gamma, b=delta, size=size)
 
     columns = ['frequency', 'recency', 'n_periods', 'p', 'theta', 'alive', 'customer_id']
     df = pd.DataFrame(np.zeros((size, len(columns))), columns=columns)
@@ -296,12 +296,12 @@ def beta_geometric_beta_binom_model(N, alpha, beta, gamma, delta, size=1):
         alive = True
         times = []
         while current_t < N[i] and alive:
-            alive = np.random.binomial(1, theta) == 0
-            if alive and np.random.binomial(1, p) == 1:
+            alive = random.binomial(1, theta) == 0
+            if alive and random.binomial(1, p) == 1:
                 times.append(current_t)
             current_t += 1
         # adding in final death opportunity to agree with [1]
         if alive:
-            alive = np.random.binomial(1, theta) == 0
+            alive = random.binomial(1, theta) == 0
         df.iloc[i] = len(times), times[-1] + 1 if len(times) != 0 else 0, N[i], p, theta, alive, i
     return df
diff --git a/lifetimes/version.py b/lifetimes/version.py
index 787f75fb..ba3160e9 100644
--- a/lifetimes/version.py
+++ b/lifetimes/version.py
@@ -1,3 +1,3 @@
 from __future__ import unicode_literals
 
-__version__ = '0.10.0.0'
+__version__ = '0.10.1'
diff --git a/requirements.txt b/requirements.txt
index 5d726e30..e81d94ef 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,4 +1,4 @@
 numpy
-scipy
+scipy>=1.0.0
 pandas>=0.19
 dill
diff --git a/setup.py b/setup.py
index 19698540..47bef8dc 100644
--- a/setup.py
+++ b/setup.py
@@ -31,14 +31,12 @@
           "License :: OSI Approved :: MIT License",
           "Programming Language :: Python",
           "Programming Language :: Python :: 2.7",
-          "Programming Language :: Python :: 3.3",
-          "Programming Language :: Python :: 3.4",
           "Programming Language :: Python :: 3.5",
           "Topic :: Scientific/Engineering",
       ],
       install_requires=[
           "numpy",
-          "scipy",
+          "scipy>=1.0.0",
           "pandas>=0.19",
           "dill"
       ],

From 24830c68296b6a8e3dfcc302a0f5454df13f2bcb Mon Sep 17 00:00:00 2001
From: Cam Davidson-Pilon <cam.davidson.pilon@gmail.com>
Date: Mon, 7 Jan 2019 10:29:36 -0500
Subject: [PATCH 2/4] pin requirements

---
 requirements.txt | 4 ++--
 setup.py         | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index e81d94ef..584948d4 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,4 +1,4 @@
-numpy
+numpy>1.10.0
 scipy>=1.0.0
 pandas>=0.19
-dill
+dill>=0.2.6
diff --git a/setup.py b/setup.py
index 47bef8dc..feb59da7 100644
--- a/setup.py
+++ b/setup.py
@@ -35,10 +35,10 @@
           "Topic :: Scientific/Engineering",
       ],
       install_requires=[
-          "numpy",
+          "numpy>=1.10.0",
           "scipy>=1.0.0",
           "pandas>=0.19",
-          "dill"
+          "dill>=0.2.6"
       ],
       package_data={
           "lifetimes": [

From 9a0c416e183093008c12deece023ae825c8ab318 Mon Sep 17 00:00:00 2001
From: Cam Davidson-Pilon <cam.davidson.pilon@gmail.com>
Date: Mon, 7 Jan 2019 10:38:45 -0500
Subject: [PATCH 3/4] fix travis yml

---
 .travis.yml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.travis.yml b/.travis.yml
index 93b49785..b6ec08fe 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -9,7 +9,8 @@ matrix:
   include:
     - python: 3.7
       dist: xenial
-      sudo: truebefore_install:
+      sudo: true
+before_install:
   - sudo apt-get update
 install:
   - "pip install -r dev_requirements.txt"

From f74e1955fb5fde360c5b22940f6fd98db36acc0b Mon Sep 17 00:00:00 2001
From: Cam Davidson-Pilon <cam.davidson.pilon@gmail.com>
Date: Mon, 7 Jan 2019 10:47:30 -0500
Subject: [PATCH 4/4] more deprecations and pinning pytest-cov

---
 dev_requirements.txt                   | 2 +-
 lifetimes/fitters/beta_geo_fitter.py   | 5 ++---
 lifetimes/fitters/pareto_nbd_fitter.py | 1 -
 3 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/dev_requirements.txt b/dev_requirements.txt
index 30e3c7f5..ba4bd349 100644
--- a/dev_requirements.txt
+++ b/dev_requirements.txt
@@ -3,7 +3,7 @@ flake8
 autopep8
 pytest
 matplotlib
-pytest-cov
+pytest-cov==2.5.1
 pytest-mpl
 coveralls
 pydocstyle
diff --git a/lifetimes/fitters/beta_geo_fitter.py b/lifetimes/fitters/beta_geo_fitter.py
index ac911d73..dd3ea6c5 100644
--- a/lifetimes/fitters/beta_geo_fitter.py
+++ b/lifetimes/fitters/beta_geo_fitter.py
@@ -8,8 +8,7 @@
     where, exp
 from numpy import ones_like
 from pandas import DataFrame
-from scipy.special import gammaln, hyp2f1, beta, gamma
-from scipy import misc
+from scipy.special import gammaln, hyp2f1, beta, gamma, logsumexp
 
 from . import BaseFitter
 from ..utils import _fit, _scale_time, _check_inputs
@@ -164,7 +163,7 @@ def _negative_log_likelihood(params, freq, rec, T, weights, penalizer_coef):
             (r + freq) * log(rec + alpha)
         A_4[isnan(A_4) | isinf(A_4)] = 0
         penalizer_term = penalizer_coef * sum(np.asarray(params) ** 2)
-        return - (weights * (A_1 + A_2 + misc.logsumexp(vconcat[A_3, A_4], axis=1, b=d))).mean() \
+        return - (weights * (A_1 + A_2 + logsumexp(vconcat[A_3, A_4], axis=1, b=d))).mean() \
                             + penalizer_term
 
     def conditional_expected_number_of_purchases_up_to_time(self, t, frequency,
diff --git a/lifetimes/fitters/pareto_nbd_fitter.py b/lifetimes/fitters/pareto_nbd_fitter.py
index 8dc1cec2..46f511d4 100644
--- a/lifetimes/fitters/pareto_nbd_fitter.py
+++ b/lifetimes/fitters/pareto_nbd_fitter.py
@@ -7,7 +7,6 @@
 from numpy import log, exp, logaddexp, asarray, any as npany, c_ as vconcat
 from pandas import DataFrame
 from scipy.special import gammaln, hyp2f1, betaln
-from scipy import misc
 from scipy.special import logsumexp
 
 from . import BaseFitter