From 224d643d483dd84d0c4719009928c999753f0272 Mon Sep 17 00:00:00 2001 From: jzluo <20971593+jzluo@users.noreply.github.com> Date: Sat, 6 Aug 2022 22:29:26 -0400 Subject: [PATCH 1/6] refactor _profile_likelihood_ci --- firthlogist/firthlogist.py | 125 +++++++++++++++++++------------------ 1 file changed, 63 insertions(+), 62 deletions(-) diff --git a/firthlogist/firthlogist.py b/firthlogist/firthlogist.py index 371f2bb..97d32f5 100644 --- a/firthlogist/firthlogist.py +++ b/firthlogist/firthlogist.py @@ -152,22 +152,16 @@ def fit(self, X, y): if not self.skip_ci: if not self.wald: - self.ci_ = np.column_stack( - [ - _profile_likelihood_ci( - X=X, - y=y, - side=side, - fitted_coef=self.coef_, - full_loglik=self.loglik_, - max_iter=self.pl_max_iter, - max_stepsize=self.pl_max_stepsize, - max_halfstep=self.pl_max_halfstep, - tol=self.tol, - alpha=0.05, - ) - for side in [-1, 1] - ] + self.ci_ = _profile_likelihood_ci( + X=X, + y=y, + fitted_coef=self.coef_, + full_loglik=self.loglik_, + max_iter=self.pl_max_iter, + max_stepsize=self.pl_max_stepsize, + max_halfstep=self.pl_max_halfstep, + tol=self.tol, + alpha=0.05, ) else: self.ci_ = _wald_ci(self.coef_, self.bse_, self.alpha) @@ -377,7 +371,6 @@ def _predict(X, coef): def _profile_likelihood_ci( X, y, - side, fitted_coef, full_loglik, max_iter, @@ -387,54 +380,62 @@ def _profile_likelihood_ci( alpha, ): LL0 = full_loglik - chi2.ppf(1 - alpha, 1) / 2 - ci = [] - for coef_idx in range(fitted_coef.shape[0]): - coef = deepcopy(fitted_coef) - for iter in range(1, max_iter + 1): - # preds = expit(X @ coef) - preds = _predict(X, coef) - loglike = -_loglikelihood(X, y, preds) - XW = _get_XW(X, preds) - hat = _hat_diag(XW) - XW = _get_aug_XW(X, preds, hat) # augmented data using hat diag - fisher_info_mtx = XW.T @ XW - U_star = np.matmul(X.T, y - preds + np.multiply(hat, 0.5 - preds)) - # https://github.com/georgheinze/logistf/blob/master/src/logistf.c#L780-L781 - inv_fisher = np.linalg.pinv(fisher_info_mtx) - tmp1x1 = U_star @ np.negative(inv_fisher) @ U_star - underRoot = ( - -2 * ((LL0 - loglike) + 0.5 * tmp1x1) / (inv_fisher[coef_idx, coef_idx]) - ) - lambda_ = 0 if underRoot < 0 else side * sqrt(underRoot) - U_star[coef_idx] += lambda_ - - step_size = np.linalg.lstsq(fisher_info_mtx, U_star, rcond=None)[0] - mx = np.max(np.abs(step_size)) / max_stepsize - if mx > 1: - step_size = step_size / mx # restrict to max_stepsize - coef += step_size - loglike_old = deepcopy(loglike) - - for halfs in range(1, max_halfstep + 1): - # preds = expit(X @ coef) + lower_bound = [] + upper_bound = [] + for side in [-1, 1]: + for coef_idx in range(fitted_coef.shape[0]): + coef = deepcopy(fitted_coef) + for iter in range(1, max_iter + 1): preds = _predict(X, coef) loglike = -_loglikelihood(X, y, preds) - if (abs(loglike - LL0) < abs(loglike_old - LL0)) and loglike > LL0: + XW = _get_XW(X, preds) + hat = _hat_diag(XW) + XW = _get_aug_XW(X, preds, hat) # augmented data using hat diag + fisher_info_mtx = XW.T @ XW + U_star = np.matmul(X.T, y - preds + np.multiply(hat, 0.5 - preds)) + # https://github.com/georgheinze/logistf/blob/master/src/logistf.c#L780-L781 + inv_fisher = np.linalg.pinv(fisher_info_mtx) + tmp1x1 = U_star @ np.negative(inv_fisher) @ U_star + underRoot = ( + -2 + * ((LL0 - loglike) + 0.5 * tmp1x1) + / (inv_fisher[coef_idx, coef_idx]) + ) + lambda_ = 0 if underRoot < 0 else side * sqrt(underRoot) + U_star[coef_idx] += lambda_ + + step_size = np.linalg.lstsq(fisher_info_mtx, U_star, rcond=None)[0] + mx = np.max(np.abs(step_size)) / max_stepsize + if mx > 1: + step_size = step_size / mx # restrict to max_stepsize + coef += step_size + loglike_old = deepcopy(loglike) + + for halfs in range(1, max_halfstep + 1): + preds = _predict(X, coef) + loglike = -_loglikelihood(X, y, preds) + if (abs(loglike - LL0) < abs(loglike_old - LL0)) and loglike > LL0: + break + step_size *= 0.5 + coef -= step_size + if abs(loglike - LL0) <= tol: + if side == -1: + lower_bound.append(coef[coef_idx]) + else: + upper_bound.append(coef[coef_idx]) break - step_size *= 0.5 - coef -= step_size - if abs(loglike - LL0) <= tol: - ci.append(coef[coef_idx]) - break - if abs(loglike - LL0) > tol: - ci.append(np.nan) - warning_msg = ( - f"Non-converged PL confidence limits - max number of " - f"iterations exceeded for variable x{coef_idx}. Try " - f"increasing pl_max_iter." - ) - warnings.warn(warning_msg, ConvergenceWarning, stacklevel=2) - return ci + if abs(loglike - LL0) > tol: + if side == -1: + lower_bound.append(np.nan) + else: + upper_bound.append(np.nan) + warning_msg = ( + f"Non-converged PL confidence limits - max number of " + f"iterations exceeded for variable x{coef_idx}. Try " + f"increasing pl_max_iter." + ) + warnings.warn(warning_msg, ConvergenceWarning, stacklevel=2) + return np.column_stack([lower_bound, upper_bound]) def _wald_ci(coef, bse, alpha): From 7a00c97fb41185038f76886802792182b613592d Mon Sep 17 00:00:00 2001 From: jzluo <20971593+jzluo@users.noreply.github.com> Date: Sun, 7 Aug 2022 02:05:19 -0400 Subject: [PATCH 2/6] get PL confidence intervals for subset of variables --- firthlogist/firthlogist.py | 27 ++++++++++++++++++++++++--- firthlogist/tests/test_firthlogist.py | 15 +++++++++++++++ 2 files changed, 39 insertions(+), 3 deletions(-) diff --git a/firthlogist/firthlogist.py b/firthlogist/firthlogist.py index 97d32f5..b7ffbac 100644 --- a/firthlogist/firthlogist.py +++ b/firthlogist/firthlogist.py @@ -54,6 +54,9 @@ class FirthLogisticRegression(BaseEstimator, ClassifierMixin): Significance level (confidence interval = 1-alpha). 0.05 as default for 95% CI. wald If True, uses Wald method to calculate p-values and confidence intervals. + test_vars + Index or list of indices of the variables for which to calculate confidence + intervals and p-values. If None, calculate for all variables. Attributes ---------- @@ -97,6 +100,7 @@ def __init__( skip_ci=False, alpha=0.05, wald=False, + test_vars=None, ): self.max_iter = max_iter self.max_stepsize = max_stepsize @@ -110,6 +114,7 @@ def __init__( self.skip_ci = skip_ci self.alpha = alpha self.wald = wald + self.test_vars = test_vars def _more_tags(self): return {"binary_only": True} @@ -161,7 +166,8 @@ def fit(self, X, y): max_stepsize=self.pl_max_stepsize, max_halfstep=self.pl_max_halfstep, tol=self.tol, - alpha=0.05, + alpha=self.alpha, + test_vars=self.test_vars, ) else: self.ci_ = _wald_ci(self.coef_, self.bse_, self.alpha) @@ -378,12 +384,20 @@ def _profile_likelihood_ci( max_halfstep, tol, alpha, + test_vars, ): LL0 = full_loglik - chi2.ppf(1 - alpha, 1) / 2 lower_bound = [] upper_bound = [] + if test_vars is None: + test_var_indices = range(fitted_coef.shape[0]) + elif isinstance(test_vars, int): # single index + test_var_indices = [test_vars] + else: # list, tuple, or set of indices + test_var_indices = sorted(test_vars) for side in [-1, 1]: - for coef_idx in range(fitted_coef.shape[0]): + # for coef_idx in range(fitted_coef.shape[0]): + for coef_idx in test_var_indices: coef = deepcopy(fitted_coef) for iter in range(1, max_iter + 1): preds = _predict(X, coef) @@ -435,7 +449,14 @@ def _profile_likelihood_ci( f"increasing pl_max_iter." ) warnings.warn(warning_msg, ConvergenceWarning, stacklevel=2) - return np.column_stack([lower_bound, upper_bound]) + bounds = np.column_stack([lower_bound, upper_bound]) + if len(lower_bound) < fitted_coef.shape[0]: + ci = np.full([fitted_coef.shape[0], 2], np.nan) + for idx, test_var_idx in enumerate(test_var_indices): + ci[test_var_idx] = bounds[idx] + return ci + else: + return bounds def _wald_ci(coef, bse, alpha): diff --git a/firthlogist/tests/test_firthlogist.py b/firthlogist/tests/test_firthlogist.py index 0e40383..d9d2bab 100644 --- a/firthlogist/tests/test_firthlogist.py +++ b/firthlogist/tests/test_firthlogist.py @@ -92,3 +92,18 @@ def test_compare_to_logistf(data): assert_allclose(firth.coef_, data["logistf_coef"], rtol=1e-05) assert_allclose(firth.intercept_, data["logistf_intercept"], rtol=1e-05) assert_allclose(firth.ci_, data["logistf_ci"], rtol=1e-05) + + +@pytest.mark.parametrize("data", ["endometrial"], indirect=True) +def test_ci_singlevaridx(data): + firth = FirthLogisticRegression(test_vars=2) + firth.fit(data["X"], data["y"]) + ci = np.array( + [ + [np.nan, np.nan], + [np.nan, np.nan], + [-4.36518284, -1.23272106], + [np.nan, np.nan], + ] + ) + assert_allclose(firth.ci_, ci) From c0091f8657064f71bb9ffea6355220711e89677c Mon Sep 17 00:00:00 2001 From: jzluo <20971593+jzluo@users.noreply.github.com> Date: Sun, 7 Aug 2022 02:38:35 -0400 Subject: [PATCH 3/6] penalized LRT for subset of variables --- firthlogist/firthlogist.py | 67 +++++++++++++++++++-------- firthlogist/tests/test_firthlogist.py | 19 ++++++++ 2 files changed, 66 insertions(+), 20 deletions(-) diff --git a/firthlogist/firthlogist.py b/firthlogist/firthlogist.py index b7ffbac..36de837 100644 --- a/firthlogist/firthlogist.py +++ b/firthlogist/firthlogist.py @@ -56,7 +56,8 @@ class FirthLogisticRegression(BaseEstimator, ClassifierMixin): If True, uses Wald method to calculate p-values and confidence intervals. test_vars Index or list of indices of the variables for which to calculate confidence - intervals and p-values. If None, calculate for all variables. + intervals and p-values. If None, calculate for all variables. This option has + no effect if wald=True. Attributes ---------- @@ -177,20 +178,16 @@ def fit(self, X, y): # penalized likelihood ratio tests if not self.skip_pvals: if not self.wald: - pvals = [] - # mask is 1-indexed because of `if mask` check in _get_XW() - for mask in range(1, self.coef_.shape[0] + 1): - _, null_loglik, _ = _firth_newton_raphson( - X, - y, - self.max_iter, - self.max_stepsize, - self.max_halfstep, - self.tol, - mask, - ) - pvals.append(_lrt(self.loglik_, null_loglik)) - self.pvals_ = np.array(pvals) + self.pvals_ = _penalized_lrt( + self.loglik_, + X, + y, + self.max_iter, + self.max_stepsize, + self.max_halfstep, + self.tol, + self.test_vars, + ) else: self.pvals_ = _wald_test(self.coef_, self.bse_) @@ -331,8 +328,8 @@ def _get_XW(X, preds, mask=None): # is this equivalent?? # https://github.com/georgheinze/logistf/blob/master/src/logistf.c#L150-L159 - if mask: - XW[:, mask - 1] = 0 + if mask is not None: + XW[:, mask] = 0 return XW @@ -361,6 +358,36 @@ def _bse(X, coefs): return np.sqrt(np.diag(np.linalg.pinv(fisher_info_mtx))) +def _penalized_lrt( + full_loglik, X, y, max_iter, max_stepsize, max_halfstep, tol, test_vars +): + if test_vars is None: + test_var_indices = range(X.shape[1]) + elif isinstance(test_vars, int): # single index + test_var_indices = [test_vars] + else: # list, tuple, or set of indices + test_var_indices = sorted(test_vars) + + pvals = [] + for mask in test_var_indices: + _, null_loglik, _ = _firth_newton_raphson( + X, + y, + max_iter, + max_stepsize, + max_halfstep, + tol, + mask, + ) + pvals.append(_lrt(full_loglik, null_loglik)) + if len(pvals) < X.shape[1]: + pval_array = np.full(X.shape[1], np.nan) + for idx, test_var_idx in enumerate(test_var_indices): + pval_array[test_var_idx] = pvals[idx] + return pval_array + return np.array(pvals) + + def _lrt(full_loglik, null_loglik): # in logistf: 1-pchisq(2*(fit.full$loglik-fit.i$loglik),1) lr_stat = 2 * (full_loglik - null_loglik) @@ -454,9 +481,9 @@ def _profile_likelihood_ci( ci = np.full([fitted_coef.shape[0], 2], np.nan) for idx, test_var_idx in enumerate(test_var_indices): ci[test_var_idx] = bounds[idx] - return ci - else: - return bounds + return ci + + return bounds def _wald_ci(coef, bse, alpha): diff --git a/firthlogist/tests/test_firthlogist.py b/firthlogist/tests/test_firthlogist.py index d9d2bab..ce8356d 100644 --- a/firthlogist/tests/test_firthlogist.py +++ b/firthlogist/tests/test_firthlogist.py @@ -106,4 +106,23 @@ def test_ci_singlevaridx(data): [np.nan, np.nan], ] ) + pvals = np.array([np.nan, np.nan, 2.50418343e-05, np.nan]) assert_allclose(firth.ci_, ci) + assert_allclose(firth.pvals_, pvals) + + +@pytest.mark.parametrize("data", ["endometrial"], indirect=True) +def test_ci_multivaridx(data): + firth = FirthLogisticRegression(test_vars=[1, 2]) + firth.fit(data["X"], data["y"]) + ci = np.array( + [ + [np.nan, np.nan], + [-0.12445872, 0.04045547], + [-4.36518284, -1.23272106], + [np.nan, np.nan], + ] + ) + pvals = np.array([np.nan, 3.76021507e-01, 2.50418343e-05, np.nan]) + assert_allclose(firth.ci_, ci) + assert_allclose(firth.pvals_, pvals) From 806c8864f7d32c841d2107fb3f02a81cf77d6df6 Mon Sep 17 00:00:00 2001 From: jzluo <20971593+jzluo@users.noreply.github.com> Date: Sun, 7 Aug 2022 02:47:29 -0400 Subject: [PATCH 4/6] add test_vars to readme --- README.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/README.md b/README.md index b37369c..cb44aba 100644 --- a/README.md +++ b/README.md @@ -88,6 +88,11 @@ coefficient.  If True, uses Wald method to calculate p-values and confidence intervals. +`test_vars`: **Union[int, List[int]], default=None** + + Index or list of indices of the variables for which to calculate confidence intervals and p-values. If None, calculate for all variables. This option has no effect if `wald=True`. + + ### Attributes `bse_` From c87868bb40ae312589d8ca9ec295da64f10ad636 Mon Sep 17 00:00:00 2001 From: jzluo <20971593+jzluo@users.noreply.github.com> Date: Sun, 7 Aug 2022 02:58:33 -0400 Subject: [PATCH 5/6] bump version to 0.5.0 --- CHANGELOG.md | 5 ++++- pyproject.toml | 2 +- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index c2ca818..de5dd3d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,8 +6,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] -## [0.4.0] - 2022-08-01 +## [0.5.0] - 2022-08-07 +### Added +- `test_vars` option to specify the variable(s) for which to calculate PL confidence intervals and p-values. +## [0.4.0] - 2022-08-01 ### Added - Option to use Wald method for computing p-values and confidence intervals instead of LRT and profile likelihood. Set `wald=True` to use ([#11](https://github.com/jzluo/firthlogist/pull/11)). - Tests for `load_sex2()` and `load_endometrial()` ([#9](https://github.com/jzluo/firthlogist/pull/9)). diff --git a/pyproject.toml b/pyproject.toml index df5141d..7cd6c72 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "firthlogist" -version = "0.4.0" +version = "0.5.0" description = "Python implementation of Logistic Regression with Firth's bias reduction" authors = ["Jon Luo "] repository = "https://github.com/jzluo/firthlogist" From 08ccb052659a1850acaa13cc232c8cb6792e6ffd Mon Sep 17 00:00:00 2001 From: jzluo <20971593+jzluo@users.noreply.github.com> Date: Sun, 7 Aug 2022 03:12:08 -0400 Subject: [PATCH 6/6] fixed summary bug --- CHANGELOG.md | 2 ++ firthlogist/firthlogist.py | 2 ++ 2 files changed, 4 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index de5dd3d..c55db46 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [0.5.0] - 2022-08-07 ### Added - `test_vars` option to specify the variable(s) for which to calculate PL confidence intervals and p-values. +### Fixed +- Fixed bug where `.summary(xname)` would append `Intercept` to `xname` such that repeated calls would break. ## [0.4.0] - 2022-08-01 ### Added diff --git a/firthlogist/firthlogist.py b/firthlogist/firthlogist.py index 36de837..0e6a85d 100644 --- a/firthlogist/firthlogist.py +++ b/firthlogist/firthlogist.py @@ -244,6 +244,8 @@ def summary(self, xname=None, tablefmt="simple"): table += f"Log-Likelihood: {round(self.loglik_, 4)}\n" table += f"Newton-Raphson iterations: {self.n_iter_}\n" print(table) + if self.fit_intercept: + xname.pop() return def decision_function(self, X):