From 32878029dc2e989497a29249c0ed8100ce063cb4 Mon Sep 17 00:00:00 2001 From: John Cherian Date: Thu, 7 Nov 2024 14:46:34 -0500 Subject: [PATCH 1/5] hotfixes for extrapolation + improved house model --- src/elexmodel/client.py | 19 ++++++++++- src/elexmodel/handlers/data/VersionedData.py | 4 +-- .../models/BootstrapElectionModel.py | 33 ++++++++++++++++++- 3 files changed, 52 insertions(+), 4 deletions(-) diff --git a/src/elexmodel/client.py b/src/elexmodel/client.py index 30097e31..bdaa412c 100644 --- a/src/elexmodel/client.py +++ b/src/elexmodel/client.py @@ -3,6 +3,8 @@ import numpy as np import pandas as pd +from io import StringIO + from elexmodel.handlers import s3 from elexmodel.handlers.config import ConfigHandler from elexmodel.handlers.data.CombinedData import CombinedDataHandler @@ -340,6 +342,20 @@ def get_estimates( else: versioned_data_handler = None + if self.office != "P": + s3_client = s3.S3CsvUtil(TARGET_BUCKET) + baseline_path = f"{S3_FILE_PATH}/{self.election_id}/data/P/data_county.csv" + results_path = f"{S3_FILE_PATH}/{self.election_id}/results/P/county/current.csv" + predictions_path = f"{S3_FILE_PATH}/{self.election_id}/predictions/P/county/unit_data/current.csv" + pres_baseline = pd.read_csv(StringIO(s3_client.get(baseline_path)), dtype={"geographic_unit_fips": str}) + pres_baseline['baseline_normalized_margin'] = (pres_baseline.baseline_dem - pres_baseline.baseline_gop) / (pres_baseline.baseline_dem + pres_baseline.baseline_gop) + pres_results = pd.read_csv(StringIO(s3_client.get(results_path)), dtype={"geographic_unit_fips": str}) + pres_predictions = pd.read_csv(StringIO(s3_client.get(predictions_path)), dtype={"geographic_unit_fips": str}) + pres_predictions = pres_predictions.merge(pres_results[['geographic_unit_fips', 'results_weights']], on="geographic_unit_fips", how="left") + pres_predictions = pres_predictions.merge(pres_baseline[['geographic_unit_fips', 'baseline_normalized_margin']], on="geographic_unit_fips", how="left") + else: + pres_predictions = None + LOG.info( "Model parameters: \n prediction intervals: %s, percent reporting threshold: %s, \ pi_method: %s, aggregates: %s, model settings: %s", @@ -359,7 +375,8 @@ def get_estimates( self.model = GaussianElectionModel(model_settings=model_settings) elif pi_method == "bootstrap": self.model = BootstrapElectionModel( - model_settings=model_settings, versioned_data_handler=versioned_data_handler + model_settings=model_settings, versioned_data_handler=versioned_data_handler, + pres_predictions=pres_predictions ) minimum_reporting_units_max = 0 diff --git a/src/elexmodel/handlers/data/VersionedData.py b/src/elexmodel/handlers/data/VersionedData.py index 231bfc29..450c1290 100644 --- a/src/elexmodel/handlers/data/VersionedData.py +++ b/src/elexmodel/handlers/data/VersionedData.py @@ -117,7 +117,7 @@ def compute_estimated_margin(df): # because the AP adjusted its model after the fact. We correct for this here. # we recompute the percent_expected_vote using the last reported value as the max perc_expected_vote_corr = np.divide( - results_turnout, results_turnout[-1], out=np.zeros_like(results_turnout), where=results_turnout[-1] != 0 + results_turnout, results_turnout[-1], out=np.zeros_like(results_turnout), where=results_turnout[-1] != 0, casting='unsafe' ) # check if perc_expected_vote_corr is monotone increasing (if not, give up and don't try to estimate a margin) @@ -190,7 +190,7 @@ def compute_estimated_margin(df): est_margins = observed_norm_margin * observed_vote + observed_batch_margin * (percs - observed_vote) est_margins = np.divide( - est_margins, percs, where=percs != 0, out=np.zeros_like(est_margins) + est_margins, percs, where=percs != 0, out=np.zeros_like(est_margins), casting='unsafe' ) # Handle div-by-zero # Return a DataFrame with the multi-index (geographic_unit_fips, perc) diff --git a/src/elexmodel/models/BootstrapElectionModel.py b/src/elexmodel/models/BootstrapElectionModel.py index 00ac36a1..5f845d4f 100644 --- a/src/elexmodel/models/BootstrapElectionModel.py +++ b/src/elexmodel/models/BootstrapElectionModel.py @@ -52,7 +52,7 @@ class BootstrapElectionModel(BaseElectionModel): and the epsilons are contest (state/district) level random effects. """ - def __init__(self, model_settings={}, versioned_data_handler=None): + def __init__(self, model_settings={}, versioned_data_handler=None, pres_predictions=None): super().__init__(model_settings) self.B = model_settings.get("B", 500) # number of bootstrap samples self.strata = model_settings.get("strata", ["county_classification"]) # columns to stratify the data by @@ -61,6 +61,10 @@ def __init__(self, model_settings={}, versioned_data_handler=None): "agg_model_hard_threshold", True ) # use sigmoid or hard thresold when calculating agg model self.district_election = model_settings.get("district_election", False) + self.office = model_settings.get("office", None) # office of the election + self.election_id = model_settings.get("election_id", None) # election ID + self.geographic_unit_type = model_settings.get("geographic_unit_type", None) # geographic unit type + self.lambda_ = model_settings.get("lambda_", None) # regularization parameter for OLS # save versioned data for later use @@ -70,6 +74,9 @@ def __init__(self, model_settings={}, versioned_data_handler=None): self.extrapolate_std_method = model_settings.get("extrapolate_std_method", "std") self.max_dist_to_observed = model_settings.get("max_dist_to_observed", 5) + # save presidenial predictions for later use + self.pres_predictions = pres_predictions + # upper and lower bounds for the quantile regression which define the strata distributions # these make sure that we can control the worst cases for the distributions in case we # haven't seen enough data ayet @@ -96,6 +103,8 @@ def __init__(self, model_settings={}, versioned_data_handler=None): self.features, self.fixed_effects, states_for_separate_model=self.states_for_separate_model ) + self.correct_from_presidential = model_settings.get("correct_from_presidential", self.office in ("H", "S")) + self.seed = model_settings.get("seed", 0) self.rng = np.random.default_rng(seed=self.seed) # used for sampling self.ran_bootstrap = False @@ -1283,6 +1292,28 @@ def compute_bootstrap_errors( extrap_filter ] + if self.correct_from_presidential: + # self.pres_predictions['geographic_unit_fips'] = self.pres_predictions.geographic_unit_fips.apply(lambda x: x.zfill(5)) + nonreporting_units['geographic_unit_fips_p'] = nonreporting_units.geographic_unit_fips.apply(lambda x: x.split("_")[1]) + nonreporting_units = nonreporting_units.merge(self.pres_predictions, left_on="geographic_unit_fips_p", right_on="geographic_unit_fips", how="left", suffixes=("", "_pres")) + + # adjust results_normalized_margin_pres to account for split counties + + nonreporting_units["margin_adj"] = ( + nonreporting_units.baseline_normalized_margin - nonreporting_units.baseline_normalized_margin_pres + ) + + nonreporting_units["results_normalized_margin_pres"] = nonreporting_units.results_margin_pres / nonreporting_units.results_weights_pres + nonreporting_units.margin_adj + nonreporting_units["pred_normalized_margin_pres"] = nonreporting_units.pred_margin / nonreporting_units.pred_turnout + nonreporting_units.margin_adj + + nonreporting_units["pred_normalized_margin"] = np.mean(y_test_pred_B.clip(min=y_partial_reporting_lower, max=y_partial_reporting_upper), axis=1) + + nonreporting_units["margin_gap"] = nonreporting_units.results_normalized_margin - nonreporting_units.results_normalized_margin_pres + + nonreporting_units["pred_normalized_margin_new"] = nonreporting_units.pred_normalized_margin_pres + nonreporting_units.margin_gap + adjustment = nonreporting_units["pred_normalized_margin_new"].values - nonreporting_units["pred_normalized_margin"].values + y_test_pred_B[~np.isnan(adjustment)] += adjustment[~np.isnan(adjustment)].reshape(-1,1) + y_test_pred_B = y_test_pred_B.clip(min=y_partial_reporting_lower, max=y_partial_reporting_upper) # \tilde{y_i}^{b} * \tilde{z_i}^{b} From f4931fb5939771cca1401b6bc14eacf37108c2cc Mon Sep 17 00:00:00 2001 From: John Cherian Date: Thu, 7 Nov 2024 14:49:25 -0500 Subject: [PATCH 2/5] fixes for testing --- src/elexmodel/client.py | 28 ++++++---- src/elexmodel/handlers/data/VersionedData.py | 8 ++- .../models/BootstrapElectionModel.py | 52 +++++++++++++------ 3 files changed, 61 insertions(+), 27 deletions(-) diff --git a/src/elexmodel/client.py b/src/elexmodel/client.py index bdaa412c..68ab90f6 100644 --- a/src/elexmodel/client.py +++ b/src/elexmodel/client.py @@ -1,10 +1,9 @@ from collections import defaultdict +from io import StringIO import numpy as np import pandas as pd -from io import StringIO - from elexmodel.handlers import s3 from elexmodel.handlers.config import ConfigHandler from elexmodel.handlers.data.CombinedData import CombinedDataHandler @@ -342,17 +341,27 @@ def get_estimates( else: versioned_data_handler = None - if self.office != "P": + if model_parameters.get("pres_corr", False): s3_client = s3.S3CsvUtil(TARGET_BUCKET) baseline_path = f"{S3_FILE_PATH}/{self.election_id}/data/P/data_county.csv" results_path = f"{S3_FILE_PATH}/{self.election_id}/results/P/county/current.csv" predictions_path = f"{S3_FILE_PATH}/{self.election_id}/predictions/P/county/unit_data/current.csv" pres_baseline = pd.read_csv(StringIO(s3_client.get(baseline_path)), dtype={"geographic_unit_fips": str}) - pres_baseline['baseline_normalized_margin'] = (pres_baseline.baseline_dem - pres_baseline.baseline_gop) / (pres_baseline.baseline_dem + pres_baseline.baseline_gop) + pres_baseline["baseline_normalized_margin"] = (pres_baseline.baseline_dem - pres_baseline.baseline_gop) / ( + pres_baseline.baseline_dem + pres_baseline.baseline_gop + ) pres_results = pd.read_csv(StringIO(s3_client.get(results_path)), dtype={"geographic_unit_fips": str}) - pres_predictions = pd.read_csv(StringIO(s3_client.get(predictions_path)), dtype={"geographic_unit_fips": str}) - pres_predictions = pres_predictions.merge(pres_results[['geographic_unit_fips', 'results_weights']], on="geographic_unit_fips", how="left") - pres_predictions = pres_predictions.merge(pres_baseline[['geographic_unit_fips', 'baseline_normalized_margin']], on="geographic_unit_fips", how="left") + pres_predictions = pd.read_csv( + StringIO(s3_client.get(predictions_path)), dtype={"geographic_unit_fips": str} + ) + pres_predictions = pres_predictions.merge( + pres_results[["geographic_unit_fips", "results_weights"]], on="geographic_unit_fips", how="left" + ) + pres_predictions = pres_predictions.merge( + pres_baseline[["geographic_unit_fips", "baseline_normalized_margin"]], + on="geographic_unit_fips", + how="left", + ) else: pres_predictions = None @@ -375,8 +384,9 @@ def get_estimates( self.model = GaussianElectionModel(model_settings=model_settings) elif pi_method == "bootstrap": self.model = BootstrapElectionModel( - model_settings=model_settings, versioned_data_handler=versioned_data_handler, - pres_predictions=pres_predictions + model_settings=model_settings, + versioned_data_handler=versioned_data_handler, + pres_predictions=pres_predictions, ) minimum_reporting_units_max = 0 diff --git a/src/elexmodel/handlers/data/VersionedData.py b/src/elexmodel/handlers/data/VersionedData.py index 450c1290..b76a55da 100644 --- a/src/elexmodel/handlers/data/VersionedData.py +++ b/src/elexmodel/handlers/data/VersionedData.py @@ -117,7 +117,11 @@ def compute_estimated_margin(df): # because the AP adjusted its model after the fact. We correct for this here. # we recompute the percent_expected_vote using the last reported value as the max perc_expected_vote_corr = np.divide( - results_turnout, results_turnout[-1], out=np.zeros_like(results_turnout), where=results_turnout[-1] != 0, casting='unsafe' + results_turnout, + results_turnout[-1], + out=np.zeros_like(results_turnout), + where=results_turnout[-1] != 0, + casting="unsafe", ) # check if perc_expected_vote_corr is monotone increasing (if not, give up and don't try to estimate a margin) @@ -190,7 +194,7 @@ def compute_estimated_margin(df): est_margins = observed_norm_margin * observed_vote + observed_batch_margin * (percs - observed_vote) est_margins = np.divide( - est_margins, percs, where=percs != 0, out=np.zeros_like(est_margins), casting='unsafe' + est_margins, percs, where=percs != 0, out=np.zeros_like(est_margins), casting="unsafe" ) # Handle div-by-zero # Return a DataFrame with the multi-index (geographic_unit_fips, perc) diff --git a/src/elexmodel/models/BootstrapElectionModel.py b/src/elexmodel/models/BootstrapElectionModel.py index 5f845d4f..0ff01b48 100644 --- a/src/elexmodel/models/BootstrapElectionModel.py +++ b/src/elexmodel/models/BootstrapElectionModel.py @@ -61,9 +61,9 @@ def __init__(self, model_settings={}, versioned_data_handler=None, pres_predicti "agg_model_hard_threshold", True ) # use sigmoid or hard thresold when calculating agg model self.district_election = model_settings.get("district_election", False) - self.office = model_settings.get("office", None) # office of the election - self.election_id = model_settings.get("election_id", None) # election ID - self.geographic_unit_type = model_settings.get("geographic_unit_type", None) # geographic unit type + self.office = model_settings.get("office", None) # office of the election + self.election_id = model_settings.get("election_id", None) # election ID + self.geographic_unit_type = model_settings.get("geographic_unit_type", None) # geographic unit type self.lambda_ = model_settings.get("lambda_", None) # regularization parameter for OLS @@ -76,6 +76,7 @@ def __init__(self, model_settings={}, versioned_data_handler=None, pres_predicti # save presidenial predictions for later use self.pres_predictions = pres_predictions + self.correct_from_presidential = model_settings.get("correct_from_presidential", self.office in ("H", "S")) # upper and lower bounds for the quantile regression which define the strata distributions # these make sure that we can control the worst cases for the distributions in case we @@ -103,8 +104,6 @@ def __init__(self, model_settings={}, versioned_data_handler=None, pres_predicti self.features, self.fixed_effects, states_for_separate_model=self.states_for_separate_model ) - self.correct_from_presidential = model_settings.get("correct_from_presidential", self.office in ("H", "S")) - self.seed = model_settings.get("seed", 0) self.rng = np.random.default_rng(seed=self.seed) # used for sampling self.ran_bootstrap = False @@ -1293,9 +1292,16 @@ def compute_bootstrap_errors( ] if self.correct_from_presidential: - # self.pres_predictions['geographic_unit_fips'] = self.pres_predictions.geographic_unit_fips.apply(lambda x: x.zfill(5)) - nonreporting_units['geographic_unit_fips_p'] = nonreporting_units.geographic_unit_fips.apply(lambda x: x.split("_")[1]) - nonreporting_units = nonreporting_units.merge(self.pres_predictions, left_on="geographic_unit_fips_p", right_on="geographic_unit_fips", how="left", suffixes=("", "_pres")) + nonreporting_units["geographic_unit_fips_p"] = nonreporting_units.geographic_unit_fips.apply( + lambda x: x.split("_")[1] + ) + nonreporting_units = nonreporting_units.merge( + self.pres_predictions, + left_on="geographic_unit_fips_p", + right_on="geographic_unit_fips", + how="left", + suffixes=("", "_pres"), + ) # adjust results_normalized_margin_pres to account for split counties @@ -1303,17 +1309,31 @@ def compute_bootstrap_errors( nonreporting_units.baseline_normalized_margin - nonreporting_units.baseline_normalized_margin_pres ) - nonreporting_units["results_normalized_margin_pres"] = nonreporting_units.results_margin_pres / nonreporting_units.results_weights_pres + nonreporting_units.margin_adj - nonreporting_units["pred_normalized_margin_pres"] = nonreporting_units.pred_margin / nonreporting_units.pred_turnout + nonreporting_units.margin_adj + nonreporting_units["results_normalized_margin_pres"] = ( + nonreporting_units.results_margin_pres / nonreporting_units.results_weights_pres + + nonreporting_units.margin_adj + ) + nonreporting_units["pred_normalized_margin_pres"] = ( + nonreporting_units.pred_margin / nonreporting_units.pred_turnout + nonreporting_units.margin_adj + ) + + nonreporting_units["pred_normalized_margin"] = np.mean( + y_test_pred_B.clip(min=y_partial_reporting_lower, max=y_partial_reporting_upper), axis=1 + ) - nonreporting_units["pred_normalized_margin"] = np.mean(y_test_pred_B.clip(min=y_partial_reporting_lower, max=y_partial_reporting_upper), axis=1) + nonreporting_units["margin_gap"] = ( + nonreporting_units.results_normalized_margin - nonreporting_units.results_normalized_margin_pres + ) - nonreporting_units["margin_gap"] = nonreporting_units.results_normalized_margin - nonreporting_units.results_normalized_margin_pres + nonreporting_units["pred_normalized_margin_new"] = ( + nonreporting_units.pred_normalized_margin_pres + nonreporting_units.margin_gap + ) + adjustment = ( + nonreporting_units["pred_normalized_margin_new"].values + - nonreporting_units["pred_normalized_margin"].values + ) + y_test_pred_B[~np.isnan(adjustment)] += adjustment[~np.isnan(adjustment)].reshape(-1, 1) - nonreporting_units["pred_normalized_margin_new"] = nonreporting_units.pred_normalized_margin_pres + nonreporting_units.margin_gap - adjustment = nonreporting_units["pred_normalized_margin_new"].values - nonreporting_units["pred_normalized_margin"].values - y_test_pred_B[~np.isnan(adjustment)] += adjustment[~np.isnan(adjustment)].reshape(-1,1) - y_test_pred_B = y_test_pred_B.clip(min=y_partial_reporting_lower, max=y_partial_reporting_upper) # \tilde{y_i}^{b} * \tilde{z_i}^{b} From 3b19d37c945a4c0498dc80c09b564c28e5dfd871 Mon Sep 17 00:00:00 2001 From: John Cherian Date: Thu, 7 Nov 2024 14:54:13 -0500 Subject: [PATCH 3/5] changed some defaults --- src/elexmodel/client.py | 2 +- src/elexmodel/models/BootstrapElectionModel.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/elexmodel/client.py b/src/elexmodel/client.py index bba89462..7895abe6 100644 --- a/src/elexmodel/client.py +++ b/src/elexmodel/client.py @@ -341,7 +341,7 @@ def get_estimates( else: versioned_data_handler = None - if model_parameters.get("pres_corr", False): + if model_parameters.get("correct_from_presidential", False): s3_client = s3.S3CsvUtil(TARGET_BUCKET) baseline_path = f"{S3_FILE_PATH}/{self.election_id}/data/P/data_county.csv" results_path = f"{S3_FILE_PATH}/{self.election_id}/results/P/county/current.csv" diff --git a/src/elexmodel/models/BootstrapElectionModel.py b/src/elexmodel/models/BootstrapElectionModel.py index 0ff01b48..2d611b2b 100644 --- a/src/elexmodel/models/BootstrapElectionModel.py +++ b/src/elexmodel/models/BootstrapElectionModel.py @@ -76,7 +76,7 @@ def __init__(self, model_settings={}, versioned_data_handler=None, pres_predicti # save presidenial predictions for later use self.pres_predictions = pres_predictions - self.correct_from_presidential = model_settings.get("correct_from_presidential", self.office in ("H", "S")) + self.correct_from_presidential = model_settings.get("correct_from_presidential", False) # upper and lower bounds for the quantile regression which define the strata distributions # these make sure that we can control the worst cases for the distributions in case we From d2f9408e001c00288bda94f307edf993d4cbe5b2 Mon Sep 17 00:00:00 2001 From: lennybronner Date: Thu, 7 Nov 2024 15:05:28 -0500 Subject: [PATCH 4/5] removed some unncessary stuff --- src/elexmodel/models/BootstrapElectionModel.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/elexmodel/models/BootstrapElectionModel.py b/src/elexmodel/models/BootstrapElectionModel.py index 2d611b2b..59f3e1f5 100644 --- a/src/elexmodel/models/BootstrapElectionModel.py +++ b/src/elexmodel/models/BootstrapElectionModel.py @@ -61,9 +61,6 @@ def __init__(self, model_settings={}, versioned_data_handler=None, pres_predicti "agg_model_hard_threshold", True ) # use sigmoid or hard thresold when calculating agg model self.district_election = model_settings.get("district_election", False) - self.office = model_settings.get("office", None) # office of the election - self.election_id = model_settings.get("election_id", None) # election ID - self.geographic_unit_type = model_settings.get("geographic_unit_type", None) # geographic unit type self.lambda_ = model_settings.get("lambda_", None) # regularization parameter for OLS From 3de3df80205a3d36cc4f16638d792ba50ead5a6f Mon Sep 17 00:00:00 2001 From: Diane Napolitano Date: Thu, 7 Nov 2024 15:17:54 -0500 Subject: [PATCH 5/5] Preparing for version 2.2.5 --- CHANGELOG.md | 6 ++++++ setup.py | 2 +- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 47c551ca..94c14747 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,11 @@ # Changelog +# 2.2.5 (11/7/2024) +- fix: hot fixes for the extrapolation step + using the presidential margins to infer a ticket splitting estimate in each house / senate race [#140](https://github.com/washingtonpost/elex-live-model/pull/140) + +# 2.2.4 (11/5/2024) +- fix: partial reporting bug [#138](https://github.com/washingtonpost/elex-live-model/pull/138) + # 2.2.3 (11/5/2024) - chore: adding additional log [#135](https://github.com/washingtonpost/elex-live-model/pull/135) diff --git a/setup.py b/setup.py index 5380de4d..7c4fc8b7 100644 --- a/setup.py +++ b/setup.py @@ -22,7 +22,7 @@ LONG_DESCRIPTION = f.read() # The full version, including alpha/beta/rc tags -RELEASE = "2.2.3" +RELEASE = "2.2.5" # The short X.Y version VERSION = ".".join(RELEASE.split(".")[:2])