diff --git a/VB/VB_imputation.ipynb b/VB/VB_imputation.ipynb deleted file mode 100644 index 4696bb7..0000000 --- a/VB/VB_imputation.ipynb +++ /dev/null @@ -1,770 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "This script imputes Veteran's Benefits (VB) recipients dollar benefit amount to match the aggregates with United States Department of Veterans Affairs (USVA) statistics for VB. In this current version, we used 2014 CPS data and USVA FY2014 and FY2015 annual reports on VB. Please refer to the documentation in the same folder for more details on methodology and assumptions. The output this script is an individual level dataset that contains CPS personal level ID (PERIDNUM), individual participation indicator (vb_participation, 0 - not a recipient, 1 - current recipient on file, 2 - imputed recipient), and benefit amount.\n", - "\n", - "Input: 2014 CPS (cpsmar2014t.csv), number of recipients and their benefits amount by state in 2014 (VB_administrative.csv)\n", - "\n", - "Output: VB_Imputation.csv\n", - "\n", - "Additional Source links: https://www.va.gov/vetdata/ " - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "import pandas as pd\n", - "from pandas import DataFrame\n", - "import numpy as np\n", - "import random\n", - "import statsmodels.discrete.discrete_model as sm\n", - "import matplotlib.pyplot as plt " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "##### Variables we use in Veteran's Benefits" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/andersonfrailey/anaconda/lib/python2.7/site-packages/IPython/core/interactiveshell.py:2723: DtypeWarning: Columns (5,23,24,29,83,265,282) have mixed types. Specify dtype option on import or set low_memory=False.\n", - " interactivity=interactivity, compiler=compiler, result=result)\n" - ] - } - ], - "source": [ - "CPS_dataset = pd.read_csv('../../Dropbox/asec2014_pubuse.csv')\n", - "columns_to_keep = ['hvet_yn','hvetval','fvetval','finc_vet','sur_yn','srvs_val','sur_sc1','sur_val1','tsurval1','sur_sc2',\n", - " 'sur_val2','tsurval2','vet_val','vet_yn','vet_typ1','vet_typ2','vet_typ3','vet_typ4','vet_typ5','hsur_yn',\n", - " 'hsurval','fsurval','finc_sur','gestfips','marsupwt','peafever','champ','a_age','wsal_val','semp_val','frse_val',\n", - " 'pedisdrs', 'pedisear', 'pediseye', 'pedisout', 'pedisphy', 'pedisrem','a_sex','peridnum','fh_seq','wc_yn', 'ss_yn', \n", - " 'dis_yn', 'hed_yn', 'hcsp_yn', 'hfdval','paw_yn', 'uc_yn', 'mcaid','mcare']\n", - "CPS_dataset = CPS_dataset[columns_to_keep]\n", - "CPS_dataset.to_csv('VB.csv', columns=columns_to_keep, index=False)" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "CPS_dataset = pd.read_csv('VB.csv')" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "# VB value\n", - "vbvalue = pd.to_numeric(np.where(CPS_dataset.vet_val!= 'None or not in universe', CPS_dataset.vet_val, 0))\n", - "# VB indicator\n", - "indicator = pd.to_numeric(np.where(CPS_dataset.vet_yn== 'Yes', 1, 0))\n", - "# On duty before\n", - "active = pd.to_numeric(np.where(CPS_dataset.peafever== 'Yes', 1, 0))" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "VB = DataFrame(vbvalue.transpose())\n", - "VB.columns = ['vbvalue']" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "# 5 types of VB\n", - "VB['kind'] = pd.to_numeric(np.where(CPS_dataset.vet_typ1== 'Yes', 1, 0))\n", - "VB.kind = pd.to_numeric(np.where(CPS_dataset.vet_typ2== 'Yes', 2, VB.kind))\n", - "VB.kind = pd.to_numeric(np.where(CPS_dataset.vet_typ3== 'Yes', 3, VB.kind))\n", - "VB.kind = pd.to_numeric(np.where(CPS_dataset.vet_typ4== 'Yes', 4, VB.kind))\n", - "VB.kind = pd.to_numeric(np.where(CPS_dataset.vet_typ5== 'Yes', 5, VB.kind))" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "VB['indicator'] = indicator\n", - "VB['active'] = active\n", - "VB['marsupwt'] = CPS_dataset.marsupwt\n", - "VB['fh_seq'] = CPS_dataset.fh_seq\n", - "VB['gestfips'] = CPS_dataset.gestfips\n", - "VB['peridnum'] = CPS_dataset.peridnum" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "# Prepare income information\n", - "wage = pd.to_numeric(np.where(CPS_dataset.wsal_val!= 'None or not in universe', CPS_dataset.wsal_val, 0))\n", - "self_employed1 = pd.to_numeric(np.where(CPS_dataset.semp_val!= 'None or not in universe', CPS_dataset.semp_val, 0))\n", - "self_employed2 = pd.to_numeric(np.where(CPS_dataset.frse_val!= 'None or not in universe', CPS_dataset.frse_val, 0))\n", - "income = wage + self_employed1 + self_employed2\n", - "VB['income'] = income" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "# Prepare age information\n", - "CPS_dataset.a_age = np.where(CPS_dataset.a_age == \"80-84 years of age\",\n", - " random.randrange(80, 84),\n", - " CPS_dataset.a_age)\n", - "CPS_dataset.a_age = np.where(CPS_dataset.a_age == \"85+ years of age\",\n", - " random.randrange(85, 95),\n", - " CPS_dataset.a_age)\n", - "VB['a_age'] = pd.to_numeric(CPS_dataset.a_age)" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "# Prepare gender inforamtion, 0 for male, 1 for female\n", - "VB['sex'] = pd.to_numeric(np.where(CPS_dataset.a_sex == 'Male', 0, 1))" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "# Prepare disabilty information, 6 types\n", - "d1 = np.where(CPS_dataset.pedisdrs == 'Yes',1,0)\n", - "d2 = np.where(CPS_dataset.pedisear == 'Yes',1,0)\n", - "d3 = np.where(CPS_dataset.pediseye == 'Yes',1,0)\n", - "d4 = np.where(CPS_dataset.pedisout == 'Yes',1,0)\n", - "d5 = np.where(CPS_dataset.pedisphy == 'Yes',1,0)\n", - "d6 = np.where(CPS_dataset.pedisrem == 'Yes',1,0)\n", - "VB['d1'] = d1\n", - "VB['d2'] = d2\n", - "VB['d3'] = d3\n", - "VB['d4'] = d4\n", - "VB['d5'] = d5\n", - "VB['d6'] = d6" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "# Limitation of potentail recipients\n", - "familyactive = VB.groupby('fh_seq')['active'].sum()\n", - "factive = DataFrame(familyactive)" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "# Family active index\n", - "VB['familyactive'] = np.zeros(len(VB))\n", - "for number in VB.fh_seq:\n", - " VB.familyactive= np.where(VB.fh_seq==number,factive.active[factive.index==number],VB.familyactive)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Add dummy for program participation " - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "VB['paw_yn'] = np.where(CPS_dataset.paw_yn=='Yes', 1, 0)\n", - "VB['wc_yn'] = np.where(CPS_dataset.wc_yn=='Yes', 1, 0)\n", - "VB['uc_yn'] = np.where(CPS_dataset.uc_yn=='Yes', 1, 0)\n", - "VB['ss_yn'] = np.where(CPS_dataset.ss_yn=='Yes', 1, 0)\n", - "VB['sur_yn'] = np.where(CPS_dataset.sur_yn=='Yes', 1, 0)\n", - "VB['dis_yn'] = np.where(CPS_dataset.dis_yn=='Yes', 1, 0)\n", - "VB['hed_yn'] = np.where(CPS_dataset.hed_yn=='Yes', 1, 0)\n", - "VB['hcsp_yn'] = np.where(CPS_dataset.hcsp_yn=='Yes', 1, 0)\n", - "VB['hfdval'] = np.where(CPS_dataset.hfdval!='Not in universe', 1, 0)\n", - "VB['mcare'] = np.where(CPS_dataset.mcare=='Yes', 1, 0)\n", - "VB['mcaid'] = np.where(CPS_dataset.mcaid=='Yes', 1, 0)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Regression model" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Optimization terminated successfully.\n", - " Current function value: 0.037499\n", - " Iterations 10\n" - ] - } - ], - "source": [ - "dta = VB\n", - "dta['intercept'] = np.ones(len(dta))\n", - "model = sm.Logit(endog=dta.indicator, exog=dta[['a_age','sex','income','d1','d2','d3','d4','d5','d6',\n", - " 'active','paw_yn','wc_yn','ss_yn', 'uc_yn', 'sur_yn',\n", - " 'hed_yn','hcsp_yn', 'hfdval','mcaid','mcare','intercept']]).fit()" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "3517346.2599999965" - ] - }, - "execution_count": 16, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "dta.marsupwt[(dta.indicator==1)].sum()" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " Logit Regression Results \n", - "==============================================================================\n", - "Dep. Variable: indicator No. Observations: 139415\n", - "Model: Logit Df Residuals: 139394\n", - "Method: MLE Df Model: 20\n", - "Date: Wed, 05 Jul 2017 Pseudo R-squ.: 0.3442\n", - "Time: 10:26:49 Log-Likelihood: -5227.9\n", - "converged: True LL-Null: -7971.7\n", - " LLR p-value: 0.000\n", - "==============================================================================\n", - " coef std err z P>|z| [0.025 0.975]\n", - "------------------------------------------------------------------------------\n", - "a_age 0.0083 0.002 3.501 0.000 0.004 0.013\n", - "sex -0.1346 0.086 -1.568 0.117 -0.303 0.034\n", - "income -6.558e-07 6.7e-07 -0.978 0.328 -1.97e-06 6.58e-07\n", - "d1 -0.0194 0.170 -0.114 0.909 -0.353 0.314\n", - "d2 0.3551 0.097 3.678 0.000 0.166 0.544\n", - "d3 -0.0998 0.170 -0.587 0.557 -0.433 0.233\n", - "d4 -0.0673 0.146 -0.462 0.644 -0.353 0.218\n", - "d5 0.6402 0.095 6.723 0.000 0.454 0.827\n", - "d6 0.3930 0.124 3.177 0.001 0.151 0.635\n", - "active 3.8318 0.084 45.564 0.000 3.667 3.997\n", - "paw_yn 0.2974 0.532 0.559 0.576 -0.746 1.341\n", - "wc_yn 0.9216 0.269 3.429 0.001 0.395 1.448\n", - "ss_yn 0.6252 0.115 5.434 0.000 0.400 0.851\n", - "uc_yn 0.0605 0.181 0.335 0.738 -0.294 0.415\n", - "sur_yn 0.7038 0.201 3.505 0.000 0.310 1.097\n", - "hed_yn 0.6845 0.105 6.494 0.000 0.478 0.891\n", - "hcsp_yn -0.1381 0.173 -0.797 0.425 -0.478 0.201\n", - "hfdval -0.2441 0.124 -1.969 0.049 -0.487 -0.001\n", - "mcaid -0.4411 0.135 -3.275 0.001 -0.705 -0.177\n", - "mcare -0.4661 0.120 -3.870 0.000 -0.702 -0.230\n", - "intercept -6.3620 0.119 -53.614 0.000 -6.595 -6.129\n", - "==============================================================================\n" - ] - } - ], - "source": [ - "print model.summary()" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "probs = model.predict()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Import administrative data" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "admin = pd.read_csv('VB_administrative.csv',\n", - " dtype={ 'Total VB population': np.float,'Average benefits': np.float, 'Total benefits': np.float, 'Medical care': np.float})\n", - "admin.index = admin.Fips" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "# CPS total benefits and Administrative total benefits\n", - "state_benefit = {}\n", - "state_recipients = {}\n", - "for state in admin.Fips:\n", - " this_state = (VB.gestfips==state)\n", - " CPS_totalb = (VB.vbvalue * VB.marsupwt)[this_state].sum()\n", - " admin_totalb = admin['Total benefits'][state] \n", - " CPS_totaln = VB.marsupwt[this_state&VB.indicator==1].sum()\n", - " admin_totaln = admin[\"Total veteran receving benefits\"][state]\n", - "\n", - " temp = [admin.State[state], CPS_totalb, admin_totalb, CPS_totaln, admin_totaln]\n", - " state_benefit[state] = temp\n", - " \n", - "pre_augment_benefit = DataFrame(state_benefit).transpose()\n", - "pre_augment_benefit.columns = ['State', 'CPS total benefits','Admin total benefits',\n", - " 'CPS total individual recipients','Admin total individual recipients']" - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "#pre_augment_benefit.to_csv('C:\\Users\\wangy\\OneDrive\\Documents\\BasicIncomeProject\\VB-pre-blow-up.csv')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Imputation" - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "# caculate difference of SNAP stats and CPS aggregates on recipients number\n", - "# by state\n", - "diff = {'Fips':[],'Difference in Population':[],'Mean Benefit':[],'CPS Population':[],'VA Population':[]}\n", - "diff['Fips'] = admin.Fips\n", - "current = (VB.indicator==1)\n", - "for FIPS in admin.Fips:\n", - " this_state = (VB.gestfips==FIPS)\n", - " current_tots = VB.marsupwt[current&this_state].sum()\n", - " valid_num = VB.marsupwt[current&this_state].sum() + 0.0000001\n", - " current_mean = ((VB.vbvalue * VB.marsupwt)[current&this_state].sum())/valid_num\n", - " diff['CPS Population'].append(current_tots)\n", - " diff['VA Population'].append(float(admin[\"Total veteran receving benefits\"][admin.Fips == FIPS]))\n", - " diff['Difference in Population'].append(float(admin[\"Total veteran receving benefits\"][admin.Fips == FIPS])- current_tots)\n", - " diff['Mean Benefit'].append(current_mean)" - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "d = DataFrame(diff)\n", - "d = d[['Fips', 'Mean Benefit', 'Difference in Population', 'CPS Population', 'VA Population']]\n", - "#d.to_csv('recipients.csv', index=False)" - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "('we need to impute', 50892.110000000001, 'for state', 1)\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/andersonfrailey/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:26: SettingWithCopyWarning: \n", - "A value is trying to be set on a copy of a slice from a DataFrame\n", - "\n", - "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", - "/Users/andersonfrailey/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:27: SettingWithCopyWarning: \n", - "A value is trying to be set on a copy of a slice from a DataFrame\n", - "\n", - "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "('Method1: regression gives', 49596.91)\n", - "('we need to impute', 5614.7399999999998, 'for state', 2)\n", - "('Method1: regression gives', 5809.25)\n", - "('we need to impute', 49924.690000000002, 'for state', 4)\n", - "('Method1: regression gives', 49614.979999999996)\n", - "('we need to impute', 13846.449999999997, 'for state', 5)\n", - "('Method1: regression gives', 14502.18)\n", - "('we need to impute', 127801.64999999997, 'for state', 6)\n", - "('Method1: regression gives', 127886.86)\n", - "('we need to impute', 59113.790000000008, 'for state', 8)\n", - "('Method1: regression gives', 60369.78)\n", - "('we need to impute', 10320.050000000003, 'for state', 9)\n", - "('Method1: regression gives', 9792.18)\n", - "('we need to impute', 2905.8199999999997, 'for state', 10)\n", - "('Method1: regression gives', 3056.7299999999996)\n", - "('we need to impute', 2460.0299999999997, 'for state', 11)\n", - "('Method1: regression gives', 2634.59)\n", - "('we need to impute', 101702.34999999992, 'for state', 12)\n", - "('Method1: regression gives', 103789.57999999999)\n", - "('we need to impute', 29402.470000000001, 'for state', 13)\n", - "('Method1: regression gives', 30182.390000000003)\n", - "('we need to impute', 9486.9699999999975, 'for state', 15)\n", - "('Method1: regression gives', 9585.17)\n", - "('we need to impute', 13564.070000000002, 'for state', 16)\n", - "('Method1: regression gives', 13372.16)\n", - "('we need to impute', 26999.789999999979, 'for state', 17)\n", - "('Method1: regression gives', 26995.769999999997)\n", - "('we need to impute', 33077.5, 'for state', 18)\n", - "('Method1: regression gives', 33307.77)\n", - "('we need to impute', 7120.7300000000032, 'for state', 19)\n", - "('Method1: regression gives', 7294.28)\n", - "('we need to impute', 21508.27, 'for state', 20)\n", - "('Method1: regression gives', 21285.899999999998)\n", - "('we need to impute', 23579.459999999992, 'for state', 21)\n", - "('Method1: regression gives', 22644.83)\n", - "('we need to impute', 41380.729999999996, 'for state', 22)\n", - "('Method1: regression gives', 39528.8)\n", - "('we need to impute', 4948.4399999999987, 'for state', 23)\n", - "('Method1: regression gives', 4810.22)\n", - "('we need to impute', 37709.610000000008, 'for state', 24)\n", - "('Method1: regression gives', 38517.45)\n", - "('we need to impute', 38146.360000000001, 'for state', 25)\n", - "('Method1: regression gives', 40724.86)\n", - "('we need to impute', 7169.3100000000122, 'for state', 26)\n", - "('Method1: regression gives', 8000.469999999999)\n", - "('we need to impute', 32742.259999999995, 'for state', 27)\n", - "('Method1: regression gives', 33090.67)\n", - "('we need to impute', 22767.23, 'for state', 28)\n", - "('Method1: regression gives', 22824.29)\n", - "('we need to impute', 45153.720000000001, 'for state', 29)\n", - "('Method1: regression gives', 44394.02)\n", - "('we need to impute', 6049.9000000000051, 'for state', 30)\n", - "('Method1: regression gives', 5934.4400000000005)\n", - "('we need to impute', 7425.1000000000058, 'for state', 31)\n", - "('Method1: regression gives', 7299.82)\n", - "('we need to impute', 12824.639999999999, 'for state', 32)\n", - "('Method1: regression gives', 12192.15)\n", - "('we need to impute', 3068.4200000000019, 'for state', 33)\n", - "('Method1: regression gives', 3216.0500000000006)\n", - "('we need to impute', 16386.739999999998, 'for state', 34)\n", - "('Method1: regression gives', 14133.74)\n", - "('we need to impute', 6395.4400000000023, 'for state', 35)\n", - "('Method1: regression gives', 6656.58)\n", - "('we need to impute', -34366.369999999995, 'for state', 36)\n", - "('we need to impute', 81345.410000000018, 'for state', 37)\n", - "('Method1: regression gives', 78839.48999999999)\n", - "('we need to impute', 4947.9900000000016, 'for state', 38)\n", - "('Method1: regression gives', 5220.02)\n", - "('we need to impute', 66836.059999999969, 'for state', 39)\n", - "('Method1: regression gives', 66512.79000000001)\n", - "('we need to impute', 48604.509999999995, 'for state', 40)\n", - "('Method1: regression gives', 48926.75000000001)\n", - "('we need to impute', -3106.9700000000012, 'for state', 41)\n", - "('we need to impute', 82072.429999999993, 'for state', 42)\n", - "('Method1: regression gives', 82542.39)\n", - "('we need to impute', -2158.5999999999985, 'for state', 44)\n", - "('we need to impute', 61496.289999999994, 'for state', 45)\n", - "('Method1: regression gives', 61500.06999999999)\n", - "('we need to impute', 6519.2500000000018, 'for state', 46)\n", - "('Method1: regression gives', 6564.29)\n", - "('we need to impute', 37959.360000000015, 'for state', 47)\n", - "('Method1: regression gives', 37587.56999999999)\n", - "('we need to impute', 86930.530000000028, 'for state', 48)\n", - "('Method1: regression gives', 88105.59)\n", - "('we need to impute', 13267.260000000002, 'for state', 49)\n", - "('Method1: regression gives', 12859.44)\n", - "('we need to impute', 1716.6400000000003, 'for state', 50)\n", - "('Method1: regression gives', 1849.3600000000001)\n", - "('we need to impute', 76446.059999999983, 'for state', 51)\n", - "('Method1: regression gives', 77029.02000000002)\n", - "('we need to impute', 724.23999999996158, 'for state', 53)\n", - "('Method1: regression gives', 4683.63)\n", - "('we need to impute', 20038.079999999998, 'for state', 54)\n", - "('Method1: regression gives', 20825.55)\n", - "('we need to impute', 29038.940000000002, 'for state', 55)\n", - "('Method1: regression gives', 28632.210000000003)\n", - "('we need to impute', 718.78999999999905, 'for state', 56)\n", - "('Method1: regression gives', 746.13)\n" - ] - } - ], - "source": [ - "VB['impute'] = np.zeros(len(VB))\n", - "VB['vb_impute'] = np.zeros(len(VB))\n", - "\n", - "non_current = (VB.indicator==0)\n", - "current = (VB.indicator==1)\n", - "random.seed()\n", - "\n", - "for FIPS in admin.Fips:\n", - " \n", - " print ('we need to impute', d['Difference in Population'][FIPS], 'for state', FIPS)\n", - " \n", - " if d['Difference in Population'][FIPS] < 0:\n", - " continue\n", - " else:\n", - " this_state = (VB.gestfips==FIPS)\n", - " not_imputed = (VB.impute==0)\n", - " pool_index = VB[this_state¬_imputed&non_current].index\n", - " pool = DataFrame({'weight': VB.marsupwt[pool_index], 'prob': probs[pool_index]},\n", - " index=pool_index)\n", - " pool = pool.sort_values(by='prob', ascending=False)\n", - " pool['cumsum_weight'] = pool['weight'].cumsum()\n", - " pool['distance'] = abs(pool.cumsum_weight-d['Difference in Population'][FIPS])\n", - " min_index = pool.sort_values(by='distance')[:1].index\n", - " min_weight = int(pool.loc[min_index].cumsum_weight)\n", - " pool['impute'] = np.where(pool.cumsum_weight<=min_weight+10 , 1, 0)\n", - " VB.impute[pool.index[pool['impute']==1]] = 1\n", - " VB.vb_impute[pool.index[pool['impute']==1]] = admin['Average benefits'][FIPS]\n", - "\n", - " print ('Method1: regression gives', \n", - " VB.marsupwt[(VB.impute==1)&this_state].sum()) " - ] - }, - { - "cell_type": "code", - "execution_count": 25, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "#Adjustment ratio\n", - "results = {}\n", - "\n", - "imputed = (VB.impute == 1)\n", - "has_val = (VB.vbvalue != 0)\n", - "no_val = (VB.vbvalue == 0)\n", - "\n", - "for FIPS in admin.Fips:\n", - " this_state = (VB.gestfips==FIPS)\n", - " \n", - " current_total = (VB.vbvalue * VB.marsupwt)[this_state].sum() \n", - " imputed_total = (VB.vb_impute * VB.marsupwt)[this_state&imputed].sum()\n", - " on_file = current_total + imputed_total\n", - "\n", - " admin_total = admin['Total benefits'][FIPS]\n", - " \n", - " adjust_ratio = admin_total / on_file\n", - " this_state_num = [admin['State'][FIPS], on_file, admin_total, adjust_ratio]\n", - " results[FIPS] = this_state_num\n", - " \n", - "\n", - " VB.vb_impute = np.where(has_val&this_state, VB.vbvalue * adjust_ratio, VB.vb_impute)\n", - " VB.vb_impute = np.where(no_val&this_state, VB.vb_impute * adjust_ratio, VB.vb_impute)\n", - "\n", - "VB[\"vb_participation\"] = np.zeros(len(VB))\n", - "VB[\"vb_participation\"] = np.where(VB.impute==1, 2, 0)#Augmented\n", - "VB[\"vb_participation\"] = np.where(has_val, 1, VB.vb_participation)#CPS \n", - "\n", - "\n", - "r = DataFrame(results).transpose()\n", - "r.columns=['State', 'Imputed', 'Admin', 'adjust ratio']\n", - "#r.to_csv('amount.csv', index=False)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Assign Medical care benefit" - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "#Medical care\n", - "medical = {}\n", - "medicalcare = {}\n", - "for FIPS in admin.Fips:\n", - " this_state = (VB.gestfips==FIPS)\n", - " medical[FIPS] = admin['Medical care'][FIPS] / (VB.marsupwt[VB.vb_participation==1][this_state].sum())\n", - " VB.vb_impute = np.where((VB.vb_participation==1) & (this_state), VB.vb_impute + medical[FIPS], VB.vb_impute)\n", - " medicalcare[FIPS] = [admin['State'][FIPS], medical[FIPS]]" - ] - }, - { - "cell_type": "code", - "execution_count": 27, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "VB.to_csv('VB_Imputation.csv', \n", - " columns=['peridnum','vb_participation', 'vb_impute'],\n", - " index = False)" - ] - }, - { - "cell_type": "code", - "execution_count": 28, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "r = DataFrame(medicalcare).transpose()\n", - "r.columns=['State','Individual average medical care']\n", - "r.to_csv('medical.csv', index=False)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - } - ], - "metadata": { - "anaconda-cloud": {}, - "kernelspec": { - "display_name": "Python [default]", - "language": "python", - "name": "python2" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 2 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython2", - "version": "2.7.13" - } - }, - "nbformat": 4, - "nbformat_minor": 0 -} diff --git a/VB/VB_imputation.py b/VB/VB_imputation.py new file mode 100644 index 0000000..32289df --- /dev/null +++ b/VB/VB_imputation.py @@ -0,0 +1,208 @@ + +# coding: utf-8 + +# This script imputes both the participation and dollar benefit amount for the VA benefit programs to match the aggregates obtained from the United States Department of Veterans Affairs (VA) statistics (https://www.va.gov/vetdata/; expenditure). +# +# This current version of imputation is based on the CPS March Supplement and the Veteran's expenditure report available from the website linked above for the corresponding year. Please refer to the documentation in the same folder for more details on methodology and assumptions. The output this script is an individual level dataset that contains CPS personal level ID (PERIDNUM), individual participation indicator (vb_participation, 0 - not a recipient, 1 - current recipient on file, 2 - imputed recipient), and benefit amount. +# +# One assumption made for participants in the imputation is that 48% of veterans use the benefit programs, according to 2016 statistics. https://www.va.gov/vetdata/docs/QuickFacts/VA_Utilization_Profile.PDF +# +# Input command line: +# python VB_imputation.py CPS_FILENAME YEAR OUTPUT_FILENAME +# +# (Inpute data file should be placed in the working directory) + + +import pandas as pd +from pandas import DataFrame +import numpy as np +import random +import statsmodels.discrete.discrete_model as sm +import sys + + +# assumptions and constants to use +DELTA = 10e-8 # constant added to avoid zero denomination +Portion = 0.48 # proportion of veteran's who uses at least one program + +# read the arguments from command lines +if len(sys.argv) != 4: + print 'Please follow the format: python VB_imputation.py CPS_FILENAME YEAR OUTPUT_FILENAME' +else: + CPS_FILENAME = sys.argv[1] + YEAR = sys.argv[2] + OUTPUT_FILENAME = sys.argv[3] + + +# input datafile +CPS_dataset = pd.read_csv(CPS_FILENAME) + +# create the dummies first - orginal variable have multiple values +VB = DataFrame(CPS_dataset[['vet_yn', 'a_sex', # veteran status & gender + 'pedisdrs','pedisear', 'pediseye', 'pedisout', 'pedisphy', 'pedisrem', # disability + 'paw_yn', 'wc_yn', 'uc_yn', 'ss_yn', 'sur_yn', # other welfare/transfer + 'hed_yn','hcsp_yn', 'hfoodsp','mcaid','mcare']]) +VB.replace([-1, 2], 0, inplace=True) + + +# add numerical values +num_vars = ['marsupwt', 'fh_seq', 'gestfips', 'peridnum','vet_val', 'a_age', + 'wsal_val', 'semp_val', 'frse_val'] +VB[num_vars] =CPS_dataset[num_vars] +VB.rename(index=str, columns = {'vet_val':'vbvalue'}, inplace=True) + +# income is approximated as wage, self-employed and farm income +VB['income'] = VB.wsal_val + VB.semp_val + VB.frse_val + +# Potential recipients also include the family members of veterans +vet_family = DataFrame(VB.groupby('fh_seq', as_index=False)['vet_yn'].sum()) +vet_family.rename(index=str, columns = {'vet_yn': 'vet_family'}, inplace=True) +VB = VB.merge(vet_family, how='left', on='fh_seq') + + +# Logit Regression for Propensity Score +VB['intercept'] = np.ones(len(VB)) +model = sm.Logit(endog=VB.vet_yn, + exog=VB[['a_age','a_sex','income', 'vet_family', + 'pedisdrs','pedisear', 'pediseye', 'pedisout', 'pedisphy', 'pedisrem', + 'paw_yn','wc_yn','ss_yn', 'uc_yn', 'sur_yn', + 'hed_yn','hcsp_yn', 'hfoodsp','mcaid','mcare','intercept']]).fit() +# print model.summary() +probs = model.predict() + +# Import administrative data +admin = pd.read_csv('VB_administrative.csv', + dtype={'Total VB population': np.float, + 'Average benefits': np.float, + 'Total benefits': np.float, + 'Medical care': np.float}) +admin.index = admin.Fips + + +# CPS total benefits and Administrative total benefits +state_benefit = {} +state_recipients = {} +for state in admin.Fips: + this_state = (VB.gestfips==state) + CPS_totalb = (VB.vbvalue * VB.marsupwt)[this_state].sum() + admin_totalb = admin['Total benefits'][state] + CPS_totaln = VB.marsupwt[this_state&VB.vet_yn==1].sum() + admin_totaln = admin["Total veteran"][state] + + temp = [admin.State[state], CPS_totalb, admin_totalb, CPS_totaln, admin_totaln] + state_benefit[state] = temp + +pre_augment_benefit = DataFrame(state_benefit).transpose() +pre_augment_benefit.columns = ['State', 'CPS total benefits','Admin total benefits', + 'CPS total individual recipients','Admin total individual recipients'] +# export upon request +# pre_augment_benefit.to_csv('VB-pre-blow-up.csv') + + +# Imputation +# caculate difference of Veteran's Benefit targets and CPS aggregates +diff = {'Fips':[],'Difference in Population':[], + 'Mean Benefit':[],'CPS Population':[],'VA Population':[]} +diff['Fips'] = admin.Fips +current = (VB.vet_yn==1) + +for FIPS in admin.Fips: + + this_state = (VB.gestfips==FIPS) + + current_tots = VB.marsupwt[current&this_state].sum() + diff['CPS Population'].append(current_tots) + + valid_num = VB.marsupwt[current&this_state].sum() + DELTA + current_mean = ((VB.vbvalue * VB.marsupwt)[current&this_state].sum())/valid_num + diff['Mean Benefit'].append(current_mean) + + + Total_participate = float(admin["Total veteran"][admin.Fips == FIPS]) * Portion + diff['VA Population'].append(Total_participate) + diff['Difference in Population'].append(Total_participate - current_tots) + + +d = DataFrame(diff) +d = d[['Fips', 'Mean Benefit', 'Difference in Population', 'CPS Population', 'VA Population']] + +# d.to_csv('recipients.csv', index=False) + +# Impute participants +VB['impute'] = np.zeros(len(VB)) +VB['vb_impute'] = np.zeros(len(VB)) + +non_current = (VB.vet_yn==0) +current = (VB.vet_yn==1) +random.seed() + +for FIPS in admin.Fips: + + if d['Difference in Population'][FIPS] < 0: + continue + else: + this_state = (VB.gestfips==FIPS) + not_imputed = (VB.impute==0) & (VB.vet_family>0) + pool_index = VB[this_state¬_imputed&non_current].index + + pool = DataFrame({'weight': VB.marsupwt[pool_index], 'prob': probs[pool_index]}, + index=pool_index) + pool = pool.sort_values(by='prob', ascending=False) + pool['cumsum_weight'] = pool['weight'].cumsum() + pool['distance'] = abs(pool.cumsum_weight-d['Difference in Population'][FIPS]) + min_index = pool.sort_values(by='distance')[:1].index + min_weight = int(pool.loc[min_index].cumsum_weight) + pool['impute'] = np.where(pool.cumsum_weight<=min_weight+10 , 1, 0) + VB.impute[pool.index[pool['impute']==1]] = 1 + VB.vb_impute[pool.index[pool['impute']==1]] = admin['Average benefits'][FIPS] + + +#Adjustment ratio +results = {} + +imputed = (VB.impute == 1) +has_val = (VB.vbvalue != 0) +no_val = (VB.vbvalue == 0) + +for FIPS in admin.Fips: + this_state = (VB.gestfips==FIPS) + + current_total = (VB.vbvalue * VB.marsupwt)[this_state].sum() + imputed_total = (VB.vb_impute * VB.marsupwt)[this_state&imputed].sum() + on_file = current_total + imputed_total + + admin_total = admin['Total benefits'][FIPS] + + adjust_ratio = admin_total / on_file + this_state_num = [admin['State'][FIPS], on_file, admin_total, adjust_ratio] + results[FIPS] = this_state_num + + + VB.vb_impute = np.where(has_val&this_state, VB.vbvalue * adjust_ratio, VB.vb_impute) + VB.vb_impute = np.where(no_val&this_state, VB.vb_impute * adjust_ratio, VB.vb_impute) + +VB["vb_participation"] = np.zeros(len(VB)) +VB["vb_participation"] = np.where(VB.impute==1, 2, 0)#Augmented +VB["vb_participation"] = np.where(has_val, 1, VB.vb_participation)#CPS + + +r = DataFrame(results).transpose() +r.columns=['State', 'Imputed', 'Admin', 'adjust ratio'] +#r.to_csv('amount.csv', index=False) + + +# Impute Medical care +medical = {} +medicalcare = {} +for FIPS in admin.Fips: + this_state = (VB.gestfips==FIPS) + medical[FIPS] = admin['Medical care'][FIPS] / (VB.marsupwt[VB.vb_participation==1][this_state].sum()) + VB.vb_impute = np.where((VB.vb_participation==1) & (this_state), VB.vb_impute + medical[FIPS], VB.vb_impute) + medicalcare[FIPS] = [admin['State'][FIPS], medical[FIPS]] + + +VB.to_csv(OUTPUT_FILENAME, + columns=['peridnum','vb_participation', 'vb_impute', 'probs'], + index = False) + +