-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path3_create_final_nh_sample.py
182 lines (144 loc) · 10.8 KB
/
3_create_final_nh_sample.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
## PROJECT: NURSING HOME REPORTING FOR FALL AND PRESSURE ULCER
## AUTHOR: ZOEY CHEN
## THIS SCRIPT MERGE MDS REPORTING DATA, MDS CHARACTERISTICS OUTPUT DATA FROM R SCRIPT AND OTHER DATA SOURCES
## TO CREATE THE FINAL DATASET FOR ANALYZING NURSING HOME CHARACTERISTICS ASSOCIATION WITH MDS REPORTING RATES;
## THE UNIT OF DATA IS NURISNG HOME - DATA ARE AVERAGED ACROSS YEARS FOR EACH NURSING HOME
import pandas as pd
from functools import reduce
import numpy as np
pd.set_option('display.max_columns', 500)
## this is the cleaned casper and ltcfocus data from last step
casper_variables = pd.read_csv('/gpfs/data/cms-share/duas/55378/Zoey/gardner/data/qm_all/initial_analysis/casper_and_ltc_clean2.csv')
## this is the reporting data on nursing home level
mds_reporting = pd.read_csv('/gpfs/data/cms-share/duas/55378/Zoey/gardner/data/qm_all/initial_analysis/all_mds_items_data_20220930.csv')
## this is the data with facility zip code info
facility = pd.read_csv('/gpfs/data/cms-share/data/mds/facility/csv/facility.csv')
## this is zipcode to rural indicator crosswalk
zipcode_rural = pd.read_csv('/gpfs/data/cms-share/duas/55378/Zoey/gardner/data/qm_all/initial_analysis/forhp-eligible-zips.csv')
## this is the count of Medicare residents for each nursing home year
nh_variables = pd.read_csv('/gpfs/data/cms-share/duas/55378/Zoey/gardner/data/mds/nh_population/nh_variables_capser.csv',
low_memory=False)
## this is the count of Medicare residents by resident type for each nursing home year
nh_shortlong_count = pd.read_csv('/gpfs/data/cms-share/duas/55378/Zoey/gardner/data/mds/nh_population/nh_medicare_res_count_shortlongstay.csv',
low_memory=False)
## this is the count of Medicare residents who had adrd for year nursing home year
nh_adrd = pd.read_csv('/gpfs/data/cms-share/duas/55378/Zoey/gardner/data/mds/nh_population/nh_adrd.csv')
## this is the average nursing home ratings and qm measure scores from 2011 - 2017
nh_ratings = pd.read_csv('/gpfs/data/cms-share/duas/55378/Zoey/gardner/data/qm_all/initial_analysis/nh_nhc_measures.csv')
## change the data type of the column for merging with other data later
casper_variables['PRVDR_NUM'] = casper_variables['PRVDR_NUM'].astype('str')
## merge facility zipcode with crosswalk to identify rural vs urban facilities
## the crosswalk defines rural area zip code; nursing homes not linked to a rural zip code would be considered non-rural
zipcode_rural['rural'] = 1
zipcode_rural = zipcode_rural.astype({'ZIP': 'str'})
facility = facility.astype({'FAC_ZIP': 'str'})
facility = facility.merge(zipcode_rural[['ZIP', 'rural']], left_on='FAC_ZIP', right_on='ZIP', how='left')
## replace missing rural with 0
facility['rural'] = facility['rural'].replace(np.nan, 0)
## merge nh variables with nh adrd to calcualte the percentage of adrd residents in nursing homes
nh_variables = nh_variables.merge(nh_adrd, on=["FAC_PRVDR_INTRNL_ID", "STATE_CD", "MEDPAR_YR_NUM"], how='left')
## fill missing value with 0
nh_variables['medicare_adrd_count'] = \
nh_variables['medicare_adrd_count'].fillna(0)
nh_variables['adrd_percent'] = nh_variables['medicare_adrd_count']/nh_variables['medicare_count']
## calculate average dual_percent, adrd_percent, and white_percent for nursing home across years
nh_dual = nh_variables.groupby('MCARE_ID')[['dual_percent', 'adrd_percent', 'white_percent']].mean().reset_index()
nh_dual = nh_dual[nh_dual['MCARE_ID']!='************']
## change data type for later merging
nh_dual['MCARE_ID'] = nh_dual['MCARE_ID'].astype('str')
## count gives us the number of claims; sum fives us the number of claims reported on MDS
nh_reporting = mds_reporting.groupby(["MCARE_ID", "mds_item", "short_stay"]).mds_reporting.agg(['count', 'sum']).reset_index()
nh_reporting.columns = ["MCARE_ID", "mds_item", "short_stay", "nclaims", "nreported"]
## include only pressure ulcer and fall (uti was not used)
nh_reporting = nh_reporting[nh_reporting['mds_item'].isin(['fall', 'pu234', 'uti_readmission'])]
## reshape the data from long to wide
nh_reporting_wide = nh_reporting.pivot(index=["MCARE_ID", "short_stay"], columns="mds_item", values=['nreported', 'nclaims'])
nh_reporting_wide.columns = ['{0}_{1}'.format(x, y) for x,y in nh_reporting_wide.columns]
nh_reporting_wide = nh_reporting_wide.reset_index()
## add claims and reporting information across fall, pu adn uti
nh_reporting_wide['nclaims_all'] = nh_reporting_wide[[col for col in nh_reporting_wide.columns if col.startswith('nclaims')]].sum(axis=1)
nh_reporting_wide['nreported_all'] = nh_reporting_wide[[col for col in nh_reporting_wide.columns if col.startswith('nreported')]].sum(axis=1)
## change data type for merging
nh_reporting_wide['MCARE_ID'] = nh_reporting_wide['MCARE_ID'].astype('str')
## pivot data from long to wide for nursing home resident count data
nh_shortlong_count = nh_shortlong_count[nh_shortlong_count['MCARE_ID']!='************']
nh_percent_shortstay = nh_shortlong_count. \
pivot(index=['MCARE_ID', 'MEDPAR_YR_NUM'], columns='short_stay', values='medicare_count'). \
reset_index()
# print(nh_percent_shortstay.head())
nh_percent_shortstay.columns=['MCARE_ID', 'MEDPAR_YR_NUM', 'medicare_count_longstay', 'medicare_count_shortstay']
## calculate the percent short stay for each nh-year
nh_percent_shortstay['pct_shortstay'] = nh_percent_shortstay['medicare_count_shortstay']/(nh_percent_shortstay['medicare_count_shortstay'] + nh_percent_shortstay['medicare_count_longstay'])
## calculate the average percent short stay for each nh
nh_percent_shortstay = nh_percent_shortstay.groupby(['MCARE_ID'])['pct_shortstay'].mean().reset_index()
## calculate the total short stay and long stay residents for each nh from 2011 - 2017
nh_shortlong_count = nh_shortlong_count.groupby(['MCARE_ID', 'short_stay'])['medicare_count'].sum().reset_index()
nh_shortlong_count['MCARE_ID'] = nh_shortlong_count['MCARE_ID'].astype('str')
## merge the count of short vs long-stay residents in the nursing home with MDS reporting data
nh_reporting_wide = nh_reporting_wide.merge(nh_shortlong_count, on=['MCARE_ID', 'short_stay'], how='right')
nh_reporting_wide = nh_reporting_wide.merge(nh_percent_shortstay, on='MCARE_ID', how='left')
# ## calculate reporting rate for each nursing home across years
for mds_item in ['fall', 'pu234', 'uti_readmission', 'all']:
nh_reporting_wide['reporting_{}'.format(mds_item)] = nh_reporting_wide['nreported_{}'.format(mds_item)] / nh_reporting_wide['nclaims_{}'.format(mds_item)]
nh_reporting_wide['claims_based_rate_{}'.format(mds_item)] = nh_reporting_wide['nclaims_{}'.format(mds_item)] / nh_reporting_wide['medicare_count']
## CLEAN CASPER<CFOCUS DATA
print(casper_variables['PRVDR_NUM'].nunique()) #17003
## keep only data from 2011 - 2017
casper_variables = casper_variables[casper_variables['year']>2010]
casper_variables = casper_variables.drop(columns="name")
casper_variables['missing_casper'] = casper_variables['from_casper'].isna()
casper_variables['missing_ltc'] = casper_variables['from_ltc'].isna()
missing_corl = \
casper_variables.groupby('PRVDR_NUM')[['missing_casper', 'missing_ltc']].sum().reset_index()
print(missing_corl.shape[0]) #16843
## remove nursing homes if more than 3 years of LTCFocus or Casper data is missing
missing_corl = missing_corl[(missing_corl['missing_casper'] < 4) & (missing_corl['missing_ltc'] < 4)]
# print(missing_corl.shape[0]) #16617
print(missing_corl['PRVDR_NUM'].nunique()) #16617
casper_variables_nomissing = casper_variables.merge(missing_corl, on='PRVDR_NUM')
## drop duplciated columns
casper_variables_nomissing = casper_variables_nomissing.drop(columns=[col for col in casper_variables_nomissing.columns if (col.endswith("_x") or col.endswith("_y"))])
## keep only nursing homes that have been in the data from 2011 - 2017
casper_variables_nomissing_years = casper_variables_nomissing.groupby("PRVDR_NUM")['year'].count().rename('nyears').reset_index()
casper_variables_nomissing = casper_variables_nomissing.merge(
casper_variables_nomissing_years[casper_variables_nomissing_years['nyears']==7], on='PRVDR_NUM'
)
print(casper_variables_nomissing['PRVDR_NUM'].nunique()) #14665
## store numeric variable names to a list
num_columns = [col for col in casper_variables_nomissing.columns if not \
((col in ['PRVDR_NUM', 'year', 'nyears', 'from_casper', 'from_ltc', 'ownership', 'diff_ownership', 'ownership_common']) |
col.endswith('_std') |
col.endswith('_avg'))
]
## keep only unique nursing homes (data unit changed from nh-year to nh)
casper_variables_nh_unique = casper_variables_nomissing.drop_duplicates(subset='PRVDR_NUM')
## keep only columns for the average in casper data, except for ownership variables (ownership variables not used)
avg_columns = ['{}_avg'.format(col) for col in num_columns if not col.startswith(('GNRL_CNTL_TYPE_CD', 'STATE'))]
casper_variables_nh_unique = casper_variables_nh_unique[['PRVDR_NUM', 'ownership_common', 'diff_ownership', 'STATE'] + avg_columns]
## merge casper and nh reporting data
nh_characters = casper_variables_nh_unique.merge(nh_reporting_wide, left_on='PRVDR_NUM', right_on='MCARE_ID', how='left')
## merge for nursing home dual and adrd percent
nh_characters = nh_characters.merge(nh_dual, left_on='PRVDR_NUM', right_on='MCARE_ID', how='left', suffixes=['', '_drop'])
nh_characters = nh_characters.drop(columns=[col for col in nh_characters.columns if col.endswith('_drop')])
## merge for rural indicator
nh_characters = nh_characters.merge(facility[['FAC_ZIP', 'MCARE_ID', 'rural']],
left_on='PRVDR_NUM',
right_on='MCARE_ID', how='left', suffixes=['', '_drop'])
nh_characters = nh_characters.drop(columns=[col for col in nh_characters.columns if col.endswith('_drop')])
## merge with nursing home ratings data
nh_ratings = nh_ratings.astype({'provnum': 'str'})
nh_characters = nh_characters.merge(nh_ratings, left_on='PRVDR_NUM', right_on='provnum', how='left')
## read in complete list of nursing homes from MDS in 2011 - 2017
all_nh_fromMDS = pd.read_csv('/gpfs/data/cms-share/duas/55378/Zoey/gardner/data/mds/nh_population/complete_nursing_home_fromMDS.csv')
all_nh_fromMDS_unique = all_nh_fromMDS.drop_duplicates(subset='MCARE_ID')
all_nh_fromMDS_unique['MCARE_ID'] = all_nh_fromMDS_unique['MCARE_ID'].astype('str')
## keep nursing home if it is both in the casper sample and in the MDS sample
nh_characters_MDS = nh_characters[nh_characters['PRVDR_NUM'].isin(all_nh_fromMDS_unique['MCARE_ID'].tolist())]
print(nh_characters_MDS['PRVDR_NUM'].nunique()) #14658 28189
## drop duplicated nursing home
nh_characters_MDS = nh_characters_MDS.drop_duplicates(subset=['PRVDR_NUM', 'short_stay'])
print(nh_characters_MDS['PRVDR_NUM'].nunique())
# nh_characters_MDS.to_csv(
# '/gpfs/data/cms-share/duas/55378/Zoey/gardner/data/qm_all/initial_analysis/nh_characteristics_20221020.csv',
# index=False
# )