-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathexhibit2.py
47 lines (36 loc) · 1.86 KB
/
exhibit2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
## CALCULATE DESCRIPTIVE DATA FOR FINAL HOSPITAL CLAIMS SAMPLE TO CREATE EXHIBIT2
import pandas as pd
import dask.dataframe as dd
import dask
import numpy as np
import datetime
from dask.distributed import Client
# client = Client("10.50.86.251:58343")
pd.set_option('display.max_columns', 500)
inputPath = '/gpfs/data/cms-share/duas/55378/Zoey/gardner/data/merge_output/infection/medpar_mds/FINAL/new/'
writePath = '/gpfs/data/cms-share/duas/55378/Zoey/gardner/data/exhibits/infection/FINAL/'
claims_type = ["primary", "secondary"]
outcome = ["UTI", "PNEU"]
for ctype in claims_type:
print(ctype)
## read in final analytical sample data
df = pd.read_csv(inputPath + '{}UTI.csv'.format(ctype), low_memory=False) ## substite UTI for PNEU to run the same code for pneumonia
## define categorical columns
ccol = ['race_name']
## define numeric columns
ncol = ['age', 'female', 'dual', 'disability', 'combinedscore', 'count_cc']
## define chronic conditions columns
cc_col = [l for l in list(df.columns) if l.endswith('_final')]
## calculate the total number of chronic conditions for each patient
df['count_cc'] = df.apply(lambda x: x[cc_col].sum(), axis=1)
# calculate sample size by short- vs. long-stay and race
print(df.groupby('short_stay')['BENE_ID'].count())
df.groupby(['short_stay', 'race_name'])['BENE_ID'].count().reindex([
(True, 'white'), (True, 'black'), (True, 'hispanic'), (True, 'asian'), (True, 'american_indian'), (True, 'other'),
(False, 'white'), (False, 'black'), (False, 'hispanic'), (False, 'asian'), (False, 'american_indian'), (False, 'other')]).\
to_csv(
writePath + 'FINAL/{}UTIbyRACE.csv'.format(ctype)
)
## calculate the grand mean of numeric variables by short- vs. long-stay
for col in ncol:
print(df.groupby('short_stay')[col].mean())