-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathparse.py
126 lines (96 loc) · 3.67 KB
/
parse.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
#!/usr/bin/env python
import os
import xmltodict
from glob import glob
import json
import datetime
import pandas as pd
import numpy as np
from tqdm import tqdm # progress bar
here = os.path.dirname(os.path.abspath(__file__))
folder = os.path.basename(here)
year = datetime.datetime.today().year
output_data = os.path.join(here, 'data-latest.tsv')
output_year = os.path.join(here, 'data-%s.tsv' % year)
latest = '%s/latest' % here
# Don't continue if we don't have latest folder
if not os.path.exists(latest):
print('%s does not have parsed data.' % folder)
sys.exit(0)
# Don't continue if we don't have results.json
results_json = os.path.join(latest, 'records.json')
if not os.path.exists(results_json):
print('%s does not have results.json' % folder)
sys.exit(1)
with open(results_json, 'r') as filey:
results = json.loads(filey.read())
columns = ['charge_code',
'price',
'description',
'hospital_id',
'filename',
'charge_type']
df = pd.DataFrame(columns=columns)
# Helper Functions - different formats of XML
def process_dataroot(content, df, hospital_id, filename):
# Hospital name is the key that doesn't start with @
for hospital_name in content['dataroot'].keys():
if not hospital_name.startswith('@'):
break
temp = pd.DataFrame(content['dataroot'][hospital_name])\
.rename(columns={'Charge_x0020_Code': 'charge_code',
'Description': 'description'})
# Need to melt the dataframe so that
# unique price type columns (e.g. Inpatient vs. Outpatient)
# are captured in a charge_type field
temp = temp.melt(id_vars=['charge_code', 'description'],
value_name='price', var_name='charge_type')\
.dropna(subset=['price'])
# Map charge_type values to standard labels
temp['charge_type'] = \
temp['charge_type'].map({'Inpatient_x0020_Price': 'inpatient',
'Outpatient_x0020_Price': 'outpatient'})
# add in filename and hospital_id columns
temp['filename'] = filename
temp['hospital_id'] = hospital_id
return df.append(temp, ignore_index=True, sort=True)
def process_workbook(df, hospital_id, filename, filepath):
# First row is header
temp = pd.read_csv(filepath, thousands=',')
# Rename to standard column names
temp.rename(columns={'Charge Description': 'description',
'Charge Amount': 'price'}, inplace=True)
temp['charge_code'] = np.nan
temp['hospital_id'] = hospital_id
temp['filename'] = filename
temp['charge_type'] = 'standard'
return df.append(temp, ignore_index=True, sort=True)
seen = []
for result in tqdm(results):
filename = os.path.join(latest, result['filename'])
if not os.path.exists(filename):
print('%s is not found in latest folder.' % filename)
continue
if os.stat(filename).st_size == 0:
print('%s is empty, skipping.' % filename)
continue
if result['filename'] in seen:
continue
seen.append(result['filename'])
print('Parsing %s' % filename)
# Parse the different XML files
if filename.endswith('xml'):
with open(filename, 'r') as filey:
content = xmltodict.parse(filey.read())
if "dataroot" in content:
df = process_dataroot(content, df, result['uri'],
result['filename'])
elif filename.endswith('csv'):
df = process_workbook(df, result['uri'],
result['filename'], filename)
else:
break
# Save data as we go
print(df.head(3))
df.to_csv(output_data, columns=columns, sep='\t', index=False)
df.to_csv(output_year, columns=columns, sep='\t', index=False)