Skip to content

Commit fe109fc

Browse files
authored
Merge pull request #1039 from cal-itp/tircp_clean
refactor tircp_data_cleaner script
2 parents 44eec17 + 0a4e8ed commit fe109fc

File tree

2 files changed

+181
-210
lines changed

2 files changed

+181
-210
lines changed

bus_procurement_cost/dgs_data_cleaner.py

Lines changed: 53 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -7,35 +7,33 @@
77
def calculate_total_cost(row):
88
"""
99
Calculate new column for total cost by checking if total_with_options_per_unit is present or not.
10+
if not, then calculate using contract_unit_price.
1011
to be used with .assign()
1112
"""
1213
if row["total_with_options_per_unit"] > 0:
1314
return row["total_with_options_per_unit"] * row["quantity"]
1415
else:
1516
return row["contract_unit_price"] * row["quantity"]
16-
17-
def new_bus_size_finder(item_description):
17+
18+
19+
def new_bus_size_finder(description: str) -> str:
1820
"""
1921
Similar to prop_type_find, matches keywords to item description col and return standardized bus size type.
22+
now includes variable that make description input lowercase.
2023
To be used with .assign()
2124
"""
2225

2326
articulated_list = [
24-
"60-foot",
2527
"60 foot",
2628
"articulated",
2729
]
2830

2931
standard_bus_list = [
30-
"30-foot",
32+
"30 foot",
3133
"35 foot",
32-
"35 Foot",
3334
"40 foot",
34-
"40-foot",
35-
"40 Foot",
3635
"40ft",
3736
"45 foot",
38-
"45-foot",
3937
"standard",
4038
]
4139

@@ -46,84 +44,83 @@ def new_bus_size_finder(item_description):
4644
other_bus_size_list = ["feeder bus"]
4745

4846
otr_bus_list = [
49-
"coach-style",
50-
"over-the-road",
47+
"coach style",
48+
"over the road",
5149
]
5250

51+
item_description = description.lower().replace("-", " ").strip()
52+
5353
if any(word in item_description for word in articulated_list):
5454
return "articulated"
5555

56-
if any(word in item_description for word in standard_bus_list):
56+
elif any(word in item_description for word in standard_bus_list):
5757
return "standard/conventional (30ft-45ft)"
5858

59-
if any(word in item_description for word in cutaway_list):
59+
elif any(word in item_description for word in cutaway_list):
6060
return "cutaway"
6161

62-
if any(word in item_description for word in otr_bus_list):
62+
elif any(word in item_description for word in otr_bus_list):
6363
return "over-the-road"
6464

65-
if any(word in item_description for word in other_bus_size_list):
65+
elif any(word in item_description for word in other_bus_size_list):
6666
return "other"
6767

68-
return "not specified"
68+
else:
69+
return "not specified"
6970

7071

7172
# new prop_finder function
72-
def new_prop_finder(item_description):
73+
def new_prop_finder(description: str) -> str:
7374
"""
7475
function that matches keywords from each propulsion type list against the item description col, returns a standardized prop type
76+
now includes variable that make description input lowercase.
7577
to be used with .assign()
7678
"""
7779

7880
BEB_list = [
7981
"battery electric",
80-
"battery-electric",
8182
"BEBs paratransit buses",
82-
"Battery Electric Bus",
8383
"battery electric bus",
84+
'battery electric buses',
85+
'battery electric buses',
8486
]
8587

8688
cng_list = [
87-
"CNG buses",
88-
"CNG fueled",
89-
"estimated-CNG buses",
90-
"low emission CNG",
89+
"cng buses",
90+
"cng fueled",
91+
"estimated cng buses",
92+
"low emission cng",
9193
]
9294

9395
electric_list = [
9496
"electric buses",
9597
"electric commuter",
96-
"Electric",
9798
"electric",
9899
]
99100

100101
FCEB_list = [
101102
"fuel cell",
102-
"fuel-cell",
103103
"fuel cell electric",
104104
"hydrogen fuel cell",
105-
"Fuel Cell Electric Bus",
106105
"fuel cell electric bus",
107-
"Hydrogen Electic Bus",
108106
"hydrogen electric bus",
109107
]
110108

111109
# low emission (hybrid)
112110
hybrid_list = [
113111
"diesel electric hybrids",
114-
"diesel-electric",
115112
"diesel-electric hybrids",
116-
"hybrid",
117113
"hybrid electric",
118114
"hybrid electric buses",
119115
"hybrid electrics",
116+
"hybrid",
120117
]
121118

122119
# low emission (propane)
123120
propane_list = [
124-
"propane",
125121
"propane buses",
126122
"propaned powered vehicles",
123+
"propane",
127124
]
128125

129126
mix_beb_list = [
@@ -142,44 +139,48 @@ def new_prop_finder(item_description):
142139
]
143140

144141
zero_e_list = [
145-
"zero‐emission",
146-
"zero-emission buses",
147-
"zero emission",
148142
"zero emission buses",
149143
"zero emission electric",
144+
"zero emission vehicles",
145+
"zero emission",
150146
]
151147

152-
if any(word in item_description for word in BEB_list):
153-
return "BEB"
148+
item_description = description.lower().replace("-" "", " ").replace("‐", " ").strip()
154149

155-
if any(word in item_description for word in cng_list):
156-
return "CNG"
150+
if any(word in item_description for word in BEB_list) and not any(
151+
word in item_description for word in ["diesel", "hybrid", "fuel cell"]
152+
):
153+
return "BEB"
157154

158-
if any(word in item_description for word in FCEB_list):
155+
elif any(word in item_description for word in FCEB_list):
159156
return "FCEB"
160157

161-
if any(word in item_description for word in hybrid_list):
158+
elif any(word in item_description for word in hybrid_list):
162159
return "low emission (hybrid)"
163160

164-
if any(word in item_description for word in propane_list):
165-
return "low emission (propane)"
166-
167-
if any(word in item_description for word in mix_beb_list):
161+
elif any(word in item_description for word in mix_beb_list):
168162
return "mix (BEB and FCEB)"
169163

170-
if any(word in item_description for word in mix_lowe_list):
164+
elif any(word in item_description for word in mix_lowe_list):
171165
return "mix (low emission)"
172166

173-
if any(word in item_description for word in mix_zero_low_list):
167+
elif any(word in item_description for word in mix_zero_low_list):
174168
return "mix (zero and low emission)"
175169

176-
if any(word in item_description for word in zero_e_list):
170+
elif any(word in item_description for word in zero_e_list):
177171
return "zero-emission bus (not specified)"
178172

179-
if any(word in item_description for word in electric_list):
173+
elif any(word in item_description for word in propane_list):
174+
return "low emission (propane)"
175+
176+
elif any(word in item_description for word in electric_list):
180177
return "electric (not specified)"
178+
179+
elif any(word in item_description for word in cng_list):
180+
return "CNG"
181181

182-
return "not specified"
182+
else:
183+
return "not specified"
183184

184185

185186
# included assign columns
@@ -226,6 +227,7 @@ def clean_dgs_columns() -> pd.DataFrame:
226227
"total_with_options_per_unit",
227228
"grand_total",
228229
]
230+
229231
# read in data
230232
dgs_17c = pd.read_excel(f"{gcs_path}{file_17c}", sheet_name=sheet_17c)
231233
dgs_17b = pd.read_excel(f"{gcs_path}{file_17b}", sheet_name=sheet_17b)
@@ -242,19 +244,18 @@ def clean_dgs_columns() -> pd.DataFrame:
242244

243245
# takes list of columns and updates to int64
244246
dgs_17bc[to_int64] = dgs_17bc[to_int64].astype("int64")
245-
247+
246248
# change purchase_order_number col to str
247-
dgs_17bc['purchase_order_number'] = dgs_17bc['purchase_order_number'].astype('str')
249+
dgs_17bc["purchase_order_number"] = dgs_17bc["purchase_order_number"].astype("str")
248250

249251
# adds 3 new columns from functions
250252
dgs_17bc2 = dgs_17bc.assign(
251253
total_cost=dgs_17bc.apply(calculate_total_cost, axis=1),
252254
new_prop_type=dgs_17bc["item_description"].apply(new_prop_finder),
253255
new_bus_size=dgs_17bc["item_description"].apply(new_bus_size_finder),
254256
)
255-
256-
return dgs_17bc2
257257

258+
return dgs_17bc2
258259

259260
def agg_by_agency(df: pd.DataFrame) -> pd.DataFrame:
260261
"""
@@ -338,8 +339,8 @@ def agg_by_agency_w_options(df: pd.DataFrame) -> pd.DataFrame:
338339

339340
return merge
340341

341-
342342
if __name__ == "__main__":
343+
343344
# initial df
344345
df1 = clean_dgs_columns()
345346

0 commit comments

Comments
 (0)