Skip to content

Commit 64afd3b

Browse files
authored
Merge pull request #941 from cal-itp/find-stop-arrival-errors
Find stop arrival errors
2 parents 30eddcd + 368bb03 commit 64afd3b

File tree

6 files changed

+206
-145
lines changed

6 files changed

+206
-145
lines changed

rt_segment_speeds/25_interpolation_issues.ipynb

Lines changed: 116 additions & 89 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
"outputs": [],
2323
"source": [
2424
"import dask.dataframe as dd\n",
25+
"import geopandas as gpd\n",
2526
"import numpy as np\n",
2627
"import pandas as pd\n",
2728
"\n",
@@ -41,7 +42,7 @@
4142
"id": "c5f369bb-68bf-46a2-86ad-6279872859b1",
4243
"metadata": {},
4344
"source": [
44-
"## Between stops, how to find stops behaving not as expected\n",
45+
"## Between stops, arrival times behaving not as expected\n",
4546
"There are erroneous calculations here.\n",
4647
"\n",
4748
"Prior arrival time can't take place **after** arrival time. \n",
@@ -69,188 +70,214 @@
6970
{
7071
"cell_type": "code",
7172
"execution_count": null,
72-
"id": "ccf433cf-69e7-476c-a64a-8c999a53858b",
73+
"id": "d26bb970-8d32-4036-b5f1-8852e5ed4eda",
7374
"metadata": {},
7475
"outputs": [],
7576
"source": [
76-
"stop_arrivals = pd.read_parquet(\n",
77-
" f\"{SEGMENT_GCS}{STOP_ARRIVALS}.parquet\",\n",
78-
" columns = [\"trip_instance_key\", \"stop_sequence\", \"arrival_time\"]\n",
79-
")"
77+
"df.columns"
8078
]
8179
},
8280
{
8381
"cell_type": "code",
8482
"execution_count": null,
85-
"id": "c3077ade-87c1-4b9d-8cf7-bbb743a03823",
83+
"id": "8556a0d8-3f05-4726-9fb2-5dd8864fe751",
8684
"metadata": {},
8785
"outputs": [],
8886
"source": [
89-
"df.error_arrival_order.value_counts()"
87+
"df.head()"
9088
]
9189
},
9290
{
9391
"cell_type": "code",
9492
"execution_count": null,
95-
"id": "3d0373a4-80fc-49e1-bac3-2edd8c5ae4d0",
93+
"id": "4e1001f7-32db-427e-859a-9987e499c327",
9694
"metadata": {},
9795
"outputs": [],
9896
"source": [
99-
"df.error_same_endpoints.value_counts()"
97+
"pd.crosstab(df.nearest_vp_idx_monotonic, \n",
98+
" df.stop_meters_monotonic)"
10099
]
101100
},
102101
{
103102
"cell_type": "code",
104103
"execution_count": null,
105-
"id": "11d2a032-83db-43b7-a7fb-9254a10ae524",
104+
"id": "53219886-827b-44e0-a764-45970bf194d0",
106105
"metadata": {},
107106
"outputs": [],
108107
"source": [
109-
"df[(df.error_same_endpoints==1) & \n",
110-
" (df.error_arrival_order==1)].shape"
108+
"pd.crosstab(df.nearest_vp_idx_monotonic, \n",
109+
" df.stop_meters_monotonic, normalize=True)"
111110
]
112111
},
113112
{
114113
"cell_type": "code",
115114
"execution_count": null,
116-
"id": "2ea55941-8d10-49f2-8765-6b2faba4080d",
115+
"id": "7578edfa-e7a1-4607-8fa5-1d086efdef1c",
117116
"metadata": {},
118117
"outputs": [],
119118
"source": [
120-
"trip_stats = (df.groupby(\"trip_instance_key\", \n",
121-
" observed=True, group_keys=False)\n",
122-
" .agg({\n",
123-
" \"error_same_endpoints\": \"mean\",\n",
124-
" \"error_arrival_order\": \"mean\"\n",
125-
" }).reset_index()\n",
126-
" )"
119+
"# Case 1: this is the largest group of errors, and \n",
120+
"# should be easier to fix"
127121
]
128122
},
129123
{
130124
"cell_type": "code",
131125
"execution_count": null,
132-
"id": "83fde1e4-29b1-43ab-b30c-f98ec63a87c8",
126+
"id": "b923bfdf-df03-49cf-94bf-f5ee2270714d",
133127
"metadata": {},
134128
"outputs": [],
135129
"source": [
136-
"# Very few trips are completely error-free\n",
137-
"trip_stats[(trip_stats.error_same_endpoints==0) & \n",
138-
" (trip_stats.error_arrival_order==0)].shape"
130+
"df[(df.nearest_vp_idx_monotonic==False) &\n",
131+
" (df.stop_meters_monotonic==True)]"
139132
]
140133
},
141134
{
142135
"cell_type": "code",
143136
"execution_count": null,
144-
"id": "1bd9d1b1-48f0-46e1-9875-181ea20df66a",
137+
"id": "689e546b-7df6-49e3-bbcc-7594a41fd32e",
145138
"metadata": {},
146139
"outputs": [],
147140
"source": [
148-
"#trip_stats.sample(10).trip_instance_key.unique()\n",
149-
"subset_trip_keys = [\n",
150-
" '9fad69264acd8387150f45b27d4b2d09',\n",
151-
" '44a55d2fa2588a479065ef7702475ef1',\n",
152-
" '36070a2428e62b96368d072eb2a8fc1b',\n",
153-
" '7f665900c6b0879f4b9bda43b93fefe3',\n",
154-
" '8e8ba9993d52388539d06a46710c1dbc',\n",
155-
" 'b301c2170c1ca49bbc1a9b600cccf643',\n",
156-
" '9373f5b0de977a718dea50fd90443619',\n",
157-
" '8415b3949147c9dc3d5ceb37863440b1',\n",
158-
" '984f598419c1d0830ef4618d495c1bd7',\n",
159-
" '815e4dd921cdcb61ad2dbb1ca5f08a39'\n",
160-
"]"
141+
"df[df.stop_meters_monotonic==False][[\n",
142+
" \"stop_sequence\", \n",
143+
" \"nearest_vp_idx\",\n",
144+
" \"rolling_nearest_vp_idx\", \"nearest_vp_idx_monotonic\", \n",
145+
" \"stop_meters\", \"rolling_stop_meters\", \n",
146+
" \"stop_meters_monotonic\"\n",
147+
"]]"
161148
]
162149
},
163150
{
164151
"cell_type": "code",
165152
"execution_count": null,
166-
"id": "212eaa5d-735c-4332-b25d-e1883ee48f15",
153+
"id": "ccf433cf-69e7-476c-a64a-8c999a53858b",
167154
"metadata": {},
168155
"outputs": [],
169156
"source": [
170-
"def check_if_surrounding_points_are_ok(df: pd.DataFrame):\n",
171-
" grouped_df = df.groupby(\"trip_instance_key\", \n",
172-
" observed=True, group_keys=False\n",
173-
" )\n",
174-
" df = df.assign(\n",
175-
" prior_error = (grouped_df\n",
176-
" .error_arrival_order\n",
177-
" .shift(1)\n",
178-
" ),\n",
179-
" subseq_error = (grouped_df\n",
180-
" .error_arrival_order\n",
181-
" .shift(-1)\n",
182-
" )\n",
183-
" )\n",
184-
" \n",
185-
" df = df.assign(\n",
186-
" can_be_fixed = df.apply(\n",
187-
" lambda x:\n",
188-
" 1 if (x.error_arrival_order==1) and\n",
189-
" (x.prior_error==0) and (x.subseq_error==0)\n",
190-
" else 0, axis=1\n",
191-
" )\n",
192-
" )\n",
157+
"# How to use stop arrivals to constrain the wrong arrival times that occur\n",
158+
"# in the middle of the trip?\n",
159+
"stop_arrivals = pd.read_parquet(\n",
160+
" f\"{SEGMENT_GCS}{STOP_ARRIVALS}.parquet\",\n",
161+
" columns = [\"trip_instance_key\", \"stop_sequence\", \"arrival_time\"]\n",
162+
")"
163+
]
164+
},
165+
{
166+
"cell_type": "code",
167+
"execution_count": null,
168+
"id": "7f80f709-1a58-4608-af7e-4295ad647bdb",
169+
"metadata": {},
170+
"outputs": [],
171+
"source": [
172+
"trips_monotonicity = (stop_arrivals.groupby(\"trip_instance_key\")\n",
173+
" .arrival_time\n",
174+
" .is_monotonic_increasing\n",
175+
" ).to_frame().reset_index()\n",
193176
"\n",
194-
" return df\n",
195-
" "
177+
"trips_monotonicity"
196178
]
197179
},
198180
{
199181
"cell_type": "code",
200182
"execution_count": null,
201-
"id": "5a219a60-4ea2-45b0-9fa9-3a11f326b8a0",
183+
"id": "3d6adbc5-3959-448f-ae35-c1fca40848c7",
202184
"metadata": {},
203185
"outputs": [],
204186
"source": [
205-
"df2 = pd.merge(\n",
206-
" df,\n",
207-
" stop_arrivals,\n",
208-
" on = [\"trip_instance_key\", \"stop_sequence\"],\n",
209-
" how = \"inner\"\n",
210-
")"
187+
"trips_monotonicity.arrival_time.value_counts()"
211188
]
212189
},
213190
{
214191
"cell_type": "code",
215192
"execution_count": null,
216-
"id": "c5b1f434-cb45-4425-aa8f-7a85c87d3e8d",
193+
"id": "3d8da9cc-957f-4d4f-8e15-b84f1b9f70b1",
217194
"metadata": {},
218195
"outputs": [],
219196
"source": [
220-
"df3 = check_if_surrounding_points_are_ok(df2)"
197+
"fail_trips = trips_monotonicity[\n",
198+
" trips_monotonicity.arrival_time==False\n",
199+
"].sample(25).trip_instance_key.unique()"
221200
]
222201
},
223202
{
224203
"cell_type": "code",
225204
"execution_count": null,
226-
"id": "4c47138d-9129-43ff-b73a-4c494f5be58a",
205+
"id": "8f96f7eb-1083-4eb6-a76d-70debce26884",
227206
"metadata": {},
228207
"outputs": [],
229208
"source": [
230-
"df3[df3.error_arrival_order==1].shape"
209+
"stop_arrivals[stop_arrivals.trip_instance_key==fail_trips[7]]"
231210
]
232211
},
233212
{
234213
"cell_type": "code",
235214
"execution_count": null,
236-
"id": "d913f05d-2df8-4a92-bc6c-3dd3d2e78a37",
215+
"id": "b06b3f04-73eb-4eb3-b8fe-9ae49c8a1c3a",
237216
"metadata": {},
238217
"outputs": [],
239218
"source": [
240-
"df3[(df3.error_arrival_order==1) & \n",
241-
" (df3.prior_error==0) & \n",
242-
" (df3.subseq_error==0)\n",
243-
" ].shape"
219+
"import altair as alt\n",
220+
"\n",
221+
"def plot_stop_arrivals(df, one_trip):\n",
222+
" chart = (alt.Chart(df[df.trip_instance_key==one_trip])\n",
223+
" .mark_line()\n",
224+
" .encode(\n",
225+
" x=\"stop_sequence\",\n",
226+
" y=\"arrival_time:T\",\n",
227+
" tooltip=[\"stop_sequence\", \"arrival_time\"]\n",
228+
" ).properties(title=one_trip)\n",
229+
" .interactive()\n",
230+
" )\n",
231+
" \n",
232+
" display(chart)"
233+
]
234+
},
235+
{
236+
"cell_type": "code",
237+
"execution_count": null,
238+
"id": "58ecf1aa-8833-44f4-a286-e5c87e67b440",
239+
"metadata": {},
240+
"outputs": [],
241+
"source": [
242+
"for t in fail_trips:\n",
243+
" plot_stop_arrivals(stop_arrivals, t)"
244+
]
245+
},
246+
{
247+
"cell_type": "code",
248+
"execution_count": null,
249+
"id": "1bd9d1b1-48f0-46e1-9875-181ea20df66a",
250+
"metadata": {},
251+
"outputs": [],
252+
"source": [
253+
"#trip_stats.sample(10).trip_instance_key.unique()\n",
254+
"subset_trip_keys = [\n",
255+
" '9fad69264acd8387150f45b27d4b2d09',\n",
256+
" '44a55d2fa2588a479065ef7702475ef1',\n",
257+
" '36070a2428e62b96368d072eb2a8fc1b',\n",
258+
" '7f665900c6b0879f4b9bda43b93fefe3',\n",
259+
" '8e8ba9993d52388539d06a46710c1dbc',\n",
260+
" 'b301c2170c1ca49bbc1a9b600cccf643',\n",
261+
" '9373f5b0de977a718dea50fd90443619',\n",
262+
" '8415b3949147c9dc3d5ceb37863440b1',\n",
263+
" '984f598419c1d0830ef4618d495c1bd7',\n",
264+
" '815e4dd921cdcb61ad2dbb1ca5f08a39'\n",
265+
"]"
244266
]
245267
},
246268
{
247269
"cell_type": "code",
248270
"execution_count": null,
249-
"id": "e979265a-780d-496e-b3b0-195cc5058d2b",
271+
"id": "5a219a60-4ea2-45b0-9fa9-3a11f326b8a0",
250272
"metadata": {},
251273
"outputs": [],
252274
"source": [
253-
"df3[df3.can_be_fixed==1].trip_instance_key.unique()[:5]"
275+
"df2 = pd.merge(\n",
276+
" df,\n",
277+
" stop_arrivals,\n",
278+
" on = [\"trip_instance_key\", \"stop_sequence\"],\n",
279+
" how = \"inner\"\n",
280+
")"
254281
]
255282
},
256283
{
@@ -260,7 +287,7 @@
260287
"metadata": {},
261288
"outputs": [],
262289
"source": [
263-
"df3[df3.trip_instance_key==\"00019686e6c7bf335148c8d290feb285\"]"
290+
"df2[df2.trip_instance_key==\"00019686e6c7bf335148c8d290feb285\"]"
264291
]
265292
},
266293
{
@@ -270,7 +297,7 @@
270297
"metadata": {},
271298
"outputs": [],
272299
"source": [
273-
"df3[df3.trip_instance_key==\"0001ad7e1ef246cf6d68599de0fdcaad\"\n",
300+
"df2[df2.trip_instance_key==\"0001ad7e1ef246cf6d68599de0fdcaad\"\n",
274301
" ].tail(10)"
275302
]
276303
},
@@ -309,16 +336,16 @@
309336
" .mark_line()\n",
310337
" .encode(\n",
311338
" x=\"stop_sequence\",\n",
312-
" y=\"error_arrival_order\"\n",
339+
" y=\"nearest_vp_idx_monotonic\"\n",
313340
" ).properties(title=f\"{t}\")\n",
314341
" )\n",
315342
" display(chart)\n",
316343
" \n",
317-
" chart2 = (alt.Chart(subset_df[subset_df.error_arrival_order == 0])\n",
344+
" chart2 = (alt.Chart(subset_df[subset_df.nearest_vp_idx_monotonic == True])\n",
318345
" .mark_line()\n",
319346
" .encode(\n",
320347
" x=\"stop_sequence\",\n",
321-
" y=\"error_same_endpoints\"\n",
348+
" y=\"stop_meters_monotonic\"\n",
322349
" )\n",
323350
" )\n",
324351
" display(chart2)"
@@ -696,7 +723,7 @@
696723
"outputs": [],
697724
"source": [
698725
"df = pd.read_parquet(\n",
699-
" f\"{SEGMENT_GCS}stop_arrivals_speed_{analysis_date}_2.parquet\")"
726+
" f\"{SEGMENT_GCS}speed_stop_segments_{analysis_date}.parquet\")"
700727
]
701728
},
702729
{
Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
2023-10-31 12:12:52.626 | INFO | __main__:<module>:99 - Analysis date: 2023-09-13
2-
2023-10-31 12:14:03.894 | INFO | __main__:<module>:134 - set up df with nearest / subseq vp info: 0:01:11.267039
3-
2023-10-31 12:14:57.365 | INFO | __main__:<module>:139 - interpolate stop arrival: 0:00:53.471494
4-
2023-10-31 12:15:05.266 | INFO | __main__:<module>:145 - execution time: 0:02:12.638916
1+
2023-10-31 18:10:00.239 | INFO | __main__:<module>:99 - Analysis date: 2023-09-13
2+
2023-10-31 18:11:18.958 | INFO | __main__:<module>:134 - set up df with nearest / subseq vp info: 0:01:18.690602
3+
2023-10-31 18:12:06.833 | INFO | __main__:<module>:139 - interpolate stop arrival: 0:00:47.874819
4+
2023-10-31 18:12:14.756 | INFO | __main__:<module>:145 - execution time: 0:02:14.488207

rt_segment_speeds/logs/nearest_vp.log

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,7 @@
1-
2023-10-31 at 09:34:59 | INFO | Analysis date: 2023-09-13
2-
2023-10-31 09:39:45.702 | INFO | __main__:<module>:261 - map partitions to transform vp: 0:04:46.103748
3-
2023-10-31 at 09:39:45 | INFO | map partitions to transform vp: 0:04:46.103748
4-
2023-10-31 09:39:46.981 | INFO | __main__:<module>:293 - map partitions to find nearest vp to stop: 0:00:01.279908
5-
2023-10-31 at 09:39:46 | INFO | map partitions to find nearest vp to stop: 0:00:01.2799082023-10-31 09:46:23.878 | INFO | __main__:<module>:316 - Analysis date: 2023-09-13
6-
2023-10-31 09:51:11.125 | INFO | __main__:find_nearest_vp_to_stop:261 - map partitions to transform vp: 0:04:47.246718
7-
2023-10-31 09:51:11.894 | INFO | __main__:find_nearest_vp_to_stop:293 - map partitions to find nearest vp to stop: 0:00:00.768417
8-
2023-10-31 09:57:34.934 | INFO | __main__:<module>:323 - execution time: 0:11:11.055258
1+
2023-10-31 17:45:52.135 | INFO | __main__:<module>:332 - Analysis date: 2023-09-13
2+
2023-10-31 17:51:23.974 | INFO | __main__:find_nearest_vp_to_stop:277 - map partitions to transform vp: 0:05:31.838490
3+
2023-10-31 17:51:25.093 | INFO | __main__:find_nearest_vp_to_stop:309 - map partitions to find nearest vp to stop: 0:00:01.118975
4+
2023-10-31 17:57:10.858 | INFO | __main__:<module>:337 - Analysis date: 2023-09-13
5+
2023-10-31 18:03:30.506 | INFO | __main__:find_nearest_vp_to_stop:282 - map partitions to transform vp: 0:06:19.646465
6+
2023-10-31 18:03:31.676 | INFO | __main__:find_nearest_vp_to_stop:314 - map partitions to find nearest vp to stop: 0:00:01.170538
7+
2023-10-31 18:08:58.296 | INFO | __main__:<module>:344 - execution time: 0:11:47.436826

rt_segment_speeds/logs/speeds_by_segment_trip.log

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,3 +8,5 @@
88
2023-10-17 18:34:34.838 | INFO | __main__:<module>:378 - execution time: 0:10:16.928330
99
2023-10-31 12:29:06.200 | INFO | __main__:<module>:23 - Analysis date: 2023-09-13
1010
2023-10-31 12:29:29.129 | INFO | __main__:<module>:69 - execution time: 0:00:22.926565
11+
2023-10-31 18:12:34.943 | INFO | __main__:<module>:23 - Analysis date: 2023-09-13
12+
2023-10-31 18:12:57.436 | INFO | __main__:<module>:69 - execution time: 0:00:22.465316

0 commit comments

Comments
 (0)