|
22 | 22 | "outputs": [],
|
23 | 23 | "source": [
|
24 | 24 | "import dask.dataframe as dd\n",
|
| 25 | + "import geopandas as gpd\n", |
25 | 26 | "import numpy as np\n",
|
26 | 27 | "import pandas as pd\n",
|
27 | 28 | "\n",
|
|
41 | 42 | "id": "c5f369bb-68bf-46a2-86ad-6279872859b1",
|
42 | 43 | "metadata": {},
|
43 | 44 | "source": [
|
44 |
| - "## Between stops, how to find stops behaving not as expected\n", |
| 45 | + "## Between stops, arrival times behaving not as expected\n", |
45 | 46 | "There are erroneous calculations here.\n",
|
46 | 47 | "\n",
|
47 | 48 | "Prior arrival time can't take place **after** arrival time. \n",
|
|
69 | 70 | {
|
70 | 71 | "cell_type": "code",
|
71 | 72 | "execution_count": null,
|
72 |
| - "id": "ccf433cf-69e7-476c-a64a-8c999a53858b", |
| 73 | + "id": "d26bb970-8d32-4036-b5f1-8852e5ed4eda", |
73 | 74 | "metadata": {},
|
74 | 75 | "outputs": [],
|
75 | 76 | "source": [
|
76 |
| - "stop_arrivals = pd.read_parquet(\n", |
77 |
| - " f\"{SEGMENT_GCS}{STOP_ARRIVALS}.parquet\",\n", |
78 |
| - " columns = [\"trip_instance_key\", \"stop_sequence\", \"arrival_time\"]\n", |
79 |
| - ")" |
| 77 | + "df.columns" |
80 | 78 | ]
|
81 | 79 | },
|
82 | 80 | {
|
83 | 81 | "cell_type": "code",
|
84 | 82 | "execution_count": null,
|
85 |
| - "id": "c3077ade-87c1-4b9d-8cf7-bbb743a03823", |
| 83 | + "id": "8556a0d8-3f05-4726-9fb2-5dd8864fe751", |
86 | 84 | "metadata": {},
|
87 | 85 | "outputs": [],
|
88 | 86 | "source": [
|
89 |
| - "df.error_arrival_order.value_counts()" |
| 87 | + "df.head()" |
90 | 88 | ]
|
91 | 89 | },
|
92 | 90 | {
|
93 | 91 | "cell_type": "code",
|
94 | 92 | "execution_count": null,
|
95 |
| - "id": "3d0373a4-80fc-49e1-bac3-2edd8c5ae4d0", |
| 93 | + "id": "4e1001f7-32db-427e-859a-9987e499c327", |
96 | 94 | "metadata": {},
|
97 | 95 | "outputs": [],
|
98 | 96 | "source": [
|
99 |
| - "df.error_same_endpoints.value_counts()" |
| 97 | + "pd.crosstab(df.nearest_vp_idx_monotonic, \n", |
| 98 | + " df.stop_meters_monotonic)" |
100 | 99 | ]
|
101 | 100 | },
|
102 | 101 | {
|
103 | 102 | "cell_type": "code",
|
104 | 103 | "execution_count": null,
|
105 |
| - "id": "11d2a032-83db-43b7-a7fb-9254a10ae524", |
| 104 | + "id": "53219886-827b-44e0-a764-45970bf194d0", |
106 | 105 | "metadata": {},
|
107 | 106 | "outputs": [],
|
108 | 107 | "source": [
|
109 |
| - "df[(df.error_same_endpoints==1) & \n", |
110 |
| - " (df.error_arrival_order==1)].shape" |
| 108 | + "pd.crosstab(df.nearest_vp_idx_monotonic, \n", |
| 109 | + " df.stop_meters_monotonic, normalize=True)" |
111 | 110 | ]
|
112 | 111 | },
|
113 | 112 | {
|
114 | 113 | "cell_type": "code",
|
115 | 114 | "execution_count": null,
|
116 |
| - "id": "2ea55941-8d10-49f2-8765-6b2faba4080d", |
| 115 | + "id": "7578edfa-e7a1-4607-8fa5-1d086efdef1c", |
117 | 116 | "metadata": {},
|
118 | 117 | "outputs": [],
|
119 | 118 | "source": [
|
120 |
| - "trip_stats = (df.groupby(\"trip_instance_key\", \n", |
121 |
| - " observed=True, group_keys=False)\n", |
122 |
| - " .agg({\n", |
123 |
| - " \"error_same_endpoints\": \"mean\",\n", |
124 |
| - " \"error_arrival_order\": \"mean\"\n", |
125 |
| - " }).reset_index()\n", |
126 |
| - " )" |
| 119 | + "# Case 1: this is the largest group of errors, and \n", |
| 120 | + "# should be easier to fix" |
127 | 121 | ]
|
128 | 122 | },
|
129 | 123 | {
|
130 | 124 | "cell_type": "code",
|
131 | 125 | "execution_count": null,
|
132 |
| - "id": "83fde1e4-29b1-43ab-b30c-f98ec63a87c8", |
| 126 | + "id": "b923bfdf-df03-49cf-94bf-f5ee2270714d", |
133 | 127 | "metadata": {},
|
134 | 128 | "outputs": [],
|
135 | 129 | "source": [
|
136 |
| - "# Very few trips are completely error-free\n", |
137 |
| - "trip_stats[(trip_stats.error_same_endpoints==0) & \n", |
138 |
| - " (trip_stats.error_arrival_order==0)].shape" |
| 130 | + "df[(df.nearest_vp_idx_monotonic==False) &\n", |
| 131 | + " (df.stop_meters_monotonic==True)]" |
139 | 132 | ]
|
140 | 133 | },
|
141 | 134 | {
|
142 | 135 | "cell_type": "code",
|
143 | 136 | "execution_count": null,
|
144 |
| - "id": "1bd9d1b1-48f0-46e1-9875-181ea20df66a", |
| 137 | + "id": "689e546b-7df6-49e3-bbcc-7594a41fd32e", |
145 | 138 | "metadata": {},
|
146 | 139 | "outputs": [],
|
147 | 140 | "source": [
|
148 |
| - "#trip_stats.sample(10).trip_instance_key.unique()\n", |
149 |
| - "subset_trip_keys = [\n", |
150 |
| - " '9fad69264acd8387150f45b27d4b2d09',\n", |
151 |
| - " '44a55d2fa2588a479065ef7702475ef1',\n", |
152 |
| - " '36070a2428e62b96368d072eb2a8fc1b',\n", |
153 |
| - " '7f665900c6b0879f4b9bda43b93fefe3',\n", |
154 |
| - " '8e8ba9993d52388539d06a46710c1dbc',\n", |
155 |
| - " 'b301c2170c1ca49bbc1a9b600cccf643',\n", |
156 |
| - " '9373f5b0de977a718dea50fd90443619',\n", |
157 |
| - " '8415b3949147c9dc3d5ceb37863440b1',\n", |
158 |
| - " '984f598419c1d0830ef4618d495c1bd7',\n", |
159 |
| - " '815e4dd921cdcb61ad2dbb1ca5f08a39'\n", |
160 |
| - "]" |
| 141 | + "df[df.stop_meters_monotonic==False][[\n", |
| 142 | + " \"stop_sequence\", \n", |
| 143 | + " \"nearest_vp_idx\",\n", |
| 144 | + " \"rolling_nearest_vp_idx\", \"nearest_vp_idx_monotonic\", \n", |
| 145 | + " \"stop_meters\", \"rolling_stop_meters\", \n", |
| 146 | + " \"stop_meters_monotonic\"\n", |
| 147 | + "]]" |
161 | 148 | ]
|
162 | 149 | },
|
163 | 150 | {
|
164 | 151 | "cell_type": "code",
|
165 | 152 | "execution_count": null,
|
166 |
| - "id": "212eaa5d-735c-4332-b25d-e1883ee48f15", |
| 153 | + "id": "ccf433cf-69e7-476c-a64a-8c999a53858b", |
167 | 154 | "metadata": {},
|
168 | 155 | "outputs": [],
|
169 | 156 | "source": [
|
170 |
| - "def check_if_surrounding_points_are_ok(df: pd.DataFrame):\n", |
171 |
| - " grouped_df = df.groupby(\"trip_instance_key\", \n", |
172 |
| - " observed=True, group_keys=False\n", |
173 |
| - " )\n", |
174 |
| - " df = df.assign(\n", |
175 |
| - " prior_error = (grouped_df\n", |
176 |
| - " .error_arrival_order\n", |
177 |
| - " .shift(1)\n", |
178 |
| - " ),\n", |
179 |
| - " subseq_error = (grouped_df\n", |
180 |
| - " .error_arrival_order\n", |
181 |
| - " .shift(-1)\n", |
182 |
| - " )\n", |
183 |
| - " )\n", |
184 |
| - " \n", |
185 |
| - " df = df.assign(\n", |
186 |
| - " can_be_fixed = df.apply(\n", |
187 |
| - " lambda x:\n", |
188 |
| - " 1 if (x.error_arrival_order==1) and\n", |
189 |
| - " (x.prior_error==0) and (x.subseq_error==0)\n", |
190 |
| - " else 0, axis=1\n", |
191 |
| - " )\n", |
192 |
| - " )\n", |
| 157 | + "# How to use stop arrivals to constrain the wrong arrival times that occur\n", |
| 158 | + "# in the middle of the trip?\n", |
| 159 | + "stop_arrivals = pd.read_parquet(\n", |
| 160 | + " f\"{SEGMENT_GCS}{STOP_ARRIVALS}.parquet\",\n", |
| 161 | + " columns = [\"trip_instance_key\", \"stop_sequence\", \"arrival_time\"]\n", |
| 162 | + ")" |
| 163 | + ] |
| 164 | + }, |
| 165 | + { |
| 166 | + "cell_type": "code", |
| 167 | + "execution_count": null, |
| 168 | + "id": "7f80f709-1a58-4608-af7e-4295ad647bdb", |
| 169 | + "metadata": {}, |
| 170 | + "outputs": [], |
| 171 | + "source": [ |
| 172 | + "trips_monotonicity = (stop_arrivals.groupby(\"trip_instance_key\")\n", |
| 173 | + " .arrival_time\n", |
| 174 | + " .is_monotonic_increasing\n", |
| 175 | + " ).to_frame().reset_index()\n", |
193 | 176 | "\n",
|
194 |
| - " return df\n", |
195 |
| - " " |
| 177 | + "trips_monotonicity" |
196 | 178 | ]
|
197 | 179 | },
|
198 | 180 | {
|
199 | 181 | "cell_type": "code",
|
200 | 182 | "execution_count": null,
|
201 |
| - "id": "5a219a60-4ea2-45b0-9fa9-3a11f326b8a0", |
| 183 | + "id": "3d6adbc5-3959-448f-ae35-c1fca40848c7", |
202 | 184 | "metadata": {},
|
203 | 185 | "outputs": [],
|
204 | 186 | "source": [
|
205 |
| - "df2 = pd.merge(\n", |
206 |
| - " df,\n", |
207 |
| - " stop_arrivals,\n", |
208 |
| - " on = [\"trip_instance_key\", \"stop_sequence\"],\n", |
209 |
| - " how = \"inner\"\n", |
210 |
| - ")" |
| 187 | + "trips_monotonicity.arrival_time.value_counts()" |
211 | 188 | ]
|
212 | 189 | },
|
213 | 190 | {
|
214 | 191 | "cell_type": "code",
|
215 | 192 | "execution_count": null,
|
216 |
| - "id": "c5b1f434-cb45-4425-aa8f-7a85c87d3e8d", |
| 193 | + "id": "3d8da9cc-957f-4d4f-8e15-b84f1b9f70b1", |
217 | 194 | "metadata": {},
|
218 | 195 | "outputs": [],
|
219 | 196 | "source": [
|
220 |
| - "df3 = check_if_surrounding_points_are_ok(df2)" |
| 197 | + "fail_trips = trips_monotonicity[\n", |
| 198 | + " trips_monotonicity.arrival_time==False\n", |
| 199 | + "].sample(25).trip_instance_key.unique()" |
221 | 200 | ]
|
222 | 201 | },
|
223 | 202 | {
|
224 | 203 | "cell_type": "code",
|
225 | 204 | "execution_count": null,
|
226 |
| - "id": "4c47138d-9129-43ff-b73a-4c494f5be58a", |
| 205 | + "id": "8f96f7eb-1083-4eb6-a76d-70debce26884", |
227 | 206 | "metadata": {},
|
228 | 207 | "outputs": [],
|
229 | 208 | "source": [
|
230 |
| - "df3[df3.error_arrival_order==1].shape" |
| 209 | + "stop_arrivals[stop_arrivals.trip_instance_key==fail_trips[7]]" |
231 | 210 | ]
|
232 | 211 | },
|
233 | 212 | {
|
234 | 213 | "cell_type": "code",
|
235 | 214 | "execution_count": null,
|
236 |
| - "id": "d913f05d-2df8-4a92-bc6c-3dd3d2e78a37", |
| 215 | + "id": "b06b3f04-73eb-4eb3-b8fe-9ae49c8a1c3a", |
237 | 216 | "metadata": {},
|
238 | 217 | "outputs": [],
|
239 | 218 | "source": [
|
240 |
| - "df3[(df3.error_arrival_order==1) & \n", |
241 |
| - " (df3.prior_error==0) & \n", |
242 |
| - " (df3.subseq_error==0)\n", |
243 |
| - " ].shape" |
| 219 | + "import altair as alt\n", |
| 220 | + "\n", |
| 221 | + "def plot_stop_arrivals(df, one_trip):\n", |
| 222 | + " chart = (alt.Chart(df[df.trip_instance_key==one_trip])\n", |
| 223 | + " .mark_line()\n", |
| 224 | + " .encode(\n", |
| 225 | + " x=\"stop_sequence\",\n", |
| 226 | + " y=\"arrival_time:T\",\n", |
| 227 | + " tooltip=[\"stop_sequence\", \"arrival_time\"]\n", |
| 228 | + " ).properties(title=one_trip)\n", |
| 229 | + " .interactive()\n", |
| 230 | + " )\n", |
| 231 | + " \n", |
| 232 | + " display(chart)" |
| 233 | + ] |
| 234 | + }, |
| 235 | + { |
| 236 | + "cell_type": "code", |
| 237 | + "execution_count": null, |
| 238 | + "id": "58ecf1aa-8833-44f4-a286-e5c87e67b440", |
| 239 | + "metadata": {}, |
| 240 | + "outputs": [], |
| 241 | + "source": [ |
| 242 | + "for t in fail_trips:\n", |
| 243 | + " plot_stop_arrivals(stop_arrivals, t)" |
| 244 | + ] |
| 245 | + }, |
| 246 | + { |
| 247 | + "cell_type": "code", |
| 248 | + "execution_count": null, |
| 249 | + "id": "1bd9d1b1-48f0-46e1-9875-181ea20df66a", |
| 250 | + "metadata": {}, |
| 251 | + "outputs": [], |
| 252 | + "source": [ |
| 253 | + "#trip_stats.sample(10).trip_instance_key.unique()\n", |
| 254 | + "subset_trip_keys = [\n", |
| 255 | + " '9fad69264acd8387150f45b27d4b2d09',\n", |
| 256 | + " '44a55d2fa2588a479065ef7702475ef1',\n", |
| 257 | + " '36070a2428e62b96368d072eb2a8fc1b',\n", |
| 258 | + " '7f665900c6b0879f4b9bda43b93fefe3',\n", |
| 259 | + " '8e8ba9993d52388539d06a46710c1dbc',\n", |
| 260 | + " 'b301c2170c1ca49bbc1a9b600cccf643',\n", |
| 261 | + " '9373f5b0de977a718dea50fd90443619',\n", |
| 262 | + " '8415b3949147c9dc3d5ceb37863440b1',\n", |
| 263 | + " '984f598419c1d0830ef4618d495c1bd7',\n", |
| 264 | + " '815e4dd921cdcb61ad2dbb1ca5f08a39'\n", |
| 265 | + "]" |
244 | 266 | ]
|
245 | 267 | },
|
246 | 268 | {
|
247 | 269 | "cell_type": "code",
|
248 | 270 | "execution_count": null,
|
249 |
| - "id": "e979265a-780d-496e-b3b0-195cc5058d2b", |
| 271 | + "id": "5a219a60-4ea2-45b0-9fa9-3a11f326b8a0", |
250 | 272 | "metadata": {},
|
251 | 273 | "outputs": [],
|
252 | 274 | "source": [
|
253 |
| - "df3[df3.can_be_fixed==1].trip_instance_key.unique()[:5]" |
| 275 | + "df2 = pd.merge(\n", |
| 276 | + " df,\n", |
| 277 | + " stop_arrivals,\n", |
| 278 | + " on = [\"trip_instance_key\", \"stop_sequence\"],\n", |
| 279 | + " how = \"inner\"\n", |
| 280 | + ")" |
254 | 281 | ]
|
255 | 282 | },
|
256 | 283 | {
|
|
260 | 287 | "metadata": {},
|
261 | 288 | "outputs": [],
|
262 | 289 | "source": [
|
263 |
| - "df3[df3.trip_instance_key==\"00019686e6c7bf335148c8d290feb285\"]" |
| 290 | + "df2[df2.trip_instance_key==\"00019686e6c7bf335148c8d290feb285\"]" |
264 | 291 | ]
|
265 | 292 | },
|
266 | 293 | {
|
|
270 | 297 | "metadata": {},
|
271 | 298 | "outputs": [],
|
272 | 299 | "source": [
|
273 |
| - "df3[df3.trip_instance_key==\"0001ad7e1ef246cf6d68599de0fdcaad\"\n", |
| 300 | + "df2[df2.trip_instance_key==\"0001ad7e1ef246cf6d68599de0fdcaad\"\n", |
274 | 301 | " ].tail(10)"
|
275 | 302 | ]
|
276 | 303 | },
|
|
309 | 336 | " .mark_line()\n",
|
310 | 337 | " .encode(\n",
|
311 | 338 | " x=\"stop_sequence\",\n",
|
312 |
| - " y=\"error_arrival_order\"\n", |
| 339 | + " y=\"nearest_vp_idx_monotonic\"\n", |
313 | 340 | " ).properties(title=f\"{t}\")\n",
|
314 | 341 | " )\n",
|
315 | 342 | " display(chart)\n",
|
316 | 343 | " \n",
|
317 |
| - " chart2 = (alt.Chart(subset_df[subset_df.error_arrival_order == 0])\n", |
| 344 | + " chart2 = (alt.Chart(subset_df[subset_df.nearest_vp_idx_monotonic == True])\n", |
318 | 345 | " .mark_line()\n",
|
319 | 346 | " .encode(\n",
|
320 | 347 | " x=\"stop_sequence\",\n",
|
321 |
| - " y=\"error_same_endpoints\"\n", |
| 348 | + " y=\"stop_meters_monotonic\"\n", |
322 | 349 | " )\n",
|
323 | 350 | " )\n",
|
324 | 351 | " display(chart2)"
|
|
696 | 723 | "outputs": [],
|
697 | 724 | "source": [
|
698 | 725 | "df = pd.read_parquet(\n",
|
699 |
| - " f\"{SEGMENT_GCS}stop_arrivals_speed_{analysis_date}_2.parquet\")" |
| 726 | + " f\"{SEGMENT_GCS}speed_stop_segments_{analysis_date}.parquet\")" |
700 | 727 | ]
|
701 | 728 | },
|
702 | 729 | {
|
|
0 commit comments