Merge pull request #941 from cal-itp/find-stop-arrival-errors

tiffanychu90 · web-flow · commit 64afd3b9ed1e · 2023-11-01T14:46:44.000-07:00
Find stop arrival errors
diff --git a/rt_segment_speeds/25_interpolation_issues.ipynb b/rt_segment_speeds/25_interpolation_issues.ipynb
@@ -22,6 +22,7 @@
    "outputs": [],
    "source": [
     "import dask.dataframe as dd\n",
+    "import geopandas as gpd\n",
     "import numpy as np\n",
     "import pandas as pd\n",
     "\n",
@@ -41,7 +42,7 @@
    "id": "c5f369bb-68bf-46a2-86ad-6279872859b1",
    "metadata": {},
    "source": [
-    "## Between stops, how to find stops behaving not as expected\n",
+    "## Between stops, arrival times behaving not as expected\n",
     "There are erroneous calculations here.\n",
     "\n",
     "Prior arrival time can't take place **after** arrival time. \n",
@@ -69,188 +70,214 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "ccf433cf-69e7-476c-a64a-8c999a53858b",
+   "id": "d26bb970-8d32-4036-b5f1-8852e5ed4eda",
    "metadata": {},
    "outputs": [],
    "source": [
-    "stop_arrivals = pd.read_parquet(\n",
-    "    f\"{SEGMENT_GCS}{STOP_ARRIVALS}.parquet\",\n",
-    "    columns = [\"trip_instance_key\", \"stop_sequence\", \"arrival_time\"]\n",
-    ")"
+    "df.columns"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "c3077ade-87c1-4b9d-8cf7-bbb743a03823",
+   "id": "8556a0d8-3f05-4726-9fb2-5dd8864fe751",
    "metadata": {},
    "outputs": [],
    "source": [
-    "df.error_arrival_order.value_counts()"
+    "df.head()"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "3d0373a4-80fc-49e1-bac3-2edd8c5ae4d0",
+   "id": "4e1001f7-32db-427e-859a-9987e499c327",
    "metadata": {},
    "outputs": [],
    "source": [
-    "df.error_same_endpoints.value_counts()"
+    "pd.crosstab(df.nearest_vp_idx_monotonic, \n",
+    "            df.stop_meters_monotonic)"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "11d2a032-83db-43b7-a7fb-9254a10ae524",
+   "id": "53219886-827b-44e0-a764-45970bf194d0",
    "metadata": {},
    "outputs": [],
    "source": [
-    "df[(df.error_same_endpoints==1) & \n",
-    "   (df.error_arrival_order==1)].shape"
+    "pd.crosstab(df.nearest_vp_idx_monotonic, \n",
+    "            df.stop_meters_monotonic, normalize=True)"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "2ea55941-8d10-49f2-8765-6b2faba4080d",
+   "id": "7578edfa-e7a1-4607-8fa5-1d086efdef1c",
    "metadata": {},
    "outputs": [],
    "source": [
-    "trip_stats = (df.groupby(\"trip_instance_key\", \n",
-    "                         observed=True, group_keys=False)\n",
-    "              .agg({\n",
-    "                  \"error_same_endpoints\": \"mean\",\n",
-    "                  \"error_arrival_order\": \"mean\"\n",
-    "              }).reset_index()\n",
-    "             )"
+    "# Case 1: this is the largest group of errors, and \n",
+    "# should be easier to fix"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "83fde1e4-29b1-43ab-b30c-f98ec63a87c8",
+   "id": "b923bfdf-df03-49cf-94bf-f5ee2270714d",
    "metadata": {},
    "outputs": [],
    "source": [
-    "# Very few trips are completely error-free\n",
-    "trip_stats[(trip_stats.error_same_endpoints==0) & \n",
-    "           (trip_stats.error_arrival_order==0)].shape"
+    "df[(df.nearest_vp_idx_monotonic==False) &\n",
+    "    (df.stop_meters_monotonic==True)]"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "1bd9d1b1-48f0-46e1-9875-181ea20df66a",
+   "id": "689e546b-7df6-49e3-bbcc-7594a41fd32e",
    "metadata": {},
    "outputs": [],
    "source": [
-    "#trip_stats.sample(10).trip_instance_key.unique()\n",
-    "subset_trip_keys = [\n",
-    "    '9fad69264acd8387150f45b27d4b2d09',\n",
-    "    '44a55d2fa2588a479065ef7702475ef1',\n",
-    "    '36070a2428e62b96368d072eb2a8fc1b',\n",
-    "    '7f665900c6b0879f4b9bda43b93fefe3',\n",
-    "    '8e8ba9993d52388539d06a46710c1dbc',\n",
-    "    'b301c2170c1ca49bbc1a9b600cccf643',\n",
-    "    '9373f5b0de977a718dea50fd90443619',\n",
-    "    '8415b3949147c9dc3d5ceb37863440b1',\n",
-    "    '984f598419c1d0830ef4618d495c1bd7',\n",
-    "    '815e4dd921cdcb61ad2dbb1ca5f08a39'\n",
-    "]"
+    "df[df.stop_meters_monotonic==False][[\n",
+    "    \"stop_sequence\", \n",
+    "    \"nearest_vp_idx\",\n",
+    "    \"rolling_nearest_vp_idx\", \"nearest_vp_idx_monotonic\", \n",
+    "    \"stop_meters\", \"rolling_stop_meters\", \n",
+    "    \"stop_meters_monotonic\"\n",
+    "]]"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "212eaa5d-735c-4332-b25d-e1883ee48f15",
+   "id": "ccf433cf-69e7-476c-a64a-8c999a53858b",
    "metadata": {},
    "outputs": [],
    "source": [
-    "def check_if_surrounding_points_are_ok(df: pd.DataFrame):\n",
-    "    grouped_df = df.groupby(\"trip_instance_key\", \n",
-    "                            observed=True, group_keys=False\n",
-    "                           )\n",
-    "    df = df.assign(\n",
-    "        prior_error = (grouped_df\n",
-    "                       .error_arrival_order\n",
-    "                       .shift(1)\n",
-    "                      ),\n",
-    "        subseq_error = (grouped_df\n",
-    "                        .error_arrival_order\n",
-    "                        .shift(-1)\n",
-    "                       )\n",
-    "    )\n",
-    "    \n",
-    "    df = df.assign(\n",
-    "        can_be_fixed = df.apply(\n",
-    "            lambda x:\n",
-    "            1 if (x.error_arrival_order==1) and\n",
-    "            (x.prior_error==0) and (x.subseq_error==0)\n",
-    "            else 0, axis=1\n",
-    "        )\n",
-    "    )\n",
+    "# How to use stop arrivals to constrain the wrong arrival times that occur\n",
+    "# in the middle of the trip?\n",
+    "stop_arrivals = pd.read_parquet(\n",
+    "    f\"{SEGMENT_GCS}{STOP_ARRIVALS}.parquet\",\n",
+    "    columns = [\"trip_instance_key\", \"stop_sequence\", \"arrival_time\"]\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7f80f709-1a58-4608-af7e-4295ad647bdb",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "trips_monotonicity = (stop_arrivals.groupby(\"trip_instance_key\")\n",
+    "                       .arrival_time\n",
+    "                       .is_monotonic_increasing\n",
+    "                      ).to_frame().reset_index()\n",
     "\n",
-    "    return df\n",
-    "    "
+    "trips_monotonicity"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "5a219a60-4ea2-45b0-9fa9-3a11f326b8a0",
+   "id": "3d6adbc5-3959-448f-ae35-c1fca40848c7",
    "metadata": {},
    "outputs": [],
    "source": [
-    "df2 = pd.merge(\n",
-    "    df,\n",
-    "    stop_arrivals,\n",
-    "    on = [\"trip_instance_key\", \"stop_sequence\"],\n",
-    "    how = \"inner\"\n",
-    ")"
+    "trips_monotonicity.arrival_time.value_counts()"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "c5b1f434-cb45-4425-aa8f-7a85c87d3e8d",
+   "id": "3d8da9cc-957f-4d4f-8e15-b84f1b9f70b1",
    "metadata": {},
    "outputs": [],
    "source": [
-    "df3 = check_if_surrounding_points_are_ok(df2)"
+    "fail_trips = trips_monotonicity[\n",
+    "    trips_monotonicity.arrival_time==False\n",
+    "].sample(25).trip_instance_key.unique()"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "4c47138d-9129-43ff-b73a-4c494f5be58a",
+   "id": "8f96f7eb-1083-4eb6-a76d-70debce26884",
    "metadata": {},
    "outputs": [],
    "source": [
-    "df3[df3.error_arrival_order==1].shape"
+    "stop_arrivals[stop_arrivals.trip_instance_key==fail_trips[7]]"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "d913f05d-2df8-4a92-bc6c-3dd3d2e78a37",
+   "id": "b06b3f04-73eb-4eb3-b8fe-9ae49c8a1c3a",
    "metadata": {},
    "outputs": [],
    "source": [
-    "df3[(df3.error_arrival_order==1) & \n",
-    "    (df3.prior_error==0) & \n",
-    "    (df3.subseq_error==0)\n",
-    "   ].shape"
+    "import altair as alt\n",
+    "\n",
+    "def plot_stop_arrivals(df, one_trip):\n",
+    "    chart = (alt.Chart(df[df.trip_instance_key==one_trip])\n",
+    "             .mark_line()\n",
+    "             .encode(\n",
+    "                 x=\"stop_sequence\",\n",
+    "                 y=\"arrival_time:T\",\n",
+    "                 tooltip=[\"stop_sequence\", \"arrival_time\"]\n",
+    "             ).properties(title=one_trip)\n",
+    "             .interactive()\n",
+    "            )\n",
+    "    \n",
+    "    display(chart)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "58ecf1aa-8833-44f4-a286-e5c87e67b440",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for t in fail_trips:\n",
+    "    plot_stop_arrivals(stop_arrivals, t)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1bd9d1b1-48f0-46e1-9875-181ea20df66a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#trip_stats.sample(10).trip_instance_key.unique()\n",
+    "subset_trip_keys = [\n",
+    "    '9fad69264acd8387150f45b27d4b2d09',\n",
+    "    '44a55d2fa2588a479065ef7702475ef1',\n",
+    "    '36070a2428e62b96368d072eb2a8fc1b',\n",
+    "    '7f665900c6b0879f4b9bda43b93fefe3',\n",
+    "    '8e8ba9993d52388539d06a46710c1dbc',\n",
+    "    'b301c2170c1ca49bbc1a9b600cccf643',\n",
+    "    '9373f5b0de977a718dea50fd90443619',\n",
+    "    '8415b3949147c9dc3d5ceb37863440b1',\n",
+    "    '984f598419c1d0830ef4618d495c1bd7',\n",
+    "    '815e4dd921cdcb61ad2dbb1ca5f08a39'\n",
+    "]"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "e979265a-780d-496e-b3b0-195cc5058d2b",
+   "id": "5a219a60-4ea2-45b0-9fa9-3a11f326b8a0",
    "metadata": {},
    "outputs": [],
    "source": [
-    "df3[df3.can_be_fixed==1].trip_instance_key.unique()[:5]"
+    "df2 = pd.merge(\n",
+    "    df,\n",
+    "    stop_arrivals,\n",
+    "    on = [\"trip_instance_key\", \"stop_sequence\"],\n",
+    "    how = \"inner\"\n",
+    ")"
    ]
   },
   {
@@ -260,7 +287,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "df3[df3.trip_instance_key==\"00019686e6c7bf335148c8d290feb285\"]"
+    "df2[df2.trip_instance_key==\"00019686e6c7bf335148c8d290feb285\"]"
    ]
   },
   {
@@ -270,7 +297,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "df3[df3.trip_instance_key==\"0001ad7e1ef246cf6d68599de0fdcaad\"\n",
+    "df2[df2.trip_instance_key==\"0001ad7e1ef246cf6d68599de0fdcaad\"\n",
     "   ].tail(10)"
    ]
   },
@@ -309,16 +336,16 @@
     "             .mark_line()\n",
     "             .encode(\n",
     "                 x=\"stop_sequence\",\n",
-    "                 y=\"error_arrival_order\"\n",
+    "                 y=\"nearest_vp_idx_monotonic\"\n",
     "             ).properties(title=f\"{t}\")\n",
     "    )\n",
     "    display(chart)\n",
     "    \n",
-    "    chart2 = (alt.Chart(subset_df[subset_df.error_arrival_order == 0])\n",
+    "    chart2 = (alt.Chart(subset_df[subset_df.nearest_vp_idx_monotonic == True])\n",
     "              .mark_line()\n",
     "              .encode(\n",
     "                  x=\"stop_sequence\",\n",
-    "                  y=\"error_same_endpoints\"\n",
+    "                  y=\"stop_meters_monotonic\"\n",
     "              )\n",
     "    )\n",
     "    display(chart2)"
@@ -696,7 +723,7 @@
    "outputs": [],
    "source": [
     "df = pd.read_parquet(\n",
-    "    f\"{SEGMENT_GCS}stop_arrivals_speed_{analysis_date}_2.parquet\")"
+    "    f\"{SEGMENT_GCS}speed_stop_segments_{analysis_date}.parquet\")"
    ]
   },
   {
diff --git a/rt_segment_speeds/logs/interpolate_stop_arrival.log b/rt_segment_speeds/logs/interpolate_stop_arrival.log
@@ -1,4 +1,4 @@
-2023-10-31 12:12:52.626 | INFO     | __main__:<module>:99 - Analysis date: 2023-09-13
-2023-10-31 12:14:03.894 | INFO     | __main__:<module>:134 - set up df with nearest / subseq vp info: 0:01:11.267039
-2023-10-31 12:14:57.365 | INFO     | __main__:<module>:139 - interpolate stop arrival: 0:00:53.471494
-2023-10-31 12:15:05.266 | INFO     | __main__:<module>:145 - execution time: 0:02:12.638916
+2023-10-31 18:10:00.239 | INFO     | __main__:<module>:99 - Analysis date: 2023-09-13
+2023-10-31 18:11:18.958 | INFO     | __main__:<module>:134 - set up df with nearest / subseq vp info: 0:01:18.690602
+2023-10-31 18:12:06.833 | INFO     | __main__:<module>:139 - interpolate stop arrival: 0:00:47.874819
+2023-10-31 18:12:14.756 | INFO     | __main__:<module>:145 - execution time: 0:02:14.488207
diff --git a/rt_segment_speeds/logs/nearest_vp.log b/rt_segment_speeds/logs/nearest_vp.log
@@ -1,8 +1,7 @@
-2023-10-31 at 09:34:59 | INFO | Analysis date: 2023-09-13
-2023-10-31 09:39:45.702 | INFO     | __main__:<module>:261 - map partitions to transform vp: 0:04:46.103748
-2023-10-31 at 09:39:45 | INFO | map partitions to transform vp: 0:04:46.103748
-2023-10-31 09:39:46.981 | INFO     | __main__:<module>:293 - map partitions to find nearest vp to stop: 0:00:01.279908
-2023-10-31 at 09:39:46 | INFO | map partitions to find nearest vp to stop: 0:00:01.2799082023-10-31 09:46:23.878 | INFO     | __main__:<module>:316 - Analysis date: 2023-09-13
-2023-10-31 09:51:11.125 | INFO     | __main__:find_nearest_vp_to_stop:261 - map partitions to transform vp: 0:04:47.246718
-2023-10-31 09:51:11.894 | INFO     | __main__:find_nearest_vp_to_stop:293 - map partitions to find nearest vp to stop: 0:00:00.768417
-2023-10-31 09:57:34.934 | INFO     | __main__:<module>:323 - execution time: 0:11:11.055258
+2023-10-31 17:45:52.135 | INFO     | __main__:<module>:332 - Analysis date: 2023-09-13
+2023-10-31 17:51:23.974 | INFO     | __main__:find_nearest_vp_to_stop:277 - map partitions to transform vp: 0:05:31.838490
+2023-10-31 17:51:25.093 | INFO     | __main__:find_nearest_vp_to_stop:309 - map partitions to find nearest vp to stop: 0:00:01.118975
+2023-10-31 17:57:10.858 | INFO     | __main__:<module>:337 - Analysis date: 2023-09-13
+2023-10-31 18:03:30.506 | INFO     | __main__:find_nearest_vp_to_stop:282 - map partitions to transform vp: 0:06:19.646465
+2023-10-31 18:03:31.676 | INFO     | __main__:find_nearest_vp_to_stop:314 - map partitions to find nearest vp to stop: 0:00:01.170538
+2023-10-31 18:08:58.296 | INFO     | __main__:<module>:344 - execution time: 0:11:47.436826
diff --git a/rt_segment_speeds/logs/speeds_by_segment_trip.log b/rt_segment_speeds/logs/speeds_by_segment_trip.log
@@ -8,3 +8,5 @@
 2023-10-17 18:34:34.838 | INFO     | __main__:<module>:378 - execution time: 0:10:16.928330
 2023-10-31 12:29:06.200 | INFO     | __main__:<module>:23 - Analysis date: 2023-09-13
 2023-10-31 12:29:29.129 | INFO     | __main__:<module>:69 - execution time: 0:00:22.926565
+2023-10-31 18:12:34.943 | INFO     | __main__:<module>:23 - Analysis date: 2023-09-13
+2023-10-31 18:12:57.436 | INFO     | __main__:<module>:69 - execution time: 0:00:22.465316
diff --git a/rt_segment_speeds/scripts/handle_common_errors.py b/rt_segment_speeds/scripts/handle_common_errors.py
diff --git a/rt_segment_speeds/scripts/nearest_vp_to_stop.py b/rt_segment_speeds/scripts/nearest_vp_to_stop.py