diff --git a/rt_segment_speeds/21_segment_comparison.ipynb b/rt_segment_speeds/21_segment_comparison.ipynb new file mode 100644 index 000000000..1871a6ad9 --- /dev/null +++ b/rt_segment_speeds/21_segment_comparison.ipynb @@ -0,0 +1,331 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "1b6a55f7-2ffd-4c4c-bbee-106403d9f27c", + "metadata": {}, + "source": [ + "# Select a couple of simpler trips to compare\n", + "\n", + "* Compare methodologies, which differ when handling more complex shapes\n", + "* But is it different even in simpler shapes?\n", + "* Start with Big Blue Bus and LA Metro" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "bd13ebd1-69b0-4fc2-8202-cc34eacb6e9e", + "metadata": {}, + "outputs": [], + "source": [ + "import geopandas as gpd\n", + "import pandas as pd\n", + "\n", + "from shared_utils import rt_dates, rt_utils\n", + "from segment_speed_utils.project_vars import SEGMENT_GCS\n", + "\n", + "from prep_comparison import map_one_trip\n", + "\n", + "analysis_date = rt_dates.DATES[\"sep2023\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "369c795d-5a7a-4471-9432-c0338f430b27", + "metadata": {}, + "outputs": [], + "source": [ + "df_eric = gpd.read_parquet(f\"{SEGMENT_GCS}speeds_eric_{analysis_date}.parquet\")\n", + "df_tiff = gpd.read_parquet(f\"{SEGMENT_GCS}speeds_tiff_{analysis_date}.parquet\")\n", + "speed_df = pd.read_parquet(\n", + " f\"{SEGMENT_GCS}speeds_comparison_{analysis_date}.parquet\")" + ] + }, + { + "cell_type": "markdown", + "id": "3c362785-d64b-489b-ab5d-071525dd488d", + "metadata": {}, + "source": [ + "## Side-by-Side Maps" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "9b08e29e-b792-4187-9c2b-6c44b0c72ee9", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "((151014, 23), (155314, 24))" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_eric.shape, df_tiff.shape " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "79c973ab-d9b9-4da5-bf6b-30daaded0e84", + "metadata": {}, + "outputs": [], + "source": [ + "#df_tiff.route_id.unique()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "58421989-2f26-44de-8431-b11185246d69", + "metadata": {}, + "outputs": [], + "source": [ + "#one_route = \"96-13168\"\n", + "#df_tiff[df_tiff.route_id==one_route].explore(\n", + "# \"route_id\", tiles = \"CartoDB Positron\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1f3c7cc5-b186-41db-9777-6df4da95486f", + "metadata": {}, + "outputs": [], + "source": [ + "#df_tiff[df_tiff.route_id==one_route].trip_id.unique()" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "fcaa69c2-72e3-4314-b92b-38ef939c2d3b", + "metadata": {}, + "outputs": [], + "source": [ + "trips_to_try = {\n", + " \"metro_720\": \"10720012750651-JUNE23\", #route_id: 720-13168\n", + " \"metro_901\": \"10901000590843-JUNE23\", #route_id: 901-13168\n", + " \"metro_550\": \"10550001350610-JUNE23\", # route_id: 550-13168\n", + " \"metro_230\": \"10230000830600-JUNE23\", # route_id: 230-13168\n", + " \"metro_96\": \"10096002510743-JUNE23\", # route_id: 96-13168\n", + " \"bbb1\": \"908521\", # route_id: 3639\n", + " #\"bbb2\": \"\", #route_id\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "f20cce34-b4e9-40a1-8845-259b8d5b4e41", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
Make this Notebook Trusted to load map: File -> Trust Notebook
" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "map_one_trip(df_eric, trips_to_try[\"metro_720\"])" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "2feb428a-a676-413c-8777-3306de7cca67", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
Make this Notebook Trusted to load map: File -> Trust Notebook
" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "map_one_trip(df_tiff, trips_to_try[\"metro_720\"])" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "320ad850-c27c-4e8a-894e-2155aaece14b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
Make this Notebook Trusted to load map: File -> Trust Notebook
" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "map_one_trip(df_eric, trips_to_try[\"metro_901\"])" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "e884a2a8-4473-4e28-9d03-76746a0014c8", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
Make this Notebook Trusted to load map: File -> Trust Notebook
" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "map_one_trip(df_tiff, trips_to_try[\"metro_901\"])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "db2c3e5d-eee4-4384-ab1d-058727175f61", + "metadata": {}, + "outputs": [], + "source": [ + "map_one_trip(df_eric, trips_to_try[\"metro_550\"])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "300a67c3-fe11-4b2c-a3e2-8e76f3df27b5", + "metadata": {}, + "outputs": [], + "source": [ + "map_one_trip(df_tiff, trips_to_try[\"metro_550\"])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "34219b56-0b5f-4530-bfa1-3d5aad3ae8da", + "metadata": {}, + "outputs": [], + "source": [ + "map_one_trip(df_eric, trips_to_try[\"metro_230\"])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "114a5b60-f405-4eff-965a-51d7d463fcdc", + "metadata": {}, + "outputs": [], + "source": [ + "map_one_trip(df_tiff, trips_to_try[\"metro_230\"])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1786d445-5e5e-4eb9-b116-028a3c3814b1", + "metadata": {}, + "outputs": [], + "source": [ + "map_one_trip(df_eric, trips_to_try[\"metro_96\"])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "12c49bcb-5c75-4726-bd93-608ce1d73029", + "metadata": {}, + "outputs": [], + "source": [ + "map_one_trip(df_tiff, trips_to_try[\"metro_96\"])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "aec93764-f09f-41b2-9e59-41f881cb107f", + "metadata": {}, + "outputs": [], + "source": [ + "map_one_trip(df_eric, trips_to_try[\"bbb1\"])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "314b7816-951c-4712-9e1b-a23197618acf", + "metadata": {}, + "outputs": [], + "source": [ + "map_one_trip(df_tiff, trips_to_try[\"bbb1\"])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "32052ff7-5f7d-4b73-864b-2e7cebc1b925", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/rt_segment_speeds/prep_comparison.py b/rt_segment_speeds/prep_comparison.py new file mode 100644 index 000000000..e439a791d --- /dev/null +++ b/rt_segment_speeds/prep_comparison.py @@ -0,0 +1,146 @@ +""" +Prep the data used to compare segment methodologies +between Eric and Tiffany. + +Get at why speeds are coming out differently. + + - segments do not exactly match + - points over which speeds are calculated aren't exactly the same, + since understanding of direction is not exactly the same + - peel all that back and start at the trip-level to see what's going into + averages, start with simpler shapes (no loop, no inlining) +""" +import geopandas as gpd +import pandas as pd + +from shared_utils import rt_dates, rt_utils +from segment_speed_utils import helpers +from segment_speed_utils.project_vars import SEGMENT_GCS, GCS_FILE_PATH +from calitp_data_analysis import utils + +RT_DELAY_GCS = f"{GCS_FILE_PATH}rt_delay/v2_segment_speed_views/" +analysis_date = rt_dates.DATES["sep2023"] + +def prep_eric_data(analysis_date: str) -> gpd.GeoDataFrame: + itp_ids = [ + 182, + 300, + ] + + # Don't narrow down time-of-day yet, we might select a trip from any + # of these + time_of_day = [ + "AM_Peak", "Midday", "PM_Peak" + ] + + eric_dfs = [ + gpd.read_parquet( + f"{RT_DELAY_GCS}{itp_id}_{analysis_date}_{time}.parquet") + for itp_id, time in zip(itp_ids, time_of_day) + ] + + df_eric = pd.concat(eric_dfs, axis=0).reset_index(drop=True) + + return df_eric + + +def prep_tiff_data( + analysis_date: str, + subset_df: gpd.GeoDataFrame +) -> gpd.GeoDataFrame: + + shape_trips = subset_df[["shape_id", "trip_id"]].drop_duplicates() + + scheduled_trips = helpers.import_scheduled_trips( + analysis_date, + columns = [ + "gtfs_dataset_key", "name", + "trip_id", "trip_instance_key", + "shape_id", "shape_array_key", + "route_id", "direction_id"], + get_pandas = True + ).rename(columns = {"gtfs_dataset_key": "schedule_gtfs_dataset_key"}) + + # Grab the trip_instance_keys we need and use it + # to filter the speeds parquet down + subset_trips = scheduled_trips.merge( + shape_trips, + on = ["shape_id", "trip_id"], + how = "inner" + ) + + trip_instances = subset_trips.trip_instance_key.unique().tolist() + subset_shapes = subset_trips.shape_array_key.unique().tolist() + + segments = gpd.read_parquet( + f"{SEGMENT_GCS}stop_segments_{analysis_date}.parquet", + filters = [[("shape_array_key", "in", subset_shapes)]] + ).drop(columns = ["geometry_arrowized", "district_name"]) + + filtered_trip_speeds = pd.read_parquet( + f"{SEGMENT_GCS}speeds_stop_segments_{analysis_date}.parquet", + filters = [[("trip_instance_key", "in", trip_instances)]] + ).merge( + subset_trips, + on = ["trip_instance_key", "shape_array_key"], + how = "inner" + ) + + df_tiff = pd.merge( + segments, + filtered_trip_speeds, + on = ["schedule_gtfs_dataset_key", "shape_array_key", "stop_sequence"], + how = "inner" + ) + + return df_tiff + + +def map_one_trip(gdf: gpd.GeoDataFrame, one_trip: str): + gdf2 = gdf[gdf.trip_id==one_trip] + + m1 = gdf2.explore( + "speed_mph", + tiles = "CartoDB Positron", + cmap = rt_utils.ZERO_THIRTY_COLORSCALE + ) + + return m1 + +if __name__ == "__main__": + df_eric = prep_eric_data(analysis_date) + df_tiff = prep_tiff_data(analysis_date, df_eric) + + utils.geoparquet_gcs_export( + df_eric, + SEGMENT_GCS, + f"speeds_eric_{analysis_date}" + ) + + utils.geoparquet_gcs_export( + df_tiff, + SEGMENT_GCS, + f"speeds_tiff_{analysis_date}" + ) + + # stop_sequence doesn't exactly merge, but that's fine, + # since Eric cuts shorter segments, so stop_sequence can have + # values like 1.25, 1.50, etc. + # Leave it in the merge for now, and allow left_only merges + identifier_cols = [ + "trip_id", "shape_id", "stop_id", "stop_sequence", + "route_id", "direction_id", + ] + + + speed_df = pd.merge( + df_eric[identifier_cols + ["speed_mph"]].rename( + columns = {"speed_mph": "eric_speed_mph"}), + df_tiff[identifier_cols + ["speed_mph"]].rename( + columns = {"speed_mph": "tiff_speed_mph"}), + on = identifier_cols, + how = "left", + indicator = True + ) + + speed_df.to_parquet(f"{SEGMENT_GCS}speeds_comparison_{analysis_date}.parquet") \ No newline at end of file