diff --git a/gtfs_funnel/stop_times_with_direction.py b/gtfs_funnel/stop_times_with_direction.py index 57a4679fb..d8d78c982 100644 --- a/gtfs_funnel/stop_times_with_direction.py +++ b/gtfs_funnel/stop_times_with_direction.py @@ -9,6 +9,8 @@ import numpy as np import pandas as pd +from typing import Literal + from calitp_data_analysis import utils from shared_utils import rt_utils from segment_speed_utils import helpers, wrangle_shapes @@ -79,15 +81,31 @@ def find_prior_stop( .groupby("trip_instance_key") .stop_sequence .shift(1) + ), + subseq_stop_sequence = ( + prior_stop + .sort_values(["trip_instance_key", "stop_sequence"]) + .groupby("trip_instance_key") + .stop_sequence + .shift(-1) ) ) - prior_stop_geom = stop_times[ - ["trip_instance_key", "stop_sequence", "geometry"] - ].rename(columns = { - "stop_sequence": "prior_stop_sequence", - "geometry": "prior_geometry" - }).set_geometry("prior_geometry").repartition(npartitions=1) + def renamed_geom_stop_times( + stop_times: dd.DataFrame, + suffix: Literal["prior", "subseq"] + ) -> dd.DataFrame: + + renamed_stop_geom = stop_times[ + ["trip_instance_key", "stop_sequence", "geometry"] + ].rename(columns = { + "stop_sequence": f"{suffix}_stop_sequence", + "geometry": f"{suffix}_geometry" + }).set_geometry(f"{suffix}_geometry").repartition(npartitions=1) + + return renamed_stop_geom + + prior_stop_geom = renamed_geom_stop_times(stop_times, suffix="prior") stop_times_with_prior = dd.merge( stop_times, @@ -101,7 +119,10 @@ def find_prior_stop( prior_stop_geom, on = ["trip_instance_key", "prior_stop_sequence"], how = "left" - ).astype({"prior_stop_sequence": "Int64"}) + ).astype({ + "prior_stop_sequence": "Int64", + "subseq_stop_sequence": "Int64" + }) return stop_times_with_prior_geom diff --git a/gtfs_funnel/update_vars.py b/gtfs_funnel/update_vars.py index 8faa8bb12..8ccbc59cd 100644 --- a/gtfs_funnel/update_vars.py +++ b/gtfs_funnel/update_vars.py @@ -3,12 +3,15 @@ from shared_utils import rt_dates months = [ + "mar", "apr", "may", + "jun", "jul", "aug", + "sep", "oct" ] analysis_date_list = [ - rt_dates.DATES["oct2023a"], - rt_dates.DATES["oct2023b"] + #rt_dates.DATES["sep2023"], + rt_dates.DATES[f"{m}2023"] for m in months ] CONFIG_PATH = Path("config.yml") diff --git a/rt_segment_speeds/23_project_all_vp_explore.ipynb b/rt_segment_speeds/23_project_all_vp_explore.ipynb index 190bc613d..4f682dd7f 100644 --- a/rt_segment_speeds/23_project_all_vp_explore.ipynb +++ b/rt_segment_speeds/23_project_all_vp_explore.ipynb @@ -17,13 +17,23 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "id": "73ced776-d521-4467-beb4-8d67d147aea4", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "DataTransformerRegistry.enable('default')" + ] + }, + "execution_count": 1, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "import altair as alt\n", - "import dask.dataframe as dd\n", "import geopandas as gpd\n", "import numpy as np\n", "import pandas as pd\n", @@ -31,13 +41,16 @@ "from segment_speed_utils import helpers\n", "from segment_speed_utils.project_vars import SEGMENT_GCS, PROJECT_CRS\n", "from shared_utils import rt_dates\n", + "from calitp_data_analysis import calitp_color_palette as cp\n", + "\n", + "analysis_date = rt_dates.DATES[\"sep2023\"]\n", "\n", - "analysis_date = rt_dates.DATES[\"sep2023\"]" + "alt.data_transformers.disable_max_rows()" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "id": "c289f689-c0fe-4ea2-9e14-4e80f77566ae", "metadata": {}, "outputs": [], @@ -62,7 +75,8 @@ "# Find whether it's loop or inlining\n", "shapes_loop_inlining = pd.read_parquet(\n", " f\"{SEGMENT_GCS}stop_segments_{analysis_date}.parquet\",\n", - " columns = [\"shape_array_key\", \"loop_or_inlining\"]\n", + " columns = [\"shape_array_key\", \"loop_or_inlining\"],\n", + " filters = [[(\"loop_or_inlining\", \"==\", 1)]]\n", ").drop_duplicates().merge(\n", " trip_to_shape,\n", " on = \"shape_array_key\",\n", @@ -72,185 +86,1171 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "1b0ba620-e24e-4197-b665-c6111ad03307", + "execution_count": 3, + "id": "b95e35da-cf34-4696-80a1-0ab3a9e99ebf", "metadata": {}, "outputs": [], "source": [ - "ok_trips = (shapes_loop_inlining[\n", - " shapes_loop_inlining.loop_or_inlining==0]\n", - " .sample(25).trip_instance_key.tolist()\n", - " )\n", - "\n", - "ok_trips" + "loopy_trips = shapes_loop_inlining.trip_instance_key.tolist()" ] }, { "cell_type": "code", - "execution_count": null, - "id": "b95e35da-cf34-4696-80a1-0ab3a9e99ebf", + "execution_count": 4, + "id": "20eb7d88-63e5-4695-82fd-141e8712396d", "metadata": {}, "outputs": [], "source": [ - "loopy_trips = (shapes_loop_inlining[\n", - " shapes_loop_inlining.loop_or_inlining==1]\n", - " .sample(25).trip_instance_key.tolist()\n", - " )\n", + "def merge_vp_with_projected_shape_meters(\n", + " analysis_date,\n", + " vp_filters: tuple\n", + "):\n", + " projected_shape_meters = pd.read_parquet(\n", + " f\"{SEGMENT_GCS}projection/vp_projected_{analysis_date}.parquet\"\n", + " )\n", "\n", - "loopy_trips" + " vp = pd.read_parquet(\n", + " f\"{SEGMENT_GCS}vp_usable_{analysis_date}\",\n", + " filters = vp_filters,\n", + " columns = [\"gtfs_dataset_key\", \"gtfs_dataset_name\", \n", + " \"vp_idx\", \"trip_instance_key\", \"trip_id\"]\n", + " ).merge(\n", + " projected_shape_meters,\n", + " on = \"vp_idx\",\n", + " how = \"inner\"\n", + " )\n", + " \n", + " return vp" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "id": "276cab58-94ff-4f72-a651-88a4f1ea890a", "metadata": {}, "outputs": [], "source": [ - "subset_trips = ok_trips + loopy_trips\n", - "\n", - "projected_shape_meters = pd.read_parquet(\n", - " f\"{SEGMENT_GCS}projection/vp_projected_{analysis_date}.parquet\"\n", - ")\n", - "\n", - "vp = pd.read_parquet(\n", - " f\"{SEGMENT_GCS}vp_usable_{analysis_date}\",\n", - " filters = [[(\"trip_instance_key\", \"in\", subset_trips)]]\n", - ").merge(\n", - " projected_shape_meters,\n", - " on = \"vp_idx\",\n", - " how = \"inner\"\n", - ").drop(columns = \"location_timestamp\")" + "vp = merge_vp_with_projected_shape_meters(\n", + " analysis_date,\n", + " vp_filters = [[(\"trip_instance_key\", \"in\", loopy_trips)]]\n", + ")" ] }, { "cell_type": "code", - "execution_count": null, - "id": "9d8f339f-f832-45f2-9f2c-f96125799a38", + "execution_count": 6, + "id": "1fbfe823-6a98-40b8-8c71-87cf686e5830", "metadata": {}, "outputs": [], "source": [ - "def plot_shape_meters(df: pd.DataFrame, one_trip: str):\n", - " \"\"\"\n", - " Plot how the projected shape meters looks for one trip.\n", - " \n", - " vp_idx is ordered by timestamp, use as x.\n", - " \"\"\"\n", - " subset_df = df[df.trip_instance_key==one_trip]\n", - " \n", - " print(f\"{subset_df.gtfs_dataset_name.iloc[0]}\")\n", - " print(f\"trip_instance_key: {one_trip}, trip_id: {subset_df.trip_id.iloc[0]}\")\n", + "#https://stackoverflow.com/questions/10226551/whats-the-most-pythonic-way-to-calculate-percentage-changes-on-a-list-of-numbers\n", + "def pct_change(arr):\n", + " if not isinstance(arr, np.ndarray):\n", + " arr = np.asarray(arr)\n", + " \n", + " pct = np.zeros_like(arr)\n", + " pct[1:] = np.diff(arr) / np.abs(arr[:-1])\n", " \n", - " chart = (alt.Chart(subset_df)\n", - " .mark_line()\n", - " .encode(\n", - " x=\"vp_idx\",\n", - " y=\"shape_meters:Q\"\n", - " )\n", - " )\n", - " \n", - " display(chart)\n" + " return pct * 100\n", + "\n", + "def number_of_elements_over_threshold(\n", + " arr, \n", + " pct_change_threshold\n", + "):\n", + " return len((pct_change(arr) > pct_change_threshold).nonzero()[0])" ] }, { "cell_type": "code", - "execution_count": null, - "id": "6aaf6652-307b-41ba-9059-72ae010b7928", + "execution_count": 7, + "id": "de934f1c-f5db-4c89-ba5a-d84a48b18ce3", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/tmp/ipykernel_871/1096090739.py:7: RuntimeWarning: divide by zero encountered in divide\n", + " pct[1:] = np.diff(arr) / np.abs(arr[:-1])\n", + "/tmp/ipykernel_871/1096090739.py:7: RuntimeWarning: invalid value encountered in divide\n", + " pct[1:] = np.diff(arr) / np.abs(arr[:-1])\n" + ] + } + ], "source": [ - "for t in ok_trips:\n", - " plot_shape_meters(vp, t)" + "# Find most jumpy trips\n", + "vp2 = (vp.groupby(\"trip_instance_key\")\n", + " .agg({\"shape_meters\": lambda x: list(x)})\n", + " .reset_index()\n", + " )\n", + "\n", + "vp2 = vp2.assign(\n", + " big_jumps = vp2.apply(\n", + " lambda x: \n", + " number_of_elements_over_threshold(\n", + " x.shape_meters, 20), axis=1\n", + " )\n", + ")" ] }, { "cell_type": "code", - "execution_count": null, - "id": "284fb053-ffc9-411a-ab46-82f6cfb08af1", + "execution_count": 27, + "id": "29a91576-ae53-4464-b0fa-40291431aa16", "metadata": {}, "outputs": [], "source": [ - "for t in loopy_trips:\n", - " plot_shape_meters(vp, t)" + "all_colors = (cp.CALITP_CATEGORY_BRIGHT_COLORS + cp.CALITP_CATEGORY_BOLD_COLORS + \n", + " cp.CALITP_DIVERGING_COLORS + cp.CALITP_SEQUENTIAL_COLORS\n", + " )\n", + "\n", + "\n", + "final_trip_keys = {\n", + " \"afaf17f1c2816652f0e4522a5c7f206b\": cp.CALITP_CATEGORY_BRIGHT_COLORS[1], # anaheim orange\n", + " \"a880d82a382929aa1de15be733f10a51\": cp.CALITP_CATEGORY_BOLD_COLORS[2], # kings green\n", + " \"d628de22f56dbb4c0e3f8242a2fe78d3\": cp.CALITP_CATEGORY_BRIGHT_COLORS[0], # vine blue\n", + " \"f1a0a79baa78db2c26b3248a30662a7b\": cp.CALITP_CATEGORY_BRIGHT_COLORS[5], # presidgo purple\n", + " \"26146503b5bf0235c0c8fe98dcd8d90b\": cp.CALITP_SEQUENTIAL_COLORS[4], # dumbarton navy\n", + " \"4f76b1c357ee534ac931a5c1bd1cbb87\": cp.CALITP_CATEGORY_BOLD_COLORS[2], # santa maria yellow\n", + " \"e5bcf460be0899f437b1e53b7f3feced\": cp.CALITP_CATEGORY_BOLD_COLORS[4], # tcrta gray blue\n", + " \"18cc0764a463566e8690f0d44c32c199\": cp.CALITP_CATEGORY_BRIGHT_COLORS[3], # sd green\n", + " \"45d8634a92be1fa10ae4f4aa5aa6d5b9\": cp.CALITP_CATEGORY_BRIGHT_COLORS[4], # foothill light blue\n", + " \"aa851696959462180fe04f189dc75584\": cp.CALITP_CATEGORY_BRIGHT_COLORS[3], # big blue bus green\n", + "}" ] }, { "cell_type": "code", - "execution_count": null, - "id": "ced6dbff-008e-4ccf-b284-cc1d79d3e801", + "execution_count": 28, + "id": "6c7f04b1-9966-4c6c-980d-a63e61c814f1", "metadata": {}, "outputs": [], "source": [ - "speed = pd.read_parquet(\n", - " f\"{SEGMENT_GCS}speeds_comparison_{analysis_date}.parquet\"\n", + "shapes = helpers.import_scheduled_shapes(\n", + " analysis_date, \n", + " columns = [\"shape_array_key\", \"shape_id\", \"geometry\"],\n", + " get_pandas = True,\n", + " crs = \"EPSG:4326\"\n", + ").merge(\n", + " shapes_loop_inlining[\n", + " shapes_loop_inlining.trip_instance_key.isin(final_trip_keys.keys())],\n", + " on = \"shape_array_key\",\n", + " how = \"inner\"\n", ")" ] }, { "cell_type": "code", - "execution_count": null, - "id": "8243b13f-323b-4b0f-8fff-75cd2837d165", - "metadata": {}, - "outputs": [], - "source": [ - "trip = \"10096002510743-JUNE23\"\n", - "speed[speed.trip_id==trip]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "394956b9-a02c-4b7d-b604-3775c4e55a51", + "execution_count": 29, + "id": "2339c9ab-2241-4087-abf8-cee4d1b2a9cc", "metadata": {}, "outputs": [], "source": [ - "metro_trip = helpers.import_scheduled_trips(\n", - " analysis_date,\n", - " columns = [\"trip_instance_key\", \"trip_id\"],\n", - " filters = [[(\"trip_id\", \"==\", trip)]],\n", - " get_pandas = True\n", + "gdf = pd.merge(\n", + " shapes,\n", + " vp,\n", + " on = \"trip_instance_key\",\n", + " how = \"inner\"\n", ")\n", "\n", - "trip_key = metro_trip.trip_instance_key.iloc[0]" + "# assign colors\n", + "gdf = gdf.assign(\n", + " color = gdf.trip_instance_key.map(final_trip_keys),\n", + " gtfs_dataset_name = gdf.gtfs_dataset_name.str.replace(\"VehiclePositions\", \"Vehicle Positions\")\n", + ")" ] }, { "cell_type": "code", - "execution_count": null, - "id": "5c5b8f1d-6571-4072-a9ea-dc1c36173453", + "execution_count": 30, + "id": "9d8f339f-f832-45f2-9f2c-f96125799a38", "metadata": {}, "outputs": [], "source": [ - "vp_pared = pd.read_parquet(\n", - " f\"{SEGMENT_GCS}vp_pared_stops_{analysis_date}\",\n", - " filters = [[(\"trip_instance_key\", \"==\", trip_key)]])" + "def make_chart_map(df: pd.DataFrame, one_trip: str):\n", + " \"\"\"\n", + " Plot how the projected shape meters looks for one trip.\n", + " \n", + " vp_idx is ordered by timestamp, use as x.\n", + " \"\"\"\n", + " subset_df = df[df.trip_instance_key==one_trip].drop(columns = \"geometry\")\n", + " subset_gdf = df[df.trip_instance_key==one_trip][[\n", + " \"shape_id\", \"geometry\"]].drop_duplicates()\n", + " \n", + " GTFS_DATA = subset_df.gtfs_dataset_name.iloc[0]\n", + " TRIP_ID = subset_df.trip_id.iloc[0]\n", + " COLOR = subset_df.color.iloc[0]\n", + " \n", + " grid_bool = False\n", + " grid_opacity = 0\n", + " WIDTH = 400\n", + " HEIGHT = 400\n", + " \n", + " chart = (alt.Chart(subset_df)\n", + " .mark_line(color=COLOR)\n", + " .encode(\n", + " x=alt.X(\"vp_idx\", \n", + " title = \"\", \n", + " axis=alt.Axis(labels=False, \n", + " gridOpacity=grid_opacity, \n", + " tickOpacity=grid_opacity)\n", + " ),\n", + " y=alt.Y(\"shape_meters:Q\", \n", + " title = \"\",\n", + " axis=alt.Axis(labels=False, \n", + " gridOpacity=grid_opacity, \n", + " tickOpacity=grid_opacity)\n", + " ),\n", + " ).properties(\n", + " title = {\"text\": [GTFS_DATA], \n", + " \"subtitle\": f\"trip_id: {TRIP_ID}\"}\n", + " )\n", + " )\n", + " \n", + " chart = (chart\n", + " .configure_axis(grid=grid_bool, labelFontSize=0)\n", + " .configure_axis(gridOpacity=grid_opacity, \n", + " domainOpacity=grid_opacity)\n", + " .configure_axisBand(grid=grid_bool)\n", + " .configure_view(strokeOpacity=grid_opacity)\n", + " .properties(width = WIDTH*1.2, height = HEIGHT)\n", + " )\n", + " \n", + " display(chart)\n", + " \n", + " m = subset_gdf.explore(\n", + " \"shape_id\", \n", + " tiles = \"CartoDB Positron\",\n", + " legend = False,\n", + " style_kwds = {\n", + " \"color\": COLOR, \n", + " },\n", + " map_kwds = {\n", + " \"dragging\": False,\n", + " \"scrollWheelZoom\": False,\n", + " \"tileOpacity\": 0\n", + " },\n", + " zoom_control = False,\n", + " width = WIDTH, height = HEIGHT\n", + " )\n", + " \n", + " display(m)\n" ] }, { "cell_type": "code", - "execution_count": null, - "id": "c83f420e-99a2-40d8-b3ef-5d13f68b4226", + "execution_count": 31, + "id": "284fb053-ffc9-411a-ab46-82f6cfb08af1", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.Chart(...)" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
Make this Notebook Trusted to load map: File -> Trust Notebook
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.Chart(...)" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
Make this Notebook Trusted to load map: File -> Trust Notebook
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.Chart(...)" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
Make this Notebook Trusted to load map: File -> Trust Notebook
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.Chart(...)" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
Make this Notebook Trusted to load map: File -> Trust Notebook
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.Chart(...)" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
Make this Notebook Trusted to load map: File -> Trust Notebook
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.Chart(...)" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
Make this Notebook Trusted to load map: File -> Trust Notebook
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.Chart(...)" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
Make this Notebook Trusted to load map: File -> Trust Notebook
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.Chart(...)" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
Make this Notebook Trusted to load map: File -> Trust Notebook
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.Chart(...)" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
Make this Notebook Trusted to load map: File -> Trust Notebook
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.Chart(...)" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
Make this Notebook Trusted to load map: File -> Trust Notebook
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ - "# 27:13, 27:56 * 29:14, 29:52, * 30:13\n", - "vp_pared[vp_pared.stop_sequence==36]" + "for t in final_trip_keys.keys():\n", + " make_chart_map(gdf, t)" ] }, { "cell_type": "code", "execution_count": null, - "id": "145ac1ce-93a7-4abb-b72b-977541a98163", + "id": "9763b29b-381b-4f76-9e88-a117e3d82a5a", "metadata": {}, "outputs": [], - "source": [ - "metro_trip" - ] + "source": [] }, { "cell_type": "code", "execution_count": null, - "id": "0a950ad3-a1bd-4f6f-b7ff-6ee95ffbf8e2", + "id": "71b15470-8c6d-4370-adab-96f3b3edb78b", "metadata": {}, "outputs": [], "source": [] diff --git a/rt_segment_speeds/logs/avg_speeds.log b/rt_segment_speeds/logs/avg_speeds.log index ca492f4c4..7d59ff9f7 100644 --- a/rt_segment_speeds/logs/avg_speeds.log +++ b/rt_segment_speeds/logs/avg_speeds.log @@ -1,30 +1,43 @@ -2023-08-14 10:45:18.623 | INFO | __main__::143 - Analysis date: 2023-07-12 -2023-08-14 10:52:06.516 | INFO | __main__::165 - execution time: 0:06:47.892861 -2023-08-14 10:54:13.170 | INFO | __main__::143 - Analysis date: 2023-06-14 -2023-08-14 11:01:01.301 | INFO | __main__::165 - execution time: 0:06:48.130654 -2023-08-14 11:12:26.319 | INFO | __main__::143 - Analysis date: 2023-05-17 -2023-08-14 11:20:25.045 | INFO | __main__::165 - execution time: 0:07:58.724683 -2023-08-14 11:21:03.494 | INFO | __main__::143 - Analysis date: 2023-04-12 -2023-08-14 11:28:45.925 | INFO | __main__::165 - execution time: 0:07:42.429806 -2023-08-14 11:33:17.821 | INFO | __main__::143 - Analysis date: 2023-03-15 -2023-08-14 11:40:28.316 | INFO | __main__::165 - execution time: 0:07:10.494542 -2023-08-18 14:48:01.479 | INFO | __main__::143 - Analysis date: 2023-08-16 -2023-08-18 14:53:19.222 | INFO | __main__::165 - execution time: 0:05:17.742302 -2023-08-24 14:56:29.366 | INFO | __main__::143 - Analysis date: 2023-08-15 -2023-08-24 15:02:21.592 | INFO | __main__::165 - execution time: 0:05:52.222770 -2023-09-21 14:37:32.197 | INFO | __main__::167 - Analysis date: 2023-09-13 -2023-09-21 14:43:35.784 | INFO | __main__::189 - execution time: 0:06:03.584427 -2023-09-21 15:49:19.146 | INFO | __main__::167 - Analysis date: 2023-06-14 -2023-09-21 15:54:51.743 | INFO | __main__::189 - execution time: 0:05:32.595873 -2023-09-21 17:54:03.991 | INFO | __main__::167 - Analysis date: 2023-05-17 -2023-09-21 18:00:12.730 | INFO | __main__::189 - execution time: 0:06:08.738161 -2023-09-21 18:40:52.924 | INFO | __main__::159 - Analysis date: 2023-04-12 -2023-09-21 18:47:14.179 | INFO | __main__::181 - execution time: 0:06:21.254666 -2023-09-21 19:28:44.034 | INFO | __main__::159 - Analysis date: 2023-03-15 -2023-09-21 19:34:39.639 | INFO | __main__::181 - execution time: 0:05:55.604394 -2023-09-21 19:36:10.116 | INFO | __main__::159 - Analysis date: 2023-07-12 -2023-09-21 19:41:33.957 | INFO | __main__::181 - execution time: 0:05:23.840096 -2023-09-21 19:43:00.511 | INFO | __main__::159 - Analysis date: 2023-08-15 -2023-09-21 19:49:14.412 | INFO | __main__::181 - execution time: 0:06:13.900161 -2023-10-17 18:34:48.535 | INFO | __main__::159 - Analysis date: 2023-10-11 -2023-10-17 18:40:48.183 | INFO | __main__::181 - execution time: 0:05:59.631201 +2023-11-06 15:57:16.305 | INFO | __main__::249 - Analysis date: 2023-10-11 +2023-11-06 15:59:18.120 | INFO | __main__:speeds_with_segment_geom:130 - segment averages execution time: 0:02:01.813076 +2023-11-06 15:59:26.899 | INFO | __main__:avg_trip_speeds_with_time_of_day:231 - trip summary execution time: 0:00:08.655715 +2023-11-06 15:59:26.911 | INFO | __main__::249 - Analysis date: 2023-09-13 +2023-11-06 16:01:24.513 | INFO | __main__:speeds_with_segment_geom:130 - segment averages execution time: 0:01:57.601884 +2023-11-06 16:01:32.899 | INFO | __main__:avg_trip_speeds_with_time_of_day:231 - trip summary execution time: 0:00:08.277026 +2023-11-06 16:01:32.910 | INFO | __main__::249 - Analysis date: 2023-08-15 +2023-11-06 16:03:34.836 | INFO | __main__:speeds_with_segment_geom:130 - segment averages execution time: 0:02:01.925195 +2023-11-06 16:03:43.726 | INFO | __main__:avg_trip_speeds_with_time_of_day:231 - trip summary execution time: 0:00:08.779732 +2023-11-06 16:03:43.739 | INFO | __main__::249 - Analysis date: 2023-07-12 +2023-11-06 16:05:28.192 | INFO | __main__:speeds_with_segment_geom:130 - segment averages execution time: 0:01:44.451632 +2023-11-06 16:05:37.199 | INFO | __main__:avg_trip_speeds_with_time_of_day:231 - trip summary execution time: 0:00:08.899107 +2023-11-06 16:05:37.210 | INFO | __main__::249 - Analysis date: 2023-06-14 +2023-11-06 16:07:20.946 | INFO | __main__:speeds_with_segment_geom:130 - segment averages execution time: 0:01:43.736212 +2023-11-06 16:07:30.071 | INFO | __main__:avg_trip_speeds_with_time_of_day:231 - trip summary execution time: 0:00:09.023577 +2023-11-06 16:07:30.083 | INFO | __main__::249 - Analysis date: 2023-05-17 +2023-11-06 16:09:26.014 | INFO | __main__:speeds_with_segment_geom:130 - segment averages execution time: 0:01:55.930926 +2023-11-06 16:09:35.119 | INFO | __main__:avg_trip_speeds_with_time_of_day:231 - trip summary execution time: 0:00:09.005461 +2023-11-06 16:09:35.131 | INFO | __main__::249 - Analysis date: 2023-04-12 +2023-11-06 16:14:19.095 | INFO | __main__::249 - Analysis date: 2023-10-11 +2023-11-06 16:16:23.123 | INFO | __main__:speeds_with_segment_geom:130 - segment averages execution time: 0:02:04.027868 +2023-11-06 16:16:32.544 | INFO | __main__:avg_trip_speeds_with_time_of_day:231 - trip summary execution time: 0:00:09.299333 +2023-11-06 16:16:32.557 | INFO | __main__::249 - Analysis date: 2023-09-13 +2023-11-06 16:18:31.595 | INFO | __main__:speeds_with_segment_geom:130 - segment averages execution time: 0:01:59.037028 +2023-11-06 16:18:39.862 | INFO | __main__:avg_trip_speeds_with_time_of_day:231 - trip summary execution time: 0:00:08.143427 +2023-11-06 16:18:39.874 | INFO | __main__::249 - Analysis date: 2023-08-15 +2023-11-06 16:20:44.117 | INFO | __main__:speeds_with_segment_geom:130 - segment averages execution time: 0:02:04.241983 +2023-11-06 16:20:52.968 | INFO | __main__:avg_trip_speeds_with_time_of_day:231 - trip summary execution time: 0:00:08.742420 +2023-11-06 16:20:52.980 | INFO | __main__::249 - Analysis date: 2023-07-12 +2023-11-06 16:22:41.130 | INFO | __main__:speeds_with_segment_geom:130 - segment averages execution time: 0:01:48.149524 +2023-11-06 16:22:50.098 | INFO | __main__:avg_trip_speeds_with_time_of_day:231 - trip summary execution time: 0:00:08.871341 +2023-11-06 16:22:50.111 | INFO | __main__::249 - Analysis date: 2023-06-14 +2023-11-06 16:24:38.042 | INFO | __main__:speeds_with_segment_geom:130 - segment averages execution time: 0:01:47.930386 +2023-11-06 16:24:47.008 | INFO | __main__:avg_trip_speeds_with_time_of_day:231 - trip summary execution time: 0:00:08.854655 +2023-11-06 16:24:47.021 | INFO | __main__::249 - Analysis date: 2023-05-17 +2023-11-06 16:26:48.437 | INFO | __main__:speeds_with_segment_geom:130 - segment averages execution time: 0:02:01.414883 +2023-11-06 16:26:57.608 | INFO | __main__:avg_trip_speeds_with_time_of_day:231 - trip summary execution time: 0:00:09.058372 +2023-11-06 16:26:57.622 | INFO | __main__::249 - Analysis date: 2023-04-12 +2023-11-06 16:29:01.767 | INFO | __main__:speeds_with_segment_geom:130 - segment averages execution time: 0:02:04.143738 +2023-11-06 16:29:10.778 | INFO | __main__:avg_trip_speeds_with_time_of_day:231 - trip summary execution time: 0:00:08.902052 +2023-11-06 16:29:10.791 | INFO | __main__::249 - Analysis date: 2023-03-15 +2023-11-06 16:31:05.067 | INFO | __main__:speeds_with_segment_geom:130 - segment averages execution time: 0:01:54.274974 +2023-11-06 16:31:14.376 | INFO | __main__:avg_trip_speeds_with_time_of_day:231 - trip summary execution time: 0:00:09.212622 diff --git a/rt_segment_speeds/logs/cut_road_segments.log b/rt_segment_speeds/logs/cut_road_segments.log index e935a62ed..a0c97e1bf 100644 --- a/rt_segment_speeds/logs/cut_road_segments.log +++ b/rt_segment_speeds/logs/cut_road_segments.log @@ -1,34 +1,3 @@ -2023-10-04 14:49:11.624 | INFO | __main__::337 - cut primary/secondary roads: 0:05:12.326594 -2023-10-04 14:57:49.674 | INFO | __main__::353 - cut local roads base: 0:08:38.050766 -2023-10-04 14:57:49.675 | INFO | __main__::356 - execution time: 0:13:50.378197 -2023-10-04 15:01:32.642 | INFO | __main__::99 - Analysis date: 2023-09-13 -2023-10-04 15:03:04.641 | INFO | __main__::106 - add local linearids for this month: 0:01:31.998829 -2023-10-04 15:03:55.610 | INFO | __main__::149 - concatenate road segments: 0:00:50.969056 -2023-10-04 15:03:55.611 | INFO | __main__::150 - execution time: 0:02:22.967885 -2023-10-04 15:04:29.207 | INFO | __main__::99 - Analysis date: 2023-08-15 -2023-10-04 15:06:03.121 | INFO | __main__::106 - add local linearids for this month: 0:01:33.913428 -2023-10-04 15:06:53.739 | INFO | __main__::149 - concatenate road segments: 0:00:50.618163 -2023-10-04 15:06:53.741 | INFO | __main__::150 - execution time: 0:02:24.531591 -2023-10-04 15:07:31.183 | INFO | __main__::99 - Analysis date: 2023-07-12 -2023-10-04 15:08:56.917 | INFO | __main__::106 - add local linearids for this month: 0:01:25.733453 -2023-10-04 15:09:47.263 | INFO | __main__::149 - concatenate road segments: 0:00:50.346193 -2023-10-04 15:09:47.265 | INFO | __main__::150 - execution time: 0:02:16.079646 -2023-10-10 13:17:21.899 | INFO | __main__::337 - cut primary/secondary roads: 0:02:55.449737 -2023-10-10 13:21:41.094 | INFO | __main__::353 - cut local roads base: 0:04:19.194981 -2023-10-10 13:21:41.094 | INFO | __main__::356 - execution time: 0:07:14.645388 -2023-10-10 13:26:23.986 | INFO | __main__::99 - Analysis date: 2023-09-13 -2023-10-10 13:27:20.525 | INFO | __main__::106 - add local linearids for this month: 0:00:56.537886 -2023-10-10 13:27:49.172 | INFO | __main__::149 - concatenate road segments: 0:00:28.647259 -2023-10-10 13:27:49.172 | INFO | __main__::150 - execution time: 0:01:25.185145 -2023-10-10 14:49:09.851 | INFO | __main__::99 - Analysis date: 2023-08-15 -2023-10-10 14:50:13.055 | INFO | __main__::106 - add local linearids for this month: 0:01:03.202748 -2023-10-10 14:50:40.658 | INFO | __main__::149 - concatenate road segments: 0:00:27.603282 -2023-10-10 14:50:40.659 | INFO | __main__::150 - execution time: 0:01:30.806030 -2023-10-10 15:06:20.252 | INFO | __main__::96 - Analysis date: 2023-07-12 -2023-10-10 15:07:19.611 | INFO | __main__::103 - add local linearids for this month: 0:00:59.359330 -2023-10-10 15:07:46.994 | INFO | __main__::146 - concatenate road segments: 0:00:27.382778 -2023-10-10 15:07:46.995 | INFO | __main__::147 - execution time: 0:01:26.742108 -2023-10-12 11:09:05.353 | INFO | __main__::96 - Analysis date: 2023-10-11 -2023-10-12 11:10:33.776 | INFO | __main__::103 - add local linearids for this month: 0:01:28.401366 -2023-10-12 11:11:07.558 | INFO | __main__::146 - concatenate road segments: 0:00:33.781990 -2023-10-12 11:11:07.559 | INFO | __main__::147 - execution time: 0:02:02.183356 +2023-11-01 13:54:23.167 | INFO | __main__::280 - cut primary/secondary roads: 0:02:48.274895 +2023-11-01 13:58:41.143 | INFO | __main__::295 - cut local roads base: 0:04:17.975700 +2023-11-01 13:58:41.143 | INFO | __main__::298 - execution time: 0:07:06.251281 diff --git a/rt_segment_speeds/logs/interpolate_stop_arrival.log b/rt_segment_speeds/logs/interpolate_stop_arrival.log index 6178d8156..c6452bd27 100644 --- a/rt_segment_speeds/logs/interpolate_stop_arrival.log +++ b/rt_segment_speeds/logs/interpolate_stop_arrival.log @@ -2,3 +2,31 @@ 2023-10-31 18:11:18.958 | INFO | __main__::134 - set up df with nearest / subseq vp info: 0:01:18.690602 2023-10-31 18:12:06.833 | INFO | __main__::139 - interpolate stop arrival: 0:00:47.874819 2023-10-31 18:12:14.756 | INFO | __main__::145 - execution time: 0:02:14.488207 +2023-11-03 13:50:00.660 | INFO | __main__::153 - Analysis date: 2023-10-11 +2023-11-03 13:51:28.630 | INFO | __main__:main:124 - set up df with nearest / subseq vp info: 0:01:27.969585 +2023-11-03 13:52:31.928 | INFO | __main__:main:129 - interpolate stop arrival: 0:01:03.298093 +2023-11-03 13:52:41.114 | INFO | __main__:main:135 - execution time: 0:02:40.453737 +2023-11-03 15:59:50.277 | INFO | __main__::153 - Analysis date: 2023-08-15 +2023-11-03 16:01:18.175 | INFO | __main__:main:124 - set up df with nearest / subseq vp info: 0:01:27.896964 +2023-11-03 16:02:21.641 | INFO | __main__:main:129 - interpolate stop arrival: 0:01:03.466104 +2023-11-03 16:02:31.047 | INFO | __main__:main:135 - execution time: 0:02:40.768508 +2023-11-03 16:02:31.231 | INFO | __main__::153 - Analysis date: 2023-07-12 +2023-11-03 16:03:55.074 | INFO | __main__:main:124 - set up df with nearest / subseq vp info: 0:01:23.841921 +2023-11-03 16:04:51.717 | INFO | __main__:main:129 - interpolate stop arrival: 0:00:56.643267 +2023-11-03 16:05:00.003 | INFO | __main__:main:135 - execution time: 0:02:28.771041 +2023-11-03 16:05:00.125 | INFO | __main__::153 - Analysis date: 2023-06-14 +2023-11-03 16:06:28.304 | INFO | __main__:main:124 - set up df with nearest / subseq vp info: 0:01:28.178540 +2023-11-03 16:07:24.310 | INFO | __main__:main:129 - interpolate stop arrival: 0:00:56.005986 +2023-11-03 16:07:33.890 | INFO | __main__:main:135 - execution time: 0:02:33.764494 +2023-11-03 16:07:34.155 | INFO | __main__::153 - Analysis date: 2023-05-17 +2023-11-03 16:08:58.194 | INFO | __main__:main:124 - set up df with nearest / subseq vp info: 0:01:24.038301 +2023-11-03 16:09:56.769 | INFO | __main__:main:129 - interpolate stop arrival: 0:00:58.574171 +2023-11-03 16:10:05.405 | INFO | __main__:main:135 - execution time: 0:02:31.248634 +2023-11-03 16:10:05.597 | INFO | __main__::153 - Analysis date: 2023-04-12 +2023-11-03 16:11:35.069 | INFO | __main__:main:124 - set up df with nearest / subseq vp info: 0:01:29.471108 +2023-11-03 16:12:36.102 | INFO | __main__:main:129 - interpolate stop arrival: 0:01:01.033072 +2023-11-03 16:12:44.655 | INFO | __main__:main:135 - execution time: 0:02:39.056691 +2023-11-03 16:12:44.788 | INFO | __main__::153 - Analysis date: 2023-03-15 +2023-11-03 16:14:10.249 | INFO | __main__:main:124 - set up df with nearest / subseq vp info: 0:01:25.460793 +2023-11-03 16:15:11.212 | INFO | __main__:main:129 - interpolate stop arrival: 0:01:00.962506 +2023-11-03 16:15:19.953 | INFO | __main__:main:135 - execution time: 0:02:35.163939 diff --git a/rt_segment_speeds/logs/nearest_vp.log b/rt_segment_speeds/logs/nearest_vp.log index d8ce12b9d..7fbeeca39 100644 --- a/rt_segment_speeds/logs/nearest_vp.log +++ b/rt_segment_speeds/logs/nearest_vp.log @@ -1,7 +1,32 @@ -2023-10-31 17:45:52.135 | INFO | __main__::332 - Analysis date: 2023-09-13 -2023-10-31 17:51:23.974 | INFO | __main__:find_nearest_vp_to_stop:277 - map partitions to transform vp: 0:05:31.838490 -2023-10-31 17:51:25.093 | INFO | __main__:find_nearest_vp_to_stop:309 - map partitions to find nearest vp to stop: 0:00:01.118975 2023-10-31 17:57:10.858 | INFO | __main__::337 - Analysis date: 2023-09-13 2023-10-31 18:03:30.506 | INFO | __main__:find_nearest_vp_to_stop:282 - map partitions to transform vp: 0:06:19.646465 2023-10-31 18:03:31.676 | INFO | __main__:find_nearest_vp_to_stop:314 - map partitions to find nearest vp to stop: 0:00:01.170538 2023-10-31 18:08:58.296 | INFO | __main__::344 - execution time: 0:11:47.436826 +2023-11-03 13:36:23.516 | INFO | __main__::337 - Analysis date: 2023-10-11 +2023-11-03 13:43:22.702 | INFO | __main__:find_nearest_vp_to_stop:281 - map partitions to transform vp: 0:06:59.134110 +2023-11-03 13:43:23.916 | INFO | __main__:find_nearest_vp_to_stop:313 - map partitions to find nearest vp to stop: 0:00:01.213424 +2023-11-03 13:49:42.958 | INFO | __main__::344 - execution time: 0:13:19.390402 +2023-11-03 14:46:33.465 | INFO | __main__::339 - Analysis date: 2023-08-15 +2023-11-03 14:52:52.919 | INFO | __main__:find_nearest_vp_to_stop:281 - map partitions to transform vp: 0:06:19.452868 +2023-11-03 14:52:54.211 | INFO | __main__:find_nearest_vp_to_stop:313 - map partitions to find nearest vp to stop: 0:00:01.292626 +2023-11-03 14:58:59.779 | INFO | __main__:find_nearest_vp_to_stop:323 - execution time: 0:12:26.312844 +2023-11-03 14:59:00.212 | INFO | __main__::339 - Analysis date: 2023-07-12 +2023-11-03 15:05:48.714 | INFO | __main__:find_nearest_vp_to_stop:281 - map partitions to transform vp: 0:06:48.501285 +2023-11-03 15:05:49.885 | INFO | __main__:find_nearest_vp_to_stop:313 - map partitions to find nearest vp to stop: 0:00:01.170919 +2023-11-03 15:11:33.748 | INFO | __main__:find_nearest_vp_to_stop:323 - execution time: 0:12:33.535236 +2023-11-03 15:11:34.299 | INFO | __main__::339 - Analysis date: 2023-06-14 +2023-11-03 15:18:00.501 | INFO | __main__:find_nearest_vp_to_stop:281 - map partitions to transform vp: 0:06:26.201462 +2023-11-03 15:18:01.751 | INFO | __main__:find_nearest_vp_to_stop:313 - map partitions to find nearest vp to stop: 0:00:01.250351 +2023-11-03 15:23:48.451 | INFO | __main__:find_nearest_vp_to_stop:323 - execution time: 0:12:14.151951 +2023-11-03 15:23:48.847 | INFO | __main__::339 - Analysis date: 2023-05-17 +2023-11-03 15:30:10.910 | INFO | __main__:find_nearest_vp_to_stop:281 - map partitions to transform vp: 0:06:22.061854 +2023-11-03 15:30:12.094 | INFO | __main__:find_nearest_vp_to_stop:313 - map partitions to find nearest vp to stop: 0:00:01.184575 +2023-11-03 15:35:50.028 | INFO | __main__:find_nearest_vp_to_stop:323 - execution time: 0:12:01.179564 +2023-11-03 15:35:50.589 | INFO | __main__::339 - Analysis date: 2023-04-12 +2023-11-03 15:42:09.950 | INFO | __main__:find_nearest_vp_to_stop:281 - map partitions to transform vp: 0:06:19.358835 +2023-11-03 15:42:11.157 | INFO | __main__:find_nearest_vp_to_stop:313 - map partitions to find nearest vp to stop: 0:00:01.207248 +2023-11-03 15:47:59.635 | INFO | __main__:find_nearest_vp_to_stop:323 - execution time: 0:12:09.044200 +2023-11-03 15:48:00.252 | INFO | __main__::339 - Analysis date: 2023-03-15 +2023-11-03 15:53:58.273 | INFO | __main__:find_nearest_vp_to_stop:281 - map partitions to transform vp: 0:05:58.019210 +2023-11-03 15:53:59.500 | INFO | __main__:find_nearest_vp_to_stop:313 - map partitions to find nearest vp to stop: 0:00:01.226720 +2023-11-03 15:59:30.250 | INFO | __main__:find_nearest_vp_to_stop:323 - execution time: 0:11:29.995820 diff --git a/rt_segment_speeds/logs/prep_stop_segments.log b/rt_segment_speeds/logs/prep_stop_segments.log index 2598c728c..04b599d8d 100644 --- a/rt_segment_speeds/logs/prep_stop_segments.log +++ b/rt_segment_speeds/logs/prep_stop_segments.log @@ -1,24 +1,24 @@ -2023-10-17 13:14:46.275 | INFO | __main__::256 - Analysis date: 2023-03-15 -2023-10-17 13:15:24.496 | INFO | __main__::263 - Prep stop segment df: 0:00:38.214426 -2023-10-17 13:15:29.098 | INFO | __main__::271 - execution time: 0:00:42.816073 -2023-10-17 13:15:29.099 | INFO | __main__::256 - Analysis date: 2023-04-12 -2023-10-17 13:15:57.485 | INFO | __main__::263 - Prep stop segment df: 0:00:28.385523 -2023-10-17 13:16:02.499 | INFO | __main__::271 - execution time: 0:00:33.400098 -2023-10-17 13:16:02.500 | INFO | __main__::256 - Analysis date: 2023-05-17 -2023-10-17 13:16:31.425 | INFO | __main__::263 - Prep stop segment df: 0:00:28.904607 -2023-10-17 13:16:35.854 | INFO | __main__::271 - execution time: 0:00:33.333007 -2023-10-17 13:16:35.854 | INFO | __main__::256 - Analysis date: 2023-06-14 -2023-10-17 13:17:04.723 | INFO | __main__::263 - Prep stop segment df: 0:00:28.867982 -2023-10-17 13:17:09.278 | INFO | __main__::271 - execution time: 0:00:33.423347 -2023-10-17 13:17:09.279 | INFO | __main__::256 - Analysis date: 2023-07-12 -2023-10-17 13:17:44.243 | INFO | __main__::263 - Prep stop segment df: 0:00:34.964086 -2023-10-17 13:17:48.637 | INFO | __main__::271 - execution time: 0:00:39.358247 -2023-10-17 13:17:48.639 | INFO | __main__::256 - Analysis date: 2023-08-15 -2023-10-17 13:18:36.553 | INFO | __main__::263 - Prep stop segment df: 0:00:47.913322 -2023-10-17 13:18:41.395 | INFO | __main__::271 - execution time: 0:00:52.755389 -2023-10-24 10:03:59.640 | INFO | __main__::252 - Analysis date: 2023-09-13 -2023-10-24 10:04:30.868 | INFO | __main__::259 - Prep stop segment df: 0:00:31.227214 -2023-10-24 10:04:36.329 | INFO | __main__::267 - execution time: 0:00:36.688195 -2023-10-24 10:04:36.330 | INFO | __main__::252 - Analysis date: 2023-10-11 -2023-10-24 10:05:12.397 | INFO | __main__::259 - Prep stop segment df: 0:00:36.067067 -2023-10-24 10:05:18.498 | INFO | __main__::267 - execution time: 0:00:42.167738 +2023-11-01 15:40:53.915 | INFO | __main__::252 - Analysis date: 2023-10-11 +2023-11-01 15:41:23.984 | INFO | __main__::259 - Prep stop segment df: 0:00:30.068163 +2023-11-01 15:41:28.115 | INFO | __main__::267 - execution time: 0:00:34.200081 +2023-11-01 15:41:28.116 | INFO | __main__::252 - Analysis date: 2023-09-13 +2023-11-01 15:41:55.989 | INFO | __main__::259 - Prep stop segment df: 0:00:27.872887 +2023-11-01 15:42:00.094 | INFO | __main__::267 - execution time: 0:00:31.978206 +2023-11-01 15:42:00.095 | INFO | __main__::252 - Analysis date: 2023-08-15 +2023-11-01 15:42:28.584 | INFO | __main__::259 - Prep stop segment df: 0:00:28.488285 +2023-11-01 15:42:32.818 | INFO | __main__::267 - execution time: 0:00:32.722970 +2023-11-01 15:42:32.819 | INFO | __main__::252 - Analysis date: 2023-07-12 +2023-11-01 15:43:01.069 | INFO | __main__::259 - Prep stop segment df: 0:00:28.250004 +2023-11-01 15:43:04.835 | INFO | __main__::267 - execution time: 0:00:32.016018 +2023-11-01 15:43:04.837 | INFO | __main__::252 - Analysis date: 2023-06-14 +2023-11-01 15:43:32.841 | INFO | __main__::259 - Prep stop segment df: 0:00:28.004113 +2023-11-01 15:43:39.634 | INFO | __main__::267 - execution time: 0:00:34.796661 +2023-11-01 15:43:39.634 | INFO | __main__::252 - Analysis date: 2023-05-17 +2023-11-01 15:44:07.928 | INFO | __main__::259 - Prep stop segment df: 0:00:28.293002 +2023-11-01 15:44:13.436 | INFO | __main__::267 - execution time: 0:00:33.800758 +2023-11-01 15:44:13.436 | INFO | __main__::252 - Analysis date: 2023-04-12 +2023-11-01 15:44:49.328 | INFO | __main__::259 - Prep stop segment df: 0:00:35.870067 +2023-11-01 15:44:53.144 | INFO | __main__::267 - execution time: 0:00:39.686215 +2023-11-01 15:44:53.145 | INFO | __main__::252 - Analysis date: 2023-03-15 +2023-11-01 15:45:20.814 | INFO | __main__::259 - Prep stop segment df: 0:00:27.669192 +2023-11-01 15:45:24.707 | INFO | __main__::267 - execution time: 0:00:31.561927 diff --git a/rt_segment_speeds/logs/shapely_project_vp.log b/rt_segment_speeds/logs/shapely_project_vp.log new file mode 100644 index 000000000..bf632164b --- /dev/null +++ b/rt_segment_speeds/logs/shapely_project_vp.log @@ -0,0 +1,27 @@ +2023-11-03 13:29:37.759 | INFO | __main__::59 - Analysis date: 2023-10-11 +2023-11-03 13:30:34.396 | INFO | __main__::108 - map partitions: 0:00:56.633350 +2023-11-03 13:36:05.124 | INFO | __main__::115 - compute and export: 0:05:30.728361 +2023-11-03 14:06:48.833 | INFO | __main__::125 - Analysis date: 2023-08-15 +2023-11-03 14:07:35.513 | INFO | __main__:project_usable_vp_one_day:98 - map partitions: 0:00:46.678890 +2023-11-03 14:12:57.494 | INFO | __main__:project_usable_vp_one_day:105 - compute and export: 0:05:21.981347 +2023-11-03 14:12:57.497 | INFO | __main__:project_usable_vp_one_day:106 - execution time: 0:06:08.660237 +2023-11-03 14:12:57.511 | INFO | __main__::125 - Analysis date: 2023-07-12 +2023-11-03 14:13:45.518 | INFO | __main__:project_usable_vp_one_day:98 - map partitions: 0:00:48.005825 +2023-11-03 14:19:44.076 | INFO | __main__:project_usable_vp_one_day:105 - compute and export: 0:05:58.557858 +2023-11-03 14:19:44.078 | INFO | __main__:project_usable_vp_one_day:106 - execution time: 0:06:46.563683 +2023-11-03 14:19:44.088 | INFO | __main__::125 - Analysis date: 2023-06-14 +2023-11-03 14:21:06.750 | INFO | __main__:project_usable_vp_one_day:98 - map partitions: 0:01:22.660793 +2023-11-03 14:25:55.035 | INFO | __main__:project_usable_vp_one_day:105 - compute and export: 0:04:48.284841 +2023-11-03 14:25:55.036 | INFO | __main__:project_usable_vp_one_day:106 - execution time: 0:06:10.945634 +2023-11-03 14:25:55.045 | INFO | __main__::125 - Analysis date: 2023-05-17 +2023-11-03 14:26:58.867 | INFO | __main__:project_usable_vp_one_day:98 - map partitions: 0:01:03.820596 +2023-11-03 14:34:19.919 | INFO | __main__:project_usable_vp_one_day:105 - compute and export: 0:07:21.052399 +2023-11-03 14:34:19.919 | INFO | __main__:project_usable_vp_one_day:106 - execution time: 0:08:24.872995 +2023-11-03 14:34:19.928 | INFO | __main__::125 - Analysis date: 2023-04-12 +2023-11-03 14:35:19.182 | INFO | __main__:project_usable_vp_one_day:98 - map partitions: 0:00:59.253906 +2023-11-03 14:40:20.944 | INFO | __main__:project_usable_vp_one_day:105 - compute and export: 0:05:01.761581 +2023-11-03 14:40:20.944 | INFO | __main__:project_usable_vp_one_day:106 - execution time: 0:06:01.015487 +2023-11-03 14:40:20.953 | INFO | __main__::125 - Analysis date: 2023-03-15 +2023-11-03 14:41:13.621 | INFO | __main__:project_usable_vp_one_day:98 - map partitions: 0:00:52.667374 +2023-11-03 14:46:15.016 | INFO | __main__:project_usable_vp_one_day:105 - compute and export: 0:05:01.394757 +2023-11-03 14:46:15.017 | INFO | __main__:project_usable_vp_one_day:106 - execution time: 0:05:54.062131 diff --git a/rt_segment_speeds/logs/sjoin_shapes_roads.log b/rt_segment_speeds/logs/sjoin_shapes_roads.log new file mode 100644 index 000000000..e69de29bb diff --git a/rt_segment_speeds/logs/sjoin_vp_segments.log b/rt_segment_speeds/logs/sjoin_vp_segments.log deleted file mode 100644 index 2ce93e885..000000000 --- a/rt_segment_speeds/logs/sjoin_vp_segments.log +++ /dev/null @@ -1,69 +0,0 @@ -2023-08-13 15:32:42.979 | INFO | __main__::211 - Analysis date: 2023-07-12 -2023-08-13 15:34:48.942 | INFO | __main__:sjoin_vp_to_segments:176 - prep vp and persist: 0:02:05.928211 -2023-08-13 15:34:49.037 | INFO | __main__:sjoin_vp_to_segments:191 - sjoin with map_partitions: 0:00:00.094836 -2023-08-13 15:41:29.266 | INFO | __main__:sjoin_vp_to_segments:200 - export partitioned results: 0:06:40.228903 -2023-08-13 15:41:33.115 | INFO | __main__::223 - execution time: 0:08:50.105129 -2023-08-13 21:21:05.325 | INFO | __main__::211 - Analysis date: 2023-06-14 -2023-08-13 21:23:03.599 | INFO | __main__:sjoin_vp_to_segments:176 - prep vp and persist: 0:01:58.270024 -2023-08-13 21:23:03.616 | INFO | __main__:sjoin_vp_to_segments:191 - sjoin with map_partitions: 0:00:00.016758 -2023-08-13 21:28:45.980 | INFO | __main__:sjoin_vp_to_segments:200 - export partitioned results: 0:05:42.364145 -2023-08-13 21:28:49.685 | INFO | __main__::223 - execution time: 0:07:44.360210 -2023-08-13 22:04:16.808 | INFO | __main__::211 - Analysis date: 2023-05-17 -2023-08-13 22:06:20.536 | INFO | __main__:sjoin_vp_to_segments:176 - prep vp and persist: 0:02:03.724408 -2023-08-13 22:06:20.559 | INFO | __main__:sjoin_vp_to_segments:191 - sjoin with map_partitions: 0:00:00.022602 -2023-08-13 22:12:19.877 | INFO | __main__:sjoin_vp_to_segments:200 - export partitioned results: 0:05:59.317878 -2023-08-13 22:12:23.653 | INFO | __main__::223 - execution time: 0:08:06.844146 -2023-08-13 22:40:05.238 | INFO | __main__::211 - Analysis date: 2023-04-12 -2023-08-13 22:42:09.037 | INFO | __main__:sjoin_vp_to_segments:176 - prep vp and persist: 0:02:03.793820 -2023-08-13 22:42:09.061 | INFO | __main__:sjoin_vp_to_segments:191 - sjoin with map_partitions: 0:00:00.023704 -2023-08-13 22:48:10.880 | INFO | __main__:sjoin_vp_to_segments:200 - export partitioned results: 0:06:01.819976 -2023-08-13 22:48:14.912 | INFO | __main__::223 - execution time: 0:08:09.673477 -2023-08-14 07:54:28.210 | INFO | __main__::211 - Analysis date: 2023-03-15 -2023-08-14 07:56:58.975 | INFO | __main__:sjoin_vp_to_segments:176 - prep vp and persist: 0:02:30.645154 -2023-08-14 07:56:59.002 | INFO | __main__:sjoin_vp_to_segments:191 - sjoin with map_partitions: 0:00:00.026733 -2023-08-14 08:07:24.889 | INFO | __main__:sjoin_vp_to_segments:200 - export partitioned results: 0:10:25.887360 -2023-08-14 08:07:29.336 | INFO | __main__::223 - execution time: 0:13:01.059573 -2023-08-18 14:12:59.926 | INFO | __main__::211 - Analysis date: 2023-08-16 -2023-08-18 14:15:43.911 | INFO | __main__:sjoin_vp_to_segments:176 - prep vp and persist: 0:02:43.951955 -2023-08-18 14:15:43.969 | INFO | __main__:sjoin_vp_to_segments:191 - sjoin with map_partitions: 0:00:00.057488 -2023-08-18 14:22:15.411 | INFO | __main__:sjoin_vp_to_segments:200 - export partitioned results: 0:06:31.441789 -2023-08-18 14:22:19.302 | INFO | __main__::223 - execution time: 0:09:19.346578 -2023-08-24 14:23:28.328 | INFO | __main__::211 - Analysis date: 2023-08-15 -2023-08-24 14:25:38.717 | INFO | __main__:sjoin_vp_to_segments:176 - prep vp and persist: 0:02:10.371530 -2023-08-24 14:25:38.767 | INFO | __main__:sjoin_vp_to_segments:191 - sjoin with map_partitions: 0:00:00.050300 -2023-08-24 14:32:56.013 | INFO | __main__:sjoin_vp_to_segments:200 - export partitioned results: 0:07:17.246282 -2023-08-24 14:33:01.424 | INFO | __main__::223 - execution time: 0:09:33.082155 -2023-09-09 13:26:39.979 | INFO | __main__::457 - Analysis date: 2023-07-12 -2023-09-09 13:34:38.631 | INFO | __main__::465 - remove erroneous sjoin results: 0:07:58.650611 -2023-09-14 12:54:12.157 | INFO | __main__::211 - Analysis date: 2023-09-13 -2023-09-14 12:56:46.821 | INFO | __main__:sjoin_vp_to_segments:176 - prep vp and persist: 0:02:34.658525 -2023-09-14 12:56:46.912 | INFO | __main__:sjoin_vp_to_segments:191 - sjoin with map_partitions: 0:00:00.091034 -2023-09-14 13:06:11.390 | INFO | __main__:sjoin_vp_to_segments:200 - export partitioned results: 0:09:24.477597 -2023-09-14 13:06:15.758 | INFO | __main__::223 - execution time: 0:12:03.599629 -2023-09-14 13:06:31.913 | INFO | __main__::457 - Analysis date: 2023-09-13 -2023-09-14 13:14:08.656 | INFO | __main__::465 - remove erroneous sjoin results: 0:07:36.742381 -2023-09-21 10:58:58.711 | INFO | __main__::457 - Analysis date: 2023-09-13 -2023-09-21 11:05:58.377 | INFO | __main__::465 - remove erroneous sjoin results: 0:06:59.626650 -2023-09-21 12:11:55.875 | INFO | __main__::456 - Analysis date: 2023-08-15 -2023-09-21 12:19:13.216 | INFO | __main__::464 - remove erroneous sjoin results: 0:07:17.337816 -2023-09-21 12:58:44.489 | INFO | __main__::456 - Analysis date: 2023-07-12 -2023-09-21 13:07:33.610 | INFO | __main__::464 - remove erroneous sjoin results: 0:08:49.120980 -2023-09-21 15:22:02.693 | INFO | __main__::456 - Analysis date: 2023-06-14 -2023-09-21 15:29:14.363 | INFO | __main__::464 - remove erroneous sjoin results: 0:07:11.668827 -2023-09-21 17:26:50.733 | INFO | __main__::456 - Analysis date: 2023-05-17 -2023-09-21 17:33:37.420 | INFO | __main__::464 - remove erroneous sjoin results: 0:06:46.686094 -2023-09-21 18:03:52.838 | INFO | __main__::456 - Analysis date: 2023-04-12 -2023-09-21 18:11:04.091 | INFO | __main__::464 - remove erroneous sjoin results: 0:07:11.251722 -2023-09-21 18:55:33.230 | INFO | __main__::456 - Analysis date: 2023-03-15 -2023-09-21 19:05:26.800 | INFO | __main__::464 - remove erroneous sjoin results: 0:09:53.569193 -2023-10-10 16:26:03.401 | INFO | __main__::189 - Analysis date: 2023-09-13 -2023-10-10 16:26:54.759 | INFO | __main__:sjoin_vp_to_segments:111 - import vp and segments: 0:00:51.353998 -2023-10-10 16:26:55.499 | INFO | __main__:sjoin_vp_to_segments:125 - sjoin with map_partitions: 0:00:00.739841 -2023-10-10 16:29:18.592 | INFO | __main__:sjoin_vp_to_segments:136 - export partitioned results: 0:02:23.093345 -2023-10-17 13:33:28.504 | INFO | __main__::259 - Analysis date: 2023-10-11 -2023-10-17 13:35:02.438 | INFO | __main__:sjoin_vp_to_segments:222 - prep vp and persist: 0:01:33.930536 -2023-10-17 13:39:08.674 | INFO | __main__:sjoin_vp_to_segments:237 - sjoin with map_partitions: 0:04:06.236607 -2023-10-17 13:39:16.788 | INFO | __main__:sjoin_vp_to_segments:248 - export partitioned results: 0:00:08.113816 -2023-10-17 13:39:16.993 | INFO | __main__::271 - execution time: 0:05:48.489259 -2023-10-17 18:14:35.528 | INFO | __main__::456 - Analysis date: 2023-10-11 -2023-10-17 18:19:27.790 | INFO | __main__::464 - remove erroneous sjoin results: 0:04:52.262114 diff --git a/rt_segment_speeds/logs/speeds_by_segment_trip.log b/rt_segment_speeds/logs/speeds_by_segment_trip.log index e2527e55d..f5654c876 100644 --- a/rt_segment_speeds/logs/speeds_by_segment_trip.log +++ b/rt_segment_speeds/logs/speeds_by_segment_trip.log @@ -1,12 +1,32 @@ -2023-10-17 18:24:17.885 | INFO | __main__::369 - Analysis date: 2023-10-11 -2023-10-17 18:24:23.381 | INFO | __main__:linear_referencing_vp_against_line:58 - set up merged vp with segments: 0:00:00.132646 -2023-10-17 18:24:23.430 | INFO | __main__:linear_referencing_vp_against_line:76 - linear referencing: 0:00:00.049441 -2023-10-17 18:28:11.254 | INFO | __main__:linear_referencing_and_speed_by_segment:302 - linear referencing: 0:03:53.340694 -2023-10-17 18:32:08.241 | INFO | __main__:linear_referencing_and_speed_by_segment:313 - make wide and get initial speeds: 0:03:56.986736 -2023-10-17 18:34:24.377 | INFO | __main__:linear_referencing_and_speed_by_segment:354 - recalculate speeds and get final: 0:02:16.135938 -2023-10-17 18:34:34.838 | INFO | __main__::377 - speeds for stop segments: 0:10:16.927674 -2023-10-17 18:34:34.838 | INFO | __main__::378 - execution time: 0:10:16.928330 -2023-10-31 12:29:06.200 | INFO | __main__::23 - Analysis date: 2023-09-13 -2023-10-31 12:29:29.129 | INFO | __main__::69 - execution time: 0:00:22.926565 2023-10-31 18:12:34.943 | INFO | __main__::23 - Analysis date: 2023-09-13 2023-10-31 18:12:57.436 | INFO | __main__::69 - execution time: 0:00:22.465316 +2023-11-03 13:53:00.175 | INFO | __main__::77 - Analysis date: 2023-10-11 +2023-11-03 13:53:24.360 | INFO | __main__:calculate_speed_from_stop_arrivals:59 - execution time: 0:00:24.139367 +2023-11-03 16:15:37.605 | INFO | __main__::77 - Analysis date: 2023-08-15 +2023-11-03 16:16:03.643 | INFO | __main__:calculate_speed_from_stop_arrivals:59 - execution time: 0:00:26.037613 +2023-11-03 16:16:03.760 | INFO | __main__::77 - Analysis date: 2023-07-12 +2023-11-03 16:16:26.371 | INFO | __main__:calculate_speed_from_stop_arrivals:59 - execution time: 0:00:22.610908 +2023-11-03 16:16:26.436 | INFO | __main__::77 - Analysis date: 2023-06-14 +2023-11-03 16:16:46.858 | INFO | __main__:calculate_speed_from_stop_arrivals:59 - execution time: 0:00:20.420953 +2023-11-03 16:16:46.941 | INFO | __main__::77 - Analysis date: 2023-05-17 +2023-11-03 16:17:08.225 | INFO | __main__:calculate_speed_from_stop_arrivals:59 - execution time: 0:00:21.282876 +2023-11-03 16:17:08.290 | INFO | __main__::77 - Analysis date: 2023-04-12 +2023-11-03 16:17:33.748 | INFO | __main__:calculate_speed_from_stop_arrivals:59 - execution time: 0:00:25.457256 +2023-11-03 16:17:33.827 | INFO | __main__::77 - Analysis date: 2023-03-15 +2023-11-03 16:18:00.624 | INFO | __main__:calculate_speed_from_stop_arrivals:59 - execution time: 0:00:26.796081 +2023-11-06 16:11:26.549 | INFO | __main__::77 - Analysis date: 2023-10-11 +2023-11-06 16:11:47.642 | INFO | __main__:calculate_speed_from_stop_arrivals:59 - execution time: 0:00:21.073421 +2023-11-06 16:11:47.709 | INFO | __main__::77 - Analysis date: 2023-09-13 +2023-11-06 16:12:06.201 | INFO | __main__:calculate_speed_from_stop_arrivals:59 - execution time: 0:00:18.491937 +2023-11-06 16:12:06.258 | INFO | __main__::77 - Analysis date: 2023-08-15 +2023-11-06 16:12:26.412 | INFO | __main__:calculate_speed_from_stop_arrivals:59 - execution time: 0:00:20.153804 +2023-11-06 16:12:26.478 | INFO | __main__::77 - Analysis date: 2023-07-12 +2023-11-06 16:12:45.627 | INFO | __main__:calculate_speed_from_stop_arrivals:59 - execution time: 0:00:19.146422 +2023-11-06 16:12:45.688 | INFO | __main__::77 - Analysis date: 2023-06-14 +2023-11-06 16:13:03.685 | INFO | __main__:calculate_speed_from_stop_arrivals:59 - execution time: 0:00:17.996113 +2023-11-06 16:13:03.740 | INFO | __main__::77 - Analysis date: 2023-05-17 +2023-11-06 16:13:22.611 | INFO | __main__:calculate_speed_from_stop_arrivals:59 - execution time: 0:00:18.869696 +2023-11-06 16:13:22.672 | INFO | __main__::77 - Analysis date: 2023-04-12 +2023-11-06 16:13:42.942 | INFO | __main__:calculate_speed_from_stop_arrivals:59 - execution time: 0:00:20.269320 +2023-11-06 16:13:43.001 | INFO | __main__::77 - Analysis date: 2023-03-15 +2023-11-06 16:14:02.267 | INFO | __main__:calculate_speed_from_stop_arrivals:59 - execution time: 0:00:19.265320 diff --git a/rt_segment_speeds/logs/valid_vehicle_positions.log b/rt_segment_speeds/logs/valid_vehicle_positions.log deleted file mode 100644 index 66aa86804..000000000 --- a/rt_segment_speeds/logs/valid_vehicle_positions.log +++ /dev/null @@ -1,135 +0,0 @@ -2023-08-13 16:04:05.150 | INFO | __main__::349 - Analysis date: 2023-07-12 -2023-08-13 16:12:31.355 | INFO | __main__::363 - pare down vp by stop segments special cases 0:08:26.201267 -2023-08-13 16:12:31.356 | INFO | __main__::366 - execution time: 0:08:26.205295 -2023-08-13 20:45:54.933 | INFO | __main__::160 - Analysis date: 2023-07-12 -2023-08-13 20:48:03.182 | INFO | __main__:pare_down_vp_by_segment:132 - merge usable vp with sjoin results: 0:02:08.244979 -2023-08-13 20:48:03.285 | INFO | __main__:pare_down_vp_by_segment:141 - keep enter/exit points: 0:00:00.103224 -2023-08-13 20:50:54.420 | INFO | __main__:pare_down_vp_by_segment:149 - exported: 0:02:51.134568 -2023-08-13 20:50:54.422 | INFO | __main__::174 - pare down vp by stop segments normal cases 0:04:59.485200 -2023-08-13 20:50:54.423 | INFO | __main__::177 - execution time: 0:04:59.488991 -2023-08-13 20:51:37.688 | INFO | __main__::351 - Analysis date: 2023-07-12 -2023-08-13 21:00:17.993 | INFO | __main__::365 - pare down vp by stop segments special cases 0:08:40.301265 -2023-08-13 21:00:17.994 | INFO | __main__::368 - execution time: 0:08:40.305588 -2023-08-13 21:29:04.380 | INFO | __main__::160 - Analysis date: 2023-06-14 -2023-08-13 21:31:30.681 | INFO | __main__:pare_down_vp_by_segment:132 - merge usable vp with sjoin results: 0:02:26.297493 -2023-08-13 21:31:30.753 | INFO | __main__:pare_down_vp_by_segment:141 - keep enter/exit points: 0:00:00.072037 -2023-08-13 21:34:24.288 | INFO | __main__:pare_down_vp_by_segment:149 - exported: 0:02:53.534893 -2023-08-13 21:34:24.290 | INFO | __main__::174 - pare down vp by stop segments normal cases 0:05:19.906925 -2023-08-13 21:34:24.291 | INFO | __main__::177 - execution time: 0:05:19.910619 -2023-08-13 21:34:38.073 | INFO | __main__::351 - Analysis date: 2023-06-14 -2023-08-13 21:43:18.603 | INFO | __main__::365 - pare down vp by stop segments special cases 0:08:40.525052 -2023-08-13 21:43:18.604 | INFO | __main__::368 - execution time: 0:08:40.529666 -2023-08-13 22:12:36.662 | INFO | __main__::160 - Analysis date: 2023-05-17 -2023-08-13 22:15:12.892 | INFO | __main__:pare_down_vp_by_segment:132 - merge usable vp with sjoin results: 0:02:36.226611 -2023-08-13 22:15:12.971 | INFO | __main__:pare_down_vp_by_segment:141 - keep enter/exit points: 0:00:00.078677 -2023-08-13 22:18:03.325 | INFO | __main__:pare_down_vp_by_segment:149 - exported: 0:02:50.353998 -2023-08-13 22:18:03.327 | INFO | __main__::174 - pare down vp by stop segments normal cases 0:05:26.661411 -2023-08-13 22:18:03.327 | INFO | __main__::177 - execution time: 0:05:26.664871 -2023-08-13 22:18:18.278 | INFO | __main__::351 - Analysis date: 2023-05-17 -2023-08-13 22:26:56.589 | INFO | __main__::365 - pare down vp by stop segments special cases 0:08:38.303835 -2023-08-13 22:26:56.590 | INFO | __main__::368 - execution time: 0:08:38.310902 -2023-08-13 22:48:28.482 | INFO | __main__::160 - Analysis date: 2023-04-12 -2023-08-13 22:51:34.171 | INFO | __main__:pare_down_vp_by_segment:132 - merge usable vp with sjoin results: 0:03:05.685506 -2023-08-13 22:51:34.253 | INFO | __main__:pare_down_vp_by_segment:141 - keep enter/exit points: 0:00:00.081727 -2023-08-13 22:54:36.123 | INFO | __main__:pare_down_vp_by_segment:149 - exported: 0:03:01.870409 -2023-08-13 22:54:36.125 | INFO | __main__::174 - pare down vp by stop segments normal cases 0:06:07.639993 -2023-08-13 22:54:36.126 | INFO | __main__::177 - execution time: 0:06:07.643654 -2023-08-13 22:54:49.842 | INFO | __main__::351 - Analysis date: 2023-04-12 -2023-08-13 23:03:53.321 | INFO | __main__::365 - pare down vp by stop segments special cases 0:09:03.475660 -2023-08-13 23:03:53.322 | INFO | __main__::368 - execution time: 0:09:03.479617 -2023-08-14 08:07:45.615 | INFO | __main__::160 - Analysis date: 2023-03-15 -2023-08-14 08:10:42.732 | INFO | __main__:pare_down_vp_by_segment:132 - merge usable vp with sjoin results: 0:02:57.076804 -2023-08-14 08:10:42.830 | INFO | __main__:pare_down_vp_by_segment:141 - keep enter/exit points: 0:00:00.097241 -2023-08-14 08:14:47.031 | INFO | __main__:pare_down_vp_by_segment:149 - exported: 0:04:04.201704 -2023-08-14 08:14:47.056 | INFO | __main__::174 - pare down vp by stop segments normal cases 0:07:01.400380 -2023-08-14 08:14:47.058 | INFO | __main__::177 - execution time: 0:07:01.408741 -2023-08-14 08:15:05.189 | INFO | __main__::351 - Analysis date: 2023-03-15 -2023-08-14 08:25:56.940 | INFO | __main__::365 - pare down vp by stop segments special cases 0:10:51.746002 -2023-08-14 08:25:56.966 | INFO | __main__::368 - execution time: 0:10:51.776126 -2023-08-18 14:22:39.116 | INFO | __main__::160 - Analysis date: 2023-08-16 -2023-08-18 14:25:47.568 | INFO | __main__:pare_down_vp_by_segment:132 - merge usable vp with sjoin results: 0:03:08.432750 -2023-08-18 14:25:47.656 | INFO | __main__:pare_down_vp_by_segment:141 - keep enter/exit points: 0:00:00.088305 -2023-08-18 14:29:22.685 | INFO | __main__:pare_down_vp_by_segment:149 - exported: 0:03:35.028937 -2023-08-18 14:29:22.687 | INFO | __main__::174 - pare down vp by stop segments normal cases 0:06:43.551547 -2023-08-18 14:29:22.687 | INFO | __main__::177 - execution time: 0:06:43.557315 -2023-08-18 14:29:38.732 | INFO | __main__::351 - Analysis date: 2023-08-16 -2023-08-18 14:40:02.981 | INFO | __main__::365 - pare down vp by stop segments special cases 0:10:24.243990 -2023-08-18 14:40:02.981 | INFO | __main__::368 - execution time: 0:10:24.248704 -2023-08-24 14:33:21.591 | INFO | __main__::160 - Analysis date: 2023-08-15 -2023-08-24 14:35:42.158 | INFO | __main__:pare_down_vp_by_segment:132 - merge usable vp with sjoin results: 0:02:20.558585 -2023-08-24 14:35:42.253 | INFO | __main__:pare_down_vp_by_segment:141 - keep enter/exit points: 0:00:00.094556 -2023-08-24 14:39:07.353 | INFO | __main__:pare_down_vp_by_segment:149 - exported: 0:03:25.100624 -2023-08-24 14:39:07.355 | INFO | __main__::174 - pare down vp by stop segments normal cases 0:05:45.755594 -2023-08-24 14:39:07.356 | INFO | __main__::177 - execution time: 0:05:45.761935 -2023-08-24 14:39:22.403 | INFO | __main__::351 - Analysis date: 2023-08-15 -2023-08-24 14:48:33.172 | INFO | __main__::365 - pare down vp by stop segments special cases 0:09:10.762642 -2023-08-24 14:48:33.173 | INFO | __main__::368 - execution time: 0:09:10.768661 -2023-09-12 10:02:48.637 | INFO | __main__::131 - Analysis date: 2023-07-12 -2023-09-12 10:02:50.085 | INFO | __main__:pare_down_vp_by_segment:103 - merge usable vp with sjoin results: 0:00:01.398546 -2023-09-12 10:02:50.172 | INFO | __main__:pare_down_vp_by_segment:112 - keep enter/exit points: 0:00:00.086938 -2023-09-12 10:05:01.643 | INFO | __main__::131 - Analysis date: 2023-07-12 -2023-09-12 10:05:02.945 | INFO | __main__:pare_down_vp_by_segment:103 - merge usable vp with sjoin results: 0:00:01.297436 -2023-09-12 10:05:03.028 | INFO | __main__:pare_down_vp_by_segment:112 - keep enter/exit points: 0:00:00.083493 -2023-09-12 10:10:26.430 | INFO | __main__:pare_down_vp_by_segment:120 - exported: 0:05:23.402394 -2023-09-12 10:10:26.450 | INFO | __main__::145 - pare down vp by stop segments normal cases 0:05:24.802713 -2023-09-12 10:10:26.451 | INFO | __main__::148 - execution time: 0:05:24.807249 -2023-09-12 10:34:00.671 | INFO | __main__::134 - Analysis date: 2023-07-12 -2023-09-12 10:34:01.998 | INFO | __main__:pare_down_vp_by_segment:103 - merge usable vp with sjoin results: 0:00:01.307426 -2023-09-12 10:34:02.107 | INFO | __main__:pare_down_vp_by_segment:112 - keep enter/exit points: 0:00:00.108987 -2023-09-12 10:39:50.666 | INFO | __main__:pare_down_vp_by_segment:123 - exported: 0:05:48.558950 -2023-09-12 10:39:50.667 | INFO | __main__::148 - pare down vp by stop segments for all cases 0:05:49.976836 -2023-09-12 10:39:50.668 | INFO | __main__::151 - execution time: 0:05:49.981306 -2023-09-14 13:14:30.723 | INFO | __main__::134 - Analysis date: 2023-09-13 -2023-09-14 13:14:31.992 | INFO | __main__:pare_down_vp_by_segment:103 - merge usable vp with sjoin results: 0:00:01.264072 -2023-09-14 13:14:32.090 | INFO | __main__:pare_down_vp_by_segment:112 - keep enter/exit points: 0:00:00.097104 -2023-09-14 13:20:47.965 | INFO | __main__:pare_down_vp_by_segment:123 - exported: 0:06:15.874961 -2023-09-14 13:20:47.966 | INFO | __main__::148 - pare down vp by stop segments for all cases 0:06:17.237883 -2023-09-14 13:20:47.967 | INFO | __main__::151 - execution time: 0:06:17.243094 -2023-09-21 11:06:27.840 | INFO | __main__::134 - Analysis date: 2023-09-13 -2023-09-21 11:06:29.244 | INFO | __main__:pare_down_vp_by_segment:103 - merge usable vp with sjoin results: 0:00:01.386494 -2023-09-21 11:06:29.323 | INFO | __main__:pare_down_vp_by_segment:112 - keep enter/exit points: 0:00:00.079269 -2023-09-21 11:13:01.939 | INFO | __main__:pare_down_vp_by_segment:123 - exported: 0:06:32.615887 -2023-09-21 11:13:01.940 | INFO | __main__::148 - pare down vp by stop segments for all cases 0:06:34.083216 -2023-09-21 11:13:01.941 | INFO | __main__::151 - execution time: 0:06:34.088864 -2023-09-21 12:19:30.995 | INFO | __main__::134 - Analysis date: 2023-08-15 -2023-09-21 12:19:32.432 | INFO | __main__:pare_down_vp_by_segment:103 - merge usable vp with sjoin results: 0:00:01.410420 -2023-09-21 12:19:32.535 | INFO | __main__:pare_down_vp_by_segment:112 - keep enter/exit points: 0:00:00.103218 -2023-09-21 12:27:56.376 | INFO | __main__:pare_down_vp_by_segment:123 - exported: 0:08:23.840718 -2023-09-21 12:27:56.378 | INFO | __main__::148 - pare down vp by stop segments for all cases 0:08:25.356390 -2023-09-21 12:27:56.379 | INFO | __main__::151 - execution time: 0:08:25.360301 -2023-09-21 13:07:51.526 | INFO | __main__::134 - Analysis date: 2023-07-12 -2023-09-21 13:07:52.848 | INFO | __main__:pare_down_vp_by_segment:103 - merge usable vp with sjoin results: 0:00:01.297768 -2023-09-21 13:07:52.939 | INFO | __main__:pare_down_vp_by_segment:112 - keep enter/exit points: 0:00:00.091154 -2023-09-21 13:16:20.491 | INFO | __main__:pare_down_vp_by_segment:123 - exported: 0:08:27.552189 -2023-09-21 13:16:20.493 | INFO | __main__::148 - pare down vp by stop segments for all cases 0:08:28.942646 -2023-09-21 13:16:20.493 | INFO | __main__::151 - execution time: 0:08:28.946383 -2023-09-21 15:29:36.092 | INFO | __main__::134 - Analysis date: 2023-06-14 -2023-09-21 15:29:37.363 | INFO | __main__:pare_down_vp_by_segment:103 - merge usable vp with sjoin results: 0:00:01.266538 -2023-09-21 15:29:37.460 | INFO | __main__:pare_down_vp_by_segment:112 - keep enter/exit points: 0:00:00.096562 -2023-09-21 15:36:38.508 | INFO | __main__:pare_down_vp_by_segment:123 - exported: 0:07:01.048429 -2023-09-21 15:36:38.510 | INFO | __main__::148 - pare down vp by stop segments for all cases 0:07:02.413058 -2023-09-21 15:36:38.510 | INFO | __main__::151 - execution time: 0:07:02.417441 -2023-09-21 17:33:53.360 | INFO | __main__::134 - Analysis date: 2023-05-17 -2023-09-21 17:33:54.621 | INFO | __main__:pare_down_vp_by_segment:103 - merge usable vp with sjoin results: 0:00:01.258114 -2023-09-21 17:33:54.702 | INFO | __main__:pare_down_vp_by_segment:112 - keep enter/exit points: 0:00:00.080633 -2023-09-21 17:40:45.339 | INFO | __main__:pare_down_vp_by_segment:123 - exported: 0:06:50.637139 -2023-09-21 17:40:45.341 | INFO | __main__::148 - pare down vp by stop segments for all cases 0:06:51.977448 -2023-09-21 17:40:45.341 | INFO | __main__::151 - execution time: 0:06:51.981031 -2023-09-21 18:11:20.644 | INFO | __main__::134 - Analysis date: 2023-04-12 -2023-09-21 18:11:21.907 | INFO | __main__:pare_down_vp_by_segment:103 - merge usable vp with sjoin results: 0:00:01.253286 -2023-09-21 18:11:21.982 | INFO | __main__:pare_down_vp_by_segment:112 - keep enter/exit points: 0:00:00.074969 -2023-09-21 18:19:17.733 | INFO | __main__:pare_down_vp_by_segment:123 - exported: 0:07:55.751630 -2023-09-21 18:19:17.735 | INFO | __main__::148 - pare down vp by stop segments for all cases 0:07:57.081328 -2023-09-21 18:19:17.735 | INFO | __main__::151 - execution time: 0:07:57.086467 -2023-09-21 19:05:57.111 | INFO | __main__::134 - Analysis date: 2023-03-15 -2023-09-21 19:05:58.586 | INFO | __main__:pare_down_vp_by_segment:103 - merge usable vp with sjoin results: 0:00:01.446362 -2023-09-21 19:05:58.673 | INFO | __main__:pare_down_vp_by_segment:112 - keep enter/exit points: 0:00:00.087041 -2023-09-21 19:14:00.946 | INFO | __main__:pare_down_vp_by_segment:123 - exported: 0:08:02.272685 -2023-09-21 19:14:00.947 | INFO | __main__::148 - pare down vp by stop segments for all cases 0:08:03.807576 -2023-09-21 19:14:00.947 | INFO | __main__::151 - execution time: 0:08:03.812106 -2023-10-17 18:19:45.986 | INFO | __main__::131 - Analysis date: 2023-10-11 -2023-10-17 18:19:47.197 | INFO | __main__:pare_down_vp_by_segment:100 - merge usable vp with sjoin results: 0:00:01.184533 -2023-10-17 18:19:47.264 | INFO | __main__:pare_down_vp_by_segment:109 - keep enter/exit points: 0:00:00.067347 -2023-10-17 18:24:04.316 | INFO | __main__:pare_down_vp_by_segment:120 - exported: 0:04:17.051551 -2023-10-17 18:24:04.318 | INFO | __main__::145 - pare down vp by stop segments for all cases 0:04:18.305061 -2023-10-17 18:24:04.318 | INFO | __main__::148 - execution time: 0:04:18.308819 diff --git a/rt_segment_speeds/scripts/A1_sjoin_vp_segments.py b/rt_segment_speeds/scripts/A1_sjoin_vp_segments.py deleted file mode 100644 index f5b0653cc..000000000 --- a/rt_segment_speeds/scripts/A1_sjoin_vp_segments.py +++ /dev/null @@ -1,262 +0,0 @@ -""" -Spatial join vehicle positions to segments. - -Ensure that RT trips can only join to the scheduled shape -for that scheduled trip. -Otherwise, vp on the same road get joined to multiple segments -across shapes. -""" -import dask.dataframe as dd -import dask_geopandas as dg -import datetime -import geopandas as gpd -import pandas as pd -import sys - -from loguru import logger - -from calitp_data_analysis.geography_utils import WGS84 -from segment_speed_utils import helpers, wrangle_shapes -from segment_speed_utils.project_vars import (analysis_date, SEGMENT_GCS, - CONFIG_PATH, PROJECT_CRS) - -def add_grouping_col_to_vp( - vp_file_name: str, - analysis_date: str, - trip_grouping_cols: list -) -> pd.DataFrame: - """ - Import unique trips present in vehicle positions. - Use trip_instance_key to merge RT with schedule. - - Determine trip_grouping_cols, a list of columns to aggregate trip tables - up to how segments are cut. - Can be ["route_id", "direction_id"] or ["shape_array_key"] - - """ - vp_trips = pd.read_parquet( - f"{SEGMENT_GCS}{vp_file_name}_{analysis_date}", - columns = ["trip_instance_key"] - ).drop_duplicates().dropna().reset_index(drop=True) - - trips = helpers.import_scheduled_trips( - analysis_date, - columns = ["trip_instance_key"] + trip_grouping_cols, - get_pandas = True - ) - - vp_with_crosswalk = dd.merge( - vp_trips, - trips, - on = "trip_instance_key", - how = "inner" - ) - - return vp_with_crosswalk - - -def import_segments_and_buffer( - segment_file_name: str, - buffer_size: int, - segment_identifier_cols: list, - **kwargs -) -> gpd.GeoDataFrame: - """ - Import segments , subset certain columns, - and buffer by some specified amount. - """ - if "stop_segments" in segment_file_name: - filename = f"{SEGMENT_GCS}{segment_file_name}.parquet" - - elif "road_segments" in segment_file_name: - filename = f"{SEGMENT_GCS}{segment_file_name}" - - segments = gpd.read_parquet( - filename, - columns = segment_identifier_cols + [ - "seg_idx", "stop_primary_direction", "geometry"], - **kwargs - ).to_crs(PROJECT_CRS) - - # Buffer the segment for vehicle positions (points) to fall in polygons - segments = segments.assign( - geometry = segments.geometry.buffer(buffer_size) - ) - - return segments - - -def get_sjoin_results( - vp: dd.DataFrame, - segments: gpd.GeoDataFrame, - grouping_col: str, - segment_identifier_cols: list, -) -> pd.DataFrame: - """ - Merge all the segments for a shape for that trip, - and check if vp is within. - Export just vp_idx and seg_idx as our "crosswalk" of sjoin results. - If we use dask map_partitions, this is still faster than dask.delayed. - """ - vp_gddf = gpd.GeoDataFrame( - vp, - geometry = gpd.points_from_xy(vp.x, vp.y, crs=WGS84) - ).to_crs(PROJECT_CRS).drop(columns = ["x", "y"]) - - vp_to_seg = gpd.sjoin( - vp_gddf, - segments, - how = "inner", - predicate = "within" - ).query( - f'{grouping_col}_left == {grouping_col}_right' - ).drop( - columns = f"{grouping_col}_right" - ).rename(columns = {f"{grouping_col}_left": grouping_col}) - - results = (vp_to_seg[["vp_idx"] + segment_identifier_cols] - .drop_duplicates() - .reset_index(drop=True) - ) - - return results - - -def stage_direction_results( - vp: dd.DataFrame, - segments: gpd.GeoDataFrame, - grouping_col: str, - segment_identifier_cols: list, - direction: str -): - opposite = wrangle_shapes.OPPOSITE_DIRECTIONS[direction] - keep_vp = [d for d in wrangle_shapes.ALL_DIRECTIONS if d != opposite] + ["Unknown"] - - # Keep all directions of vp except the ones running in opposite direction - # Esp since buses make turns, a northbound segment can be - # partially westbound and then northbound - vp_subset = vp[vp.vp_primary_direction.isin(keep_vp)].repartition(npartitions=20) - - segments_subset = segments[ - segments.stop_primary_direction==direction - ].reset_index(drop=True) - - seg_id_dtypes = segments[segment_identifier_cols].dtypes.to_dict() - - results_subset = vp_subset.map_partitions( - get_sjoin_results, - segments_subset, - grouping_col, - segment_identifier_cols, - meta = {"vp_idx": "int64", - **seg_id_dtypes}, - align_dataframes = False - ) - - return results_subset - -def sjoin_vp_to_segments( - analysis_date: str, - dict_inputs: dict = {} -): - """ - Spatial join vehicle positions to segments. - Subset by grouping columns. - - Vehicle positions can only join to the relevant segments. - Use route_dir_identifier or shape_array_key to figure out - the relevant segments those vp can be joined to. - """ - INPUT_FILE = dict_inputs["stage1"] - SEGMENT_FILE = dict_inputs["segments_file"] - TRIP_GROUPING_COLS = dict_inputs["trip_grouping_cols"] - GROUPING_COL = dict_inputs["grouping_col"] - SEGMENT_IDENTIFIER_COLS = dict_inputs["segment_identifier_cols"] - EXPORT_FILE = dict_inputs["stage2"] - - BUFFER_METERS = 35 - - time0 = datetime.datetime.now() - - # Get a list of trips we need to keep from vp - vp_trips = add_grouping_col_to_vp( - f"{INPUT_FILE}", - analysis_date, - TRIP_GROUPING_COLS - ) - - groups_present = vp_trips[GROUPING_COL].unique().tolist() - - # only import segments whose shape_array_key is associated with a vp_trip - segments = import_segments_and_buffer( - f"{SEGMENT_FILE}_{analysis_date}", - BUFFER_METERS, - SEGMENT_IDENTIFIER_COLS, - filters = [[(GROUPING_COL, "in", groups_present)]] - ) - - # Import vp, keep trips that are usable - vp = dd.read_parquet( - f"{SEGMENT_GCS}{INPUT_FILE}_{analysis_date}/", - columns = [ - "trip_instance_key", "vp_idx", "x", "y", - "vp_primary_direction"] - ).merge( - vp_trips, - on = "trip_instance_key", - how = "inner" - ) - - vp = vp.repartition(npartitions=100).persist() - - time1 = datetime.datetime.now() - logger.info(f"prep vp and persist: {time1 - time0}") - - - results = [ - stage_direction_results( - vp, - segments, - GROUPING_COL, - SEGMENT_IDENTIFIER_COLS, - one_direction - ).persist() for one_direction in wrangle_shapes.ALL_DIRECTIONS - ] - - - time2 = datetime.datetime.now() - logger.info(f"sjoin with map_partitions: {time2 - time1}") - - full_results = dd.multi.concat(results, axis=0).reset_index(drop=True) - full_results = full_results.repartition(npartitions=4) - - full_results.to_parquet( - f"{SEGMENT_GCS}vp_sjoin/{EXPORT_FILE}_{analysis_date}", - overwrite=True - ) - - time3 = datetime.datetime.now() - logger.info(f"export partitioned results: {time3 - time2}") - - -if __name__ == "__main__": - - LOG_FILE = "../logs/sjoin_vp_segments.log" - logger.add(LOG_FILE, retention="3 months") - logger.add(sys.stderr, - format="{time:YYYY-MM-DD at HH:mm:ss} | {level} | {message}", - level="INFO") - - logger.info(f"Analysis date: {analysis_date}") - - start = datetime.datetime.now() - - STOP_SEG_DICT = helpers.get_parameters(CONFIG_PATH, "stop_segments") - - sjoin_vp_to_segments( - analysis_date = analysis_date, - dict_inputs = STOP_SEG_DICT - ) - - end = datetime.datetime.now() - logger.info(f"execution time: {end-start}") \ No newline at end of file diff --git a/rt_segment_speeds/scripts/A2_sjoin_postprocessing.py b/rt_segment_speeds/scripts/A2_sjoin_postprocessing.py deleted file mode 100644 index 4bef3b131..000000000 --- a/rt_segment_speeds/scripts/A2_sjoin_postprocessing.py +++ /dev/null @@ -1,465 +0,0 @@ -""" -Spatial join post-processing. -Inlining causes erroneous sjoin results, and we may -keep way too many sjoin pairs. - -Instead of only focusing only loop_or_inlining shapes for -direction check, do it for all shapes. -If there are 2 groupings of vp (non-consecutive) attached to -a segment, check that the vp run the same direction as the segment. -Otherwise, drop. -""" -import dask.dataframe as dd -import datetime -import geopandas as gpd -import numpy as np -import pandas as pd -import shapely -import sys - -from loguru import logger - -from calitp_data_analysis.geography_utils import WGS84 -from segment_speed_utils import helpers, segment_calcs, wrangle_shapes -from segment_speed_utils.project_vars import (SEGMENT_GCS, analysis_date, - PROJECT_CRS, CONFIG_PATH) -from A3_valid_vehicle_positions import merge_usable_vp_with_sjoin_vpidx - - -def find_convoluted_segments( - ddf: dd.DataFrame, - segment_identifier_cols: list -) -> pd.DataFrame: - """ - Identify which segments (shape-stop_seq) are prone to - having double sjoin results due to inlining. - - These are particularly complex to resolve, because - even though we have vp_idx as a column, - we cannot simply check whether it is a difference of 1 - compared to previous or subsequent vp. - - Ex: if we're going westbound first, then eastbound, then - both 2 distinct groupings of sorted vp_idx will appear on both segments. - The westbound segment should be associated with smaller vp_idx. - The eastbound segment should be associated with larger vp_idx. - - So, we need to use direction to resolve this. - """ - segment_trip_cols = ["trip_instance_key"] + segment_identifier_cols - - ddf["prior_vp_idx"] = (ddf.groupby(segment_trip_cols, - observed=True, group_keys=False) - ["vp_idx"] - .shift(1, meta = ("prior_vp_idx", "Int64")) - ) - - ddf["subseq_vp_idx"] = (ddf.groupby(segment_trip_cols, - observed=True, group_keys=False) - ["vp_idx"] - .shift(-1, meta = ("subseq_vp_idx", "Int64")) - ) - - ddf = ddf.assign( - change_from_prior = ddf.vp_idx - ddf.prior_vp_idx, - change_to_subseq = ddf.subseq_vp_idx - ddf.vp_idx - ) - - # The segments that are convoluted would have either - # a max(change_from_prior) > 1 or max(change_to_subseq) > 1 - convoluted_segments = ddf.assign( - max_change_from_prior = ( - ddf.groupby(segment_trip_cols, - observed=True, group_keys=False) - .change_from_prior - .transform("max", - meta = ("max_change_from_prior", "Int64")) - ), - max_change_to_subseq = ( - ddf.groupby(segment_trip_cols, - observed=True, group_keys=False) - .change_to_subseq - .transform("max", - meta = ("max_change_to_subseq", "Int64")) - ) - )[segment_trip_cols + [ - "max_change_from_prior", - "max_change_to_subseq"] - ].drop_duplicates().query( - 'max_change_from_prior > 1 or max_change_to_subseq > 1' - ).reset_index(drop=True) - - return (convoluted_segments[segment_trip_cols] - .compute().reset_index(drop=True)) - - -def split_vp_into_groups( - df: dd.DataFrame, - group_cols: list, - col_to_find_groups: str = "location_timestamp_local" -) -> dd.DataFrame: - """ - Within each segment-trip, break up the vp into 2 groups using - vp_idx. Within each group, check direction. - Only correct sjoin results are kept. - - Can use vp_idx, should be simpler than original use of location_timestamp_local. - """ - if col_to_find_groups == "location_timestamp_local": - col = f"{col_to_find_groups}_sec" - - df = segment_calcs.convert_timestamp_to_seconds( - df, [col_to_find_groups]) - else: - col = col_to_find_groups - - mean_df = (df.groupby(group_cols, observed=True, group_keys=False) - .agg({col: "mean"}) - .reset_index() - .rename(columns = {col: "avg"}) - ) - - df2 = dd.merge( - df, - mean_df, - on = group_cols, - ) - - df2 = df2.assign( - group = df2.apply( - lambda x: 0 if x[col] <= x.avg - else 1, axis=1, meta=("group", "int8")) - ).drop(columns = "avg") - - return df2 - - -def get_first_last_position_in_group( - df: dd.DataFrame, - group_cols: list, - col_to_find_groups: str = "location_timestamp_local_sec" -) -> pd.DataFrame: - """ - For each grouping of vp (separated by the mean timestamp) - for a segment-trip, get the first and last vp. - Find the direction each pair of points. - """ - col = col_to_find_groups - trip_group_cols = group_cols + ["group"] - - grouped_df = df.groupby(trip_group_cols, observed=True, group_keys=False) - - first = (grouped_df - .agg({col: "min"}) - .reset_index() - ) - - last = (grouped_df - .agg({col: "max"}) - .reset_index() - ) - - keep_cols = trip_group_cols + [col, "x", "y"] - - pared_down = (dd.multi.concat([first, last], axis=0) - [trip_group_cols + [col]] - .drop_duplicates() - .reset_index(drop=True) - ) - - # get rid of the groups with only 1 obs - # if it has only 1 point (cannot calculate direction vector), - # which means it'll get excluded down the line - more_than_2 = (pared_down - .groupby(trip_group_cols, observed=True, group_keys=False) - [col].size() - .loc[lambda x: x > 1] - .reset_index() - .drop(columns = col) - ) - - pared_down2 = dd.merge( - pared_down, - more_than_2, - on = trip_group_cols - ).reset_index(drop=True) - - # Do subset first, because dask doesn't like subsetting on-the-fly - df2 = df[keep_cols] - df3 = dd.merge( - df2, - pared_down2, - on = trip_group_cols + [col] - ).compute() # compute so we can sort by multiple columns - - # Sorting right before the groupby causes errors - df3 = df3.sort_values( - trip_group_cols + [col] - ).reset_index(drop=True) - - df3 = df3.assign( - obs = (df3.groupby(trip_group_cols, observed=True, - group_keys=False)[col] - .cumcount() + 1 - ).astype("int8") - ) - - return df3 - - -def get_stop_segments_direction_vector( - stop_segments: gpd.GeoDataFrame -) -> pd.DataFrame: - """ - Grab the first and last coordinate points in the stop segment - and turn that into a normalized vector. - """ - # Take the stop segment geometry and turn it into an array of coords - shape_array = [np.array(shapely.LineString(i).coords) - for i in stop_segments.geometry] - - # Grab the first and last items in the array, - # and turn it back to shapely - subset_shape_array = [ - np.array( - [shapely.geometry.Point(i[0]), - shapely.geometry.Point(i[-1])] - ).flatten() for i in shape_array - ] - - # Get the shape's direction vector and normalize it - direction_vector = [ - wrangle_shapes.distill_array_into_direction_vector(i) - for i in subset_shape_array - ] - - shape_vec = [wrangle_shapes.get_normalized_vector(i) - for i in direction_vector] - - # Assign this vector as a column, drop geometry, since we can - # bring it back for full df later - stop_segments2 = stop_segments.assign( - segments_vector = shape_vec - ).drop(columns = "geometry") - - return stop_segments2 - - -def find_vp_direction_vector( - df: pd.DataFrame, - group_cols: list, - crs: str = PROJECT_CRS -) -> pd.DataFrame: - """ - Get direction vector for first and last vp within segment. - """ - trip_group_cols = group_cols + ["group", "segments_vector"] - keep_cols = trip_group_cols + ["x", "y"] - - first_position = df[df.obs == 1][keep_cols] - last_position = df[df.obs==2][keep_cols] - - # Set this up to be wide so we can compare positions and - # get a vector - df_wide = pd.merge( - first_position, - last_position, - on = trip_group_cols, - suffixes = ('_start', '_end') - ).sort_values(trip_group_cols).reset_index(drop=True) - - # Use 2 geoseries, the first point and the last point - first_series = gpd.points_from_xy( - df_wide.x_start, df_wide.y_start, - crs=WGS84 - ).to_crs(crs) - - last_series = gpd.points_from_xy( - df_wide.x_end, df_wide.y_end, - crs=WGS84 - ).to_crs(crs) - - # Input 2 series to get a directon for each element-pair - direction_vector = [ - wrangle_shapes.get_direction_vector(start, end) - for start, end in zip(first_series, last_series) - ] - - # Normalize vector by Pythagorean Theorem to get values between -1 and 1 - vector_normalized = [wrangle_shapes.get_normalized_vector(i) - for i in direction_vector] - - results = df_wide[trip_group_cols] - results = results.assign( - vp_vector = vector_normalized - ) - - # Take the dot product. - # positive = same direction; 0 = orthogonal; negative = opposite direction - dot_result = [wrangle_shapes.dot_product(vec1, vec2) for vec1, vec2 in - zip(results.segments_vector, results.vp_vector)] - - results = results.assign( - dot_product = dot_result - ) - - return results - -def check_vp_direction_against_segment_direction( - convoluted_sjoin_results: dd.DataFrame, - convoluted_segments: pd.DataFrame, - segment_identifier_cols: list, - grouping_col: str -) -> dd.DataFrame: - """ - Return vp sjoined to segment_identifier_cols that are - should be excluded. - """ - segment_trip_cols = ["trip_instance_key"] + segment_identifier_cols - - convoluted_vp_grouped = split_vp_into_groups( - convoluted_sjoin_results, - group_cols = segment_trip_cols, - col_to_find_groups = "vp_idx" - ).persist() - - convoluted_vp_first_last = get_first_last_position_in_group( - convoluted_vp_grouped, - group_cols = segment_trip_cols, - col_to_find_groups = "vp_idx" - ) - - shapes_with_error = convoluted_segments[grouping_col].unique().tolist() - - segments_to_fix = gpd.read_parquet( - f"{SEGMENT_GCS}stop_segments_{analysis_date}.parquet", - columns = segment_identifier_cols + ["geometry"], - filters = [[(grouping_col, "in", shapes_with_error)]] - ).merge( - convoluted_segments[segment_identifier_cols], - on = segment_identifier_cols, - how = "inner" - ) - - segments_to_fix = get_stop_segments_direction_vector( - segments_to_fix) - - vp_with_segment_vec = pd.merge( - segments_to_fix, - convoluted_vp_first_last, - on = segment_identifier_cols, - ) - - vp_dot_prod = find_vp_direction_vector( - vp_with_segment_vec, segment_trip_cols) - - vp_to_drop = vp_dot_prod[vp_dot_prod.dot_product < 0][ - segment_trip_cols + ["group"]] - - vp_to_seg_drop = (convoluted_vp_grouped - .merge( - vp_to_drop, - on = segment_trip_cols + ["group"], - how = "inner" - ) - )[segment_identifier_cols + ["vp_idx"]].drop_duplicates() - - return vp_to_seg_drop - - -def remove_erroneous_sjoin_results( - analysis_date: str, - dict_inputs: dict -): - """ - Split the sjoin results into segment-trips that look ok - and ones that look convoluted. - Fix the convoluted sjoins by checking for direction. - Drop the erroneous ones. - Save over the existing sjoin results. - """ - USABLE_VP = dict_inputs["stage1"] - INPUT_FILE_PREFIX = dict_inputs["stage2"] - SEGMENT_IDENTIFIER_COLS = dict_inputs["segment_identifier_cols"] - SEGMENT_TRIP_COLS = ["trip_instance_key"] + SEGMENT_IDENTIFIER_COLS - GROUPING_COL = dict_inputs["grouping_col"] - - vp_trip_info = pd.read_parquet( - f"{SEGMENT_GCS}{USABLE_VP}_{analysis_date}/", - columns = ["vp_idx", "trip_instance_key"] - ) - - sjoin_results = pd.read_parquet( - f"{SEGMENT_GCS}vp_sjoin/{INPUT_FILE_PREFIX}_{analysis_date}/", - ).merge( - vp_trip_info, - on = "vp_idx", - how = "inner" - ).sort_values(SEGMENT_TRIP_COLS + ["vp_idx"]).reset_index(drop=True) - - # We can do groupby and shift with ddfs, but - # divisions not known error addressed only with sort=False - ddf = dd.from_pandas(sjoin_results, npartitions=80, sort=False) - - convoluted_segments = find_convoluted_segments( - ddf, SEGMENT_IDENTIFIER_COLS) - - error_shapes = convoluted_segments.shape_array_key.unique().tolist() - - convoluted_sjoin_results = merge_usable_vp_with_sjoin_vpidx( - f"{USABLE_VP}_{analysis_date}", - f"{INPUT_FILE_PREFIX}_{analysis_date}", - sjoin_filtering = [[(GROUPING_COL, "in", error_shapes)]] - ).merge( - convoluted_segments, - on = SEGMENT_TRIP_COLS, - how = "inner" - ) - - convoluted_sjoin_results = convoluted_sjoin_results.repartition(npartitions=5) - - convoluted_sjoin_drop = check_vp_direction_against_segment_direction( - convoluted_sjoin_results, - convoluted_segments, - SEGMENT_IDENTIFIER_COLS, - GROUPING_COL - ) - - all_sjoin_results = dd.read_parquet( - f"{SEGMENT_GCS}vp_sjoin/{INPUT_FILE_PREFIX}_{analysis_date}", - ) - - cleaned_sjoin_results = dd.merge( - all_sjoin_results, - convoluted_sjoin_drop, - on = SEGMENT_IDENTIFIER_COLS + ["vp_idx"], - how = "left", - indicator = True - ).query('_merge=="left_only"').drop(columns = "_merge") - - cleaned_sjoin_results = (cleaned_sjoin_results.repartition(npartitions=5) - .reset_index(drop=True)) - - cleaned_sjoin_results.to_parquet( - f"{SEGMENT_GCS}vp_sjoin/{INPUT_FILE_PREFIX}_{analysis_date}" - ) - - -if __name__ == "__main__": - - LOG_FILE = "../logs/sjoin_vp_segments.log" - logger.add(LOG_FILE, retention="3 months") - logger.add(sys.stderr, - format="{time:YYYY-MM-DD at HH:mm:ss} | {level} | {message}", - level="INFO") - - logger.info(f"Analysis date: {analysis_date}") - - start = datetime.datetime.now() - - STOP_SEG_DICT = helpers.get_parameters(CONFIG_PATH, "stop_segments") - remove_erroneous_sjoin_results(analysis_date, STOP_SEG_DICT) - - end = datetime.datetime.now() - logger.info(f"remove erroneous sjoin results: {end-start}") - \ No newline at end of file diff --git a/rt_segment_speeds/scripts/A3_valid_vehicle_positions.py b/rt_segment_speeds/scripts/A3_valid_vehicle_positions.py deleted file mode 100644 index 18f3865c2..000000000 --- a/rt_segment_speeds/scripts/A3_valid_vehicle_positions.py +++ /dev/null @@ -1,148 +0,0 @@ -""" -Filter out unusable trips using RT trip diagnostics. - -Keep the enter / exit points for each segment. -""" -import dask.dataframe as dd -import datetime -import numpy as np -import pandas as pd -import sys - -from loguru import logger -from typing import Literal - -from segment_speed_utils import helpers, segment_calcs -from segment_speed_utils.project_vars import (SEGMENT_GCS, analysis_date, - CONFIG_PATH) - -def identify_stop_segment_cases( - analysis_date: str, - grouping_col: str, - loop_or_inlining: Literal[0, 1] -) -> np.ndarray: - """ - Filter based on the column loop_or_inlining in the - stops_projected file. - 1 is special case, can have loops or inlining. - 0 is normal case - """ - shape_cases = pd.read_parquet( - f"{SEGMENT_GCS}stops_projected_{analysis_date}/", - filters = [[("loop_or_inlining", "==", loop_or_inlining)]], - columns = [grouping_col] - )[grouping_col].unique().tolist() - - return shape_cases - - -def merge_usable_vp_with_sjoin_vpidx( - usable_vp_file: str, - sjoin_results_file: str, - sjoin_filtering: tuple = None, - **kwargs -) -> dd.DataFrame: - """ - Grab all the usable vp (with lat/lon columns), filter it down to - normal or special cases, and merge it - against the sjoin results (which only has vp_idx + segment_identifier_cols). - """ - # First, grab all the usable vp (with lat/lon columns) - usable_vp = dd.read_parquet( - f"{SEGMENT_GCS}{usable_vp_file}", - **kwargs - ).repartition(npartitions=100) - - # Grab our results of vp_idx joined to segments - vp_to_seg = dd.read_parquet( - f"{SEGMENT_GCS}vp_sjoin/{sjoin_results_file}", - filters = sjoin_filtering, - ) - - usable_vp_full_info = dd.merge( - usable_vp, - vp_to_seg, - on = "vp_idx", - how = "inner" - ) - - return usable_vp_full_info - - -def pare_down_vp_by_segment( - analysis_date: str, - dict_inputs: dict = {} -): - """ - Pare down vehicle positions that have been joined to segments - to keep the enter / exit timestamps. - Also, exclude any bad batches of trips. - """ - time0 = datetime.datetime.now() - - USABLE_VP = dict_inputs["stage1"] - INPUT_FILE_PREFIX = dict_inputs["stage2"] - SEGMENT_IDENTIFIER_COLS = dict_inputs["segment_identifier_cols"] - GROUPING_COL = dict_inputs["grouping_col"] - TIMESTAMP_COL = dict_inputs["timestamp_col"] - EXPORT_FILE = dict_inputs["stage3"] - - # First, grab all the usable vp (with lat/lon columns) - usable_vp = merge_usable_vp_with_sjoin_vpidx( - f"{USABLE_VP}_{analysis_date}", - f"{INPUT_FILE_PREFIX}_{analysis_date}", - sjoin_filtering = None, - columns = ["vp_idx", "trip_instance_key", TIMESTAMP_COL, - "x", "y"] - ) - - time1 = datetime.datetime.now() - logger.info(f"merge usable vp with sjoin results: {time1 - time0}") - - vp_to_keep = segment_calcs.keep_min_max_timestamps_by_segment( - usable_vp, - SEGMENT_IDENTIFIER_COLS + ["trip_instance_key"], - TIMESTAMP_COL - ) - - time2 = datetime.datetime.now() - logger.info(f"keep enter/exit points: {time2 - time1}") - - vp_to_keep = (vp_to_keep.drop_duplicates() - .reset_index(drop=True) - .repartition(npartitions=3) - ) - vp_to_keep.to_parquet( - f"{SEGMENT_GCS}{EXPORT_FILE}_{analysis_date}", - overwrite=True - ) - - logger.info(f"exported: {datetime.datetime.now() - time2}") - - -if __name__ == "__main__": - - LOG_FILE = "../logs/valid_vehicle_positions.log" - logger.add(LOG_FILE, retention="3 months") - logger.add(sys.stderr, - format="{time:YYYY-MM-DD at HH:mm:ss} | {level} | {message}", - level="INFO") - - logger.info(f"Analysis date: {analysis_date}") - - start = datetime.datetime.now() - - STOP_SEG_DICT = helpers.get_parameters(CONFIG_PATH, "stop_segments") - - time1 = datetime.datetime.now() - - pare_down_vp_by_segment( - analysis_date, - dict_inputs = STOP_SEG_DICT - ) - - time2 = datetime.datetime.now() - logger.info(f"pare down vp by stop segments for all cases {time2 - time1}") - - end = datetime.datetime.now() - logger.info(f"execution time: {end-start}") \ No newline at end of file diff --git a/rt_segment_speeds/scripts/B1_speeds_by_segment_trip.py b/rt_segment_speeds/scripts/B1_speeds_by_segment_trip.py deleted file mode 100644 index 75b8d7b14..000000000 --- a/rt_segment_speeds/scripts/B1_speeds_by_segment_trip.py +++ /dev/null @@ -1,379 +0,0 @@ -""" -Do linear referencing by segment-trip -and derive speed. -""" -import os -os.environ['USE_PYGEOS'] = '0' - -import dask.dataframe as dd -import dask_geopandas as dg -import datetime -import geopandas as gpd -import numpy as np -import pandas as pd -import sys - -from loguru import logger - -from calitp_data_analysis import geography_utils -from segment_speed_utils import helpers, segment_calcs, wrangle_shapes -from segment_speed_utils.project_vars import (SEGMENT_GCS, analysis_date, - PROJECT_CRS, CONFIG_PATH) -from shared_utils.rt_utils import MPH_PER_MPS - - -def linear_referencing_vp_against_line( - vp: dd.DataFrame, - segments: gpd.GeoDataFrame, - segment_identifier_cols: list, - timestamp_col: str -) -> dd.DataFrame: - """ - Take the vp x,y columns, make into gdf. - Merge in segment geometry and do linear referencing. - Return just the shape_meters result and timestamp converted to seconds. - """ - time0 = datetime.datetime.now() - - # https://stackoverflow.com/questions/71685387/faster-methods-to-create-geodataframe-from-a-dask-or-pandas-dataframe - # https://github.com/geopandas/dask-geopandas/issues/197 - vp_gddf = dg.from_dask_dataframe( - vp, - geometry=dg.points_from_xy(vp, "x", "y") - ).set_crs(geography_utils.WGS84).to_crs(PROJECT_CRS).drop(columns = ["x", "y"]) - - vp_with_seg_geom = dd.merge( - vp_gddf, - segments, - on = segment_identifier_cols, - how = "inner" - ).rename(columns = { - "geometry_x": "vp_geometry", - "geometry_y": "segment_geometry"} - ).set_geometry("vp_geometry") - - vp_with_seg_geom = vp_with_seg_geom.repartition(npartitions=50) - - time1 = datetime.datetime.now() - logger.info(f"set up merged vp with segments: {time1 - time0}") - - shape_meters_series = vp_with_seg_geom.map_partitions( - wrangle_shapes.project_point_geom_onto_linestring, - "segment_geometry", - "vp_geometry", - meta = ("shape_meters", "float") - ) - - vp_with_seg_geom = segment_calcs.convert_timestamp_to_seconds( - vp_with_seg_geom, [timestamp_col]) - - vp_with_seg_geom = vp_with_seg_geom.assign( - shape_meters = shape_meters_series, - segment_meters = vp_with_seg_geom.segment_geometry.length - ) - - time2 = datetime.datetime.now() - logger.info(f"linear referencing: {time2 - time1}") - - drop_cols = [f"{timestamp_col}", "vp_geometry", "segment_geometry"] - vp_with_seg_geom2 = vp_with_seg_geom.drop(columns = drop_cols) - - return vp_with_seg_geom2 - - -def make_wide_get_speed( - df: dd.DataFrame, - group_cols: list, - timestamp_col: str -) -> dd.DataFrame: - """ - Get df wide and set up current vp_idx and get meters/sec_elapsed - against prior and calculate speed. - """ - vp2 = ( - df.groupby(group_cols, - observed=True, group_keys=False) - .agg({"vp_idx": "max"}) - .reset_index() - .merge( - df, - on = group_cols + ["vp_idx"], - how = "inner" - ) - ) - - vp1 = ( - df.groupby(group_cols, - observed=True, group_keys=False) - .agg({"vp_idx": "min"}) - .reset_index() - .merge( - df, - on = group_cols + ["vp_idx"], - how = "inner" - ).rename(columns = { - "vp_idx": "prior_vp_idx", - f"{timestamp_col}_sec": f"prior_{timestamp_col}_sec", - "shape_meters": "prior_shape_meters", - }) - ) - - df_wide = dd.merge( - vp2, - vp1, - on = group_cols, - how = "left" - ) - - speed = segment_calcs.derive_speed( - df_wide, - distance_cols = ("prior_shape_meters", "shape_meters"), - time_cols = (f"prior_{timestamp_col}_sec", f"{timestamp_col}_sec") - ) - - speed = speed.assign( - pct_segment = speed.meters_elapsed.divide(speed.segment_meters) - ) - - return speed - - -def filter_for_unstable_speeds( - df: pd.DataFrame, - pct_segment_threshold: float -) -> tuple[pd.DataFrame]: - ok_speeds = df[df.pct_segment > pct_segment_threshold] - low_speeds = df[df.pct_segment <= pct_segment_threshold] - - return ok_speeds, low_speeds - - -def recalculate_low_speeds_with_straight_distance( - low_speeds_df: pd.DataFrame, - group_cols: list, - timestamp_col: str -): - """ - For low speed segments, select a different vp_idx. - Use the current vp_idx and subtract by 1. - This will fill in something where the segment only had 1 point previously. - """ - keep_cols = group_cols + [ - "vp_idx", "location_timestamp_local_sec", - ] - - df1 = low_speeds_df[keep_cols].drop_duplicates().reset_index(drop=True) - - df1 = df1.assign( - prior_vp_idx = df1.vp_idx - 1 - ) - - usable_vp = dd.read_parquet( - f"{SEGMENT_GCS}vp_usable_{analysis_date}", - columns = ["trip_instance_key", - "vp_idx", timestamp_col, "x", "y"] - ) - - vp_idx_bounds = segment_calcs.get_usable_vp_bounds_by_trip(usable_vp) - - df2 = pd.merge( - df1, - vp_idx_bounds, - on = "trip_instance_key", - how = "inner" - ) - - # Check that the prior_vp_idx actually is on the same trip (must be within bounds) - # If not, select the next point - df2 = df2.assign( - prior_vp_idx = df2.apply( - lambda x: - x.vp_idx + 1 if (x.prior_vp_idx < x.min_vp_idx) and - (x.vp_idx + 1 <= x.max_vp_idx) - else x.prior_vp_idx, - axis=1) - ).drop(columns = ["trip_instance_key", "min_vp_idx", "max_vp_idx"]) - - # We will need point geom again, since we are using straight distance - subset_vp_idx = np.union1d( - df2.vp_idx.unique(), - df2.prior_vp_idx.unique() - ).tolist() - - usable_vp2 = usable_vp[usable_vp.vp_idx.isin(subset_vp_idx)].compute() - - usable_gdf = geography_utils.create_point_geometry( - usable_vp2, - longitude_col = "x", - latitude_col = "y", - crs = PROJECT_CRS - ).drop(columns = ["x", "y"]).reset_index(drop=True) - - usable_gdf2 = segment_calcs.convert_timestamp_to_seconds( - usable_gdf, [timestamp_col]).drop(columns = timestamp_col) - - # Merge in coord for current_vp_idx - # we already have a timestamp_sec for current vp_idx - gdf = pd.merge( - usable_gdf2.drop(columns = f"{timestamp_col}_sec"), - df2, - on = "vp_idx", - how = "inner" - ) - - # Merge in coord for prior_vp_idx - gdf2 = pd.merge( - gdf, - usable_gdf2[ - ["vp_idx", f"{timestamp_col}_sec", "geometry"] - ].add_prefix("prior_"), - on = "prior_vp_idx", - how = "inner" - ) - - # should we do straight distance or interpolate against full shape? - # what if full shape is problematic? - # do we want to do a check against the scale? that's not very robust either though - - gdf2 = gdf2.assign( - straight_distance = gdf2.geometry.distance(gdf2.prior_geometry) - ) - - gdf2 = gdf2.assign( - sec_elapsed = (gdf2[f"{timestamp_col}_sec"] - - gdf2[f"prior_{timestamp_col}_sec"]).abs() - ) - - gdf2 = gdf2.assign( - speed_mph = gdf2.straight_distance.divide(gdf2.sec_elapsed) * MPH_PER_MPS - ) - - drop_cols = ["geometry", "prior_geometry"] - results = gdf2.drop(columns = drop_cols) - - return results - - -def linear_referencing_and_speed_by_segment( - analysis_date: str, - dict_inputs: dict = {} -): - """ - With just enter / exit points on segments, - do the linear referencing to get shape_meters, and then derive speed. - Do a second pass for low speed segments with straight distance. - """ - time0 = datetime.datetime.now() - - VP_FILE = dict_inputs["stage3"] - SEGMENT_FILE = dict_inputs["segments_file"] - SEGMENT_IDENTIFIER_COLS = dict_inputs["segment_identifier_cols"] - TIMESTAMP_COL = dict_inputs["timestamp_col"] - EXPORT_FILE = dict_inputs["stage4"] - PCT_SEGMENT_MIN = dict_inputs["pct_segment_minimum"] - - # Keep subset of columns - don't need it all. we can get the - # columns dropped through segments file - vp_keep_cols = [ - 'trip_instance_key', - TIMESTAMP_COL, - 'x', 'y', 'vp_idx' - ] + SEGMENT_IDENTIFIER_COLS - - vp = dd.read_parquet( - f"{SEGMENT_GCS}{VP_FILE}_{analysis_date}", - columns = vp_keep_cols - ) - - segments = helpers.import_segments( - SEGMENT_GCS, - f"{SEGMENT_FILE}_{analysis_date}", - columns = SEGMENT_IDENTIFIER_COLS + ["geometry"] - ).dropna(subset="geometry").reset_index(drop=True) - - vp_with_seg_geom = linear_referencing_vp_against_line( - vp, - segments, - SEGMENT_IDENTIFIER_COLS, - TIMESTAMP_COL - ).persist() - - time1 = datetime.datetime.now() - logger.info(f"linear referencing: {time1 - time0}") - - SEGMENT_TRIP_COLS = ["trip_instance_key", - "segment_meters"] + SEGMENT_IDENTIFIER_COLS - - initial_speeds = make_wide_get_speed( - vp_with_seg_geom, SEGMENT_TRIP_COLS, TIMESTAMP_COL - ).compute() - - - time2 = datetime.datetime.now() - logger.info(f"make wide and get initial speeds: {time2 - time1}") - - ok_speeds, low_speeds = filter_for_unstable_speeds( - initial_speeds, - pct_segment_threshold = PCT_SEGMENT_MIN - ) - - low_speeds_recalculated = recalculate_low_speeds_with_straight_distance( - low_speeds, - SEGMENT_TRIP_COLS, - TIMESTAMP_COL - ) - - # Add a flag that tells us speed was recalculated - # Combine columns and rename straight distance as meters_elapsed - low_speeds_recalculated = low_speeds_recalculated.assign( - flag_recalculated = 1, - meters_elapsed = low_speeds_recalculated.straight_distance - ) - - keep_cols = SEGMENT_TRIP_COLS + [ - "vp_idx", "prior_vp_idx", - f"{TIMESTAMP_COL}_sec", f"prior_{TIMESTAMP_COL}_sec", - "meters_elapsed", - "sec_elapsed", - "pct_segment", - "speed_mph", - "flag_recalculated", - ] - - speeds = pd.concat([ - ok_speeds, - low_speeds_recalculated - ], axis=0).sort_values(SEGMENT_IDENTIFIER_COLS + ["trip_instance_key"] - ).reset_index(drop=True) - - speeds = speeds.assign( - flag_recalculated = speeds.flag_recalculated.fillna(0).astype("int8") - )[keep_cols] - - time3 = datetime.datetime.now() - logger.info(f"recalculate speeds and get final: {time3 - time2}") - - speeds.to_parquet( - f"{SEGMENT_GCS}{EXPORT_FILE}_{analysis_date}.parquet", - ) - - -if __name__ == "__main__": - - LOG_FILE = "../logs/speeds_by_segment_trip.log" - logger.add(LOG_FILE, retention="3 months") - logger.add(sys.stderr, - format="{time:YYYY-MM-DD at HH:mm:ss} | {level} | {message}", - level="INFO") - - logger.info(f"Analysis date: {analysis_date}") - - start = datetime.datetime.now() - - STOP_SEG_DICT = helpers.get_parameters(CONFIG_PATH, "stop_segments") - - linear_referencing_and_speed_by_segment(analysis_date, STOP_SEG_DICT) - - logger.info(f"speeds for stop segments: {datetime.datetime.now() - start}") - logger.info(f"execution time: {datetime.datetime.now() - start}") - \ No newline at end of file diff --git a/rt_segment_speeds/scripts/B2_avg_speeds_by_segment.py b/rt_segment_speeds/scripts/B2_avg_speeds_by_segment.py deleted file mode 100644 index dc499e5d1..000000000 --- a/rt_segment_speeds/scripts/B2_avg_speeds_by_segment.py +++ /dev/null @@ -1,181 +0,0 @@ -""" -Quick aggregation for speed metrics by segment -""" -import datetime -import geopandas as gpd -import pandas as pd -import sys - -from loguru import logger - -from segment_speed_utils import helpers, sched_rt_utils -from segment_speed_utils.project_vars import (SEGMENT_GCS, analysis_date, - CONFIG_PATH) -from calitp_data_analysis import utils, geography_utils - - -def calculate_avg_speeds( - df: pd.DataFrame, - group_cols: list -) -> pd.DataFrame: - """ - Calculate the median, 20th, and 80th percentile speeds - by groups. - """ - # Take the average after dropping unusually high speeds - grouped_df = df.groupby(group_cols, observed=True, group_keys=False) - - - avg = (grouped_df - .agg({ - "speed_mph": "median", - "trip_instance_key": "nunique"}) - .reset_index() - ) - - p20 = (grouped_df - .agg({"speed_mph": lambda x: x.quantile(0.2)}) - .reset_index() - ) - - p80 = (grouped_df - .agg({"speed_mph": lambda x: x.quantile(0.8)}) - .reset_index() - ) - - stats = pd.merge( - avg.rename(columns = {"speed_mph": "p50_mph", - "trip_instance_key": "n_trips"}), - p20.rename(columns = {"speed_mph": "p20_mph"}), - on = group_cols, - how = "left" - ).merge( - p80.rename(columns = {"speed_mph": "p80_mph"}), - on = group_cols, - how = "left" - ) - - # Clean up for map - speed_cols = [c for c in stats.columns if "_mph" in c] - stats[speed_cols] = stats[speed_cols].round(2) - - return stats - -def speeds_with_segment_geom( - analysis_date: str, - max_speed_cutoff: int = 70, - dict_inputs: dict = {}, -) -> gpd.GeoDataFrame: - """ - Import the segment-trip table. - Average the speed_mph across all trips present in the segment. - By default, filter out rows where meters_elapsed covers less than 40% of segment length - """ - SEGMENT_FILE = dict_inputs["segments_file"] - SEGMENT_IDENTIFIER_COLS = dict_inputs["segment_identifier_cols"] - SPEEDS_FILE = dict_inputs["stage4"] - - # Load in segment geometry - segment_cols_to_keep = SEGMENT_IDENTIFIER_COLS + [ - "schedule_gtfs_dataset_key", - "stop_id", - "loop_or_inlining", - "geometry", - "district_name" - ] - - segments = helpers.import_segments( - SEGMENT_GCS, - f"{SEGMENT_FILE}_{analysis_date}", - columns = segment_cols_to_keep - ) - - # Read in speeds - df = pd.read_parquet( - f"{SEGMENT_GCS}{SPEEDS_FILE}_{analysis_date}.parquet", - filters = [[ - ("speed_mph", "<=", max_speed_cutoff), - ("meters_elapsed", ">", 0), - ("sec_elapsed", ">", 0) - ]]) - - # Do a merge with segments - df2 = pd.merge( - segments, - df, - on = SEGMENT_IDENTIFIER_COLS, - how = "inner" - ) - - # Keep only segments that have RT data. - unique_segments = (df2[segment_cols_to_keep] - .drop_duplicates() - .reset_index(drop = True) - ).to_crs(geography_utils.WGS84) - - time_of_day_df = sched_rt_utils.get_trip_time_buckets(analysis_date) - - df3 = pd.merge( - df2, - time_of_day_df, - on = "trip_instance_key", - how = "inner" - ) - - all_day = calculate_avg_speeds( - df3, - SEGMENT_IDENTIFIER_COLS - ) - peak = calculate_avg_speeds( - df3[df3.time_of_day.isin(["AM Peak", "PM Peak"])], - SEGMENT_IDENTIFIER_COLS - ) - - stats = pd.concat([ - all_day.assign(time_of_day = "all_day"), - peak.assign(time_of_day = "peak") - ], axis=0) - - - # Merge in segment geometry - gdf = pd.merge( - unique_segments, - stats, - on = SEGMENT_IDENTIFIER_COLS, - how = "left" - ).sort_values(SEGMENT_IDENTIFIER_COLS + ["time_of_day"]).reset_index(drop=True) - - return gdf - - -if __name__ == "__main__": - - LOG_FILE = "../logs/avg_speeds.log" - logger.add(LOG_FILE, retention="3 months") - logger.add(sys.stderr, - format="{time:YYYY-MM-DD at HH:mm:ss} | {level} | {message}", - level="INFO") - - logger.info(f"Analysis date: {analysis_date}") - - start = datetime.datetime.now() - STOP_SEG_DICT = helpers.get_parameters(CONFIG_PATH, "stop_segments") - EXPORT_FILE = f'{STOP_SEG_DICT["stage5"]}_{analysis_date}' - - MAX_SPEED = 70 - - # Average the speeds for segment for entire day - # Drop speeds above our max cutoff - stop_segment_speeds = speeds_with_segment_geom( - analysis_date, - max_speed_cutoff = MAX_SPEED, - dict_inputs = STOP_SEG_DICT, - ) - - utils.geoparquet_gcs_export( - stop_segment_speeds, - SEGMENT_GCS, - EXPORT_FILE - ) - - logger.info(f"execution time: {datetime.datetime.now() - start}") \ No newline at end of file diff --git a/rt_segment_speeds/scripts/B3_export.py b/rt_segment_speeds/scripts/B3_export.py deleted file mode 100644 index 53333fc6b..000000000 --- a/rt_segment_speeds/scripts/B3_export.py +++ /dev/null @@ -1,149 +0,0 @@ -""" -Attach columns needed for publishing to open data portal. -Suppress certain rows and columns too. -""" -import os -os.environ['USE_PYGEOS'] = '0' - -import dask.dataframe as dd -import dask_geopandas as dg -import datetime -import geopandas as gpd -import pandas as pd - -from shared_utils import schedule_rt_utils, utils -from calitp_data_analysis.geography_utils import WGS84 -from segment_speed_utils import helpers -from segment_speed_utils.project_vars import (SEGMENT_GCS, analysis_date, - CONFIG_PATH) - -def get_operator_natural_identifiers( - df: pd.DataFrame, - analysis_date: str -) -> pd.DataFrame: - """ - For each gtfs_dataset_key-shape_array_key combination, - re-attach the natural identifiers and organizational identifiers. - Return a df that should be merged against speeds_df. - """ - operator_shape_df = (df[["schedule_gtfs_dataset_key", "shape_array_key"]] - .drop_duplicates() - .reset_index(drop=True) - .rename(columns = {"schedule_gtfs_dataset_key": "gtfs_dataset_key"}) - ) - - # Get shape_id back - shape_identifiers = helpers.import_scheduled_trips( - analysis_date, - columns = ["shape_array_key", "shape_id"], - get_pandas = True - ) - - df_with_shape = pd.merge( - operator_shape_df, - shape_identifiers, - on = "shape_array_key", - how = "inner" - ) - - # Get base64_url, uri, organization_source_record_id and organization_name - crosswalk = schedule_rt_utils.sample_gtfs_dataset_key_to_organization_crosswalk( - df_with_shape, - analysis_date, - quartet_data = "schedule", - dim_gtfs_dataset_cols = [ - "key", - "base64_url", - ], - dim_organization_cols = ["source_record_id", "name"] - ) - - df_with_org = pd.merge( - df_with_shape.rename(columns = {"gtfs_dataset_key": "schedule_gtfs_dataset_key"}), - crosswalk, - on = "schedule_gtfs_dataset_key", - how = "inner" - ) - - return df_with_org - - -def finalize_df_for_export(gdf: gpd.GeoDataFrame) -> gpd.GeoDataFrame: - """ - Sorting, suppressing columns not needed in export. - """ - - RENAME_DICT = { - "organization_source_record_id": "org_id", - "organization_name": "agency", - } - - gdf2 = (gdf.sort_values(["organization_name", - "shape_id", "stop_sequence"]) - .reset_index(drop=True) - .rename(columns = RENAME_DICT) - ) - - return gdf2 - -if __name__ == "__main__": - - start = datetime.datetime.now() - - STOP_SEG_DICT = helpers.get_parameters(CONFIG_PATH, "stop_segments") - INPUT_FILE = f'{STOP_SEG_DICT["stage5"]}_{analysis_date}' - - gdf = gpd.read_parquet( - f"{SEGMENT_GCS}{INPUT_FILE}.parquet" - ) - - operator_identifiers = get_operator_natural_identifiers(gdf, analysis_date) - - time1 = datetime.datetime.now() - print(f"get natural identifiers: {time1 - start}") - - gdf2 = pd.merge( - gdf, - operator_identifiers, - on = ["schedule_gtfs_dataset_key", "shape_array_key"], - how = "inner" - ) - - final_gdf = finalize_df_for_export(gdf2) - - time2 = datetime.datetime.now() - print(f"finalize: {time2 - time1}") - - keep_cols = [ - 'org_id', 'agency', - 'shape_id', 'stop_sequence', 'stop_id', - 'geometry', - 'p50_mph', 'p20_mph', - 'p80_mph', 'n_trips', - 'time_of_day', - 'base64_url', - 'district_name' - ] - - utils.geoparquet_gcs_export( - final_gdf[keep_cols], - f"{SEGMENT_GCS}export/", - INPUT_FILE - ) - - # Keep a tabular version (geom is big to save) for us to compare what's published - # and contains columns we use for internal modeling - # (shape_array_key, gtfs_dataset_key, etc) - final_gdf.drop(columns = "geometry").to_parquet( - f"{SEGMENT_GCS}export/{INPUT_FILE}_tabular.parquet" - ) - - utils.geoparquet_gcs_export( - final_gdf[keep_cols], - f"{SEGMENT_GCS}export/", - "speeds_by_stop_segments" - ) - - end = datetime.datetime.now() - print(f"export: {end - time2}") - print(f"execution time: {end - start}") \ No newline at end of file diff --git a/rt_segment_speeds/scripts/C2_triangulate_vp.py b/rt_segment_speeds/scripts/C2_triangulate_vp.py deleted file mode 100644 index daba8dbfb..000000000 --- a/rt_segment_speeds/scripts/C2_triangulate_vp.py +++ /dev/null @@ -1,184 +0,0 @@ -""" -Starting from the vp that were spatially joined -to segments, pick a subset of these. -Triangulate these by picking 5 points to better -calculate speeds for the entire trip. - -If we pick only 2 points, for a looping route, origin/destination -are basically the same. If we pick 3 points, this is better -for triangulating the distance traveled. -""" -import os -os.environ['USE_PYGEOS'] = '0' - -import dask.dataframe as dd -import datetime -import numpy as np -import pandas as pd - -from typing import Literal - -from segment_speed_utils import helpers -from segment_speed_utils.project_vars import (SEGMENT_GCS, analysis_date, - CONFIG_PATH) - -from A3_valid_vehicle_positions import merge_usable_vp_with_sjoin_vpidx - - -def triangulate_vp( - ddf: dd.DataFrame, - group_cols: list = ["trip_instance_key"] -) -> np.ndarray: - """ - Grab a sample of vehicle positions for each trip to triangulate distance. - These vp already sjoined onto the shape. - Roughly pick vp at equally spaced intervals. - - Dask aggregation can't group and use lambda to create list of possible - vp_idx. - """ - grouped_ddf = ddf.groupby(group_cols, observed=True, group_keys=False) - - min_df = (grouped_ddf - .agg({"vp_idx": "min"}) - .rename(columns = {"vp_idx": "min_vp_idx"}) - ) - - max_df = (grouped_ddf - .agg({"vp_idx": "max"}) - .rename(columns = {"vp_idx": "max_vp_idx"}) - ) - - vp_range = dd.merge( - min_df, - max_df, - left_index = True, - right_index = True, - how = "inner" - ) - - vp_range = vp_range.persist() - - vp_range["range_diff"] = vp_range.max_vp_idx - vp_range.min_vp_idx - - vp_range = vp_range.assign( - p25_vp_idx = (vp_range.range_diff * 0.25 + vp_range.min_vp_idx - ).round(0).astype("int64"), - p50_vp_idx = (vp_range.range_diff * 0.5 + vp_range.min_vp_idx - ).round(0).astype("int64"), - p75_vp_idx = (vp_range.range_diff * 0.75 + vp_range.min_vp_idx - ).round(0).astype("int64"), - ) - - vp_idx_cols = [ - "min_vp_idx", - "p25_vp_idx", - "p50_vp_idx", - "p75_vp_idx", - "max_vp_idx" - ] - - results = vp_range[vp_idx_cols].compute().to_numpy().flatten() - - return results - - -def subset_usable_vp(dict_inputs: dict) -> np.ndarray: - """ - Subset all the usable vp and keep a sample of triangulated - vp per trip. - """ - SEGMENT_FILE = f'{dict_inputs["segments_file"]}_{analysis_date}' - SJOIN_FILE = f'{dict_inputs["stage2"]}_{analysis_date}' - USABLE_FILE = f'{dict_inputs["stage1"]}_{analysis_date}' - GROUPING_COL = dict_inputs["grouping_col"] - - all_shapes = pd.read_parquet( - f"{SEGMENT_GCS}{SEGMENT_FILE}.parquet", - columns = ["shape_array_key"] - ).shape_array_key.unique().tolist() - - # Use this function to attach the crosswalk of sjoin results - # back to usable_vp - ddf = merge_usable_vp_with_sjoin_vpidx( - USABLE_FILE, - SJOIN_FILE, - sjoin_filtering = [(GROUPING_COL, "in", all_shapes)], - columns = ["trip_instance_key", "vp_idx"] - ) - - # Results are just vp_idx as np array - results = triangulate_vp( - ddf, - ["trip_instance_key"] - ) - - return results - - -def merge_rt_scheduled_trips( - rt_trips: dd.DataFrame, - analysis_date: str, - group_cols: list = ["trip_instance_key"]) -> dd.DataFrame: - """ - Merge RT trips (vehicle positions) to scheduled trips - to get the shape_array_key. - Don't pull other scheduled trip columns now, wait until - after aggregation is done. - """ - trips = helpers.import_scheduled_trips( - analysis_date, - columns = group_cols + ["shape_array_key"], - get_pandas = True - ) - - df = dd.merge( - rt_trips, - trips, - on = group_cols, - how = "left", - ) - - return df - - -if __name__ == "__main__": - - start = datetime.datetime.now() - - STOP_SEG_DICT = helpers.get_parameters(CONFIG_PATH, "stop_segments") - - results = subset_usable_vp(STOP_SEG_DICT) - - time1 = datetime.datetime.now() - print(f"compute results: {time1 - start}") - - # Use these vp_idx and filter the vp with all the columns - vp_idx_list = results.tolist() - - USABLE_FILE = f'{STOP_SEG_DICT["stage1"]}_{analysis_date}' - - vp_results = dd.read_parquet( - f"{SEGMENT_GCS}{USABLE_FILE}/", - columns = [ - "gtfs_dataset_key", "trip_instance_key", - "location_timestamp_local", - "x", "y", "vp_idx"], - filters = [[("vp_idx", "in", vp_idx_list)]] - ).compute() - - vp_with_sched = ( - merge_rt_scheduled_trips( - vp_results, - analysis_date, - group_cols = ["trip_instance_key"] - ).sort_values("vp_idx") - .reset_index(drop=True) - ) - - vp_with_sched.to_parquet( - f"{SEGMENT_GCS}trip_summary/vp_subset_{analysis_date}.parquet", - ) - - end = datetime.datetime.now() - print(f"execution time: {end - start}") diff --git a/rt_segment_speeds/scripts/C3_trip_route_speed.py b/rt_segment_speeds/scripts/C3_trip_route_speed.py deleted file mode 100644 index 46795c36d..000000000 --- a/rt_segment_speeds/scripts/C3_trip_route_speed.py +++ /dev/null @@ -1,353 +0,0 @@ -""" -Use triangulated points, 5 sample vp, per trip -and calculate distance and seconds elapsed. -For each trip, take the sum of the change in distances, change in time, -and calculate speed. - -Aggregate trip speeds into route-direction averages by time-of-day. -""" -import dask.dataframe as dd -import datetime -import geopandas as gpd -import numpy as np -import pandas as pd - -from shared_utils.rt_utils import MPH_PER_MPS -from calitp_data_analysis.geography_utils import WGS84 -from calitp_data_analysis import utils -from shared_utils import portfolio_utils, schedule_rt_utils -from segment_speed_utils import helpers, sched_rt_utils, wrangle_shapes -from segment_speed_utils.project_vars import (SEGMENT_GCS, analysis_date, - PROJECT_CRS) - -def distance_and_seconds_elapsed( - df: pd.DataFrame, - group_cols: list -) -> pd.DataFrame: - """ - If every trip has 3 vp, we want the change in time and distance - between 1st and 2nd, 2nd and 3rd. - Then, sum up the change in time and change by trip. - """ - dist_col = "shape_meters" - time_col = "location_timestamp_local" - sort_cols = group_cols + ["vp_idx"] - - # Get the change in distance, time for each row - df = df.assign( - prior_dist = (df.sort_values(sort_cols) - .groupby(group_cols, - observed=True, group_keys=False) - [dist_col] - .apply(lambda x: x.shift(1)) - ), - prior_time = (df.sort_values(sort_cols) - .groupby(group_cols, - observed=True, group_keys=False) - [time_col] - .apply(lambda x: x.shift(1)) - ) - ) - - # distance should be positive, but sometimes it's not, - # so use absolute value - df = df.assign( - change_meters = abs(df[dist_col] - df.prior_dist), - change_sec = (df[time_col] - df.prior_time).divide( - np.timedelta64(1, 's')) - ) - - # For a trip, sum up the total change in distance and time - # Easier to calculate the speed this way, than - # taking a weighted average later - df2 = (df.groupby(group_cols, - observed=True, group_keys=False) - .agg({"change_meters": "sum", - "change_sec": "sum"}) - .reset_index() - ) - - df2 = df2.assign( - speed_mph = (df2.change_meters.divide(df2.change_sec) * - MPH_PER_MPS) - ) - - return df2 - - -def add_scheduled_trip_columns( - rt_trips: pd.DataFrame, - analysis_date: str, - group_cols: list = ["trip_instance_key"]) -> pd.DataFrame: - """ - Merge RT trips (vehicle positions) to scheduled trips. - Add in the needed scheduled trip columns to take - route-direction-time_of_day averages. - """ - keep_cols = [ - "gtfs_dataset_key", - "direction_id", - "route_id", "route_short_name", "route_long_name", "route_desc", - ] + group_cols - - crosswalk = helpers.import_scheduled_trips( - analysis_date, - columns = keep_cols, - get_pandas = True - ) - - common_shape = sched_rt_utils.most_common_shape_by_route_direction(analysis_date) - - crosswalk2 = pd.merge( - crosswalk, - common_shape, - on = ["schedule_gtfs_dataset_key", "route_id", "direction_id"], - how = "inner" - ).astype({"direction_id": "Int64"}) - - - time_of_day = sched_rt_utils.get_trip_time_buckets(analysis_date) - - # Clean up route name - crosswalk2 = portfolio_utils.add_route_name( - crosswalk2 - ).drop(columns = ["route_short_name", "route_long_name", "route_desc"]) - - df = dd.merge( - rt_trips, - crosswalk2, - on = group_cols, - how = "left", - ).merge( - time_of_day, - on = group_cols, - how = "left" - ) - - return df - - -def drop_extremely_low_and_high_speeds( - df: pd.DataFrame, - speed_range: tuple -) -> pd.DataFrame: - """ - Descriptives show the 5th percentile is around 5 mph, - and 95th percentile is around 25 mph. - - There are some weird calculations for <3 mph, and even - some negative values, so let's exclude those...maybe - the vp is not traveling across the entirety of the shape. - - Exclude unusually high speeds, over 70 mph. - """ - low, high = speed_range - - df2 = df[(df.speed_mph >= low) & - (df.speed_mph <= high) - ].reset_index(drop=True) - - return df2 - - -def avg_route_speeds_by_time_of_day( - df: pd.DataFrame, - group_cols: list, - speed_range: tuple = (3, 70) -) -> pd.DataFrame: - """ - Keep trips with average speeds at least LOWER_BOUND_SPEED - and less than or equal to UPPER_BOUND_SPEED. - - Take the average by route-direction-time_of_day. - Also include averages for scheduled trip service_minutes vs - rt trip approximated-service-minutes - """ - df2 = drop_extremely_low_and_high_speeds(df, speed_range = (3, 70)) - - df3 = (df2.groupby(group_cols, - observed = True, group_keys = False) - .agg({ - "speed_mph": "mean", - "service_minutes": "mean", - "change_sec": "mean", - "trip_instance_key": "count" - }).reset_index() - ) - - df3 = df3.assign( - avg_rt_trip_min = df3.change_sec.divide(60).round(1), - service_minutes = df3.service_minutes.round(1), - speed_mph = df3.speed_mph.round(1), - ).rename(columns = { - "service_minutes": "avg_sched_trip_min", - "trip_instance_key": "n_trips", - "route_name_used": "route_name", - }).drop(columns = "change_sec") - - return df3 - - -def final_cleaning_for_export( - df: pd.DataFrame, - analysis_date: str -) -> gpd.GeoDataFrame: - """ - Attach shape geometry to most common shape_id. - """ - # Attach org name and source_record_id - org_crosswalk = ( - schedule_rt_utils.sample_gtfs_dataset_key_to_organization_crosswalk( - df, - analysis_date, - quartet_data = "vehicle_positions", - dim_gtfs_dataset_cols = ["key", "base64_url"], - dim_organization_cols = ["source_record_id", - "name", "caltrans_district"]) - ) - - df_with_org = pd.merge( - df, - org_crosswalk.rename(columns = { - "vehicle_positions_gtfs_dataset_key": "gtfs_dataset_key"}), - on = "gtfs_dataset_key", - how = "inner" - ) - - # Attach shape geometry and make sure it's in WGS84 - shapes = helpers.import_scheduled_shapes( - analysis_date, - columns = ["shape_array_key", "geometry"], - get_pandas = True, - crs = WGS84 - ) - - df_with_shape = pd.merge( - shapes, - df_with_org, - on = "shape_array_key", # once merged, can drop shape_array_key - how = "inner" - ) - - agency_cols = ['organization_source_record_id', 'organization_name'] - route_cols = ['route_id', 'route_name', - 'direction_id', 'common_shape_id'] - - col_order = agency_cols + route_cols + [ - 'time_of_day', - 'speed_mph', 'n_trips', - 'avg_sched_trip_min', 'avg_rt_trip_min', - 'base64_url', 'caltrans_district', - 'geometry' - ] - - final_df = df_with_shape.reindex(columns = col_order).rename( - columns = {"organization_source_record_id": "org_id", - "organization_name": "agency", - "caltrans_district": "district_name" - }) - - return final_df - - -if __name__ == "__main__": - - start = datetime.datetime.now() - - # Merge in the subset of vp to the shape geometry - vp = pd.read_parquet( - f"{SEGMENT_GCS}trip_summary/vp_subset_{analysis_date}.parquet", - ) - - vp = gpd.GeoDataFrame( - vp, - geometry = gpd.points_from_xy(vp.x, vp.y, crs=WGS84) - ).to_crs(PROJECT_CRS).drop(columns = ["x", "y"]) - - # in case there are fewer shapes to grab - shapes_list = vp.shape_array_key.unique().tolist() - - # to_crs() takes a long time when os.environ["USE_PYGEOS"] = '0', - # so keep pygeos on - shapes = helpers.import_scheduled_shapes( - analysis_date, - columns = ["shape_array_key","geometry"], - filters = [[("shape_array_key", "in", shapes_list)]], - get_pandas = True, - crs = PROJECT_CRS - ) - - df = pd.merge( - vp, - shapes, - on = "shape_array_key", - how = "inner" - ).rename(columns = {"geometry_x": "vp_geometry", - "geometry_y": "shape_geometry"} - ).set_geometry("vp_geometry") - - # project the vp geometry onto the shape geometry and get shape_meters - shape_meters_geoseries = wrangle_shapes.project_point_geom_onto_linestring( - df, - "shape_geometry", - "vp_geometry", - ) - - df["shape_meters"] = shape_meters_geoseries - - time1 = datetime.datetime.now() - print(f"linear ref: {time1 - start}") - - # Get trip-level speed - speed = distance_and_seconds_elapsed( - df, - group_cols = ["gtfs_dataset_key", "trip_instance_key"] - ) - - # Attach scheduled trip columns, like route, direction, time_of_day - speed2 = add_scheduled_trip_columns( - speed, - analysis_date, - group_cols = ["trip_instance_key"] - ) - - time2 = datetime.datetime.now() - print(f"calculate speed: {time2 - time1}") - - speed2.to_parquet( - f"{SEGMENT_GCS}trip_summary/trip_speeds_{analysis_date}.parquet" - ) - - speed2 = pd.read_parquet( - f"{SEGMENT_GCS}trip_summary/trip_speeds_{analysis_date}.parquet") - - # Take the average across route-direction-time_of_day - avg_speeds = avg_route_speeds_by_time_of_day( - speed2, - group_cols = [ - "gtfs_dataset_key", "time_of_day", - "route_id", "direction_id", - "route_name_used", - "common_shape_id", "shape_array_key" - ] - ) - - avg_speeds2 = final_cleaning_for_export(avg_speeds, analysis_date) - - utils.geoparquet_gcs_export( - avg_speeds2, - f"{SEGMENT_GCS}trip_summary/", - f"route_speeds_{analysis_date}" - ) - - utils.geoparquet_gcs_export( - avg_speeds2, - f"{SEGMENT_GCS}export/", - "speeds_by_route_time_of_day" - ) - - time3 = datetime.datetime.now() - print(f"route-direction average speeds: {time3 - time2}") - - end = datetime.datetime.now() - print(f"execution time: {end - start}") \ No newline at end of file diff --git a/rt_segment_speeds/scripts/Makefile b/rt_segment_speeds/scripts/Makefile index 8f79d19d8..9c8d4d87b 100644 --- a/rt_segment_speeds/scripts/Makefile +++ b/rt_segment_speeds/scripts/Makefile @@ -5,28 +5,18 @@ segmentize: python cut_special_stop_segments.py python concatenate_stop_segments.py #python cut_road_segments.py #cut once creates files needed to run in concat_road_segments - python concatenate_road_segments.py - - -speeds_pipeline: - python A1_sjoin_vp_segments.py - python A2_sjoin_postprocessing.py - python A3_valid_vehicle_positions.py - python B1_speeds_by_segment_trip.py - python B2_avg_speeds_by_segment.py - python B3_export.py - python C2_triangulate_vp.py - python C3_trip_route_speed.py + python concatenate_road_segments.py -new_pipeline: - python prep_stop_segments.py + +speeds_pipeline: python shapely_project_vp.py python nearest_vp_to_stop.py python interpolate_stop_arrival.py python stop_arrivals_to_speed.py - #python handle_common_errors.py + python handle_common_errors.py + python avg_speeds_by_segment.py + python export.py - download_roads: #pip install esridump diff --git a/rt_segment_speeds/scripts/avg_speeds_by_segment.py b/rt_segment_speeds/scripts/avg_speeds_by_segment.py new file mode 100644 index 000000000..8dba1e57c --- /dev/null +++ b/rt_segment_speeds/scripts/avg_speeds_by_segment.py @@ -0,0 +1,260 @@ +""" +Quick aggregation for speed metrics by segment +""" +import datetime +import geopandas as gpd +import numpy as np +import pandas as pd +import sys + +from loguru import logger + +from segment_speed_utils import helpers, sched_rt_utils +from segment_speed_utils.project_vars import SEGMENT_GCS, CONFIG_PATH +from calitp_data_analysis import utils, geography_utils +from shared_utils import portfolio_utils, rt_utils + + +def calculate_avg_speeds( + df: pd.DataFrame, + group_cols: list +) -> pd.DataFrame: + """ + Calculate the median, 20th, and 80th percentile speeds + by groups. + """ + # pd.groupby and pd.quantile is so slow + # create our own list of speeds and use np + df2 = (df.groupby(group_cols, + observed=True, group_keys=False) + .agg({"speed_mph": lambda x: sorted(list(x))}) + .reset_index() + .rename(columns = {"speed_mph": "speed_mph_list"}) + ) + + df2 = df2.assign( + p50_mph = df2.apply(lambda x: np.percentile(x.speed_mph_list, 0.5), axis=1), + n_trips = df2.apply(lambda x: len(x.speed_mph_list), axis=1).astype("int"), + p20_mph = df2.apply(lambda x: np.percentile(x.speed_mph_list, 0.2), axis=1), + p80_mph = df2.apply(lambda x: np.percentile(x.speed_mph_list, 0.8), axis=1), + ) + + stats = df2.drop(columns = "speed_mph_list") + + # Clean up for map + speed_cols = [c for c in stats.columns if "_mph" in c] + stats[speed_cols] = stats[speed_cols].round(2) + + return stats + + +def speeds_with_segment_geom( + analysis_date: str, + dict_inputs: dict = {}, +): + """ + Import the segment-trip table. + Average the speed_mph across all trips present in the segment. + """ + start = datetime.datetime.now() + + SEGMENT_FILE = f'{dict_inputs["segments_file"]}_{analysis_date}' + SEGMENT_IDENTIFIER_COLS = dict_inputs["segment_identifier_cols"] + SPEEDS_FILE = f'{dict_inputs["stage4"]}_{analysis_date}' + EXPORT_FILE = f'{dict_inputs["stage5"]}_{analysis_date}' + MAX_SPEED = dict_inputs["max_speed"] + + # Read in speeds and attach time-of-day + df = pd.read_parquet( + f"{SEGMENT_GCS}{SPEEDS_FILE}.parquet", + filters = [[("speed_mph", "<=", MAX_SPEED)]] + ) + + time_of_day_df = sched_rt_utils.get_trip_time_buckets(analysis_date) + + df2 = pd.merge( + df, + time_of_day_df, + on = "trip_instance_key", + how = "inner" + ) + + subset_shape_keys = df2.shape_array_key.unique().tolist() + + # Load in segment geometry, keep shapes present in speeds + segments = gpd.read_parquet( + f"{SEGMENT_GCS}{SEGMENT_FILE}.parquet", + columns = SEGMENT_IDENTIFIER_COLS + [ + "schedule_gtfs_dataset_key", + "stop_id", + "loop_or_inlining", + "geometry", + "district_name" + ], + filters = [[("shape_array_key", "in", subset_shape_keys)]] + ).to_crs(geography_utils.WGS84) + + all_day = calculate_avg_speeds( + df2, + SEGMENT_IDENTIFIER_COLS, + ) + + peak = calculate_avg_speeds( + df2[df2.time_of_day.isin(["AM Peak", "PM Peak"])], + SEGMENT_IDENTIFIER_COLS, + ) + + stats = pd.concat([ + all_day.assign(time_of_day = "all_day"), + peak.assign(time_of_day = "peak") + ], axis=0) + + + # Merge in segment geometry + gdf = pd.merge( + segments, + stats, + on = SEGMENT_IDENTIFIER_COLS, + how = "left" + ).sort_values( + SEGMENT_IDENTIFIER_COLS + ["time_of_day"] + ).reset_index(drop=True) + + utils.geoparquet_gcs_export( + gdf, + SEGMENT_GCS, + EXPORT_FILE + ) + + end = datetime.datetime.now() + logger.info(f"segment averages execution time: {end - start}") + + return + + +def add_scheduled_trip_columns( + rt_trips: pd.DataFrame, + analysis_date: str, + group_cols: list = ["trip_instance_key"]) -> pd.DataFrame: + """ + Merge RT trips (vehicle positions) to scheduled trips. + Add in the needed scheduled trip columns to take + route-direction-time_of_day averages. + """ + keep_cols = [ + "gtfs_dataset_key", + "direction_id", + "route_id", "route_short_name", "route_long_name", "route_desc", + ] + group_cols + + crosswalk = helpers.import_scheduled_trips( + analysis_date, + columns = keep_cols, + get_pandas = True + ) + + common_shape = sched_rt_utils.most_common_shape_by_route_direction(analysis_date) + + crosswalk2 = pd.merge( + crosswalk, + common_shape, + on = ["schedule_gtfs_dataset_key", "route_id", "direction_id"], + how = "inner" + ).astype({"direction_id": "Int64"}) + + time_of_day = sched_rt_utils.get_trip_time_buckets(analysis_date) + + # Clean up route name + crosswalk2 = portfolio_utils.add_route_name( + crosswalk2 + ).drop(columns = ["route_short_name", "route_long_name", "route_desc"]) + + df = pd.merge( + rt_trips, + crosswalk2, + on = group_cols, + how = "left", + ).merge( + time_of_day, + on = group_cols, + how = "left" + ) + + return df + + +def avg_trip_speeds_with_time_of_day( + analysis_date: str, + dict_inputs: dict, +) -> pd.DataFrame: + """ + Get trip-level speeds, scheduled trip service_minutes + and rt trip approximated-service_minutes. + """ + start = datetime.datetime.now() + + SPEEDS_FILE = f'{dict_inputs["stage4"]}_{analysis_date}' + EXPORT_FILE = f'{dict_inputs["stage6"]}_{analysis_date}' + MAX_SPEED = dict_inputs["max_speed"] + + by_segment = pd.read_parquet( + f"{SEGMENT_GCS}{SPEEDS_FILE}.parquet", + columns = ["trip_instance_key", "meters_elapsed", "sec_elapsed"] + ) + + by_trip = (by_segment.groupby("trip_instance_key") + .agg({ + "meters_elapsed": "sum", + "sec_elapsed": "sum" + }) + .reset_index() + ) + + by_trip = by_trip.assign( + speed_mph = (by_trip.meters_elapsed.divide(by_trip.sec_elapsed) + ) * rt_utils.MPH_PER_MPS, + rt_trip_min = by_trip.sec_elapsed.divide(60) + ).query('speed_mph <= @MAX_SPEED') + + + df = add_scheduled_trip_columns( + by_trip, + analysis_date, + group_cols = ["trip_instance_key"] + ) + + df.to_parquet( + f"{SEGMENT_GCS}trip_summary/{EXPORT_FILE}.parquet" + ) + + end = datetime.datetime.now() + logger.info(f"trip summary execution time: {end - start}") + + return + + +if __name__ == "__main__": + + from segment_speed_utils.project_vars import analysis_date_list + + LOG_FILE = "../logs/avg_speeds.log" + logger.add(LOG_FILE, retention="3 months") + logger.add(sys.stderr, + format="{time:YYYY-MM-DD at HH:mm:ss} | {level} | {message}", + level="INFO") + + STOP_SEG_DICT = helpers.get_parameters(CONFIG_PATH, "stop_segments") + + for analysis_date in analysis_date_list: + logger.info(f"Analysis date: {analysis_date}") + + speeds_with_segment_geom( + analysis_date, + STOP_SEG_DICT + ) + + avg_trip_speeds_with_time_of_day( + analysis_date, + STOP_SEG_DICT + ) + diff --git a/rt_segment_speeds/scripts/concatenate_stop_segments.py b/rt_segment_speeds/scripts/concatenate_stop_segments.py index 3cd2a883f..6f46a32c9 100644 --- a/rt_segment_speeds/scripts/concatenate_stop_segments.py +++ b/rt_segment_speeds/scripts/concatenate_stop_segments.py @@ -96,11 +96,8 @@ def spatial_join_to_caltrans_districts( gdf2 = spatial_join_to_caltrans_districts(gdf) - arrowized_segments = wrangle_shapes.add_arrowized_geometry( - gdf2) - utils.geoparquet_gcs_export( - arrowized_segments, + gdf2, SEGMENT_GCS, f"{EXPORT_FILE}_{analysis_date}" ) \ No newline at end of file diff --git a/rt_segment_speeds/scripts/config.yml b/rt_segment_speeds/scripts/config.yml index ac1d96bdb..022e28e65 100644 --- a/rt_segment_speeds/scripts/config.yml +++ b/rt_segment_speeds/scripts/config.yml @@ -2,11 +2,16 @@ stop_segments: stage1: "vp_usable" stage2: "nearest_vp" stage3: "stop_arrivals" - stage4: "speed_stop_segments" + stage4: "speeds_stop_segments" + stage5: "avg_speeds_stop_segments" + stage6: "trip_speeds" + stage7: "route_speeds" + segments_file: "stop_segments" segment_identifier_cols: ["shape_array_key", "stop_sequence"] timestamp_col: "location_timestamp_local" time_min_cutoff: 10 pct_segment_minimum: 0.3 + max_speed: 80 road_segments: stage1: "vp_usable" stage2: "vp_road_segment" diff --git a/rt_segment_speeds/scripts/cut_road_segments.py b/rt_segment_speeds/scripts/cut_road_segments.py index 443907456..6d585e856 100644 --- a/rt_segment_speeds/scripts/cut_road_segments.py +++ b/rt_segment_speeds/scripts/cut_road_segments.py @@ -87,6 +87,7 @@ def load_roads(filtering: tuple) -> gpd.GeoDataFrame: return df2 + """ Primary/Secondary Roads """ @@ -246,7 +247,7 @@ def add_segment_direction(df: gpd.GeoDataFrame) -> gpd.GeoDataFrame: rt_utils.primary_cardinal_direction(x.origin, x.destination), axis=1, ) - ).drop(columns = ["origin", "destination"]) + ) return df @@ -267,7 +268,7 @@ def add_segment_direction(df: gpd.GeoDataFrame) -> gpd.GeoDataFrame: roads = load_roads(filtering = [("MTFCC", "in", road_type_values)]) primary_secondary_roads = cut_primary_secondary_roads( - roads, ROAD_SEGMENT_METERS).drop(columns = "road_length") + roads, ROAD_SEGMENT_METERS) utils.geoparquet_gcs_export( primary_secondary_roads, diff --git a/rt_segment_speeds/scripts/export.py b/rt_segment_speeds/scripts/export.py new file mode 100644 index 000000000..4c50d6c11 --- /dev/null +++ b/rt_segment_speeds/scripts/export.py @@ -0,0 +1,275 @@ +""" +Attach columns needed for publishing to open data portal. +Suppress certain rows and columns too. +""" +import datetime +import geopandas as gpd +import pandas as pd + +from shared_utils import schedule_rt_utils, utils +from calitp_data_analysis import utils, geography_utils +from segment_speed_utils import helpers +from segment_speed_utils.project_vars import SEGMENT_GCS, CONFIG_PATH + +def get_operator_natural_identifiers( + df: pd.DataFrame, + analysis_date: str +) -> pd.DataFrame: + """ + For each gtfs_dataset_key-shape_array_key combination, + re-attach the natural identifiers and organizational identifiers. + Return a df that should be merged against speeds_df. + """ + operator_shape_df = (df[["schedule_gtfs_dataset_key", "shape_array_key"]] + .drop_duplicates() + .reset_index(drop=True) + .rename(columns = { + "schedule_gtfs_dataset_key": "gtfs_dataset_key"}) + ) + + # Get shape_id back + shape_identifiers = helpers.import_scheduled_trips( + analysis_date, + columns = ["shape_array_key", "shape_id"], + get_pandas = True + ) + + df_with_shape = pd.merge( + operator_shape_df, + shape_identifiers, + on = "shape_array_key", + how = "inner" + ) + + # Get base64_url, uri, organization_source_record_id and organization_name + crosswalk = schedule_rt_utils.sample_gtfs_dataset_key_to_organization_crosswalk( + df_with_shape, + analysis_date, + quartet_data = "schedule", + dim_gtfs_dataset_cols = [ + "key", + "base64_url", + ], + dim_organization_cols = ["source_record_id", "name"] + ) + + df_with_org = pd.merge( + df_with_shape.rename( + columns = {"gtfs_dataset_key": "schedule_gtfs_dataset_key"}), + crosswalk, + on = "schedule_gtfs_dataset_key", + how = "inner" + ) + + return df_with_org + + +def finalize_df_for_export(gdf: gpd.GeoDataFrame) -> gpd.GeoDataFrame: + """ + Sorting, suppressing columns not needed in export. + """ + + RENAME_DICT = { + "organization_source_record_id": "org_id", + "organization_name": "agency", + } + + gdf2 = (gdf.sort_values(["organization_name", + "shape_id", "stop_sequence"]) + .reset_index(drop=True) + .rename(columns = RENAME_DICT) + ) + + return gdf2 + + +def export_average_speeds( + analysis_date: str, + dict_inputs: dict +): + start = datetime.datetime.now() + INPUT_FILE = f'{dict_inputs["stage5"]}_{analysis_date}' + + gdf = gpd.read_parquet( + f"{SEGMENT_GCS}{INPUT_FILE}.parquet" + ) + + operator_identifiers = get_operator_natural_identifiers(gdf, analysis_date) + + time1 = datetime.datetime.now() + print(f"get natural identifiers: {time1 - start}") + + gdf2 = pd.merge( + gdf, + operator_identifiers, + on = ["schedule_gtfs_dataset_key", "shape_array_key"], + how = "inner" + ) + + final_gdf = finalize_df_for_export(gdf2) + + time2 = datetime.datetime.now() + print(f"finalize: {time2 - time1}") + + keep_cols = [ + 'org_id', 'agency', + 'shape_id', 'stop_sequence', 'stop_id', + 'geometry', + 'p50_mph', 'p20_mph', + 'p80_mph', 'n_trips', + 'time_of_day', + 'base64_url', + 'district_name' + ] + + utils.geoparquet_gcs_export( + final_gdf[keep_cols], + f"{SEGMENT_GCS}export/", + INPUT_FILE + ) + + # Keep a tabular version (geom is big to save) for us to compare what's published + # and contains columns we use for internal modeling + # (shape_array_key, gtfs_dataset_key, etc) + final_gdf.drop(columns = "geometry").to_parquet( + f"{SEGMENT_GCS}export/{INPUT_FILE}_tabular.parquet" + ) + + utils.geoparquet_gcs_export( + final_gdf[keep_cols], + f"{SEGMENT_GCS}export/", + "speeds_by_stop_segments" + ) + + end = datetime.datetime.now() + print(f"execution time: {end - start}") + + return + + +def average_route_speeds_for_export( + analysis_date: str, + dict_inputs: dict +) -> gpd.GeoDataFrame: + """ + Aggregate trip speeds to route-direction. + Attach shape geometry to most common shape_id. + """ + SPEEDS_FILE = f'{dict_inputs["stage6"]}_{analysis_date}' + EXPORT_FILE = f'{dict_inputs["stage7"]}_{analysis_date}' + MAX_SPEED = dict_inputs["max_speed"] + + df = pd.read_parquet( + f"{SEGMENT_GCS}trip_summary/{SPEEDS_FILE}.parquet", + filters = [[("speed_mph", "<=", MAX_SPEED)]] + ) + + # Aggregate by route-direction + route_cols = [ + "schedule_gtfs_dataset_key", "time_of_day", + "route_id", "direction_id", + "route_name_used", + "common_shape_id", "shape_array_key" + ] + + df2 = (df.groupby(route_cols, + observed = True, group_keys = False) + .agg({ + "service_minutes": "mean", + "rt_trip_min": "mean", + "speed_mph": "mean", + "trip_instance_key": "count" + }).reset_index() + ) + + df3 = df2.assign( + rt_trip_min = df2.rt_trip_min.round(1), + service_minutes = df2.service_minutes.round(1), + speed_mph = df2.speed_mph.round(1) + ).rename(columns = { + "service_minutes": "avg_sched_trip_min", + "rt_trip_min": "avg_rt_trip_min", + "trip_instance_key": "n_trips", + "route_name_used": "route_name", + "schedule_gtfs_dataset_key": "gtfs_dataset_key" + }) + + # Attach org name and source_record_id + org_crosswalk = ( + schedule_rt_utils.sample_gtfs_dataset_key_to_organization_crosswalk( + df3, + analysis_date, + quartet_data = "schedule", + dim_gtfs_dataset_cols = ["key", "base64_url"], + dim_organization_cols = ["source_record_id", + "name", "caltrans_district"]) + ) + + df_with_org = pd.merge( + df3, + org_crosswalk.rename(columns = { + "schedule_gtfs_dataset_key": "gtfs_dataset_key"}), + on = "gtfs_dataset_key", + how = "inner" + ) + + # Attach shape geometry and make sure it's in WGS84 + shapes = helpers.import_scheduled_shapes( + analysis_date, + columns = ["shape_array_key", "geometry"], + get_pandas = True, + crs = geography_utils.WGS84 + ) + + df_with_shape = pd.merge( + shapes, + df_with_org, + on = "shape_array_key", # once merged, can drop shape_array_key + how = "inner" + ) + + agency_cols = ['organization_source_record_id', 'organization_name'] + route_cols = ['route_id', 'route_name', + 'direction_id', 'common_shape_id'] + + col_order = agency_cols + route_cols + [ + 'time_of_day', + 'speed_mph', 'n_trips', + 'avg_sched_trip_min', 'avg_rt_trip_min', + 'base64_url', 'caltrans_district', + 'geometry' + ] + + final_df = df_with_shape.reindex(columns = col_order).rename( + columns = {"organization_source_record_id": "org_id", + "organization_name": "agency", + "caltrans_district": "district_name" + }) + + + utils.geoparquet_gcs_export( + final_df, + f"{SEGMENT_GCS}trip_summary/", + EXPORT_FILE + ) + + utils.geoparquet_gcs_export( + final_df, + f"{SEGMENT_GCS}export/", + "speeds_by_route_time_of_day" + ) + + return + + +if __name__ == "__main__": + + from segment_speed_utils.project_vars import analysis_date_list + + STOP_SEG_DICT = helpers.get_parameters(CONFIG_PATH, "stop_segments") + + for analysis_date in analysis_date_list: + export_average_speeds(analysis_date, STOP_SEG_DICT) + average_route_speeds_for_export(analysis_date, STOP_SEG_DICT) + + diff --git a/rt_segment_speeds/scripts/interpolate_stop_arrival.py b/rt_segment_speeds/scripts/interpolate_stop_arrival.py index 4460d8538..9045f54d4 100644 --- a/rt_segment_speeds/scripts/interpolate_stop_arrival.py +++ b/rt_segment_speeds/scripts/interpolate_stop_arrival.py @@ -12,7 +12,6 @@ from segment_speed_utils import helpers from segment_speed_utils.project_vars import (SEGMENT_GCS, PROJECT_CRS, CONFIG_PATH) -from shared_utils import rt_dates def attach_vp_shape_meters_with_timestamp( @@ -82,22 +81,13 @@ def get_stop_arrivals(df: pd.DataFrame) -> pd.DataFrame: return df -if __name__ == "__main__": - - LOG_FILE = "../logs/interpolate_stop_arrival.log" - logger.add(LOG_FILE, retention="3 months") - logger.add(sys.stderr, - format="{time:YYYY-MM-DD at HH:mm:ss} | {level} | {message}", - level="INFO") +def main( + analysis_date: str, + dict_inputs: dict +): + NEAREST_VP = f"{dict_inputs['stage2']}_{analysis_date}" + STOP_ARRIVALS_FILE = f"{dict_inputs['stage3']}_{analysis_date}" - analysis_date = rt_dates.DATES["sep2023"] - STOP_SEG_DICT = helpers.get_parameters(CONFIG_PATH, "stop_segments") - - NEAREST_VP = f"{STOP_SEG_DICT['stage2']}_{analysis_date}" - STOP_ARRIVALS_FILE = f"{STOP_SEG_DICT['stage3']}_{analysis_date}" - - logger.info(f"Analysis date: {analysis_date}") - start = datetime.datetime.now() vp_pared = pd.read_parquet( @@ -143,3 +133,22 @@ def get_stop_arrivals(df: pd.DataFrame) -> pd.DataFrame: end = datetime.datetime.now() logger.info(f"execution time: {end - start}") + + return + + +if __name__ == "__main__": + + from segment_speed_utils.project_vars import analysis_date_list + + LOG_FILE = "../logs/interpolate_stop_arrival.log" + logger.add(LOG_FILE, retention="3 months") + logger.add(sys.stderr, + format="{time:YYYY-MM-DD at HH:mm:ss} | {level} | {message}", + level="INFO") + + STOP_SEG_DICT = helpers.get_parameters(CONFIG_PATH, "stop_segments") + + for analysis_date in analysis_date_list: + logger.info(f"Analysis date: {analysis_date}") + main(analysis_date, STOP_SEG_DICT) diff --git a/rt_segment_speeds/scripts/loop_utils.py b/rt_segment_speeds/scripts/loop_utils.py deleted file mode 100644 index d296089bc..000000000 --- a/rt_segment_speeds/scripts/loop_utils.py +++ /dev/null @@ -1,105 +0,0 @@ -import dask.dataframe as dd -import folium -import geopandas as gpd -import numpy as np -import pandas as pd - -import prep_stop_segments -from segment_speed_utils import (gtfs_schedule_wrangling, helpers, - wrangle_shapes) -from segment_speed_utils.project_vars import analysis_date - -def grab_loop_trips(analysis_date: str) -> pd.DataFrame: - """ - Use stop_times table to grab the trips that - visit the same stop_id at least twice. - """ - stop_times = helpers.import_scheduled_stop_times( - analysis_date, - columns = [ - "feed_key", "trip_id", - "stop_id", "stop_sequence", - ] - ).drop_duplicates() - - stop_visits = (stop_times.groupby( - ["feed_key", "trip_id", "stop_id"]) - .agg({"stop_sequence": "count"}) - #nunique doesn't work in dask - .reset_index() - ) - - loop_trips = (stop_visits[stop_visits.stop_sequence > 1] - [["feed_key", "trip_id"]] - .drop_duplicates() - .reset_index(drop=True) - .compute() - ) - return loop_trips - - -def grab_loop_shapes(analysis_date: str) -> gpd.GeoDataFrame: - - loop_trips = grab_loop_trips(analysis_date) - - trips_with_geom = gtfs_schedule_wrangling.get_trips_with_geom( - analysis_date).compute() - - loop_trips_with_geom = pd.merge( - trips_with_geom, - loop_trips, - on = ["feed_key", "trip_id"], - how = "inner" - ) - - return loop_trips_with_geom - - -def assign_visits_to_stop(df: pd.DataFrame): - """ - Groupby shape and stop_id and count how many times it's being visited - and which number visit it is. - """ - df = df.assign( - num_visits = df.groupby(["shape_array_key", "stop_id"]) - .stop_sequence.transform("nunique") - ) - - return df - - -def plot_segments_and_stops( - segment: gpd.GeoSeries, - stops: gpd.GeoSeries -): - m = segment.explore(tiles="CartoDB Positron", name="segment") - m = stops.explore(m=m, name="stops") - - folium.LayerControl().add_to(m) - return m - - -def stop_segment_components_to_geoseries( - subset_shape_geom_array: np.ndarray, - subset_stop_geom_array: np.ndarray = [], - crs: str = "EPSG:3310" -) -> tuple:#[gpd.GeoDataFrame]: - """ - Turn segments and stops into geoseries so we can plot it easily. - """ - stop_segment = wrangle_shapes.array_to_geoseries( - subset_shape_geom_array, - geom_type="line", - crs=crs - )#.to_frame(name="stop_segment") - - if len(subset_stop_geom_array) > 0: - related_stops = wrangle_shapes.array_to_geoseries( - subset_stop_geom_array, - geom_type="point", - crs=crs - )#.to_frame(name="surrounding_stops_geom") - - return stop_segment, related_stops - else: - return stop_segment \ No newline at end of file diff --git a/rt_segment_speeds/scripts/nearest_vp_to_stop.py b/rt_segment_speeds/scripts/nearest_vp_to_stop.py index 847f6db30..a6b52ee16 100644 --- a/rt_segment_speeds/scripts/nearest_vp_to_stop.py +++ b/rt_segment_speeds/scripts/nearest_vp_to_stop.py @@ -16,7 +16,6 @@ from segment_speed_utils import helpers, segment_calcs, wrangle_shapes from segment_speed_utils.project_vars import (SEGMENT_GCS, PROJECT_CRS, CONFIG_PATH) -from shared_utils import rt_dates def rt_trips_to_shape(analysis_date: str) -> pd.DataFrame: @@ -320,25 +319,23 @@ def find_nearest_vp_to_stop( fixed_results.to_parquet( f"{SEGMENT_GCS}projection/{EXPORT_FILE}.parquet") - + end = datetime.datetime.now() + logger.info(f"execution time: {end - start}") if __name__ == "__main__": + from segment_speed_utils.project_vars import analysis_date_list + LOG_FILE = "../logs/nearest_vp.log" logger.add(LOG_FILE, retention="3 months") logger.add(sys.stderr, format="{time:YYYY-MM-DD at HH:mm:ss} | {level} | {message}", level="INFO") - analysis_date = rt_dates.DATES["sep2023"] STOP_SEG_DICT = helpers.get_parameters(CONFIG_PATH, "stop_segments") - logger.info(f"Analysis date: {analysis_date}") + for analysis_date in analysis_date_list: + logger.info(f"Analysis date: {analysis_date}") - start = datetime.datetime.now() - - find_nearest_vp_to_stop(analysis_date, STOP_SEG_DICT) - - end = datetime.datetime.now() - logger.info(f"execution time: {end - start}") + find_nearest_vp_to_stop(analysis_date, STOP_SEG_DICT) \ No newline at end of file diff --git a/rt_segment_speeds/scripts/prep_stop_segments.py b/rt_segment_speeds/scripts/prep_stop_segments.py index 0f3b739e4..d71198b9f 100644 --- a/rt_segment_speeds/scripts/prep_stop_segments.py +++ b/rt_segment_speeds/scripts/prep_stop_segments.py @@ -231,7 +231,7 @@ def prep_stop_segments(analysis_date: str) -> gpd.GeoDataFrame: 1 if x.shape_array_key in loopy_inlining else 0, axis=1, ).astype("int8") - ) + ).sort_values(["shape_array_key", "stop_sequence"]).reset_index(drop=True) return stop_times_with_geom2 diff --git a/rt_segment_speeds/scripts/shapely_project_vp.py b/rt_segment_speeds/scripts/shapely_project_vp.py index 5539fb31c..260b9e9a9 100644 --- a/rt_segment_speeds/scripts/shapely_project_vp.py +++ b/rt_segment_speeds/scripts/shapely_project_vp.py @@ -11,7 +11,6 @@ from loguru import logger from calitp_data_analysis.geography_utils import WGS84 -from shared_utils import rt_dates from segment_speed_utils import helpers from segment_speed_utils.project_vars import (SEGMENT_GCS, PROJECT_CRS, CONFIG_PATH) @@ -47,14 +46,13 @@ def project_vp_to_shape( return vp_projected_result -if __name__ == "__main__": - - analysis_date = rt_dates.DATES["sep2023"] - STOP_SEG_DICT = helpers.get_parameters(CONFIG_PATH, "stop_segments") - +def project_usable_vp_one_day( + analysis_date: str, + dict_inputs: dict = {} +): start = datetime.datetime.now() - USABLE_VP = f'{STOP_SEG_DICT["stage1"]}_{analysis_date}' + USABLE_VP = f'{dict_inputs["stage1"]}_{analysis_date}' trips = helpers.import_scheduled_trips( analysis_date, @@ -104,4 +102,27 @@ def project_vp_to_shape( f"{SEGMENT_GCS}projection/vp_projected_{analysis_date}.parquet") end = datetime.datetime.now() - logger.info(f"compute and export: {end - time1}") \ No newline at end of file + logger.info(f"compute and export: {end - time1}") + logger.info(f"execution time: {end - start}") + + return + + +if __name__ == "__main__": + + from segment_speed_utils.project_vars import analysis_date_list + + LOG_FILE = "../logs/shapely_project_vp.log" + logger.add(LOG_FILE, retention="3 months") + logger.add(sys.stderr, + format="{time:YYYY-MM-DD at HH:mm:ss} | {level} | {message}", + level="INFO") + + + STOP_SEG_DICT = helpers.get_parameters(CONFIG_PATH, "stop_segments") + + for analysis_date in analysis_date_list: + logger.info(f"Analysis date: {analysis_date}") + + project_usable_vp_one_day(analysis_date, STOP_SEG_DICT) + \ No newline at end of file diff --git a/rt_segment_speeds/scripts/shapes_roads_crosswalk.py b/rt_segment_speeds/scripts/shapes_roads_crosswalk.py new file mode 100644 index 000000000..cf3c3b1b0 --- /dev/null +++ b/rt_segment_speeds/scripts/shapes_roads_crosswalk.py @@ -0,0 +1,101 @@ +""" +""" +import dask.dataframe as dd +import dask_geopandas as dg +import datetime +import geopandas as gpd +import pandas as pd +import sys + +from loguru import logger + +from segment_speed_utils import helpers +from segment_speed_utils.project_vars import (SEGMENT_GCS, + CONFIG_PATH, PROJECT_CRS) + + +def sjoin_shapes_to_roads( + road_segments: gpd.GeoDataFrame, + shapes: gpd.GeoDataFrame +) -> pd.DataFrame: + + keep_cols = ["shape_array_key", "linearid", + "mtfcc", "segment_sequence"] + + shapes = shapes.assign( + geometry = shapes.geometry.buffer(25) + ) + + shapes_to_roads = gpd.sjoin( + shapes, + road_segments, + how = "inner", + predicate = "intersects" + )[keep_cols].drop_duplicates() + + return shapes_to_roads + + +def main(analysis_date: str, dict_inputs: dict): + + start = datetime.datetime.now() + + shapes = helpers.import_scheduled_shapes( + analysis_date, + columns = ["shape_array_key", "geometry"], + get_pandas = True, + crs = PROJECT_CRS + ).pipe( + helpers.remove_shapes_outside_ca + ).drop(columns = "index_right") + + keep_road_cols = ["linearid", "mtfcc", "segment_sequence"] + + road_segments = dg.read_parquet( + f"{SEGMENT_GCS}road_segments_{analysis_date}", + columns = keep_road_cols + ["geometry"] + ).repartition(npartitions=5) + + keep_shape_cols = ["shape_array_key"] + + shape_cols_dtypes = shapes[keep_shape_cols].dtypes.to_dict() + road_cols_dtypes = road_segments[keep_road_cols].dtypes.to_dict() + + sjoin_results = road_segments.map_partitions( + sjoin_shapes_to_roads, + shapes, + meta = { + **shape_cols_dtypes, + **road_cols_dtypes, + }, + align_dataframes = False + ) + + results = sjoin_results.compute() + results.to_parquet( + f"{SEGMENT_GCS}shape_road_crosswalk_{analysis_date}.parquet" + ) + + end = datetime.datetime.now() + logger.info(f"execution time: {end - start}") + + + +if __name__ == "__main__": + + from segment_speed_utils.project_vars import analysis_date_list + + LOG_FILE = "../logs/sjoin_shapes_roads.log" + + logger.add(LOG_FILE, retention="3 months") + logger.add(sys.stderr, + format="{time:YYYY-MM-DD at HH:mm:ss} | {level} | {message}", + level="INFO") + + ROAD_SEG_DICT = helpers.get_parameters(CONFIG_PATH, "road_segments") + + for analysis_date in analysis_date_list: + logger.info(f"Analysis date: {analysis_date}") + main(analysis_date, ROAD_SEG_DICT) + + diff --git a/rt_segment_speeds/scripts/sjoin_vp_roads.py b/rt_segment_speeds/scripts/sjoin_vp_roads.py deleted file mode 100644 index 5a4cf567d..000000000 --- a/rt_segment_speeds/scripts/sjoin_vp_roads.py +++ /dev/null @@ -1,150 +0,0 @@ -import dask.dataframe as dd -import dask_geopandas as dg -import datetime -import geopandas as gpd -import numpy as np -import pandas as pd -import sys - -from loguru import logger - -import A1_sjoin_vp_segments as A1 -from calitp_data_analysis.geography_utils import WGS84 -from segment_speed_utils import helpers -from segment_speed_utils.project_vars import (analysis_date, SEGMENT_GCS, - CONFIG_PATH, PROJECT_CRS) - - - -def single_direction_spatial_join( - vp: dd.DataFrame, - segments: gpd.GeoDataFrame, - segment_identifer_cols: list, - direction: str -) -> dd.DataFrame: - """ - Merge all the segments for a shape for that trip, - and check if vp is within. - Use map partitions, which treats each partition as df or gdf. - """ - vp_gdf = gpd.GeoDataFrame( - vp, - geometry = gpd.points_from_xy(vp.x, vp.y, crs=WGS84) - ).to_crs(PROJECT_CRS).drop(columns = ["x", "y"]) - - vp_to_seg = gpd.sjoin( - vp_gdf, - segments, - how = "inner", - predicate = "within" - )[["vp_idx"] + segment_identifer_cols] - - results = (vp_to_seg - .drop_duplicates() - .reset_index(drop=True) - ) - - return results - - -def stage_direction_results( - vp: dd.DataFrame, - segments: gpd.GeoDataFrame, - segment_identifier_cols: list, - direction: str -): - keep_vp = [direction, "Unknown"] - - vp_subset = vp[vp.vp_primary_direction.isin(keep_vp)].repartition(npartitions=20) - segments_subset = segments[ - segments.primary_direction==direction].reset_index(drop=True) - - seg_id_dtypes = segments[segment_identifier_cols].dtypes.to_dict() - - results_subset = vp_subset.map_partitions( - single_direction_spatial_join, - segments_subset, - segment_identifier_cols, - direction, - meta = {"vp_idx": "int64", - **seg_id_dtypes}, - align_dataframes = False - ) - - return results_subset - - -def sjoin_vp_to_segments( - analysis_date: str, - dict_inputs: dict = {} -): - """ - Spatial join vehicle positions to segments. - Subset by grouping columns. - - Vehicle positions can only join to the relevant segments. - Use route_dir_identifier or shape_array_key to figure out - the relevant segments those vp can be joined to. - """ - INPUT_FILE = dict_inputs["stage1"] - SEGMENT_FILE = dict_inputs["segments_file"] - SEGMENT_IDENTIFIER_COLS = dict_inputs["segment_identifier_cols"] - EXPORT_FILE = dict_inputs["stage2"] - - BUFFER_METERS = 35 - - time0 = datetime.datetime.now() - - # Import vp, keep trips that are usable - vp = dd.read_parquet( - f"{SEGMENT_GCS}{INPUT_FILE}_{analysis_date}/", - columns = ["vp_idx", "x", "y", "vp_primary_direction"], - ).repartition(npartitions=100) - - segments = A1.import_segments_and_buffer( - f"{SEGMENT_FILE}_{analysis_date}", - BUFFER_METERS, - SEGMENT_IDENTIFIER_COLS, - ) - - time1 = datetime.datetime.now() - logger.info(f"import vp and segments: {time1 - time0}") - - all_directions = ["Northbound", "Southbound", "Eastbound", "Westbound"] - - results = [ - stage_direction_results( - vp, - segments, - SEGMENT_IDENTIFIER_COLS, - one_direction - ) for one_direction in all_directions - ] - - time2 = datetime.datetime.now() - logger.info(f"sjoin with map_partitions: {time2 - time1}") - - full_results = dd.multi.concat(results, axis=0).reset_index(drop=True) - full_results = full_results.repartition(npartitions=4) - - full_results.to_parquet( - f"{SEGMENT_GCS}vp_sjoin/{EXPORT_FILE}_{analysis_date}", - overwrite = True - ) - - time3 = datetime.datetime.now() - logger.info(f"export partitioned results: {time3 - time2}") - - -if __name__ == "__main__": - LOG_FILE = "../logs/sjoin_vp_segments.log" - logger.add(LOG_FILE, retention="3 months") - logger.add(sys.stderr, - format="{time:YYYY-MM-DD at HH:mm:ss} | {level} | {message}", - level="INFO") - - logger.info(f"Analysis date: {analysis_date}") - - ROAD_SEG_DICT = helpers.get_parameters(CONFIG_PATH, "road_segments") - - sjoin_vp_to_segments(analysis_date, ROAD_SEG_DICT) \ No newline at end of file diff --git a/rt_segment_speeds/scripts/stop_arrivals_to_speed.py b/rt_segment_speeds/scripts/stop_arrivals_to_speed.py index be9b0e677..46bca3828 100644 --- a/rt_segment_speeds/scripts/stop_arrivals_to_speed.py +++ b/rt_segment_speeds/scripts/stop_arrivals_to_speed.py @@ -7,26 +7,16 @@ from loguru import logger -from shared_utils import rt_dates from segment_speed_utils import helpers, segment_calcs from segment_speed_utils.project_vars import SEGMENT_GCS, CONFIG_PATH -if __name__ == "__main__": - - LOG_FILE = "../logs/speeds_by_segment_trip.log" - logger.add(LOG_FILE, retention="3 months") - logger.add(sys.stderr, - format="{time:YYYY-MM-DD at HH:mm:ss} | {level} | {message}", - level="INFO") +def calculate_speed_from_stop_arrivals( + analysis_date: str, + dict_inputs: dict +): - analysis_date = rt_dates.DATES["sep2023"] - logger.info(f"Analysis date: {analysis_date}") - - - STOP_SEG_DICT = helpers.get_parameters(CONFIG_PATH, "stop_segments") - - STOP_ARRIVALS_FILE = f"{STOP_SEG_DICT['stage3']}_{analysis_date}" - SPEED_FILE = f"{STOP_SEG_DICT['stage4']}_{analysis_date}" + STOP_ARRIVALS_FILE = f"{dict_inputs['stage3']}_{analysis_date}" + SPEED_FILE = f"{dict_inputs['stage4']}_{analysis_date}" start = datetime.datetime.now() @@ -66,4 +56,24 @@ f"{SEGMENT_GCS}{SPEED_FILE}.parquet") end = datetime.datetime.now() - logger.info(f"execution time: {end - start}") \ No newline at end of file + logger.info(f"execution time: {end - start}") + + return + + +if __name__ == "__main__": + + from segment_speed_utils.project_vars import analysis_date_list + + LOG_FILE = "../logs/speeds_by_segment_trip.log" + logger.add(LOG_FILE, retention="3 months") + logger.add(sys.stderr, + format="{time:YYYY-MM-DD at HH:mm:ss} | {level} | {message}", + level="INFO") + + STOP_SEG_DICT = helpers.get_parameters(CONFIG_PATH, "stop_segments") + + for analysis_date in analysis_date_list: + logger.info(f"Analysis date: {analysis_date}") + + calculate_speed_from_stop_arrivals(analysis_date, STOP_SEG_DICT) diff --git a/rt_segment_speeds/segment_speed_utils/gtfs_schedule_wrangling.py b/rt_segment_speeds/segment_speed_utils/gtfs_schedule_wrangling.py index f46123b27..dd51bb0be 100644 --- a/rt_segment_speeds/segment_speed_utils/gtfs_schedule_wrangling.py +++ b/rt_segment_speeds/segment_speed_utils/gtfs_schedule_wrangling.py @@ -68,29 +68,6 @@ def get_trips_with_geom( return trips_with_geom -def merge_shapes_to_stop_times( - trips_with_shape_geom: Union[dg.GeoDataFrame, gpd.GeoDataFrame], - stop_times: dd.DataFrame, -) -> dg.GeoDataFrame: - """ - Merge stop_times with trips (with shape_geom) attached. - """ - st_with_shape = dd.merge( - trips_with_shape_geom, - stop_times, - on = ["feed_key", "trip_id"], - how = "inner", - ).drop_duplicates() - - if isinstance(trips_with_shape_geom, (gpd.GeoDataFrame, dg.GeoDataFrame)): - geometry_col = trips_with_shape_geom.geometry.name - # Sometimes, geometry is lost...need to set it so it remains dg.GeoDataFrame - st_with_shape = st_with_shape.set_geometry(geometry_col) - st_with_shape = st_with_shape.set_crs(trips_with_shape_geom.crs) - - return st_with_shape - - def stop_arrivals_per_stop( stop_times: pd.DataFrame, group_cols: list, diff --git a/rt_segment_speeds/segment_speed_utils/project_vars.py b/rt_segment_speeds/segment_speed_utils/project_vars.py index 792b9ac29..14b26a905 100644 --- a/rt_segment_speeds/segment_speed_utils/project_vars.py +++ b/rt_segment_speeds/segment_speed_utils/project_vars.py @@ -9,9 +9,9 @@ SHARED_GCS = f"{GCS_FILE_PATH}shared_data/" analysis_date = rt_dates.DATES["oct2023"] + analysis_date_list = [ - rt_dates.DATES["sep2023"], - rt_dates.DATES["oct2023"] + rt_dates.DATES["oct2023"] ] PROJECT_CRS = "EPSG:3310" diff --git a/rt_segment_speeds/segment_speed_utils/segment_calcs.py b/rt_segment_speeds/segment_speed_utils/segment_calcs.py index 3dd8b306b..bf36a79d2 100644 --- a/rt_segment_speeds/segment_speed_utils/segment_calcs.py +++ b/rt_segment_speeds/segment_speed_utils/segment_calcs.py @@ -8,59 +8,12 @@ from shared_utils import rt_utils -# https://stackoverflow.com/questions/58145700/using-groupby-to-store-value-counts-in-new-column-in-dask-dataframe -# https://github.com/dask/dask/pull/5327 -def keep_min_max_timestamps_by_segment( - vp_to_seg: dd.DataFrame, - segment_identifier_cols: list, - timestamp_col: str = "location_timestamp_local" -) -> dd.DataFrame: - """ - For each segment-trip combination, throw away excess points, just - keep the enter/exit points for the segment. - """ - # a groupby including gtfs_dataset_key explodes the groupers, and - # a lot of NaNs result...why? - # gtfs_dataset_key or name sometimes is category dtype...so we must use groupby(observed=True) - # shape_array_key will uniquely identify feed_key + shape_id, so should be ok - dtypes_map = vp_to_seg[segment_identifier_cols + [timestamp_col]].dtypes.to_dict() - - # https://stackoverflow.com/questions/52552066/dask-compute-gives-attributeerror-series-object-has-no-attribute-encode - grouped_df = vp_to_seg.groupby(segment_identifier_cols, - observed=True, group_keys=False) - - enter = (grouped_df - [timestamp_col].min() - .reset_index() - # we lose the dtypes for int16 in dask, set it again - .astype(dtypes_map) - ) - - exit = (grouped_df - [timestamp_col].max() - .reset_index() - .astype(dtypes_map) - ) - - enter_exit = dd.multi.concat([enter, exit], axis=0) - - # Merge back in with original to only keep the min/max timestamps - # dask can't sort by multiple columns to drop - vp_full_info = dd.merge( - vp_to_seg, - enter_exit, - on = segment_identifier_cols + [timestamp_col], - how = "inner" - ).reset_index(drop=True) - - return vp_full_info - - def derive_speed( df: pd.DataFrame, distance_cols: tuple = ("prior_shape_meters", "shape_meters"), - time_cols: tuple = ("prior_location_timestamp_local_sec", "location_timestamp_local_sec") + time_cols: tuple = ("prior_location_timestamp_local_sec", + "location_timestamp_local_sec") ) -> pd.DataFrame: """ Derive meters and sec elapsed to calculate speed_mph. @@ -91,58 +44,6 @@ def derive_speed( return df - -def calculate_speed_by_segment_trip( - df: dd.DataFrame, - segment_identifier_cols: list, - timestamp_col: str -) -> dd.DataFrame: - """ - For each segment-trip pair, calculate find the min/max timestamp - and min/max shape_meters. Use that to derive speed column. - """ - segment_trip_cols = [ - "gtfs_dataset_key", "gtfs_dataset_name", - "trip_id", "trip_instance_key", "schedule_gtfs_dataset_key" - ] + segment_identifier_cols - - grouped_df = df.groupby(segment_trip_cols, observed=True, group_keys=False) - - min_time_dist = (grouped_df - .agg({timestamp_col: "min", - "shape_meters": "min"}) - .reset_index() - .rename(columns = { - timestamp_col: "min_time", - "shape_meters": "min_dist"}) - ) - - max_time_dist = (grouped_df - .agg({timestamp_col: "max", - "shape_meters": "max"}) - .reset_index() - .rename(columns = { - timestamp_col: "max_time", - "shape_meters": "max_dist"}) - ) - - segment_trip_agg = pd.merge( - min_time_dist, - max_time_dist, - on = segment_trip_cols, - how = "left" - ) - - #segment_trip_agg = dd.from_pandas(segment_trip_agg, npartitions=3) - - segment_speeds = derive_speed( - segment_trip_agg, - distance_cols = ("min_dist", "max_dist"), - time_cols = ("min_time", "max_time") - ) - - return segment_speeds - def convert_timestamp_to_seconds( df: pd.DataFrame, diff --git a/rt_segment_speeds/segment_speed_utils/wrangle_shapes.py b/rt_segment_speeds/segment_speed_utils/wrangle_shapes.py index acc7f1448..8a4c5be1e 100644 --- a/rt_segment_speeds/segment_speed_utils/wrangle_shapes.py +++ b/rt_segment_speeds/segment_speed_utils/wrangle_shapes.py @@ -84,31 +84,6 @@ def add_arrowized_geometry(gdf: gpd.GeoDataFrame) -> gpd.GeoDataFrame: return gdf -def project_point_geom_onto_linestring( - vp_with_seg_geom: dg.GeoDataFrame, - shape_geoseries: str = "segment_geometry", - point_geoseries: str = "vp_geometry" -): - """ - Use shapely.project to turn point coordinates into numeric. - The point coordinates will be converted to the distance along the linestring. - https://shapely.readthedocs.io/en/stable/manual.html?highlight=project#object.project - https://gis.stackexchange.com/questions/306838/snap-points-shapefile-to-line-shapefile-using-shapely - - From Eric: projecting the stop's point geom onto the shape_id's line geom - https://github.com/cal-itp/data-analyses/blob/f4c9c3607069da6ea96e70c485d0ffe1af6d7a47/rt_delay/rt_analysis/rt_parser.py#L102-L103 - """ - shape_meters_series = vp_with_seg_geom.apply( - lambda x: x[shape_geoseries].project(x[point_geoseries]), - axis=1, - ) - - # To add this as a column to a dask df - # https://www.appsloveworld.com/coding/dataframe/6/add-a-dask-array-column-to-a-dask-dataframe - - return shape_meters_series - - def array_to_geoseries( array: np.ndarray, geom_type: Literal["point", "line", "polygon"], @@ -132,6 +107,7 @@ def array_to_geoseries( return gdf + def get_direction_vector( start: shapely.geometry.Point, end: shapely.geometry.Point @@ -146,6 +122,7 @@ def get_direction_vector( """ return ((end.x - start.x), (end.y - start.y)) + def distill_array_into_direction_vector(array: np.ndarray) -> tuple: """ Given an array of n items, let's take the start/end of that. diff --git a/rt_segment_speeds/setup.py b/rt_segment_speeds/setup.py index e9d338620..41c79e829 100644 --- a/rt_segment_speeds/setup.py +++ b/rt_segment_speeds/setup.py @@ -3,7 +3,7 @@ setup( name="segment_speed_utils", packages=find_packages(), - version="0.2.1", + version="1.0.0", description="Utility functions for GTFS RT segment speeds", author="Cal-ITP", license="Apache",