diff --git a/gtfs_digest/03_report.ipynb b/gtfs_digest/03_report.ipynb index de0335f01..b2d89dd9d 100644 --- a/gtfs_digest/03_report.ipynb +++ b/gtfs_digest/03_report.ipynb @@ -54,9 +54,8 @@ "# Comment out and leave this cell right below pandas\n", "# organization_name = \"Marin County Transit District\"\n", "# organization_name = \"Monterey-Salinas Transit\"\n", - "# organization_name = \"City of Visalia\"\n", - "# organization_name = \"City of Simi Valley\"\n", - "# organization_name = \"Curry Public Transit\"" + "organization_name = \"City of Santa Maria\"\n", + "# organization_name = \"Capitol Corridor Joint Powers Authority\"" ] }, { @@ -68,8 +67,8 @@ }, "outputs": [], "source": [ - "%%capture_parameters\n", - "organization_name" + "# %%capture_parameters\n", + "# organization_name" ] }, { @@ -539,6 +538,24 @@ "except:\n", " display(Markdown(f\"\"\"{organization_name} only has schedule data.\"\"\"))" ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dcb3b64e-15df-49a2-bdf2-b9d2132fa49f", + "metadata": {}, + "outputs": [], + "source": [ + "display(section2.filtered_route(sched_vp_df))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bf0776b1-6812-4c29-a036-4d218aade386", + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { diff --git a/gtfs_digest/43_missing_routes.ipynb b/gtfs_digest/43_missing_routes.ipynb index a1d750d62..42f673b49 100644 --- a/gtfs_digest/43_missing_routes.ipynb +++ b/gtfs_digest/43_missing_routes.ipynb @@ -8,7 +8,11 @@ "## Find Missing Routes: 2 operators. \n", "* [Issue](https://github.com/cal-itp/data-analyses/issues/1312): Capital Corridor doesn't have any rail routes. \n", "* [Most of Santa Maria's routes not showing up in GTFS Digest](https://github.com/cal-itp/data-analyses/issues/1313)\n", - "* `cd data-analyses/rt_segment_speeds && pip install -r requirements.txt && cd ../_shared_utils && make setup_env && cd ../gtfs_digest`" + "* `cd data-analyses/rt_segment_speeds && pip install -r requirements.txt && cd ../_shared_utils && make setup_env && cd ../gtfs_digest`\n", + "\n", + "To-Do\n", + "* Move all the code here to the proper file.\n", + "* Rerun all the scripts that create the underlying dataframes for November date (`df_sched`,`df_avg_speeds`,`df_rt_sched`) and merge them using `gtfs_digest/merge_data.merge_data_sources_by_route_direction()`" ] }, { @@ -22,281 +26,167 @@ "import merge_data\n", "import numpy as np\n", "import pandas as pd\n", - "from segment_speed_utils import gtfs_schedule_wrangling, helpers, time_series_utils\n", - "from shared_utils import catalog_utils, rt_dates, rt_utils\n", - "from update_vars import GTFS_DATA_DICT, RT_SCHED_GCS, SCHED_GCS" + "from segment_speed_utils import (\n", + " gtfs_schedule_wrangling,\n", + " helpers,\n", + " metrics,\n", + " segment_calcs,\n", + " time_series_utils,\n", + ")\n", + "from shared_utils import (\n", + " catalog_utils,\n", + " portfolio_utils,\n", + " rt_dates,\n", + " rt_utils,\n", + " time_helpers,\n", + ")\n", + "from update_vars import GTFS_DATA_DICT, RT_SCHED_GCS, SCHED_GCS, SEGMENT_GCS" ] }, { "cell_type": "code", "execution_count": 2, - "id": "74eaf3a5-711d-447d-a945-93cc24dd6f14", + "id": "f1ff9b22-b6cf-47d5-bc20-138f992a9519", "metadata": {}, "outputs": [], "source": [ - "pd.options.display.max_columns = 100\n", - "pd.options.display.float_format = \"{:.2f}\".format\n", - "pd.set_option(\"display.max_rows\", None)\n", - "pd.set_option(\"display.max_colwidth\", None)" + "from shared_utils.rt_utils import METERS_PER_MILE" ] }, { "cell_type": "code", "execution_count": 3, - "id": "cb99b4b5-7745-422c-a6c5-153f02ffc244", + "id": "74eaf3a5-711d-447d-a945-93cc24dd6f14", "metadata": {}, "outputs": [], "source": [ - "OPERATOR_FILE = GTFS_DATA_DICT.digest_tables.operator_profiles\n", - "OPERATOR_ROUTE = GTFS_DATA_DICT.digest_tables.operator_routes_map" + "pd.options.display.max_columns = 100\n", + "pd.options.display.float_format = \"{:.2f}\".format\n", + "pd.set_option(\"display.max_rows\", None)\n", + "pd.set_option(\"display.max_colwidth\", None)" ] }, { "cell_type": "code", "execution_count": 4, - "id": "55faff71-f82c-46fc-a99d-dcc40205e100", + "id": "1da55301-1cb1-4187-a90c-9ed3d1c39706", "metadata": {}, "outputs": [], "source": [ - "operator_route_gdf = gpd.read_parquet(\n", - " f\"{RT_SCHED_GCS}{OPERATOR_ROUTE}.parquet\",\n", - ")" + "org_name_lists = [\"Capitol Corridor Joint Powers Authority\", \"City of Santa Maria\"]" ] }, { "cell_type": "code", "execution_count": 5, - "id": "9b2c852f-f053-406a-8274-8b4f015f10c9", + "id": "370e6e0d-edb8-40ab-8a27-b299ea9c279e", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "Index(['shape_array_key', 'geometry', 'feed_key', 'schedule_gtfs_dataset_key',\n", - " 'direction_id', 'route_key', 'route_length', 'route_length_miles',\n", - " 'is_downtown_local', 'is_local', 'is_coverage', 'is_rapid',\n", - " 'is_express', 'is_rail', 'organization_source_record_id',\n", - " 'organization_name', 'service_date', 'name', 'route_long_name',\n", - " 'route_short_name', 'route_combined_name', 'route_id'],\n", - " dtype='object')" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ - "operator_route_gdf.columns" + "analysis_date_list = [\"2024-11-13\"]" ] }, { "cell_type": "code", "execution_count": 6, - "id": "1da55301-1cb1-4187-a90c-9ed3d1c39706", + "id": "05bd6fee-c007-4d01-a29e-05c30c478fcb", "metadata": {}, "outputs": [], "source": [ - "org_name_lists = [\"Capitol Corridor Joint Powers Authority\", \"City of Santa Maria\"]" + "one_analysis_date = \"2024-11-13\"" ] }, { "cell_type": "code", "execution_count": 7, - "id": "b164eae4-f657-49e3-ada1-e059362e4689", + "id": "29688d1c-3239-4bc0-935d-8947a426d02d", "metadata": {}, "outputs": [], "source": [ - "operator_route_gdf2 = operator_route_gdf.loc[\n", - " operator_route_gdf.organization_name.isin(org_name_lists)\n", + "schd_keys = [\n", + " \"5a8721fe96786fcd25fba1f8a0ee6358\",\n", + " \"73105f2d1cabc8170ab066d96863c5d5\",\n", + " \"f5a749dd65924e025b1293c58f95f8d6\",\n", "]" ] }, { "cell_type": "code", "execution_count": 8, - "id": "89ccde0b-736c-4fc9-a294-8a12116823a8", + "id": "b4a5fd8f-b4ed-42b4-ab3a-199e0ce779ae", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "Index(['shape_array_key', 'geometry', 'feed_key', 'schedule_gtfs_dataset_key',\n", - " 'direction_id', 'route_key', 'route_length', 'route_length_miles',\n", - " 'is_downtown_local', 'is_local', 'is_coverage', 'is_rapid',\n", - " 'is_express', 'is_rail', 'organization_source_record_id',\n", - " 'organization_name', 'service_date', 'name', 'route_long_name',\n", - " 'route_short_name', 'route_combined_name', 'route_id'],\n", - " dtype='object')" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ - "operator_route_gdf2.columns" + "import sys\n", + "\n", + "sys.path.append(\"../gtfs_funnel/\")\n", + "import operator_scheduled_stats\n", + "import schedule_stats_by_route_direction" ] }, { "cell_type": "code", "execution_count": 9, - "id": "295aaf35-9ade-4f9e-bc4d-5b8ef95a1569", + "id": "62b562a2-9422-4d56-8baf-9d0a87d0b5da", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "41" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ - "len(operator_route_gdf2)" + "def preview(df):\n", + " df2 = df[\n", + " [\"schedule_gtfs_dataset_key\", \"route_id\", \"direction_id\"]\n", + " ].drop_duplicates()\n", + " display(df2)" ] }, { - "cell_type": "code", - "execution_count": 10, - "id": "5630aaaa-dc8b-4917-b9fa-ae0924999720", + "cell_type": "markdown", + "id": "1739c2de-8d1c-4ec2-8bbf-a05838fb803e", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "0 41\n", - "Name: is_rail, dtype: int64" - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], "source": [ - "operator_route_gdf2.is_rail.value_counts()" + "### Fix `schd_vp_url`" ] }, { "cell_type": "code", - "execution_count": 11, - "id": "3ecc56aa-63ce-402b-8136-a847fd5c0d11", + "execution_count": 10, + "id": "38069e57-1172-4261-9312-c9e7da14619f", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "Capitol Corridor Joint Powers Authority 21\n", - "City of Santa Maria 20\n", - "Name: organization_name, dtype: int64" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ - "operator_route_gdf2.organization_name.value_counts()" + "schd_vp_url = f\"{GTFS_DATA_DICT.digest_tables.dir}{GTFS_DATA_DICT.digest_tables.route_schedule_vp}.parquet\"" ] }, { "cell_type": "code", - "execution_count": 12, - "id": "5798606e-2ea4-4ab0-a6d8-a5597a51e66f", + "execution_count": 11, + "id": "7b60298d-23a9-4a9a-8086-153f0dc8a0e9", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "array(['5a8721fe96786fcd25fba1f8a0ee6358',\n", - " '73105f2d1cabc8170ab066d96863c5d5',\n", - " 'f5a749dd65924e025b1293c58f95f8d6'], dtype=object)" - ] - }, - "execution_count": 12, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ - "operator_route_gdf2.schedule_gtfs_dataset_key.unique()" + "schd_vp_df = pd.read_parquet(schd_vp_url)" ] }, { - "cell_type": "markdown", - "id": "26d11950-fca8-4f5b-8d17-2b9fa0aa368c", + "cell_type": "code", + "execution_count": 12, + "id": "36152890-03ba-47a6-9bdf-89489be23410", "metadata": {}, + "outputs": [], "source": [ - "### Why does City of Santa Maria have multiple schedule_gtfs_dataset_keys?" + "schd_vp_df2 = schd_vp_df.loc[schd_vp_df.organization_name.isin(org_name_lists)]" ] }, { "cell_type": "code", "execution_count": 13, - "id": "81fbd586-cc2d-4a70-97a6-5b25228684b8", + "id": "a674a033-5a27-4a34-98ed-ad86c37e6416", "metadata": {}, "outputs": [ { "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
route_short_name
organization_nameschedule_gtfs_dataset_key
Capitol Corridor Joint Powers Authorityf5a749dd65924e025b1293c58f95f8d61
City of Santa Maria5a8721fe96786fcd25fba1f8a0ee63581
73105f2d1cabc8170ab066d96863c5d51
\n", - "
" - ], "text/plain": [ - " route_short_name\n", - "organization_name schedule_gtfs_dataset_key \n", - "Capitol Corridor Joint Powers Authority f5a749dd65924e025b1293c58f95f8d6 1\n", - "City of Santa Maria 5a8721fe96786fcd25fba1f8a0ee6358 1\n", - " 73105f2d1cabc8170ab066d96863c5d5 1" + "array(['b3848f93-d26b-48a9-b6a6-5de22a4eab47', '5', 'Shuttle', 'CC'],\n", + " dtype=object)" ] }, "execution_count": 13, @@ -305,27 +195,23 @@ } ], "source": [ - "operator_route_gdf2.groupby([\"organization_name\", \"schedule_gtfs_dataset_key\"]).agg(\n", - " {\"route_short_name\": \"nunique\"}\n", - ")" + "schd_vp_df2.route_id.unique()" ] }, { "cell_type": "code", "execution_count": 14, - "id": "f989221a-19f9-4f4f-8655-df4f68e7ca15", + "id": "e37b2103-d050-42a2-8a51-646beb6873bb", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "Index(['shape_array_key', 'geometry', 'feed_key', 'schedule_gtfs_dataset_key',\n", - " 'direction_id', 'route_key', 'route_length', 'route_length_miles',\n", - " 'is_downtown_local', 'is_local', 'is_coverage', 'is_rapid',\n", - " 'is_express', 'is_rail', 'organization_source_record_id',\n", - " 'organization_name', 'service_date', 'name', 'route_long_name',\n", - " 'route_short_name', 'route_combined_name', 'route_id'],\n", - " dtype='object')" + "Shuttle 132\n", + "CC 84\n", + "5 67\n", + "b3848f93-d26b-48a9-b6a6-5de22a4eab47 6\n", + "Name: route_id, dtype: int64" ] }, "execution_count": 14, @@ -334,22 +220,19 @@ } ], "source": [ - "operator_route_gdf2.columns" + "schd_vp_df2.route_id.value_counts()" ] }, { "cell_type": "code", "execution_count": 15, - "id": "568e2a00-8f8c-451c-8b6d-ae331d18471c", + "id": "74d096a3-ff5c-42cd-a5f4-5faf3ae83ffe", "metadata": {}, "outputs": [ { "data": { - "text/html": [ - "
Make this Notebook Trusted to load map: File -> Trust Notebook
" - ], "text/plain": [ - "" + "array(['all_day', 'offpeak', 'peak', None], dtype=object)" ] }, "execution_count": 15, @@ -358,60 +241,56 @@ } ], "source": [ - "operator_route_gdf2.drop(columns=[\"service_date\"]).explore(\"organization_name\")" + "schd_vp_df2.time_period.unique()" ] }, { - "cell_type": "code", - "execution_count": 16, - "id": "bd466515-a3cd-473a-a01a-2e73f9507104", + "cell_type": "markdown", + "id": "00214565-1faa-44c2-8b8e-f95aeff43e0c", "metadata": {}, - "outputs": [], "source": [ - "# operator_route_gdf2.drop(columns = [\"service_date\"]).explore(\"shape_array_key\")" + "### DONE Check out `rt_segment_speeds/segment_speed_utils/gtfs_schedule_wrangling`\n", + "* https://github.com/cal-itp/data-analyses/blob/4dc340343a60b45ad94217c3efd91f807b03ebc2/rt_segment_speeds/segment_speed_utils/gtfs_schedule_wrangling.py \n", + "* Tiffany: Can you try specifying the dropna argument inside pandas groupby? Our pandas version has gone through upgrades, from 0.25 to now 1.5 ), and this argument was introduced in 1.1 and since it's dropna=True, that's probably what's driving the the row behavior.\n", + "* It worked! Now time to rerun stuff further down the pipeline and see what happens." ] }, { - "cell_type": "markdown", - "id": "b1ddfdee-292e-4d57-bb1e-17248e87fce8", + "cell_type": "code", + "execution_count": 16, + "id": "74a4ce67-82af-40cb-9a9e-02464ff0e512", "metadata": {}, + "outputs": [], "source": [ - "### Find longest_shape_array_key [here](https://github.com/cal-itp/data-analyses/blob/4dc340343a60b45ad94217c3efd91f807b03ebc2/gtfs_funnel/operator_scheduled_stats.py#L148)\n", - "* There aren't any routes for Santa Maria\n", - "* Routes are showing for Capital Corridor." + "common_shape_test = gtfs_schedule_wrangling.most_common_shape_by_route_direction(\n", + " one_analysis_date\n", + ")" ] }, { "cell_type": "code", "execution_count": 17, - "id": "22587dd0-886d-475b-a101-f23816f396cb", + "id": "d04f9154-4842-439b-8a24-f0e084a2e31a", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'digest/operator_routes'" - ] - }, - "execution_count": 17, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ - "OPERATOR_ROUTE" + "common_shape_test2 = common_shape_test.loc[\n", + " common_shape_test.schedule_gtfs_dataset_key.isin(schd_keys)\n", + "]" ] }, { "cell_type": "code", "execution_count": 18, - "id": "2aac59b0-7cfb-4796-baf3-b99d5b5db14e", + "id": "c84d2bba-e499-4f4f-af7b-f77e2f2cf378", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "'operator_profiles/operator_routes'" + "array(['7', '6', 'CC', '8', 'Mall', '12X', '13X', '11', '30', 'Shuttle',\n", + " '8a7c42f9-51e4-4848-bf88-30c210f149ad', '2', '3', '1B', '20', 'SF',\n", + " '5', '4', '9', '1'], dtype=object)" ] }, "execution_count": 18, @@ -420,1189 +299,9566 @@ } ], "source": [ - "GTFS_DATA_DICT.schedule_tables.operator_routes" + "common_shape_test2.route_id.unique()" ] }, { - "cell_type": "code", - "execution_count": 19, - "id": "d14199f0-63e5-466c-a122-51b2c2abaa75", + "cell_type": "markdown", + "id": "acd1679e-c5e0-4c75-aa0c-5ae2a5a5d5c5", "metadata": {}, - "outputs": [], "source": [ - "analysis_date = \"2024-11-13\"" + "### Breakdown `gtfs_digest/merge_data.`" ] }, { - "cell_type": "code", - "execution_count": 20, - "id": "a31bc07a-7f16-4b32-8f1a-639914c1eeea", + "cell_type": "markdown", + "id": "1bdcce76-6b7a-4bc6-9953-1ca8cceaca13", "metadata": {}, - "outputs": [], "source": [ - "route_cols = [\"schedule_gtfs_dataset_key\", \"route_id\"]" + "#### Line 294:DONE making all the changes to the original files. `df_sched` is already missing a lot of the routes." ] }, { "cell_type": "code", - "execution_count": 21, - "id": "0a83573f-6fca-403b-a3ec-2b944efcfabd", + "execution_count": 19, + "id": "b164eae4-f657-49e3-ada1-e059362e4689", "metadata": {}, "outputs": [], "source": [ - "longest_shape_gdf = (\n", - " gtfs_schedule_wrangling.longest_shape_by_route_direction(analysis_date)\n", - " .sort_values(\n", - " route_cols + [\"route_length\"], ascending=[True for i in route_cols] + [False]\n", - " )\n", - " .drop_duplicates(subset=route_cols)\n", - " .reset_index(drop=True)\n", - ")" + "# Get cardinal direction for each route\n", + "df_sched_og = merge_data.concatenate_schedule_by_route_direction(analysis_date_list)" ] }, { "cell_type": "code", - "execution_count": 22, - "id": "7c57add2-f72c-4c22-9ca6-e5efe879cab3", + "execution_count": 20, + "id": "b5c8be82-af6c-4255-a2c5-487acdb30e52", "metadata": {}, "outputs": [], "source": [ - "schd_keys = list(operator_route_gdf2.schedule_gtfs_dataset_key.unique())" + "df_sched2_og = df_sched_og.loc[df_sched_og.schedule_gtfs_dataset_key.isin(schd_keys)]" ] }, { "cell_type": "code", - "execution_count": 23, - "id": "7a76026d-88e6-49a3-83f8-b20836b70d7a", + "execution_count": 21, + "id": "5d50ad8e-7536-4187-812d-2591d3589d15", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "['5a8721fe96786fcd25fba1f8a0ee6358',\n", - " '73105f2d1cabc8170ab066d96863c5d5',\n", - " 'f5a749dd65924e025b1293c58f95f8d6']" + "Shuttle 6\n", + "5 3\n", + "Name: route_id, dtype: int64" ] }, - "execution_count": 23, + "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "schd_keys" + "df_sched2_og.route_id.value_counts()" + ] + }, + { + "cell_type": "markdown", + "id": "e9ece006-fc30-462c-b8f3-abdc3293075e", + "metadata": {}, + "source": [ + "##### Go back to `gtfs_funnel/schedule_stats_by_route_direction`\n", + "* https://github.com/cal-itp/data-analyses/blob/1ba0f544a01f99966a6e210dd11666b4fe4a146e/gtfs_funnel/schedule_stats_by_route_direction.py#L190\n", + "* **Filled in `direction_id` with 0**" + ] + }, + { + "cell_type": "markdown", + "id": "24845d09-e06a-4f89-a0e2-e2a40274708c", + "metadata": {}, + "source": [ + "##### `assemble_scheduled_trip_metrics`: nothing is missing but `direction_id` is missing a lot of values." ] }, { "cell_type": "code", - "execution_count": 24, - "id": "3bcb40ca-7e6a-432e-a70c-e1817f7eebe9", + "execution_count": 22, + "id": "6472c0ff-a1d8-4882-91fd-cb15c0dd3c48", "metadata": {}, "outputs": [], "source": [ - "longest_shape_gdf2 = longest_shape_gdf.loc[\n", - " longest_shape_gdf.schedule_gtfs_dataset_key.isin(schd_keys)\n", - "]" + "trip_metrics = schedule_stats_by_route_direction.assemble_scheduled_trip_metrics(\n", + " one_analysis_date, GTFS_DATA_DICT\n", + ")" ] }, { "cell_type": "code", - "execution_count": 25, - "id": "86963a9f-3456-48d5-a386-05c211fe93f4", + "execution_count": 23, + "id": "da37674c-b332-456e-adcd-8af0fdf8fa94", + "metadata": {}, + "outputs": [], + "source": [ + "trip_metrics2 = trip_metrics.loc[trip_metrics.schedule_gtfs_dataset_key.isin(schd_keys)]" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "11ebad2e-53e0-4899-a22c-681d11bf54d4", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "Index(['shape_array_key', 'geometry', 'feed_key', 'schedule_gtfs_dataset_key',\n", - " 'route_id', 'direction_id', 'route_key', 'route_length'],\n", + "Index(['schedule_gtfs_dataset_key', 'trip_instance_key', 'median_stop_meters',\n", + " 'time_of_day', 'scheduled_service_minutes', 'route_id', 'direction_id'],\n", " dtype='object')" ] }, - "execution_count": 25, + "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "longest_shape_gdf2.columns" + "trip_metrics2.columns" ] }, { "cell_type": "code", - "execution_count": 26, - "id": "b256ef9b-82c1-4832-ac54-19ca9319bdc4", + "execution_count": 25, + "id": "e3548c12-ecad-4196-afdc-b0539b6f6cd3", "metadata": {}, "outputs": [ { - "name": "stdout", + "name": "stderr", "output_type": "stream", "text": [ - "\n", - "Int64Index: 20 entries, 1061 to 2588\n", - "Data columns (total 8 columns):\n", - " # Column Non-Null Count Dtype \n", - "--- ------ -------------- ----- \n", - " 0 shape_array_key 20 non-null object \n", - " 1 geometry 20 non-null geometry\n", - " 2 feed_key 20 non-null object \n", - " 3 schedule_gtfs_dataset_key 20 non-null object \n", - " 4 route_id 20 non-null object \n", - " 5 direction_id 4 non-null float64 \n", - " 6 route_key 20 non-null object \n", - " 7 route_length 20 non-null float64 \n", - "dtypes: float64(2), geometry(1), object(5)\n", - "memory usage: 1.4+ KB\n" + "/tmp/ipykernel_2800/3236449391.py:1: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " trip_metrics2.direction_id = trip_metrics2.direction_id.fillna(0)\n" ] } ], "source": [ - "longest_shape_gdf2.info()" + "trip_metrics2.direction_id = trip_metrics2.direction_id.fillna(0)" ] }, { "cell_type": "code", - "execution_count": 27, - "id": "6db42351-2a52-4e00-a265-33e5743cdea2", + "execution_count": 26, + "id": "fbb27f9d-29d0-4997-b04d-c26e59a2154a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(335, 7)" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "trip_metrics2.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "d9026014-1e37-4792-a4fb-3bba5dfa20fd", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array(['PM Peak', 'Midday', 'AM Peak', 'Early AM', 'Evening'],\n", + " dtype=object)" + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "trip_metrics2.time_of_day.unique()" + ] + }, + { + "cell_type": "markdown", + "id": "be2084cc-6482-4aef-a43b-7508e7952d0e", + "metadata": {}, + "source": [ + "##### Each row is populated." + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "id": "7aeb5b42-0a9f-4b00-a1fa-d2a656640118", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
median_stop_meterstime_of_dayscheduled_service_minutesroute_iddirection_id
49725405.04Midday35.0030.00
49729178.05Midday14.98Mall0.00
49731451.15Midday41.00110.00
49736361.12Midday30.001B0.00
49737357.22Midday30.3710.00
49738444.75Midday40.0090.00
49741440.62Midday41.0040.00
49742989.61Midday56.0012X0.00
49744437.51Midday42.0050.00
49754477.41Midday53.0013X0.00
49756512.01Midday43.0080.00
49780589.78Midday36.0070.00
497819208.29Midday120.00200.00
49784462.04Midday49.0020.00
497951953.92Midday173.00300.00
49802619.55Midday38.0060.00
49804407.39Midday41.658a7c42f9-51e4-4848-bf88-30c210f149ad0.00
11328421434.28Midday85.00Shuttle0.00
11329610770.70Midday30.00SF1.00
11330310770.70Midday30.00SF0.00
11330510988.09Midday191.00CC0.00
11332314985.48Midday117.00CC1.00
\n", + "
" + ], + "text/plain": [ + " median_stop_meters time_of_day scheduled_service_minutes \\\n", + "49725 405.04 Midday 35.00 \n", + "49729 178.05 Midday 14.98 \n", + "49731 451.15 Midday 41.00 \n", + "49736 361.12 Midday 30.00 \n", + "49737 357.22 Midday 30.37 \n", + "49738 444.75 Midday 40.00 \n", + "49741 440.62 Midday 41.00 \n", + "49742 989.61 Midday 56.00 \n", + "49744 437.51 Midday 42.00 \n", + "49754 477.41 Midday 53.00 \n", + "49756 512.01 Midday 43.00 \n", + "49780 589.78 Midday 36.00 \n", + "49781 9208.29 Midday 120.00 \n", + "49784 462.04 Midday 49.00 \n", + "49795 1953.92 Midday 173.00 \n", + "49802 619.55 Midday 38.00 \n", + "49804 407.39 Midday 41.65 \n", + "113284 21434.28 Midday 85.00 \n", + "113296 10770.70 Midday 30.00 \n", + "113303 10770.70 Midday 30.00 \n", + "113305 10988.09 Midday 191.00 \n", + "113323 14985.48 Midday 117.00 \n", + "\n", + " route_id direction_id \n", + "49725 3 0.00 \n", + "49729 Mall 0.00 \n", + "49731 11 0.00 \n", + "49736 1B 0.00 \n", + "49737 1 0.00 \n", + "49738 9 0.00 \n", + "49741 4 0.00 \n", + "49742 12X 0.00 \n", + "49744 5 0.00 \n", + "49754 13X 0.00 \n", + "49756 8 0.00 \n", + "49780 7 0.00 \n", + "49781 20 0.00 \n", + "49784 2 0.00 \n", + "49795 30 0.00 \n", + "49802 6 0.00 \n", + "49804 8a7c42f9-51e4-4848-bf88-30c210f149ad 0.00 \n", + "113284 Shuttle 0.00 \n", + "113296 SF 1.00 \n", + "113303 SF 0.00 \n", + "113305 CC 0.00 \n", + "113323 CC 1.00 " + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "trip_metrics2.loc[trip_metrics2.time_of_day == \"Midday\"].drop_duplicates(\n", + " subset=[\"schedule_gtfs_dataset_key\", \"route_id\", \"direction_id\"]\n", + ").drop(columns=[\"schedule_gtfs_dataset_key\", \"trip_instance_key\"])" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "id": "571f4ec3-966d-4412-8d05-9f50ea7c159d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
schedule_gtfs_dataset_keyroute_iddirection_id
4972473105f2d1cabc8170ab066d96863c5d5300.00
4972573105f2d1cabc8170ab066d96863c5d530.00
4972773105f2d1cabc8170ab066d96863c5d5200.00
4972873105f2d1cabc8170ab066d96863c5d540.00
4972973105f2d1cabc8170ab066d96863c5d5Mall0.00
4973073105f2d1cabc8170ab066d96863c5d550.00
4973173105f2d1cabc8170ab066d96863c5d5110.00
4973273105f2d1cabc8170ab066d96863c5d570.00
4973373105f2d1cabc8170ab066d96863c5d590.00
4973573105f2d1cabc8170ab066d96863c5d510.00
4973673105f2d1cabc8170ab066d96863c5d51B0.00
4974273105f2d1cabc8170ab066d96863c5d512X0.00
4974573105f2d1cabc8170ab066d96863c5d560.00
4975173105f2d1cabc8170ab066d96863c5d520.00
4975473105f2d1cabc8170ab066d96863c5d513X0.00
4975673105f2d1cabc8170ab066d96863c5d580.00
4979973105f2d1cabc8170ab066d96863c5d58a7c42f9-51e4-4848-bf88-30c210f149ad0.00
113284f5a749dd65924e025b1293c58f95f8d6Shuttle0.00
113285f5a749dd65924e025b1293c58f95f8d6SF1.00
113286f5a749dd65924e025b1293c58f95f8d6SF0.00
113289f5a749dd65924e025b1293c58f95f8d6CC1.00
113292f5a749dd65924e025b1293c58f95f8d6CC0.00
113307f5a749dd65924e025b1293c58f95f8d6Shuttle1.00
\n", + "
" + ], + "text/plain": [ + " schedule_gtfs_dataset_key \\\n", + "49724 73105f2d1cabc8170ab066d96863c5d5 \n", + "49725 73105f2d1cabc8170ab066d96863c5d5 \n", + "49727 73105f2d1cabc8170ab066d96863c5d5 \n", + "49728 73105f2d1cabc8170ab066d96863c5d5 \n", + "49729 73105f2d1cabc8170ab066d96863c5d5 \n", + "49730 73105f2d1cabc8170ab066d96863c5d5 \n", + "49731 73105f2d1cabc8170ab066d96863c5d5 \n", + "49732 73105f2d1cabc8170ab066d96863c5d5 \n", + "49733 73105f2d1cabc8170ab066d96863c5d5 \n", + "49735 73105f2d1cabc8170ab066d96863c5d5 \n", + "49736 73105f2d1cabc8170ab066d96863c5d5 \n", + "49742 73105f2d1cabc8170ab066d96863c5d5 \n", + "49745 73105f2d1cabc8170ab066d96863c5d5 \n", + "49751 73105f2d1cabc8170ab066d96863c5d5 \n", + "49754 73105f2d1cabc8170ab066d96863c5d5 \n", + "49756 73105f2d1cabc8170ab066d96863c5d5 \n", + "49799 73105f2d1cabc8170ab066d96863c5d5 \n", + "113284 f5a749dd65924e025b1293c58f95f8d6 \n", + "113285 f5a749dd65924e025b1293c58f95f8d6 \n", + "113286 f5a749dd65924e025b1293c58f95f8d6 \n", + "113289 f5a749dd65924e025b1293c58f95f8d6 \n", + "113292 f5a749dd65924e025b1293c58f95f8d6 \n", + "113307 f5a749dd65924e025b1293c58f95f8d6 \n", + "\n", + " route_id direction_id \n", + "49724 30 0.00 \n", + "49725 3 0.00 \n", + "49727 20 0.00 \n", + "49728 4 0.00 \n", + "49729 Mall 0.00 \n", + "49730 5 0.00 \n", + "49731 11 0.00 \n", + "49732 7 0.00 \n", + "49733 9 0.00 \n", + "49735 1 0.00 \n", + "49736 1B 0.00 \n", + "49742 12X 0.00 \n", + "49745 6 0.00 \n", + "49751 2 0.00 \n", + "49754 13X 0.00 \n", + "49756 8 0.00 \n", + "49799 8a7c42f9-51e4-4848-bf88-30c210f149ad 0.00 \n", + "113284 Shuttle 0.00 \n", + "113285 SF 1.00 \n", + "113286 SF 0.00 \n", + "113289 CC 1.00 \n", + "113292 CC 0.00 \n", + "113307 Shuttle 1.00 " + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "preview(trip_metrics2)" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "id": "2309939b-4531-4548-aa18-d85ea147880d", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Int64Index: 335 entries, 49724 to 113340\n", + "Data columns (total 7 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 schedule_gtfs_dataset_key 335 non-null object \n", + " 1 trip_instance_key 335 non-null object \n", + " 2 median_stop_meters 335 non-null float64\n", + " 3 time_of_day 335 non-null object \n", + " 4 scheduled_service_minutes 335 non-null float64\n", + " 5 route_id 335 non-null object \n", + " 6 direction_id 335 non-null float64\n", + "dtypes: float64(3), object(4)\n", + "memory usage: 20.9+ KB\n" + ] + } + ], + "source": [ + "trip_metrics2.info()" + ] + }, + { + "cell_type": "markdown", + "id": "72613ac3-2ff9-4026-bf76-601b2b4ec0ca", + "metadata": {}, + "source": [ + "##### DONE`gtfs_funnel/schedule_stats_by_route_direction/schedule_metrics_by_route_direction` \n", + "* **updated to `dropna=False` and also filled in `time_period` with `peak_offpeak`**" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "id": "a8f01bbf-831d-4ca6-ac60-0851413d8df3", + "metadata": {}, + "outputs": [], + "source": [ + "def schedule_metrics_by_route_direction(\n", + " df: pd.DataFrame,\n", + " analysis_date: str,\n", + " group_merge_cols: list,\n", + ") -> pd.DataFrame:\n", + " \"\"\"\n", + " Aggregate trip-level metrics to route-direction, and\n", + " attach shape geometry for common_shape_id.\n", + " \"\"\"\n", + " service_freq_df = gtfs_schedule_wrangling.aggregate_time_of_day_to_peak_offpeak(\n", + " df, group_merge_cols, long_or_wide=\"long\"\n", + " )\n", + "\n", + " metrics_df = (\n", + " df.groupby(group_merge_cols, observed=True, group_keys=False, dropna=False)\n", + " .agg(\n", + " {\n", + " \"median_stop_meters\": \"mean\",\n", + " # take mean of the median stop spacing for trip\n", + " # does this make sense?\n", + " # median is the single boiled down metric at the trip-level\n", + " \"scheduled_service_minutes\": \"mean\",\n", + " }\n", + " )\n", + " .reset_index()\n", + " .rename(\n", + " columns={\n", + " \"median_stop_meters\": \"avg_stop_meters\",\n", + " \"scheduled_service_minutes\": \"avg_scheduled_service_minutes\",\n", + " }\n", + " )\n", + " )\n", + "\n", + " metrics_df = metrics_df.assign(\n", + " avg_stop_miles=metrics_df.avg_stop_meters.divide(METERS_PER_MILE).round(2)\n", + " ).drop(columns=[\"avg_stop_meters\"])\n", + "\n", + " round_me = [\"avg_stop_miles\", \"avg_scheduled_service_minutes\"]\n", + " metrics_df[round_me] = metrics_df[round_me].round(2)\n", + "\n", + " common_shape = gtfs_schedule_wrangling.most_common_shape_by_route_direction(\n", + " analysis_date\n", + " ).pipe(helpers.remove_shapes_outside_ca)\n", + "\n", + " df = pd.merge(common_shape, metrics_df, on=group_merge_cols, how=\"inner\").merge(\n", + " service_freq_df, on=group_merge_cols, how=\"inner\"\n", + " )\n", + "\n", + " df.time_period = df.time_period.fillna(df.peak_offpeak)\n", + " return df" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "id": "8391d0c3-f887-4aee-9088-f839186238a4", + "metadata": {}, + "outputs": [], + "source": [ + "route_group_merge_cols = [\"schedule_gtfs_dataset_key\", \"route_id\", \"direction_id\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "id": "a389a761-d6dd-42b9-a8aa-2e91bad1bc1f", + "metadata": { + "scrolled": true, + "tags": [] + }, + "outputs": [], + "source": [ + "route_dir_metrics = schedule_metrics_by_route_direction(\n", + " trip_metrics2, one_analysis_date, route_group_merge_cols\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "id": "e708fa79-b533-4d28-8c53-1d5509645c0f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
schedule_gtfs_dataset_keyroute_iddirection_id
073105f2d1cabc8170ab066d96863c5d570.00
373105f2d1cabc8170ab066d96863c5d560.00
673105f2d1cabc8170ab066d96863c5d580.00
973105f2d1cabc8170ab066d96863c5d5Mall0.00
1273105f2d1cabc8170ab066d96863c5d512X0.00
1573105f2d1cabc8170ab066d96863c5d513X0.00
1873105f2d1cabc8170ab066d96863c5d5110.00
2173105f2d1cabc8170ab066d96863c5d5300.00
24f5a749dd65924e025b1293c58f95f8d6Shuttle1.00
27f5a749dd65924e025b1293c58f95f8d6Shuttle0.00
3073105f2d1cabc8170ab066d96863c5d58a7c42f9-51e4-4848-bf88-30c210f149ad0.00
3373105f2d1cabc8170ab066d96863c5d520.00
3673105f2d1cabc8170ab066d96863c5d530.00
3973105f2d1cabc8170ab066d96863c5d51B0.00
4273105f2d1cabc8170ab066d96863c5d5200.00
4573105f2d1cabc8170ab066d96863c5d550.00
4873105f2d1cabc8170ab066d96863c5d540.00
5173105f2d1cabc8170ab066d96863c5d590.00
5473105f2d1cabc8170ab066d96863c5d510.00
\n", + "
" + ], + "text/plain": [ + " schedule_gtfs_dataset_key route_id \\\n", + "0 73105f2d1cabc8170ab066d96863c5d5 7 \n", + "3 73105f2d1cabc8170ab066d96863c5d5 6 \n", + "6 73105f2d1cabc8170ab066d96863c5d5 8 \n", + "9 73105f2d1cabc8170ab066d96863c5d5 Mall \n", + "12 73105f2d1cabc8170ab066d96863c5d5 12X \n", + "15 73105f2d1cabc8170ab066d96863c5d5 13X \n", + "18 73105f2d1cabc8170ab066d96863c5d5 11 \n", + "21 73105f2d1cabc8170ab066d96863c5d5 30 \n", + "24 f5a749dd65924e025b1293c58f95f8d6 Shuttle \n", + "27 f5a749dd65924e025b1293c58f95f8d6 Shuttle \n", + "30 73105f2d1cabc8170ab066d96863c5d5 8a7c42f9-51e4-4848-bf88-30c210f149ad \n", + "33 73105f2d1cabc8170ab066d96863c5d5 2 \n", + "36 73105f2d1cabc8170ab066d96863c5d5 3 \n", + "39 73105f2d1cabc8170ab066d96863c5d5 1B \n", + "42 73105f2d1cabc8170ab066d96863c5d5 20 \n", + "45 73105f2d1cabc8170ab066d96863c5d5 5 \n", + "48 73105f2d1cabc8170ab066d96863c5d5 4 \n", + "51 73105f2d1cabc8170ab066d96863c5d5 9 \n", + "54 73105f2d1cabc8170ab066d96863c5d5 1 \n", + "\n", + " direction_id \n", + "0 0.00 \n", + "3 0.00 \n", + "6 0.00 \n", + "9 0.00 \n", + "12 0.00 \n", + "15 0.00 \n", + "18 0.00 \n", + "21 0.00 \n", + "24 1.00 \n", + "27 0.00 \n", + "30 0.00 \n", + "33 0.00 \n", + "36 0.00 \n", + "39 0.00 \n", + "42 0.00 \n", + "45 0.00 \n", + "48 0.00 \n", + "51 0.00 \n", + "54 0.00 " + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "preview(route_dir_metrics)" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "id": "3877741e-ba60-41d9-ada5-6f7dd02e9cf1", + "metadata": { + "scrolled": true, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
route_iddirection_idroute_nameavg_scheduled_service_minutesavg_stop_milesn_tripstime_periodpeak_offpeakfrequency
070.00Rt 7. A. H. College, Crossroads Shopping Center via Boone St, Bradley Rd.36.000.3719all_dayNaN0.79
170.00Rt 7. A. H. College, Crossroads Shopping Center via Boone St, Bradley Rd.36.000.379offpeakoffpeak0.38
270.00Rt 7. A. H. College, Crossroads Shopping Center via Boone St, Bradley Rd.36.000.3710peakpeak0.42
360.00Rt 6. Oak Knolls to Old Orcutt-East to West-Outbound38.000.3818all_dayNaN0.75
460.00Rt 6. Oak Knolls to Old Orcutt-East to West-Outbound38.000.387offpeakoffpeak0.29
560.00Rt 6. Oak Knolls to Old Orcutt-East to West-Outbound38.000.3811peakpeak0.46
680.00Rt 8. Tanglewood to Crossroads Shopping Center via McCoy Ln.43.000.3216all_dayNaN0.67
780.00Rt 8. Tanglewood to Crossroads Shopping Center via McCoy Ln.43.000.328offpeakoffpeak0.33
880.00Rt 8. Tanglewood to Crossroads Shopping Center via McCoy Ln.43.000.328peakpeak0.33
9Mall0.00Mall Shuttle14.980.1128all_dayNaN1.17
10Mall0.00Mall Shuttle14.980.1114offpeakoffpeak0.58
11Mall0.00Mall Shuttle14.980.1114peakpeak0.58
1212X0.0012X Broadway/Orcutt Express56.000.6111all_dayNaN0.46
1312X0.0012X Broadway/Orcutt Express56.000.616offpeakoffpeak0.25
1412X0.0012X Broadway/Orcutt Express56.000.615peakpeak0.21
1513X0.0013X Transit Center/PVHS/N. Broadway50.820.2911all_dayNaN0.46
1613X0.0013X Transit Center/PVHS/N. Broadway50.820.296offpeakoffpeak0.25
1713X0.0013X Transit Center/PVHS/N. Broadway50.820.295peakpeak0.21
18110.00R11. Transit Center to Gov't Center via S. Broadway41.000.2822all_dayNaN0.92
19110.00R11. Transit Center to Gov't Center via S. Broadway41.000.2810offpeakoffpeak0.42
20110.00R11. Transit Center to Gov't Center via S. Broadway41.000.2812peakpeak0.50
21300.00Route 30 - Santa Maria Transit Center/Vandenberg/Lompoc166.331.229all_dayNaN0.38
22300.00Route 30 - Santa Maria Transit Center/Vandenberg/Lompoc166.331.225offpeakoffpeak0.21
23300.00Route 30 - Santa Maria Transit Center/Vandenberg/Lompoc166.331.224peakpeak0.17
24Shuttle1.00Shuttle to Auburn72.0013.745all_dayNaN0.21
25Shuttle1.00Shuttle to Auburn72.0013.742offpeakoffpeak0.08
26Shuttle1.00Shuttle to Auburn72.0013.743peakpeak0.12
27Shuttle0.00Shuttle to Auburn70.0011.785all_dayNaN0.21
28Shuttle0.00Shuttle to Auburn70.0011.783offpeakoffpeak0.12
29Shuttle0.00Shuttle to Auburn70.0011.782peakpeak0.08
308a7c42f9-51e4-4848-bf88-30c210f149ad0.00Rt 11. Transit Center to Gov't Center via S. Broadway41.650.2518all_dayNaN0.75
318a7c42f9-51e4-4848-bf88-30c210f149ad0.00Rt 11. Transit Center to Gov't Center via S. Broadway41.650.258offpeakoffpeak0.33
328a7c42f9-51e4-4848-bf88-30c210f149ad0.00Rt 11. Transit Center to Gov't Center via S. Broadway41.650.2510peakpeak0.42
3320.00Rt 2. Transit Center to PVH School via Western., Donovan Rd53.240.2917all_dayNaN0.71
3420.00Rt 2. Transit Center to PVH School via Western., Donovan Rd53.240.296offpeakoffpeak0.25
3520.00Rt 2. Transit Center to PVH School via Western., Donovan Rd53.240.2911peakpeak0.46
3630.00Rt 3. Transit Center to Marian Hospital to PVH School, via E. Main, Suey Ln.35.110.2618all_dayNaN0.75
3730.00Rt 3. Transit Center to Marian Hospital to PVH School, via E. Main, Suey Ln.35.110.268offpeakoffpeak0.33
3830.00Rt 3. Transit Center to Marian Hospital to PVH School, via E. Main, Suey Ln.35.110.2610peakpeak0.42
391B0.00Rt 1. Transit Ctr to Preisker Park Via N. Broadway30.000.2212all_dayNaN0.50
401B0.00Rt 1. Transit Ctr to Preisker Park Via N. Broadway30.000.225offpeakoffpeak0.21
411B0.00Rt 1. Transit Ctr to Preisker Park Via N. Broadway30.000.227peakpeak0.29
42200.00Route 20 - Santa Maria Transit Center/Los Alamos/Buellton/Solvang-OB119.175.726all_dayNaN0.25
43200.00Route 20 - Santa Maria Transit Center/Los Alamos/Buellton/Solvang-OB119.175.723offpeakoffpeak0.12
44200.00Route 20 - Santa Maria Transit Center/Los Alamos/Buellton/Solvang-OB119.175.723peakpeak0.12
4550.00Rt 5. Transit Center to Gov't Cntr to Evergreen Shopping Center via Miller St., S.M. Way42.000.2718all_dayNaN0.75
4650.00Rt 5. Transit Center to Gov't Cntr to Evergreen Shopping Center via Miller St., S.M. Way42.000.278offpeakoffpeak0.33
4750.00Rt 5. Transit Center to Gov't Cntr to Evergreen Shopping Center via Miller St., S.M. Way42.000.2710peakpeak0.42
4840.00Rt 4. Transit Center to SMH school to VTC via Cook St., Thornburg., Betteravia Rd.41.060.2718all_dayNaN0.75
4940.00Rt 4. Transit Center to SMH school to VTC via Cook St., Thornburg., Betteravia Rd.41.060.278offpeakoffpeak0.33
5040.00Rt 4. Transit Center to SMH school to VTC via Cook St., Thornburg., Betteravia Rd.41.060.2710peakpeak0.42
5190.00Rt 9. Transit Center to PVH via Alvin Ave.40.000.2818all_dayNaN0.75
5290.00Rt 9. Transit Center to PVH via Alvin Ave.40.000.288offpeakoffpeak0.33
5390.00Rt 9. Transit Center to PVH via Alvin Ave.40.000.2810peakpeak0.42
5410.00Rt 1. Transit Ctr to Preisker Park Via N. Broadway30.370.2219all_dayNaN0.79
5510.00Rt 1. Transit Ctr to Preisker Park Via N. Broadway30.370.229offpeakoffpeak0.38
5610.00Rt 1. Transit Ctr to Preisker Park Via N. Broadway30.370.2210peakpeak0.42
\n", + "
" + ], + "text/plain": [ + " route_id direction_id \\\n", + "0 7 0.00 \n", + "1 7 0.00 \n", + "2 7 0.00 \n", + "3 6 0.00 \n", + "4 6 0.00 \n", + "5 6 0.00 \n", + "6 8 0.00 \n", + "7 8 0.00 \n", + "8 8 0.00 \n", + "9 Mall 0.00 \n", + "10 Mall 0.00 \n", + "11 Mall 0.00 \n", + "12 12X 0.00 \n", + "13 12X 0.00 \n", + "14 12X 0.00 \n", + "15 13X 0.00 \n", + "16 13X 0.00 \n", + "17 13X 0.00 \n", + "18 11 0.00 \n", + "19 11 0.00 \n", + "20 11 0.00 \n", + "21 30 0.00 \n", + "22 30 0.00 \n", + "23 30 0.00 \n", + "24 Shuttle 1.00 \n", + "25 Shuttle 1.00 \n", + "26 Shuttle 1.00 \n", + "27 Shuttle 0.00 \n", + "28 Shuttle 0.00 \n", + "29 Shuttle 0.00 \n", + "30 8a7c42f9-51e4-4848-bf88-30c210f149ad 0.00 \n", + "31 8a7c42f9-51e4-4848-bf88-30c210f149ad 0.00 \n", + "32 8a7c42f9-51e4-4848-bf88-30c210f149ad 0.00 \n", + "33 2 0.00 \n", + "34 2 0.00 \n", + "35 2 0.00 \n", + "36 3 0.00 \n", + "37 3 0.00 \n", + "38 3 0.00 \n", + "39 1B 0.00 \n", + "40 1B 0.00 \n", + "41 1B 0.00 \n", + "42 20 0.00 \n", + "43 20 0.00 \n", + "44 20 0.00 \n", + "45 5 0.00 \n", + "46 5 0.00 \n", + "47 5 0.00 \n", + "48 4 0.00 \n", + "49 4 0.00 \n", + "50 4 0.00 \n", + "51 9 0.00 \n", + "52 9 0.00 \n", + "53 9 0.00 \n", + "54 1 0.00 \n", + "55 1 0.00 \n", + "56 1 0.00 \n", + "\n", + " route_name \\\n", + "0 Rt 7. A. H. College, Crossroads Shopping Center via Boone St, Bradley Rd. \n", + "1 Rt 7. A. H. College, Crossroads Shopping Center via Boone St, Bradley Rd. \n", + "2 Rt 7. A. H. College, Crossroads Shopping Center via Boone St, Bradley Rd. \n", + "3 Rt 6. Oak Knolls to Old Orcutt-East to West-Outbound \n", + "4 Rt 6. Oak Knolls to Old Orcutt-East to West-Outbound \n", + "5 Rt 6. Oak Knolls to Old Orcutt-East to West-Outbound \n", + "6 Rt 8. Tanglewood to Crossroads Shopping Center via McCoy Ln. \n", + "7 Rt 8. Tanglewood to Crossroads Shopping Center via McCoy Ln. \n", + "8 Rt 8. Tanglewood to Crossroads Shopping Center via McCoy Ln. \n", + "9 Mall Shuttle \n", + "10 Mall Shuttle \n", + "11 Mall Shuttle \n", + "12 12X Broadway/Orcutt Express \n", + "13 12X Broadway/Orcutt Express \n", + "14 12X Broadway/Orcutt Express \n", + "15 13X Transit Center/PVHS/N. Broadway \n", + "16 13X Transit Center/PVHS/N. Broadway \n", + "17 13X Transit Center/PVHS/N. Broadway \n", + "18 R11. Transit Center to Gov't Center via S. Broadway \n", + "19 R11. Transit Center to Gov't Center via S. Broadway \n", + "20 R11. Transit Center to Gov't Center via S. Broadway \n", + "21 Route 30 - Santa Maria Transit Center/Vandenberg/Lompoc \n", + "22 Route 30 - Santa Maria Transit Center/Vandenberg/Lompoc \n", + "23 Route 30 - Santa Maria Transit Center/Vandenberg/Lompoc \n", + "24 Shuttle to Auburn \n", + "25 Shuttle to Auburn \n", + "26 Shuttle to Auburn \n", + "27 Shuttle to Auburn \n", + "28 Shuttle to Auburn \n", + "29 Shuttle to Auburn \n", + "30 Rt 11. Transit Center to Gov't Center via S. Broadway \n", + "31 Rt 11. Transit Center to Gov't Center via S. Broadway \n", + "32 Rt 11. Transit Center to Gov't Center via S. Broadway \n", + "33 Rt 2. Transit Center to PVH School via Western., Donovan Rd \n", + "34 Rt 2. Transit Center to PVH School via Western., Donovan Rd \n", + "35 Rt 2. Transit Center to PVH School via Western., Donovan Rd \n", + "36 Rt 3. Transit Center to Marian Hospital to PVH School, via E. Main, Suey Ln. \n", + "37 Rt 3. Transit Center to Marian Hospital to PVH School, via E. Main, Suey Ln. \n", + "38 Rt 3. Transit Center to Marian Hospital to PVH School, via E. Main, Suey Ln. \n", + "39 Rt 1. Transit Ctr to Preisker Park Via N. Broadway \n", + "40 Rt 1. Transit Ctr to Preisker Park Via N. Broadway \n", + "41 Rt 1. Transit Ctr to Preisker Park Via N. Broadway \n", + "42 Route 20 - Santa Maria Transit Center/Los Alamos/Buellton/Solvang-OB \n", + "43 Route 20 - Santa Maria Transit Center/Los Alamos/Buellton/Solvang-OB \n", + "44 Route 20 - Santa Maria Transit Center/Los Alamos/Buellton/Solvang-OB \n", + "45 Rt 5. Transit Center to Gov't Cntr to Evergreen Shopping Center via Miller St., S.M. Way \n", + "46 Rt 5. Transit Center to Gov't Cntr to Evergreen Shopping Center via Miller St., S.M. Way \n", + "47 Rt 5. Transit Center to Gov't Cntr to Evergreen Shopping Center via Miller St., S.M. Way \n", + "48 Rt 4. Transit Center to SMH school to VTC via Cook St., Thornburg., Betteravia Rd. \n", + "49 Rt 4. Transit Center to SMH school to VTC via Cook St., Thornburg., Betteravia Rd. \n", + "50 Rt 4. Transit Center to SMH school to VTC via Cook St., Thornburg., Betteravia Rd. \n", + "51 Rt 9. Transit Center to PVH via Alvin Ave. \n", + "52 Rt 9. Transit Center to PVH via Alvin Ave. \n", + "53 Rt 9. Transit Center to PVH via Alvin Ave. \n", + "54 Rt 1. Transit Ctr to Preisker Park Via N. Broadway \n", + "55 Rt 1. Transit Ctr to Preisker Park Via N. Broadway \n", + "56 Rt 1. Transit Ctr to Preisker Park Via N. Broadway \n", + "\n", + " avg_scheduled_service_minutes avg_stop_miles n_trips time_period \\\n", + "0 36.00 0.37 19 all_day \n", + "1 36.00 0.37 9 offpeak \n", + "2 36.00 0.37 10 peak \n", + "3 38.00 0.38 18 all_day \n", + "4 38.00 0.38 7 offpeak \n", + "5 38.00 0.38 11 peak \n", + "6 43.00 0.32 16 all_day \n", + "7 43.00 0.32 8 offpeak \n", + "8 43.00 0.32 8 peak \n", + "9 14.98 0.11 28 all_day \n", + "10 14.98 0.11 14 offpeak \n", + "11 14.98 0.11 14 peak \n", + "12 56.00 0.61 11 all_day \n", + "13 56.00 0.61 6 offpeak \n", + "14 56.00 0.61 5 peak \n", + "15 50.82 0.29 11 all_day \n", + "16 50.82 0.29 6 offpeak \n", + "17 50.82 0.29 5 peak \n", + "18 41.00 0.28 22 all_day \n", + "19 41.00 0.28 10 offpeak \n", + "20 41.00 0.28 12 peak \n", + "21 166.33 1.22 9 all_day \n", + "22 166.33 1.22 5 offpeak \n", + "23 166.33 1.22 4 peak \n", + "24 72.00 13.74 5 all_day \n", + "25 72.00 13.74 2 offpeak \n", + "26 72.00 13.74 3 peak \n", + "27 70.00 11.78 5 all_day \n", + "28 70.00 11.78 3 offpeak \n", + "29 70.00 11.78 2 peak \n", + "30 41.65 0.25 18 all_day \n", + "31 41.65 0.25 8 offpeak \n", + "32 41.65 0.25 10 peak \n", + "33 53.24 0.29 17 all_day \n", + "34 53.24 0.29 6 offpeak \n", + "35 53.24 0.29 11 peak \n", + "36 35.11 0.26 18 all_day \n", + "37 35.11 0.26 8 offpeak \n", + "38 35.11 0.26 10 peak \n", + "39 30.00 0.22 12 all_day \n", + "40 30.00 0.22 5 offpeak \n", + "41 30.00 0.22 7 peak \n", + "42 119.17 5.72 6 all_day \n", + "43 119.17 5.72 3 offpeak \n", + "44 119.17 5.72 3 peak \n", + "45 42.00 0.27 18 all_day \n", + "46 42.00 0.27 8 offpeak \n", + "47 42.00 0.27 10 peak \n", + "48 41.06 0.27 18 all_day \n", + "49 41.06 0.27 8 offpeak \n", + "50 41.06 0.27 10 peak \n", + "51 40.00 0.28 18 all_day \n", + "52 40.00 0.28 8 offpeak \n", + "53 40.00 0.28 10 peak \n", + "54 30.37 0.22 19 all_day \n", + "55 30.37 0.22 9 offpeak \n", + "56 30.37 0.22 10 peak \n", + "\n", + " peak_offpeak frequency \n", + "0 NaN 0.79 \n", + "1 offpeak 0.38 \n", + "2 peak 0.42 \n", + "3 NaN 0.75 \n", + "4 offpeak 0.29 \n", + "5 peak 0.46 \n", + "6 NaN 0.67 \n", + "7 offpeak 0.33 \n", + "8 peak 0.33 \n", + "9 NaN 1.17 \n", + "10 offpeak 0.58 \n", + "11 peak 0.58 \n", + "12 NaN 0.46 \n", + "13 offpeak 0.25 \n", + "14 peak 0.21 \n", + "15 NaN 0.46 \n", + "16 offpeak 0.25 \n", + "17 peak 0.21 \n", + "18 NaN 0.92 \n", + "19 offpeak 0.42 \n", + "20 peak 0.50 \n", + "21 NaN 0.38 \n", + "22 offpeak 0.21 \n", + "23 peak 0.17 \n", + "24 NaN 0.21 \n", + "25 offpeak 0.08 \n", + "26 peak 0.12 \n", + "27 NaN 0.21 \n", + "28 offpeak 0.12 \n", + "29 peak 0.08 \n", + "30 NaN 0.75 \n", + "31 offpeak 0.33 \n", + "32 peak 0.42 \n", + "33 NaN 0.71 \n", + "34 offpeak 0.25 \n", + "35 peak 0.46 \n", + "36 NaN 0.75 \n", + "37 offpeak 0.33 \n", + "38 peak 0.42 \n", + "39 NaN 0.50 \n", + "40 offpeak 0.21 \n", + "41 peak 0.29 \n", + "42 NaN 0.25 \n", + "43 offpeak 0.12 \n", + "44 peak 0.12 \n", + "45 NaN 0.75 \n", + "46 offpeak 0.33 \n", + "47 peak 0.42 \n", + "48 NaN 0.75 \n", + "49 offpeak 0.33 \n", + "50 peak 0.42 \n", + "51 NaN 0.75 \n", + "52 offpeak 0.33 \n", + "53 peak 0.42 \n", + "54 NaN 0.79 \n", + "55 offpeak 0.38 \n", + "56 peak 0.42 " + ] + }, + "execution_count": 35, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "route_dir_metrics.drop(\n", + " columns=[\n", + " \"geometry\",\n", + " \"schedule_gtfs_dataset_key\",\n", + " \"common_shape_id\",\n", + " ]\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "689e66e2-bf0f-4f18-b8b6-793805692d9d", + "metadata": {}, + "source": [ + "##### Still in `rt_segment_speeds/segment_speed_utils/gtfs_schedule_wrangling` \n", + "* **Updated `dropna=False` in `groupby`**\n", + "* **Filled in `time_period` with `peak_offpeak`**" + ] + }, + { + "cell_type": "markdown", + "id": "10bb80e6-042a-42a7-8fbd-653fff05f674", + "metadata": {}, + "source": [ + "##### In `if __name__ == \"__main__\"` in `gtfs_funnel/schedule_stats_by_route`" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "id": "b906d052-a2a2-4ebc-9884-6ed9f4965487", + "metadata": {}, + "outputs": [], + "source": [ + "ROUTE_TYPOLOGIES = GTFS_DATA_DICT.schedule_tables.route_typologies\n", + "route_typologies = pd.read_parquet(\n", + " f\"{SCHED_GCS}{ROUTE_TYPOLOGIES}_{one_analysis_date}.parquet\",\n", + " columns=route_group_merge_cols\n", + " + [\n", + " \"is_coverage\",\n", + " \"is_downtown_local\",\n", + " \"is_local\",\n", + " \"is_rapid\",\n", + " \"is_express\",\n", + " \"is_rail\",\n", + " ],\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "id": "1a7000bf-9693-4f14-b1d1-244b1fe5a18d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
schedule_gtfs_dataset_keyroute_iddirection_idis_coverageis_downtown_localis_localis_rapidis_expressis_rail
1416f5a749dd65924e025b1293c58f95f8d6Shuttle1.00100000
357273105f2d1cabc8170ab066d96863c5d513X0.00100100
357373105f2d1cabc8170ab066d96863c5d5200.00100000
357473105f2d1cabc8170ab066d96863c5d512X0.00100110
357573105f2d1cabc8170ab066d96863c5d5300.00100100
357673105f2d1cabc8170ab066d96863c5d520.00010100
357773105f2d1cabc8170ab066d96863c5d510.00100100
357873105f2d1cabc8170ab066d96863c5d51B0.00100100
357973105f2d1cabc8170ab066d96863c5d540.00100100
358073105f2d1cabc8170ab066d96863c5d570.00100100
358173105f2d1cabc8170ab066d96863c5d580.00100100
358273105f2d1cabc8170ab066d96863c5d58a7c42f9-51e4-4848-bf88-30c210f149ad0.00100100
358373105f2d1cabc8170ab066d96863c5d590.00001100
358473105f2d1cabc8170ab066d96863c5d560.00100100
358573105f2d1cabc8170ab066d96863c5d5Mall0.00001100
358673105f2d1cabc8170ab066d96863c5d530.00100100
358773105f2d1cabc8170ab066d96863c5d550.00100100
358873105f2d1cabc8170ab066d96863c5d5110.00100100
\n", + "
" + ], + "text/plain": [ + " schedule_gtfs_dataset_key route_id \\\n", + "1416 f5a749dd65924e025b1293c58f95f8d6 Shuttle \n", + "3572 73105f2d1cabc8170ab066d96863c5d5 13X \n", + "3573 73105f2d1cabc8170ab066d96863c5d5 20 \n", + "3574 73105f2d1cabc8170ab066d96863c5d5 12X \n", + "3575 73105f2d1cabc8170ab066d96863c5d5 30 \n", + "3576 73105f2d1cabc8170ab066d96863c5d5 2 \n", + "3577 73105f2d1cabc8170ab066d96863c5d5 1 \n", + "3578 73105f2d1cabc8170ab066d96863c5d5 1B \n", + "3579 73105f2d1cabc8170ab066d96863c5d5 4 \n", + "3580 73105f2d1cabc8170ab066d96863c5d5 7 \n", + "3581 73105f2d1cabc8170ab066d96863c5d5 8 \n", + "3582 73105f2d1cabc8170ab066d96863c5d5 8a7c42f9-51e4-4848-bf88-30c210f149ad \n", + "3583 73105f2d1cabc8170ab066d96863c5d5 9 \n", + "3584 73105f2d1cabc8170ab066d96863c5d5 6 \n", + "3585 73105f2d1cabc8170ab066d96863c5d5 Mall \n", + "3586 73105f2d1cabc8170ab066d96863c5d5 3 \n", + "3587 73105f2d1cabc8170ab066d96863c5d5 5 \n", + "3588 73105f2d1cabc8170ab066d96863c5d5 11 \n", + "\n", + " direction_id is_coverage is_downtown_local is_local is_rapid \\\n", + "1416 1.00 1 0 0 0 \n", + "3572 0.00 1 0 0 1 \n", + "3573 0.00 1 0 0 0 \n", + "3574 0.00 1 0 0 1 \n", + "3575 0.00 1 0 0 1 \n", + "3576 0.00 0 1 0 1 \n", + "3577 0.00 1 0 0 1 \n", + "3578 0.00 1 0 0 1 \n", + "3579 0.00 1 0 0 1 \n", + "3580 0.00 1 0 0 1 \n", + "3581 0.00 1 0 0 1 \n", + "3582 0.00 1 0 0 1 \n", + "3583 0.00 0 0 1 1 \n", + "3584 0.00 1 0 0 1 \n", + "3585 0.00 0 0 1 1 \n", + "3586 0.00 1 0 0 1 \n", + "3587 0.00 1 0 0 1 \n", + "3588 0.00 1 0 0 1 \n", + "\n", + " is_express is_rail \n", + "1416 0 0 \n", + "3572 0 0 \n", + "3573 0 0 \n", + "3574 1 0 \n", + "3575 0 0 \n", + "3576 0 0 \n", + "3577 0 0 \n", + "3578 0 0 \n", + "3579 0 0 \n", + "3580 0 0 \n", + "3581 0 0 \n", + "3582 0 0 \n", + "3583 0 0 \n", + "3584 0 0 \n", + "3585 0 0 \n", + "3586 0 0 \n", + "3587 0 0 \n", + "3588 0 0 " + ] + }, + "execution_count": 37, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "route_typologies.loc[route_typologies.schedule_gtfs_dataset_key.isin(schd_keys)]" + ] + }, + { + "cell_type": "markdown", + "id": "96a8aae2-ad7a-4183-9476-d1e7f7acfadc", + "metadata": {}, + "source": [ + "##### `cardinal_direction_for_route_direction` also gets rid of a lot of stuff -> Fix this" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "id": "59453366-e8a1-4758-813f-7feeedba18ed", + "metadata": {}, + "outputs": [], + "source": [ + "STOP_TIMES_FILE = GTFS_DATA_DICT.rt_vs_schedule_tables.stop_times_direction" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "id": "872dda1a-c190-4dcf-8513-e0c2b52aee8c", + "metadata": {}, + "outputs": [], + "source": [ + "stop_times_df = pd.read_parquet(\n", + " f\"{RT_SCHED_GCS}{STOP_TIMES_FILE}_{one_analysis_date}.parquet\",\n", + " filters=[[(\"stop_primary_direction\", \"!=\", \"Unknown\")]],\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "id": "80428679-11b3-4c9a-9a69-6fae3e376580", + "metadata": {}, + "outputs": [], + "source": [ + "stop_times_df2 = stop_times_df.loc[\n", + " stop_times_df.schedule_gtfs_dataset_key.isin(schd_keys)\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "id": "47cea347-de35-49cd-b5a7-8bdbe0c87a06", + "metadata": {}, + "outputs": [], + "source": [ + "trip_scheduled_col = [\n", + " \"route_id\",\n", + " \"trip_instance_key\",\n", + " \"gtfs_dataset_key\",\n", + " \"shape_array_key\",\n", + " \"direction_id\",\n", + " \"route_long_name\",\n", + " \"route_short_name\",\n", + " \"route_desc\",\n", + " \"name\",\n", + "]\n", + "\n", + "trips_df = helpers.import_scheduled_trips(\n", + " one_analysis_date, columns=trip_scheduled_col, get_pandas=True\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "id": "b5aa4d89-791c-4a51-b538-27739a966e90", + "metadata": {}, + "outputs": [], + "source": [ + "merge_cols = [\"trip_instance_key\", \"schedule_gtfs_dataset_key\", \"shape_array_key\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "id": "9de81f13-d613-4c01-8e73-fecb92d957f6", + "metadata": {}, + "outputs": [], + "source": [ + "stop_times_with_trip = pd.merge(stop_times_df2, trips_df, on=merge_cols)" + ] + }, + { + "cell_type": "markdown", + "id": "38a3b647-05af-4ffb-a48e-abd60904dbf6", + "metadata": {}, + "source": [ + "##### Fill in `direction_id`" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "id": "d4c73c2f-11e9-43b4-a6e2-25ff80bd3bbf", + "metadata": {}, + "outputs": [], + "source": [ + "stop_times_with_trip.direction_id = stop_times_with_trip.direction_id.fillna(0)" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "id": "a05fd233-fb33-4fc7-821f-1fe8ea14c049", + "metadata": {}, + "outputs": [], + "source": [ + "main_cols = [\"route_id\", \"schedule_gtfs_dataset_key\", \"direction_id\"]" + ] + }, + { + "cell_type": "markdown", + "id": "eb9a923e-ecb9-4163-ac7d-946ae8eab9c1", + "metadata": {}, + "source": [ + "##### Done Changing dropna=False here too" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "id": "6a79cf47-731a-470b-8639-59d3ceef9d2c", + "metadata": {}, + "outputs": [], + "source": [ + "agg1 = (\n", + " stop_times_with_trip.groupby(main_cols + [\"stop_primary_direction\"], dropna=False)\n", + " .agg({\"stop_sequence\": \"count\"})\n", + " .reset_index()\n", + " .rename(columns={\"stop_sequence\": \"total_stops\"})\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "id": "2d88f775-4ca9-4889-a03f-9e8a2518dbfd", + "metadata": {}, + "outputs": [], + "source": [ + "agg2 = agg1.sort_values(\n", + " by=main_cols + [\"total_stops\"],\n", + " ascending=[True, True, True, False],\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "625d2776-e1e6-466f-b38b-c202460fdd14", + "metadata": {}, + "source": [ + "##### There are values for `route_primary_direction` but because `direction_id` is missing, it goes away? \n", + "* AH: testing to see if filling `direction_id` with something will change things." + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "id": "cd8096f3-35e9-4b45-8a28-13f66bb61991", + "metadata": {}, + "outputs": [], + "source": [ + "cardinal_dir_df = (\n", + " agg2.drop_duplicates(subset=main_cols)\n", + " .reset_index(drop=True)\n", + " .drop(columns=[\"total_stops\"])\n", + " .rename(columns={\"stop_primary_direction\": \"route_primary_direction\"})\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "id": "52f702c5-b7eb-4fbe-a92b-e4eaadf19e80", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
route_idschedule_gtfs_dataset_keydirection_idroute_primary_direction
0173105f2d1cabc8170ab066d96863c5d50.00Northbound
11173105f2d1cabc8170ab066d96863c5d50.00Northbound
212X73105f2d1cabc8170ab066d96863c5d50.00Northbound
313X73105f2d1cabc8170ab066d96863c5d50.00Westbound
41B73105f2d1cabc8170ab066d96863c5d50.00Northbound
5273105f2d1cabc8170ab066d96863c5d50.00Westbound
62073105f2d1cabc8170ab066d96863c5d50.00Eastbound
7373105f2d1cabc8170ab066d96863c5d50.00Eastbound
83073105f2d1cabc8170ab066d96863c5d50.00Southbound
9473105f2d1cabc8170ab066d96863c5d50.00Southbound
10573105f2d1cabc8170ab066d96863c5d50.00Northbound
11673105f2d1cabc8170ab066d96863c5d50.00Northbound
12773105f2d1cabc8170ab066d96863c5d50.00Southbound
13873105f2d1cabc8170ab066d96863c5d50.00Eastbound
148a7c42f9-51e4-4848-bf88-30c210f149ad73105f2d1cabc8170ab066d96863c5d50.00Northbound
15973105f2d1cabc8170ab066d96863c5d50.00Westbound
16CCf5a749dd65924e025b1293c58f95f8d60.00Northbound
17CCf5a749dd65924e025b1293c58f95f8d61.00Southbound
18Mall73105f2d1cabc8170ab066d96863c5d50.00Eastbound
19SFf5a749dd65924e025b1293c58f95f8d60.00Eastbound
20SFf5a749dd65924e025b1293c58f95f8d61.00Westbound
21Shuttlef5a749dd65924e025b1293c58f95f8d60.00Eastbound
22Shuttlef5a749dd65924e025b1293c58f95f8d61.00Westbound
\n", + "
" + ], + "text/plain": [ + " route_id schedule_gtfs_dataset_key \\\n", + "0 1 73105f2d1cabc8170ab066d96863c5d5 \n", + "1 11 73105f2d1cabc8170ab066d96863c5d5 \n", + "2 12X 73105f2d1cabc8170ab066d96863c5d5 \n", + "3 13X 73105f2d1cabc8170ab066d96863c5d5 \n", + "4 1B 73105f2d1cabc8170ab066d96863c5d5 \n", + "5 2 73105f2d1cabc8170ab066d96863c5d5 \n", + "6 20 73105f2d1cabc8170ab066d96863c5d5 \n", + "7 3 73105f2d1cabc8170ab066d96863c5d5 \n", + "8 30 73105f2d1cabc8170ab066d96863c5d5 \n", + "9 4 73105f2d1cabc8170ab066d96863c5d5 \n", + "10 5 73105f2d1cabc8170ab066d96863c5d5 \n", + "11 6 73105f2d1cabc8170ab066d96863c5d5 \n", + "12 7 73105f2d1cabc8170ab066d96863c5d5 \n", + "13 8 73105f2d1cabc8170ab066d96863c5d5 \n", + "14 8a7c42f9-51e4-4848-bf88-30c210f149ad 73105f2d1cabc8170ab066d96863c5d5 \n", + "15 9 73105f2d1cabc8170ab066d96863c5d5 \n", + "16 CC f5a749dd65924e025b1293c58f95f8d6 \n", + "17 CC f5a749dd65924e025b1293c58f95f8d6 \n", + "18 Mall 73105f2d1cabc8170ab066d96863c5d5 \n", + "19 SF f5a749dd65924e025b1293c58f95f8d6 \n", + "20 SF f5a749dd65924e025b1293c58f95f8d6 \n", + "21 Shuttle f5a749dd65924e025b1293c58f95f8d6 \n", + "22 Shuttle f5a749dd65924e025b1293c58f95f8d6 \n", + "\n", + " direction_id route_primary_direction \n", + "0 0.00 Northbound \n", + "1 0.00 Northbound \n", + "2 0.00 Northbound \n", + "3 0.00 Westbound \n", + "4 0.00 Northbound \n", + "5 0.00 Westbound \n", + "6 0.00 Eastbound \n", + "7 0.00 Eastbound \n", + "8 0.00 Southbound \n", + "9 0.00 Southbound \n", + "10 0.00 Northbound \n", + "11 0.00 Northbound \n", + "12 0.00 Southbound \n", + "13 0.00 Eastbound \n", + "14 0.00 Northbound \n", + "15 0.00 Westbound \n", + "16 0.00 Northbound \n", + "17 1.00 Southbound \n", + "18 0.00 Eastbound \n", + "19 0.00 Eastbound \n", + "20 1.00 Westbound \n", + "21 0.00 Eastbound \n", + "22 1.00 Westbound " + ] + }, + "execution_count": 49, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cardinal_dir_df" + ] + }, + { + "cell_type": "markdown", + "id": "b16e169e-fb71-49de-bd17-9769e96a83ce", + "metadata": {}, + "source": [ + "##### Continuing back to `if __name__ == \"__main__\"` portion of `gtfs_funnel/schedule_stats_by_route`" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "id": "37ad64d9-1a55-4925-90b6-c5a6425915f5", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
schedule_gtfs_dataset_keyroute_iddirection_idis_coverageis_downtown_localis_localis_rapidis_expressis_rail
01770249a5a2e770ca90628434d4934b134070.00100100
\n", + "
" + ], + "text/plain": [ + " schedule_gtfs_dataset_key route_id direction_id is_coverage \\\n", + "0 1770249a5a2e770ca90628434d4934b1 3407 0.00 1 \n", + "\n", + " is_downtown_local is_local is_rapid is_express is_rail \n", + "0 0 0 1 0 0 " + ] + }, + "execution_count": 50, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "route_typologies.head(1)" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "id": "d6f63dfb-9b18-41be-94a3-acdba99191f9", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['schedule_gtfs_dataset_key', 'route_id', 'direction_id']" + ] + }, + "execution_count": 51, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "route_group_merge_cols" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "id": "f6365ee6-6960-4988-a35e-fa8e787e1d3f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
schedule_gtfs_dataset_keyroute_iddirection_id
073105f2d1cabc8170ab066d96863c5d570.00
373105f2d1cabc8170ab066d96863c5d560.00
673105f2d1cabc8170ab066d96863c5d580.00
973105f2d1cabc8170ab066d96863c5d5Mall0.00
1273105f2d1cabc8170ab066d96863c5d512X0.00
1573105f2d1cabc8170ab066d96863c5d513X0.00
1873105f2d1cabc8170ab066d96863c5d5110.00
2173105f2d1cabc8170ab066d96863c5d5300.00
24f5a749dd65924e025b1293c58f95f8d6Shuttle1.00
27f5a749dd65924e025b1293c58f95f8d6Shuttle0.00
3073105f2d1cabc8170ab066d96863c5d58a7c42f9-51e4-4848-bf88-30c210f149ad0.00
3373105f2d1cabc8170ab066d96863c5d520.00
3673105f2d1cabc8170ab066d96863c5d530.00
3973105f2d1cabc8170ab066d96863c5d51B0.00
4273105f2d1cabc8170ab066d96863c5d5200.00
4573105f2d1cabc8170ab066d96863c5d550.00
4873105f2d1cabc8170ab066d96863c5d540.00
5173105f2d1cabc8170ab066d96863c5d590.00
5473105f2d1cabc8170ab066d96863c5d510.00
\n", + "
" + ], + "text/plain": [ + " schedule_gtfs_dataset_key route_id \\\n", + "0 73105f2d1cabc8170ab066d96863c5d5 7 \n", + "3 73105f2d1cabc8170ab066d96863c5d5 6 \n", + "6 73105f2d1cabc8170ab066d96863c5d5 8 \n", + "9 73105f2d1cabc8170ab066d96863c5d5 Mall \n", + "12 73105f2d1cabc8170ab066d96863c5d5 12X \n", + "15 73105f2d1cabc8170ab066d96863c5d5 13X \n", + "18 73105f2d1cabc8170ab066d96863c5d5 11 \n", + "21 73105f2d1cabc8170ab066d96863c5d5 30 \n", + "24 f5a749dd65924e025b1293c58f95f8d6 Shuttle \n", + "27 f5a749dd65924e025b1293c58f95f8d6 Shuttle \n", + "30 73105f2d1cabc8170ab066d96863c5d5 8a7c42f9-51e4-4848-bf88-30c210f149ad \n", + "33 73105f2d1cabc8170ab066d96863c5d5 2 \n", + "36 73105f2d1cabc8170ab066d96863c5d5 3 \n", + "39 73105f2d1cabc8170ab066d96863c5d5 1B \n", + "42 73105f2d1cabc8170ab066d96863c5d5 20 \n", + "45 73105f2d1cabc8170ab066d96863c5d5 5 \n", + "48 73105f2d1cabc8170ab066d96863c5d5 4 \n", + "51 73105f2d1cabc8170ab066d96863c5d5 9 \n", + "54 73105f2d1cabc8170ab066d96863c5d5 1 \n", + "\n", + " direction_id \n", + "0 0.00 \n", + "3 0.00 \n", + "6 0.00 \n", + "9 0.00 \n", + "12 0.00 \n", + "15 0.00 \n", + "18 0.00 \n", + "21 0.00 \n", + "24 1.00 \n", + "27 0.00 \n", + "30 0.00 \n", + "33 0.00 \n", + "36 0.00 \n", + "39 0.00 \n", + "42 0.00 \n", + "45 0.00 \n", + "48 0.00 \n", + "51 0.00 \n", + "54 0.00 " + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "preview(route_dir_metrics)" + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "id": "0b0fedf8-43d2-4ee6-8097-1e15a58b6d27", + "metadata": {}, + "outputs": [], + "source": [ + "route_dir_metrics2 = pd.merge(\n", + " route_dir_metrics, route_typologies, on=route_group_merge_cols, how=\"left\"\n", + ").merge(cardinal_dir_df, on=route_group_merge_cols, how=\"left\")" + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "id": "0fa520ce-3308-4b4c-a67c-a1a79f16696e", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array(['7', '6', '8', 'Mall', '12X', '13X', '11', '30', 'Shuttle',\n", + " '8a7c42f9-51e4-4848-bf88-30c210f149ad', '2', '3', '1B', '20', '5',\n", + " '4', '9', '1'], dtype=object)" + ] + }, + "execution_count": 54, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "route_dir_metrics2.route_id.unique()" + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "id": "58dc075e-61eb-457c-af0a-ef6861ef7db2", + "metadata": { + "scrolled": true, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "' route_dir_metrics2.drop(\\n columns=[\\n \"geometry\",\\n \"common_shape_id\",\\n \"geometry\",\\n \"route_name\",\\n \"is_coverage\",\\n \"is_downtown_local\",\\n \"is_local\",\\n \"is_rapid\",\\n \"is_express\",\\n \"is_rail\",\\n \"schedule_gtfs_dataset_key\"\\n ]\\n).sort_values(by=[\"route_id\",\"direction_id\"])'" + ] + }, + "execution_count": 55, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "\"\"\" route_dir_metrics2.drop(\n", + " columns=[\n", + " \"geometry\",\n", + " \"common_shape_id\",\n", + " \"geometry\",\n", + " \"route_name\",\n", + " \"is_coverage\",\n", + " \"is_downtown_local\",\n", + " \"is_local\",\n", + " \"is_rapid\",\n", + " \"is_express\",\n", + " \"is_rail\",\n", + " \"schedule_gtfs_dataset_key\"\n", + " ]\n", + ").sort_values(by=[\"route_id\",\"direction_id\"])\"\"\"" + ] + }, + { + "cell_type": "markdown", + "id": "bc23d1cc-a2fb-494a-bd6c-3adac7781763", + "metadata": {}, + "source": [ + "##### Double check that the columns are the same." + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "id": "542a0761-c5c9-4189-a620-b89da1e3bc5d", + "metadata": {}, + "outputs": [], + "source": [ + "og_nov_url = \"gs://calitp-analytics-data/data-analyses/rt_vs_schedule/schedule_route_dir/schedule_route_direction_metrics_2024-11-13.parquet\"" + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "id": "ed839532-8f41-46a2-a42b-66ec7454c94e", + "metadata": {}, + "outputs": [], + "source": [ + "df_sched_og = gpd.read_parquet(og_nov_url)" + ] + }, + { + "cell_type": "code", + "execution_count": 58, + "id": "f9fa795e-0793-4786-b589-fa79f713d95d", + "metadata": {}, + "outputs": [], + "source": [ + "df_sched_og = df_sched_og.loc[df_sched_og.schedule_gtfs_dataset_key.isin(schd_keys)]" + ] + }, + { + "cell_type": "code", + "execution_count": 59, + "id": "c95c9ca7-ab97-4421-878b-97c964b72663", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
route_iddirection_idtime_periodpeak_offpeak
224850.00all_dayNone
224950.00Noneoffpeak
225050.00Nonepeak
1849Shuttle0.00all_dayNone
1850Shuttle0.00Noneoffpeak
1851Shuttle0.00Nonepeak
1846Shuttle1.00all_dayNone
1847Shuttle1.00Noneoffpeak
1848Shuttle1.00Nonepeak
\n", + "
" + ], + "text/plain": [ + " route_id direction_id time_period peak_offpeak\n", + "2248 5 0.00 all_day None\n", + "2249 5 0.00 None offpeak\n", + "2250 5 0.00 None peak\n", + "1849 Shuttle 0.00 all_day None\n", + "1850 Shuttle 0.00 None offpeak\n", + "1851 Shuttle 0.00 None peak\n", + "1846 Shuttle 1.00 all_day None\n", + "1847 Shuttle 1.00 None offpeak\n", + "1848 Shuttle 1.00 None peak" + ] + }, + "execution_count": 59, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_sched_og[[\"route_id\", \"direction_id\", \"time_period\", \"peak_offpeak\"]].sort_values(\n", + " by=[\n", + " \"route_id\",\n", + " \"direction_id\",\n", + " ]\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 60, + "id": "b579cb53-8b01-4418-a2f7-8f597cf852fd", + "metadata": { + "scrolled": true, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
route_iddirection_idtime_periodpeak_offpeak
5410.00all_dayNaN
5510.00offpeakoffpeak
5610.00peakpeak
18110.00all_dayNaN
19110.00offpeakoffpeak
20110.00peakpeak
1212X0.00all_dayNaN
1312X0.00offpeakoffpeak
1412X0.00peakpeak
1513X0.00all_dayNaN
1613X0.00offpeakoffpeak
1713X0.00peakpeak
391B0.00all_dayNaN
401B0.00offpeakoffpeak
411B0.00peakpeak
3320.00all_dayNaN
3420.00offpeakoffpeak
3520.00peakpeak
42200.00all_dayNaN
43200.00offpeakoffpeak
44200.00peakpeak
3630.00all_dayNaN
3730.00offpeakoffpeak
3830.00peakpeak
21300.00all_dayNaN
22300.00offpeakoffpeak
23300.00peakpeak
4840.00all_dayNaN
4940.00offpeakoffpeak
5040.00peakpeak
4550.00all_dayNaN
4650.00offpeakoffpeak
4750.00peakpeak
360.00all_dayNaN
460.00offpeakoffpeak
560.00peakpeak
070.00all_dayNaN
170.00offpeakoffpeak
270.00peakpeak
680.00all_dayNaN
780.00offpeakoffpeak
880.00peakpeak
308a7c42f9-51e4-4848-bf88-30c210f149ad0.00all_dayNaN
318a7c42f9-51e4-4848-bf88-30c210f149ad0.00offpeakoffpeak
328a7c42f9-51e4-4848-bf88-30c210f149ad0.00peakpeak
5190.00all_dayNaN
5290.00offpeakoffpeak
5390.00peakpeak
9Mall0.00all_dayNaN
10Mall0.00offpeakoffpeak
11Mall0.00peakpeak
27Shuttle0.00all_dayNaN
28Shuttle0.00offpeakoffpeak
29Shuttle0.00peakpeak
24Shuttle1.00all_dayNaN
25Shuttle1.00offpeakoffpeak
26Shuttle1.00peakpeak
\n", + "
" + ], + "text/plain": [ + " route_id direction_id time_period \\\n", + "54 1 0.00 all_day \n", + "55 1 0.00 offpeak \n", + "56 1 0.00 peak \n", + "18 11 0.00 all_day \n", + "19 11 0.00 offpeak \n", + "20 11 0.00 peak \n", + "12 12X 0.00 all_day \n", + "13 12X 0.00 offpeak \n", + "14 12X 0.00 peak \n", + "15 13X 0.00 all_day \n", + "16 13X 0.00 offpeak \n", + "17 13X 0.00 peak \n", + "39 1B 0.00 all_day \n", + "40 1B 0.00 offpeak \n", + "41 1B 0.00 peak \n", + "33 2 0.00 all_day \n", + "34 2 0.00 offpeak \n", + "35 2 0.00 peak \n", + "42 20 0.00 all_day \n", + "43 20 0.00 offpeak \n", + "44 20 0.00 peak \n", + "36 3 0.00 all_day \n", + "37 3 0.00 offpeak \n", + "38 3 0.00 peak \n", + "21 30 0.00 all_day \n", + "22 30 0.00 offpeak \n", + "23 30 0.00 peak \n", + "48 4 0.00 all_day \n", + "49 4 0.00 offpeak \n", + "50 4 0.00 peak \n", + "45 5 0.00 all_day \n", + "46 5 0.00 offpeak \n", + "47 5 0.00 peak \n", + "3 6 0.00 all_day \n", + "4 6 0.00 offpeak \n", + "5 6 0.00 peak \n", + "0 7 0.00 all_day \n", + "1 7 0.00 offpeak \n", + "2 7 0.00 peak \n", + "6 8 0.00 all_day \n", + "7 8 0.00 offpeak \n", + "8 8 0.00 peak \n", + "30 8a7c42f9-51e4-4848-bf88-30c210f149ad 0.00 all_day \n", + "31 8a7c42f9-51e4-4848-bf88-30c210f149ad 0.00 offpeak \n", + "32 8a7c42f9-51e4-4848-bf88-30c210f149ad 0.00 peak \n", + "51 9 0.00 all_day \n", + "52 9 0.00 offpeak \n", + "53 9 0.00 peak \n", + "9 Mall 0.00 all_day \n", + "10 Mall 0.00 offpeak \n", + "11 Mall 0.00 peak \n", + "27 Shuttle 0.00 all_day \n", + "28 Shuttle 0.00 offpeak \n", + "29 Shuttle 0.00 peak \n", + "24 Shuttle 1.00 all_day \n", + "25 Shuttle 1.00 offpeak \n", + "26 Shuttle 1.00 peak \n", + "\n", + " peak_offpeak \n", + "54 NaN \n", + "55 offpeak \n", + "56 peak \n", + "18 NaN \n", + "19 offpeak \n", + "20 peak \n", + "12 NaN \n", + "13 offpeak \n", + "14 peak \n", + "15 NaN \n", + "16 offpeak \n", + "17 peak \n", + "39 NaN \n", + "40 offpeak \n", + "41 peak \n", + "33 NaN \n", + "34 offpeak \n", + "35 peak \n", + "42 NaN \n", + "43 offpeak \n", + "44 peak \n", + "36 NaN \n", + "37 offpeak \n", + "38 peak \n", + "21 NaN \n", + "22 offpeak \n", + "23 peak \n", + "48 NaN \n", + "49 offpeak \n", + "50 peak \n", + "45 NaN \n", + "46 offpeak \n", + "47 peak \n", + "3 NaN \n", + "4 offpeak \n", + "5 peak \n", + "0 NaN \n", + "1 offpeak \n", + "2 peak \n", + "6 NaN \n", + "7 offpeak \n", + "8 peak \n", + "30 NaN \n", + "31 offpeak \n", + "32 peak \n", + "51 NaN \n", + "52 offpeak \n", + "53 peak \n", + "9 NaN \n", + "10 offpeak \n", + "11 peak \n", + "27 NaN \n", + "28 offpeak \n", + "29 peak \n", + "24 NaN \n", + "25 offpeak \n", + "26 peak " + ] + }, + "execution_count": 60, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "route_dir_metrics2[\n", + " [\"route_id\", \"direction_id\", \"time_period\", \"peak_offpeak\"]\n", + "].sort_values(\n", + " by=[\n", + " \"route_id\",\n", + " \"direction_id\",\n", + " ]\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a8b09dce-9d6a-48f9-974e-06c7c5fff1d5", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 61, + "id": "b4f948fa-91d7-47d0-a3e7-3dd37dce9bbc", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([ True, True, True, True, True, True, True, True, True,\n", + " True, True, True, True, True, True, True, True, True,\n", + " True])" + ] + }, + "execution_count": 61, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "route_dir_metrics2.columns == df_sched_og.columns" + ] + }, + { + "cell_type": "code", + "execution_count": 62, + "id": "b791a48e-102c-4cae-81f2-2c109ef9a3b4", + "metadata": {}, + "outputs": [], + "source": [ + "df_sched = route_dir_metrics2.copy()" + ] + }, + { + "cell_type": "markdown", + "id": "995a5884-ee8c-4797-b664-0e09a94235ad", + "metadata": {}, + "source": [ + "#### DONE `gtfs_digest/merge_data` line 300 `df_avg_speeds` is also missing a lot of routes.\n", + "* [File `rt_segment_speeds/scripts/average_summary_speeds.py`](https://github.com/cal-itp/data-analyses/blob/main/rt_segment_speeds/scripts/average_summary_speeds.py)" + ] + }, + { + "cell_type": "code", + "execution_count": 63, + "id": "b4a704ba-b34c-496b-bff9-452e3ae124cd", + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "\n", + "sys.path.append(\"../rt_segment_speeds/scripts/\")\n", + "import average_segment_speeds\n", + "import average_summary_speeds\n", + "from segment_speed_utils import (\n", + " gtfs_schedule_wrangling,\n", + " helpers,\n", + " metrics,\n", + " segment_calcs,\n", + " time_series_utils,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 64, + "id": "2f364b44-3fbd-4514-9f9c-74e3dd5d0903", + "metadata": {}, + "outputs": [], + "source": [ + "df_avg_speeds = merge_data.concatenate_speeds_by_route_direction(analysis_date_list)" + ] + }, + { + "cell_type": "code", + "execution_count": 65, + "id": "762a40b1-a498-41c3-95ef-0b2527c9bc71", + "metadata": {}, + "outputs": [], + "source": [ + "df_avg_speeds2 = df_avg_speeds.loc[\n", + " df_avg_speeds.schedule_gtfs_dataset_key.isin(schd_keys)\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": 66, + "id": "2c3dfd99-f530-4b0f-b7d6-2c87f47b3e7d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "5 3\n", + "Name: route_id, dtype: int64" + ] + }, + "execution_count": 66, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_avg_speeds2.route_id.value_counts()" + ] + }, + { + "cell_type": "markdown", + "id": "7d82ddad-4e51-4be7-9850-e9547dfaec0a", + "metadata": {}, + "source": [ + "##### See what is in `rt_segment_speeds/scripts/average_segment_speeds.concatenate_trip_segment_speeds`" + ] + }, + { + "cell_type": "code", + "execution_count": 67, + "id": "6763f5cf-421e-43e1-a504-4dd15cbc3038", + "metadata": {}, + "outputs": [], + "source": [ + "segment_type = \"stop_segments\"" + ] + }, + { + "cell_type": "code", + "execution_count": 68, + "id": "78cd0972-c79e-47fb-8972-bfa0e233985e", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "concatenated files\n" + ] + } + ], + "source": [ + "df = average_segment_speeds.concatenate_trip_segment_speeds(\n", + " analysis_date_list, segment_type\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "870cf0c1-ac4d-4dfc-9132-333aaea46bdf", + "metadata": { + "tags": [] + }, + "source": [ + "##### Done. Amanda: filled in `nans` with 0." + ] + }, + { + "cell_type": "code", + "execution_count": 69, + "id": "5bfa6408-51be-4127-abb6-cb9d1f384a35", + "metadata": {}, + "outputs": [], + "source": [ + "df.direction_id = df.direction_id.fillna(0)" + ] + }, + { + "cell_type": "code", + "execution_count": 70, + "id": "4cedd5c3-6321-4025-b126-9b1f36b68848", + "metadata": {}, + "outputs": [], + "source": [ + "df2 = df.loc[df.schedule_gtfs_dataset_key.isin(schd_keys)]" + ] + }, + { + "cell_type": "code", + "execution_count": 71, + "id": "41b802f0-e199-4c39-b11d-32365469a99a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array(['30', '3', '20', '4', '5', '11', '7', '9', '1', '12X', '6', '2',\n", + " '8', '8a7c42f9-51e4-4848-bf88-30c210f149ad', 'CC'], dtype=object)" + ] + }, + "execution_count": 71, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df2.route_id.unique()" + ] + }, + { + "cell_type": "code", + "execution_count": 72, + "id": "b6478bf3-b6e0-433d-a54d-a061620f379c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(3543, 17)" + ] + }, + "execution_count": 72, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df2.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 73, + "id": "5dc4ba07-6b4d-4532-a368-390ea0c55364", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Int64Index: 3543 entries, 159381 to 2656608\n", + "Data columns (total 17 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 schedule_gtfs_dataset_key 3543 non-null object \n", + " 1 shape_array_key 3543 non-null object \n", + " 2 shape_id 3543 non-null object \n", + " 3 stop_sequence 3543 non-null int64 \n", + " 4 route_id 3543 non-null object \n", + " 5 direction_id 3543 non-null float64 \n", + " 6 stop_pair 3543 non-null object \n", + " 7 stop_pair_name 3543 non-null object \n", + " 8 trip_instance_key 3543 non-null object \n", + " 9 speed_mph 3543 non-null float64 \n", + " 10 meters_elapsed 3543 non-null float64 \n", + " 11 sec_elapsed 3543 non-null float64 \n", + " 12 time_of_day 3543 non-null object \n", + " 13 arrival_time 3543 non-null datetime64[ns]\n", + " 14 service_date 3543 non-null datetime64[ns]\n", + " 15 peak_offpeak 3543 non-null object \n", + " 16 weekday_weekend 3543 non-null object \n", + "dtypes: datetime64[ns](2), float64(4), int64(1), object(10)\n", + "memory usage: 498.2+ KB\n" + ] + } + ], + "source": [ + "df2.info()" + ] + }, + { + "cell_type": "code", + "execution_count": 74, + "id": "4942f2a9-fa29-488a-b307-3a084cfaad2c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array(['PM Peak', 'Early AM', 'Midday', 'AM Peak', 'Evening'],\n", + " dtype=object)" + ] + }, + "execution_count": 74, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df2.time_of_day.unique()" + ] + }, + { + "cell_type": "code", + "execution_count": 75, + "id": "54f81eed-ee1f-4061-be2d-523d6a009547", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array(['peak', 'offpeak'], dtype=object)" + ] + }, + "execution_count": 75, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df2.peak_offpeak.unique()" + ] + }, + { + "cell_type": "code", + "execution_count": 76, + "id": "1305f738-38d6-4fa3-95f5-f7f35f65f435", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
stop_pair
route_iddirection_id
10.0023
110.0019
12X0.0014
20.0031
200.007
30.0020
300.0027
40.0025
50.0034
60.0029
70.0014
80.0018
8a7c42f9-51e4-4848-bf88-30c210f149ad0.0019
90.0026
CC0.003
1.002
\n", + "
" + ], + "text/plain": [ + " stop_pair\n", + "route_id direction_id \n", + "1 0.00 23\n", + "11 0.00 19\n", + "12X 0.00 14\n", + "2 0.00 31\n", + "20 0.00 7\n", + "3 0.00 20\n", + "30 0.00 27\n", + "4 0.00 25\n", + "5 0.00 34\n", + "6 0.00 29\n", + "7 0.00 14\n", + "8 0.00 18\n", + "8a7c42f9-51e4-4848-bf88-30c210f149ad 0.00 19\n", + "9 0.00 26\n", + "CC 0.00 3\n", + " 1.00 2" + ] + }, + "execution_count": 76, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df2.groupby([\"route_id\", \"direction_id\"]).agg({\"stop_pair\": \"nunique\"})" + ] + }, + { + "cell_type": "code", + "execution_count": 77, + "id": "5d83b85b-fbbb-4a3a-8954-91acfd40f5a9", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
159381159382159383159384
schedule_gtfs_dataset_key73105f2d1cabc8170ab066d96863c5d573105f2d1cabc8170ab066d96863c5d573105f2d1cabc8170ab066d96863c5d573105f2d1cabc8170ab066d96863c5d5
shape_array_keyc6e9cda0db8bf76bc535f590ca1fccb5c6e9cda0db8bf76bc535f590ca1fccb5c6e9cda0db8bf76bc535f590ca1fccb5c6e9cda0db8bf76bc535f590ca1fccb5
shape_id8746730d-27f9-4fb2-9f52-987afe3569298746730d-27f9-4fb2-9f52-987afe3569298746730d-27f9-4fb2-9f52-987afe3569298746730d-27f9-4fb2-9f52-987afe356929
stop_sequence2233
route_id30303030
direction_id0.000.000.000.00
stop_pairf09af637-87de-4bdb-bf49-660539686c97__47def414-f158-496a-91cb-5f3fb0aa406cf09af637-87de-4bdb-bf49-660539686c97__47def414-f158-496a-91cb-5f3fb0aa406c47def414-f158-496a-91cb-5f3fb0aa406c__a94160c1-bd99-4898-921f-941aa748ce6f47def414-f158-496a-91cb-5f3fb0aa406c__a94160c1-bd99-4898-921f-941aa748ce6f
stop_pair_nameBroadway at Stowell__Betteravia at Miller (Panda Express)Broadway at Stowell__Betteravia at Miller (Panda Express)Betteravia at Miller (Panda Express)__McCoy at Broadway (Outbound)Betteravia at Miller (Panda Express)__McCoy at Broadway (Outbound)
trip_instance_key005bb393ed8b22ca4d8e7cc8d7895231217b90defbc6c69f05e19d16e96d1e3f005bb393ed8b22ca4d8e7cc8d7895231217b90defbc6c69f05e19d16e96d1e3f
speed_mph13.2113.8918.8817.04
meters_elapsed1930.841930.841409.451409.45
sec_elapsed327.00311.00167.00185.00
time_of_dayPM PeakEarly AMPM PeakEarly AM
arrival_time2024-11-13 15:23:452024-11-13 06:21:232024-11-13 15:29:122024-11-13 06:26:34
service_date2024-11-13 00:00:002024-11-13 00:00:002024-11-13 00:00:002024-11-13 00:00:00
peak_offpeakpeakoffpeakpeakoffpeak
weekday_weekendweekdayweekdayweekdayweekday
\n", + "
" + ], + "text/plain": [ + " 159381 \\\n", + "schedule_gtfs_dataset_key 73105f2d1cabc8170ab066d96863c5d5 \n", + "shape_array_key c6e9cda0db8bf76bc535f590ca1fccb5 \n", + "shape_id 8746730d-27f9-4fb2-9f52-987afe356929 \n", + "stop_sequence 2 \n", + "route_id 30 \n", + "direction_id 0.00 \n", + "stop_pair f09af637-87de-4bdb-bf49-660539686c97__47def414-f158-496a-91cb-5f3fb0aa406c \n", + "stop_pair_name Broadway at Stowell__Betteravia at Miller (Panda Express) \n", + "trip_instance_key 005bb393ed8b22ca4d8e7cc8d7895231 \n", + "speed_mph 13.21 \n", + "meters_elapsed 1930.84 \n", + "sec_elapsed 327.00 \n", + "time_of_day PM Peak \n", + "arrival_time 2024-11-13 15:23:45 \n", + "service_date 2024-11-13 00:00:00 \n", + "peak_offpeak peak \n", + "weekday_weekend weekday \n", + "\n", + " 159382 \\\n", + "schedule_gtfs_dataset_key 73105f2d1cabc8170ab066d96863c5d5 \n", + "shape_array_key c6e9cda0db8bf76bc535f590ca1fccb5 \n", + "shape_id 8746730d-27f9-4fb2-9f52-987afe356929 \n", + "stop_sequence 2 \n", + "route_id 30 \n", + "direction_id 0.00 \n", + "stop_pair f09af637-87de-4bdb-bf49-660539686c97__47def414-f158-496a-91cb-5f3fb0aa406c \n", + "stop_pair_name Broadway at Stowell__Betteravia at Miller (Panda Express) \n", + "trip_instance_key 217b90defbc6c69f05e19d16e96d1e3f \n", + "speed_mph 13.89 \n", + "meters_elapsed 1930.84 \n", + "sec_elapsed 311.00 \n", + "time_of_day Early AM \n", + "arrival_time 2024-11-13 06:21:23 \n", + "service_date 2024-11-13 00:00:00 \n", + "peak_offpeak offpeak \n", + "weekday_weekend weekday \n", + "\n", + " 159383 \\\n", + "schedule_gtfs_dataset_key 73105f2d1cabc8170ab066d96863c5d5 \n", + "shape_array_key c6e9cda0db8bf76bc535f590ca1fccb5 \n", + "shape_id 8746730d-27f9-4fb2-9f52-987afe356929 \n", + "stop_sequence 3 \n", + "route_id 30 \n", + "direction_id 0.00 \n", + "stop_pair 47def414-f158-496a-91cb-5f3fb0aa406c__a94160c1-bd99-4898-921f-941aa748ce6f \n", + "stop_pair_name Betteravia at Miller (Panda Express)__McCoy at Broadway (Outbound) \n", + "trip_instance_key 005bb393ed8b22ca4d8e7cc8d7895231 \n", + "speed_mph 18.88 \n", + "meters_elapsed 1409.45 \n", + "sec_elapsed 167.00 \n", + "time_of_day PM Peak \n", + "arrival_time 2024-11-13 15:29:12 \n", + "service_date 2024-11-13 00:00:00 \n", + "peak_offpeak peak \n", + "weekday_weekend weekday \n", + "\n", + " 159384 \n", + "schedule_gtfs_dataset_key 73105f2d1cabc8170ab066d96863c5d5 \n", + "shape_array_key c6e9cda0db8bf76bc535f590ca1fccb5 \n", + "shape_id 8746730d-27f9-4fb2-9f52-987afe356929 \n", + "stop_sequence 3 \n", + "route_id 30 \n", + "direction_id 0.00 \n", + "stop_pair 47def414-f158-496a-91cb-5f3fb0aa406c__a94160c1-bd99-4898-921f-941aa748ce6f \n", + "stop_pair_name Betteravia at Miller (Panda Express)__McCoy at Broadway (Outbound) \n", + "trip_instance_key 217b90defbc6c69f05e19d16e96d1e3f \n", + "speed_mph 17.04 \n", + "meters_elapsed 1409.45 \n", + "sec_elapsed 185.00 \n", + "time_of_day Early AM \n", + "arrival_time 2024-11-13 06:26:34 \n", + "service_date 2024-11-13 00:00:00 \n", + "peak_offpeak offpeak \n", + "weekday_weekend weekday " + ] + }, + "execution_count": 77, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df2.head(4).T" + ] + }, + { + "cell_type": "markdown", + "id": "beb7e517-e8e8-4257-ac60-c55e755a81a9", + "metadata": {}, + "source": [ + "##### Now moving onto the function `rt_segment_speeds/scripts/average_segment_speeds/segment_averages()`" + ] + }, + { + "cell_type": "code", + "execution_count": 78, + "id": "6504f04f-fe58-47d8-aaf6-788d064dd03c", + "metadata": {}, + "outputs": [], + "source": [ + "dict_inputs = GTFS_DATA_DICT[segment_type]" + ] + }, + { + "cell_type": "code", + "execution_count": 79, + "id": "8c0e863f-66da-4026-839e-25fcd23d5ef6", + "metadata": {}, + "outputs": [], + "source": [ + "OPERATOR_COLS = [\n", + " \"schedule_gtfs_dataset_key\",\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": 80, + "id": "0a40a434-cd05-4ba3-acb2-08063e1bc3b7", + "metadata": {}, + "outputs": [], + "source": [ + "ROUTE_DIR_COLS = [*dict_inputs[\"route_dir_cols\"]]\n", + "STOP_PAIR_COLS = [*dict_inputs[\"stop_pair_cols\"]]" + ] + }, + { + "cell_type": "code", + "execution_count": 81, + "id": "460b4f1a-0253-4a13-8a97-3ee365d2da83", + "metadata": {}, + "outputs": [], + "source": [ + "group_cols = OPERATOR_COLS + ROUTE_DIR_COLS + STOP_PAIR_COLS" + ] + }, + { + "cell_type": "code", + "execution_count": 82, + "id": "55a6a69a-98d4-4e0e-a5e8-2143ea15bfa0", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['schedule_gtfs_dataset_key',\n", + " 'route_id',\n", + " 'direction_id',\n", + " 'stop_pair',\n", + " 'stop_pair_name']" + ] + }, + "execution_count": 82, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "group_cols" + ] + }, + { + "cell_type": "markdown", + "id": "6577b12e-541a-473a-9ae4-c2d763f0b383", + "metadata": {}, + "source": [ + "##### Done. Added `dropna=False` to `rt_segment_speeds/segment_speed_utils/segment_calcs.calculate_avg_speeds`" + ] + }, + { + "cell_type": "code", + "execution_count": 83, + "id": "09c1fea8-8efb-42f0-9229-42f6cb63ebdb", + "metadata": {}, + "outputs": [], + "source": [ + "def calculate_avg_speeds(df: pd.DataFrame, group_cols: list) -> pd.DataFrame:\n", + " \"\"\"\n", + " Calculate the median, 20th, and 80th percentile speeds\n", + " by groups.\n", + " \"\"\"\n", + " # pd.groupby and pd.quantile is so slow\n", + " # create our own list of speeds and use np\n", + " df2 = (\n", + " df.groupby(group_cols, observed=True, group_keys=False, dropna=False)\n", + " .agg({\"speed_mph\": lambda x: sorted(list(x))})\n", + " .reset_index()\n", + " .rename(columns={\"speed_mph\": \"speed_mph_list\"})\n", + " )\n", + "\n", + " df2 = df2.assign(\n", + " p50_mph=df2.apply(lambda x: np.percentile(x.speed_mph_list, q=50), axis=1),\n", + " n_trips=df2.apply(lambda x: len(x.speed_mph_list), axis=1).astype(\"int16\"),\n", + " p20_mph=df2.apply(lambda x: np.percentile(x.speed_mph_list, q=20), axis=1),\n", + " p80_mph=df2.apply(lambda x: np.percentile(x.speed_mph_list, q=80), axis=1),\n", + " )\n", + "\n", + " stats = df2.drop(columns=\"speed_mph_list\")\n", + "\n", + " # Clean up for map\n", + " speed_cols = [c for c in stats.columns if \"_mph\" in c]\n", + " stats[speed_cols] = stats[speed_cols].round(2)\n", + "\n", + " return stats" + ] + }, + { + "cell_type": "code", + "execution_count": 84, + "id": "2e7944c7-211d-4098-aa6d-15150ff8d3c0", + "metadata": {}, + "outputs": [], + "source": [ + "avg_speeds = calculate_avg_speeds(\n", + " df2,\n", + " group_cols + [\"time_of_day\"],\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 85, + "id": "2efce226-fb81-4353-822d-5044f7b4f164", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
schedule_gtfs_dataset_keyroute_iddirection_idstop_pairstop_pair_nametime_of_dayp50_mphn_tripsp20_mphp80_mph
073105f2d1cabc8170ab066d96863c5d510.001c236429-e252-40c0-8287-4fe38145f5ae__c65b7144-737c-42ae-87e8-fc676f19c644Broadway at Hermosa__Broadway at FeslerAM Peak32.35410.4151.44
173105f2d1cabc8170ab066d96863c5d510.001c236429-e252-40c0-8287-4fe38145f5ae__c65b7144-737c-42ae-87e8-fc676f19c644Broadway at Hermosa__Broadway at FeslerEarly AM20.20120.2020.20
273105f2d1cabc8170ab066d96863c5d510.001c236429-e252-40c0-8287-4fe38145f5ae__c65b7144-737c-42ae-87e8-fc676f19c644Broadway at Hermosa__Broadway at FeslerEvening13.37113.3713.37
373105f2d1cabc8170ab066d96863c5d510.001c236429-e252-40c0-8287-4fe38145f5ae__c65b7144-737c-42ae-87e8-fc676f19c644Broadway at Hermosa__Broadway at FeslerMidday15.9661.0020.20
473105f2d1cabc8170ab066d96863c5d510.001c236429-e252-40c0-8287-4fe38145f5ae__c65b7144-737c-42ae-87e8-fc676f19c644Broadway at Hermosa__Broadway at FeslerPM Peak14.6650.9316.33
\n", + "
" + ], + "text/plain": [ + " schedule_gtfs_dataset_key route_id direction_id \\\n", + "0 73105f2d1cabc8170ab066d96863c5d5 1 0.00 \n", + "1 73105f2d1cabc8170ab066d96863c5d5 1 0.00 \n", + "2 73105f2d1cabc8170ab066d96863c5d5 1 0.00 \n", + "3 73105f2d1cabc8170ab066d96863c5d5 1 0.00 \n", + "4 73105f2d1cabc8170ab066d96863c5d5 1 0.00 \n", + "\n", + " stop_pair \\\n", + "0 1c236429-e252-40c0-8287-4fe38145f5ae__c65b7144-737c-42ae-87e8-fc676f19c644 \n", + "1 1c236429-e252-40c0-8287-4fe38145f5ae__c65b7144-737c-42ae-87e8-fc676f19c644 \n", + "2 1c236429-e252-40c0-8287-4fe38145f5ae__c65b7144-737c-42ae-87e8-fc676f19c644 \n", + "3 1c236429-e252-40c0-8287-4fe38145f5ae__c65b7144-737c-42ae-87e8-fc676f19c644 \n", + "4 1c236429-e252-40c0-8287-4fe38145f5ae__c65b7144-737c-42ae-87e8-fc676f19c644 \n", + "\n", + " stop_pair_name time_of_day p50_mph n_trips \\\n", + "0 Broadway at Hermosa__Broadway at Fesler AM Peak 32.35 4 \n", + "1 Broadway at Hermosa__Broadway at Fesler Early AM 20.20 1 \n", + "2 Broadway at Hermosa__Broadway at Fesler Evening 13.37 1 \n", + "3 Broadway at Hermosa__Broadway at Fesler Midday 15.96 6 \n", + "4 Broadway at Hermosa__Broadway at Fesler PM Peak 14.66 5 \n", + "\n", + " p20_mph p80_mph \n", + "0 10.41 51.44 \n", + "1 20.20 20.20 \n", + "2 13.37 13.37 \n", + "3 1.00 20.20 \n", + "4 0.93 16.33 " + ] + }, + "execution_count": 85, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "avg_speeds.head()" + ] + }, + { + "cell_type": "markdown", + "id": "1d64908f-c9cc-42a9-a9e2-11ca0b517cb0", + "metadata": {}, + "source": [ + "##### Go back to `rt_sgment_speeds/scripts/average_segment_speeds.segment_averages()`" + ] + }, + { + "cell_type": "code", + "execution_count": 86, + "id": "64313006-18d1-4722-9919-97a0c338e8a0", + "metadata": {}, + "outputs": [], + "source": [ + "avg_speeds2 = avg_speeds.pipe(\n", + " gtfs_schedule_wrangling.merge_operator_identifiers,\n", + " analysis_date_list,\n", + " columns=average_segment_speeds.CROSSWALK_COLS,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 87, + "id": "6b9547c7-e6cf-4a42-b735-6dcb6b0e5d83", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Int64Index: 1120 entries, 0 to 1119\n", + "Data columns (total 15 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 schedule_gtfs_dataset_key 1120 non-null object \n", + " 1 route_id 1120 non-null object \n", + " 2 direction_id 1120 non-null float64\n", + " 3 stop_pair 1120 non-null object \n", + " 4 stop_pair_name 1120 non-null object \n", + " 5 time_of_day 1120 non-null object \n", + " 6 p50_mph 1120 non-null float64\n", + " 7 n_trips 1120 non-null int16 \n", + " 8 p20_mph 1120 non-null float64\n", + " 9 p80_mph 1120 non-null float64\n", + " 10 name 1120 non-null object \n", + " 11 caltrans_district 1120 non-null object \n", + " 12 organization_source_record_id 1120 non-null object \n", + " 13 organization_name 1120 non-null object \n", + " 14 base64_url 1120 non-null object \n", + "dtypes: float64(4), int16(1), object(10)\n", + "memory usage: 133.4+ KB\n" + ] + } + ], + "source": [ + "avg_speeds2.info()" + ] + }, + { + "cell_type": "code", + "execution_count": 88, + "id": "cb5e175d-9836-4497-9cc0-b4397f63afc5", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
schedule_gtfs_dataset_keyroute_iddirection_id
073105f2d1cabc8170ab066d96863c5d510.00
9173105f2d1cabc8170ab066d96863c5d5110.00
14873105f2d1cabc8170ab066d96863c5d512X0.00
20473105f2d1cabc8170ab066d96863c5d520.00
31573105f2d1cabc8170ab066d96863c5d5200.00
33473105f2d1cabc8170ab066d96863c5d530.00
40973105f2d1cabc8170ab066d96863c5d5300.00
49273105f2d1cabc8170ab066d96863c5d540.00
58473105f2d1cabc8170ab066d96863c5d550.00
71773105f2d1cabc8170ab066d96863c5d560.00
82973105f2d1cabc8170ab066d96863c5d570.00
88773105f2d1cabc8170ab066d96863c5d580.00
95273105f2d1cabc8170ab066d96863c5d58a7c42f9-51e4-4848-bf88-30c210f149ad0.00
102873105f2d1cabc8170ab066d96863c5d590.00
1115f5a749dd65924e025b1293c58f95f8d6CC0.00
1118f5a749dd65924e025b1293c58f95f8d6CC1.00
\n", + "
" + ], + "text/plain": [ + " schedule_gtfs_dataset_key route_id \\\n", + "0 73105f2d1cabc8170ab066d96863c5d5 1 \n", + "91 73105f2d1cabc8170ab066d96863c5d5 11 \n", + "148 73105f2d1cabc8170ab066d96863c5d5 12X \n", + "204 73105f2d1cabc8170ab066d96863c5d5 2 \n", + "315 73105f2d1cabc8170ab066d96863c5d5 20 \n", + "334 73105f2d1cabc8170ab066d96863c5d5 3 \n", + "409 73105f2d1cabc8170ab066d96863c5d5 30 \n", + "492 73105f2d1cabc8170ab066d96863c5d5 4 \n", + "584 73105f2d1cabc8170ab066d96863c5d5 5 \n", + "717 73105f2d1cabc8170ab066d96863c5d5 6 \n", + "829 73105f2d1cabc8170ab066d96863c5d5 7 \n", + "887 73105f2d1cabc8170ab066d96863c5d5 8 \n", + "952 73105f2d1cabc8170ab066d96863c5d5 8a7c42f9-51e4-4848-bf88-30c210f149ad \n", + "1028 73105f2d1cabc8170ab066d96863c5d5 9 \n", + "1115 f5a749dd65924e025b1293c58f95f8d6 CC \n", + "1118 f5a749dd65924e025b1293c58f95f8d6 CC \n", + "\n", + " direction_id \n", + "0 0.00 \n", + "91 0.00 \n", + "148 0.00 \n", + "204 0.00 \n", + "315 0.00 \n", + "334 0.00 \n", + "409 0.00 \n", + "492 0.00 \n", + "584 0.00 \n", + "717 0.00 \n", + "829 0.00 \n", + "887 0.00 \n", + "952 0.00 \n", + "1028 0.00 \n", + "1115 0.00 \n", + "1118 1.00 " + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "preview(avg_speeds2)" + ] + }, + { + "cell_type": "markdown", + "id": "71124cd8-df34-42e6-99b2-1e72d1c888fa", + "metadata": {}, + "source": [ + "##### Move onto`rt_segment_speeds/scripts/average_segement_speeds/merge_in_segment_geometry()`\n", + "* Original function=only 3 routes showing...Check it out." + ] + }, + { + "cell_type": "code", + "execution_count": 89, + "id": "41f1b87c-5a5f-4909-bf13-63f6a430a16c", + "metadata": {}, + "outputs": [], + "source": [ + "avg_speeds_with_geom = average_segment_speeds.merge_in_segment_geometry(\n", + " avg_speeds2, one_analysis_date, segment_type\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 90, + "id": "4a371afd-a828-41a0-afc6-f581bfffe175", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
schedule_gtfs_dataset_keyroute_iddirection_id
073105f2d1cabc8170ab066d96863c5d5300.00
11073105f2d1cabc8170ab066d96863c5d530.00
20273105f2d1cabc8170ab066d96863c5d5200.00
22173105f2d1cabc8170ab066d96863c5d540.00
33973105f2d1cabc8170ab066d96863c5d550.00
47273105f2d1cabc8170ab066d96863c5d5110.00
52973105f2d1cabc8170ab066d96863c5d570.00
58773105f2d1cabc8170ab066d96863c5d590.00
69273105f2d1cabc8170ab066d96863c5d510.00
79073105f2d1cabc8170ab066d96863c5d512X0.00
84673105f2d1cabc8170ab066d96863c5d560.00
96573105f2d1cabc8170ab066d96863c5d520.00
110173105f2d1cabc8170ab066d96863c5d580.00
1166f5a749dd65924e025b1293c58f95f8d6CC1.00
1169f5a749dd65924e025b1293c58f95f8d6CC0.00
118773105f2d1cabc8170ab066d96863c5d58a7c42f9-51e4-4848-bf88-30c210f149ad0.00
\n", + "
" + ], + "text/plain": [ + " schedule_gtfs_dataset_key route_id \\\n", + "0 73105f2d1cabc8170ab066d96863c5d5 30 \n", + "110 73105f2d1cabc8170ab066d96863c5d5 3 \n", + "202 73105f2d1cabc8170ab066d96863c5d5 20 \n", + "221 73105f2d1cabc8170ab066d96863c5d5 4 \n", + "339 73105f2d1cabc8170ab066d96863c5d5 5 \n", + "472 73105f2d1cabc8170ab066d96863c5d5 11 \n", + "529 73105f2d1cabc8170ab066d96863c5d5 7 \n", + "587 73105f2d1cabc8170ab066d96863c5d5 9 \n", + "692 73105f2d1cabc8170ab066d96863c5d5 1 \n", + "790 73105f2d1cabc8170ab066d96863c5d5 12X \n", + "846 73105f2d1cabc8170ab066d96863c5d5 6 \n", + "965 73105f2d1cabc8170ab066d96863c5d5 2 \n", + "1101 73105f2d1cabc8170ab066d96863c5d5 8 \n", + "1166 f5a749dd65924e025b1293c58f95f8d6 CC \n", + "1169 f5a749dd65924e025b1293c58f95f8d6 CC \n", + "1187 73105f2d1cabc8170ab066d96863c5d5 8a7c42f9-51e4-4848-bf88-30c210f149ad \n", + "\n", + " direction_id \n", + "0 0.00 \n", + "110 0.00 \n", + "202 0.00 \n", + "221 0.00 \n", + "339 0.00 \n", + "472 0.00 \n", + "529 0.00 \n", + "587 0.00 \n", + "692 0.00 \n", + "790 0.00 \n", + "846 0.00 \n", + "965 0.00 \n", + "1101 0.00 \n", + "1166 1.00 \n", + "1169 0.00 \n", + "1187 0.00 " + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "preview(avg_speeds_with_geom)" + ] + }, + { + "cell_type": "code", + "execution_count": 91, + "id": "4ad1b0ef-e48f-4606-aca8-4bfcf1fd18b8", + "metadata": {}, + "outputs": [], + "source": [ + "from calitp_data_analysis.geography_utils import WGS84" + ] + }, + { + "cell_type": "markdown", + "id": "fb05005e-676a-404e-8b98-b639e364729e", + "metadata": {}, + "source": [ + "##### Down another rabbit hole: this `SEGMENT_FILE` doesn't contain values for direction_id \n", + "* Need to find out whre it's originally made.\n", + "* Done **Fill in `direction_id` with 0.**" + ] + }, + { + "cell_type": "code", + "execution_count": 92, + "id": "458212b0-0f3f-46ef-b698-1d00f6a285c9", + "metadata": {}, + "outputs": [], + "source": [ + "SEGMENT_FILE = GTFS_DATA_DICT[segment_type].segments_file\n", + "\n", + "segment_geom = gpd.read_parquet(\n", + " f\"{SEGMENT_GCS}{SEGMENT_FILE}_{one_analysis_date}.parquet\",\n", + ").to_crs(WGS84)" + ] + }, + { + "cell_type": "code", + "execution_count": 93, + "id": "1d7886f9-e9ec-467d-a6e6-15df89fdc970", + "metadata": {}, + "outputs": [], + "source": [ + "segment_geom.direction_id = segment_geom.direction_id.fillna(0)" + ] + }, + { + "cell_type": "code", + "execution_count": 94, + "id": "b809e2e7-12c2-4232-8019-8516d84fb20f", + "metadata": {}, + "outputs": [], + "source": [ + "segment_geom2 = segment_geom.loc[segment_geom.schedule_gtfs_dataset_key.isin(schd_keys)]" + ] + }, + { + "cell_type": "code", + "execution_count": 95, + "id": "2da0e84d-df1a-41a6-8192-7ebbaf4386e0", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
schedule_gtfs_dataset_keyroute_iddirection_id
18178073105f2d1cabc8170ab066d96863c5d5300.00
57076073105f2d1cabc8170ab066d96863c5d530.00
161329673105f2d1cabc8170ab066d96863c5d5200.00
196919873105f2d1cabc8170ab066d96863c5d540.00
208306673105f2d1cabc8170ab066d96863c5d550.00
211228473105f2d1cabc8170ab066d96863c5d5110.00
216591173105f2d1cabc8170ab066d96863c5d570.00
221518073105f2d1cabc8170ab066d96863c5d590.00
238609873105f2d1cabc8170ab066d96863c5d510.00
272053773105f2d1cabc8170ab066d96863c5d512X0.00
279499973105f2d1cabc8170ab066d96863c5d560.00
290380973105f2d1cabc8170ab066d96863c5d520.00
297340073105f2d1cabc8170ab066d96863c5d580.00
2986372f5a749dd65924e025b1293c58f95f8d6CC1.00
3065284f5a749dd65924e025b1293c58f95f8d6CC0.00
310706373105f2d1cabc8170ab066d96863c5d58a7c42f9-51e4-4848-bf88-30c210f149ad0.00
317038273105f2d1cabc8170ab066d96863c5d51B0.00
\n", + "
" + ], + "text/plain": [ + " schedule_gtfs_dataset_key \\\n", + "181780 73105f2d1cabc8170ab066d96863c5d5 \n", + "570760 73105f2d1cabc8170ab066d96863c5d5 \n", + "1613296 73105f2d1cabc8170ab066d96863c5d5 \n", + "1969198 73105f2d1cabc8170ab066d96863c5d5 \n", + "2083066 73105f2d1cabc8170ab066d96863c5d5 \n", + "2112284 73105f2d1cabc8170ab066d96863c5d5 \n", + "2165911 73105f2d1cabc8170ab066d96863c5d5 \n", + "2215180 73105f2d1cabc8170ab066d96863c5d5 \n", + "2386098 73105f2d1cabc8170ab066d96863c5d5 \n", + "2720537 73105f2d1cabc8170ab066d96863c5d5 \n", + "2794999 73105f2d1cabc8170ab066d96863c5d5 \n", + "2903809 73105f2d1cabc8170ab066d96863c5d5 \n", + "2973400 73105f2d1cabc8170ab066d96863c5d5 \n", + "2986372 f5a749dd65924e025b1293c58f95f8d6 \n", + "3065284 f5a749dd65924e025b1293c58f95f8d6 \n", + "3107063 73105f2d1cabc8170ab066d96863c5d5 \n", + "3170382 73105f2d1cabc8170ab066d96863c5d5 \n", + "\n", + " route_id direction_id \n", + "181780 30 0.00 \n", + "570760 3 0.00 \n", + "1613296 20 0.00 \n", + "1969198 4 0.00 \n", + "2083066 5 0.00 \n", + "2112284 11 0.00 \n", + "2165911 7 0.00 \n", + "2215180 9 0.00 \n", + "2386098 1 0.00 \n", + "2720537 12X 0.00 \n", + "2794999 6 0.00 \n", + "2903809 2 0.00 \n", + "2973400 8 0.00 \n", + "2986372 CC 1.00 \n", + "3065284 CC 0.00 \n", + "3107063 8a7c42f9-51e4-4848-bf88-30c210f149ad 0.00 \n", + "3170382 1B 0.00 " + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "preview(segment_geom2)" + ] + }, + { + "cell_type": "code", + "execution_count": 96, + "id": "a4b2dece-6078-43a6-8a90-4b9cd04ada08", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
trip_instance_keyshape_array_keystop_id1stop_sequencestop_id2segment_idstop_pairschedule_gtfs_dataset_keyroute_iddirection_idst_trip_instance_keysegment_uuid
181780005bb393ed8b22ca4d8e7cc8d7895231c6e9cda0db8bf76bc535f590ca1fccb5120f2635-ec31-435e-a089-225b26965f121f09af637-87de-4bdb-bf49-660539686c97120f2635-ec31-435e-a089-225b26965f12-f09af637-87de-4bdb-bf49-660539686c97-1120f2635-ec31-435e-a089-225b26965f12__f09af637-87de-4bdb-bf49-660539686c9773105f2d1cabc8170ab066d96863c5d5300.00005bb393ed8b22ca4d8e7cc8d789523173105f2d1cabc8170ab066d96863c5d5__30__nan__120f2635-ec31-435e-a089-225b26965f12-f09af637-87de-4bdb-bf49-660539686c97-1
181781005bb393ed8b22ca4d8e7cc8d7895231c6e9cda0db8bf76bc535f590ca1fccb5f09af637-87de-4bdb-bf49-660539686c97247def414-f158-496a-91cb-5f3fb0aa406cf09af637-87de-4bdb-bf49-660539686c97-47def414-f158-496a-91cb-5f3fb0aa406c-1f09af637-87de-4bdb-bf49-660539686c97__47def414-f158-496a-91cb-5f3fb0aa406c73105f2d1cabc8170ab066d96863c5d5300.00005bb393ed8b22ca4d8e7cc8d789523173105f2d1cabc8170ab066d96863c5d5__30__nan__f09af637-87de-4bdb-bf49-660539686c97-47def414-f158-496a-91cb-5f3fb0aa406c-1
\n", + "
" + ], + "text/plain": [ + " trip_instance_key shape_array_key \\\n", + "181780 005bb393ed8b22ca4d8e7cc8d7895231 c6e9cda0db8bf76bc535f590ca1fccb5 \n", + "181781 005bb393ed8b22ca4d8e7cc8d7895231 c6e9cda0db8bf76bc535f590ca1fccb5 \n", + "\n", + " stop_id1 stop_sequence \\\n", + "181780 120f2635-ec31-435e-a089-225b26965f12 1 \n", + "181781 f09af637-87de-4bdb-bf49-660539686c97 2 \n", + "\n", + " stop_id2 \\\n", + "181780 f09af637-87de-4bdb-bf49-660539686c97 \n", + "181781 47def414-f158-496a-91cb-5f3fb0aa406c \n", + "\n", + " segment_id \\\n", + "181780 120f2635-ec31-435e-a089-225b26965f12-f09af637-87de-4bdb-bf49-660539686c97-1 \n", + "181781 f09af637-87de-4bdb-bf49-660539686c97-47def414-f158-496a-91cb-5f3fb0aa406c-1 \n", + "\n", + " stop_pair \\\n", + "181780 120f2635-ec31-435e-a089-225b26965f12__f09af637-87de-4bdb-bf49-660539686c97 \n", + "181781 f09af637-87de-4bdb-bf49-660539686c97__47def414-f158-496a-91cb-5f3fb0aa406c \n", + "\n", + " schedule_gtfs_dataset_key route_id direction_id \\\n", + "181780 73105f2d1cabc8170ab066d96863c5d5 30 0.00 \n", + "181781 73105f2d1cabc8170ab066d96863c5d5 30 0.00 \n", + "\n", + " st_trip_instance_key \\\n", + "181780 005bb393ed8b22ca4d8e7cc8d7895231 \n", + "181781 005bb393ed8b22ca4d8e7cc8d7895231 \n", + "\n", + " segment_uuid \n", + "181780 73105f2d1cabc8170ab066d96863c5d5__30__nan__120f2635-ec31-435e-a089-225b26965f12-f09af637-87de-4bdb-bf49-660539686c97-1 \n", + "181781 73105f2d1cabc8170ab066d96863c5d5__30__nan__f09af637-87de-4bdb-bf49-660539686c97-47def414-f158-496a-91cb-5f3fb0aa406c-1 " + ] + }, + "execution_count": 96, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "segment_geom2.drop(columns=[\"geometry\"]).head(2)" + ] + }, + { + "cell_type": "markdown", + "id": "acec3a1a-30f3-43b5-b7e5-efba1699924c", + "metadata": {}, + "source": [ + "##### Continue on with the rest of `merge_in_segment_geometry` in `rt_segment_speeds/scripts/average_segment_speeds`" + ] + }, + { + "cell_type": "code", + "execution_count": 97, + "id": "202a6a24-fab9-4204-82ee-dfdab76f5628", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'rollup_singleday/speeds_route_dir_segments'" + ] + }, + "execution_count": 97, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dict_inputs[\"route_dir_single_segment\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 98, + "id": "03f32706-608d-4ff8-bcce-6aebb8c59820", + "metadata": {}, + "outputs": [], + "source": [ + "geom_file_cols = segment_geom2.columns.tolist()" + ] + }, + { + "cell_type": "code", + "execution_count": 99, + "id": "1030f6a4-eeb1-404b-aab1-329d67bb8cf8", + "metadata": {}, + "outputs": [], + "source": [ + "col_order = [c for c in avg_speeds2.columns]" + ] + }, + { + "cell_type": "code", + "execution_count": 100, + "id": "36b45cfb-8aa8-4f52-a1e8-101f491cc2c8", + "metadata": {}, + "outputs": [], + "source": [ + "merge_cols = list(set(col_order).intersection(geom_file_cols))" + ] + }, + { + "cell_type": "code", + "execution_count": 101, + "id": "8e3a312c-2ec4-4820-a306-c32920b3333e", + "metadata": {}, + "outputs": [], + "source": [ + "gdf = (\n", + " pd.merge(\n", + " segment_geom2[merge_cols + [\"geometry\"]].drop_duplicates(),\n", + " avg_speeds2,\n", + " on=merge_cols,\n", + " )\n", + " .reset_index(drop=True)\n", + " .reindex(columns=col_order + [\"geometry\"])\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 102, + "id": "f32499ad-55e5-446d-a9e1-8dab5b3d52be", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 1266 entries, 0 to 1265\n", + "Data columns (total 16 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 schedule_gtfs_dataset_key 1266 non-null object \n", + " 1 route_id 1266 non-null object \n", + " 2 direction_id 1266 non-null float64 \n", + " 3 stop_pair 1266 non-null object \n", + " 4 stop_pair_name 1266 non-null object \n", + " 5 time_of_day 1266 non-null object \n", + " 6 p50_mph 1266 non-null float64 \n", + " 7 n_trips 1266 non-null int16 \n", + " 8 p20_mph 1266 non-null float64 \n", + " 9 p80_mph 1266 non-null float64 \n", + " 10 name 1266 non-null object \n", + " 11 caltrans_district 1266 non-null object \n", + " 12 organization_source_record_id 1266 non-null object \n", + " 13 organization_name 1266 non-null object \n", + " 14 base64_url 1266 non-null object \n", + " 15 geometry 1266 non-null geometry\n", + "dtypes: float64(4), geometry(1), int16(1), object(10)\n", + "memory usage: 151.0+ KB\n" + ] + } + ], + "source": [ + "gdf.info()" + ] + }, + { + "cell_type": "code", + "execution_count": 103, + "id": "5079633b-1029-4370-878d-346c30369036", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
schedule_gtfs_dataset_keyroute_iddirection_idstop_pairstop_pair_nametime_of_dayp50_mphn_tripsp20_mphp80_mphnamecaltrans_district
116273105f2d1cabc8170ab066d96863c5d580.00712b4000-441b-4b64-8a8e-36ec38bbbce1__ae050555-4c98-44e7-ad1a-d536b91d2012Carmen ln at Trinity (Wesgate)(Outbound)__Carmen Ln at Carmelia Ln.AM Peak24.39320.9931.30Santa Maria Schedule05 - San Luis Obispo
\n", + "
" + ], + "text/plain": [ + " schedule_gtfs_dataset_key route_id direction_id \\\n", + "1162 73105f2d1cabc8170ab066d96863c5d5 8 0.00 \n", + "\n", + " stop_pair \\\n", + "1162 712b4000-441b-4b64-8a8e-36ec38bbbce1__ae050555-4c98-44e7-ad1a-d536b91d2012 \n", + "\n", + " stop_pair_name \\\n", + "1162 Carmen ln at Trinity (Wesgate)(Outbound)__Carmen Ln at Carmelia Ln. \n", + "\n", + " time_of_day p50_mph n_trips p20_mph p80_mph name \\\n", + "1162 AM Peak 24.39 3 20.99 31.30 Santa Maria Schedule \n", + "\n", + " caltrans_district \n", + "1162 05 - San Luis Obispo " + ] + }, + "execution_count": 103, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "gdf.drop(\n", + " columns=[\n", + " \"geometry\",\n", + " \"organization_source_record_id\",\n", + " \"organization_name\",\n", + " \"base64_url\",\n", + " ]\n", + ").sample()" + ] + }, + { + "cell_type": "markdown", + "id": "b1636a93-4c57-489a-a40f-f2a605e92afc", + "metadata": {}, + "source": [ + "##### `rt_segment_speeds/scripts/average_segment_speeds` gives me the speeds by stop for a route. However, in `gtfss_digest/merge_data`, we want the speeds for the entire route from `average_segment_speeds` is summarized in `rt_segment_speeds/scripts/average_summary_speeds`" + ] + }, + { + "cell_type": "code", + "execution_count": 104, + "id": "ea8acdaf-c827-42e5-a410-75cd85119949", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'rollup_singleday/speeds_route_dir_segments'" + ] + }, + "execution_count": 104, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dict_inputs[\"route_dir_single_segment\"]" + ] + }, + { + "cell_type": "markdown", + "id": "09e6f8f2-06e2-4280-afeb-7309e1eb6aae", + "metadata": {}, + "source": [ + "##### **This file below is used in `gtfs_digest/merge_data`. Need to breakout `average_summary_speeds`**\n", + "* gs://calitp-analytics-data/data-analyses/rt_segment_speeds/ and rollup_singleday/speeds_route_dir_AH_TEST_2024-11-13" + ] + }, + { + "cell_type": "code", + "execution_count": 105, + "id": "a2c77444-4705-46cb-80c1-f704d316ad73", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'rollup_singleday/speeds_route_dir'" + ] + }, + "execution_count": 105, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "GTFS_DATA_DICT.rt_stop_times.route_dir_single_summary" + ] + }, + { + "cell_type": "code", + "execution_count": 106, + "id": "0851effa-872b-4b43-855d-09a08f836ab3", + "metadata": {}, + "outputs": [], + "source": [ + "dict_inputs = GTFS_DATA_DICT[segment_type]" + ] + }, + { + "cell_type": "code", + "execution_count": 107, + "id": "4f0b4f78-93ee-4687-ab55-29c628e2ae93", + "metadata": { + "scrolled": true, + "tags": [] + }, + "outputs": [], + "source": [ + "avg_summary_speeds_url = \"gs://calitp-analytics-data/data-analyses/rt_segment_speeds/rollup_singleday/speeds_route_dir_AH_TEST_2024-11-13.parquet\"" + ] + }, + { + "cell_type": "code", + "execution_count": 108, + "id": "8c35d5ca-4fec-4912-9869-c889befa71ae", + "metadata": {}, + "outputs": [], + "source": [ + "avg_summary_speeds_df = gpd.read_parquet(avg_summary_speeds_url)" + ] + }, + { + "cell_type": "markdown", + "id": "55e502c4-6fca-4378-b100-da4c1e5c14d7", + "metadata": {}, + "source": [ + "##### Only one route is showing!" + ] + }, + { + "cell_type": "code", + "execution_count": 109, + "id": "55b2425d-c8ab-45da-9af8-0aedc4c8437b", + "metadata": {}, + "outputs": [], + "source": [ + "avg_summary_speeds_df2 = avg_summary_speeds_df.loc[\n", + " avg_summary_speeds_df.schedule_gtfs_dataset_key.isin(schd_keys)\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": 110, + "id": "19209be1-d849-442c-ab84-d6a3d780a2f0", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
schedule_gtfs_dataset_keyroute_iddirection_id
100273105f2d1cabc8170ab066d96863c5d550.00
\n", + "
" + ], + "text/plain": [ + " schedule_gtfs_dataset_key route_id direction_id\n", + "1002 73105f2d1cabc8170ab066d96863c5d5 5 0.00" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "preview(avg_summary_speeds_df2)" + ] + }, + { + "cell_type": "code", + "execution_count": 111, + "id": "3413b59d-4dee-4d7c-bf39-6732a8189bb4", + "metadata": {}, + "outputs": [], + "source": [ + "common_shape_geom = gtfs_schedule_wrangling.most_common_shape_by_route_direction(\n", + " one_analysis_date\n", + ").to_crs(WGS84)" + ] + }, + { + "cell_type": "code", + "execution_count": 112, + "id": "43ac9053-9287-4ecf-9699-67a07eeede41", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['geometry', 'schedule_gtfs_dataset_key', 'route_id', 'direction_id',\n", + " 'common_shape_id', 'route_name'],\n", + " dtype='object')" + ] + }, + "execution_count": 112, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "common_shape_geom.columns" + ] + }, + { + "cell_type": "code", + "execution_count": 113, + "id": "29f05db8-dc82-4f8b-89b4-0fde6ab4c23f", + "metadata": {}, + "outputs": [], + "source": [ + "common_shape_geom2 = common_shape_geom.loc[\n", + " common_shape_geom.schedule_gtfs_dataset_key.isin(schd_keys)\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": 114, + "id": "6338c87c-827a-4ee8-a1d8-352abcb0ad09", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Int64Index: 23 entries, 167 to 1098\n", + "Data columns (total 6 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 geometry 23 non-null geometry\n", + " 1 schedule_gtfs_dataset_key 23 non-null object \n", + " 2 route_id 23 non-null object \n", + " 3 direction_id 23 non-null float64 \n", + " 4 common_shape_id 23 non-null object \n", + " 5 route_name 23 non-null object \n", + "dtypes: float64(1), geometry(1), object(4)\n", + "memory usage: 1.3+ KB\n" + ] + } + ], + "source": [ + "common_shape_geom2.info()" + ] + }, + { + "cell_type": "markdown", + "id": "e4a1c69b-b1ec-4474-b998-182e536939ba", + "metadata": {}, + "source": [ + "##### DONE. This `concatenate_trip_segment_speeds` is from `rt_segment_speeds/scripts/average_segment_speeds`" + ] + }, + { + "cell_type": "code", + "execution_count": 115, + "id": "d70f10af-fe25-4931-9f2c-2e60fe5b3248", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "concatenated files\n" + ] + } + ], + "source": [ + "df = average_summary_speeds.concatenate_trip_segment_speeds(\n", + " analysis_date_list, segment_type\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 116, + "id": "99349065-4151-4254-9c88-bf21537e7f27", + "metadata": {}, + "outputs": [], + "source": [ + "df2 = df.loc[df.schedule_gtfs_dataset_key.isin(schd_keys)]" + ] + }, + { + "cell_type": "markdown", + "id": "51467373-8bf3-4cd4-bd0a-4c47c2c54462", + "metadata": {}, + "source": [ + "##### DONE **Filled in `direction_id` with 0. Should actually go back to `average_summary_speeds.concatenate_trip_segment_speeds` and fill it in there**" + ] + }, + { + "cell_type": "code", + "execution_count": 117, + "id": "65a254fc-b944-4d4b-834d-a2d5547b5fc7", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/tmp/ipykernel_2800/3692506384.py:1: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " df2.direction_id = df2.direction_id.fillna(0)\n" + ] + } + ], + "source": [ + "df2.direction_id = df2.direction_id.fillna(0)" + ] + }, + { + "cell_type": "code", + "execution_count": 118, + "id": "7aacc425-3a17-4aca-9e62-371215494557", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Int64Index: 3543 entries, 159381 to 2656608\n", + "Data columns (total 17 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 schedule_gtfs_dataset_key 3543 non-null object \n", + " 1 shape_array_key 3543 non-null object \n", + " 2 shape_id 3543 non-null object \n", + " 3 stop_sequence 3543 non-null int64 \n", + " 4 route_id 3543 non-null object \n", + " 5 direction_id 3543 non-null float64 \n", + " 6 stop_pair 3543 non-null object \n", + " 7 stop_pair_name 3543 non-null object \n", + " 8 trip_instance_key 3543 non-null object \n", + " 9 speed_mph 3543 non-null float64 \n", + " 10 meters_elapsed 3543 non-null float64 \n", + " 11 sec_elapsed 3543 non-null float64 \n", + " 12 time_of_day 3543 non-null object \n", + " 13 arrival_time 3543 non-null datetime64[ns]\n", + " 14 service_date 3543 non-null datetime64[ns]\n", + " 15 peak_offpeak 3543 non-null object \n", + " 16 weekday_weekend 3543 non-null object \n", + "dtypes: datetime64[ns](2), float64(4), int64(1), object(10)\n", + "memory usage: 498.2+ KB\n" + ] + } + ], + "source": [ + "df2.info()" + ] + }, + { + "cell_type": "code", + "execution_count": 119, + "id": "afd48aa0-2404-4d73-946d-2621be91bb05", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
schedule_gtfs_dataset_keyroute_iddirection_id
15938173105f2d1cabc8170ab066d96863c5d5300.00
47213173105f2d1cabc8170ab066d96863c5d530.00
132098073105f2d1cabc8170ab066d96863c5d5200.00
162728473105f2d1cabc8170ab066d96863c5d540.00
172799673105f2d1cabc8170ab066d96863c5d550.00
175412273105f2d1cabc8170ab066d96863c5d5110.00
180142373105f2d1cabc8170ab066d96863c5d570.00
183809173105f2d1cabc8170ab066d96863c5d590.00
198682573105f2d1cabc8170ab066d96863c5d510.00
227758473105f2d1cabc8170ab066d96863c5d512X0.00
234144373105f2d1cabc8170ab066d96863c5d560.00
243180073105f2d1cabc8170ab066d96863c5d520.00
249147173105f2d1cabc8170ab066d96863c5d580.00
260081973105f2d1cabc8170ab066d96863c5d58a7c42f9-51e4-4848-bf88-30c210f149ad0.00
2615442f5a749dd65924e025b1293c58f95f8d6CC0.00
2656607f5a749dd65924e025b1293c58f95f8d6CC1.00
\n", + "
" + ], + "text/plain": [ + " schedule_gtfs_dataset_key \\\n", + "159381 73105f2d1cabc8170ab066d96863c5d5 \n", + "472131 73105f2d1cabc8170ab066d96863c5d5 \n", + "1320980 73105f2d1cabc8170ab066d96863c5d5 \n", + "1627284 73105f2d1cabc8170ab066d96863c5d5 \n", + "1727996 73105f2d1cabc8170ab066d96863c5d5 \n", + "1754122 73105f2d1cabc8170ab066d96863c5d5 \n", + "1801423 73105f2d1cabc8170ab066d96863c5d5 \n", + "1838091 73105f2d1cabc8170ab066d96863c5d5 \n", + "1986825 73105f2d1cabc8170ab066d96863c5d5 \n", + "2277584 73105f2d1cabc8170ab066d96863c5d5 \n", + "2341443 73105f2d1cabc8170ab066d96863c5d5 \n", + "2431800 73105f2d1cabc8170ab066d96863c5d5 \n", + "2491471 73105f2d1cabc8170ab066d96863c5d5 \n", + "2600819 73105f2d1cabc8170ab066d96863c5d5 \n", + "2615442 f5a749dd65924e025b1293c58f95f8d6 \n", + "2656607 f5a749dd65924e025b1293c58f95f8d6 \n", + "\n", + " route_id direction_id \n", + "159381 30 0.00 \n", + "472131 3 0.00 \n", + "1320980 20 0.00 \n", + "1627284 4 0.00 \n", + "1727996 5 0.00 \n", + "1754122 11 0.00 \n", + "1801423 7 0.00 \n", + "1838091 9 0.00 \n", + "1986825 1 0.00 \n", + "2277584 12X 0.00 \n", + "2341443 6 0.00 \n", + "2431800 2 0.00 \n", + "2491471 8 0.00 \n", + "2600819 8a7c42f9-51e4-4848-bf88-30c210f149ad 0.00 \n", + "2615442 CC 0.00 \n", + "2656607 CC 1.00 " + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "preview(df2)" + ] + }, + { + "cell_type": "markdown", + "id": "4effb93c-6b15-46a2-8365-232989cc563e", + "metadata": {}, + "source": [ + "##### Continuing on with `average_summary_speeds`" + ] + }, + { + "cell_type": "code", + "execution_count": 120, + "id": "3a7190ff-b431-4eea-884f-6b0175dcbe69", + "metadata": {}, + "outputs": [], + "source": [ + "trip_group_cols = OPERATOR_COLS + ROUTE_DIR_COLS" + ] + }, + { + "cell_type": "code", + "execution_count": 121, + "id": "6e2d9095-51c9-4507-be42-393f0abbc194", + "metadata": {}, + "outputs": [], + "source": [ + "trip_avg = (\n", + " metrics.weighted_average_speeds_across_segments(\n", + " df2,\n", + " trip_group_cols + [\"peak_offpeak\"],\n", + " )\n", + " .pipe(\n", + " gtfs_schedule_wrangling.merge_operator_identifiers,\n", + " analysis_date_list,\n", + " columns=average_segment_speeds.CROSSWALK_COLS,\n", + " )\n", + " .reset_index(drop=True)\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 122, + "id": "907dad4a-f806-4090-adbe-3d24eef62348", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
schedule_gtfs_dataset_keyroute_iddirection_idpeak_offpeakmeters_elapsedsec_elapsedspeed_mphnamecaltrans_districtorganization_source_record_idorganization_namebase64_url
073105f2d1cabc8170ab066d96863c5d510.00offpeak355890.8860001.0013.27Santa Maria Schedule05 - San Luis Obisporec9zGMJgNnes75K1City of Santa MariaaHR0cHM6Ly9zbXJ0LnRyaXBzaG90LmNvbS92MS9ndGZzLnppcD9yZWdpb25JZD1DQTU1OEREQy1EN0YyLTRCNDgtOUNBQy1ERUVBMTEzNEY4MjA=
\n", + "
" + ], + "text/plain": [ + " schedule_gtfs_dataset_key route_id direction_id peak_offpeak \\\n", + "0 73105f2d1cabc8170ab066d96863c5d5 1 0.00 offpeak \n", + "\n", + " meters_elapsed sec_elapsed speed_mph name \\\n", + "0 355890.88 60001.00 13.27 Santa Maria Schedule \n", + "\n", + " caltrans_district organization_source_record_id organization_name \\\n", + "0 05 - San Luis Obispo rec9zGMJgNnes75K1 City of Santa Maria \n", + "\n", + " base64_url \n", + "0 aHR0cHM6Ly9zbXJ0LnRyaXBzaG90LmNvbS92MS9ndGZzLnppcD9yZWdpb25JZD1DQTU1OEREQy1EN0YyLTRCNDgtOUNBQy1ERUVBMTEzNEY4MjA= " + ] + }, + "execution_count": 122, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "trip_avg.head(1)" + ] + }, + { + "cell_type": "code", + "execution_count": 123, + "id": "db139167-e446-49f9-9285-14011f5bad7e", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 30 entries, 0 to 29\n", + "Data columns (total 12 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 schedule_gtfs_dataset_key 30 non-null object \n", + " 1 route_id 30 non-null object \n", + " 2 direction_id 30 non-null float64\n", + " 3 peak_offpeak 30 non-null object \n", + " 4 meters_elapsed 30 non-null float64\n", + " 5 sec_elapsed 30 non-null float64\n", + " 6 speed_mph 30 non-null float64\n", + " 7 name 30 non-null object \n", + " 8 caltrans_district 30 non-null object \n", + " 9 organization_source_record_id 30 non-null object \n", + " 10 organization_name 30 non-null object \n", + " 11 base64_url 30 non-null object \n", + "dtypes: float64(4), object(8)\n", + "memory usage: 2.9+ KB\n" + ] + } + ], + "source": [ + "trip_avg.info()" + ] + }, + { + "cell_type": "code", + "execution_count": 124, + "id": "a6aae002-e115-4de5-bd36-b2aed09a3f16", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
schedule_gtfs_dataset_keyroute_iddirection_id
073105f2d1cabc8170ab066d96863c5d510.00
273105f2d1cabc8170ab066d96863c5d5110.00
473105f2d1cabc8170ab066d96863c5d512X0.00
673105f2d1cabc8170ab066d96863c5d520.00
873105f2d1cabc8170ab066d96863c5d5200.00
1073105f2d1cabc8170ab066d96863c5d530.00
1273105f2d1cabc8170ab066d96863c5d5300.00
1473105f2d1cabc8170ab066d96863c5d540.00
1673105f2d1cabc8170ab066d96863c5d550.00
1873105f2d1cabc8170ab066d96863c5d560.00
2073105f2d1cabc8170ab066d96863c5d570.00
2273105f2d1cabc8170ab066d96863c5d580.00
2473105f2d1cabc8170ab066d96863c5d58a7c42f9-51e4-4848-bf88-30c210f149ad0.00
2673105f2d1cabc8170ab066d96863c5d590.00
28f5a749dd65924e025b1293c58f95f8d6CC0.00
29f5a749dd65924e025b1293c58f95f8d6CC1.00
\n", + "
" + ], + "text/plain": [ + " schedule_gtfs_dataset_key route_id \\\n", + "0 73105f2d1cabc8170ab066d96863c5d5 1 \n", + "2 73105f2d1cabc8170ab066d96863c5d5 11 \n", + "4 73105f2d1cabc8170ab066d96863c5d5 12X \n", + "6 73105f2d1cabc8170ab066d96863c5d5 2 \n", + "8 73105f2d1cabc8170ab066d96863c5d5 20 \n", + "10 73105f2d1cabc8170ab066d96863c5d5 3 \n", + "12 73105f2d1cabc8170ab066d96863c5d5 30 \n", + "14 73105f2d1cabc8170ab066d96863c5d5 4 \n", + "16 73105f2d1cabc8170ab066d96863c5d5 5 \n", + "18 73105f2d1cabc8170ab066d96863c5d5 6 \n", + "20 73105f2d1cabc8170ab066d96863c5d5 7 \n", + "22 73105f2d1cabc8170ab066d96863c5d5 8 \n", + "24 73105f2d1cabc8170ab066d96863c5d5 8a7c42f9-51e4-4848-bf88-30c210f149ad \n", + "26 73105f2d1cabc8170ab066d96863c5d5 9 \n", + "28 f5a749dd65924e025b1293c58f95f8d6 CC \n", + "29 f5a749dd65924e025b1293c58f95f8d6 CC \n", + "\n", + " direction_id \n", + "0 0.00 \n", + "2 0.00 \n", + "4 0.00 \n", + "6 0.00 \n", + "8 0.00 \n", + "10 0.00 \n", + "12 0.00 \n", + "14 0.00 \n", + "16 0.00 \n", + "18 0.00 \n", + "20 0.00 \n", + "22 0.00 \n", + "24 0.00 \n", + "26 0.00 \n", + "28 0.00 \n", + "29 1.00 " + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "preview(trip_avg)" + ] + }, + { + "cell_type": "markdown", + "id": "05c11760-6000-4165-9d9c-3be8de68e087", + "metadata": {}, + "source": [ + "##### Skipping this part because I can't find `MIN_TRIP_SECONDS` and `MAX_TRIP_SECONDS` in `dict_input`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7aa47719-0302-440e-b15a-0443bb7e18dd", + "metadata": {}, + "outputs": [], + "source": [ + "\"\"\" trip_avg_filtered = trip_avg[\n", + " (trip_avg.meters_elapsed >= average_summary_speeds.METERS_CUTOFF) & \n", + " (trip_avg.sec_elapsed >= average_summary_speeds.MIN_TRIP_SECONDS) & \n", + " (trip_avg.sec_elapsed <= average_summary_speeds.MAX_TRIP_SECONDS)\n", + " ]\n", + " \"\"\"" + ] + }, + { + "cell_type": "code", + "execution_count": 125, + "id": "a5a2d768-cd9d-40cd-903b-a04c3e8ad33e", + "metadata": {}, + "outputs": [], + "source": [ + "group_cols = OPERATOR_COLS + ROUTE_DIR_COLS" + ] + }, + { + "cell_type": "code", + "execution_count": 126, + "id": "e7408973-da0a-4d0f-9b8f-a178310bd5b5", + "metadata": {}, + "outputs": [], + "source": [ + "avg_speeds = (\n", + " metrics.concatenate_peak_offpeak_allday_averages(\n", + " trip_avg, group_cols, metric_type=\"summary_speeds\"\n", + " )\n", + " .pipe(\n", + " gtfs_schedule_wrangling.merge_operator_identifiers,\n", + " analysis_date_list,\n", + " columns=average_segment_speeds.CROSSWALK_COLS,\n", + " )\n", + " .reset_index(drop=True)\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 127, + "id": "55a0be8a-b1bc-4138-8a90-49885512cf53", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
schedule_gtfs_dataset_keyroute_iddirection_id
073105f2d1cabc8170ab066d96863c5d510.00
273105f2d1cabc8170ab066d96863c5d5110.00
473105f2d1cabc8170ab066d96863c5d512X0.00
673105f2d1cabc8170ab066d96863c5d520.00
873105f2d1cabc8170ab066d96863c5d5200.00
1073105f2d1cabc8170ab066d96863c5d530.00
1273105f2d1cabc8170ab066d96863c5d5300.00
1473105f2d1cabc8170ab066d96863c5d540.00
1673105f2d1cabc8170ab066d96863c5d550.00
1873105f2d1cabc8170ab066d96863c5d560.00
2073105f2d1cabc8170ab066d96863c5d570.00
2273105f2d1cabc8170ab066d96863c5d580.00
2473105f2d1cabc8170ab066d96863c5d58a7c42f9-51e4-4848-bf88-30c210f149ad0.00
2673105f2d1cabc8170ab066d96863c5d590.00
42f5a749dd65924e025b1293c58f95f8d6CC0.00
43f5a749dd65924e025b1293c58f95f8d6CC1.00
\n", + "
" + ], + "text/plain": [ + " schedule_gtfs_dataset_key route_id \\\n", + "0 73105f2d1cabc8170ab066d96863c5d5 1 \n", + "2 73105f2d1cabc8170ab066d96863c5d5 11 \n", + "4 73105f2d1cabc8170ab066d96863c5d5 12X \n", + "6 73105f2d1cabc8170ab066d96863c5d5 2 \n", + "8 73105f2d1cabc8170ab066d96863c5d5 20 \n", + "10 73105f2d1cabc8170ab066d96863c5d5 3 \n", + "12 73105f2d1cabc8170ab066d96863c5d5 30 \n", + "14 73105f2d1cabc8170ab066d96863c5d5 4 \n", + "16 73105f2d1cabc8170ab066d96863c5d5 5 \n", + "18 73105f2d1cabc8170ab066d96863c5d5 6 \n", + "20 73105f2d1cabc8170ab066d96863c5d5 7 \n", + "22 73105f2d1cabc8170ab066d96863c5d5 8 \n", + "24 73105f2d1cabc8170ab066d96863c5d5 8a7c42f9-51e4-4848-bf88-30c210f149ad \n", + "26 73105f2d1cabc8170ab066d96863c5d5 9 \n", + "42 f5a749dd65924e025b1293c58f95f8d6 CC \n", + "43 f5a749dd65924e025b1293c58f95f8d6 CC \n", + "\n", + " direction_id \n", + "0 0.00 \n", + "2 0.00 \n", + "4 0.00 \n", + "6 0.00 \n", + "8 0.00 \n", + "10 0.00 \n", + "12 0.00 \n", + "14 0.00 \n", + "16 0.00 \n", + "18 0.00 \n", + "20 0.00 \n", + "22 0.00 \n", + "24 0.00 \n", + "26 0.00 \n", + "42 0.00 \n", + "43 1.00 " + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "preview(avg_speeds)" + ] + }, + { + "cell_type": "code", + "execution_count": 128, + "id": "287deabd-5043-4d7c-bcee-5952a2b0ad30", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 46 entries, 0 to 45\n", + "Data columns (total 12 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 schedule_gtfs_dataset_key 46 non-null object \n", + " 1 route_id 46 non-null object \n", + " 2 direction_id 46 non-null float64\n", + " 3 time_period 46 non-null object \n", + " 4 meters_elapsed 46 non-null float64\n", + " 5 sec_elapsed 46 non-null float64\n", + " 6 speed_mph 46 non-null float64\n", + " 7 name 46 non-null object \n", + " 8 caltrans_district 46 non-null object \n", + " 9 organization_source_record_id 46 non-null object \n", + " 10 organization_name 46 non-null object \n", + " 11 base64_url 46 non-null object \n", + "dtypes: float64(4), object(8)\n", + "memory usage: 4.4+ KB\n" + ] + } + ], + "source": [ + "avg_speeds.info()" + ] + }, + { + "cell_type": "code", + "execution_count": 129, + "id": "01efe186-e8f1-4c6d-941b-faf332a77281", + "metadata": {}, + "outputs": [], + "source": [ + "avg_speeds_with_geom = average_summary_speeds.merge_in_common_shape_geometry(\n", + " avg_speeds, one_analysis_date\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 130, + "id": "32228a19-0133-4d23-91e5-f60b34907b2a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
schedule_gtfs_dataset_keyroute_iddirection_id
073105f2d1cabc8170ab066d96863c5d570.00
373105f2d1cabc8170ab066d96863c5d560.00
6f5a749dd65924e025b1293c58f95f8d6CC1.00
8f5a749dd65924e025b1293c58f95f8d6CC0.00
1073105f2d1cabc8170ab066d96863c5d580.00
1373105f2d1cabc8170ab066d96863c5d512X0.00
1673105f2d1cabc8170ab066d96863c5d5110.00
1973105f2d1cabc8170ab066d96863c5d5300.00
2273105f2d1cabc8170ab066d96863c5d58a7c42f9-51e4-4848-bf88-30c210f149ad0.00
2573105f2d1cabc8170ab066d96863c5d520.00
2873105f2d1cabc8170ab066d96863c5d530.00
3173105f2d1cabc8170ab066d96863c5d5200.00
3473105f2d1cabc8170ab066d96863c5d550.00
3773105f2d1cabc8170ab066d96863c5d540.00
4073105f2d1cabc8170ab066d96863c5d590.00
4373105f2d1cabc8170ab066d96863c5d510.00
\n", + "
" + ], + "text/plain": [ + " schedule_gtfs_dataset_key route_id \\\n", + "0 73105f2d1cabc8170ab066d96863c5d5 7 \n", + "3 73105f2d1cabc8170ab066d96863c5d5 6 \n", + "6 f5a749dd65924e025b1293c58f95f8d6 CC \n", + "8 f5a749dd65924e025b1293c58f95f8d6 CC \n", + "10 73105f2d1cabc8170ab066d96863c5d5 8 \n", + "13 73105f2d1cabc8170ab066d96863c5d5 12X \n", + "16 73105f2d1cabc8170ab066d96863c5d5 11 \n", + "19 73105f2d1cabc8170ab066d96863c5d5 30 \n", + "22 73105f2d1cabc8170ab066d96863c5d5 8a7c42f9-51e4-4848-bf88-30c210f149ad \n", + "25 73105f2d1cabc8170ab066d96863c5d5 2 \n", + "28 73105f2d1cabc8170ab066d96863c5d5 3 \n", + "31 73105f2d1cabc8170ab066d96863c5d5 20 \n", + "34 73105f2d1cabc8170ab066d96863c5d5 5 \n", + "37 73105f2d1cabc8170ab066d96863c5d5 4 \n", + "40 73105f2d1cabc8170ab066d96863c5d5 9 \n", + "43 73105f2d1cabc8170ab066d96863c5d5 1 \n", + "\n", + " direction_id \n", + "0 0.00 \n", + "3 0.00 \n", + "6 1.00 \n", + "8 0.00 \n", + "10 0.00 \n", + "13 0.00 \n", + "16 0.00 \n", + "19 0.00 \n", + "22 0.00 \n", + "25 0.00 \n", + "28 0.00 \n", + "31 0.00 \n", + "34 0.00 \n", + "37 0.00 \n", + "40 0.00 \n", + "43 0.00 " + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "preview(avg_speeds_with_geom)" + ] + }, + { + "cell_type": "code", + "execution_count": 131, + "id": "7ce0e0b6-7dbf-4822-a4f1-5419e27539c9", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 46 entries, 0 to 45\n", + "Data columns (total 14 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 schedule_gtfs_dataset_key 46 non-null object \n", + " 1 route_id 46 non-null object \n", + " 2 direction_id 46 non-null float64 \n", + " 3 time_period 46 non-null object \n", + " 4 meters_elapsed 46 non-null float64 \n", + " 5 sec_elapsed 46 non-null float64 \n", + " 6 speed_mph 46 non-null float64 \n", + " 7 name 46 non-null object \n", + " 8 caltrans_district 46 non-null object \n", + " 9 organization_source_record_id 46 non-null object \n", + " 10 organization_name 46 non-null object \n", + " 11 base64_url 46 non-null object \n", + " 12 route_name 46 non-null object \n", + " 13 geometry 46 non-null geometry\n", + "dtypes: float64(4), geometry(1), object(9)\n", + "memory usage: 5.2+ KB\n" + ] + } + ], + "source": [ + "avg_speeds_with_geom.info()" + ] + }, + { + "cell_type": "markdown", + "id": "5c1ce92d-dceb-4ae4-affa-cc1daecd5f89", + "metadata": {}, + "source": [ + "##### Double check that my work matches what's in `gtfs_digest/merge_data`" + ] + }, + { + "cell_type": "code", + "execution_count": 132, + "id": "cbc4f65b-fdbd-4a33-ac39-36b753c5171c", + "metadata": {}, + "outputs": [], + "source": [ + "df_avg_speeds_og = pd.read_parquet(\n", + " \"gs://calitp-analytics-data/data-analyses/rt_segment_speeds/rollup_singleday/speeds_route_dir_2024-11-13.parquet\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 133, + "id": "978581b2-15fb-419f-af04-ce4c26b2f59a", + "metadata": {}, + "outputs": [], + "source": [ + "df_avg_speeds_og = df_avg_speeds_og.loc[\n", + " df_avg_speeds_og.schedule_gtfs_dataset_key.isin(schd_keys)\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": 134, + "id": "b77336a5-eb1e-4a1a-b82f-75d9f63334a3", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
schedule_gtfs_dataset_keyroute_iddirection_id
100073105f2d1cabc8170ab066d96863c5d550.00
\n", + "
" + ], + "text/plain": [ + " schedule_gtfs_dataset_key route_id direction_id\n", + "1000 73105f2d1cabc8170ab066d96863c5d5 5 0.00" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "preview(df_avg_speeds_og)" + ] + }, + { + "cell_type": "code", + "execution_count": 135, + "id": "a1191e33-c311-4d5a-b907-db2540d53c59", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['schedule_gtfs_dataset_key', 'route_id', 'direction_id', 'time_period',\n", + " 'meters_elapsed', 'sec_elapsed', 'speed_mph', 'name',\n", + " 'caltrans_district', 'organization_source_record_id',\n", + " 'organization_name', 'base64_url', 'route_name', 'geometry'],\n", + " dtype='object')" + ] + }, + "execution_count": 135, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_avg_speeds_og.columns" + ] + }, + { + "cell_type": "code", + "execution_count": 136, + "id": "586d9d8f-df66-43a6-ba70-00b8652e7697", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['schedule_gtfs_dataset_key', 'route_id', 'direction_id', 'time_period',\n", + " 'meters_elapsed', 'sec_elapsed', 'speed_mph', 'name',\n", + " 'caltrans_district', 'organization_source_record_id',\n", + " 'organization_name', 'base64_url', 'route_name', 'geometry'],\n", + " dtype='object')" + ] + }, + "execution_count": 136, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "avg_speeds_with_geom.columns" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "09f5b44b-5c7b-433d-aab7-3ccb4f86718f", + "metadata": {}, + "outputs": [], + "source": [ + "df_avg_speeds = avg_speeds_with_geom.copy()" + ] + }, + { + "cell_type": "markdown", + "id": "6bc456bb-8f26-429e-acc9-0cf80b380f76", + "metadata": {}, + "source": [ + "#### Dataframe in line 307 `df_rt_sched` in `gtfs_digest/merge_data`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "17ac16fc-791a-4b6d-ade1-73bd278bfbe3", + "metadata": {}, + "outputs": [], + "source": [ + "df_rt_sched = merge_data.concatenate_rt_vs_schedule_by_route_direction(\n", + " analysis_date_list\n", + ").astype({\"direction_id\": \"float\"})" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "00fa91ac-5933-47e8-9254-6e0200b51b23", + "metadata": {}, + "outputs": [], + "source": [ + "df_rt_sched2 = df_rt_sched.loc[df_rt_sched.schedule_gtfs_dataset_key.isin(schd_keys)]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ef91794e-3a0d-40d2-9b0b-45f86f9fc636", + "metadata": {}, + "outputs": [], + "source": [ + "preview(df_rt_sched2)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5fb3db36-9f49-4c0a-a725-b631f83ff1fd", + "metadata": {}, + "outputs": [], + "source": [ + "df_rt_sched2.head(2)" + ] + }, + { + "cell_type": "markdown", + "id": "37e4e50d-e658-4759-861d-32f123d7d7db", + "metadata": {}, + "source": [ + "##### `dt_rt_sched` is created using [`rt_scheduled_v_ran/scripts/rt_v_scheduled_routes`](https://github.com/cal-itp/data-analyses/blob/main/rt_scheduled_v_ran/scripts/rt_v_scheduled_routes.py) " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8134248e-c1c4-42eb-b609-dd7cf266fc6c", + "metadata": {}, + "outputs": [], + "source": [ + "[*GTFS_DATA_DICT[\"stop_segments\"][\"route_dir_cols\"]]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9a8b3cc0-24c0-4a07-affc-797542ba6d03", + "metadata": {}, + "outputs": [], + "source": [ + "dict_inputs = GTFS_DATA_DICT.rt_vs_schedule_tables" + ] + }, + { + "cell_type": "markdown", + "id": "d0e6de74-24ad-4c09-8de5-cc25e6dea8e9", + "metadata": {}, + "source": [ + "##### `route_metrics` in `rt_scheduled_v_ran/scripts/rt_v_scheduled_routes`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f40d89aa-5b32-481d-a86c-eb18e0c94e63", + "metadata": {}, + "outputs": [], + "source": [ + "TRIP_EXPORT = dict_inputs.vp_trip_metrics" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "117a707c-28c1-4f38-ab07-c331ae35c916", + "metadata": {}, + "outputs": [], + "source": [ + "ROUTE_EXPORT = dict_inputs.vp_route_direction_metrics" + ] + }, + { + "cell_type": "markdown", + "id": "db216b8a-5ee0-4ea6-8238-a0e89a8b4a11", + "metadata": {}, + "source": [ + "##### DONE **Everything is available in `trip_df`. Fill in Direction_id with 0.**\n", + "* Where is `trip_df` created again?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d2affb77-895f-406d-a6c9-b9c595347156", + "metadata": {}, + "outputs": [], + "source": [ + "trip_df = pd.read_parquet(f\"{RT_SCHED_GCS}{TRIP_EXPORT}_{one_analysis_date}.parquet\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dbbcd9b4-52b1-4860-9d67-d87b00087b4b", + "metadata": {}, + "outputs": [], + "source": [ + "trip_df2 = trip_df.loc[trip_df.schedule_gtfs_dataset_key.isin(schd_keys)]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5424c44f-a4c8-45c8-8f84-e386a97b35d9", + "metadata": {}, + "outputs": [], + "source": [ + "trip_df2.direction_id = trip_df2.direction_id.fillna(0)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "64df0dca-b76e-4024-9b98-4ad58bae2f9b", + "metadata": {}, + "outputs": [], + "source": [ + "trip_df2.shape" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9e7853f9-deba-43c4-97e9-733b0e4b5668", + "metadata": {}, + "outputs": [], + "source": [ + "preview(trip_df2)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b61d5a44-a403-42ab-b7e4-b61e4ac133e4", + "metadata": {}, + "outputs": [], + "source": [ + "trip_df2.info()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0ce4e0d1-f639-4f6e-8062-b3513feb071f", + "metadata": {}, + "outputs": [], + "source": [ + "trip_df2.loc[trip_df2.time_of_day == \"AM Peak\"].drop(\n", + " columns=[\"schedule_gtfs_dataset_key\", \"trip_instance_key\"]\n", + ").sort_values(by=[\"route_id\"]).drop_duplicates(\n", + " subset=[\n", + " \"route_id\",\n", + " \"direction_id\",\n", + " ]\n", + ").T" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "749f4347-3a7f-45a5-9d5b-5bb159c8baba", + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "\n", + "sys.path.append(\"../rt_scheduled_v_ran/scripts\")\n", + "import rt_v_scheduled_routes" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f4abfe0c-d6ac-4912-8452-c67f361930f0", + "metadata": {}, + "outputs": [], + "source": [ + "ROUTE_DIR_COLS = [*GTFS_DATA_DICT[\"stop_segments\"][\"route_dir_cols\"]]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6936d9d7-a8e4-4029-9214-ccf3df2c409b", + "metadata": {}, + "outputs": [], + "source": [ + "crosswalk_cols = [\n", + " \"schedule_gtfs_dataset_key\",\n", + " \"name\",\n", + " \"schedule_source_record_id\",\n", + " \"base64_url\",\n", + " \"organization_source_record_id\",\n", + " \"organization_name\",\n", + " \"caltrans_district\",\n", + "]" + ] + }, + { + "cell_type": "markdown", + "id": "d00aa8e3-a0b5-4401-8995-e087871994b2", + "metadata": {}, + "source": [ + "##### Have to break out `metrics.concatenate_peak_offpeak_allday_averages` which is in `rt_segment_speeds/segment_speed_utils/` because all of the routes are missing." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2930b17e-6e1f-4a6a-b429-6aa4f8ca38fe", + "metadata": {}, + "outputs": [], + "source": [ + "route_df = (\n", + " metrics.concatenate_peak_offpeak_allday_averages(\n", + " trip_df2,\n", + " group_cols=[\"schedule_gtfs_dataset_key\"] + ROUTE_DIR_COLS,\n", + " metric_type=\"rt_vs_schedule\",\n", + " )\n", + " .pipe(metrics.derive_rt_vs_schedule_metrics)\n", + " .pipe(rt_v_scheduled_routes.average_rt_trip_times)\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "805ac735-490a-4f47-a67a-327792f670a9", + "metadata": {}, + "outputs": [], + "source": [ + "preview(route_df)" + ] + }, + { + "cell_type": "markdown", + "id": "e8549cb3-776e-4b1b-bff8-358ded0f4134", + "metadata": {}, + "source": [ + "`calculate_avg_speeds` is from `rt_segement_speeds/segement_speed_utils/segment_calc.py` -> added `dropna=False`" + ] + }, + { + "cell_type": "markdown", + "id": "33059d74-1776-46b0-8fff-386667fa8332", + "metadata": {}, + "source": [ + "DONE `calculate_weighted_average_vp_schedule_metrics` is from `rt_segment_speeds/segment_speed_utils/metrics` -> added `dropna=False`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "942a86d6-3ed0-4edb-a4d0-5577164464af", + "metadata": {}, + "outputs": [], + "source": [ + "def calculate_weighted_average_vp_schedule_metrics(\n", + " df: pd.DataFrame,\n", + " group_cols: list,\n", + ") -> pd.DataFrame:\n", + "\n", + " sum_cols = [\n", + " \"minutes_atleast1_vp\",\n", + " \"minutes_atleast2_vp\",\n", + " \"rt_service_minutes\",\n", + " \"scheduled_service_minutes\",\n", + " \"total_vp\",\n", + " \"vp_in_shape\",\n", + " \"is_early\",\n", + " \"is_ontime\",\n", + " \"is_late\",\n", + " ]\n", + "\n", + " count_cols = [\"trip_instance_key\"]\n", + "\n", + " df2 = (\n", + " df.groupby(group_cols, observed=True, group_keys=False, dropna=False)\n", + " .agg({**{e: \"sum\" for e in sum_cols}, **{e: \"count\" for e in count_cols}})\n", + " .reset_index()\n", + " .rename(columns={\"trip_instance_key\": \"n_vp_trips\"})\n", + " )\n", + "\n", + " return df2" + ] + }, + { + "cell_type": "markdown", + "id": "99c92a5c-a7f0-41b3-be41-4376a72d1140", + "metadata": {}, + "source": [ + "DONE`weighted_average_speeds_across_segments` is from `rt_segment_speeds/segment_speed_utils/metrics` -> added `dropna=False`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7ba76d14-9458-481f-a59e-c69a944000b5", + "metadata": {}, + "outputs": [], + "source": [ + "def weighted_average_speeds_across_segments(\n", + " df: pd.DataFrame, group_cols: list\n", + ") -> pd.DataFrame:\n", + " \"\"\"\n", + " We can use our segments and the deltas within a trip\n", + " to calculate the trip-level average speed, or\n", + " the route-direction-level average speed.\n", + " But, we want a weighted average, using the raw deltas\n", + " instead of mean(speed_mph), since segments can be varying lengths.\n", + " \"\"\"\n", + " avg_speeds = (\n", + " df.groupby(group_cols, observed=True, group_keys=False, dropna=False)\n", + " .agg(\n", + " {\n", + " \"meters_elapsed\": \"sum\",\n", + " \"sec_elapsed\": \"sum\",\n", + " }\n", + " )\n", + " .reset_index()\n", + " ).pipe(segment_calcs.speed_from_meters_elapsed_sec_elapsed)\n", + "\n", + " return avg_speeds" + ] + }, + { + "cell_type": "markdown", + "id": "b3bb0bb3-9065-40c2-9f0c-52806fc00e86", + "metadata": {}, + "source": [ + "`concatenate_peak_offpeak_allday_averages` is from `rt_segment_speeds/segment_speed_utils/metrics`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a913f3fd-cfea-4956-8e9e-8d01974b3b85", + "metadata": {}, + "outputs": [], + "source": [ + "def concatenate_peak_offpeak_allday_averages(\n", + " df: pd.DataFrame, group_cols: list, metric_type: str\n", + ") -> pd.DataFrame:\n", + " \"\"\"\n", + " Calculate average speeds for all day and\n", + " peak_offpeak.\n", + " Concatenate these, so that speeds are always calculated\n", + " for the same 3 time periods.\n", + " \"\"\"\n", + " if metric_type == \"segment_speeds\":\n", + " avg_peak = calculate_avg_speeds(df, group_cols + [\"peak_offpeak\"])\n", + "\n", + " avg_allday = calculate_avg_speeds(df, group_cols).assign(peak_offpeak=\"all_day\")\n", + "\n", + " elif metric_type == \"summary_speeds\":\n", + " avg_peak = weighted_average_speeds_across_segments(\n", + " df, group_cols + [\"peak_offpeak\"]\n", + " )\n", + "\n", + " avg_allday = weighted_average_speeds_across_segments(df, group_cols).assign(\n", + " peak_offpeak=\"all_day\"\n", + " )\n", + "\n", + " elif metric_type == \"rt_vs_schedule\":\n", + " avg_peak = calculate_weighted_average_vp_schedule_metrics(\n", + " df, group_cols + [\"peak_offpeak\"]\n", + " )\n", + "\n", + " avg_allday = calculate_weighted_average_vp_schedule_metrics(\n", + " df, group_cols\n", + " ).assign(peak_offpeak=\"all_day\")\n", + "\n", + " else:\n", + " print(\n", + " f\"Valid metric types: ['segment_speeds', 'summary_speeds', 'rt_vs_schedule']\"\n", + " )\n", + "\n", + " # Concatenate so that every segment has 3 time periods: peak, offpeak, and all_day\n", + " avg_metrics = pd.concat([avg_peak, avg_allday], axis=0, ignore_index=True).rename(\n", + " columns={\"peak_offpeak\": \"time_period\"}\n", + " )\n", + "\n", + " return avg_metrics" + ] + }, + { + "cell_type": "markdown", + "id": "a91fa61a-0e3f-4d08-a3de-90afc9e7ea1c", + "metadata": {}, + "source": [ + "##### Going back to `rt_v_scheduled/scripts/rt_v_scheduled_routes.py`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "84ba1eca-7b33-4822-8771-ec2137a2415a", + "metadata": {}, + "outputs": [], + "source": [ + "route_metrics_df = concatenate_peak_offpeak_allday_averages(\n", + " trip_df2,\n", + " group_cols=[\"schedule_gtfs_dataset_key\"] + ROUTE_DIR_COLS,\n", + " metric_type=\"rt_vs_schedule\",\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5adb841a-436b-4a04-a9dc-3b061d01c875", + "metadata": {}, + "outputs": [], + "source": [ + "preview(route_metrics_df)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "30b4fe2b-47b4-4bef-9c03-ee8d8e163452", + "metadata": {}, + "outputs": [], + "source": [ + "route_metrics_df.shape" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4c9011ce-c7f9-4eee-bf89-406f97b3dddc", + "metadata": {}, + "outputs": [], + "source": [ + "route_metrics_df.route_id.value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f7a02de0-168a-420c-ae0f-063e0824eea7", + "metadata": { + "scrolled": true, + "tags": [] + }, + "outputs": [], + "source": [ + "route_metrics_df.loc[route_metrics_df.route_id == \"CC\"]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ce66eab0-999b-48d2-9f5c-c54c9e1d09de", + "metadata": {}, + "outputs": [], + "source": [ + "route_metrics_df = route_metrics_df.pipe(metrics.derive_rt_vs_schedule_metrics)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dfe011a8-52ac-4240-a094-8bf270fdb420", + "metadata": {}, + "outputs": [], + "source": [ + "route_metrics_df.columns" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1378f515-fa23-44b4-b556-f3eaaeaf077b", + "metadata": {}, + "outputs": [], + "source": [ + "route_metrics_df2 = route_metrics_df.pipe(rt_v_scheduled_routes.average_rt_trip_times)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ca5edbfd-cd6b-4bc9-9074-4e14a80be2dd", + "metadata": {}, + "outputs": [], + "source": [ + "route_metrics_df2.columns" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "44fd4153-2f1f-41fa-8262-cf7ea5200f97", + "metadata": {}, + "outputs": [], + "source": [ + "route_metrics_df3 = gtfs_schedule_wrangling.merge_operator_identifiers(\n", + " route_metrics_df2, [one_analysis_date], columns=crosswalk_cols\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ef670a71-4496-44fe-9d66-8c6e51315993", + "metadata": {}, + "outputs": [], + "source": [ + "route_metrics_df3.columns" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8c3af40a-c9d4-475a-b22e-687a07617994", + "metadata": {}, + "outputs": [], + "source": [ + "route_metrics_df.loc[route_metrics_df.time_period == \"peak\"].drop(\n", + " columns=[\n", + " \"schedule_gtfs_dataset_key\",\n", + " \"schedule_source_record_id\",\n", + " \"base64_url\",\n", + " \"organization_name\",\n", + " \"organization_source_record_id\",\n", + " \"caltrans_district\",\n", + " ]\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "34f85df9-e8f8-432a-9c88-075418b41250", + "metadata": {}, + "source": [ + "##### `df_rt_sched` equals `df_rt_sched`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a6b611fa-f06d-4a35-a5bb-d8101bc4e867", + "metadata": {}, + "outputs": [], + "source": [ + "df_rt_sched_og = merge_data.concatenate_rt_vs_schedule_by_route_direction(\n", + " analysis_date_list\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2521cbd7-2434-42d9-b84d-c22cbe5bf518", + "metadata": {}, + "outputs": [], + "source": [ + "df_rt_sched = route_metrics_df3.copy()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "65f3085b-dd9a-432b-ba12-d0f916c142e5", + "metadata": {}, + "outputs": [], + "source": [ + "df_rt_sched_og.columns" + ] + }, + { + "cell_type": "markdown", + "id": "431622bf-0f4c-486b-8eea-83451892aa2e", + "metadata": {}, + "source": [ + "##### All these columns pop up around the step of `gtfs_schedule_wrangling.merge_operator_identifiers` because the extra columns match what is in `crosswalk_cols`?? " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "238bd321-c642-4e32-b327-85a684e7f4b2", + "metadata": {}, + "outputs": [], + "source": [ + "df_rt_sched.columns" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "152826be-d2d7-4d25-aa5f-1c48a99c0c25", + "metadata": {}, + "outputs": [], + "source": [ + "df_rt_sched = df_rt_sched.drop(\n", + " columns=[\n", + " \"base64_url\",\n", + " \"organization_source_record_id\",\n", + " \"organization_name\",\n", + " \"caltrans_district\",\n", + " ]\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8365d9b3-8598-4405-b31c-0431a1fb2b39", + "metadata": {}, + "outputs": [], + "source": [ + "df_rt_sched_og.loc[\n", + " df_rt_sched_og.schedule_gtfs_dataset_key.isin(schd_keys)\n", + "].route_id.nunique()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2c1572f2-017c-44f4-a138-b5341c03cefa", + "metadata": {}, + "outputs": [], + "source": [ + "df_rt_sched.route_id.nunique()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "21691913-e7a2-448a-86aa-cc34c967a66c", + "metadata": {}, + "outputs": [], + "source": [ + "df_rt_sched.shape" + ] + }, + { + "cell_type": "markdown", + "id": "682b194e-29cb-4a6d-b321-35f7f0177774", + "metadata": {}, + "source": [ + "#### `gtfs_digest/merge_data/` line 316: `df_crosswalk`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "64c14f30-577f-41ec-80e9-2bf6759f074f", + "metadata": {}, + "outputs": [], + "source": [ + "df_crosswalk = merge_data.concatenate_crosswalk_organization(analysis_date_list)" + ] + }, + { + "cell_type": "markdown", + "id": "09f4f8d9-0398-4b9d-a2eb-28289fc6d145", + "metadata": {}, + "source": [ + "#### `gtfs_digest/merge_data/merge_data_sources_by_route_direction`\n", + "* Have to make some tweaks since `df_avg_speeds2` is missing a lot of routes." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "63679d40-8d3e-45e8-9b10-fa391787a4f1", + "metadata": {}, + "outputs": [], + "source": [ + "service_date_datetime = pd.to_datetime(\"2024-11-13T00:00:00.000000000\")" + ] + }, + { + "cell_type": "markdown", + "id": "7932a990-d3ee-42f9-b7f3-0ca9fc8fd4d8", + "metadata": {}, + "source": [ + "##### Why are time_periods and peak_off_peak different between `df_sched` and `df_rt_sched`\n", + "* Something is wrong with `df_sched` because a lot of `time_period` values are missing~\n", + "##### Amanda, test: fill in `nans` in `time_period` with `peak_offpeak`\n", + "* This might solve why all the routes are missing in Nov/Dec too?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1bb1555f-cc0b-411d-bfc2-167c4f8d8859", + "metadata": { + "scrolled": true, + "tags": [] + }, + "outputs": [], + "source": [ + "df_rt_sched[[\"route_id\", \"time_period\", \"direction_id\"]].drop_duplicates().sort_values(\n", + " by=[\"route_id\", \"direction_id\"]\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d676de3a-0322-43d6-847a-e4df00e59c32", + "metadata": { + "scrolled": true, + "tags": [] + }, + "outputs": [], + "source": [ + "df_sched[[\"route_id\", \"time_period\", \"direction_id\"]].drop_duplicates().sort_values(\n", + " by=[\"route_id\", \"direction_id\"]\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "39fa369e-2482-4337-b130-08b8a799b5d8", + "metadata": {}, + "outputs": [], + "source": [ + "df_sched[\"service_date\"] = service_date_datetime" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "825126f3-4363-45d4-9e09-ac80f850f19e", + "metadata": {}, + "outputs": [], + "source": [ + "df_rt_sched[\"service_date\"] = service_date_datetime" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b8ac692d-38e7-46c4-bffb-83663f81aa96", + "metadata": {}, + "outputs": [], + "source": [ + "df_avg_speeds[\"service_date\"] = service_date_datetime" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "febaf0e2-4328-4f27-b698-0b3ad6824369", + "metadata": {}, + "outputs": [], + "source": [ + "# merge1 = merge_data.merge_data_sources_by_route_direction(\n", + "# route_dir_metrics2,\n", + "# df_rt_sched,\n", + "# df_avg_speeds2,\n", + "# df_crosswalk\n", + "# )" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cd262bde-0d33-430d-aab9-71c5bcab2741", + "metadata": {}, + "outputs": [], + "source": [ + "primary_typology = merge_data.set_primary_typology(route_dir_metrics2)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "14979024-1101-472f-909a-5e3aca484f9c", + "metadata": {}, + "outputs": [], + "source": [ + "primary_typology.head(1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a6fc2e2c-5525-4af3-8c8f-0f9046fc54a6", + "metadata": {}, + "outputs": [], + "source": [ + "route_time_cols = [\n", + " \"schedule_gtfs_dataset_key\",\n", + " \"route_id\",\n", + " \"direction_id\",\n", + " \"time_period\",\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d46d5ef4-81b5-45c5-8aaa-398e70472aca", + "metadata": {}, + "outputs": [], + "source": [ + "df_schedule2 = pd.merge(df_sched, primary_typology, on=route_time_cols, how=\"left\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "881ccf54-00a2-4de9-b660-bfb83a749da4", + "metadata": {}, + "outputs": [], + "source": [ + "df_schedule2.info()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e8902f56-2f31-4c6a-be94-f3da7ef66645", + "metadata": {}, + "outputs": [], + "source": [ + "route_time_cols" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c8979573-6e86-4660-ac9d-8dfe2cb8a624", + "metadata": {}, + "outputs": [], + "source": [ + "df_schedule2.columns" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5345717b-819a-43a9-b757-5770795dc75f", + "metadata": {}, + "outputs": [], + "source": [ + "df_rt_sched.columns" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "007774e5-d6ab-43aa-8cc8-ecda0ac30e6d", + "metadata": {}, + "outputs": [], + "source": [ + "df_avg_speeds.columns" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b403a4ab-1dbd-4e6b-bfe8-f4ab9fdf4e2a", + "metadata": {}, + "outputs": [], + "source": [ + "df = pd.merge(\n", + " df_schedule2,\n", + " df_rt_sched,\n", + " on=route_time_cols + [\"service_date\"],\n", + " how=\"outer\",\n", + " indicator=\"sched_rt_category\",\n", + ").merge(\n", + " df_avg_speeds,\n", + " on=route_time_cols + [\"service_date\"],\n", + " how=\"outer\",\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "9e74a489-63e9-44ce-bd04-7fa5b140b1f9", + "metadata": {}, + "source": [ + "##### Check that all the routes are here." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bcf2314a-31ae-4b73-93b7-68d683dae795", + "metadata": {}, + "outputs": [], + "source": [ + "df.route_id.unique()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fe2a49fc-947e-4daf-86fd-b09062cfb9ab", + "metadata": {}, + "outputs": [], + "source": [ + "df.route_id.value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e25dab9a-13ae-4dce-ae13-ba4eaaa2307d", + "metadata": {}, + "outputs": [], + "source": [ + "df.columns" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ce2255dd-ca78-490a-9bb4-9926a8c18d12", + "metadata": {}, + "outputs": [], + "source": [ + "df2 = df.assign(\n", + " sched_rt_category=df.sched_rt_category.map(\n", + " gtfs_schedule_wrangling.sched_rt_category_dict\n", + " )\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "12d1522c-4338-4355-8185-50ab74745d77", + "metadata": {}, + "outputs": [], + "source": [ + "df2.columns" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c7460296-a3fd-46f2-b450-835d1705076d", + "metadata": {}, + "outputs": [], + "source": [ + "df3 = df2.pipe(\n", + " merge_data.merge_in_standardized_route_names,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "91241657-7be4-4967-9e55-7b18a8659529", + "metadata": {}, + "outputs": [], + "source": [ + "df3.columns" + ] + }, + { + "cell_type": "markdown", + "id": "9220b9cc-7683-479b-a288-1832d414ea17", + "metadata": {}, + "source": [ + "###### Extra columns are popping up?? Detailed below. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2a289464-c5bc-4691-b87b-ce88a8960f07", + "metadata": {}, + "outputs": [], + "source": [ + "drop_cols = [\n", + " \"schedule_source_record_id\",\n", + " \"base64_url\",\n", + " \"organization_source_record_id\",\n", + " \"organization_name\",\n", + " \"caltrans_district\",\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "47440343-04d9-4f00-adfc-22a6ad288763", + "metadata": {}, + "outputs": [], + "source": [ + "df4 = pd.merge(\n", + " df3.drop(columns=drop_cols),\n", + " df_crosswalk,\n", + " on=[\"schedule_gtfs_dataset_key\", \"name\", \"service_date\"],\n", + " how=\"left\",\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "73244d16-3ca8-4d70-934b-4d2c6b8907f6", + "metadata": {}, + "outputs": [], + "source": [ + "df4.columns" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3b3ef87b-f4b6-49b3-8cbe-0f7011ee022f", + "metadata": {}, + "outputs": [], + "source": [ + "df4.route_id.value_counts()" + ] + }, + { + "cell_type": "markdown", + "id": "2319efc6-c98e-4f3e-93ec-6160381af404", + "metadata": {}, + "source": [ + "##### Lots of repeated columns...why!!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "94eb667e-ebd4-489c-80b7-4419eb670a45", + "metadata": {}, + "outputs": [], + "source": [ + "df4.info()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "37d5c006-b5cb-4626-b22a-5e8d557ea6be", + "metadata": {}, + "outputs": [], + "source": [ + "df5 = df4.pipe(\n", + " # Find the most common cardinal direction\n", + " gtfs_schedule_wrangling.top_cardinal_direction\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "1515317b-2747-4658-abcf-495a520b511f", + "metadata": {}, + "source": [ + "#### Observations\n", + "* There are no typologies for these previously missing routes.\n", + "* `Route_primary_direction` and `direction_id` is empty for all of City of Santa Maria \n", + "* `route_ids` are repeated...somehow messed up during merges.\n", + "* I have an extra column for `peak_offpeak ` and `time_period`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4d6b6853-10cf-4a99-8b34-63e8704e4874", + "metadata": {}, + "outputs": [], + "source": [ + "df5.info()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dbddc7d0-9c03-4563-b799-36529a257203", + "metadata": {}, + "outputs": [], + "source": [ + "df[[\"time_period\", \"route_id\"]].drop_duplicates().sort_values(by=[\"route_id\"])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "77c95343-ceb5-4e3d-b87c-a75b822ba4cf", + "metadata": {}, + "outputs": [], + "source": [ + "df5.columns" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "eb1b1ea0-5aff-46ee-8c23-47d218759ecc", + "metadata": {}, + "outputs": [], + "source": [ + "preview_cols = [\n", + " \"organization_name\",\n", + " \"route_id\",\n", + " \"sched_rt_category\",\n", + " \"direction_id\",\n", + " \"route_primary_direction\",\n", + " \"avg_scheduled_service_minutes\",\n", + " \"avg_stop_miles\",\n", + " \"n_trips\",\n", + " \"time_period\",\n", + " \"frequency\",\n", + " \"typology\",\n", + " \"minutes_atleast1_vp\",\n", + " \"minutes_atleast2_vp\",\n", + " \"total_rt_service_minutes\",\n", + " \"total_scheduled_service_minutes\",\n", + " \"total_vp\",\n", + " \"vp_in_shape\",\n", + " \"is_early\",\n", + " \"is_ontime\",\n", + " \"is_late\",\n", + " \"n_vp_trips\",\n", + " \"vp_per_minute\",\n", + " \"pct_in_shape\",\n", + " \"pct_rt_journey_atleast1_vp\",\n", + " \"pct_rt_journey_atleast2_vp\",\n", + " \"pct_sched_journey_atleast1_vp\",\n", + " \"pct_sched_journey_atleast2_vp\",\n", + " \"rt_sched_journey_ratio\",\n", + " \"avg_rt_service_minutes\",\n", + " \"speed_mph\",\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5854cf7b-c98f-4a36-9c86-e17234f299ea", + "metadata": { + "scrolled": true, + "tags": [] + }, + "outputs": [], + "source": [ + "df5.groupby(\n", + " [\n", + " \"route_id\",\n", + " \"sched_rt_category\",\n", + " ]\n", + ").agg({\"organization_name\": \"count\"})" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7ef1f3e6-df31-4141-89a4-9b71f51bc1c7", + "metadata": {}, + "outputs": [], + "source": [ + "df5.loc[df5.route_id == \"1B\"][preview_cols].sort_values(\n", + " by=[\"organization_name\", \"route_id\"]\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "712eb63d-5352-40ad-a856-1a3066f13b96", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "df5.loc[df5.time_period == \"peak\"][preview_cols].sort_values(\n", + " by=[\"organization_name\", \"route_id\"]\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9df0ccd6-5573-42c5-b31a-0fb6e95b506c", + "metadata": {}, + "outputs": [], + "source": [ + "stop" + ] + }, + { + "cell_type": "markdown", + "id": "46093153-c813-4684-a6a0-1c163589c41f", + "metadata": {}, + "source": [ + "### Fix `ROUTE_TYPOLOGIES` in `gtfs_funnel/route_typologies.py`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "229e3c18-dedc-4baa-b583-72679b06b7b6", + "metadata": {}, + "outputs": [], + "source": [ + "ROUTE_TYPOLOGIES" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "86d73125-6624-45b1-8463-7d3cd4c6f613", + "metadata": {}, + "outputs": [], + "source": [ + "GTFS_DATA_DICT.schedule_tables.route_typologies" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "54d371c5-1143-4649-bc21-280188257722", + "metadata": {}, + "outputs": [], + "source": [ + "GTFS_DATA_DICT.schedule_tables.route_typologies" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "88fe3778-c61e-4b35-89dd-2d8593a739a8", + "metadata": {}, + "outputs": [], + "source": [ + "route_typologies2 = route_typologies.loc[\n", + " route_typologies.schedule_gtfs_dataset_key.isin(schd_keys)\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0e9693a0-053e-4d9d-a250-434144929cb5", + "metadata": {}, + "outputs": [], + "source": [ + "route_typologies2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d04a785f-ee9a-4bd5-8490-aefed10e34b5", + "metadata": {}, + "outputs": [], + "source": [ + "route_dir_cols = [\n", + " \"schedule_gtfs_dataset_key\",\n", + " \"route_id\",\n", + " \"direction_id\",\n", + " \"common_shape_id\",\n", + " \"route_name\",\n", + " \"route_meters\",\n", + "]" + ] + }, + { + "cell_type": "markdown", + "id": "98a3fbab-daba-423c-8d55-e9008d427baf", + "metadata": {}, + "source": [ + "##### Amanda: in `rt_segment_speeds/segment_speed_utils/gtfs_schedule_wrangling`, I filled `nan` rows in `direction_id`. Then I commented out parts of `gtfs_funnel/route_typologies`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "95bbd6b2-9e0a-4ba4-b953-f5eb047828f9", + "metadata": {}, + "outputs": [], + "source": [ + "common_shape = gtfs_schedule_wrangling.most_common_shape_by_route_direction(\n", + " one_analysis_date\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f701bf69-7c72-4ec7-a13a-2aea089cd71b", + "metadata": {}, + "outputs": [], + "source": [ + "common_shape2 = common_shape.loc[common_shape.schedule_gtfs_dataset_key.isin(schd_keys)]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "982eea55-9177-4031-81d7-fcc20a5b988e", + "metadata": {}, + "outputs": [], + "source": [ + "nov_typology_ah_test_df = pd.read_parquet(\n", + " \"gs://calitp-analytics-data/data-analyses/gtfs_schedule/nacto_typologies/route_typologies_AH_TESTING_2024-11-13.parquet\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6bf465f8-4775-47c9-b864-857724ad739d", + "metadata": {}, + "outputs": [], + "source": [ + "nov_typology_ah_test_df.loc[\n", + " nov_typology_ah_test_df.schedule_gtfs_dataset_key.isin(schd_keys)\n", + "]" + ] + }, + { + "cell_type": "markdown", + "id": "77a3f68d-1244-4306-920c-27aed2f543bd", + "metadata": { + "tags": [] + }, + "source": [ + "### Fix Map: `gtfs_digest/merge_operator_data`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cb99b4b5-7745-422c-a6c5-153f02ffc244", + "metadata": {}, + "outputs": [], + "source": [ + "OPERATOR_FILE = GTFS_DATA_DICT.digest_tables.operator_profiles\n", + "OPERATOR_ROUTE = GTFS_DATA_DICT.digest_tables.operator_routes_map" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "55faff71-f82c-46fc-a99d-dcc40205e100", + "metadata": {}, + "outputs": [], + "source": [ + "operator_route_gdf = gpd.read_parquet(\n", + " f\"{RT_SCHED_GCS}{OPERATOR_ROUTE}.parquet\",\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9b2c852f-f053-406a-8274-8b4f015f10c9", + "metadata": {}, + "outputs": [], + "source": [ + "operator_route_gdf.columns" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "89ccde0b-736c-4fc9-a294-8a12116823a8", + "metadata": {}, + "outputs": [], + "source": [ + "operator_route_gdf2.columns" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "295aaf35-9ade-4f9e-bc4d-5b8ef95a1569", + "metadata": {}, + "outputs": [], + "source": [ + "len(operator_route_gdf2)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5630aaaa-dc8b-4917-b9fa-ae0924999720", + "metadata": {}, + "outputs": [], + "source": [ + "operator_route_gdf2.is_rail.value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3ecc56aa-63ce-402b-8136-a847fd5c0d11", + "metadata": {}, + "outputs": [], + "source": [ + "operator_route_gdf2.organization_name.value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5798606e-2ea4-4ab0-a6d8-a5597a51e66f", + "metadata": {}, + "outputs": [], + "source": [ + "operator_route_gdf2.schedule_gtfs_dataset_key.unique()" + ] + }, + { + "cell_type": "markdown", + "id": "26d11950-fca8-4f5b-8d17-2b9fa0aa368c", + "metadata": {}, + "source": [ + "#### Why does City of Santa Maria have multiple schedule_gtfs_dataset_keys?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "81fbd586-cc2d-4a70-97a6-5b25228684b8", + "metadata": {}, + "outputs": [], + "source": [ + "operator_route_gdf2.groupby([\"organization_name\", \"schedule_gtfs_dataset_key\"]).agg(\n", + " {\"route_short_name\": \"nunique\"}\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "568e2a00-8f8c-451c-8b6d-ae331d18471c", + "metadata": {}, + "outputs": [], + "source": [ + "operator_route_gdf2.drop(columns=[\"service_date\"]).explore(\"organization_name\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bd466515-a3cd-473a-a01a-2e73f9507104", "metadata": {}, "outputs": [], "source": [ - "longest_shape_gdf2 = longest_shape_gdf2.dropna()" + "# operator_route_gdf2.drop(columns = [\"service_date\"]).explore(\"shape_array_key\")" + ] + }, + { + "cell_type": "markdown", + "id": "b1ddfdee-292e-4d57-bb1e-17248e87fce8", + "metadata": {}, + "source": [ + "#### Starting from here [`gtfs_funnel/operator_scheduled_stats`](https://github.com/cal-itp/data-analyses/blob/4dc340343a60b45ad94217c3efd91f807b03ebc2/gtfs_funnel/operator_scheduled_stats.py#L148)" ] }, { "cell_type": "code", - "execution_count": 28, - "id": "50af7a1e-4e0c-4e5e-9755-f9ffbab99a8b", + "execution_count": null, + "id": "d14199f0-63e5-466c-a122-51b2c2abaa75", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "4" - ] - }, - "execution_count": 28, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ - "longest_shape_gdf2.shape_array_key.nunique()" + "analysis_date = \"2024-11-13\"" ] }, { "cell_type": "code", - "execution_count": 29, - "id": "94bc9b1f-8a73-4c7a-a773-cc61b843b6a7", + "execution_count": null, + "id": "7a908db1-ddaa-41f9-b0dd-41b0a6046ad6", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "f5a749dd65924e025b1293c58f95f8d6 3\n", - "73105f2d1cabc8170ab066d96863c5d5 1\n", - "Name: schedule_gtfs_dataset_key, dtype: int64" - ] - }, - "execution_count": 29, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7c57add2-f72c-4c22-9ca6-e5efe879cab3", + "metadata": {}, + "outputs": [], "source": [ - "longest_shape_gdf2.schedule_gtfs_dataset_key.value_counts()" + "schd_keys = list(operator_route_gdf2.schedule_gtfs_dataset_key.unique())" ] }, { "cell_type": "code", - "execution_count": 30, - "id": "19012a85-7ef7-4188-870b-251eb600034f", + "execution_count": null, + "id": "7a76026d-88e6-49a3-83f8-b20836b70d7a", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "id": "a97faf62-2c23-428a-a2fa-23cb8fb7f11e", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "array(['73105f2d1cabc8170ab066d96863c5d5',\n", - " 'f5a749dd65924e025b1293c58f95f8d6'], dtype=object)" - ] - }, - "execution_count": 30, - "metadata": {}, - "output_type": "execute_result" - } - ], "source": [ - "longest_shape_gdf2.schedule_gtfs_dataset_key.unique()" + "#### Longest shape does have all the routes..." ] }, { "cell_type": "code", - "execution_count": 31, - "id": "1eae8307-2d99-41e4-b541-ca2fc1c68b02", + "execution_count": null, + "id": "33484c6b-1422-42f8-918e-a7aa70531aa3", "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
Make this Notebook Trusted to load map: File -> Trust Notebook
" - ], - "text/plain": [ - "" - ] - }, - "execution_count": 31, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ - "longest_shape_gdf2.explore(\"schedule_gtfs_dataset_key\")" + "longest_shape_gdf = operator_scheduled_stats.longest_shape_by_route(analysis_date)" ] }, { - "cell_type": "markdown", - "id": "0706fb58-e04e-4d40-b49b-d505da875262", + "cell_type": "code", + "execution_count": null, + "id": "3bcb40ca-7e6a-432e-a70c-e1817f7eebe9", "metadata": {}, + "outputs": [], "source": [ - "### Step back before finding the longest shape [here](https://github.com/cal-itp/data-analyses/blob/4dc340343a60b45ad94217c3efd91f807b03ebc2/rt_segment_speeds/segment_speed_utils/gtfs_schedule_wrangling.py#L365)\n", - "#### Amanda: deleted `direction_id` b/c I discovered City of Santa Maria doesn't have values for the column `direction_id`" + "longest_shape_gdf2 = longest_shape_gdf.loc[\n", + " longest_shape_gdf.schedule_gtfs_dataset_key.isin(schd_keys)\n", + "]" ] }, { "cell_type": "code", - "execution_count": 32, - "id": "0f1f1160-fd82-4e35-adaa-914aac83ac85", + "execution_count": null, + "id": "86963a9f-3456-48d5-a386-05c211fe93f4", "metadata": {}, "outputs": [], "source": [ - "route_dir_cols = [\"gtfs_dataset_key\", \"route_id\", ]\n", - "\n", - "keep_trip_cols = route_dir_cols + [\"trip_instance_key\", \"shape_id\", \"shape_array_key\"]" + "longest_shape_gdf2.columns" ] }, { "cell_type": "code", - "execution_count": 33, - "id": "f332d9b6-d1f8-456d-b3bf-495651b17214", + "execution_count": null, + "id": "b256ef9b-82c1-4832-ac54-19ca9319bdc4", "metadata": {}, "outputs": [], "source": [ - "trips = helpers.import_scheduled_trips(\n", - " analysis_date, columns=keep_trip_cols, get_pandas=True\n", - ").rename(columns={\"schedule_gtfs_dataset_key\": \"gtfs_dataset_key\"})" + "longest_shape_gdf2.info()" ] }, { "cell_type": "code", - "execution_count": 34, - "id": "5e133db6-9df8-42d4-ad84-6d158514f045", + "execution_count": null, + "id": "6db42351-2a52-4e00-a265-33e5743cdea2", "metadata": {}, "outputs": [], "source": [ - "sorting_order = [True for i in route_dir_cols]" + "longest_shape_gdf2.route_id.value_counts()" ] }, { "cell_type": "code", - "execution_count": 35, - "id": "2407cc5f-b8c7-416e-b38b-8dc54a90ed30", + "execution_count": null, + "id": "ca9a6950-6b83-4ec7-bc55-cdc79f3a0843", "metadata": {}, "outputs": [], "source": [ - "# Grab only relevant schedule_gtfs_dataset_keys\n", - "trips2 = trips.loc[trips.gtfs_dataset_key.isin(schd_keys)].reset_index(drop=True)" + "# longest_shape_gdf2.explore(\"schedule_gtfs_dataset_key\")" ] }, { "cell_type": "code", - "execution_count": 54, - "id": "373584a8-a3b4-414a-8d95-7e1e08c61fa7", + "execution_count": null, + "id": "6dfe2c94-098a-4816-8255-278b85a43f0b", + "metadata": {}, + "outputs": [], + "source": [ + "longest_shape_gdf2.groupby([\"schedule_gtfs_dataset_key\", \"route_id\"]).agg(\n", + " {\"route_length_miles\": \"max\"}\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "69d6cb38-2073-4119-bff6-bdb777038b43", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "RangeIndex: 335 entries, 0 to 334\n", - "Data columns (total 5 columns):\n", - " # Column Non-Null Count Dtype \n", - "--- ------ -------------- ----- \n", - " 0 gtfs_dataset_key 335 non-null object\n", - " 1 route_id 335 non-null object\n", - " 2 trip_instance_key 335 non-null object\n", - " 3 shape_id 335 non-null object\n", - " 4 shape_array_key 335 non-null object\n", - "dtypes: object(5)\n", - "memory usage: 13.2+ KB\n" - ] - } - ], "source": [ - "trips2.info()" + "#### Somewhere along the way the routes are cut...maybe b/c of `direction_id`" ] }, { "cell_type": "code", - "execution_count": 55, - "id": "521a90b8-c28e-42eb-b61d-231b11594db6", + "execution_count": null, + "id": "859ad004-4b41-41d3-9da7-0c5524daa98e", "metadata": {}, "outputs": [], "source": [ - "direction_id_kept = [\"direction_id\",\"gtfs_dataset_key\",\"route_id\", \"trip_instance_key\", \"shape_id\", \"shape_array_key\"]" + "OPERATOR_EXPORT = GTFS_DATA_DICT.schedule_tables.operator_scheduled_stats" ] }, { "cell_type": "code", - "execution_count": 56, - "id": "2b0a5d50-b5df-4e3e-b4af-6cee0ceb9498", + "execution_count": null, + "id": "c6e5a1f3-4aba-40ec-811b-7ea6c1e01655", "metadata": {}, "outputs": [], "source": [ - "trips_w_direction_id = helpers.import_scheduled_trips(\n", - " analysis_date, columns=direction_id_kept, get_pandas=True\n", - ").rename(columns={\"schedule_gtfs_dataset_key\": \"gtfs_dataset_key\"})" + "SCHED_GCS" ] }, { "cell_type": "code", - "execution_count": 58, - "id": "24feb45b-154d-4f0c-8c1e-0b5af20d8926", + "execution_count": null, + "id": "537949dd-a008-4643-bbd1-de0dc142026b", "metadata": {}, "outputs": [], "source": [ - "# Grab only relevant schedule_gtfs_dataset_keys\n", - "trips_w_direction_id2 = trips_w_direction_id.loc[trips_w_direction_id.gtfs_dataset_key.isin(schd_keys)].reset_index(drop=True)" + "GTFS_DATA_DICT.schedule_tables.operator_routes" ] }, { "cell_type": "code", - "execution_count": 59, - "id": "5964848f-970b-48bc-868e-0d1ff9e0c1e5", + "execution_count": null, + "id": "4ac524dc-d27e-43b5-94ad-e8390d5c7f0f", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "RangeIndex: 335 entries, 0 to 334\n", - "Data columns (total 6 columns):\n", - " # Column Non-Null Count Dtype \n", - "--- ------ -------------- ----- \n", - " 0 direction_id 75 non-null float64\n", - " 1 gtfs_dataset_key 335 non-null object \n", - " 2 route_id 335 non-null object \n", - " 3 trip_instance_key 335 non-null object \n", - " 4 shape_id 335 non-null object \n", - " 5 shape_array_key 335 non-null object \n", - "dtypes: float64(1), object(5)\n", - "memory usage: 15.8+ KB\n" - ] - } - ], + "outputs": [], "source": [ - "trips_w_direction_id2.info()" + "dec_url = \"gs://calitp-analytics-data/data-analyses/gtfs_schedule/operator_profiles/operator_routes_2024-12-11.parquet\"" ] }, { "cell_type": "code", - "execution_count": 36, - "id": "9beb666c-5e6d-4db8-94fb-e7faf717074c", + "execution_count": null, + "id": "e8b5d8f7-f6cf-4b24-a77b-685bfd444966", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "335" - ] - }, - "execution_count": 36, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ - "len(trips2)" + "dec_df = gpd.read_parquet(dec_url)" ] }, { "cell_type": "code", - "execution_count": 37, - "id": "c154dd13-8561-4909-a77c-cec19fc963c6", + "execution_count": null, + "id": "bb5c8db7-baa1-4b4a-a018-60c419e48343", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "73105f2d1cabc8170ab066d96863c5d5 278\n", - "f5a749dd65924e025b1293c58f95f8d6 57\n", - "Name: gtfs_dataset_key, dtype: int64" - ] - }, - "execution_count": 37, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ - "trips2.gtfs_dataset_key.value_counts()" + "dec_df.organization_name.value_counts().head()" ] }, { "cell_type": "code", - "execution_count": 39, - "id": "f6b3b67f-2173-453c-879a-e9c9fa515466", + "execution_count": null, + "id": "e4037d38-d6f7-4471-875c-3471d0219bfe", "metadata": {}, "outputs": [], "source": [ - "most_common_shape = (\n", - " trips2.groupby(\n", - " route_dir_cols + [\"shape_id\", \"shape_array_key\"],\n", - " observed=True,\n", - " group_keys=False,\n", - " )\n", - " .agg({\"trip_instance_key\": \"count\"})\n", - " .reset_index()\n", - " .sort_values(\n", - " route_dir_cols + [\"trip_instance_key\"], ascending=sorting_order + [False]\n", - " )\n", - " .drop_duplicates(subset=route_dir_cols)\n", - " .reset_index(drop=True)[route_dir_cols + [\"shape_id\", \"shape_array_key\"]]\n", - ").rename(\n", - " columns={\n", - " \"gtfs_dataset_key\": \"schedule_gtfs_dataset_key\",\n", - " \"shape_id\": \"common_shape_id\",\n", - " }\n", - ")" + "dec_df.loc[\n", + " dec_df.organization_name == \"Alameda-Contra Costa Transit District\"\n", + "].head().drop(columns=[\"geometry\"]).T" ] }, { "cell_type": "code", - "execution_count": 40, - "id": "26c4100c-33bb-4075-baf5-a02a8c791bf6", + "execution_count": null, + "id": "19012a85-7ef7-4188-870b-251eb600034f", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "pandas.core.frame.DataFrame" - ] - }, - "execution_count": 40, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ - "type(most_common_shape)" + "dec_df2 = dec_df.loc[dec_df.schedule_gtfs_dataset_key.isin(schd_keys)]" ] }, { "cell_type": "code", - "execution_count": 41, - "id": "dcfa378c-1417-4be2-bfd6-dbafdb6c771a", + "execution_count": null, + "id": "1eae8307-2d99-41e4-b541-ca2fc1c68b02", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "20" - ] - }, - "execution_count": 41, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ - "len(most_common_shape)" + "dec_df2.shape" ] }, { "cell_type": "code", - "execution_count": 43, - "id": "3385337d-35cf-4bf7-a56b-f9663b9f27d5", + "execution_count": null, + "id": "5c31f929-f77e-4fb3-a571-3b0577c6d3f5", "metadata": {}, "outputs": [], "source": [ - "shape_geom = helpers.import_scheduled_shapes(\n", - " analysis_date,\n", - " columns=[\"shape_array_key\", \"geometry\"],\n", - ")" + "type(dec_df2)" ] }, { "cell_type": "code", - "execution_count": 44, - "id": "3d8b4211-aa87-458f-a1fa-3a777b888657", + "execution_count": null, + "id": "0751e612-789f-4dcc-b771-0b6af7960ff7", "metadata": {}, "outputs": [], "source": [ - "common_shape_geom = pd.merge(\n", - " shape_geom, most_common_shape, on=\"shape_array_key\", how=\"inner\"\n", - ").drop(columns=\"shape_array_key\")" + "dec_df2.drop(columns=[\"geometry\"]).T" ] }, { "cell_type": "code", - "execution_count": 45, - "id": "992b5d9c-f5b7-4270-8d3d-ee2e7079e5e9", + "execution_count": null, + "id": "45eb690e-e5a8-4798-885a-a5a738bc8062", + "metadata": {}, + "outputs": [], + "source": [ + "# dec_df2.explore()" + ] + }, + { + "cell_type": "markdown", + "id": "ec619dd6-f042-492f-8b87-4adaf435241d", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "(geopandas.geodataframe.GeoDataFrame, 20)" - ] - }, - "execution_count": 45, - "metadata": {}, - "output_type": "execute_result" - } - ], "source": [ - "type(common_shape_geom), len(common_shape_geom)" + "#### Find where in `gtfs_funnel` all the routes disappear" ] }, { "cell_type": "code", - "execution_count": 46, - "id": "28a37ddd-52e8-4a91-9b9d-8be0f22f8e5e", + "execution_count": null, + "id": "b9e70fd7-3fa0-4d9f-899b-e1e3eea03151", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "Index(['geometry', 'schedule_gtfs_dataset_key', 'route_id', 'common_shape_id'], dtype='object')" - ] - }, - "execution_count": 46, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ - "common_shape_geom.columns" + "group_cols = [\"schedule_gtfs_dataset_key\"]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1fbc60bb-62bc-4a66-956e-7a4c5ea22371", + "metadata": {}, + "outputs": [], + "source": [ + "longest_shape_gdf2.info()" ] }, { - "cell_type": "code", - "execution_count": 47, - "id": "aa870e11-8b3a-421a-a304-409faa9bcede", + "cell_type": "markdown", + "id": "f2c71b31-c05c-40e2-a70b-d1f1276ecf99", "metadata": {}, - "outputs": [], "source": [ - "from shared_utils import portfolio_utils" + "#### something is going on in `operator_scheduled_stats.schedule_stats_by_operator`" ] }, { "cell_type": "code", - "execution_count": 48, - "id": "20e4a0e3-f9c0-497e-96ff-97262cc21ff3", + "execution_count": null, + "id": "6a006462-1cd0-4b9e-9209-e8658964adc7", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "73105f2d1cabc8170ab066d96863c5d5 17\n", - "f5a749dd65924e025b1293c58f95f8d6 3\n", - "Name: schedule_gtfs_dataset_key, dtype: int64" - ] - }, - "execution_count": 48, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ - "common_shape_geom.schedule_gtfs_dataset_key.value_counts()" + "ROUTE_TYPOLOGY = GTFS_DATA_DICT.schedule_tables.route_typologies" ] }, { "cell_type": "code", - "execution_count": 49, - "id": "ac28e91a-a805-4c29-a54e-14a4c27c46fd", + "execution_count": null, + "id": "9a330bd8-450f-4a84-8a74-3e07bbffdcf1", "metadata": {}, "outputs": [], "source": [ - "route_info = (\n", - " helpers.import_scheduled_trips(\n", - " analysis_date,\n", - " columns=[\n", - " \"gtfs_dataset_key\",\n", - " \"route_id\",\n", - " \"route_long_name\",\n", - " \"route_short_name\",\n", - " \"route_desc\",\n", - " ],\n", - " )\n", - " .drop_duplicates()\n", - " .pipe(portfolio_utils.add_route_name)\n", - " .drop(columns=[\"route_long_name\", \"route_short_name\", \"route_desc\"])\n", - ")" + "route_typology = pd.read_parquet(f\"{SCHED_GCS}{ROUTE_TYPOLOGY}_{analysis_date}.parquet\")" ] }, { "cell_type": "code", - "execution_count": 50, - "id": "97d0c027-a95a-44c9-969a-ffa3a4e6b1c6", + "execution_count": null, + "id": "2e334d8c-5377-4ec3-8f78-a28c112429c7", "metadata": {}, "outputs": [], "source": [ - "common_shape_geom2 = pd.merge(\n", - " common_shape_geom,\n", - " route_info.rename(columns={\"route_name_used\": \"route_name\"}),\n", - " on=[\"schedule_gtfs_dataset_key\", \"route_id\"],\n", - ")" + "from route_typologies import route_typologies" ] }, { "cell_type": "code", - "execution_count": 51, - "id": "83f42ec6-2235-47db-a687-dd80bd24ffcd", + "execution_count": null, + "id": "d89002ae-c699-46df-8e1f-cd3eb74c3158", "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
schedule_gtfs_dataset_keyroute_idcommon_shape_idroute_name
073105f2d1cabc8170ab066d96863c5d57715be44b-4dee-4c56-83f8-b1970d6133cfRt 7. A. H. College, Crossroads Shopping Center via Boone St, Bradley Rd.
173105f2d1cabc8170ab066d96863c5d56de042d01-f50a-4b67-ba25-4628643021faRt 6. Oak Knolls to Old Orcutt-East to West-Outbound
273105f2d1cabc8170ab066d96863c5d58882010e7-d331-4518-b31f-3944c689ac17Rt 8. Tanglewood to Crossroads Shopping Center via McCoy Ln.
373105f2d1cabc8170ab066d96863c5d5Malldf0838ab-9999-4118-a599-852164ed2471Mall Shuttle
473105f2d1cabc8170ab066d96863c5d512X14db961b-0cc1-4916-b366-ba0784592fb812X Broadway/Orcutt Express
5f5a749dd65924e025b1293c58f95f8d6CC5cf6811a-2f53-4199-b315-4408eb816e82Daily train service between Auburn, Sacramento, Oakland and San Jose
673105f2d1cabc8170ab066d96863c5d513X077be56b-8745-4f65-acec-eda2e39cccf713X Transit Center/PVHS/N. Broadway
773105f2d1cabc8170ab066d96863c5d511fe7d3b5b-6aed-4f53-9f9c-b582942157dbR11. Transit Center to Gov't Center via S. Broadway
873105f2d1cabc8170ab066d96863c5d53021e2ec94-9952-4f8e-8515-8332c94e8b55Route 30 - Santa Maria Transit Center/Vandenberg/Lompoc
9f5a749dd65924e025b1293c58f95f8d6Shuttlee70e22aa-8bb7-44eb-b12c-f98aeb8f61b7Shuttle to Auburn
1073105f2d1cabc8170ab066d96863c5d58a7c42f9-51e4-4848-bf88-30c210f149adab03b79f-f4a9-4a61-895c-f9e98311322fRt 11. Transit Center to Gov't Center via S. Broadway
1173105f2d1cabc8170ab066d96863c5d5233e31c53-87d1-4cae-930a-d0c26ed8d9e7Rt 2. Transit Center to PVH School via Western., Donovan Rd
1273105f2d1cabc8170ab066d96863c5d53eb560457-7bcf-4989-a293-d134546cc289Rt 3. Transit Center to Marian Hospital to PVH School, via E. Main, Suey Ln.
1373105f2d1cabc8170ab066d96863c5d51B6cdb20fb-9413-4ed1-abc3-4d6b3bbe2f02Rt 1. Transit Ctr to Preisker Park Via N. Broadway
1473105f2d1cabc8170ab066d96863c5d5200f836575-8fe7-4d67-8ee1-86a0d86c57b7Route 20 - Santa Maria Transit Center/Los Alamos/Buellton/Solvang-OB
15f5a749dd65924e025b1293c58f95f8d6SFejnnShuttle to San Francisco Transbay Terminal
1673105f2d1cabc8170ab066d96863c5d55fd9d7de5-ae77-4fa8-8545-a1dc02117126Rt 5. Transit Center to Gov't Cntr to Evergreen Shopping Center via Miller St., S.M. Way
1773105f2d1cabc8170ab066d96863c5d54709dca08-c50f-489b-9814-9a220627172fRt 4. Transit Center to SMH school to VTC via Cook St., Thornburg., Betteravia Rd.
1873105f2d1cabc8170ab066d96863c5d59d05481c2-ba1b-484f-a859-36fdaa827487Rt 9. Transit Center to PVH via Alvin Ave.
1973105f2d1cabc8170ab066d96863c5d516341a660-d9c2-45d7-aee9-fcef64b4fa3bRt 1. Transit Ctr to Preisker Park Via N. Broadway
\n", - "
" - ], - "text/plain": [ - " schedule_gtfs_dataset_key route_id \\\n", - "0 73105f2d1cabc8170ab066d96863c5d5 7 \n", - "1 73105f2d1cabc8170ab066d96863c5d5 6 \n", - "2 73105f2d1cabc8170ab066d96863c5d5 8 \n", - "3 73105f2d1cabc8170ab066d96863c5d5 Mall \n", - "4 73105f2d1cabc8170ab066d96863c5d5 12X \n", - "5 f5a749dd65924e025b1293c58f95f8d6 CC \n", - "6 73105f2d1cabc8170ab066d96863c5d5 13X \n", - "7 73105f2d1cabc8170ab066d96863c5d5 11 \n", - "8 73105f2d1cabc8170ab066d96863c5d5 30 \n", - "9 f5a749dd65924e025b1293c58f95f8d6 Shuttle \n", - "10 73105f2d1cabc8170ab066d96863c5d5 8a7c42f9-51e4-4848-bf88-30c210f149ad \n", - "11 73105f2d1cabc8170ab066d96863c5d5 2 \n", - "12 73105f2d1cabc8170ab066d96863c5d5 3 \n", - "13 73105f2d1cabc8170ab066d96863c5d5 1B \n", - "14 73105f2d1cabc8170ab066d96863c5d5 20 \n", - "15 f5a749dd65924e025b1293c58f95f8d6 SF \n", - "16 73105f2d1cabc8170ab066d96863c5d5 5 \n", - "17 73105f2d1cabc8170ab066d96863c5d5 4 \n", - "18 73105f2d1cabc8170ab066d96863c5d5 9 \n", - "19 73105f2d1cabc8170ab066d96863c5d5 1 \n", - "\n", - " common_shape_id \\\n", - "0 715be44b-4dee-4c56-83f8-b1970d6133cf \n", - "1 de042d01-f50a-4b67-ba25-4628643021fa \n", - "2 882010e7-d331-4518-b31f-3944c689ac17 \n", - "3 df0838ab-9999-4118-a599-852164ed2471 \n", - "4 14db961b-0cc1-4916-b366-ba0784592fb8 \n", - "5 5cf6811a-2f53-4199-b315-4408eb816e82 \n", - "6 077be56b-8745-4f65-acec-eda2e39cccf7 \n", - "7 fe7d3b5b-6aed-4f53-9f9c-b582942157db \n", - "8 21e2ec94-9952-4f8e-8515-8332c94e8b55 \n", - "9 e70e22aa-8bb7-44eb-b12c-f98aeb8f61b7 \n", - "10 ab03b79f-f4a9-4a61-895c-f9e98311322f \n", - "11 33e31c53-87d1-4cae-930a-d0c26ed8d9e7 \n", - "12 eb560457-7bcf-4989-a293-d134546cc289 \n", - "13 6cdb20fb-9413-4ed1-abc3-4d6b3bbe2f02 \n", - "14 0f836575-8fe7-4d67-8ee1-86a0d86c57b7 \n", - "15 ejnn \n", - "16 fd9d7de5-ae77-4fa8-8545-a1dc02117126 \n", - "17 709dca08-c50f-489b-9814-9a220627172f \n", - "18 d05481c2-ba1b-484f-a859-36fdaa827487 \n", - "19 6341a660-d9c2-45d7-aee9-fcef64b4fa3b \n", - "\n", - " route_name \n", - "0 Rt 7. A. H. College, Crossroads Shopping Center via Boone St, Bradley Rd. \n", - "1 Rt 6. Oak Knolls to Old Orcutt-East to West-Outbound \n", - "2 Rt 8. Tanglewood to Crossroads Shopping Center via McCoy Ln. \n", - "3 Mall Shuttle \n", - "4 12X Broadway/Orcutt Express \n", - "5 Daily train service between Auburn, Sacramento, Oakland and San Jose \n", - "6 13X Transit Center/PVHS/N. Broadway \n", - "7 R11. Transit Center to Gov't Center via S. Broadway \n", - "8 Route 30 - Santa Maria Transit Center/Vandenberg/Lompoc \n", - "9 Shuttle to Auburn \n", - "10 Rt 11. Transit Center to Gov't Center via S. Broadway \n", - "11 Rt 2. Transit Center to PVH School via Western., Donovan Rd \n", - "12 Rt 3. Transit Center to Marian Hospital to PVH School, via E. Main, Suey Ln. \n", - "13 Rt 1. Transit Ctr to Preisker Park Via N. Broadway \n", - "14 Route 20 - Santa Maria Transit Center/Los Alamos/Buellton/Solvang-OB \n", - "15 Shuttle to San Francisco Transbay Terminal \n", - "16 Rt 5. Transit Center to Gov't Cntr to Evergreen Shopping Center via Miller St., S.M. Way \n", - "17 Rt 4. Transit Center to SMH school to VTC via Cook St., Thornburg., Betteravia Rd. \n", - "18 Rt 9. Transit Center to PVH via Alvin Ave. \n", - "19 Rt 1. Transit Ctr to Preisker Park Via N. Broadway " - ] - }, - "execution_count": 51, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ - "common_shape_geom2.drop(columns=[\"geometry\"])" + "route_typology_grouped = (\n", + " route_typology.groupby([\"schedule_gtfs_dataset_key\", \"route_id\"])\n", + " .agg({**{f\"is_{c}\": \"sum\" for c in route_typologies}})\n", + " .reset_index()\n", + ")" ] }, { "cell_type": "code", - "execution_count": 53, - "id": "3fb5cbf9-16fa-45b9-b870-1ceaea9801ba", + "execution_count": null, + "id": "82df1d3d-3afa-4fc9-bb16-578d6580a351", "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
Make this Notebook Trusted to load map: File -> Trust Notebook
" - ], - "text/plain": [ - "" - ] - }, - "execution_count": 53, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ - "common_shape_geom2.explore(\"route_id\")" + "route_typology_grouped2 = route_typology_grouped.loc[\n", + " route_typology_grouped.schedule_gtfs_dataset_key.isin(schd_keys)\n", + "]" ] }, { "cell_type": "markdown", - "id": "55924475-3c80-4fb5-bca7-c6e1566b4af6", + "id": "a63499cd-d3dc-4425-b62c-31fe43341f38", "metadata": {}, "source": [ - "### Don't look at most common shape, just load trips.\n", - "* `f5a749dd65924e025b1293c58f95f8d6` is Amtrak\n", - "* 73105f2d1cabc8170ab066d96863c5d5 is the City of Santa Maria" + "#### Routes are missing for Santa Maria and Capitol Corridor in `ROUTE_TYPOLOGY`" ] }, { "cell_type": "code", "execution_count": null, - "id": "d66d0644-60d4-479b-b702-282fdbb31bd2", + "id": "de4c9c7f-0f3a-472b-b255-68518cbd6ddf", "metadata": {}, "outputs": [], "source": [ - "len(trips2)" + "route_typology_grouped2.T" ] }, { "cell_type": "code", "execution_count": null, - "id": "2c1c03b1-04cd-4ee5-b446-f46c7c0b7eba", + "id": "b8f73717-1c5d-4b6a-8498-cd908e6302f3", "metadata": {}, "outputs": [], "source": [ - "trips2.head(2)" + "route_gdf = longest_shape_gdf2.merge(\n", + " route_typology_grouped2, on=[\"schedule_gtfs_dataset_key\", \"route_id\"], how=\"outer\"\n", + ")" ] }, { "cell_type": "code", "execution_count": null, - "id": "1ac9ce22-d308-4a80-b2fd-7eea7d1c4571", + "id": "fe956bad-5a37-4f3c-9e93-0e3fc86b1927", "metadata": {}, "outputs": [], "source": [ - "trips2.gtfs_dataset_key.value_counts()" + "route_gdf.shape" ] }, { "cell_type": "code", "execution_count": null, - "id": "fae140d9-aa52-43d7-a6dc-350e21db21a6", + "id": "72db2ec7-427f-4cab-b2eb-e75a2b23d856", "metadata": {}, "outputs": [], "source": [ - "test2 = pd.merge(shape_geom, trips2, on=\"shape_array_key\", how=\"inner\")" + "route_gdf.drop(columns=[\"geometry\"])" ] }, { "cell_type": "code", "execution_count": null, - "id": "1049f135-ff88-4ceb-a054-a412f01fe42d", + "id": "d5395dad-8d87-45aa-90fb-8d2da3a8c591", "metadata": {}, "outputs": [], "source": [ - "len(test2)" + "# route_gdf2.explore(\"schedule_gtfs_dataset_key\")" + ] + }, + { + "cell_type": "markdown", + "id": "1798ab2b-8847-4111-bcd1-9421bbfc2a4a", + "metadata": {}, + "source": [ + "#### Change merge from `inner` to `left`" ] }, { "cell_type": "code", "execution_count": null, - "id": "cd777b6b-2566-4680-afeb-a5ad99ef94a2", + "id": "e4224b34-f08c-4cc5-8853-1a588f6c59ef", "metadata": {}, "outputs": [], "source": [ - "# test2.head(1)" + "f\"{GTFS_DATA_DICT.digest_tables.dir}{GTFS_DATA_DICT.digest_tables.operator_routes_map}.parquet\"" ] }, { "cell_type": "code", "execution_count": null, - "id": "687db4f0-df8c-4232-9c5d-dc4b6d8a5a6b", + "id": "5becb3cf-d295-4c85-9dc9-716c657ea19c", "metadata": {}, "outputs": [], "source": [ - "route_dir_cols" + "SCHED_GCS" ] }, { - "cell_type": "markdown", - "id": "81adac11-8b42-4fcf-895c-f3fc61b08fc8", + "cell_type": "code", + "execution_count": null, + "id": "6146956f-0745-4844-8a84-51ce772fb0e3", "metadata": {}, + "outputs": [], "source": [ - "### City of Santa Maria has many rows without a `direction_id` value. That is why so few routes are appearing." + "GTFS_DATA_DICT.schedule_tables.operator_routes" ] }, { "cell_type": "code", "execution_count": null, - "id": "fafcfddc-660e-4bbc-bbf3-849ea5ee07c4", + "id": "017c78bf-9bbf-4a99-a867-c606c6f55858", "metadata": {}, "outputs": [], "source": [ - "test2.info()" + "my_test_url = \"gs://calitp-analytics-data/data-analyses/gtfs_schedule/operator_profiles/operator_routes_2024-12-11_AH.parquet\"" ] }, { "cell_type": "code", "execution_count": null, - "id": "45719a9b-6e6d-4f12-bab4-19ef9a5c9b88", - "metadata": { - "scrolled": true, - "tags": [] - }, + "id": "eea9fb55-53e1-4e83-8228-fdf9a5c2cfac", + "metadata": {}, "outputs": [], "source": [ - "test2.groupby(['gtfs_dataset_key', 'route_id',\"shape_id\", \"shape_array_key\"],\n", - ").agg({\"trip_instance_key\": \"count\"})" + "f\"{GTFS_DATA_DICT.digest_tables.dir}{GTFS_DATA_DICT.digest_tables.operator_profiles}.parquet\"" ] }, { - "cell_type": "markdown", - "id": "a4052089-30a5-4448-a45d-8eddb6fe41ff", + "cell_type": "code", + "execution_count": null, + "id": "3ce8f60a-d25c-4004-9c34-c7c86bd56079", "metadata": {}, + "outputs": [], "source": [ - "### Determine common shape " + "test_gdf = gpd.read_parquet(my_test_url)" ] }, { "cell_type": "code", "execution_count": null, - "id": "68a8309b-44a0-4a83-8edf-89e52378c0ee", + "id": "890fd31a-987f-45de-85e7-e1535d7c1cf2", "metadata": {}, "outputs": [], "source": [ - "route_dir_cols.remove('direction_id')" + "test_gdf2 = test_gdf.loc[test_gdf.schedule_gtfs_dataset_key.isin(schd_keys)]" ] }, { "cell_type": "code", "execution_count": null, - "id": "6e114cae-d748-47c8-9cae-f452ec76dc12", + "id": "47514189-ff4a-43f7-9de6-85c1c1cb79a0", "metadata": {}, "outputs": [], "source": [ - "sorting_order" + "test_gdf2.explore(\"route_id\")" + ] + }, + { + "cell_type": "markdown", + "id": "648e80d6-619d-424d-8615-23d2c11f8e01", + "metadata": {}, + "source": [ + "#### Test with all the dates." ] }, { "cell_type": "code", "execution_count": null, - "id": "49979bf5-cf8b-4e36-99c7-438a7fef3b2b", + "id": "aeaa5d4c-f34d-4a56-895b-058940878dfa", "metadata": {}, "outputs": [], "source": [ - "test3 = (\n", - " test2.groupby(\n", - " route_dir_cols + [\"shape_id\", \"shape_array_key\"],\n", - " observed=True,\n", - " group_keys=False,\n", - " )\n", - " .agg({\"trip_instance_key\": \"count\"})\n", - " .reset_index()\n", - " .sort_values(\n", - " route_dir_cols + [\"trip_instance_key\"], ascending=[True, True] + [False]\n", - " )\n", - " .drop_duplicates(subset=route_dir_cols)\n", - " .reset_index(drop=True)[route_dir_cols + [\"shape_id\", \"shape_array_key\"]]\n", - ").rename(\n", - " columns={\n", - " \"gtfs_dataset_key\": \"schedule_gtfs_dataset_key\",\n", - " \"shape_id\": \"common_shape_id\",\n", - " }\n", - ")" + "GTFS_DATA_DICT.schedule_tables.operator_routes" ] }, { "cell_type": "code", "execution_count": null, - "id": "44cb60d3-6c95-469c-8ba5-0f98ea0609f9", + "id": "a4d04121-1df4-4b78-8193-99d2843e6e89", "metadata": {}, "outputs": [], "source": [ - "test3" + "RT_SCHED_GCS" ] }, { - "cell_type": "markdown", - "id": "ef31cc38-bcab-46bd-8992-92c5cb43dd18", + "cell_type": "code", + "execution_count": null, + "id": "59e431ef-d831-408f-8f94-93434438e3ad", "metadata": {}, + "outputs": [], "source": [ - "### Drop duplicates based on route_id: lots of routes show up for Santa Maria now." + "f\"{OPERATOR_ROUTE}_AH_test\"" ] }, { "cell_type": "code", "execution_count": null, - "id": "5b4ff0ec-dc39-4ff1-ac93-2d16db6dbe26", + "id": "7a562a65-a68c-4d3b-8b35-5039b4757e6f", "metadata": {}, "outputs": [], "source": [ - "test3 = test2.drop_duplicates(subset=[\"gtfs_dataset_key\", \"route_id\"]).reset_index(\n", - " drop=True\n", - ")" + "f\"{GTFS_DATA_DICT.digest_tables.dir}{GTFS_DATA_DICT.digest_tables.operator_routes_map}.parquet\"" ] }, { "cell_type": "code", "execution_count": null, - "id": "603e9d27-9d40-47a5-bfd1-6e061f219090", + "id": "a5a00166-be9e-46a1-bb35-ba233162ef5b", "metadata": {}, "outputs": [], "source": [ - "test3.explore(\"route_id\", style_kwds={\"weight\": 5})" + "test_df = gpd.read_parquet(\n", + " \"gs://calitp-analytics-data/data-analyses/rt_vs_schedule/digest/operator_routes_AH_test.parquet\"\n", + ")" ] }, { - "cell_type": "markdown", - "id": "5e6b2d85-81f8-463f-8ce8-2461ecf3197b", + "cell_type": "code", + "execution_count": null, + "id": "62dc4282-1b72-4768-969d-48d3e8250a8a", "metadata": {}, + "outputs": [], "source": [ - "### Try dropping duplicates again" + "test_df.columns" ] }, { "cell_type": "code", "execution_count": null, - "id": "fca9ad78-4763-4410-9f09-50e806f19c07", + "id": "f83624e0-be98-44e7-9ede-895748bc0f96", "metadata": {}, "outputs": [], "source": [ - "route_dir_cols" + "op_routes_gdf = test_df.loc[test_df.organization_name.isin(org_name_lists)]" ] }, { "cell_type": "code", "execution_count": null, - "id": "1f07937e-003e-40a4-929e-5140220f15e6", + "id": "928454c5-0cf1-4887-97b0-bd1931366178", "metadata": {}, "outputs": [], "source": [ - "trips2.groupby(\n", - " route_dir_cols + [\"shape_id\", \"shape_array_key\"],\n", - " observed=True,\n", - " group_keys=False,\n", - ").agg({\"trip_instance_key\": \"count\"})" + "# Find the most recent geography for each route.\n", + "op_routes_gdf = op_routes_gdf.sort_values(by=[\"service_date\"], ascending=False)\n", + "\n", + "# Keep only the most recent row.\n", + "op_routes_gdf = op_routes_gdf.drop_duplicates(\n", + " subset=[\"route_long_name\", \"route_short_name\", \"route_combined_name\"]\n", + ")\n", + "\n", + "# Drop service_dates\n", + "op_routes_gdf = op_routes_gdf.drop(columns=[\"service_date\"])" ] }, { "cell_type": "code", "execution_count": null, - "id": "72e89120-297c-4f67-bb34-e5c257df4794", + "id": "fdec7a7e-8de9-4817-969d-dc867ed1605e", "metadata": {}, "outputs": [], "source": [ - "duplicates2 = (\n", - " trips2.groupby(\n", - " route_dir_cols + [\"shape_id\", \"shape_array_key\"],\n", - " observed=True,\n", - " group_keys=False,\n", - " )\n", - " .agg({\"trip_instance_key\": \"count\"})\n", - " .reset_index()\n", - ")" + "op_routes_gdf.organization_name.value_counts()" ] }, { "cell_type": "code", "execution_count": null, - "id": "dc9633e7-8c0b-4503-b046-e039d831fd11", + "id": "ec771a75-0095-4f0e-a2c0-0de5e886492d", "metadata": {}, "outputs": [], "source": [ - "duplicates2" + "op_routes_gdf.loc[op_routes_gdf.organization_name == \"City of Santa Maria\"].explore(\n", + " \"route_long_name\"\n", + ")" ] } ], diff --git a/gtfs_digest/44_debugging_dec2024.ipynb b/gtfs_digest/44_debugging_dec2024.ipynb index c5e47c500..da9d9ec16 100644 --- a/gtfs_digest/44_debugging_dec2024.ipynb +++ b/gtfs_digest/44_debugging_dec2024.ipynb @@ -12,12 +12,14 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "id": "ac7ce931-86fe-418c-95d7-5d2f85000bee", "metadata": {}, "outputs": [], "source": [ + "import _section2_utils as section2\n", "import geopandas as gpd\n", + "import merge_operator_data\n", "import merge_data\n", "import numpy as np\n", "import pandas as pd\n", @@ -28,7 +30,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "id": "27d67993-3143-4a78-acbc-d36078569db8", "metadata": {}, "outputs": [], @@ -41,7 +43,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "id": "d91ded3e-4959-43d4-b90e-45df7ac60883", "metadata": {}, "outputs": [], @@ -51,70 +53,389 @@ }, { "cell_type": "markdown", - "id": "ea7bc262-afa1-4193-8580-831587a78c0b", + "id": "8154301a-e454-41b8-af51-6f61fb420843", "metadata": {}, "source": [ - "### Op Profiles\n", - "* The code for `gtfs_digest/merge_operator.py` stopped working because one of the column names changed. I went into `gtfs_funnel/crosswalk-gtfs_dataset_key` to fix that. \n", - "* Operator Profiles: are from September 2024 when it's Dec 2024.\n", - " * Fixed: was still referencing one of my old testing profiles." + "### Metrics for All Routes\n", + "* March 2023 has two values for some operators.\n", + "* Some operators have many rows that are repeating, causing their charts to go above 100. " + ] + }, + { + "cell_type": "markdown", + "id": "ba30a005-66c9-4eb3-9e3e-8bfe3ce1c297", + "metadata": {}, + "source": [ + "#### Look at the metrics dataframes first.\n", + "* I think `op_rt_sched_metrics` is the reason why there are duplicative values.\n", + "* Temp fix: in `section2_utils.load_operator_metrics()` drop duplicates based on `service_date`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4a79db03-a8f4-4fd9-bcb7-d4dbd8c8befa", + "metadata": {}, + "outputs": [], + "source": [ + "op_sched_metrics = merge_operator_data.concatenate_schedule_operator_metrics(analysis_date_list)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "619c554f-32d4-4d79-b0f6-e788370a85de", + "metadata": {}, + "outputs": [], + "source": [ + "op_sched_metrics_dec = op_sched_metrics.loc[op_sched_metrics.service_date ==\n", + " '2024-12-11T00:00:00.000000000']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7968c4ab-b0a9-44ab-9332-f2cd59e6d733", + "metadata": {}, + "outputs": [], + "source": [ + "op_sched_metrics_dec.schedule_gtfs_dataset_key.value_counts().head(10)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4321e393-990c-42cc-af36-ba92de71c80e", + "metadata": {}, + "outputs": [], + "source": [ + "op_rt_sched_metrics = merge_operator_data.concatenate_rt_vs_schedule_operator_metrics(analysis_date_list)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5a2cea75-a1e5-422b-8cf0-669c88a42b60", + "metadata": {}, + "outputs": [], + "source": [ + "op_rt_sched_metrics_dec = op_rt_sched_metrics.loc[op_rt_sched_metrics.service_date ==\n", + " '2024-12-11T00:00:00.000000000']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "208f3c21-0b46-4216-a2e6-95fc1c7e37cd", + "metadata": {}, + "outputs": [], + "source": [ + "op_rt_sched_metrics_dec.organization_name.value_counts().head(15)" + ] + }, + { + "cell_type": "markdown", + "id": "697a0b0c-20ee-417c-95bd-abd53d356295", + "metadata": {}, + "source": [ + "* There is the rail versus the bus schedule." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5f0f0033-1702-4677-9d3b-1df79fdeff24", + "metadata": {}, + "outputs": [], + "source": [ + "op_rt_sched_metrics_dec.loc[\n", + " op_rt_sched_metrics_dec.organization_name\n", + " == \"Los Angeles County Metropolitan Transportation Authority\"\n", + "].T" + ] + }, + { + "cell_type": "markdown", + "id": "7ce9ee27-a434-4e94-ad3a-3503234291e1", + "metadata": {}, + "source": [ + "#### How do you know which one is correct?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "46a4a34c-9e56-4305-ae0c-685a799a3b64", + "metadata": {}, + "outputs": [], + "source": [ + "op_rt_sched_metrics_dec.loc[\n", + " op_rt_sched_metrics_dec.organization_name\n", + " == \"Transit Joint Powers Authority for Merced County\"\n", + "].T" ] }, { "cell_type": "code", - "execution_count": 4, - "id": "d513ad73-b50c-4bd6-b00b-3df0ed166c4c", + "execution_count": null, + "id": "a866f8ee-0150-40ad-99d4-b114041dd9b5", "metadata": {}, "outputs": [], "source": [ - "import merge_operator_data" + "op_rt_sched_metrics_dec.loc[\n", + " op_rt_sched_metrics_dec.organization_name\n", + " == \"City of Santa Monica\"\n", + "].T" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "15db8be8-f949-4a6a-b298-62a2b162d1eb", + "metadata": {}, + "outputs": [], + "source": [ + "op_rt_sched_metrics_dec.loc[\n", + " op_rt_sched_metrics_dec.organization_name\n", + " == \"Tahoe Transportation District\"\n", + "].T" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f0627b80-ac4c-4407-bad3-12f94a0dac50", + "metadata": {}, + "outputs": [], + "source": [ + "op_rt_sched_metrics_dec.loc[\n", + " op_rt_sched_metrics_dec.organization_name\n", + " == \"City of Lawndale\"\n", + "].T" + ] + }, + { + "cell_type": "markdown", + "id": "3b45cc80-ac39-4d1c-ae8c-9132c6ec7619", + "metadata": {}, + "source": [ + "#### Dataframe from `merge_operator_data.concatenate_rt_vs_schedule_operator_metrics` is created [here at `gtfs_funnel/operator_scheduled_stats.py`](https://github.com/cal-itp/data-analyses/blob/1ba0f544a01f99966a6e210dd11666b4fe4a146e/gtfs_funnel/operator_scheduled_stats.py#L147)\n", + "* The data is grouped by `gtfs_schedule_dataset_key` and an `organization_name` can have multiple, which is why some organizations have multiple entries." + ] + }, + { + "cell_type": "markdown", + "id": "704e7d04-1dd8-4ab2-8b59-588649ca9905", + "metadata": {}, + "source": [ + "#### Other attempts to look at Operator Profiles" ] }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, + "id": "fba829d7-7dad-4ba8-8f58-a55a290b71fb", + "metadata": {}, + "outputs": [], + "source": [ + "url = \"gs://calitp-analytics-data/data-analyses/rt_vs_schedule/digest/operator_profiles.parquet\"\n", + "operator_profile_df = pd.read_parquet(url)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4ae95240-58a7-4e6e-957d-a30400216452", + "metadata": {}, + "outputs": [], + "source": [ + "operator_profile_df.service_date.unique()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3bee1206-44f4-4304-b6c2-d248a397ec86", + "metadata": {}, + "outputs": [], + "source": [ + "march_2023 = operator_profile_df.loc[\n", + " operator_profile_df.service_date == \"2023-03-15T00:00:00.000000000\"\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4bc3a0ce-f864-48b2-8abe-22cfef2c77a5", + "metadata": {}, + "outputs": [], + "source": [ + "dec_2024 = operator_profile_df.loc[\n", + " operator_profile_df.service_date == \"2024-12-11T00:00:00.000000000\"\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "142a2233-a259-4a2e-8d18-5e14ecb1bf1d", + "metadata": {}, + "outputs": [], + "source": [ + "march_2023.organization_name.value_counts().head(12)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e533058a-e30b-469d-a4b4-dd9487c476c9", + "metadata": {}, + "outputs": [], + "source": [ + "dec_2024.organization_name.value_counts().head(12)" + ] + }, + { + "cell_type": "markdown", + "id": "a4406245-e9a1-4a7b-9996-30aafca141ea", + "metadata": {}, + "source": [ + "#### How does Los Angeles County Metropolitan Transportation Authority have two different values?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0afa87ac-4a45-455e-8c4d-05514bf8f0b4", + "metadata": {}, + "outputs": [], + "source": [ + "dec_2024.loc[\n", + " dec_2024.organization_name\n", + " == \"Basin Transit\"\n", + "].T" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "482c959c-1abe-4d24-ad75-ee2e26fe3a72", + "metadata": {}, + "outputs": [], + "source": [ + "dec_2024.loc[\n", + " dec_2024.organization_name\n", + " == \"Los Angeles County Metropolitan Transportation Authority\"\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "061d47a6-ffd8-4d39-8848-db4588b4004d", + "metadata": {}, + "outputs": [], + "source": [ + "dec_2024.loc[\n", + " dec_2024.organization_name == \"Transit Joint Powers Authority for Merced County\"\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d60d6540-7290-4540-93a2-925b98fcf101", + "metadata": {}, + "outputs": [], + "source": [ + "dec_2024.loc[dec_2024.organization_name == \"City of Lawndale\"]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ceb37389-7ad9-4968-9f18-5f24183f07f6", + "metadata": {}, + "outputs": [], + "source": [ + "dec_2024.loc[dec_2024.organization_name == \"Palo Verde Valley Transit Agency\"]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b4804ece-08c0-4e6b-8b6b-62216d6abffd", + "metadata": {}, + "outputs": [], + "source": [ + "dec_2024.loc[dec_2024.organization_name == \"City of San Luis Obispo\"]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2949c9b9-154b-44b1-a4bd-88f41ea192b4", + "metadata": {}, + "outputs": [], + "source": [ + "crosswalk_df = merge_operator_data.concatenate_crosswalks(analysis_date_list)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "96da557f-a1d4-4c14-a67e-03b45466daa1", + "metadata": {}, + "outputs": [], + "source": [ + "crosswalk_df.head(1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "367adf95-face-4c66-bec7-7cb8fd8eaaa9", + "metadata": {}, + "outputs": [], + "source": [ + "march_crosswalk_df = crosswalk_df.loc[]" + ] + }, + { + "cell_type": "markdown", + "id": "ea7bc262-afa1-4193-8580-831587a78c0b", + "metadata": {}, + "source": [ + "### Op Profiles\n", + "* The code for `gtfs_digest/merge_operator.py` stopped working because one of the column names changed. I went into `crosswalk_gtfs_dataset_key_to_organization` to fix that. \n", + "* Operator Profiles: are from September 2024 when it's Dec 2024.\n", + " * Fixed: was still referencing one of my old testing profiles." + ] + }, + { + "cell_type": "code", + "execution_count": null, "id": "cf9b7d10-625b-4c76-bca9-8116aa77c93a", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'gs://calitp-analytics-data/data-analyses/gtfs_schedule/'" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "SCHED_GCS" ] }, { "cell_type": "code", - "execution_count": 6, + "execution_count": null, "id": "0cdda265-423c-430e-8685-04dc7cb356cd", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'crosswalk/gtfs_key_organization'" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "f\"{GTFS_DATA_DICT.schedule_tables.gtfs_key_crosswalk}\"" ] }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, "id": "3a880fdb-2730-4978-ad7d-e557698d8e70", "metadata": {}, "outputs": [], @@ -124,7 +445,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": null, "id": "b843e240-07ee-4f1a-b29a-3f97e9be8b0e", "metadata": {}, "outputs": [], @@ -134,7 +455,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": null, "id": "2809d41a-1911-45e4-a78a-4e72e73d1f9a", "metadata": {}, "outputs": [], @@ -144,7 +465,41 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": null, + "id": "0cddd845-f206-41b5-97dd-fff179a211df", + "metadata": {}, + "outputs": [], + "source": [ + "dec_crosswalk_df.organization_name.value_counts().head(25)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bab1c6d1-2421-4f84-9eaa-42bf6b245611", + "metadata": {}, + "outputs": [], + "source": [ + "dec_crosswalk_df.loc[\n", + " dec_crosswalk_df.organization_name == \"City of South San Francisco\"\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e18e82dd-5a87-4f49-a374-4bf48c4527ef", + "metadata": {}, + "outputs": [], + "source": [ + "dec_crosswalk_df.loc[\n", + " dec_crosswalk_df.organization_name == \"City and County of San Francisco\"\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": null, "id": "f08b5be9-d49d-44b9-8b6a-a74d10682aa7", "metadata": {}, "outputs": [], @@ -154,7 +509,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": null, "id": "2bca5311-669c-4d3e-bc41-d3ebc7a69c3c", "metadata": {}, "outputs": [], @@ -166,7 +521,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": null, "id": "01e983f2-0929-42b8-9026-4509cf033aeb", "metadata": {}, "outputs": [], @@ -178,70 +533,37 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": null, "id": "3450162d-11a9-47ac-8d55-a883f51b023a", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "set()" - ] - }, - "execution_count": 13, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "nov_cols - sept_cols" ] }, { "cell_type": "code", - "execution_count": 14, + "execution_count": null, "id": "809724b7-aea1-4862-b828-5f4a9b5ec6f6", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "set()" - ] - }, - "execution_count": 14, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "sept_cols - dec_cols" ] }, { "cell_type": "code", - "execution_count": 15, + "execution_count": null, "id": "5e59113f-4dc5-4f57-9ea5-26c09199d706", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "set()" - ] - }, - "execution_count": 15, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "dec_cols - sept_cols" ] }, { "cell_type": "code", - "execution_count": 16, + "execution_count": null, "id": "57f2bca0-1f1b-4940-875b-958783bd941f", "metadata": {}, "outputs": [], @@ -253,62 +575,17 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": null, "id": "5e651996-43f3-4c02-9631-0f8e44537961", "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
primary_uza_codeprimary_uza_name
1NoneOxnard--San Buenaventura (Ventura), CA
\n", - "
" - ], - "text/plain": [ - " primary_uza_code primary_uza_name\n", - "1 None Oxnard--San Buenaventura (Ventura), CA" - ] - }, - "execution_count": 17, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "ventura_dec[[\"primary_uza_code\", \"primary_uza_name\"]].drop_duplicates()" ] }, { "cell_type": "code", - "execution_count": 18, + "execution_count": null, "id": "9de016d3-9f85-48b4-a763-5a5b89dd3ad3", "metadata": {}, "outputs": [], @@ -320,62 +597,17 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": null, "id": "4bfb30ba-cc1f-41d8-afe8-6fb4ccfc40bf", "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
primary_uza_codeprimary_uza_name
1NoneOxnard--San Buenaventura (Ventura), CA
\n", - "
" - ], - "text/plain": [ - " primary_uza_code primary_uza_name\n", - "1 None Oxnard--San Buenaventura (Ventura), CA" - ] - }, - "execution_count": 20, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "ventura_sept[['primary_uza_code', 'primary_uza_name']].drop_duplicates()" - ] - }, - { - "cell_type": "code", - "execution_count": 21, + "outputs": [], + "source": [ + "ventura_sept[[\"primary_uza_code\", \"primary_uza_name\"]].drop_duplicates()" + ] + }, + { + "cell_type": "code", + "execution_count": null, "id": "81d75c3c-07d4-4997-9774-b7c6e86d4d7a", "metadata": {}, "outputs": [], @@ -385,49 +617,27 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": null, "id": "cb934b4e-63bc-4015-9363-84152be93c5e", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "array(['2023-03-15T00:00:00.000000000', '2023-04-12T00:00:00.000000000',\n", - " '2023-05-17T00:00:00.000000000', '2023-06-14T00:00:00.000000000',\n", - " '2023-07-12T00:00:00.000000000', '2023-08-15T00:00:00.000000000',\n", - " '2023-09-13T00:00:00.000000000', '2023-10-11T00:00:00.000000000',\n", - " '2023-11-15T00:00:00.000000000', '2023-12-13T00:00:00.000000000',\n", - " '2024-01-17T00:00:00.000000000', '2024-02-14T00:00:00.000000000',\n", - " '2024-03-13T00:00:00.000000000', '2024-04-17T00:00:00.000000000',\n", - " '2024-05-22T00:00:00.000000000', '2024-06-12T00:00:00.000000000',\n", - " '2024-07-17T00:00:00.000000000', '2024-08-14T00:00:00.000000000',\n", - " '2024-09-18T00:00:00.000000000', '2024-10-16T00:00:00.000000000',\n", - " '2024-11-13T00:00:00.000000000', '2024-12-11T00:00:00.000000000'],\n", - " dtype='datetime64[ns]')" - ] - }, - "execution_count": 23, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "crosswalk_df.service_date.unique()" ] }, { "cell_type": "code", - "execution_count": 24, + "execution_count": null, "id": "813b89c6-9e26-439b-9bbd-841bd1b53e28", "metadata": {}, "outputs": [], "source": [ - "import _section1_utils " + "import _section1_utils" ] }, { "cell_type": "code", - "execution_count": 25, + "execution_count": null, "id": "a34c2067-ec22-4576-8ae7-bf28d8b1f433", "metadata": {}, "outputs": [], @@ -437,7 +647,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": null, "id": "2beb41b8-7ce5-4022-9aa3-8ab268ff3102", "metadata": {}, "outputs": [], @@ -447,134 +657,10 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": null, "id": "5f2a10ad-32ad-4f59-968e-5d74003f2aea", "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
schedule_gtfs_dataset_keyVP per Minute (All Routes)Spatial Accuracy (All Routes)Date# Routes# Trips# Shapes# Stops# ArrivalsOperator Service MilesAvg Arrivals per Stop# Downtown Local Route Types# Local Route Types# Coverage Route Types# Rapid Route Types# Express Route Types# Rail Route TypesTransit OperatorOrganization IDOrganizationDistrictcounties_servedservice_area_sq_mileshq_cityservice_area_poporganization_typeprimary_uza_namereporter_type
169809d3f8121513057bc5cb8de7b54ce21.9489.902024-12-1134.001036.0070.00919.0023141.00467.6025.1827.0018.0039.0030.001.000.00Monterey Salinas SchedulereceZJ9sEnP9vy3g0Monterey-Salinas Transit05 - San Luis ObispoMonterey159Monterey437325Independent Public Agency or Authority of Transit ServiceSeaside--Monterey--Pacific Grove, CAFull Reporter
\n", - "
" - ], - "text/plain": [ - " schedule_gtfs_dataset_key VP per Minute (All Routes) \\\n", - "16 9809d3f8121513057bc5cb8de7b54ce2 1.94 \n", - "\n", - " Spatial Accuracy (All Routes) Date # Routes # Trips # Shapes \\\n", - "16 89.90 2024-12-11 34.00 1036.00 70.00 \n", - "\n", - " # Stops # Arrivals Operator Service Miles Avg Arrivals per Stop \\\n", - "16 919.00 23141.00 467.60 25.18 \n", - "\n", - " # Downtown Local Route Types # Local Route Types # Coverage Route Types \\\n", - "16 27.00 18.00 39.00 \n", - "\n", - " # Rapid Route Types # Express Route Types # Rail Route Types \\\n", - "16 30.00 1.00 0.00 \n", - "\n", - " Transit Operator Organization ID Organization \\\n", - "16 Monterey Salinas Schedule receZJ9sEnP9vy3g0 Monterey-Salinas Transit \n", - "\n", - " District counties_served service_area_sq_miles hq_city \\\n", - "16 05 - San Luis Obispo Monterey 159 Monterey \n", - "\n", - " service_area_pop \\\n", - "16 437325 \n", - "\n", - " organization_type \\\n", - "16 Independent Public Agency or Authority of Transit Service \n", - "\n", - " primary_uza_name reporter_type \n", - "16 Seaside--Monterey--Pacific Grove, CA Full Reporter " - ] - }, - "execution_count": 27, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "ntd_profile" ] @@ -589,7 +675,7 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": null, "id": "b91470ac-fa05-4083-8352-f5adf73712ed", "metadata": {}, "outputs": [], @@ -599,339 +685,88 @@ }, { "cell_type": "code", - "execution_count": 33, + "execution_count": null, "id": "7c847cb9-4d2e-483d-a8c2-af3be35af44c", "metadata": {}, "outputs": [], "source": [ "# Keep only rows that are found in both schedule and real time data\n", - "schd_vp_df = (pd.read_parquet(schd_vp_url, \n", - " filters=[[(\"organization_name\", \"==\", organization_name),\n", - " (\"sched_rt_category\", \"==\", \"schedule_and_vp\")]])\n", - " )" + "schd_vp_df = pd.read_parquet(\n", + " schd_vp_url,\n", + " filters=[\n", + " [\n", + " (\"organization_name\", \"==\", organization_name),\n", + " (\"sched_rt_category\", \"==\", \"schedule_and_vp\"),\n", + " ]\n", + " ],\n", + ")" ] }, { "cell_type": "code", - "execution_count": 59, + "execution_count": null, "id": "a5ff575d-9722-49c2-b27f-a7fa8488f9b9", "metadata": {}, "outputs": [], "source": [ - "schd_vp_df_gtfskeys = schd_vp_df[[\"schedule_gtfs_dataset_key\",\"service_date\"]].drop_duplicates()" + "schd_vp_df_gtfskeys = schd_vp_df[\n", + " [\"schedule_gtfs_dataset_key\", \"service_date\"]\n", + "].drop_duplicates()" ] }, { "cell_type": "code", - "execution_count": 34, + "execution_count": null, "id": "d10abab2-4994-42b6-a745-d9bf792e8e8b", "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
schedule_gtfs_dataset_keydirection_idtime_periodavg_scheduled_service_minutesavg_stop_milesn_scheduled_tripsfrequencyis_expressis_rapidis_railis_coverageis_downtown_localis_localservice_datetypologyminutes_atleast1_vpminutes_atleast2_vptotal_rt_service_minutestotal_scheduled_service_minutestotal_vpvp_in_shapeis_earlyis_ontimeis_laten_vp_tripsvp_per_minutepct_in_shapepct_rt_journey_atleast1_vppct_rt_journey_atleast2_vppct_sched_journey_atleast1_vppct_sched_journey_atleast2_vprt_sched_journey_ratioavg_rt_service_minutesschedule_source_record_id_xsched_rt_categoryspeed_mphnameroute_long_nameroute_short_nameroute_combined_nameroute_idschedule_source_record_id_ybase64_urlorganization_source_record_idorganization_namecaltrans_districtroute_primary_direction
16256288d9aa978e4ca97e5ba1dbbc20f3fc190.00all_day22.000.13120.500.000.000.001.001.000.002023-03-15downtown_local241210259.08264.00464427543121.790.920.930.810.910.800.9821.59Noneschedule_and_vp13.38Monterey Salinas ScheduleMonterey - PG via Asilomar11 Monterey - PG via Asilomar001recysP9m9kjCJwHZeaHR0cHM6Ly93d3cubXN0Lm9yZy9nb29nbGUvZ29vZ2xlX3RyYW5zaXQuemlwreceZJ9sEnP9vy3g0Monterey-Salinas Transit05 - San Luis ObispoEastbound
16256388d9aa978e4ca97e5ba1dbbc20f3fc190.00all_day22.000.13120.500.000.000.001.001.000.002023-03-15downtown_local241210259.08264.00464427543121.790.920.930.810.910.800.9821.59Noneschedule_and_vp13.38Monterey Salinas ScheduleMonterey - PG via Asilomar11 Monterey - PG via Asilomar001recysP9m9kjCJwHZeaHR0cHM6Ly93d3cubXN0Lm9yZy9nb29nbGUvZ29vZ2xlX3RyYW5zaXQuemlwreceZJ9sEnP9vy3g0Monterey-Salinas Transit05 - San Luis ObispoEastbound
\n", - "
" - ], - "text/plain": [ - " schedule_gtfs_dataset_key direction_id time_period \\\n", - "162562 88d9aa978e4ca97e5ba1dbbc20f3fc19 0.00 all_day \n", - "162563 88d9aa978e4ca97e5ba1dbbc20f3fc19 0.00 all_day \n", - "\n", - " avg_scheduled_service_minutes avg_stop_miles n_scheduled_trips \\\n", - "162562 22.00 0.13 12 \n", - "162563 22.00 0.13 12 \n", - "\n", - " frequency is_express is_rapid is_rail is_coverage \\\n", - "162562 0.50 0.00 0.00 0.00 1.00 \n", - "162563 0.50 0.00 0.00 0.00 1.00 \n", - "\n", - " is_downtown_local is_local service_date typology \\\n", - "162562 1.00 0.00 2023-03-15 downtown_local \n", - "162563 1.00 0.00 2023-03-15 downtown_local \n", - "\n", - " minutes_atleast1_vp minutes_atleast2_vp total_rt_service_minutes \\\n", - "162562 241 210 259.08 \n", - "162563 241 210 259.08 \n", - "\n", - " total_scheduled_service_minutes total_vp vp_in_shape is_early \\\n", - "162562 264.00 464 427 5 \n", - "162563 264.00 464 427 5 \n", - "\n", - " is_ontime is_late n_vp_trips vp_per_minute pct_in_shape \\\n", - "162562 4 3 12 1.79 0.92 \n", - "162563 4 3 12 1.79 0.92 \n", - "\n", - " pct_rt_journey_atleast1_vp pct_rt_journey_atleast2_vp \\\n", - "162562 0.93 0.81 \n", - "162563 0.93 0.81 \n", - "\n", - " pct_sched_journey_atleast1_vp pct_sched_journey_atleast2_vp \\\n", - "162562 0.91 0.80 \n", - "162563 0.91 0.80 \n", - "\n", - " rt_sched_journey_ratio avg_rt_service_minutes \\\n", - "162562 0.98 21.59 \n", - "162563 0.98 21.59 \n", - "\n", - " schedule_source_record_id_x sched_rt_category speed_mph \\\n", - "162562 None schedule_and_vp 13.38 \n", - "162563 None schedule_and_vp 13.38 \n", - "\n", - " name route_long_name \\\n", - "162562 Monterey Salinas Schedule Monterey - PG via Asilomar \n", - "162563 Monterey Salinas Schedule Monterey - PG via Asilomar \n", - "\n", - " route_short_name route_combined_name route_id \\\n", - "162562 1 1 Monterey - PG via Asilomar 001 \n", - "162563 1 1 Monterey - PG via Asilomar 001 \n", - "\n", - " schedule_source_record_id_y \\\n", - "162562 recysP9m9kjCJwHZe \n", - "162563 recysP9m9kjCJwHZe \n", - "\n", - " base64_url \\\n", - "162562 aHR0cHM6Ly93d3cubXN0Lm9yZy9nb29nbGUvZ29vZ2xlX3RyYW5zaXQuemlw \n", - "162563 aHR0cHM6Ly93d3cubXN0Lm9yZy9nb29nbGUvZ29vZ2xlX3RyYW5zaXQuemlw \n", - "\n", - " organization_source_record_id organization_name \\\n", - "162562 receZJ9sEnP9vy3g0 Monterey-Salinas Transit \n", - "162563 receZJ9sEnP9vy3g0 Monterey-Salinas Transit \n", - "\n", - " caltrans_district route_primary_direction \n", - "162562 05 - San Luis Obispo Eastbound \n", - "162563 05 - San Luis Obispo Eastbound " - ] - }, - "execution_count": 34, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "schd_vp_df.head(2)" ] }, { "cell_type": "code", - "execution_count": 61, + "execution_count": null, "id": "13dfc48a-6f10-4f7e-9307-686f64c8fcfc", "metadata": {}, "outputs": [], "source": [ "schedule_by_route = merge_data.concatenate_schedule_by_route_direction(\n", - " analysis_date_list\n", - " )" + " analysis_date_list\n", + ")" ] }, { "cell_type": "code", - "execution_count": 62, + "execution_count": null, "id": "2cce00c2-a8e4-4c11-b157-f8e98b9018d3", "metadata": {}, "outputs": [], "source": [ - "schedule_by_route_gtfskeys = schedule_by_route[[\"schedule_gtfs_dataset_key\",\"service_date\"]].drop_duplicates()" + "schedule_by_route_gtfskeys = schedule_by_route[\n", + " [\"schedule_gtfs_dataset_key\", \"service_date\"]\n", + "].drop_duplicates()" ] }, { "cell_type": "code", - "execution_count": 63, + "execution_count": null, "id": "be70ebf2-b2f5-4070-a7ee-954952d9674a", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "_merge \n", - "right_only 1675\n", - "both 1593\n", - "left_only 55\n", - "dtype: int64" - ] - }, - "execution_count": 63, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ - "pd.merge(df_avg_speeds_gtfskeys, schedule_by_route_gtfskeys, on = [\"schedule_gtfs_dataset_key\",\"service_date\"],\n", - " how = \"outer\", indicator = True)[[\"_merge\"]].value_counts()" + "pd.merge(\n", + " df_avg_speeds_gtfskeys,\n", + " schedule_by_route_gtfskeys,\n", + " on=[\"schedule_gtfs_dataset_key\", \"service_date\"],\n", + " how=\"outer\",\n", + " indicator=True,\n", + ")[[\"_merge\"]].value_counts()" ] }, { "cell_type": "code", - "execution_count": 36, + "execution_count": null, "id": "dbd0f246-4616-4da5-9c51-b53abbcc8c9a", "metadata": {}, "outputs": [], @@ -941,7 +776,7 @@ }, { "cell_type": "code", - "execution_count": 39, + "execution_count": null, "id": "40700f2b-2ccd-46e2-982b-e4306d734654", "metadata": {}, "outputs": [], @@ -961,192 +796,60 @@ }, { "cell_type": "code", - "execution_count": 54, + "execution_count": null, "id": "16d50718-c09a-4e4f-bab7-90c7b6ea3f16", "metadata": {}, "outputs": [], "source": [ - "df_avg_speeds = merge_data.concatenate_speeds_by_route_direction(\n", - " analysis_date_list\n", - " )" + "df_avg_speeds = merge_data.concatenate_speeds_by_route_direction(analysis_date_list)" ] }, { "cell_type": "code", - "execution_count": 55, + "execution_count": null, "id": "9cff0c4b-50bf-4e5f-8ad5-8eab93b6431a", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "array(['2023-04-12T00:00:00.000000000', '2023-05-17T00:00:00.000000000',\n", - " '2023-06-14T00:00:00.000000000', '2023-07-12T00:00:00.000000000',\n", - " '2023-08-15T00:00:00.000000000', '2023-09-13T00:00:00.000000000',\n", - " '2023-10-11T00:00:00.000000000', '2023-11-15T00:00:00.000000000',\n", - " '2023-12-13T00:00:00.000000000', '2024-01-17T00:00:00.000000000',\n", - " '2024-02-14T00:00:00.000000000', '2024-03-13T00:00:00.000000000',\n", - " '2024-04-17T00:00:00.000000000', '2024-05-22T00:00:00.000000000',\n", - " '2024-06-12T00:00:00.000000000', '2024-07-17T00:00:00.000000000',\n", - " '2024-08-14T00:00:00.000000000', '2024-09-18T00:00:00.000000000',\n", - " '2024-10-16T00:00:00.000000000', '2024-11-13T00:00:00.000000000',\n", - " '2024-12-11T00:00:00.000000000', '2023-03-15T00:00:00.000000000'],\n", - " dtype='datetime64[ns]')" - ] - }, - "execution_count": 55, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "df_avg_speeds.service_date.unique()" ] }, { "cell_type": "code", - "execution_count": 56, + "execution_count": null, "id": "9268e7bd-5f99-46de-975b-327fe7e72c9b", "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
schedule_gtfs_dataset_keyroute_iddirection_idtime_periodspeed_mphservice_date
0015d67d5b75b5cf2b710bbadadfb75f5170.00all_day16.632023-04-12
1015d67d5b75b5cf2b710bbadadfb75f5170.00all_day15.722023-05-17
2015d67d5b75b5cf2b710bbadadfb75f5170.00all_day15.172023-06-14
3015d67d5b75b5cf2b710bbadadfb75f5170.00all_day15.412023-07-12
4015d67d5b75b5cf2b710bbadadfb75f5170.00all_day15.062023-08-15
\n", - "
" - ], - "text/plain": [ - " schedule_gtfs_dataset_key route_id direction_id time_period \\\n", - "0 015d67d5b75b5cf2b710bbadadfb75f5 17 0.00 all_day \n", - "1 015d67d5b75b5cf2b710bbadadfb75f5 17 0.00 all_day \n", - "2 015d67d5b75b5cf2b710bbadadfb75f5 17 0.00 all_day \n", - "3 015d67d5b75b5cf2b710bbadadfb75f5 17 0.00 all_day \n", - "4 015d67d5b75b5cf2b710bbadadfb75f5 17 0.00 all_day \n", - "\n", - " speed_mph service_date \n", - "0 16.63 2023-04-12 \n", - "1 15.72 2023-05-17 \n", - "2 15.17 2023-06-14 \n", - "3 15.41 2023-07-12 \n", - "4 15.06 2023-08-15 " - ] - }, - "execution_count": 56, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "df_avg_speeds.head()" ] }, { "cell_type": "code", - "execution_count": 58, + "execution_count": null, "id": "670ec966-66d9-4405-9887-03cec3340e45", "metadata": {}, "outputs": [], "source": [ - "df_avg_speeds_gtfskeys = df_avg_speeds[[\"schedule_gtfs_dataset_key\",\"service_date\"]].drop_duplicates()" + "df_avg_speeds_gtfskeys = df_avg_speeds[\n", + " [\"schedule_gtfs_dataset_key\", \"service_date\"]\n", + "].drop_duplicates()" ] }, { "cell_type": "code", - "execution_count": 60, + "execution_count": null, "id": "6187a364-c90c-462c-b1cd-e3171a5651f4", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "_merge \n", - "left_only 1626\n", - "both 22\n", - "right_only 0\n", - "dtype: int64" - ] - }, - "execution_count": 60, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "pd.merge(df_avg_speeds_gtfskeys, schd_vp_df_gtfskeys, on = [\"schedule_gtfs_dataset_key\",\"service_date\"],\n", - " how = \"outer\", indicator = True)[[\"_merge\"]].value_counts()" + "outputs": [], + "source": [ + "pd.merge(\n", + " df_avg_speeds_gtfskeys,\n", + " schd_vp_df_gtfskeys,\n", + " on=[\"schedule_gtfs_dataset_key\", \"service_date\"],\n", + " how=\"outer\",\n", + " indicator=True,\n", + ")[[\"_merge\"]].value_counts()" ] } ], diff --git a/gtfs_digest/45_missing_routes2.ipynb b/gtfs_digest/45_missing_routes2.ipynb new file mode 100644 index 000000000..9488b17e9 --- /dev/null +++ b/gtfs_digest/45_missing_routes2.ipynb @@ -0,0 +1,1001 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "df9388c2-122f-470c-8b96-4f7cbffea26f", + "metadata": {}, + "source": [ + "## Finding Missing Routes\n", + "* [Issue](https://github.com/cal-itp/data-analyses/issues/1312): Capital Corridor doesn't have any rail routes. \n", + "* [Most of Santa Maria's routes not showing up in GTFS Digest](https://github.com/cal-itp/data-analyses/issues/1313)\n", + "* Rerun all the scripts that create the underlying dataframes for November date (`df_sched`,`df_avg_speeds`,`df_rt_sched`) and merge them using `gtfs_digest/merge_data.merge_data_sources_by_route_direction()`" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "d898cfa3-466e-4ca2-8484-e381b6fc4ce1", + "metadata": {}, + "outputs": [ + { + "ename": "SyntaxError", + "evalue": "invalid syntax (_section2_utils.py, line 896)", + "output_type": "error", + "traceback": [ + "Traceback \u001b[0;36m(most recent call last)\u001b[0m:\n", + "\u001b[0m File \u001b[1;32m/opt/conda/lib/python3.9/site-packages/IPython/core/interactiveshell.py:3508\u001b[0m in \u001b[1;35mrun_code\u001b[0m\n exec(code_obj, self.user_global_ns, self.user_ns)\u001b[0m\n", + "\u001b[0;36m Cell \u001b[0;32mIn[1], line 1\u001b[0;36m\n\u001b[0;31m import _section2_utils\u001b[0;36m\n", + "\u001b[0;36m File \u001b[0;32m~/data-analyses/gtfs_digest/_section2_utils.py:896\u001b[0;36m\u001b[0m\n\u001b[0;31m y_col = \"Speed (MPH)\",\u001b[0m\n\u001b[0m ^\u001b[0m\n\u001b[0;31mSyntaxError\u001b[0m\u001b[0;31m:\u001b[0m invalid syntax\n" + ] + } + ], + "source": [ + "import _section2_utils\n", + "import geopandas as gpd\n", + "import merge_data\n", + "import numpy as np\n", + "import pandas as pd\n", + "from segment_speed_utils import gtfs_schedule_wrangling\n", + "from update_vars import GTFS_DATA_DICT, RT_SCHED_GCS, SCHED_GCS, SEGMENT_GCS" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7a5a0cb2-d314-47aa-886c-5ebdf143905b", + "metadata": {}, + "outputs": [], + "source": [ + "pd.options.display.max_columns = 100\n", + "pd.options.display.float_format = \"{:.2f}\".format\n", + "pd.set_option(\"display.max_rows\", None)\n", + "pd.set_option(\"display.max_colwidth\", None)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c4d1d951-101a-4bed-8774-d2c3ff1605e9", + "metadata": {}, + "outputs": [], + "source": [ + "org_name_lists = [\"Capitol Corridor Joint Powers Authority\", \"City of Santa Maria\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "efde4bc7-fd20-4c73-9ec4-6982d4643e39", + "metadata": {}, + "outputs": [], + "source": [ + "analysis_date_list = [\"2024-11-13\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "93682bff-3d64-4d60-83a6-98234cc2bbdd", + "metadata": {}, + "outputs": [], + "source": [ + "one_analysis_date = \"2024-11-13\"" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "69a13fbc-7af0-408d-b4f1-1a78a35ffa86", + "metadata": {}, + "outputs": [], + "source": [ + "schd_keys = [\n", + " \"5a8721fe96786fcd25fba1f8a0ee6358\",\n", + " \"73105f2d1cabc8170ab066d96863c5d5\",\n", + " \"f5a749dd65924e025b1293c58f95f8d6\",\n", + "]" + ] + }, + { + "cell_type": "markdown", + "id": "b3197201-0f2c-471e-bc84-fb518e9a2c93", + "metadata": {}, + "source": [ + "### Run the scripts that create the following dataframes for November.\n", + "* `df_sched`: `gtfs_funnel/schedule_stats_by_route_direction`\n", + "* `df_rt_sched`: `rt_scheduled_v_ran/scripts/rt_v_scheduled_routes`\n", + "* `df_avg_speeds`: `rt_segment_speeds/script/average_summary_speed`" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "c1bfd907-5907-4f08-a841-27ff992b10fb", + "metadata": {}, + "outputs": [ + { + "ename": "NameError", + "evalue": "name 'RT_SCHED_GCS' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[5], line 2\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;66;03m# df_sched\u001b[39;00m\n\u001b[0;32m----> 2\u001b[0m \u001b[43mRT_SCHED_GCS\u001b[49m\n", + "\u001b[0;31mNameError\u001b[0m: name 'RT_SCHED_GCS' is not defined" + ] + } + ], + "source": [ + "# df_sched\n", + "RT_SCHED_GCS" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d377f69c-b363-4b1d-889b-941a88eede10", + "metadata": {}, + "outputs": [], + "source": [ + "ROUTE_DIR_EXPORT = GTFS_DATA_DICT.rt_vs_schedule_tables.sched_route_direction_metrics" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ae25396c-3eb6-4c2a-b036-ffda5c481b5a", + "metadata": {}, + "outputs": [], + "source": [ + "ROUTE_DIR_EXPORT" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7e3a136e-f4a0-4943-b603-0435a759bfbe", + "metadata": {}, + "outputs": [], + "source": [ + "df_schedule = pd.read_parquet(\n", + " \"gs://calitp-analytics-data/data-analyses/rt_vs_schedule/schedule_route_dir/schedule_route_direction_metrics_2024-11-13.parquet\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "266efbce-6c7a-4cc4-84ee-f82de18cd0c6", + "metadata": {}, + "outputs": [], + "source": [ + "df_schedule.columns" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c7dd1435-60ee-4894-bcd1-69b1ad8c1c41", + "metadata": {}, + "outputs": [], + "source": [ + "filtered_df_schedule = df_schedule.loc[\n", + " df_schedule.schedule_gtfs_dataset_key.isin(schd_keys)\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "440e76d9-43f2-495b-9f97-bc0f77c44435", + "metadata": {}, + "outputs": [], + "source": [ + "filtered_df_schedule.route_id.unique()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c7b96e3b-d0ef-4c56-b163-292745a9e7e5", + "metadata": {}, + "outputs": [], + "source": [ + "# df_avg_speeds\n", + "segment_type = \"rt_stop_times\"\n", + "\n", + "dict_inputs = GTFS_DATA_DICT[segment_type]\n", + "ROUTE_DIR_FILE = dict_inputs[\"route_dir_single_summary\"]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c9b29c6e-6255-43ef-9409-94c152acfa93", + "metadata": {}, + "outputs": [], + "source": [ + "SEGMENT_GCS" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ca8c3fc6-a08f-4c49-b020-266c50b9a49e", + "metadata": {}, + "outputs": [], + "source": [ + "ROUTE_DIR_FILE" + ] + }, + { + "cell_type": "markdown", + "id": "3e58057d-4851-4da5-b30b-873031266279", + "metadata": {}, + "source": [ + "#### Average speeds is missing a lot of stuff" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2b012e83-e2c1-4198-a35f-8147a20dc6c6", + "metadata": {}, + "outputs": [], + "source": [ + "df_avg_speeds = pd.read_parquet(\n", + " \"gs://calitp-analytics-data/data-analyses/rt_segment_speeds/rollup_singleday/speeds_route_dir_2024-11-13.parquet\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8741de0f-531d-4a47-9c7d-fae01fe91c1f", + "metadata": {}, + "outputs": [], + "source": [ + "filtered_df_avg_speeds = df_avg_speeds.loc[\n", + " df_avg_speeds.schedule_gtfs_dataset_key.isin(schd_keys)\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1fb0dfbe-8574-4427-87a5-b514aa77c753", + "metadata": {}, + "outputs": [], + "source": [ + "filtered_df_avg_speeds.route_id.unique()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cdd75c43-d2e7-49bd-972e-4a28c05feedb", + "metadata": { + "scrolled": true, + "tags": [] + }, + "outputs": [], + "source": [ + "df_avg_speeds.loc[\n", + " df_avg_speeds.organization_name == \"Marin County Transit District\"\n", + "].drop(columns=[\"geometry\"])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e2defe91-ac5c-44dd-8ab2-44ae06a22a61", + "metadata": { + "scrolled": true, + "tags": [] + }, + "outputs": [], + "source": [ + "# filtered_df_avg_speeds[[ 'route_id', 'direction_id', 'time_period','speed_mph']]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b31aa6db-06cc-4af2-b27e-ad18b58f45bb", + "metadata": {}, + "outputs": [], + "source": [ + "# df_rt_sched\n", + "RT_SCHED_GCS" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b4aaed52-8c8d-4368-892c-a4a6ffbc2a3a", + "metadata": {}, + "outputs": [], + "source": [ + "GTFS_DATA_DICT.rt_vs_schedule_tables.vp_route_direction_metrics" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "26a12c09-82a5-4199-801f-e374bb20b361", + "metadata": {}, + "outputs": [], + "source": [ + "df_rt_sched = pd.read_parquet(\n", + " \"gs://calitp-analytics-data/data-analyses/rt_vs_schedule/vp_route_dir/route_direction_metrics_2024-11-13.parquet\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2dbe65fb-f60d-431e-b8f4-143db6cfa5da", + "metadata": {}, + "outputs": [], + "source": [ + "df_rt_sched.columns" + ] + }, + { + "cell_type": "markdown", + "id": "a8209979-ea65-44e6-92bc-94c1d43e4e57", + "metadata": {}, + "source": [ + "### Open up original file" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a4df5906-9ef9-49e6-875b-95b36afa4063", + "metadata": {}, + "outputs": [], + "source": [ + "schd_vp_url = f\"{GTFS_DATA_DICT.digest_tables.dir}{GTFS_DATA_DICT.digest_tables.route_schedule_vp}.parquet\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "185e680c-6027-47fa-b488-cb0b20e27a71", + "metadata": {}, + "outputs": [], + "source": [ + "schd_vp_url" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "534a48bf-de90-460c-9515-3d0e5519274d", + "metadata": {}, + "outputs": [], + "source": [ + "schd_vp_df = pd.read_parquet(schd_vp_url)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "20bf983a-bcc5-49ff-be63-0ef4207b801f", + "metadata": {}, + "outputs": [], + "source": [ + "schd_vp_df2 = schd_vp_df.loc[schd_vp_df.organization_name.isin(org_name_lists)]" + ] + }, + { + "cell_type": "markdown", + "id": "3becbc8b-4098-4b16-9bae-6aa50bd658f7", + "metadata": {}, + "source": [ + "### Merge all the files based on `gtfs_digest/merge_data`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3d6a52e5-1981-4eca-b166-76abe1420dfc", + "metadata": {}, + "outputs": [], + "source": [ + "service_date_datetime = pd.to_datetime(\"2024-11-13T00:00:00.000000000\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "87556b98-3ac3-46ee-b9ef-eb0a10f29dab", + "metadata": {}, + "outputs": [], + "source": [ + "df_schedule[\"service_date\"] = service_date_datetime" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "34d44c3d-ee9e-4fc5-b16c-7e2448b81d40", + "metadata": {}, + "outputs": [], + "source": [ + "df_rt_sched[\"service_date\"] = service_date_datetime" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a67a9a2a-9045-44de-84f5-7d5c1a678dfc", + "metadata": {}, + "outputs": [], + "source": [ + "df_avg_speeds[\"service_date\"] = service_date_datetime" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "305c73b1-41f3-4237-ac0c-156097237e42", + "metadata": {}, + "outputs": [], + "source": [ + "df_crosswalk = merge_data.concatenate_crosswalk_organization(analysis_date_list)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "314c4209-f8f9-4c14-ba02-5b77705721a8", + "metadata": {}, + "outputs": [], + "source": [ + "route_time_cols = [\n", + " \"schedule_gtfs_dataset_key\",\n", + " \"route_id\",\n", + " \"direction_id\",\n", + " \"time_period\",\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c4506f3c-3ed7-4648-a6bf-9faaa15cfcf2", + "metadata": {}, + "outputs": [], + "source": [ + "primary_typology = merge_data.set_primary_typology(df_schedule)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "09fc686a-6773-4bab-a0f3-c0342ab382db", + "metadata": {}, + "outputs": [], + "source": [ + "df_schedule2 = pd.merge(df_schedule, primary_typology, on=route_time_cols, how=\"left\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "185d615b-cf0d-45ae-ab89-7e5ab1bab7c8", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d35b1214-e648-4d58-80aa-baca192bcbf4", + "metadata": {}, + "outputs": [], + "source": [ + "df = pd.merge(\n", + " df_schedule2,\n", + " df_rt_sched,\n", + " on=route_time_cols + [\"service_date\"],\n", + " how=\"outer\",\n", + " indicator=\"sched_rt_category\",\n", + ").merge(\n", + " df_avg_speeds,\n", + " on=route_time_cols + [\"service_date\"],\n", + " how=\"outer\",\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0a31a83c-c7f0-4d9d-a30a-12672bd5de54", + "metadata": {}, + "outputs": [], + "source": [ + "df = (\n", + " df.assign(\n", + " sched_rt_category=df.sched_rt_category.map(\n", + " gtfs_schedule_wrangling.sched_rt_category_dict\n", + " )\n", + " )\n", + " .pipe(\n", + " merge_data.merge_in_standardized_route_names,\n", + " )\n", + " .merge(\n", + " df_crosswalk,\n", + " on=[\"schedule_gtfs_dataset_key\", \"name\", \"service_date\"],\n", + " how=\"left\",\n", + " )\n", + " .pipe(\n", + " # Find the most common cardinal direction\n", + " gtfs_schedule_wrangling.top_cardinal_direction\n", + " )\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7a51500f-6155-4c50-b484-3f05767d47ca", + "metadata": {}, + "outputs": [], + "source": [ + "df = df.rename(columns={\"n_trips\": \"n_scheduled_trips\"})" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "67050044-6890-4230-bad8-e7eead2e890c", + "metadata": {}, + "outputs": [], + "source": [ + "integrify = [\n", + " \"n_scheduled_trips\",\n", + " \"n_vp_trips\",\n", + " \"minutes_atleast1_vp\",\n", + " \"minutes_atleast2_vp\",\n", + " \"total_vp\",\n", + " \"vp_in_shape\",\n", + " \"is_early\",\n", + " \"is_ontime\",\n", + " \"is_late\",\n", + "]\n", + "\n", + "df[integrify] = df[integrify].fillna(0).astype(\"int\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "88bfc756-435d-48a9-8ac0-81c49ef96933", + "metadata": {}, + "outputs": [], + "source": [ + "repeated_y_cols = list([col for col in df.columns if \"_y\" in col.lower()])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "47042af9-1976-4498-88d3-7211fd1fbd05", + "metadata": {}, + "outputs": [], + "source": [ + "df = df.drop(columns=repeated_y_cols)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "70781682-a47c-448d-b634-047fcb60abf9", + "metadata": {}, + "outputs": [], + "source": [ + "repeated_x_cols = list([col for col in df.columns if \"_x\" in col.lower()])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e3fd6d14-9f4e-4954-b32e-bb81382b7299", + "metadata": {}, + "outputs": [], + "source": [ + "df = df.drop(columns=repeated_x_cols)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4f1ab64c-1da2-4898-a4d5-6534f1580a0e", + "metadata": {}, + "outputs": [], + "source": [ + "df.columns" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fbb26e1d-3325-4b90-865a-668d777aaa49", + "metadata": {}, + "outputs": [], + "source": [ + "df.loc[df.schedule_gtfs_dataset_key.isin(schd_keys)].route_id.value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f461c76c-b5fa-453c-ae1a-c90ec5e6a437", + "metadata": {}, + "outputs": [], + "source": [ + "df.info()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2b4c1331-43e7-4a72-8bef-760114faf9a7", + "metadata": {}, + "outputs": [], + "source": [ + "df.sched_rt_category.value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "36d41d75-3390-4a32-a8c6-6da23d675862", + "metadata": {}, + "outputs": [], + "source": [ + "filtered_df = df.loc[df.schedule_gtfs_dataset_key.isin(schd_keys)]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f16bbe7a-c1db-41ac-8d52-81627218da4c", + "metadata": { + "scrolled": true, + "tags": [] + }, + "outputs": [], + "source": [ + "filtered_df[\n", + " [\n", + " \"organization_name\",\n", + " \"route_combined_name\",\n", + " \"sched_rt_category\",\n", + " \"speed_mph\",\n", + " \"frequency\",\n", + " \"direction_id\",\n", + " ]\n", + "].drop_duplicates()" + ] + }, + { + "cell_type": "markdown", + "id": "df87ba57-ed93-40b0-9177-512d89d7995e", + "metadata": {}, + "source": [ + "### Save this temporarily " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d669b728-d3cd-4e74-941b-3f82e87d071c", + "metadata": {}, + "outputs": [], + "source": [ + "df.to_parquet(\n", + " \"gs://calitp-analytics-data/data-analyses/rt_vs_schedule/digest/schedule_vp_metrics_AH_TESTING.parquet\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "2cd6bf6f-24f4-4b5e-aa83-2dc6f7266304", + "metadata": {}, + "source": [ + "### Check for speeds again" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f5606dd1-caf2-48ba-935d-6ffa24a76b1a", + "metadata": {}, + "outputs": [], + "source": [ + "organization_name = \"Marin County Transit District\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9808f1d8-c2f7-4276-a46b-c495dea1fcde", + "metadata": {}, + "outputs": [], + "source": [ + "y_col = \"Speed (MPH)\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "270007af-fc3c-4bba-af03-a754f9f972ef", + "metadata": {}, + "outputs": [], + "source": [ + "marin_county = _section2_utils.load_schedule_vp_metrics(organization_name)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "da4738aa-1dd7-42ba-b51e-2fcb94031d37", + "metadata": { + "scrolled": true, + "tags": [] + }, + "outputs": [], + "source": [ + "marin_county[\n", + " [\"GTFS Availability\", \"Route\", \"Route ID\", \"Direction\", \"Period\", \"Speed (MPH)\"]\n", + "].sort_values(by=\"Route ID\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "40813f67-aad6-4a17-b6fb-aac0c543457e", + "metadata": {}, + "outputs": [], + "source": [ + "marin_county_route_29 = marin_county.loc[\n", + " marin_county.Route == \"29 Downtown San Rafael - E. Corte Madera\"\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cbd8bccd-4815-43a5-a8ed-1b8ca1a89501", + "metadata": {}, + "outputs": [], + "source": [ + "import altair as alt" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "229abd52-438d-4ceb-bb88-e07ca6eb00d2", + "metadata": {}, + "outputs": [], + "source": [ + "routes_list = marin_county[\"Route\"].unique().tolist()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f1e259f9-1a20-41d4-9e9a-408f055495e3", + "metadata": {}, + "outputs": [], + "source": [ + "_section2_utils.base_facet_line(marin_county_route_29, y_col, \"Testing\", \"Testing\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "125dad9f-289d-4330-add4-6be8a9e48694", + "metadata": {}, + "outputs": [], + "source": [ + "max_y = _section2_utils.set_y_axis(marin_county_route_29, y_col)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b989d1c1-36cc-4227-a51d-f6052f6da959", + "metadata": {}, + "outputs": [], + "source": [ + "max_y" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7ef27ac7-4723-43ad-8024-5ea120cc72c6", + "metadata": {}, + "outputs": [], + "source": [ + "marin_county_route_29 = _section2_utils.clean_data_charts(marin_county_route_29, y_col)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0f7929cc-a6a4-47e9-b949-6c8473aca2a4", + "metadata": {}, + "outputs": [], + "source": [ + "marin_county_route_29[[\"dir_0_1\", \"Direction\", \"Period\", \"Speed (MPH)\", \"Date\"]]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f267e527-56da-403e-a3ed-ba191ae62760", + "metadata": {}, + "outputs": [], + "source": [ + "import _report_utils" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "55256f17-5726-4ec6-ba80-955e79fa14be", + "metadata": {}, + "outputs": [], + "source": [ + "import yaml\n", + "\n", + "with open(\"color_palettes.yml\") as f:\n", + " color_dict = yaml.safe_load(f)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0bd0dc9f-063b-4756-8106-5e3a9af90068", + "metadata": {}, + "outputs": [], + "source": [ + "with open(\"readable.yml\") as f:\n", + " readable_dict = yaml.safe_load(f)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d05e4bda-c521-480c-ab93-a95a72df00e0", + "metadata": {}, + "outputs": [], + "source": [ + "readable_dict[\"frequency_graph\"][\"title\"]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6b9b7787-7dba-4004-a0d6-63966d00f7a8", + "metadata": {}, + "outputs": [], + "source": [ + "(readable_dict[\"frequency_graph\"][\"title\"] + \" Test\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "88e2a952-54ed-4c00-98e9-8cfa52a3b6bb", + "metadata": {}, + "outputs": [], + "source": [ + "alt.Chart(\n", + " marin_county_route_29.loc[marin_county_route_29.dir_0_1 == 1]\n", + ").mark_bar(size=10).encode(\n", + " x=\"yearmonthdate(Date):O\",\n", + " y=\"Speed (MPH):Q\",\n", + " color=alt.Color(\n", + " \"Period:N\",\n", + " title=_report_utils.labeling(\"Period\"),\n", + " scale=alt.Scale(range=color_dict[\"tri_color\"]),\n", + " ),\n", + ").facet(column=alt.Column(\"Period:N\", title=_report_utils.labeling(\"Direction\")),\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3a4c41fe-2f46-4116-a956-3fc57cae4732", + "metadata": {}, + "outputs": [], + "source": [ + "readable_dict[\"speed_graph\"][\"title\"]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2966000b-29fa-4f54-b8c9-4c024811671c", + "metadata": {}, + "outputs": [], + "source": [ + "_section2_utils.grouped_bar_chart(\n", + " df = marin_county_route_29.loc[marin_county_route_29.dir_0_1 == 1],\n", + " color_col = \"Period\",\n", + " y_col = \"Speed (MPH)\",\n", + " offset_col = \"Period\",\n", + " title=readable_dict[\"speed_graph\"][\"title\"],\n", + " subtitle= readable_dict[\"speed_graph\"][\"subtitle\"])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "32fa65ef-40f1-4289-a816-659f8b882a43", + "metadata": {}, + "outputs": [], + "source": [ + "(\n", + " alt.Chart(marin_county_route_29.loc[marin_county_route_29.dir_0_1 == 0])\n", + " .mark_line(size=3)\n", + " .encode(\n", + " x=alt.X(\n", + " \"yearmonthdate(Date):O\",\n", + " title=\"Date\",\n", + " axis=alt.Axis(labelAngle=-45, format=\"%b %Y\"),\n", + " ),\n", + " y=alt.Y(\n", + " f\"{y_col}:Q\",\n", + " title=_report_utils.labeling(y_col),\n", + " scale=alt.Scale(domain=[0, max_y]),\n", + " ),\n", + " color=alt.Color(\n", + " \"Period:N\",\n", + " title=_report_utils.labeling(\"Period\"),\n", + " scale=alt.Scale(range=color_dict[\"tri_color\"]),\n", + " ),\n", + " )\n", + ").properties(width=200, height=250)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f2036267-3a9d-4688-bdcb-062d45a48eca", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/gtfs_digest/README.md b/gtfs_digest/README.md index 274b6f533..e1dc97236 100644 --- a/gtfs_digest/README.md +++ b/gtfs_digest/README.md @@ -1,9 +1,11 @@ # General Transit Feed Specification (GTFS) Digest -The goal of this website is to give you an overview of transit operators that produce GTFS schedule and/or real-time data. We use data from the [National Transit Database](https://www.transit.dot.gov/ntd), [National Association of City Transportation Officials's Transit Route Types](https://nacto.org/publication/transit-street-design-guide/introduction/service-context/transit-route-types/), and [GTFS feeds](https://gtfs.org/) to deliver key insights. You can find details such as the types of routes and the total scheduled hours of public transit service for which an operator runs. +The goal of this website is to give you an overview of transit operators that produce GTFS schedule and/or real-time data either on the individual operator, Caltrans district, or legislative district level. -For operators who produce real-time data, we also calculate additional performance metrics for all of their routes. Examples include displaying the number of on-time, early, and late trips, the average speed, and the headway for a route. +We use data from the [National Transit Database](https://www.transit.dot.gov/ntd), [National Association of City Transportation Official’s Transit Route Types](https://nacto.org/publication/transit-street-design-guide/introduction/service-context/transit-route-types/), and [GTFS feeds](https://gtfs.org/) to deliver key insights. You can find details such as the types of routes and the total scheduled hours of public transit service for which an operator runs. -GTFS Digest will continue to evolve as we dive into our own data warehouse! +For operators who produce real-time data, we also calculate additional performance metrics for all their routes. Examples include displaying the number of on-time, early, and late trips, the average speed, and the headway for a route. + +GTFS Digest will continue to evolve as we dive into our own data warehouse! ## Definitions and Methodology To read about the methodology behind and the definitions of terms used throughout our work, please visit [here](https://github.com/cal-itp/data-analyses/blob/main/gtfs_digest/methodology.md). diff --git a/gtfs_digest/_section2_utils.py b/gtfs_digest/_section2_utils.py index 04aa433c4..6db3fa39a 100644 --- a/gtfs_digest/_section2_utils.py +++ b/gtfs_digest/_section2_utils.py @@ -18,7 +18,9 @@ # Data Dictionary GTFS_DATA_DICT = catalog_utils.get_catalog("gtfs_analytics_data") + import yaml + with open("readable.yml") as f: readable_dict = yaml.safe_load(f) @@ -34,11 +36,11 @@ def load_schedule_vp_metrics(organization:str)->pd.DataFrame: Load schedule versus realtime file. """ schd_vp_url = f"{GTFS_DATA_DICT.digest_tables.dir}{GTFS_DATA_DICT.digest_tables.route_schedule_vp}.parquet" - + # schd_vp_url = "gs://calitp-analytics-data/data-analyses/rt_vs_schedule/digest/schedule_vp_metrics_AH_TESTING.parquet" # Keep only rows that are found in both schedule and real time data df = (pd.read_parquet(schd_vp_url, filters=[[("organization_name", "==", organization), - ("sched_rt_category", "==", "schedule_and_vp")]]) + ("sched_rt_category", "in", ["schedule_and_vp"])]]) ) # Delete duplicates @@ -76,6 +78,8 @@ def load_operator_metrics(organization_name:str)->pd.DataFrame: df = pd.read_parquet(url, filters=[[(("organization_name", "==", organization_name))]]) + df = df.drop_duplicates(subset = ["service_date"]).reset_index(drop = True) + # Rename dataframe df = _report_utils.replace_column_names(df) @@ -288,11 +292,11 @@ def grouped_bar_chart( chart = ( alt.Chart(df) - .mark_bar(size=8) + .mark_bar(size=5) .encode( x=alt.X( "yearmonthdate(Date):O", - title=["Grouped by Direction ID", "Date"], + title=["Date"], axis=alt.Axis(labelAngle=-45, format="%b %Y"), ), y=alt.Y(f"{y_col}:Q", title=_report_utils.labeling(y_col)), @@ -359,15 +363,17 @@ def base_facet_line( ) ) - chart = chart.properties(width=200, height=250) - chart = chart.facet( - column=alt.Column("Direction:N", title=_report_utils.labeling("Direction")), - ).properties( + chart = chart.properties(width=200, height=250).properties( title={ "text": [title], "subtitle": [subtitle], } ) + """ + chart = chart.facet( + column=alt.Column("Direction:N", title=_report_utils.labeling("Direction")), + ) + """ return chart def base_facet_circle( @@ -883,12 +889,26 @@ def filtered_route( .transform_filter(xcol_param) ) - speed_graph = ( - base_facet_line( - df, + speed_graph_dir_0 = ( + grouped_bar_chart( + df.loc[df.dir_0_1 == 0], + "Period", "Speed (MPH)", - readable_dict["speed_graph"]["title"], - readable_dict["speed_graph"]["subtitle"], + "Period", + readable_dict["speed_graph_dir_0"]["title"], + readable_dict["speed_graph_dir_0"]["subtitle"], + ) + .add_params(xcol_param) + .transform_filter(xcol_param) + ) + speed_graph_dir_1 = ( + grouped_bar_chart( + df.loc[df.dir_0_1 == 1], + "Period", + "Speed (MPH)", + "Period", + readable_dict["speed_graph_dir_1"]["title"], + readable_dict["speed_graph_dir_0"]["subtitle"], ) .add_params(xcol_param) .transform_filter(xcol_param) @@ -964,7 +984,8 @@ def filtered_route( timeliness_trips_dir_1, frequency_graph_dir_0, frequency_graph_dir_1, - speed_graph, + speed_graph_dir_0, + speed_graph_dir_1, data_quality, vp_per_min_graph, sched_vp_per_min, diff --git a/gtfs_digest/merge_operator_data.py b/gtfs_digest/merge_operator_data.py index 8516fc479..1eccb4ddd 100644 --- a/gtfs_digest/merge_operator_data.py +++ b/gtfs_digest/merge_operator_data.py @@ -32,7 +32,7 @@ def concatenate_operator_routes( date_list: list ) -> gpd.GeoDataFrame: FILE = GTFS_DATA_DICT.schedule_tables.operator_routes - + df = time_series_utils.concatenate_datasets_across_dates( SCHED_GCS, FILE, @@ -191,13 +191,22 @@ def operator_category_counts_by_date() -> pd.DataFrame: ) # Drop duplicates created after merging + # Add more strigent drop duplicate criteria + + duplicate_cols = ["schedule_gtfs_dataset_key", + "vp_per_min_agency", + "spatial_accuracy_agency", + "service_date", + "organization_name", + "caltrans_district"] + op_profiles_df3 = ( op_profiles_df2 .pipe( publish_utils.exclude_private_datasets, col = "schedule_gtfs_dataset_key", public_gtfs_dataset_keys = public_feeds - ).drop_duplicates(subset = list(op_profiles_df2.columns)) + ).drop_duplicates(subset = duplicate_cols) .reset_index(drop = True)) op_profiles_df3.to_parquet( diff --git a/gtfs_digest/readable.yml b/gtfs_digest/readable.yml index 57e4f9569..b3d38e9c2 100644 --- a/gtfs_digest/readable.yml +++ b/gtfs_digest/readable.yml @@ -108,9 +108,11 @@ timeliness_trips_graph: frequency_graph: title: "Frequency of Trips in Minutes" subtitle: "Understanding how often a trip comes. If the bar says 120 minutes, that means a trip will pass that particular direction once every 2 hours." -speed_graph: - title: "Average Speed (MPH)" +speed_graph_dir_0: + title: "Average Speed (MPH) for Direction 0" subtitle: "The average miles per hour the bus travels by direction and time of day." +speed_graph_dir_1: + title: "Average Speed (MPH) for Direction 1" vp_per_min_graph: title: "Vehicle Positions per Minute" subtitle: "Trips should have 2+ VPs per minute. This metric reflects the accuracy of the temporal data collected." diff --git a/gtfs_funnel/crosswalk_gtfs_dataset_key_to_organization.py b/gtfs_funnel/crosswalk_gtfs_dataset_key_to_organization.py index 662977aa6..f9ac6fed8 100644 --- a/gtfs_funnel/crosswalk_gtfs_dataset_key_to_organization.py +++ b/gtfs_funnel/crosswalk_gtfs_dataset_key_to_organization.py @@ -205,6 +205,7 @@ def merge_ntd_mobility(year:int)->pd.DataFrame: # Drop ntd_id from ntd_df to avoid confusion crosswalk_df = crosswalk_df.drop(columns = ["ntd_id_2022"]) + # Drop duplicates since we're getting a lot. crosswalk_df.to_parquet( f"{SCHED_GCS}{EXPORT}_{analysis_date}.parquet" ) diff --git a/gtfs_funnel/operator_scheduled_stats.py b/gtfs_funnel/operator_scheduled_stats.py index 4363c5801..d3b5c7919 100644 --- a/gtfs_funnel/operator_scheduled_stats.py +++ b/gtfs_funnel/operator_scheduled_stats.py @@ -192,7 +192,7 @@ def operator_typology_breakdown(df: pd.DataFrame) -> pd.DataFrame: ).merge( route_typology_grouped, on = ["schedule_gtfs_dataset_key", "route_id"], - how = "inner" + how = "left" ).merge( crosswalk, on = "schedule_gtfs_dataset_key", diff --git a/gtfs_funnel/route_typologies.py b/gtfs_funnel/route_typologies.py index b27aa864e..03153bce8 100644 --- a/gtfs_funnel/route_typologies.py +++ b/gtfs_funnel/route_typologies.py @@ -394,6 +394,7 @@ def reconcile_route_and_nacto_typologies( df3.to_parquet( f"{SCHED_GCS}{EXPORT}_{analysis_date}.parquet") + time1 = datetime.datetime.now() print(f"route typologies {analysis_date}: {time1 - time0}") diff --git a/gtfs_funnel/schedule_stats_by_route_direction.py b/gtfs_funnel/schedule_stats_by_route_direction.py index 41bee317b..38e0b6bfb 100644 --- a/gtfs_funnel/schedule_stats_by_route_direction.py +++ b/gtfs_funnel/schedule_stats_by_route_direction.py @@ -23,6 +23,7 @@ def cardinal_direction_for_route_direction(analysis_date:str, dict_inputs:dict): filters=[[("stop_primary_direction", "!=", "Unknown")] ]) + trip_scheduled_col = [ "route_id", "trip_instance_key", @@ -49,6 +50,10 @@ def cardinal_direction_for_route_direction(analysis_date:str, dict_inputs:dict): stop_times_with_trip = pd.merge(stop_times_df, trips_df, on = merge_cols) + # AH: temporarily fill in direction_id rows with nans + # should go back to the script that creates stop_times_df + stop_times_with_trip.direction_id = stop_times_with_trip.direction_id.fillna(0) + main_cols = [ "route_id", "schedule_gtfs_dataset_key", @@ -57,7 +62,8 @@ def cardinal_direction_for_route_direction(analysis_date:str, dict_inputs:dict): agg1 = ( stop_times_with_trip.groupby( - main_cols + ["stop_primary_direction"] + main_cols + ["stop_primary_direction"], + dropna=False ) .agg({"stop_sequence": "count"}) .reset_index() @@ -136,31 +142,37 @@ def schedule_metrics_by_route_direction( group_merge_cols: list, ) -> pd.DataFrame: """ - Aggregate trip-level metrics to route-direction, and + Aggregate trip-level metrics to route-direction, and attach shape geometry for common_shape_id. """ service_freq_df = gtfs_schedule_wrangling.aggregate_time_of_day_to_peak_offpeak( - df, group_merge_cols, long_or_wide = "long") - - metrics_df = (df.groupby(group_merge_cols, - observed=True, group_keys=False) - .agg({ - "median_stop_meters": "mean", - # take mean of the median stop spacing for trip - # does this make sense? - # median is the single boiled down metric at the trip-level - "scheduled_service_minutes": "mean", - }).reset_index() - .rename(columns = { - "median_stop_meters": "avg_stop_meters", - "scheduled_service_minutes": "avg_scheduled_service_minutes" - }) - ) - + df, group_merge_cols, long_or_wide="long" + ) + + metrics_df = ( + df.groupby(group_merge_cols, observed=True, group_keys=False, dropna=False) + .agg( + { + "median_stop_meters": "mean", + # take mean of the median stop spacing for trip + # does this make sense? + # median is the single boiled down metric at the trip-level + "scheduled_service_minutes": "mean", + } + ) + .reset_index() + .rename( + columns={ + "median_stop_meters": "avg_stop_meters", + "scheduled_service_minutes": "avg_scheduled_service_minutes", + } + ) + ) + metrics_df = metrics_df.assign( - avg_stop_miles = metrics_df.avg_stop_meters.divide(METERS_PER_MILE).round(2) - ).drop(columns = ["avg_stop_meters"]) - + avg_stop_miles=metrics_df.avg_stop_meters.divide(METERS_PER_MILE).round(2) + ).drop(columns=["avg_stop_meters"]) + round_me = ["avg_stop_miles", "avg_scheduled_service_minutes"] metrics_df[round_me] = metrics_df[round_me].round(2) @@ -168,17 +180,11 @@ def schedule_metrics_by_route_direction( analysis_date ).pipe(helpers.remove_shapes_outside_ca) - df = pd.merge( - common_shape, - metrics_df, - on = group_merge_cols, - how = "inner" - ).merge( - service_freq_df, - on = group_merge_cols, - how = "inner" + df = pd.merge(common_shape, metrics_df, on=group_merge_cols, how="inner").merge( + service_freq_df, on=group_merge_cols, how="inner" ) - + + df.time_period = df.time_period.fillna(df.peak_offpeak) return df @@ -195,7 +201,9 @@ def schedule_metrics_by_route_direction( # Find metrics on the trip grain trip_metrics = assemble_scheduled_trip_metrics(date, GTFS_DATA_DICT) - + + trip_metrics.direction_id = trip_metrics.direction_id.fillna(0) + trip_metrics.to_parquet( f"{RT_SCHED_GCS}{TRIP_EXPORT}_{date}.parquet") @@ -235,4 +243,4 @@ def schedule_metrics_by_route_direction( ) end = datetime.datetime.now() - print(f"schedule stats for {date}: {end - start}") + print(f"schedule stats for {date}: {end - start}") \ No newline at end of file diff --git a/gtfs_funnel/update_vars.py b/gtfs_funnel/update_vars.py index ae625f3cc..1d2d5aac3 100644 --- a/gtfs_funnel/update_vars.py +++ b/gtfs_funnel/update_vars.py @@ -12,7 +12,7 @@ ) -# analysis_date_list = [rt_dates.DATES["dec2024"]] +# analysis_date_list = [rt_dates.DATES["dec2024"]] + [rt_dates.DATES['nov2024']] analysis_date_list = all_dates GTFS_DATA_DICT = catalog_utils.get_catalog("gtfs_analytics_data") diff --git a/rt_scheduled_v_ran/logs/rt_v_scheduled_route_metrics.log b/rt_scheduled_v_ran/logs/rt_v_scheduled_route_metrics.log index 5966dcaed..1352bee1b 100644 --- a/rt_scheduled_v_ran/logs/rt_v_scheduled_route_metrics.log +++ b/rt_scheduled_v_ran/logs/rt_v_scheduled_route_metrics.log @@ -80,3 +80,26 @@ 2024-11-20 10:54:08.184 | INFO | __main__:route_metrics:85 - route aggregation 2024-11-13: 0:00:05.430277 2024-12-17 15:40:02.618 | INFO | __main__:route_metrics:85 - route aggregation 2024-12-11: 0:00:02.999985 2024-12-19 09:53:33.513 | INFO | __main__:route_metrics:85 - route aggregation 2024-12-11: 0:00:03.161056 +2025-01-15 15:11:15.095 | INFO | __main__:route_metrics:88 - route aggregation 2024-11-13: 0:00:02.958490 +2025-01-16 15:49:19.011 | INFO | __main__:route_metrics:88 - route aggregation 2024-01-17: 0:00:03.055709 +2025-01-16 15:49:21.746 | INFO | __main__:route_metrics:88 - route aggregation 2024-02-14: 0:00:02.700508 +2025-01-16 15:49:24.061 | INFO | __main__:route_metrics:88 - route aggregation 2024-03-13: 0:00:02.309990 +2025-01-16 15:49:26.168 | INFO | __main__:route_metrics:88 - route aggregation 2024-04-17: 0:00:02.101453 +2025-01-16 15:49:28.852 | INFO | __main__:route_metrics:88 - route aggregation 2024-05-22: 0:00:02.679262 +2025-01-16 15:49:31.068 | INFO | __main__:route_metrics:88 - route aggregation 2024-06-12: 0:00:02.211054 +2025-01-16 15:49:32.952 | INFO | __main__:route_metrics:88 - route aggregation 2024-07-17: 0:00:01.879407 +2025-01-16 15:49:34.738 | INFO | __main__:route_metrics:88 - route aggregation 2024-08-14: 0:00:01.781556 +2025-01-16 15:49:36.550 | INFO | __main__:route_metrics:88 - route aggregation 2024-09-18: 0:00:01.806138 +2025-01-16 15:49:38.451 | INFO | __main__:route_metrics:88 - route aggregation 2024-10-16: 0:00:01.896576 +2025-01-16 15:49:40.292 | INFO | __main__:route_metrics:88 - route aggregation 2024-11-13: 0:00:01.836643 +2025-01-16 15:49:41.992 | INFO | __main__:route_metrics:88 - route aggregation 2024-12-11: 0:00:01.695409 +2025-01-16 15:49:43.743 | INFO | __main__:route_metrics:88 - route aggregation 2023-03-15: 0:00:01.741539 +2025-01-16 15:49:45.597 | INFO | __main__:route_metrics:88 - route aggregation 2023-04-12: 0:00:01.849813 +2025-01-16 15:49:47.350 | INFO | __main__:route_metrics:88 - route aggregation 2023-05-17: 0:00:01.749005 +2025-01-16 15:49:49.083 | INFO | __main__:route_metrics:88 - route aggregation 2023-06-14: 0:00:01.725776 +2025-01-16 15:49:50.855 | INFO | __main__:route_metrics:88 - route aggregation 2023-07-12: 0:00:01.768527 +2025-01-16 15:49:52.712 | INFO | __main__:route_metrics:88 - route aggregation 2023-08-15: 0:00:01.851221 +2025-01-16 15:49:54.532 | INFO | __main__:route_metrics:88 - route aggregation 2023-09-13: 0:00:01.815201 +2025-01-16 15:49:56.361 | INFO | __main__:route_metrics:88 - route aggregation 2023-10-11: 0:00:01.825395 +2025-01-16 15:49:58.178 | INFO | __main__:route_metrics:88 - route aggregation 2023-11-15: 0:00:01.812722 +2025-01-16 15:50:00.055 | INFO | __main__:route_metrics:88 - route aggregation 2023-12-13: 0:00:01.873527 diff --git a/rt_scheduled_v_ran/scripts/rt_v_scheduled_routes.py b/rt_scheduled_v_ran/scripts/rt_v_scheduled_routes.py index 176c0db68..621457dd2 100644 --- a/rt_scheduled_v_ran/scripts/rt_v_scheduled_routes.py +++ b/rt_scheduled_v_ran/scripts/rt_v_scheduled_routes.py @@ -52,6 +52,9 @@ def route_metrics( f"{RT_SCHED_GCS}{TRIP_EXPORT}_{analysis_date}.parquet" ) + # Fill in trip_export with direction_id here temporarily + trip_df.direction_id = trip_df.direction_id.fillna(0) + crosswalk_cols = [ "schedule_gtfs_dataset_key", "name", diff --git a/rt_scheduled_v_ran/scripts/update_vars.py b/rt_scheduled_v_ran/scripts/update_vars.py index 11f5f64ff..c000ab20a 100644 --- a/rt_scheduled_v_ran/scripts/update_vars.py +++ b/rt_scheduled_v_ran/scripts/update_vars.py @@ -7,8 +7,8 @@ oct2024_week = rt_dates.get_week("oct2024", exclude_wed=True) -analysis_date_list = [rt_dates.DATES["dec2024"]] -# analysis_date_list = rt_dates.y2024_dates + rt_dates.y2023_dates +# analysis_date_list = [rt_dates.DATES["nov2024"]] +analysis_date_list = rt_dates.y2024_dates + rt_dates.y2023_dates GTFS_DATA_DICT = catalog_utils.get_catalog("gtfs_analytics_data") diff --git a/rt_segment_speeds/logs/avg_speeds.log b/rt_segment_speeds/logs/avg_speeds.log index abffd9563..26dd87007 100644 --- a/rt_segment_speeds/logs/avg_speeds.log +++ b/rt_segment_speeds/logs/avg_speeds.log @@ -584,3 +584,97 @@ 2024-12-18 15:06:31.961 | INFO | average_segment_speeds:segment_averages_detail:249 - speedmap_segments detailed segment averaging for ['2024-12-11'] execution time: 0:07:17.222047 2024-12-18 15:12:50.801 | INFO | average_segment_speeds:segment_averages:185 - speedmap_segments segment averaging for ['2024-12-11'] execution time: 0:06:18.665542 2024-12-18 15:17:58.646 | INFO | average_segment_speeds:segment_averages:185 - speedmap_segments segment averaging for ['2024-12-11'] execution time: 0:05:07.636255 +2025-01-15 14:36:17.067 | INFO | __main__:summary_average_speeds:120 - trip avg 0:00:18.845972 +2025-01-15 14:36:31.001 | INFO | __main__:summary_average_speeds:154 - rt_stop_times summary speed averaging for ['2024-11-13'] execution time: 0:00:32.780041 +2025-01-16 16:04:20.713 | INFO | __main__:summary_average_speeds:120 - trip avg 0:00:16.934088 +2025-01-16 16:04:32.821 | INFO | __main__:summary_average_speeds:154 - rt_stop_times summary speed averaging for ['2024-01-17'] execution time: 0:00:29.042487 +2025-01-16 16:04:46.545 | INFO | __main__:summary_average_speeds:120 - trip avg 0:00:13.629139 +2025-01-16 16:04:57.111 | INFO | __main__:summary_average_speeds:154 - rt_stop_times summary speed averaging for ['2024-02-14'] execution time: 0:00:24.194859 +2025-01-16 16:05:11.281 | INFO | __main__:summary_average_speeds:120 - trip avg 0:00:14.052910 +2025-01-16 16:05:20.619 | INFO | __main__:summary_average_speeds:154 - rt_stop_times summary speed averaging for ['2024-03-13'] execution time: 0:00:23.391635 +2025-01-16 16:05:34.034 | INFO | __main__:summary_average_speeds:120 - trip avg 0:00:13.328255 +2025-01-16 16:05:43.981 | INFO | __main__:summary_average_speeds:154 - rt_stop_times summary speed averaging for ['2024-04-17'] execution time: 0:00:23.274450 +2025-01-16 16:05:56.572 | INFO | __main__:summary_average_speeds:120 - trip avg 0:00:12.484715 +2025-01-16 16:06:06.317 | INFO | __main__:summary_average_speeds:154 - rt_stop_times summary speed averaging for ['2024-05-22'] execution time: 0:00:22.229829 +2025-01-16 16:06:17.425 | INFO | __main__:summary_average_speeds:120 - trip avg 0:00:11.002452 +2025-01-16 16:06:25.857 | INFO | __main__:summary_average_speeds:154 - rt_stop_times summary speed averaging for ['2024-06-12'] execution time: 0:00:19.433568 +2025-01-16 16:06:38.063 | INFO | __main__:summary_average_speeds:120 - trip avg 0:00:12.129066 +2025-01-16 16:06:46.950 | INFO | __main__:summary_average_speeds:154 - rt_stop_times summary speed averaging for ['2024-07-17'] execution time: 0:00:21.016595 +2025-01-16 16:06:59.293 | INFO | __main__:summary_average_speeds:120 - trip avg 0:00:12.254732 +2025-01-16 16:07:08.575 | INFO | __main__:summary_average_speeds:154 - rt_stop_times summary speed averaging for ['2024-08-14'] execution time: 0:00:21.535979 +2025-01-16 16:07:21.188 | INFO | __main__:summary_average_speeds:120 - trip avg 0:00:12.529217 +2025-01-16 16:07:30.842 | INFO | __main__:summary_average_speeds:154 - rt_stop_times summary speed averaging for ['2024-09-18'] execution time: 0:00:22.183636 +2025-01-16 16:07:43.077 | INFO | __main__:summary_average_speeds:120 - trip avg 0:00:12.145614 +2025-01-16 16:07:52.741 | INFO | __main__:summary_average_speeds:154 - rt_stop_times summary speed averaging for ['2024-10-16'] execution time: 0:00:21.809547 +2025-01-16 16:08:06.064 | INFO | __main__:summary_average_speeds:120 - trip avg 0:00:13.208644 +2025-01-16 16:08:16.431 | INFO | __main__:summary_average_speeds:154 - rt_stop_times summary speed averaging for ['2024-11-13'] execution time: 0:00:23.576042 +2025-01-16 16:08:29.763 | INFO | __main__:summary_average_speeds:120 - trip avg 0:00:13.232394 +2025-01-16 16:08:40.304 | INFO | __main__:summary_average_speeds:154 - rt_stop_times summary speed averaging for ['2024-12-11'] execution time: 0:00:23.773742 +2025-01-16 16:08:52.689 | INFO | __main__:summary_average_speeds:120 - trip avg 0:00:12.282962 +2025-01-16 16:09:03.966 | INFO | __main__:summary_average_speeds:154 - rt_stop_times summary speed averaging for ['2023-03-15'] execution time: 0:00:23.560709 +2025-01-16 16:09:16.346 | INFO | __main__:summary_average_speeds:120 - trip avg 0:00:12.293681 +2025-01-16 16:09:27.403 | INFO | __main__:summary_average_speeds:154 - rt_stop_times summary speed averaging for ['2023-04-12'] execution time: 0:00:23.350808 +2025-01-16 16:09:39.044 | INFO | __main__:summary_average_speeds:120 - trip avg 0:00:11.520841 +2025-01-16 16:09:49.899 | INFO | __main__:summary_average_speeds:154 - rt_stop_times summary speed averaging for ['2023-05-17'] execution time: 0:00:22.376414 +2025-01-16 16:10:01.160 | INFO | __main__:summary_average_speeds:120 - trip avg 0:00:11.178880 +2025-01-16 16:10:11.350 | INFO | __main__:summary_average_speeds:154 - rt_stop_times summary speed averaging for ['2023-06-14'] execution time: 0:00:21.369181 +2025-01-16 16:10:23.073 | INFO | __main__:summary_average_speeds:120 - trip avg 0:00:11.634097 +2025-01-16 16:10:33.139 | INFO | __main__:summary_average_speeds:154 - rt_stop_times summary speed averaging for ['2023-07-12'] execution time: 0:00:21.699396 +2025-01-16 16:10:45.143 | INFO | __main__:summary_average_speeds:120 - trip avg 0:00:11.916611 +2025-01-16 16:10:56.045 | INFO | __main__:summary_average_speeds:154 - rt_stop_times summary speed averaging for ['2023-08-15'] execution time: 0:00:22.818622 +2025-01-16 16:11:07.561 | INFO | __main__:summary_average_speeds:120 - trip avg 0:00:11.420412 +2025-01-16 16:11:18.107 | INFO | __main__:summary_average_speeds:154 - rt_stop_times summary speed averaging for ['2023-09-13'] execution time: 0:00:21.966925 +2025-01-16 16:11:30.453 | INFO | __main__:summary_average_speeds:120 - trip avg 0:00:12.253862 +2025-01-16 16:11:41.081 | INFO | __main__:summary_average_speeds:154 - rt_stop_times summary speed averaging for ['2023-10-11'] execution time: 0:00:22.881764 +2025-01-16 16:11:53.690 | INFO | __main__:summary_average_speeds:120 - trip avg 0:00:12.516304 +2025-01-16 16:12:04.782 | INFO | __main__:summary_average_speeds:154 - rt_stop_times summary speed averaging for ['2023-11-15'] execution time: 0:00:23.608095 +2025-01-16 16:12:17.336 | INFO | __main__:summary_average_speeds:120 - trip avg 0:00:12.461608 +2025-01-16 16:12:27.033 | INFO | __main__:summary_average_speeds:154 - rt_stop_times summary speed averaging for ['2023-12-13'] execution time: 0:00:22.158792 +2025-01-16 16:12:38.972 | INFO | __main__:summary_average_speeds:120 - trip avg 0:00:11.817331 +2025-01-16 16:12:48.760 | INFO | __main__:summary_average_speeds:154 - rt_stop_times summary speed averaging for ['2024-10-14'] execution time: 0:00:21.605632 +2025-01-16 16:13:00.707 | INFO | __main__:summary_average_speeds:120 - trip avg 0:00:11.855366 +2025-01-16 16:13:10.306 | INFO | __main__:summary_average_speeds:154 - rt_stop_times summary speed averaging for ['2024-10-15'] execution time: 0:00:21.454589 +2025-01-16 16:13:22.517 | INFO | __main__:summary_average_speeds:120 - trip avg 0:00:12.118255 +2025-01-16 16:13:32.147 | INFO | __main__:summary_average_speeds:154 - rt_stop_times summary speed averaging for ['2024-10-17'] execution time: 0:00:21.748474 +2025-01-16 16:13:44.222 | INFO | __main__:summary_average_speeds:120 - trip avg 0:00:11.973277 +2025-01-16 16:13:54.036 | INFO | __main__:summary_average_speeds:154 - rt_stop_times summary speed averaging for ['2024-10-18'] execution time: 0:00:21.787478 +2025-01-16 16:14:03.231 | INFO | __main__:summary_average_speeds:120 - trip avg 0:00:09.096869 +2025-01-16 16:14:10.890 | INFO | __main__:summary_average_speeds:154 - rt_stop_times summary speed averaging for ['2024-10-19'] execution time: 0:00:16.755838 +2025-01-16 16:14:19.475 | INFO | __main__:summary_average_speeds:120 - trip avg 0:00:08.520944 +2025-01-16 16:14:27.037 | INFO | __main__:summary_average_speeds:154 - rt_stop_times summary speed averaging for ['2024-10-20'] execution time: 0:00:16.082530 +2025-01-16 16:22:23.209 | INFO | __main__:summary_average_speeds:120 - trip avg 0:00:15.830816 +2025-01-16 16:22:35.360 | INFO | __main__:summary_average_speeds:154 - rt_stop_times summary speed averaging for ['2024-04-15'] execution time: 0:00:27.982591 +2025-01-16 16:22:49.048 | INFO | __main__:summary_average_speeds:120 - trip avg 0:00:13.587776 +2025-01-16 16:23:00.716 | INFO | __main__:summary_average_speeds:154 - rt_stop_times summary speed averaging for ['2024-04-16'] execution time: 0:00:25.255428 +2025-01-16 16:23:14.773 | INFO | __main__:summary_average_speeds:120 - trip avg 0:00:13.934353 +2025-01-16 16:23:24.388 | INFO | __main__:summary_average_speeds:154 - rt_stop_times summary speed averaging for ['2024-04-18'] execution time: 0:00:23.549288 +2025-01-16 16:23:36.612 | INFO | __main__:summary_average_speeds:120 - trip avg 0:00:12.119526 +2025-01-16 16:23:46.011 | INFO | __main__:summary_average_speeds:154 - rt_stop_times summary speed averaging for ['2024-04-19'] execution time: 0:00:21.518818 +2025-01-16 16:23:54.762 | INFO | __main__:summary_average_speeds:120 - trip avg 0:00:08.666832 +2025-01-16 16:24:02.242 | INFO | __main__:summary_average_speeds:154 - rt_stop_times summary speed averaging for ['2024-04-20'] execution time: 0:00:16.145971 +2025-01-16 16:24:11.687 | INFO | __main__:summary_average_speeds:120 - trip avg 0:00:09.381356 +2025-01-16 16:24:19.204 | INFO | __main__:summary_average_speeds:154 - rt_stop_times summary speed averaging for ['2024-04-21'] execution time: 0:00:16.898038 +2025-01-16 16:24:31.946 | INFO | __main__:summary_average_speeds:120 - trip avg 0:00:12.679766 +2025-01-16 16:24:43.239 | INFO | __main__:summary_average_speeds:154 - rt_stop_times summary speed averaging for ['2023-10-09'] execution time: 0:00:23.972403 +2025-01-16 16:24:57.759 | INFO | __main__:summary_average_speeds:120 - trip avg 0:00:14.395392 +2025-01-16 16:25:10.122 | INFO | __main__:summary_average_speeds:154 - rt_stop_times summary speed averaging for ['2023-10-10'] execution time: 0:00:26.758428 +2025-01-16 16:25:22.503 | INFO | __main__:summary_average_speeds:120 - trip avg 0:00:12.292461 +2025-01-16 16:25:33.968 | INFO | __main__:summary_average_speeds:154 - rt_stop_times summary speed averaging for ['2023-10-12'] execution time: 0:00:23.756984 +2025-01-16 16:25:46.305 | INFO | __main__:summary_average_speeds:120 - trip avg 0:00:12.247253 +2025-01-16 16:25:57.723 | INFO | __main__:summary_average_speeds:154 - rt_stop_times summary speed averaging for ['2023-10-13'] execution time: 0:00:23.665107 +2025-01-16 16:26:08.382 | INFO | __main__:summary_average_speeds:120 - trip avg 0:00:10.573562 +2025-01-16 16:26:19.198 | INFO | __main__:summary_average_speeds:154 - rt_stop_times summary speed averaging for ['2023-10-14'] execution time: 0:00:21.388992 +2025-01-16 16:26:27.808 | INFO | __main__:summary_average_speeds:120 - trip avg 0:00:08.541491 +2025-01-16 16:26:36.529 | INFO | __main__:summary_average_speeds:154 - rt_stop_times summary speed averaging for ['2023-10-15'] execution time: 0:00:17.262397 +2025-01-16 16:26:48.305 | INFO | __main__:summary_average_speeds:120 - trip avg 0:00:11.712182 +2025-01-16 16:26:59.835 | INFO | __main__:summary_average_speeds:154 - rt_stop_times summary speed averaging for ['2023-04-10'] execution time: 0:00:23.242696 +2025-01-16 16:27:12.344 | INFO | __main__:summary_average_speeds:120 - trip avg 0:00:12.418244 +2025-01-16 16:27:24.277 | INFO | __main__:summary_average_speeds:154 - rt_stop_times summary speed averaging for ['2023-04-11'] execution time: 0:00:24.351473 +2025-01-16 16:27:36.863 | INFO | __main__:summary_average_speeds:120 - trip avg 0:00:12.501083 +2025-01-16 16:27:48.867 | INFO | __main__:summary_average_speeds:154 - rt_stop_times summary speed averaging for ['2023-04-13'] execution time: 0:00:24.505543 +2025-01-16 16:28:02.931 | INFO | __main__:summary_average_speeds:120 - trip avg 0:00:13.978393 +2025-01-16 16:28:14.047 | INFO | __main__:summary_average_speeds:154 - rt_stop_times summary speed averaging for ['2023-04-14'] execution time: 0:00:25.093847 +2025-01-16 16:28:22.438 | INFO | __main__:summary_average_speeds:120 - trip avg 0:00:08.302166 +2025-01-16 16:28:31.739 | INFO | __main__:summary_average_speeds:154 - rt_stop_times summary speed averaging for ['2023-04-15'] execution time: 0:00:17.603240 +2025-01-16 16:28:40.309 | INFO | __main__:summary_average_speeds:120 - trip avg 0:00:08.508140 +2025-01-16 16:28:49.151 | INFO | __main__:summary_average_speeds:154 - rt_stop_times summary speed averaging for ['2023-04-16'] execution time: 0:00:17.350031 diff --git a/rt_segment_speeds/scripts/average_segment_speeds.py b/rt_segment_speeds/scripts/average_segment_speeds.py index 0a9fea9e0..c0f89426d 100644 --- a/rt_segment_speeds/scripts/average_segment_speeds.py +++ b/rt_segment_speeds/scripts/average_segment_speeds.py @@ -72,12 +72,20 @@ def concatenate_trip_segment_speeds( ).pipe( gtfs_schedule_wrangling.add_peak_offpeak_column ) + """ + Amanda: There's already a `service_date` column. df = df.rename(columns={'arrival_time':'service_date'} ).pipe( gtfs_schedule_wrangling.add_weekday_weekend_column ) # drop service_date? print("concatenated files") + """ + df = df.pipe( + gtfs_schedule_wrangling.add_weekday_weekend_column + ) # drop service_date? + df.direction_id = df.direction_id.fillna(0) + print("concatenated files") return df @@ -96,6 +104,9 @@ def merge_in_segment_geometry( f"{SEGMENT_GCS}{SEGMENT_FILE}_{analysis_date}.parquet", ).to_crs(WGS84) + # Amanda: go back to the script that creates segment_geom to fill in nans + segment_geom.direction_id = segment_geom.direction_id.fillna(0) + col_order = [c for c in speeds_by_segment.columns] # The merge columns list should be all the columns that are in common @@ -134,6 +145,9 @@ def segment_averages( get_pandas = False ) + # Amanda, temporarily filling in direction id here + df.direction_id = df.direction_id.fillna(0) + if weighted_averages: avg_speeds = delayed(metrics.concatenate_peak_offpeak_allday_averages)( df, diff --git a/rt_segment_speeds/scripts/average_summary_speeds.py b/rt_segment_speeds/scripts/average_summary_speeds.py index d3c3ad25e..a7d389b82 100644 --- a/rt_segment_speeds/scripts/average_summary_speeds.py +++ b/rt_segment_speeds/scripts/average_summary_speeds.py @@ -149,7 +149,6 @@ def summary_average_speeds( f"{export_file}_{time_span_str}" ) - end = datetime.datetime.now() logger.info( @@ -177,7 +176,6 @@ def summary_average_speeds( ROUTE_DIR_COLS = [*dict_inputs["route_dir_cols"]] ROUTE_DIR_FILE = dict_inputs["route_dir_single_summary"] - for analysis_date in analysis_date_list: summary_average_speeds( @@ -186,7 +184,6 @@ def summary_average_speeds( group_cols = OPERATOR_COLS + ROUTE_DIR_COLS, export_file = ROUTE_DIR_FILE ) - ''' from segment_speed_utils.project_vars import weeks_available diff --git a/rt_segment_speeds/segment_speed_utils/gtfs_schedule_wrangling.py b/rt_segment_speeds/segment_speed_utils/gtfs_schedule_wrangling.py index 2e1316a5e..0d61fa7aa 100644 --- a/rt_segment_speeds/segment_speed_utils/gtfs_schedule_wrangling.py +++ b/rt_segment_speeds/segment_speed_utils/gtfs_schedule_wrangling.py @@ -127,20 +127,21 @@ def add_weekday_weekend_column(df: pd.DataFrame, category_dict: dict = time_help ) return df - + def count_trips_by_group(df: pd.DataFrame, group_cols: list): """ - Given a df with trip_instance_key and an arbitrary list of + Given a df with trip_instance_key and an arbitrary list of group_cols, return trip counts by group. """ assert "trip_instance_key" in df.columns - df = (df.groupby(group_cols) - .agg({"trip_instance_key": "count"}) - .reset_index() - ) - df = df.rename(columns = {"trip_instance_key": "n_trips"}) + df = ( + df.groupby(group_cols, dropna=False) + .agg({"trip_instance_key": "count"}) + .reset_index() + ) + df = df.rename(columns={"trip_instance_key": "n_trips"}) return df - + def aggregate_time_of_day_to_peak_offpeak( df: pd.DataFrame, group_cols: list, @@ -388,7 +389,7 @@ def most_common_shape_by_route_direction(analysis_date: str) -> gpd.GeoDataFrame most_common_shape = ( trips.groupby(route_dir_cols + ["shape_id", "shape_array_key"], - observed=True, group_keys = False) + observed=True, group_keys = False, dropna= False) .agg({"trip_instance_key": "count"}) .reset_index() .sort_values(route_dir_cols + ["trip_instance_key"], @@ -429,6 +430,8 @@ def most_common_shape_by_route_direction(analysis_date: str) -> gpd.GeoDataFrame on = ["schedule_gtfs_dataset_key", "route_id"] ) + # Amanda: test + common_shape_geom2.direction_id = common_shape_geom2.direction_id.fillna(0) return common_shape_geom2 diff --git a/rt_segment_speeds/segment_speed_utils/metrics.py b/rt_segment_speeds/segment_speed_utils/metrics.py index 1b3ac554c..8abe8d100 100644 --- a/rt_segment_speeds/segment_speed_utils/metrics.py +++ b/rt_segment_speeds/segment_speed_utils/metrics.py @@ -9,9 +9,8 @@ from segment_speed_utils import segment_calcs def weighted_average_speeds_across_segments( - df: pd.DataFrame, - group_cols: list -) -> pd.DataFrame: + df: pd.DataFrame, group_cols: list +) -> pd.DataFrame: """ We can use our segments and the deltas within a trip to calculate the trip-level average speed, or @@ -19,15 +18,16 @@ def weighted_average_speeds_across_segments( But, we want a weighted average, using the raw deltas instead of mean(speed_mph), since segments can be varying lengths. """ - avg_speeds = (df.groupby(group_cols, - observed=True, group_keys=False) - .agg({ - "meters_elapsed": "sum", - "sec_elapsed": "sum", - }).reset_index() - ).pipe( - segment_calcs.speed_from_meters_elapsed_sec_elapsed - ) + avg_speeds = ( + df.groupby(group_cols, observed=True, group_keys=False, dropna=False) + .agg( + { + "meters_elapsed": "sum", + "sec_elapsed": "sum", + } + ) + .reset_index() + ).pipe(segment_calcs.speed_from_meters_elapsed_sec_elapsed) return avg_speeds @@ -112,10 +112,10 @@ def derive_trip_comparison_metrics( def calculate_weighted_average_vp_schedule_metrics( - df: pd.DataFrame, + df: pd.DataFrame, group_cols: list, ) -> pd.DataFrame: - + sum_cols = [ "minutes_atleast1_vp", "minutes_atleast2_vp", @@ -123,21 +123,20 @@ def calculate_weighted_average_vp_schedule_metrics( "scheduled_service_minutes", "total_vp", "vp_in_shape", - "is_early", "is_ontime", "is_late" + "is_early", + "is_ontime", + "is_late", ] count_cols = ["trip_instance_key"] - + df2 = ( - df.groupby(group_cols, - observed=True, group_keys=False) - .agg({ - **{e: "sum" for e in sum_cols}, - **{e: "count" for e in count_cols}} - ).reset_index() - .rename(columns = {"trip_instance_key": "n_vp_trips"}) + df.groupby(group_cols, observed=True, group_keys=False, dropna=False) + .agg({**{e: "sum" for e in sum_cols}, **{e: "count" for e in count_cols}}) + .reset_index() + .rename(columns={"trip_instance_key": "n_vp_trips"}) ) - + return df2 diff --git a/rt_segment_speeds/segment_speed_utils/project_vars.py b/rt_segment_speeds/segment_speed_utils/project_vars.py index 6d28427eb..cae8f5759 100644 --- a/rt_segment_speeds/segment_speed_utils/project_vars.py +++ b/rt_segment_speeds/segment_speed_utils/project_vars.py @@ -11,13 +11,14 @@ SHARED_GCS = GTFS_DATA_DICT.gcs_paths.SHARED_GCS PUBLIC_GCS = GTFS_DATA_DICT.gcs_paths.PUBLIC_GCS -analysis_date = rt_dates.DATES["dec2024"] +# analysis_date = rt_dates.DATES["nov2024"] oct2023_week = rt_dates.get_week("oct2023", exclude_wed=True) apr2023_week = rt_dates.get_week("apr2023", exclude_wed=True) apr2024_week = rt_dates.get_week("apr2024", exclude_wed=True) oct2024_week = rt_dates.get_week("oct2024", exclude_wed=True) +# One file wasn't found for October 21 2024 all_dates = ( rt_dates.y2024_dates + rt_dates.y2023_dates + oct2024_week + apr2024_week + oct2023_week + apr2023_week @@ -28,8 +29,7 @@ rt_dates.oct2023_week, rt_dates.apr2023_week, ] - -analysis_date_list = [analysis_date] +analysis_date_list = apr2024_week + oct2023_week + apr2023_week PROJECT_CRS = "EPSG:3310" diff --git a/rt_segment_speeds/segment_speed_utils/segment_calcs.py b/rt_segment_speeds/segment_speed_utils/segment_calcs.py index 4fb21c5b8..bd15cbd5c 100644 --- a/rt_segment_speeds/segment_speed_utils/segment_calcs.py +++ b/rt_segment_speeds/segment_speed_utils/segment_calcs.py @@ -68,11 +68,10 @@ def calculate_avg_speeds( # pd.groupby and pd.quantile is so slow # create our own list of speeds and use np df2 = (df.groupby(group_cols, - observed=True, group_keys=False) + observed=True, group_keys=False, dropna=False) .agg({"speed_mph": lambda x: sorted(list(x))}) .reset_index() - .rename(columns = {"speed_mph": "speed_mph_list"}) - ) + .rename(columns = {"speed_mph": "speed_mph_list"})) df2 = df2.assign( p50_mph = df2.apply(lambda x: np.percentile(x.speed_mph_list, q=50), axis=1),