diff --git a/gtfs_digest/03_report.ipynb b/gtfs_digest/03_report.ipynb
index de0335f01..b2d89dd9d 100644
--- a/gtfs_digest/03_report.ipynb
+++ b/gtfs_digest/03_report.ipynb
@@ -54,9 +54,8 @@
"# Comment out and leave this cell right below pandas\n",
"# organization_name = \"Marin County Transit District\"\n",
"# organization_name = \"Monterey-Salinas Transit\"\n",
- "# organization_name = \"City of Visalia\"\n",
- "# organization_name = \"City of Simi Valley\"\n",
- "# organization_name = \"Curry Public Transit\""
+ "organization_name = \"City of Santa Maria\"\n",
+ "# organization_name = \"Capitol Corridor Joint Powers Authority\""
]
},
{
@@ -68,8 +67,8 @@
},
"outputs": [],
"source": [
- "%%capture_parameters\n",
- "organization_name"
+ "# %%capture_parameters\n",
+ "# organization_name"
]
},
{
@@ -539,6 +538,24 @@
"except:\n",
" display(Markdown(f\"\"\"{organization_name} only has schedule data.\"\"\"))"
]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "dcb3b64e-15df-49a2-bdf2-b9d2132fa49f",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "display(section2.filtered_route(sched_vp_df))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "bf0776b1-6812-4c29-a036-4d218aade386",
+ "metadata": {},
+ "outputs": [],
+ "source": []
}
],
"metadata": {
diff --git a/gtfs_digest/43_missing_routes.ipynb b/gtfs_digest/43_missing_routes.ipynb
index a1d750d62..42f673b49 100644
--- a/gtfs_digest/43_missing_routes.ipynb
+++ b/gtfs_digest/43_missing_routes.ipynb
@@ -8,7 +8,11 @@
"## Find Missing Routes: 2 operators. \n",
"* [Issue](https://github.com/cal-itp/data-analyses/issues/1312): Capital Corridor doesn't have any rail routes. \n",
"* [Most of Santa Maria's routes not showing up in GTFS Digest](https://github.com/cal-itp/data-analyses/issues/1313)\n",
- "* `cd data-analyses/rt_segment_speeds && pip install -r requirements.txt && cd ../_shared_utils && make setup_env && cd ../gtfs_digest`"
+ "* `cd data-analyses/rt_segment_speeds && pip install -r requirements.txt && cd ../_shared_utils && make setup_env && cd ../gtfs_digest`\n",
+ "\n",
+ "To-Do\n",
+ "* Move all the code here to the proper file.\n",
+ "* Rerun all the scripts that create the underlying dataframes for November date (`df_sched`,`df_avg_speeds`,`df_rt_sched`) and merge them using `gtfs_digest/merge_data.merge_data_sources_by_route_direction()`"
]
},
{
@@ -22,281 +26,167 @@
"import merge_data\n",
"import numpy as np\n",
"import pandas as pd\n",
- "from segment_speed_utils import gtfs_schedule_wrangling, helpers, time_series_utils\n",
- "from shared_utils import catalog_utils, rt_dates, rt_utils\n",
- "from update_vars import GTFS_DATA_DICT, RT_SCHED_GCS, SCHED_GCS"
+ "from segment_speed_utils import (\n",
+ " gtfs_schedule_wrangling,\n",
+ " helpers,\n",
+ " metrics,\n",
+ " segment_calcs,\n",
+ " time_series_utils,\n",
+ ")\n",
+ "from shared_utils import (\n",
+ " catalog_utils,\n",
+ " portfolio_utils,\n",
+ " rt_dates,\n",
+ " rt_utils,\n",
+ " time_helpers,\n",
+ ")\n",
+ "from update_vars import GTFS_DATA_DICT, RT_SCHED_GCS, SCHED_GCS, SEGMENT_GCS"
]
},
{
"cell_type": "code",
"execution_count": 2,
- "id": "74eaf3a5-711d-447d-a945-93cc24dd6f14",
+ "id": "f1ff9b22-b6cf-47d5-bc20-138f992a9519",
"metadata": {},
"outputs": [],
"source": [
- "pd.options.display.max_columns = 100\n",
- "pd.options.display.float_format = \"{:.2f}\".format\n",
- "pd.set_option(\"display.max_rows\", None)\n",
- "pd.set_option(\"display.max_colwidth\", None)"
+ "from shared_utils.rt_utils import METERS_PER_MILE"
]
},
{
"cell_type": "code",
"execution_count": 3,
- "id": "cb99b4b5-7745-422c-a6c5-153f02ffc244",
+ "id": "74eaf3a5-711d-447d-a945-93cc24dd6f14",
"metadata": {},
"outputs": [],
"source": [
- "OPERATOR_FILE = GTFS_DATA_DICT.digest_tables.operator_profiles\n",
- "OPERATOR_ROUTE = GTFS_DATA_DICT.digest_tables.operator_routes_map"
+ "pd.options.display.max_columns = 100\n",
+ "pd.options.display.float_format = \"{:.2f}\".format\n",
+ "pd.set_option(\"display.max_rows\", None)\n",
+ "pd.set_option(\"display.max_colwidth\", None)"
]
},
{
"cell_type": "code",
"execution_count": 4,
- "id": "55faff71-f82c-46fc-a99d-dcc40205e100",
+ "id": "1da55301-1cb1-4187-a90c-9ed3d1c39706",
"metadata": {},
"outputs": [],
"source": [
- "operator_route_gdf = gpd.read_parquet(\n",
- " f\"{RT_SCHED_GCS}{OPERATOR_ROUTE}.parquet\",\n",
- ")"
+ "org_name_lists = [\"Capitol Corridor Joint Powers Authority\", \"City of Santa Maria\"]"
]
},
{
"cell_type": "code",
"execution_count": 5,
- "id": "9b2c852f-f053-406a-8274-8b4f015f10c9",
+ "id": "370e6e0d-edb8-40ab-8a27-b299ea9c279e",
"metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "Index(['shape_array_key', 'geometry', 'feed_key', 'schedule_gtfs_dataset_key',\n",
- " 'direction_id', 'route_key', 'route_length', 'route_length_miles',\n",
- " 'is_downtown_local', 'is_local', 'is_coverage', 'is_rapid',\n",
- " 'is_express', 'is_rail', 'organization_source_record_id',\n",
- " 'organization_name', 'service_date', 'name', 'route_long_name',\n",
- " 'route_short_name', 'route_combined_name', 'route_id'],\n",
- " dtype='object')"
- ]
- },
- "execution_count": 5,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
+ "outputs": [],
"source": [
- "operator_route_gdf.columns"
+ "analysis_date_list = [\"2024-11-13\"]"
]
},
{
"cell_type": "code",
"execution_count": 6,
- "id": "1da55301-1cb1-4187-a90c-9ed3d1c39706",
+ "id": "05bd6fee-c007-4d01-a29e-05c30c478fcb",
"metadata": {},
"outputs": [],
"source": [
- "org_name_lists = [\"Capitol Corridor Joint Powers Authority\", \"City of Santa Maria\"]"
+ "one_analysis_date = \"2024-11-13\""
]
},
{
"cell_type": "code",
"execution_count": 7,
- "id": "b164eae4-f657-49e3-ada1-e059362e4689",
+ "id": "29688d1c-3239-4bc0-935d-8947a426d02d",
"metadata": {},
"outputs": [],
"source": [
- "operator_route_gdf2 = operator_route_gdf.loc[\n",
- " operator_route_gdf.organization_name.isin(org_name_lists)\n",
+ "schd_keys = [\n",
+ " \"5a8721fe96786fcd25fba1f8a0ee6358\",\n",
+ " \"73105f2d1cabc8170ab066d96863c5d5\",\n",
+ " \"f5a749dd65924e025b1293c58f95f8d6\",\n",
"]"
]
},
{
"cell_type": "code",
"execution_count": 8,
- "id": "89ccde0b-736c-4fc9-a294-8a12116823a8",
+ "id": "b4a5fd8f-b4ed-42b4-ab3a-199e0ce779ae",
"metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "Index(['shape_array_key', 'geometry', 'feed_key', 'schedule_gtfs_dataset_key',\n",
- " 'direction_id', 'route_key', 'route_length', 'route_length_miles',\n",
- " 'is_downtown_local', 'is_local', 'is_coverage', 'is_rapid',\n",
- " 'is_express', 'is_rail', 'organization_source_record_id',\n",
- " 'organization_name', 'service_date', 'name', 'route_long_name',\n",
- " 'route_short_name', 'route_combined_name', 'route_id'],\n",
- " dtype='object')"
- ]
- },
- "execution_count": 8,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
+ "outputs": [],
"source": [
- "operator_route_gdf2.columns"
+ "import sys\n",
+ "\n",
+ "sys.path.append(\"../gtfs_funnel/\")\n",
+ "import operator_scheduled_stats\n",
+ "import schedule_stats_by_route_direction"
]
},
{
"cell_type": "code",
"execution_count": 9,
- "id": "295aaf35-9ade-4f9e-bc4d-5b8ef95a1569",
+ "id": "62b562a2-9422-4d56-8baf-9d0a87d0b5da",
"metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "41"
- ]
- },
- "execution_count": 9,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
+ "outputs": [],
"source": [
- "len(operator_route_gdf2)"
+ "def preview(df):\n",
+ " df2 = df[\n",
+ " [\"schedule_gtfs_dataset_key\", \"route_id\", \"direction_id\"]\n",
+ " ].drop_duplicates()\n",
+ " display(df2)"
]
},
{
- "cell_type": "code",
- "execution_count": 10,
- "id": "5630aaaa-dc8b-4917-b9fa-ae0924999720",
+ "cell_type": "markdown",
+ "id": "1739c2de-8d1c-4ec2-8bbf-a05838fb803e",
"metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "0 41\n",
- "Name: is_rail, dtype: int64"
- ]
- },
- "execution_count": 10,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
"source": [
- "operator_route_gdf2.is_rail.value_counts()"
+ "### Fix `schd_vp_url`"
]
},
{
"cell_type": "code",
- "execution_count": 11,
- "id": "3ecc56aa-63ce-402b-8136-a847fd5c0d11",
+ "execution_count": 10,
+ "id": "38069e57-1172-4261-9312-c9e7da14619f",
"metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "Capitol Corridor Joint Powers Authority 21\n",
- "City of Santa Maria 20\n",
- "Name: organization_name, dtype: int64"
- ]
- },
- "execution_count": 11,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
+ "outputs": [],
"source": [
- "operator_route_gdf2.organization_name.value_counts()"
+ "schd_vp_url = f\"{GTFS_DATA_DICT.digest_tables.dir}{GTFS_DATA_DICT.digest_tables.route_schedule_vp}.parquet\""
]
},
{
"cell_type": "code",
- "execution_count": 12,
- "id": "5798606e-2ea4-4ab0-a6d8-a5597a51e66f",
+ "execution_count": 11,
+ "id": "7b60298d-23a9-4a9a-8086-153f0dc8a0e9",
"metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "array(['5a8721fe96786fcd25fba1f8a0ee6358',\n",
- " '73105f2d1cabc8170ab066d96863c5d5',\n",
- " 'f5a749dd65924e025b1293c58f95f8d6'], dtype=object)"
- ]
- },
- "execution_count": 12,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
+ "outputs": [],
"source": [
- "operator_route_gdf2.schedule_gtfs_dataset_key.unique()"
+ "schd_vp_df = pd.read_parquet(schd_vp_url)"
]
},
{
- "cell_type": "markdown",
- "id": "26d11950-fca8-4f5b-8d17-2b9fa0aa368c",
+ "cell_type": "code",
+ "execution_count": 12,
+ "id": "36152890-03ba-47a6-9bdf-89489be23410",
"metadata": {},
+ "outputs": [],
"source": [
- "### Why does City of Santa Maria have multiple schedule_gtfs_dataset_keys?"
+ "schd_vp_df2 = schd_vp_df.loc[schd_vp_df.organization_name.isin(org_name_lists)]"
]
},
{
"cell_type": "code",
"execution_count": 13,
- "id": "81fbd586-cc2d-4a70-97a6-5b25228684b8",
+ "id": "a674a033-5a27-4a34-98ed-ad86c37e6416",
"metadata": {},
"outputs": [
{
"data": {
- "text/html": [
- "
\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " | \n",
- " route_short_name | \n",
- "
\n",
- " \n",
- " organization_name | \n",
- " schedule_gtfs_dataset_key | \n",
- " | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " Capitol Corridor Joint Powers Authority | \n",
- " f5a749dd65924e025b1293c58f95f8d6 | \n",
- " 1 | \n",
- "
\n",
- " \n",
- " City of Santa Maria | \n",
- " 5a8721fe96786fcd25fba1f8a0ee6358 | \n",
- " 1 | \n",
- "
\n",
- " \n",
- " 73105f2d1cabc8170ab066d96863c5d5 | \n",
- " 1 | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
"text/plain": [
- " route_short_name\n",
- "organization_name schedule_gtfs_dataset_key \n",
- "Capitol Corridor Joint Powers Authority f5a749dd65924e025b1293c58f95f8d6 1\n",
- "City of Santa Maria 5a8721fe96786fcd25fba1f8a0ee6358 1\n",
- " 73105f2d1cabc8170ab066d96863c5d5 1"
+ "array(['b3848f93-d26b-48a9-b6a6-5de22a4eab47', '5', 'Shuttle', 'CC'],\n",
+ " dtype=object)"
]
},
"execution_count": 13,
@@ -305,27 +195,23 @@
}
],
"source": [
- "operator_route_gdf2.groupby([\"organization_name\", \"schedule_gtfs_dataset_key\"]).agg(\n",
- " {\"route_short_name\": \"nunique\"}\n",
- ")"
+ "schd_vp_df2.route_id.unique()"
]
},
{
"cell_type": "code",
"execution_count": 14,
- "id": "f989221a-19f9-4f4f-8655-df4f68e7ca15",
+ "id": "e37b2103-d050-42a2-8a51-646beb6873bb",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
- "Index(['shape_array_key', 'geometry', 'feed_key', 'schedule_gtfs_dataset_key',\n",
- " 'direction_id', 'route_key', 'route_length', 'route_length_miles',\n",
- " 'is_downtown_local', 'is_local', 'is_coverage', 'is_rapid',\n",
- " 'is_express', 'is_rail', 'organization_source_record_id',\n",
- " 'organization_name', 'service_date', 'name', 'route_long_name',\n",
- " 'route_short_name', 'route_combined_name', 'route_id'],\n",
- " dtype='object')"
+ "Shuttle 132\n",
+ "CC 84\n",
+ "5 67\n",
+ "b3848f93-d26b-48a9-b6a6-5de22a4eab47 6\n",
+ "Name: route_id, dtype: int64"
]
},
"execution_count": 14,
@@ -334,22 +220,19 @@
}
],
"source": [
- "operator_route_gdf2.columns"
+ "schd_vp_df2.route_id.value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 15,
- "id": "568e2a00-8f8c-451c-8b6d-ae331d18471c",
+ "id": "74d096a3-ff5c-42cd-a5f4-5faf3ae83ffe",
"metadata": {},
"outputs": [
{
"data": {
- "text/html": [
- "Make this Notebook Trusted to load map: File -> Trust Notebook
"
- ],
"text/plain": [
- ""
+ "array(['all_day', 'offpeak', 'peak', None], dtype=object)"
]
},
"execution_count": 15,
@@ -358,60 +241,56 @@
}
],
"source": [
- "operator_route_gdf2.drop(columns=[\"service_date\"]).explore(\"organization_name\")"
+ "schd_vp_df2.time_period.unique()"
]
},
{
- "cell_type": "code",
- "execution_count": 16,
- "id": "bd466515-a3cd-473a-a01a-2e73f9507104",
+ "cell_type": "markdown",
+ "id": "00214565-1faa-44c2-8b8e-f95aeff43e0c",
"metadata": {},
- "outputs": [],
"source": [
- "# operator_route_gdf2.drop(columns = [\"service_date\"]).explore(\"shape_array_key\")"
+ "### DONE Check out `rt_segment_speeds/segment_speed_utils/gtfs_schedule_wrangling`\n",
+ "* https://github.com/cal-itp/data-analyses/blob/4dc340343a60b45ad94217c3efd91f807b03ebc2/rt_segment_speeds/segment_speed_utils/gtfs_schedule_wrangling.py \n",
+ "* Tiffany: Can you try specifying the dropna argument inside pandas groupby? Our pandas version has gone through upgrades, from 0.25 to now 1.5 ), and this argument was introduced in 1.1 and since it's dropna=True, that's probably what's driving the the row behavior.\n",
+ "* It worked! Now time to rerun stuff further down the pipeline and see what happens."
]
},
{
- "cell_type": "markdown",
- "id": "b1ddfdee-292e-4d57-bb1e-17248e87fce8",
+ "cell_type": "code",
+ "execution_count": 16,
+ "id": "74a4ce67-82af-40cb-9a9e-02464ff0e512",
"metadata": {},
+ "outputs": [],
"source": [
- "### Find longest_shape_array_key [here](https://github.com/cal-itp/data-analyses/blob/4dc340343a60b45ad94217c3efd91f807b03ebc2/gtfs_funnel/operator_scheduled_stats.py#L148)\n",
- "* There aren't any routes for Santa Maria\n",
- "* Routes are showing for Capital Corridor."
+ "common_shape_test = gtfs_schedule_wrangling.most_common_shape_by_route_direction(\n",
+ " one_analysis_date\n",
+ ")"
]
},
{
"cell_type": "code",
"execution_count": 17,
- "id": "22587dd0-886d-475b-a101-f23816f396cb",
+ "id": "d04f9154-4842-439b-8a24-f0e084a2e31a",
"metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "'digest/operator_routes'"
- ]
- },
- "execution_count": 17,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
+ "outputs": [],
"source": [
- "OPERATOR_ROUTE"
+ "common_shape_test2 = common_shape_test.loc[\n",
+ " common_shape_test.schedule_gtfs_dataset_key.isin(schd_keys)\n",
+ "]"
]
},
{
"cell_type": "code",
"execution_count": 18,
- "id": "2aac59b0-7cfb-4796-baf3-b99d5b5db14e",
+ "id": "c84d2bba-e499-4f4f-af7b-f77e2f2cf378",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
- "'operator_profiles/operator_routes'"
+ "array(['7', '6', 'CC', '8', 'Mall', '12X', '13X', '11', '30', 'Shuttle',\n",
+ " '8a7c42f9-51e4-4848-bf88-30c210f149ad', '2', '3', '1B', '20', 'SF',\n",
+ " '5', '4', '9', '1'], dtype=object)"
]
},
"execution_count": 18,
@@ -420,1189 +299,9566 @@
}
],
"source": [
- "GTFS_DATA_DICT.schedule_tables.operator_routes"
+ "common_shape_test2.route_id.unique()"
]
},
{
- "cell_type": "code",
- "execution_count": 19,
- "id": "d14199f0-63e5-466c-a122-51b2c2abaa75",
+ "cell_type": "markdown",
+ "id": "acd1679e-c5e0-4c75-aa0c-5ae2a5a5d5c5",
"metadata": {},
- "outputs": [],
"source": [
- "analysis_date = \"2024-11-13\""
+ "### Breakdown `gtfs_digest/merge_data.`"
]
},
{
- "cell_type": "code",
- "execution_count": 20,
- "id": "a31bc07a-7f16-4b32-8f1a-639914c1eeea",
+ "cell_type": "markdown",
+ "id": "1bdcce76-6b7a-4bc6-9953-1ca8cceaca13",
"metadata": {},
- "outputs": [],
"source": [
- "route_cols = [\"schedule_gtfs_dataset_key\", \"route_id\"]"
+ "#### Line 294:DONE making all the changes to the original files. `df_sched` is already missing a lot of the routes."
]
},
{
"cell_type": "code",
- "execution_count": 21,
- "id": "0a83573f-6fca-403b-a3ec-2b944efcfabd",
+ "execution_count": 19,
+ "id": "b164eae4-f657-49e3-ada1-e059362e4689",
"metadata": {},
"outputs": [],
"source": [
- "longest_shape_gdf = (\n",
- " gtfs_schedule_wrangling.longest_shape_by_route_direction(analysis_date)\n",
- " .sort_values(\n",
- " route_cols + [\"route_length\"], ascending=[True for i in route_cols] + [False]\n",
- " )\n",
- " .drop_duplicates(subset=route_cols)\n",
- " .reset_index(drop=True)\n",
- ")"
+ "# Get cardinal direction for each route\n",
+ "df_sched_og = merge_data.concatenate_schedule_by_route_direction(analysis_date_list)"
]
},
{
"cell_type": "code",
- "execution_count": 22,
- "id": "7c57add2-f72c-4c22-9ca6-e5efe879cab3",
+ "execution_count": 20,
+ "id": "b5c8be82-af6c-4255-a2c5-487acdb30e52",
"metadata": {},
"outputs": [],
"source": [
- "schd_keys = list(operator_route_gdf2.schedule_gtfs_dataset_key.unique())"
+ "df_sched2_og = df_sched_og.loc[df_sched_og.schedule_gtfs_dataset_key.isin(schd_keys)]"
]
},
{
"cell_type": "code",
- "execution_count": 23,
- "id": "7a76026d-88e6-49a3-83f8-b20836b70d7a",
+ "execution_count": 21,
+ "id": "5d50ad8e-7536-4187-812d-2591d3589d15",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
- "['5a8721fe96786fcd25fba1f8a0ee6358',\n",
- " '73105f2d1cabc8170ab066d96863c5d5',\n",
- " 'f5a749dd65924e025b1293c58f95f8d6']"
+ "Shuttle 6\n",
+ "5 3\n",
+ "Name: route_id, dtype: int64"
]
},
- "execution_count": 23,
+ "execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
- "schd_keys"
+ "df_sched2_og.route_id.value_counts()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "e9ece006-fc30-462c-b8f3-abdc3293075e",
+ "metadata": {},
+ "source": [
+ "##### Go back to `gtfs_funnel/schedule_stats_by_route_direction`\n",
+ "* https://github.com/cal-itp/data-analyses/blob/1ba0f544a01f99966a6e210dd11666b4fe4a146e/gtfs_funnel/schedule_stats_by_route_direction.py#L190\n",
+ "* **Filled in `direction_id` with 0**"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "24845d09-e06a-4f89-a0e2-e2a40274708c",
+ "metadata": {},
+ "source": [
+ "##### `assemble_scheduled_trip_metrics`: nothing is missing but `direction_id` is missing a lot of values."
]
},
{
"cell_type": "code",
- "execution_count": 24,
- "id": "3bcb40ca-7e6a-432e-a70c-e1817f7eebe9",
+ "execution_count": 22,
+ "id": "6472c0ff-a1d8-4882-91fd-cb15c0dd3c48",
"metadata": {},
"outputs": [],
"source": [
- "longest_shape_gdf2 = longest_shape_gdf.loc[\n",
- " longest_shape_gdf.schedule_gtfs_dataset_key.isin(schd_keys)\n",
- "]"
+ "trip_metrics = schedule_stats_by_route_direction.assemble_scheduled_trip_metrics(\n",
+ " one_analysis_date, GTFS_DATA_DICT\n",
+ ")"
]
},
{
"cell_type": "code",
- "execution_count": 25,
- "id": "86963a9f-3456-48d5-a386-05c211fe93f4",
+ "execution_count": 23,
+ "id": "da37674c-b332-456e-adcd-8af0fdf8fa94",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "trip_metrics2 = trip_metrics.loc[trip_metrics.schedule_gtfs_dataset_key.isin(schd_keys)]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 24,
+ "id": "11ebad2e-53e0-4899-a22c-681d11bf54d4",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
- "Index(['shape_array_key', 'geometry', 'feed_key', 'schedule_gtfs_dataset_key',\n",
- " 'route_id', 'direction_id', 'route_key', 'route_length'],\n",
+ "Index(['schedule_gtfs_dataset_key', 'trip_instance_key', 'median_stop_meters',\n",
+ " 'time_of_day', 'scheduled_service_minutes', 'route_id', 'direction_id'],\n",
" dtype='object')"
]
},
- "execution_count": 25,
+ "execution_count": 24,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
- "longest_shape_gdf2.columns"
+ "trip_metrics2.columns"
]
},
{
"cell_type": "code",
- "execution_count": 26,
- "id": "b256ef9b-82c1-4832-ac54-19ca9319bdc4",
+ "execution_count": 25,
+ "id": "e3548c12-ecad-4196-afdc-b0539b6f6cd3",
"metadata": {},
"outputs": [
{
- "name": "stdout",
+ "name": "stderr",
"output_type": "stream",
"text": [
- "\n",
- "Int64Index: 20 entries, 1061 to 2588\n",
- "Data columns (total 8 columns):\n",
- " # Column Non-Null Count Dtype \n",
- "--- ------ -------------- ----- \n",
- " 0 shape_array_key 20 non-null object \n",
- " 1 geometry 20 non-null geometry\n",
- " 2 feed_key 20 non-null object \n",
- " 3 schedule_gtfs_dataset_key 20 non-null object \n",
- " 4 route_id 20 non-null object \n",
- " 5 direction_id 4 non-null float64 \n",
- " 6 route_key 20 non-null object \n",
- " 7 route_length 20 non-null float64 \n",
- "dtypes: float64(2), geometry(1), object(5)\n",
- "memory usage: 1.4+ KB\n"
+ "/tmp/ipykernel_2800/3236449391.py:1: SettingWithCopyWarning: \n",
+ "A value is trying to be set on a copy of a slice from a DataFrame.\n",
+ "Try using .loc[row_indexer,col_indexer] = value instead\n",
+ "\n",
+ "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+ " trip_metrics2.direction_id = trip_metrics2.direction_id.fillna(0)\n"
]
}
],
"source": [
- "longest_shape_gdf2.info()"
+ "trip_metrics2.direction_id = trip_metrics2.direction_id.fillna(0)"
]
},
{
"cell_type": "code",
- "execution_count": 27,
- "id": "6db42351-2a52-4e00-a265-33e5743cdea2",
+ "execution_count": 26,
+ "id": "fbb27f9d-29d0-4997-b04d-c26e59a2154a",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(335, 7)"
+ ]
+ },
+ "execution_count": 26,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "trip_metrics2.shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 27,
+ "id": "d9026014-1e37-4792-a4fb-3bba5dfa20fd",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "array(['PM Peak', 'Midday', 'AM Peak', 'Early AM', 'Evening'],\n",
+ " dtype=object)"
+ ]
+ },
+ "execution_count": 27,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "trip_metrics2.time_of_day.unique()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "be2084cc-6482-4aef-a43b-7508e7952d0e",
+ "metadata": {},
+ "source": [
+ "##### Each row is populated."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 28,
+ "id": "7aeb5b42-0a9f-4b00-a1fa-d2a656640118",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " median_stop_meters | \n",
+ " time_of_day | \n",
+ " scheduled_service_minutes | \n",
+ " route_id | \n",
+ " direction_id | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 49725 | \n",
+ " 405.04 | \n",
+ " Midday | \n",
+ " 35.00 | \n",
+ " 3 | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ " 49729 | \n",
+ " 178.05 | \n",
+ " Midday | \n",
+ " 14.98 | \n",
+ " Mall | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ " 49731 | \n",
+ " 451.15 | \n",
+ " Midday | \n",
+ " 41.00 | \n",
+ " 11 | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ " 49736 | \n",
+ " 361.12 | \n",
+ " Midday | \n",
+ " 30.00 | \n",
+ " 1B | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ " 49737 | \n",
+ " 357.22 | \n",
+ " Midday | \n",
+ " 30.37 | \n",
+ " 1 | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ " 49738 | \n",
+ " 444.75 | \n",
+ " Midday | \n",
+ " 40.00 | \n",
+ " 9 | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ " 49741 | \n",
+ " 440.62 | \n",
+ " Midday | \n",
+ " 41.00 | \n",
+ " 4 | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ " 49742 | \n",
+ " 989.61 | \n",
+ " Midday | \n",
+ " 56.00 | \n",
+ " 12X | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ " 49744 | \n",
+ " 437.51 | \n",
+ " Midday | \n",
+ " 42.00 | \n",
+ " 5 | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ " 49754 | \n",
+ " 477.41 | \n",
+ " Midday | \n",
+ " 53.00 | \n",
+ " 13X | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ " 49756 | \n",
+ " 512.01 | \n",
+ " Midday | \n",
+ " 43.00 | \n",
+ " 8 | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ " 49780 | \n",
+ " 589.78 | \n",
+ " Midday | \n",
+ " 36.00 | \n",
+ " 7 | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ " 49781 | \n",
+ " 9208.29 | \n",
+ " Midday | \n",
+ " 120.00 | \n",
+ " 20 | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ " 49784 | \n",
+ " 462.04 | \n",
+ " Midday | \n",
+ " 49.00 | \n",
+ " 2 | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ " 49795 | \n",
+ " 1953.92 | \n",
+ " Midday | \n",
+ " 173.00 | \n",
+ " 30 | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ " 49802 | \n",
+ " 619.55 | \n",
+ " Midday | \n",
+ " 38.00 | \n",
+ " 6 | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ " 49804 | \n",
+ " 407.39 | \n",
+ " Midday | \n",
+ " 41.65 | \n",
+ " 8a7c42f9-51e4-4848-bf88-30c210f149ad | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ " 113284 | \n",
+ " 21434.28 | \n",
+ " Midday | \n",
+ " 85.00 | \n",
+ " Shuttle | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ " 113296 | \n",
+ " 10770.70 | \n",
+ " Midday | \n",
+ " 30.00 | \n",
+ " SF | \n",
+ " 1.00 | \n",
+ "
\n",
+ " \n",
+ " 113303 | \n",
+ " 10770.70 | \n",
+ " Midday | \n",
+ " 30.00 | \n",
+ " SF | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ " 113305 | \n",
+ " 10988.09 | \n",
+ " Midday | \n",
+ " 191.00 | \n",
+ " CC | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ " 113323 | \n",
+ " 14985.48 | \n",
+ " Midday | \n",
+ " 117.00 | \n",
+ " CC | \n",
+ " 1.00 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " median_stop_meters time_of_day scheduled_service_minutes \\\n",
+ "49725 405.04 Midday 35.00 \n",
+ "49729 178.05 Midday 14.98 \n",
+ "49731 451.15 Midday 41.00 \n",
+ "49736 361.12 Midday 30.00 \n",
+ "49737 357.22 Midday 30.37 \n",
+ "49738 444.75 Midday 40.00 \n",
+ "49741 440.62 Midday 41.00 \n",
+ "49742 989.61 Midday 56.00 \n",
+ "49744 437.51 Midday 42.00 \n",
+ "49754 477.41 Midday 53.00 \n",
+ "49756 512.01 Midday 43.00 \n",
+ "49780 589.78 Midday 36.00 \n",
+ "49781 9208.29 Midday 120.00 \n",
+ "49784 462.04 Midday 49.00 \n",
+ "49795 1953.92 Midday 173.00 \n",
+ "49802 619.55 Midday 38.00 \n",
+ "49804 407.39 Midday 41.65 \n",
+ "113284 21434.28 Midday 85.00 \n",
+ "113296 10770.70 Midday 30.00 \n",
+ "113303 10770.70 Midday 30.00 \n",
+ "113305 10988.09 Midday 191.00 \n",
+ "113323 14985.48 Midday 117.00 \n",
+ "\n",
+ " route_id direction_id \n",
+ "49725 3 0.00 \n",
+ "49729 Mall 0.00 \n",
+ "49731 11 0.00 \n",
+ "49736 1B 0.00 \n",
+ "49737 1 0.00 \n",
+ "49738 9 0.00 \n",
+ "49741 4 0.00 \n",
+ "49742 12X 0.00 \n",
+ "49744 5 0.00 \n",
+ "49754 13X 0.00 \n",
+ "49756 8 0.00 \n",
+ "49780 7 0.00 \n",
+ "49781 20 0.00 \n",
+ "49784 2 0.00 \n",
+ "49795 30 0.00 \n",
+ "49802 6 0.00 \n",
+ "49804 8a7c42f9-51e4-4848-bf88-30c210f149ad 0.00 \n",
+ "113284 Shuttle 0.00 \n",
+ "113296 SF 1.00 \n",
+ "113303 SF 0.00 \n",
+ "113305 CC 0.00 \n",
+ "113323 CC 1.00 "
+ ]
+ },
+ "execution_count": 28,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "trip_metrics2.loc[trip_metrics2.time_of_day == \"Midday\"].drop_duplicates(\n",
+ " subset=[\"schedule_gtfs_dataset_key\", \"route_id\", \"direction_id\"]\n",
+ ").drop(columns=[\"schedule_gtfs_dataset_key\", \"trip_instance_key\"])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 29,
+ "id": "571f4ec3-966d-4412-8d05-9f50ea7c159d",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " schedule_gtfs_dataset_key | \n",
+ " route_id | \n",
+ " direction_id | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 49724 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 30 | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ " 49725 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 3 | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ " 49727 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 20 | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ " 49728 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 4 | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ " 49729 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " Mall | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ " 49730 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 5 | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ " 49731 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 11 | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ " 49732 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 7 | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ " 49733 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 9 | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ " 49735 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 1 | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ " 49736 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 1B | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ " 49742 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 12X | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ " 49745 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 6 | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ " 49751 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 2 | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ " 49754 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 13X | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ " 49756 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 8 | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ " 49799 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 8a7c42f9-51e4-4848-bf88-30c210f149ad | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ " 113284 | \n",
+ " f5a749dd65924e025b1293c58f95f8d6 | \n",
+ " Shuttle | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ " 113285 | \n",
+ " f5a749dd65924e025b1293c58f95f8d6 | \n",
+ " SF | \n",
+ " 1.00 | \n",
+ "
\n",
+ " \n",
+ " 113286 | \n",
+ " f5a749dd65924e025b1293c58f95f8d6 | \n",
+ " SF | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ " 113289 | \n",
+ " f5a749dd65924e025b1293c58f95f8d6 | \n",
+ " CC | \n",
+ " 1.00 | \n",
+ "
\n",
+ " \n",
+ " 113292 | \n",
+ " f5a749dd65924e025b1293c58f95f8d6 | \n",
+ " CC | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ " 113307 | \n",
+ " f5a749dd65924e025b1293c58f95f8d6 | \n",
+ " Shuttle | \n",
+ " 1.00 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " schedule_gtfs_dataset_key \\\n",
+ "49724 73105f2d1cabc8170ab066d96863c5d5 \n",
+ "49725 73105f2d1cabc8170ab066d96863c5d5 \n",
+ "49727 73105f2d1cabc8170ab066d96863c5d5 \n",
+ "49728 73105f2d1cabc8170ab066d96863c5d5 \n",
+ "49729 73105f2d1cabc8170ab066d96863c5d5 \n",
+ "49730 73105f2d1cabc8170ab066d96863c5d5 \n",
+ "49731 73105f2d1cabc8170ab066d96863c5d5 \n",
+ "49732 73105f2d1cabc8170ab066d96863c5d5 \n",
+ "49733 73105f2d1cabc8170ab066d96863c5d5 \n",
+ "49735 73105f2d1cabc8170ab066d96863c5d5 \n",
+ "49736 73105f2d1cabc8170ab066d96863c5d5 \n",
+ "49742 73105f2d1cabc8170ab066d96863c5d5 \n",
+ "49745 73105f2d1cabc8170ab066d96863c5d5 \n",
+ "49751 73105f2d1cabc8170ab066d96863c5d5 \n",
+ "49754 73105f2d1cabc8170ab066d96863c5d5 \n",
+ "49756 73105f2d1cabc8170ab066d96863c5d5 \n",
+ "49799 73105f2d1cabc8170ab066d96863c5d5 \n",
+ "113284 f5a749dd65924e025b1293c58f95f8d6 \n",
+ "113285 f5a749dd65924e025b1293c58f95f8d6 \n",
+ "113286 f5a749dd65924e025b1293c58f95f8d6 \n",
+ "113289 f5a749dd65924e025b1293c58f95f8d6 \n",
+ "113292 f5a749dd65924e025b1293c58f95f8d6 \n",
+ "113307 f5a749dd65924e025b1293c58f95f8d6 \n",
+ "\n",
+ " route_id direction_id \n",
+ "49724 30 0.00 \n",
+ "49725 3 0.00 \n",
+ "49727 20 0.00 \n",
+ "49728 4 0.00 \n",
+ "49729 Mall 0.00 \n",
+ "49730 5 0.00 \n",
+ "49731 11 0.00 \n",
+ "49732 7 0.00 \n",
+ "49733 9 0.00 \n",
+ "49735 1 0.00 \n",
+ "49736 1B 0.00 \n",
+ "49742 12X 0.00 \n",
+ "49745 6 0.00 \n",
+ "49751 2 0.00 \n",
+ "49754 13X 0.00 \n",
+ "49756 8 0.00 \n",
+ "49799 8a7c42f9-51e4-4848-bf88-30c210f149ad 0.00 \n",
+ "113284 Shuttle 0.00 \n",
+ "113285 SF 1.00 \n",
+ "113286 SF 0.00 \n",
+ "113289 CC 1.00 \n",
+ "113292 CC 0.00 \n",
+ "113307 Shuttle 1.00 "
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "preview(trip_metrics2)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 30,
+ "id": "2309939b-4531-4548-aa18-d85ea147880d",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "Int64Index: 335 entries, 49724 to 113340\n",
+ "Data columns (total 7 columns):\n",
+ " # Column Non-Null Count Dtype \n",
+ "--- ------ -------------- ----- \n",
+ " 0 schedule_gtfs_dataset_key 335 non-null object \n",
+ " 1 trip_instance_key 335 non-null object \n",
+ " 2 median_stop_meters 335 non-null float64\n",
+ " 3 time_of_day 335 non-null object \n",
+ " 4 scheduled_service_minutes 335 non-null float64\n",
+ " 5 route_id 335 non-null object \n",
+ " 6 direction_id 335 non-null float64\n",
+ "dtypes: float64(3), object(4)\n",
+ "memory usage: 20.9+ KB\n"
+ ]
+ }
+ ],
+ "source": [
+ "trip_metrics2.info()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "72613ac3-2ff9-4026-bf76-601b2b4ec0ca",
+ "metadata": {},
+ "source": [
+ "##### DONE`gtfs_funnel/schedule_stats_by_route_direction/schedule_metrics_by_route_direction` \n",
+ "* **updated to `dropna=False` and also filled in `time_period` with `peak_offpeak`**"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 31,
+ "id": "a8f01bbf-831d-4ca6-ac60-0851413d8df3",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def schedule_metrics_by_route_direction(\n",
+ " df: pd.DataFrame,\n",
+ " analysis_date: str,\n",
+ " group_merge_cols: list,\n",
+ ") -> pd.DataFrame:\n",
+ " \"\"\"\n",
+ " Aggregate trip-level metrics to route-direction, and\n",
+ " attach shape geometry for common_shape_id.\n",
+ " \"\"\"\n",
+ " service_freq_df = gtfs_schedule_wrangling.aggregate_time_of_day_to_peak_offpeak(\n",
+ " df, group_merge_cols, long_or_wide=\"long\"\n",
+ " )\n",
+ "\n",
+ " metrics_df = (\n",
+ " df.groupby(group_merge_cols, observed=True, group_keys=False, dropna=False)\n",
+ " .agg(\n",
+ " {\n",
+ " \"median_stop_meters\": \"mean\",\n",
+ " # take mean of the median stop spacing for trip\n",
+ " # does this make sense?\n",
+ " # median is the single boiled down metric at the trip-level\n",
+ " \"scheduled_service_minutes\": \"mean\",\n",
+ " }\n",
+ " )\n",
+ " .reset_index()\n",
+ " .rename(\n",
+ " columns={\n",
+ " \"median_stop_meters\": \"avg_stop_meters\",\n",
+ " \"scheduled_service_minutes\": \"avg_scheduled_service_minutes\",\n",
+ " }\n",
+ " )\n",
+ " )\n",
+ "\n",
+ " metrics_df = metrics_df.assign(\n",
+ " avg_stop_miles=metrics_df.avg_stop_meters.divide(METERS_PER_MILE).round(2)\n",
+ " ).drop(columns=[\"avg_stop_meters\"])\n",
+ "\n",
+ " round_me = [\"avg_stop_miles\", \"avg_scheduled_service_minutes\"]\n",
+ " metrics_df[round_me] = metrics_df[round_me].round(2)\n",
+ "\n",
+ " common_shape = gtfs_schedule_wrangling.most_common_shape_by_route_direction(\n",
+ " analysis_date\n",
+ " ).pipe(helpers.remove_shapes_outside_ca)\n",
+ "\n",
+ " df = pd.merge(common_shape, metrics_df, on=group_merge_cols, how=\"inner\").merge(\n",
+ " service_freq_df, on=group_merge_cols, how=\"inner\"\n",
+ " )\n",
+ "\n",
+ " df.time_period = df.time_period.fillna(df.peak_offpeak)\n",
+ " return df"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 32,
+ "id": "8391d0c3-f887-4aee-9088-f839186238a4",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "route_group_merge_cols = [\"schedule_gtfs_dataset_key\", \"route_id\", \"direction_id\"]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 33,
+ "id": "a389a761-d6dd-42b9-a8aa-2e91bad1bc1f",
+ "metadata": {
+ "scrolled": true,
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "route_dir_metrics = schedule_metrics_by_route_direction(\n",
+ " trip_metrics2, one_analysis_date, route_group_merge_cols\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 34,
+ "id": "e708fa79-b533-4d28-8c53-1d5509645c0f",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " schedule_gtfs_dataset_key | \n",
+ " route_id | \n",
+ " direction_id | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 7 | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 6 | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ " 6 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 8 | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ " 9 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " Mall | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ " 12 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 12X | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ " 15 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 13X | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ " 18 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 11 | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ " 21 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 30 | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ " 24 | \n",
+ " f5a749dd65924e025b1293c58f95f8d6 | \n",
+ " Shuttle | \n",
+ " 1.00 | \n",
+ "
\n",
+ " \n",
+ " 27 | \n",
+ " f5a749dd65924e025b1293c58f95f8d6 | \n",
+ " Shuttle | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ " 30 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 8a7c42f9-51e4-4848-bf88-30c210f149ad | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ " 33 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 2 | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ " 36 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 3 | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ " 39 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 1B | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ " 42 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 20 | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ " 45 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 5 | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ " 48 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 4 | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ " 51 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 9 | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ " 54 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 1 | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " schedule_gtfs_dataset_key route_id \\\n",
+ "0 73105f2d1cabc8170ab066d96863c5d5 7 \n",
+ "3 73105f2d1cabc8170ab066d96863c5d5 6 \n",
+ "6 73105f2d1cabc8170ab066d96863c5d5 8 \n",
+ "9 73105f2d1cabc8170ab066d96863c5d5 Mall \n",
+ "12 73105f2d1cabc8170ab066d96863c5d5 12X \n",
+ "15 73105f2d1cabc8170ab066d96863c5d5 13X \n",
+ "18 73105f2d1cabc8170ab066d96863c5d5 11 \n",
+ "21 73105f2d1cabc8170ab066d96863c5d5 30 \n",
+ "24 f5a749dd65924e025b1293c58f95f8d6 Shuttle \n",
+ "27 f5a749dd65924e025b1293c58f95f8d6 Shuttle \n",
+ "30 73105f2d1cabc8170ab066d96863c5d5 8a7c42f9-51e4-4848-bf88-30c210f149ad \n",
+ "33 73105f2d1cabc8170ab066d96863c5d5 2 \n",
+ "36 73105f2d1cabc8170ab066d96863c5d5 3 \n",
+ "39 73105f2d1cabc8170ab066d96863c5d5 1B \n",
+ "42 73105f2d1cabc8170ab066d96863c5d5 20 \n",
+ "45 73105f2d1cabc8170ab066d96863c5d5 5 \n",
+ "48 73105f2d1cabc8170ab066d96863c5d5 4 \n",
+ "51 73105f2d1cabc8170ab066d96863c5d5 9 \n",
+ "54 73105f2d1cabc8170ab066d96863c5d5 1 \n",
+ "\n",
+ " direction_id \n",
+ "0 0.00 \n",
+ "3 0.00 \n",
+ "6 0.00 \n",
+ "9 0.00 \n",
+ "12 0.00 \n",
+ "15 0.00 \n",
+ "18 0.00 \n",
+ "21 0.00 \n",
+ "24 1.00 \n",
+ "27 0.00 \n",
+ "30 0.00 \n",
+ "33 0.00 \n",
+ "36 0.00 \n",
+ "39 0.00 \n",
+ "42 0.00 \n",
+ "45 0.00 \n",
+ "48 0.00 \n",
+ "51 0.00 \n",
+ "54 0.00 "
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "preview(route_dir_metrics)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 35,
+ "id": "3877741e-ba60-41d9-ada5-6f7dd02e9cf1",
+ "metadata": {
+ "scrolled": true,
+ "tags": []
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " route_id | \n",
+ " direction_id | \n",
+ " route_name | \n",
+ " avg_scheduled_service_minutes | \n",
+ " avg_stop_miles | \n",
+ " n_trips | \n",
+ " time_period | \n",
+ " peak_offpeak | \n",
+ " frequency | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 7 | \n",
+ " 0.00 | \n",
+ " Rt 7. A. H. College, Crossroads Shopping Center via Boone St, Bradley Rd. | \n",
+ " 36.00 | \n",
+ " 0.37 | \n",
+ " 19 | \n",
+ " all_day | \n",
+ " NaN | \n",
+ " 0.79 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 7 | \n",
+ " 0.00 | \n",
+ " Rt 7. A. H. College, Crossroads Shopping Center via Boone St, Bradley Rd. | \n",
+ " 36.00 | \n",
+ " 0.37 | \n",
+ " 9 | \n",
+ " offpeak | \n",
+ " offpeak | \n",
+ " 0.38 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 7 | \n",
+ " 0.00 | \n",
+ " Rt 7. A. H. College, Crossroads Shopping Center via Boone St, Bradley Rd. | \n",
+ " 36.00 | \n",
+ " 0.37 | \n",
+ " 10 | \n",
+ " peak | \n",
+ " peak | \n",
+ " 0.42 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 6 | \n",
+ " 0.00 | \n",
+ " Rt 6. Oak Knolls to Old Orcutt-East to West-Outbound | \n",
+ " 38.00 | \n",
+ " 0.38 | \n",
+ " 18 | \n",
+ " all_day | \n",
+ " NaN | \n",
+ " 0.75 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 6 | \n",
+ " 0.00 | \n",
+ " Rt 6. Oak Knolls to Old Orcutt-East to West-Outbound | \n",
+ " 38.00 | \n",
+ " 0.38 | \n",
+ " 7 | \n",
+ " offpeak | \n",
+ " offpeak | \n",
+ " 0.29 | \n",
+ "
\n",
+ " \n",
+ " 5 | \n",
+ " 6 | \n",
+ " 0.00 | \n",
+ " Rt 6. Oak Knolls to Old Orcutt-East to West-Outbound | \n",
+ " 38.00 | \n",
+ " 0.38 | \n",
+ " 11 | \n",
+ " peak | \n",
+ " peak | \n",
+ " 0.46 | \n",
+ "
\n",
+ " \n",
+ " 6 | \n",
+ " 8 | \n",
+ " 0.00 | \n",
+ " Rt 8. Tanglewood to Crossroads Shopping Center via McCoy Ln. | \n",
+ " 43.00 | \n",
+ " 0.32 | \n",
+ " 16 | \n",
+ " all_day | \n",
+ " NaN | \n",
+ " 0.67 | \n",
+ "
\n",
+ " \n",
+ " 7 | \n",
+ " 8 | \n",
+ " 0.00 | \n",
+ " Rt 8. Tanglewood to Crossroads Shopping Center via McCoy Ln. | \n",
+ " 43.00 | \n",
+ " 0.32 | \n",
+ " 8 | \n",
+ " offpeak | \n",
+ " offpeak | \n",
+ " 0.33 | \n",
+ "
\n",
+ " \n",
+ " 8 | \n",
+ " 8 | \n",
+ " 0.00 | \n",
+ " Rt 8. Tanglewood to Crossroads Shopping Center via McCoy Ln. | \n",
+ " 43.00 | \n",
+ " 0.32 | \n",
+ " 8 | \n",
+ " peak | \n",
+ " peak | \n",
+ " 0.33 | \n",
+ "
\n",
+ " \n",
+ " 9 | \n",
+ " Mall | \n",
+ " 0.00 | \n",
+ " Mall Shuttle | \n",
+ " 14.98 | \n",
+ " 0.11 | \n",
+ " 28 | \n",
+ " all_day | \n",
+ " NaN | \n",
+ " 1.17 | \n",
+ "
\n",
+ " \n",
+ " 10 | \n",
+ " Mall | \n",
+ " 0.00 | \n",
+ " Mall Shuttle | \n",
+ " 14.98 | \n",
+ " 0.11 | \n",
+ " 14 | \n",
+ " offpeak | \n",
+ " offpeak | \n",
+ " 0.58 | \n",
+ "
\n",
+ " \n",
+ " 11 | \n",
+ " Mall | \n",
+ " 0.00 | \n",
+ " Mall Shuttle | \n",
+ " 14.98 | \n",
+ " 0.11 | \n",
+ " 14 | \n",
+ " peak | \n",
+ " peak | \n",
+ " 0.58 | \n",
+ "
\n",
+ " \n",
+ " 12 | \n",
+ " 12X | \n",
+ " 0.00 | \n",
+ " 12X Broadway/Orcutt Express | \n",
+ " 56.00 | \n",
+ " 0.61 | \n",
+ " 11 | \n",
+ " all_day | \n",
+ " NaN | \n",
+ " 0.46 | \n",
+ "
\n",
+ " \n",
+ " 13 | \n",
+ " 12X | \n",
+ " 0.00 | \n",
+ " 12X Broadway/Orcutt Express | \n",
+ " 56.00 | \n",
+ " 0.61 | \n",
+ " 6 | \n",
+ " offpeak | \n",
+ " offpeak | \n",
+ " 0.25 | \n",
+ "
\n",
+ " \n",
+ " 14 | \n",
+ " 12X | \n",
+ " 0.00 | \n",
+ " 12X Broadway/Orcutt Express | \n",
+ " 56.00 | \n",
+ " 0.61 | \n",
+ " 5 | \n",
+ " peak | \n",
+ " peak | \n",
+ " 0.21 | \n",
+ "
\n",
+ " \n",
+ " 15 | \n",
+ " 13X | \n",
+ " 0.00 | \n",
+ " 13X Transit Center/PVHS/N. Broadway | \n",
+ " 50.82 | \n",
+ " 0.29 | \n",
+ " 11 | \n",
+ " all_day | \n",
+ " NaN | \n",
+ " 0.46 | \n",
+ "
\n",
+ " \n",
+ " 16 | \n",
+ " 13X | \n",
+ " 0.00 | \n",
+ " 13X Transit Center/PVHS/N. Broadway | \n",
+ " 50.82 | \n",
+ " 0.29 | \n",
+ " 6 | \n",
+ " offpeak | \n",
+ " offpeak | \n",
+ " 0.25 | \n",
+ "
\n",
+ " \n",
+ " 17 | \n",
+ " 13X | \n",
+ " 0.00 | \n",
+ " 13X Transit Center/PVHS/N. Broadway | \n",
+ " 50.82 | \n",
+ " 0.29 | \n",
+ " 5 | \n",
+ " peak | \n",
+ " peak | \n",
+ " 0.21 | \n",
+ "
\n",
+ " \n",
+ " 18 | \n",
+ " 11 | \n",
+ " 0.00 | \n",
+ " R11. Transit Center to Gov't Center via S. Broadway | \n",
+ " 41.00 | \n",
+ " 0.28 | \n",
+ " 22 | \n",
+ " all_day | \n",
+ " NaN | \n",
+ " 0.92 | \n",
+ "
\n",
+ " \n",
+ " 19 | \n",
+ " 11 | \n",
+ " 0.00 | \n",
+ " R11. Transit Center to Gov't Center via S. Broadway | \n",
+ " 41.00 | \n",
+ " 0.28 | \n",
+ " 10 | \n",
+ " offpeak | \n",
+ " offpeak | \n",
+ " 0.42 | \n",
+ "
\n",
+ " \n",
+ " 20 | \n",
+ " 11 | \n",
+ " 0.00 | \n",
+ " R11. Transit Center to Gov't Center via S. Broadway | \n",
+ " 41.00 | \n",
+ " 0.28 | \n",
+ " 12 | \n",
+ " peak | \n",
+ " peak | \n",
+ " 0.50 | \n",
+ "
\n",
+ " \n",
+ " 21 | \n",
+ " 30 | \n",
+ " 0.00 | \n",
+ " Route 30 - Santa Maria Transit Center/Vandenberg/Lompoc | \n",
+ " 166.33 | \n",
+ " 1.22 | \n",
+ " 9 | \n",
+ " all_day | \n",
+ " NaN | \n",
+ " 0.38 | \n",
+ "
\n",
+ " \n",
+ " 22 | \n",
+ " 30 | \n",
+ " 0.00 | \n",
+ " Route 30 - Santa Maria Transit Center/Vandenberg/Lompoc | \n",
+ " 166.33 | \n",
+ " 1.22 | \n",
+ " 5 | \n",
+ " offpeak | \n",
+ " offpeak | \n",
+ " 0.21 | \n",
+ "
\n",
+ " \n",
+ " 23 | \n",
+ " 30 | \n",
+ " 0.00 | \n",
+ " Route 30 - Santa Maria Transit Center/Vandenberg/Lompoc | \n",
+ " 166.33 | \n",
+ " 1.22 | \n",
+ " 4 | \n",
+ " peak | \n",
+ " peak | \n",
+ " 0.17 | \n",
+ "
\n",
+ " \n",
+ " 24 | \n",
+ " Shuttle | \n",
+ " 1.00 | \n",
+ " Shuttle to Auburn | \n",
+ " 72.00 | \n",
+ " 13.74 | \n",
+ " 5 | \n",
+ " all_day | \n",
+ " NaN | \n",
+ " 0.21 | \n",
+ "
\n",
+ " \n",
+ " 25 | \n",
+ " Shuttle | \n",
+ " 1.00 | \n",
+ " Shuttle to Auburn | \n",
+ " 72.00 | \n",
+ " 13.74 | \n",
+ " 2 | \n",
+ " offpeak | \n",
+ " offpeak | \n",
+ " 0.08 | \n",
+ "
\n",
+ " \n",
+ " 26 | \n",
+ " Shuttle | \n",
+ " 1.00 | \n",
+ " Shuttle to Auburn | \n",
+ " 72.00 | \n",
+ " 13.74 | \n",
+ " 3 | \n",
+ " peak | \n",
+ " peak | \n",
+ " 0.12 | \n",
+ "
\n",
+ " \n",
+ " 27 | \n",
+ " Shuttle | \n",
+ " 0.00 | \n",
+ " Shuttle to Auburn | \n",
+ " 70.00 | \n",
+ " 11.78 | \n",
+ " 5 | \n",
+ " all_day | \n",
+ " NaN | \n",
+ " 0.21 | \n",
+ "
\n",
+ " \n",
+ " 28 | \n",
+ " Shuttle | \n",
+ " 0.00 | \n",
+ " Shuttle to Auburn | \n",
+ " 70.00 | \n",
+ " 11.78 | \n",
+ " 3 | \n",
+ " offpeak | \n",
+ " offpeak | \n",
+ " 0.12 | \n",
+ "
\n",
+ " \n",
+ " 29 | \n",
+ " Shuttle | \n",
+ " 0.00 | \n",
+ " Shuttle to Auburn | \n",
+ " 70.00 | \n",
+ " 11.78 | \n",
+ " 2 | \n",
+ " peak | \n",
+ " peak | \n",
+ " 0.08 | \n",
+ "
\n",
+ " \n",
+ " 30 | \n",
+ " 8a7c42f9-51e4-4848-bf88-30c210f149ad | \n",
+ " 0.00 | \n",
+ " Rt 11. Transit Center to Gov't Center via S. Broadway | \n",
+ " 41.65 | \n",
+ " 0.25 | \n",
+ " 18 | \n",
+ " all_day | \n",
+ " NaN | \n",
+ " 0.75 | \n",
+ "
\n",
+ " \n",
+ " 31 | \n",
+ " 8a7c42f9-51e4-4848-bf88-30c210f149ad | \n",
+ " 0.00 | \n",
+ " Rt 11. Transit Center to Gov't Center via S. Broadway | \n",
+ " 41.65 | \n",
+ " 0.25 | \n",
+ " 8 | \n",
+ " offpeak | \n",
+ " offpeak | \n",
+ " 0.33 | \n",
+ "
\n",
+ " \n",
+ " 32 | \n",
+ " 8a7c42f9-51e4-4848-bf88-30c210f149ad | \n",
+ " 0.00 | \n",
+ " Rt 11. Transit Center to Gov't Center via S. Broadway | \n",
+ " 41.65 | \n",
+ " 0.25 | \n",
+ " 10 | \n",
+ " peak | \n",
+ " peak | \n",
+ " 0.42 | \n",
+ "
\n",
+ " \n",
+ " 33 | \n",
+ " 2 | \n",
+ " 0.00 | \n",
+ " Rt 2. Transit Center to PVH School via Western., Donovan Rd | \n",
+ " 53.24 | \n",
+ " 0.29 | \n",
+ " 17 | \n",
+ " all_day | \n",
+ " NaN | \n",
+ " 0.71 | \n",
+ "
\n",
+ " \n",
+ " 34 | \n",
+ " 2 | \n",
+ " 0.00 | \n",
+ " Rt 2. Transit Center to PVH School via Western., Donovan Rd | \n",
+ " 53.24 | \n",
+ " 0.29 | \n",
+ " 6 | \n",
+ " offpeak | \n",
+ " offpeak | \n",
+ " 0.25 | \n",
+ "
\n",
+ " \n",
+ " 35 | \n",
+ " 2 | \n",
+ " 0.00 | \n",
+ " Rt 2. Transit Center to PVH School via Western., Donovan Rd | \n",
+ " 53.24 | \n",
+ " 0.29 | \n",
+ " 11 | \n",
+ " peak | \n",
+ " peak | \n",
+ " 0.46 | \n",
+ "
\n",
+ " \n",
+ " 36 | \n",
+ " 3 | \n",
+ " 0.00 | \n",
+ " Rt 3. Transit Center to Marian Hospital to PVH School, via E. Main, Suey Ln. | \n",
+ " 35.11 | \n",
+ " 0.26 | \n",
+ " 18 | \n",
+ " all_day | \n",
+ " NaN | \n",
+ " 0.75 | \n",
+ "
\n",
+ " \n",
+ " 37 | \n",
+ " 3 | \n",
+ " 0.00 | \n",
+ " Rt 3. Transit Center to Marian Hospital to PVH School, via E. Main, Suey Ln. | \n",
+ " 35.11 | \n",
+ " 0.26 | \n",
+ " 8 | \n",
+ " offpeak | \n",
+ " offpeak | \n",
+ " 0.33 | \n",
+ "
\n",
+ " \n",
+ " 38 | \n",
+ " 3 | \n",
+ " 0.00 | \n",
+ " Rt 3. Transit Center to Marian Hospital to PVH School, via E. Main, Suey Ln. | \n",
+ " 35.11 | \n",
+ " 0.26 | \n",
+ " 10 | \n",
+ " peak | \n",
+ " peak | \n",
+ " 0.42 | \n",
+ "
\n",
+ " \n",
+ " 39 | \n",
+ " 1B | \n",
+ " 0.00 | \n",
+ " Rt 1. Transit Ctr to Preisker Park Via N. Broadway | \n",
+ " 30.00 | \n",
+ " 0.22 | \n",
+ " 12 | \n",
+ " all_day | \n",
+ " NaN | \n",
+ " 0.50 | \n",
+ "
\n",
+ " \n",
+ " 40 | \n",
+ " 1B | \n",
+ " 0.00 | \n",
+ " Rt 1. Transit Ctr to Preisker Park Via N. Broadway | \n",
+ " 30.00 | \n",
+ " 0.22 | \n",
+ " 5 | \n",
+ " offpeak | \n",
+ " offpeak | \n",
+ " 0.21 | \n",
+ "
\n",
+ " \n",
+ " 41 | \n",
+ " 1B | \n",
+ " 0.00 | \n",
+ " Rt 1. Transit Ctr to Preisker Park Via N. Broadway | \n",
+ " 30.00 | \n",
+ " 0.22 | \n",
+ " 7 | \n",
+ " peak | \n",
+ " peak | \n",
+ " 0.29 | \n",
+ "
\n",
+ " \n",
+ " 42 | \n",
+ " 20 | \n",
+ " 0.00 | \n",
+ " Route 20 - Santa Maria Transit Center/Los Alamos/Buellton/Solvang-OB | \n",
+ " 119.17 | \n",
+ " 5.72 | \n",
+ " 6 | \n",
+ " all_day | \n",
+ " NaN | \n",
+ " 0.25 | \n",
+ "
\n",
+ " \n",
+ " 43 | \n",
+ " 20 | \n",
+ " 0.00 | \n",
+ " Route 20 - Santa Maria Transit Center/Los Alamos/Buellton/Solvang-OB | \n",
+ " 119.17 | \n",
+ " 5.72 | \n",
+ " 3 | \n",
+ " offpeak | \n",
+ " offpeak | \n",
+ " 0.12 | \n",
+ "
\n",
+ " \n",
+ " 44 | \n",
+ " 20 | \n",
+ " 0.00 | \n",
+ " Route 20 - Santa Maria Transit Center/Los Alamos/Buellton/Solvang-OB | \n",
+ " 119.17 | \n",
+ " 5.72 | \n",
+ " 3 | \n",
+ " peak | \n",
+ " peak | \n",
+ " 0.12 | \n",
+ "
\n",
+ " \n",
+ " 45 | \n",
+ " 5 | \n",
+ " 0.00 | \n",
+ " Rt 5. Transit Center to Gov't Cntr to Evergreen Shopping Center via Miller St., S.M. Way | \n",
+ " 42.00 | \n",
+ " 0.27 | \n",
+ " 18 | \n",
+ " all_day | \n",
+ " NaN | \n",
+ " 0.75 | \n",
+ "
\n",
+ " \n",
+ " 46 | \n",
+ " 5 | \n",
+ " 0.00 | \n",
+ " Rt 5. Transit Center to Gov't Cntr to Evergreen Shopping Center via Miller St., S.M. Way | \n",
+ " 42.00 | \n",
+ " 0.27 | \n",
+ " 8 | \n",
+ " offpeak | \n",
+ " offpeak | \n",
+ " 0.33 | \n",
+ "
\n",
+ " \n",
+ " 47 | \n",
+ " 5 | \n",
+ " 0.00 | \n",
+ " Rt 5. Transit Center to Gov't Cntr to Evergreen Shopping Center via Miller St., S.M. Way | \n",
+ " 42.00 | \n",
+ " 0.27 | \n",
+ " 10 | \n",
+ " peak | \n",
+ " peak | \n",
+ " 0.42 | \n",
+ "
\n",
+ " \n",
+ " 48 | \n",
+ " 4 | \n",
+ " 0.00 | \n",
+ " Rt 4. Transit Center to SMH school to VTC via Cook St., Thornburg., Betteravia Rd. | \n",
+ " 41.06 | \n",
+ " 0.27 | \n",
+ " 18 | \n",
+ " all_day | \n",
+ " NaN | \n",
+ " 0.75 | \n",
+ "
\n",
+ " \n",
+ " 49 | \n",
+ " 4 | \n",
+ " 0.00 | \n",
+ " Rt 4. Transit Center to SMH school to VTC via Cook St., Thornburg., Betteravia Rd. | \n",
+ " 41.06 | \n",
+ " 0.27 | \n",
+ " 8 | \n",
+ " offpeak | \n",
+ " offpeak | \n",
+ " 0.33 | \n",
+ "
\n",
+ " \n",
+ " 50 | \n",
+ " 4 | \n",
+ " 0.00 | \n",
+ " Rt 4. Transit Center to SMH school to VTC via Cook St., Thornburg., Betteravia Rd. | \n",
+ " 41.06 | \n",
+ " 0.27 | \n",
+ " 10 | \n",
+ " peak | \n",
+ " peak | \n",
+ " 0.42 | \n",
+ "
\n",
+ " \n",
+ " 51 | \n",
+ " 9 | \n",
+ " 0.00 | \n",
+ " Rt 9. Transit Center to PVH via Alvin Ave. | \n",
+ " 40.00 | \n",
+ " 0.28 | \n",
+ " 18 | \n",
+ " all_day | \n",
+ " NaN | \n",
+ " 0.75 | \n",
+ "
\n",
+ " \n",
+ " 52 | \n",
+ " 9 | \n",
+ " 0.00 | \n",
+ " Rt 9. Transit Center to PVH via Alvin Ave. | \n",
+ " 40.00 | \n",
+ " 0.28 | \n",
+ " 8 | \n",
+ " offpeak | \n",
+ " offpeak | \n",
+ " 0.33 | \n",
+ "
\n",
+ " \n",
+ " 53 | \n",
+ " 9 | \n",
+ " 0.00 | \n",
+ " Rt 9. Transit Center to PVH via Alvin Ave. | \n",
+ " 40.00 | \n",
+ " 0.28 | \n",
+ " 10 | \n",
+ " peak | \n",
+ " peak | \n",
+ " 0.42 | \n",
+ "
\n",
+ " \n",
+ " 54 | \n",
+ " 1 | \n",
+ " 0.00 | \n",
+ " Rt 1. Transit Ctr to Preisker Park Via N. Broadway | \n",
+ " 30.37 | \n",
+ " 0.22 | \n",
+ " 19 | \n",
+ " all_day | \n",
+ " NaN | \n",
+ " 0.79 | \n",
+ "
\n",
+ " \n",
+ " 55 | \n",
+ " 1 | \n",
+ " 0.00 | \n",
+ " Rt 1. Transit Ctr to Preisker Park Via N. Broadway | \n",
+ " 30.37 | \n",
+ " 0.22 | \n",
+ " 9 | \n",
+ " offpeak | \n",
+ " offpeak | \n",
+ " 0.38 | \n",
+ "
\n",
+ " \n",
+ " 56 | \n",
+ " 1 | \n",
+ " 0.00 | \n",
+ " Rt 1. Transit Ctr to Preisker Park Via N. Broadway | \n",
+ " 30.37 | \n",
+ " 0.22 | \n",
+ " 10 | \n",
+ " peak | \n",
+ " peak | \n",
+ " 0.42 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " route_id direction_id \\\n",
+ "0 7 0.00 \n",
+ "1 7 0.00 \n",
+ "2 7 0.00 \n",
+ "3 6 0.00 \n",
+ "4 6 0.00 \n",
+ "5 6 0.00 \n",
+ "6 8 0.00 \n",
+ "7 8 0.00 \n",
+ "8 8 0.00 \n",
+ "9 Mall 0.00 \n",
+ "10 Mall 0.00 \n",
+ "11 Mall 0.00 \n",
+ "12 12X 0.00 \n",
+ "13 12X 0.00 \n",
+ "14 12X 0.00 \n",
+ "15 13X 0.00 \n",
+ "16 13X 0.00 \n",
+ "17 13X 0.00 \n",
+ "18 11 0.00 \n",
+ "19 11 0.00 \n",
+ "20 11 0.00 \n",
+ "21 30 0.00 \n",
+ "22 30 0.00 \n",
+ "23 30 0.00 \n",
+ "24 Shuttle 1.00 \n",
+ "25 Shuttle 1.00 \n",
+ "26 Shuttle 1.00 \n",
+ "27 Shuttle 0.00 \n",
+ "28 Shuttle 0.00 \n",
+ "29 Shuttle 0.00 \n",
+ "30 8a7c42f9-51e4-4848-bf88-30c210f149ad 0.00 \n",
+ "31 8a7c42f9-51e4-4848-bf88-30c210f149ad 0.00 \n",
+ "32 8a7c42f9-51e4-4848-bf88-30c210f149ad 0.00 \n",
+ "33 2 0.00 \n",
+ "34 2 0.00 \n",
+ "35 2 0.00 \n",
+ "36 3 0.00 \n",
+ "37 3 0.00 \n",
+ "38 3 0.00 \n",
+ "39 1B 0.00 \n",
+ "40 1B 0.00 \n",
+ "41 1B 0.00 \n",
+ "42 20 0.00 \n",
+ "43 20 0.00 \n",
+ "44 20 0.00 \n",
+ "45 5 0.00 \n",
+ "46 5 0.00 \n",
+ "47 5 0.00 \n",
+ "48 4 0.00 \n",
+ "49 4 0.00 \n",
+ "50 4 0.00 \n",
+ "51 9 0.00 \n",
+ "52 9 0.00 \n",
+ "53 9 0.00 \n",
+ "54 1 0.00 \n",
+ "55 1 0.00 \n",
+ "56 1 0.00 \n",
+ "\n",
+ " route_name \\\n",
+ "0 Rt 7. A. H. College, Crossroads Shopping Center via Boone St, Bradley Rd. \n",
+ "1 Rt 7. A. H. College, Crossroads Shopping Center via Boone St, Bradley Rd. \n",
+ "2 Rt 7. A. H. College, Crossroads Shopping Center via Boone St, Bradley Rd. \n",
+ "3 Rt 6. Oak Knolls to Old Orcutt-East to West-Outbound \n",
+ "4 Rt 6. Oak Knolls to Old Orcutt-East to West-Outbound \n",
+ "5 Rt 6. Oak Knolls to Old Orcutt-East to West-Outbound \n",
+ "6 Rt 8. Tanglewood to Crossroads Shopping Center via McCoy Ln. \n",
+ "7 Rt 8. Tanglewood to Crossroads Shopping Center via McCoy Ln. \n",
+ "8 Rt 8. Tanglewood to Crossroads Shopping Center via McCoy Ln. \n",
+ "9 Mall Shuttle \n",
+ "10 Mall Shuttle \n",
+ "11 Mall Shuttle \n",
+ "12 12X Broadway/Orcutt Express \n",
+ "13 12X Broadway/Orcutt Express \n",
+ "14 12X Broadway/Orcutt Express \n",
+ "15 13X Transit Center/PVHS/N. Broadway \n",
+ "16 13X Transit Center/PVHS/N. Broadway \n",
+ "17 13X Transit Center/PVHS/N. Broadway \n",
+ "18 R11. Transit Center to Gov't Center via S. Broadway \n",
+ "19 R11. Transit Center to Gov't Center via S. Broadway \n",
+ "20 R11. Transit Center to Gov't Center via S. Broadway \n",
+ "21 Route 30 - Santa Maria Transit Center/Vandenberg/Lompoc \n",
+ "22 Route 30 - Santa Maria Transit Center/Vandenberg/Lompoc \n",
+ "23 Route 30 - Santa Maria Transit Center/Vandenberg/Lompoc \n",
+ "24 Shuttle to Auburn \n",
+ "25 Shuttle to Auburn \n",
+ "26 Shuttle to Auburn \n",
+ "27 Shuttle to Auburn \n",
+ "28 Shuttle to Auburn \n",
+ "29 Shuttle to Auburn \n",
+ "30 Rt 11. Transit Center to Gov't Center via S. Broadway \n",
+ "31 Rt 11. Transit Center to Gov't Center via S. Broadway \n",
+ "32 Rt 11. Transit Center to Gov't Center via S. Broadway \n",
+ "33 Rt 2. Transit Center to PVH School via Western., Donovan Rd \n",
+ "34 Rt 2. Transit Center to PVH School via Western., Donovan Rd \n",
+ "35 Rt 2. Transit Center to PVH School via Western., Donovan Rd \n",
+ "36 Rt 3. Transit Center to Marian Hospital to PVH School, via E. Main, Suey Ln. \n",
+ "37 Rt 3. Transit Center to Marian Hospital to PVH School, via E. Main, Suey Ln. \n",
+ "38 Rt 3. Transit Center to Marian Hospital to PVH School, via E. Main, Suey Ln. \n",
+ "39 Rt 1. Transit Ctr to Preisker Park Via N. Broadway \n",
+ "40 Rt 1. Transit Ctr to Preisker Park Via N. Broadway \n",
+ "41 Rt 1. Transit Ctr to Preisker Park Via N. Broadway \n",
+ "42 Route 20 - Santa Maria Transit Center/Los Alamos/Buellton/Solvang-OB \n",
+ "43 Route 20 - Santa Maria Transit Center/Los Alamos/Buellton/Solvang-OB \n",
+ "44 Route 20 - Santa Maria Transit Center/Los Alamos/Buellton/Solvang-OB \n",
+ "45 Rt 5. Transit Center to Gov't Cntr to Evergreen Shopping Center via Miller St., S.M. Way \n",
+ "46 Rt 5. Transit Center to Gov't Cntr to Evergreen Shopping Center via Miller St., S.M. Way \n",
+ "47 Rt 5. Transit Center to Gov't Cntr to Evergreen Shopping Center via Miller St., S.M. Way \n",
+ "48 Rt 4. Transit Center to SMH school to VTC via Cook St., Thornburg., Betteravia Rd. \n",
+ "49 Rt 4. Transit Center to SMH school to VTC via Cook St., Thornburg., Betteravia Rd. \n",
+ "50 Rt 4. Transit Center to SMH school to VTC via Cook St., Thornburg., Betteravia Rd. \n",
+ "51 Rt 9. Transit Center to PVH via Alvin Ave. \n",
+ "52 Rt 9. Transit Center to PVH via Alvin Ave. \n",
+ "53 Rt 9. Transit Center to PVH via Alvin Ave. \n",
+ "54 Rt 1. Transit Ctr to Preisker Park Via N. Broadway \n",
+ "55 Rt 1. Transit Ctr to Preisker Park Via N. Broadway \n",
+ "56 Rt 1. Transit Ctr to Preisker Park Via N. Broadway \n",
+ "\n",
+ " avg_scheduled_service_minutes avg_stop_miles n_trips time_period \\\n",
+ "0 36.00 0.37 19 all_day \n",
+ "1 36.00 0.37 9 offpeak \n",
+ "2 36.00 0.37 10 peak \n",
+ "3 38.00 0.38 18 all_day \n",
+ "4 38.00 0.38 7 offpeak \n",
+ "5 38.00 0.38 11 peak \n",
+ "6 43.00 0.32 16 all_day \n",
+ "7 43.00 0.32 8 offpeak \n",
+ "8 43.00 0.32 8 peak \n",
+ "9 14.98 0.11 28 all_day \n",
+ "10 14.98 0.11 14 offpeak \n",
+ "11 14.98 0.11 14 peak \n",
+ "12 56.00 0.61 11 all_day \n",
+ "13 56.00 0.61 6 offpeak \n",
+ "14 56.00 0.61 5 peak \n",
+ "15 50.82 0.29 11 all_day \n",
+ "16 50.82 0.29 6 offpeak \n",
+ "17 50.82 0.29 5 peak \n",
+ "18 41.00 0.28 22 all_day \n",
+ "19 41.00 0.28 10 offpeak \n",
+ "20 41.00 0.28 12 peak \n",
+ "21 166.33 1.22 9 all_day \n",
+ "22 166.33 1.22 5 offpeak \n",
+ "23 166.33 1.22 4 peak \n",
+ "24 72.00 13.74 5 all_day \n",
+ "25 72.00 13.74 2 offpeak \n",
+ "26 72.00 13.74 3 peak \n",
+ "27 70.00 11.78 5 all_day \n",
+ "28 70.00 11.78 3 offpeak \n",
+ "29 70.00 11.78 2 peak \n",
+ "30 41.65 0.25 18 all_day \n",
+ "31 41.65 0.25 8 offpeak \n",
+ "32 41.65 0.25 10 peak \n",
+ "33 53.24 0.29 17 all_day \n",
+ "34 53.24 0.29 6 offpeak \n",
+ "35 53.24 0.29 11 peak \n",
+ "36 35.11 0.26 18 all_day \n",
+ "37 35.11 0.26 8 offpeak \n",
+ "38 35.11 0.26 10 peak \n",
+ "39 30.00 0.22 12 all_day \n",
+ "40 30.00 0.22 5 offpeak \n",
+ "41 30.00 0.22 7 peak \n",
+ "42 119.17 5.72 6 all_day \n",
+ "43 119.17 5.72 3 offpeak \n",
+ "44 119.17 5.72 3 peak \n",
+ "45 42.00 0.27 18 all_day \n",
+ "46 42.00 0.27 8 offpeak \n",
+ "47 42.00 0.27 10 peak \n",
+ "48 41.06 0.27 18 all_day \n",
+ "49 41.06 0.27 8 offpeak \n",
+ "50 41.06 0.27 10 peak \n",
+ "51 40.00 0.28 18 all_day \n",
+ "52 40.00 0.28 8 offpeak \n",
+ "53 40.00 0.28 10 peak \n",
+ "54 30.37 0.22 19 all_day \n",
+ "55 30.37 0.22 9 offpeak \n",
+ "56 30.37 0.22 10 peak \n",
+ "\n",
+ " peak_offpeak frequency \n",
+ "0 NaN 0.79 \n",
+ "1 offpeak 0.38 \n",
+ "2 peak 0.42 \n",
+ "3 NaN 0.75 \n",
+ "4 offpeak 0.29 \n",
+ "5 peak 0.46 \n",
+ "6 NaN 0.67 \n",
+ "7 offpeak 0.33 \n",
+ "8 peak 0.33 \n",
+ "9 NaN 1.17 \n",
+ "10 offpeak 0.58 \n",
+ "11 peak 0.58 \n",
+ "12 NaN 0.46 \n",
+ "13 offpeak 0.25 \n",
+ "14 peak 0.21 \n",
+ "15 NaN 0.46 \n",
+ "16 offpeak 0.25 \n",
+ "17 peak 0.21 \n",
+ "18 NaN 0.92 \n",
+ "19 offpeak 0.42 \n",
+ "20 peak 0.50 \n",
+ "21 NaN 0.38 \n",
+ "22 offpeak 0.21 \n",
+ "23 peak 0.17 \n",
+ "24 NaN 0.21 \n",
+ "25 offpeak 0.08 \n",
+ "26 peak 0.12 \n",
+ "27 NaN 0.21 \n",
+ "28 offpeak 0.12 \n",
+ "29 peak 0.08 \n",
+ "30 NaN 0.75 \n",
+ "31 offpeak 0.33 \n",
+ "32 peak 0.42 \n",
+ "33 NaN 0.71 \n",
+ "34 offpeak 0.25 \n",
+ "35 peak 0.46 \n",
+ "36 NaN 0.75 \n",
+ "37 offpeak 0.33 \n",
+ "38 peak 0.42 \n",
+ "39 NaN 0.50 \n",
+ "40 offpeak 0.21 \n",
+ "41 peak 0.29 \n",
+ "42 NaN 0.25 \n",
+ "43 offpeak 0.12 \n",
+ "44 peak 0.12 \n",
+ "45 NaN 0.75 \n",
+ "46 offpeak 0.33 \n",
+ "47 peak 0.42 \n",
+ "48 NaN 0.75 \n",
+ "49 offpeak 0.33 \n",
+ "50 peak 0.42 \n",
+ "51 NaN 0.75 \n",
+ "52 offpeak 0.33 \n",
+ "53 peak 0.42 \n",
+ "54 NaN 0.79 \n",
+ "55 offpeak 0.38 \n",
+ "56 peak 0.42 "
+ ]
+ },
+ "execution_count": 35,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "route_dir_metrics.drop(\n",
+ " columns=[\n",
+ " \"geometry\",\n",
+ " \"schedule_gtfs_dataset_key\",\n",
+ " \"common_shape_id\",\n",
+ " ]\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "689e66e2-bf0f-4f18-b8b6-793805692d9d",
+ "metadata": {},
+ "source": [
+ "##### Still in `rt_segment_speeds/segment_speed_utils/gtfs_schedule_wrangling` \n",
+ "* **Updated `dropna=False` in `groupby`**\n",
+ "* **Filled in `time_period` with `peak_offpeak`**"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "10bb80e6-042a-42a7-8fbd-653fff05f674",
+ "metadata": {},
+ "source": [
+ "##### In `if __name__ == \"__main__\"` in `gtfs_funnel/schedule_stats_by_route`"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 36,
+ "id": "b906d052-a2a2-4ebc-9884-6ed9f4965487",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "ROUTE_TYPOLOGIES = GTFS_DATA_DICT.schedule_tables.route_typologies\n",
+ "route_typologies = pd.read_parquet(\n",
+ " f\"{SCHED_GCS}{ROUTE_TYPOLOGIES}_{one_analysis_date}.parquet\",\n",
+ " columns=route_group_merge_cols\n",
+ " + [\n",
+ " \"is_coverage\",\n",
+ " \"is_downtown_local\",\n",
+ " \"is_local\",\n",
+ " \"is_rapid\",\n",
+ " \"is_express\",\n",
+ " \"is_rail\",\n",
+ " ],\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 37,
+ "id": "1a7000bf-9693-4f14-b1d1-244b1fe5a18d",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " schedule_gtfs_dataset_key | \n",
+ " route_id | \n",
+ " direction_id | \n",
+ " is_coverage | \n",
+ " is_downtown_local | \n",
+ " is_local | \n",
+ " is_rapid | \n",
+ " is_express | \n",
+ " is_rail | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 1416 | \n",
+ " f5a749dd65924e025b1293c58f95f8d6 | \n",
+ " Shuttle | \n",
+ " 1.00 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 3572 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 13X | \n",
+ " 0.00 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 3573 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 20 | \n",
+ " 0.00 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 3574 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 12X | \n",
+ " 0.00 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 3575 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 30 | \n",
+ " 0.00 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 3576 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 2 | \n",
+ " 0.00 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 3577 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 1 | \n",
+ " 0.00 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 3578 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 1B | \n",
+ " 0.00 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 3579 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 4 | \n",
+ " 0.00 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 3580 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 7 | \n",
+ " 0.00 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 3581 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 8 | \n",
+ " 0.00 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 3582 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 8a7c42f9-51e4-4848-bf88-30c210f149ad | \n",
+ " 0.00 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 3583 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 9 | \n",
+ " 0.00 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 3584 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 6 | \n",
+ " 0.00 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 3585 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " Mall | \n",
+ " 0.00 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 3586 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 3 | \n",
+ " 0.00 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 3587 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 5 | \n",
+ " 0.00 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 3588 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 11 | \n",
+ " 0.00 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " schedule_gtfs_dataset_key route_id \\\n",
+ "1416 f5a749dd65924e025b1293c58f95f8d6 Shuttle \n",
+ "3572 73105f2d1cabc8170ab066d96863c5d5 13X \n",
+ "3573 73105f2d1cabc8170ab066d96863c5d5 20 \n",
+ "3574 73105f2d1cabc8170ab066d96863c5d5 12X \n",
+ "3575 73105f2d1cabc8170ab066d96863c5d5 30 \n",
+ "3576 73105f2d1cabc8170ab066d96863c5d5 2 \n",
+ "3577 73105f2d1cabc8170ab066d96863c5d5 1 \n",
+ "3578 73105f2d1cabc8170ab066d96863c5d5 1B \n",
+ "3579 73105f2d1cabc8170ab066d96863c5d5 4 \n",
+ "3580 73105f2d1cabc8170ab066d96863c5d5 7 \n",
+ "3581 73105f2d1cabc8170ab066d96863c5d5 8 \n",
+ "3582 73105f2d1cabc8170ab066d96863c5d5 8a7c42f9-51e4-4848-bf88-30c210f149ad \n",
+ "3583 73105f2d1cabc8170ab066d96863c5d5 9 \n",
+ "3584 73105f2d1cabc8170ab066d96863c5d5 6 \n",
+ "3585 73105f2d1cabc8170ab066d96863c5d5 Mall \n",
+ "3586 73105f2d1cabc8170ab066d96863c5d5 3 \n",
+ "3587 73105f2d1cabc8170ab066d96863c5d5 5 \n",
+ "3588 73105f2d1cabc8170ab066d96863c5d5 11 \n",
+ "\n",
+ " direction_id is_coverage is_downtown_local is_local is_rapid \\\n",
+ "1416 1.00 1 0 0 0 \n",
+ "3572 0.00 1 0 0 1 \n",
+ "3573 0.00 1 0 0 0 \n",
+ "3574 0.00 1 0 0 1 \n",
+ "3575 0.00 1 0 0 1 \n",
+ "3576 0.00 0 1 0 1 \n",
+ "3577 0.00 1 0 0 1 \n",
+ "3578 0.00 1 0 0 1 \n",
+ "3579 0.00 1 0 0 1 \n",
+ "3580 0.00 1 0 0 1 \n",
+ "3581 0.00 1 0 0 1 \n",
+ "3582 0.00 1 0 0 1 \n",
+ "3583 0.00 0 0 1 1 \n",
+ "3584 0.00 1 0 0 1 \n",
+ "3585 0.00 0 0 1 1 \n",
+ "3586 0.00 1 0 0 1 \n",
+ "3587 0.00 1 0 0 1 \n",
+ "3588 0.00 1 0 0 1 \n",
+ "\n",
+ " is_express is_rail \n",
+ "1416 0 0 \n",
+ "3572 0 0 \n",
+ "3573 0 0 \n",
+ "3574 1 0 \n",
+ "3575 0 0 \n",
+ "3576 0 0 \n",
+ "3577 0 0 \n",
+ "3578 0 0 \n",
+ "3579 0 0 \n",
+ "3580 0 0 \n",
+ "3581 0 0 \n",
+ "3582 0 0 \n",
+ "3583 0 0 \n",
+ "3584 0 0 \n",
+ "3585 0 0 \n",
+ "3586 0 0 \n",
+ "3587 0 0 \n",
+ "3588 0 0 "
+ ]
+ },
+ "execution_count": 37,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "route_typologies.loc[route_typologies.schedule_gtfs_dataset_key.isin(schd_keys)]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "96a8aae2-ad7a-4183-9476-d1e7f7acfadc",
+ "metadata": {},
+ "source": [
+ "##### `cardinal_direction_for_route_direction` also gets rid of a lot of stuff -> Fix this"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 38,
+ "id": "59453366-e8a1-4758-813f-7feeedba18ed",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "STOP_TIMES_FILE = GTFS_DATA_DICT.rt_vs_schedule_tables.stop_times_direction"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 39,
+ "id": "872dda1a-c190-4dcf-8513-e0c2b52aee8c",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "stop_times_df = pd.read_parquet(\n",
+ " f\"{RT_SCHED_GCS}{STOP_TIMES_FILE}_{one_analysis_date}.parquet\",\n",
+ " filters=[[(\"stop_primary_direction\", \"!=\", \"Unknown\")]],\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 40,
+ "id": "80428679-11b3-4c9a-9a69-6fae3e376580",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "stop_times_df2 = stop_times_df.loc[\n",
+ " stop_times_df.schedule_gtfs_dataset_key.isin(schd_keys)\n",
+ "]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 41,
+ "id": "47cea347-de35-49cd-b5a7-8bdbe0c87a06",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "trip_scheduled_col = [\n",
+ " \"route_id\",\n",
+ " \"trip_instance_key\",\n",
+ " \"gtfs_dataset_key\",\n",
+ " \"shape_array_key\",\n",
+ " \"direction_id\",\n",
+ " \"route_long_name\",\n",
+ " \"route_short_name\",\n",
+ " \"route_desc\",\n",
+ " \"name\",\n",
+ "]\n",
+ "\n",
+ "trips_df = helpers.import_scheduled_trips(\n",
+ " one_analysis_date, columns=trip_scheduled_col, get_pandas=True\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 42,
+ "id": "b5aa4d89-791c-4a51-b538-27739a966e90",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "merge_cols = [\"trip_instance_key\", \"schedule_gtfs_dataset_key\", \"shape_array_key\"]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 43,
+ "id": "9de81f13-d613-4c01-8e73-fecb92d957f6",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "stop_times_with_trip = pd.merge(stop_times_df2, trips_df, on=merge_cols)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "38a3b647-05af-4ffb-a48e-abd60904dbf6",
+ "metadata": {},
+ "source": [
+ "##### Fill in `direction_id`"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 44,
+ "id": "d4c73c2f-11e9-43b4-a6e2-25ff80bd3bbf",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "stop_times_with_trip.direction_id = stop_times_with_trip.direction_id.fillna(0)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 45,
+ "id": "a05fd233-fb33-4fc7-821f-1fe8ea14c049",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "main_cols = [\"route_id\", \"schedule_gtfs_dataset_key\", \"direction_id\"]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "eb9a923e-ecb9-4163-ac7d-946ae8eab9c1",
+ "metadata": {},
+ "source": [
+ "##### Done Changing dropna=False here too"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 46,
+ "id": "6a79cf47-731a-470b-8639-59d3ceef9d2c",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "agg1 = (\n",
+ " stop_times_with_trip.groupby(main_cols + [\"stop_primary_direction\"], dropna=False)\n",
+ " .agg({\"stop_sequence\": \"count\"})\n",
+ " .reset_index()\n",
+ " .rename(columns={\"stop_sequence\": \"total_stops\"})\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 47,
+ "id": "2d88f775-4ca9-4889-a03f-9e8a2518dbfd",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "agg2 = agg1.sort_values(\n",
+ " by=main_cols + [\"total_stops\"],\n",
+ " ascending=[True, True, True, False],\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "625d2776-e1e6-466f-b38b-c202460fdd14",
+ "metadata": {},
+ "source": [
+ "##### There are values for `route_primary_direction` but because `direction_id` is missing, it goes away? \n",
+ "* AH: testing to see if filling `direction_id` with something will change things."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 48,
+ "id": "cd8096f3-35e9-4b45-8a28-13f66bb61991",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "cardinal_dir_df = (\n",
+ " agg2.drop_duplicates(subset=main_cols)\n",
+ " .reset_index(drop=True)\n",
+ " .drop(columns=[\"total_stops\"])\n",
+ " .rename(columns={\"stop_primary_direction\": \"route_primary_direction\"})\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 49,
+ "id": "52f702c5-b7eb-4fbe-a92b-e4eaadf19e80",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " route_id | \n",
+ " schedule_gtfs_dataset_key | \n",
+ " direction_id | \n",
+ " route_primary_direction | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 0.00 | \n",
+ " Northbound | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 11 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 0.00 | \n",
+ " Northbound | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 12X | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 0.00 | \n",
+ " Northbound | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 13X | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 0.00 | \n",
+ " Westbound | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 1B | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 0.00 | \n",
+ " Northbound | \n",
+ "
\n",
+ " \n",
+ " 5 | \n",
+ " 2 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 0.00 | \n",
+ " Westbound | \n",
+ "
\n",
+ " \n",
+ " 6 | \n",
+ " 20 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 0.00 | \n",
+ " Eastbound | \n",
+ "
\n",
+ " \n",
+ " 7 | \n",
+ " 3 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 0.00 | \n",
+ " Eastbound | \n",
+ "
\n",
+ " \n",
+ " 8 | \n",
+ " 30 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 0.00 | \n",
+ " Southbound | \n",
+ "
\n",
+ " \n",
+ " 9 | \n",
+ " 4 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 0.00 | \n",
+ " Southbound | \n",
+ "
\n",
+ " \n",
+ " 10 | \n",
+ " 5 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 0.00 | \n",
+ " Northbound | \n",
+ "
\n",
+ " \n",
+ " 11 | \n",
+ " 6 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 0.00 | \n",
+ " Northbound | \n",
+ "
\n",
+ " \n",
+ " 12 | \n",
+ " 7 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 0.00 | \n",
+ " Southbound | \n",
+ "
\n",
+ " \n",
+ " 13 | \n",
+ " 8 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 0.00 | \n",
+ " Eastbound | \n",
+ "
\n",
+ " \n",
+ " 14 | \n",
+ " 8a7c42f9-51e4-4848-bf88-30c210f149ad | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 0.00 | \n",
+ " Northbound | \n",
+ "
\n",
+ " \n",
+ " 15 | \n",
+ " 9 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 0.00 | \n",
+ " Westbound | \n",
+ "
\n",
+ " \n",
+ " 16 | \n",
+ " CC | \n",
+ " f5a749dd65924e025b1293c58f95f8d6 | \n",
+ " 0.00 | \n",
+ " Northbound | \n",
+ "
\n",
+ " \n",
+ " 17 | \n",
+ " CC | \n",
+ " f5a749dd65924e025b1293c58f95f8d6 | \n",
+ " 1.00 | \n",
+ " Southbound | \n",
+ "
\n",
+ " \n",
+ " 18 | \n",
+ " Mall | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 0.00 | \n",
+ " Eastbound | \n",
+ "
\n",
+ " \n",
+ " 19 | \n",
+ " SF | \n",
+ " f5a749dd65924e025b1293c58f95f8d6 | \n",
+ " 0.00 | \n",
+ " Eastbound | \n",
+ "
\n",
+ " \n",
+ " 20 | \n",
+ " SF | \n",
+ " f5a749dd65924e025b1293c58f95f8d6 | \n",
+ " 1.00 | \n",
+ " Westbound | \n",
+ "
\n",
+ " \n",
+ " 21 | \n",
+ " Shuttle | \n",
+ " f5a749dd65924e025b1293c58f95f8d6 | \n",
+ " 0.00 | \n",
+ " Eastbound | \n",
+ "
\n",
+ " \n",
+ " 22 | \n",
+ " Shuttle | \n",
+ " f5a749dd65924e025b1293c58f95f8d6 | \n",
+ " 1.00 | \n",
+ " Westbound | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " route_id schedule_gtfs_dataset_key \\\n",
+ "0 1 73105f2d1cabc8170ab066d96863c5d5 \n",
+ "1 11 73105f2d1cabc8170ab066d96863c5d5 \n",
+ "2 12X 73105f2d1cabc8170ab066d96863c5d5 \n",
+ "3 13X 73105f2d1cabc8170ab066d96863c5d5 \n",
+ "4 1B 73105f2d1cabc8170ab066d96863c5d5 \n",
+ "5 2 73105f2d1cabc8170ab066d96863c5d5 \n",
+ "6 20 73105f2d1cabc8170ab066d96863c5d5 \n",
+ "7 3 73105f2d1cabc8170ab066d96863c5d5 \n",
+ "8 30 73105f2d1cabc8170ab066d96863c5d5 \n",
+ "9 4 73105f2d1cabc8170ab066d96863c5d5 \n",
+ "10 5 73105f2d1cabc8170ab066d96863c5d5 \n",
+ "11 6 73105f2d1cabc8170ab066d96863c5d5 \n",
+ "12 7 73105f2d1cabc8170ab066d96863c5d5 \n",
+ "13 8 73105f2d1cabc8170ab066d96863c5d5 \n",
+ "14 8a7c42f9-51e4-4848-bf88-30c210f149ad 73105f2d1cabc8170ab066d96863c5d5 \n",
+ "15 9 73105f2d1cabc8170ab066d96863c5d5 \n",
+ "16 CC f5a749dd65924e025b1293c58f95f8d6 \n",
+ "17 CC f5a749dd65924e025b1293c58f95f8d6 \n",
+ "18 Mall 73105f2d1cabc8170ab066d96863c5d5 \n",
+ "19 SF f5a749dd65924e025b1293c58f95f8d6 \n",
+ "20 SF f5a749dd65924e025b1293c58f95f8d6 \n",
+ "21 Shuttle f5a749dd65924e025b1293c58f95f8d6 \n",
+ "22 Shuttle f5a749dd65924e025b1293c58f95f8d6 \n",
+ "\n",
+ " direction_id route_primary_direction \n",
+ "0 0.00 Northbound \n",
+ "1 0.00 Northbound \n",
+ "2 0.00 Northbound \n",
+ "3 0.00 Westbound \n",
+ "4 0.00 Northbound \n",
+ "5 0.00 Westbound \n",
+ "6 0.00 Eastbound \n",
+ "7 0.00 Eastbound \n",
+ "8 0.00 Southbound \n",
+ "9 0.00 Southbound \n",
+ "10 0.00 Northbound \n",
+ "11 0.00 Northbound \n",
+ "12 0.00 Southbound \n",
+ "13 0.00 Eastbound \n",
+ "14 0.00 Northbound \n",
+ "15 0.00 Westbound \n",
+ "16 0.00 Northbound \n",
+ "17 1.00 Southbound \n",
+ "18 0.00 Eastbound \n",
+ "19 0.00 Eastbound \n",
+ "20 1.00 Westbound \n",
+ "21 0.00 Eastbound \n",
+ "22 1.00 Westbound "
+ ]
+ },
+ "execution_count": 49,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "cardinal_dir_df"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "b16e169e-fb71-49de-bd17-9769e96a83ce",
+ "metadata": {},
+ "source": [
+ "##### Continuing back to `if __name__ == \"__main__\"` portion of `gtfs_funnel/schedule_stats_by_route`"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 50,
+ "id": "37ad64d9-1a55-4925-90b6-c5a6425915f5",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " schedule_gtfs_dataset_key | \n",
+ " route_id | \n",
+ " direction_id | \n",
+ " is_coverage | \n",
+ " is_downtown_local | \n",
+ " is_local | \n",
+ " is_rapid | \n",
+ " is_express | \n",
+ " is_rail | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 1770249a5a2e770ca90628434d4934b1 | \n",
+ " 3407 | \n",
+ " 0.00 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " schedule_gtfs_dataset_key route_id direction_id is_coverage \\\n",
+ "0 1770249a5a2e770ca90628434d4934b1 3407 0.00 1 \n",
+ "\n",
+ " is_downtown_local is_local is_rapid is_express is_rail \n",
+ "0 0 0 1 0 0 "
+ ]
+ },
+ "execution_count": 50,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "route_typologies.head(1)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 51,
+ "id": "d6f63dfb-9b18-41be-94a3-acdba99191f9",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "['schedule_gtfs_dataset_key', 'route_id', 'direction_id']"
+ ]
+ },
+ "execution_count": 51,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "route_group_merge_cols"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 52,
+ "id": "f6365ee6-6960-4988-a35e-fa8e787e1d3f",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " schedule_gtfs_dataset_key | \n",
+ " route_id | \n",
+ " direction_id | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 7 | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 6 | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ " 6 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 8 | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ " 9 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " Mall | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ " 12 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 12X | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ " 15 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 13X | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ " 18 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 11 | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ " 21 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 30 | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ " 24 | \n",
+ " f5a749dd65924e025b1293c58f95f8d6 | \n",
+ " Shuttle | \n",
+ " 1.00 | \n",
+ "
\n",
+ " \n",
+ " 27 | \n",
+ " f5a749dd65924e025b1293c58f95f8d6 | \n",
+ " Shuttle | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ " 30 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 8a7c42f9-51e4-4848-bf88-30c210f149ad | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ " 33 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 2 | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ " 36 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 3 | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ " 39 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 1B | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ " 42 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 20 | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ " 45 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 5 | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ " 48 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 4 | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ " 51 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 9 | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ " 54 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 1 | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " schedule_gtfs_dataset_key route_id \\\n",
+ "0 73105f2d1cabc8170ab066d96863c5d5 7 \n",
+ "3 73105f2d1cabc8170ab066d96863c5d5 6 \n",
+ "6 73105f2d1cabc8170ab066d96863c5d5 8 \n",
+ "9 73105f2d1cabc8170ab066d96863c5d5 Mall \n",
+ "12 73105f2d1cabc8170ab066d96863c5d5 12X \n",
+ "15 73105f2d1cabc8170ab066d96863c5d5 13X \n",
+ "18 73105f2d1cabc8170ab066d96863c5d5 11 \n",
+ "21 73105f2d1cabc8170ab066d96863c5d5 30 \n",
+ "24 f5a749dd65924e025b1293c58f95f8d6 Shuttle \n",
+ "27 f5a749dd65924e025b1293c58f95f8d6 Shuttle \n",
+ "30 73105f2d1cabc8170ab066d96863c5d5 8a7c42f9-51e4-4848-bf88-30c210f149ad \n",
+ "33 73105f2d1cabc8170ab066d96863c5d5 2 \n",
+ "36 73105f2d1cabc8170ab066d96863c5d5 3 \n",
+ "39 73105f2d1cabc8170ab066d96863c5d5 1B \n",
+ "42 73105f2d1cabc8170ab066d96863c5d5 20 \n",
+ "45 73105f2d1cabc8170ab066d96863c5d5 5 \n",
+ "48 73105f2d1cabc8170ab066d96863c5d5 4 \n",
+ "51 73105f2d1cabc8170ab066d96863c5d5 9 \n",
+ "54 73105f2d1cabc8170ab066d96863c5d5 1 \n",
+ "\n",
+ " direction_id \n",
+ "0 0.00 \n",
+ "3 0.00 \n",
+ "6 0.00 \n",
+ "9 0.00 \n",
+ "12 0.00 \n",
+ "15 0.00 \n",
+ "18 0.00 \n",
+ "21 0.00 \n",
+ "24 1.00 \n",
+ "27 0.00 \n",
+ "30 0.00 \n",
+ "33 0.00 \n",
+ "36 0.00 \n",
+ "39 0.00 \n",
+ "42 0.00 \n",
+ "45 0.00 \n",
+ "48 0.00 \n",
+ "51 0.00 \n",
+ "54 0.00 "
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "preview(route_dir_metrics)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 53,
+ "id": "0b0fedf8-43d2-4ee6-8097-1e15a58b6d27",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "route_dir_metrics2 = pd.merge(\n",
+ " route_dir_metrics, route_typologies, on=route_group_merge_cols, how=\"left\"\n",
+ ").merge(cardinal_dir_df, on=route_group_merge_cols, how=\"left\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 54,
+ "id": "0fa520ce-3308-4b4c-a67c-a1a79f16696e",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "array(['7', '6', '8', 'Mall', '12X', '13X', '11', '30', 'Shuttle',\n",
+ " '8a7c42f9-51e4-4848-bf88-30c210f149ad', '2', '3', '1B', '20', '5',\n",
+ " '4', '9', '1'], dtype=object)"
+ ]
+ },
+ "execution_count": 54,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "route_dir_metrics2.route_id.unique()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 55,
+ "id": "58dc075e-61eb-457c-af0a-ef6861ef7db2",
+ "metadata": {
+ "scrolled": true,
+ "tags": []
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "' route_dir_metrics2.drop(\\n columns=[\\n \"geometry\",\\n \"common_shape_id\",\\n \"geometry\",\\n \"route_name\",\\n \"is_coverage\",\\n \"is_downtown_local\",\\n \"is_local\",\\n \"is_rapid\",\\n \"is_express\",\\n \"is_rail\",\\n \"schedule_gtfs_dataset_key\"\\n ]\\n).sort_values(by=[\"route_id\",\"direction_id\"])'"
+ ]
+ },
+ "execution_count": 55,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "\"\"\" route_dir_metrics2.drop(\n",
+ " columns=[\n",
+ " \"geometry\",\n",
+ " \"common_shape_id\",\n",
+ " \"geometry\",\n",
+ " \"route_name\",\n",
+ " \"is_coverage\",\n",
+ " \"is_downtown_local\",\n",
+ " \"is_local\",\n",
+ " \"is_rapid\",\n",
+ " \"is_express\",\n",
+ " \"is_rail\",\n",
+ " \"schedule_gtfs_dataset_key\"\n",
+ " ]\n",
+ ").sort_values(by=[\"route_id\",\"direction_id\"])\"\"\""
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "bc23d1cc-a2fb-494a-bd6c-3adac7781763",
+ "metadata": {},
+ "source": [
+ "##### Double check that the columns are the same."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 56,
+ "id": "542a0761-c5c9-4189-a620-b89da1e3bc5d",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "og_nov_url = \"gs://calitp-analytics-data/data-analyses/rt_vs_schedule/schedule_route_dir/schedule_route_direction_metrics_2024-11-13.parquet\""
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 57,
+ "id": "ed839532-8f41-46a2-a42b-66ec7454c94e",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df_sched_og = gpd.read_parquet(og_nov_url)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 58,
+ "id": "f9fa795e-0793-4786-b589-fa79f713d95d",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df_sched_og = df_sched_og.loc[df_sched_og.schedule_gtfs_dataset_key.isin(schd_keys)]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 59,
+ "id": "c95c9ca7-ab97-4421-878b-97c964b72663",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " route_id | \n",
+ " direction_id | \n",
+ " time_period | \n",
+ " peak_offpeak | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 2248 | \n",
+ " 5 | \n",
+ " 0.00 | \n",
+ " all_day | \n",
+ " None | \n",
+ "
\n",
+ " \n",
+ " 2249 | \n",
+ " 5 | \n",
+ " 0.00 | \n",
+ " None | \n",
+ " offpeak | \n",
+ "
\n",
+ " \n",
+ " 2250 | \n",
+ " 5 | \n",
+ " 0.00 | \n",
+ " None | \n",
+ " peak | \n",
+ "
\n",
+ " \n",
+ " 1849 | \n",
+ " Shuttle | \n",
+ " 0.00 | \n",
+ " all_day | \n",
+ " None | \n",
+ "
\n",
+ " \n",
+ " 1850 | \n",
+ " Shuttle | \n",
+ " 0.00 | \n",
+ " None | \n",
+ " offpeak | \n",
+ "
\n",
+ " \n",
+ " 1851 | \n",
+ " Shuttle | \n",
+ " 0.00 | \n",
+ " None | \n",
+ " peak | \n",
+ "
\n",
+ " \n",
+ " 1846 | \n",
+ " Shuttle | \n",
+ " 1.00 | \n",
+ " all_day | \n",
+ " None | \n",
+ "
\n",
+ " \n",
+ " 1847 | \n",
+ " Shuttle | \n",
+ " 1.00 | \n",
+ " None | \n",
+ " offpeak | \n",
+ "
\n",
+ " \n",
+ " 1848 | \n",
+ " Shuttle | \n",
+ " 1.00 | \n",
+ " None | \n",
+ " peak | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " route_id direction_id time_period peak_offpeak\n",
+ "2248 5 0.00 all_day None\n",
+ "2249 5 0.00 None offpeak\n",
+ "2250 5 0.00 None peak\n",
+ "1849 Shuttle 0.00 all_day None\n",
+ "1850 Shuttle 0.00 None offpeak\n",
+ "1851 Shuttle 0.00 None peak\n",
+ "1846 Shuttle 1.00 all_day None\n",
+ "1847 Shuttle 1.00 None offpeak\n",
+ "1848 Shuttle 1.00 None peak"
+ ]
+ },
+ "execution_count": 59,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df_sched_og[[\"route_id\", \"direction_id\", \"time_period\", \"peak_offpeak\"]].sort_values(\n",
+ " by=[\n",
+ " \"route_id\",\n",
+ " \"direction_id\",\n",
+ " ]\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 60,
+ "id": "b579cb53-8b01-4418-a2f7-8f597cf852fd",
+ "metadata": {
+ "scrolled": true,
+ "tags": []
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " route_id | \n",
+ " direction_id | \n",
+ " time_period | \n",
+ " peak_offpeak | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 54 | \n",
+ " 1 | \n",
+ " 0.00 | \n",
+ " all_day | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 55 | \n",
+ " 1 | \n",
+ " 0.00 | \n",
+ " offpeak | \n",
+ " offpeak | \n",
+ "
\n",
+ " \n",
+ " 56 | \n",
+ " 1 | \n",
+ " 0.00 | \n",
+ " peak | \n",
+ " peak | \n",
+ "
\n",
+ " \n",
+ " 18 | \n",
+ " 11 | \n",
+ " 0.00 | \n",
+ " all_day | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 19 | \n",
+ " 11 | \n",
+ " 0.00 | \n",
+ " offpeak | \n",
+ " offpeak | \n",
+ "
\n",
+ " \n",
+ " 20 | \n",
+ " 11 | \n",
+ " 0.00 | \n",
+ " peak | \n",
+ " peak | \n",
+ "
\n",
+ " \n",
+ " 12 | \n",
+ " 12X | \n",
+ " 0.00 | \n",
+ " all_day | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 13 | \n",
+ " 12X | \n",
+ " 0.00 | \n",
+ " offpeak | \n",
+ " offpeak | \n",
+ "
\n",
+ " \n",
+ " 14 | \n",
+ " 12X | \n",
+ " 0.00 | \n",
+ " peak | \n",
+ " peak | \n",
+ "
\n",
+ " \n",
+ " 15 | \n",
+ " 13X | \n",
+ " 0.00 | \n",
+ " all_day | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 16 | \n",
+ " 13X | \n",
+ " 0.00 | \n",
+ " offpeak | \n",
+ " offpeak | \n",
+ "
\n",
+ " \n",
+ " 17 | \n",
+ " 13X | \n",
+ " 0.00 | \n",
+ " peak | \n",
+ " peak | \n",
+ "
\n",
+ " \n",
+ " 39 | \n",
+ " 1B | \n",
+ " 0.00 | \n",
+ " all_day | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 40 | \n",
+ " 1B | \n",
+ " 0.00 | \n",
+ " offpeak | \n",
+ " offpeak | \n",
+ "
\n",
+ " \n",
+ " 41 | \n",
+ " 1B | \n",
+ " 0.00 | \n",
+ " peak | \n",
+ " peak | \n",
+ "
\n",
+ " \n",
+ " 33 | \n",
+ " 2 | \n",
+ " 0.00 | \n",
+ " all_day | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 34 | \n",
+ " 2 | \n",
+ " 0.00 | \n",
+ " offpeak | \n",
+ " offpeak | \n",
+ "
\n",
+ " \n",
+ " 35 | \n",
+ " 2 | \n",
+ " 0.00 | \n",
+ " peak | \n",
+ " peak | \n",
+ "
\n",
+ " \n",
+ " 42 | \n",
+ " 20 | \n",
+ " 0.00 | \n",
+ " all_day | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 43 | \n",
+ " 20 | \n",
+ " 0.00 | \n",
+ " offpeak | \n",
+ " offpeak | \n",
+ "
\n",
+ " \n",
+ " 44 | \n",
+ " 20 | \n",
+ " 0.00 | \n",
+ " peak | \n",
+ " peak | \n",
+ "
\n",
+ " \n",
+ " 36 | \n",
+ " 3 | \n",
+ " 0.00 | \n",
+ " all_day | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 37 | \n",
+ " 3 | \n",
+ " 0.00 | \n",
+ " offpeak | \n",
+ " offpeak | \n",
+ "
\n",
+ " \n",
+ " 38 | \n",
+ " 3 | \n",
+ " 0.00 | \n",
+ " peak | \n",
+ " peak | \n",
+ "
\n",
+ " \n",
+ " 21 | \n",
+ " 30 | \n",
+ " 0.00 | \n",
+ " all_day | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 22 | \n",
+ " 30 | \n",
+ " 0.00 | \n",
+ " offpeak | \n",
+ " offpeak | \n",
+ "
\n",
+ " \n",
+ " 23 | \n",
+ " 30 | \n",
+ " 0.00 | \n",
+ " peak | \n",
+ " peak | \n",
+ "
\n",
+ " \n",
+ " 48 | \n",
+ " 4 | \n",
+ " 0.00 | \n",
+ " all_day | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 49 | \n",
+ " 4 | \n",
+ " 0.00 | \n",
+ " offpeak | \n",
+ " offpeak | \n",
+ "
\n",
+ " \n",
+ " 50 | \n",
+ " 4 | \n",
+ " 0.00 | \n",
+ " peak | \n",
+ " peak | \n",
+ "
\n",
+ " \n",
+ " 45 | \n",
+ " 5 | \n",
+ " 0.00 | \n",
+ " all_day | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 46 | \n",
+ " 5 | \n",
+ " 0.00 | \n",
+ " offpeak | \n",
+ " offpeak | \n",
+ "
\n",
+ " \n",
+ " 47 | \n",
+ " 5 | \n",
+ " 0.00 | \n",
+ " peak | \n",
+ " peak | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 6 | \n",
+ " 0.00 | \n",
+ " all_day | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 6 | \n",
+ " 0.00 | \n",
+ " offpeak | \n",
+ " offpeak | \n",
+ "
\n",
+ " \n",
+ " 5 | \n",
+ " 6 | \n",
+ " 0.00 | \n",
+ " peak | \n",
+ " peak | \n",
+ "
\n",
+ " \n",
+ " 0 | \n",
+ " 7 | \n",
+ " 0.00 | \n",
+ " all_day | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 7 | \n",
+ " 0.00 | \n",
+ " offpeak | \n",
+ " offpeak | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 7 | \n",
+ " 0.00 | \n",
+ " peak | \n",
+ " peak | \n",
+ "
\n",
+ " \n",
+ " 6 | \n",
+ " 8 | \n",
+ " 0.00 | \n",
+ " all_day | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 7 | \n",
+ " 8 | \n",
+ " 0.00 | \n",
+ " offpeak | \n",
+ " offpeak | \n",
+ "
\n",
+ " \n",
+ " 8 | \n",
+ " 8 | \n",
+ " 0.00 | \n",
+ " peak | \n",
+ " peak | \n",
+ "
\n",
+ " \n",
+ " 30 | \n",
+ " 8a7c42f9-51e4-4848-bf88-30c210f149ad | \n",
+ " 0.00 | \n",
+ " all_day | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 31 | \n",
+ " 8a7c42f9-51e4-4848-bf88-30c210f149ad | \n",
+ " 0.00 | \n",
+ " offpeak | \n",
+ " offpeak | \n",
+ "
\n",
+ " \n",
+ " 32 | \n",
+ " 8a7c42f9-51e4-4848-bf88-30c210f149ad | \n",
+ " 0.00 | \n",
+ " peak | \n",
+ " peak | \n",
+ "
\n",
+ " \n",
+ " 51 | \n",
+ " 9 | \n",
+ " 0.00 | \n",
+ " all_day | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 52 | \n",
+ " 9 | \n",
+ " 0.00 | \n",
+ " offpeak | \n",
+ " offpeak | \n",
+ "
\n",
+ " \n",
+ " 53 | \n",
+ " 9 | \n",
+ " 0.00 | \n",
+ " peak | \n",
+ " peak | \n",
+ "
\n",
+ " \n",
+ " 9 | \n",
+ " Mall | \n",
+ " 0.00 | \n",
+ " all_day | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 10 | \n",
+ " Mall | \n",
+ " 0.00 | \n",
+ " offpeak | \n",
+ " offpeak | \n",
+ "
\n",
+ " \n",
+ " 11 | \n",
+ " Mall | \n",
+ " 0.00 | \n",
+ " peak | \n",
+ " peak | \n",
+ "
\n",
+ " \n",
+ " 27 | \n",
+ " Shuttle | \n",
+ " 0.00 | \n",
+ " all_day | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 28 | \n",
+ " Shuttle | \n",
+ " 0.00 | \n",
+ " offpeak | \n",
+ " offpeak | \n",
+ "
\n",
+ " \n",
+ " 29 | \n",
+ " Shuttle | \n",
+ " 0.00 | \n",
+ " peak | \n",
+ " peak | \n",
+ "
\n",
+ " \n",
+ " 24 | \n",
+ " Shuttle | \n",
+ " 1.00 | \n",
+ " all_day | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 25 | \n",
+ " Shuttle | \n",
+ " 1.00 | \n",
+ " offpeak | \n",
+ " offpeak | \n",
+ "
\n",
+ " \n",
+ " 26 | \n",
+ " Shuttle | \n",
+ " 1.00 | \n",
+ " peak | \n",
+ " peak | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " route_id direction_id time_period \\\n",
+ "54 1 0.00 all_day \n",
+ "55 1 0.00 offpeak \n",
+ "56 1 0.00 peak \n",
+ "18 11 0.00 all_day \n",
+ "19 11 0.00 offpeak \n",
+ "20 11 0.00 peak \n",
+ "12 12X 0.00 all_day \n",
+ "13 12X 0.00 offpeak \n",
+ "14 12X 0.00 peak \n",
+ "15 13X 0.00 all_day \n",
+ "16 13X 0.00 offpeak \n",
+ "17 13X 0.00 peak \n",
+ "39 1B 0.00 all_day \n",
+ "40 1B 0.00 offpeak \n",
+ "41 1B 0.00 peak \n",
+ "33 2 0.00 all_day \n",
+ "34 2 0.00 offpeak \n",
+ "35 2 0.00 peak \n",
+ "42 20 0.00 all_day \n",
+ "43 20 0.00 offpeak \n",
+ "44 20 0.00 peak \n",
+ "36 3 0.00 all_day \n",
+ "37 3 0.00 offpeak \n",
+ "38 3 0.00 peak \n",
+ "21 30 0.00 all_day \n",
+ "22 30 0.00 offpeak \n",
+ "23 30 0.00 peak \n",
+ "48 4 0.00 all_day \n",
+ "49 4 0.00 offpeak \n",
+ "50 4 0.00 peak \n",
+ "45 5 0.00 all_day \n",
+ "46 5 0.00 offpeak \n",
+ "47 5 0.00 peak \n",
+ "3 6 0.00 all_day \n",
+ "4 6 0.00 offpeak \n",
+ "5 6 0.00 peak \n",
+ "0 7 0.00 all_day \n",
+ "1 7 0.00 offpeak \n",
+ "2 7 0.00 peak \n",
+ "6 8 0.00 all_day \n",
+ "7 8 0.00 offpeak \n",
+ "8 8 0.00 peak \n",
+ "30 8a7c42f9-51e4-4848-bf88-30c210f149ad 0.00 all_day \n",
+ "31 8a7c42f9-51e4-4848-bf88-30c210f149ad 0.00 offpeak \n",
+ "32 8a7c42f9-51e4-4848-bf88-30c210f149ad 0.00 peak \n",
+ "51 9 0.00 all_day \n",
+ "52 9 0.00 offpeak \n",
+ "53 9 0.00 peak \n",
+ "9 Mall 0.00 all_day \n",
+ "10 Mall 0.00 offpeak \n",
+ "11 Mall 0.00 peak \n",
+ "27 Shuttle 0.00 all_day \n",
+ "28 Shuttle 0.00 offpeak \n",
+ "29 Shuttle 0.00 peak \n",
+ "24 Shuttle 1.00 all_day \n",
+ "25 Shuttle 1.00 offpeak \n",
+ "26 Shuttle 1.00 peak \n",
+ "\n",
+ " peak_offpeak \n",
+ "54 NaN \n",
+ "55 offpeak \n",
+ "56 peak \n",
+ "18 NaN \n",
+ "19 offpeak \n",
+ "20 peak \n",
+ "12 NaN \n",
+ "13 offpeak \n",
+ "14 peak \n",
+ "15 NaN \n",
+ "16 offpeak \n",
+ "17 peak \n",
+ "39 NaN \n",
+ "40 offpeak \n",
+ "41 peak \n",
+ "33 NaN \n",
+ "34 offpeak \n",
+ "35 peak \n",
+ "42 NaN \n",
+ "43 offpeak \n",
+ "44 peak \n",
+ "36 NaN \n",
+ "37 offpeak \n",
+ "38 peak \n",
+ "21 NaN \n",
+ "22 offpeak \n",
+ "23 peak \n",
+ "48 NaN \n",
+ "49 offpeak \n",
+ "50 peak \n",
+ "45 NaN \n",
+ "46 offpeak \n",
+ "47 peak \n",
+ "3 NaN \n",
+ "4 offpeak \n",
+ "5 peak \n",
+ "0 NaN \n",
+ "1 offpeak \n",
+ "2 peak \n",
+ "6 NaN \n",
+ "7 offpeak \n",
+ "8 peak \n",
+ "30 NaN \n",
+ "31 offpeak \n",
+ "32 peak \n",
+ "51 NaN \n",
+ "52 offpeak \n",
+ "53 peak \n",
+ "9 NaN \n",
+ "10 offpeak \n",
+ "11 peak \n",
+ "27 NaN \n",
+ "28 offpeak \n",
+ "29 peak \n",
+ "24 NaN \n",
+ "25 offpeak \n",
+ "26 peak "
+ ]
+ },
+ "execution_count": 60,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "route_dir_metrics2[\n",
+ " [\"route_id\", \"direction_id\", \"time_period\", \"peak_offpeak\"]\n",
+ "].sort_values(\n",
+ " by=[\n",
+ " \"route_id\",\n",
+ " \"direction_id\",\n",
+ " ]\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "a8b09dce-9d6a-48f9-974e-06c7c5fff1d5",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 61,
+ "id": "b4f948fa-91d7-47d0-a3e7-3dd37dce9bbc",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "array([ True, True, True, True, True, True, True, True, True,\n",
+ " True, True, True, True, True, True, True, True, True,\n",
+ " True])"
+ ]
+ },
+ "execution_count": 61,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "route_dir_metrics2.columns == df_sched_og.columns"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 62,
+ "id": "b791a48e-102c-4cae-81f2-2c109ef9a3b4",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df_sched = route_dir_metrics2.copy()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "995a5884-ee8c-4797-b664-0e09a94235ad",
+ "metadata": {},
+ "source": [
+ "#### DONE `gtfs_digest/merge_data` line 300 `df_avg_speeds` is also missing a lot of routes.\n",
+ "* [File `rt_segment_speeds/scripts/average_summary_speeds.py`](https://github.com/cal-itp/data-analyses/blob/main/rt_segment_speeds/scripts/average_summary_speeds.py)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 63,
+ "id": "b4a704ba-b34c-496b-bff9-452e3ae124cd",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import sys\n",
+ "\n",
+ "sys.path.append(\"../rt_segment_speeds/scripts/\")\n",
+ "import average_segment_speeds\n",
+ "import average_summary_speeds\n",
+ "from segment_speed_utils import (\n",
+ " gtfs_schedule_wrangling,\n",
+ " helpers,\n",
+ " metrics,\n",
+ " segment_calcs,\n",
+ " time_series_utils,\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 64,
+ "id": "2f364b44-3fbd-4514-9f9c-74e3dd5d0903",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df_avg_speeds = merge_data.concatenate_speeds_by_route_direction(analysis_date_list)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 65,
+ "id": "762a40b1-a498-41c3-95ef-0b2527c9bc71",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df_avg_speeds2 = df_avg_speeds.loc[\n",
+ " df_avg_speeds.schedule_gtfs_dataset_key.isin(schd_keys)\n",
+ "]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 66,
+ "id": "2c3dfd99-f530-4b0f-b7d6-2c87f47b3e7d",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "5 3\n",
+ "Name: route_id, dtype: int64"
+ ]
+ },
+ "execution_count": 66,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df_avg_speeds2.route_id.value_counts()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "7d82ddad-4e51-4be7-9850-e9547dfaec0a",
+ "metadata": {},
+ "source": [
+ "##### See what is in `rt_segment_speeds/scripts/average_segment_speeds.concatenate_trip_segment_speeds`"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 67,
+ "id": "6763f5cf-421e-43e1-a504-4dd15cbc3038",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "segment_type = \"stop_segments\""
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 68,
+ "id": "78cd0972-c79e-47fb-8972-bfa0e233985e",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "concatenated files\n"
+ ]
+ }
+ ],
+ "source": [
+ "df = average_segment_speeds.concatenate_trip_segment_speeds(\n",
+ " analysis_date_list, segment_type\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "870cf0c1-ac4d-4dfc-9132-333aaea46bdf",
+ "metadata": {
+ "tags": []
+ },
+ "source": [
+ "##### Done. Amanda: filled in `nans` with 0."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 69,
+ "id": "5bfa6408-51be-4127-abb6-cb9d1f384a35",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df.direction_id = df.direction_id.fillna(0)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 70,
+ "id": "4cedd5c3-6321-4025-b126-9b1f36b68848",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df2 = df.loc[df.schedule_gtfs_dataset_key.isin(schd_keys)]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 71,
+ "id": "41b802f0-e199-4c39-b11d-32365469a99a",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "array(['30', '3', '20', '4', '5', '11', '7', '9', '1', '12X', '6', '2',\n",
+ " '8', '8a7c42f9-51e4-4848-bf88-30c210f149ad', 'CC'], dtype=object)"
+ ]
+ },
+ "execution_count": 71,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df2.route_id.unique()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 72,
+ "id": "b6478bf3-b6e0-433d-a54d-a061620f379c",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(3543, 17)"
+ ]
+ },
+ "execution_count": 72,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df2.shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 73,
+ "id": "5dc4ba07-6b4d-4532-a368-390ea0c55364",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "Int64Index: 3543 entries, 159381 to 2656608\n",
+ "Data columns (total 17 columns):\n",
+ " # Column Non-Null Count Dtype \n",
+ "--- ------ -------------- ----- \n",
+ " 0 schedule_gtfs_dataset_key 3543 non-null object \n",
+ " 1 shape_array_key 3543 non-null object \n",
+ " 2 shape_id 3543 non-null object \n",
+ " 3 stop_sequence 3543 non-null int64 \n",
+ " 4 route_id 3543 non-null object \n",
+ " 5 direction_id 3543 non-null float64 \n",
+ " 6 stop_pair 3543 non-null object \n",
+ " 7 stop_pair_name 3543 non-null object \n",
+ " 8 trip_instance_key 3543 non-null object \n",
+ " 9 speed_mph 3543 non-null float64 \n",
+ " 10 meters_elapsed 3543 non-null float64 \n",
+ " 11 sec_elapsed 3543 non-null float64 \n",
+ " 12 time_of_day 3543 non-null object \n",
+ " 13 arrival_time 3543 non-null datetime64[ns]\n",
+ " 14 service_date 3543 non-null datetime64[ns]\n",
+ " 15 peak_offpeak 3543 non-null object \n",
+ " 16 weekday_weekend 3543 non-null object \n",
+ "dtypes: datetime64[ns](2), float64(4), int64(1), object(10)\n",
+ "memory usage: 498.2+ KB\n"
+ ]
+ }
+ ],
+ "source": [
+ "df2.info()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 74,
+ "id": "4942f2a9-fa29-488a-b307-3a084cfaad2c",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "array(['PM Peak', 'Early AM', 'Midday', 'AM Peak', 'Evening'],\n",
+ " dtype=object)"
+ ]
+ },
+ "execution_count": 74,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df2.time_of_day.unique()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 75,
+ "id": "54f81eed-ee1f-4061-be2d-523d6a009547",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "array(['peak', 'offpeak'], dtype=object)"
+ ]
+ },
+ "execution_count": 75,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df2.peak_offpeak.unique()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 76,
+ "id": "1305f738-38d6-4fa3-95f5-f7f35f65f435",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " | \n",
+ " stop_pair | \n",
+ "
\n",
+ " \n",
+ " route_id | \n",
+ " direction_id | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 1 | \n",
+ " 0.00 | \n",
+ " 23 | \n",
+ "
\n",
+ " \n",
+ " 11 | \n",
+ " 0.00 | \n",
+ " 19 | \n",
+ "
\n",
+ " \n",
+ " 12X | \n",
+ " 0.00 | \n",
+ " 14 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 0.00 | \n",
+ " 31 | \n",
+ "
\n",
+ " \n",
+ " 20 | \n",
+ " 0.00 | \n",
+ " 7 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 0.00 | \n",
+ " 20 | \n",
+ "
\n",
+ " \n",
+ " 30 | \n",
+ " 0.00 | \n",
+ " 27 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 0.00 | \n",
+ " 25 | \n",
+ "
\n",
+ " \n",
+ " 5 | \n",
+ " 0.00 | \n",
+ " 34 | \n",
+ "
\n",
+ " \n",
+ " 6 | \n",
+ " 0.00 | \n",
+ " 29 | \n",
+ "
\n",
+ " \n",
+ " 7 | \n",
+ " 0.00 | \n",
+ " 14 | \n",
+ "
\n",
+ " \n",
+ " 8 | \n",
+ " 0.00 | \n",
+ " 18 | \n",
+ "
\n",
+ " \n",
+ " 8a7c42f9-51e4-4848-bf88-30c210f149ad | \n",
+ " 0.00 | \n",
+ " 19 | \n",
+ "
\n",
+ " \n",
+ " 9 | \n",
+ " 0.00 | \n",
+ " 26 | \n",
+ "
\n",
+ " \n",
+ " CC | \n",
+ " 0.00 | \n",
+ " 3 | \n",
+ "
\n",
+ " \n",
+ " 1.00 | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " stop_pair\n",
+ "route_id direction_id \n",
+ "1 0.00 23\n",
+ "11 0.00 19\n",
+ "12X 0.00 14\n",
+ "2 0.00 31\n",
+ "20 0.00 7\n",
+ "3 0.00 20\n",
+ "30 0.00 27\n",
+ "4 0.00 25\n",
+ "5 0.00 34\n",
+ "6 0.00 29\n",
+ "7 0.00 14\n",
+ "8 0.00 18\n",
+ "8a7c42f9-51e4-4848-bf88-30c210f149ad 0.00 19\n",
+ "9 0.00 26\n",
+ "CC 0.00 3\n",
+ " 1.00 2"
+ ]
+ },
+ "execution_count": 76,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df2.groupby([\"route_id\", \"direction_id\"]).agg({\"stop_pair\": \"nunique\"})"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 77,
+ "id": "5d83b85b-fbbb-4a3a-8954-91acfd40f5a9",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " 159381 | \n",
+ " 159382 | \n",
+ " 159383 | \n",
+ " 159384 | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " schedule_gtfs_dataset_key | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ "
\n",
+ " \n",
+ " shape_array_key | \n",
+ " c6e9cda0db8bf76bc535f590ca1fccb5 | \n",
+ " c6e9cda0db8bf76bc535f590ca1fccb5 | \n",
+ " c6e9cda0db8bf76bc535f590ca1fccb5 | \n",
+ " c6e9cda0db8bf76bc535f590ca1fccb5 | \n",
+ "
\n",
+ " \n",
+ " shape_id | \n",
+ " 8746730d-27f9-4fb2-9f52-987afe356929 | \n",
+ " 8746730d-27f9-4fb2-9f52-987afe356929 | \n",
+ " 8746730d-27f9-4fb2-9f52-987afe356929 | \n",
+ " 8746730d-27f9-4fb2-9f52-987afe356929 | \n",
+ "
\n",
+ " \n",
+ " stop_sequence | \n",
+ " 2 | \n",
+ " 2 | \n",
+ " 3 | \n",
+ " 3 | \n",
+ "
\n",
+ " \n",
+ " route_id | \n",
+ " 30 | \n",
+ " 30 | \n",
+ " 30 | \n",
+ " 30 | \n",
+ "
\n",
+ " \n",
+ " direction_id | \n",
+ " 0.00 | \n",
+ " 0.00 | \n",
+ " 0.00 | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ " stop_pair | \n",
+ " f09af637-87de-4bdb-bf49-660539686c97__47def414-f158-496a-91cb-5f3fb0aa406c | \n",
+ " f09af637-87de-4bdb-bf49-660539686c97__47def414-f158-496a-91cb-5f3fb0aa406c | \n",
+ " 47def414-f158-496a-91cb-5f3fb0aa406c__a94160c1-bd99-4898-921f-941aa748ce6f | \n",
+ " 47def414-f158-496a-91cb-5f3fb0aa406c__a94160c1-bd99-4898-921f-941aa748ce6f | \n",
+ "
\n",
+ " \n",
+ " stop_pair_name | \n",
+ " Broadway at Stowell__Betteravia at Miller (Panda Express) | \n",
+ " Broadway at Stowell__Betteravia at Miller (Panda Express) | \n",
+ " Betteravia at Miller (Panda Express)__McCoy at Broadway (Outbound) | \n",
+ " Betteravia at Miller (Panda Express)__McCoy at Broadway (Outbound) | \n",
+ "
\n",
+ " \n",
+ " trip_instance_key | \n",
+ " 005bb393ed8b22ca4d8e7cc8d7895231 | \n",
+ " 217b90defbc6c69f05e19d16e96d1e3f | \n",
+ " 005bb393ed8b22ca4d8e7cc8d7895231 | \n",
+ " 217b90defbc6c69f05e19d16e96d1e3f | \n",
+ "
\n",
+ " \n",
+ " speed_mph | \n",
+ " 13.21 | \n",
+ " 13.89 | \n",
+ " 18.88 | \n",
+ " 17.04 | \n",
+ "
\n",
+ " \n",
+ " meters_elapsed | \n",
+ " 1930.84 | \n",
+ " 1930.84 | \n",
+ " 1409.45 | \n",
+ " 1409.45 | \n",
+ "
\n",
+ " \n",
+ " sec_elapsed | \n",
+ " 327.00 | \n",
+ " 311.00 | \n",
+ " 167.00 | \n",
+ " 185.00 | \n",
+ "
\n",
+ " \n",
+ " time_of_day | \n",
+ " PM Peak | \n",
+ " Early AM | \n",
+ " PM Peak | \n",
+ " Early AM | \n",
+ "
\n",
+ " \n",
+ " arrival_time | \n",
+ " 2024-11-13 15:23:45 | \n",
+ " 2024-11-13 06:21:23 | \n",
+ " 2024-11-13 15:29:12 | \n",
+ " 2024-11-13 06:26:34 | \n",
+ "
\n",
+ " \n",
+ " service_date | \n",
+ " 2024-11-13 00:00:00 | \n",
+ " 2024-11-13 00:00:00 | \n",
+ " 2024-11-13 00:00:00 | \n",
+ " 2024-11-13 00:00:00 | \n",
+ "
\n",
+ " \n",
+ " peak_offpeak | \n",
+ " peak | \n",
+ " offpeak | \n",
+ " peak | \n",
+ " offpeak | \n",
+ "
\n",
+ " \n",
+ " weekday_weekend | \n",
+ " weekday | \n",
+ " weekday | \n",
+ " weekday | \n",
+ " weekday | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " 159381 \\\n",
+ "schedule_gtfs_dataset_key 73105f2d1cabc8170ab066d96863c5d5 \n",
+ "shape_array_key c6e9cda0db8bf76bc535f590ca1fccb5 \n",
+ "shape_id 8746730d-27f9-4fb2-9f52-987afe356929 \n",
+ "stop_sequence 2 \n",
+ "route_id 30 \n",
+ "direction_id 0.00 \n",
+ "stop_pair f09af637-87de-4bdb-bf49-660539686c97__47def414-f158-496a-91cb-5f3fb0aa406c \n",
+ "stop_pair_name Broadway at Stowell__Betteravia at Miller (Panda Express) \n",
+ "trip_instance_key 005bb393ed8b22ca4d8e7cc8d7895231 \n",
+ "speed_mph 13.21 \n",
+ "meters_elapsed 1930.84 \n",
+ "sec_elapsed 327.00 \n",
+ "time_of_day PM Peak \n",
+ "arrival_time 2024-11-13 15:23:45 \n",
+ "service_date 2024-11-13 00:00:00 \n",
+ "peak_offpeak peak \n",
+ "weekday_weekend weekday \n",
+ "\n",
+ " 159382 \\\n",
+ "schedule_gtfs_dataset_key 73105f2d1cabc8170ab066d96863c5d5 \n",
+ "shape_array_key c6e9cda0db8bf76bc535f590ca1fccb5 \n",
+ "shape_id 8746730d-27f9-4fb2-9f52-987afe356929 \n",
+ "stop_sequence 2 \n",
+ "route_id 30 \n",
+ "direction_id 0.00 \n",
+ "stop_pair f09af637-87de-4bdb-bf49-660539686c97__47def414-f158-496a-91cb-5f3fb0aa406c \n",
+ "stop_pair_name Broadway at Stowell__Betteravia at Miller (Panda Express) \n",
+ "trip_instance_key 217b90defbc6c69f05e19d16e96d1e3f \n",
+ "speed_mph 13.89 \n",
+ "meters_elapsed 1930.84 \n",
+ "sec_elapsed 311.00 \n",
+ "time_of_day Early AM \n",
+ "arrival_time 2024-11-13 06:21:23 \n",
+ "service_date 2024-11-13 00:00:00 \n",
+ "peak_offpeak offpeak \n",
+ "weekday_weekend weekday \n",
+ "\n",
+ " 159383 \\\n",
+ "schedule_gtfs_dataset_key 73105f2d1cabc8170ab066d96863c5d5 \n",
+ "shape_array_key c6e9cda0db8bf76bc535f590ca1fccb5 \n",
+ "shape_id 8746730d-27f9-4fb2-9f52-987afe356929 \n",
+ "stop_sequence 3 \n",
+ "route_id 30 \n",
+ "direction_id 0.00 \n",
+ "stop_pair 47def414-f158-496a-91cb-5f3fb0aa406c__a94160c1-bd99-4898-921f-941aa748ce6f \n",
+ "stop_pair_name Betteravia at Miller (Panda Express)__McCoy at Broadway (Outbound) \n",
+ "trip_instance_key 005bb393ed8b22ca4d8e7cc8d7895231 \n",
+ "speed_mph 18.88 \n",
+ "meters_elapsed 1409.45 \n",
+ "sec_elapsed 167.00 \n",
+ "time_of_day PM Peak \n",
+ "arrival_time 2024-11-13 15:29:12 \n",
+ "service_date 2024-11-13 00:00:00 \n",
+ "peak_offpeak peak \n",
+ "weekday_weekend weekday \n",
+ "\n",
+ " 159384 \n",
+ "schedule_gtfs_dataset_key 73105f2d1cabc8170ab066d96863c5d5 \n",
+ "shape_array_key c6e9cda0db8bf76bc535f590ca1fccb5 \n",
+ "shape_id 8746730d-27f9-4fb2-9f52-987afe356929 \n",
+ "stop_sequence 3 \n",
+ "route_id 30 \n",
+ "direction_id 0.00 \n",
+ "stop_pair 47def414-f158-496a-91cb-5f3fb0aa406c__a94160c1-bd99-4898-921f-941aa748ce6f \n",
+ "stop_pair_name Betteravia at Miller (Panda Express)__McCoy at Broadway (Outbound) \n",
+ "trip_instance_key 217b90defbc6c69f05e19d16e96d1e3f \n",
+ "speed_mph 17.04 \n",
+ "meters_elapsed 1409.45 \n",
+ "sec_elapsed 185.00 \n",
+ "time_of_day Early AM \n",
+ "arrival_time 2024-11-13 06:26:34 \n",
+ "service_date 2024-11-13 00:00:00 \n",
+ "peak_offpeak offpeak \n",
+ "weekday_weekend weekday "
+ ]
+ },
+ "execution_count": 77,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df2.head(4).T"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "beb7e517-e8e8-4257-ac60-c55e755a81a9",
+ "metadata": {},
+ "source": [
+ "##### Now moving onto the function `rt_segment_speeds/scripts/average_segment_speeds/segment_averages()`"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 78,
+ "id": "6504f04f-fe58-47d8-aaf6-788d064dd03c",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "dict_inputs = GTFS_DATA_DICT[segment_type]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 79,
+ "id": "8c0e863f-66da-4026-839e-25fcd23d5ef6",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "OPERATOR_COLS = [\n",
+ " \"schedule_gtfs_dataset_key\",\n",
+ "]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 80,
+ "id": "0a40a434-cd05-4ba3-acb2-08063e1bc3b7",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "ROUTE_DIR_COLS = [*dict_inputs[\"route_dir_cols\"]]\n",
+ "STOP_PAIR_COLS = [*dict_inputs[\"stop_pair_cols\"]]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 81,
+ "id": "460b4f1a-0253-4a13-8a97-3ee365d2da83",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "group_cols = OPERATOR_COLS + ROUTE_DIR_COLS + STOP_PAIR_COLS"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 82,
+ "id": "55a6a69a-98d4-4e0e-a5e8-2143ea15bfa0",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "['schedule_gtfs_dataset_key',\n",
+ " 'route_id',\n",
+ " 'direction_id',\n",
+ " 'stop_pair',\n",
+ " 'stop_pair_name']"
+ ]
+ },
+ "execution_count": 82,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "group_cols"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "6577b12e-541a-473a-9ae4-c2d763f0b383",
+ "metadata": {},
+ "source": [
+ "##### Done. Added `dropna=False` to `rt_segment_speeds/segment_speed_utils/segment_calcs.calculate_avg_speeds`"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 83,
+ "id": "09c1fea8-8efb-42f0-9229-42f6cb63ebdb",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def calculate_avg_speeds(df: pd.DataFrame, group_cols: list) -> pd.DataFrame:\n",
+ " \"\"\"\n",
+ " Calculate the median, 20th, and 80th percentile speeds\n",
+ " by groups.\n",
+ " \"\"\"\n",
+ " # pd.groupby and pd.quantile is so slow\n",
+ " # create our own list of speeds and use np\n",
+ " df2 = (\n",
+ " df.groupby(group_cols, observed=True, group_keys=False, dropna=False)\n",
+ " .agg({\"speed_mph\": lambda x: sorted(list(x))})\n",
+ " .reset_index()\n",
+ " .rename(columns={\"speed_mph\": \"speed_mph_list\"})\n",
+ " )\n",
+ "\n",
+ " df2 = df2.assign(\n",
+ " p50_mph=df2.apply(lambda x: np.percentile(x.speed_mph_list, q=50), axis=1),\n",
+ " n_trips=df2.apply(lambda x: len(x.speed_mph_list), axis=1).astype(\"int16\"),\n",
+ " p20_mph=df2.apply(lambda x: np.percentile(x.speed_mph_list, q=20), axis=1),\n",
+ " p80_mph=df2.apply(lambda x: np.percentile(x.speed_mph_list, q=80), axis=1),\n",
+ " )\n",
+ "\n",
+ " stats = df2.drop(columns=\"speed_mph_list\")\n",
+ "\n",
+ " # Clean up for map\n",
+ " speed_cols = [c for c in stats.columns if \"_mph\" in c]\n",
+ " stats[speed_cols] = stats[speed_cols].round(2)\n",
+ "\n",
+ " return stats"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 84,
+ "id": "2e7944c7-211d-4098-aa6d-15150ff8d3c0",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "avg_speeds = calculate_avg_speeds(\n",
+ " df2,\n",
+ " group_cols + [\"time_of_day\"],\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 85,
+ "id": "2efce226-fb81-4353-822d-5044f7b4f164",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " schedule_gtfs_dataset_key | \n",
+ " route_id | \n",
+ " direction_id | \n",
+ " stop_pair | \n",
+ " stop_pair_name | \n",
+ " time_of_day | \n",
+ " p50_mph | \n",
+ " n_trips | \n",
+ " p20_mph | \n",
+ " p80_mph | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 1 | \n",
+ " 0.00 | \n",
+ " 1c236429-e252-40c0-8287-4fe38145f5ae__c65b7144-737c-42ae-87e8-fc676f19c644 | \n",
+ " Broadway at Hermosa__Broadway at Fesler | \n",
+ " AM Peak | \n",
+ " 32.35 | \n",
+ " 4 | \n",
+ " 10.41 | \n",
+ " 51.44 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 1 | \n",
+ " 0.00 | \n",
+ " 1c236429-e252-40c0-8287-4fe38145f5ae__c65b7144-737c-42ae-87e8-fc676f19c644 | \n",
+ " Broadway at Hermosa__Broadway at Fesler | \n",
+ " Early AM | \n",
+ " 20.20 | \n",
+ " 1 | \n",
+ " 20.20 | \n",
+ " 20.20 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 1 | \n",
+ " 0.00 | \n",
+ " 1c236429-e252-40c0-8287-4fe38145f5ae__c65b7144-737c-42ae-87e8-fc676f19c644 | \n",
+ " Broadway at Hermosa__Broadway at Fesler | \n",
+ " Evening | \n",
+ " 13.37 | \n",
+ " 1 | \n",
+ " 13.37 | \n",
+ " 13.37 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 1 | \n",
+ " 0.00 | \n",
+ " 1c236429-e252-40c0-8287-4fe38145f5ae__c65b7144-737c-42ae-87e8-fc676f19c644 | \n",
+ " Broadway at Hermosa__Broadway at Fesler | \n",
+ " Midday | \n",
+ " 15.96 | \n",
+ " 6 | \n",
+ " 1.00 | \n",
+ " 20.20 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 1 | \n",
+ " 0.00 | \n",
+ " 1c236429-e252-40c0-8287-4fe38145f5ae__c65b7144-737c-42ae-87e8-fc676f19c644 | \n",
+ " Broadway at Hermosa__Broadway at Fesler | \n",
+ " PM Peak | \n",
+ " 14.66 | \n",
+ " 5 | \n",
+ " 0.93 | \n",
+ " 16.33 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " schedule_gtfs_dataset_key route_id direction_id \\\n",
+ "0 73105f2d1cabc8170ab066d96863c5d5 1 0.00 \n",
+ "1 73105f2d1cabc8170ab066d96863c5d5 1 0.00 \n",
+ "2 73105f2d1cabc8170ab066d96863c5d5 1 0.00 \n",
+ "3 73105f2d1cabc8170ab066d96863c5d5 1 0.00 \n",
+ "4 73105f2d1cabc8170ab066d96863c5d5 1 0.00 \n",
+ "\n",
+ " stop_pair \\\n",
+ "0 1c236429-e252-40c0-8287-4fe38145f5ae__c65b7144-737c-42ae-87e8-fc676f19c644 \n",
+ "1 1c236429-e252-40c0-8287-4fe38145f5ae__c65b7144-737c-42ae-87e8-fc676f19c644 \n",
+ "2 1c236429-e252-40c0-8287-4fe38145f5ae__c65b7144-737c-42ae-87e8-fc676f19c644 \n",
+ "3 1c236429-e252-40c0-8287-4fe38145f5ae__c65b7144-737c-42ae-87e8-fc676f19c644 \n",
+ "4 1c236429-e252-40c0-8287-4fe38145f5ae__c65b7144-737c-42ae-87e8-fc676f19c644 \n",
+ "\n",
+ " stop_pair_name time_of_day p50_mph n_trips \\\n",
+ "0 Broadway at Hermosa__Broadway at Fesler AM Peak 32.35 4 \n",
+ "1 Broadway at Hermosa__Broadway at Fesler Early AM 20.20 1 \n",
+ "2 Broadway at Hermosa__Broadway at Fesler Evening 13.37 1 \n",
+ "3 Broadway at Hermosa__Broadway at Fesler Midday 15.96 6 \n",
+ "4 Broadway at Hermosa__Broadway at Fesler PM Peak 14.66 5 \n",
+ "\n",
+ " p20_mph p80_mph \n",
+ "0 10.41 51.44 \n",
+ "1 20.20 20.20 \n",
+ "2 13.37 13.37 \n",
+ "3 1.00 20.20 \n",
+ "4 0.93 16.33 "
+ ]
+ },
+ "execution_count": 85,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "avg_speeds.head()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "1d64908f-c9cc-42a9-a9e2-11ca0b517cb0",
+ "metadata": {},
+ "source": [
+ "##### Go back to `rt_sgment_speeds/scripts/average_segment_speeds.segment_averages()`"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 86,
+ "id": "64313006-18d1-4722-9919-97a0c338e8a0",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "avg_speeds2 = avg_speeds.pipe(\n",
+ " gtfs_schedule_wrangling.merge_operator_identifiers,\n",
+ " analysis_date_list,\n",
+ " columns=average_segment_speeds.CROSSWALK_COLS,\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 87,
+ "id": "6b9547c7-e6cf-4a42-b735-6dcb6b0e5d83",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "Int64Index: 1120 entries, 0 to 1119\n",
+ "Data columns (total 15 columns):\n",
+ " # Column Non-Null Count Dtype \n",
+ "--- ------ -------------- ----- \n",
+ " 0 schedule_gtfs_dataset_key 1120 non-null object \n",
+ " 1 route_id 1120 non-null object \n",
+ " 2 direction_id 1120 non-null float64\n",
+ " 3 stop_pair 1120 non-null object \n",
+ " 4 stop_pair_name 1120 non-null object \n",
+ " 5 time_of_day 1120 non-null object \n",
+ " 6 p50_mph 1120 non-null float64\n",
+ " 7 n_trips 1120 non-null int16 \n",
+ " 8 p20_mph 1120 non-null float64\n",
+ " 9 p80_mph 1120 non-null float64\n",
+ " 10 name 1120 non-null object \n",
+ " 11 caltrans_district 1120 non-null object \n",
+ " 12 organization_source_record_id 1120 non-null object \n",
+ " 13 organization_name 1120 non-null object \n",
+ " 14 base64_url 1120 non-null object \n",
+ "dtypes: float64(4), int16(1), object(10)\n",
+ "memory usage: 133.4+ KB\n"
+ ]
+ }
+ ],
+ "source": [
+ "avg_speeds2.info()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 88,
+ "id": "cb5e175d-9836-4497-9cc0-b4397f63afc5",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " schedule_gtfs_dataset_key | \n",
+ " route_id | \n",
+ " direction_id | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 1 | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ " 91 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 11 | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ " 148 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 12X | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ " 204 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 2 | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ " 315 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 20 | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ " 334 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 3 | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ " 409 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 30 | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ " 492 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 4 | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ " 584 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 5 | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ " 717 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 6 | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ " 829 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 7 | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ " 887 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 8 | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ " 952 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 8a7c42f9-51e4-4848-bf88-30c210f149ad | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ " 1028 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 9 | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ " 1115 | \n",
+ " f5a749dd65924e025b1293c58f95f8d6 | \n",
+ " CC | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ " 1118 | \n",
+ " f5a749dd65924e025b1293c58f95f8d6 | \n",
+ " CC | \n",
+ " 1.00 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " schedule_gtfs_dataset_key route_id \\\n",
+ "0 73105f2d1cabc8170ab066d96863c5d5 1 \n",
+ "91 73105f2d1cabc8170ab066d96863c5d5 11 \n",
+ "148 73105f2d1cabc8170ab066d96863c5d5 12X \n",
+ "204 73105f2d1cabc8170ab066d96863c5d5 2 \n",
+ "315 73105f2d1cabc8170ab066d96863c5d5 20 \n",
+ "334 73105f2d1cabc8170ab066d96863c5d5 3 \n",
+ "409 73105f2d1cabc8170ab066d96863c5d5 30 \n",
+ "492 73105f2d1cabc8170ab066d96863c5d5 4 \n",
+ "584 73105f2d1cabc8170ab066d96863c5d5 5 \n",
+ "717 73105f2d1cabc8170ab066d96863c5d5 6 \n",
+ "829 73105f2d1cabc8170ab066d96863c5d5 7 \n",
+ "887 73105f2d1cabc8170ab066d96863c5d5 8 \n",
+ "952 73105f2d1cabc8170ab066d96863c5d5 8a7c42f9-51e4-4848-bf88-30c210f149ad \n",
+ "1028 73105f2d1cabc8170ab066d96863c5d5 9 \n",
+ "1115 f5a749dd65924e025b1293c58f95f8d6 CC \n",
+ "1118 f5a749dd65924e025b1293c58f95f8d6 CC \n",
+ "\n",
+ " direction_id \n",
+ "0 0.00 \n",
+ "91 0.00 \n",
+ "148 0.00 \n",
+ "204 0.00 \n",
+ "315 0.00 \n",
+ "334 0.00 \n",
+ "409 0.00 \n",
+ "492 0.00 \n",
+ "584 0.00 \n",
+ "717 0.00 \n",
+ "829 0.00 \n",
+ "887 0.00 \n",
+ "952 0.00 \n",
+ "1028 0.00 \n",
+ "1115 0.00 \n",
+ "1118 1.00 "
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "preview(avg_speeds2)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "71124cd8-df34-42e6-99b2-1e72d1c888fa",
+ "metadata": {},
+ "source": [
+ "##### Move onto`rt_segment_speeds/scripts/average_segement_speeds/merge_in_segment_geometry()`\n",
+ "* Original function=only 3 routes showing...Check it out."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 89,
+ "id": "41f1b87c-5a5f-4909-bf13-63f6a430a16c",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "avg_speeds_with_geom = average_segment_speeds.merge_in_segment_geometry(\n",
+ " avg_speeds2, one_analysis_date, segment_type\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 90,
+ "id": "4a371afd-a828-41a0-afc6-f581bfffe175",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " schedule_gtfs_dataset_key | \n",
+ " route_id | \n",
+ " direction_id | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 30 | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ " 110 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 3 | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ " 202 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 20 | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ " 221 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 4 | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ " 339 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 5 | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ " 472 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 11 | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ " 529 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 7 | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ " 587 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 9 | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ " 692 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 1 | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ " 790 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 12X | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ " 846 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 6 | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ " 965 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 2 | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ " 1101 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 8 | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ " 1166 | \n",
+ " f5a749dd65924e025b1293c58f95f8d6 | \n",
+ " CC | \n",
+ " 1.00 | \n",
+ "
\n",
+ " \n",
+ " 1169 | \n",
+ " f5a749dd65924e025b1293c58f95f8d6 | \n",
+ " CC | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ " 1187 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 8a7c42f9-51e4-4848-bf88-30c210f149ad | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " schedule_gtfs_dataset_key route_id \\\n",
+ "0 73105f2d1cabc8170ab066d96863c5d5 30 \n",
+ "110 73105f2d1cabc8170ab066d96863c5d5 3 \n",
+ "202 73105f2d1cabc8170ab066d96863c5d5 20 \n",
+ "221 73105f2d1cabc8170ab066d96863c5d5 4 \n",
+ "339 73105f2d1cabc8170ab066d96863c5d5 5 \n",
+ "472 73105f2d1cabc8170ab066d96863c5d5 11 \n",
+ "529 73105f2d1cabc8170ab066d96863c5d5 7 \n",
+ "587 73105f2d1cabc8170ab066d96863c5d5 9 \n",
+ "692 73105f2d1cabc8170ab066d96863c5d5 1 \n",
+ "790 73105f2d1cabc8170ab066d96863c5d5 12X \n",
+ "846 73105f2d1cabc8170ab066d96863c5d5 6 \n",
+ "965 73105f2d1cabc8170ab066d96863c5d5 2 \n",
+ "1101 73105f2d1cabc8170ab066d96863c5d5 8 \n",
+ "1166 f5a749dd65924e025b1293c58f95f8d6 CC \n",
+ "1169 f5a749dd65924e025b1293c58f95f8d6 CC \n",
+ "1187 73105f2d1cabc8170ab066d96863c5d5 8a7c42f9-51e4-4848-bf88-30c210f149ad \n",
+ "\n",
+ " direction_id \n",
+ "0 0.00 \n",
+ "110 0.00 \n",
+ "202 0.00 \n",
+ "221 0.00 \n",
+ "339 0.00 \n",
+ "472 0.00 \n",
+ "529 0.00 \n",
+ "587 0.00 \n",
+ "692 0.00 \n",
+ "790 0.00 \n",
+ "846 0.00 \n",
+ "965 0.00 \n",
+ "1101 0.00 \n",
+ "1166 1.00 \n",
+ "1169 0.00 \n",
+ "1187 0.00 "
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "preview(avg_speeds_with_geom)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 91,
+ "id": "4ad1b0ef-e48f-4606-aca8-4bfcf1fd18b8",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from calitp_data_analysis.geography_utils import WGS84"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "fb05005e-676a-404e-8b98-b639e364729e",
+ "metadata": {},
+ "source": [
+ "##### Down another rabbit hole: this `SEGMENT_FILE` doesn't contain values for direction_id \n",
+ "* Need to find out whre it's originally made.\n",
+ "* Done **Fill in `direction_id` with 0.**"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 92,
+ "id": "458212b0-0f3f-46ef-b698-1d00f6a285c9",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "SEGMENT_FILE = GTFS_DATA_DICT[segment_type].segments_file\n",
+ "\n",
+ "segment_geom = gpd.read_parquet(\n",
+ " f\"{SEGMENT_GCS}{SEGMENT_FILE}_{one_analysis_date}.parquet\",\n",
+ ").to_crs(WGS84)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 93,
+ "id": "1d7886f9-e9ec-467d-a6e6-15df89fdc970",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "segment_geom.direction_id = segment_geom.direction_id.fillna(0)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 94,
+ "id": "b809e2e7-12c2-4232-8019-8516d84fb20f",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "segment_geom2 = segment_geom.loc[segment_geom.schedule_gtfs_dataset_key.isin(schd_keys)]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 95,
+ "id": "2da0e84d-df1a-41a6-8192-7ebbaf4386e0",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " schedule_gtfs_dataset_key | \n",
+ " route_id | \n",
+ " direction_id | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 181780 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 30 | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ " 570760 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 3 | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ " 1613296 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 20 | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ " 1969198 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 4 | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ " 2083066 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 5 | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ " 2112284 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 11 | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ " 2165911 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 7 | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ " 2215180 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 9 | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ " 2386098 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 1 | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ " 2720537 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 12X | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ " 2794999 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 6 | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ " 2903809 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 2 | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ " 2973400 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 8 | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ " 2986372 | \n",
+ " f5a749dd65924e025b1293c58f95f8d6 | \n",
+ " CC | \n",
+ " 1.00 | \n",
+ "
\n",
+ " \n",
+ " 3065284 | \n",
+ " f5a749dd65924e025b1293c58f95f8d6 | \n",
+ " CC | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ " 3107063 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 8a7c42f9-51e4-4848-bf88-30c210f149ad | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ " 3170382 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 1B | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " schedule_gtfs_dataset_key \\\n",
+ "181780 73105f2d1cabc8170ab066d96863c5d5 \n",
+ "570760 73105f2d1cabc8170ab066d96863c5d5 \n",
+ "1613296 73105f2d1cabc8170ab066d96863c5d5 \n",
+ "1969198 73105f2d1cabc8170ab066d96863c5d5 \n",
+ "2083066 73105f2d1cabc8170ab066d96863c5d5 \n",
+ "2112284 73105f2d1cabc8170ab066d96863c5d5 \n",
+ "2165911 73105f2d1cabc8170ab066d96863c5d5 \n",
+ "2215180 73105f2d1cabc8170ab066d96863c5d5 \n",
+ "2386098 73105f2d1cabc8170ab066d96863c5d5 \n",
+ "2720537 73105f2d1cabc8170ab066d96863c5d5 \n",
+ "2794999 73105f2d1cabc8170ab066d96863c5d5 \n",
+ "2903809 73105f2d1cabc8170ab066d96863c5d5 \n",
+ "2973400 73105f2d1cabc8170ab066d96863c5d5 \n",
+ "2986372 f5a749dd65924e025b1293c58f95f8d6 \n",
+ "3065284 f5a749dd65924e025b1293c58f95f8d6 \n",
+ "3107063 73105f2d1cabc8170ab066d96863c5d5 \n",
+ "3170382 73105f2d1cabc8170ab066d96863c5d5 \n",
+ "\n",
+ " route_id direction_id \n",
+ "181780 30 0.00 \n",
+ "570760 3 0.00 \n",
+ "1613296 20 0.00 \n",
+ "1969198 4 0.00 \n",
+ "2083066 5 0.00 \n",
+ "2112284 11 0.00 \n",
+ "2165911 7 0.00 \n",
+ "2215180 9 0.00 \n",
+ "2386098 1 0.00 \n",
+ "2720537 12X 0.00 \n",
+ "2794999 6 0.00 \n",
+ "2903809 2 0.00 \n",
+ "2973400 8 0.00 \n",
+ "2986372 CC 1.00 \n",
+ "3065284 CC 0.00 \n",
+ "3107063 8a7c42f9-51e4-4848-bf88-30c210f149ad 0.00 \n",
+ "3170382 1B 0.00 "
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "preview(segment_geom2)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 96,
+ "id": "a4b2dece-6078-43a6-8a90-4b9cd04ada08",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " trip_instance_key | \n",
+ " shape_array_key | \n",
+ " stop_id1 | \n",
+ " stop_sequence | \n",
+ " stop_id2 | \n",
+ " segment_id | \n",
+ " stop_pair | \n",
+ " schedule_gtfs_dataset_key | \n",
+ " route_id | \n",
+ " direction_id | \n",
+ " st_trip_instance_key | \n",
+ " segment_uuid | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 181780 | \n",
+ " 005bb393ed8b22ca4d8e7cc8d7895231 | \n",
+ " c6e9cda0db8bf76bc535f590ca1fccb5 | \n",
+ " 120f2635-ec31-435e-a089-225b26965f12 | \n",
+ " 1 | \n",
+ " f09af637-87de-4bdb-bf49-660539686c97 | \n",
+ " 120f2635-ec31-435e-a089-225b26965f12-f09af637-87de-4bdb-bf49-660539686c97-1 | \n",
+ " 120f2635-ec31-435e-a089-225b26965f12__f09af637-87de-4bdb-bf49-660539686c97 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 30 | \n",
+ " 0.00 | \n",
+ " 005bb393ed8b22ca4d8e7cc8d7895231 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5__30__nan__120f2635-ec31-435e-a089-225b26965f12-f09af637-87de-4bdb-bf49-660539686c97-1 | \n",
+ "
\n",
+ " \n",
+ " 181781 | \n",
+ " 005bb393ed8b22ca4d8e7cc8d7895231 | \n",
+ " c6e9cda0db8bf76bc535f590ca1fccb5 | \n",
+ " f09af637-87de-4bdb-bf49-660539686c97 | \n",
+ " 2 | \n",
+ " 47def414-f158-496a-91cb-5f3fb0aa406c | \n",
+ " f09af637-87de-4bdb-bf49-660539686c97-47def414-f158-496a-91cb-5f3fb0aa406c-1 | \n",
+ " f09af637-87de-4bdb-bf49-660539686c97__47def414-f158-496a-91cb-5f3fb0aa406c | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 30 | \n",
+ " 0.00 | \n",
+ " 005bb393ed8b22ca4d8e7cc8d7895231 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5__30__nan__f09af637-87de-4bdb-bf49-660539686c97-47def414-f158-496a-91cb-5f3fb0aa406c-1 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " trip_instance_key shape_array_key \\\n",
+ "181780 005bb393ed8b22ca4d8e7cc8d7895231 c6e9cda0db8bf76bc535f590ca1fccb5 \n",
+ "181781 005bb393ed8b22ca4d8e7cc8d7895231 c6e9cda0db8bf76bc535f590ca1fccb5 \n",
+ "\n",
+ " stop_id1 stop_sequence \\\n",
+ "181780 120f2635-ec31-435e-a089-225b26965f12 1 \n",
+ "181781 f09af637-87de-4bdb-bf49-660539686c97 2 \n",
+ "\n",
+ " stop_id2 \\\n",
+ "181780 f09af637-87de-4bdb-bf49-660539686c97 \n",
+ "181781 47def414-f158-496a-91cb-5f3fb0aa406c \n",
+ "\n",
+ " segment_id \\\n",
+ "181780 120f2635-ec31-435e-a089-225b26965f12-f09af637-87de-4bdb-bf49-660539686c97-1 \n",
+ "181781 f09af637-87de-4bdb-bf49-660539686c97-47def414-f158-496a-91cb-5f3fb0aa406c-1 \n",
+ "\n",
+ " stop_pair \\\n",
+ "181780 120f2635-ec31-435e-a089-225b26965f12__f09af637-87de-4bdb-bf49-660539686c97 \n",
+ "181781 f09af637-87de-4bdb-bf49-660539686c97__47def414-f158-496a-91cb-5f3fb0aa406c \n",
+ "\n",
+ " schedule_gtfs_dataset_key route_id direction_id \\\n",
+ "181780 73105f2d1cabc8170ab066d96863c5d5 30 0.00 \n",
+ "181781 73105f2d1cabc8170ab066d96863c5d5 30 0.00 \n",
+ "\n",
+ " st_trip_instance_key \\\n",
+ "181780 005bb393ed8b22ca4d8e7cc8d7895231 \n",
+ "181781 005bb393ed8b22ca4d8e7cc8d7895231 \n",
+ "\n",
+ " segment_uuid \n",
+ "181780 73105f2d1cabc8170ab066d96863c5d5__30__nan__120f2635-ec31-435e-a089-225b26965f12-f09af637-87de-4bdb-bf49-660539686c97-1 \n",
+ "181781 73105f2d1cabc8170ab066d96863c5d5__30__nan__f09af637-87de-4bdb-bf49-660539686c97-47def414-f158-496a-91cb-5f3fb0aa406c-1 "
+ ]
+ },
+ "execution_count": 96,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "segment_geom2.drop(columns=[\"geometry\"]).head(2)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "acec3a1a-30f3-43b5-b7e5-efba1699924c",
+ "metadata": {},
+ "source": [
+ "##### Continue on with the rest of `merge_in_segment_geometry` in `rt_segment_speeds/scripts/average_segment_speeds`"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 97,
+ "id": "202a6a24-fab9-4204-82ee-dfdab76f5628",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "'rollup_singleday/speeds_route_dir_segments'"
+ ]
+ },
+ "execution_count": 97,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "dict_inputs[\"route_dir_single_segment\"]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 98,
+ "id": "03f32706-608d-4ff8-bcce-6aebb8c59820",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "geom_file_cols = segment_geom2.columns.tolist()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 99,
+ "id": "1030f6a4-eeb1-404b-aab1-329d67bb8cf8",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "col_order = [c for c in avg_speeds2.columns]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 100,
+ "id": "36b45cfb-8aa8-4f52-a1e8-101f491cc2c8",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "merge_cols = list(set(col_order).intersection(geom_file_cols))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 101,
+ "id": "8e3a312c-2ec4-4820-a306-c32920b3333e",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "gdf = (\n",
+ " pd.merge(\n",
+ " segment_geom2[merge_cols + [\"geometry\"]].drop_duplicates(),\n",
+ " avg_speeds2,\n",
+ " on=merge_cols,\n",
+ " )\n",
+ " .reset_index(drop=True)\n",
+ " .reindex(columns=col_order + [\"geometry\"])\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 102,
+ "id": "f32499ad-55e5-446d-a9e1-8dab5b3d52be",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "RangeIndex: 1266 entries, 0 to 1265\n",
+ "Data columns (total 16 columns):\n",
+ " # Column Non-Null Count Dtype \n",
+ "--- ------ -------------- ----- \n",
+ " 0 schedule_gtfs_dataset_key 1266 non-null object \n",
+ " 1 route_id 1266 non-null object \n",
+ " 2 direction_id 1266 non-null float64 \n",
+ " 3 stop_pair 1266 non-null object \n",
+ " 4 stop_pair_name 1266 non-null object \n",
+ " 5 time_of_day 1266 non-null object \n",
+ " 6 p50_mph 1266 non-null float64 \n",
+ " 7 n_trips 1266 non-null int16 \n",
+ " 8 p20_mph 1266 non-null float64 \n",
+ " 9 p80_mph 1266 non-null float64 \n",
+ " 10 name 1266 non-null object \n",
+ " 11 caltrans_district 1266 non-null object \n",
+ " 12 organization_source_record_id 1266 non-null object \n",
+ " 13 organization_name 1266 non-null object \n",
+ " 14 base64_url 1266 non-null object \n",
+ " 15 geometry 1266 non-null geometry\n",
+ "dtypes: float64(4), geometry(1), int16(1), object(10)\n",
+ "memory usage: 151.0+ KB\n"
+ ]
+ }
+ ],
+ "source": [
+ "gdf.info()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 103,
+ "id": "5079633b-1029-4370-878d-346c30369036",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " schedule_gtfs_dataset_key | \n",
+ " route_id | \n",
+ " direction_id | \n",
+ " stop_pair | \n",
+ " stop_pair_name | \n",
+ " time_of_day | \n",
+ " p50_mph | \n",
+ " n_trips | \n",
+ " p20_mph | \n",
+ " p80_mph | \n",
+ " name | \n",
+ " caltrans_district | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 1162 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 8 | \n",
+ " 0.00 | \n",
+ " 712b4000-441b-4b64-8a8e-36ec38bbbce1__ae050555-4c98-44e7-ad1a-d536b91d2012 | \n",
+ " Carmen ln at Trinity (Wesgate)(Outbound)__Carmen Ln at Carmelia Ln. | \n",
+ " AM Peak | \n",
+ " 24.39 | \n",
+ " 3 | \n",
+ " 20.99 | \n",
+ " 31.30 | \n",
+ " Santa Maria Schedule | \n",
+ " 05 - San Luis Obispo | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " schedule_gtfs_dataset_key route_id direction_id \\\n",
+ "1162 73105f2d1cabc8170ab066d96863c5d5 8 0.00 \n",
+ "\n",
+ " stop_pair \\\n",
+ "1162 712b4000-441b-4b64-8a8e-36ec38bbbce1__ae050555-4c98-44e7-ad1a-d536b91d2012 \n",
+ "\n",
+ " stop_pair_name \\\n",
+ "1162 Carmen ln at Trinity (Wesgate)(Outbound)__Carmen Ln at Carmelia Ln. \n",
+ "\n",
+ " time_of_day p50_mph n_trips p20_mph p80_mph name \\\n",
+ "1162 AM Peak 24.39 3 20.99 31.30 Santa Maria Schedule \n",
+ "\n",
+ " caltrans_district \n",
+ "1162 05 - San Luis Obispo "
+ ]
+ },
+ "execution_count": 103,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "gdf.drop(\n",
+ " columns=[\n",
+ " \"geometry\",\n",
+ " \"organization_source_record_id\",\n",
+ " \"organization_name\",\n",
+ " \"base64_url\",\n",
+ " ]\n",
+ ").sample()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "b1636a93-4c57-489a-a40f-f2a605e92afc",
+ "metadata": {},
+ "source": [
+ "##### `rt_segment_speeds/scripts/average_segment_speeds` gives me the speeds by stop for a route. However, in `gtfss_digest/merge_data`, we want the speeds for the entire route from `average_segment_speeds` is summarized in `rt_segment_speeds/scripts/average_summary_speeds`"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 104,
+ "id": "ea8acdaf-c827-42e5-a410-75cd85119949",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "'rollup_singleday/speeds_route_dir_segments'"
+ ]
+ },
+ "execution_count": 104,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "dict_inputs[\"route_dir_single_segment\"]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "09e6f8f2-06e2-4280-afeb-7309e1eb6aae",
+ "metadata": {},
+ "source": [
+ "##### **This file below is used in `gtfs_digest/merge_data`. Need to breakout `average_summary_speeds`**\n",
+ "* gs://calitp-analytics-data/data-analyses/rt_segment_speeds/ and rollup_singleday/speeds_route_dir_AH_TEST_2024-11-13"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 105,
+ "id": "a2c77444-4705-46cb-80c1-f704d316ad73",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "'rollup_singleday/speeds_route_dir'"
+ ]
+ },
+ "execution_count": 105,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "GTFS_DATA_DICT.rt_stop_times.route_dir_single_summary"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 106,
+ "id": "0851effa-872b-4b43-855d-09a08f836ab3",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "dict_inputs = GTFS_DATA_DICT[segment_type]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 107,
+ "id": "4f0b4f78-93ee-4687-ab55-29c628e2ae93",
+ "metadata": {
+ "scrolled": true,
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "avg_summary_speeds_url = \"gs://calitp-analytics-data/data-analyses/rt_segment_speeds/rollup_singleday/speeds_route_dir_AH_TEST_2024-11-13.parquet\""
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 108,
+ "id": "8c35d5ca-4fec-4912-9869-c889befa71ae",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "avg_summary_speeds_df = gpd.read_parquet(avg_summary_speeds_url)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "55e502c4-6fca-4378-b100-da4c1e5c14d7",
+ "metadata": {},
+ "source": [
+ "##### Only one route is showing!"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 109,
+ "id": "55b2425d-c8ab-45da-9af8-0aedc4c8437b",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "avg_summary_speeds_df2 = avg_summary_speeds_df.loc[\n",
+ " avg_summary_speeds_df.schedule_gtfs_dataset_key.isin(schd_keys)\n",
+ "]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 110,
+ "id": "19209be1-d849-442c-ab84-d6a3d780a2f0",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " schedule_gtfs_dataset_key | \n",
+ " route_id | \n",
+ " direction_id | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 1002 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 5 | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " schedule_gtfs_dataset_key route_id direction_id\n",
+ "1002 73105f2d1cabc8170ab066d96863c5d5 5 0.00"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "preview(avg_summary_speeds_df2)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 111,
+ "id": "3413b59d-4dee-4d7c-bf39-6732a8189bb4",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "common_shape_geom = gtfs_schedule_wrangling.most_common_shape_by_route_direction(\n",
+ " one_analysis_date\n",
+ ").to_crs(WGS84)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 112,
+ "id": "43ac9053-9287-4ecf-9699-67a07eeede41",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "Index(['geometry', 'schedule_gtfs_dataset_key', 'route_id', 'direction_id',\n",
+ " 'common_shape_id', 'route_name'],\n",
+ " dtype='object')"
+ ]
+ },
+ "execution_count": 112,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "common_shape_geom.columns"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 113,
+ "id": "29f05db8-dc82-4f8b-89b4-0fde6ab4c23f",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "common_shape_geom2 = common_shape_geom.loc[\n",
+ " common_shape_geom.schedule_gtfs_dataset_key.isin(schd_keys)\n",
+ "]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 114,
+ "id": "6338c87c-827a-4ee8-a1d8-352abcb0ad09",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "Int64Index: 23 entries, 167 to 1098\n",
+ "Data columns (total 6 columns):\n",
+ " # Column Non-Null Count Dtype \n",
+ "--- ------ -------------- ----- \n",
+ " 0 geometry 23 non-null geometry\n",
+ " 1 schedule_gtfs_dataset_key 23 non-null object \n",
+ " 2 route_id 23 non-null object \n",
+ " 3 direction_id 23 non-null float64 \n",
+ " 4 common_shape_id 23 non-null object \n",
+ " 5 route_name 23 non-null object \n",
+ "dtypes: float64(1), geometry(1), object(4)\n",
+ "memory usage: 1.3+ KB\n"
+ ]
+ }
+ ],
+ "source": [
+ "common_shape_geom2.info()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "e4a1c69b-b1ec-4474-b998-182e536939ba",
+ "metadata": {},
+ "source": [
+ "##### DONE. This `concatenate_trip_segment_speeds` is from `rt_segment_speeds/scripts/average_segment_speeds`"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 115,
+ "id": "d70f10af-fe25-4931-9f2c-2e60fe5b3248",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "concatenated files\n"
+ ]
+ }
+ ],
+ "source": [
+ "df = average_summary_speeds.concatenate_trip_segment_speeds(\n",
+ " analysis_date_list, segment_type\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 116,
+ "id": "99349065-4151-4254-9c88-bf21537e7f27",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df2 = df.loc[df.schedule_gtfs_dataset_key.isin(schd_keys)]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "51467373-8bf3-4cd4-bd0a-4c47c2c54462",
+ "metadata": {},
+ "source": [
+ "##### DONE **Filled in `direction_id` with 0. Should actually go back to `average_summary_speeds.concatenate_trip_segment_speeds` and fill it in there**"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 117,
+ "id": "65a254fc-b944-4d4b-834d-a2d5547b5fc7",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/tmp/ipykernel_2800/3692506384.py:1: SettingWithCopyWarning: \n",
+ "A value is trying to be set on a copy of a slice from a DataFrame.\n",
+ "Try using .loc[row_indexer,col_indexer] = value instead\n",
+ "\n",
+ "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+ " df2.direction_id = df2.direction_id.fillna(0)\n"
+ ]
+ }
+ ],
+ "source": [
+ "df2.direction_id = df2.direction_id.fillna(0)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 118,
+ "id": "7aacc425-3a17-4aca-9e62-371215494557",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "Int64Index: 3543 entries, 159381 to 2656608\n",
+ "Data columns (total 17 columns):\n",
+ " # Column Non-Null Count Dtype \n",
+ "--- ------ -------------- ----- \n",
+ " 0 schedule_gtfs_dataset_key 3543 non-null object \n",
+ " 1 shape_array_key 3543 non-null object \n",
+ " 2 shape_id 3543 non-null object \n",
+ " 3 stop_sequence 3543 non-null int64 \n",
+ " 4 route_id 3543 non-null object \n",
+ " 5 direction_id 3543 non-null float64 \n",
+ " 6 stop_pair 3543 non-null object \n",
+ " 7 stop_pair_name 3543 non-null object \n",
+ " 8 trip_instance_key 3543 non-null object \n",
+ " 9 speed_mph 3543 non-null float64 \n",
+ " 10 meters_elapsed 3543 non-null float64 \n",
+ " 11 sec_elapsed 3543 non-null float64 \n",
+ " 12 time_of_day 3543 non-null object \n",
+ " 13 arrival_time 3543 non-null datetime64[ns]\n",
+ " 14 service_date 3543 non-null datetime64[ns]\n",
+ " 15 peak_offpeak 3543 non-null object \n",
+ " 16 weekday_weekend 3543 non-null object \n",
+ "dtypes: datetime64[ns](2), float64(4), int64(1), object(10)\n",
+ "memory usage: 498.2+ KB\n"
+ ]
+ }
+ ],
+ "source": [
+ "df2.info()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 119,
+ "id": "afd48aa0-2404-4d73-946d-2621be91bb05",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " schedule_gtfs_dataset_key | \n",
+ " route_id | \n",
+ " direction_id | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 159381 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 30 | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ " 472131 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 3 | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ " 1320980 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 20 | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ " 1627284 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 4 | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ " 1727996 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 5 | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ " 1754122 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 11 | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ " 1801423 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 7 | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ " 1838091 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 9 | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ " 1986825 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 1 | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ " 2277584 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 12X | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ " 2341443 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 6 | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ " 2431800 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 2 | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ " 2491471 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 8 | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ " 2600819 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 8a7c42f9-51e4-4848-bf88-30c210f149ad | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ " 2615442 | \n",
+ " f5a749dd65924e025b1293c58f95f8d6 | \n",
+ " CC | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ " 2656607 | \n",
+ " f5a749dd65924e025b1293c58f95f8d6 | \n",
+ " CC | \n",
+ " 1.00 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " schedule_gtfs_dataset_key \\\n",
+ "159381 73105f2d1cabc8170ab066d96863c5d5 \n",
+ "472131 73105f2d1cabc8170ab066d96863c5d5 \n",
+ "1320980 73105f2d1cabc8170ab066d96863c5d5 \n",
+ "1627284 73105f2d1cabc8170ab066d96863c5d5 \n",
+ "1727996 73105f2d1cabc8170ab066d96863c5d5 \n",
+ "1754122 73105f2d1cabc8170ab066d96863c5d5 \n",
+ "1801423 73105f2d1cabc8170ab066d96863c5d5 \n",
+ "1838091 73105f2d1cabc8170ab066d96863c5d5 \n",
+ "1986825 73105f2d1cabc8170ab066d96863c5d5 \n",
+ "2277584 73105f2d1cabc8170ab066d96863c5d5 \n",
+ "2341443 73105f2d1cabc8170ab066d96863c5d5 \n",
+ "2431800 73105f2d1cabc8170ab066d96863c5d5 \n",
+ "2491471 73105f2d1cabc8170ab066d96863c5d5 \n",
+ "2600819 73105f2d1cabc8170ab066d96863c5d5 \n",
+ "2615442 f5a749dd65924e025b1293c58f95f8d6 \n",
+ "2656607 f5a749dd65924e025b1293c58f95f8d6 \n",
+ "\n",
+ " route_id direction_id \n",
+ "159381 30 0.00 \n",
+ "472131 3 0.00 \n",
+ "1320980 20 0.00 \n",
+ "1627284 4 0.00 \n",
+ "1727996 5 0.00 \n",
+ "1754122 11 0.00 \n",
+ "1801423 7 0.00 \n",
+ "1838091 9 0.00 \n",
+ "1986825 1 0.00 \n",
+ "2277584 12X 0.00 \n",
+ "2341443 6 0.00 \n",
+ "2431800 2 0.00 \n",
+ "2491471 8 0.00 \n",
+ "2600819 8a7c42f9-51e4-4848-bf88-30c210f149ad 0.00 \n",
+ "2615442 CC 0.00 \n",
+ "2656607 CC 1.00 "
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "preview(df2)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "4effb93c-6b15-46a2-8365-232989cc563e",
+ "metadata": {},
+ "source": [
+ "##### Continuing on with `average_summary_speeds`"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 120,
+ "id": "3a7190ff-b431-4eea-884f-6b0175dcbe69",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "trip_group_cols = OPERATOR_COLS + ROUTE_DIR_COLS"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 121,
+ "id": "6e2d9095-51c9-4507-be42-393f0abbc194",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "trip_avg = (\n",
+ " metrics.weighted_average_speeds_across_segments(\n",
+ " df2,\n",
+ " trip_group_cols + [\"peak_offpeak\"],\n",
+ " )\n",
+ " .pipe(\n",
+ " gtfs_schedule_wrangling.merge_operator_identifiers,\n",
+ " analysis_date_list,\n",
+ " columns=average_segment_speeds.CROSSWALK_COLS,\n",
+ " )\n",
+ " .reset_index(drop=True)\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 122,
+ "id": "907dad4a-f806-4090-adbe-3d24eef62348",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " schedule_gtfs_dataset_key | \n",
+ " route_id | \n",
+ " direction_id | \n",
+ " peak_offpeak | \n",
+ " meters_elapsed | \n",
+ " sec_elapsed | \n",
+ " speed_mph | \n",
+ " name | \n",
+ " caltrans_district | \n",
+ " organization_source_record_id | \n",
+ " organization_name | \n",
+ " base64_url | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 1 | \n",
+ " 0.00 | \n",
+ " offpeak | \n",
+ " 355890.88 | \n",
+ " 60001.00 | \n",
+ " 13.27 | \n",
+ " Santa Maria Schedule | \n",
+ " 05 - San Luis Obispo | \n",
+ " rec9zGMJgNnes75K1 | \n",
+ " City of Santa Maria | \n",
+ " aHR0cHM6Ly9zbXJ0LnRyaXBzaG90LmNvbS92MS9ndGZzLnppcD9yZWdpb25JZD1DQTU1OEREQy1EN0YyLTRCNDgtOUNBQy1ERUVBMTEzNEY4MjA= | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " schedule_gtfs_dataset_key route_id direction_id peak_offpeak \\\n",
+ "0 73105f2d1cabc8170ab066d96863c5d5 1 0.00 offpeak \n",
+ "\n",
+ " meters_elapsed sec_elapsed speed_mph name \\\n",
+ "0 355890.88 60001.00 13.27 Santa Maria Schedule \n",
+ "\n",
+ " caltrans_district organization_source_record_id organization_name \\\n",
+ "0 05 - San Luis Obispo rec9zGMJgNnes75K1 City of Santa Maria \n",
+ "\n",
+ " base64_url \n",
+ "0 aHR0cHM6Ly9zbXJ0LnRyaXBzaG90LmNvbS92MS9ndGZzLnppcD9yZWdpb25JZD1DQTU1OEREQy1EN0YyLTRCNDgtOUNBQy1ERUVBMTEzNEY4MjA= "
+ ]
+ },
+ "execution_count": 122,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "trip_avg.head(1)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 123,
+ "id": "db139167-e446-49f9-9285-14011f5bad7e",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "RangeIndex: 30 entries, 0 to 29\n",
+ "Data columns (total 12 columns):\n",
+ " # Column Non-Null Count Dtype \n",
+ "--- ------ -------------- ----- \n",
+ " 0 schedule_gtfs_dataset_key 30 non-null object \n",
+ " 1 route_id 30 non-null object \n",
+ " 2 direction_id 30 non-null float64\n",
+ " 3 peak_offpeak 30 non-null object \n",
+ " 4 meters_elapsed 30 non-null float64\n",
+ " 5 sec_elapsed 30 non-null float64\n",
+ " 6 speed_mph 30 non-null float64\n",
+ " 7 name 30 non-null object \n",
+ " 8 caltrans_district 30 non-null object \n",
+ " 9 organization_source_record_id 30 non-null object \n",
+ " 10 organization_name 30 non-null object \n",
+ " 11 base64_url 30 non-null object \n",
+ "dtypes: float64(4), object(8)\n",
+ "memory usage: 2.9+ KB\n"
+ ]
+ }
+ ],
+ "source": [
+ "trip_avg.info()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 124,
+ "id": "a6aae002-e115-4de5-bd36-b2aed09a3f16",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " schedule_gtfs_dataset_key | \n",
+ " route_id | \n",
+ " direction_id | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 1 | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 11 | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 12X | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ " 6 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 2 | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ " 8 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 20 | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ " 10 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 3 | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ " 12 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 30 | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ " 14 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 4 | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ " 16 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 5 | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ " 18 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 6 | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ " 20 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 7 | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ " 22 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 8 | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ " 24 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 8a7c42f9-51e4-4848-bf88-30c210f149ad | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ " 26 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 9 | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ " 28 | \n",
+ " f5a749dd65924e025b1293c58f95f8d6 | \n",
+ " CC | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ " 29 | \n",
+ " f5a749dd65924e025b1293c58f95f8d6 | \n",
+ " CC | \n",
+ " 1.00 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " schedule_gtfs_dataset_key route_id \\\n",
+ "0 73105f2d1cabc8170ab066d96863c5d5 1 \n",
+ "2 73105f2d1cabc8170ab066d96863c5d5 11 \n",
+ "4 73105f2d1cabc8170ab066d96863c5d5 12X \n",
+ "6 73105f2d1cabc8170ab066d96863c5d5 2 \n",
+ "8 73105f2d1cabc8170ab066d96863c5d5 20 \n",
+ "10 73105f2d1cabc8170ab066d96863c5d5 3 \n",
+ "12 73105f2d1cabc8170ab066d96863c5d5 30 \n",
+ "14 73105f2d1cabc8170ab066d96863c5d5 4 \n",
+ "16 73105f2d1cabc8170ab066d96863c5d5 5 \n",
+ "18 73105f2d1cabc8170ab066d96863c5d5 6 \n",
+ "20 73105f2d1cabc8170ab066d96863c5d5 7 \n",
+ "22 73105f2d1cabc8170ab066d96863c5d5 8 \n",
+ "24 73105f2d1cabc8170ab066d96863c5d5 8a7c42f9-51e4-4848-bf88-30c210f149ad \n",
+ "26 73105f2d1cabc8170ab066d96863c5d5 9 \n",
+ "28 f5a749dd65924e025b1293c58f95f8d6 CC \n",
+ "29 f5a749dd65924e025b1293c58f95f8d6 CC \n",
+ "\n",
+ " direction_id \n",
+ "0 0.00 \n",
+ "2 0.00 \n",
+ "4 0.00 \n",
+ "6 0.00 \n",
+ "8 0.00 \n",
+ "10 0.00 \n",
+ "12 0.00 \n",
+ "14 0.00 \n",
+ "16 0.00 \n",
+ "18 0.00 \n",
+ "20 0.00 \n",
+ "22 0.00 \n",
+ "24 0.00 \n",
+ "26 0.00 \n",
+ "28 0.00 \n",
+ "29 1.00 "
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "preview(trip_avg)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "05c11760-6000-4165-9d9c-3be8de68e087",
+ "metadata": {},
+ "source": [
+ "##### Skipping this part because I can't find `MIN_TRIP_SECONDS` and `MAX_TRIP_SECONDS` in `dict_input`"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "7aa47719-0302-440e-b15a-0443bb7e18dd",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "\"\"\" trip_avg_filtered = trip_avg[\n",
+ " (trip_avg.meters_elapsed >= average_summary_speeds.METERS_CUTOFF) & \n",
+ " (trip_avg.sec_elapsed >= average_summary_speeds.MIN_TRIP_SECONDS) & \n",
+ " (trip_avg.sec_elapsed <= average_summary_speeds.MAX_TRIP_SECONDS)\n",
+ " ]\n",
+ " \"\"\""
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 125,
+ "id": "a5a2d768-cd9d-40cd-903b-a04c3e8ad33e",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "group_cols = OPERATOR_COLS + ROUTE_DIR_COLS"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 126,
+ "id": "e7408973-da0a-4d0f-9b8f-a178310bd5b5",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "avg_speeds = (\n",
+ " metrics.concatenate_peak_offpeak_allday_averages(\n",
+ " trip_avg, group_cols, metric_type=\"summary_speeds\"\n",
+ " )\n",
+ " .pipe(\n",
+ " gtfs_schedule_wrangling.merge_operator_identifiers,\n",
+ " analysis_date_list,\n",
+ " columns=average_segment_speeds.CROSSWALK_COLS,\n",
+ " )\n",
+ " .reset_index(drop=True)\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 127,
+ "id": "55a0be8a-b1bc-4138-8a90-49885512cf53",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " schedule_gtfs_dataset_key | \n",
+ " route_id | \n",
+ " direction_id | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 1 | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 11 | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 12X | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ " 6 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 2 | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ " 8 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 20 | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ " 10 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 3 | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ " 12 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 30 | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ " 14 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 4 | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ " 16 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 5 | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ " 18 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 6 | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ " 20 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 7 | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ " 22 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 8 | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ " 24 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 8a7c42f9-51e4-4848-bf88-30c210f149ad | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ " 26 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 9 | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ " 42 | \n",
+ " f5a749dd65924e025b1293c58f95f8d6 | \n",
+ " CC | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ " 43 | \n",
+ " f5a749dd65924e025b1293c58f95f8d6 | \n",
+ " CC | \n",
+ " 1.00 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " schedule_gtfs_dataset_key route_id \\\n",
+ "0 73105f2d1cabc8170ab066d96863c5d5 1 \n",
+ "2 73105f2d1cabc8170ab066d96863c5d5 11 \n",
+ "4 73105f2d1cabc8170ab066d96863c5d5 12X \n",
+ "6 73105f2d1cabc8170ab066d96863c5d5 2 \n",
+ "8 73105f2d1cabc8170ab066d96863c5d5 20 \n",
+ "10 73105f2d1cabc8170ab066d96863c5d5 3 \n",
+ "12 73105f2d1cabc8170ab066d96863c5d5 30 \n",
+ "14 73105f2d1cabc8170ab066d96863c5d5 4 \n",
+ "16 73105f2d1cabc8170ab066d96863c5d5 5 \n",
+ "18 73105f2d1cabc8170ab066d96863c5d5 6 \n",
+ "20 73105f2d1cabc8170ab066d96863c5d5 7 \n",
+ "22 73105f2d1cabc8170ab066d96863c5d5 8 \n",
+ "24 73105f2d1cabc8170ab066d96863c5d5 8a7c42f9-51e4-4848-bf88-30c210f149ad \n",
+ "26 73105f2d1cabc8170ab066d96863c5d5 9 \n",
+ "42 f5a749dd65924e025b1293c58f95f8d6 CC \n",
+ "43 f5a749dd65924e025b1293c58f95f8d6 CC \n",
+ "\n",
+ " direction_id \n",
+ "0 0.00 \n",
+ "2 0.00 \n",
+ "4 0.00 \n",
+ "6 0.00 \n",
+ "8 0.00 \n",
+ "10 0.00 \n",
+ "12 0.00 \n",
+ "14 0.00 \n",
+ "16 0.00 \n",
+ "18 0.00 \n",
+ "20 0.00 \n",
+ "22 0.00 \n",
+ "24 0.00 \n",
+ "26 0.00 \n",
+ "42 0.00 \n",
+ "43 1.00 "
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "preview(avg_speeds)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 128,
+ "id": "287deabd-5043-4d7c-bcee-5952a2b0ad30",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "RangeIndex: 46 entries, 0 to 45\n",
+ "Data columns (total 12 columns):\n",
+ " # Column Non-Null Count Dtype \n",
+ "--- ------ -------------- ----- \n",
+ " 0 schedule_gtfs_dataset_key 46 non-null object \n",
+ " 1 route_id 46 non-null object \n",
+ " 2 direction_id 46 non-null float64\n",
+ " 3 time_period 46 non-null object \n",
+ " 4 meters_elapsed 46 non-null float64\n",
+ " 5 sec_elapsed 46 non-null float64\n",
+ " 6 speed_mph 46 non-null float64\n",
+ " 7 name 46 non-null object \n",
+ " 8 caltrans_district 46 non-null object \n",
+ " 9 organization_source_record_id 46 non-null object \n",
+ " 10 organization_name 46 non-null object \n",
+ " 11 base64_url 46 non-null object \n",
+ "dtypes: float64(4), object(8)\n",
+ "memory usage: 4.4+ KB\n"
+ ]
+ }
+ ],
+ "source": [
+ "avg_speeds.info()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 129,
+ "id": "01efe186-e8f1-4c6d-941b-faf332a77281",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "avg_speeds_with_geom = average_summary_speeds.merge_in_common_shape_geometry(\n",
+ " avg_speeds, one_analysis_date\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 130,
+ "id": "32228a19-0133-4d23-91e5-f60b34907b2a",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " schedule_gtfs_dataset_key | \n",
+ " route_id | \n",
+ " direction_id | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 7 | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 6 | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ " 6 | \n",
+ " f5a749dd65924e025b1293c58f95f8d6 | \n",
+ " CC | \n",
+ " 1.00 | \n",
+ "
\n",
+ " \n",
+ " 8 | \n",
+ " f5a749dd65924e025b1293c58f95f8d6 | \n",
+ " CC | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ " 10 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 8 | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ " 13 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 12X | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ " 16 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 11 | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ " 19 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 30 | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ " 22 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 8a7c42f9-51e4-4848-bf88-30c210f149ad | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ " 25 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 2 | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ " 28 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 3 | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ " 31 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 20 | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ " 34 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 5 | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ " 37 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 4 | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ " 40 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 9 | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ " 43 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 1 | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " schedule_gtfs_dataset_key route_id \\\n",
+ "0 73105f2d1cabc8170ab066d96863c5d5 7 \n",
+ "3 73105f2d1cabc8170ab066d96863c5d5 6 \n",
+ "6 f5a749dd65924e025b1293c58f95f8d6 CC \n",
+ "8 f5a749dd65924e025b1293c58f95f8d6 CC \n",
+ "10 73105f2d1cabc8170ab066d96863c5d5 8 \n",
+ "13 73105f2d1cabc8170ab066d96863c5d5 12X \n",
+ "16 73105f2d1cabc8170ab066d96863c5d5 11 \n",
+ "19 73105f2d1cabc8170ab066d96863c5d5 30 \n",
+ "22 73105f2d1cabc8170ab066d96863c5d5 8a7c42f9-51e4-4848-bf88-30c210f149ad \n",
+ "25 73105f2d1cabc8170ab066d96863c5d5 2 \n",
+ "28 73105f2d1cabc8170ab066d96863c5d5 3 \n",
+ "31 73105f2d1cabc8170ab066d96863c5d5 20 \n",
+ "34 73105f2d1cabc8170ab066d96863c5d5 5 \n",
+ "37 73105f2d1cabc8170ab066d96863c5d5 4 \n",
+ "40 73105f2d1cabc8170ab066d96863c5d5 9 \n",
+ "43 73105f2d1cabc8170ab066d96863c5d5 1 \n",
+ "\n",
+ " direction_id \n",
+ "0 0.00 \n",
+ "3 0.00 \n",
+ "6 1.00 \n",
+ "8 0.00 \n",
+ "10 0.00 \n",
+ "13 0.00 \n",
+ "16 0.00 \n",
+ "19 0.00 \n",
+ "22 0.00 \n",
+ "25 0.00 \n",
+ "28 0.00 \n",
+ "31 0.00 \n",
+ "34 0.00 \n",
+ "37 0.00 \n",
+ "40 0.00 \n",
+ "43 0.00 "
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "preview(avg_speeds_with_geom)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 131,
+ "id": "7ce0e0b6-7dbf-4822-a4f1-5419e27539c9",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "RangeIndex: 46 entries, 0 to 45\n",
+ "Data columns (total 14 columns):\n",
+ " # Column Non-Null Count Dtype \n",
+ "--- ------ -------------- ----- \n",
+ " 0 schedule_gtfs_dataset_key 46 non-null object \n",
+ " 1 route_id 46 non-null object \n",
+ " 2 direction_id 46 non-null float64 \n",
+ " 3 time_period 46 non-null object \n",
+ " 4 meters_elapsed 46 non-null float64 \n",
+ " 5 sec_elapsed 46 non-null float64 \n",
+ " 6 speed_mph 46 non-null float64 \n",
+ " 7 name 46 non-null object \n",
+ " 8 caltrans_district 46 non-null object \n",
+ " 9 organization_source_record_id 46 non-null object \n",
+ " 10 organization_name 46 non-null object \n",
+ " 11 base64_url 46 non-null object \n",
+ " 12 route_name 46 non-null object \n",
+ " 13 geometry 46 non-null geometry\n",
+ "dtypes: float64(4), geometry(1), object(9)\n",
+ "memory usage: 5.2+ KB\n"
+ ]
+ }
+ ],
+ "source": [
+ "avg_speeds_with_geom.info()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "5c1ce92d-dceb-4ae4-affa-cc1daecd5f89",
+ "metadata": {},
+ "source": [
+ "##### Double check that my work matches what's in `gtfs_digest/merge_data`"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 132,
+ "id": "cbc4f65b-fdbd-4a33-ac39-36b753c5171c",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df_avg_speeds_og = pd.read_parquet(\n",
+ " \"gs://calitp-analytics-data/data-analyses/rt_segment_speeds/rollup_singleday/speeds_route_dir_2024-11-13.parquet\"\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 133,
+ "id": "978581b2-15fb-419f-af04-ce4c26b2f59a",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df_avg_speeds_og = df_avg_speeds_og.loc[\n",
+ " df_avg_speeds_og.schedule_gtfs_dataset_key.isin(schd_keys)\n",
+ "]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 134,
+ "id": "b77336a5-eb1e-4a1a-b82f-75d9f63334a3",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " schedule_gtfs_dataset_key | \n",
+ " route_id | \n",
+ " direction_id | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 1000 | \n",
+ " 73105f2d1cabc8170ab066d96863c5d5 | \n",
+ " 5 | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " schedule_gtfs_dataset_key route_id direction_id\n",
+ "1000 73105f2d1cabc8170ab066d96863c5d5 5 0.00"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "preview(df_avg_speeds_og)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 135,
+ "id": "a1191e33-c311-4d5a-b907-db2540d53c59",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "Index(['schedule_gtfs_dataset_key', 'route_id', 'direction_id', 'time_period',\n",
+ " 'meters_elapsed', 'sec_elapsed', 'speed_mph', 'name',\n",
+ " 'caltrans_district', 'organization_source_record_id',\n",
+ " 'organization_name', 'base64_url', 'route_name', 'geometry'],\n",
+ " dtype='object')"
+ ]
+ },
+ "execution_count": 135,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df_avg_speeds_og.columns"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 136,
+ "id": "586d9d8f-df66-43a6-ba70-00b8652e7697",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "Index(['schedule_gtfs_dataset_key', 'route_id', 'direction_id', 'time_period',\n",
+ " 'meters_elapsed', 'sec_elapsed', 'speed_mph', 'name',\n",
+ " 'caltrans_district', 'organization_source_record_id',\n",
+ " 'organization_name', 'base64_url', 'route_name', 'geometry'],\n",
+ " dtype='object')"
+ ]
+ },
+ "execution_count": 136,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "avg_speeds_with_geom.columns"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "09f5b44b-5c7b-433d-aab7-3ccb4f86718f",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df_avg_speeds = avg_speeds_with_geom.copy()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "6bc456bb-8f26-429e-acc9-0cf80b380f76",
+ "metadata": {},
+ "source": [
+ "#### Dataframe in line 307 `df_rt_sched` in `gtfs_digest/merge_data`"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "17ac16fc-791a-4b6d-ade1-73bd278bfbe3",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df_rt_sched = merge_data.concatenate_rt_vs_schedule_by_route_direction(\n",
+ " analysis_date_list\n",
+ ").astype({\"direction_id\": \"float\"})"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "00fa91ac-5933-47e8-9254-6e0200b51b23",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df_rt_sched2 = df_rt_sched.loc[df_rt_sched.schedule_gtfs_dataset_key.isin(schd_keys)]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "ef91794e-3a0d-40d2-9b0b-45f86f9fc636",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "preview(df_rt_sched2)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "5fb3db36-9f49-4c0a-a725-b631f83ff1fd",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df_rt_sched2.head(2)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "37e4e50d-e658-4759-861d-32f123d7d7db",
+ "metadata": {},
+ "source": [
+ "##### `dt_rt_sched` is created using [`rt_scheduled_v_ran/scripts/rt_v_scheduled_routes`](https://github.com/cal-itp/data-analyses/blob/main/rt_scheduled_v_ran/scripts/rt_v_scheduled_routes.py) "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "8134248e-c1c4-42eb-b609-dd7cf266fc6c",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "[*GTFS_DATA_DICT[\"stop_segments\"][\"route_dir_cols\"]]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "9a8b3cc0-24c0-4a07-affc-797542ba6d03",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "dict_inputs = GTFS_DATA_DICT.rt_vs_schedule_tables"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "d0e6de74-24ad-4c09-8de5-cc25e6dea8e9",
+ "metadata": {},
+ "source": [
+ "##### `route_metrics` in `rt_scheduled_v_ran/scripts/rt_v_scheduled_routes`"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "f40d89aa-5b32-481d-a86c-eb18e0c94e63",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "TRIP_EXPORT = dict_inputs.vp_trip_metrics"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "117a707c-28c1-4f38-ab07-c331ae35c916",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "ROUTE_EXPORT = dict_inputs.vp_route_direction_metrics"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "db216b8a-5ee0-4ea6-8238-a0e89a8b4a11",
+ "metadata": {},
+ "source": [
+ "##### DONE **Everything is available in `trip_df`. Fill in Direction_id with 0.**\n",
+ "* Where is `trip_df` created again?"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "d2affb77-895f-406d-a6c9-b9c595347156",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "trip_df = pd.read_parquet(f\"{RT_SCHED_GCS}{TRIP_EXPORT}_{one_analysis_date}.parquet\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "dbbcd9b4-52b1-4860-9d67-d87b00087b4b",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "trip_df2 = trip_df.loc[trip_df.schedule_gtfs_dataset_key.isin(schd_keys)]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "5424c44f-a4c8-45c8-8f84-e386a97b35d9",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "trip_df2.direction_id = trip_df2.direction_id.fillna(0)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "64df0dca-b76e-4024-9b98-4ad58bae2f9b",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "trip_df2.shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "9e7853f9-deba-43c4-97e9-733b0e4b5668",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "preview(trip_df2)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "b61d5a44-a403-42ab-b7e4-b61e4ac133e4",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "trip_df2.info()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "0ce4e0d1-f639-4f6e-8062-b3513feb071f",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "trip_df2.loc[trip_df2.time_of_day == \"AM Peak\"].drop(\n",
+ " columns=[\"schedule_gtfs_dataset_key\", \"trip_instance_key\"]\n",
+ ").sort_values(by=[\"route_id\"]).drop_duplicates(\n",
+ " subset=[\n",
+ " \"route_id\",\n",
+ " \"direction_id\",\n",
+ " ]\n",
+ ").T"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "749f4347-3a7f-45a5-9d5b-5bb159c8baba",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import sys\n",
+ "\n",
+ "sys.path.append(\"../rt_scheduled_v_ran/scripts\")\n",
+ "import rt_v_scheduled_routes"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "f4abfe0c-d6ac-4912-8452-c67f361930f0",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "ROUTE_DIR_COLS = [*GTFS_DATA_DICT[\"stop_segments\"][\"route_dir_cols\"]]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "6936d9d7-a8e4-4029-9214-ccf3df2c409b",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "crosswalk_cols = [\n",
+ " \"schedule_gtfs_dataset_key\",\n",
+ " \"name\",\n",
+ " \"schedule_source_record_id\",\n",
+ " \"base64_url\",\n",
+ " \"organization_source_record_id\",\n",
+ " \"organization_name\",\n",
+ " \"caltrans_district\",\n",
+ "]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "d00aa8e3-a0b5-4401-8995-e087871994b2",
+ "metadata": {},
+ "source": [
+ "##### Have to break out `metrics.concatenate_peak_offpeak_allday_averages` which is in `rt_segment_speeds/segment_speed_utils/` because all of the routes are missing."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "2930b17e-6e1f-4a6a-b429-6aa4f8ca38fe",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "route_df = (\n",
+ " metrics.concatenate_peak_offpeak_allday_averages(\n",
+ " trip_df2,\n",
+ " group_cols=[\"schedule_gtfs_dataset_key\"] + ROUTE_DIR_COLS,\n",
+ " metric_type=\"rt_vs_schedule\",\n",
+ " )\n",
+ " .pipe(metrics.derive_rt_vs_schedule_metrics)\n",
+ " .pipe(rt_v_scheduled_routes.average_rt_trip_times)\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "805ac735-490a-4f47-a67a-327792f670a9",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "preview(route_df)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "e8549cb3-776e-4b1b-bff8-358ded0f4134",
+ "metadata": {},
+ "source": [
+ "`calculate_avg_speeds` is from `rt_segement_speeds/segement_speed_utils/segment_calc.py` -> added `dropna=False`"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "33059d74-1776-46b0-8fff-386667fa8332",
+ "metadata": {},
+ "source": [
+ "DONE `calculate_weighted_average_vp_schedule_metrics` is from `rt_segment_speeds/segment_speed_utils/metrics` -> added `dropna=False`"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "942a86d6-3ed0-4edb-a4d0-5577164464af",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def calculate_weighted_average_vp_schedule_metrics(\n",
+ " df: pd.DataFrame,\n",
+ " group_cols: list,\n",
+ ") -> pd.DataFrame:\n",
+ "\n",
+ " sum_cols = [\n",
+ " \"minutes_atleast1_vp\",\n",
+ " \"minutes_atleast2_vp\",\n",
+ " \"rt_service_minutes\",\n",
+ " \"scheduled_service_minutes\",\n",
+ " \"total_vp\",\n",
+ " \"vp_in_shape\",\n",
+ " \"is_early\",\n",
+ " \"is_ontime\",\n",
+ " \"is_late\",\n",
+ " ]\n",
+ "\n",
+ " count_cols = [\"trip_instance_key\"]\n",
+ "\n",
+ " df2 = (\n",
+ " df.groupby(group_cols, observed=True, group_keys=False, dropna=False)\n",
+ " .agg({**{e: \"sum\" for e in sum_cols}, **{e: \"count\" for e in count_cols}})\n",
+ " .reset_index()\n",
+ " .rename(columns={\"trip_instance_key\": \"n_vp_trips\"})\n",
+ " )\n",
+ "\n",
+ " return df2"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "99c92a5c-a7f0-41b3-be41-4376a72d1140",
+ "metadata": {},
+ "source": [
+ "DONE`weighted_average_speeds_across_segments` is from `rt_segment_speeds/segment_speed_utils/metrics` -> added `dropna=False`"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "7ba76d14-9458-481f-a59e-c69a944000b5",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def weighted_average_speeds_across_segments(\n",
+ " df: pd.DataFrame, group_cols: list\n",
+ ") -> pd.DataFrame:\n",
+ " \"\"\"\n",
+ " We can use our segments and the deltas within a trip\n",
+ " to calculate the trip-level average speed, or\n",
+ " the route-direction-level average speed.\n",
+ " But, we want a weighted average, using the raw deltas\n",
+ " instead of mean(speed_mph), since segments can be varying lengths.\n",
+ " \"\"\"\n",
+ " avg_speeds = (\n",
+ " df.groupby(group_cols, observed=True, group_keys=False, dropna=False)\n",
+ " .agg(\n",
+ " {\n",
+ " \"meters_elapsed\": \"sum\",\n",
+ " \"sec_elapsed\": \"sum\",\n",
+ " }\n",
+ " )\n",
+ " .reset_index()\n",
+ " ).pipe(segment_calcs.speed_from_meters_elapsed_sec_elapsed)\n",
+ "\n",
+ " return avg_speeds"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "b3bb0bb3-9065-40c2-9f0c-52806fc00e86",
+ "metadata": {},
+ "source": [
+ "`concatenate_peak_offpeak_allday_averages` is from `rt_segment_speeds/segment_speed_utils/metrics`"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "a913f3fd-cfea-4956-8e9e-8d01974b3b85",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def concatenate_peak_offpeak_allday_averages(\n",
+ " df: pd.DataFrame, group_cols: list, metric_type: str\n",
+ ") -> pd.DataFrame:\n",
+ " \"\"\"\n",
+ " Calculate average speeds for all day and\n",
+ " peak_offpeak.\n",
+ " Concatenate these, so that speeds are always calculated\n",
+ " for the same 3 time periods.\n",
+ " \"\"\"\n",
+ " if metric_type == \"segment_speeds\":\n",
+ " avg_peak = calculate_avg_speeds(df, group_cols + [\"peak_offpeak\"])\n",
+ "\n",
+ " avg_allday = calculate_avg_speeds(df, group_cols).assign(peak_offpeak=\"all_day\")\n",
+ "\n",
+ " elif metric_type == \"summary_speeds\":\n",
+ " avg_peak = weighted_average_speeds_across_segments(\n",
+ " df, group_cols + [\"peak_offpeak\"]\n",
+ " )\n",
+ "\n",
+ " avg_allday = weighted_average_speeds_across_segments(df, group_cols).assign(\n",
+ " peak_offpeak=\"all_day\"\n",
+ " )\n",
+ "\n",
+ " elif metric_type == \"rt_vs_schedule\":\n",
+ " avg_peak = calculate_weighted_average_vp_schedule_metrics(\n",
+ " df, group_cols + [\"peak_offpeak\"]\n",
+ " )\n",
+ "\n",
+ " avg_allday = calculate_weighted_average_vp_schedule_metrics(\n",
+ " df, group_cols\n",
+ " ).assign(peak_offpeak=\"all_day\")\n",
+ "\n",
+ " else:\n",
+ " print(\n",
+ " f\"Valid metric types: ['segment_speeds', 'summary_speeds', 'rt_vs_schedule']\"\n",
+ " )\n",
+ "\n",
+ " # Concatenate so that every segment has 3 time periods: peak, offpeak, and all_day\n",
+ " avg_metrics = pd.concat([avg_peak, avg_allday], axis=0, ignore_index=True).rename(\n",
+ " columns={\"peak_offpeak\": \"time_period\"}\n",
+ " )\n",
+ "\n",
+ " return avg_metrics"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "a91fa61a-0e3f-4d08-a3de-90afc9e7ea1c",
+ "metadata": {},
+ "source": [
+ "##### Going back to `rt_v_scheduled/scripts/rt_v_scheduled_routes.py`"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "84ba1eca-7b33-4822-8771-ec2137a2415a",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "route_metrics_df = concatenate_peak_offpeak_allday_averages(\n",
+ " trip_df2,\n",
+ " group_cols=[\"schedule_gtfs_dataset_key\"] + ROUTE_DIR_COLS,\n",
+ " metric_type=\"rt_vs_schedule\",\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "5adb841a-436b-4a04-a9dc-3b061d01c875",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "preview(route_metrics_df)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "30b4fe2b-47b4-4bef-9c03-ee8d8e163452",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "route_metrics_df.shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "4c9011ce-c7f9-4eee-bf89-406f97b3dddc",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "route_metrics_df.route_id.value_counts()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "f7a02de0-168a-420c-ae0f-063e0824eea7",
+ "metadata": {
+ "scrolled": true,
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "route_metrics_df.loc[route_metrics_df.route_id == \"CC\"]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "ce66eab0-999b-48d2-9f5c-c54c9e1d09de",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "route_metrics_df = route_metrics_df.pipe(metrics.derive_rt_vs_schedule_metrics)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "dfe011a8-52ac-4240-a094-8bf270fdb420",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "route_metrics_df.columns"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "1378f515-fa23-44b4-b556-f3eaaeaf077b",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "route_metrics_df2 = route_metrics_df.pipe(rt_v_scheduled_routes.average_rt_trip_times)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "ca5edbfd-cd6b-4bc9-9074-4e14a80be2dd",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "route_metrics_df2.columns"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "44fd4153-2f1f-41fa-8262-cf7ea5200f97",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "route_metrics_df3 = gtfs_schedule_wrangling.merge_operator_identifiers(\n",
+ " route_metrics_df2, [one_analysis_date], columns=crosswalk_cols\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "ef670a71-4496-44fe-9d66-8c6e51315993",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "route_metrics_df3.columns"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "8c3af40a-c9d4-475a-b22e-687a07617994",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "route_metrics_df.loc[route_metrics_df.time_period == \"peak\"].drop(\n",
+ " columns=[\n",
+ " \"schedule_gtfs_dataset_key\",\n",
+ " \"schedule_source_record_id\",\n",
+ " \"base64_url\",\n",
+ " \"organization_name\",\n",
+ " \"organization_source_record_id\",\n",
+ " \"caltrans_district\",\n",
+ " ]\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "34f85df9-e8f8-432a-9c88-075418b41250",
+ "metadata": {},
+ "source": [
+ "##### `df_rt_sched` equals `df_rt_sched`"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "a6b611fa-f06d-4a35-a5bb-d8101bc4e867",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df_rt_sched_og = merge_data.concatenate_rt_vs_schedule_by_route_direction(\n",
+ " analysis_date_list\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "2521cbd7-2434-42d9-b84d-c22cbe5bf518",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df_rt_sched = route_metrics_df3.copy()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "65f3085b-dd9a-432b-ba12-d0f916c142e5",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df_rt_sched_og.columns"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "431622bf-0f4c-486b-8eea-83451892aa2e",
+ "metadata": {},
+ "source": [
+ "##### All these columns pop up around the step of `gtfs_schedule_wrangling.merge_operator_identifiers` because the extra columns match what is in `crosswalk_cols`?? "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "238bd321-c642-4e32-b327-85a684e7f4b2",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df_rt_sched.columns"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "152826be-d2d7-4d25-aa5f-1c48a99c0c25",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df_rt_sched = df_rt_sched.drop(\n",
+ " columns=[\n",
+ " \"base64_url\",\n",
+ " \"organization_source_record_id\",\n",
+ " \"organization_name\",\n",
+ " \"caltrans_district\",\n",
+ " ]\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "8365d9b3-8598-4405-b31c-0431a1fb2b39",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df_rt_sched_og.loc[\n",
+ " df_rt_sched_og.schedule_gtfs_dataset_key.isin(schd_keys)\n",
+ "].route_id.nunique()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "2c1572f2-017c-44f4-a138-b5341c03cefa",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df_rt_sched.route_id.nunique()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "21691913-e7a2-448a-86aa-cc34c967a66c",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df_rt_sched.shape"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "682b194e-29cb-4a6d-b321-35f7f0177774",
+ "metadata": {},
+ "source": [
+ "#### `gtfs_digest/merge_data/` line 316: `df_crosswalk`"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "64c14f30-577f-41ec-80e9-2bf6759f074f",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df_crosswalk = merge_data.concatenate_crosswalk_organization(analysis_date_list)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "09f4f8d9-0398-4b9d-a2eb-28289fc6d145",
+ "metadata": {},
+ "source": [
+ "#### `gtfs_digest/merge_data/merge_data_sources_by_route_direction`\n",
+ "* Have to make some tweaks since `df_avg_speeds2` is missing a lot of routes."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "63679d40-8d3e-45e8-9b10-fa391787a4f1",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "service_date_datetime = pd.to_datetime(\"2024-11-13T00:00:00.000000000\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "7932a990-d3ee-42f9-b7f3-0ca9fc8fd4d8",
+ "metadata": {},
+ "source": [
+ "##### Why are time_periods and peak_off_peak different between `df_sched` and `df_rt_sched`\n",
+ "* Something is wrong with `df_sched` because a lot of `time_period` values are missing~\n",
+ "##### Amanda, test: fill in `nans` in `time_period` with `peak_offpeak`\n",
+ "* This might solve why all the routes are missing in Nov/Dec too?"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "1bb1555f-cc0b-411d-bfc2-167c4f8d8859",
+ "metadata": {
+ "scrolled": true,
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "df_rt_sched[[\"route_id\", \"time_period\", \"direction_id\"]].drop_duplicates().sort_values(\n",
+ " by=[\"route_id\", \"direction_id\"]\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "d676de3a-0322-43d6-847a-e4df00e59c32",
+ "metadata": {
+ "scrolled": true,
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "df_sched[[\"route_id\", \"time_period\", \"direction_id\"]].drop_duplicates().sort_values(\n",
+ " by=[\"route_id\", \"direction_id\"]\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "39fa369e-2482-4337-b130-08b8a799b5d8",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df_sched[\"service_date\"] = service_date_datetime"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "825126f3-4363-45d4-9e09-ac80f850f19e",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df_rt_sched[\"service_date\"] = service_date_datetime"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "b8ac692d-38e7-46c4-bffb-83663f81aa96",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df_avg_speeds[\"service_date\"] = service_date_datetime"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "febaf0e2-4328-4f27-b698-0b3ad6824369",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# merge1 = merge_data.merge_data_sources_by_route_direction(\n",
+ "# route_dir_metrics2,\n",
+ "# df_rt_sched,\n",
+ "# df_avg_speeds2,\n",
+ "# df_crosswalk\n",
+ "# )"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "cd262bde-0d33-430d-aab9-71c5bcab2741",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "primary_typology = merge_data.set_primary_typology(route_dir_metrics2)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "14979024-1101-472f-909a-5e3aca484f9c",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "primary_typology.head(1)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "a6fc2e2c-5525-4af3-8c8f-0f9046fc54a6",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "route_time_cols = [\n",
+ " \"schedule_gtfs_dataset_key\",\n",
+ " \"route_id\",\n",
+ " \"direction_id\",\n",
+ " \"time_period\",\n",
+ "]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "d46d5ef4-81b5-45c5-8aaa-398e70472aca",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df_schedule2 = pd.merge(df_sched, primary_typology, on=route_time_cols, how=\"left\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "881ccf54-00a2-4de9-b660-bfb83a749da4",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df_schedule2.info()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "e8902f56-2f31-4c6a-be94-f3da7ef66645",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "route_time_cols"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "c8979573-6e86-4660-ac9d-8dfe2cb8a624",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df_schedule2.columns"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "5345717b-819a-43a9-b757-5770795dc75f",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df_rt_sched.columns"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "007774e5-d6ab-43aa-8cc8-ecda0ac30e6d",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df_avg_speeds.columns"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "b403a4ab-1dbd-4e6b-bfe8-f4ab9fdf4e2a",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df = pd.merge(\n",
+ " df_schedule2,\n",
+ " df_rt_sched,\n",
+ " on=route_time_cols + [\"service_date\"],\n",
+ " how=\"outer\",\n",
+ " indicator=\"sched_rt_category\",\n",
+ ").merge(\n",
+ " df_avg_speeds,\n",
+ " on=route_time_cols + [\"service_date\"],\n",
+ " how=\"outer\",\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "9e74a489-63e9-44ce-bd04-7fa5b140b1f9",
+ "metadata": {},
+ "source": [
+ "##### Check that all the routes are here."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "bcf2314a-31ae-4b73-93b7-68d683dae795",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df.route_id.unique()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "fe2a49fc-947e-4daf-86fd-b09062cfb9ab",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df.route_id.value_counts()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "e25dab9a-13ae-4dce-ae13-ba4eaaa2307d",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df.columns"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "ce2255dd-ca78-490a-9bb4-9926a8c18d12",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df2 = df.assign(\n",
+ " sched_rt_category=df.sched_rt_category.map(\n",
+ " gtfs_schedule_wrangling.sched_rt_category_dict\n",
+ " )\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "12d1522c-4338-4355-8185-50ab74745d77",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df2.columns"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "c7460296-a3fd-46f2-b450-835d1705076d",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df3 = df2.pipe(\n",
+ " merge_data.merge_in_standardized_route_names,\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "91241657-7be4-4967-9e55-7b18a8659529",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df3.columns"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "9220b9cc-7683-479b-a288-1832d414ea17",
+ "metadata": {},
+ "source": [
+ "###### Extra columns are popping up?? Detailed below. "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "2a289464-c5bc-4691-b87b-ce88a8960f07",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "drop_cols = [\n",
+ " \"schedule_source_record_id\",\n",
+ " \"base64_url\",\n",
+ " \"organization_source_record_id\",\n",
+ " \"organization_name\",\n",
+ " \"caltrans_district\",\n",
+ "]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "47440343-04d9-4f00-adfc-22a6ad288763",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df4 = pd.merge(\n",
+ " df3.drop(columns=drop_cols),\n",
+ " df_crosswalk,\n",
+ " on=[\"schedule_gtfs_dataset_key\", \"name\", \"service_date\"],\n",
+ " how=\"left\",\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "73244d16-3ca8-4d70-934b-4d2c6b8907f6",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df4.columns"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "3b3ef87b-f4b6-49b3-8cbe-0f7011ee022f",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df4.route_id.value_counts()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "2319efc6-c98e-4f3e-93ec-6160381af404",
+ "metadata": {},
+ "source": [
+ "##### Lots of repeated columns...why!!"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "94eb667e-ebd4-489c-80b7-4419eb670a45",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df4.info()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "37d5c006-b5cb-4626-b22a-5e8d557ea6be",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df5 = df4.pipe(\n",
+ " # Find the most common cardinal direction\n",
+ " gtfs_schedule_wrangling.top_cardinal_direction\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "1515317b-2747-4658-abcf-495a520b511f",
+ "metadata": {},
+ "source": [
+ "#### Observations\n",
+ "* There are no typologies for these previously missing routes.\n",
+ "* `Route_primary_direction` and `direction_id` is empty for all of City of Santa Maria \n",
+ "* `route_ids` are repeated...somehow messed up during merges.\n",
+ "* I have an extra column for `peak_offpeak ` and `time_period`"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "4d6b6853-10cf-4a99-8b34-63e8704e4874",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df5.info()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "dbddc7d0-9c03-4563-b799-36529a257203",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df[[\"time_period\", \"route_id\"]].drop_duplicates().sort_values(by=[\"route_id\"])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "77c95343-ceb5-4e3d-b87c-a75b822ba4cf",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df5.columns"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "eb1b1ea0-5aff-46ee-8c23-47d218759ecc",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "preview_cols = [\n",
+ " \"organization_name\",\n",
+ " \"route_id\",\n",
+ " \"sched_rt_category\",\n",
+ " \"direction_id\",\n",
+ " \"route_primary_direction\",\n",
+ " \"avg_scheduled_service_minutes\",\n",
+ " \"avg_stop_miles\",\n",
+ " \"n_trips\",\n",
+ " \"time_period\",\n",
+ " \"frequency\",\n",
+ " \"typology\",\n",
+ " \"minutes_atleast1_vp\",\n",
+ " \"minutes_atleast2_vp\",\n",
+ " \"total_rt_service_minutes\",\n",
+ " \"total_scheduled_service_minutes\",\n",
+ " \"total_vp\",\n",
+ " \"vp_in_shape\",\n",
+ " \"is_early\",\n",
+ " \"is_ontime\",\n",
+ " \"is_late\",\n",
+ " \"n_vp_trips\",\n",
+ " \"vp_per_minute\",\n",
+ " \"pct_in_shape\",\n",
+ " \"pct_rt_journey_atleast1_vp\",\n",
+ " \"pct_rt_journey_atleast2_vp\",\n",
+ " \"pct_sched_journey_atleast1_vp\",\n",
+ " \"pct_sched_journey_atleast2_vp\",\n",
+ " \"rt_sched_journey_ratio\",\n",
+ " \"avg_rt_service_minutes\",\n",
+ " \"speed_mph\",\n",
+ "]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "5854cf7b-c98f-4a36-9c86-e17234f299ea",
+ "metadata": {
+ "scrolled": true,
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "df5.groupby(\n",
+ " [\n",
+ " \"route_id\",\n",
+ " \"sched_rt_category\",\n",
+ " ]\n",
+ ").agg({\"organization_name\": \"count\"})"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "7ef1f3e6-df31-4141-89a4-9b71f51bc1c7",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df5.loc[df5.route_id == \"1B\"][preview_cols].sort_values(\n",
+ " by=[\"organization_name\", \"route_id\"]\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "712eb63d-5352-40ad-a856-1a3066f13b96",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "df5.loc[df5.time_period == \"peak\"][preview_cols].sort_values(\n",
+ " by=[\"organization_name\", \"route_id\"]\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "9df0ccd6-5573-42c5-b31a-0fb6e95b506c",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "stop"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "46093153-c813-4684-a6a0-1c163589c41f",
+ "metadata": {},
+ "source": [
+ "### Fix `ROUTE_TYPOLOGIES` in `gtfs_funnel/route_typologies.py`"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "229e3c18-dedc-4baa-b583-72679b06b7b6",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "ROUTE_TYPOLOGIES"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "86d73125-6624-45b1-8463-7d3cd4c6f613",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "GTFS_DATA_DICT.schedule_tables.route_typologies"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "54d371c5-1143-4649-bc21-280188257722",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "GTFS_DATA_DICT.schedule_tables.route_typologies"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "88fe3778-c61e-4b35-89dd-2d8593a739a8",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "route_typologies2 = route_typologies.loc[\n",
+ " route_typologies.schedule_gtfs_dataset_key.isin(schd_keys)\n",
+ "]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "0e9693a0-053e-4d9d-a250-434144929cb5",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "route_typologies2"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "d04a785f-ee9a-4bd5-8490-aefed10e34b5",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "route_dir_cols = [\n",
+ " \"schedule_gtfs_dataset_key\",\n",
+ " \"route_id\",\n",
+ " \"direction_id\",\n",
+ " \"common_shape_id\",\n",
+ " \"route_name\",\n",
+ " \"route_meters\",\n",
+ "]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "98a3fbab-daba-423c-8d55-e9008d427baf",
+ "metadata": {},
+ "source": [
+ "##### Amanda: in `rt_segment_speeds/segment_speed_utils/gtfs_schedule_wrangling`, I filled `nan` rows in `direction_id`. Then I commented out parts of `gtfs_funnel/route_typologies`"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "95bbd6b2-9e0a-4ba4-b953-f5eb047828f9",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "common_shape = gtfs_schedule_wrangling.most_common_shape_by_route_direction(\n",
+ " one_analysis_date\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "f701bf69-7c72-4ec7-a13a-2aea089cd71b",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "common_shape2 = common_shape.loc[common_shape.schedule_gtfs_dataset_key.isin(schd_keys)]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "982eea55-9177-4031-81d7-fcc20a5b988e",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "nov_typology_ah_test_df = pd.read_parquet(\n",
+ " \"gs://calitp-analytics-data/data-analyses/gtfs_schedule/nacto_typologies/route_typologies_AH_TESTING_2024-11-13.parquet\"\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "6bf465f8-4775-47c9-b864-857724ad739d",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "nov_typology_ah_test_df.loc[\n",
+ " nov_typology_ah_test_df.schedule_gtfs_dataset_key.isin(schd_keys)\n",
+ "]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "77a3f68d-1244-4306-920c-27aed2f543bd",
+ "metadata": {
+ "tags": []
+ },
+ "source": [
+ "### Fix Map: `gtfs_digest/merge_operator_data`"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "cb99b4b5-7745-422c-a6c5-153f02ffc244",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "OPERATOR_FILE = GTFS_DATA_DICT.digest_tables.operator_profiles\n",
+ "OPERATOR_ROUTE = GTFS_DATA_DICT.digest_tables.operator_routes_map"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "55faff71-f82c-46fc-a99d-dcc40205e100",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "operator_route_gdf = gpd.read_parquet(\n",
+ " f\"{RT_SCHED_GCS}{OPERATOR_ROUTE}.parquet\",\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "9b2c852f-f053-406a-8274-8b4f015f10c9",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "operator_route_gdf.columns"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "89ccde0b-736c-4fc9-a294-8a12116823a8",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "operator_route_gdf2.columns"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "295aaf35-9ade-4f9e-bc4d-5b8ef95a1569",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "len(operator_route_gdf2)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "5630aaaa-dc8b-4917-b9fa-ae0924999720",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "operator_route_gdf2.is_rail.value_counts()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "3ecc56aa-63ce-402b-8136-a847fd5c0d11",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "operator_route_gdf2.organization_name.value_counts()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "5798606e-2ea4-4ab0-a6d8-a5597a51e66f",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "operator_route_gdf2.schedule_gtfs_dataset_key.unique()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "26d11950-fca8-4f5b-8d17-2b9fa0aa368c",
+ "metadata": {},
+ "source": [
+ "#### Why does City of Santa Maria have multiple schedule_gtfs_dataset_keys?"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "81fbd586-cc2d-4a70-97a6-5b25228684b8",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "operator_route_gdf2.groupby([\"organization_name\", \"schedule_gtfs_dataset_key\"]).agg(\n",
+ " {\"route_short_name\": \"nunique\"}\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "568e2a00-8f8c-451c-8b6d-ae331d18471c",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "operator_route_gdf2.drop(columns=[\"service_date\"]).explore(\"organization_name\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "bd466515-a3cd-473a-a01a-2e73f9507104",
"metadata": {},
"outputs": [],
"source": [
- "longest_shape_gdf2 = longest_shape_gdf2.dropna()"
+ "# operator_route_gdf2.drop(columns = [\"service_date\"]).explore(\"shape_array_key\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "b1ddfdee-292e-4d57-bb1e-17248e87fce8",
+ "metadata": {},
+ "source": [
+ "#### Starting from here [`gtfs_funnel/operator_scheduled_stats`](https://github.com/cal-itp/data-analyses/blob/4dc340343a60b45ad94217c3efd91f807b03ebc2/gtfs_funnel/operator_scheduled_stats.py#L148)"
]
},
{
"cell_type": "code",
- "execution_count": 28,
- "id": "50af7a1e-4e0c-4e5e-9755-f9ffbab99a8b",
+ "execution_count": null,
+ "id": "d14199f0-63e5-466c-a122-51b2c2abaa75",
"metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "4"
- ]
- },
- "execution_count": 28,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
+ "outputs": [],
"source": [
- "longest_shape_gdf2.shape_array_key.nunique()"
+ "analysis_date = \"2024-11-13\""
]
},
{
"cell_type": "code",
- "execution_count": 29,
- "id": "94bc9b1f-8a73-4c7a-a773-cc61b843b6a7",
+ "execution_count": null,
+ "id": "7a908db1-ddaa-41f9-b0dd-41b0a6046ad6",
"metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "f5a749dd65924e025b1293c58f95f8d6 3\n",
- "73105f2d1cabc8170ab066d96863c5d5 1\n",
- "Name: schedule_gtfs_dataset_key, dtype: int64"
- ]
- },
- "execution_count": 29,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "7c57add2-f72c-4c22-9ca6-e5efe879cab3",
+ "metadata": {},
+ "outputs": [],
"source": [
- "longest_shape_gdf2.schedule_gtfs_dataset_key.value_counts()"
+ "schd_keys = list(operator_route_gdf2.schedule_gtfs_dataset_key.unique())"
]
},
{
"cell_type": "code",
- "execution_count": 30,
- "id": "19012a85-7ef7-4188-870b-251eb600034f",
+ "execution_count": null,
+ "id": "7a76026d-88e6-49a3-83f8-b20836b70d7a",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "markdown",
+ "id": "a97faf62-2c23-428a-a2fa-23cb8fb7f11e",
"metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "array(['73105f2d1cabc8170ab066d96863c5d5',\n",
- " 'f5a749dd65924e025b1293c58f95f8d6'], dtype=object)"
- ]
- },
- "execution_count": 30,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
"source": [
- "longest_shape_gdf2.schedule_gtfs_dataset_key.unique()"
+ "#### Longest shape does have all the routes..."
]
},
{
"cell_type": "code",
- "execution_count": 31,
- "id": "1eae8307-2d99-41e4-b541-ca2fc1c68b02",
+ "execution_count": null,
+ "id": "33484c6b-1422-42f8-918e-a7aa70531aa3",
"metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "Make this Notebook Trusted to load map: File -> Trust Notebook
"
- ],
- "text/plain": [
- ""
- ]
- },
- "execution_count": 31,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
+ "outputs": [],
"source": [
- "longest_shape_gdf2.explore(\"schedule_gtfs_dataset_key\")"
+ "longest_shape_gdf = operator_scheduled_stats.longest_shape_by_route(analysis_date)"
]
},
{
- "cell_type": "markdown",
- "id": "0706fb58-e04e-4d40-b49b-d505da875262",
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "3bcb40ca-7e6a-432e-a70c-e1817f7eebe9",
"metadata": {},
+ "outputs": [],
"source": [
- "### Step back before finding the longest shape [here](https://github.com/cal-itp/data-analyses/blob/4dc340343a60b45ad94217c3efd91f807b03ebc2/rt_segment_speeds/segment_speed_utils/gtfs_schedule_wrangling.py#L365)\n",
- "#### Amanda: deleted `direction_id` b/c I discovered City of Santa Maria doesn't have values for the column `direction_id`"
+ "longest_shape_gdf2 = longest_shape_gdf.loc[\n",
+ " longest_shape_gdf.schedule_gtfs_dataset_key.isin(schd_keys)\n",
+ "]"
]
},
{
"cell_type": "code",
- "execution_count": 32,
- "id": "0f1f1160-fd82-4e35-adaa-914aac83ac85",
+ "execution_count": null,
+ "id": "86963a9f-3456-48d5-a386-05c211fe93f4",
"metadata": {},
"outputs": [],
"source": [
- "route_dir_cols = [\"gtfs_dataset_key\", \"route_id\", ]\n",
- "\n",
- "keep_trip_cols = route_dir_cols + [\"trip_instance_key\", \"shape_id\", \"shape_array_key\"]"
+ "longest_shape_gdf2.columns"
]
},
{
"cell_type": "code",
- "execution_count": 33,
- "id": "f332d9b6-d1f8-456d-b3bf-495651b17214",
+ "execution_count": null,
+ "id": "b256ef9b-82c1-4832-ac54-19ca9319bdc4",
"metadata": {},
"outputs": [],
"source": [
- "trips = helpers.import_scheduled_trips(\n",
- " analysis_date, columns=keep_trip_cols, get_pandas=True\n",
- ").rename(columns={\"schedule_gtfs_dataset_key\": \"gtfs_dataset_key\"})"
+ "longest_shape_gdf2.info()"
]
},
{
"cell_type": "code",
- "execution_count": 34,
- "id": "5e133db6-9df8-42d4-ad84-6d158514f045",
+ "execution_count": null,
+ "id": "6db42351-2a52-4e00-a265-33e5743cdea2",
"metadata": {},
"outputs": [],
"source": [
- "sorting_order = [True for i in route_dir_cols]"
+ "longest_shape_gdf2.route_id.value_counts()"
]
},
{
"cell_type": "code",
- "execution_count": 35,
- "id": "2407cc5f-b8c7-416e-b38b-8dc54a90ed30",
+ "execution_count": null,
+ "id": "ca9a6950-6b83-4ec7-bc55-cdc79f3a0843",
"metadata": {},
"outputs": [],
"source": [
- "# Grab only relevant schedule_gtfs_dataset_keys\n",
- "trips2 = trips.loc[trips.gtfs_dataset_key.isin(schd_keys)].reset_index(drop=True)"
+ "# longest_shape_gdf2.explore(\"schedule_gtfs_dataset_key\")"
]
},
{
"cell_type": "code",
- "execution_count": 54,
- "id": "373584a8-a3b4-414a-8d95-7e1e08c61fa7",
+ "execution_count": null,
+ "id": "6dfe2c94-098a-4816-8255-278b85a43f0b",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "longest_shape_gdf2.groupby([\"schedule_gtfs_dataset_key\", \"route_id\"]).agg(\n",
+ " {\"route_length_miles\": \"max\"}\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "69d6cb38-2073-4119-bff6-bdb777038b43",
"metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "\n",
- "RangeIndex: 335 entries, 0 to 334\n",
- "Data columns (total 5 columns):\n",
- " # Column Non-Null Count Dtype \n",
- "--- ------ -------------- ----- \n",
- " 0 gtfs_dataset_key 335 non-null object\n",
- " 1 route_id 335 non-null object\n",
- " 2 trip_instance_key 335 non-null object\n",
- " 3 shape_id 335 non-null object\n",
- " 4 shape_array_key 335 non-null object\n",
- "dtypes: object(5)\n",
- "memory usage: 13.2+ KB\n"
- ]
- }
- ],
"source": [
- "trips2.info()"
+ "#### Somewhere along the way the routes are cut...maybe b/c of `direction_id`"
]
},
{
"cell_type": "code",
- "execution_count": 55,
- "id": "521a90b8-c28e-42eb-b61d-231b11594db6",
+ "execution_count": null,
+ "id": "859ad004-4b41-41d3-9da7-0c5524daa98e",
"metadata": {},
"outputs": [],
"source": [
- "direction_id_kept = [\"direction_id\",\"gtfs_dataset_key\",\"route_id\", \"trip_instance_key\", \"shape_id\", \"shape_array_key\"]"
+ "OPERATOR_EXPORT = GTFS_DATA_DICT.schedule_tables.operator_scheduled_stats"
]
},
{
"cell_type": "code",
- "execution_count": 56,
- "id": "2b0a5d50-b5df-4e3e-b4af-6cee0ceb9498",
+ "execution_count": null,
+ "id": "c6e5a1f3-4aba-40ec-811b-7ea6c1e01655",
"metadata": {},
"outputs": [],
"source": [
- "trips_w_direction_id = helpers.import_scheduled_trips(\n",
- " analysis_date, columns=direction_id_kept, get_pandas=True\n",
- ").rename(columns={\"schedule_gtfs_dataset_key\": \"gtfs_dataset_key\"})"
+ "SCHED_GCS"
]
},
{
"cell_type": "code",
- "execution_count": 58,
- "id": "24feb45b-154d-4f0c-8c1e-0b5af20d8926",
+ "execution_count": null,
+ "id": "537949dd-a008-4643-bbd1-de0dc142026b",
"metadata": {},
"outputs": [],
"source": [
- "# Grab only relevant schedule_gtfs_dataset_keys\n",
- "trips_w_direction_id2 = trips_w_direction_id.loc[trips_w_direction_id.gtfs_dataset_key.isin(schd_keys)].reset_index(drop=True)"
+ "GTFS_DATA_DICT.schedule_tables.operator_routes"
]
},
{
"cell_type": "code",
- "execution_count": 59,
- "id": "5964848f-970b-48bc-868e-0d1ff9e0c1e5",
+ "execution_count": null,
+ "id": "4ac524dc-d27e-43b5-94ad-e8390d5c7f0f",
"metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "\n",
- "RangeIndex: 335 entries, 0 to 334\n",
- "Data columns (total 6 columns):\n",
- " # Column Non-Null Count Dtype \n",
- "--- ------ -------------- ----- \n",
- " 0 direction_id 75 non-null float64\n",
- " 1 gtfs_dataset_key 335 non-null object \n",
- " 2 route_id 335 non-null object \n",
- " 3 trip_instance_key 335 non-null object \n",
- " 4 shape_id 335 non-null object \n",
- " 5 shape_array_key 335 non-null object \n",
- "dtypes: float64(1), object(5)\n",
- "memory usage: 15.8+ KB\n"
- ]
- }
- ],
+ "outputs": [],
"source": [
- "trips_w_direction_id2.info()"
+ "dec_url = \"gs://calitp-analytics-data/data-analyses/gtfs_schedule/operator_profiles/operator_routes_2024-12-11.parquet\""
]
},
{
"cell_type": "code",
- "execution_count": 36,
- "id": "9beb666c-5e6d-4db8-94fb-e7faf717074c",
+ "execution_count": null,
+ "id": "e8b5d8f7-f6cf-4b24-a77b-685bfd444966",
"metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "335"
- ]
- },
- "execution_count": 36,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
+ "outputs": [],
"source": [
- "len(trips2)"
+ "dec_df = gpd.read_parquet(dec_url)"
]
},
{
"cell_type": "code",
- "execution_count": 37,
- "id": "c154dd13-8561-4909-a77c-cec19fc963c6",
+ "execution_count": null,
+ "id": "bb5c8db7-baa1-4b4a-a018-60c419e48343",
"metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "73105f2d1cabc8170ab066d96863c5d5 278\n",
- "f5a749dd65924e025b1293c58f95f8d6 57\n",
- "Name: gtfs_dataset_key, dtype: int64"
- ]
- },
- "execution_count": 37,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
+ "outputs": [],
"source": [
- "trips2.gtfs_dataset_key.value_counts()"
+ "dec_df.organization_name.value_counts().head()"
]
},
{
"cell_type": "code",
- "execution_count": 39,
- "id": "f6b3b67f-2173-453c-879a-e9c9fa515466",
+ "execution_count": null,
+ "id": "e4037d38-d6f7-4471-875c-3471d0219bfe",
"metadata": {},
"outputs": [],
"source": [
- "most_common_shape = (\n",
- " trips2.groupby(\n",
- " route_dir_cols + [\"shape_id\", \"shape_array_key\"],\n",
- " observed=True,\n",
- " group_keys=False,\n",
- " )\n",
- " .agg({\"trip_instance_key\": \"count\"})\n",
- " .reset_index()\n",
- " .sort_values(\n",
- " route_dir_cols + [\"trip_instance_key\"], ascending=sorting_order + [False]\n",
- " )\n",
- " .drop_duplicates(subset=route_dir_cols)\n",
- " .reset_index(drop=True)[route_dir_cols + [\"shape_id\", \"shape_array_key\"]]\n",
- ").rename(\n",
- " columns={\n",
- " \"gtfs_dataset_key\": \"schedule_gtfs_dataset_key\",\n",
- " \"shape_id\": \"common_shape_id\",\n",
- " }\n",
- ")"
+ "dec_df.loc[\n",
+ " dec_df.organization_name == \"Alameda-Contra Costa Transit District\"\n",
+ "].head().drop(columns=[\"geometry\"]).T"
]
},
{
"cell_type": "code",
- "execution_count": 40,
- "id": "26c4100c-33bb-4075-baf5-a02a8c791bf6",
+ "execution_count": null,
+ "id": "19012a85-7ef7-4188-870b-251eb600034f",
"metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "pandas.core.frame.DataFrame"
- ]
- },
- "execution_count": 40,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
+ "outputs": [],
"source": [
- "type(most_common_shape)"
+ "dec_df2 = dec_df.loc[dec_df.schedule_gtfs_dataset_key.isin(schd_keys)]"
]
},
{
"cell_type": "code",
- "execution_count": 41,
- "id": "dcfa378c-1417-4be2-bfd6-dbafdb6c771a",
+ "execution_count": null,
+ "id": "1eae8307-2d99-41e4-b541-ca2fc1c68b02",
"metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "20"
- ]
- },
- "execution_count": 41,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
+ "outputs": [],
"source": [
- "len(most_common_shape)"
+ "dec_df2.shape"
]
},
{
"cell_type": "code",
- "execution_count": 43,
- "id": "3385337d-35cf-4bf7-a56b-f9663b9f27d5",
+ "execution_count": null,
+ "id": "5c31f929-f77e-4fb3-a571-3b0577c6d3f5",
"metadata": {},
"outputs": [],
"source": [
- "shape_geom = helpers.import_scheduled_shapes(\n",
- " analysis_date,\n",
- " columns=[\"shape_array_key\", \"geometry\"],\n",
- ")"
+ "type(dec_df2)"
]
},
{
"cell_type": "code",
- "execution_count": 44,
- "id": "3d8b4211-aa87-458f-a1fa-3a777b888657",
+ "execution_count": null,
+ "id": "0751e612-789f-4dcc-b771-0b6af7960ff7",
"metadata": {},
"outputs": [],
"source": [
- "common_shape_geom = pd.merge(\n",
- " shape_geom, most_common_shape, on=\"shape_array_key\", how=\"inner\"\n",
- ").drop(columns=\"shape_array_key\")"
+ "dec_df2.drop(columns=[\"geometry\"]).T"
]
},
{
"cell_type": "code",
- "execution_count": 45,
- "id": "992b5d9c-f5b7-4270-8d3d-ee2e7079e5e9",
+ "execution_count": null,
+ "id": "45eb690e-e5a8-4798-885a-a5a738bc8062",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# dec_df2.explore()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "ec619dd6-f042-492f-8b87-4adaf435241d",
"metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "(geopandas.geodataframe.GeoDataFrame, 20)"
- ]
- },
- "execution_count": 45,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
"source": [
- "type(common_shape_geom), len(common_shape_geom)"
+ "#### Find where in `gtfs_funnel` all the routes disappear"
]
},
{
"cell_type": "code",
- "execution_count": 46,
- "id": "28a37ddd-52e8-4a91-9b9d-8be0f22f8e5e",
+ "execution_count": null,
+ "id": "b9e70fd7-3fa0-4d9f-899b-e1e3eea03151",
"metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "Index(['geometry', 'schedule_gtfs_dataset_key', 'route_id', 'common_shape_id'], dtype='object')"
- ]
- },
- "execution_count": 46,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
+ "outputs": [],
"source": [
- "common_shape_geom.columns"
+ "group_cols = [\"schedule_gtfs_dataset_key\"]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "1fbc60bb-62bc-4a66-956e-7a4c5ea22371",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "longest_shape_gdf2.info()"
]
},
{
- "cell_type": "code",
- "execution_count": 47,
- "id": "aa870e11-8b3a-421a-a304-409faa9bcede",
+ "cell_type": "markdown",
+ "id": "f2c71b31-c05c-40e2-a70b-d1f1276ecf99",
"metadata": {},
- "outputs": [],
"source": [
- "from shared_utils import portfolio_utils"
+ "#### something is going on in `operator_scheduled_stats.schedule_stats_by_operator`"
]
},
{
"cell_type": "code",
- "execution_count": 48,
- "id": "20e4a0e3-f9c0-497e-96ff-97262cc21ff3",
+ "execution_count": null,
+ "id": "6a006462-1cd0-4b9e-9209-e8658964adc7",
"metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "73105f2d1cabc8170ab066d96863c5d5 17\n",
- "f5a749dd65924e025b1293c58f95f8d6 3\n",
- "Name: schedule_gtfs_dataset_key, dtype: int64"
- ]
- },
- "execution_count": 48,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
+ "outputs": [],
"source": [
- "common_shape_geom.schedule_gtfs_dataset_key.value_counts()"
+ "ROUTE_TYPOLOGY = GTFS_DATA_DICT.schedule_tables.route_typologies"
]
},
{
"cell_type": "code",
- "execution_count": 49,
- "id": "ac28e91a-a805-4c29-a54e-14a4c27c46fd",
+ "execution_count": null,
+ "id": "9a330bd8-450f-4a84-8a74-3e07bbffdcf1",
"metadata": {},
"outputs": [],
"source": [
- "route_info = (\n",
- " helpers.import_scheduled_trips(\n",
- " analysis_date,\n",
- " columns=[\n",
- " \"gtfs_dataset_key\",\n",
- " \"route_id\",\n",
- " \"route_long_name\",\n",
- " \"route_short_name\",\n",
- " \"route_desc\",\n",
- " ],\n",
- " )\n",
- " .drop_duplicates()\n",
- " .pipe(portfolio_utils.add_route_name)\n",
- " .drop(columns=[\"route_long_name\", \"route_short_name\", \"route_desc\"])\n",
- ")"
+ "route_typology = pd.read_parquet(f\"{SCHED_GCS}{ROUTE_TYPOLOGY}_{analysis_date}.parquet\")"
]
},
{
"cell_type": "code",
- "execution_count": 50,
- "id": "97d0c027-a95a-44c9-969a-ffa3a4e6b1c6",
+ "execution_count": null,
+ "id": "2e334d8c-5377-4ec3-8f78-a28c112429c7",
"metadata": {},
"outputs": [],
"source": [
- "common_shape_geom2 = pd.merge(\n",
- " common_shape_geom,\n",
- " route_info.rename(columns={\"route_name_used\": \"route_name\"}),\n",
- " on=[\"schedule_gtfs_dataset_key\", \"route_id\"],\n",
- ")"
+ "from route_typologies import route_typologies"
]
},
{
"cell_type": "code",
- "execution_count": 51,
- "id": "83f42ec6-2235-47db-a687-dd80bd24ffcd",
+ "execution_count": null,
+ "id": "d89002ae-c699-46df-8e1f-cd3eb74c3158",
"metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " schedule_gtfs_dataset_key | \n",
- " route_id | \n",
- " common_shape_id | \n",
- " route_name | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 0 | \n",
- " 73105f2d1cabc8170ab066d96863c5d5 | \n",
- " 7 | \n",
- " 715be44b-4dee-4c56-83f8-b1970d6133cf | \n",
- " Rt 7. A. H. College, Crossroads Shopping Center via Boone St, Bradley Rd. | \n",
- "
\n",
- " \n",
- " 1 | \n",
- " 73105f2d1cabc8170ab066d96863c5d5 | \n",
- " 6 | \n",
- " de042d01-f50a-4b67-ba25-4628643021fa | \n",
- " Rt 6. Oak Knolls to Old Orcutt-East to West-Outbound | \n",
- "
\n",
- " \n",
- " 2 | \n",
- " 73105f2d1cabc8170ab066d96863c5d5 | \n",
- " 8 | \n",
- " 882010e7-d331-4518-b31f-3944c689ac17 | \n",
- " Rt 8. Tanglewood to Crossroads Shopping Center via McCoy Ln. | \n",
- "
\n",
- " \n",
- " 3 | \n",
- " 73105f2d1cabc8170ab066d96863c5d5 | \n",
- " Mall | \n",
- " df0838ab-9999-4118-a599-852164ed2471 | \n",
- " Mall Shuttle | \n",
- "
\n",
- " \n",
- " 4 | \n",
- " 73105f2d1cabc8170ab066d96863c5d5 | \n",
- " 12X | \n",
- " 14db961b-0cc1-4916-b366-ba0784592fb8 | \n",
- " 12X Broadway/Orcutt Express | \n",
- "
\n",
- " \n",
- " 5 | \n",
- " f5a749dd65924e025b1293c58f95f8d6 | \n",
- " CC | \n",
- " 5cf6811a-2f53-4199-b315-4408eb816e82 | \n",
- " Daily train service between Auburn, Sacramento, Oakland and San Jose | \n",
- "
\n",
- " \n",
- " 6 | \n",
- " 73105f2d1cabc8170ab066d96863c5d5 | \n",
- " 13X | \n",
- " 077be56b-8745-4f65-acec-eda2e39cccf7 | \n",
- " 13X Transit Center/PVHS/N. Broadway | \n",
- "
\n",
- " \n",
- " 7 | \n",
- " 73105f2d1cabc8170ab066d96863c5d5 | \n",
- " 11 | \n",
- " fe7d3b5b-6aed-4f53-9f9c-b582942157db | \n",
- " R11. Transit Center to Gov't Center via S. Broadway | \n",
- "
\n",
- " \n",
- " 8 | \n",
- " 73105f2d1cabc8170ab066d96863c5d5 | \n",
- " 30 | \n",
- " 21e2ec94-9952-4f8e-8515-8332c94e8b55 | \n",
- " Route 30 - Santa Maria Transit Center/Vandenberg/Lompoc | \n",
- "
\n",
- " \n",
- " 9 | \n",
- " f5a749dd65924e025b1293c58f95f8d6 | \n",
- " Shuttle | \n",
- " e70e22aa-8bb7-44eb-b12c-f98aeb8f61b7 | \n",
- " Shuttle to Auburn | \n",
- "
\n",
- " \n",
- " 10 | \n",
- " 73105f2d1cabc8170ab066d96863c5d5 | \n",
- " 8a7c42f9-51e4-4848-bf88-30c210f149ad | \n",
- " ab03b79f-f4a9-4a61-895c-f9e98311322f | \n",
- " Rt 11. Transit Center to Gov't Center via S. Broadway | \n",
- "
\n",
- " \n",
- " 11 | \n",
- " 73105f2d1cabc8170ab066d96863c5d5 | \n",
- " 2 | \n",
- " 33e31c53-87d1-4cae-930a-d0c26ed8d9e7 | \n",
- " Rt 2. Transit Center to PVH School via Western., Donovan Rd | \n",
- "
\n",
- " \n",
- " 12 | \n",
- " 73105f2d1cabc8170ab066d96863c5d5 | \n",
- " 3 | \n",
- " eb560457-7bcf-4989-a293-d134546cc289 | \n",
- " Rt 3. Transit Center to Marian Hospital to PVH School, via E. Main, Suey Ln. | \n",
- "
\n",
- " \n",
- " 13 | \n",
- " 73105f2d1cabc8170ab066d96863c5d5 | \n",
- " 1B | \n",
- " 6cdb20fb-9413-4ed1-abc3-4d6b3bbe2f02 | \n",
- " Rt 1. Transit Ctr to Preisker Park Via N. Broadway | \n",
- "
\n",
- " \n",
- " 14 | \n",
- " 73105f2d1cabc8170ab066d96863c5d5 | \n",
- " 20 | \n",
- " 0f836575-8fe7-4d67-8ee1-86a0d86c57b7 | \n",
- " Route 20 - Santa Maria Transit Center/Los Alamos/Buellton/Solvang-OB | \n",
- "
\n",
- " \n",
- " 15 | \n",
- " f5a749dd65924e025b1293c58f95f8d6 | \n",
- " SF | \n",
- " ejnn | \n",
- " Shuttle to San Francisco Transbay Terminal | \n",
- "
\n",
- " \n",
- " 16 | \n",
- " 73105f2d1cabc8170ab066d96863c5d5 | \n",
- " 5 | \n",
- " fd9d7de5-ae77-4fa8-8545-a1dc02117126 | \n",
- " Rt 5. Transit Center to Gov't Cntr to Evergreen Shopping Center via Miller St., S.M. Way | \n",
- "
\n",
- " \n",
- " 17 | \n",
- " 73105f2d1cabc8170ab066d96863c5d5 | \n",
- " 4 | \n",
- " 709dca08-c50f-489b-9814-9a220627172f | \n",
- " Rt 4. Transit Center to SMH school to VTC via Cook St., Thornburg., Betteravia Rd. | \n",
- "
\n",
- " \n",
- " 18 | \n",
- " 73105f2d1cabc8170ab066d96863c5d5 | \n",
- " 9 | \n",
- " d05481c2-ba1b-484f-a859-36fdaa827487 | \n",
- " Rt 9. Transit Center to PVH via Alvin Ave. | \n",
- "
\n",
- " \n",
- " 19 | \n",
- " 73105f2d1cabc8170ab066d96863c5d5 | \n",
- " 1 | \n",
- " 6341a660-d9c2-45d7-aee9-fcef64b4fa3b | \n",
- " Rt 1. Transit Ctr to Preisker Park Via N. Broadway | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " schedule_gtfs_dataset_key route_id \\\n",
- "0 73105f2d1cabc8170ab066d96863c5d5 7 \n",
- "1 73105f2d1cabc8170ab066d96863c5d5 6 \n",
- "2 73105f2d1cabc8170ab066d96863c5d5 8 \n",
- "3 73105f2d1cabc8170ab066d96863c5d5 Mall \n",
- "4 73105f2d1cabc8170ab066d96863c5d5 12X \n",
- "5 f5a749dd65924e025b1293c58f95f8d6 CC \n",
- "6 73105f2d1cabc8170ab066d96863c5d5 13X \n",
- "7 73105f2d1cabc8170ab066d96863c5d5 11 \n",
- "8 73105f2d1cabc8170ab066d96863c5d5 30 \n",
- "9 f5a749dd65924e025b1293c58f95f8d6 Shuttle \n",
- "10 73105f2d1cabc8170ab066d96863c5d5 8a7c42f9-51e4-4848-bf88-30c210f149ad \n",
- "11 73105f2d1cabc8170ab066d96863c5d5 2 \n",
- "12 73105f2d1cabc8170ab066d96863c5d5 3 \n",
- "13 73105f2d1cabc8170ab066d96863c5d5 1B \n",
- "14 73105f2d1cabc8170ab066d96863c5d5 20 \n",
- "15 f5a749dd65924e025b1293c58f95f8d6 SF \n",
- "16 73105f2d1cabc8170ab066d96863c5d5 5 \n",
- "17 73105f2d1cabc8170ab066d96863c5d5 4 \n",
- "18 73105f2d1cabc8170ab066d96863c5d5 9 \n",
- "19 73105f2d1cabc8170ab066d96863c5d5 1 \n",
- "\n",
- " common_shape_id \\\n",
- "0 715be44b-4dee-4c56-83f8-b1970d6133cf \n",
- "1 de042d01-f50a-4b67-ba25-4628643021fa \n",
- "2 882010e7-d331-4518-b31f-3944c689ac17 \n",
- "3 df0838ab-9999-4118-a599-852164ed2471 \n",
- "4 14db961b-0cc1-4916-b366-ba0784592fb8 \n",
- "5 5cf6811a-2f53-4199-b315-4408eb816e82 \n",
- "6 077be56b-8745-4f65-acec-eda2e39cccf7 \n",
- "7 fe7d3b5b-6aed-4f53-9f9c-b582942157db \n",
- "8 21e2ec94-9952-4f8e-8515-8332c94e8b55 \n",
- "9 e70e22aa-8bb7-44eb-b12c-f98aeb8f61b7 \n",
- "10 ab03b79f-f4a9-4a61-895c-f9e98311322f \n",
- "11 33e31c53-87d1-4cae-930a-d0c26ed8d9e7 \n",
- "12 eb560457-7bcf-4989-a293-d134546cc289 \n",
- "13 6cdb20fb-9413-4ed1-abc3-4d6b3bbe2f02 \n",
- "14 0f836575-8fe7-4d67-8ee1-86a0d86c57b7 \n",
- "15 ejnn \n",
- "16 fd9d7de5-ae77-4fa8-8545-a1dc02117126 \n",
- "17 709dca08-c50f-489b-9814-9a220627172f \n",
- "18 d05481c2-ba1b-484f-a859-36fdaa827487 \n",
- "19 6341a660-d9c2-45d7-aee9-fcef64b4fa3b \n",
- "\n",
- " route_name \n",
- "0 Rt 7. A. H. College, Crossroads Shopping Center via Boone St, Bradley Rd. \n",
- "1 Rt 6. Oak Knolls to Old Orcutt-East to West-Outbound \n",
- "2 Rt 8. Tanglewood to Crossroads Shopping Center via McCoy Ln. \n",
- "3 Mall Shuttle \n",
- "4 12X Broadway/Orcutt Express \n",
- "5 Daily train service between Auburn, Sacramento, Oakland and San Jose \n",
- "6 13X Transit Center/PVHS/N. Broadway \n",
- "7 R11. Transit Center to Gov't Center via S. Broadway \n",
- "8 Route 30 - Santa Maria Transit Center/Vandenberg/Lompoc \n",
- "9 Shuttle to Auburn \n",
- "10 Rt 11. Transit Center to Gov't Center via S. Broadway \n",
- "11 Rt 2. Transit Center to PVH School via Western., Donovan Rd \n",
- "12 Rt 3. Transit Center to Marian Hospital to PVH School, via E. Main, Suey Ln. \n",
- "13 Rt 1. Transit Ctr to Preisker Park Via N. Broadway \n",
- "14 Route 20 - Santa Maria Transit Center/Los Alamos/Buellton/Solvang-OB \n",
- "15 Shuttle to San Francisco Transbay Terminal \n",
- "16 Rt 5. Transit Center to Gov't Cntr to Evergreen Shopping Center via Miller St., S.M. Way \n",
- "17 Rt 4. Transit Center to SMH school to VTC via Cook St., Thornburg., Betteravia Rd. \n",
- "18 Rt 9. Transit Center to PVH via Alvin Ave. \n",
- "19 Rt 1. Transit Ctr to Preisker Park Via N. Broadway "
- ]
- },
- "execution_count": 51,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
+ "outputs": [],
"source": [
- "common_shape_geom2.drop(columns=[\"geometry\"])"
+ "route_typology_grouped = (\n",
+ " route_typology.groupby([\"schedule_gtfs_dataset_key\", \"route_id\"])\n",
+ " .agg({**{f\"is_{c}\": \"sum\" for c in route_typologies}})\n",
+ " .reset_index()\n",
+ ")"
]
},
{
"cell_type": "code",
- "execution_count": 53,
- "id": "3fb5cbf9-16fa-45b9-b870-1ceaea9801ba",
+ "execution_count": null,
+ "id": "82df1d3d-3afa-4fc9-bb16-578d6580a351",
"metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "Make this Notebook Trusted to load map: File -> Trust Notebook
"
- ],
- "text/plain": [
- ""
- ]
- },
- "execution_count": 53,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
+ "outputs": [],
"source": [
- "common_shape_geom2.explore(\"route_id\")"
+ "route_typology_grouped2 = route_typology_grouped.loc[\n",
+ " route_typology_grouped.schedule_gtfs_dataset_key.isin(schd_keys)\n",
+ "]"
]
},
{
"cell_type": "markdown",
- "id": "55924475-3c80-4fb5-bca7-c6e1566b4af6",
+ "id": "a63499cd-d3dc-4425-b62c-31fe43341f38",
"metadata": {},
"source": [
- "### Don't look at most common shape, just load trips.\n",
- "* `f5a749dd65924e025b1293c58f95f8d6` is Amtrak\n",
- "* 73105f2d1cabc8170ab066d96863c5d5 is the City of Santa Maria"
+ "#### Routes are missing for Santa Maria and Capitol Corridor in `ROUTE_TYPOLOGY`"
]
},
{
"cell_type": "code",
"execution_count": null,
- "id": "d66d0644-60d4-479b-b702-282fdbb31bd2",
+ "id": "de4c9c7f-0f3a-472b-b255-68518cbd6ddf",
"metadata": {},
"outputs": [],
"source": [
- "len(trips2)"
+ "route_typology_grouped2.T"
]
},
{
"cell_type": "code",
"execution_count": null,
- "id": "2c1c03b1-04cd-4ee5-b446-f46c7c0b7eba",
+ "id": "b8f73717-1c5d-4b6a-8498-cd908e6302f3",
"metadata": {},
"outputs": [],
"source": [
- "trips2.head(2)"
+ "route_gdf = longest_shape_gdf2.merge(\n",
+ " route_typology_grouped2, on=[\"schedule_gtfs_dataset_key\", \"route_id\"], how=\"outer\"\n",
+ ")"
]
},
{
"cell_type": "code",
"execution_count": null,
- "id": "1ac9ce22-d308-4a80-b2fd-7eea7d1c4571",
+ "id": "fe956bad-5a37-4f3c-9e93-0e3fc86b1927",
"metadata": {},
"outputs": [],
"source": [
- "trips2.gtfs_dataset_key.value_counts()"
+ "route_gdf.shape"
]
},
{
"cell_type": "code",
"execution_count": null,
- "id": "fae140d9-aa52-43d7-a6dc-350e21db21a6",
+ "id": "72db2ec7-427f-4cab-b2eb-e75a2b23d856",
"metadata": {},
"outputs": [],
"source": [
- "test2 = pd.merge(shape_geom, trips2, on=\"shape_array_key\", how=\"inner\")"
+ "route_gdf.drop(columns=[\"geometry\"])"
]
},
{
"cell_type": "code",
"execution_count": null,
- "id": "1049f135-ff88-4ceb-a054-a412f01fe42d",
+ "id": "d5395dad-8d87-45aa-90fb-8d2da3a8c591",
"metadata": {},
"outputs": [],
"source": [
- "len(test2)"
+ "# route_gdf2.explore(\"schedule_gtfs_dataset_key\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "1798ab2b-8847-4111-bcd1-9421bbfc2a4a",
+ "metadata": {},
+ "source": [
+ "#### Change merge from `inner` to `left`"
]
},
{
"cell_type": "code",
"execution_count": null,
- "id": "cd777b6b-2566-4680-afeb-a5ad99ef94a2",
+ "id": "e4224b34-f08c-4cc5-8853-1a588f6c59ef",
"metadata": {},
"outputs": [],
"source": [
- "# test2.head(1)"
+ "f\"{GTFS_DATA_DICT.digest_tables.dir}{GTFS_DATA_DICT.digest_tables.operator_routes_map}.parquet\""
]
},
{
"cell_type": "code",
"execution_count": null,
- "id": "687db4f0-df8c-4232-9c5d-dc4b6d8a5a6b",
+ "id": "5becb3cf-d295-4c85-9dc9-716c657ea19c",
"metadata": {},
"outputs": [],
"source": [
- "route_dir_cols"
+ "SCHED_GCS"
]
},
{
- "cell_type": "markdown",
- "id": "81adac11-8b42-4fcf-895c-f3fc61b08fc8",
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "6146956f-0745-4844-8a84-51ce772fb0e3",
"metadata": {},
+ "outputs": [],
"source": [
- "### City of Santa Maria has many rows without a `direction_id` value. That is why so few routes are appearing."
+ "GTFS_DATA_DICT.schedule_tables.operator_routes"
]
},
{
"cell_type": "code",
"execution_count": null,
- "id": "fafcfddc-660e-4bbc-bbf3-849ea5ee07c4",
+ "id": "017c78bf-9bbf-4a99-a867-c606c6f55858",
"metadata": {},
"outputs": [],
"source": [
- "test2.info()"
+ "my_test_url = \"gs://calitp-analytics-data/data-analyses/gtfs_schedule/operator_profiles/operator_routes_2024-12-11_AH.parquet\""
]
},
{
"cell_type": "code",
"execution_count": null,
- "id": "45719a9b-6e6d-4f12-bab4-19ef9a5c9b88",
- "metadata": {
- "scrolled": true,
- "tags": []
- },
+ "id": "eea9fb55-53e1-4e83-8228-fdf9a5c2cfac",
+ "metadata": {},
"outputs": [],
"source": [
- "test2.groupby(['gtfs_dataset_key', 'route_id',\"shape_id\", \"shape_array_key\"],\n",
- ").agg({\"trip_instance_key\": \"count\"})"
+ "f\"{GTFS_DATA_DICT.digest_tables.dir}{GTFS_DATA_DICT.digest_tables.operator_profiles}.parquet\""
]
},
{
- "cell_type": "markdown",
- "id": "a4052089-30a5-4448-a45d-8eddb6fe41ff",
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "3ce8f60a-d25c-4004-9c34-c7c86bd56079",
"metadata": {},
+ "outputs": [],
"source": [
- "### Determine common shape "
+ "test_gdf = gpd.read_parquet(my_test_url)"
]
},
{
"cell_type": "code",
"execution_count": null,
- "id": "68a8309b-44a0-4a83-8edf-89e52378c0ee",
+ "id": "890fd31a-987f-45de-85e7-e1535d7c1cf2",
"metadata": {},
"outputs": [],
"source": [
- "route_dir_cols.remove('direction_id')"
+ "test_gdf2 = test_gdf.loc[test_gdf.schedule_gtfs_dataset_key.isin(schd_keys)]"
]
},
{
"cell_type": "code",
"execution_count": null,
- "id": "6e114cae-d748-47c8-9cae-f452ec76dc12",
+ "id": "47514189-ff4a-43f7-9de6-85c1c1cb79a0",
"metadata": {},
"outputs": [],
"source": [
- "sorting_order"
+ "test_gdf2.explore(\"route_id\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "648e80d6-619d-424d-8615-23d2c11f8e01",
+ "metadata": {},
+ "source": [
+ "#### Test with all the dates."
]
},
{
"cell_type": "code",
"execution_count": null,
- "id": "49979bf5-cf8b-4e36-99c7-438a7fef3b2b",
+ "id": "aeaa5d4c-f34d-4a56-895b-058940878dfa",
"metadata": {},
"outputs": [],
"source": [
- "test3 = (\n",
- " test2.groupby(\n",
- " route_dir_cols + [\"shape_id\", \"shape_array_key\"],\n",
- " observed=True,\n",
- " group_keys=False,\n",
- " )\n",
- " .agg({\"trip_instance_key\": \"count\"})\n",
- " .reset_index()\n",
- " .sort_values(\n",
- " route_dir_cols + [\"trip_instance_key\"], ascending=[True, True] + [False]\n",
- " )\n",
- " .drop_duplicates(subset=route_dir_cols)\n",
- " .reset_index(drop=True)[route_dir_cols + [\"shape_id\", \"shape_array_key\"]]\n",
- ").rename(\n",
- " columns={\n",
- " \"gtfs_dataset_key\": \"schedule_gtfs_dataset_key\",\n",
- " \"shape_id\": \"common_shape_id\",\n",
- " }\n",
- ")"
+ "GTFS_DATA_DICT.schedule_tables.operator_routes"
]
},
{
"cell_type": "code",
"execution_count": null,
- "id": "44cb60d3-6c95-469c-8ba5-0f98ea0609f9",
+ "id": "a4d04121-1df4-4b78-8193-99d2843e6e89",
"metadata": {},
"outputs": [],
"source": [
- "test3"
+ "RT_SCHED_GCS"
]
},
{
- "cell_type": "markdown",
- "id": "ef31cc38-bcab-46bd-8992-92c5cb43dd18",
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "59e431ef-d831-408f-8f94-93434438e3ad",
"metadata": {},
+ "outputs": [],
"source": [
- "### Drop duplicates based on route_id: lots of routes show up for Santa Maria now."
+ "f\"{OPERATOR_ROUTE}_AH_test\""
]
},
{
"cell_type": "code",
"execution_count": null,
- "id": "5b4ff0ec-dc39-4ff1-ac93-2d16db6dbe26",
+ "id": "7a562a65-a68c-4d3b-8b35-5039b4757e6f",
"metadata": {},
"outputs": [],
"source": [
- "test3 = test2.drop_duplicates(subset=[\"gtfs_dataset_key\", \"route_id\"]).reset_index(\n",
- " drop=True\n",
- ")"
+ "f\"{GTFS_DATA_DICT.digest_tables.dir}{GTFS_DATA_DICT.digest_tables.operator_routes_map}.parquet\""
]
},
{
"cell_type": "code",
"execution_count": null,
- "id": "603e9d27-9d40-47a5-bfd1-6e061f219090",
+ "id": "a5a00166-be9e-46a1-bb35-ba233162ef5b",
"metadata": {},
"outputs": [],
"source": [
- "test3.explore(\"route_id\", style_kwds={\"weight\": 5})"
+ "test_df = gpd.read_parquet(\n",
+ " \"gs://calitp-analytics-data/data-analyses/rt_vs_schedule/digest/operator_routes_AH_test.parquet\"\n",
+ ")"
]
},
{
- "cell_type": "markdown",
- "id": "5e6b2d85-81f8-463f-8ce8-2461ecf3197b",
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "62dc4282-1b72-4768-969d-48d3e8250a8a",
"metadata": {},
+ "outputs": [],
"source": [
- "### Try dropping duplicates again"
+ "test_df.columns"
]
},
{
"cell_type": "code",
"execution_count": null,
- "id": "fca9ad78-4763-4410-9f09-50e806f19c07",
+ "id": "f83624e0-be98-44e7-9ede-895748bc0f96",
"metadata": {},
"outputs": [],
"source": [
- "route_dir_cols"
+ "op_routes_gdf = test_df.loc[test_df.organization_name.isin(org_name_lists)]"
]
},
{
"cell_type": "code",
"execution_count": null,
- "id": "1f07937e-003e-40a4-929e-5140220f15e6",
+ "id": "928454c5-0cf1-4887-97b0-bd1931366178",
"metadata": {},
"outputs": [],
"source": [
- "trips2.groupby(\n",
- " route_dir_cols + [\"shape_id\", \"shape_array_key\"],\n",
- " observed=True,\n",
- " group_keys=False,\n",
- ").agg({\"trip_instance_key\": \"count\"})"
+ "# Find the most recent geography for each route.\n",
+ "op_routes_gdf = op_routes_gdf.sort_values(by=[\"service_date\"], ascending=False)\n",
+ "\n",
+ "# Keep only the most recent row.\n",
+ "op_routes_gdf = op_routes_gdf.drop_duplicates(\n",
+ " subset=[\"route_long_name\", \"route_short_name\", \"route_combined_name\"]\n",
+ ")\n",
+ "\n",
+ "# Drop service_dates\n",
+ "op_routes_gdf = op_routes_gdf.drop(columns=[\"service_date\"])"
]
},
{
"cell_type": "code",
"execution_count": null,
- "id": "72e89120-297c-4f67-bb34-e5c257df4794",
+ "id": "fdec7a7e-8de9-4817-969d-dc867ed1605e",
"metadata": {},
"outputs": [],
"source": [
- "duplicates2 = (\n",
- " trips2.groupby(\n",
- " route_dir_cols + [\"shape_id\", \"shape_array_key\"],\n",
- " observed=True,\n",
- " group_keys=False,\n",
- " )\n",
- " .agg({\"trip_instance_key\": \"count\"})\n",
- " .reset_index()\n",
- ")"
+ "op_routes_gdf.organization_name.value_counts()"
]
},
{
"cell_type": "code",
"execution_count": null,
- "id": "dc9633e7-8c0b-4503-b046-e039d831fd11",
+ "id": "ec771a75-0095-4f0e-a2c0-0de5e886492d",
"metadata": {},
"outputs": [],
"source": [
- "duplicates2"
+ "op_routes_gdf.loc[op_routes_gdf.organization_name == \"City of Santa Maria\"].explore(\n",
+ " \"route_long_name\"\n",
+ ")"
]
}
],
diff --git a/gtfs_digest/44_debugging_dec2024.ipynb b/gtfs_digest/44_debugging_dec2024.ipynb
index c5e47c500..da9d9ec16 100644
--- a/gtfs_digest/44_debugging_dec2024.ipynb
+++ b/gtfs_digest/44_debugging_dec2024.ipynb
@@ -12,12 +12,14 @@
},
{
"cell_type": "code",
- "execution_count": 1,
+ "execution_count": null,
"id": "ac7ce931-86fe-418c-95d7-5d2f85000bee",
"metadata": {},
"outputs": [],
"source": [
+ "import _section2_utils as section2\n",
"import geopandas as gpd\n",
+ "import merge_operator_data\n",
"import merge_data\n",
"import numpy as np\n",
"import pandas as pd\n",
@@ -28,7 +30,7 @@
},
{
"cell_type": "code",
- "execution_count": 2,
+ "execution_count": null,
"id": "27d67993-3143-4a78-acbc-d36078569db8",
"metadata": {},
"outputs": [],
@@ -41,7 +43,7 @@
},
{
"cell_type": "code",
- "execution_count": 3,
+ "execution_count": null,
"id": "d91ded3e-4959-43d4-b90e-45df7ac60883",
"metadata": {},
"outputs": [],
@@ -51,70 +53,389 @@
},
{
"cell_type": "markdown",
- "id": "ea7bc262-afa1-4193-8580-831587a78c0b",
+ "id": "8154301a-e454-41b8-af51-6f61fb420843",
"metadata": {},
"source": [
- "### Op Profiles\n",
- "* The code for `gtfs_digest/merge_operator.py` stopped working because one of the column names changed. I went into `gtfs_funnel/crosswalk-gtfs_dataset_key` to fix that. \n",
- "* Operator Profiles: are from September 2024 when it's Dec 2024.\n",
- " * Fixed: was still referencing one of my old testing profiles."
+ "### Metrics for All Routes\n",
+ "* March 2023 has two values for some operators.\n",
+ "* Some operators have many rows that are repeating, causing their charts to go above 100. "
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "ba30a005-66c9-4eb3-9e3e-8bfe3ce1c297",
+ "metadata": {},
+ "source": [
+ "#### Look at the metrics dataframes first.\n",
+ "* I think `op_rt_sched_metrics` is the reason why there are duplicative values.\n",
+ "* Temp fix: in `section2_utils.load_operator_metrics()` drop duplicates based on `service_date`."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "4a79db03-a8f4-4fd9-bcb7-d4dbd8c8befa",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "op_sched_metrics = merge_operator_data.concatenate_schedule_operator_metrics(analysis_date_list)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "619c554f-32d4-4d79-b0f6-e788370a85de",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "op_sched_metrics_dec = op_sched_metrics.loc[op_sched_metrics.service_date ==\n",
+ " '2024-12-11T00:00:00.000000000']"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "7968c4ab-b0a9-44ab-9332-f2cd59e6d733",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "op_sched_metrics_dec.schedule_gtfs_dataset_key.value_counts().head(10)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "4321e393-990c-42cc-af36-ba92de71c80e",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "op_rt_sched_metrics = merge_operator_data.concatenate_rt_vs_schedule_operator_metrics(analysis_date_list)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "5a2cea75-a1e5-422b-8cf0-669c88a42b60",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "op_rt_sched_metrics_dec = op_rt_sched_metrics.loc[op_rt_sched_metrics.service_date ==\n",
+ " '2024-12-11T00:00:00.000000000']"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "208f3c21-0b46-4216-a2e6-95fc1c7e37cd",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "op_rt_sched_metrics_dec.organization_name.value_counts().head(15)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "697a0b0c-20ee-417c-95bd-abd53d356295",
+ "metadata": {},
+ "source": [
+ "* There is the rail versus the bus schedule."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "5f0f0033-1702-4677-9d3b-1df79fdeff24",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "op_rt_sched_metrics_dec.loc[\n",
+ " op_rt_sched_metrics_dec.organization_name\n",
+ " == \"Los Angeles County Metropolitan Transportation Authority\"\n",
+ "].T"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "7ce9ee27-a434-4e94-ad3a-3503234291e1",
+ "metadata": {},
+ "source": [
+ "#### How do you know which one is correct?"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "46a4a34c-9e56-4305-ae0c-685a799a3b64",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "op_rt_sched_metrics_dec.loc[\n",
+ " op_rt_sched_metrics_dec.organization_name\n",
+ " == \"Transit Joint Powers Authority for Merced County\"\n",
+ "].T"
]
},
{
"cell_type": "code",
- "execution_count": 4,
- "id": "d513ad73-b50c-4bd6-b00b-3df0ed166c4c",
+ "execution_count": null,
+ "id": "a866f8ee-0150-40ad-99d4-b114041dd9b5",
"metadata": {},
"outputs": [],
"source": [
- "import merge_operator_data"
+ "op_rt_sched_metrics_dec.loc[\n",
+ " op_rt_sched_metrics_dec.organization_name\n",
+ " == \"City of Santa Monica\"\n",
+ "].T"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "15db8be8-f949-4a6a-b298-62a2b162d1eb",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "op_rt_sched_metrics_dec.loc[\n",
+ " op_rt_sched_metrics_dec.organization_name\n",
+ " == \"Tahoe Transportation District\"\n",
+ "].T"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "f0627b80-ac4c-4407-bad3-12f94a0dac50",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "op_rt_sched_metrics_dec.loc[\n",
+ " op_rt_sched_metrics_dec.organization_name\n",
+ " == \"City of Lawndale\"\n",
+ "].T"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "3b45cc80-ac39-4d1c-ae8c-9132c6ec7619",
+ "metadata": {},
+ "source": [
+ "#### Dataframe from `merge_operator_data.concatenate_rt_vs_schedule_operator_metrics` is created [here at `gtfs_funnel/operator_scheduled_stats.py`](https://github.com/cal-itp/data-analyses/blob/1ba0f544a01f99966a6e210dd11666b4fe4a146e/gtfs_funnel/operator_scheduled_stats.py#L147)\n",
+ "* The data is grouped by `gtfs_schedule_dataset_key` and an `organization_name` can have multiple, which is why some organizations have multiple entries."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "704e7d04-1dd8-4ab2-8b59-588649ca9905",
+ "metadata": {},
+ "source": [
+ "#### Other attempts to look at Operator Profiles"
]
},
{
"cell_type": "code",
- "execution_count": 5,
+ "execution_count": null,
+ "id": "fba829d7-7dad-4ba8-8f58-a55a290b71fb",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "url = \"gs://calitp-analytics-data/data-analyses/rt_vs_schedule/digest/operator_profiles.parquet\"\n",
+ "operator_profile_df = pd.read_parquet(url)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "4ae95240-58a7-4e6e-957d-a30400216452",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "operator_profile_df.service_date.unique()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "3bee1206-44f4-4304-b6c2-d248a397ec86",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "march_2023 = operator_profile_df.loc[\n",
+ " operator_profile_df.service_date == \"2023-03-15T00:00:00.000000000\"\n",
+ "]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "4bc3a0ce-f864-48b2-8abe-22cfef2c77a5",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "dec_2024 = operator_profile_df.loc[\n",
+ " operator_profile_df.service_date == \"2024-12-11T00:00:00.000000000\"\n",
+ "]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "142a2233-a259-4a2e-8d18-5e14ecb1bf1d",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "march_2023.organization_name.value_counts().head(12)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "e533058a-e30b-469d-a4b4-dd9487c476c9",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "dec_2024.organization_name.value_counts().head(12)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "a4406245-e9a1-4a7b-9996-30aafca141ea",
+ "metadata": {},
+ "source": [
+ "#### How does Los Angeles County Metropolitan Transportation Authority have two different values?"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "0afa87ac-4a45-455e-8c4d-05514bf8f0b4",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "dec_2024.loc[\n",
+ " dec_2024.organization_name\n",
+ " == \"Basin Transit\"\n",
+ "].T"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "482c959c-1abe-4d24-ad75-ee2e26fe3a72",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "dec_2024.loc[\n",
+ " dec_2024.organization_name\n",
+ " == \"Los Angeles County Metropolitan Transportation Authority\"\n",
+ "]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "061d47a6-ffd8-4d39-8848-db4588b4004d",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "dec_2024.loc[\n",
+ " dec_2024.organization_name == \"Transit Joint Powers Authority for Merced County\"\n",
+ "]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "d60d6540-7290-4540-93a2-925b98fcf101",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "dec_2024.loc[dec_2024.organization_name == \"City of Lawndale\"]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "ceb37389-7ad9-4968-9f18-5f24183f07f6",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "dec_2024.loc[dec_2024.organization_name == \"Palo Verde Valley Transit Agency\"]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "b4804ece-08c0-4e6b-8b6b-62216d6abffd",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "dec_2024.loc[dec_2024.organization_name == \"City of San Luis Obispo\"]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "2949c9b9-154b-44b1-a4bd-88f41ea192b4",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "crosswalk_df = merge_operator_data.concatenate_crosswalks(analysis_date_list)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "96da557f-a1d4-4c14-a67e-03b45466daa1",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "crosswalk_df.head(1)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "367adf95-face-4c66-bec7-7cb8fd8eaaa9",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "march_crosswalk_df = crosswalk_df.loc[]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "ea7bc262-afa1-4193-8580-831587a78c0b",
+ "metadata": {},
+ "source": [
+ "### Op Profiles\n",
+ "* The code for `gtfs_digest/merge_operator.py` stopped working because one of the column names changed. I went into `crosswalk_gtfs_dataset_key_to_organization` to fix that. \n",
+ "* Operator Profiles: are from September 2024 when it's Dec 2024.\n",
+ " * Fixed: was still referencing one of my old testing profiles."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
"id": "cf9b7d10-625b-4c76-bca9-8116aa77c93a",
"metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "'gs://calitp-analytics-data/data-analyses/gtfs_schedule/'"
- ]
- },
- "execution_count": 5,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
+ "outputs": [],
"source": [
"SCHED_GCS"
]
},
{
"cell_type": "code",
- "execution_count": 6,
+ "execution_count": null,
"id": "0cdda265-423c-430e-8685-04dc7cb356cd",
"metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "'crosswalk/gtfs_key_organization'"
- ]
- },
- "execution_count": 6,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
+ "outputs": [],
"source": [
"f\"{GTFS_DATA_DICT.schedule_tables.gtfs_key_crosswalk}\""
]
},
{
"cell_type": "code",
- "execution_count": 7,
+ "execution_count": null,
"id": "3a880fdb-2730-4978-ad7d-e557698d8e70",
"metadata": {},
"outputs": [],
@@ -124,7 +445,7 @@
},
{
"cell_type": "code",
- "execution_count": 8,
+ "execution_count": null,
"id": "b843e240-07ee-4f1a-b29a-3f97e9be8b0e",
"metadata": {},
"outputs": [],
@@ -134,7 +455,7 @@
},
{
"cell_type": "code",
- "execution_count": 9,
+ "execution_count": null,
"id": "2809d41a-1911-45e4-a78a-4e72e73d1f9a",
"metadata": {},
"outputs": [],
@@ -144,7 +465,41 @@
},
{
"cell_type": "code",
- "execution_count": 10,
+ "execution_count": null,
+ "id": "0cddd845-f206-41b5-97dd-fff179a211df",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "dec_crosswalk_df.organization_name.value_counts().head(25)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "bab1c6d1-2421-4f84-9eaa-42bf6b245611",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "dec_crosswalk_df.loc[\n",
+ " dec_crosswalk_df.organization_name == \"City of South San Francisco\"\n",
+ "]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "e18e82dd-5a87-4f49-a374-4bf48c4527ef",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "dec_crosswalk_df.loc[\n",
+ " dec_crosswalk_df.organization_name == \"City and County of San Francisco\"\n",
+ "]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
"id": "f08b5be9-d49d-44b9-8b6a-a74d10682aa7",
"metadata": {},
"outputs": [],
@@ -154,7 +509,7 @@
},
{
"cell_type": "code",
- "execution_count": 11,
+ "execution_count": null,
"id": "2bca5311-669c-4d3e-bc41-d3ebc7a69c3c",
"metadata": {},
"outputs": [],
@@ -166,7 +521,7 @@
},
{
"cell_type": "code",
- "execution_count": 12,
+ "execution_count": null,
"id": "01e983f2-0929-42b8-9026-4509cf033aeb",
"metadata": {},
"outputs": [],
@@ -178,70 +533,37 @@
},
{
"cell_type": "code",
- "execution_count": 13,
+ "execution_count": null,
"id": "3450162d-11a9-47ac-8d55-a883f51b023a",
"metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "set()"
- ]
- },
- "execution_count": 13,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
+ "outputs": [],
"source": [
"nov_cols - sept_cols"
]
},
{
"cell_type": "code",
- "execution_count": 14,
+ "execution_count": null,
"id": "809724b7-aea1-4862-b828-5f4a9b5ec6f6",
"metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "set()"
- ]
- },
- "execution_count": 14,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
+ "outputs": [],
"source": [
"sept_cols - dec_cols"
]
},
{
"cell_type": "code",
- "execution_count": 15,
+ "execution_count": null,
"id": "5e59113f-4dc5-4f57-9ea5-26c09199d706",
"metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "set()"
- ]
- },
- "execution_count": 15,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
+ "outputs": [],
"source": [
"dec_cols - sept_cols"
]
},
{
"cell_type": "code",
- "execution_count": 16,
+ "execution_count": null,
"id": "57f2bca0-1f1b-4940-875b-958783bd941f",
"metadata": {},
"outputs": [],
@@ -253,62 +575,17 @@
},
{
"cell_type": "code",
- "execution_count": 17,
+ "execution_count": null,
"id": "5e651996-43f3-4c02-9631-0f8e44537961",
"metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " primary_uza_code | \n",
- " primary_uza_name | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 1 | \n",
- " None | \n",
- " Oxnard--San Buenaventura (Ventura), CA | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " primary_uza_code primary_uza_name\n",
- "1 None Oxnard--San Buenaventura (Ventura), CA"
- ]
- },
- "execution_count": 17,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
+ "outputs": [],
"source": [
"ventura_dec[[\"primary_uza_code\", \"primary_uza_name\"]].drop_duplicates()"
]
},
{
"cell_type": "code",
- "execution_count": 18,
+ "execution_count": null,
"id": "9de016d3-9f85-48b4-a763-5a5b89dd3ad3",
"metadata": {},
"outputs": [],
@@ -320,62 +597,17 @@
},
{
"cell_type": "code",
- "execution_count": 20,
+ "execution_count": null,
"id": "4bfb30ba-cc1f-41d8-afe8-6fb4ccfc40bf",
"metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " primary_uza_code | \n",
- " primary_uza_name | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 1 | \n",
- " None | \n",
- " Oxnard--San Buenaventura (Ventura), CA | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " primary_uza_code primary_uza_name\n",
- "1 None Oxnard--San Buenaventura (Ventura), CA"
- ]
- },
- "execution_count": 20,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "ventura_sept[['primary_uza_code', 'primary_uza_name']].drop_duplicates()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 21,
+ "outputs": [],
+ "source": [
+ "ventura_sept[[\"primary_uza_code\", \"primary_uza_name\"]].drop_duplicates()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
"id": "81d75c3c-07d4-4997-9774-b7c6e86d4d7a",
"metadata": {},
"outputs": [],
@@ -385,49 +617,27 @@
},
{
"cell_type": "code",
- "execution_count": 23,
+ "execution_count": null,
"id": "cb934b4e-63bc-4015-9363-84152be93c5e",
"metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "array(['2023-03-15T00:00:00.000000000', '2023-04-12T00:00:00.000000000',\n",
- " '2023-05-17T00:00:00.000000000', '2023-06-14T00:00:00.000000000',\n",
- " '2023-07-12T00:00:00.000000000', '2023-08-15T00:00:00.000000000',\n",
- " '2023-09-13T00:00:00.000000000', '2023-10-11T00:00:00.000000000',\n",
- " '2023-11-15T00:00:00.000000000', '2023-12-13T00:00:00.000000000',\n",
- " '2024-01-17T00:00:00.000000000', '2024-02-14T00:00:00.000000000',\n",
- " '2024-03-13T00:00:00.000000000', '2024-04-17T00:00:00.000000000',\n",
- " '2024-05-22T00:00:00.000000000', '2024-06-12T00:00:00.000000000',\n",
- " '2024-07-17T00:00:00.000000000', '2024-08-14T00:00:00.000000000',\n",
- " '2024-09-18T00:00:00.000000000', '2024-10-16T00:00:00.000000000',\n",
- " '2024-11-13T00:00:00.000000000', '2024-12-11T00:00:00.000000000'],\n",
- " dtype='datetime64[ns]')"
- ]
- },
- "execution_count": 23,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
+ "outputs": [],
"source": [
"crosswalk_df.service_date.unique()"
]
},
{
"cell_type": "code",
- "execution_count": 24,
+ "execution_count": null,
"id": "813b89c6-9e26-439b-9bbd-841bd1b53e28",
"metadata": {},
"outputs": [],
"source": [
- "import _section1_utils "
+ "import _section1_utils"
]
},
{
"cell_type": "code",
- "execution_count": 25,
+ "execution_count": null,
"id": "a34c2067-ec22-4576-8ae7-bf28d8b1f433",
"metadata": {},
"outputs": [],
@@ -437,7 +647,7 @@
},
{
"cell_type": "code",
- "execution_count": 26,
+ "execution_count": null,
"id": "2beb41b8-7ce5-4022-9aa3-8ab268ff3102",
"metadata": {},
"outputs": [],
@@ -447,134 +657,10 @@
},
{
"cell_type": "code",
- "execution_count": 27,
+ "execution_count": null,
"id": "5f2a10ad-32ad-4f59-968e-5d74003f2aea",
"metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " schedule_gtfs_dataset_key | \n",
- " VP per Minute (All Routes) | \n",
- " Spatial Accuracy (All Routes) | \n",
- " Date | \n",
- " # Routes | \n",
- " # Trips | \n",
- " # Shapes | \n",
- " # Stops | \n",
- " # Arrivals | \n",
- " Operator Service Miles | \n",
- " Avg Arrivals per Stop | \n",
- " # Downtown Local Route Types | \n",
- " # Local Route Types | \n",
- " # Coverage Route Types | \n",
- " # Rapid Route Types | \n",
- " # Express Route Types | \n",
- " # Rail Route Types | \n",
- " Transit Operator | \n",
- " Organization ID | \n",
- " Organization | \n",
- " District | \n",
- " counties_served | \n",
- " service_area_sq_miles | \n",
- " hq_city | \n",
- " service_area_pop | \n",
- " organization_type | \n",
- " primary_uza_name | \n",
- " reporter_type | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 16 | \n",
- " 9809d3f8121513057bc5cb8de7b54ce2 | \n",
- " 1.94 | \n",
- " 89.90 | \n",
- " 2024-12-11 | \n",
- " 34.00 | \n",
- " 1036.00 | \n",
- " 70.00 | \n",
- " 919.00 | \n",
- " 23141.00 | \n",
- " 467.60 | \n",
- " 25.18 | \n",
- " 27.00 | \n",
- " 18.00 | \n",
- " 39.00 | \n",
- " 30.00 | \n",
- " 1.00 | \n",
- " 0.00 | \n",
- " Monterey Salinas Schedule | \n",
- " receZJ9sEnP9vy3g0 | \n",
- " Monterey-Salinas Transit | \n",
- " 05 - San Luis Obispo | \n",
- " Monterey | \n",
- " 159 | \n",
- " Monterey | \n",
- " 437325 | \n",
- " Independent Public Agency or Authority of Transit Service | \n",
- " Seaside--Monterey--Pacific Grove, CA | \n",
- " Full Reporter | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " schedule_gtfs_dataset_key VP per Minute (All Routes) \\\n",
- "16 9809d3f8121513057bc5cb8de7b54ce2 1.94 \n",
- "\n",
- " Spatial Accuracy (All Routes) Date # Routes # Trips # Shapes \\\n",
- "16 89.90 2024-12-11 34.00 1036.00 70.00 \n",
- "\n",
- " # Stops # Arrivals Operator Service Miles Avg Arrivals per Stop \\\n",
- "16 919.00 23141.00 467.60 25.18 \n",
- "\n",
- " # Downtown Local Route Types # Local Route Types # Coverage Route Types \\\n",
- "16 27.00 18.00 39.00 \n",
- "\n",
- " # Rapid Route Types # Express Route Types # Rail Route Types \\\n",
- "16 30.00 1.00 0.00 \n",
- "\n",
- " Transit Operator Organization ID Organization \\\n",
- "16 Monterey Salinas Schedule receZJ9sEnP9vy3g0 Monterey-Salinas Transit \n",
- "\n",
- " District counties_served service_area_sq_miles hq_city \\\n",
- "16 05 - San Luis Obispo Monterey 159 Monterey \n",
- "\n",
- " service_area_pop \\\n",
- "16 437325 \n",
- "\n",
- " organization_type \\\n",
- "16 Independent Public Agency or Authority of Transit Service \n",
- "\n",
- " primary_uza_name reporter_type \n",
- "16 Seaside--Monterey--Pacific Grove, CA Full Reporter "
- ]
- },
- "execution_count": 27,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
+ "outputs": [],
"source": [
"ntd_profile"
]
@@ -589,7 +675,7 @@
},
{
"cell_type": "code",
- "execution_count": 32,
+ "execution_count": null,
"id": "b91470ac-fa05-4083-8352-f5adf73712ed",
"metadata": {},
"outputs": [],
@@ -599,339 +685,88 @@
},
{
"cell_type": "code",
- "execution_count": 33,
+ "execution_count": null,
"id": "7c847cb9-4d2e-483d-a8c2-af3be35af44c",
"metadata": {},
"outputs": [],
"source": [
"# Keep only rows that are found in both schedule and real time data\n",
- "schd_vp_df = (pd.read_parquet(schd_vp_url, \n",
- " filters=[[(\"organization_name\", \"==\", organization_name),\n",
- " (\"sched_rt_category\", \"==\", \"schedule_and_vp\")]])\n",
- " )"
+ "schd_vp_df = pd.read_parquet(\n",
+ " schd_vp_url,\n",
+ " filters=[\n",
+ " [\n",
+ " (\"organization_name\", \"==\", organization_name),\n",
+ " (\"sched_rt_category\", \"==\", \"schedule_and_vp\"),\n",
+ " ]\n",
+ " ],\n",
+ ")"
]
},
{
"cell_type": "code",
- "execution_count": 59,
+ "execution_count": null,
"id": "a5ff575d-9722-49c2-b27f-a7fa8488f9b9",
"metadata": {},
"outputs": [],
"source": [
- "schd_vp_df_gtfskeys = schd_vp_df[[\"schedule_gtfs_dataset_key\",\"service_date\"]].drop_duplicates()"
+ "schd_vp_df_gtfskeys = schd_vp_df[\n",
+ " [\"schedule_gtfs_dataset_key\", \"service_date\"]\n",
+ "].drop_duplicates()"
]
},
{
"cell_type": "code",
- "execution_count": 34,
+ "execution_count": null,
"id": "d10abab2-4994-42b6-a745-d9bf792e8e8b",
"metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " schedule_gtfs_dataset_key | \n",
- " direction_id | \n",
- " time_period | \n",
- " avg_scheduled_service_minutes | \n",
- " avg_stop_miles | \n",
- " n_scheduled_trips | \n",
- " frequency | \n",
- " is_express | \n",
- " is_rapid | \n",
- " is_rail | \n",
- " is_coverage | \n",
- " is_downtown_local | \n",
- " is_local | \n",
- " service_date | \n",
- " typology | \n",
- " minutes_atleast1_vp | \n",
- " minutes_atleast2_vp | \n",
- " total_rt_service_minutes | \n",
- " total_scheduled_service_minutes | \n",
- " total_vp | \n",
- " vp_in_shape | \n",
- " is_early | \n",
- " is_ontime | \n",
- " is_late | \n",
- " n_vp_trips | \n",
- " vp_per_minute | \n",
- " pct_in_shape | \n",
- " pct_rt_journey_atleast1_vp | \n",
- " pct_rt_journey_atleast2_vp | \n",
- " pct_sched_journey_atleast1_vp | \n",
- " pct_sched_journey_atleast2_vp | \n",
- " rt_sched_journey_ratio | \n",
- " avg_rt_service_minutes | \n",
- " schedule_source_record_id_x | \n",
- " sched_rt_category | \n",
- " speed_mph | \n",
- " name | \n",
- " route_long_name | \n",
- " route_short_name | \n",
- " route_combined_name | \n",
- " route_id | \n",
- " schedule_source_record_id_y | \n",
- " base64_url | \n",
- " organization_source_record_id | \n",
- " organization_name | \n",
- " caltrans_district | \n",
- " route_primary_direction | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 162562 | \n",
- " 88d9aa978e4ca97e5ba1dbbc20f3fc19 | \n",
- " 0.00 | \n",
- " all_day | \n",
- " 22.00 | \n",
- " 0.13 | \n",
- " 12 | \n",
- " 0.50 | \n",
- " 0.00 | \n",
- " 0.00 | \n",
- " 0.00 | \n",
- " 1.00 | \n",
- " 1.00 | \n",
- " 0.00 | \n",
- " 2023-03-15 | \n",
- " downtown_local | \n",
- " 241 | \n",
- " 210 | \n",
- " 259.08 | \n",
- " 264.00 | \n",
- " 464 | \n",
- " 427 | \n",
- " 5 | \n",
- " 4 | \n",
- " 3 | \n",
- " 12 | \n",
- " 1.79 | \n",
- " 0.92 | \n",
- " 0.93 | \n",
- " 0.81 | \n",
- " 0.91 | \n",
- " 0.80 | \n",
- " 0.98 | \n",
- " 21.59 | \n",
- " None | \n",
- " schedule_and_vp | \n",
- " 13.38 | \n",
- " Monterey Salinas Schedule | \n",
- " Monterey - PG via Asilomar | \n",
- " 1 | \n",
- " 1 Monterey - PG via Asilomar | \n",
- " 001 | \n",
- " recysP9m9kjCJwHZe | \n",
- " aHR0cHM6Ly93d3cubXN0Lm9yZy9nb29nbGUvZ29vZ2xlX3RyYW5zaXQuemlw | \n",
- " receZJ9sEnP9vy3g0 | \n",
- " Monterey-Salinas Transit | \n",
- " 05 - San Luis Obispo | \n",
- " Eastbound | \n",
- "
\n",
- " \n",
- " 162563 | \n",
- " 88d9aa978e4ca97e5ba1dbbc20f3fc19 | \n",
- " 0.00 | \n",
- " all_day | \n",
- " 22.00 | \n",
- " 0.13 | \n",
- " 12 | \n",
- " 0.50 | \n",
- " 0.00 | \n",
- " 0.00 | \n",
- " 0.00 | \n",
- " 1.00 | \n",
- " 1.00 | \n",
- " 0.00 | \n",
- " 2023-03-15 | \n",
- " downtown_local | \n",
- " 241 | \n",
- " 210 | \n",
- " 259.08 | \n",
- " 264.00 | \n",
- " 464 | \n",
- " 427 | \n",
- " 5 | \n",
- " 4 | \n",
- " 3 | \n",
- " 12 | \n",
- " 1.79 | \n",
- " 0.92 | \n",
- " 0.93 | \n",
- " 0.81 | \n",
- " 0.91 | \n",
- " 0.80 | \n",
- " 0.98 | \n",
- " 21.59 | \n",
- " None | \n",
- " schedule_and_vp | \n",
- " 13.38 | \n",
- " Monterey Salinas Schedule | \n",
- " Monterey - PG via Asilomar | \n",
- " 1 | \n",
- " 1 Monterey - PG via Asilomar | \n",
- " 001 | \n",
- " recysP9m9kjCJwHZe | \n",
- " aHR0cHM6Ly93d3cubXN0Lm9yZy9nb29nbGUvZ29vZ2xlX3RyYW5zaXQuemlw | \n",
- " receZJ9sEnP9vy3g0 | \n",
- " Monterey-Salinas Transit | \n",
- " 05 - San Luis Obispo | \n",
- " Eastbound | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " schedule_gtfs_dataset_key direction_id time_period \\\n",
- "162562 88d9aa978e4ca97e5ba1dbbc20f3fc19 0.00 all_day \n",
- "162563 88d9aa978e4ca97e5ba1dbbc20f3fc19 0.00 all_day \n",
- "\n",
- " avg_scheduled_service_minutes avg_stop_miles n_scheduled_trips \\\n",
- "162562 22.00 0.13 12 \n",
- "162563 22.00 0.13 12 \n",
- "\n",
- " frequency is_express is_rapid is_rail is_coverage \\\n",
- "162562 0.50 0.00 0.00 0.00 1.00 \n",
- "162563 0.50 0.00 0.00 0.00 1.00 \n",
- "\n",
- " is_downtown_local is_local service_date typology \\\n",
- "162562 1.00 0.00 2023-03-15 downtown_local \n",
- "162563 1.00 0.00 2023-03-15 downtown_local \n",
- "\n",
- " minutes_atleast1_vp minutes_atleast2_vp total_rt_service_minutes \\\n",
- "162562 241 210 259.08 \n",
- "162563 241 210 259.08 \n",
- "\n",
- " total_scheduled_service_minutes total_vp vp_in_shape is_early \\\n",
- "162562 264.00 464 427 5 \n",
- "162563 264.00 464 427 5 \n",
- "\n",
- " is_ontime is_late n_vp_trips vp_per_minute pct_in_shape \\\n",
- "162562 4 3 12 1.79 0.92 \n",
- "162563 4 3 12 1.79 0.92 \n",
- "\n",
- " pct_rt_journey_atleast1_vp pct_rt_journey_atleast2_vp \\\n",
- "162562 0.93 0.81 \n",
- "162563 0.93 0.81 \n",
- "\n",
- " pct_sched_journey_atleast1_vp pct_sched_journey_atleast2_vp \\\n",
- "162562 0.91 0.80 \n",
- "162563 0.91 0.80 \n",
- "\n",
- " rt_sched_journey_ratio avg_rt_service_minutes \\\n",
- "162562 0.98 21.59 \n",
- "162563 0.98 21.59 \n",
- "\n",
- " schedule_source_record_id_x sched_rt_category speed_mph \\\n",
- "162562 None schedule_and_vp 13.38 \n",
- "162563 None schedule_and_vp 13.38 \n",
- "\n",
- " name route_long_name \\\n",
- "162562 Monterey Salinas Schedule Monterey - PG via Asilomar \n",
- "162563 Monterey Salinas Schedule Monterey - PG via Asilomar \n",
- "\n",
- " route_short_name route_combined_name route_id \\\n",
- "162562 1 1 Monterey - PG via Asilomar 001 \n",
- "162563 1 1 Monterey - PG via Asilomar 001 \n",
- "\n",
- " schedule_source_record_id_y \\\n",
- "162562 recysP9m9kjCJwHZe \n",
- "162563 recysP9m9kjCJwHZe \n",
- "\n",
- " base64_url \\\n",
- "162562 aHR0cHM6Ly93d3cubXN0Lm9yZy9nb29nbGUvZ29vZ2xlX3RyYW5zaXQuemlw \n",
- "162563 aHR0cHM6Ly93d3cubXN0Lm9yZy9nb29nbGUvZ29vZ2xlX3RyYW5zaXQuemlw \n",
- "\n",
- " organization_source_record_id organization_name \\\n",
- "162562 receZJ9sEnP9vy3g0 Monterey-Salinas Transit \n",
- "162563 receZJ9sEnP9vy3g0 Monterey-Salinas Transit \n",
- "\n",
- " caltrans_district route_primary_direction \n",
- "162562 05 - San Luis Obispo Eastbound \n",
- "162563 05 - San Luis Obispo Eastbound "
- ]
- },
- "execution_count": 34,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
+ "outputs": [],
"source": [
"schd_vp_df.head(2)"
]
},
{
"cell_type": "code",
- "execution_count": 61,
+ "execution_count": null,
"id": "13dfc48a-6f10-4f7e-9307-686f64c8fcfc",
"metadata": {},
"outputs": [],
"source": [
"schedule_by_route = merge_data.concatenate_schedule_by_route_direction(\n",
- " analysis_date_list\n",
- " )"
+ " analysis_date_list\n",
+ ")"
]
},
{
"cell_type": "code",
- "execution_count": 62,
+ "execution_count": null,
"id": "2cce00c2-a8e4-4c11-b157-f8e98b9018d3",
"metadata": {},
"outputs": [],
"source": [
- "schedule_by_route_gtfskeys = schedule_by_route[[\"schedule_gtfs_dataset_key\",\"service_date\"]].drop_duplicates()"
+ "schedule_by_route_gtfskeys = schedule_by_route[\n",
+ " [\"schedule_gtfs_dataset_key\", \"service_date\"]\n",
+ "].drop_duplicates()"
]
},
{
"cell_type": "code",
- "execution_count": 63,
+ "execution_count": null,
"id": "be70ebf2-b2f5-4070-a7ee-954952d9674a",
"metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "_merge \n",
- "right_only 1675\n",
- "both 1593\n",
- "left_only 55\n",
- "dtype: int64"
- ]
- },
- "execution_count": 63,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
+ "outputs": [],
"source": [
- "pd.merge(df_avg_speeds_gtfskeys, schedule_by_route_gtfskeys, on = [\"schedule_gtfs_dataset_key\",\"service_date\"],\n",
- " how = \"outer\", indicator = True)[[\"_merge\"]].value_counts()"
+ "pd.merge(\n",
+ " df_avg_speeds_gtfskeys,\n",
+ " schedule_by_route_gtfskeys,\n",
+ " on=[\"schedule_gtfs_dataset_key\", \"service_date\"],\n",
+ " how=\"outer\",\n",
+ " indicator=True,\n",
+ ")[[\"_merge\"]].value_counts()"
]
},
{
"cell_type": "code",
- "execution_count": 36,
+ "execution_count": null,
"id": "dbd0f246-4616-4da5-9c51-b53abbcc8c9a",
"metadata": {},
"outputs": [],
@@ -941,7 +776,7 @@
},
{
"cell_type": "code",
- "execution_count": 39,
+ "execution_count": null,
"id": "40700f2b-2ccd-46e2-982b-e4306d734654",
"metadata": {},
"outputs": [],
@@ -961,192 +796,60 @@
},
{
"cell_type": "code",
- "execution_count": 54,
+ "execution_count": null,
"id": "16d50718-c09a-4e4f-bab7-90c7b6ea3f16",
"metadata": {},
"outputs": [],
"source": [
- "df_avg_speeds = merge_data.concatenate_speeds_by_route_direction(\n",
- " analysis_date_list\n",
- " )"
+ "df_avg_speeds = merge_data.concatenate_speeds_by_route_direction(analysis_date_list)"
]
},
{
"cell_type": "code",
- "execution_count": 55,
+ "execution_count": null,
"id": "9cff0c4b-50bf-4e5f-8ad5-8eab93b6431a",
"metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "array(['2023-04-12T00:00:00.000000000', '2023-05-17T00:00:00.000000000',\n",
- " '2023-06-14T00:00:00.000000000', '2023-07-12T00:00:00.000000000',\n",
- " '2023-08-15T00:00:00.000000000', '2023-09-13T00:00:00.000000000',\n",
- " '2023-10-11T00:00:00.000000000', '2023-11-15T00:00:00.000000000',\n",
- " '2023-12-13T00:00:00.000000000', '2024-01-17T00:00:00.000000000',\n",
- " '2024-02-14T00:00:00.000000000', '2024-03-13T00:00:00.000000000',\n",
- " '2024-04-17T00:00:00.000000000', '2024-05-22T00:00:00.000000000',\n",
- " '2024-06-12T00:00:00.000000000', '2024-07-17T00:00:00.000000000',\n",
- " '2024-08-14T00:00:00.000000000', '2024-09-18T00:00:00.000000000',\n",
- " '2024-10-16T00:00:00.000000000', '2024-11-13T00:00:00.000000000',\n",
- " '2024-12-11T00:00:00.000000000', '2023-03-15T00:00:00.000000000'],\n",
- " dtype='datetime64[ns]')"
- ]
- },
- "execution_count": 55,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
+ "outputs": [],
"source": [
"df_avg_speeds.service_date.unique()"
]
},
{
"cell_type": "code",
- "execution_count": 56,
+ "execution_count": null,
"id": "9268e7bd-5f99-46de-975b-327fe7e72c9b",
"metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " schedule_gtfs_dataset_key | \n",
- " route_id | \n",
- " direction_id | \n",
- " time_period | \n",
- " speed_mph | \n",
- " service_date | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 0 | \n",
- " 015d67d5b75b5cf2b710bbadadfb75f5 | \n",
- " 17 | \n",
- " 0.00 | \n",
- " all_day | \n",
- " 16.63 | \n",
- " 2023-04-12 | \n",
- "
\n",
- " \n",
- " 1 | \n",
- " 015d67d5b75b5cf2b710bbadadfb75f5 | \n",
- " 17 | \n",
- " 0.00 | \n",
- " all_day | \n",
- " 15.72 | \n",
- " 2023-05-17 | \n",
- "
\n",
- " \n",
- " 2 | \n",
- " 015d67d5b75b5cf2b710bbadadfb75f5 | \n",
- " 17 | \n",
- " 0.00 | \n",
- " all_day | \n",
- " 15.17 | \n",
- " 2023-06-14 | \n",
- "
\n",
- " \n",
- " 3 | \n",
- " 015d67d5b75b5cf2b710bbadadfb75f5 | \n",
- " 17 | \n",
- " 0.00 | \n",
- " all_day | \n",
- " 15.41 | \n",
- " 2023-07-12 | \n",
- "
\n",
- " \n",
- " 4 | \n",
- " 015d67d5b75b5cf2b710bbadadfb75f5 | \n",
- " 17 | \n",
- " 0.00 | \n",
- " all_day | \n",
- " 15.06 | \n",
- " 2023-08-15 | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " schedule_gtfs_dataset_key route_id direction_id time_period \\\n",
- "0 015d67d5b75b5cf2b710bbadadfb75f5 17 0.00 all_day \n",
- "1 015d67d5b75b5cf2b710bbadadfb75f5 17 0.00 all_day \n",
- "2 015d67d5b75b5cf2b710bbadadfb75f5 17 0.00 all_day \n",
- "3 015d67d5b75b5cf2b710bbadadfb75f5 17 0.00 all_day \n",
- "4 015d67d5b75b5cf2b710bbadadfb75f5 17 0.00 all_day \n",
- "\n",
- " speed_mph service_date \n",
- "0 16.63 2023-04-12 \n",
- "1 15.72 2023-05-17 \n",
- "2 15.17 2023-06-14 \n",
- "3 15.41 2023-07-12 \n",
- "4 15.06 2023-08-15 "
- ]
- },
- "execution_count": 56,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
+ "outputs": [],
"source": [
"df_avg_speeds.head()"
]
},
{
"cell_type": "code",
- "execution_count": 58,
+ "execution_count": null,
"id": "670ec966-66d9-4405-9887-03cec3340e45",
"metadata": {},
"outputs": [],
"source": [
- "df_avg_speeds_gtfskeys = df_avg_speeds[[\"schedule_gtfs_dataset_key\",\"service_date\"]].drop_duplicates()"
+ "df_avg_speeds_gtfskeys = df_avg_speeds[\n",
+ " [\"schedule_gtfs_dataset_key\", \"service_date\"]\n",
+ "].drop_duplicates()"
]
},
{
"cell_type": "code",
- "execution_count": 60,
+ "execution_count": null,
"id": "6187a364-c90c-462c-b1cd-e3171a5651f4",
"metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "_merge \n",
- "left_only 1626\n",
- "both 22\n",
- "right_only 0\n",
- "dtype: int64"
- ]
- },
- "execution_count": 60,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "pd.merge(df_avg_speeds_gtfskeys, schd_vp_df_gtfskeys, on = [\"schedule_gtfs_dataset_key\",\"service_date\"],\n",
- " how = \"outer\", indicator = True)[[\"_merge\"]].value_counts()"
+ "outputs": [],
+ "source": [
+ "pd.merge(\n",
+ " df_avg_speeds_gtfskeys,\n",
+ " schd_vp_df_gtfskeys,\n",
+ " on=[\"schedule_gtfs_dataset_key\", \"service_date\"],\n",
+ " how=\"outer\",\n",
+ " indicator=True,\n",
+ ")[[\"_merge\"]].value_counts()"
]
}
],
diff --git a/gtfs_digest/45_missing_routes2.ipynb b/gtfs_digest/45_missing_routes2.ipynb
new file mode 100644
index 000000000..9488b17e9
--- /dev/null
+++ b/gtfs_digest/45_missing_routes2.ipynb
@@ -0,0 +1,1001 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "id": "df9388c2-122f-470c-8b96-4f7cbffea26f",
+ "metadata": {},
+ "source": [
+ "## Finding Missing Routes\n",
+ "* [Issue](https://github.com/cal-itp/data-analyses/issues/1312): Capital Corridor doesn't have any rail routes. \n",
+ "* [Most of Santa Maria's routes not showing up in GTFS Digest](https://github.com/cal-itp/data-analyses/issues/1313)\n",
+ "* Rerun all the scripts that create the underlying dataframes for November date (`df_sched`,`df_avg_speeds`,`df_rt_sched`) and merge them using `gtfs_digest/merge_data.merge_data_sources_by_route_direction()`"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "id": "d898cfa3-466e-4ca2-8484-e381b6fc4ce1",
+ "metadata": {},
+ "outputs": [
+ {
+ "ename": "SyntaxError",
+ "evalue": "invalid syntax (_section2_utils.py, line 896)",
+ "output_type": "error",
+ "traceback": [
+ "Traceback \u001b[0;36m(most recent call last)\u001b[0m:\n",
+ "\u001b[0m File \u001b[1;32m/opt/conda/lib/python3.9/site-packages/IPython/core/interactiveshell.py:3508\u001b[0m in \u001b[1;35mrun_code\u001b[0m\n exec(code_obj, self.user_global_ns, self.user_ns)\u001b[0m\n",
+ "\u001b[0;36m Cell \u001b[0;32mIn[1], line 1\u001b[0;36m\n\u001b[0;31m import _section2_utils\u001b[0;36m\n",
+ "\u001b[0;36m File \u001b[0;32m~/data-analyses/gtfs_digest/_section2_utils.py:896\u001b[0;36m\u001b[0m\n\u001b[0;31m y_col = \"Speed (MPH)\",\u001b[0m\n\u001b[0m ^\u001b[0m\n\u001b[0;31mSyntaxError\u001b[0m\u001b[0;31m:\u001b[0m invalid syntax\n"
+ ]
+ }
+ ],
+ "source": [
+ "import _section2_utils\n",
+ "import geopandas as gpd\n",
+ "import merge_data\n",
+ "import numpy as np\n",
+ "import pandas as pd\n",
+ "from segment_speed_utils import gtfs_schedule_wrangling\n",
+ "from update_vars import GTFS_DATA_DICT, RT_SCHED_GCS, SCHED_GCS, SEGMENT_GCS"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "7a5a0cb2-d314-47aa-886c-5ebdf143905b",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "pd.options.display.max_columns = 100\n",
+ "pd.options.display.float_format = \"{:.2f}\".format\n",
+ "pd.set_option(\"display.max_rows\", None)\n",
+ "pd.set_option(\"display.max_colwidth\", None)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "c4d1d951-101a-4bed-8774-d2c3ff1605e9",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "org_name_lists = [\"Capitol Corridor Joint Powers Authority\", \"City of Santa Maria\"]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "id": "efde4bc7-fd20-4c73-9ec4-6982d4643e39",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "analysis_date_list = [\"2024-11-13\"]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "id": "93682bff-3d64-4d60-83a6-98234cc2bbdd",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "one_analysis_date = \"2024-11-13\""
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "id": "69a13fbc-7af0-408d-b4f1-1a78a35ffa86",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "schd_keys = [\n",
+ " \"5a8721fe96786fcd25fba1f8a0ee6358\",\n",
+ " \"73105f2d1cabc8170ab066d96863c5d5\",\n",
+ " \"f5a749dd65924e025b1293c58f95f8d6\",\n",
+ "]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "b3197201-0f2c-471e-bc84-fb518e9a2c93",
+ "metadata": {},
+ "source": [
+ "### Run the scripts that create the following dataframes for November.\n",
+ "* `df_sched`: `gtfs_funnel/schedule_stats_by_route_direction`\n",
+ "* `df_rt_sched`: `rt_scheduled_v_ran/scripts/rt_v_scheduled_routes`\n",
+ "* `df_avg_speeds`: `rt_segment_speeds/script/average_summary_speed`"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "id": "c1bfd907-5907-4f08-a841-27ff992b10fb",
+ "metadata": {},
+ "outputs": [
+ {
+ "ename": "NameError",
+ "evalue": "name 'RT_SCHED_GCS' is not defined",
+ "output_type": "error",
+ "traceback": [
+ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+ "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
+ "Cell \u001b[0;32mIn[5], line 2\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;66;03m# df_sched\u001b[39;00m\n\u001b[0;32m----> 2\u001b[0m \u001b[43mRT_SCHED_GCS\u001b[49m\n",
+ "\u001b[0;31mNameError\u001b[0m: name 'RT_SCHED_GCS' is not defined"
+ ]
+ }
+ ],
+ "source": [
+ "# df_sched\n",
+ "RT_SCHED_GCS"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "d377f69c-b363-4b1d-889b-941a88eede10",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "ROUTE_DIR_EXPORT = GTFS_DATA_DICT.rt_vs_schedule_tables.sched_route_direction_metrics"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "ae25396c-3eb6-4c2a-b036-ffda5c481b5a",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "ROUTE_DIR_EXPORT"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "7e3a136e-f4a0-4943-b603-0435a759bfbe",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df_schedule = pd.read_parquet(\n",
+ " \"gs://calitp-analytics-data/data-analyses/rt_vs_schedule/schedule_route_dir/schedule_route_direction_metrics_2024-11-13.parquet\"\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "266efbce-6c7a-4cc4-84ee-f82de18cd0c6",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df_schedule.columns"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "c7dd1435-60ee-4894-bcd1-69b1ad8c1c41",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "filtered_df_schedule = df_schedule.loc[\n",
+ " df_schedule.schedule_gtfs_dataset_key.isin(schd_keys)\n",
+ "]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "440e76d9-43f2-495b-9f97-bc0f77c44435",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "filtered_df_schedule.route_id.unique()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "c7b96e3b-d0ef-4c56-b163-292745a9e7e5",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# df_avg_speeds\n",
+ "segment_type = \"rt_stop_times\"\n",
+ "\n",
+ "dict_inputs = GTFS_DATA_DICT[segment_type]\n",
+ "ROUTE_DIR_FILE = dict_inputs[\"route_dir_single_summary\"]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "c9b29c6e-6255-43ef-9409-94c152acfa93",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "SEGMENT_GCS"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "ca8c3fc6-a08f-4c49-b020-266c50b9a49e",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "ROUTE_DIR_FILE"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "3e58057d-4851-4da5-b30b-873031266279",
+ "metadata": {},
+ "source": [
+ "#### Average speeds is missing a lot of stuff"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "2b012e83-e2c1-4198-a35f-8147a20dc6c6",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df_avg_speeds = pd.read_parquet(\n",
+ " \"gs://calitp-analytics-data/data-analyses/rt_segment_speeds/rollup_singleday/speeds_route_dir_2024-11-13.parquet\"\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "8741de0f-531d-4a47-9c7d-fae01fe91c1f",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "filtered_df_avg_speeds = df_avg_speeds.loc[\n",
+ " df_avg_speeds.schedule_gtfs_dataset_key.isin(schd_keys)\n",
+ "]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "1fb0dfbe-8574-4427-87a5-b514aa77c753",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "filtered_df_avg_speeds.route_id.unique()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "cdd75c43-d2e7-49bd-972e-4a28c05feedb",
+ "metadata": {
+ "scrolled": true,
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "df_avg_speeds.loc[\n",
+ " df_avg_speeds.organization_name == \"Marin County Transit District\"\n",
+ "].drop(columns=[\"geometry\"])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "e2defe91-ac5c-44dd-8ab2-44ae06a22a61",
+ "metadata": {
+ "scrolled": true,
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "# filtered_df_avg_speeds[[ 'route_id', 'direction_id', 'time_period','speed_mph']]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "b31aa6db-06cc-4af2-b27e-ad18b58f45bb",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# df_rt_sched\n",
+ "RT_SCHED_GCS"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "b4aaed52-8c8d-4368-892c-a4a6ffbc2a3a",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "GTFS_DATA_DICT.rt_vs_schedule_tables.vp_route_direction_metrics"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "26a12c09-82a5-4199-801f-e374bb20b361",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df_rt_sched = pd.read_parquet(\n",
+ " \"gs://calitp-analytics-data/data-analyses/rt_vs_schedule/vp_route_dir/route_direction_metrics_2024-11-13.parquet\"\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "2dbe65fb-f60d-431e-b8f4-143db6cfa5da",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df_rt_sched.columns"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "a8209979-ea65-44e6-92bc-94c1d43e4e57",
+ "metadata": {},
+ "source": [
+ "### Open up original file"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "a4df5906-9ef9-49e6-875b-95b36afa4063",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "schd_vp_url = f\"{GTFS_DATA_DICT.digest_tables.dir}{GTFS_DATA_DICT.digest_tables.route_schedule_vp}.parquet\""
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "185e680c-6027-47fa-b488-cb0b20e27a71",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "schd_vp_url"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "534a48bf-de90-460c-9515-3d0e5519274d",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "schd_vp_df = pd.read_parquet(schd_vp_url)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "20bf983a-bcc5-49ff-be63-0ef4207b801f",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "schd_vp_df2 = schd_vp_df.loc[schd_vp_df.organization_name.isin(org_name_lists)]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "3becbc8b-4098-4b16-9bae-6aa50bd658f7",
+ "metadata": {},
+ "source": [
+ "### Merge all the files based on `gtfs_digest/merge_data`"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "3d6a52e5-1981-4eca-b166-76abe1420dfc",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "service_date_datetime = pd.to_datetime(\"2024-11-13T00:00:00.000000000\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "87556b98-3ac3-46ee-b9ef-eb0a10f29dab",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df_schedule[\"service_date\"] = service_date_datetime"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "34d44c3d-ee9e-4fc5-b16c-7e2448b81d40",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df_rt_sched[\"service_date\"] = service_date_datetime"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "a67a9a2a-9045-44de-84f5-7d5c1a678dfc",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df_avg_speeds[\"service_date\"] = service_date_datetime"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "305c73b1-41f3-4237-ac0c-156097237e42",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df_crosswalk = merge_data.concatenate_crosswalk_organization(analysis_date_list)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "314c4209-f8f9-4c14-ba02-5b77705721a8",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "route_time_cols = [\n",
+ " \"schedule_gtfs_dataset_key\",\n",
+ " \"route_id\",\n",
+ " \"direction_id\",\n",
+ " \"time_period\",\n",
+ "]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "c4506f3c-3ed7-4648-a6bf-9faaa15cfcf2",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "primary_typology = merge_data.set_primary_typology(df_schedule)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "09fc686a-6773-4bab-a0f3-c0342ab382db",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df_schedule2 = pd.merge(df_schedule, primary_typology, on=route_time_cols, how=\"left\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "185d615b-cf0d-45ae-ab89-7e5ab1bab7c8",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "d35b1214-e648-4d58-80aa-baca192bcbf4",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df = pd.merge(\n",
+ " df_schedule2,\n",
+ " df_rt_sched,\n",
+ " on=route_time_cols + [\"service_date\"],\n",
+ " how=\"outer\",\n",
+ " indicator=\"sched_rt_category\",\n",
+ ").merge(\n",
+ " df_avg_speeds,\n",
+ " on=route_time_cols + [\"service_date\"],\n",
+ " how=\"outer\",\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "0a31a83c-c7f0-4d9d-a30a-12672bd5de54",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df = (\n",
+ " df.assign(\n",
+ " sched_rt_category=df.sched_rt_category.map(\n",
+ " gtfs_schedule_wrangling.sched_rt_category_dict\n",
+ " )\n",
+ " )\n",
+ " .pipe(\n",
+ " merge_data.merge_in_standardized_route_names,\n",
+ " )\n",
+ " .merge(\n",
+ " df_crosswalk,\n",
+ " on=[\"schedule_gtfs_dataset_key\", \"name\", \"service_date\"],\n",
+ " how=\"left\",\n",
+ " )\n",
+ " .pipe(\n",
+ " # Find the most common cardinal direction\n",
+ " gtfs_schedule_wrangling.top_cardinal_direction\n",
+ " )\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "7a51500f-6155-4c50-b484-3f05767d47ca",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df = df.rename(columns={\"n_trips\": \"n_scheduled_trips\"})"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "67050044-6890-4230-bad8-e7eead2e890c",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "integrify = [\n",
+ " \"n_scheduled_trips\",\n",
+ " \"n_vp_trips\",\n",
+ " \"minutes_atleast1_vp\",\n",
+ " \"minutes_atleast2_vp\",\n",
+ " \"total_vp\",\n",
+ " \"vp_in_shape\",\n",
+ " \"is_early\",\n",
+ " \"is_ontime\",\n",
+ " \"is_late\",\n",
+ "]\n",
+ "\n",
+ "df[integrify] = df[integrify].fillna(0).astype(\"int\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "88bfc756-435d-48a9-8ac0-81c49ef96933",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "repeated_y_cols = list([col for col in df.columns if \"_y\" in col.lower()])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "47042af9-1976-4498-88d3-7211fd1fbd05",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df = df.drop(columns=repeated_y_cols)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "70781682-a47c-448d-b634-047fcb60abf9",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "repeated_x_cols = list([col for col in df.columns if \"_x\" in col.lower()])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "e3fd6d14-9f4e-4954-b32e-bb81382b7299",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df = df.drop(columns=repeated_x_cols)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "4f1ab64c-1da2-4898-a4d5-6534f1580a0e",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df.columns"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "fbb26e1d-3325-4b90-865a-668d777aaa49",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df.loc[df.schedule_gtfs_dataset_key.isin(schd_keys)].route_id.value_counts()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "f461c76c-b5fa-453c-ae1a-c90ec5e6a437",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df.info()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "2b4c1331-43e7-4a72-8bef-760114faf9a7",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df.sched_rt_category.value_counts()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "36d41d75-3390-4a32-a8c6-6da23d675862",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "filtered_df = df.loc[df.schedule_gtfs_dataset_key.isin(schd_keys)]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "f16bbe7a-c1db-41ac-8d52-81627218da4c",
+ "metadata": {
+ "scrolled": true,
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "filtered_df[\n",
+ " [\n",
+ " \"organization_name\",\n",
+ " \"route_combined_name\",\n",
+ " \"sched_rt_category\",\n",
+ " \"speed_mph\",\n",
+ " \"frequency\",\n",
+ " \"direction_id\",\n",
+ " ]\n",
+ "].drop_duplicates()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "df87ba57-ed93-40b0-9177-512d89d7995e",
+ "metadata": {},
+ "source": [
+ "### Save this temporarily "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "d669b728-d3cd-4e74-941b-3f82e87d071c",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df.to_parquet(\n",
+ " \"gs://calitp-analytics-data/data-analyses/rt_vs_schedule/digest/schedule_vp_metrics_AH_TESTING.parquet\"\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "2cd6bf6f-24f4-4b5e-aa83-2dc6f7266304",
+ "metadata": {},
+ "source": [
+ "### Check for speeds again"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "f5606dd1-caf2-48ba-935d-6ffa24a76b1a",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "organization_name = \"Marin County Transit District\""
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "9808f1d8-c2f7-4276-a46b-c495dea1fcde",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "y_col = \"Speed (MPH)\""
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "270007af-fc3c-4bba-af03-a754f9f972ef",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "marin_county = _section2_utils.load_schedule_vp_metrics(organization_name)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "da4738aa-1dd7-42ba-b51e-2fcb94031d37",
+ "metadata": {
+ "scrolled": true,
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "marin_county[\n",
+ " [\"GTFS Availability\", \"Route\", \"Route ID\", \"Direction\", \"Period\", \"Speed (MPH)\"]\n",
+ "].sort_values(by=\"Route ID\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "40813f67-aad6-4a17-b6fb-aac0c543457e",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "marin_county_route_29 = marin_county.loc[\n",
+ " marin_county.Route == \"29 Downtown San Rafael - E. Corte Madera\"\n",
+ "]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "cbd8bccd-4815-43a5-a8ed-1b8ca1a89501",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import altair as alt"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "229abd52-438d-4ceb-bb88-e07ca6eb00d2",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "routes_list = marin_county[\"Route\"].unique().tolist()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "f1e259f9-1a20-41d4-9e9a-408f055495e3",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "_section2_utils.base_facet_line(marin_county_route_29, y_col, \"Testing\", \"Testing\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "125dad9f-289d-4330-add4-6be8a9e48694",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "max_y = _section2_utils.set_y_axis(marin_county_route_29, y_col)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "b989d1c1-36cc-4227-a51d-f6052f6da959",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "max_y"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "7ef27ac7-4723-43ad-8024-5ea120cc72c6",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "marin_county_route_29 = _section2_utils.clean_data_charts(marin_county_route_29, y_col)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "0f7929cc-a6a4-47e9-b949-6c8473aca2a4",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "marin_county_route_29[[\"dir_0_1\", \"Direction\", \"Period\", \"Speed (MPH)\", \"Date\"]]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "f267e527-56da-403e-a3ed-ba191ae62760",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import _report_utils"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "55256f17-5726-4ec6-ba80-955e79fa14be",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import yaml\n",
+ "\n",
+ "with open(\"color_palettes.yml\") as f:\n",
+ " color_dict = yaml.safe_load(f)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "0bd0dc9f-063b-4756-8106-5e3a9af90068",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "with open(\"readable.yml\") as f:\n",
+ " readable_dict = yaml.safe_load(f)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "d05e4bda-c521-480c-ab93-a95a72df00e0",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "readable_dict[\"frequency_graph\"][\"title\"]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "6b9b7787-7dba-4004-a0d6-63966d00f7a8",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "(readable_dict[\"frequency_graph\"][\"title\"] + \" Test\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "88e2a952-54ed-4c00-98e9-8cfa52a3b6bb",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "alt.Chart(\n",
+ " marin_county_route_29.loc[marin_county_route_29.dir_0_1 == 1]\n",
+ ").mark_bar(size=10).encode(\n",
+ " x=\"yearmonthdate(Date):O\",\n",
+ " y=\"Speed (MPH):Q\",\n",
+ " color=alt.Color(\n",
+ " \"Period:N\",\n",
+ " title=_report_utils.labeling(\"Period\"),\n",
+ " scale=alt.Scale(range=color_dict[\"tri_color\"]),\n",
+ " ),\n",
+ ").facet(column=alt.Column(\"Period:N\", title=_report_utils.labeling(\"Direction\")),\n",
+ " )"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "3a4c41fe-2f46-4116-a956-3fc57cae4732",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "readable_dict[\"speed_graph\"][\"title\"]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "2966000b-29fa-4f54-b8c9-4c024811671c",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "_section2_utils.grouped_bar_chart(\n",
+ " df = marin_county_route_29.loc[marin_county_route_29.dir_0_1 == 1],\n",
+ " color_col = \"Period\",\n",
+ " y_col = \"Speed (MPH)\",\n",
+ " offset_col = \"Period\",\n",
+ " title=readable_dict[\"speed_graph\"][\"title\"],\n",
+ " subtitle= readable_dict[\"speed_graph\"][\"subtitle\"])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "32fa65ef-40f1-4289-a816-659f8b882a43",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "(\n",
+ " alt.Chart(marin_county_route_29.loc[marin_county_route_29.dir_0_1 == 0])\n",
+ " .mark_line(size=3)\n",
+ " .encode(\n",
+ " x=alt.X(\n",
+ " \"yearmonthdate(Date):O\",\n",
+ " title=\"Date\",\n",
+ " axis=alt.Axis(labelAngle=-45, format=\"%b %Y\"),\n",
+ " ),\n",
+ " y=alt.Y(\n",
+ " f\"{y_col}:Q\",\n",
+ " title=_report_utils.labeling(y_col),\n",
+ " scale=alt.Scale(domain=[0, max_y]),\n",
+ " ),\n",
+ " color=alt.Color(\n",
+ " \"Period:N\",\n",
+ " title=_report_utils.labeling(\"Period\"),\n",
+ " scale=alt.Scale(range=color_dict[\"tri_color\"]),\n",
+ " ),\n",
+ " )\n",
+ ").properties(width=200, height=250)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "f2036267-3a9d-4688-bdcb-062d45a48eca",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.9.13"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/gtfs_digest/README.md b/gtfs_digest/README.md
index 274b6f533..e1dc97236 100644
--- a/gtfs_digest/README.md
+++ b/gtfs_digest/README.md
@@ -1,9 +1,11 @@
# General Transit Feed Specification (GTFS) Digest
-The goal of this website is to give you an overview of transit operators that produce GTFS schedule and/or real-time data. We use data from the [National Transit Database](https://www.transit.dot.gov/ntd), [National Association of City Transportation Officials's Transit Route Types](https://nacto.org/publication/transit-street-design-guide/introduction/service-context/transit-route-types/), and [GTFS feeds](https://gtfs.org/) to deliver key insights. You can find details such as the types of routes and the total scheduled hours of public transit service for which an operator runs.
+The goal of this website is to give you an overview of transit operators that produce GTFS schedule and/or real-time data either on the individual operator, Caltrans district, or legislative district level.
-For operators who produce real-time data, we also calculate additional performance metrics for all of their routes. Examples include displaying the number of on-time, early, and late trips, the average speed, and the headway for a route.
+We use data from the [National Transit Database](https://www.transit.dot.gov/ntd), [National Association of City Transportation Official’s Transit Route Types](https://nacto.org/publication/transit-street-design-guide/introduction/service-context/transit-route-types/), and [GTFS feeds](https://gtfs.org/) to deliver key insights. You can find details such as the types of routes and the total scheduled hours of public transit service for which an operator runs.
-GTFS Digest will continue to evolve as we dive into our own data warehouse!
+For operators who produce real-time data, we also calculate additional performance metrics for all their routes. Examples include displaying the number of on-time, early, and late trips, the average speed, and the headway for a route.
+
+GTFS Digest will continue to evolve as we dive into our own data warehouse!
## Definitions and Methodology
To read about the methodology behind and the definitions of terms used throughout our work, please visit [here](https://github.com/cal-itp/data-analyses/blob/main/gtfs_digest/methodology.md).
diff --git a/gtfs_digest/_section2_utils.py b/gtfs_digest/_section2_utils.py
index 04aa433c4..6db3fa39a 100644
--- a/gtfs_digest/_section2_utils.py
+++ b/gtfs_digest/_section2_utils.py
@@ -18,7 +18,9 @@
# Data Dictionary
GTFS_DATA_DICT = catalog_utils.get_catalog("gtfs_analytics_data")
+
import yaml
+
with open("readable.yml") as f:
readable_dict = yaml.safe_load(f)
@@ -34,11 +36,11 @@ def load_schedule_vp_metrics(organization:str)->pd.DataFrame:
Load schedule versus realtime file.
"""
schd_vp_url = f"{GTFS_DATA_DICT.digest_tables.dir}{GTFS_DATA_DICT.digest_tables.route_schedule_vp}.parquet"
-
+ # schd_vp_url = "gs://calitp-analytics-data/data-analyses/rt_vs_schedule/digest/schedule_vp_metrics_AH_TESTING.parquet"
# Keep only rows that are found in both schedule and real time data
df = (pd.read_parquet(schd_vp_url,
filters=[[("organization_name", "==", organization),
- ("sched_rt_category", "==", "schedule_and_vp")]])
+ ("sched_rt_category", "in", ["schedule_and_vp"])]])
)
# Delete duplicates
@@ -76,6 +78,8 @@ def load_operator_metrics(organization_name:str)->pd.DataFrame:
df = pd.read_parquet(url,
filters=[[(("organization_name", "==", organization_name))]])
+ df = df.drop_duplicates(subset = ["service_date"]).reset_index(drop = True)
+
# Rename dataframe
df = _report_utils.replace_column_names(df)
@@ -288,11 +292,11 @@ def grouped_bar_chart(
chart = (
alt.Chart(df)
- .mark_bar(size=8)
+ .mark_bar(size=5)
.encode(
x=alt.X(
"yearmonthdate(Date):O",
- title=["Grouped by Direction ID", "Date"],
+ title=["Date"],
axis=alt.Axis(labelAngle=-45, format="%b %Y"),
),
y=alt.Y(f"{y_col}:Q", title=_report_utils.labeling(y_col)),
@@ -359,15 +363,17 @@ def base_facet_line(
)
)
- chart = chart.properties(width=200, height=250)
- chart = chart.facet(
- column=alt.Column("Direction:N", title=_report_utils.labeling("Direction")),
- ).properties(
+ chart = chart.properties(width=200, height=250).properties(
title={
"text": [title],
"subtitle": [subtitle],
}
)
+ """
+ chart = chart.facet(
+ column=alt.Column("Direction:N", title=_report_utils.labeling("Direction")),
+ )
+ """
return chart
def base_facet_circle(
@@ -883,12 +889,26 @@ def filtered_route(
.transform_filter(xcol_param)
)
- speed_graph = (
- base_facet_line(
- df,
+ speed_graph_dir_0 = (
+ grouped_bar_chart(
+ df.loc[df.dir_0_1 == 0],
+ "Period",
"Speed (MPH)",
- readable_dict["speed_graph"]["title"],
- readable_dict["speed_graph"]["subtitle"],
+ "Period",
+ readable_dict["speed_graph_dir_0"]["title"],
+ readable_dict["speed_graph_dir_0"]["subtitle"],
+ )
+ .add_params(xcol_param)
+ .transform_filter(xcol_param)
+ )
+ speed_graph_dir_1 = (
+ grouped_bar_chart(
+ df.loc[df.dir_0_1 == 1],
+ "Period",
+ "Speed (MPH)",
+ "Period",
+ readable_dict["speed_graph_dir_1"]["title"],
+ readable_dict["speed_graph_dir_0"]["subtitle"],
)
.add_params(xcol_param)
.transform_filter(xcol_param)
@@ -964,7 +984,8 @@ def filtered_route(
timeliness_trips_dir_1,
frequency_graph_dir_0,
frequency_graph_dir_1,
- speed_graph,
+ speed_graph_dir_0,
+ speed_graph_dir_1,
data_quality,
vp_per_min_graph,
sched_vp_per_min,
diff --git a/gtfs_digest/merge_operator_data.py b/gtfs_digest/merge_operator_data.py
index 8516fc479..1eccb4ddd 100644
--- a/gtfs_digest/merge_operator_data.py
+++ b/gtfs_digest/merge_operator_data.py
@@ -32,7 +32,7 @@ def concatenate_operator_routes(
date_list: list
) -> gpd.GeoDataFrame:
FILE = GTFS_DATA_DICT.schedule_tables.operator_routes
-
+
df = time_series_utils.concatenate_datasets_across_dates(
SCHED_GCS,
FILE,
@@ -191,13 +191,22 @@ def operator_category_counts_by_date() -> pd.DataFrame:
)
# Drop duplicates created after merging
+ # Add more strigent drop duplicate criteria
+
+ duplicate_cols = ["schedule_gtfs_dataset_key",
+ "vp_per_min_agency",
+ "spatial_accuracy_agency",
+ "service_date",
+ "organization_name",
+ "caltrans_district"]
+
op_profiles_df3 = (
op_profiles_df2
.pipe(
publish_utils.exclude_private_datasets,
col = "schedule_gtfs_dataset_key",
public_gtfs_dataset_keys = public_feeds
- ).drop_duplicates(subset = list(op_profiles_df2.columns))
+ ).drop_duplicates(subset = duplicate_cols)
.reset_index(drop = True))
op_profiles_df3.to_parquet(
diff --git a/gtfs_digest/readable.yml b/gtfs_digest/readable.yml
index 57e4f9569..b3d38e9c2 100644
--- a/gtfs_digest/readable.yml
+++ b/gtfs_digest/readable.yml
@@ -108,9 +108,11 @@ timeliness_trips_graph:
frequency_graph:
title: "Frequency of Trips in Minutes"
subtitle: "Understanding how often a trip comes. If the bar says 120 minutes, that means a trip will pass that particular direction once every 2 hours."
-speed_graph:
- title: "Average Speed (MPH)"
+speed_graph_dir_0:
+ title: "Average Speed (MPH) for Direction 0"
subtitle: "The average miles per hour the bus travels by direction and time of day."
+speed_graph_dir_1:
+ title: "Average Speed (MPH) for Direction 1"
vp_per_min_graph:
title: "Vehicle Positions per Minute"
subtitle: "Trips should have 2+ VPs per minute. This metric reflects the accuracy of the temporal data collected."
diff --git a/gtfs_funnel/crosswalk_gtfs_dataset_key_to_organization.py b/gtfs_funnel/crosswalk_gtfs_dataset_key_to_organization.py
index 662977aa6..f9ac6fed8 100644
--- a/gtfs_funnel/crosswalk_gtfs_dataset_key_to_organization.py
+++ b/gtfs_funnel/crosswalk_gtfs_dataset_key_to_organization.py
@@ -205,6 +205,7 @@ def merge_ntd_mobility(year:int)->pd.DataFrame:
# Drop ntd_id from ntd_df to avoid confusion
crosswalk_df = crosswalk_df.drop(columns = ["ntd_id_2022"])
+ # Drop duplicates since we're getting a lot.
crosswalk_df.to_parquet(
f"{SCHED_GCS}{EXPORT}_{analysis_date}.parquet"
)
diff --git a/gtfs_funnel/operator_scheduled_stats.py b/gtfs_funnel/operator_scheduled_stats.py
index 4363c5801..d3b5c7919 100644
--- a/gtfs_funnel/operator_scheduled_stats.py
+++ b/gtfs_funnel/operator_scheduled_stats.py
@@ -192,7 +192,7 @@ def operator_typology_breakdown(df: pd.DataFrame) -> pd.DataFrame:
).merge(
route_typology_grouped,
on = ["schedule_gtfs_dataset_key", "route_id"],
- how = "inner"
+ how = "left"
).merge(
crosswalk,
on = "schedule_gtfs_dataset_key",
diff --git a/gtfs_funnel/route_typologies.py b/gtfs_funnel/route_typologies.py
index b27aa864e..03153bce8 100644
--- a/gtfs_funnel/route_typologies.py
+++ b/gtfs_funnel/route_typologies.py
@@ -394,6 +394,7 @@ def reconcile_route_and_nacto_typologies(
df3.to_parquet(
f"{SCHED_GCS}{EXPORT}_{analysis_date}.parquet")
+
time1 = datetime.datetime.now()
print(f"route typologies {analysis_date}: {time1 - time0}")
diff --git a/gtfs_funnel/schedule_stats_by_route_direction.py b/gtfs_funnel/schedule_stats_by_route_direction.py
index 41bee317b..38e0b6bfb 100644
--- a/gtfs_funnel/schedule_stats_by_route_direction.py
+++ b/gtfs_funnel/schedule_stats_by_route_direction.py
@@ -23,6 +23,7 @@ def cardinal_direction_for_route_direction(analysis_date:str, dict_inputs:dict):
filters=[[("stop_primary_direction", "!=", "Unknown")]
])
+
trip_scheduled_col = [
"route_id",
"trip_instance_key",
@@ -49,6 +50,10 @@ def cardinal_direction_for_route_direction(analysis_date:str, dict_inputs:dict):
stop_times_with_trip = pd.merge(stop_times_df, trips_df, on = merge_cols)
+ # AH: temporarily fill in direction_id rows with nans
+ # should go back to the script that creates stop_times_df
+ stop_times_with_trip.direction_id = stop_times_with_trip.direction_id.fillna(0)
+
main_cols = [
"route_id",
"schedule_gtfs_dataset_key",
@@ -57,7 +62,8 @@ def cardinal_direction_for_route_direction(analysis_date:str, dict_inputs:dict):
agg1 = (
stop_times_with_trip.groupby(
- main_cols + ["stop_primary_direction"]
+ main_cols + ["stop_primary_direction"],
+ dropna=False
)
.agg({"stop_sequence": "count"})
.reset_index()
@@ -136,31 +142,37 @@ def schedule_metrics_by_route_direction(
group_merge_cols: list,
) -> pd.DataFrame:
"""
- Aggregate trip-level metrics to route-direction, and
+ Aggregate trip-level metrics to route-direction, and
attach shape geometry for common_shape_id.
"""
service_freq_df = gtfs_schedule_wrangling.aggregate_time_of_day_to_peak_offpeak(
- df, group_merge_cols, long_or_wide = "long")
-
- metrics_df = (df.groupby(group_merge_cols,
- observed=True, group_keys=False)
- .agg({
- "median_stop_meters": "mean",
- # take mean of the median stop spacing for trip
- # does this make sense?
- # median is the single boiled down metric at the trip-level
- "scheduled_service_minutes": "mean",
- }).reset_index()
- .rename(columns = {
- "median_stop_meters": "avg_stop_meters",
- "scheduled_service_minutes": "avg_scheduled_service_minutes"
- })
- )
-
+ df, group_merge_cols, long_or_wide="long"
+ )
+
+ metrics_df = (
+ df.groupby(group_merge_cols, observed=True, group_keys=False, dropna=False)
+ .agg(
+ {
+ "median_stop_meters": "mean",
+ # take mean of the median stop spacing for trip
+ # does this make sense?
+ # median is the single boiled down metric at the trip-level
+ "scheduled_service_minutes": "mean",
+ }
+ )
+ .reset_index()
+ .rename(
+ columns={
+ "median_stop_meters": "avg_stop_meters",
+ "scheduled_service_minutes": "avg_scheduled_service_minutes",
+ }
+ )
+ )
+
metrics_df = metrics_df.assign(
- avg_stop_miles = metrics_df.avg_stop_meters.divide(METERS_PER_MILE).round(2)
- ).drop(columns = ["avg_stop_meters"])
-
+ avg_stop_miles=metrics_df.avg_stop_meters.divide(METERS_PER_MILE).round(2)
+ ).drop(columns=["avg_stop_meters"])
+
round_me = ["avg_stop_miles", "avg_scheduled_service_minutes"]
metrics_df[round_me] = metrics_df[round_me].round(2)
@@ -168,17 +180,11 @@ def schedule_metrics_by_route_direction(
analysis_date
).pipe(helpers.remove_shapes_outside_ca)
- df = pd.merge(
- common_shape,
- metrics_df,
- on = group_merge_cols,
- how = "inner"
- ).merge(
- service_freq_df,
- on = group_merge_cols,
- how = "inner"
+ df = pd.merge(common_shape, metrics_df, on=group_merge_cols, how="inner").merge(
+ service_freq_df, on=group_merge_cols, how="inner"
)
-
+
+ df.time_period = df.time_period.fillna(df.peak_offpeak)
return df
@@ -195,7 +201,9 @@ def schedule_metrics_by_route_direction(
# Find metrics on the trip grain
trip_metrics = assemble_scheduled_trip_metrics(date, GTFS_DATA_DICT)
-
+
+ trip_metrics.direction_id = trip_metrics.direction_id.fillna(0)
+
trip_metrics.to_parquet(
f"{RT_SCHED_GCS}{TRIP_EXPORT}_{date}.parquet")
@@ -235,4 +243,4 @@ def schedule_metrics_by_route_direction(
)
end = datetime.datetime.now()
- print(f"schedule stats for {date}: {end - start}")
+ print(f"schedule stats for {date}: {end - start}")
\ No newline at end of file
diff --git a/gtfs_funnel/update_vars.py b/gtfs_funnel/update_vars.py
index ae625f3cc..1d2d5aac3 100644
--- a/gtfs_funnel/update_vars.py
+++ b/gtfs_funnel/update_vars.py
@@ -12,7 +12,7 @@
)
-# analysis_date_list = [rt_dates.DATES["dec2024"]]
+# analysis_date_list = [rt_dates.DATES["dec2024"]] + [rt_dates.DATES['nov2024']]
analysis_date_list = all_dates
GTFS_DATA_DICT = catalog_utils.get_catalog("gtfs_analytics_data")
diff --git a/rt_scheduled_v_ran/logs/rt_v_scheduled_route_metrics.log b/rt_scheduled_v_ran/logs/rt_v_scheduled_route_metrics.log
index 5966dcaed..1352bee1b 100644
--- a/rt_scheduled_v_ran/logs/rt_v_scheduled_route_metrics.log
+++ b/rt_scheduled_v_ran/logs/rt_v_scheduled_route_metrics.log
@@ -80,3 +80,26 @@
2024-11-20 10:54:08.184 | INFO | __main__:route_metrics:85 - route aggregation 2024-11-13: 0:00:05.430277
2024-12-17 15:40:02.618 | INFO | __main__:route_metrics:85 - route aggregation 2024-12-11: 0:00:02.999985
2024-12-19 09:53:33.513 | INFO | __main__:route_metrics:85 - route aggregation 2024-12-11: 0:00:03.161056
+2025-01-15 15:11:15.095 | INFO | __main__:route_metrics:88 - route aggregation 2024-11-13: 0:00:02.958490
+2025-01-16 15:49:19.011 | INFO | __main__:route_metrics:88 - route aggregation 2024-01-17: 0:00:03.055709
+2025-01-16 15:49:21.746 | INFO | __main__:route_metrics:88 - route aggregation 2024-02-14: 0:00:02.700508
+2025-01-16 15:49:24.061 | INFO | __main__:route_metrics:88 - route aggregation 2024-03-13: 0:00:02.309990
+2025-01-16 15:49:26.168 | INFO | __main__:route_metrics:88 - route aggregation 2024-04-17: 0:00:02.101453
+2025-01-16 15:49:28.852 | INFO | __main__:route_metrics:88 - route aggregation 2024-05-22: 0:00:02.679262
+2025-01-16 15:49:31.068 | INFO | __main__:route_metrics:88 - route aggregation 2024-06-12: 0:00:02.211054
+2025-01-16 15:49:32.952 | INFO | __main__:route_metrics:88 - route aggregation 2024-07-17: 0:00:01.879407
+2025-01-16 15:49:34.738 | INFO | __main__:route_metrics:88 - route aggregation 2024-08-14: 0:00:01.781556
+2025-01-16 15:49:36.550 | INFO | __main__:route_metrics:88 - route aggregation 2024-09-18: 0:00:01.806138
+2025-01-16 15:49:38.451 | INFO | __main__:route_metrics:88 - route aggregation 2024-10-16: 0:00:01.896576
+2025-01-16 15:49:40.292 | INFO | __main__:route_metrics:88 - route aggregation 2024-11-13: 0:00:01.836643
+2025-01-16 15:49:41.992 | INFO | __main__:route_metrics:88 - route aggregation 2024-12-11: 0:00:01.695409
+2025-01-16 15:49:43.743 | INFO | __main__:route_metrics:88 - route aggregation 2023-03-15: 0:00:01.741539
+2025-01-16 15:49:45.597 | INFO | __main__:route_metrics:88 - route aggregation 2023-04-12: 0:00:01.849813
+2025-01-16 15:49:47.350 | INFO | __main__:route_metrics:88 - route aggregation 2023-05-17: 0:00:01.749005
+2025-01-16 15:49:49.083 | INFO | __main__:route_metrics:88 - route aggregation 2023-06-14: 0:00:01.725776
+2025-01-16 15:49:50.855 | INFO | __main__:route_metrics:88 - route aggregation 2023-07-12: 0:00:01.768527
+2025-01-16 15:49:52.712 | INFO | __main__:route_metrics:88 - route aggregation 2023-08-15: 0:00:01.851221
+2025-01-16 15:49:54.532 | INFO | __main__:route_metrics:88 - route aggregation 2023-09-13: 0:00:01.815201
+2025-01-16 15:49:56.361 | INFO | __main__:route_metrics:88 - route aggregation 2023-10-11: 0:00:01.825395
+2025-01-16 15:49:58.178 | INFO | __main__:route_metrics:88 - route aggregation 2023-11-15: 0:00:01.812722
+2025-01-16 15:50:00.055 | INFO | __main__:route_metrics:88 - route aggregation 2023-12-13: 0:00:01.873527
diff --git a/rt_scheduled_v_ran/scripts/rt_v_scheduled_routes.py b/rt_scheduled_v_ran/scripts/rt_v_scheduled_routes.py
index 176c0db68..621457dd2 100644
--- a/rt_scheduled_v_ran/scripts/rt_v_scheduled_routes.py
+++ b/rt_scheduled_v_ran/scripts/rt_v_scheduled_routes.py
@@ -52,6 +52,9 @@ def route_metrics(
f"{RT_SCHED_GCS}{TRIP_EXPORT}_{analysis_date}.parquet"
)
+ # Fill in trip_export with direction_id here temporarily
+ trip_df.direction_id = trip_df.direction_id.fillna(0)
+
crosswalk_cols = [
"schedule_gtfs_dataset_key",
"name",
diff --git a/rt_scheduled_v_ran/scripts/update_vars.py b/rt_scheduled_v_ran/scripts/update_vars.py
index 11f5f64ff..c000ab20a 100644
--- a/rt_scheduled_v_ran/scripts/update_vars.py
+++ b/rt_scheduled_v_ran/scripts/update_vars.py
@@ -7,8 +7,8 @@
oct2024_week = rt_dates.get_week("oct2024", exclude_wed=True)
-analysis_date_list = [rt_dates.DATES["dec2024"]]
-# analysis_date_list = rt_dates.y2024_dates + rt_dates.y2023_dates
+# analysis_date_list = [rt_dates.DATES["nov2024"]]
+analysis_date_list = rt_dates.y2024_dates + rt_dates.y2023_dates
GTFS_DATA_DICT = catalog_utils.get_catalog("gtfs_analytics_data")
diff --git a/rt_segment_speeds/logs/avg_speeds.log b/rt_segment_speeds/logs/avg_speeds.log
index abffd9563..26dd87007 100644
--- a/rt_segment_speeds/logs/avg_speeds.log
+++ b/rt_segment_speeds/logs/avg_speeds.log
@@ -584,3 +584,97 @@
2024-12-18 15:06:31.961 | INFO | average_segment_speeds:segment_averages_detail:249 - speedmap_segments detailed segment averaging for ['2024-12-11'] execution time: 0:07:17.222047
2024-12-18 15:12:50.801 | INFO | average_segment_speeds:segment_averages:185 - speedmap_segments segment averaging for ['2024-12-11'] execution time: 0:06:18.665542
2024-12-18 15:17:58.646 | INFO | average_segment_speeds:segment_averages:185 - speedmap_segments segment averaging for ['2024-12-11'] execution time: 0:05:07.636255
+2025-01-15 14:36:17.067 | INFO | __main__:summary_average_speeds:120 - trip avg 0:00:18.845972
+2025-01-15 14:36:31.001 | INFO | __main__:summary_average_speeds:154 - rt_stop_times summary speed averaging for ['2024-11-13'] execution time: 0:00:32.780041
+2025-01-16 16:04:20.713 | INFO | __main__:summary_average_speeds:120 - trip avg 0:00:16.934088
+2025-01-16 16:04:32.821 | INFO | __main__:summary_average_speeds:154 - rt_stop_times summary speed averaging for ['2024-01-17'] execution time: 0:00:29.042487
+2025-01-16 16:04:46.545 | INFO | __main__:summary_average_speeds:120 - trip avg 0:00:13.629139
+2025-01-16 16:04:57.111 | INFO | __main__:summary_average_speeds:154 - rt_stop_times summary speed averaging for ['2024-02-14'] execution time: 0:00:24.194859
+2025-01-16 16:05:11.281 | INFO | __main__:summary_average_speeds:120 - trip avg 0:00:14.052910
+2025-01-16 16:05:20.619 | INFO | __main__:summary_average_speeds:154 - rt_stop_times summary speed averaging for ['2024-03-13'] execution time: 0:00:23.391635
+2025-01-16 16:05:34.034 | INFO | __main__:summary_average_speeds:120 - trip avg 0:00:13.328255
+2025-01-16 16:05:43.981 | INFO | __main__:summary_average_speeds:154 - rt_stop_times summary speed averaging for ['2024-04-17'] execution time: 0:00:23.274450
+2025-01-16 16:05:56.572 | INFO | __main__:summary_average_speeds:120 - trip avg 0:00:12.484715
+2025-01-16 16:06:06.317 | INFO | __main__:summary_average_speeds:154 - rt_stop_times summary speed averaging for ['2024-05-22'] execution time: 0:00:22.229829
+2025-01-16 16:06:17.425 | INFO | __main__:summary_average_speeds:120 - trip avg 0:00:11.002452
+2025-01-16 16:06:25.857 | INFO | __main__:summary_average_speeds:154 - rt_stop_times summary speed averaging for ['2024-06-12'] execution time: 0:00:19.433568
+2025-01-16 16:06:38.063 | INFO | __main__:summary_average_speeds:120 - trip avg 0:00:12.129066
+2025-01-16 16:06:46.950 | INFO | __main__:summary_average_speeds:154 - rt_stop_times summary speed averaging for ['2024-07-17'] execution time: 0:00:21.016595
+2025-01-16 16:06:59.293 | INFO | __main__:summary_average_speeds:120 - trip avg 0:00:12.254732
+2025-01-16 16:07:08.575 | INFO | __main__:summary_average_speeds:154 - rt_stop_times summary speed averaging for ['2024-08-14'] execution time: 0:00:21.535979
+2025-01-16 16:07:21.188 | INFO | __main__:summary_average_speeds:120 - trip avg 0:00:12.529217
+2025-01-16 16:07:30.842 | INFO | __main__:summary_average_speeds:154 - rt_stop_times summary speed averaging for ['2024-09-18'] execution time: 0:00:22.183636
+2025-01-16 16:07:43.077 | INFO | __main__:summary_average_speeds:120 - trip avg 0:00:12.145614
+2025-01-16 16:07:52.741 | INFO | __main__:summary_average_speeds:154 - rt_stop_times summary speed averaging for ['2024-10-16'] execution time: 0:00:21.809547
+2025-01-16 16:08:06.064 | INFO | __main__:summary_average_speeds:120 - trip avg 0:00:13.208644
+2025-01-16 16:08:16.431 | INFO | __main__:summary_average_speeds:154 - rt_stop_times summary speed averaging for ['2024-11-13'] execution time: 0:00:23.576042
+2025-01-16 16:08:29.763 | INFO | __main__:summary_average_speeds:120 - trip avg 0:00:13.232394
+2025-01-16 16:08:40.304 | INFO | __main__:summary_average_speeds:154 - rt_stop_times summary speed averaging for ['2024-12-11'] execution time: 0:00:23.773742
+2025-01-16 16:08:52.689 | INFO | __main__:summary_average_speeds:120 - trip avg 0:00:12.282962
+2025-01-16 16:09:03.966 | INFO | __main__:summary_average_speeds:154 - rt_stop_times summary speed averaging for ['2023-03-15'] execution time: 0:00:23.560709
+2025-01-16 16:09:16.346 | INFO | __main__:summary_average_speeds:120 - trip avg 0:00:12.293681
+2025-01-16 16:09:27.403 | INFO | __main__:summary_average_speeds:154 - rt_stop_times summary speed averaging for ['2023-04-12'] execution time: 0:00:23.350808
+2025-01-16 16:09:39.044 | INFO | __main__:summary_average_speeds:120 - trip avg 0:00:11.520841
+2025-01-16 16:09:49.899 | INFO | __main__:summary_average_speeds:154 - rt_stop_times summary speed averaging for ['2023-05-17'] execution time: 0:00:22.376414
+2025-01-16 16:10:01.160 | INFO | __main__:summary_average_speeds:120 - trip avg 0:00:11.178880
+2025-01-16 16:10:11.350 | INFO | __main__:summary_average_speeds:154 - rt_stop_times summary speed averaging for ['2023-06-14'] execution time: 0:00:21.369181
+2025-01-16 16:10:23.073 | INFO | __main__:summary_average_speeds:120 - trip avg 0:00:11.634097
+2025-01-16 16:10:33.139 | INFO | __main__:summary_average_speeds:154 - rt_stop_times summary speed averaging for ['2023-07-12'] execution time: 0:00:21.699396
+2025-01-16 16:10:45.143 | INFO | __main__:summary_average_speeds:120 - trip avg 0:00:11.916611
+2025-01-16 16:10:56.045 | INFO | __main__:summary_average_speeds:154 - rt_stop_times summary speed averaging for ['2023-08-15'] execution time: 0:00:22.818622
+2025-01-16 16:11:07.561 | INFO | __main__:summary_average_speeds:120 - trip avg 0:00:11.420412
+2025-01-16 16:11:18.107 | INFO | __main__:summary_average_speeds:154 - rt_stop_times summary speed averaging for ['2023-09-13'] execution time: 0:00:21.966925
+2025-01-16 16:11:30.453 | INFO | __main__:summary_average_speeds:120 - trip avg 0:00:12.253862
+2025-01-16 16:11:41.081 | INFO | __main__:summary_average_speeds:154 - rt_stop_times summary speed averaging for ['2023-10-11'] execution time: 0:00:22.881764
+2025-01-16 16:11:53.690 | INFO | __main__:summary_average_speeds:120 - trip avg 0:00:12.516304
+2025-01-16 16:12:04.782 | INFO | __main__:summary_average_speeds:154 - rt_stop_times summary speed averaging for ['2023-11-15'] execution time: 0:00:23.608095
+2025-01-16 16:12:17.336 | INFO | __main__:summary_average_speeds:120 - trip avg 0:00:12.461608
+2025-01-16 16:12:27.033 | INFO | __main__:summary_average_speeds:154 - rt_stop_times summary speed averaging for ['2023-12-13'] execution time: 0:00:22.158792
+2025-01-16 16:12:38.972 | INFO | __main__:summary_average_speeds:120 - trip avg 0:00:11.817331
+2025-01-16 16:12:48.760 | INFO | __main__:summary_average_speeds:154 - rt_stop_times summary speed averaging for ['2024-10-14'] execution time: 0:00:21.605632
+2025-01-16 16:13:00.707 | INFO | __main__:summary_average_speeds:120 - trip avg 0:00:11.855366
+2025-01-16 16:13:10.306 | INFO | __main__:summary_average_speeds:154 - rt_stop_times summary speed averaging for ['2024-10-15'] execution time: 0:00:21.454589
+2025-01-16 16:13:22.517 | INFO | __main__:summary_average_speeds:120 - trip avg 0:00:12.118255
+2025-01-16 16:13:32.147 | INFO | __main__:summary_average_speeds:154 - rt_stop_times summary speed averaging for ['2024-10-17'] execution time: 0:00:21.748474
+2025-01-16 16:13:44.222 | INFO | __main__:summary_average_speeds:120 - trip avg 0:00:11.973277
+2025-01-16 16:13:54.036 | INFO | __main__:summary_average_speeds:154 - rt_stop_times summary speed averaging for ['2024-10-18'] execution time: 0:00:21.787478
+2025-01-16 16:14:03.231 | INFO | __main__:summary_average_speeds:120 - trip avg 0:00:09.096869
+2025-01-16 16:14:10.890 | INFO | __main__:summary_average_speeds:154 - rt_stop_times summary speed averaging for ['2024-10-19'] execution time: 0:00:16.755838
+2025-01-16 16:14:19.475 | INFO | __main__:summary_average_speeds:120 - trip avg 0:00:08.520944
+2025-01-16 16:14:27.037 | INFO | __main__:summary_average_speeds:154 - rt_stop_times summary speed averaging for ['2024-10-20'] execution time: 0:00:16.082530
+2025-01-16 16:22:23.209 | INFO | __main__:summary_average_speeds:120 - trip avg 0:00:15.830816
+2025-01-16 16:22:35.360 | INFO | __main__:summary_average_speeds:154 - rt_stop_times summary speed averaging for ['2024-04-15'] execution time: 0:00:27.982591
+2025-01-16 16:22:49.048 | INFO | __main__:summary_average_speeds:120 - trip avg 0:00:13.587776
+2025-01-16 16:23:00.716 | INFO | __main__:summary_average_speeds:154 - rt_stop_times summary speed averaging for ['2024-04-16'] execution time: 0:00:25.255428
+2025-01-16 16:23:14.773 | INFO | __main__:summary_average_speeds:120 - trip avg 0:00:13.934353
+2025-01-16 16:23:24.388 | INFO | __main__:summary_average_speeds:154 - rt_stop_times summary speed averaging for ['2024-04-18'] execution time: 0:00:23.549288
+2025-01-16 16:23:36.612 | INFO | __main__:summary_average_speeds:120 - trip avg 0:00:12.119526
+2025-01-16 16:23:46.011 | INFO | __main__:summary_average_speeds:154 - rt_stop_times summary speed averaging for ['2024-04-19'] execution time: 0:00:21.518818
+2025-01-16 16:23:54.762 | INFO | __main__:summary_average_speeds:120 - trip avg 0:00:08.666832
+2025-01-16 16:24:02.242 | INFO | __main__:summary_average_speeds:154 - rt_stop_times summary speed averaging for ['2024-04-20'] execution time: 0:00:16.145971
+2025-01-16 16:24:11.687 | INFO | __main__:summary_average_speeds:120 - trip avg 0:00:09.381356
+2025-01-16 16:24:19.204 | INFO | __main__:summary_average_speeds:154 - rt_stop_times summary speed averaging for ['2024-04-21'] execution time: 0:00:16.898038
+2025-01-16 16:24:31.946 | INFO | __main__:summary_average_speeds:120 - trip avg 0:00:12.679766
+2025-01-16 16:24:43.239 | INFO | __main__:summary_average_speeds:154 - rt_stop_times summary speed averaging for ['2023-10-09'] execution time: 0:00:23.972403
+2025-01-16 16:24:57.759 | INFO | __main__:summary_average_speeds:120 - trip avg 0:00:14.395392
+2025-01-16 16:25:10.122 | INFO | __main__:summary_average_speeds:154 - rt_stop_times summary speed averaging for ['2023-10-10'] execution time: 0:00:26.758428
+2025-01-16 16:25:22.503 | INFO | __main__:summary_average_speeds:120 - trip avg 0:00:12.292461
+2025-01-16 16:25:33.968 | INFO | __main__:summary_average_speeds:154 - rt_stop_times summary speed averaging for ['2023-10-12'] execution time: 0:00:23.756984
+2025-01-16 16:25:46.305 | INFO | __main__:summary_average_speeds:120 - trip avg 0:00:12.247253
+2025-01-16 16:25:57.723 | INFO | __main__:summary_average_speeds:154 - rt_stop_times summary speed averaging for ['2023-10-13'] execution time: 0:00:23.665107
+2025-01-16 16:26:08.382 | INFO | __main__:summary_average_speeds:120 - trip avg 0:00:10.573562
+2025-01-16 16:26:19.198 | INFO | __main__:summary_average_speeds:154 - rt_stop_times summary speed averaging for ['2023-10-14'] execution time: 0:00:21.388992
+2025-01-16 16:26:27.808 | INFO | __main__:summary_average_speeds:120 - trip avg 0:00:08.541491
+2025-01-16 16:26:36.529 | INFO | __main__:summary_average_speeds:154 - rt_stop_times summary speed averaging for ['2023-10-15'] execution time: 0:00:17.262397
+2025-01-16 16:26:48.305 | INFO | __main__:summary_average_speeds:120 - trip avg 0:00:11.712182
+2025-01-16 16:26:59.835 | INFO | __main__:summary_average_speeds:154 - rt_stop_times summary speed averaging for ['2023-04-10'] execution time: 0:00:23.242696
+2025-01-16 16:27:12.344 | INFO | __main__:summary_average_speeds:120 - trip avg 0:00:12.418244
+2025-01-16 16:27:24.277 | INFO | __main__:summary_average_speeds:154 - rt_stop_times summary speed averaging for ['2023-04-11'] execution time: 0:00:24.351473
+2025-01-16 16:27:36.863 | INFO | __main__:summary_average_speeds:120 - trip avg 0:00:12.501083
+2025-01-16 16:27:48.867 | INFO | __main__:summary_average_speeds:154 - rt_stop_times summary speed averaging for ['2023-04-13'] execution time: 0:00:24.505543
+2025-01-16 16:28:02.931 | INFO | __main__:summary_average_speeds:120 - trip avg 0:00:13.978393
+2025-01-16 16:28:14.047 | INFO | __main__:summary_average_speeds:154 - rt_stop_times summary speed averaging for ['2023-04-14'] execution time: 0:00:25.093847
+2025-01-16 16:28:22.438 | INFO | __main__:summary_average_speeds:120 - trip avg 0:00:08.302166
+2025-01-16 16:28:31.739 | INFO | __main__:summary_average_speeds:154 - rt_stop_times summary speed averaging for ['2023-04-15'] execution time: 0:00:17.603240
+2025-01-16 16:28:40.309 | INFO | __main__:summary_average_speeds:120 - trip avg 0:00:08.508140
+2025-01-16 16:28:49.151 | INFO | __main__:summary_average_speeds:154 - rt_stop_times summary speed averaging for ['2023-04-16'] execution time: 0:00:17.350031
diff --git a/rt_segment_speeds/scripts/average_segment_speeds.py b/rt_segment_speeds/scripts/average_segment_speeds.py
index 0a9fea9e0..c0f89426d 100644
--- a/rt_segment_speeds/scripts/average_segment_speeds.py
+++ b/rt_segment_speeds/scripts/average_segment_speeds.py
@@ -72,12 +72,20 @@ def concatenate_trip_segment_speeds(
).pipe(
gtfs_schedule_wrangling.add_peak_offpeak_column
)
+ """
+ Amanda: There's already a `service_date` column.
df = df.rename(columns={'arrival_time':'service_date'}
).pipe(
gtfs_schedule_wrangling.add_weekday_weekend_column
) # drop service_date?
print("concatenated files")
+ """
+ df = df.pipe(
+ gtfs_schedule_wrangling.add_weekday_weekend_column
+ ) # drop service_date?
+ df.direction_id = df.direction_id.fillna(0)
+ print("concatenated files")
return df
@@ -96,6 +104,9 @@ def merge_in_segment_geometry(
f"{SEGMENT_GCS}{SEGMENT_FILE}_{analysis_date}.parquet",
).to_crs(WGS84)
+ # Amanda: go back to the script that creates segment_geom to fill in nans
+ segment_geom.direction_id = segment_geom.direction_id.fillna(0)
+
col_order = [c for c in speeds_by_segment.columns]
# The merge columns list should be all the columns that are in common
@@ -134,6 +145,9 @@ def segment_averages(
get_pandas = False
)
+ # Amanda, temporarily filling in direction id here
+ df.direction_id = df.direction_id.fillna(0)
+
if weighted_averages:
avg_speeds = delayed(metrics.concatenate_peak_offpeak_allday_averages)(
df,
diff --git a/rt_segment_speeds/scripts/average_summary_speeds.py b/rt_segment_speeds/scripts/average_summary_speeds.py
index d3c3ad25e..a7d389b82 100644
--- a/rt_segment_speeds/scripts/average_summary_speeds.py
+++ b/rt_segment_speeds/scripts/average_summary_speeds.py
@@ -149,7 +149,6 @@ def summary_average_speeds(
f"{export_file}_{time_span_str}"
)
-
end = datetime.datetime.now()
logger.info(
@@ -177,7 +176,6 @@ def summary_average_speeds(
ROUTE_DIR_COLS = [*dict_inputs["route_dir_cols"]]
ROUTE_DIR_FILE = dict_inputs["route_dir_single_summary"]
-
for analysis_date in analysis_date_list:
summary_average_speeds(
@@ -186,7 +184,6 @@ def summary_average_speeds(
group_cols = OPERATOR_COLS + ROUTE_DIR_COLS,
export_file = ROUTE_DIR_FILE
)
-
'''
from segment_speed_utils.project_vars import weeks_available
diff --git a/rt_segment_speeds/segment_speed_utils/gtfs_schedule_wrangling.py b/rt_segment_speeds/segment_speed_utils/gtfs_schedule_wrangling.py
index 2e1316a5e..0d61fa7aa 100644
--- a/rt_segment_speeds/segment_speed_utils/gtfs_schedule_wrangling.py
+++ b/rt_segment_speeds/segment_speed_utils/gtfs_schedule_wrangling.py
@@ -127,20 +127,21 @@ def add_weekday_weekend_column(df: pd.DataFrame, category_dict: dict = time_help
)
return df
-
+
def count_trips_by_group(df: pd.DataFrame, group_cols: list):
"""
- Given a df with trip_instance_key and an arbitrary list of
+ Given a df with trip_instance_key and an arbitrary list of
group_cols, return trip counts by group.
"""
assert "trip_instance_key" in df.columns
- df = (df.groupby(group_cols)
- .agg({"trip_instance_key": "count"})
- .reset_index()
- )
- df = df.rename(columns = {"trip_instance_key": "n_trips"})
+ df = (
+ df.groupby(group_cols, dropna=False)
+ .agg({"trip_instance_key": "count"})
+ .reset_index()
+ )
+ df = df.rename(columns={"trip_instance_key": "n_trips"})
return df
-
+
def aggregate_time_of_day_to_peak_offpeak(
df: pd.DataFrame,
group_cols: list,
@@ -388,7 +389,7 @@ def most_common_shape_by_route_direction(analysis_date: str) -> gpd.GeoDataFrame
most_common_shape = (
trips.groupby(route_dir_cols + ["shape_id", "shape_array_key"],
- observed=True, group_keys = False)
+ observed=True, group_keys = False, dropna= False)
.agg({"trip_instance_key": "count"})
.reset_index()
.sort_values(route_dir_cols + ["trip_instance_key"],
@@ -429,6 +430,8 @@ def most_common_shape_by_route_direction(analysis_date: str) -> gpd.GeoDataFrame
on = ["schedule_gtfs_dataset_key", "route_id"]
)
+ # Amanda: test
+ common_shape_geom2.direction_id = common_shape_geom2.direction_id.fillna(0)
return common_shape_geom2
diff --git a/rt_segment_speeds/segment_speed_utils/metrics.py b/rt_segment_speeds/segment_speed_utils/metrics.py
index 1b3ac554c..8abe8d100 100644
--- a/rt_segment_speeds/segment_speed_utils/metrics.py
+++ b/rt_segment_speeds/segment_speed_utils/metrics.py
@@ -9,9 +9,8 @@
from segment_speed_utils import segment_calcs
def weighted_average_speeds_across_segments(
- df: pd.DataFrame,
- group_cols: list
-) -> pd.DataFrame:
+ df: pd.DataFrame, group_cols: list
+) -> pd.DataFrame:
"""
We can use our segments and the deltas within a trip
to calculate the trip-level average speed, or
@@ -19,15 +18,16 @@ def weighted_average_speeds_across_segments(
But, we want a weighted average, using the raw deltas
instead of mean(speed_mph), since segments can be varying lengths.
"""
- avg_speeds = (df.groupby(group_cols,
- observed=True, group_keys=False)
- .agg({
- "meters_elapsed": "sum",
- "sec_elapsed": "sum",
- }).reset_index()
- ).pipe(
- segment_calcs.speed_from_meters_elapsed_sec_elapsed
- )
+ avg_speeds = (
+ df.groupby(group_cols, observed=True, group_keys=False, dropna=False)
+ .agg(
+ {
+ "meters_elapsed": "sum",
+ "sec_elapsed": "sum",
+ }
+ )
+ .reset_index()
+ ).pipe(segment_calcs.speed_from_meters_elapsed_sec_elapsed)
return avg_speeds
@@ -112,10 +112,10 @@ def derive_trip_comparison_metrics(
def calculate_weighted_average_vp_schedule_metrics(
- df: pd.DataFrame,
+ df: pd.DataFrame,
group_cols: list,
) -> pd.DataFrame:
-
+
sum_cols = [
"minutes_atleast1_vp",
"minutes_atleast2_vp",
@@ -123,21 +123,20 @@ def calculate_weighted_average_vp_schedule_metrics(
"scheduled_service_minutes",
"total_vp",
"vp_in_shape",
- "is_early", "is_ontime", "is_late"
+ "is_early",
+ "is_ontime",
+ "is_late",
]
count_cols = ["trip_instance_key"]
-
+
df2 = (
- df.groupby(group_cols,
- observed=True, group_keys=False)
- .agg({
- **{e: "sum" for e in sum_cols},
- **{e: "count" for e in count_cols}}
- ).reset_index()
- .rename(columns = {"trip_instance_key": "n_vp_trips"})
+ df.groupby(group_cols, observed=True, group_keys=False, dropna=False)
+ .agg({**{e: "sum" for e in sum_cols}, **{e: "count" for e in count_cols}})
+ .reset_index()
+ .rename(columns={"trip_instance_key": "n_vp_trips"})
)
-
+
return df2
diff --git a/rt_segment_speeds/segment_speed_utils/project_vars.py b/rt_segment_speeds/segment_speed_utils/project_vars.py
index 6d28427eb..cae8f5759 100644
--- a/rt_segment_speeds/segment_speed_utils/project_vars.py
+++ b/rt_segment_speeds/segment_speed_utils/project_vars.py
@@ -11,13 +11,14 @@
SHARED_GCS = GTFS_DATA_DICT.gcs_paths.SHARED_GCS
PUBLIC_GCS = GTFS_DATA_DICT.gcs_paths.PUBLIC_GCS
-analysis_date = rt_dates.DATES["dec2024"]
+# analysis_date = rt_dates.DATES["nov2024"]
oct2023_week = rt_dates.get_week("oct2023", exclude_wed=True)
apr2023_week = rt_dates.get_week("apr2023", exclude_wed=True)
apr2024_week = rt_dates.get_week("apr2024", exclude_wed=True)
oct2024_week = rt_dates.get_week("oct2024", exclude_wed=True)
+# One file wasn't found for October 21 2024
all_dates = (
rt_dates.y2024_dates + rt_dates.y2023_dates +
oct2024_week + apr2024_week + oct2023_week + apr2023_week
@@ -28,8 +29,7 @@
rt_dates.oct2023_week, rt_dates.apr2023_week,
]
-
-analysis_date_list = [analysis_date]
+analysis_date_list = apr2024_week + oct2023_week + apr2023_week
PROJECT_CRS = "EPSG:3310"
diff --git a/rt_segment_speeds/segment_speed_utils/segment_calcs.py b/rt_segment_speeds/segment_speed_utils/segment_calcs.py
index 4fb21c5b8..bd15cbd5c 100644
--- a/rt_segment_speeds/segment_speed_utils/segment_calcs.py
+++ b/rt_segment_speeds/segment_speed_utils/segment_calcs.py
@@ -68,11 +68,10 @@ def calculate_avg_speeds(
# pd.groupby and pd.quantile is so slow
# create our own list of speeds and use np
df2 = (df.groupby(group_cols,
- observed=True, group_keys=False)
+ observed=True, group_keys=False, dropna=False)
.agg({"speed_mph": lambda x: sorted(list(x))})
.reset_index()
- .rename(columns = {"speed_mph": "speed_mph_list"})
- )
+ .rename(columns = {"speed_mph": "speed_mph_list"}))
df2 = df2.assign(
p50_mph = df2.apply(lambda x: np.percentile(x.speed_mph_list, q=50), axis=1),