diff --git a/rt_segment_speeds/18_speed_distribution.ipynb b/rt_segment_speeds/18_speed_distribution.ipynb
new file mode 100644
index 000000000..61a84e8d5
--- /dev/null
+++ b/rt_segment_speeds/18_speed_distribution.ipynb
@@ -0,0 +1,1003 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "id": "1266be23-2c40-438d-90c0-eef6dcf3c621",
+ "metadata": {},
+ "source": [
+ "# Distribution of speeds\n",
+ "\n",
+ "Spot-checking Big Blue Bus speeds of existing speed maps and these `p20_mph` speeds.\n",
+ "\n",
+ "Average speeds only throws away too-high speeds (above 70 mph), but not anything too low.\n",
+ "\n",
+ "It looks like we're keeping way too many observations going into the averages, and we need to be more aggressive in excluding unstable speed calculations (speeds that are derived over a too-short-distance or too-short-time). Too-short-time means our denominator is approaching zero, and the calculations could be wildly unstable as we approach the asymptote. The same is probably happening for too-short-distances. "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "id": "776b8208-ad41-4c32-a064-99ab87fb371a",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import os\n",
+ "os.environ['USE_PYGEOS']='0'\n",
+ "\n",
+ "import altair as alt\n",
+ "import dask.dataframe as dd\n",
+ "import geopandas as gpd\n",
+ "import numpy as np\n",
+ "import pandas as pd\n",
+ "\n",
+ "from shared_utils import rt_dates, rt_utils\n",
+ "from segment_speed_utils.project_vars import SEGMENT_GCS"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "id": "e764cc9e-7113-4672-8943-dc4827b80ea2",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "'2023-07-12'"
+ ]
+ },
+ "execution_count": 2,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "months = [\"mar\", \"apr\", \"may\", \"jun\", \"jul\"]\n",
+ "\n",
+ "dates = [\n",
+ " rt_dates.DATES[f\"{m}2023\"] for m in months\n",
+ "]\n",
+ "analysis_date = dates[-1]\n",
+ "analysis_date"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "id": "43f5ec34-f301-43cc-898a-ed2e31a1778d",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "test_operator = \"Big Blue Bus VehiclePositions\"\n",
+ "test_org = \"City of Santa Monica\"\n",
+ "\n",
+ "pub_df = pd.read_parquet(\n",
+ " f\"{SEGMENT_GCS}export/avg_speeds_stop_segments_{analysis_date}_tabular.parquet\", \n",
+ " filters = [[(\"agency\", \"==\", test_org)]]\n",
+ ")\n",
+ "\n",
+ "test_operator_key = pub_df.gtfs_dataset_key.iloc[0]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "id": "a728d1c4-1777-4706-8df6-81670730a2b7",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def import_avg_speeds(date: str, **kwargs) -> gpd.GeoDataFrame: \n",
+ " avg_speeds = gpd.read_parquet(\n",
+ " f\"{SEGMENT_GCS}avg_speeds_stop_segments_{date}.parquet\", \n",
+ " **kwargs\n",
+ " )\n",
+ " \n",
+ " return avg_speeds\n",
+ "\n",
+ "\n",
+ "def import_trip_speeds(date: str, **kwargs) -> pd.DataFrame:\n",
+ " trips = dd.read_parquet(\n",
+ " f\"{SEGMENT_GCS}speeds_stop_segments_{analysis_date}\",\n",
+ " **kwargs\n",
+ " ).compute()\n",
+ " \n",
+ " return trips"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "id": "bfcd3915-8f86-483f-a4ee-7b17b09c8f9b",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "Index(['shape_array_key', 'stop_sequence', 'gtfs_dataset_key', 'stop_id',\n",
+ " 'loop_or_inlining', 'district', 'district_name', 'p50_mph', 'n_trips',\n",
+ " 'p20_mph', 'p80_mph', 'time_of_day', 'shape_id', 'base64_url', 'uri',\n",
+ " 'org_id', 'agency'],\n",
+ " dtype='object')"
+ ]
+ },
+ "execution_count": 5,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "pub_df.columns"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "id": "89f1bbad-a050-4ed9-a96e-45180a8aced8",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "array(['de70089f186a809de6685c056377f892'], dtype=object)"
+ ]
+ },
+ "execution_count": 6,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "pub_df[pub_df.shape_id==\"26347\"].shape_array_key.unique()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "id": "bc23981e-6ca9-4339-8064-f01544b873e9",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "array(['080f585295228f8c8f52cb373b1685cc'], dtype=object)"
+ ]
+ },
+ "execution_count": 7,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "pub_df[pub_df.shape_id==\"26348\"].shape_array_key.unique()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "id": "728161ad-de9b-43ec-a2ef-9d6fa981976d",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "array(['5d34851ee46adb62216152f8a16fe7d0'], dtype=object)"
+ ]
+ },
+ "execution_count": 8,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "pub_df[pub_df.shape_id==\"26342\"].shape_array_key.unique()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "id": "9aa0f04d-24a0-4786-892d-ec6723838bfc",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "test_shape1 = \"de70089f186a809de6685c056377f892\"\n",
+ "test_shape2 = \"080f585295228f8c8f52cb373b1685cc\"\n",
+ "test_shape3 = \"5d34851ee46adb62216152f8a16fe7d0\""
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "id": "5eb43efb-53ac-41cc-a7cd-21cdcd6f11d5",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "avg_speeds = import_avg_speeds(\n",
+ " analysis_date, \n",
+ " filters = [[(\"gtfs_dataset_key\", \"==\", test_operator_key)]],\n",
+ " columns = [\"shape_array_key\", \"stop_sequence\",\n",
+ " \"p20_mph\", \"p50_mph\", \"p80_mph\", \n",
+ " \"n_trips\", \"time_of_day\",\n",
+ " \"geometry\"]\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "id": "7a6feb4c-c273-43f5-9d3d-361d83745f7c",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "trip_speeds = import_trip_speeds(\n",
+ " analysis_date,\n",
+ " filters = [[(\"gtfs_dataset_key\", \"==\", test_operator_key)]],\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "id": "8f3dd9fb-93bd-4d8b-bc22-5cfb6ccd53b8",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " shape_array_key | \n",
+ " stop_sequence | \n",
+ " p20_mph | \n",
+ " p50_mph | \n",
+ " p80_mph | \n",
+ " n_trips | \n",
+ " time_of_day | \n",
+ " geometry | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 201857 | \n",
+ " de70089f186a809de6685c056377f892 | \n",
+ " 29 | \n",
+ " 1.68 | \n",
+ " 3.77 | \n",
+ " 9.89 | \n",
+ " 3 | \n",
+ " all_day | \n",
+ " LINESTRING (-118.48871 34.02165, -118.48951 34... | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " shape_array_key stop_sequence p20_mph p50_mph \\\n",
+ "201857 de70089f186a809de6685c056377f892 29 1.68 3.77 \n",
+ "\n",
+ " p80_mph n_trips time_of_day \\\n",
+ "201857 9.89 3 all_day \n",
+ "\n",
+ " geometry \n",
+ "201857 LINESTRING (-118.48871 34.02165, -118.48951 34... "
+ ]
+ },
+ "execution_count": 12,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "avg_speeds[(avg_speeds.shape_array_key==test_shape1) & \n",
+ " (avg_speeds.stop_sequence==29)]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "id": "78826c8e-865e-4e8c-9e23-ee9c599d3867",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "array([ nan, 3.76987431, 13.96246222, 0.29230316])"
+ ]
+ },
+ "execution_count": 13,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "trip_speeds[(trip_speeds.shape_array_key==test_shape1) & \n",
+ " (trip_speeds.stop_sequence==29)].speed_mph.unique()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "id": "b7afc829-0833-426b-9b63-c47f3de70ef8",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "Make this Notebook Trusted to load map: File -> Trust Notebook
"
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 14,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "avg_speeds[avg_speeds.shape_array_key==test_shape1].explore(\n",
+ " \"p50_mph\", \n",
+ " tiles = \"CartoDB Positron\",\n",
+ " cmap = rt_utils.ZERO_THIRTY_COLORSCALE\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
+ "id": "560c3590-c919-4f79-8bbe-8bc87a5b3103",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "Make this Notebook Trusted to load map: File -> Trust Notebook
"
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 15,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "avg_speeds[avg_speeds.shape_array_key==test_shape3].explore(\n",
+ " \"p50_mph\", \n",
+ " tiles = \"CartoDB Positron\",\n",
+ " cmap = rt_utils.ZERO_THIRTY_COLORSCALE\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 16,
+ "id": "2609f100-8eee-400b-a63d-5de271355905",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "array([ nan, 11.85504701, 8.30953561, 8.11102264, 3.30187334,\n",
+ " 13.55831359, 3.12661394, 0. , 12.39887132, 9.04728661,\n",
+ " 12.95855883, 11.64274665, 13.80783397, 8.7487871 , 8.11962361,\n",
+ " 1.13056783, 8.76647662, 1.91638667, 12.15852066, 9.74048341,\n",
+ " 6.32444451, 10.52354078, 12.24562967, 6.69858223, 4.280623 ,\n",
+ " 13.21839885])"
+ ]
+ },
+ "execution_count": 16,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "trip_speeds[(trip_speeds.shape_array_key==test_shape3) & \n",
+ " (trip_speeds.stop_sequence==6)].speed_mph.unique()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 17,
+ "id": "f8a5fe95-979b-4ba7-9cbe-d1c22ccdedf0",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "array([ 0. , 1.13056783, 1.91638667, 3.12661394, 3.30187334,\n",
+ " 4.280623 , 6.32444451, 6.69858223, 8.11102264, 8.11962361,\n",
+ " 8.30953561, 8.7487871 , 8.76647662, 9.04728661, 9.74048341,\n",
+ " 10.52354078, 11.64274665, 11.85504701, 12.15852066, 12.24562967,\n",
+ " 12.39887132, 12.95855883, 13.21839885, 13.55831359, 13.80783397,\n",
+ " nan])"
+ ]
+ },
+ "execution_count": 17,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "one_segment = trip_speeds[\n",
+ " (trip_speeds.shape_array_key==test_shape3) & \n",
+ " (trip_speeds.stop_sequence==6)]\n",
+ "\n",
+ "np.sort(one_segment.speed_mph.unique())"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 18,
+ "id": "0897e604-344c-4700-8edf-4221953f1c12",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "array([ 1.13056783, 1.91638667, 3.12661394, 3.30187334, 4.280623 ,\n",
+ " 6.32444451, 6.69858223, 8.11102264, 8.11962361, 8.30953561,\n",
+ " 8.7487871 , 8.76647662, 9.04728661, 9.74048341, 10.52354078,\n",
+ " 11.64274665, 11.85504701, 12.15852066, 12.24562967, 12.39887132,\n",
+ " 12.95855883, 13.21839885, 13.55831359, 13.80783397])"
+ ]
+ },
+ "execution_count": 18,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "one_segment_filtered = one_segment[(one_segment.sec_elapsed > 0) & \n",
+ " (one_segment.meters_elapsed > 0) & \n",
+ " (one_segment.speed_mph.notna())\n",
+ " ]\n",
+ "\n",
+ "np.sort(one_segment_filtered.speed_mph.unique())"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 19,
+ "id": "6989271c-05b7-47bf-bf94-ef262c7ddc4f",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "count 53.000000\n",
+ "mean 102.879312\n",
+ "std 121.899773\n",
+ "min 0.000000\n",
+ "25% 0.000000\n",
+ "50% 0.000000\n",
+ "75% 254.725516\n",
+ "max 271.589045\n",
+ "Name: meters_elapsed, dtype: float64"
+ ]
+ },
+ "execution_count": 19,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "one_segment.meters_elapsed.describe()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 22,
+ "id": "601277ea-f897-42e4-824b-15aed1a74cc2",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/tmp/ipykernel_1462/2851869297.py:4: SettingWithCopyWarning: \n",
+ "A value is trying to be set on a copy of a slice from a DataFrame.\n",
+ "Try using .loc[row_indexer,col_indexer] = value instead\n",
+ "\n",
+ "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+ "/tmp/ipykernel_1462/2851869297.py:7: SettingWithCopyWarning: \n",
+ "A value is trying to be set on a copy of a slice from a DataFrame.\n",
+ "Try using .loc[row_indexer,col_indexer] = value instead\n",
+ "\n",
+ "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n"
+ ]
+ }
+ ],
+ "source": [
+ "bins = [0, 5, 10, 15, 20, 25, 30, 35, 40]\n",
+ "meter_bins = [0, 50, 100, 150, 200, 250, 300]\n",
+ "\n",
+ "one_segment['speed_binned'] = pd.cut(\n",
+ " one_segment.speed_mph, bins).apply(lambda x: x.left)\n",
+ "\n",
+ "one_segment[\"meters_binned\"] = pd.cut(\n",
+ " one_segment.meters_elapsed, meter_bins).apply(lambda x: x.left)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 23,
+ "id": "01238681-f672-41d2-bd4b-ec161cf74bd8",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ ""
+ ],
+ "text/plain": [
+ "alt.Chart(...)"
+ ]
+ },
+ "execution_count": 23,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "(alt.Chart(one_segment)\n",
+ " .mark_tick()\n",
+ " .encode(\n",
+ " x=\"meters_binned:O\",\n",
+ " y=\"speed_mph:Q\"\n",
+ " ).interactive()\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 24,
+ "id": "ca2a05aa-f398-4ec9-af86-29e3acd4777d",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " trip_id | \n",
+ " meters_elapsed | \n",
+ " sec_elapsed | \n",
+ " speed_mph | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 5953 | \n",
+ " 903007 | \n",
+ " 22.237365 | \n",
+ " 44.0 | \n",
+ " 1.130568 | \n",
+ "
\n",
+ " \n",
+ " 5955 | \n",
+ " 902944 | \n",
+ " 268.139932 | \n",
+ " 313.0 | \n",
+ " 1.916387 | \n",
+ "
\n",
+ " \n",
+ " 5936 | \n",
+ " 902985 | \n",
+ " 271.150248 | \n",
+ " 194.0 | \n",
+ " 3.126614 | \n",
+ "
\n",
+ " \n",
+ " 5932 | \n",
+ " 902987 | \n",
+ " 271.589045 | \n",
+ " 184.0 | \n",
+ " 3.301873 | \n",
+ "
\n",
+ " \n",
+ " 5971 | \n",
+ " 903002 | \n",
+ " 86.109984 | \n",
+ " 45.0 | \n",
+ " 4.280623 | \n",
+ "
\n",
+ " \n",
+ " 5963 | \n",
+ " 902978 | \n",
+ " 271.411119 | \n",
+ " 96.0 | \n",
+ " 6.324445 | \n",
+ "
\n",
+ " \n",
+ " 5968 | \n",
+ " 902943 | \n",
+ " 266.505954 | \n",
+ " 89.0 | \n",
+ " 6.698582 | \n",
+ "
\n",
+ " \n",
+ " 5930 | \n",
+ " 902971 | \n",
+ " 177.666566 | \n",
+ " 49.0 | \n",
+ " 8.111023 | \n",
+ "
\n",
+ " \n",
+ " 5952 | \n",
+ " 902997 | \n",
+ " 192.373738 | \n",
+ " 53.0 | \n",
+ " 8.119624 | \n",
+ "
\n",
+ " \n",
+ " 5928 | \n",
+ " 902960 | \n",
+ " 167.156505 | \n",
+ " 45.0 | \n",
+ " 8.309536 | \n",
+ "
\n",
+ " \n",
+ " 5947 | \n",
+ " 902974 | \n",
+ " 195.547320 | \n",
+ " 50.0 | \n",
+ " 8.748787 | \n",
+ "
\n",
+ " \n",
+ " 5954 | \n",
+ " 903005 | \n",
+ " 254.725516 | \n",
+ " 65.0 | \n",
+ " 8.766477 | \n",
+ "
\n",
+ " \n",
+ " 5940 | \n",
+ " 902966 | \n",
+ " 181.997272 | \n",
+ " 45.0 | \n",
+ " 9.047287 | \n",
+ "
\n",
+ " \n",
+ " 5959 | \n",
+ " 902968 | \n",
+ " 235.130131 | \n",
+ " 54.0 | \n",
+ " 9.740483 | \n",
+ "
\n",
+ " \n",
+ " 5965 | \n",
+ " 902956 | \n",
+ " 216.398246 | \n",
+ " 46.0 | \n",
+ " 10.523541 | \n",
+ "
\n",
+ " \n",
+ " 5945 | \n",
+ " 902959 | \n",
+ " 260.231262 | \n",
+ " 50.0 | \n",
+ " 11.642747 | \n",
+ "
\n",
+ " \n",
+ " 5927 | \n",
+ " 902941 | \n",
+ " 264.976464 | \n",
+ " 50.0 | \n",
+ " 11.855047 | \n",
+ "
\n",
+ " \n",
+ " 5956 | \n",
+ " 902980 | \n",
+ " 244.583563 | \n",
+ " 45.0 | \n",
+ " 12.158521 | \n",
+ "
\n",
+ " \n",
+ " 5967 | \n",
+ " 902961 | \n",
+ " 268.232389 | \n",
+ " 49.0 | \n",
+ " 12.245630 | \n",
+ "
\n",
+ " \n",
+ " 5938 | \n",
+ " 902995 | \n",
+ " 271.589045 | \n",
+ " 49.0 | \n",
+ " 12.398871 | \n",
+ "
\n",
+ " \n",
+ " 5942 | \n",
+ " 902950 | \n",
+ " 260.677312 | \n",
+ " 45.0 | \n",
+ " 12.958559 | \n",
+ "
\n",
+ " \n",
+ " 5972 | \n",
+ " 902965 | \n",
+ " 265.904313 | \n",
+ " 45.0 | \n",
+ " 13.218399 | \n",
+ "
\n",
+ " \n",
+ " 5935 | \n",
+ " 902979 | \n",
+ " 266.681179 | \n",
+ " 44.0 | \n",
+ " 13.558314 | \n",
+ "
\n",
+ " \n",
+ " 5946 | \n",
+ " 902972 | \n",
+ " 271.589045 | \n",
+ " 44.0 | \n",
+ " 13.807834 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " trip_id meters_elapsed sec_elapsed speed_mph\n",
+ "5953 903007 22.237365 44.0 1.130568\n",
+ "5955 902944 268.139932 313.0 1.916387\n",
+ "5936 902985 271.150248 194.0 3.126614\n",
+ "5932 902987 271.589045 184.0 3.301873\n",
+ "5971 903002 86.109984 45.0 4.280623\n",
+ "5963 902978 271.411119 96.0 6.324445\n",
+ "5968 902943 266.505954 89.0 6.698582\n",
+ "5930 902971 177.666566 49.0 8.111023\n",
+ "5952 902997 192.373738 53.0 8.119624\n",
+ "5928 902960 167.156505 45.0 8.309536\n",
+ "5947 902974 195.547320 50.0 8.748787\n",
+ "5954 903005 254.725516 65.0 8.766477\n",
+ "5940 902966 181.997272 45.0 9.047287\n",
+ "5959 902968 235.130131 54.0 9.740483\n",
+ "5965 902956 216.398246 46.0 10.523541\n",
+ "5945 902959 260.231262 50.0 11.642747\n",
+ "5927 902941 264.976464 50.0 11.855047\n",
+ "5956 902980 244.583563 45.0 12.158521\n",
+ "5967 902961 268.232389 49.0 12.245630\n",
+ "5938 902995 271.589045 49.0 12.398871\n",
+ "5942 902950 260.677312 45.0 12.958559\n",
+ "5972 902965 265.904313 45.0 13.218399\n",
+ "5935 902979 266.681179 44.0 13.558314\n",
+ "5946 902972 271.589045 44.0 13.807834"
+ ]
+ },
+ "execution_count": 24,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "one_segment_filtered.sort_values(\"speed_mph\")[\n",
+ " [\"trip_id\", \"meters_elapsed\", \"sec_elapsed\", \"speed_mph\"]]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 25,
+ "id": "502c330e-52bc-409c-8a92-9f179f3d6852",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "speed_distribution = np.sort(one_segment_filtered.speed_mph.unique())"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 26,
+ "id": "1274bfc2-0c42-4ef3-939f-d5f93afaed13",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "8.906881612224176"
+ ]
+ },
+ "execution_count": 26,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "np.quantile(speed_distribution, 0.5)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 27,
+ "id": "3b031a5d-2813-44dc-80d6-11548142d61c",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "5.506915906639923"
+ ]
+ },
+ "execution_count": 27,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "np.quantile(speed_distribution, 0.2)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 28,
+ "id": "4ff51893-12cd-450b-810e-a34c3a16ecda",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "operator_segments = gpd.read_parquet(\n",
+ " f\"{SEGMENT_GCS}stop_segments_{analysis_date}.parquet\",\n",
+ " filters = [[(\"gtfs_dataset_key\", \"==\", test_operator_key)]]\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 30,
+ "id": "d981a2fc-53f3-4989-a8f0-cc3e4729609b",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "operator_segments = operator_segments.assign(\n",
+ " segment_meters = operator_segments.geometry.length\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 31,
+ "id": "54dac8b1-c5ac-47e6-9910-2c864d3e46f9",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "count 2211.000000\n",
+ "mean 431.703997\n",
+ "std 635.401191\n",
+ "min 0.000000\n",
+ "25% 265.627183\n",
+ "50% 343.305085\n",
+ "75% 442.045878\n",
+ "max 17762.088740\n",
+ "Name: segment_meters, dtype: float64"
+ ]
+ },
+ "execution_count": 31,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "operator_segments.segment_meters.describe()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "a6c8c131-31c5-4f17-b82a-2978a941f9d2",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.9.13"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/rt_segment_speeds/ca_segment_speeds.ipynb b/rt_segment_speeds/ca_segment_speeds.ipynb
deleted file mode 100644
index b2256516c..000000000
--- a/rt_segment_speeds/ca_segment_speeds.ipynb
+++ /dev/null
@@ -1,141 +0,0 @@
-{
- "cells": [
- {
- "cell_type": "code",
- "execution_count": 1,
- "id": "9f04613d-8fa7-44d1-8827-3c55e2351ab0",
- "metadata": {},
- "outputs": [],
- "source": [
- "%%capture\n",
- "import warnings\n",
- "warnings.filterwarnings(\"ignore\")\n",
- "\n",
- "import branca\n",
- "import calitp.magics\n",
- "import geopandas as gpd\n",
- "import pandas as pd\n",
- "\n",
- "from shared_utils import portfolio_utils\n",
- "from update_vars import SEGMENT_GCS, analysis_date\n",
- "\n",
- "ZERO_FIFTY_COLORSCALE = branca.colormap.step.RdYlGn_10.scale(\n",
- " vmin=0, \n",
- " vmax=50\n",
- ")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 2,
- "id": "4e37258d-b752-467f-948c-6a2d5391e8b0",
- "metadata": {},
- "outputs": [
- {
- "ename": "NameError",
- "evalue": "name 'district' is not defined",
- "output_type": "error",
- "traceback": [
- "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
- "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
- "Cell \u001b[0;32mIn[2], line 3\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;66;03m# Read in data\u001b[39;00m\n\u001b[1;32m 2\u001b[0m gdf \u001b[38;5;241m=\u001b[39m gpd\u001b[38;5;241m.\u001b[39mread_parquet(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mSEGMENT_GCS\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124mavg_speeds_\u001b[39m\u001b[38;5;132;01m{\u001b[39;00manalysis_date\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m.parquet\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m----> 3\u001b[0m gdf \u001b[38;5;241m=\u001b[39m gdf[gdf\u001b[38;5;241m.\u001b[39mdistrict\u001b[38;5;241m==\u001b[39m\u001b[43mdistrict\u001b[49m]\u001b[38;5;241m.\u001b[39mreset_index(drop\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)\n\u001b[1;32m 5\u001b[0m \u001b[38;5;66;03m# Use dict to key into district name because D9 is missing data\u001b[39;00m\n\u001b[1;32m 6\u001b[0m district_name \u001b[38;5;241m=\u001b[39m portfolio_utils\u001b[38;5;241m.\u001b[39mdistrict_name_dict[district]\n",
- "\u001b[0;31mNameError\u001b[0m: name 'district' is not defined"
- ]
- }
- ],
- "source": [
- "# Read in data\n",
- "gdf = gpd.read_parquet(f\"{SEGMENT_GCS}avg_speeds_{analysis_date}.parquet\")\n",
- "gdf = gdf[gdf.district==district].reset_index(drop=True)\n",
- "\n",
- "# Use dict to key into district name because D9 is missing data\n",
- "district_name = portfolio_utils.district_name_dict[district]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "7a222bbd-4fb0-43f4-b189-63edc35ce1f8",
- "metadata": {},
- "outputs": [],
- "source": [
- "%%capture_parameters\n",
- "district, district_name"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "0dda7c36-29f0-457f-9932-f2277d2c8ea1",
- "metadata": {},
- "source": [
- "# {district_name}\n",
- "\n",
- "* Goal: statewide map for daily segment speeds\n",
- "* v2 warehouse (01/18/23)\n",
- "* calculate time-of-day averages"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "0ae01433-865d-4820-94fe-a62a6c4c6376",
- "metadata": {},
- "outputs": [],
- "source": [
- "#for s in range(30, 75, 5):\n",
- "# print(f\"# rows with over {s} mph: {len(gdf[gdf.speed_mph > s])}\")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "113b8300-6992-497f-977e-5bdac9650d15",
- "metadata": {},
- "outputs": [],
- "source": [
- "def make_map(gdf: gpd.GeoDataFrame, district: int):\n",
- " if len(gdf) > 0:\n",
- " m = gdf.explore(\n",
- " \"speed_mph\",\n",
- " tiles = \"CartoDB Positron\",\n",
- " cmap = ZERO_FIFTY_COLORSCALE\n",
- " )\n",
- "\n",
- " display(m)\n",
- " else:\n",
- " print(f\"No RT trip info available in district {district}.\")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "e92ff10b-ff10-4900-b503-e64848ee8f54",
- "metadata": {},
- "outputs": [],
- "source": [
- "make_map(gdf, district)"
- ]
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 3 (ipykernel)",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.9.13"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}
diff --git a/rt_segment_speeds/logs/cut_stop_segments.log b/rt_segment_speeds/logs/cut_stop_segments.log
index df7e8ea74..2e3075997 100644
--- a/rt_segment_speeds/logs/cut_stop_segments.log
+++ b/rt_segment_speeds/logs/cut_stop_segments.log
@@ -37,3 +37,27 @@
2023-07-25 15:31:25.425 | INFO | __main__::313 - Cut special stop segments: 0:28:46.570691
2023-07-25 15:31:26.518 | INFO | __main__::332 - export results: 0:00:01.093220
2023-07-25 15:31:26.520 | INFO | __main__::333 - execution time: 0:28:47.663911
+2023-07-25 16:07:13.609 | INFO | __main__::198 - Analysis date: 2023-07-12
+2023-07-25 16:08:07.369 | INFO | __main__::240 - Cut normal stop segments: 0:00:53.759088
+2023-07-25 16:19:44.770 | INFO | __main__::252 - Export results: 0:11:37.401262
+2023-07-25 16:19:44.770 | INFO | __main__::255 - execution time: 0:12:31.160955
+2023-07-25 16:20:26.164 | INFO | __main__::293 - Analysis date: 2023-07-12
+2023-07-25 16:48:16.067 | INFO | __main__::313 - Cut special stop segments: 0:27:49.863811
+2023-07-25 16:48:17.367 | INFO | __main__::332 - export results: 0:00:01.299823
+2023-07-25 16:48:17.369 | INFO | __main__::333 - execution time: 0:27:51.163634
+2023-07-26 12:20:36.330 | INFO | __main__::198 - Analysis date: 2023-04-12
+2023-07-26 12:21:45.345 | INFO | __main__::240 - Cut normal stop segments: 0:01:08.999538
+2023-07-26 12:38:38.885 | INFO | __main__::252 - Export results: 0:16:53.539426
+2023-07-26 12:38:38.887 | INFO | __main__::255 - execution time: 0:18:02.541602
+2023-07-26 12:39:04.242 | INFO | __main__::293 - Analysis date: 2023-04-12
+2023-07-26 13:24:15.235 | INFO | __main__::313 - Cut special stop segments: 0:45:10.983686
+2023-07-26 13:24:16.470 | INFO | __main__::332 - export results: 0:00:01.234829
+2023-07-26 13:24:16.474 | INFO | __main__::333 - execution time: 0:45:12.218515
+2023-07-26 13:47:14.875 | INFO | __main__::198 - Analysis date: 2023-03-15
+2023-07-26 13:48:21.112 | INFO | __main__::240 - Cut normal stop segments: 0:01:06.234095
+2023-07-26 14:04:28.944 | INFO | __main__::252 - Export results: 0:16:07.832295
+2023-07-26 14:04:28.946 | INFO | __main__::255 - execution time: 0:17:14.068003
+2023-07-26 14:04:55.833 | INFO | __main__::293 - Analysis date: 2023-03-15
+2023-07-26 14:45:50.780 | INFO | __main__::313 - Cut special stop segments: 0:40:54.937581
+2023-07-26 14:45:52.168 | INFO | __main__::332 - export results: 0:00:01.388009
+2023-07-26 14:45:52.170 | INFO | __main__::333 - execution time: 0:40:56.325590
diff --git a/rt_segment_speeds/logs/prep_stop_segments.log b/rt_segment_speeds/logs/prep_stop_segments.log
index 45c4d8cee..aaf61620b 100644
--- a/rt_segment_speeds/logs/prep_stop_segments.log
+++ b/rt_segment_speeds/logs/prep_stop_segments.log
@@ -18,3 +18,10 @@
2023-07-25 14:39:19.717 | INFO | __main__::290 - Analysis date: 2023-06-14
2023-07-25 14:49:25.861 | INFO | __main__::297 - Prep stop segment df: 0:10:06.141854
2023-07-25 14:50:12.947 | INFO | __main__::307 - execution time: 0:10:53.228185
+2023-07-25 15:55:48.582 | INFO | __main__::290 - Analysis date: 2023-07-12
+2023-07-26 12:01:47.353 | INFO | __main__::290 - Analysis date: 2023-04-12
+2023-07-26 12:19:11.448 | INFO | __main__::297 - Prep stop segment df: 0:17:24.080713
+2023-07-26 12:20:13.208 | INFO | __main__::307 - execution time: 0:18:25.839838
+2023-07-26 13:31:19.892 | INFO | __main__::290 - Analysis date: 2023-03-15
+2023-07-26 13:45:57.782 | INFO | __main__::297 - Prep stop segment df: 0:14:37.886934
+2023-07-26 13:46:51.502 | INFO | __main__::307 - execution time: 0:15:31.607489
diff --git a/rt_segment_speeds/logs/sjoin_vp_segments.log b/rt_segment_speeds/logs/sjoin_vp_segments.log
index 16eb2d1ac..b87c6e79a 100644
--- a/rt_segment_speeds/logs/sjoin_vp_segments.log
+++ b/rt_segment_speeds/logs/sjoin_vp_segments.log
@@ -75,19 +75,27 @@
2023-06-30 13:41:14.364 | INFO | __main__::298 - attach vp to stop-to-stop segments: 0:46:03.991197
2023-06-30 13:42:25.427 | INFO | __main__::308 - compiled parquets: 0:01:11.062989
2023-06-30 13:42:25.430 | INFO | __main__::309 - execution time: 0:47:15.054186
-2023-07-21 15:14:18.352 | INFO | __main__::286 - Analysis date: 2023-07-12
-2023-07-21 16:00:08.511 | INFO | __main__::298 - attach vp to stop-to-stop segments: 0:45:50.129886
-2023-07-21 16:01:13.055 | INFO | __main__::308 - compiled parquets: 0:01:04.543964
-2023-07-21 16:01:13.056 | INFO | __main__::309 - execution time: 0:46:54.673850
-2023-07-24 15:17:47.583 | INFO | __main__::286 - Analysis date: 2023-05-17
-2023-07-24 15:59:43.505 | INFO | __main__::298 - attach vp to stop-to-stop segments: 0:41:55.910214
-2023-07-24 16:00:43.071 | INFO | __main__::308 - compiled parquets: 0:00:59.566345
-2023-07-24 16:00:43.072 | INFO | __main__::309 - execution time: 0:42:55.476559
-2023-07-24 20:46:21.403 | INFO | __main__::286 - Analysis date: 2023-06-14
-2023-07-24 21:26:37.726 | INFO | __main__::298 - attach vp to stop-to-stop segments: 0:40:16.322554
-2023-07-24 21:27:37.057 | INFO | __main__::308 - compiled parquets: 0:00:59.331107
-2023-07-24 21:27:37.058 | INFO | __main__::309 - execution time: 0:41:15.653661
-2023-07-25 10:03:02.972 | INFO | __main__::286 - Analysis date: 2023-07-12
-2023-07-25 10:47:55.030 | INFO | __main__::298 - attach vp to stop-to-stop segments: 0:44:52.057560
-2023-07-25 10:48:51.630 | INFO | __main__::308 - compiled parquets: 0:00:56.599756
-2023-07-25 10:48:51.630 | INFO | __main__::309 - execution time: 0:45:48.657316
+2023-07-25 16:52:12.138 | INFO | __main__::286 - Analysis date: 2023-05-17
+2023-07-25 17:34:09.962 | INFO | __main__::298 - attach vp to stop-to-stop segments: 0:41:57.812287
+2023-07-25 17:35:06.440 | INFO | __main__::308 - compiled parquets: 0:00:56.477889
+2023-07-25 17:35:06.441 | INFO | __main__::309 - execution time: 0:42:54.290176
+2023-07-25 19:29:22.771 | INFO | __main__::286 - Analysis date: 2023-06-14
+2023-07-25 20:10:03.035 | INFO | __main__::298 - attach vp to stop-to-stop segments: 0:40:40.263436
+2023-07-25 20:11:00.673 | INFO | __main__::308 - compiled parquets: 0:00:57.637347
+2023-07-25 20:11:00.675 | INFO | __main__::309 - execution time: 0:41:37.900783
+2023-07-25 21:27:52.866 | INFO | __main__::286 - Analysis date: 2023-07-12
+2023-07-25 22:11:55.296 | INFO | __main__::298 - attach vp to stop-to-stop segments: 0:44:02.429077
+2023-07-25 22:13:11.709 | INFO | __main__::308 - compiled parquets: 0:01:16.412710
+2023-07-25 22:13:11.713 | INFO | __main__::309 - execution time: 0:45:18.841787
+2023-07-26 15:09:54.854 | INFO | __main__::286 - Analysis date: 2023-03-15
+2023-07-26 16:14:01.117 | INFO | __main__::298 - attach vp to stop-to-stop segments: 1:04:06.237100
+2023-07-26 16:15:04.057 | INFO | __main__::308 - compiled parquets: 0:01:02.939853
+2023-07-26 16:15:04.058 | INFO | __main__::309 - execution time: 1:05:09.176953
+2023-07-26 18:23:50.016 | INFO | __main__::286 - Analysis date: 2023-03-15
+2023-07-26 19:31:45.629 | INFO | __main__::298 - attach vp to stop-to-stop segments: 1:07:55.611782
+2023-07-26 19:32:43.933 | INFO | __main__::308 - compiled parquets: 0:00:58.304490
+2023-07-26 19:32:43.944 | INFO | __main__::309 - execution time: 1:08:53.916272
+2023-07-26 21:21:05.389 | INFO | __main__::286 - Analysis date: 2023-04-12
+2023-07-26 22:22:19.205 | INFO | __main__::298 - attach vp to stop-to-stop segments: 1:01:13.776816
+2023-07-26 22:23:14.796 | INFO | __main__::308 - compiled parquets: 0:00:55.591754
+2023-07-26 22:23:14.798 | INFO | __main__::309 - execution time: 1:02:09.368570
diff --git a/rt_segment_speeds/logs/speeds_by_segment_trip.log b/rt_segment_speeds/logs/speeds_by_segment_trip.log
index dacbb21f9..e72c8681c 100644
--- a/rt_segment_speeds/logs/speeds_by_segment_trip.log
+++ b/rt_segment_speeds/logs/speeds_by_segment_trip.log
@@ -31,18 +31,28 @@
2023-05-20 10:56:10.880 | INFO | __main__:linear_referencing_and_speed_by_segment:78 - calculate speeds: 0:00:00.000678
2023-05-20 11:50:39.921 | INFO | __main__::115 - speeds for stop segments: 0:54:29.046501
2023-05-20 11:50:39.924 | INFO | __main__::116 - execution time: 0:54:29.049154
-2023-07-24 16:22:19.702 | INFO | __main__::116 - Analysis date: 2023-05-17
-2023-07-24 16:52:00.420 | INFO | __main__:linear_referencing_and_speed_by_segment:84 - linear referencing: 0:29:40.696934
-2023-07-24 16:52:00.426 | INFO | __main__:linear_referencing_and_speed_by_segment:96 - calculate speeds: 0:00:00.006454
-2023-07-24 17:19:08.400 | INFO | __main__::127 - speeds for stop segments: 0:56:48.681858
-2023-07-24 17:19:08.401 | INFO | __main__::128 - execution time: 0:56:48.682781
-2023-07-24 21:48:17.827 | INFO | __main__::116 - Analysis date: 2023-06-14
-2023-07-24 22:17:10.701 | INFO | __main__:linear_referencing_and_speed_by_segment:84 - linear referencing: 0:28:52.868856
-2023-07-24 22:17:10.707 | INFO | __main__:linear_referencing_and_speed_by_segment:96 - calculate speeds: 0:00:00.005970
-2023-07-24 22:43:17.406 | INFO | __main__::127 - speeds for stop segments: 0:54:59.578336
-2023-07-24 22:43:17.407 | INFO | __main__::128 - execution time: 0:54:59.579301
-2023-07-25 11:12:28.456 | INFO | __main__::116 - Analysis date: 2023-07-12
-2023-07-25 11:49:17.338 | INFO | __main__:linear_referencing_and_speed_by_segment:84 - linear referencing: 0:36:48.876648
-2023-07-25 11:49:17.344 | INFO | __main__:linear_referencing_and_speed_by_segment:96 - calculate speeds: 0:00:00.006385
-2023-07-25 12:16:28.698 | INFO | __main__::127 - speeds for stop segments: 1:04:00.240843
-2023-07-25 12:16:28.699 | INFO | __main__::128 - execution time: 1:04:00.241659
+2023-07-25 17:55:40.998 | INFO | __main__::116 - Analysis date: 2023-05-17
+2023-07-25 18:25:17.498 | INFO | __main__:linear_referencing_and_speed_by_segment:84 - linear referencing: 0:29:36.467955
+2023-07-25 18:25:17.508 | INFO | __main__:linear_referencing_and_speed_by_segment:96 - calculate speeds: 0:00:00.009634
+2023-07-25 18:51:59.936 | INFO | __main__::127 - speeds for stop segments: 0:56:18.910904
+2023-07-25 18:51:59.937 | INFO | __main__::128 - execution time: 0:56:18.911558
+2023-07-25 20:31:00.338 | INFO | __main__::116 - Analysis date: 2023-06-14
+2023-07-25 20:59:50.907 | INFO | __main__:linear_referencing_and_speed_by_segment:84 - linear referencing: 0:28:50.563683
+2023-07-25 20:59:50.913 | INFO | __main__:linear_referencing_and_speed_by_segment:96 - calculate speeds: 0:00:00.006050
+2023-07-25 21:26:12.883 | INFO | __main__::127 - speeds for stop segments: 0:55:12.544803
+2023-07-25 21:26:12.884 | INFO | __main__::128 - execution time: 0:55:12.545554
+2023-07-25 22:48:46.313 | INFO | __main__::116 - Analysis date: 2023-07-12
+2023-07-25 23:31:55.464 | INFO | __main__:linear_referencing_and_speed_by_segment:84 - linear referencing: 0:43:09.123043
+2023-07-25 23:31:55.493 | INFO | __main__:linear_referencing_and_speed_by_segment:96 - calculate speeds: 0:00:00.028630
+2023-07-26 00:00:04.530 | INFO | __main__::127 - speeds for stop segments: 1:11:18.196214
+2023-07-26 00:00:04.531 | INFO | __main__::128 - execution time: 1:11:18.197170
+2023-07-26 20:02:52.083 | INFO | __main__::116 - Analysis date: 2023-03-15
+2023-07-26 20:34:36.201 | INFO | __main__:linear_referencing_and_speed_by_segment:84 - linear referencing: 0:31:44.113095
+2023-07-26 20:34:36.209 | INFO | __main__:linear_referencing_and_speed_by_segment:96 - calculate speeds: 0:00:00.007907
+2023-07-26 21:12:55.949 | INFO | __main__::127 - speeds for stop segments: 1:10:03.865319
+2023-07-26 21:12:55.950 | INFO | __main__::128 - execution time: 1:10:03.866321
+2023-07-26 22:48:56.041 | INFO | __main__::116 - Analysis date: 2023-04-12
+2023-07-26 23:21:16.317 | INFO | __main__:linear_referencing_and_speed_by_segment:84 - linear referencing: 0:32:20.261839
+2023-07-26 23:21:16.327 | INFO | __main__:linear_referencing_and_speed_by_segment:96 - calculate speeds: 0:00:00.009498
+2023-07-26 23:50:59.023 | INFO | __main__::127 - speeds for stop segments: 1:02:02.972239
+2023-07-26 23:50:59.024 | INFO | __main__::128 - execution time: 1:02:02.973670
diff --git a/rt_segment_speeds/logs/usable_rt_vp.log b/rt_segment_speeds/logs/usable_rt_vp.log
index 7fc9bbad3..70907d754 100644
--- a/rt_segment_speeds/logs/usable_rt_vp.log
+++ b/rt_segment_speeds/logs/usable_rt_vp.log
@@ -16,3 +16,9 @@
2023-07-14 13:40:47.502 | INFO | __main__::151 - Analysis date: 2023-07-12
2023-07-14 13:42:34.741 | INFO | __main__::164 - pare down vp
2023-07-14 13:42:34.742 | INFO | __main__::167 - execution time: 0:01:47.239089
+2023-07-26 18:18:52.727 | INFO | __main__::151 - Analysis date: 2023-03-15
+2023-07-26 18:23:25.887 | INFO | __main__::164 - pare down vp
+2023-07-26 18:23:25.901 | INFO | __main__::167 - execution time: 0:04:33.151084
+2023-07-26 21:16:48.752 | INFO | __main__::151 - Analysis date: 2023-04-12
+2023-07-26 21:20:41.866 | INFO | __main__::164 - pare down vp
+2023-07-26 21:20:41.895 | INFO | __main__::167 - execution time: 0:03:53.140859
diff --git a/rt_segment_speeds/logs/valid_vehicle_positions.log b/rt_segment_speeds/logs/valid_vehicle_positions.log
index 93b10b576..7abaeca64 100644
--- a/rt_segment_speeds/logs/valid_vehicle_positions.log
+++ b/rt_segment_speeds/logs/valid_vehicle_positions.log
@@ -159,3 +159,57 @@
2023-07-25 10:57:31.262 | INFO | __main__::344 - Analysis date: 2023-07-12
2023-07-25 11:10:59.353 | INFO | __main__::358 - pare down vp by stop segments special cases 0:13:28.085738
2023-07-25 11:10:59.355 | INFO | __main__::361 - execution time: 0:13:28.091919
+2023-07-25 17:35:24.578 | INFO | __main__::157 - Analysis date: 2023-05-17
+2023-07-25 17:38:17.011 | INFO | __main__:pare_down_vp_by_segment:130 - merge usable vp with sjoin results: 0:02:52.414746
+2023-07-25 17:38:17.141 | INFO | __main__:pare_down_vp_by_segment:139 - keep enter/exit points: 0:00:00.130051
+2023-07-25 17:42:44.665 | INFO | __main__:pare_down_vp_by_segment:145 - exported: 0:04:27.523972
+2023-07-25 17:42:44.667 | INFO | __main__::171 - pare down vp by stop segments normal cases 0:07:20.071086
+2023-07-25 17:42:44.667 | INFO | __main__::174 - execution time: 0:07:20.075709
+2023-07-25 17:43:02.687 | INFO | __main__::344 - Analysis date: 2023-05-17
+2023-07-25 17:54:28.505 | INFO | __main__::358 - pare down vp by stop segments special cases 0:11:25.812402
+2023-07-25 17:54:28.506 | INFO | __main__::361 - execution time: 0:11:25.817451
+2023-07-25 20:11:18.015 | INFO | __main__::157 - Analysis date: 2023-06-14
+2023-07-25 20:13:59.584 | INFO | __main__:pare_down_vp_by_segment:130 - merge usable vp with sjoin results: 0:02:41.562999
+2023-07-25 20:13:59.739 | INFO | __main__:pare_down_vp_by_segment:139 - keep enter/exit points: 0:00:00.154597
+2023-07-25 20:18:02.221 | INFO | __main__:pare_down_vp_by_segment:145 - exported: 0:04:02.482467
+2023-07-25 20:18:02.223 | INFO | __main__::171 - pare down vp by stop segments normal cases 0:06:44.202068
+2023-07-25 20:18:02.224 | INFO | __main__::174 - execution time: 0:06:44.207623
+2023-07-25 20:18:18.862 | INFO | __main__::344 - Analysis date: 2023-06-14
+2023-07-25 20:29:41.529 | INFO | __main__::358 - pare down vp by stop segments special cases 0:11:22.662479
+2023-07-25 20:29:41.530 | INFO | __main__::361 - execution time: 0:11:22.667589
+2023-07-25 22:13:36.796 | INFO | __main__::157 - Analysis date: 2023-07-12
+2023-07-25 22:16:54.225 | INFO | __main__:pare_down_vp_by_segment:130 - merge usable vp with sjoin results: 0:03:17.415267
+2023-07-25 22:16:54.345 | INFO | __main__:pare_down_vp_by_segment:139 - keep enter/exit points: 0:00:00.119414
+2023-07-25 22:22:49.255 | INFO | __main__:pare_down_vp_by_segment:145 - exported: 0:05:54.910487
+2023-07-25 22:22:49.260 | INFO | __main__::171 - pare down vp by stop segments normal cases 0:09:12.450290
+2023-07-25 22:22:49.263 | INFO | __main__::174 - execution time: 0:09:12.463640
+2023-07-25 22:23:28.707 | INFO | __main__::344 - Analysis date: 2023-07-12
+2023-07-25 22:46:48.934 | INFO | __main__::358 - pare down vp by stop segments special cases 0:23:20.216783
+2023-07-25 22:46:48.962 | INFO | __main__::361 - execution time: 0:23:20.251985
+2023-07-26 16:15:26.883 | INFO | __main__::157 - Analysis date: 2023-03-15
+2023-07-26 16:19:15.751 | INFO | __main__:pare_down_vp_by_segment:130 - merge usable vp with sjoin results: 0:03:48.836153
+2023-07-26 16:19:15.994 | INFO | __main__:pare_down_vp_by_segment:139 - keep enter/exit points: 0:00:00.242565
+2023-07-26 16:25:56.532 | INFO | __main__:pare_down_vp_by_segment:145 - exported: 0:06:40.538840
+2023-07-26 16:25:56.535 | INFO | __main__::171 - pare down vp by stop segments normal cases 0:10:29.620559
+2023-07-26 16:25:56.536 | INFO | __main__::174 - execution time: 0:10:29.629418
+2023-07-26 16:26:18.870 | INFO | __main__::344 - Analysis date: 2023-03-15
+2023-07-26 16:44:55.208 | INFO | __main__::358 - pare down vp by stop segments special cases 0:18:36.327270
+2023-07-26 16:44:55.211 | INFO | __main__::361 - execution time: 0:18:36.338508
+2023-07-26 19:33:08.404 | INFO | __main__::157 - Analysis date: 2023-03-15
+2023-07-26 19:36:57.092 | INFO | __main__:pare_down_vp_by_segment:130 - merge usable vp with sjoin results: 0:03:48.678126
+2023-07-26 19:36:57.276 | INFO | __main__:pare_down_vp_by_segment:139 - keep enter/exit points: 0:00:00.183628
+2023-07-26 19:43:45.644 | INFO | __main__:pare_down_vp_by_segment:145 - exported: 0:06:48.367899
+2023-07-26 19:43:45.651 | INFO | __main__::171 - pare down vp by stop segments normal cases 0:10:37.237054
+2023-07-26 19:43:45.654 | INFO | __main__::174 - execution time: 0:10:37.246745
+2023-07-26 19:44:10.257 | INFO | __main__::344 - Analysis date: 2023-03-15
+2023-07-26 20:01:26.984 | INFO | __main__::358 - pare down vp by stop segments special cases 0:17:16.715529
+2023-07-26 20:01:26.986 | INFO | __main__::361 - execution time: 0:17:16.724971
+2023-07-26 22:23:33.140 | INFO | __main__::157 - Analysis date: 2023-04-12
+2023-07-26 22:26:50.268 | INFO | __main__:pare_down_vp_by_segment:130 - merge usable vp with sjoin results: 0:03:17.096219
+2023-07-26 22:26:50.417 | INFO | __main__:pare_down_vp_by_segment:139 - keep enter/exit points: 0:00:00.148960
+2023-07-26 22:32:23.388 | INFO | __main__:pare_down_vp_by_segment:145 - exported: 0:05:32.971007
+2023-07-26 22:32:23.391 | INFO | __main__::171 - pare down vp by stop segments normal cases 0:08:50.218961
+2023-07-26 22:32:23.392 | INFO | __main__::174 - execution time: 0:08:50.225194
+2023-07-26 22:32:42.683 | INFO | __main__::344 - Analysis date: 2023-04-12
+2023-07-26 22:47:30.729 | INFO | __main__::358 - pare down vp by stop segments special cases 0:14:48.039668
+2023-07-26 22:47:30.730 | INFO | __main__::361 - execution time: 0:14:48.045353
diff --git a/rt_segment_speeds/scripts/calculate-trip-avg.ipynb b/rt_segment_speeds/scripts/calculate-trip-avg.ipynb
deleted file mode 100644
index fecd4ac20..000000000
--- a/rt_segment_speeds/scripts/calculate-trip-avg.ipynb
+++ /dev/null
@@ -1,917 +0,0 @@
-{
- "cells": [
- {
- "cell_type": "markdown",
- "id": "8f0f779b-50a4-4fa6-9fe9-9632c161316a",
- "metadata": {},
- "source": [
- "# Average speeds across entire trip"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 34,
- "id": "52c8d239-4897-42e0-80e0-2352d4b0a79b",
- "metadata": {},
- "outputs": [],
- "source": [
- "#import os\n",
- "#os.environ['USE_PYGEOS'] = '0'\n",
- "# turning this off makes to_crs really slow\n",
- "\n",
- "import dask.dataframe as dd\n",
- "import dask_geopandas as dg\n",
- "import folium\n",
- "import geopandas as gpd\n",
- "import numpy as np\n",
- "import pandas as pd\n",
- "import shapely\n",
- "\n",
- "from segment_speed_utils import helpers, sched_rt_utils, wrangle_shapes\n",
- "from segment_speed_utils.project_vars import (SEGMENT_GCS,\n",
- " CONFIG_PATH, PROJECT_CRS\n",
- " )\n",
- "analysis_date = \"2023-05-17\""
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 35,
- "id": "5e5d3ceb-f859-4583-b361-e1b51c668a68",
- "metadata": {},
- "outputs": [],
- "source": [
- "df = pd.read_parquet(f\"{SEGMENT_GCS}trip_summary/trip_speed_{analysis_date}.parquet\")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 43,
- "id": "07fb6ffa-6fdd-4a14-be21-c31d6afe653f",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "(31, 14)"
- ]
- },
- "execution_count": 43,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "df[df.speed_mph >= 60].shape"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 42,
- "id": "85dca4cc-9b3a-4b3a-806e-51a8d1539833",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "(4686, 14)"
- ]
- },
- "execution_count": 42,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "df[df.speed_mph <= 3].shape"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 41,
- "id": "fe0a82c9-214f-4c8f-9928-7da9b9a8adf3",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "(68556, 14)"
- ]
- },
- "execution_count": 41,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "df.shape"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 3,
- "id": "7e3f17f1-1883-41c0-aee2-6f560812442a",
- "metadata": {},
- "outputs": [],
- "source": [
- "# in case there are fewer shapes to grab\n",
- "shapes_list = df.shape_array_key.unique().tolist()\n",
- "\n",
- "shapes = helpers.import_scheduled_shapes(\n",
- " analysis_date,\n",
- " columns = [\"shape_array_key\",\"geometry\"],\n",
- " filters = [[(\"shape_array_key\", \"in\", shapes_list)]],\n",
- " get_pandas = True,\n",
- " crs = PROJECT_CRS\n",
- ")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 4,
- "id": "0122c274-e71c-407e-84c3-743899e9b525",
- "metadata": {},
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "/opt/conda/lib/python3.9/site-packages/pygeos/linear.py:87: RuntimeWarning: invalid value encountered in line_locate_point\n"
- ]
- }
- ],
- "source": [
- "linear_ref = wrangle_shapes.linear_reference_vp_against_segment(\n",
- " df,\n",
- " shapes,\n",
- " segment_identifier_cols = [\"shape_array_key\"]\n",
- ").compute()\n",
- "\n",
- "linear_ref.to_parquet(\"test.parquet\")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "0c66c64a-a4b9-454a-9e7e-f6253085c160",
- "metadata": {},
- "outputs": [],
- "source": [
- "linear_ref = pd.read_parquet(\"test.parquet\")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 7,
- "id": "f4b557a8-019a-4ea5-b544-2a810515c5ab",
- "metadata": {},
- "outputs": [],
- "source": [
- "def distance_and_seconds_elapsed(\n",
- " df: pd.DataFrame, \n",
- " group_cols: list\n",
- ") -> pd.DataFrame:\n",
- " \"\"\"\n",
- " If every trip has 3 vp, we want the change in time and distance\n",
- " between 1st and 2nd, 2nd and 3rd.\n",
- " Then, sum up the change in time and change by trip.\n",
- " \"\"\"\n",
- " dist_col = \"shape_meters\"\n",
- " time_col = \"location_timestamp_local\"\n",
- " sort_cols = group_cols + [\"vp_idx\"]\n",
- " \n",
- "\n",
- " df = df.assign(\n",
- " prior_dist = (df.sort_values(sort_cols)\n",
- " .groupby(group_cols, \n",
- " observed=True, group_keys=False)\n",
- " [dist_col]\n",
- " .apply(lambda x: x.shift(1))\n",
- " ),\n",
- " prior_time = (df.sort_values(sort_cols)\n",
- " .groupby(group_cols, \n",
- " observed=True, group_keys=False)\n",
- " [time_col]\n",
- " .apply(lambda x: x.shift(1))\n",
- " ) \n",
- " )\n",
- " \n",
- " df = df.assign(\n",
- " change_meters = df[dist_col] - df.prior_dist,\n",
- " change_sec = (df[time_col] - df.prior_time).divide(\n",
- " np.timedelta64(1, 's'))\n",
- " )\n",
- " \n",
- " df2 = (df.groupby(group_cols, \n",
- " observed=True, group_keys=False)\n",
- " .agg({\"change_meters\": \"sum\", \n",
- " \"change_sec\": \"sum\"})\n",
- " .reset_index()\n",
- " )\n",
- " \n",
- " df2 = df2.assign(\n",
- " speed_mph = (df2.change_meters.divide(df2.change_sec) * \n",
- " rt_utils.MPH_PER_MPS)\n",
- " )\n",
- " \n",
- " return df2"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 8,
- "id": "192d6fd1-c8c3-461f-a3fe-1c350fb6096c",
- "metadata": {},
- "outputs": [],
- "source": [
- "from shared_utils import rt_utils\n",
- "\n",
- "speed = distance_and_seconds_elapsed(\n",
- " linear_ref, \n",
- " group_cols = [\"gtfs_dataset_key\", \"trip_id\"]\n",
- ")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 9,
- "id": "9cdf3484-20d8-499a-a3e4-41f7afae23a3",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "(68556, 5)"
- ]
- },
- "execution_count": 9,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "speed.shape"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 10,
- "id": "f28f722f-b481-4543-ae5c-f678e69b30a2",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "(20, 5)"
- ]
- },
- "execution_count": 10,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "speed[speed.speed_mph>=70].shape"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 13,
- "id": "1346a085-dc35-4792-b487-71dd0b559d80",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "(4378, 5)"
- ]
- },
- "execution_count": 13,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "speed[speed.speed_mph<=2].shape"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "4420a0f0-c63d-4d24-9e80-919a7fe32744",
- "metadata": {},
- "outputs": [],
- "source": [
- "def aggregate_by_operator_route_time_of_day():"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "6ca60770-45c1-4aab-b0e9-8b1830d9250e",
- "metadata": {},
- "outputs": [],
- "source": [
- "#test_key = \"00accf770009aafd5dc103ff2eeddb37\"\n",
- "#test_trip = \"t_1995375_b_33395_tn_0\"\n",
- "test_shape = \"70f010e0dba18191937ed4b5bea42e8a\""
- ]
- },
- {
- "cell_type": "markdown",
- "id": "dd00a9e9-f60a-4cff-9870-b8d93b763a7d",
- "metadata": {},
- "source": [
- "This trip has a lot of vp that end up not being joined to any segment.\n",
- "Including those vp far away from the shape mean that the interpolation results show the same thing, because essentially, all those points fall closest to the one end of the shape, and when taking the difference in `shape_meters`, the difference is zero.\n",
- "\n",
- "This is a compelling reason to add the % of segments touched in the sjoin results. Before, we used time cutoff, because it's easier to implement. '\n",
- "\n",
- "At least for calculating trip average speeds, we do need to touch at least 50% of the segments, or even 70% of segments as recommended in notebook, to only calculate entire trip averages on trips that have enough vp.\n",
- "\n",
- "The con of using % of segments is that it becomes even more crucial that segments are cut correctly. If we miss a segment (which we might, currently), there are vp that are not being joined, and we may throw out too many trips because it fails the % segments threshold.\n",
- "\n",
- "For now, let's take the sjoin results and use a couple points to triangulate the distance. Make an array, and pick points either every 10 min or at least 3 points to calculate distance."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "25017d03-f86a-4cc3-a5f6-584e6b647952",
- "metadata": {},
- "outputs": [],
- "source": [
- "ddf = A2.merge_usable_vp_with_sjoin_vpidx(\n",
- " [test_shape],\n",
- " USABLE_FILE,\n",
- " SJOIN_FILE,\n",
- " SEGMENT_IDENTIFIER_COLS,\n",
- " GROUPING_COL\n",
- ")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "c9ecb54b-6a8c-4b43-abcd-40d652ac92cd",
- "metadata": {},
- "outputs": [],
- "source": [
- "ddf = ddf.compute()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "f5bc03d7-5ebf-4100-bdc0-250876f1c04e",
- "metadata": {},
- "outputs": [],
- "source": [
- "from shared_utils import geography_utils\n",
- "\n",
- "ddf = geography_utils.create_point_geometry(ddf, \"x\", \"y\")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "f940224a-57e1-4d67-a0d7-8706733aa9fd",
- "metadata": {},
- "outputs": [],
- "source": [
- "crosswalk = sched_rt_utils.crosswalk_scheduled_trip_grouping_with_rt_key(\n",
- " analysis_date, \n",
- " [\"feed_key\", \"trip_id\", GROUPING_COL, \"shape_id\"] \n",
- " )"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "50cc7daf-b044-4add-9f4e-85424e7b514c",
- "metadata": {},
- "outputs": [],
- "source": [
- "shapes = helpers.import_scheduled_shapes(\n",
- " analysis_date,\n",
- " columns = [\"shape_array_key\", \"geometry\"],\n",
- " filters = [[(\"shape_array_key\", \"in\", [test_shape])]],\n",
- " get_pandas = True,\n",
- " crs = PROJECT_CRS\n",
- ")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "c9ac9777-0fd1-4cb7-90bf-1718a7e75e93",
- "metadata": {},
- "outputs": [],
- "source": [
- "shapes2 = pd.merge(\n",
- " shapes,\n",
- " crosswalk,\n",
- " on = \"shape_array_key\",\n",
- " how = \"inner\"\n",
- ")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "785628db-70b1-4152-a507-d35cb610f29c",
- "metadata": {},
- "outputs": [],
- "source": [
- "ddf2 = ddf.to_crs(PROJECT_CRS).drop(\n",
- " columns = [\"location_timestamp\", \"location_timestamp_local\", \n",
- " \"activity_date\"])"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "956f499e-475f-4427-9338-c9c6062d97af",
- "metadata": {},
- "outputs": [],
- "source": [
- "m = ddf2.explore(\"trip_id\", tiles = \"CartoDB Positron\")\n",
- "m = shapes2.explore(m=m, color=\"yellow\", name=\"shape\")\n",
- "folium.LayerControl().add_to(m)\n",
- "m"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "1450a48c-47f8-429a-b92b-b4b73f6893a9",
- "metadata": {},
- "source": [
- "## Triangulate vp based on sjoin results"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "2a377e7e-e6c6-489f-b035-f4937622b3a8",
- "metadata": {},
- "outputs": [],
- "source": [
- "def list_of_vp_by_trip(\n",
- " df: pd.DataFrame, \n",
- " group_cols: list = [\"gtfs_dataset_key\", \"trip_id\"]\n",
- ") -> pd.DataFrame:\n",
- "\n",
- " df2 = (df.groupby(trip_cols, observed=True)\n",
- " .agg({\"vp_idx\": list})\n",
- " .reset_index()\n",
- " )\n",
- " \n",
- " return df2"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "2f294723-e7c9-46aa-bea1-7ea8397b781b",
- "metadata": {},
- "outputs": [],
- "source": []
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "20d94c32-2b9d-448b-bfa4-d4834f923dd4",
- "metadata": {},
- "outputs": [],
- "source": []
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "4f475c8d-caae-4a13-a3ef-a1d7a8943752",
- "metadata": {},
- "outputs": [],
- "source": []
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "e2e2ab41-148b-44e4-976f-1a5c836405ec",
- "metadata": {},
- "outputs": [],
- "source": []
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "2cd529e8-4dca-446b-a540-236e99d6f78d",
- "metadata": {},
- "outputs": [],
- "source": [
- "by_trip_ddfs = [list_of_vp_by_trip(df, trip_cols) for df in subset_vp_ddfs]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "6c59093a-5f29-408b-a460-fcea81d44c0c",
- "metadata": {},
- "outputs": [],
- "source": [
- "one = by_trip_ddfs[0]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "c70056d1-7a0b-4b59-a5c3-d84bc328b4e6",
- "metadata": {},
- "outputs": [],
- "source": [
- "trip_df = compute(one)[0]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "c4df9dfb-5e12-4e16-8078-ed8e8be1c95d",
- "metadata": {},
- "outputs": [],
- "source": [
- "def count_vp_and_get_every_10_min(my_list: list):\n",
- " vp_idx_arr = np.asarray(my_list)\n",
- " subset_arr = vp_idx_arr[::30]\n",
- " \n",
- " if len(subset_arr) < 3:\n",
- " subset_arr = vp_idx_arr[:15]\n",
- " \n",
- " return list(subset_arr)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "d674a955-b843-4045-9f85-47293aadecaa",
- "metadata": {},
- "outputs": [],
- "source": [
- "trip_df = trip_df.assign(\n",
- " vp_idx2 = trip_df.apply(\n",
- " lambda x: \n",
- " count_vp_and_get_every_10_min(x.vp_idx), \n",
- " axis=1, meta=('vp_idx2', 'object'))\n",
- ")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "6beb7014-d275-4a7e-92ff-ad94787602c3",
- "metadata": {},
- "outputs": [],
- "source": [
- "keep_subset_vp = trip_df.vp_idx2.explode()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "8bf189ec-f3c5-4e99-8879-531fecca531d",
- "metadata": {},
- "outputs": [],
- "source": [
- "ddf_subset = ddf[ddf.vp_idx.isin(keep_subset_vp)][\n",
- " [\"gtfs_dataset_key\", \"trip_id\",\n",
- " \"location_timestamp_local\",\n",
- " \"x\", \"y\", \"vp_idx\"]]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "747033ce-549a-4dc7-9877-b3c758a6d692",
- "metadata": {},
- "outputs": [],
- "source": [
- "crosswalk = sched_rt_utils.crosswalk_scheduled_trip_grouping_with_rt_key(\n",
- " analysis_date, \n",
- " [\"feed_key\", \"trip_id\", GROUPING_COL]\n",
- " )"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "e4ca3a07-1246-41a9-baa9-666e01f6c8dd",
- "metadata": {},
- "outputs": [],
- "source": [
- "subset_vp_shape = delayed(dd.merge)(\n",
- " ddf_subset,\n",
- " crosswalk,\n",
- " on = [\"gtfs_dataset_key\", \"trip_id\"],\n",
- " how = \"inner\"\n",
- ").drop_duplicates()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "95b33d2c-7991-44af-b087-7720ad90762a",
- "metadata": {},
- "outputs": [],
- "source": [
- "subset_shapes = subset_vp_shape.shape_array_key.unique().persist()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "3034219b-99d2-4f64-bcb4-f6082e32760e",
- "metadata": {},
- "outputs": [],
- "source": [
- "subset_shapes"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "2a5214d5-6523-4eeb-800f-dc4088aa7a13",
- "metadata": {},
- "outputs": [],
- "source": [
- "shapes = helpers.import_scheduled_shapes(\n",
- " analysis_date,\n",
- " columns = [\"shape_array_key\", \"geometry\"],\n",
- " filters = [[(\"shape_array_key\", \"in\", subset_shapes)]],\n",
- " get_pandas = True,\n",
- " crs = PROJECT_CRS\n",
- ")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "f0c78935-f7ab-4d78-9261-fdacf96e8abe",
- "metadata": {},
- "outputs": [],
- "source": [
- "RT_OPERATORS = subset_vp_shape.gtfs_dataset_key.unique().compute()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "9155289c-35bc-4bdc-bfe6-a67b670309dc",
- "metadata": {},
- "outputs": [],
- "source": [
- "test_operator = RT_OPERATORS[0]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "18b2632f-c9da-4161-9efc-dc7d773cb3c1",
- "metadata": {},
- "outputs": [],
- "source": [
- "subset_vp_operator = subset_vp_shape[\n",
- " subset_vp_shape.gtfs_dataset_key==test_operator]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "d2c209eb-b7d5-40ac-b208-6be674e59308",
- "metadata": {},
- "outputs": [],
- "source": [
- "linear_ref_operator = delayed(\n",
- " wrangle_shapes.linear_reference_vp_against_segment)(\n",
- " subset_vp_operator,\n",
- " shapes,\n",
- " segment_identifier_cols = [GROUPING_COL]\n",
- ")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "28593d93-f10e-4b55-8113-2a4b96a0664a",
- "metadata": {},
- "outputs": [],
- "source": [
- "linear_ref = delayed(wrangle_shapes.linear_reference_vp_against_segment)(\n",
- " subset_vp_shape,\n",
- " shapes,\n",
- " segment_identifier_cols = [GROUPING_COL]\n",
- ")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "71530c79-c2cf-4d54-8b14-fdd9d0810c26",
- "metadata": {},
- "outputs": [],
- "source": [
- "linear_ref"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "2b1a5e65-92ea-4687-af86-306efc54cd27",
- "metadata": {},
- "outputs": [],
- "source": []
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "9135590c-d7e2-4fc7-876b-88b9e99cf5a5",
- "metadata": {},
- "outputs": [],
- "source": []
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "e3defe7e-2f37-4b9e-b149-200433924255",
- "metadata": {},
- "outputs": [],
- "source": [
- "operators = dd.read_parquet(\n",
- " f\"{SEGMENT_GCS}{INPUT_FILE}\", \n",
- " columns = [\"gtfs_dataset_key\"]\n",
- ").gtfs_dataset_key.unique().compute().tolist()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "010db624-33bc-4fe7-8f9f-f957ff183f8a",
- "metadata": {},
- "outputs": [],
- "source": []
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "87843b73-9963-4ad6-8239-af991b2fdb47",
- "metadata": {},
- "outputs": [],
- "source": [
- "subset_operators = operators[:2]\n",
- "subset_operators"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "02d6ccf0-5f72-4502-a0a0-43da58b088e1",
- "metadata": {},
- "outputs": [],
- "source": [
- "ddf = dd.read_parquet(\n",
- " f\"{SEGMENT_GCS}{INPUT_FILE}\", \n",
- " filters = [[(\"gtfs_dataset_key\", \"in\", subset_operators)]],\n",
- " columns = [\"vp_idx\"]\n",
- ")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "bc95f22a-549e-4176-9ff5-4d2f0dbaac0d",
- "metadata": {},
- "outputs": [],
- "source": [
- "trip_cols = [\"gtfs_dataset_key\", \"trip_id\"]\n",
- "hour_min_cols = [\"hour\", \"minute\"]"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "5cb85263-ca3c-49ac-acc8-2b03bb173a9b",
- "metadata": {},
- "source": [
- "## Pings per minute for service hours"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "22f194bd-5713-478e-b7ab-5634a6c86a53",
- "metadata": {},
- "outputs": [],
- "source": [
- "ddf = ddf.repartition(npartitions=5)\n",
- "\n",
- "ddf = ddf.assign(\n",
- " minute = ddf.location_timestamp_local.dt.minute\n",
- ")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "b26e222c-8c20-43f3-b706-4bf311a7fda8",
- "metadata": {},
- "outputs": [],
- "source": [
- "ddf.dtypes"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "de3ab88f-97b3-45d3-95ef-28a907a25d1f",
- "metadata": {},
- "outputs": [],
- "source": [
- "num_vp_pings = (ddf.groupby(trip_cols + hour_min_cols, observed=True)\n",
- " [\"location_timestamp_local\"]\n",
- " .count()\n",
- " .dropna()\n",
- " .reset_index()\n",
- " .rename(columns = {\"location_timestamp_local\": \"num_pings\"})\n",
- " )"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "4938392d-a2ee-473e-91b5-9805b0aceb14",
- "metadata": {},
- "outputs": [],
- "source": [
- "num_vp_pings = num_vp_pings.assign(\n",
- " atleast2 = num_vp_pings.apply(\n",
- " lambda x: 1 if x.num_pings >= 2\n",
- " else 0, axis=1, meta=('atleast2', 'int8'))\n",
- " ) "
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "0fcb4ff1-04f6-4870-9e67-4391be2508a3",
- "metadata": {},
- "outputs": [],
- "source": [
- "vp_pings = (num_vp_pings.groupby(trip_cols)\n",
- " .agg({\n",
- " \"hour\": \"size\",\n",
- " \"atleast2\": \"sum\"})\n",
- " .dropna()\n",
- " .reset_index()\n",
- " ).rename(columns = {\n",
- " \"hour\": \"trip_min_elapsed\"})"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "813ec2d3-8cf8-4e44-96bd-405ad65c19a9",
- "metadata": {},
- "outputs": [],
- "source": [
- "vp_pings = vp_pings.persist()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "b427d9db-0c0e-43b9-9b82-464dd923d3e0",
- "metadata": {},
- "outputs": [],
- "source": [
- "vp_pings.compute()"
- ]
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 3 (ipykernel)",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.9.13"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}