diff --git a/_shared_utils/shared_utils/rt_utils.py b/_shared_utils/shared_utils/rt_utils.py
index d7f906da9..87ceba3fc 100644
--- a/_shared_utils/shared_utils/rt_utils.py
+++ b/_shared_utils/shared_utils/rt_utils.py
@@ -601,10 +601,10 @@ def arrowize_segment(line_geometry, buffer_distance: int = 20):
end = shapely.ops.substring(end_segment, end_segment.length, end_segment.length) # correct
r_shift = end_segment.parallel_offset(shift_distance, "right")
r_pt = shapely.ops.substring(r_shift, 0, 0)
- r_pt2 = shapely.ops.substring(r_shift, r_shift.length - arrow_distance, r_shift.length - arrow_distance)
+ r_pt2 = shapely.ops.substring(r_shift, r_shift.length, r_shift.length)
l_shift = end_segment.parallel_offset(shift_distance, "left")
l_pt = shapely.ops.substring(l_shift, 0, 0)
- l_pt2 = shapely.ops.substring(l_shift, arrow_distance, arrow_distance)
+ l_pt2 = shapely.ops.substring(l_shift, l_shift.length, l_shift.length)
t1 = shapely.geometry.Polygon((l_pt2, end, l_pt)) # triangles to cut top of arrow
t2 = shapely.geometry.Polygon((r_pt2, end, r_pt))
diff --git a/rt_segment_speeds/30_interpolated_segments.ipynb b/rt_segment_speeds/30_interpolated_segments.ipynb
new file mode 100644
index 000000000..cc056296c
--- /dev/null
+++ b/rt_segment_speeds/30_interpolated_segments.ipynb
@@ -0,0 +1,1974 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "id": "b61fc969-d940-4f3d-97d3-dd666c02bf2e",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "import os\n",
+ "os.environ[\"CALITP_BQ_MAX_BYTES\"] = str(800_000_000_000)\n",
+ "\n",
+ "import pandas as pd\n",
+ "import geopandas as gpd\n",
+ "\n",
+ "import dask.dataframe as dd\n",
+ "\n",
+ "from segment_speed_utils import helpers\n",
+ "from segment_speed_utils.project_vars import SEGMENT_GCS, analysis_date\n",
+ "from shared_utils.rt_utils import arrowize_segment\n",
+ "from shared_utils import gtfs_utils_v2\n",
+ "from calitp_data_analysis import geography_utils, utils\n",
+ "from calitp_data_analysis.tables import tbls\n",
+ "\n",
+ "\n",
+ "import numpy as np\n",
+ "import shapely\n",
+ "from siuba import *"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "bd52224f-d256-47da-b7db-a9436e87ee0d",
+ "metadata": {},
+ "source": [
+ "# Add interpolated segments to pipeline\n",
+ "\n",
+ "* Additional detail between widely spaced stops is more useful for speed analysis"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "caba4d72-7e58-4abc-88fa-83425df20a14",
+ "metadata": {},
+ "source": [
+ "## Check existing segments"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "id": "b0a70df9-f929-46c3-8082-7a4e9030700f",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "segs_path = 'gs://calitp-analytics-data/data-analyses/rt_segment_speeds/segment_options/stop_segments_2024-03-13.parquet'\n",
+ "GCS_PATH = 'gs://calitp-analytics-data/data-analyses/rt_segment_speeds/segment_options/'"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "id": "46158cda-fccb-40e7-b322-46ef22de46b9",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# read via geopandas so that geometry stays intact\n",
+ "segs = gpd.read_parquet(segs_path)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "id": "439cc654-8e02-4804-9692-d7b9395104ea",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# segs >> head(3)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "fe3ec77c-3455-416e-bba9-14c303f750a8",
+ "metadata": {},
+ "source": [
+ "## Less than 6% of segments need to be interpolated...\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "id": "e75fe998-7cb1-40c2-a89e-6a22e893e42c",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "997.6817468833709"
+ ]
+ },
+ "execution_count": 5,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "segs.length.quantile(.94)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "f6b1e941-11d5-474a-9250-b5b47fb6c49d",
+ "metadata": {},
+ "source": [
+ "# Additional Columns Required\n",
+ "\n",
+ "Should probably happen upstream in `cut_stop_segments.py`, related scripts...\n",
+ "\n",
+ "* `length`: float, `geometry.length`\n",
+ "* `next_stop_sequence`: lead of `stop_sequence`, should include final stop seq (final stop seq unavailable here since shifting from existing df...)\n",
+ " * alternatively, rename `stop_sequence` -> `stop_sequence1` and add `stop_sequence2` (consistent with existing `stop_id1` and `stop_id2`)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "d624a026-968f-49b2-b6e6-ad38811fa924",
+ "metadata": {
+ "tags": []
+ },
+ "source": [
+ "## Adding these after the fact for testing..."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "id": "8d783355-2bef-47ce-8251-aaef211400cf",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# segs_ddf = dd.from_pandas(segs, npartitions=50)\n",
+ "\n",
+ "# segs_ddf = segs_ddf.assign(length=lambda x: x.geometry.length)\n",
+ "\n",
+ "# next_sequence_ddf = segs_ddf[['trip_instance_key', 'stop_sequence']].groupby('trip_instance_key').shift(-1)\n",
+ "# next_sequence_ddf = next_sequence_ddf.rename(columns={'stop_sequence': 'next_stop_sequence'})\n",
+ "\n",
+ "# # note this relies on the index\n",
+ "# segs_ddf = segs_ddf.join(next_sequence_ddf)\n",
+ "\n",
+ "# !mkdir test_segs\n",
+ "\n",
+ "# segs_ddf[['next_stop_sequence', 'length']].to_parquet('test_segs/')\n",
+ "\n",
+ "# # new_cols = segs_ddf.compute()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "52d148c7-ed25-49fd-b684-0097ef95ee73",
+ "metadata": {},
+ "source": [
+ "### restart kernel/seperate script"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "id": "2120da8e-a98f-48a2-97ad-87e11b82a432",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# segs_ddf = dd.read_parquet('test_segs/')\n",
+ "\n",
+ "# segs_ddf\n",
+ "\n",
+ "# segs = segs.join(segs_ddf.compute())"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 78,
+ "id": "1d757efc-be7f-463b-9ace-e5bd9c1e6c9b",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "intermediate_file = 'test_interpolated_intermediate_2024-03-13.parquet'"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 81,
+ "id": "7f9ed931-1782-4887-8ce3-f3804cf4b6d6",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "utils.geoparquet_gcs_export(segs, GCS_PATH, intermediate_file)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 82,
+ "id": "f42cd425-4512-4f4d-b9a3-fdeb8493d009",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "segs = gpd.read_parquet(GCS_PATH+intermediate_file)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "8c9ba88b-27a4-4288-88ce-f368d1507ba3",
+ "metadata": {
+ "tags": []
+ },
+ "source": [
+ "## Find BBBR10"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "id": "b892f6ea-67d0-4295-b014-10532d2408b9",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "bbb = gtfs_utils_v2.schedule_daily_feed_to_gtfs_dataset_name('2024-03-13') >> filter(_.name.str.contains('Big Blue'))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "id": "1fc3d7d1-8b0d-4d2b-82d6-cbb741ae4ff1",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " key | \n",
+ " date | \n",
+ " feed_key | \n",
+ " feed_timezone | \n",
+ " base64_url | \n",
+ " gtfs_dataset_key | \n",
+ " gtfs_dataset_name | \n",
+ " name | \n",
+ " type | \n",
+ " regional_feed_type | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 219 | \n",
+ " c35ae2a7cb1f9ad2cf0a5bce84ccf298 | \n",
+ " 2024-03-13 | \n",
+ " 4f9888472a8dad0f66bdbbd002312789 | \n",
+ " America/Los_Angeles | \n",
+ " aHR0cDovL2d0ZnMuYmlnYmx1ZWJ1cy5jb20vY3VycmVudC... | \n",
+ " efbbd5293be71f7a5de0cf82b59febe1 | \n",
+ " Big Blue Bus Schedule | \n",
+ " Big Blue Bus Schedule | \n",
+ " schedule | \n",
+ " None | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " key date \\\n",
+ "219 c35ae2a7cb1f9ad2cf0a5bce84ccf298 2024-03-13 \n",
+ "\n",
+ " feed_key feed_timezone \\\n",
+ "219 4f9888472a8dad0f66bdbbd002312789 America/Los_Angeles \n",
+ "\n",
+ " base64_url \\\n",
+ "219 aHR0cDovL2d0ZnMuYmlnYmx1ZWJ1cy5jb20vY3VycmVudC... \n",
+ "\n",
+ " gtfs_dataset_key gtfs_dataset_name \\\n",
+ "219 efbbd5293be71f7a5de0cf82b59febe1 Big Blue Bus Schedule \n",
+ "\n",
+ " name type regional_feed_type \n",
+ "219 Big Blue Bus Schedule schedule None "
+ ]
+ },
+ "execution_count": 10,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "bbb"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "id": "9b9963d4-9a87-46de-9c62-cf70d97c7a49",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/opt/conda/lib/python3.9/site-packages/sqlalchemy_bigquery/_types.py:101: SAWarning: Did not recognize type 'INTERVAL' of column 'arrival_time_interval'\n",
+ " sqlalchemy.util.warn(\n",
+ "/opt/conda/lib/python3.9/site-packages/sqlalchemy_bigquery/_types.py:101: SAWarning: Did not recognize type 'INTERVAL' of column 'departure_time_interval'\n",
+ " sqlalchemy.util.warn(\n",
+ "/opt/conda/lib/python3.9/site-packages/sqlalchemy_bigquery/_types.py:101: SAWarning: Did not recognize type 'INTERVAL' of column 'start_pickup_drop_off_window_interval'\n",
+ " sqlalchemy.util.warn(\n",
+ "/opt/conda/lib/python3.9/site-packages/sqlalchemy_bigquery/_types.py:101: SAWarning: Did not recognize type 'INTERVAL' of column 'end_pickup_drop_off_window_interval'\n",
+ " sqlalchemy.util.warn(\n"
+ ]
+ }
+ ],
+ "source": [
+ "str10 = tbls.mart_gtfs.dim_stop_times() >> filter(_.feed_key == '4f9888472a8dad0f66bdbbd002312789', _.trip_id == '919600') >> collect()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "id": "fd382312-3069-44a5-96de-3e25a29d40fd",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# segs >> filter(_.trip_instance_key == 'd98e5cc1fb62e6e5ed0030934ef8a396') >> arrange(_.stop_sequence)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "id": "8282707d-d5f3-465a-8c04-0cf161fb04cb",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " trip_id | \n",
+ " stop_id | \n",
+ " stop_sequence | \n",
+ " arrival_time | \n",
+ " departure_time | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 919600 | \n",
+ " 1708 | \n",
+ " 1 | \n",
+ " 06:00:00 | \n",
+ " 06:00:00 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 919600 | \n",
+ " 227 | \n",
+ " 2 | \n",
+ " 06:01:00 | \n",
+ " 06:01:00 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 919600 | \n",
+ " 5 | \n",
+ " 3 | \n",
+ " 06:01:29 | \n",
+ " 06:01:29 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " trip_id stop_id stop_sequence arrival_time departure_time\n",
+ "0 919600 1708 1 06:00:00 06:00:00\n",
+ "1 919600 227 2 06:01:00 06:01:00\n",
+ "2 919600 5 3 06:01:29 06:01:29"
+ ]
+ },
+ "execution_count": 13,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "str10 >> select(_.trip_id, _.stop_id, _.stop_sequence, _.arrival_time, _.departure_time) >> arrange(_.stop_sequence) >> head (3)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "id": "579807e7-0602-41a1-a1dc-e7d4d64c3c0b",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "bbbr10 = (tbls.mart_gtfs.fct_scheduled_trips()\n",
+ " >> filter(_.gtfs_dataset_key.isin(bbb.gtfs_dataset_key),\n",
+ " _.route_short_name.str.contains('R10'),\n",
+ " _.service_date == '2024-03-13')\n",
+ ") >> collect()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
+ "id": "55ce4e1a-205d-4f5a-baf0-d6adac265942",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# bbbr10"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 16,
+ "id": "46cc6303-2103-46d8-970a-d0a08b4f5bdd",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# bbbr10.trip_id"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "91721d4c-7ff8-4606-bcee-2b421694f48a",
+ "metadata": {},
+ "source": [
+ "## Work out function"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 17,
+ "id": "c97b6aaa-8e28-4e06-b209-4051aabf834f",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "test = segs >> filter(_.trip_instance_key == '3505bf6a20e8d29e83e545784a421bc7')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 18,
+ "id": "ed74402c-0d2b-42bb-9fb8-7e926e6f15c5",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "long = test >> filter(_.stop_sequence == 18)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 19,
+ "id": "bacc740d-3192-4283-a39d-c0fa140df9ec",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "geom = long.geometry.iloc[0]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 20,
+ "id": "1a3481ae-9b85-4c95-94a7-a8215796259e",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "17696.81706739369"
+ ]
+ },
+ "execution_count": 20,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "geom.length"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 21,
+ "id": "3e86bf11-82b7-44c2-aa58-32031c7cbbe7",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def split_distance(geom, dist=1000):\n",
+ " '''\n",
+ " geom: shapely.LineString\n",
+ " \n",
+ " returns a shapely.MultiLineString split every dist (meters)\n",
+ " '''\n",
+ " \n",
+ " split_segs = geom.length // dist\n",
+ " substrings = []\n",
+ " for i in range(0, int(split_segs)):\n",
+ " substrings += [shapely.ops.substring(geom, i * dist, (i+1) * dist)]\n",
+ " substrings += [shapely.ops.substring(geom, split_segs * dist, geom.length)]\n",
+ " new_geom = shapely.MultiLineString(substrings)\n",
+ " \n",
+ " return new_geom"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 22,
+ "id": "886c6d18-db65-405b-8eb0-db45a4b80945",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "new_geom = split_distance(geom)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 23,
+ "id": "6fdc3e6c-58f4-4725-9c9d-c229db6f7dfd",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "image/svg+xml": [
+ ""
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 23,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "geom"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 24,
+ "id": "f6a3b041-6ea1-4c1c-8d01-d1a20bc2b1a2",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "image/svg+xml": [
+ ""
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 24,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "new_geom"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 25,
+ "id": "2fa3c835-5fb3-48bd-a099-e063943a1068",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# geom.wkt"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 26,
+ "id": "fda14a05-d0d4-4768-9063-9e115a52a367",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# new_geom.wkt"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 27,
+ "id": "fd136876-14d5-4e85-a448-4ed1101889c0",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/opt/conda/lib/python3.9/site-packages/geopandas/geodataframe.py:1543: SettingWithCopyWarning: \n",
+ "A value is trying to be set on a copy of a slice from a DataFrame.\n",
+ "Try using .loc[row_indexer,col_indexer] = value instead\n",
+ "\n",
+ "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+ " super().__setitem__(key, value)\n"
+ ]
+ }
+ ],
+ "source": [
+ "long.geometry = [new_geom]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 28,
+ "id": "c85e3ca0-f32a-43ff-b3d4-7c09c09a456a",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/tmp/ipykernel_445/1845589243.py:1: FutureWarning: Currently, index_parts defaults to True, but in the future, it will default to False to be consistent with Pandas. Use `index_parts=True` to keep the current behavior and True/False to silence the warning.\n",
+ " exploded = long.explode()\n"
+ ]
+ }
+ ],
+ "source": [
+ "exploded = long.explode()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 29,
+ "id": "a33dd638-dccc-4b36-b09b-b81de3c31084",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " | \n",
+ " trip_instance_key | \n",
+ " shape_array_key | \n",
+ " stop_id1 | \n",
+ " stop_sequence | \n",
+ " stop_id2 | \n",
+ " segment_id | \n",
+ " stop_pair | \n",
+ " schedule_gtfs_dataset_key | \n",
+ " route_id | \n",
+ " direction_id | \n",
+ " next_stop_sequence | \n",
+ " length | \n",
+ " geometry | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 2875405 | \n",
+ " 0 | \n",
+ " 3505bf6a20e8d29e83e545784a421bc7 | \n",
+ " 37468c4ffbbbab83f270b8fcecb6de61 | \n",
+ " 228 | \n",
+ " 18 | \n",
+ " 106 | \n",
+ " 228-106-1 | \n",
+ " 228__106 | \n",
+ " efbbd5293be71f7a5de0cf82b59febe1 | \n",
+ " 3714 | \n",
+ " 0.0 | \n",
+ " 19.0 | \n",
+ " 17696.817067 | \n",
+ " LINESTRING (143107.958 -441727.430, 143118.407... | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 3505bf6a20e8d29e83e545784a421bc7 | \n",
+ " 37468c4ffbbbab83f270b8fcecb6de61 | \n",
+ " 228 | \n",
+ " 18 | \n",
+ " 106 | \n",
+ " 228-106-1 | \n",
+ " 228__106 | \n",
+ " efbbd5293be71f7a5de0cf82b59febe1 | \n",
+ " 3714 | \n",
+ " 0.0 | \n",
+ " 19.0 | \n",
+ " 17696.817067 | \n",
+ " LINESTRING (143967.823 -441640.058, 144006.108... | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " trip_instance_key shape_array_key \\\n",
+ "2875405 0 3505bf6a20e8d29e83e545784a421bc7 37468c4ffbbbab83f270b8fcecb6de61 \n",
+ " 1 3505bf6a20e8d29e83e545784a421bc7 37468c4ffbbbab83f270b8fcecb6de61 \n",
+ "\n",
+ " stop_id1 stop_sequence stop_id2 segment_id stop_pair \\\n",
+ "2875405 0 228 18 106 228-106-1 228__106 \n",
+ " 1 228 18 106 228-106-1 228__106 \n",
+ "\n",
+ " schedule_gtfs_dataset_key route_id direction_id \\\n",
+ "2875405 0 efbbd5293be71f7a5de0cf82b59febe1 3714 0.0 \n",
+ " 1 efbbd5293be71f7a5de0cf82b59febe1 3714 0.0 \n",
+ "\n",
+ " next_stop_sequence length \\\n",
+ "2875405 0 19.0 17696.817067 \n",
+ " 1 19.0 17696.817067 \n",
+ "\n",
+ " geometry \n",
+ "2875405 0 LINESTRING (143107.958 -441727.430, 143118.407... \n",
+ " 1 LINESTRING (143967.823 -441640.058, 144006.108... "
+ ]
+ },
+ "execution_count": 29,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# note order remains from list order passed to MultiLineString constructor\n",
+ "exploded.head(2)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 30,
+ "id": "173449e3-48cd-489c-b9da-dd2777457d7a",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def process_exploded(gdf):\n",
+ " '''\n",
+ " update required cols in exploded gdf\n",
+ " \n",
+ " stop_sequence: increment proportional to segment distance within arbitrary stop sequence increment\n",
+ " segment_id: postfix _(int) per segment to maintain uniqueness\n",
+ " '''\n",
+ " assert len(gdf.trip_instance_key.unique()) == 1 and len(gdf.segment_id.unique()) == 1, 'must group by trip_instance_key, segment_id'\n",
+ " \n",
+ " prev_stop = int(gdf.stop_sequence.min())\n",
+ " next_stop = int(gdf.next_stop_sequence.max())\n",
+ " stop_seq_chg = gdf.next_stop_sequence.max() - prev_stop\n",
+ " \n",
+ " # increment stop sequence proportional to distance traveled \n",
+ " seq_per_km = stop_seq_chg / gdf.length.sum() \n",
+ " seq_changes = gdf.length * seq_per_km\n",
+ " stop_sequences_scaled = np.flip(next_stop - np.flip(seq_changes).cumsum())\n",
+ " \n",
+ " gdf['stop_sequence'] = stop_sequences_scaled\n",
+ " \n",
+ " # postfix to segment_id so that it remains unique\n",
+ " postfixes = np.arange(0, gdf.shape[0]).astype(str)\n",
+ " underscores = np.full(gdf.shape[0], '_')\n",
+ " postfixes = np.char.add(underscores, postfixes)\n",
+ " gdf['segment_id'] = gdf.segment_id + postfixes\n",
+ " \n",
+ " return gdf"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 31,
+ "id": "60a09e07-9882-44f5-990d-e35c3d7b9b6b",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "processed = process_exploded(exploded)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 32,
+ "id": "10ea37f3-d285-4988-a663-9f333f815fc9",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "\u001b[0;31mSignature:\u001b[0m \u001b[0marrowize_segment\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mline_geometry\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mbuffer_distance\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mint\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;36m20\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+ "\u001b[0;31mDocstring:\u001b[0m\n",
+ "Given a linestring segment from a gtfs shape,\n",
+ "buffer and clip to show direction of progression\n",
+ "\u001b[0;31mFile:\u001b[0m ~/data-analyses/_shared_utils/shared_utils/rt_utils.py\n",
+ "\u001b[0;31mType:\u001b[0m function"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "arrowize_segment?"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 33,
+ "id": "a78f13ca-3f22-4e60-b3ec-9bc1c0a73c04",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "processed.geometry = processed.geometry.apply(lambda x: arrowize_segment(x))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 34,
+ "id": "1003a058-4b1e-4aa8-aeba-d4a52fb12731",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " | \n",
+ " trip_instance_key | \n",
+ " shape_array_key | \n",
+ " stop_id1 | \n",
+ " stop_sequence | \n",
+ " stop_id2 | \n",
+ " segment_id | \n",
+ " stop_pair | \n",
+ " schedule_gtfs_dataset_key | \n",
+ " route_id | \n",
+ " direction_id | \n",
+ " next_stop_sequence | \n",
+ " length | \n",
+ " geometry | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 2875405 | \n",
+ " 0 | \n",
+ " 3505bf6a20e8d29e83e545784a421bc7 | \n",
+ " 37468c4ffbbbab83f270b8fcecb6de61 | \n",
+ " 228 | \n",
+ " 18.000000 | \n",
+ " 106 | \n",
+ " 228-106-1_0 | \n",
+ " 228__106 | \n",
+ " efbbd5293be71f7a5de0cf82b59febe1 | \n",
+ " 3714 | \n",
+ " 0.0 | \n",
+ " 19.0 | \n",
+ " 17696.817067 | \n",
+ " POLYGON ((143475.239 -441774.715, 143514.754 -... | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 3505bf6a20e8d29e83e545784a421bc7 | \n",
+ " 37468c4ffbbbab83f270b8fcecb6de61 | \n",
+ " 228 | \n",
+ " 18.056507 | \n",
+ " 106 | \n",
+ " 228-106-1_1 | \n",
+ " 228__106 | \n",
+ " efbbd5293be71f7a5de0cf82b59febe1 | \n",
+ " 3714 | \n",
+ " 0.0 | \n",
+ " 19.0 | \n",
+ " 17696.817067 | \n",
+ " POLYGON ((144774.726 -441469.401, 144927.328 -... | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 3505bf6a20e8d29e83e545784a421bc7 | \n",
+ " 37468c4ffbbbab83f270b8fcecb6de61 | \n",
+ " 228 | \n",
+ " 18.113015 | \n",
+ " 106 | \n",
+ " 228-106-1_2 | \n",
+ " 228__106 | \n",
+ " efbbd5293be71f7a5de0cf82b59febe1 | \n",
+ " 3714 | \n",
+ " 0.0 | \n",
+ " 19.0 | \n",
+ " 17696.817067 | \n",
+ " POLYGON ((145112.933 -441431.350, 145928.926 -... | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " trip_instance_key shape_array_key \\\n",
+ "2875405 0 3505bf6a20e8d29e83e545784a421bc7 37468c4ffbbbab83f270b8fcecb6de61 \n",
+ " 1 3505bf6a20e8d29e83e545784a421bc7 37468c4ffbbbab83f270b8fcecb6de61 \n",
+ " 2 3505bf6a20e8d29e83e545784a421bc7 37468c4ffbbbab83f270b8fcecb6de61 \n",
+ "\n",
+ " stop_id1 stop_sequence stop_id2 segment_id stop_pair \\\n",
+ "2875405 0 228 18.000000 106 228-106-1_0 228__106 \n",
+ " 1 228 18.056507 106 228-106-1_1 228__106 \n",
+ " 2 228 18.113015 106 228-106-1_2 228__106 \n",
+ "\n",
+ " schedule_gtfs_dataset_key route_id direction_id \\\n",
+ "2875405 0 efbbd5293be71f7a5de0cf82b59febe1 3714 0.0 \n",
+ " 1 efbbd5293be71f7a5de0cf82b59febe1 3714 0.0 \n",
+ " 2 efbbd5293be71f7a5de0cf82b59febe1 3714 0.0 \n",
+ "\n",
+ " next_stop_sequence length \\\n",
+ "2875405 0 19.0 17696.817067 \n",
+ " 1 19.0 17696.817067 \n",
+ " 2 19.0 17696.817067 \n",
+ "\n",
+ " geometry \n",
+ "2875405 0 POLYGON ((143475.239 -441774.715, 143514.754 -... \n",
+ " 1 POLYGON ((144774.726 -441469.401, 144927.328 -... \n",
+ " 2 POLYGON ((145112.933 -441431.350, 145928.926 -... "
+ ]
+ },
+ "execution_count": 34,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "processed.head(3)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 35,
+ "id": "c13b133d-27ef-43e8-ad3a-ee803c0788c4",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# processed.explore()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "9cb8d0de-12fd-471d-95c3-bb258a334add",
+ "metadata": {},
+ "source": [
+ "# Abstracting (to move to new script?)\n",
+ "\n",
+ "* seperate short and long segments\n",
+ "* split long segments and update columns to preserve unique segment_id, meaningful stop_sequence that is sortable and proportional to distance travelled (when between actual stops, no proportionality requirement in original feed)\n",
+ "* recombine"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 36,
+ "id": "ba047670-b2fc-4a41-a342-5ed9323ad94b",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "shorts = (segs[segs['length'] < 1000]).copy()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 37,
+ "id": "bcb9dfe0-c8b9-474f-ad1f-3c5c14923452",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "longs = (segs[segs['length'] > 1000]).copy()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 38,
+ "id": "8f6d0468-d15e-4663-965a-095d8dcad12c",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "longs = longs >> filter(-_.next_stop_sequence.isna()) # fix upstream (include final next seq...)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 39,
+ "id": "112b7f03-1399-41a5-85e8-501717e31647",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(163577, 13)"
+ ]
+ },
+ "execution_count": 39,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "longs.shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 40,
+ "id": "d5f71050-2201-4d95-b0cc-b539999e2039",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "test_longs = longs.iloc[:1000,:]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 41,
+ "id": "425245ff-4c54-4992-be17-0f31520908e1",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " trip_instance_key | \n",
+ " shape_array_key | \n",
+ " stop_id1 | \n",
+ " stop_sequence | \n",
+ " geometry | \n",
+ " stop_id2 | \n",
+ " segment_id | \n",
+ " stop_pair | \n",
+ " schedule_gtfs_dataset_key | \n",
+ " route_id | \n",
+ " direction_id | \n",
+ " next_stop_sequence | \n",
+ " length | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 22 | \n",
+ " 0000dbb743b258f707687f2dc14ad90f | \n",
+ " 37bb198ccd3af93e3b7f10bb7602faff | \n",
+ " 819 | \n",
+ " 1672 | \n",
+ " LINESTRING (195533.932 -435807.899, 195532.071... | \n",
+ " 2304 | \n",
+ " 819-2304-1 | \n",
+ " 819__2304 | \n",
+ " f74424acf8c41e4c1e9fd42838c4875c | \n",
+ " 488 | \n",
+ " 1.0 | \n",
+ " 1959.0 | \n",
+ " 1734.469963 | \n",
+ "
\n",
+ " \n",
+ " 78 | \n",
+ " 09e32488392ad4c1684b0108f3bba8b3 | \n",
+ " 37bb198ccd3af93e3b7f10bb7602faff | \n",
+ " 819 | \n",
+ " 1532 | \n",
+ " LINESTRING (195533.932 -435807.899, 195532.071... | \n",
+ " 2304 | \n",
+ " 819-2304-1 | \n",
+ " 819__2304 | \n",
+ " f74424acf8c41e4c1e9fd42838c4875c | \n",
+ " 488 | \n",
+ " 1.0 | \n",
+ " 1767.0 | \n",
+ " 1734.469963 | \n",
+ "
\n",
+ " \n",
+ " 134 | \n",
+ " 0aaeb33101f4ac9ebb7851388c355825 | \n",
+ " 37bb198ccd3af93e3b7f10bb7602faff | \n",
+ " 819 | \n",
+ " 1672 | \n",
+ " LINESTRING (195533.932 -435807.899, 195532.071... | \n",
+ " 2304 | \n",
+ " 819-2304-1 | \n",
+ " 819__2304 | \n",
+ " f74424acf8c41e4c1e9fd42838c4875c | \n",
+ " 488 | \n",
+ " 1.0 | \n",
+ " 1959.0 | \n",
+ " 1734.469963 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " trip_instance_key shape_array_key \\\n",
+ "22 0000dbb743b258f707687f2dc14ad90f 37bb198ccd3af93e3b7f10bb7602faff \n",
+ "78 09e32488392ad4c1684b0108f3bba8b3 37bb198ccd3af93e3b7f10bb7602faff \n",
+ "134 0aaeb33101f4ac9ebb7851388c355825 37bb198ccd3af93e3b7f10bb7602faff \n",
+ "\n",
+ " stop_id1 stop_sequence \\\n",
+ "22 819 1672 \n",
+ "78 819 1532 \n",
+ "134 819 1672 \n",
+ "\n",
+ " geometry stop_id2 segment_id \\\n",
+ "22 LINESTRING (195533.932 -435807.899, 195532.071... 2304 819-2304-1 \n",
+ "78 LINESTRING (195533.932 -435807.899, 195532.071... 2304 819-2304-1 \n",
+ "134 LINESTRING (195533.932 -435807.899, 195532.071... 2304 819-2304-1 \n",
+ "\n",
+ " stop_pair schedule_gtfs_dataset_key route_id direction_id \\\n",
+ "22 819__2304 f74424acf8c41e4c1e9fd42838c4875c 488 1.0 \n",
+ "78 819__2304 f74424acf8c41e4c1e9fd42838c4875c 488 1.0 \n",
+ "134 819__2304 f74424acf8c41e4c1e9fd42838c4875c 488 1.0 \n",
+ "\n",
+ " next_stop_sequence length \n",
+ "22 1959.0 1734.469963 \n",
+ "78 1767.0 1734.469963 \n",
+ "134 1959.0 1734.469963 "
+ ]
+ },
+ "execution_count": 41,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "test_longs.head(3)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "90f76913-57fb-4af3-a330-95215b55eec2",
+ "metadata": {
+ "jp-MarkdownHeadingCollapsed": true,
+ "tags": []
+ },
+ "source": [
+ "## Rowwise apply and accumulate? (works but too slow, ~45min)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 42,
+ "id": "1f3bf3e2-9d83-4db0-bfb8-191f9991581e",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# def interpolate_segments(row):\n",
+ "# '''\n",
+ "# wrapper function -- rowwise apply to a gdf of \"long\" (>1000m) segments\n",
+ "# '''\n",
+ "# global interpolated_longs\n",
+ "# new_geom = split_distance(row.geometry)\n",
+ "\n",
+ "# row.geometry = new_geom\n",
+ "# # back to gdf to use .explode()\n",
+ "# row = (gpd.GeoDataFrame(row)\n",
+ "# .transpose()\n",
+ "# .set_geometry('geometry')\n",
+ "# .set_crs(geography_utils.CA_NAD83Albers)\n",
+ "# )\n",
+ "# exploded = gpd.GeoDataFrame.explode(row, column='geometry', index_parts=False)\n",
+ "# # return exploded\n",
+ "# processed = process_exploded(exploded)\n",
+ " \n",
+ "# interpolated_longs += [processed]\n",
+ "# return"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 43,
+ "id": "8007f5dd-2e83-4210-bebe-06faa5bcc5e0",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# test1 = test_longs.head(1)\n",
+ "\n",
+ "# x = test1.apply(interpolate_segments, axis=1)\n",
+ "\n",
+ "# gdf = pd.concat(interpolated_longs)\n",
+ "\n",
+ "# # can't split at endpoints (no next stop seq, must calculate upstream instead of shift!)\n",
+ "# # drop for now to test\n",
+ "# test_longs = test_longs >> filter(-_.next_stop_sequence.isna())\n",
+ "\n",
+ "# %%timeit\n",
+ "\n",
+ "# interpolated_longs = []\n",
+ "\n",
+ "# _ = test_longs.apply(interpolate_segments, axis=1)\n",
+ "\n",
+ "# (14 * 177) / 60 # 41min to interpolate all -- not entirely ideal\n",
+ "\n",
+ "# (interpolated >> distinct(_.segment_id, _keep_all=True)).shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 44,
+ "id": "11817c00-4d3b-43b4-bb08-eda917712f7c",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# interpolated = pd.concat(interpolated_longs)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "f899c508-b508-472d-89f4-fe22a89fb561",
+ "metadata": {},
+ "source": [
+ "## Compute new geometries only once\n",
+ "\n",
+ "* compute/join on `shape_array_key, segment_id`\n",
+ "* accumulate geoms only in dict based on unique df; lookup/replace geoms in full df, then use `gdf.explode()`, `process_exploded`\n",
+ "* ~12min total for entire state, could maybe speed up `process_exploded` since it's a groupby/apply but this implementation depends on the gdf staying in order for each trip/segment after `explode`"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 45,
+ "id": "98206243-676b-4233-8b87-26d845918469",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " trip_instance_key | \n",
+ " shape_array_key | \n",
+ " stop_id1 | \n",
+ " stop_sequence | \n",
+ " geometry | \n",
+ " stop_id2 | \n",
+ " segment_id | \n",
+ " stop_pair | \n",
+ " schedule_gtfs_dataset_key | \n",
+ " route_id | \n",
+ " direction_id | \n",
+ " next_stop_sequence | \n",
+ " length | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 22 | \n",
+ " 0000dbb743b258f707687f2dc14ad90f | \n",
+ " 37bb198ccd3af93e3b7f10bb7602faff | \n",
+ " 819 | \n",
+ " 1672 | \n",
+ " LINESTRING (195533.932 -435807.899, 195532.071... | \n",
+ " 2304 | \n",
+ " 819-2304-1 | \n",
+ " 819__2304 | \n",
+ " f74424acf8c41e4c1e9fd42838c4875c | \n",
+ " 488 | \n",
+ " 1.0 | \n",
+ " 1959.0 | \n",
+ " 1734.469963 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " trip_instance_key shape_array_key \\\n",
+ "22 0000dbb743b258f707687f2dc14ad90f 37bb198ccd3af93e3b7f10bb7602faff \n",
+ "\n",
+ " stop_id1 stop_sequence geometry \\\n",
+ "22 819 1672 LINESTRING (195533.932 -435807.899, 195532.071... \n",
+ "\n",
+ " stop_id2 segment_id stop_pair schedule_gtfs_dataset_key route_id \\\n",
+ "22 2304 819-2304-1 819__2304 f74424acf8c41e4c1e9fd42838c4875c 488 \n",
+ "\n",
+ " direction_id next_stop_sequence length \n",
+ "22 1.0 1959.0 1734.469963 "
+ ]
+ },
+ "execution_count": 45,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "longs.head(1)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 46,
+ "id": "095240b9-1137-480c-82ab-1b3b5f00ee8d",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# segments missing from interpolation because we're missing the last stop sequence in testing\n",
+ "# should be fixed in prod!\n",
+ "\n",
+ "# (longs >> distinct(_.shape_array_key, _.segment_id, _keep_all=True)\n",
+ "# >> filter(_.next_stop_sequence.isna())).explore()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 47,
+ "id": "26dba8e0-bc65-424f-9c04-65b84fd84571",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def store_new_geoms(row):\n",
+ " '''\n",
+ " wrapper function -- rowwise apply to a gdf of \"long\" (>1000m) segments\n",
+ " \n",
+ " accumulate results in a dict: segment_geoms (init empty dict outside function)\n",
+ " '''\n",
+ " global segment_geoms\n",
+ " new_geom = split_distance(row.geometry)\n",
+ " # row.geometry = new_geom\n",
+ " \n",
+ " geom_key = (row.shape_array_key, row.segment_id)\n",
+ " segment_geoms[geom_key] = new_geom\n",
+ " # TODO store key:geom in dict/something fast...\n",
+ " \n",
+ " return"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 48,
+ "id": "8603a84a-cc7b-4f58-b229-6417d8014ed1",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def lookup_geom(row, segment_geoms: dict):\n",
+ " '''\n",
+ " after running store_new_geoms on unique segments, apply this to \n",
+ " a gdf of all segments to lookup new geom by shape_array_key, segment_id\n",
+ " '''\n",
+ " row.geometry = segment_geoms[(row.shape_array_key, row.segment_id)]\n",
+ " return row"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 49,
+ "id": "bb3b2fd0-16eb-45db-891e-a5b85e48701e",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "to_interpolate = (longs >> distinct(_.shape_array_key, _.segment_id, _keep_all=True)\n",
+ " )"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 50,
+ "id": "c0b8d61c-3c27-44a5-888c-33277c2c8650",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " trip_instance_key | \n",
+ " shape_array_key | \n",
+ " stop_id1 | \n",
+ " stop_sequence | \n",
+ " geometry | \n",
+ " stop_id2 | \n",
+ " segment_id | \n",
+ " stop_pair | \n",
+ " schedule_gtfs_dataset_key | \n",
+ " route_id | \n",
+ " direction_id | \n",
+ " next_stop_sequence | \n",
+ " length | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 0000dbb743b258f707687f2dc14ad90f | \n",
+ " 37bb198ccd3af93e3b7f10bb7602faff | \n",
+ " 819 | \n",
+ " 1672 | \n",
+ " LINESTRING (195533.932 -435807.899, 195532.071... | \n",
+ " 2304 | \n",
+ " 819-2304-1 | \n",
+ " 819__2304 | \n",
+ " f74424acf8c41e4c1e9fd42838c4875c | \n",
+ " 488 | \n",
+ " 1.0 | \n",
+ " 1959.0 | \n",
+ " 1734.469963 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " trip_instance_key shape_array_key \\\n",
+ "0 0000dbb743b258f707687f2dc14ad90f 37bb198ccd3af93e3b7f10bb7602faff \n",
+ "\n",
+ " stop_id1 stop_sequence geometry \\\n",
+ "0 819 1672 LINESTRING (195533.932 -435807.899, 195532.071... \n",
+ "\n",
+ " stop_id2 segment_id stop_pair schedule_gtfs_dataset_key route_id \\\n",
+ "0 2304 819-2304-1 819__2304 f74424acf8c41e4c1e9fd42838c4875c 488 \n",
+ "\n",
+ " direction_id next_stop_sequence length \n",
+ "0 1.0 1959.0 1734.469963 "
+ ]
+ },
+ "execution_count": 50,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "to_interpolate >> head(1)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 51,
+ "id": "577f76c8-2910-4fab-8edb-c6f8dfb0c64e",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "CPU times: user 25.4 s, sys: 88 ms, total: 25.5 s\n",
+ "Wall time: 26.9 s\n"
+ ]
+ }
+ ],
+ "source": [
+ "%%time\n",
+ "\n",
+ "segment_geoms = {}\n",
+ "\n",
+ "_ = to_interpolate.apply(store_new_geoms, axis=1)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 52,
+ "id": "cb53c1dd-a736-4113-a41d-24398fd95f28",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "CPU times: user 22.9 s, sys: 536 ms, total: 23.4 s\n",
+ "Wall time: 25 s\n"
+ ]
+ }
+ ],
+ "source": [
+ "%%time\n",
+ "# now, add geometries to long list...\n",
+ "interpolated = longs.apply(lookup_geom, axis = 1, args = ([segment_geoms]))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 53,
+ "id": "58a26367-7ee0-41ba-a860-ec768e09163d",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "interpolated = interpolated.explode(index_parts=False).reset_index(drop=True)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 54,
+ "id": "5ecc3beb-0296-4730-8123-5937104c4cb7",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "interpolated['length'] = interpolated.geometry.apply(lambda x: x.length)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 55,
+ "id": "186654d3-5132-42b2-8c72-7c2f7252fef5",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(514674, 13)"
+ ]
+ },
+ "execution_count": 55,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "interpolated.shape"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "e217636d-9a3a-4d08-bea6-6c5c0953e4d9",
+ "metadata": {
+ "jp-MarkdownHeadingCollapsed": true,
+ "tags": []
+ },
+ "source": [
+ "### Parallel (runs, but slower than single-thread for this task anyway. Also, still have some ordering issues.)\n",
+ "\n",
+ "https://docs.dask.org/en/stable/generated/dask.dataframe.groupby.DataFrameGroupBy.apply.html\n",
+ "* \"If the grouper does not align with the index then this causes a full shuffle. The order of rows within each group may not be preserved.\"\n",
+ "* attempted to set index equal to grouper, but some segments still seem to end up out of order..."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 56,
+ "id": "77dcbc44-abab-496c-9890-dfe80ce1d8bc",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# interpolated = interpolated.set_index(['trip_instance_key', 'segment_id'], drop=False)\n",
+ "# # dask doesn't support MultiIndex...\n",
+ "\n",
+ "# interpolated = interpolated.set_index(pd.util.hash_pandas_object(interpolated.index))\n",
+ "\n",
+ "# interpolated.index.name = 'trip_seg_hash'\n",
+ "\n",
+ "# int_meta = dd.utils.make_meta(interpolated)\n",
+ "\n",
+ "# # interpolated.info()\n",
+ "\n",
+ "# int_dd = dd.from_pandas(interpolated, npartitions=20)\n",
+ "\n",
+ "# int_group = int_dd.groupby(by='trip_seg_hash')\n",
+ "\n",
+ "# int_group = int_group.apply(process_exploded, meta = int_meta)\n",
+ "\n",
+ "# %%time\n",
+ "# # 13min -- actually slower than Pandas in this case\n",
+ "# result = int_group.compute()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "1ec919d5-90da-4cd1-acbe-1c9852123fae",
+ "metadata": {},
+ "source": [
+ "### Single-thread (works, about 10min)\n",
+ "\n",
+ "* depends on df remaining in order from .explode()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 57,
+ "id": "f8607ce5-7407-4bf7-a38c-ef223e44159b",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# %%time\n",
+ "\n",
+ "# processed = interpolated.groupby(['trip_instance_key', 'segment_id'], group_keys=False).apply(process_exploded)\n",
+ "\n",
+ "# recombined = pd.concat([shorts, processed]).reset_index(drop=True)\n",
+ "\n",
+ "# utils.geoparquet_gcs_export(recombined, gcs_file_path=GCS_PATH, file_name=filename)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 58,
+ "id": "ec6deec3-5b19-4909-b40d-e8efec7ded00",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "filename = 'test_interpolated_2024-03-13.parquet'"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 59,
+ "id": "544ab666-5574-450a-822f-e2b0c95bda30",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "recombined = gpd.read_parquet(GCS_PATH+filename)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 60,
+ "id": "faf92e49-1f74-4d7a-8018-4adbbbad99fe",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(514674, 13)"
+ ]
+ },
+ "execution_count": 60,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "interpolated.shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 61,
+ "id": "6a7e843b-86f4-40f2-a3d7-6ced53cd5a3f",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "3318446"
+ ]
+ },
+ "execution_count": 61,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "interpolated.shape[0] + shorts.shape[0]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 62,
+ "id": "d6970871-2e68-42d9-99e1-509ccbd26b78",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(3318446, 13)"
+ ]
+ },
+ "execution_count": 62,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "recombined.shape # looks OK"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "7548823a-e980-4c61-93df-9b9ccfc22db1",
+ "metadata": {
+ "tags": []
+ },
+ "source": [
+ "# Taking a look: Big Blue Bus R10\n",
+ "\n",
+ "* split and merge with untouched segments looks good!"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 63,
+ "id": "6c9799d2-b433-4dfe-9f59-44159aa30e7e",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from siuba import * # re-import to fix bug? TODO report..."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 64,
+ "id": "f88049ae-4614-496c-a0bb-c618dff460ab",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "test = recombined[recombined.trip_instance_key == '3505bf6a20e8d29e83e545784a421bc7']"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 65,
+ "id": "0e1dfbe2-7c40-4fad-af6b-bf3d121e7335",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/opt/conda/lib/python3.9/site-packages/geopandas/geodataframe.py:1543: SettingWithCopyWarning: \n",
+ "A value is trying to be set on a copy of a slice from a DataFrame.\n",
+ "Try using .loc[row_indexer,col_indexer] = value instead\n",
+ "\n",
+ "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+ " super().__setitem__(key, value)\n"
+ ]
+ }
+ ],
+ "source": [
+ "test.geometry = test.geometry.apply(lambda x: arrowize_segment(x))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 66,
+ "id": "236a583f-fd3d-475a-9d67-c15fe579264d",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "# test.explore()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "853f3517-af32-468e-ad29-5c719d6feada",
+ "metadata": {
+ "tags": []
+ },
+ "source": [
+ "# Further Testing: Salinas Valley, Woodland/Davis/Sac, Bay Br, I110 (Harbor Fwy), etc. all look good!"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 67,
+ "id": "8dbce55b-7c34-400d-ad82-c144af43eb61",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "bigtest = (recombined >> distinct(_.segment_id, _.shape_array_key, _keep_all=True))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 68,
+ "id": "a04cf6dc-1958-490b-9fea-1c0bc42f1b6d",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(203106, 13)"
+ ]
+ },
+ "execution_count": 68,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "bigtest.shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 69,
+ "id": "223524e7-f638-4419-8083-0cf670cde6f7",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "bigtest = bigtest >> filter(_.shape_array_key.isin(_.shape_array_key.unique()[:250]))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 70,
+ "id": "530cb1cc-dcba-4085-b241-53997dfaf164",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "bigtest.geometry = bigtest.geometry.apply(lambda x: arrowize_segment(x))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 71,
+ "id": "525c12e0-2119-47c5-9346-baa1ca11b4ec",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(11034, 13)"
+ ]
+ },
+ "execution_count": 71,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "bigtest.shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 76,
+ "id": "d585036e-888c-421a-a942-a829d2408e3f",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# bigtest.explore()"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.9.13"
+ },
+ "widgets": {
+ "application/vnd.jupyter.widget-state+json": {
+ "state": {},
+ "version_major": 2,
+ "version_minor": 0
+ }
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}