diff --git a/_shared_utils/shared_utils/rt_utils.py b/_shared_utils/shared_utils/rt_utils.py index d7f906da9..87ceba3fc 100644 --- a/_shared_utils/shared_utils/rt_utils.py +++ b/_shared_utils/shared_utils/rt_utils.py @@ -601,10 +601,10 @@ def arrowize_segment(line_geometry, buffer_distance: int = 20): end = shapely.ops.substring(end_segment, end_segment.length, end_segment.length) # correct r_shift = end_segment.parallel_offset(shift_distance, "right") r_pt = shapely.ops.substring(r_shift, 0, 0) - r_pt2 = shapely.ops.substring(r_shift, r_shift.length - arrow_distance, r_shift.length - arrow_distance) + r_pt2 = shapely.ops.substring(r_shift, r_shift.length, r_shift.length) l_shift = end_segment.parallel_offset(shift_distance, "left") l_pt = shapely.ops.substring(l_shift, 0, 0) - l_pt2 = shapely.ops.substring(l_shift, arrow_distance, arrow_distance) + l_pt2 = shapely.ops.substring(l_shift, l_shift.length, l_shift.length) t1 = shapely.geometry.Polygon((l_pt2, end, l_pt)) # triangles to cut top of arrow t2 = shapely.geometry.Polygon((r_pt2, end, r_pt)) diff --git a/rt_segment_speeds/30_interpolated_segments.ipynb b/rt_segment_speeds/30_interpolated_segments.ipynb new file mode 100644 index 000000000..cc056296c --- /dev/null +++ b/rt_segment_speeds/30_interpolated_segments.ipynb @@ -0,0 +1,1974 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "b61fc969-d940-4f3d-97d3-dd666c02bf2e", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "import os\n", + "os.environ[\"CALITP_BQ_MAX_BYTES\"] = str(800_000_000_000)\n", + "\n", + "import pandas as pd\n", + "import geopandas as gpd\n", + "\n", + "import dask.dataframe as dd\n", + "\n", + "from segment_speed_utils import helpers\n", + "from segment_speed_utils.project_vars import SEGMENT_GCS, analysis_date\n", + "from shared_utils.rt_utils import arrowize_segment\n", + "from shared_utils import gtfs_utils_v2\n", + "from calitp_data_analysis import geography_utils, utils\n", + "from calitp_data_analysis.tables import tbls\n", + "\n", + "\n", + "import numpy as np\n", + "import shapely\n", + "from siuba import *" + ] + }, + { + "cell_type": "markdown", + "id": "bd52224f-d256-47da-b7db-a9436e87ee0d", + "metadata": {}, + "source": [ + "# Add interpolated segments to pipeline\n", + "\n", + "* Additional detail between widely spaced stops is more useful for speed analysis" + ] + }, + { + "cell_type": "markdown", + "id": "caba4d72-7e58-4abc-88fa-83425df20a14", + "metadata": {}, + "source": [ + "## Check existing segments" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "b0a70df9-f929-46c3-8082-7a4e9030700f", + "metadata": {}, + "outputs": [], + "source": [ + "segs_path = 'gs://calitp-analytics-data/data-analyses/rt_segment_speeds/segment_options/stop_segments_2024-03-13.parquet'\n", + "GCS_PATH = 'gs://calitp-analytics-data/data-analyses/rt_segment_speeds/segment_options/'" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "46158cda-fccb-40e7-b322-46ef22de46b9", + "metadata": {}, + "outputs": [], + "source": [ + "# read via geopandas so that geometry stays intact\n", + "segs = gpd.read_parquet(segs_path)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "439cc654-8e02-4804-9692-d7b9395104ea", + "metadata": {}, + "outputs": [], + "source": [ + "# segs >> head(3)" + ] + }, + { + "cell_type": "markdown", + "id": "fe3ec77c-3455-416e-bba9-14c303f750a8", + "metadata": {}, + "source": [ + "## Less than 6% of segments need to be interpolated...\n" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "e75fe998-7cb1-40c2-a89e-6a22e893e42c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "997.6817468833709" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "segs.length.quantile(.94)" + ] + }, + { + "cell_type": "markdown", + "id": "f6b1e941-11d5-474a-9250-b5b47fb6c49d", + "metadata": {}, + "source": [ + "# Additional Columns Required\n", + "\n", + "Should probably happen upstream in `cut_stop_segments.py`, related scripts...\n", + "\n", + "* `length`: float, `geometry.length`\n", + "* `next_stop_sequence`: lead of `stop_sequence`, should include final stop seq (final stop seq unavailable here since shifting from existing df...)\n", + " * alternatively, rename `stop_sequence` -> `stop_sequence1` and add `stop_sequence2` (consistent with existing `stop_id1` and `stop_id2`)" + ] + }, + { + "cell_type": "markdown", + "id": "d624a026-968f-49b2-b6e6-ad38811fa924", + "metadata": { + "tags": [] + }, + "source": [ + "## Adding these after the fact for testing..." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "8d783355-2bef-47ce-8251-aaef211400cf", + "metadata": {}, + "outputs": [], + "source": [ + "# segs_ddf = dd.from_pandas(segs, npartitions=50)\n", + "\n", + "# segs_ddf = segs_ddf.assign(length=lambda x: x.geometry.length)\n", + "\n", + "# next_sequence_ddf = segs_ddf[['trip_instance_key', 'stop_sequence']].groupby('trip_instance_key').shift(-1)\n", + "# next_sequence_ddf = next_sequence_ddf.rename(columns={'stop_sequence': 'next_stop_sequence'})\n", + "\n", + "# # note this relies on the index\n", + "# segs_ddf = segs_ddf.join(next_sequence_ddf)\n", + "\n", + "# !mkdir test_segs\n", + "\n", + "# segs_ddf[['next_stop_sequence', 'length']].to_parquet('test_segs/')\n", + "\n", + "# # new_cols = segs_ddf.compute()" + ] + }, + { + "cell_type": "markdown", + "id": "52d148c7-ed25-49fd-b684-0097ef95ee73", + "metadata": {}, + "source": [ + "### restart kernel/seperate script" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "2120da8e-a98f-48a2-97ad-87e11b82a432", + "metadata": {}, + "outputs": [], + "source": [ + "# segs_ddf = dd.read_parquet('test_segs/')\n", + "\n", + "# segs_ddf\n", + "\n", + "# segs = segs.join(segs_ddf.compute())" + ] + }, + { + "cell_type": "code", + "execution_count": 78, + "id": "1d757efc-be7f-463b-9ace-e5bd9c1e6c9b", + "metadata": {}, + "outputs": [], + "source": [ + "intermediate_file = 'test_interpolated_intermediate_2024-03-13.parquet'" + ] + }, + { + "cell_type": "code", + "execution_count": 81, + "id": "7f9ed931-1782-4887-8ce3-f3804cf4b6d6", + "metadata": {}, + "outputs": [], + "source": [ + "utils.geoparquet_gcs_export(segs, GCS_PATH, intermediate_file)" + ] + }, + { + "cell_type": "code", + "execution_count": 82, + "id": "f42cd425-4512-4f4d-b9a3-fdeb8493d009", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "segs = gpd.read_parquet(GCS_PATH+intermediate_file)" + ] + }, + { + "cell_type": "markdown", + "id": "8c9ba88b-27a4-4288-88ce-f368d1507ba3", + "metadata": { + "tags": [] + }, + "source": [ + "## Find BBBR10" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "b892f6ea-67d0-4295-b014-10532d2408b9", + "metadata": {}, + "outputs": [], + "source": [ + "bbb = gtfs_utils_v2.schedule_daily_feed_to_gtfs_dataset_name('2024-03-13') >> filter(_.name.str.contains('Big Blue'))" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "1fc3d7d1-8b0d-4d2b-82d6-cbb741ae4ff1", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
keydatefeed_keyfeed_timezonebase64_urlgtfs_dataset_keygtfs_dataset_namenametyperegional_feed_type
219c35ae2a7cb1f9ad2cf0a5bce84ccf2982024-03-134f9888472a8dad0f66bdbbd002312789America/Los_AngelesaHR0cDovL2d0ZnMuYmlnYmx1ZWJ1cy5jb20vY3VycmVudC...efbbd5293be71f7a5de0cf82b59febe1Big Blue Bus ScheduleBig Blue Bus SchedulescheduleNone
\n", + "
" + ], + "text/plain": [ + " key date \\\n", + "219 c35ae2a7cb1f9ad2cf0a5bce84ccf298 2024-03-13 \n", + "\n", + " feed_key feed_timezone \\\n", + "219 4f9888472a8dad0f66bdbbd002312789 America/Los_Angeles \n", + "\n", + " base64_url \\\n", + "219 aHR0cDovL2d0ZnMuYmlnYmx1ZWJ1cy5jb20vY3VycmVudC... \n", + "\n", + " gtfs_dataset_key gtfs_dataset_name \\\n", + "219 efbbd5293be71f7a5de0cf82b59febe1 Big Blue Bus Schedule \n", + "\n", + " name type regional_feed_type \n", + "219 Big Blue Bus Schedule schedule None " + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "bbb" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "9b9963d4-9a87-46de-9c62-cf70d97c7a49", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/opt/conda/lib/python3.9/site-packages/sqlalchemy_bigquery/_types.py:101: SAWarning: Did not recognize type 'INTERVAL' of column 'arrival_time_interval'\n", + " sqlalchemy.util.warn(\n", + "/opt/conda/lib/python3.9/site-packages/sqlalchemy_bigquery/_types.py:101: SAWarning: Did not recognize type 'INTERVAL' of column 'departure_time_interval'\n", + " sqlalchemy.util.warn(\n", + "/opt/conda/lib/python3.9/site-packages/sqlalchemy_bigquery/_types.py:101: SAWarning: Did not recognize type 'INTERVAL' of column 'start_pickup_drop_off_window_interval'\n", + " sqlalchemy.util.warn(\n", + "/opt/conda/lib/python3.9/site-packages/sqlalchemy_bigquery/_types.py:101: SAWarning: Did not recognize type 'INTERVAL' of column 'end_pickup_drop_off_window_interval'\n", + " sqlalchemy.util.warn(\n" + ] + } + ], + "source": [ + "str10 = tbls.mart_gtfs.dim_stop_times() >> filter(_.feed_key == '4f9888472a8dad0f66bdbbd002312789', _.trip_id == '919600') >> collect()" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "fd382312-3069-44a5-96de-3e25a29d40fd", + "metadata": {}, + "outputs": [], + "source": [ + "# segs >> filter(_.trip_instance_key == 'd98e5cc1fb62e6e5ed0030934ef8a396') >> arrange(_.stop_sequence)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "8282707d-d5f3-465a-8c04-0cf161fb04cb", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
trip_idstop_idstop_sequencearrival_timedeparture_time
09196001708106:00:0006:00:00
1919600227206:01:0006:01:00
29196005306:01:2906:01:29
\n", + "
" + ], + "text/plain": [ + " trip_id stop_id stop_sequence arrival_time departure_time\n", + "0 919600 1708 1 06:00:00 06:00:00\n", + "1 919600 227 2 06:01:00 06:01:00\n", + "2 919600 5 3 06:01:29 06:01:29" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "str10 >> select(_.trip_id, _.stop_id, _.stop_sequence, _.arrival_time, _.departure_time) >> arrange(_.stop_sequence) >> head (3)" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "579807e7-0602-41a1-a1dc-e7d4d64c3c0b", + "metadata": {}, + "outputs": [], + "source": [ + "bbbr10 = (tbls.mart_gtfs.fct_scheduled_trips()\n", + " >> filter(_.gtfs_dataset_key.isin(bbb.gtfs_dataset_key),\n", + " _.route_short_name.str.contains('R10'),\n", + " _.service_date == '2024-03-13')\n", + ") >> collect()" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "55ce4e1a-205d-4f5a-baf0-d6adac265942", + "metadata": {}, + "outputs": [], + "source": [ + "# bbbr10" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "46cc6303-2103-46d8-970a-d0a08b4f5bdd", + "metadata": {}, + "outputs": [], + "source": [ + "# bbbr10.trip_id" + ] + }, + { + "cell_type": "markdown", + "id": "91721d4c-7ff8-4606-bcee-2b421694f48a", + "metadata": {}, + "source": [ + "## Work out function" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "c97b6aaa-8e28-4e06-b209-4051aabf834f", + "metadata": {}, + "outputs": [], + "source": [ + "test = segs >> filter(_.trip_instance_key == '3505bf6a20e8d29e83e545784a421bc7')" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "ed74402c-0d2b-42bb-9fb8-7e926e6f15c5", + "metadata": {}, + "outputs": [], + "source": [ + "long = test >> filter(_.stop_sequence == 18)" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "bacc740d-3192-4283-a39d-c0fa140df9ec", + "metadata": {}, + "outputs": [], + "source": [ + "geom = long.geometry.iloc[0]" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "1a3481ae-9b85-4c95-94a7-a8215796259e", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "17696.81706739369" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "geom.length" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "3e86bf11-82b7-44c2-aa58-32031c7cbbe7", + "metadata": {}, + "outputs": [], + "source": [ + "def split_distance(geom, dist=1000):\n", + " '''\n", + " geom: shapely.LineString\n", + " \n", + " returns a shapely.MultiLineString split every dist (meters)\n", + " '''\n", + " \n", + " split_segs = geom.length // dist\n", + " substrings = []\n", + " for i in range(0, int(split_segs)):\n", + " substrings += [shapely.ops.substring(geom, i * dist, (i+1) * dist)]\n", + " substrings += [shapely.ops.substring(geom, split_segs * dist, geom.length)]\n", + " new_geom = shapely.MultiLineString(substrings)\n", + " \n", + " return new_geom" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "886c6d18-db65-405b-8eb0-db45a4b80945", + "metadata": {}, + "outputs": [], + "source": [ + "new_geom = split_distance(geom)" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "6fdc3e6c-58f4-4725-9c9d-c229db6f7dfd", + "metadata": {}, + "outputs": [ + { + "data": { + "image/svg+xml": [ + "" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "geom" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "f6a3b041-6ea1-4c1c-8d01-d1a20bc2b1a2", + "metadata": {}, + "outputs": [ + { + "data": { + "image/svg+xml": [ + "" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "new_geom" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "2fa3c835-5fb3-48bd-a099-e063943a1068", + "metadata": {}, + "outputs": [], + "source": [ + "# geom.wkt" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "fda14a05-d0d4-4768-9063-9e115a52a367", + "metadata": {}, + "outputs": [], + "source": [ + "# new_geom.wkt" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "fd136876-14d5-4e85-a448-4ed1101889c0", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/opt/conda/lib/python3.9/site-packages/geopandas/geodataframe.py:1543: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " super().__setitem__(key, value)\n" + ] + } + ], + "source": [ + "long.geometry = [new_geom]" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "id": "c85e3ca0-f32a-43ff-b3d4-7c09c09a456a", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/tmp/ipykernel_445/1845589243.py:1: FutureWarning: Currently, index_parts defaults to True, but in the future, it will default to False to be consistent with Pandas. Use `index_parts=True` to keep the current behavior and True/False to silence the warning.\n", + " exploded = long.explode()\n" + ] + } + ], + "source": [ + "exploded = long.explode()" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "id": "a33dd638-dccc-4b36-b09b-b81de3c31084", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
trip_instance_keyshape_array_keystop_id1stop_sequencestop_id2segment_idstop_pairschedule_gtfs_dataset_keyroute_iddirection_idnext_stop_sequencelengthgeometry
287540503505bf6a20e8d29e83e545784a421bc737468c4ffbbbab83f270b8fcecb6de6122818106228-106-1228__106efbbd5293be71f7a5de0cf82b59febe137140.019.017696.817067LINESTRING (143107.958 -441727.430, 143118.407...
13505bf6a20e8d29e83e545784a421bc737468c4ffbbbab83f270b8fcecb6de6122818106228-106-1228__106efbbd5293be71f7a5de0cf82b59febe137140.019.017696.817067LINESTRING (143967.823 -441640.058, 144006.108...
\n", + "
" + ], + "text/plain": [ + " trip_instance_key shape_array_key \\\n", + "2875405 0 3505bf6a20e8d29e83e545784a421bc7 37468c4ffbbbab83f270b8fcecb6de61 \n", + " 1 3505bf6a20e8d29e83e545784a421bc7 37468c4ffbbbab83f270b8fcecb6de61 \n", + "\n", + " stop_id1 stop_sequence stop_id2 segment_id stop_pair \\\n", + "2875405 0 228 18 106 228-106-1 228__106 \n", + " 1 228 18 106 228-106-1 228__106 \n", + "\n", + " schedule_gtfs_dataset_key route_id direction_id \\\n", + "2875405 0 efbbd5293be71f7a5de0cf82b59febe1 3714 0.0 \n", + " 1 efbbd5293be71f7a5de0cf82b59febe1 3714 0.0 \n", + "\n", + " next_stop_sequence length \\\n", + "2875405 0 19.0 17696.817067 \n", + " 1 19.0 17696.817067 \n", + "\n", + " geometry \n", + "2875405 0 LINESTRING (143107.958 -441727.430, 143118.407... \n", + " 1 LINESTRING (143967.823 -441640.058, 144006.108... " + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# note order remains from list order passed to MultiLineString constructor\n", + "exploded.head(2)" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "id": "173449e3-48cd-489c-b9da-dd2777457d7a", + "metadata": {}, + "outputs": [], + "source": [ + "def process_exploded(gdf):\n", + " '''\n", + " update required cols in exploded gdf\n", + " \n", + " stop_sequence: increment proportional to segment distance within arbitrary stop sequence increment\n", + " segment_id: postfix _(int) per segment to maintain uniqueness\n", + " '''\n", + " assert len(gdf.trip_instance_key.unique()) == 1 and len(gdf.segment_id.unique()) == 1, 'must group by trip_instance_key, segment_id'\n", + " \n", + " prev_stop = int(gdf.stop_sequence.min())\n", + " next_stop = int(gdf.next_stop_sequence.max())\n", + " stop_seq_chg = gdf.next_stop_sequence.max() - prev_stop\n", + " \n", + " # increment stop sequence proportional to distance traveled \n", + " seq_per_km = stop_seq_chg / gdf.length.sum() \n", + " seq_changes = gdf.length * seq_per_km\n", + " stop_sequences_scaled = np.flip(next_stop - np.flip(seq_changes).cumsum())\n", + " \n", + " gdf['stop_sequence'] = stop_sequences_scaled\n", + " \n", + " # postfix to segment_id so that it remains unique\n", + " postfixes = np.arange(0, gdf.shape[0]).astype(str)\n", + " underscores = np.full(gdf.shape[0], '_')\n", + " postfixes = np.char.add(underscores, postfixes)\n", + " gdf['segment_id'] = gdf.segment_id + postfixes\n", + " \n", + " return gdf" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "id": "60a09e07-9882-44f5-990d-e35c3d7b9b6b", + "metadata": {}, + "outputs": [], + "source": [ + "processed = process_exploded(exploded)" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "id": "10ea37f3-d285-4988-a663-9f333f815fc9", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "\u001b[0;31mSignature:\u001b[0m \u001b[0marrowize_segment\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mline_geometry\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mbuffer_distance\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mint\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;36m20\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mDocstring:\u001b[0m\n", + "Given a linestring segment from a gtfs shape,\n", + "buffer and clip to show direction of progression\n", + "\u001b[0;31mFile:\u001b[0m ~/data-analyses/_shared_utils/shared_utils/rt_utils.py\n", + "\u001b[0;31mType:\u001b[0m function" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "arrowize_segment?" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "id": "a78f13ca-3f22-4e60-b3ec-9bc1c0a73c04", + "metadata": {}, + "outputs": [], + "source": [ + "processed.geometry = processed.geometry.apply(lambda x: arrowize_segment(x))" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "id": "1003a058-4b1e-4aa8-aeba-d4a52fb12731", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
trip_instance_keyshape_array_keystop_id1stop_sequencestop_id2segment_idstop_pairschedule_gtfs_dataset_keyroute_iddirection_idnext_stop_sequencelengthgeometry
287540503505bf6a20e8d29e83e545784a421bc737468c4ffbbbab83f270b8fcecb6de6122818.000000106228-106-1_0228__106efbbd5293be71f7a5de0cf82b59febe137140.019.017696.817067POLYGON ((143475.239 -441774.715, 143514.754 -...
13505bf6a20e8d29e83e545784a421bc737468c4ffbbbab83f270b8fcecb6de6122818.056507106228-106-1_1228__106efbbd5293be71f7a5de0cf82b59febe137140.019.017696.817067POLYGON ((144774.726 -441469.401, 144927.328 -...
23505bf6a20e8d29e83e545784a421bc737468c4ffbbbab83f270b8fcecb6de6122818.113015106228-106-1_2228__106efbbd5293be71f7a5de0cf82b59febe137140.019.017696.817067POLYGON ((145112.933 -441431.350, 145928.926 -...
\n", + "
" + ], + "text/plain": [ + " trip_instance_key shape_array_key \\\n", + "2875405 0 3505bf6a20e8d29e83e545784a421bc7 37468c4ffbbbab83f270b8fcecb6de61 \n", + " 1 3505bf6a20e8d29e83e545784a421bc7 37468c4ffbbbab83f270b8fcecb6de61 \n", + " 2 3505bf6a20e8d29e83e545784a421bc7 37468c4ffbbbab83f270b8fcecb6de61 \n", + "\n", + " stop_id1 stop_sequence stop_id2 segment_id stop_pair \\\n", + "2875405 0 228 18.000000 106 228-106-1_0 228__106 \n", + " 1 228 18.056507 106 228-106-1_1 228__106 \n", + " 2 228 18.113015 106 228-106-1_2 228__106 \n", + "\n", + " schedule_gtfs_dataset_key route_id direction_id \\\n", + "2875405 0 efbbd5293be71f7a5de0cf82b59febe1 3714 0.0 \n", + " 1 efbbd5293be71f7a5de0cf82b59febe1 3714 0.0 \n", + " 2 efbbd5293be71f7a5de0cf82b59febe1 3714 0.0 \n", + "\n", + " next_stop_sequence length \\\n", + "2875405 0 19.0 17696.817067 \n", + " 1 19.0 17696.817067 \n", + " 2 19.0 17696.817067 \n", + "\n", + " geometry \n", + "2875405 0 POLYGON ((143475.239 -441774.715, 143514.754 -... \n", + " 1 POLYGON ((144774.726 -441469.401, 144927.328 -... \n", + " 2 POLYGON ((145112.933 -441431.350, 145928.926 -... " + ] + }, + "execution_count": 34, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "processed.head(3)" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "id": "c13b133d-27ef-43e8-ad3a-ee803c0788c4", + "metadata": {}, + "outputs": [], + "source": [ + "# processed.explore()" + ] + }, + { + "cell_type": "markdown", + "id": "9cb8d0de-12fd-471d-95c3-bb258a334add", + "metadata": {}, + "source": [ + "# Abstracting (to move to new script?)\n", + "\n", + "* seperate short and long segments\n", + "* split long segments and update columns to preserve unique segment_id, meaningful stop_sequence that is sortable and proportional to distance travelled (when between actual stops, no proportionality requirement in original feed)\n", + "* recombine" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "id": "ba047670-b2fc-4a41-a342-5ed9323ad94b", + "metadata": {}, + "outputs": [], + "source": [ + "shorts = (segs[segs['length'] < 1000]).copy()" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "id": "bcb9dfe0-c8b9-474f-ad1f-3c5c14923452", + "metadata": {}, + "outputs": [], + "source": [ + "longs = (segs[segs['length'] > 1000]).copy()" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "id": "8f6d0468-d15e-4663-965a-095d8dcad12c", + "metadata": {}, + "outputs": [], + "source": [ + "longs = longs >> filter(-_.next_stop_sequence.isna()) # fix upstream (include final next seq...)" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "id": "112b7f03-1399-41a5-85e8-501717e31647", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(163577, 13)" + ] + }, + "execution_count": 39, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "longs.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "id": "d5f71050-2201-4d95-b0cc-b539999e2039", + "metadata": {}, + "outputs": [], + "source": [ + "test_longs = longs.iloc[:1000,:]" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "id": "425245ff-4c54-4992-be17-0f31520908e1", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
trip_instance_keyshape_array_keystop_id1stop_sequencegeometrystop_id2segment_idstop_pairschedule_gtfs_dataset_keyroute_iddirection_idnext_stop_sequencelength
220000dbb743b258f707687f2dc14ad90f37bb198ccd3af93e3b7f10bb7602faff8191672LINESTRING (195533.932 -435807.899, 195532.071...2304819-2304-1819__2304f74424acf8c41e4c1e9fd42838c4875c4881.01959.01734.469963
7809e32488392ad4c1684b0108f3bba8b337bb198ccd3af93e3b7f10bb7602faff8191532LINESTRING (195533.932 -435807.899, 195532.071...2304819-2304-1819__2304f74424acf8c41e4c1e9fd42838c4875c4881.01767.01734.469963
1340aaeb33101f4ac9ebb7851388c35582537bb198ccd3af93e3b7f10bb7602faff8191672LINESTRING (195533.932 -435807.899, 195532.071...2304819-2304-1819__2304f74424acf8c41e4c1e9fd42838c4875c4881.01959.01734.469963
\n", + "
" + ], + "text/plain": [ + " trip_instance_key shape_array_key \\\n", + "22 0000dbb743b258f707687f2dc14ad90f 37bb198ccd3af93e3b7f10bb7602faff \n", + "78 09e32488392ad4c1684b0108f3bba8b3 37bb198ccd3af93e3b7f10bb7602faff \n", + "134 0aaeb33101f4ac9ebb7851388c355825 37bb198ccd3af93e3b7f10bb7602faff \n", + "\n", + " stop_id1 stop_sequence \\\n", + "22 819 1672 \n", + "78 819 1532 \n", + "134 819 1672 \n", + "\n", + " geometry stop_id2 segment_id \\\n", + "22 LINESTRING (195533.932 -435807.899, 195532.071... 2304 819-2304-1 \n", + "78 LINESTRING (195533.932 -435807.899, 195532.071... 2304 819-2304-1 \n", + "134 LINESTRING (195533.932 -435807.899, 195532.071... 2304 819-2304-1 \n", + "\n", + " stop_pair schedule_gtfs_dataset_key route_id direction_id \\\n", + "22 819__2304 f74424acf8c41e4c1e9fd42838c4875c 488 1.0 \n", + "78 819__2304 f74424acf8c41e4c1e9fd42838c4875c 488 1.0 \n", + "134 819__2304 f74424acf8c41e4c1e9fd42838c4875c 488 1.0 \n", + "\n", + " next_stop_sequence length \n", + "22 1959.0 1734.469963 \n", + "78 1767.0 1734.469963 \n", + "134 1959.0 1734.469963 " + ] + }, + "execution_count": 41, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "test_longs.head(3)" + ] + }, + { + "cell_type": "markdown", + "id": "90f76913-57fb-4af3-a330-95215b55eec2", + "metadata": { + "jp-MarkdownHeadingCollapsed": true, + "tags": [] + }, + "source": [ + "## Rowwise apply and accumulate? (works but too slow, ~45min)" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "id": "1f3bf3e2-9d83-4db0-bfb8-191f9991581e", + "metadata": {}, + "outputs": [], + "source": [ + "# def interpolate_segments(row):\n", + "# '''\n", + "# wrapper function -- rowwise apply to a gdf of \"long\" (>1000m) segments\n", + "# '''\n", + "# global interpolated_longs\n", + "# new_geom = split_distance(row.geometry)\n", + "\n", + "# row.geometry = new_geom\n", + "# # back to gdf to use .explode()\n", + "# row = (gpd.GeoDataFrame(row)\n", + "# .transpose()\n", + "# .set_geometry('geometry')\n", + "# .set_crs(geography_utils.CA_NAD83Albers)\n", + "# )\n", + "# exploded = gpd.GeoDataFrame.explode(row, column='geometry', index_parts=False)\n", + "# # return exploded\n", + "# processed = process_exploded(exploded)\n", + " \n", + "# interpolated_longs += [processed]\n", + "# return" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "id": "8007f5dd-2e83-4210-bebe-06faa5bcc5e0", + "metadata": {}, + "outputs": [], + "source": [ + "# test1 = test_longs.head(1)\n", + "\n", + "# x = test1.apply(interpolate_segments, axis=1)\n", + "\n", + "# gdf = pd.concat(interpolated_longs)\n", + "\n", + "# # can't split at endpoints (no next stop seq, must calculate upstream instead of shift!)\n", + "# # drop for now to test\n", + "# test_longs = test_longs >> filter(-_.next_stop_sequence.isna())\n", + "\n", + "# %%timeit\n", + "\n", + "# interpolated_longs = []\n", + "\n", + "# _ = test_longs.apply(interpolate_segments, axis=1)\n", + "\n", + "# (14 * 177) / 60 # 41min to interpolate all -- not entirely ideal\n", + "\n", + "# (interpolated >> distinct(_.segment_id, _keep_all=True)).shape" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "id": "11817c00-4d3b-43b4-bb08-eda917712f7c", + "metadata": {}, + "outputs": [], + "source": [ + "# interpolated = pd.concat(interpolated_longs)" + ] + }, + { + "cell_type": "markdown", + "id": "f899c508-b508-472d-89f4-fe22a89fb561", + "metadata": {}, + "source": [ + "## Compute new geometries only once\n", + "\n", + "* compute/join on `shape_array_key, segment_id`\n", + "* accumulate geoms only in dict based on unique df; lookup/replace geoms in full df, then use `gdf.explode()`, `process_exploded`\n", + "* ~12min total for entire state, could maybe speed up `process_exploded` since it's a groupby/apply but this implementation depends on the gdf staying in order for each trip/segment after `explode`" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "id": "98206243-676b-4233-8b87-26d845918469", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
trip_instance_keyshape_array_keystop_id1stop_sequencegeometrystop_id2segment_idstop_pairschedule_gtfs_dataset_keyroute_iddirection_idnext_stop_sequencelength
220000dbb743b258f707687f2dc14ad90f37bb198ccd3af93e3b7f10bb7602faff8191672LINESTRING (195533.932 -435807.899, 195532.071...2304819-2304-1819__2304f74424acf8c41e4c1e9fd42838c4875c4881.01959.01734.469963
\n", + "
" + ], + "text/plain": [ + " trip_instance_key shape_array_key \\\n", + "22 0000dbb743b258f707687f2dc14ad90f 37bb198ccd3af93e3b7f10bb7602faff \n", + "\n", + " stop_id1 stop_sequence geometry \\\n", + "22 819 1672 LINESTRING (195533.932 -435807.899, 195532.071... \n", + "\n", + " stop_id2 segment_id stop_pair schedule_gtfs_dataset_key route_id \\\n", + "22 2304 819-2304-1 819__2304 f74424acf8c41e4c1e9fd42838c4875c 488 \n", + "\n", + " direction_id next_stop_sequence length \n", + "22 1.0 1959.0 1734.469963 " + ] + }, + "execution_count": 45, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "longs.head(1)" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "id": "095240b9-1137-480c-82ab-1b3b5f00ee8d", + "metadata": {}, + "outputs": [], + "source": [ + "# segments missing from interpolation because we're missing the last stop sequence in testing\n", + "# should be fixed in prod!\n", + "\n", + "# (longs >> distinct(_.shape_array_key, _.segment_id, _keep_all=True)\n", + "# >> filter(_.next_stop_sequence.isna())).explore()" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "id": "26dba8e0-bc65-424f-9c04-65b84fd84571", + "metadata": {}, + "outputs": [], + "source": [ + "def store_new_geoms(row):\n", + " '''\n", + " wrapper function -- rowwise apply to a gdf of \"long\" (>1000m) segments\n", + " \n", + " accumulate results in a dict: segment_geoms (init empty dict outside function)\n", + " '''\n", + " global segment_geoms\n", + " new_geom = split_distance(row.geometry)\n", + " # row.geometry = new_geom\n", + " \n", + " geom_key = (row.shape_array_key, row.segment_id)\n", + " segment_geoms[geom_key] = new_geom\n", + " # TODO store key:geom in dict/something fast...\n", + " \n", + " return" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "id": "8603a84a-cc7b-4f58-b229-6417d8014ed1", + "metadata": {}, + "outputs": [], + "source": [ + "def lookup_geom(row, segment_geoms: dict):\n", + " '''\n", + " after running store_new_geoms on unique segments, apply this to \n", + " a gdf of all segments to lookup new geom by shape_array_key, segment_id\n", + " '''\n", + " row.geometry = segment_geoms[(row.shape_array_key, row.segment_id)]\n", + " return row" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "id": "bb3b2fd0-16eb-45db-891e-a5b85e48701e", + "metadata": {}, + "outputs": [], + "source": [ + "to_interpolate = (longs >> distinct(_.shape_array_key, _.segment_id, _keep_all=True)\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "id": "c0b8d61c-3c27-44a5-888c-33277c2c8650", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
trip_instance_keyshape_array_keystop_id1stop_sequencegeometrystop_id2segment_idstop_pairschedule_gtfs_dataset_keyroute_iddirection_idnext_stop_sequencelength
00000dbb743b258f707687f2dc14ad90f37bb198ccd3af93e3b7f10bb7602faff8191672LINESTRING (195533.932 -435807.899, 195532.071...2304819-2304-1819__2304f74424acf8c41e4c1e9fd42838c4875c4881.01959.01734.469963
\n", + "
" + ], + "text/plain": [ + " trip_instance_key shape_array_key \\\n", + "0 0000dbb743b258f707687f2dc14ad90f 37bb198ccd3af93e3b7f10bb7602faff \n", + "\n", + " stop_id1 stop_sequence geometry \\\n", + "0 819 1672 LINESTRING (195533.932 -435807.899, 195532.071... \n", + "\n", + " stop_id2 segment_id stop_pair schedule_gtfs_dataset_key route_id \\\n", + "0 2304 819-2304-1 819__2304 f74424acf8c41e4c1e9fd42838c4875c 488 \n", + "\n", + " direction_id next_stop_sequence length \n", + "0 1.0 1959.0 1734.469963 " + ] + }, + "execution_count": 50, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "to_interpolate >> head(1)" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "id": "577f76c8-2910-4fab-8edb-c6f8dfb0c64e", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 25.4 s, sys: 88 ms, total: 25.5 s\n", + "Wall time: 26.9 s\n" + ] + } + ], + "source": [ + "%%time\n", + "\n", + "segment_geoms = {}\n", + "\n", + "_ = to_interpolate.apply(store_new_geoms, axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "id": "cb53c1dd-a736-4113-a41d-24398fd95f28", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 22.9 s, sys: 536 ms, total: 23.4 s\n", + "Wall time: 25 s\n" + ] + } + ], + "source": [ + "%%time\n", + "# now, add geometries to long list...\n", + "interpolated = longs.apply(lookup_geom, axis = 1, args = ([segment_geoms]))" + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "id": "58a26367-7ee0-41ba-a860-ec768e09163d", + "metadata": {}, + "outputs": [], + "source": [ + "interpolated = interpolated.explode(index_parts=False).reset_index(drop=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "id": "5ecc3beb-0296-4730-8123-5937104c4cb7", + "metadata": {}, + "outputs": [], + "source": [ + "interpolated['length'] = interpolated.geometry.apply(lambda x: x.length)" + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "id": "186654d3-5132-42b2-8c72-7c2f7252fef5", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(514674, 13)" + ] + }, + "execution_count": 55, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "interpolated.shape" + ] + }, + { + "cell_type": "markdown", + "id": "e217636d-9a3a-4d08-bea6-6c5c0953e4d9", + "metadata": { + "jp-MarkdownHeadingCollapsed": true, + "tags": [] + }, + "source": [ + "### Parallel (runs, but slower than single-thread for this task anyway. Also, still have some ordering issues.)\n", + "\n", + "https://docs.dask.org/en/stable/generated/dask.dataframe.groupby.DataFrameGroupBy.apply.html\n", + "* \"If the grouper does not align with the index then this causes a full shuffle. The order of rows within each group may not be preserved.\"\n", + "* attempted to set index equal to grouper, but some segments still seem to end up out of order..." + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "id": "77dcbc44-abab-496c-9890-dfe80ce1d8bc", + "metadata": {}, + "outputs": [], + "source": [ + "# interpolated = interpolated.set_index(['trip_instance_key', 'segment_id'], drop=False)\n", + "# # dask doesn't support MultiIndex...\n", + "\n", + "# interpolated = interpolated.set_index(pd.util.hash_pandas_object(interpolated.index))\n", + "\n", + "# interpolated.index.name = 'trip_seg_hash'\n", + "\n", + "# int_meta = dd.utils.make_meta(interpolated)\n", + "\n", + "# # interpolated.info()\n", + "\n", + "# int_dd = dd.from_pandas(interpolated, npartitions=20)\n", + "\n", + "# int_group = int_dd.groupby(by='trip_seg_hash')\n", + "\n", + "# int_group = int_group.apply(process_exploded, meta = int_meta)\n", + "\n", + "# %%time\n", + "# # 13min -- actually slower than Pandas in this case\n", + "# result = int_group.compute()" + ] + }, + { + "cell_type": "markdown", + "id": "1ec919d5-90da-4cd1-acbe-1c9852123fae", + "metadata": {}, + "source": [ + "### Single-thread (works, about 10min)\n", + "\n", + "* depends on df remaining in order from .explode()" + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "id": "f8607ce5-7407-4bf7-a38c-ef223e44159b", + "metadata": {}, + "outputs": [], + "source": [ + "# %%time\n", + "\n", + "# processed = interpolated.groupby(['trip_instance_key', 'segment_id'], group_keys=False).apply(process_exploded)\n", + "\n", + "# recombined = pd.concat([shorts, processed]).reset_index(drop=True)\n", + "\n", + "# utils.geoparquet_gcs_export(recombined, gcs_file_path=GCS_PATH, file_name=filename)" + ] + }, + { + "cell_type": "code", + "execution_count": 58, + "id": "ec6deec3-5b19-4909-b40d-e8efec7ded00", + "metadata": {}, + "outputs": [], + "source": [ + "filename = 'test_interpolated_2024-03-13.parquet'" + ] + }, + { + "cell_type": "code", + "execution_count": 59, + "id": "544ab666-5574-450a-822f-e2b0c95bda30", + "metadata": {}, + "outputs": [], + "source": [ + "recombined = gpd.read_parquet(GCS_PATH+filename)" + ] + }, + { + "cell_type": "code", + "execution_count": 60, + "id": "faf92e49-1f74-4d7a-8018-4adbbbad99fe", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(514674, 13)" + ] + }, + "execution_count": 60, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "interpolated.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 61, + "id": "6a7e843b-86f4-40f2-a3d7-6ced53cd5a3f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "3318446" + ] + }, + "execution_count": 61, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "interpolated.shape[0] + shorts.shape[0]" + ] + }, + { + "cell_type": "code", + "execution_count": 62, + "id": "d6970871-2e68-42d9-99e1-509ccbd26b78", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(3318446, 13)" + ] + }, + "execution_count": 62, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "recombined.shape # looks OK" + ] + }, + { + "cell_type": "markdown", + "id": "7548823a-e980-4c61-93df-9b9ccfc22db1", + "metadata": { + "tags": [] + }, + "source": [ + "# Taking a look: Big Blue Bus R10\n", + "\n", + "* split and merge with untouched segments looks good!" + ] + }, + { + "cell_type": "code", + "execution_count": 63, + "id": "6c9799d2-b433-4dfe-9f59-44159aa30e7e", + "metadata": {}, + "outputs": [], + "source": [ + "from siuba import * # re-import to fix bug? TODO report..." + ] + }, + { + "cell_type": "code", + "execution_count": 64, + "id": "f88049ae-4614-496c-a0bb-c618dff460ab", + "metadata": {}, + "outputs": [], + "source": [ + "test = recombined[recombined.trip_instance_key == '3505bf6a20e8d29e83e545784a421bc7']" + ] + }, + { + "cell_type": "code", + "execution_count": 65, + "id": "0e1dfbe2-7c40-4fad-af6b-bf3d121e7335", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/opt/conda/lib/python3.9/site-packages/geopandas/geodataframe.py:1543: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " super().__setitem__(key, value)\n" + ] + } + ], + "source": [ + "test.geometry = test.geometry.apply(lambda x: arrowize_segment(x))" + ] + }, + { + "cell_type": "code", + "execution_count": 66, + "id": "236a583f-fd3d-475a-9d67-c15fe579264d", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# test.explore()" + ] + }, + { + "cell_type": "markdown", + "id": "853f3517-af32-468e-ad29-5c719d6feada", + "metadata": { + "tags": [] + }, + "source": [ + "# Further Testing: Salinas Valley, Woodland/Davis/Sac, Bay Br, I110 (Harbor Fwy), etc. all look good!" + ] + }, + { + "cell_type": "code", + "execution_count": 67, + "id": "8dbce55b-7c34-400d-ad82-c144af43eb61", + "metadata": {}, + "outputs": [], + "source": [ + "bigtest = (recombined >> distinct(_.segment_id, _.shape_array_key, _keep_all=True))" + ] + }, + { + "cell_type": "code", + "execution_count": 68, + "id": "a04cf6dc-1958-490b-9fea-1c0bc42f1b6d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(203106, 13)" + ] + }, + "execution_count": 68, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "bigtest.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 69, + "id": "223524e7-f638-4419-8083-0cf670cde6f7", + "metadata": {}, + "outputs": [], + "source": [ + "bigtest = bigtest >> filter(_.shape_array_key.isin(_.shape_array_key.unique()[:250]))" + ] + }, + { + "cell_type": "code", + "execution_count": 70, + "id": "530cb1cc-dcba-4085-b241-53997dfaf164", + "metadata": {}, + "outputs": [], + "source": [ + "bigtest.geometry = bigtest.geometry.apply(lambda x: arrowize_segment(x))" + ] + }, + { + "cell_type": "code", + "execution_count": 71, + "id": "525c12e0-2119-47c5-9346-baa1ca11b4ec", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(11034, 13)" + ] + }, + "execution_count": 71, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "bigtest.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 76, + "id": "d585036e-888c-421a-a942-a829d2408e3f", + "metadata": {}, + "outputs": [], + "source": [ + "# bigtest.explore()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.13" + }, + "widgets": { + "application/vnd.jupyter.widget-state+json": { + "state": {}, + "version_major": 2, + "version_minor": 0 + } + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}