Skip to content

Commit

Permalink
Merge pull request #932 from cal-itp/segment-refactoring
Browse files Browse the repository at this point in the history
Segment refactoring
  • Loading branch information
tiffanychu90 authored Oct 18, 2023
2 parents d00cf65 + 89fd1d1 commit 566f455
Show file tree
Hide file tree
Showing 25 changed files with 983 additions and 398 deletions.
3 changes: 2 additions & 1 deletion _shared_utils/shared_utils/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from . import ( # calitp_color_palette,; geography_utils,; styleguide,; utils,
dask_utils,
geog_utils_to_add,
gtfs_utils,
gtfs_utils_v2,
portfolio_utils,
Expand All @@ -11,7 +12,7 @@
__all__ = [
# "calitp_color_palette",
"dask_utils",
# "geography_utils",
"geog_utils_to_add",
"gtfs_utils",
"gtfs_utils_v2",
"portfolio_utils",
Expand Down
36 changes: 36 additions & 0 deletions _shared_utils/shared_utils/geog_utils_to_add.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
import geopandas as gpd


def explode_segments(
gdf: gpd.GeoDataFrame, group_cols: list, segment_col: str = "segment_geometry"
) -> gpd.GeoDataFrame:
"""
Explode the column that is used to store segments, which is a list.
Take the list and create a row for each element in the list.
We'll do a rough rank so we can order the segments.
"""
gdf_exploded = gdf.explode(segment_col).reset_index(drop=True)

gdf_exploded["temp_index"] = gdf_exploded.index

gdf_exploded = gdf_exploded.assign(
segment_sequence=(
gdf_exploded.groupby(group_cols, observed=True, group_keys=False).temp_index.transform("rank")
- 1
# there are NaNs, but since they're a single segment, just use 0
)
.fillna(0)
.astype("int16")
)

# Drop the original line geometry, use the segment geometry only
gdf_exploded2 = (
gdf_exploded.drop(columns=["geometry", "temp_index"])
.rename(columns={segment_col: "geometry"})
.set_geometry("geometry")
.set_crs(gdf_exploded.crs)
.sort_values(group_cols + ["segment_sequence"])
.reset_index(drop=True)
)

return gdf_exploded2
18 changes: 13 additions & 5 deletions gtfs_funnel/stop_times_with_direction.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,9 @@ def prep_scheduled_stop_times(analysis_date: str) -> dg.GeoDataFrame:
trips = helpers.import_scheduled_trips(
analysis_date,
columns = ["gtfs_dataset_key", "feed_key",
"trip_id", "trip_instance_key"],
"trip_id", "trip_instance_key",
"shape_array_key"
],
get_pandas = True
)

Expand Down Expand Up @@ -77,7 +79,7 @@ def find_prior_stop(
.groupby("trip_instance_key")
.stop_sequence
.shift(1)
).astype("Int64")
)
)

prior_stop_geom = stop_times[
Expand All @@ -99,7 +101,7 @@ def find_prior_stop(
prior_stop_geom,
on = ["trip_instance_key", "prior_stop_sequence"],
how = "left"
)
).astype({"prior_stop_sequence": "Int64"})

return stop_times_with_prior_geom

Expand All @@ -116,7 +118,9 @@ def assemble_stop_times_with_direction(analysis_date: str):

scheduled_stop_times = prep_scheduled_stop_times(analysis_date).persist()

trip_stop_cols = ["trip_instance_key", "stop_id", "stop_sequence"]
trip_stop_cols = ["trip_instance_key", "shape_array_key",
"stop_id", "stop_sequence"]

scheduled_stop_times2 = find_prior_stop(scheduled_stop_times, trip_stop_cols)

other_stops = scheduled_stop_times2[
Expand Down Expand Up @@ -148,7 +152,9 @@ def assemble_stop_times_with_direction(analysis_date: str):
axis=0
)

df = scheduled_stop_times_with_direction.sort_index()
df = scheduled_stop_times_with_direction.sort_values([
"trip_instance_key", "stop_sequence"]
).reset_index(drop=True)

time1 = datetime.datetime.now()
print(f"get scheduled stop times with direction: {time1 - start}")
Expand All @@ -159,6 +165,8 @@ def assemble_stop_times_with_direction(analysis_date: str):
f"stop_times_direction_{analysis_date}"
)



end = datetime.datetime.now()
print(f"execution time: {end - start}")

Expand Down
4 changes: 3 additions & 1 deletion gtfs_funnel/update_vars.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,10 @@
from pathlib import Path
from shared_utils import rt_dates

months = ["sep", "oct"]

analysis_date_list = [
rt_dates.DATES["oct2023"],
rt_dates.DATES[f"{m}2023"] for m in months
]

CONFIG_PATH = Path("config.yml")
Expand Down
99 changes: 27 additions & 72 deletions high_quality_transit_areas/B1_create_hqta_segments.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@

import operators_for_hqta
from calitp_data_analysis import geography_utils, utils
from shared_utils import rt_utils
from shared_utils import rt_utils, geog_utils_to_add
from segment_speed_utils import helpers, gtfs_schedule_wrangling
from utilities import GCS_FILE_PATH
from update_vars import analysis_date, COMPILED_CACHED_VIEWS
Expand All @@ -52,42 +52,27 @@ def pare_down_trips_by_route_direction(
"""
route_dir_cols = ["feed_key", "route_key", "route_id", "direction_id"]

trips_with_geom = (
trips_with_geom
.assign(
route_length = trips_with_geom.geometry.length
).sort_values(route_dir_cols + ["route_length"],
ascending = [True for i in route_dir_cols] + [False])
.drop_duplicates(subset = route_dir_cols)
.reset_index(drop=True)
)

# If direction_id is missing, then later code will break, because
# we need to find the longest route_length
# Don't really care what direction is, since we will replace it with north-south
# Just need a value to stand-in, treat it as the same direction

trips_with_geom2 = trips_with_geom.assign(
direction_id = (trips_with_geom.direction_id.fillna(0)
.astype(int).astype(str))
)

trips_with_geom2 = trips_with_geom2.assign(
route_dir_identifier = trips_with_geom2.apply(
lambda x: zlib.crc32(
(x.route_key + x.direction_id
).encode("utf-8")),
axis=1,
)
)

# Keep the longest shape_id for each direction
# with missing direction_id filled in
longest_shapes = (trips_with_geom2.sort_values("shape_array_key")
.drop_duplicates("route_dir_identifier")
.drop(columns = "route_dir_identifier")
.reset_index(drop=True)
)
trips_with_geom = trips_with_geom.assign(
route_length = trips_with_geom.geometry.length,
direction_id = trips_with_geom.direction_id.fillna(0).astype(int).astype(str)
).sort_values(route_dir_cols + ["route_length"],
ascending = [True for i in route_dir_cols] + [False]
)

longest_shapes = trips_with_geom.assign(
max_route_length = (trips_with_geom
.groupby(route_dir_cols,observed=True, group_keys=False)
.route_length
.transform("max")
)
).query(
'max_route_length == route_length'
).drop_duplicates(subset = route_dir_cols)
# if there are duplicates remaining, drop and keep first obs based on prior sorting

# A route is uniquely identified by route_key (feed_key + route_id)
# Once we keep just 1 shape for each route direction, go back to route_key
Expand Down Expand Up @@ -202,15 +187,16 @@ def select_shapes_and_segment(
axis=0)[["route_key", "geometry"]]

# Cut segments
segmented = geography_utils.cut_segments(
ready_for_segmenting,
group_cols = ["route_key"],
segment_distance = segment_length
ready_for_segmenting["segment_geometry"] = ready_for_segmenting.apply(
lambda x:
geography_utils.create_segments(x.geometry, int(segment_length)),
axis=1,
)

segmented = segmented.assign(
segment_sequence = (segmented.groupby("route_key")
["segment_sequence"].cumcount())
segmented = geog_utils_to_add.explode_segments(
ready_for_segmenting,
group_cols = ["route_key"],
segment_col = "segment_geometry"
)

route_cols = ["feed_key", "route_id", "route_key"]
Expand Down Expand Up @@ -287,24 +273,6 @@ def find_primary_direction_across_hqta_segments(
).drop(columns = drop_cols)

return routes_with_primary_direction


def dissolved_to_longest_shape(hqta_segments: gpd.GeoDataFrame):
"""
Since HQTA segments were cut right after the overlay difference
was taken, do a dissolve so that each route is just 1 line geom.
Keep this version to plot the route.
"""
route_cols = ["feed_key", "route_id",
"route_key", "route_direction"]

dissolved = (hqta_segments[route_cols + ["geometry"]]
.dissolve(by=route_cols)
.reset_index()
)

return dissolved


if __name__=="__main__":
Expand All @@ -317,8 +285,6 @@ def dissolved_to_longest_shape(hqta_segments: gpd.GeoDataFrame):
logger.info(f"Analysis date: {analysis_date}")

start = dt.datetime.now()

#https://stackoverflow.com/questions/69884348/use-dask-to-chunkwise-work-with-smaller-pandas-df-breaks-memory-limits

# (1) Merge routelines with trips, find the longest shape in
# each direction, and after overlay difference, cut HQTA segments
Expand Down Expand Up @@ -348,22 +314,11 @@ def dissolved_to_longest_shape(hqta_segments: gpd.GeoDataFrame):
utils.geoparquet_gcs_export(
hqta_segments_with_dir,
GCS_FILE_PATH,
"hqta_segments"
"hqta_segments_test"
)

time2 = dt.datetime.now()
logger.info(f"cut segments: {time2 - time1}")

# In addition to segments, let's keep a version where line geom is
# at route-level
# Dissolve across directions so that each route is 1 row, 1 line
longest_shape = dissolved_to_longest_shape(hqta_segments_with_dir)

utils.geoparquet_gcs_export(longest_shape,
GCS_FILE_PATH,
"longest_shape_with_dir"
)

end = dt.datetime.now()
logger.info(f"dissolve: {end - time2}")
logger.info(f"total execution time: {end - start}")
8 changes: 1 addition & 7 deletions high_quality_transit_areas/catalog.yml
Original file line number Diff line number Diff line change
Expand Up @@ -25,13 +25,7 @@ sources:
driver: geoparquet
description: Cut HQTA segments across all operators. Created in B1_create_hqta_segments.py.
args:
urlpath: gs://calitp-analytics-data/data-analyses/high_quality_transit_areas/hqta_segments.parquet
# Source: B1_create_hqta_segments.py
longest_shape:
driver: geoparquet
description: Dissolve hqta segments into 1 line geom per route. Created in B1_create_hqta_segments.py.
args:
urlpath: gs://calitp-analytics-data/data-analyses/high_quality_transit_areas/longest_shape_with_dir.parquet
urlpath: gs://calitp-analytics-data/data-analyses/high_quality_transit_areas/hqta_segments.parquet
# Source: B2_sjoin_stops_to_segments.py
all_bus:
driver: geoparquet
Expand Down
6 changes: 6 additions & 0 deletions high_quality_transit_areas/logs/B1_create_hqta_segments.log
Original file line number Diff line number Diff line change
Expand Up @@ -28,3 +28,9 @@
2023-10-12 10:20:40.784 | INFO | __main__:<module>:355 - cut segments: 0:16:46.996748
2023-10-12 10:20:56.013 | INFO | __main__:<module>:368 - dissolve: 0:00:15.229048
2023-10-12 10:20:56.015 | INFO | __main__:<module>:369 - total execution time: 0:17:13.929747
2023-10-18 13:34:46.941 | INFO | __main__:<module>:285 - Analysis date: 2023-10-11
2023-10-18 13:34:54.061 | INFO | __main__:<module>:302 - merge routes to trips: 0:00:07.120037
2023-10-18 13:40:54.360 | INFO | __main__:<module>:285 - Analysis date: 2023-10-11
2023-10-18 13:41:01.280 | INFO | __main__:<module>:302 - merge routes to trips: 0:00:06.918956
2023-10-18 13:44:48.351 | INFO | __main__:<module>:321 - cut segments: 0:03:47.071447
2023-10-18 13:44:48.352 | INFO | __main__:<module>:324 - total execution time: 0:03:53.991597
Loading

0 comments on commit 566f455

Please sign in to comment.