Skip to content

Commit

Permalink
Merge pull request #1339 from cal-itp/add-shn-info
Browse files Browse the repository at this point in the history
Add SHN info to open data `ca_transit_stops`
  • Loading branch information
tiffanychu90 authored Jan 7, 2025
2 parents 1ba0f54 + 049f7c6 commit a16daf7
Show file tree
Hide file tree
Showing 2 changed files with 44 additions and 7 deletions.
38 changes: 35 additions & 3 deletions open_data/create_stops_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,18 +4,21 @@
"""
import datetime
import geopandas as gpd
import intake
import pandas as pd
import yaml

import open_data_utils
from calitp_data_analysis import utils
from calitp_data_analysis import geography_utils, utils
from shared_utils import publish_utils
from update_vars import (analysis_date,
GTFS_DATA_DICT,
TRAFFIC_OPS_GCS,
RT_SCHED_GCS, SCHED_GCS
)

catalog = intake.open_catalog("../_shared_utils/shared_utils/shared_data_catalog.yml")

def create_stops_file_for_export(
date: str,
) -> gpd.GeoDataFrame:
Expand All @@ -40,6 +43,31 @@ def create_stops_file_for_export(
return stops2


def add_distance_to_state_highway(
stops: gpd.GeoDataFrame
) -> gpd.GeoDataFrame:
"""
Bring in State Highway Network gdf and add a column that tells us
distance (in meters) between stop and SHN.
For stops outside of CA, this will not be that meaningful.
See discussion in:
https://github.com/cal-itp/data-analyses/issues/1182
https://github.com/cal-itp/data-analyses/issues/1321
"""
orig_crs = stops.crs

shn = catalog.state_highway_network.read()[
["geometry"]].to_crs(geography_utils.CA_NAD83Albers).geometry.iloc[0]

stops = stops.to_crs(geography_utils.CA_NAD83Albers)

stops = stops.assign(
meters_to_shn = stops.geometry.distance(shn).round(1)
)

return stops.to_crs(orig_crs)


def patch_previous_dates(
current_stops: gpd.GeoDataFrame,
current_date: str,
Expand Down Expand Up @@ -80,7 +108,8 @@ def patch_previous_dates(
published_stops = pd.concat(
[current_stops, patch_stops],
axis=0, ignore_index=True
)
).pipe(add_distance_to_state_highway)


return published_stops

Expand All @@ -98,8 +127,10 @@ def finalize_export_df(df: gpd.GeoDataFrame) -> gpd.GeoDataFrame:
# add GTFS stop-related metrics
'n_routes', 'route_ids_served', 'route_types_served',
'n_arrivals', 'n_hours_in_service',
# this is derived column
'meters_to_shn'
]
agency_ids = ['base64_url']
agency_ids = ['base64_url', 'caltrans_district']

col_order = route_cols + stop_cols + agency_ids + ['geometry']

Expand All @@ -123,6 +154,7 @@ def finalize_export_df(df: gpd.GeoDataFrame) -> gpd.GeoDataFrame:
analysis_date,
).pipe(finalize_export_df)


utils.geoparquet_gcs_export(
published_stops,
TRAFFIC_OPS_GCS,
Expand Down
13 changes: 9 additions & 4 deletions open_data/open_data_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,18 +31,23 @@ def standardize_operator_info_for_exports(
f"{SCHED_GCS}{CROSSWALK_FILE}_{date}.parquet",
columns = [
"schedule_gtfs_dataset_key", "name", "base64_url",
"organization_source_record_id", "organization_name"
"organization_source_record_id", "organization_name",
"caltrans_district",
],
filters = [[("schedule_gtfs_dataset_key", "in", public_feeds)]]
)

# Checked whether we need a left merge to keep stops outside of CA
# that may not have caltrans_district
# and inner merge is fine. All operators are assigned a caltrans_district
# so Amtrak / FlixBus stops have values populated
df2 = pd.merge(
df,
crosswalk,
on = "schedule_gtfs_dataset_key",
how = "inner"
)

return df2


Expand Down Expand Up @@ -124,8 +129,8 @@ def clip_to_usa(gdf: gpd.GeoDataFrame) -> gpd.GeoDataFrame:
"route_name_used": "route_name",
"route_types_served": "routetypes",
"n_hours_in": "n_hours_in_service",
"route_ids_": "route_ids_served"

"route_ids_": "route_ids_served",
"meters_to_shn": "meters_to_ca_state_highway"
}


Expand Down

0 comments on commit a16daf7

Please sign in to comment.