Skip to content

Commit d2f900d

Browse files
authored
Merge pull request #1248 from cal-itp/schedule-stop-metrics
Schedule stop metrics
2 parents b5e6658 + 4c52d0f commit d2f900d

28 files changed

+554
-347
lines changed

_shared_utils/setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
setup(
55
name="shared_utils",
66
packages=find_packages(),
7-
version="2.6",
7+
version="2.7",
88
description="Shared utility functions for data analyses",
99
author="Cal-ITP",
1010
license="Apache",

_shared_utils/shared_utils/catalog_utils.py

Lines changed: 0 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@
55
from typing import Literal
66

77
import intake
8-
import yaml
98
from omegaconf import OmegaConf # this is yaml parser
109

1110
repo_name = "data-analyses/"
@@ -22,20 +21,3 @@ def get_catalog(catalog_name: Literal["shared_data_catalog", "gtfs_analytics_dat
2221

2322
else:
2423
return intake.open_catalog(catalog_path)
25-
26-
27-
def get_parameters(config_file: str, key: str) -> dict:
28-
"""
29-
Parse the config.yml file to get the parameters needed
30-
for working with route or stop segments.
31-
These parameters will be passed through the scripts when working
32-
with vehicle position data.
33-
34-
Returns a dictionary of parameters.
35-
"""
36-
# https://aaltoscicomp.github.io/python-for-scicomp/scripts/
37-
with open(config_file) as f:
38-
my_dict = yaml.safe_load(f)
39-
params_dict = my_dict[key]
40-
41-
return params_dict

_shared_utils/shared_utils/gtfs_analytics_data.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,8 @@ rt_vs_schedule_tables:
5353
vp_trip_metrics: "vp_trip/trip_metrics"
5454
vp_route_direction_metrics: "vp_route_dir/route_direction_metrics"
5555
vp_operator_metrics: "vp_operator/operator_metrics"
56+
sched_stop_metrics: "schedule_stop/schedule_stop_metrics"
57+
#vp_stop_metrics: "vp_stop/vp_stop_metrics" # WIP: transit bunching
5658
schedule_rt_stop_times: "schedule_rt_stop_times"
5759
early_trip_minutes: -5
5860
late_trip_minutes: 5

_shared_utils/shared_utils/publish_utils.py

Lines changed: 31 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,16 @@
11
import os
22
from pathlib import Path
3-
from typing import Union
3+
from typing import Literal, Union
44

55
import gcsfs
6+
import geopandas as gpd
67
import pandas as pd
8+
from shared_utils import catalog_utils
79

810
fs = gcsfs.GCSFileSystem()
11+
SCHED_GCS = "gs://calitp-analytics-data/data-analyses/gtfs_schedule/"
912
PUBLIC_BUCKET = "gs://calitp-publish-data-analysis/"
13+
GTFS_DATA_DICT = catalog_utils.get_catalog("gtfs_analytics_data")
1014

1115

1216
def write_to_public_gcs(
@@ -59,3 +63,29 @@ def exclude_private_datasets(
5963
Filter out private datasets.
6064
"""
6165
return df[df[col].isin(public_gtfs_dataset_keys)].reset_index(drop=True)
66+
67+
68+
def subset_table_from_previous_date(
69+
gcs_bucket: str,
70+
filename: Union[str, Path],
71+
operator_and_dates_dict: dict,
72+
date: str,
73+
crosswalk_col: str = "schedule_gtfs_dataset_key",
74+
data_type: Literal["df", "gdf"] = "df",
75+
) -> pd.DataFrame:
76+
CROSSWALK_FILE = GTFS_DATA_DICT.schedule_tables.gtfs_key_crosswalk
77+
78+
crosswalk = pd.read_parquet(f"{SCHED_GCS}{CROSSWALK_FILE}_{date}.parquet", columns=["name", crosswalk_col])
79+
80+
subset_keys = crosswalk[crosswalk.name.isin(operator_and_dates_dict[date])][crosswalk_col].unique()
81+
82+
if data_type == "df":
83+
past_df = pd.read_parquet(
84+
f"{gcs_bucket}{filename}_{date}.parquet", filters=[[(crosswalk_col, "in", subset_keys)]]
85+
)
86+
else:
87+
past_df = gpd.read_parquet(
88+
f"{gcs_bucket}{filename}_{date}.parquet", filters=[[(crosswalk_col, "in", subset_keys)]]
89+
)
90+
91+
return past_df

gtfs_funnel/Makefile

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,8 @@ preprocess_vp:
2121
preprocess_schedule_only:
2222
make route_typologies_data
2323
python operator_scheduled_stats.py
24-
24+
python schedule_stats_by_stop.py
25+
2526
route_typologies_data:
2627
python route_typologies.py
2728
python schedule_stats_by_route_direction.py

gtfs_funnel/published_operators.yml

Lines changed: 0 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@
22
- Alhambra Schedule
33
- Amador Schedule
44
- Anaheim Resort Schedule
5-
- Anaheim Resort Schedule v2
65
- Antelope Valley Transit Authority Schedule
76
- Arcadia Schedule
87
- Arvin Schedule
@@ -51,7 +50,6 @@
5150
- Bell Gardens Schedule
5251
- Bellflower Bus Schedule
5352
- Big Blue Bus Schedule
54-
- Big Blue Bus Swiftly Schedule
5553
- BruinBus Schedule
5654
- Burbank Schedule
5755
- Calabasas Schedule
@@ -193,7 +191,6 @@
193191
- Santa Cruz Schedule
194192
2024-06-12:
195193
- Anteater Express Schedule
196-
- Lassen Flex
197194
- Lynwood Schedule
198195
- Manteca Schedule
199196
2024-05-22:
@@ -207,29 +204,13 @@
207204
- Rosemead Schedule
208205
2023-12-13:
209206
- DowneyLINK Schedule
210-
- Humboldt Flex
211-
- Laguna Beach Flex
212-
- Manteca Flex
213-
- Placer Flex
214-
- San Joaquin Flex
215207
- Spirit Bus Schedule
216-
- StanRTA Flex
217-
- TART Flex
218-
- Thousand Oaks Flex
219-
- Tracy Flex
220-
- Turlock Flex
221-
- Union City Flex
222-
- VCTC Flex
223-
- WestCAT Flex
224208
2023-11-15:
225209
- Amtrak Schedule
226210
- Mission Bay Schedule
227211
2023-08-15:
228212
- Blossom Express Schedule
229-
- Eastern Sierra Flex
230213
2023-06-14:
231214
- Tuolumne Schedule
232-
2023-04-12:
233-
- Guadalupe Flex
234215
2023-03-15:
235216
- TIME GMV Schedule

gtfs_funnel/schedule_stats_by_stop.py

Lines changed: 143 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,143 @@
1+
"""
2+
Add some GTFS schedule derived metrics
3+
by stop (arrivals, number of trips/routes served,
4+
service hours).
5+
6+
This is stop grain version of schedule_stats_by_route_direction.
7+
Grain: schedule_gtfs_dataset_key-stop_id
8+
"""
9+
import datetime
10+
import geopandas as gpd
11+
import pandas as pd
12+
13+
from calitp_data_analysis.geography_utils import WGS84
14+
from calitp_data_analysis import utils
15+
from segment_speed_utils import helpers
16+
17+
def stats_for_stop(
18+
df: pd.DataFrame,
19+
group_cols: list
20+
) -> pd.DataFrame:
21+
"""
22+
List the stats we'd like to calculate for each stop.
23+
"""
24+
df2 = (
25+
df
26+
.groupby(group_cols, group_keys=False)
27+
.agg({
28+
"route_id": lambda x: list(sorted(set(x))),
29+
"route_type": lambda x: list(sorted(set(x))),
30+
"departure_sec": "count",
31+
"departure_hour": "nunique"
32+
}).reset_index()
33+
.rename(columns = {
34+
"departure_sec": "n_arrivals",
35+
"departure_hour": "n_hours_in_service",
36+
"route_id": "route_ids_served",
37+
"route_type": "route_types_served"
38+
})
39+
)
40+
41+
df2 = df2.assign(
42+
n_routes = df2.apply(lambda x: len(x.route_ids_served), axis=1)
43+
)
44+
45+
# Instead of producing list, we want to show values like 0, 3 instead of [0, 3]
46+
# portal users can see combinations more quickly
47+
# and access particular rows using str.contains
48+
df2 = df2.assign(
49+
route_types_served = df2.route_types_served.str.join(", "),
50+
route_ids_served = df2.route_ids_served.str.join(", "),
51+
)
52+
53+
return df2
54+
55+
56+
def schedule_stats_by_stop(
57+
analysis_date: str
58+
) -> gpd.GeoDataFrame:
59+
"""
60+
Import stop_times, trips, and stops.
61+
Merge and aggregate for stop-level schedule stats.
62+
63+
Calculate some extra stats from other schedule tables,
64+
such as how many route_ids and route_types the
65+
stop shares.
66+
"""
67+
# departure hour nunique values can let us know span of service
68+
stop_times = helpers.import_scheduled_stop_times(
69+
analysis_date,
70+
columns = ["feed_key", "stop_id", "trip_id",
71+
"departure_sec", "departure_hour"],
72+
with_direction = False,
73+
get_pandas = True
74+
)
75+
76+
# include route info so we know how many trips, routes,
77+
# route_types that the stop serves
78+
# stop can serve 1 light rail + 5 bus routes vs 6 bus routes
79+
trips = helpers.import_scheduled_trips(
80+
analysis_date,
81+
columns = ["gtfs_dataset_key", "feed_key",
82+
"trip_id",
83+
"route_id", "route_type"],
84+
get_pandas = True,
85+
)
86+
87+
stops = helpers.import_scheduled_stops(
88+
analysis_date,
89+
columns = ["feed_key", "stop_id", "stop_name", "geometry"],
90+
get_pandas = True,
91+
crs = WGS84
92+
)
93+
94+
stop_df = pd.merge(
95+
stop_times,
96+
trips,
97+
on = ["feed_key", "trip_id"],
98+
how = "inner"
99+
).pipe(
100+
stats_for_stop,
101+
group_cols = ["schedule_gtfs_dataset_key", "feed_key", "stop_id"]
102+
)
103+
104+
105+
stop_gdf = pd.merge(
106+
stops,
107+
stop_df,
108+
on = ["feed_key", "stop_id"],
109+
how = "inner"
110+
).drop(columns = "feed_key")
111+
112+
# Fix order of columns
113+
col_order = [
114+
c for c in stop_gdf.columns
115+
if c not in ["schedule_gtfs_dataset_key", "geometry"]
116+
]
117+
118+
stop_gdf = stop_gdf.reindex(
119+
columns = ["schedule_gtfs_dataset_key", *col_order, "geometry"]
120+
)
121+
122+
return stop_gdf
123+
124+
125+
if __name__ == "__main__":
126+
127+
from update_vars import analysis_date_list, RT_SCHED_GCS, GTFS_DATA_DICT
128+
129+
EXPORT_FILE = GTFS_DATA_DICT.rt_vs_schedule_tables.sched_stop_metrics
130+
131+
for analysis_date in analysis_date_list:
132+
start = datetime.datetime.now()
133+
134+
gdf = schedule_stats_by_stop(analysis_date)
135+
136+
utils.geoparquet_gcs_export(
137+
gdf,
138+
RT_SCHED_GCS,
139+
f"{EXPORT_FILE}_{analysis_date}"
140+
)
141+
142+
end = datetime.datetime.now()
143+
print(f"schedule stop stats for {analysis_date}: {end - start}")

gtfs_funnel/track_publish_dates.py

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
from pathlib import Path
1212
from typing import Union
1313

14-
from shared_utils import rt_dates
14+
from shared_utils import gtfs_utils_v2, rt_dates
1515
from segment_speed_utils import time_series_utils
1616

1717
def filter_to_recent_date(df: pd.DataFrame) -> pd.DataFrame:
@@ -29,6 +29,7 @@ def filter_to_recent_date(df: pd.DataFrame) -> pd.DataFrame:
2929
)
3030
return df2
3131

32+
3233
def export_results_yml(
3334
df: pd.DataFrame,
3435
export_yaml: Union[str, Path]
@@ -41,18 +42,25 @@ def export_results_yml(
4142
# operator names that have more recent names that we are keeping,
4243
# so we can remove these from our yaml
4344
exclude_me = [
44-
"TIME GMV"
45+
"Flex",
4546
]
47+
48+
df2 = df.copy()
49+
50+
for exclude_word in exclude_me:
4651

47-
df2 = df[~df.name.isin(exclude_me)]
52+
df2 = df2[~df2.name.str.contains(exclude_word)]
4853

54+
# yaml export can have date as string
55+
# but yaml safe_load will automatically parse as datetime again
4956
my_dict = {
5057
**{
5158
date_key: df2[df2.service_date==date_key].name.tolist()
5259
for date_key in df2.service_date.unique()
5360
}
5461
}
5562

63+
5664
# sort_keys=False to prevent alphabetical sort (earliest date first)
5765
# because we want to main our results and yaml with most recent date first
5866
output = pyaml.dump(my_dict, sort_keys=False)
@@ -73,12 +81,15 @@ def export_results_yml(
7381

7482
TABLE = GTFS_DATA_DICT.schedule_downloads.trips
7583

84+
public_feeds = gtfs_utils_v2.filter_to_public_schedule_gtfs_dataset_keys()
85+
7686
operators = time_series_utils.concatenate_datasets_across_dates(
7787
COMPILED_CACHED_VIEWS,
7888
TABLE,
7989
rt_dates.y2024_dates + rt_dates.y2023_dates,
8090
data_type = "df",
8191
get_pandas = True,
92+
filters = [[("gtfs_dataset_key", "in", public_feeds)]],
8293
columns = ["name"]
8394
).drop_duplicates().pipe(filter_to_recent_date)
8495

open_data/Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,5 +11,5 @@ compile_open_data_portal:
1111
#python arcgis_script_pro.py #(in ESRI!)
1212
python update_data_dict.py # check if columns are missing in data_dictionary yml
1313
python update_fields_fgdc.py # populate fields with data dictionary yml values, run if update_data_dict had changes to incorporate
14-
python open_data.py # go back into ESRI and update xml
14+
python metadata_update_pro.py # go back into ESRI and update xml
1515
python cleanup.py # run after ESRI work done

0 commit comments

Comments
 (0)