Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Digest ymls #1074

Merged
merged 5 commits into from
Apr 12, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 14 additions & 2 deletions gtfs_digest/deploy_portfolio_yaml.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

from segment_speed_utils.project_vars import RT_SCHED_GCS

PORTFOLIO_SITE_YAML = Path("../portfolio/sites/gtfs_digest.yml")
PORTFOLIO_SITE_YAML = Path("../portfolio/sites/digest_typologies.yml")

def overwrite_yaml(portfolio_site_yaml: Path) -> list:
"""
Expand All @@ -29,7 +29,19 @@ def overwrite_yaml(portfolio_site_yaml: Path) -> list:

districts = sorted(list(df.caltrans_district.unique()))

operators = df.organization_name.tolist()
operators = df.organization_name.tolist()

keep_me = ["City of Santa Monica", "City of Culver City",
"Long Beach Transit", "Southern California Regional Rail Authority",
"Foothill Transit",
"Alameda-Contra Costa Transit District",
"City and County of San Francisco",
"Sonoma-Marin Area Rail Transit District",
"Marin County Transit District",
]
operators = [i for i in operators if i in keep_me]
districts = sorted(df[df.organization_name.isin(keep_me)].caltrans_district.unique())

# Eric's example
# https://github.com/cal-itp/data-analyses/blob/main/rt_delay/04_generate_all.ipynb

Expand Down
28 changes: 0 additions & 28 deletions gtfs_digest/extra_cleaning.py

This file was deleted.

45 changes: 14 additions & 31 deletions gtfs_digest/merge_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,54 +136,37 @@ def concatenate_crosswalk_organization(
def merge_in_standardized_route_names(
df: pd.DataFrame,
) -> pd.DataFrame:

keep_cols = [
"schedule_gtfs_dataset_key", "name",
"route_id", "service_date",
"recent_route_id2", "recent_combined_name"]
]

CLEAN_ROUTES = GTFS_DATA_DICT.schedule_tables.route_identification

operators_need_cleaning = pd.read_parquet(
f"{SCHED_GCS}{CLEAN_ROUTES}.parquet",
filters = [[("name", "in", operators_only_route_long_name)]]
)
operators_ok = pd.read_parquet(
f"{SCHED_GCS}{CLEAN_ROUTES}.parquet",
filters = [[("name", "not in", operators_only_route_long_name)]]
)
route_names_df = pd.read_parquet(f"{SCHED_GCS}{CLEAN_ROUTES}.parquet")

operators_need_cleaning = operators_need_cleaning.assign(
recent_combined_name = operators_need_cleaning.route_long_name
)
route_names_df = time_series_utils.clean_standardized_route_names(
route_names_df).drop_duplicates()

standardized_route_names = pd.concat([
operators_need_cleaning,
operators_ok
], axis=0, ignore_index=True)[keep_cols]

if "name" in df.columns:
df = df.drop(columns = "name")

df = pd.merge(
# Use `route_id` to merge to standardized_route_names
df2 = pd.merge(
df,
standardized_route_names,
route_names_df,
on = ["schedule_gtfs_dataset_key",
"route_id", "service_date"],
how = "left",
)

df = df.assign(
recent_combined_name = df.recent_combined_name.str.replace("__", " ")
).drop(
columns = ["route_id"]
).rename(
columns = {
"recent_route_id2": "route_id",
"recent_combined_name": "route_combined_name"
}
)
# After merging, we can replace route_id with recent_route_id2
drop_cols = ["route_desc", "combined_name", "route_id2"]
df3 = time_series_utils.parse_route_combined_name(df2).drop(
columns = drop_cols)

return df
return df3


if __name__ == "__main__":
Expand Down Expand Up @@ -237,7 +220,7 @@ def merge_in_standardized_route_names(
"is_early", "is_ontime", "is_late"
]

df[integrify] = df[integrify].astype("Int64")
df[integrify] = df[integrify].fillna(0).astype("int")

df.to_parquet(
f"{RT_SCHED_GCS}{DIGEST_RT_SCHED}.parquet"
Expand Down
66 changes: 66 additions & 0 deletions gtfs_digest/readable.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
# GTFS schedule
direction_id:
readable: Direction
caption: Something
avg_scheduled_service_minutes: "Average Scheduled Service (trip minutes)"
avg_stop_miles: Average Stop Distance (miles)
ttl_service_hours: total service (hours)
n_scheduled_trips: "# scheduled trips"
frequency: Trips per Hour
total_scheduled_service_minutes: "Aggregate Scheduled Service Minutes (all trips)"
route_id: Route
route_combined_name: Route

# GTFS vehicle positions
minutes_atleast1_vp: "# minutes with 1+ vp per minute"
minutes_atleast2_vp: "# minutes with 2+ vp per minute"
total_rt_service_minutes: "Aggregate Actual Service Minutes (all trips)"
total_vp: "# vp"
vp_in_shape: "# vp within scheduled shape"
is_early": "# early arrival trips"
is_ontime: "# on-time trips"
is_late: "# late trips"
n_vp_trips: "# trips with vp"
vp_per_minute: "Average vp per minute"
pct_in_shape: "% vp within scheduled shape"
pct_rt_journey_atleast1_vp: "% actual trip minutes with 1+ vp per minute"
pct_rt_journey_atleast2_vp: "% actual trip minutes with 2+ vp per minute"
pct_sched_journey_atleast1_vp: "% scheduled trip minutes with 1+ vp per minute"
pct_sched_journey_atleast2_vp: "% scheduled trip minutes with 2+ vp per minute"
rt_sched_journey_ratio: "Actual / scheduled service ratio"
avg_rt_service_minutes: "Average Actual Service (trip minutes)"
speed_mph: "Speed (mph)"
sched_rt_category: "GTFS Availability"

# Operators
organization_source_record_id: "Organization ID"
organization_name: Organization
name: Transit Operator
caltrans_district: District
base64_url: "base64 encoded feed URL"
operator_n_routes: "# routes"
operator_n_trips: "# trips"
operator_n_shapes: "# shapes"
operator_n_stops: "# stops"
operator_n_arrivals: "# arrivals"
operator_route_length_miles: "service miles"
operator_arrivals_per_stop: "avg arrivals per stop"
n_coverage_routes: "# coverage route types"
n_downtown_local_routes: "# downtown local route types"
n_local_routes: "# local route types"
n_rapid_routes: "# rapid route types"


# Dates / time
time_period: Period
service_date: Date
time_of_day: "time of day"
day_name: "Day of Week"
year: Year
month: Month
year_month: "Month - Year"

# Roads
road_freq_category: "Road Frequency Category"
road_typology: "Road Typology Category"
pct_typology: "% route miles with typology"
31 changes: 27 additions & 4 deletions gtfs_funnel/concatenate_monthly_scheduled_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,8 @@

MONTHLY_SERVICE = GTFS_DATA_DICT.schedule_tables.monthly_scheduled_service
CROSSWALK = GTFS_DATA_DICT.schedule_tables.gtfs_key_crosswalk

ROUTES = GTFS_DATA_DICT.schedule_tables.route_identification

year_list = [2023, 2024]
analysis_date_list = (rt_dates.y2024_dates +
rt_dates.y2023_dates +
Expand Down Expand Up @@ -47,18 +48,40 @@
"schedule_source_record_id",
"schedule_gtfs_dataset_key",
"organization_source_record_id", "organization_name",
],
]
).drop(
columns = "service_date"
).drop_duplicates().reset_index(drop=True)
).drop_duplicates().reset_index(drop=True)

# Get standardized route names and clean up more
standardized_routes = pd.read_parquet(f"{SCHED_GCS}{ROUTES}.parquet")

route_names_df = time_series_utils.clean_standardized_route_names(
standardized_routes).pipe(
time_series_utils.parse_route_combined_name
)[
["schedule_gtfs_dataset_key",
"route_long_name", "route_short_name",
"route_id", "route_combined_name"]
].drop_duplicates()

# Merge monthly service with crosswalk to get schedule_gtfs_dataset_key
df2 = pd.merge(
df,
crosswalk,
on = "schedule_source_record_id",
how = "inner",
)

df2.to_parquet(
# Merge in route_names so we use the standardized/cleaned up route names
df3 = pd.merge(
df2,
route_names_df,
on = ["schedule_gtfs_dataset_key",
"route_long_name", "route_short_name"],
how = "left",
)

df3.to_parquet(
f"{SCHED_GCS}{MONTHLY_SERVICE}.parquet"
)
5 changes: 5 additions & 0 deletions portfolio/digest_typologies/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
# GTFS Digest

Performance metrics from GTFS schedule and vehicle positions time-series data for all transit operators by route.

[![digest_mermaid](https://mermaid.ink/img/pako:eNqVk1Fv2jAQx7-K5QnlJUGEhAKeNKkU2NOeWu1hZIpMfAGrjh3Zl7WA-O5zknXQba20PDin-_98Z9-dT7QwAiijURRlGiUqYOTzw_qeLOUOHGa6EwaDU6YJkVoiI51JSIB7qCBgJNhyB0F47f3KreRbBS74jXuptrLi9nBnlLHtvg-ryXqyvn3ZeiEe4Bkv1Gg0-htZGCvAvgUpqeEtzUFhtHh9jvV6ulpcMQgW5SukLMugl8_tzy_nwSDTmS6VeSr23CJ5WPSAa7Y7y-s9saZBiIS0UKA0ulf7tVDcuSWURJSklEqxX7X4Q3WFlTW-EN0pr6O03-3GFXsQjQJSAVpZuO-MMR82ij4R0TXx44VebFxTtfUjrgYQ_2CvQt9tLOY_XP4_CZabHZYuf4RDjiY3dse1PPL2-qSwxrknrh7f27_aOORte4Q8gsi7EuaaV_Bu1t4CLS4uGtIKbMWl8NPdzWBGu9nMKPOmgJI3CjPqG-lR3qC5P-iCMrQNhLSpBUdYSu47WVFWcuW8F4REY7_0L6Z7OCGtuabsRJ8pi2bjYTJPxvM0mc6S-ThOQ3qgLLkZxulklt6kaTwZzdL4HNKjMT5qPBzHfjZH8TRJPJ1O0y7ct07sUp5_AvaOGGY?type=png)](https://mermaid.live/edit#pako:eNqVk1Fv2jAQx7-K5QnlJUGEhAKeNKkU2NOeWu1hZIpMfAGrjh3Zl7WA-O5zknXQba20PDin-_98Z9-dT7QwAiijURRlGiUqYOTzw_qeLOUOHGa6EwaDU6YJkVoiI51JSIB7qCBgJNhyB0F47f3KreRbBS74jXuptrLi9nBnlLHtvg-ryXqyvn3ZeiEe4Bkv1Gg0-htZGCvAvgUpqeEtzUFhtHh9jvV6ulpcMQgW5SukLMugl8_tzy_nwSDTmS6VeSr23CJ5WPSAa7Y7y-s9saZBiIS0UKA0ulf7tVDcuSWURJSklEqxX7X4Q3WFlTW-EN0pr6O03-3GFXsQjQJSAVpZuO-MMR82ij4R0TXx44VebFxTtfUjrgYQ_2CvQt9tLOY_XP4_CZabHZYuf4RDjiY3dse1PPL2-qSwxrknrh7f27_aOORte4Q8gsi7EuaaV_Bu1t4CLS4uGtIKbMWl8NPdzWBGu9nMKPOmgJI3CjPqG-lR3qC5P-iCMrQNhLSpBUdYSu47WVFWcuW8F4REY7_0L6Z7OCGtuabsRJ8pi2bjYTJPxvM0mc6S-ThOQ3qgLLkZxulklt6kaTwZzdL4HNKjMT5qPBzHfjZH8TRJPJ1O0y7ct07sUp5_AvaOGGY)
43 changes: 43 additions & 0 deletions portfolio/digest_typologies/_config.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
# Book settings
# Learn more at https://jupyterbook.org/customize/config.html

title: Transit Operators and Route Typologies
author: Cal-ITP
copyright: "2024"
#logo: calitp_logo_MAIN.png

# Force re-execution of notebooks on each build.
# See https://jupyterbook.org/content/execute.html
execute:
execute_notebooks: 'off'
allow_errors: false
timeout: -1

# Define the name of the latex output file for PDF builds
latex:
latex_documents:
targetname: book.tex

launch_buttons:
binderhub_url: "https://mybinder.org"
jupyterhub_url: "https://hubtest.k8s.calitp.jarv.us"
thebe: true

repository:
url: https://github.com/cal-itp/data-analyses/ # Online location of your book
# path_to_book: docs # Optional path to your book, relative to the repository root
path_to_book: gtfs_digest
branch: main # Which branch of the repository should be used when creating links (optional)

# Add GitHub buttons to your book
# See https://jupyterbook.org/customize/config.html#add-a-link-to-your-repository
html:
use_issues_button: true
use_repository_button: true
use_edit_page_button: true
google_analytics_id: 'G-JCX3Z8JZJC'

sphinx:
config:
html_js_files:
- https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.4/require.min.js
11 changes: 11 additions & 0 deletions portfolio/digest_typologies/_toc.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
format: jb-book
parts:
- caption: null
chapters:
- file: district_04-oakland.md
sections:
- glob: district_04-oakland/*
- file: district_07-los-angeles.md
sections:
- glob: district_07-los-angeles/*
root: README
1 change: 1 addition & 0 deletions portfolio/digest_typologies/district_04-oakland.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
# District 04 - Oakland
Git LFS file not shown
Git LFS file not shown
Git LFS file not shown
Git LFS file not shown
Git LFS file not shown
Git LFS file not shown
Git LFS file not shown
1 change: 1 addition & 0 deletions portfolio/digest_typologies/district_07-los-angeles.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
# District 07 - Los Angeles
Git LFS file not shown
Git LFS file not shown
Git LFS file not shown
Git LFS file not shown
Git LFS file not shown
Git LFS file not shown
Git LFS file not shown
Loading
Loading