Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

GTFS Digest Portfolio #1149

Merged
merged 3 commits into from
Jun 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
219 changes: 0 additions & 219 deletions gtfs_digest/_section2_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,225 +36,6 @@
Schedule_vp_metrics
Functions
"""
def load_most_current_date() -> str:
# from shared_utils import rt_utils
dates_dictionary = rt_dates.DATES
date_list = list(dates_dictionary.items())
# Grab the last key-value pair
last_key, last_value = date_list[-1]
return last_value

def load_scheduled_stop_times(date: str, gtfs_schedule_key: list) -> pd.DataFrame:
stop_times_col = [
"feed_key",
"stop_id",
"stop_sequence",
"schedule_gtfs_dataset_key",
"trip_instance_key",
"shape_array_key",
"stop_name",
"prior_stop_sequence",
"subseq_stop_sequence",
"stop_pair",
"stop_pair_name",
"stop_primary_direction",
"stop_meters",
]
stop_times_df = helpers.import_scheduled_stop_times(
date,
filters=[[("schedule_gtfs_dataset_key", "in", gtfs_schedule_key)]],
columns=stop_times_col,
get_pandas=True,
with_direction=True,
)

stop_times_df["service_date"] = date
return stop_times_df

def load_scheduled_trips(date: str, gtfs_schedule_key: list) -> pd.DataFrame:
scheduled_col = [
"route_id",
"trip_instance_key",
"gtfs_dataset_key",
"shape_array_key",
"direction_id",
"route_long_name",
"route_short_name",
"route_desc",
"name"
]

scheduled_trips_df = helpers.import_scheduled_trips(
date,
filters=[[("gtfs_dataset_key", "in", gtfs_schedule_key)]],
columns=scheduled_col,
)

scheduled_trips_df["service_date"] = date
return scheduled_trips_df


def find_most_common_dir(
scheduled_trips_df: pd.DataFrame,
scheduled_stop_times_df: pd.DataFrame,
) -> pd.DataFrame:
"""
Load load_scheduled_trips() and load_scheduled_stop_times()
"""

# Merge dfs
merge_cols = [
"trip_instance_key",
"schedule_gtfs_dataset_key",
"shape_array_key",
"service_date",
]

df = delayed(pd.merge)(
scheduled_trips_df,
scheduled_stop_times_df,
on=merge_cols,
how="inner",
)

agg1 = (
df.groupby(
[
"route_id",
"schedule_gtfs_dataset_key",
"direction_id",
"stop_primary_direction",
"service_date",
]
)
.agg({"stop_sequence": "count"})
.reset_index()
.rename(columns={"stop_sequence": "total_stops"})
)

# Sort and drop duplicates so that the
# largest # of stops by stop_primary_direction is at the top
agg2 = agg1.sort_values(
by=["route_id",
"schedule_gtfs_dataset_key",
"direction_id",
"service_date",
"total_stops"],
ascending=[True, True, True, True, False],
)

# Drop duplicates so only the top stop_primary_direction is kept.
agg3 = agg2.drop_duplicates(
subset=[
"route_id",
"schedule_gtfs_dataset_key",
"direction_id",
"service_date"
]
).reset_index(drop=True)

agg3 = agg3.drop(columns=["total_stops"])
return agg3


def most_recent_route_info(
df: pd.DataFrame,
group_cols: list,
route_col: str
) -> pd.DataFrame:
"""
Find the most recent value across a grouping.
Ex: if we group by route_id, we can find the most recent
value for route_long_name.

Needs a date column to work.
"""
sort_order = [True for c in group_cols]

most_recent = (df.sort_values(group_cols + ["service_date"],
ascending = sort_order + [False])
.drop_duplicates(subset = group_cols)
.rename(columns = {route_col: f"recent_{route_col}"})
)


df2 = delayed(pd.merge)(
df,
most_recent[group_cols + [f"recent_{route_col}"]],
on = group_cols,
how = "left"
)
return most_recent

def find_most_recent_route_id(df):
df = df.assign(
route_id=df.route_id.fillna(""),
route_short_name=df.route_short_name.fillna(""),
route_long_name=df.route_long_name.fillna(""),
)
df = df.assign(combined_name=df.route_short_name + "__" + df.route_long_name)

df = df.assign(
route_id2=df.apply(
lambda x: gtfs_schedule_wrangling.standardize_route_id(
x, "name", "route_id"
),
axis=1,
)
)

route_cols = ["schedule_gtfs_dataset_key", "name", "route_id2"]

df2 =most_recent_route_info(
df, group_cols=route_cols, route_col="combined_name"
).pipe(
most_recent_route_info,
group_cols=["schedule_gtfs_dataset_key", "name", "recent_combined_name"],
route_col="route_id2",
)

to_keep_cols = ["schedule_gtfs_dataset_key", "route_id","service_date", "recent_route_id2"]
df2 = df2[to_keep_cols]
return df2

def find_cardinal_direction(date:str, gtfs_schedule_keys: list) -> pd.DataFrame:
# Grab all available dates for these dataframes
# Load the 2 dataframes
scheduled_trips_dd = delayed(load_scheduled_trips(date, gtfs_schedule_keys))
scheduled_stops_dd = delayed(load_scheduled_stop_times(date, gtfs_schedule_keys))

# Find the most common direction for this Route ID
common_stops_dd = find_most_common_dir(scheduled_trips_dd, scheduled_stops_dd)

# Find the most recent Route ID to connect back to sched_vp_df
recent_ids_dd = find_most_recent_route_id(scheduled_trips_dd)

# Merge this
m1 = delayed(pd.merge)(
common_stops_dd,
recent_ids_dd,
on=["schedule_gtfs_dataset_key", "route_id", "service_date"],
how="inner",
)

m1 = m1.drop(columns = ["route_id"])

return m1

def all_dates_cardinal_dir(dates:list, gtfs_schedule_keys:list)->pd.DataFrame:
full_df = pd.DataFrame()
for date in dates:
df = find_cardinal_direction(date, gtfs_schedule_keys)
df = df.compute()
full_df = pd.concat([full_df, df], axis=0)
to_keep = ["schedule_gtfs_dataset_key",
"direction_id",
"recent_route_id2",
"stop_primary_direction",
"service_date"]
full_df = full_df[to_keep]
return full_df

def load_schedule_vp_metrics(organization:str)->pd.DataFrame:
schd_vp_url = f"{GTFS_DATA_DICT.digest_tables.dir}{GTFS_DATA_DICT.digest_tables.route_schedule_vp}.parquet"

Expand Down
25 changes: 0 additions & 25 deletions portfolio/gtfs_digest_testing/README.md

This file was deleted.

2 changes: 1 addition & 1 deletion portfolio/gtfs_digest_testing/_config.yml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# Book settings
# Learn more at https://jupyterbook.org/customize/config.html

title: GTFS Digest
title: GTFS Digest TEST
author: Cal-ITP
copyright: "2024"
#logo: calitp_logo_MAIN.png
Expand Down
1 change: 0 additions & 1 deletion portfolio/gtfs_digest_testing/district_01-eureka.md

This file was deleted.

1 change: 0 additions & 1 deletion portfolio/gtfs_digest_testing/district_02-redding.md

This file was deleted.

1 change: 0 additions & 1 deletion portfolio/gtfs_digest_testing/district_03-marysville.md

This file was deleted.

1 change: 0 additions & 1 deletion portfolio/gtfs_digest_testing/district_04-oakland.md

This file was deleted.

Git LFS file not shown
Git LFS file not shown
Git LFS file not shown
Git LFS file not shown
Git LFS file not shown
Git LFS file not shown
Git LFS file not shown
Git LFS file not shown
Git LFS file not shown
Git LFS file not shown

This file was deleted.

Git LFS file not shown
Git LFS file not shown
Git LFS file not shown
Git LFS file not shown
Git LFS file not shown
Git LFS file not shown
Git LFS file not shown

This file was deleted.

This file was deleted.

1 change: 0 additions & 1 deletion portfolio/gtfs_digest_testing/district_07-los-angeles.md

This file was deleted.

19 changes: 16 additions & 3 deletions portfolio/sites/gtfs_digest_testing.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,22 @@ parts:
params:
district: 04 - Oakland
sections:
- organization_name: Alameda-Contra Costa Transit District
- organization_name: Capitol Corridor Joint Powers Authority
- organization_name: Central Contra Costa Transit Authority
- organization_name: City and County of San Francisco
- organization_name: City of Fairfield
- organization_name: City of Menlo Park
- organization_name: City of Petaluma
- organization_name: City of Rio Vista
- organization_name: City of Santa Rosa
- organization_name: City of South San Francisco
- organization_name: City of Union City
- organization_name: City of Vacaville
- organization_name: Marin County Transit District
- organization_name: Napa Valley Transportation Authority
- organization_name: Peninsula Corridor Joint Powers Board
- organization_name: Presidio Trust
- organization_name: San Francisco Bay Area Rapid Transit District
- organization_name: Alameda-Contra Costa Transit District

readme: ./gtfs_digest/README.md
title: GTFS Digest
title: GTFS Digest TEST
Loading