From ecc7159ef2f55865032b2eabf4fba2aecd249de3 Mon Sep 17 00:00:00 2001 From: tiffanychu90 Date: Mon, 30 Sep 2024 22:09:32 +0000 Subject: [PATCH 01/10] schedule stop metrics, backfill all dates --- .../shared_utils/gtfs_analytics_data.yml | 2 + gtfs_funnel/Makefile | 3 +- gtfs_funnel/schedule_stats_by_stop.py | 140 ++++++++++++++++++ 3 files changed, 144 insertions(+), 1 deletion(-) create mode 100644 gtfs_funnel/schedule_stats_by_stop.py diff --git a/_shared_utils/shared_utils/gtfs_analytics_data.yml b/_shared_utils/shared_utils/gtfs_analytics_data.yml index e8ab2cc9a..ea7561ded 100644 --- a/_shared_utils/shared_utils/gtfs_analytics_data.yml +++ b/_shared_utils/shared_utils/gtfs_analytics_data.yml @@ -53,6 +53,8 @@ rt_vs_schedule_tables: vp_trip_metrics: "vp_trip/trip_metrics" vp_route_direction_metrics: "vp_route_dir/route_direction_metrics" vp_operator_metrics: "vp_operator/operator_metrics" + sched_stop_metrics: "schedule_stop/schedule_stop_metrics" + #vp_stop_metrics: "vp_stop/vp_stop_metrics" # WIP: transit bunching schedule_rt_stop_times: "schedule_rt_stop_times" early_trip_minutes: -5 late_trip_minutes: 5 diff --git a/gtfs_funnel/Makefile b/gtfs_funnel/Makefile index 6fb981b36..7e07bdfd5 100644 --- a/gtfs_funnel/Makefile +++ b/gtfs_funnel/Makefile @@ -21,7 +21,8 @@ preprocess_vp: preprocess_schedule_only: make route_typologies_data python operator_scheduled_stats.py - + python schedule_stats_by_stop.py + route_typologies_data: python route_typologies.py python schedule_stats_by_route_direction.py diff --git a/gtfs_funnel/schedule_stats_by_stop.py b/gtfs_funnel/schedule_stats_by_stop.py new file mode 100644 index 000000000..55d0f3d11 --- /dev/null +++ b/gtfs_funnel/schedule_stats_by_stop.py @@ -0,0 +1,140 @@ +""" +Add some GTFS schedule derived metrics +by stop (arrivals, number of trips/routes served, +service hours). + +This is stop grain version of schedule_stats_by_route_direction. +Grain: schedule_gtfs_dataset_key-stop_id +""" +import datetime +import geopandas as gpd +import pandas as pd + +from calitp_data_analysis.geography_utils import WGS84 +from calitp_data_analysis import utils +from segment_speed_utils import helpers + +def stats_for_stop( + df: pd.DataFrame, + group_cols: list +) -> pd.DataFrame: + """ + List the stats we'd like to calculate for each stop. + """ + df2 = ( + df + .groupby(group_cols, group_keys=False) + .agg({ + "trip_id": "nunique", + "route_id": "nunique", + "route_type": lambda x: list(sorted(set(x))), + "departure_sec": "count", + "departure_hour": "nunique" + }).reset_index() + .rename(columns = { + "departure_sec": "n_arrivals", + "departure_hour": "n_hours_in_service", + "trip_id": "n_trips", + "route_id": "n_routes", + "route_type": "route_types_served" + }) + ) + + # Instead of producing list, we want to show values like 0, 3 instead of [0, 3] + # portal users can see combinations more quickly + # and access particular rows using str.contains + df2 = df2.assign( + route_types_served = df2.route_types_served.str.join(", ") + ) + + return df2 + + +def schedule_stats_by_stop( + analysis_date: str +) -> gpd.GeoDataFrame: + """ + Import stop_times, trips, and stops. + Merge and aggregate for stop-level schedule stats. + + Calculate some extra stats from other schedule tables, + such as how many route_ids and route_types the + stop shares. + """ + # departure hour nunique values can let us know span of service + stop_times = helpers.import_scheduled_stop_times( + analysis_date, + columns = ["feed_key", "stop_id", "trip_id", + "departure_sec", "departure_hour"], + with_direction = False, + get_pandas = True + ) + + # include route info so we know how many trips, routes, + # route_types that the stop serves + # stop can serve 1 light rail + 5 bus routes vs 6 bus routes + trips = helpers.import_scheduled_trips( + analysis_date, + columns = ["gtfs_dataset_key", "feed_key", + "trip_id", + "route_id", "route_type"], + get_pandas = True, + ) + + stops = helpers.import_scheduled_stops( + analysis_date, + columns = ["feed_key", "stop_id", "stop_name", "geometry"], + get_pandas = True, + crs = WGS84 + ) + + stop_df = pd.merge( + stop_times, + trips, + on = ["feed_key", "trip_id"], + how = "inner" + ).pipe( + stats_for_stop, + group_cols = ["schedule_gtfs_dataset_key", "feed_key", "stop_id"] + ) + + + stop_gdf = pd.merge( + stops, + stop_df, + on = ["feed_key", "stop_id"], + how = "inner" + ).drop(columns = "feed_key") + + # Fix order of columns + col_order = [ + c for c in stop_gdf.columns + if c not in ["schedule_gtfs_dataset_key", "geometry"] + ] + + stop_gdf = stop_gdf.reindex( + columns = ["schedule_gtfs_dataset_key", *col_order, "geometry"] + ) + + return stop_gdf + + +if __name__ == "__main__": + + from update_vars import analysis_date_list, RT_SCHED_GCS, GTFS_DATA_DICT + + EXPORT_FILE = GTFS_DATA_DICT.rt_vs_schedule_tables.sched_stop_metrics + + for analysis_date in analysis_date_list: + start = datetime.datetime.now() + + gdf = schedule_stats_by_stop(analysis_date) + + utils.geoparquet_gcs_export( + gdf, + RT_SCHED_GCS, + f"{EXPORT_FILE}_{analysis_date}" + ) + + end = datetime.datetime.now() + print(f"schedule stop stats for {analysis_date}: {end - start}") \ No newline at end of file From b71ead7ecfa75ef3c23dac41cecea952351a407d Mon Sep 17 00:00:00 2001 From: tiffanychu90 Date: Mon, 30 Sep 2024 22:51:28 +0000 Subject: [PATCH 02/10] remove flex and private datasets from published_operators.yml' --- gtfs_funnel/published_operators.yml | 19 ------------------- gtfs_funnel/track_publish_dates.py | 17 ++++++++++++++--- 2 files changed, 14 insertions(+), 22 deletions(-) diff --git a/gtfs_funnel/published_operators.yml b/gtfs_funnel/published_operators.yml index a29fa245e..7a8ce9f2e 100644 --- a/gtfs_funnel/published_operators.yml +++ b/gtfs_funnel/published_operators.yml @@ -2,7 +2,6 @@ - Alhambra Schedule - Amador Schedule - Anaheim Resort Schedule - - Anaheim Resort Schedule v2 - Antelope Valley Transit Authority Schedule - Arcadia Schedule - Arvin Schedule @@ -51,7 +50,6 @@ - Bell Gardens Schedule - Bellflower Bus Schedule - Big Blue Bus Schedule - - Big Blue Bus Swiftly Schedule - BruinBus Schedule - Burbank Schedule - Calabasas Schedule @@ -193,7 +191,6 @@ - Santa Cruz Schedule 2024-06-12: - Anteater Express Schedule - - Lassen Flex - Lynwood Schedule - Manteca Schedule 2024-05-22: @@ -207,29 +204,13 @@ - Rosemead Schedule 2023-12-13: - DowneyLINK Schedule - - Humboldt Flex - - Laguna Beach Flex - - Manteca Flex - - Placer Flex - - San Joaquin Flex - Spirit Bus Schedule - - StanRTA Flex - - TART Flex - - Thousand Oaks Flex - - Tracy Flex - - Turlock Flex - - Union City Flex - - VCTC Flex - - WestCAT Flex 2023-11-15: - Amtrak Schedule - Mission Bay Schedule 2023-08-15: - Blossom Express Schedule - - Eastern Sierra Flex 2023-06-14: - Tuolumne Schedule -2023-04-12: - - Guadalupe Flex 2023-03-15: - TIME GMV Schedule diff --git a/gtfs_funnel/track_publish_dates.py b/gtfs_funnel/track_publish_dates.py index 4cf6d90e1..49d7fb962 100644 --- a/gtfs_funnel/track_publish_dates.py +++ b/gtfs_funnel/track_publish_dates.py @@ -11,7 +11,7 @@ from pathlib import Path from typing import Union -from shared_utils import rt_dates +from shared_utils import gtfs_utils_v2, rt_dates from segment_speed_utils import time_series_utils def filter_to_recent_date(df: pd.DataFrame) -> pd.DataFrame: @@ -29,6 +29,7 @@ def filter_to_recent_date(df: pd.DataFrame) -> pd.DataFrame: ) return df2 + def export_results_yml( df: pd.DataFrame, export_yaml: Union[str, Path] @@ -41,11 +42,17 @@ def export_results_yml( # operator names that have more recent names that we are keeping, # so we can remove these from our yaml exclude_me = [ - "TIME GMV" + "Flex", ] + + df2 = df.copy() + + for exclude_word in exclude_me: - df2 = df[~df.name.isin(exclude_me)] + df2 = df2[~df2.name.str.contains(exclude_word)] + # yaml export can have date as string + # but yaml safe_load will automatically parse as datetime again my_dict = { **{ date_key: df2[df2.service_date==date_key].name.tolist() @@ -53,6 +60,7 @@ def export_results_yml( } } + # sort_keys=False to prevent alphabetical sort (earliest date first) # because we want to main our results and yaml with most recent date first output = pyaml.dump(my_dict, sort_keys=False) @@ -73,12 +81,15 @@ def export_results_yml( TABLE = GTFS_DATA_DICT.schedule_downloads.trips + public_feeds = gtfs_utils_v2.filter_to_public_schedule_gtfs_dataset_keys() + operators = time_series_utils.concatenate_datasets_across_dates( COMPILED_CACHED_VIEWS, TABLE, rt_dates.y2024_dates + rt_dates.y2023_dates, data_type = "df", get_pandas = True, + filters = [[("gtfs_dataset_key", "in", public_feeds)]], columns = ["name"] ).drop_duplicates().pipe(filter_to_recent_date) From b5a3e34c3ad3596e04391f9a594ea071a2c2b052 Mon Sep 17 00:00:00 2001 From: tiffanychu90 Date: Mon, 30 Sep 2024 22:52:44 +0000 Subject: [PATCH 03/10] deprecate old config.yml function --- _shared_utils/shared_utils/catalog_utils.py | 18 ------------------ 1 file changed, 18 deletions(-) diff --git a/_shared_utils/shared_utils/catalog_utils.py b/_shared_utils/shared_utils/catalog_utils.py index bb8284f06..df24ac7ec 100644 --- a/_shared_utils/shared_utils/catalog_utils.py +++ b/_shared_utils/shared_utils/catalog_utils.py @@ -5,7 +5,6 @@ from typing import Literal import intake -import yaml from omegaconf import OmegaConf # this is yaml parser repo_name = "data-analyses/" @@ -22,20 +21,3 @@ def get_catalog(catalog_name: Literal["shared_data_catalog", "gtfs_analytics_dat else: return intake.open_catalog(catalog_path) - - -def get_parameters(config_file: str, key: str) -> dict: - """ - Parse the config.yml file to get the parameters needed - for working with route or stop segments. - These parameters will be passed through the scripts when working - with vehicle position data. - - Returns a dictionary of parameters. - """ - # https://aaltoscicomp.github.io/python-for-scicomp/scripts/ - with open(config_file) as f: - my_dict = yaml.safe_load(f) - params_dict = my_dict[key] - - return params_dict From 8ab1c751ea8f08f5ecf04e95829e3ace062bcb1f Mon Sep 17 00:00:00 2001 From: tiffanychu90 Date: Thu, 3 Oct 2024 18:45:25 +0000 Subject: [PATCH 04/10] add publish_utils for patching in previous dates and test on stops file --- _shared_utils/setup.py | 2 +- _shared_utils/shared_utils/publish_utils.py | 32 +++- open_data/create_stops_data.py | 178 ++++++++++---------- open_data/prep_traffic_ops.py | 61 +++---- open_data/update_vars.py | 15 +- 5 files changed, 152 insertions(+), 136 deletions(-) diff --git a/_shared_utils/setup.py b/_shared_utils/setup.py index 5173807d8..efb6c97c2 100644 --- a/_shared_utils/setup.py +++ b/_shared_utils/setup.py @@ -4,7 +4,7 @@ setup( name="shared_utils", packages=find_packages(), - version="2.6", + version="2.7", description="Shared utility functions for data analyses", author="Cal-ITP", license="Apache", diff --git a/_shared_utils/shared_utils/publish_utils.py b/_shared_utils/shared_utils/publish_utils.py index deb9e0697..eecc70679 100644 --- a/_shared_utils/shared_utils/publish_utils.py +++ b/_shared_utils/shared_utils/publish_utils.py @@ -1,12 +1,16 @@ import os from pathlib import Path -from typing import Union +from typing import Literal, Union import gcsfs +import geopandas as gpd import pandas as pd +from shared_utils import catalog_utils fs = gcsfs.GCSFileSystem() +SCHED_GCS = "gs://calitp-analytics-data/data-analyses/gtfs_schedule/" PUBLIC_BUCKET = "gs://calitp-publish-data-analysis/" +GTFS_DATA_DICT = catalog_utils.get_catalog("gtfs_analytics_data") def write_to_public_gcs( @@ -59,3 +63,29 @@ def exclude_private_datasets( Filter out private datasets. """ return df[df[col].isin(public_gtfs_dataset_keys)].reset_index(drop=True) + + +def subset_table_from_previous_date( + gcs_bucket: str, + filename: Union[str, Path], + operator_and_dates_dict: dict, + date: str, + crosswalk_col: str = "schedule_gtfs_dataset_key", + data_type: Literal["df", "gdf"] = "df", +) -> pd.DataFrame: + CROSSWALK_FILE = GTFS_DATA_DICT.schedule_tables.gtfs_key_crosswalk + + crosswalk = pd.read_parquet(f"{SCHED_GCS}{CROSSWALK_FILE}_{date}.parquet", columns=["name", crosswalk_col]) + + subset_keys = crosswalk[crosswalk.name.isin(operator_and_dates_dict[date])][crosswalk_col].unique() + + if data_type == "df": + past_df = pd.read_parquet( + f"{gcs_bucket}{filename}_{date}.parquet", filters=[[(crosswalk_col, "in", subset_keys)]] + ) + else: + past_df = gpd.read_parquet( + f"{gcs_bucket}{filename}_{date}.parquet", filters=[[(crosswalk_col, "in", subset_keys)]] + ) + + return past_df diff --git a/open_data/create_stops_data.py b/open_data/create_stops_data.py index aa43e3442..fb3d102a8 100644 --- a/open_data/create_stops_data.py +++ b/open_data/create_stops_data.py @@ -2,81 +2,107 @@ Create stops file with identifiers including route_id, route_name, agency_id, agency_name. """ +import datetime import geopandas as gpd import pandas as pd - -from datetime import datetime +import yaml import prep_traffic_ops -from calitp_data_analysis import utils, geography_utils -from shared_utils import gtfs_utils_v2, schedule_rt_utils -from segment_speed_utils import helpers -from update_vars import analysis_date, TRAFFIC_OPS_GCS +from calitp_data_analysis import utils +from shared_utils import publish_utils +from update_vars import (analysis_date, + GTFS_DATA_DICT, + TRAFFIC_OPS_GCS, + RT_SCHED_GCS, SCHED_GCS + ) - -def attach_route_info_to_stops( - stops: gpd.GeoDataFrame, - trips: pd.DataFrame, - stop_times: pd.DataFrame +def create_stops_file_for_export( + date: str, ) -> gpd.GeoDataFrame: """ - Attach all the various route information (route_id, route_type) - to the stops file. - """ - # In stop_times table, find the trip ids that are present that day - # then find unique stop_ids present that day - trip_cols = ["feed_key", "name", - "trip_id", - "route_id", "route_type", - ] - - stops_with_route_info = ( - pd.merge( - stop_times, - trips[trip_cols], - on = ["feed_key", "trip_id"] - ).drop_duplicates(subset=["feed_key", "stop_id", - "route_id", "route_type"]) - .drop(columns = "trip_id") - .reset_index(drop=True) + Read in scheduled stop metrics table and attach crosswalk + info related to organization for Geoportal. + """ + time0 = datetime.datetime.now() + + # Read in parquets + STOP_FILE = GTFS_DATA_DICT.rt_vs_schedule_tables.sched_stop_metrics + + stops = gpd.read_parquet( + f"{RT_SCHED_GCS}{STOP_FILE}_{date}.parquet" ) - stops_with_geom = pd.merge( - stops, - stops_with_route_info, - on = ["feed_key", "stop_id"], - how = "inner" - ) - - stops_assembled = (stops_with_geom - .sort_values(["name", "route_id", "stop_id"]) - .reset_index(drop=True) - ) + stops2 = prep_traffic_ops.standardize_operator_info_for_exports(stops, date) + + time1 = datetime.datetime.now() + print(f"get stops for date: {time1 - time0}") - stops_assembled2 = prep_traffic_ops.clip_to_usa(stops_assembled) + return stops2 + + +def patch_previous_dates( + current_stops: gpd.GeoDataFrame, + current_date: str, + published_operators_yaml: str = "../gtfs_funnel/published_operators.yml" +) -> gpd.GeoDataFrame: + """ + Compare to the yaml for what operators we want, and + patch in previous dates for the 10 or so operators + that do not have data for this current date. + """ + with open(published_operators_yaml) as f: + published_operators_dict = yaml.safe_load(f) - stops_assembled3 = prep_traffic_ops.standardize_operator_info_for_exports( - stops_assembled2, analysis_date) + patch_operators_dict = { + str(date): operator_list for + date, operator_list in published_operators_dict.items() + if str(date) != current_date + } - return stops_assembled3 + partial_dfs = [] + + STOP_FILE = GTFS_DATA_DICT.rt_vs_schedule_tables.sched_stop_metrics + for one_date, operator_list in patch_operators_dict.items(): + df_to_add = publish_utils.subset_table_from_previous_date( + gcs_bucket = RT_SCHED_GCS, + filename = STOP_FILE, + operator_and_dates_dict = patch_operators_dict, + date = one_date, + crosswalk_col = "schedule_gtfs_dataset_key", + data_type = "gdf" + ).pipe(prep_traffic_ops.standardize_operator_info_for_exports, one_date) + + partial_dfs.append(df_to_add) + + patch_stops = pd.concat(partial_dfs, axis=0, ignore_index=True) + + published_stops = pd.concat( + [current_stops, patch_stops], + axis=0, ignore_index=True + ) + + return published_stops + def finalize_export_df(df: gpd.GeoDataFrame) -> gpd.GeoDataFrame: """ Suppress certain columns used in our internal modeling for export. """ - public_feeds = gtfs_utils_v2.filter_to_public_schedule_gtfs_dataset_keys() - # Change column order route_cols = [ 'organization_source_record_id', 'organization_name', - 'route_id', 'route_type'] - stop_cols = ['stop_id', 'stop_name'] + ] + stop_cols = [ + 'stop_id', 'stop_name', + # add GTFS stop-related metrics + 'n_trips', 'n_routes', 'route_types_served', 'n_arrivals', 'n_hours_in_service', + ] agency_ids = ['base64_url'] col_order = route_cols + stop_cols + agency_ids + ['geometry'] - df2 = (df[df.schedule_gtfs_dataset_key.isin(public_feeds)][col_order] + df2 = (df[col_order] .reindex(columns = col_order) .rename(columns = prep_traffic_ops.RENAME_COLS) .reset_index(drop=True) @@ -85,52 +111,26 @@ def finalize_export_df(df: gpd.GeoDataFrame) -> gpd.GeoDataFrame: return df2 -def create_stops_file_for_export(date: str) -> gpd.GeoDataFrame: - time0 = datetime.now() - - # Read in parquets - stops = helpers.import_scheduled_stops( - date, - columns = prep_traffic_ops.keep_stop_cols, - get_pandas = True, - crs = geography_utils.WGS84 - ) - - trips = helpers.import_scheduled_trips( - date, - columns = prep_traffic_ops.keep_trip_cols, - get_pandas = True - ) - - stop_times = helpers.import_scheduled_stop_times( - date, - columns = prep_traffic_ops.keep_stop_time_cols, - get_pandas = True - ) - - stops_assembled = attach_route_info_to_stops(stops, trips, stop_times) - - time1 = datetime.now() - print(f"Attach route and operator info to stops: {time1-time0}") - - return stops_assembled - - if __name__ == "__main__": - time0 = datetime.now() + + time0 = datetime.datetime.now() stops = create_stops_file_for_export(analysis_date) - stops2 = finalize_export_df(stops) + prep_traffic_ops.export_to_subfolder( + "ca_transit_stops", analysis_date + ) + + published_stops = patch_previous_dates( + stops, + analysis_date, + ).pipe(finalize_export_df) utils.geoparquet_gcs_export( - stops2, + published_stops, TRAFFIC_OPS_GCS, "ca_transit_stops" ) - prep_traffic_ops.export_to_subfolder( - "ca_transit_stops", analysis_date) - - time1 = datetime.now() - print(f"Execution time for stops script: {time1-time0}") + time1 = datetime.datetime.now() + print(f"Execution time for stops script: {time1 - time0}") diff --git a/open_data/prep_traffic_ops.py b/open_data/prep_traffic_ops.py index b8be6faa7..1eb8301e0 100644 --- a/open_data/prep_traffic_ops.py +++ b/open_data/prep_traffic_ops.py @@ -7,58 +7,40 @@ import pandas as pd from calitp_data_analysis import utils, geography_utils -from shared_utils import schedule_rt_utils -from update_vars import TRAFFIC_OPS_GCS, analysis_date +from shared_utils import gtfs_utils_v2, schedule_rt_utils +from update_vars import TRAFFIC_OPS_GCS, analysis_date, GTFS_DATA_DICT, SCHED_GCS catalog = intake.open_catalog( "../_shared_utils/shared_utils/shared_data_catalog.yml") - -keep_trip_cols = [ - "feed_key", "name", - "trip_id", - "route_id", "route_type", - "shape_id", "shape_array_key", - "route_long_name", "route_short_name", "route_desc" -] - -keep_shape_cols = [ - "shape_array_key", - "n_trips", "geometry" -] - -keep_stop_cols = [ - "feed_key", - "stop_id", "stop_name", - "geometry" -] - -keep_stop_time_cols = [ - "feed_key", "trip_id", "stop_id" -] - + def standardize_operator_info_for_exports( df: pd.DataFrame, date: str ) -> pd.DataFrame: - - crosswalk = schedule_rt_utils.sample_schedule_feed_key_to_organization_crosswalk( - df, - date, - quartet_data = "schedule", - dim_gtfs_dataset_cols = [ - "key", "regional_feed_type", - "base64_url"], - dim_organization_cols = [ - "source_record_id", "name", "caltrans_district"] + """ + Use our crosswalk file created in gtfs_funnel + and add in the organization columns we want to + publish on. + """ + CROSSWALK_FILE = GTFS_DATA_DICT.schedule_tables.gtfs_key_crosswalk + + public_feeds = gtfs_utils_v2.filter_to_public_schedule_gtfs_dataset_keys() + + crosswalk = pd.read_parquet( + f"{SCHED_GCS}{CROSSWALK_FILE}_{date}.parquet", + columns = [ + "schedule_gtfs_dataset_key", "name", "base64_url", + "organization_source_record_id", "organization_name" + ], + filters = [[("schedule_gtfs_dataset_key", "in", public_feeds)]] ) df2 = pd.merge( df, crosswalk, - on = "feed_key", - how = "inner", - validate = "m:1" + on = "schedule_gtfs_dataset_key", + how = "inner" ) return df2 @@ -157,4 +139,5 @@ def export_to_subfolder(file_name: str, date: str): "organization_name": "agency", "organization_source_record_id": "org_id", "route_name_used": "route_name", + "route_types_served": "routetypes" } \ No newline at end of file diff --git a/open_data/update_vars.py b/open_data/update_vars.py index b8364cd71..15ed718ec 100644 --- a/open_data/update_vars.py +++ b/open_data/update_vars.py @@ -1,13 +1,16 @@ from pathlib import Path -from shared_utils import rt_dates +from shared_utils import catalog_utils, rt_dates analysis_date = rt_dates.DATES["sep2024"] -GCS_FILE_PATH = "gs://calitp-analytics-data/data-analyses/" -COMPILED_CACHED_VIEWS = f"{GCS_FILE_PATH}rt_delay/compiled_cached_views/" -TRAFFIC_OPS_GCS = f"{GCS_FILE_PATH}traffic_ops/" -HQTA_GCS = f"{GCS_FILE_PATH}high_quality_transit_areas/" -SEGMENT_GCS = f"{GCS_FILE_PATH}rt_segment_speeds/" +GTFS_DATA_DICT = catalog_utils.get_catalog("gtfs_analytics_data") + +COMPILED_CACHED_VIEWS = GTFS_DATA_DICT.gcs_paths.COMPILED_CACHED_VIEWS +SEGMENT_GCS = GTFS_DATA_DICT.gcs_paths.SEGMENT_GCS +RT_SCHED_GCS = GTFS_DATA_DICT.gcs_paths.RT_SCHED_GCS +SCHED_GCS = GTFS_DATA_DICT.gcs_paths.SCHED_GCS +TRAFFIC_OPS_GCS = f"{GTFS_DATA_DICT.gcs_paths.GCS}traffic_ops/" +HQTA_GCS = f"{GTFS_DATA_DICT.gcs_paths.GCS}high_quality_transit_areas/" ESRI_BASE_URL = "https://gisdata.dot.ca.gov/arcgis/rest/services/CHrailroad/" XML_FOLDER = Path("xml") From fbcd7385957135808150ec8c14416a692288ab5d Mon Sep 17 00:00:00 2001 From: tiffanychu90 Date: Thu, 3 Oct 2024 22:03:45 +0000 Subject: [PATCH 05/10] combine publish_utils and prep_traffic_ops and update data dict --- open_data/create_stops_data.py | 10 +-- open_data/data_dictionary.yml | 29 +++++++- open_data/metadata.json | 2 +- open_data/metadata.yml | 2 +- open_data/metadata_update_pro.py | 15 +++++ ...prep_traffic_ops.py => open_data_utils.py} | 67 ++++++++++++++++++- open_data/supplement_meta.py | 4 +- open_data/update_data_dict.py | 8 +-- 8 files changed, 121 insertions(+), 16 deletions(-) rename open_data/{prep_traffic_ops.py => open_data_utils.py} (71%) diff --git a/open_data/create_stops_data.py b/open_data/create_stops_data.py index fb3d102a8..c02300569 100644 --- a/open_data/create_stops_data.py +++ b/open_data/create_stops_data.py @@ -7,7 +7,7 @@ import pandas as pd import yaml -import prep_traffic_ops +import open_data_utils from calitp_data_analysis import utils from shared_utils import publish_utils from update_vars import (analysis_date, @@ -32,7 +32,7 @@ def create_stops_file_for_export( f"{RT_SCHED_GCS}{STOP_FILE}_{date}.parquet" ) - stops2 = prep_traffic_ops.standardize_operator_info_for_exports(stops, date) + stops2 = open_data_utils.standardize_operator_info_for_exports(stops, date) time1 = datetime.datetime.now() print(f"get stops for date: {time1 - time0}") @@ -71,7 +71,7 @@ def patch_previous_dates( date = one_date, crosswalk_col = "schedule_gtfs_dataset_key", data_type = "gdf" - ).pipe(prep_traffic_ops.standardize_operator_info_for_exports, one_date) + ).pipe(open_data_utils.standardize_operator_info_for_exports, one_date) partial_dfs.append(df_to_add) @@ -104,7 +104,7 @@ def finalize_export_df(df: gpd.GeoDataFrame) -> gpd.GeoDataFrame: df2 = (df[col_order] .reindex(columns = col_order) - .rename(columns = prep_traffic_ops.RENAME_COLS) + .rename(columns = open_data_utils.RENAME_COLS) .reset_index(drop=True) ) @@ -117,7 +117,7 @@ def finalize_export_df(df: gpd.GeoDataFrame) -> gpd.GeoDataFrame: stops = create_stops_file_for_export(analysis_date) - prep_traffic_ops.export_to_subfolder( + open_data_utils.export_to_subfolder( "ca_transit_stops", analysis_date ) diff --git a/open_data/data_dictionary.yml b/open_data/data_dictionary.yml index 0281c1827..a8dd93792 100644 --- a/open_data/data_dictionary.yml +++ b/open_data/data_dictionary.yml @@ -123,12 +123,35 @@ tables: - dataset_name: ca_transit_stops agency: *agency org_id: *org_id - route_id: *route_id - route_type: *route_type stop_id: *stop_id stop_name: *stop_name base64_url: *base64_url - + n_trips: + definition: |- + "# of daily trips (unique trip_ids) this stop serves" + definition_source: |- + "https://gtfs.org/schedule/reference/#stop_timestxt" + n_routes: + definition: |- + "# of unique route_ids this stop serves" + definition_source: |- + "https://gtfs.org/schedule/reference/#tripstxt" + routetypes: + definition: |- + "# of unique route types served" + definition_source: |- + "https://gtfs.org/schedule/reference/#routestxt" + n_arrivals: + definition: |- + "# daily arrivals (across routes) this stop serves" + definition_source: |- + "https://gtfs.org/schedule/reference/#stop_timestxt" + n_hours_in_service: + definition: |- + "# hours this stop has scheduled service (unique departure hours)" + definition_source: |- + "https://gtfs.org/schedule/reference/#stop_timestxt" + - dataset_name: speeds_by_stop_segments agency: *agency org_id: *org_id diff --git a/open_data/metadata.json b/open_data/metadata.json index de7f22c39..05fa8b1a1 100644 --- a/open_data/metadata.json +++ b/open_data/metadata.json @@ -1 +1 @@ -{"ca_hq_transit_areas": {"dataset_name": "ca_hq_transit_areas", "publish_entity": "Data & Digital Services / California Integrated Travel Project", "summary_purpose": "Estimated High Quality Transit Areas as described in Public Resources Code 21155, 21064.3, 21060.2.", "description": "Use GTFS schedule trips, stop_times, shapes, and stops to estimate whether corridor segments have scheduled frequencies of 15 minutes or less.", "public_access": "Public.", "creation_date": "2022-02-08", "place": "California", "status": "completed", "frequency": "monthly", "theme_topic": "transportation", "theme_keywords": "Transportation, Land Use, Transit-Oriented Development, TOD, High Quality Transit", "data_dict_type": "XML", "readme": "https://github.com/cal-itp/data-analyses/blob/main/high_quality_transit_areas/README.md", "readme_desc": "This site allows you to access the code used to create this dataset and provides additional explanatory resources.", "contact_organization": "Caltrans", "contact_person": "Eric Dasmalchi", "contact_email": "eric.dasmalchi@dot.ca.gov", "horiz_accuracy": "4 meters", "boilerplate_desc": "The data are made available to the public solely for informational purposes. Information provided in the Caltrans GIS Data Library is accurate to the best of our knowledge and is subject to change on a regular basis, without notice. While Caltrans makes every effort to provide useful and accurate information, we do not warrant the information Use Limitation - The data are made available to the public solely for informational purposes. Information provided in the Caltrans GIS Data Library is accurate to the best of our knowledge and is subject to change on a regular basis, without notice. While Caltrans makes every effort to provide useful and accurate information, we do not warrant the information to be authoritative, complete, factual, or timely. Information is provided on an 'as is' and an 'as available' basis. The Department of Transportation is not liable to any party for any cost or damages, including any direct, indirect, special, incidental, or consequential damages, arising out of or in connection with the access or use of, or the inability to access or use, the Site or any of the Materials or Services described herein.", "boilerplate_license": "Creative Commons 4.0 Attribution.", "methodology": "This data was estimated using a spatial process derived from General Transit Feed Specification (GTFS) schedule data. To find high-quality bus corridors, we split each corridor into 1,500 meter segments and counted frequencies at the stop within that segment with the highest number of transit trips. If that stop saw at least 4 trips per hour for at least one hour in the morning, and again for at least one hour in the afternoon, we consider that segment a high-quality bus corridor. Segments without a stop are not considered high-quality corridors. Major transit stops were identified as either the intersection of two high-quality corridors from the previous step, a rail or bus rapid transit station, or a ferry terminal with bus service. Note that the definition of `bus rapid transit` in Public Resources Code 21060.2 includes features not captured by available data sources, these features were captured manually using information from transit agency sources and imagery. We believe this data to be broadly accurate, and fit for purposes including overall dashboards, locating facilities in relation to high quality transit areas, and assessing community transit coverage. However, the spatial determination of high-quality transit areas from GTFS data necessarily involves some assumptions as described above. Any critical determinations of whether a specific parcel is located within a high-quality transit area should be made in conjunction with local sources, such as transit agency timetables. Notes: Null values may be present. The `hqta_details` columns defines which part of the Public Resources Code definition the HQTA classification was based on. If `hqta_details` references a single operator, then `agency_secondary` and `base64_url_secondary` are null. If `hqta_details` references the same operator, then `agency_secondary` and `base64_url_secondary` are the same as `agency_primary` and `base64_url_primary`.", "data_dict_url": "https://gisdata.dot.ca.gov/arcgis/rest/services/CHrailroad/CA_HQ_Transit_Areas/FeatureServer", "revision_date": "2024-08-14", "rename_cols": {"agency_pri": "agency_primary", "agency_sec": "agency_secondary", "hqta_detai": "hqta_details", "base64_url": "base64_url_primary", "base64_u_1": "base64_url_secondary", "org_id_pri": "org_id_primary", "org_id_sec": "org_id_secondary"}}, "ca_hq_transit_stops": {"dataset_name": "ca_hq_transit_stops", "publish_entity": "Data & Digital Services / California Integrated Travel Project", "summary_purpose": "Estimated stops along High Quality Transit Corridors, plus major transit stops for bus rapid transit, ferry, rail modes as described in Public Resources Code 21155, 21064.3, 21060.2.", "description": "Use GTFS schedule trips, stop_times, shapes, and stops to estimate whether corridor segments have scheduled frequencies of 15 minutes or less.", "public_access": "Public.", "creation_date": "2022-02-08", "place": "California", "status": "completed", "frequency": "monthly", "theme_topic": "transportation", "theme_keywords": "Transportation, Land Use, Transit-Oriented Development, TOD, High Quality Transit", "data_dict_type": "XML", "readme": "https://github.com/cal-itp/data-analyses/blob/main/high_quality_transit_areas/README.md", "readme_desc": "This site allows you to access the code used to create this dataset and provides additional explanatory resources.", "contact_organization": "Caltrans", "contact_person": "Eric Dasmalchi", "contact_email": "eric.dasmalchi@dot.ca.gov", "horiz_accuracy": "4 meters", "boilerplate_desc": "The data are made available to the public solely for informational purposes. Information provided in the Caltrans GIS Data Library is accurate to the best of our knowledge and is subject to change on a regular basis, without notice. While Caltrans makes every effort to provide useful and accurate information, we do not warrant the information Use Limitation - The data are made available to the public solely for informational purposes. Information provided in the Caltrans GIS Data Library is accurate to the best of our knowledge and is subject to change on a regular basis, without notice. While Caltrans makes every effort to provide useful and accurate information, we do not warrant the information to be authoritative, complete, factual, or timely. Information is provided on an 'as is' and an 'as available' basis. The Department of Transportation is not liable to any party for any cost or damages, including any direct, indirect, special, incidental, or consequential damages, arising out of or in connection with the access or use of, or the inability to access or use, the Site or any of the Materials or Services described herein.", "boilerplate_license": "Creative Commons 4.0 Attribution.", "methodology": "This data was estimated using a spatial process derived from General Transit Feed Specification (GTFS) schedule data. To find high-quality bus corridors, we split each corridor into 1,500 meter segments and counted frequencies at the stop within that segment with the highest number of transit trips. If that stop saw at least 4 trips per hour for at least one hour in the morning, and again for at least one hour in the afternoon, we consider that segment a high-quality bus corridor. Segments without a stop are not considered high-quality corridors. Major transit stops were identified as either the intersection of two high-quality corridors from the previous step, a rail or bus rapid transit station, or a ferry terminal with bus service. Note that the definition of `bus rapid transit` in Public Resources Code 21060.2 includes features not captured by available data sources, these features were captured manually using information from transit agency sources and imagery. We believe this data to be broadly accurate, and fit for purposes including overall dashboards, locating facilities in relation to high quality transit areas, and assessing community transit coverage. However, the spatial determination of high-quality transit areas from GTFS data necessarily involves some assumptions as described above. Any critical determinations of whether a specific parcel is located within a high-quality transit area should be made in conjunction with local sources, such as transit agency timetables. Notes: Null values may be present. The `hqta_details` columns defines which part of the Public Resources Code definition the HQTA classification was based on. If `hqta_details` references a single operator, then `agency_secondary` and `base64_url_secondary` are null. If `hqta_details` references the same operator, then `agency_secondary` and `base64_url_secondary` are the same as `agency_primary` and `base64_url_primary`.", "data_dict_url": "https://gisdata.dot.ca.gov/arcgis/rest/services/CHrailroad/CA_HQ_Transit_Stops/FeatureServer", "revision_date": "2024-08-14", "rename_cols": {"agency_pri": "agency_primary", "agency_sec": "agency_secondary", "hqta_detai": "hqta_details", "base64_url": "base64_url_primary", "base64_u_1": "base64_url_secondary", "org_id_pri": "org_id_primary", "org_id_sec": "org_id_secondary"}}, "ca_transit_routes": {"dataset_name": "ca_transit_routes", "publish_entity": "Data & Digital Services / California Integrated Travel Project", "summary_purpose": "Provide all CA transit stops and routes (geospatial) from all transit operators.", "description": "Provide compiled GTFS schedule data in geospatial format. Transit routes associates route information to shapes. Transit stops associates route information to stops.", "public_access": "Public.", "creation_date": "2022-02-08", "place": "California", "status": "completed", "frequency": "monthly", "theme_topic": "transportation", "theme_keywords": "Transportation, GTFS, Transit routes, Transit stops, Transit", "data_dict_type": "XML", "readme": "https://github.com/cal-itp/data-analyses/blob/main/open_data/README.md", "readme_desc": "This site allows you to access the code used to create this dataset and provides additional explanatory resources.", "contact_organization": "Caltrans", "contact_person": "Tiffany Ku", "contact_email": "tiffany.ku@dot.ca.gov", "horiz_accuracy": "4 meters", "boilerplate_desc": "The data are made available to the public solely for informational purposes. Information provided in the Caltrans GIS Data Library is accurate to the best of our knowledge and is subject to change on a regular basis, without notice. While Caltrans makes every effort to provide useful and accurate information, we do not warrant the information Use Limitation - The data are made available to the public solely for informational purposes. Information provided in the Caltrans GIS Data Library is accurate to the best of our knowledge and is subject to change on a regular basis, without notice. While Caltrans makes every effort to provide useful and accurate information, we do not warrant the information to be authoritative, complete, factual, or timely. Information is provided on an 'as is' and an 'as available' basis. The Department of Transportation is not liable to any party for any cost or damages, including any direct, indirect, special, incidental, or consequential damages, arising out of or in connection with the access or use of, or the inability to access or use, the Site or any of the Materials or Services described herein.", "boilerplate_license": "Creative Commons 4.0 Attribution.", "methodology": "This data was assembled from the General Transit Feed Specification (GTFS) schedule data. GTFS tables are text files, but these have been compiled for all operators and transformed into geospatial data, with minimal data processing. The transit routes dataset is assembled from two tables: (1) `shapes.txt`, which defines the route alignment path, and (2) `trips.txt` and `stops.txt`, for routes not found in `shapes.txt`. `shapes.txt` is an optional GTFS table with richer information than just transit stop longitude and latitude. The transit stops dataset is assembled from `stops.txt`, which contains information about the route, stop sequence, and stop longitude and latitude. References: https://gtfs.org/. https://gtfs.org/schedule/reference/#shapestxt. https://gtfs.org/schedule/reference/#stopstxt. https://gtfs.org/schedule/reference/#tripstxt.", "data_dict_url": "https://gisdata.dot.ca.gov/arcgis/rest/services/CHrailroad/CA_Transit_Routes/FeatureServer", "revision_date": "2024-08-14"}, "ca_transit_stops": {"dataset_name": "ca_transit_stops", "publish_entity": "Data & Digital Services / California Integrated Travel Project", "summary_purpose": "Provide all CA transit stops and routes (geospatial) from all transit operators.", "description": "Provide compiled GTFS schedule data in geospatial format. Transit routes associates route information to shapes. Transit stops associates route information to stops.", "public_access": "Public.", "creation_date": "2022-02-08", "place": "California", "status": "completed", "frequency": "monthly", "theme_topic": "transportation", "theme_keywords": "Transportation, GTFS, Transit routes, Transit stops, Transit", "data_dict_type": "XML", "readme": "https://github.com/cal-itp/data-analyses/blob/main/open_data/README.md", "readme_desc": "This site allows you to access the code used to create this dataset and provides additional explanatory resources.", "contact_organization": "Caltrans", "contact_person": "Tiffany Ku", "contact_email": "tiffany.ku@dot.ca.gov", "horiz_accuracy": "4 meters", "boilerplate_desc": "The data are made available to the public solely for informational purposes. Information provided in the Caltrans GIS Data Library is accurate to the best of our knowledge and is subject to change on a regular basis, without notice. While Caltrans makes every effort to provide useful and accurate information, we do not warrant the information Use Limitation - The data are made available to the public solely for informational purposes. Information provided in the Caltrans GIS Data Library is accurate to the best of our knowledge and is subject to change on a regular basis, without notice. While Caltrans makes every effort to provide useful and accurate information, we do not warrant the information to be authoritative, complete, factual, or timely. Information is provided on an 'as is' and an 'as available' basis. The Department of Transportation is not liable to any party for any cost or damages, including any direct, indirect, special, incidental, or consequential damages, arising out of or in connection with the access or use of, or the inability to access or use, the Site or any of the Materials or Services described herein.", "boilerplate_license": "Creative Commons 4.0 Attribution.", "methodology": "This data was assembled from the General Transit Feed Specification (GTFS) schedule data. GTFS tables are text files, but these have been compiled for all operators and transformed into geospatial data, with minimal data processing. The transit routes dataset is assembled from two tables: (1) `shapes.txt`, which defines the route alignment path, and (2) `trips.txt` and `stops.txt`, for routes not found in `shapes.txt`. `shapes.txt` is an optional GTFS table with richer information than just transit stop longitude and latitude. The transit stops dataset is assembled from `stops.txt`, which contains information about the route, stop sequence, and stop longitude and latitude. References: https://gtfs.org/. https://gtfs.org/schedule/reference/#shapestxt. https://gtfs.org/schedule/reference/#stopstxt. https://gtfs.org/schedule/reference/#tripstxt.", "data_dict_url": "https://gisdata.dot.ca.gov/arcgis/rest/services/CHrailroad/CA_Transit_Stops/FeatureServer", "revision_date": "2024-08-14"}, "speeds_by_stop_segments": {"dataset_name": "speeds_by_stop_segments", "publish_entity": "Data & Digital Services / California Integrated Travel Project", "summary_purpose": "Average all-day, peak, and offpeak transit speeds by segments for all CA operators that provide GTFS real-time vehicle positions data.", "description": "All day and peak transit 20th, 50th, and 80th percentile speeds on stop segments estimated on a single day for all CA transit operators that provide GTFS real-time vehicle positions data.", "public_access": "Public.", "creation_date": "2023-06-14", "place": "California", "status": "completed", "frequency": "monthly", "theme_topic": "transportation", "theme_keywords": "Transportation, Transit, GTFS, GTFS RT, real time, speeds, vehicle positions ", "data_dict_type": "XML", "readme": "https://github.com/cal-itp/data-analyses/blob/main/rt_segment_speeds/README.md", "readme_desc": "This site allows you to access the code used to create this dataset and provides additional explanatory resources.", "contact_organization": "Caltrans", "contact_person": "Tiffany Ku / Eric Dasmalchi", "contact_email": "tiffany.ku@dot.ca.gov / eric.dasmalchi@dot.ca.gov", "horiz_accuracy": "4 meters", "boilerplate_desc": "The data are made available to the public solely for informational purposes. Information provided in the Caltrans GIS Data Library is accurate to the best of our knowledge and is subject to change on a regular basis, without notice. While Caltrans makes every effort to provide useful and accurate information, we do not warrant the information Use Limitation - The data are made available to the public solely for informational purposes. Information provided in the Caltrans GIS Data Library is accurate to the best of our knowledge and is subject to change on a regular basis, without notice. While Caltrans makes every effort to provide useful and accurate information, we do not warrant the information to be authoritative, complete, factual, or timely. Information is provided on an 'as is' and an 'as available' basis. The Department of Transportation is not liable to any party for any cost or damages, including any direct, indirect, special, incidental, or consequential damages, arising out of or in connection with the access or use of, or the inability to access or use, the Site or any of the Materials or Services described herein.", "boilerplate_license": "Creative Commons 4.0 Attribution.", "methodology": "This data was estimated by combining GTFS real-time vehicle positions to GTFS scheduled trips, shapes, stops, and stop times tables. GTFS shapes provides the route alignment path. Multiple trips may share the same shape, with a route typically associated with multiple shapes. Shapes are cut into segments at stop positions (stop_id-stop_sequence combination). A `stop segment` refers to the portion of shapes between the prior stop and the current stop. Vehicle positions are spatially joined to 35 meter buffered segments. Within each segment-trip, the first and last vehicle position observed are used to calculate the speed. Since multiple trips may occur over a segment each day, the multiple trip speeds provide a distribution. From this distribution, the 20th percentile, 50th percentile (median), and 80th percentile speeds are calculated. For all day speed metrics, all trips are used. For peak speed metrics, only trips with start times between 7 - 9:59 AM and 4 - 7:59 PM are used to find the 20th, 50th, and 80th percentile metrics. Data processing notes: (a) GTFS RT trips whose vehicle position timestamps span 10 minutes or less are dropped. Incomplete data would lead to unreliable estimates of speed at the granularity we need. (b) Segment-trip speeds of over 70 mph are excluded. These are erroneously calculated as transit does not typically reach those speeds. (c) Other missing or erroneous calculations, either arising from only one vehicle position found in a segment (change in time or change in distance cannot be calculated).", "data_dict_url": "https://gisdata.dot.ca.gov/arcgis/rest/services/CHrailroad/Speeds_By_Stop_Segments/FeatureServer", "revision_date": "2024-08-14", "rename_cols": {"stop_seque": "stop_sequence", "time_of_da": "time_of_day", "time_perio": "time_period", "district_n": "district_name", "direction_": "direction_id", "common_sha": "common_shape_id", "avg_sched_": "avg_sched_trip_min", "avg_rt_tri": "avg_rt_trip_min", "caltrans_d": "district_name", "organization_source_record_id": "org_id", "organization_name": "agency", "stop_pair_": "stop_pair_name"}}, "speeds_by_route_time_of_day": {"dataset_name": "speeds_by_route_time_of_day", "publish_entity": "Data & Digital Services / California Integrated Travel Project", "summary_purpose": "Average transit speeds by route-direction estimated on a single day for all CA transit operators that provide GTFS real-time vehicle positions data.", "description": "Provide average transit speeds, number of trips by route-direction.", "public_access": "Public.", "creation_date": "2023-06-14", "place": "California", "status": "completed", "frequency": "monthly", "theme_topic": "transportation", "theme_keywords": "Transportation, Transit, GTFS, GTFS RT, real time, speeds, vehicle positions ", "data_dict_type": "XML", "readme": "https://github.com/cal-itp/data-analyses/blob/main/rt_segment_speeds/README.md", "readme_desc": "This site allows you to access the code used to create this dataset and provides additional explanatory resources.", "contact_organization": "Caltrans", "contact_person": "Tiffany Ku", "contact_email": "tiffany.ku@dot.ca.gov", "horiz_accuracy": "4 meters", "boilerplate_desc": "The data are made available to the public solely for informational purposes. Information provided in the Caltrans GIS Data Library is accurate to the best of our knowledge and is subject to change on a regular basis, without notice. While Caltrans makes every effort to provide useful and accurate information, we do not warrant the information Use Limitation - The data are made available to the public solely for informational purposes. Information provided in the Caltrans GIS Data Library is accurate to the best of our knowledge and is subject to change on a regular basis, without notice. While Caltrans makes every effort to provide useful and accurate information, we do not warrant the information to be authoritative, complete, factual, or timely. Information is provided on an 'as is' and an 'as available' basis. The Department of Transportation is not liable to any party for any cost or damages, including any direct, indirect, special, incidental, or consequential damages, arising out of or in connection with the access or use of, or the inability to access or use, the Site or any of the Materials or Services described herein.", "boilerplate_license": "Creative Commons 4.0 Attribution.", "methodology": "This data was estimated by combining GTFS real-time vehicle positions with GTFS scheduled trips and shapes. GTFS real-time (RT) vehicle positions are spatially joined to GTFS scheduled shapes, so only vehicle positions traveling along the route alignment path are kept. A sample of five vehicle positions are selected (min, 25th percentile, 50th percentile, 75th percentile, max). The trip speed is calculated using these five vehicle positions. Each trip is categorized into a time-of-day. The average speed for a route-direction-time_of_day is calculated. Additional metrics are stored, such as the number of trips observed, the average scheduled service minutes, and the average RT observed service minutes. For convenience, we also provide a singular shape (common_shape_id) to associate with a route-direction. This is the shape that had the most number of trips for a given route-direction. Time-of-day is determined by the GTFS scheduled trip start time. The trip start hour (military time) is categorized based on the following: Owl (0-3), Early AM (4-6), AM Peak (7-9), Midday (10-14), PM Peak (15-19), and Evening (20-23). The start and end hours are inclusive (e.g., 4-6 refers to 4am, 5am, and 6am).", "data_dict_url": "https://gisdata.dot.ca.gov/arcgis/rest/services/CHrailroad/Speeds_By_Route_Time_of_Day/FeatureServer", "revision_date": "2024-08-14", "rename_cols": {"stop_seque": "stop_sequence", "time_of_da": "time_of_day", "time_perio": "time_period", "district_n": "district_name", "direction_": "direction_id", "common_sha": "common_shape_id", "avg_sched_": "avg_sched_trip_min", "avg_rt_tri": "avg_rt_trip_min", "caltrans_d": "district_name", "organization_source_record_id": "org_id", "organization_name": "agency", "stop_pair_": "stop_pair_name"}}} \ No newline at end of file +{"ca_hq_transit_areas": {"dataset_name": "ca_hq_transit_areas", "publish_entity": "Data & Digital Services / California Integrated Travel Project", "summary_purpose": "Estimated High Quality Transit Areas as described in Public Resources Code 21155, 21064.3, 21060.2.", "description": "Use GTFS schedule trips, stop_times, shapes, and stops to estimate whether corridor segments have scheduled frequencies of 15 minutes or less.", "public_access": "Public.", "creation_date": "2022-02-08", "place": "California", "status": "completed", "frequency": "monthly", "theme_topic": "transportation", "theme_keywords": "Transportation, Land Use, Transit-Oriented Development, TOD, High Quality Transit", "data_dict_type": "XML", "readme": "https://github.com/cal-itp/data-analyses/blob/main/high_quality_transit_areas/README.md", "readme_desc": "This site allows you to access the code used to create this dataset and provides additional explanatory resources.", "contact_organization": "Caltrans", "contact_person": "Eric Dasmalchi", "contact_email": "eric.dasmalchi@dot.ca.gov", "horiz_accuracy": "4 meters", "boilerplate_desc": "The data are made available to the public solely for informational purposes. Information provided in the Caltrans GIS Data Library is accurate to the best of our knowledge and is subject to change on a regular basis, without notice. While Caltrans makes every effort to provide useful and accurate information, we do not warrant the information Use Limitation - The data are made available to the public solely for informational purposes. Information provided in the Caltrans GIS Data Library is accurate to the best of our knowledge and is subject to change on a regular basis, without notice. While Caltrans makes every effort to provide useful and accurate information, we do not warrant the information to be authoritative, complete, factual, or timely. Information is provided on an 'as is' and an 'as available' basis. The Department of Transportation is not liable to any party for any cost or damages, including any direct, indirect, special, incidental, or consequential damages, arising out of or in connection with the access or use of, or the inability to access or use, the Site or any of the Materials or Services described herein.", "boilerplate_license": "Creative Commons 4.0 Attribution.", "methodology": "This data was estimated using a spatial process derived from General Transit Feed Specification (GTFS) schedule data. To find high-quality bus corridors, we split each corridor into 1,500 meter segments and counted frequencies at the stop within that segment with the highest number of transit trips. If that stop saw at least 4 trips per hour for at least one hour in the morning, and again for at least one hour in the afternoon, we consider that segment a high-quality bus corridor. Segments without a stop are not considered high-quality corridors. Major transit stops were identified as either the intersection of two high-quality corridors from the previous step, a rail or bus rapid transit station, or a ferry terminal with bus service. Note that the definition of `bus rapid transit` in Public Resources Code 21060.2 includes features not captured by available data sources, these features were captured manually using information from transit agency sources and imagery. We believe this data to be broadly accurate, and fit for purposes including overall dashboards, locating facilities in relation to high quality transit areas, and assessing community transit coverage. However, the spatial determination of high-quality transit areas from GTFS data necessarily involves some assumptions as described above. Any critical determinations of whether a specific parcel is located within a high-quality transit area should be made in conjunction with local sources, such as transit agency timetables. Notes: Null values may be present. The `hqta_details` columns defines which part of the Public Resources Code definition the HQTA classification was based on. If `hqta_details` references a single operator, then `agency_secondary` and `base64_url_secondary` are null. If `hqta_details` references the same operator, then `agency_secondary` and `base64_url_secondary` are the same as `agency_primary` and `base64_url_primary`.", "data_dict_url": "https://gisdata.dot.ca.gov/arcgis/rest/services/CHrailroad/CA_HQ_Transit_Areas/FeatureServer", "revision_date": "2024-09-18", "rename_cols": {"agency_pri": "agency_primary", "agency_sec": "agency_secondary", "hqta_detai": "hqta_details", "base64_url": "base64_url_primary", "base64_u_1": "base64_url_secondary", "org_id_pri": "org_id_primary", "org_id_sec": "org_id_secondary"}}, "ca_hq_transit_stops": {"dataset_name": "ca_hq_transit_stops", "publish_entity": "Data & Digital Services / California Integrated Travel Project", "summary_purpose": "Estimated stops along High Quality Transit Corridors, plus major transit stops for bus rapid transit, ferry, rail modes as described in Public Resources Code 21155, 21064.3, 21060.2.", "description": "Use GTFS schedule trips, stop_times, shapes, and stops to estimate whether corridor segments have scheduled frequencies of 15 minutes or less.", "public_access": "Public.", "creation_date": "2022-02-08", "place": "California", "status": "completed", "frequency": "monthly", "theme_topic": "transportation", "theme_keywords": "Transportation, Land Use, Transit-Oriented Development, TOD, High Quality Transit", "data_dict_type": "XML", "readme": "https://github.com/cal-itp/data-analyses/blob/main/high_quality_transit_areas/README.md", "readme_desc": "This site allows you to access the code used to create this dataset and provides additional explanatory resources.", "contact_organization": "Caltrans", "contact_person": "Eric Dasmalchi", "contact_email": "eric.dasmalchi@dot.ca.gov", "horiz_accuracy": "4 meters", "boilerplate_desc": "The data are made available to the public solely for informational purposes. Information provided in the Caltrans GIS Data Library is accurate to the best of our knowledge and is subject to change on a regular basis, without notice. While Caltrans makes every effort to provide useful and accurate information, we do not warrant the information Use Limitation - The data are made available to the public solely for informational purposes. Information provided in the Caltrans GIS Data Library is accurate to the best of our knowledge and is subject to change on a regular basis, without notice. While Caltrans makes every effort to provide useful and accurate information, we do not warrant the information to be authoritative, complete, factual, or timely. Information is provided on an 'as is' and an 'as available' basis. The Department of Transportation is not liable to any party for any cost or damages, including any direct, indirect, special, incidental, or consequential damages, arising out of or in connection with the access or use of, or the inability to access or use, the Site or any of the Materials or Services described herein.", "boilerplate_license": "Creative Commons 4.0 Attribution.", "methodology": "This data was estimated using a spatial process derived from General Transit Feed Specification (GTFS) schedule data. To find high-quality bus corridors, we split each corridor into 1,500 meter segments and counted frequencies at the stop within that segment with the highest number of transit trips. If that stop saw at least 4 trips per hour for at least one hour in the morning, and again for at least one hour in the afternoon, we consider that segment a high-quality bus corridor. Segments without a stop are not considered high-quality corridors. Major transit stops were identified as either the intersection of two high-quality corridors from the previous step, a rail or bus rapid transit station, or a ferry terminal with bus service. Note that the definition of `bus rapid transit` in Public Resources Code 21060.2 includes features not captured by available data sources, these features were captured manually using information from transit agency sources and imagery. We believe this data to be broadly accurate, and fit for purposes including overall dashboards, locating facilities in relation to high quality transit areas, and assessing community transit coverage. However, the spatial determination of high-quality transit areas from GTFS data necessarily involves some assumptions as described above. Any critical determinations of whether a specific parcel is located within a high-quality transit area should be made in conjunction with local sources, such as transit agency timetables. Notes: Null values may be present. The `hqta_details` columns defines which part of the Public Resources Code definition the HQTA classification was based on. If `hqta_details` references a single operator, then `agency_secondary` and `base64_url_secondary` are null. If `hqta_details` references the same operator, then `agency_secondary` and `base64_url_secondary` are the same as `agency_primary` and `base64_url_primary`.", "data_dict_url": "https://gisdata.dot.ca.gov/arcgis/rest/services/CHrailroad/CA_HQ_Transit_Stops/FeatureServer", "revision_date": "2024-09-18", "rename_cols": {"agency_pri": "agency_primary", "agency_sec": "agency_secondary", "hqta_detai": "hqta_details", "base64_url": "base64_url_primary", "base64_u_1": "base64_url_secondary", "org_id_pri": "org_id_primary", "org_id_sec": "org_id_secondary"}}, "ca_transit_routes": {"dataset_name": "ca_transit_routes", "publish_entity": "Data & Digital Services / California Integrated Travel Project", "summary_purpose": "Provide all CA transit stops and routes (geospatial) from all transit operators.", "description": "Provide compiled GTFS schedule data in geospatial format. Transit routes associates route information to shapes. Transit stops associates aggregated stop times and route information aggregated for each stops.", "public_access": "Public.", "creation_date": "2022-02-08", "place": "California", "status": "completed", "frequency": "monthly", "theme_topic": "transportation", "theme_keywords": "Transportation, GTFS, Transit routes, Transit stops, Transit", "data_dict_type": "XML", "readme": "https://github.com/cal-itp/data-analyses/blob/main/open_data/README.md", "readme_desc": "This site allows you to access the code used to create this dataset and provides additional explanatory resources.", "contact_organization": "Caltrans", "contact_person": "Tiffany Ku", "contact_email": "tiffany.ku@dot.ca.gov", "horiz_accuracy": "4 meters", "boilerplate_desc": "The data are made available to the public solely for informational purposes. Information provided in the Caltrans GIS Data Library is accurate to the best of our knowledge and is subject to change on a regular basis, without notice. While Caltrans makes every effort to provide useful and accurate information, we do not warrant the information Use Limitation - The data are made available to the public solely for informational purposes. Information provided in the Caltrans GIS Data Library is accurate to the best of our knowledge and is subject to change on a regular basis, without notice. While Caltrans makes every effort to provide useful and accurate information, we do not warrant the information to be authoritative, complete, factual, or timely. Information is provided on an 'as is' and an 'as available' basis. The Department of Transportation is not liable to any party for any cost or damages, including any direct, indirect, special, incidental, or consequential damages, arising out of or in connection with the access or use of, or the inability to access or use, the Site or any of the Materials or Services described herein.", "boilerplate_license": "Creative Commons 4.0 Attribution.", "methodology": "This data was assembled from the General Transit Feed Specification (GTFS) schedule data. GTFS tables are text files, but these have been compiled for all operators and transformed into geospatial data, with minimal data processing. The transit routes dataset is assembled from two tables: (1) `shapes.txt`, which defines the route alignment path, and (2) `trips.txt` and `stops.txt`, for routes not found in `shapes.txt`. `shapes.txt` is an optional GTFS table with richer information than just transit stop longitude and latitude. The transit stops dataset is assembled from `stops.txt`, which contains information about the route, stop sequence, and stop longitude and latitude. References: https://gtfs.org/. https://gtfs.org/schedule/reference/#shapestxt. https://gtfs.org/schedule/reference/#stopstxt. https://gtfs.org/schedule/reference/#tripstxt.", "data_dict_url": "https://gisdata.dot.ca.gov/arcgis/rest/services/CHrailroad/CA_Transit_Routes/FeatureServer", "revision_date": "2024-09-18", "rename_cols": {"n_hours_in": "n_hours_in_service"}}, "ca_transit_stops": {"dataset_name": "ca_transit_stops", "publish_entity": "Data & Digital Services / California Integrated Travel Project", "summary_purpose": "Provide all CA transit stops and routes (geospatial) from all transit operators.", "description": "Provide compiled GTFS schedule data in geospatial format. Transit routes associates route information to shapes. Transit stops associates aggregated stop times and route information aggregated for each stops.", "public_access": "Public.", "creation_date": "2022-02-08", "place": "California", "status": "completed", "frequency": "monthly", "theme_topic": "transportation", "theme_keywords": "Transportation, GTFS, Transit routes, Transit stops, Transit", "data_dict_type": "XML", "readme": "https://github.com/cal-itp/data-analyses/blob/main/open_data/README.md", "readme_desc": "This site allows you to access the code used to create this dataset and provides additional explanatory resources.", "contact_organization": "Caltrans", "contact_person": "Tiffany Ku", "contact_email": "tiffany.ku@dot.ca.gov", "horiz_accuracy": "4 meters", "boilerplate_desc": "The data are made available to the public solely for informational purposes. Information provided in the Caltrans GIS Data Library is accurate to the best of our knowledge and is subject to change on a regular basis, without notice. While Caltrans makes every effort to provide useful and accurate information, we do not warrant the information Use Limitation - The data are made available to the public solely for informational purposes. Information provided in the Caltrans GIS Data Library is accurate to the best of our knowledge and is subject to change on a regular basis, without notice. While Caltrans makes every effort to provide useful and accurate information, we do not warrant the information to be authoritative, complete, factual, or timely. Information is provided on an 'as is' and an 'as available' basis. The Department of Transportation is not liable to any party for any cost or damages, including any direct, indirect, special, incidental, or consequential damages, arising out of or in connection with the access or use of, or the inability to access or use, the Site or any of the Materials or Services described herein.", "boilerplate_license": "Creative Commons 4.0 Attribution.", "methodology": "This data was assembled from the General Transit Feed Specification (GTFS) schedule data. GTFS tables are text files, but these have been compiled for all operators and transformed into geospatial data, with minimal data processing. The transit routes dataset is assembled from two tables: (1) `shapes.txt`, which defines the route alignment path, and (2) `trips.txt` and `stops.txt`, for routes not found in `shapes.txt`. `shapes.txt` is an optional GTFS table with richer information than just transit stop longitude and latitude. The transit stops dataset is assembled from `stops.txt`, which contains information about the route, stop sequence, and stop longitude and latitude. References: https://gtfs.org/. https://gtfs.org/schedule/reference/#shapestxt. https://gtfs.org/schedule/reference/#stopstxt. https://gtfs.org/schedule/reference/#tripstxt.", "data_dict_url": "https://gisdata.dot.ca.gov/arcgis/rest/services/CHrailroad/CA_Transit_Stops/FeatureServer", "revision_date": "2024-09-18", "rename_cols": {"n_hours_in": "n_hours_in_service"}}, "speeds_by_stop_segments": {"dataset_name": "speeds_by_stop_segments", "publish_entity": "Data & Digital Services / California Integrated Travel Project", "summary_purpose": "Average all-day, peak, and offpeak transit speeds by segments for all CA operators that provide GTFS real-time vehicle positions data.", "description": "All day and peak transit 20th, 50th, and 80th percentile speeds on stop segments estimated on a single day for all CA transit operators that provide GTFS real-time vehicle positions data.", "public_access": "Public.", "creation_date": "2023-06-14", "place": "California", "status": "completed", "frequency": "monthly", "theme_topic": "transportation", "theme_keywords": "Transportation, Transit, GTFS, GTFS RT, real time, speeds, vehicle positions ", "data_dict_type": "XML", "readme": "https://github.com/cal-itp/data-analyses/blob/main/rt_segment_speeds/README.md", "readme_desc": "This site allows you to access the code used to create this dataset and provides additional explanatory resources.", "contact_organization": "Caltrans", "contact_person": "Tiffany Ku / Eric Dasmalchi", "contact_email": "tiffany.ku@dot.ca.gov / eric.dasmalchi@dot.ca.gov", "horiz_accuracy": "4 meters", "boilerplate_desc": "The data are made available to the public solely for informational purposes. Information provided in the Caltrans GIS Data Library is accurate to the best of our knowledge and is subject to change on a regular basis, without notice. While Caltrans makes every effort to provide useful and accurate information, we do not warrant the information Use Limitation - The data are made available to the public solely for informational purposes. Information provided in the Caltrans GIS Data Library is accurate to the best of our knowledge and is subject to change on a regular basis, without notice. While Caltrans makes every effort to provide useful and accurate information, we do not warrant the information to be authoritative, complete, factual, or timely. Information is provided on an 'as is' and an 'as available' basis. The Department of Transportation is not liable to any party for any cost or damages, including any direct, indirect, special, incidental, or consequential damages, arising out of or in connection with the access or use of, or the inability to access or use, the Site or any of the Materials or Services described herein.", "boilerplate_license": "Creative Commons 4.0 Attribution.", "methodology": "This data was estimated by combining GTFS real-time vehicle positions to GTFS scheduled trips, shapes, stops, and stop times tables. GTFS shapes provides the route alignment path. Multiple trips may share the same shape, with a route typically associated with multiple shapes. Shapes are cut into segments at stop positions (stop_id-stop_sequence combination). A `stop segment` refers to the portion of shapes between the prior stop and the current stop. Vehicle positions are spatially joined to 35 meter buffered segments. Within each segment-trip, the first and last vehicle position observed are used to calculate the speed. Since multiple trips may occur over a segment each day, the multiple trip speeds provide a distribution. From this distribution, the 20th percentile, 50th percentile (median), and 80th percentile speeds are calculated. For all day speed metrics, all trips are used. For peak speed metrics, only trips with start times between 7 - 9:59 AM and 4 - 7:59 PM are used to find the 20th, 50th, and 80th percentile metrics. Data processing notes: (a) GTFS RT trips whose vehicle position timestamps span 10 minutes or less are dropped. Incomplete data would lead to unreliable estimates of speed at the granularity we need. (b) Segment-trip speeds of over 70 mph are excluded. These are erroneously calculated as transit does not typically reach those speeds. (c) Other missing or erroneous calculations, either arising from only one vehicle position found in a segment (change in time or change in distance cannot be calculated).", "data_dict_url": "https://gisdata.dot.ca.gov/arcgis/rest/services/CHrailroad/Speeds_By_Stop_Segments/FeatureServer", "revision_date": "2024-09-18", "rename_cols": {"stop_seque": "stop_sequence", "time_of_da": "time_of_day", "time_perio": "time_period", "district_n": "district_name", "direction_": "direction_id", "common_sha": "common_shape_id", "avg_sched_": "avg_sched_trip_min", "avg_rt_tri": "avg_rt_trip_min", "caltrans_d": "district_name", "organization_source_record_id": "org_id", "organization_name": "agency", "stop_pair_": "stop_pair_name"}}, "speeds_by_route_time_of_day": {"dataset_name": "speeds_by_route_time_of_day", "publish_entity": "Data & Digital Services / California Integrated Travel Project", "summary_purpose": "Average transit speeds by route-direction estimated on a single day for all CA transit operators that provide GTFS real-time vehicle positions data.", "description": "Provide average transit speeds, number of trips by route-direction.", "public_access": "Public.", "creation_date": "2023-06-14", "place": "California", "status": "completed", "frequency": "monthly", "theme_topic": "transportation", "theme_keywords": "Transportation, Transit, GTFS, GTFS RT, real time, speeds, vehicle positions ", "data_dict_type": "XML", "readme": "https://github.com/cal-itp/data-analyses/blob/main/rt_segment_speeds/README.md", "readme_desc": "This site allows you to access the code used to create this dataset and provides additional explanatory resources.", "contact_organization": "Caltrans", "contact_person": "Tiffany Ku", "contact_email": "tiffany.ku@dot.ca.gov", "horiz_accuracy": "4 meters", "boilerplate_desc": "The data are made available to the public solely for informational purposes. Information provided in the Caltrans GIS Data Library is accurate to the best of our knowledge and is subject to change on a regular basis, without notice. While Caltrans makes every effort to provide useful and accurate information, we do not warrant the information Use Limitation - The data are made available to the public solely for informational purposes. Information provided in the Caltrans GIS Data Library is accurate to the best of our knowledge and is subject to change on a regular basis, without notice. While Caltrans makes every effort to provide useful and accurate information, we do not warrant the information to be authoritative, complete, factual, or timely. Information is provided on an 'as is' and an 'as available' basis. The Department of Transportation is not liable to any party for any cost or damages, including any direct, indirect, special, incidental, or consequential damages, arising out of or in connection with the access or use of, or the inability to access or use, the Site or any of the Materials or Services described herein.", "boilerplate_license": "Creative Commons 4.0 Attribution.", "methodology": "This data was estimated by combining GTFS real-time vehicle positions with GTFS scheduled trips and shapes. GTFS real-time (RT) vehicle positions are spatially joined to GTFS scheduled shapes, so only vehicle positions traveling along the route alignment path are kept. A sample of five vehicle positions are selected (min, 25th percentile, 50th percentile, 75th percentile, max). The trip speed is calculated using these five vehicle positions. Each trip is categorized into a time-of-day. The average speed for a route-direction-time_of_day is calculated. Additional metrics are stored, such as the number of trips observed, the average scheduled service minutes, and the average RT observed service minutes. For convenience, we also provide a singular shape (common_shape_id) to associate with a route-direction. This is the shape that had the most number of trips for a given route-direction. Time-of-day is determined by the GTFS scheduled trip start time. The trip start hour (military time) is categorized based on the following: Owl (0-3), Early AM (4-6), AM Peak (7-9), Midday (10-14), PM Peak (15-19), and Evening (20-23). The start and end hours are inclusive (e.g., 4-6 refers to 4am, 5am, and 6am).", "data_dict_url": "https://gisdata.dot.ca.gov/arcgis/rest/services/CHrailroad/Speeds_By_Route_Time_of_Day/FeatureServer", "revision_date": "2024-09-18", "rename_cols": {"stop_seque": "stop_sequence", "time_of_da": "time_of_day", "time_perio": "time_period", "district_n": "district_name", "direction_": "direction_id", "common_sha": "common_shape_id", "avg_sched_": "avg_sched_trip_min", "avg_rt_tri": "avg_rt_trip_min", "caltrans_d": "district_name", "organization_source_record_id": "org_id", "organization_name": "agency", "stop_pair_": "stop_pair_name"}}} \ No newline at end of file diff --git a/open_data/metadata.yml b/open_data/metadata.yml index cc48bc42f..f6e3fa4bc 100644 --- a/open_data/metadata.yml +++ b/open_data/metadata.yml @@ -28,7 +28,7 @@ common-fields: - &traffic_ops_purpose |- Provide all CA transit stops and routes (geospatial) from all transit operators. - &traffic_ops_description |- - Provide compiled GTFS schedule data in geospatial format. Transit routes associates route information to shapes. Transit stops associates route information to stops. + Provide compiled GTFS schedule data in geospatial format. Transit routes associates route information to shapes. Transit stops associates aggregated stop times and route information aggregated for each stops. - &speeds_readme |- https://github.com/cal-itp/data-analyses/blob/main/rt_segment_speeds/README.md - &speeds_keywords |- diff --git a/open_data/metadata_update_pro.py b/open_data/metadata_update_pro.py index 89ab62cd3..709c0aa01 100644 --- a/open_data/metadata_update_pro.py +++ b/open_data/metadata_update_pro.py @@ -14,6 +14,7 @@ from update_vars import DEFAULT_XML_TEMPLATE, XML_FOLDER, META_JSON + # This prefix keeps coming up, but xmltodict has trouble processing or replacing it x = "ns0:" main = f"{x}MD_Metadata" @@ -296,3 +297,17 @@ def update_dataset_metadata_xml( with open(OUTPUT_FOLDER.joinpath(f"{dataset_name}.xml"), 'w') as f: f.write(new_xml) print("Save over existing XML") + + +if __name__=="__main__": + + from update_vars import RUN_ME + assert str(Path.cwd()).endswith("open_data"), "this script must be run from open_data directory!" + + for i in RUN_ME: + print(i) + print("-------------------------------------------") + update_dataset_metadata_xml( + i, + metadata_path = META_JSON, + ) \ No newline at end of file diff --git a/open_data/prep_traffic_ops.py b/open_data/open_data_utils.py similarity index 71% rename from open_data/prep_traffic_ops.py rename to open_data/open_data_utils.py index 1eb8301e0..978f46bba 100644 --- a/open_data/prep_traffic_ops.py +++ b/open_data/open_data_utils.py @@ -133,11 +133,76 @@ def export_to_subfolder(file_name: str, date: str): f"{file_name_sanitized}_{date}" ) - + +STANDARDIZED_COLUMNS_DICT = { + "caltrans_district": "district_name", + "organization_source_record_id": "org_id", + "organization_name": "agency", + "agency_name_primary": "agency_primary", + "agency_name_secondary": "agency_secondary" +} + + +def standardize_column_names(df: gpd.GeoDataFrame) -> gpd.GeoDataFrame: + """ + Standardize how agency is referred to. + """ + return df.rename(columns = STANDARDIZED_COLUMNS_DICT) + + +def remove_internal_keys(df: gpd.GeoDataFrame) -> gpd.GeoDataFrame: + """ + Remove columns used in our internal data modeling. + Leave only natural identifiers (route_id, shape_id). + Remove shape_array_key, gtfs_dataset_key, etc. + """ + exclude_list = [ + "sec_elapsed", "meters_elapsed", + "name", "schedule_gtfs_dataset_key" + ] + cols = [c for c in df.columns] + + internal_cols = [c for c in cols if "_key" in c or c in exclude_list] + + print(f"drop: {internal_cols}") + + return df.drop(columns = internal_cols) + + # Define column names, must fit ESRI 10 character limits RENAME_COLS = { "organization_name": "agency", "organization_source_record_id": "org_id", "route_name_used": "route_name", "route_types_served": "routetypes" +} + +# Rename columns when shapefile truncates +RENAME_HQTA = { + "agency_pri": "agency_primary", + "agency_sec": "agency_secondary", + "hqta_detai": "hqta_details", + "base64_url": "base64_url_primary", + "base64_u_1": "base64_url_secondary", + "org_id_pri": "org_id_primary", + "org_id_sec": "org_id_secondary", +} + +RENAME_SPEED = { + "stop_seque": "stop_sequence", + "time_of_da": "time_of_day", + "time_perio": "time_period", + "district_n": "district_name", + "direction_": "direction_id", + "common_sha": "common_shape_id", + "avg_sched_": "avg_sched_trip_min", + "avg_rt_tri": "avg_rt_trip_min", + "caltrans_d": "district_name", + "organization_source_record_id": "org_id", + "organization_name": "agency", + "stop_pair_": "stop_pair_name" +} + +RENAME_GTFS = { + "n_hours_in": "n_hours_in_service" } \ No newline at end of file diff --git a/open_data/supplement_meta.py b/open_data/supplement_meta.py index edda0d4c2..c8948dc23 100644 --- a/open_data/supplement_meta.py +++ b/open_data/supplement_meta.py @@ -9,7 +9,7 @@ from calitp_data_analysis import utils from update_vars import analysis_date, ESRI_BASE_URL -from publish_utils import RENAME_HQTA, RENAME_SPEED +from open_data_utils import RENAME_HQTA, RENAME_SPEED, RENAME_GTFS def get_esri_url(name: str)-> str: return f"{ESRI_BASE_URL}{name}/FeatureServer" @@ -46,11 +46,13 @@ def get_esri_url(name: str)-> str: "methodology": TRAFFIC_OPS_METHODOLOGY, "data_dict_url": get_esri_url("CA_Transit_Routes"), "revision_date": analysis_date, + "rename_cols": RENAME_GTFS }, "ca_transit_stops": { "methodology": TRAFFIC_OPS_METHODOLOGY, "data_dict_url": get_esri_url("CA_Transit_Stops"), "revision_date": analysis_date, + "rename_cols": RENAME_GTFS }, "speeds_by_stop_segments": { "methodology": SEGMENT_METHODOLOGY, diff --git a/open_data/update_data_dict.py b/open_data/update_data_dict.py index def688f64..8b199d3ba 100644 --- a/open_data/update_data_dict.py +++ b/open_data/update_data_dict.py @@ -7,7 +7,7 @@ from pathlib import Path from typing import Union -import publish_utils +import open_data_utils from update_vars import analysis_date catalog = intake.open_catalog("catalog.yml") @@ -55,12 +55,12 @@ def new_columns_for_data_dict( # Columns in our dataset FILE = catalog[t].urlpath gdf = gpd.read_parquet(FILE).pipe( - publish_utils.standardize_column_names + open_data_utils.standardize_column_names ).pipe( - publish_utils.remove_internal_keys) + open_data_utils.remove_internal_keys) if "hq_" in t: - gdf = gdf.rename(columns = publish_utils.RENAME_HQTA) + gdf = gdf.rename(columns = open_data_utils.RENAME_HQTA) elif "speed" in t: gdf = gdf.rename(columns = publish_utils.RENAME_SPEED) From 32fd046768c1ee856945c0f872ce3d4fe50d6e80 Mon Sep 17 00:00:00 2001 From: tiffanychu90 Date: Thu, 3 Oct 2024 22:04:14 +0000 Subject: [PATCH 06/10] (remove): publish_utils, combined into open_data_utils --- open_data/publish_utils.py | 62 -------------------------------------- 1 file changed, 62 deletions(-) delete mode 100644 open_data/publish_utils.py diff --git a/open_data/publish_utils.py b/open_data/publish_utils.py deleted file mode 100644 index 7865d7206..000000000 --- a/open_data/publish_utils.py +++ /dev/null @@ -1,62 +0,0 @@ -import geopandas as gpd -import pandas as pd - -STANDARDIZED_COLUMNS_DICT = { - "caltrans_district": "district_name", - "organization_source_record_id": "org_id", - "organization_name": "agency", - "agency_name_primary": "agency_primary", - "agency_name_secondary": "agency_secondary" -} - - -# Rename columns when shapefile truncates -RENAME_HQTA = { - "agency_pri": "agency_primary", - "agency_sec": "agency_secondary", - "hqta_detai": "hqta_details", - "base64_url": "base64_url_primary", - "base64_u_1": "base64_url_secondary", - "org_id_pri": "org_id_primary", - "org_id_sec": "org_id_secondary", -} - -RENAME_SPEED = { - "stop_seque": "stop_sequence", - "time_of_da": "time_of_day", - "time_perio": "time_period", - "district_n": "district_name", - "direction_": "direction_id", - "common_sha": "common_shape_id", - "avg_sched_": "avg_sched_trip_min", - "avg_rt_tri": "avg_rt_trip_min", - "caltrans_d": "district_name", - "organization_source_record_id": "org_id", - "organization_name": "agency", - "stop_pair_": "stop_pair_name" -} - -def standardize_column_names(df: gpd.GeoDataFrame) -> gpd.GeoDataFrame: - """ - Standardize how agency is referred to. - """ - return df.rename(columns = STANDARDIZED_COLUMNS_DICT) - - -def remove_internal_keys(df: gpd.GeoDataFrame) -> gpd.GeoDataFrame: - """ - Remove columns used in our internal data modeling. - Leave only natural identifiers (route_id, shape_id). - Remove shape_array_key, gtfs_dataset_key, etc. - """ - exclude_list = [ - "sec_elapsed", "meters_elapsed", - "name", "schedule_gtfs_dataset_key" - ] - cols = [c for c in df.columns] - - internal_cols = [c for c in cols if "_key" in c or c in exclude_list] - - print(f"drop: {internal_cols}") - - return df.drop(columns = internal_cols) \ No newline at end of file From e1d3aeaab6897a003aaec2d305ff7bd805375a8a Mon Sep 17 00:00:00 2001 From: tiffanychu90 Date: Fri, 4 Oct 2024 00:32:29 +0000 Subject: [PATCH 07/10] refactor create routes and add patching --- open_data/create_routes_data.py | 112 ++++++++++++++++++++++++-------- open_data/create_stops_data.py | 6 +- open_data/open_data_utils.py | 21 +----- 3 files changed, 91 insertions(+), 48 deletions(-) diff --git a/open_data/create_routes_data.py b/open_data/create_routes_data.py index 9b4da38b0..9293808b9 100644 --- a/open_data/create_routes_data.py +++ b/open_data/create_routes_data.py @@ -2,30 +2,40 @@ Create routes file with identifiers including route_id, route_name, operator name. """ +import datetime import geopandas as gpd import pandas as pd +import yaml -from datetime import datetime - -import prep_traffic_ops +import open_data_utils from calitp_data_analysis import utils, geography_utils -from shared_utils import gtfs_utils_v2, portfolio_utils +from shared_utils import gtfs_utils_v2, portfolio_utils, publish_utils from segment_speed_utils import helpers from update_vars import analysis_date, TRAFFIC_OPS_GCS def create_routes_file_for_export(date: str) -> gpd.GeoDataFrame: - + """ + Create a shapes (with associated route info) file for export. + This allows users to plot the various shapes, + transit path options, and select between variations for + a given route. + """ # Read in local parquets trips = helpers.import_scheduled_trips( date, - columns = prep_traffic_ops.keep_trip_cols, + columns = [ + "gtfs_dataset_key", + "route_id", "route_type", + "shape_id", "shape_array_key", + "route_long_name", "route_short_name", "route_desc" + ], get_pandas = True ).dropna(subset="shape_array_key") shapes = helpers.import_scheduled_shapes( date, - columns = prep_traffic_ops.keep_shape_cols, + columns = ["shape_array_key", "n_trips", "geometry"], get_pandas = True, crs = geography_utils.WGS84 ).dropna(subset="shape_array_key") @@ -35,21 +45,21 @@ def create_routes_file_for_export(date: str) -> gpd.GeoDataFrame: trips, on = "shape_array_key", how = "inner" - ).drop(columns = "trip_id").drop_duplicates(subset="shape_array_key") - - df2 = remove_erroneous_shapes(df) - + ).drop_duplicates(subset="shape_array_key").drop(columns = "shape_array_key") + drop_cols = ["route_short_name", "route_long_name", "route_desc"] + route_shape_cols = ["schedule_gtfs_dataset_key", "route_id", "shape_id"] - routes_assembled = (portfolio_utils.add_route_name(df2) + routes_assembled = (portfolio_utils.add_route_name(df) .drop(columns = drop_cols) - .sort_values(["name", "route_id", "shape_id"]) - .drop_duplicates(subset=[ - "name", "route_id", "shape_id"]) + .sort_values(route_shape_cols) + .drop_duplicates(subset=route_shape_cols) .reset_index(drop=True) ) - routes_assembled2 = prep_traffic_ops.standardize_operator_info_for_exports( - routes_assembled, date) + routes_assembled2 = open_data_utils.standardize_operator_info_for_exports( + routes_assembled, + date + ).pipe(remove_erroneous_shapes) return routes_assembled2 @@ -83,6 +93,51 @@ def remove_erroneous_shapes( return ok_shapes +def patch_previous_dates( + current_routes: gpd.GeoDataFrame, + current_date: str, + published_operators_yaml: str = "../gtfs_funnel/published_operators.yml" +) -> gpd.GeoDataFrame: + """ + Compare to the yaml for what operators we want, and + patch in previous dates for the 10 or so operators + that do not have data for this current date. + """ + with open(published_operators_yaml) as f: + published_operators_dict = yaml.safe_load(f) + + patch_operators_dict = { + str(date): operator_list for + date, operator_list in published_operators_dict.items() + if str(date) != current_date + } + + partial_dfs = [] + + + for one_date, operator_list in patch_operators_dict.items(): + df_to_add = publish_utils.subset_table_from_previous_date( + gcs_bucket = TRAFFIC_OPS_GCS, + filename = f"export/ca_transit_routes", + operator_and_dates_dict = patch_operators_dict, + date = one_date, + crosswalk_col = "schedule_gtfs_dataset_key", + data_type = "gdf" + ).pipe(open_data_utils.standardize_operator_info_for_exports, one_date) + + partial_dfs.append(df_to_add) + + patch_routes = pd.concat(partial_dfs, axis=0, ignore_index=True) + + published_routes = pd.concat( + [current_routes, patch_routes], + axis=0, ignore_index=True + ) + + return published_routes + + + def finalize_export_df(df: gpd.GeoDataFrame) -> gpd.GeoDataFrame: """ Suppress certain columns used in our internal modeling for export. @@ -99,7 +154,7 @@ def finalize_export_df(df: gpd.GeoDataFrame) -> gpd.GeoDataFrame: col_order = route_cols + shape_cols + agency_ids + ['geometry'] df2 = (df[df.schedule_gtfs_dataset_key.isin(public_feeds)][col_order] .reindex(columns = col_order) - .rename(columns = prep_traffic_ops.RENAME_COLS) + .rename(columns = open_data_utils.RENAME_COLS) .reset_index(drop=True) ) @@ -107,22 +162,27 @@ def finalize_export_df(df: gpd.GeoDataFrame) -> gpd.GeoDataFrame: if __name__ == "__main__": - time0 = datetime.now() + time0 = datetime.datetime.now() # Make an operator-feed level file (this is published) - # This is feed-level already, but we already keep only 1 feed per operator routes = create_routes_file_for_export(analysis_date) - routes2 = finalize_export_df(routes) + utils.geoparquet_gcs_export( + routes, + TRAFFIC_OPS_GCS, + f"export/ca_transit_routes_{analysis_date}" + ) + published_routes = patch_previous_dates( + routes, + analysis_date, + ).pipe(finalize_export_df) + utils.geoparquet_gcs_export( - routes2, + published_routes, TRAFFIC_OPS_GCS, "ca_transit_routes" ) - prep_traffic_ops.export_to_subfolder( - "ca_transit_routes", analysis_date) - - time1 = datetime.now() + time1 = datetime.datetime.now() print(f"Execution time for routes script: {time1-time0}") diff --git a/open_data/create_stops_data.py b/open_data/create_stops_data.py index c02300569..4435f206a 100644 --- a/open_data/create_stops_data.py +++ b/open_data/create_stops_data.py @@ -117,8 +117,10 @@ def finalize_export_df(df: gpd.GeoDataFrame) -> gpd.GeoDataFrame: stops = create_stops_file_for_export(analysis_date) - open_data_utils.export_to_subfolder( - "ca_transit_stops", analysis_date + utils.geoparquet_gcs_export( + stops, + TRAFFIC_OPS_GCS, + f"export/ca_transit_stops_{analysis_date}" ) published_stops = patch_previous_dates( diff --git a/open_data/open_data_utils.py b/open_data/open_data_utils.py index 978f46bba..983aba033 100644 --- a/open_data/open_data_utils.py +++ b/open_data/open_data_utils.py @@ -6,7 +6,7 @@ import intake import pandas as pd -from calitp_data_analysis import utils, geography_utils +from calitp_data_analysis import geography_utils from shared_utils import gtfs_utils_v2, schedule_rt_utils from update_vars import TRAFFIC_OPS_GCS, analysis_date, GTFS_DATA_DICT, SCHED_GCS @@ -114,25 +114,6 @@ def clip_to_usa(gdf: gpd.GeoDataFrame) -> gpd.GeoDataFrame: return gdf2 - -def export_to_subfolder(file_name: str, date: str): - """ - We always overwrite the same geoparquets each month, and point our - shared_utils/shared_data_catalog.yml to the latest file. - - But, save historical exports just in case. - """ - file_name_sanitized = utils.sanitize_file_path(file_name) - - gdf = gpd.read_parquet( - f"{TRAFFIC_OPS_GCS}{file_name_sanitized}.parquet") - - utils.geoparquet_gcs_export( - gdf, - f"{TRAFFIC_OPS_GCS}export/", - f"{file_name_sanitized}_{date}" - ) - STANDARDIZED_COLUMNS_DICT = { "caltrans_district": "district_name", From 6888f50b4ce410112de708c25fadac1a872c31ee Mon Sep 17 00:00:00 2001 From: tiffanychu90 Date: Fri, 4 Oct 2024 00:33:00 +0000 Subject: [PATCH 08/10] (remove): open_data script, work it into metadata_update_pro script --- open_data/Makefile | 2 +- open_data/open_data.py | 19 ------------------- 2 files changed, 1 insertion(+), 20 deletions(-) delete mode 100644 open_data/open_data.py diff --git a/open_data/Makefile b/open_data/Makefile index 8f7a42637..c8376836b 100644 --- a/open_data/Makefile +++ b/open_data/Makefile @@ -11,5 +11,5 @@ compile_open_data_portal: #python arcgis_script_pro.py #(in ESRI!) python update_data_dict.py # check if columns are missing in data_dictionary yml python update_fields_fgdc.py # populate fields with data dictionary yml values, run if update_data_dict had changes to incorporate - python open_data.py # go back into ESRI and update xml + python metadata_update_pro.py # go back into ESRI and update xml python cleanup.py # run after ESRI work done \ No newline at end of file diff --git a/open_data/open_data.py b/open_data/open_data.py deleted file mode 100644 index 421f94a47..000000000 --- a/open_data/open_data.py +++ /dev/null @@ -1,19 +0,0 @@ -""" -Track the metadata updates for all open data portal datasets. -""" -from pathlib import Path - -import metadata_update_pro -from update_vars import META_JSON, RUN_ME - - -if __name__=="__main__": - assert str(Path.cwd()).endswith("open_data"), "this script must be run from open_data directory!" - - for i in RUN_ME: - print(i) - print("-------------------------------------------") - metadata_update_pro.update_dataset_metadata_xml( - i, - metadata_path = META_JSON, - ) \ No newline at end of file From 5fe995975b55f1f2b5612922c8a60117d91340cc Mon Sep 17 00:00:00 2001 From: tiffanychu90 Date: Fri, 4 Oct 2024 21:04:14 +0000 Subject: [PATCH 09/10] add list of route_ids to scheduled stops, refactor geoportal routes layer --- gtfs_funnel/schedule_stats_by_stop.py | 13 +++++---- open_data/create_routes_data.py | 38 ++++++++++++++++----------- open_data/create_stops_data.py | 19 +++++++------- open_data/data_dictionary.yml | 12 ++++----- open_data/open_data_utils.py | 5 ++-- 5 files changed, 49 insertions(+), 38 deletions(-) diff --git a/gtfs_funnel/schedule_stats_by_stop.py b/gtfs_funnel/schedule_stats_by_stop.py index 55d0f3d11..898fe0538 100644 --- a/gtfs_funnel/schedule_stats_by_stop.py +++ b/gtfs_funnel/schedule_stats_by_stop.py @@ -25,8 +25,7 @@ def stats_for_stop( df .groupby(group_cols, group_keys=False) .agg({ - "trip_id": "nunique", - "route_id": "nunique", + "route_id": lambda x: list(sorted(set(x))), "route_type": lambda x: list(sorted(set(x))), "departure_sec": "count", "departure_hour": "nunique" @@ -34,17 +33,21 @@ def stats_for_stop( .rename(columns = { "departure_sec": "n_arrivals", "departure_hour": "n_hours_in_service", - "trip_id": "n_trips", - "route_id": "n_routes", + "route_id": "route_ids_served", "route_type": "route_types_served" }) ) + df2 = df2.assign( + n_routes = df2.apply(lambda x: len(x.route_ids_served), axis=1) + ) + # Instead of producing list, we want to show values like 0, 3 instead of [0, 3] # portal users can see combinations more quickly # and access particular rows using str.contains df2 = df2.assign( - route_types_served = df2.route_types_served.str.join(", ") + route_types_served = df2.route_types_served.str.join(", "), + route_ids_served = df2.route_ids_served.str.join(", "), ) return df2 diff --git a/open_data/create_routes_data.py b/open_data/create_routes_data.py index 9293808b9..a90c44297 100644 --- a/open_data/create_routes_data.py +++ b/open_data/create_routes_data.py @@ -8,8 +8,9 @@ import yaml import open_data_utils -from calitp_data_analysis import utils, geography_utils -from shared_utils import gtfs_utils_v2, portfolio_utils, publish_utils +from calitp_data_analysis.geography_utils import WGS84 +from calitp_data_analysis import utils +from shared_utils import portfolio_utils, publish_utils from segment_speed_utils import helpers from update_vars import analysis_date, TRAFFIC_OPS_GCS @@ -37,7 +38,7 @@ def create_routes_file_for_export(date: str) -> gpd.GeoDataFrame: date, columns = ["shape_array_key", "n_trips", "geometry"], get_pandas = True, - crs = geography_utils.WGS84 + crs = WGS84 ).dropna(subset="shape_array_key") df = pd.merge( @@ -50,12 +51,14 @@ def create_routes_file_for_export(date: str) -> gpd.GeoDataFrame: drop_cols = ["route_short_name", "route_long_name", "route_desc"] route_shape_cols = ["schedule_gtfs_dataset_key", "route_id", "shape_id"] - routes_assembled = (portfolio_utils.add_route_name(df) - .drop(columns = drop_cols) - .sort_values(route_shape_cols) - .drop_duplicates(subset=route_shape_cols) - .reset_index(drop=True) - ) + routes_assembled = ( + portfolio_utils.add_route_name(df) + .drop(columns = drop_cols) + .sort_values(route_shape_cols) + .drop_duplicates(subset=route_shape_cols) + .reset_index(drop=True) + ) + routes_assembled2 = open_data_utils.standardize_operator_info_for_exports( routes_assembled, date @@ -114,11 +117,10 @@ def patch_previous_dates( partial_dfs = [] - for one_date, operator_list in patch_operators_dict.items(): df_to_add = publish_utils.subset_table_from_previous_date( gcs_bucket = TRAFFIC_OPS_GCS, - filename = f"export/ca_transit_routes", + filename = f"ca_transit_routes", operator_and_dates_dict = patch_operators_dict, date = one_date, crosswalk_col = "schedule_gtfs_dataset_key", @@ -137,13 +139,10 @@ def patch_previous_dates( return published_routes - def finalize_export_df(df: gpd.GeoDataFrame) -> gpd.GeoDataFrame: """ Suppress certain columns used in our internal modeling for export. """ - public_feeds = gtfs_utils_v2.filter_to_public_schedule_gtfs_dataset_keys() - # Change column order route_cols = [ 'organization_source_record_id', 'organization_name', @@ -152,7 +151,7 @@ def finalize_export_df(df: gpd.GeoDataFrame) -> gpd.GeoDataFrame: agency_ids = ['base64_url'] col_order = route_cols + shape_cols + agency_ids + ['geometry'] - df2 = (df[df.schedule_gtfs_dataset_key.isin(public_feeds)][col_order] + df2 = (df[col_order] .reindex(columns = col_order) .rename(columns = open_data_utils.RENAME_COLS) .reset_index(drop=True) @@ -162,17 +161,24 @@ def finalize_export_df(df: gpd.GeoDataFrame) -> gpd.GeoDataFrame: if __name__ == "__main__": + time0 = datetime.datetime.now() # Make an operator-feed level file (this is published) routes = create_routes_file_for_export(analysis_date) + # Export into GCS (outside export/) + # create_routes is different than create_stops, which already has + # a table created in gtfs_funnel that we can use to patch in previous dates + # here, we have to create those for each date, then save a copy + # the export/ folder contains the patched versions of the routes utils.geoparquet_gcs_export( routes, TRAFFIC_OPS_GCS, - f"export/ca_transit_routes_{analysis_date}" + f"ca_transit_routes_{analysis_date}" ) + published_routes = patch_previous_dates( routes, analysis_date, diff --git a/open_data/create_stops_data.py b/open_data/create_stops_data.py index 4435f206a..c8a82d689 100644 --- a/open_data/create_stops_data.py +++ b/open_data/create_stops_data.py @@ -96,7 +96,8 @@ def finalize_export_df(df: gpd.GeoDataFrame) -> gpd.GeoDataFrame: stop_cols = [ 'stop_id', 'stop_name', # add GTFS stop-related metrics - 'n_trips', 'n_routes', 'route_types_served', 'n_arrivals', 'n_hours_in_service', + 'n_routes', 'route_ids_served', 'route_types_served', + 'n_arrivals', 'n_hours_in_service', ] agency_ids = ['base64_url'] @@ -112,22 +113,22 @@ def finalize_export_df(df: gpd.GeoDataFrame) -> gpd.GeoDataFrame: if __name__ == "__main__": - + time0 = datetime.datetime.now() stops = create_stops_file_for_export(analysis_date) - + + published_stops = patch_previous_dates( + stops, + analysis_date, + ).pipe(finalize_export_df) + utils.geoparquet_gcs_export( - stops, + published_stops, TRAFFIC_OPS_GCS, f"export/ca_transit_stops_{analysis_date}" ) - published_stops = patch_previous_dates( - stops, - analysis_date, - ).pipe(finalize_export_df) - utils.geoparquet_gcs_export( published_stops, TRAFFIC_OPS_GCS, diff --git a/open_data/data_dictionary.yml b/open_data/data_dictionary.yml index a8dd93792..b8ff9e42b 100644 --- a/open_data/data_dictionary.yml +++ b/open_data/data_dictionary.yml @@ -126,11 +126,6 @@ tables: stop_id: *stop_id stop_name: *stop_name base64_url: *base64_url - n_trips: - definition: |- - "# of daily trips (unique trip_ids) this stop serves" - definition_source: |- - "https://gtfs.org/schedule/reference/#stop_timestxt" n_routes: definition: |- "# of unique route_ids this stop serves" @@ -138,7 +133,12 @@ tables: "https://gtfs.org/schedule/reference/#tripstxt" routetypes: definition: |- - "# of unique route types served" + "List of unique route types served at the stop. A value of 3 indicates a route_type==3." + definition_source: |- + "https://gtfs.org/schedule/reference/#routestxt" + route_ids_served: + definition: |- + "List of unique route_ids served at the stop. A value of 1 indicates route_id==1." definition_source: |- "https://gtfs.org/schedule/reference/#routestxt" n_arrivals: diff --git a/open_data/open_data_utils.py b/open_data/open_data_utils.py index 983aba033..f0402ca1d 100644 --- a/open_data/open_data_utils.py +++ b/open_data/open_data_utils.py @@ -155,7 +155,7 @@ def remove_internal_keys(df: gpd.GeoDataFrame) -> gpd.GeoDataFrame: "organization_name": "agency", "organization_source_record_id": "org_id", "route_name_used": "route_name", - "route_types_served": "routetypes" + "route_types_served": "routetypes", } # Rename columns when shapefile truncates @@ -185,5 +185,6 @@ def remove_internal_keys(df: gpd.GeoDataFrame) -> gpd.GeoDataFrame: } RENAME_GTFS = { - "n_hours_in": "n_hours_in_service" + "n_hours_in": "n_hours_in_service", + "route_ids_": "route_ids_served" } \ No newline at end of file From 4c52d0f8850e2ed88ff8ca401e75fdef0715e0ed Mon Sep 17 00:00:00 2001 From: tiffanychu90 Date: Tue, 8 Oct 2024 17:20:49 +0000 Subject: [PATCH 10/10] update metadata with new columns for stops added --- open_data/create_routes_data.py | 2 +- open_data/create_stops_data.py | 2 +- open_data/gcs_to_esri.py | 14 ++++++++----- open_data/metadata.json | 2 +- open_data/open_data_utils.py | 20 ++++++------------- open_data/supplement_meta.py | 6 +++--- open_data/update_data_dict.py | 4 ++-- open_data/xml/ca_hq_transit_areas.xml | 4 ++-- open_data/xml/ca_hq_transit_stops.xml | 4 ++-- open_data/xml/ca_transit_routes.xml | 6 +++--- open_data/xml/ca_transit_stops.xml | 6 +++--- open_data/xml/speeds_by_route_time_of_day.xml | 4 ++-- open_data/xml/speeds_by_stop_segments.xml | 4 ++-- 13 files changed, 37 insertions(+), 41 deletions(-) diff --git a/open_data/create_routes_data.py b/open_data/create_routes_data.py index a90c44297..aa32cd7be 100644 --- a/open_data/create_routes_data.py +++ b/open_data/create_routes_data.py @@ -153,7 +153,7 @@ def finalize_export_df(df: gpd.GeoDataFrame) -> gpd.GeoDataFrame: col_order = route_cols + shape_cols + agency_ids + ['geometry'] df2 = (df[col_order] .reindex(columns = col_order) - .rename(columns = open_data_utils.RENAME_COLS) + .rename(columns = open_data_utils.STANDARDIZED_COLUMNS_DICT) .reset_index(drop=True) ) diff --git a/open_data/create_stops_data.py b/open_data/create_stops_data.py index c8a82d689..343751018 100644 --- a/open_data/create_stops_data.py +++ b/open_data/create_stops_data.py @@ -105,7 +105,7 @@ def finalize_export_df(df: gpd.GeoDataFrame) -> gpd.GeoDataFrame: df2 = (df[col_order] .reindex(columns = col_order) - .rename(columns = open_data_utils.RENAME_COLS) + .rename(columns = open_data_utils.STANDARDIZED_COLUMNS_DICT) .reset_index(drop=True) ) diff --git a/open_data/gcs_to_esri.py b/open_data/gcs_to_esri.py index c9857fa68..fd3296d71 100644 --- a/open_data/gcs_to_esri.py +++ b/open_data/gcs_to_esri.py @@ -12,8 +12,9 @@ from loguru import logger -import publish_utils -from calitp_data_analysis import utils, geography_utils +import open_data_utils +from calitp_data_analysis.geography_utils import WGS84 +from calitp_data_analysis import utils from update_vars import analysis_date, RUN_ME catalog = intake.open_catalog("./catalog.yml") @@ -52,9 +53,12 @@ def remove_zipped_shapefiles(): level="INFO") for d in RUN_ME : - gdf = catalog[d].read().to_crs(geography_utils.WGS84) - gdf = publish_utils.standardize_column_names(gdf).pipe( - publish_utils.remove_internal_keys) + gdf = catalog[d].read().to_crs(WGS84).pipe( + open_data_utils.standardize_column_names + ).pipe( + open_data_utils.remove_internal_keys + ) + logger.info(f"********* {d} *************") print_info(gdf) diff --git a/open_data/metadata.json b/open_data/metadata.json index 05fa8b1a1..6a455e678 100644 --- a/open_data/metadata.json +++ b/open_data/metadata.json @@ -1 +1 @@ -{"ca_hq_transit_areas": {"dataset_name": "ca_hq_transit_areas", "publish_entity": "Data & Digital Services / California Integrated Travel Project", "summary_purpose": "Estimated High Quality Transit Areas as described in Public Resources Code 21155, 21064.3, 21060.2.", "description": "Use GTFS schedule trips, stop_times, shapes, and stops to estimate whether corridor segments have scheduled frequencies of 15 minutes or less.", "public_access": "Public.", "creation_date": "2022-02-08", "place": "California", "status": "completed", "frequency": "monthly", "theme_topic": "transportation", "theme_keywords": "Transportation, Land Use, Transit-Oriented Development, TOD, High Quality Transit", "data_dict_type": "XML", "readme": "https://github.com/cal-itp/data-analyses/blob/main/high_quality_transit_areas/README.md", "readme_desc": "This site allows you to access the code used to create this dataset and provides additional explanatory resources.", "contact_organization": "Caltrans", "contact_person": "Eric Dasmalchi", "contact_email": "eric.dasmalchi@dot.ca.gov", "horiz_accuracy": "4 meters", "boilerplate_desc": "The data are made available to the public solely for informational purposes. Information provided in the Caltrans GIS Data Library is accurate to the best of our knowledge and is subject to change on a regular basis, without notice. While Caltrans makes every effort to provide useful and accurate information, we do not warrant the information Use Limitation - The data are made available to the public solely for informational purposes. Information provided in the Caltrans GIS Data Library is accurate to the best of our knowledge and is subject to change on a regular basis, without notice. While Caltrans makes every effort to provide useful and accurate information, we do not warrant the information to be authoritative, complete, factual, or timely. Information is provided on an 'as is' and an 'as available' basis. The Department of Transportation is not liable to any party for any cost or damages, including any direct, indirect, special, incidental, or consequential damages, arising out of or in connection with the access or use of, or the inability to access or use, the Site or any of the Materials or Services described herein.", "boilerplate_license": "Creative Commons 4.0 Attribution.", "methodology": "This data was estimated using a spatial process derived from General Transit Feed Specification (GTFS) schedule data. To find high-quality bus corridors, we split each corridor into 1,500 meter segments and counted frequencies at the stop within that segment with the highest number of transit trips. If that stop saw at least 4 trips per hour for at least one hour in the morning, and again for at least one hour in the afternoon, we consider that segment a high-quality bus corridor. Segments without a stop are not considered high-quality corridors. Major transit stops were identified as either the intersection of two high-quality corridors from the previous step, a rail or bus rapid transit station, or a ferry terminal with bus service. Note that the definition of `bus rapid transit` in Public Resources Code 21060.2 includes features not captured by available data sources, these features were captured manually using information from transit agency sources and imagery. We believe this data to be broadly accurate, and fit for purposes including overall dashboards, locating facilities in relation to high quality transit areas, and assessing community transit coverage. However, the spatial determination of high-quality transit areas from GTFS data necessarily involves some assumptions as described above. Any critical determinations of whether a specific parcel is located within a high-quality transit area should be made in conjunction with local sources, such as transit agency timetables. Notes: Null values may be present. The `hqta_details` columns defines which part of the Public Resources Code definition the HQTA classification was based on. If `hqta_details` references a single operator, then `agency_secondary` and `base64_url_secondary` are null. If `hqta_details` references the same operator, then `agency_secondary` and `base64_url_secondary` are the same as `agency_primary` and `base64_url_primary`.", "data_dict_url": "https://gisdata.dot.ca.gov/arcgis/rest/services/CHrailroad/CA_HQ_Transit_Areas/FeatureServer", "revision_date": "2024-09-18", "rename_cols": {"agency_pri": "agency_primary", "agency_sec": "agency_secondary", "hqta_detai": "hqta_details", "base64_url": "base64_url_primary", "base64_u_1": "base64_url_secondary", "org_id_pri": "org_id_primary", "org_id_sec": "org_id_secondary"}}, "ca_hq_transit_stops": {"dataset_name": "ca_hq_transit_stops", "publish_entity": "Data & Digital Services / California Integrated Travel Project", "summary_purpose": "Estimated stops along High Quality Transit Corridors, plus major transit stops for bus rapid transit, ferry, rail modes as described in Public Resources Code 21155, 21064.3, 21060.2.", "description": "Use GTFS schedule trips, stop_times, shapes, and stops to estimate whether corridor segments have scheduled frequencies of 15 minutes or less.", "public_access": "Public.", "creation_date": "2022-02-08", "place": "California", "status": "completed", "frequency": "monthly", "theme_topic": "transportation", "theme_keywords": "Transportation, Land Use, Transit-Oriented Development, TOD, High Quality Transit", "data_dict_type": "XML", "readme": "https://github.com/cal-itp/data-analyses/blob/main/high_quality_transit_areas/README.md", "readme_desc": "This site allows you to access the code used to create this dataset and provides additional explanatory resources.", "contact_organization": "Caltrans", "contact_person": "Eric Dasmalchi", "contact_email": "eric.dasmalchi@dot.ca.gov", "horiz_accuracy": "4 meters", "boilerplate_desc": "The data are made available to the public solely for informational purposes. Information provided in the Caltrans GIS Data Library is accurate to the best of our knowledge and is subject to change on a regular basis, without notice. While Caltrans makes every effort to provide useful and accurate information, we do not warrant the information Use Limitation - The data are made available to the public solely for informational purposes. Information provided in the Caltrans GIS Data Library is accurate to the best of our knowledge and is subject to change on a regular basis, without notice. While Caltrans makes every effort to provide useful and accurate information, we do not warrant the information to be authoritative, complete, factual, or timely. Information is provided on an 'as is' and an 'as available' basis. The Department of Transportation is not liable to any party for any cost or damages, including any direct, indirect, special, incidental, or consequential damages, arising out of or in connection with the access or use of, or the inability to access or use, the Site or any of the Materials or Services described herein.", "boilerplate_license": "Creative Commons 4.0 Attribution.", "methodology": "This data was estimated using a spatial process derived from General Transit Feed Specification (GTFS) schedule data. To find high-quality bus corridors, we split each corridor into 1,500 meter segments and counted frequencies at the stop within that segment with the highest number of transit trips. If that stop saw at least 4 trips per hour for at least one hour in the morning, and again for at least one hour in the afternoon, we consider that segment a high-quality bus corridor. Segments without a stop are not considered high-quality corridors. Major transit stops were identified as either the intersection of two high-quality corridors from the previous step, a rail or bus rapid transit station, or a ferry terminal with bus service. Note that the definition of `bus rapid transit` in Public Resources Code 21060.2 includes features not captured by available data sources, these features were captured manually using information from transit agency sources and imagery. We believe this data to be broadly accurate, and fit for purposes including overall dashboards, locating facilities in relation to high quality transit areas, and assessing community transit coverage. However, the spatial determination of high-quality transit areas from GTFS data necessarily involves some assumptions as described above. Any critical determinations of whether a specific parcel is located within a high-quality transit area should be made in conjunction with local sources, such as transit agency timetables. Notes: Null values may be present. The `hqta_details` columns defines which part of the Public Resources Code definition the HQTA classification was based on. If `hqta_details` references a single operator, then `agency_secondary` and `base64_url_secondary` are null. If `hqta_details` references the same operator, then `agency_secondary` and `base64_url_secondary` are the same as `agency_primary` and `base64_url_primary`.", "data_dict_url": "https://gisdata.dot.ca.gov/arcgis/rest/services/CHrailroad/CA_HQ_Transit_Stops/FeatureServer", "revision_date": "2024-09-18", "rename_cols": {"agency_pri": "agency_primary", "agency_sec": "agency_secondary", "hqta_detai": "hqta_details", "base64_url": "base64_url_primary", "base64_u_1": "base64_url_secondary", "org_id_pri": "org_id_primary", "org_id_sec": "org_id_secondary"}}, "ca_transit_routes": {"dataset_name": "ca_transit_routes", "publish_entity": "Data & Digital Services / California Integrated Travel Project", "summary_purpose": "Provide all CA transit stops and routes (geospatial) from all transit operators.", "description": "Provide compiled GTFS schedule data in geospatial format. Transit routes associates route information to shapes. Transit stops associates aggregated stop times and route information aggregated for each stops.", "public_access": "Public.", "creation_date": "2022-02-08", "place": "California", "status": "completed", "frequency": "monthly", "theme_topic": "transportation", "theme_keywords": "Transportation, GTFS, Transit routes, Transit stops, Transit", "data_dict_type": "XML", "readme": "https://github.com/cal-itp/data-analyses/blob/main/open_data/README.md", "readme_desc": "This site allows you to access the code used to create this dataset and provides additional explanatory resources.", "contact_organization": "Caltrans", "contact_person": "Tiffany Ku", "contact_email": "tiffany.ku@dot.ca.gov", "horiz_accuracy": "4 meters", "boilerplate_desc": "The data are made available to the public solely for informational purposes. Information provided in the Caltrans GIS Data Library is accurate to the best of our knowledge and is subject to change on a regular basis, without notice. While Caltrans makes every effort to provide useful and accurate information, we do not warrant the information Use Limitation - The data are made available to the public solely for informational purposes. Information provided in the Caltrans GIS Data Library is accurate to the best of our knowledge and is subject to change on a regular basis, without notice. While Caltrans makes every effort to provide useful and accurate information, we do not warrant the information to be authoritative, complete, factual, or timely. Information is provided on an 'as is' and an 'as available' basis. The Department of Transportation is not liable to any party for any cost or damages, including any direct, indirect, special, incidental, or consequential damages, arising out of or in connection with the access or use of, or the inability to access or use, the Site or any of the Materials or Services described herein.", "boilerplate_license": "Creative Commons 4.0 Attribution.", "methodology": "This data was assembled from the General Transit Feed Specification (GTFS) schedule data. GTFS tables are text files, but these have been compiled for all operators and transformed into geospatial data, with minimal data processing. The transit routes dataset is assembled from two tables: (1) `shapes.txt`, which defines the route alignment path, and (2) `trips.txt` and `stops.txt`, for routes not found in `shapes.txt`. `shapes.txt` is an optional GTFS table with richer information than just transit stop longitude and latitude. The transit stops dataset is assembled from `stops.txt`, which contains information about the route, stop sequence, and stop longitude and latitude. References: https://gtfs.org/. https://gtfs.org/schedule/reference/#shapestxt. https://gtfs.org/schedule/reference/#stopstxt. https://gtfs.org/schedule/reference/#tripstxt.", "data_dict_url": "https://gisdata.dot.ca.gov/arcgis/rest/services/CHrailroad/CA_Transit_Routes/FeatureServer", "revision_date": "2024-09-18", "rename_cols": {"n_hours_in": "n_hours_in_service"}}, "ca_transit_stops": {"dataset_name": "ca_transit_stops", "publish_entity": "Data & Digital Services / California Integrated Travel Project", "summary_purpose": "Provide all CA transit stops and routes (geospatial) from all transit operators.", "description": "Provide compiled GTFS schedule data in geospatial format. Transit routes associates route information to shapes. Transit stops associates aggregated stop times and route information aggregated for each stops.", "public_access": "Public.", "creation_date": "2022-02-08", "place": "California", "status": "completed", "frequency": "monthly", "theme_topic": "transportation", "theme_keywords": "Transportation, GTFS, Transit routes, Transit stops, Transit", "data_dict_type": "XML", "readme": "https://github.com/cal-itp/data-analyses/blob/main/open_data/README.md", "readme_desc": "This site allows you to access the code used to create this dataset and provides additional explanatory resources.", "contact_organization": "Caltrans", "contact_person": "Tiffany Ku", "contact_email": "tiffany.ku@dot.ca.gov", "horiz_accuracy": "4 meters", "boilerplate_desc": "The data are made available to the public solely for informational purposes. Information provided in the Caltrans GIS Data Library is accurate to the best of our knowledge and is subject to change on a regular basis, without notice. While Caltrans makes every effort to provide useful and accurate information, we do not warrant the information Use Limitation - The data are made available to the public solely for informational purposes. Information provided in the Caltrans GIS Data Library is accurate to the best of our knowledge and is subject to change on a regular basis, without notice. While Caltrans makes every effort to provide useful and accurate information, we do not warrant the information to be authoritative, complete, factual, or timely. Information is provided on an 'as is' and an 'as available' basis. The Department of Transportation is not liable to any party for any cost or damages, including any direct, indirect, special, incidental, or consequential damages, arising out of or in connection with the access or use of, or the inability to access or use, the Site or any of the Materials or Services described herein.", "boilerplate_license": "Creative Commons 4.0 Attribution.", "methodology": "This data was assembled from the General Transit Feed Specification (GTFS) schedule data. GTFS tables are text files, but these have been compiled for all operators and transformed into geospatial data, with minimal data processing. The transit routes dataset is assembled from two tables: (1) `shapes.txt`, which defines the route alignment path, and (2) `trips.txt` and `stops.txt`, for routes not found in `shapes.txt`. `shapes.txt` is an optional GTFS table with richer information than just transit stop longitude and latitude. The transit stops dataset is assembled from `stops.txt`, which contains information about the route, stop sequence, and stop longitude and latitude. References: https://gtfs.org/. https://gtfs.org/schedule/reference/#shapestxt. https://gtfs.org/schedule/reference/#stopstxt. https://gtfs.org/schedule/reference/#tripstxt.", "data_dict_url": "https://gisdata.dot.ca.gov/arcgis/rest/services/CHrailroad/CA_Transit_Stops/FeatureServer", "revision_date": "2024-09-18", "rename_cols": {"n_hours_in": "n_hours_in_service"}}, "speeds_by_stop_segments": {"dataset_name": "speeds_by_stop_segments", "publish_entity": "Data & Digital Services / California Integrated Travel Project", "summary_purpose": "Average all-day, peak, and offpeak transit speeds by segments for all CA operators that provide GTFS real-time vehicle positions data.", "description": "All day and peak transit 20th, 50th, and 80th percentile speeds on stop segments estimated on a single day for all CA transit operators that provide GTFS real-time vehicle positions data.", "public_access": "Public.", "creation_date": "2023-06-14", "place": "California", "status": "completed", "frequency": "monthly", "theme_topic": "transportation", "theme_keywords": "Transportation, Transit, GTFS, GTFS RT, real time, speeds, vehicle positions ", "data_dict_type": "XML", "readme": "https://github.com/cal-itp/data-analyses/blob/main/rt_segment_speeds/README.md", "readme_desc": "This site allows you to access the code used to create this dataset and provides additional explanatory resources.", "contact_organization": "Caltrans", "contact_person": "Tiffany Ku / Eric Dasmalchi", "contact_email": "tiffany.ku@dot.ca.gov / eric.dasmalchi@dot.ca.gov", "horiz_accuracy": "4 meters", "boilerplate_desc": "The data are made available to the public solely for informational purposes. Information provided in the Caltrans GIS Data Library is accurate to the best of our knowledge and is subject to change on a regular basis, without notice. While Caltrans makes every effort to provide useful and accurate information, we do not warrant the information Use Limitation - The data are made available to the public solely for informational purposes. Information provided in the Caltrans GIS Data Library is accurate to the best of our knowledge and is subject to change on a regular basis, without notice. While Caltrans makes every effort to provide useful and accurate information, we do not warrant the information to be authoritative, complete, factual, or timely. Information is provided on an 'as is' and an 'as available' basis. The Department of Transportation is not liable to any party for any cost or damages, including any direct, indirect, special, incidental, or consequential damages, arising out of or in connection with the access or use of, or the inability to access or use, the Site or any of the Materials or Services described herein.", "boilerplate_license": "Creative Commons 4.0 Attribution.", "methodology": "This data was estimated by combining GTFS real-time vehicle positions to GTFS scheduled trips, shapes, stops, and stop times tables. GTFS shapes provides the route alignment path. Multiple trips may share the same shape, with a route typically associated with multiple shapes. Shapes are cut into segments at stop positions (stop_id-stop_sequence combination). A `stop segment` refers to the portion of shapes between the prior stop and the current stop. Vehicle positions are spatially joined to 35 meter buffered segments. Within each segment-trip, the first and last vehicle position observed are used to calculate the speed. Since multiple trips may occur over a segment each day, the multiple trip speeds provide a distribution. From this distribution, the 20th percentile, 50th percentile (median), and 80th percentile speeds are calculated. For all day speed metrics, all trips are used. For peak speed metrics, only trips with start times between 7 - 9:59 AM and 4 - 7:59 PM are used to find the 20th, 50th, and 80th percentile metrics. Data processing notes: (a) GTFS RT trips whose vehicle position timestamps span 10 minutes or less are dropped. Incomplete data would lead to unreliable estimates of speed at the granularity we need. (b) Segment-trip speeds of over 70 mph are excluded. These are erroneously calculated as transit does not typically reach those speeds. (c) Other missing or erroneous calculations, either arising from only one vehicle position found in a segment (change in time or change in distance cannot be calculated).", "data_dict_url": "https://gisdata.dot.ca.gov/arcgis/rest/services/CHrailroad/Speeds_By_Stop_Segments/FeatureServer", "revision_date": "2024-09-18", "rename_cols": {"stop_seque": "stop_sequence", "time_of_da": "time_of_day", "time_perio": "time_period", "district_n": "district_name", "direction_": "direction_id", "common_sha": "common_shape_id", "avg_sched_": "avg_sched_trip_min", "avg_rt_tri": "avg_rt_trip_min", "caltrans_d": "district_name", "organization_source_record_id": "org_id", "organization_name": "agency", "stop_pair_": "stop_pair_name"}}, "speeds_by_route_time_of_day": {"dataset_name": "speeds_by_route_time_of_day", "publish_entity": "Data & Digital Services / California Integrated Travel Project", "summary_purpose": "Average transit speeds by route-direction estimated on a single day for all CA transit operators that provide GTFS real-time vehicle positions data.", "description": "Provide average transit speeds, number of trips by route-direction.", "public_access": "Public.", "creation_date": "2023-06-14", "place": "California", "status": "completed", "frequency": "monthly", "theme_topic": "transportation", "theme_keywords": "Transportation, Transit, GTFS, GTFS RT, real time, speeds, vehicle positions ", "data_dict_type": "XML", "readme": "https://github.com/cal-itp/data-analyses/blob/main/rt_segment_speeds/README.md", "readme_desc": "This site allows you to access the code used to create this dataset and provides additional explanatory resources.", "contact_organization": "Caltrans", "contact_person": "Tiffany Ku", "contact_email": "tiffany.ku@dot.ca.gov", "horiz_accuracy": "4 meters", "boilerplate_desc": "The data are made available to the public solely for informational purposes. Information provided in the Caltrans GIS Data Library is accurate to the best of our knowledge and is subject to change on a regular basis, without notice. While Caltrans makes every effort to provide useful and accurate information, we do not warrant the information Use Limitation - The data are made available to the public solely for informational purposes. Information provided in the Caltrans GIS Data Library is accurate to the best of our knowledge and is subject to change on a regular basis, without notice. While Caltrans makes every effort to provide useful and accurate information, we do not warrant the information to be authoritative, complete, factual, or timely. Information is provided on an 'as is' and an 'as available' basis. The Department of Transportation is not liable to any party for any cost or damages, including any direct, indirect, special, incidental, or consequential damages, arising out of or in connection with the access or use of, or the inability to access or use, the Site or any of the Materials or Services described herein.", "boilerplate_license": "Creative Commons 4.0 Attribution.", "methodology": "This data was estimated by combining GTFS real-time vehicle positions with GTFS scheduled trips and shapes. GTFS real-time (RT) vehicle positions are spatially joined to GTFS scheduled shapes, so only vehicle positions traveling along the route alignment path are kept. A sample of five vehicle positions are selected (min, 25th percentile, 50th percentile, 75th percentile, max). The trip speed is calculated using these five vehicle positions. Each trip is categorized into a time-of-day. The average speed for a route-direction-time_of_day is calculated. Additional metrics are stored, such as the number of trips observed, the average scheduled service minutes, and the average RT observed service minutes. For convenience, we also provide a singular shape (common_shape_id) to associate with a route-direction. This is the shape that had the most number of trips for a given route-direction. Time-of-day is determined by the GTFS scheduled trip start time. The trip start hour (military time) is categorized based on the following: Owl (0-3), Early AM (4-6), AM Peak (7-9), Midday (10-14), PM Peak (15-19), and Evening (20-23). The start and end hours are inclusive (e.g., 4-6 refers to 4am, 5am, and 6am).", "data_dict_url": "https://gisdata.dot.ca.gov/arcgis/rest/services/CHrailroad/Speeds_By_Route_Time_of_Day/FeatureServer", "revision_date": "2024-09-18", "rename_cols": {"stop_seque": "stop_sequence", "time_of_da": "time_of_day", "time_perio": "time_period", "district_n": "district_name", "direction_": "direction_id", "common_sha": "common_shape_id", "avg_sched_": "avg_sched_trip_min", "avg_rt_tri": "avg_rt_trip_min", "caltrans_d": "district_name", "organization_source_record_id": "org_id", "organization_name": "agency", "stop_pair_": "stop_pair_name"}}} \ No newline at end of file +{"ca_hq_transit_areas": {"dataset_name": "ca_hq_transit_areas", "publish_entity": "Data & Digital Services / California Integrated Travel Project", "summary_purpose": "Estimated High Quality Transit Areas as described in Public Resources Code 21155, 21064.3, 21060.2.", "description": "Use GTFS schedule trips, stop_times, shapes, and stops to estimate whether corridor segments have scheduled frequencies of 15 minutes or less.", "public_access": "Public.", "creation_date": "2022-02-08", "place": "California", "status": "completed", "frequency": "monthly", "theme_topic": "transportation", "theme_keywords": "Transportation, Land Use, Transit-Oriented Development, TOD, High Quality Transit", "data_dict_type": "XML", "readme": "https://github.com/cal-itp/data-analyses/blob/main/high_quality_transit_areas/README.md", "readme_desc": "This site allows you to access the code used to create this dataset and provides additional explanatory resources.", "contact_organization": "Caltrans", "contact_person": "Eric Dasmalchi", "contact_email": "eric.dasmalchi@dot.ca.gov", "horiz_accuracy": "4 meters", "boilerplate_desc": "The data are made available to the public solely for informational purposes. Information provided in the Caltrans GIS Data Library is accurate to the best of our knowledge and is subject to change on a regular basis, without notice. While Caltrans makes every effort to provide useful and accurate information, we do not warrant the information Use Limitation - The data are made available to the public solely for informational purposes. Information provided in the Caltrans GIS Data Library is accurate to the best of our knowledge and is subject to change on a regular basis, without notice. While Caltrans makes every effort to provide useful and accurate information, we do not warrant the information to be authoritative, complete, factual, or timely. Information is provided on an 'as is' and an 'as available' basis. The Department of Transportation is not liable to any party for any cost or damages, including any direct, indirect, special, incidental, or consequential damages, arising out of or in connection with the access or use of, or the inability to access or use, the Site or any of the Materials or Services described herein.", "boilerplate_license": "Creative Commons 4.0 Attribution.", "methodology": "This data was estimated using a spatial process derived from General Transit Feed Specification (GTFS) schedule data. To find high-quality bus corridors, we split each corridor into 1,500 meter segments and counted frequencies at the stop within that segment with the highest number of transit trips. If that stop saw at least 4 trips per hour for at least one hour in the morning, and again for at least one hour in the afternoon, we consider that segment a high-quality bus corridor. Segments without a stop are not considered high-quality corridors. Major transit stops were identified as either the intersection of two high-quality corridors from the previous step, a rail or bus rapid transit station, or a ferry terminal with bus service. Note that the definition of `bus rapid transit` in Public Resources Code 21060.2 includes features not captured by available data sources, these features were captured manually using information from transit agency sources and imagery. We believe this data to be broadly accurate, and fit for purposes including overall dashboards, locating facilities in relation to high quality transit areas, and assessing community transit coverage. However, the spatial determination of high-quality transit areas from GTFS data necessarily involves some assumptions as described above. Any critical determinations of whether a specific parcel is located within a high-quality transit area should be made in conjunction with local sources, such as transit agency timetables. Notes: Null values may be present. The `hqta_details` columns defines which part of the Public Resources Code definition the HQTA classification was based on. If `hqta_details` references a single operator, then `agency_secondary` and `base64_url_secondary` are null. If `hqta_details` references the same operator, then `agency_secondary` and `base64_url_secondary` are the same as `agency_primary` and `base64_url_primary`.", "data_dict_url": "https://gisdata.dot.ca.gov/arcgis/rest/services/CHrailroad/CA_HQ_Transit_Areas/FeatureServer", "revision_date": "2024-09-18", "rename_cols": {"agency_pri": "agency_primary", "agency_sec": "agency_secondary", "hqta_detai": "hqta_details", "base64_url": "base64_url_primary", "base64_u_1": "base64_url_secondary", "org_id_pri": "org_id_primary", "org_id_sec": "org_id_secondary"}}, "ca_hq_transit_stops": {"dataset_name": "ca_hq_transit_stops", "publish_entity": "Data & Digital Services / California Integrated Travel Project", "summary_purpose": "Estimated stops along High Quality Transit Corridors, plus major transit stops for bus rapid transit, ferry, rail modes as described in Public Resources Code 21155, 21064.3, 21060.2.", "description": "Use GTFS schedule trips, stop_times, shapes, and stops to estimate whether corridor segments have scheduled frequencies of 15 minutes or less.", "public_access": "Public.", "creation_date": "2022-02-08", "place": "California", "status": "completed", "frequency": "monthly", "theme_topic": "transportation", "theme_keywords": "Transportation, Land Use, Transit-Oriented Development, TOD, High Quality Transit", "data_dict_type": "XML", "readme": "https://github.com/cal-itp/data-analyses/blob/main/high_quality_transit_areas/README.md", "readme_desc": "This site allows you to access the code used to create this dataset and provides additional explanatory resources.", "contact_organization": "Caltrans", "contact_person": "Eric Dasmalchi", "contact_email": "eric.dasmalchi@dot.ca.gov", "horiz_accuracy": "4 meters", "boilerplate_desc": "The data are made available to the public solely for informational purposes. Information provided in the Caltrans GIS Data Library is accurate to the best of our knowledge and is subject to change on a regular basis, without notice. While Caltrans makes every effort to provide useful and accurate information, we do not warrant the information Use Limitation - The data are made available to the public solely for informational purposes. Information provided in the Caltrans GIS Data Library is accurate to the best of our knowledge and is subject to change on a regular basis, without notice. While Caltrans makes every effort to provide useful and accurate information, we do not warrant the information to be authoritative, complete, factual, or timely. Information is provided on an 'as is' and an 'as available' basis. The Department of Transportation is not liable to any party for any cost or damages, including any direct, indirect, special, incidental, or consequential damages, arising out of or in connection with the access or use of, or the inability to access or use, the Site or any of the Materials or Services described herein.", "boilerplate_license": "Creative Commons 4.0 Attribution.", "methodology": "This data was estimated using a spatial process derived from General Transit Feed Specification (GTFS) schedule data. To find high-quality bus corridors, we split each corridor into 1,500 meter segments and counted frequencies at the stop within that segment with the highest number of transit trips. If that stop saw at least 4 trips per hour for at least one hour in the morning, and again for at least one hour in the afternoon, we consider that segment a high-quality bus corridor. Segments without a stop are not considered high-quality corridors. Major transit stops were identified as either the intersection of two high-quality corridors from the previous step, a rail or bus rapid transit station, or a ferry terminal with bus service. Note that the definition of `bus rapid transit` in Public Resources Code 21060.2 includes features not captured by available data sources, these features were captured manually using information from transit agency sources and imagery. We believe this data to be broadly accurate, and fit for purposes including overall dashboards, locating facilities in relation to high quality transit areas, and assessing community transit coverage. However, the spatial determination of high-quality transit areas from GTFS data necessarily involves some assumptions as described above. Any critical determinations of whether a specific parcel is located within a high-quality transit area should be made in conjunction with local sources, such as transit agency timetables. Notes: Null values may be present. The `hqta_details` columns defines which part of the Public Resources Code definition the HQTA classification was based on. If `hqta_details` references a single operator, then `agency_secondary` and `base64_url_secondary` are null. If `hqta_details` references the same operator, then `agency_secondary` and `base64_url_secondary` are the same as `agency_primary` and `base64_url_primary`.", "data_dict_url": "https://gisdata.dot.ca.gov/arcgis/rest/services/CHrailroad/CA_HQ_Transit_Stops/FeatureServer", "revision_date": "2024-09-18", "rename_cols": {"agency_pri": "agency_primary", "agency_sec": "agency_secondary", "hqta_detai": "hqta_details", "base64_url": "base64_url_primary", "base64_u_1": "base64_url_secondary", "org_id_pri": "org_id_primary", "org_id_sec": "org_id_secondary"}}, "ca_transit_routes": {"dataset_name": "ca_transit_routes", "publish_entity": "Data & Digital Services / California Integrated Travel Project", "summary_purpose": "Provide all CA transit stops and routes (geospatial) from all transit operators.", "description": "Provide compiled GTFS schedule data in geospatial format. Transit routes associates route information to shapes. Transit stops associates aggregated stop times and route information aggregated for each stops.", "public_access": "Public.", "creation_date": "2022-02-08", "place": "California", "status": "completed", "frequency": "monthly", "theme_topic": "transportation", "theme_keywords": "Transportation, GTFS, Transit routes, Transit stops, Transit", "data_dict_type": "XML", "readme": "https://github.com/cal-itp/data-analyses/blob/main/open_data/README.md", "readme_desc": "This site allows you to access the code used to create this dataset and provides additional explanatory resources.", "contact_organization": "Caltrans", "contact_person": "Tiffany Ku", "contact_email": "tiffany.ku@dot.ca.gov", "horiz_accuracy": "4 meters", "boilerplate_desc": "The data are made available to the public solely for informational purposes. Information provided in the Caltrans GIS Data Library is accurate to the best of our knowledge and is subject to change on a regular basis, without notice. While Caltrans makes every effort to provide useful and accurate information, we do not warrant the information Use Limitation - The data are made available to the public solely for informational purposes. Information provided in the Caltrans GIS Data Library is accurate to the best of our knowledge and is subject to change on a regular basis, without notice. While Caltrans makes every effort to provide useful and accurate information, we do not warrant the information to be authoritative, complete, factual, or timely. Information is provided on an 'as is' and an 'as available' basis. The Department of Transportation is not liable to any party for any cost or damages, including any direct, indirect, special, incidental, or consequential damages, arising out of or in connection with the access or use of, or the inability to access or use, the Site or any of the Materials or Services described herein.", "boilerplate_license": "Creative Commons 4.0 Attribution.", "methodology": "This data was assembled from the General Transit Feed Specification (GTFS) schedule data. GTFS tables are text files, but these have been compiled for all operators and transformed into geospatial data, with minimal data processing. The transit routes dataset is assembled from two tables: (1) `shapes.txt`, which defines the route alignment path, and (2) `trips.txt` and `stops.txt`, for routes not found in `shapes.txt`. `shapes.txt` is an optional GTFS table with richer information than just transit stop longitude and latitude. The transit stops dataset is assembled from `stops.txt`, which contains information about the route, stop sequence, and stop longitude and latitude. References: https://gtfs.org/. https://gtfs.org/schedule/reference/#shapestxt. https://gtfs.org/schedule/reference/#stopstxt. https://gtfs.org/schedule/reference/#tripstxt.", "data_dict_url": "https://gisdata.dot.ca.gov/arcgis/rest/services/CHrailroad/CA_Transit_Routes/FeatureServer", "revision_date": "2024-09-18", "rename_cols": {"caltrans_district": "district_name", "organization_source_record_id": "org_id", "organization_name": "agency", "agency_name_primary": "agency_primary", "agency_name_secondary": "agency_secondary", "route_name_used": "route_name", "route_types_served": "routetypes", "n_hours_in": "n_hours_in_service", "route_ids_": "route_ids_served"}}, "ca_transit_stops": {"dataset_name": "ca_transit_stops", "publish_entity": "Data & Digital Services / California Integrated Travel Project", "summary_purpose": "Provide all CA transit stops and routes (geospatial) from all transit operators.", "description": "Provide compiled GTFS schedule data in geospatial format. Transit routes associates route information to shapes. Transit stops associates aggregated stop times and route information aggregated for each stops.", "public_access": "Public.", "creation_date": "2022-02-08", "place": "California", "status": "completed", "frequency": "monthly", "theme_topic": "transportation", "theme_keywords": "Transportation, GTFS, Transit routes, Transit stops, Transit", "data_dict_type": "XML", "readme": "https://github.com/cal-itp/data-analyses/blob/main/open_data/README.md", "readme_desc": "This site allows you to access the code used to create this dataset and provides additional explanatory resources.", "contact_organization": "Caltrans", "contact_person": "Tiffany Ku", "contact_email": "tiffany.ku@dot.ca.gov", "horiz_accuracy": "4 meters", "boilerplate_desc": "The data are made available to the public solely for informational purposes. Information provided in the Caltrans GIS Data Library is accurate to the best of our knowledge and is subject to change on a regular basis, without notice. While Caltrans makes every effort to provide useful and accurate information, we do not warrant the information Use Limitation - The data are made available to the public solely for informational purposes. Information provided in the Caltrans GIS Data Library is accurate to the best of our knowledge and is subject to change on a regular basis, without notice. While Caltrans makes every effort to provide useful and accurate information, we do not warrant the information to be authoritative, complete, factual, or timely. Information is provided on an 'as is' and an 'as available' basis. The Department of Transportation is not liable to any party for any cost or damages, including any direct, indirect, special, incidental, or consequential damages, arising out of or in connection with the access or use of, or the inability to access or use, the Site or any of the Materials or Services described herein.", "boilerplate_license": "Creative Commons 4.0 Attribution.", "methodology": "This data was assembled from the General Transit Feed Specification (GTFS) schedule data. GTFS tables are text files, but these have been compiled for all operators and transformed into geospatial data, with minimal data processing. The transit routes dataset is assembled from two tables: (1) `shapes.txt`, which defines the route alignment path, and (2) `trips.txt` and `stops.txt`, for routes not found in `shapes.txt`. `shapes.txt` is an optional GTFS table with richer information than just transit stop longitude and latitude. The transit stops dataset is assembled from `stops.txt`, which contains information about the route, stop sequence, and stop longitude and latitude. References: https://gtfs.org/. https://gtfs.org/schedule/reference/#shapestxt. https://gtfs.org/schedule/reference/#stopstxt. https://gtfs.org/schedule/reference/#tripstxt.", "data_dict_url": "https://gisdata.dot.ca.gov/arcgis/rest/services/CHrailroad/CA_Transit_Stops/FeatureServer", "revision_date": "2024-09-18", "rename_cols": {"caltrans_district": "district_name", "organization_source_record_id": "org_id", "organization_name": "agency", "agency_name_primary": "agency_primary", "agency_name_secondary": "agency_secondary", "route_name_used": "route_name", "route_types_served": "routetypes", "n_hours_in": "n_hours_in_service", "route_ids_": "route_ids_served"}}, "speeds_by_stop_segments": {"dataset_name": "speeds_by_stop_segments", "publish_entity": "Data & Digital Services / California Integrated Travel Project", "summary_purpose": "Average all-day, peak, and offpeak transit speeds by segments for all CA operators that provide GTFS real-time vehicle positions data.", "description": "All day and peak transit 20th, 50th, and 80th percentile speeds on stop segments estimated on a single day for all CA transit operators that provide GTFS real-time vehicle positions data.", "public_access": "Public.", "creation_date": "2023-06-14", "place": "California", "status": "completed", "frequency": "monthly", "theme_topic": "transportation", "theme_keywords": "Transportation, Transit, GTFS, GTFS RT, real time, speeds, vehicle positions ", "data_dict_type": "XML", "readme": "https://github.com/cal-itp/data-analyses/blob/main/rt_segment_speeds/README.md", "readme_desc": "This site allows you to access the code used to create this dataset and provides additional explanatory resources.", "contact_organization": "Caltrans", "contact_person": "Tiffany Ku / Eric Dasmalchi", "contact_email": "tiffany.ku@dot.ca.gov / eric.dasmalchi@dot.ca.gov", "horiz_accuracy": "4 meters", "boilerplate_desc": "The data are made available to the public solely for informational purposes. Information provided in the Caltrans GIS Data Library is accurate to the best of our knowledge and is subject to change on a regular basis, without notice. While Caltrans makes every effort to provide useful and accurate information, we do not warrant the information Use Limitation - The data are made available to the public solely for informational purposes. Information provided in the Caltrans GIS Data Library is accurate to the best of our knowledge and is subject to change on a regular basis, without notice. While Caltrans makes every effort to provide useful and accurate information, we do not warrant the information to be authoritative, complete, factual, or timely. Information is provided on an 'as is' and an 'as available' basis. The Department of Transportation is not liable to any party for any cost or damages, including any direct, indirect, special, incidental, or consequential damages, arising out of or in connection with the access or use of, or the inability to access or use, the Site or any of the Materials or Services described herein.", "boilerplate_license": "Creative Commons 4.0 Attribution.", "methodology": "This data was estimated by combining GTFS real-time vehicle positions to GTFS scheduled trips, shapes, stops, and stop times tables. GTFS shapes provides the route alignment path. Multiple trips may share the same shape, with a route typically associated with multiple shapes. Shapes are cut into segments at stop positions (stop_id-stop_sequence combination). A `stop segment` refers to the portion of shapes between the prior stop and the current stop. Vehicle positions are spatially joined to 35 meter buffered segments. Within each segment-trip, the first and last vehicle position observed are used to calculate the speed. Since multiple trips may occur over a segment each day, the multiple trip speeds provide a distribution. From this distribution, the 20th percentile, 50th percentile (median), and 80th percentile speeds are calculated. For all day speed metrics, all trips are used. For peak speed metrics, only trips with start times between 7 - 9:59 AM and 4 - 7:59 PM are used to find the 20th, 50th, and 80th percentile metrics. Data processing notes: (a) GTFS RT trips whose vehicle position timestamps span 10 minutes or less are dropped. Incomplete data would lead to unreliable estimates of speed at the granularity we need. (b) Segment-trip speeds of over 70 mph are excluded. These are erroneously calculated as transit does not typically reach those speeds. (c) Other missing or erroneous calculations, either arising from only one vehicle position found in a segment (change in time or change in distance cannot be calculated).", "data_dict_url": "https://gisdata.dot.ca.gov/arcgis/rest/services/CHrailroad/Speeds_By_Stop_Segments/FeatureServer", "revision_date": "2024-09-18", "rename_cols": {"stop_seque": "stop_sequence", "time_of_da": "time_of_day", "time_perio": "time_period", "district_n": "district_name", "direction_": "direction_id", "common_sha": "common_shape_id", "avg_sched_": "avg_sched_trip_min", "avg_rt_tri": "avg_rt_trip_min", "caltrans_d": "district_name", "organization_source_record_id": "org_id", "organization_name": "agency", "stop_pair_": "stop_pair_name"}}, "speeds_by_route_time_of_day": {"dataset_name": "speeds_by_route_time_of_day", "publish_entity": "Data & Digital Services / California Integrated Travel Project", "summary_purpose": "Average transit speeds by route-direction estimated on a single day for all CA transit operators that provide GTFS real-time vehicle positions data.", "description": "Provide average transit speeds, number of trips by route-direction.", "public_access": "Public.", "creation_date": "2023-06-14", "place": "California", "status": "completed", "frequency": "monthly", "theme_topic": "transportation", "theme_keywords": "Transportation, Transit, GTFS, GTFS RT, real time, speeds, vehicle positions ", "data_dict_type": "XML", "readme": "https://github.com/cal-itp/data-analyses/blob/main/rt_segment_speeds/README.md", "readme_desc": "This site allows you to access the code used to create this dataset and provides additional explanatory resources.", "contact_organization": "Caltrans", "contact_person": "Tiffany Ku", "contact_email": "tiffany.ku@dot.ca.gov", "horiz_accuracy": "4 meters", "boilerplate_desc": "The data are made available to the public solely for informational purposes. Information provided in the Caltrans GIS Data Library is accurate to the best of our knowledge and is subject to change on a regular basis, without notice. While Caltrans makes every effort to provide useful and accurate information, we do not warrant the information Use Limitation - The data are made available to the public solely for informational purposes. Information provided in the Caltrans GIS Data Library is accurate to the best of our knowledge and is subject to change on a regular basis, without notice. While Caltrans makes every effort to provide useful and accurate information, we do not warrant the information to be authoritative, complete, factual, or timely. Information is provided on an 'as is' and an 'as available' basis. The Department of Transportation is not liable to any party for any cost or damages, including any direct, indirect, special, incidental, or consequential damages, arising out of or in connection with the access or use of, or the inability to access or use, the Site or any of the Materials or Services described herein.", "boilerplate_license": "Creative Commons 4.0 Attribution.", "methodology": "This data was estimated by combining GTFS real-time vehicle positions with GTFS scheduled trips and shapes. GTFS real-time (RT) vehicle positions are spatially joined to GTFS scheduled shapes, so only vehicle positions traveling along the route alignment path are kept. A sample of five vehicle positions are selected (min, 25th percentile, 50th percentile, 75th percentile, max). The trip speed is calculated using these five vehicle positions. Each trip is categorized into a time-of-day. The average speed for a route-direction-time_of_day is calculated. Additional metrics are stored, such as the number of trips observed, the average scheduled service minutes, and the average RT observed service minutes. For convenience, we also provide a singular shape (common_shape_id) to associate with a route-direction. This is the shape that had the most number of trips for a given route-direction. Time-of-day is determined by the GTFS scheduled trip start time. The trip start hour (military time) is categorized based on the following: Owl (0-3), Early AM (4-6), AM Peak (7-9), Midday (10-14), PM Peak (15-19), and Evening (20-23). The start and end hours are inclusive (e.g., 4-6 refers to 4am, 5am, and 6am).", "data_dict_url": "https://gisdata.dot.ca.gov/arcgis/rest/services/CHrailroad/Speeds_By_Route_Time_of_Day/FeatureServer", "revision_date": "2024-09-18", "rename_cols": {"stop_seque": "stop_sequence", "time_of_da": "time_of_day", "time_perio": "time_period", "district_n": "district_name", "direction_": "direction_id", "common_sha": "common_shape_id", "avg_sched_": "avg_sched_trip_min", "avg_rt_tri": "avg_rt_trip_min", "caltrans_d": "district_name", "organization_source_record_id": "org_id", "organization_name": "agency", "stop_pair_": "stop_pair_name"}}} \ No newline at end of file diff --git a/open_data/open_data_utils.py b/open_data/open_data_utils.py index f0402ca1d..d7bfad002 100644 --- a/open_data/open_data_utils.py +++ b/open_data/open_data_utils.py @@ -120,7 +120,12 @@ def clip_to_usa(gdf: gpd.GeoDataFrame) -> gpd.GeoDataFrame: "organization_source_record_id": "org_id", "organization_name": "agency", "agency_name_primary": "agency_primary", - "agency_name_secondary": "agency_secondary" + "agency_name_secondary": "agency_secondary", + "route_name_used": "route_name", + "route_types_served": "routetypes", + "n_hours_in": "n_hours_in_service", + "route_ids_": "route_ids_served" + } @@ -150,14 +155,6 @@ def remove_internal_keys(df: gpd.GeoDataFrame) -> gpd.GeoDataFrame: return df.drop(columns = internal_cols) -# Define column names, must fit ESRI 10 character limits -RENAME_COLS = { - "organization_name": "agency", - "organization_source_record_id": "org_id", - "route_name_used": "route_name", - "route_types_served": "routetypes", -} - # Rename columns when shapefile truncates RENAME_HQTA = { "agency_pri": "agency_primary", @@ -182,9 +179,4 @@ def remove_internal_keys(df: gpd.GeoDataFrame) -> gpd.GeoDataFrame: "organization_source_record_id": "org_id", "organization_name": "agency", "stop_pair_": "stop_pair_name" -} - -RENAME_GTFS = { - "n_hours_in": "n_hours_in_service", - "route_ids_": "route_ids_served" } \ No newline at end of file diff --git a/open_data/supplement_meta.py b/open_data/supplement_meta.py index c8948dc23..d0f6bad56 100644 --- a/open_data/supplement_meta.py +++ b/open_data/supplement_meta.py @@ -9,7 +9,7 @@ from calitp_data_analysis import utils from update_vars import analysis_date, ESRI_BASE_URL -from open_data_utils import RENAME_HQTA, RENAME_SPEED, RENAME_GTFS +from open_data_utils import RENAME_HQTA, RENAME_SPEED, STANDARDIZED_COLUMNS_DICT def get_esri_url(name: str)-> str: return f"{ESRI_BASE_URL}{name}/FeatureServer" @@ -46,13 +46,13 @@ def get_esri_url(name: str)-> str: "methodology": TRAFFIC_OPS_METHODOLOGY, "data_dict_url": get_esri_url("CA_Transit_Routes"), "revision_date": analysis_date, - "rename_cols": RENAME_GTFS + "rename_cols": STANDARDIZED_COLUMNS_DICT }, "ca_transit_stops": { "methodology": TRAFFIC_OPS_METHODOLOGY, "data_dict_url": get_esri_url("CA_Transit_Stops"), "revision_date": analysis_date, - "rename_cols": RENAME_GTFS + "rename_cols": STANDARDIZED_COLUMNS_DICT }, "speeds_by_stop_segments": { "methodology": SEGMENT_METHODOLOGY, diff --git a/open_data/update_data_dict.py b/open_data/update_data_dict.py index 8b199d3ba..f5b30702c 100644 --- a/open_data/update_data_dict.py +++ b/open_data/update_data_dict.py @@ -62,8 +62,8 @@ def new_columns_for_data_dict( if "hq_" in t: gdf = gdf.rename(columns = open_data_utils.RENAME_HQTA) elif "speed" in t: - gdf = gdf.rename(columns = publish_utils.RENAME_SPEED) - + gdf = gdf.rename(columns = open_data_utils.RENAME_SPEED) + col_list = gdf.columns.tolist() # Columns included in data dictionary diff --git a/open_data/xml/ca_hq_transit_areas.xml b/open_data/xml/ca_hq_transit_areas.xml index 5b114abb5..d46a0f366 100644 --- a/open_data/xml/ca_hq_transit_areas.xml +++ b/open_data/xml/ca_hq_transit_areas.xml @@ -20,7 +20,7 @@ - 2024-08-15 + 2024-10-08 ISO 19139 Geographic Information - Metadata - Implementation Specification @@ -85,7 +85,7 @@ - 2024-08-14 + 2024-09-18 diff --git a/open_data/xml/ca_hq_transit_stops.xml b/open_data/xml/ca_hq_transit_stops.xml index f7158a8ef..c753d5fd1 100644 --- a/open_data/xml/ca_hq_transit_stops.xml +++ b/open_data/xml/ca_hq_transit_stops.xml @@ -20,7 +20,7 @@ - 2024-08-15 + 2024-10-08 ISO 19139 Geographic Information - Metadata - Implementation Specification @@ -85,7 +85,7 @@ - 2024-08-14 + 2024-09-18 diff --git a/open_data/xml/ca_transit_routes.xml b/open_data/xml/ca_transit_routes.xml index 2249baebf..6c83f0622 100644 --- a/open_data/xml/ca_transit_routes.xml +++ b/open_data/xml/ca_transit_routes.xml @@ -20,7 +20,7 @@ - 2024-08-15 + 2024-10-08 ISO 19139 Geographic Information - Metadata - Implementation Specification @@ -85,7 +85,7 @@ - 2024-08-14 + 2024-09-18 @@ -143,7 +143,7 @@ - Provide compiled GTFS schedule data in geospatial format. Transit routes associates route information to shapes. Transit stops associates route information to stops. + Provide compiled GTFS schedule data in geospatial format. Transit routes associates route information to shapes. Transit stops associates aggregated stop times and route information aggregated for each stops. Provide all CA transit stops and routes (geospatial) from all transit operators. diff --git a/open_data/xml/ca_transit_stops.xml b/open_data/xml/ca_transit_stops.xml index 1de469e44..b383f144d 100644 --- a/open_data/xml/ca_transit_stops.xml +++ b/open_data/xml/ca_transit_stops.xml @@ -20,7 +20,7 @@ - 2024-08-15 + 2024-10-08 ISO 19139 Geographic Information - Metadata - Implementation Specification @@ -85,7 +85,7 @@ - 2024-08-14 + 2024-09-18 @@ -143,7 +143,7 @@ - Provide compiled GTFS schedule data in geospatial format. Transit routes associates route information to shapes. Transit stops associates route information to stops. + Provide compiled GTFS schedule data in geospatial format. Transit routes associates route information to shapes. Transit stops associates aggregated stop times and route information aggregated for each stops. Provide all CA transit stops and routes (geospatial) from all transit operators. diff --git a/open_data/xml/speeds_by_route_time_of_day.xml b/open_data/xml/speeds_by_route_time_of_day.xml index c8292b920..d0ea6d1f1 100644 --- a/open_data/xml/speeds_by_route_time_of_day.xml +++ b/open_data/xml/speeds_by_route_time_of_day.xml @@ -20,7 +20,7 @@ - 2024-08-15 + 2024-10-08 ISO 19139 Geographic Information - Metadata - Implementation Specification @@ -85,7 +85,7 @@ - 2024-08-14 + 2024-09-18 diff --git a/open_data/xml/speeds_by_stop_segments.xml b/open_data/xml/speeds_by_stop_segments.xml index f32789c93..2b1b8daa2 100644 --- a/open_data/xml/speeds_by_stop_segments.xml +++ b/open_data/xml/speeds_by_stop_segments.xml @@ -20,7 +20,7 @@ - 2024-08-15 + 2024-10-08 ISO 19139 Geographic Information - Metadata - Implementation Specification @@ -85,7 +85,7 @@ - 2024-08-14 + 2024-09-18