cal-itp · tiffanychu90 · Apr 12, 2024 · Apr 10, 2024 · Apr 10, 2024 · Apr 11, 2024
diff --git a/gtfs_digest/deploy_portfolio_yaml.py b/gtfs_digest/deploy_portfolio_yaml.py
@@ -6,7 +6,7 @@
 
 from segment_speed_utils.project_vars import RT_SCHED_GCS
 
-PORTFOLIO_SITE_YAML = Path("../portfolio/sites/gtfs_digest.yml")
+PORTFOLIO_SITE_YAML = Path("../portfolio/sites/digest_typologies.yml")
 
 def overwrite_yaml(portfolio_site_yaml: Path) -> list:
     """
@@ -29,7 +29,19 @@ def overwrite_yaml(portfolio_site_yaml: Path) -> list:
 
     districts = sorted(list(df.caltrans_district.unique()))
 
-    operators = df.organization_name.tolist()      
+    operators = df.organization_name.tolist()  
+
+    keep_me = ["City of Santa Monica", "City of Culver City",
+               "Long Beach Transit", "Southern California Regional Rail Authority",
+               "Foothill Transit",
+               "Alameda-Contra Costa Transit District",
+               "City and County of San Francisco",
+               "Sonoma-Marin Area Rail Transit District",
+               "Marin County Transit District",
+              ]
+    operators = [i for i in operators if i in keep_me]
+    districts = sorted(df[df.organization_name.isin(keep_me)].caltrans_district.unique())
+
     # Eric's example
     # https://github.com/cal-itp/data-analyses/blob/main/rt_delay/04_generate_all.ipynb
 

diff --git a/gtfs_digest/extra_cleaning.py b/gtfs_digest/extra_cleaning.py
diff --git a/gtfs_digest/merge_data.py b/gtfs_digest/merge_data.py
@@ -136,54 +136,37 @@ def concatenate_crosswalk_organization(
 def merge_in_standardized_route_names(
     df: pd.DataFrame, 
 ) -> pd.DataFrame:
+
     keep_cols = [
         "schedule_gtfs_dataset_key", "name", 
         "route_id", "service_date", 
-        "recent_route_id2", "recent_combined_name"]
+    ]
 
     CLEAN_ROUTES = GTFS_DATA_DICT.schedule_tables.route_identification
 
-    operators_need_cleaning = pd.read_parquet(
-        f"{SCHED_GCS}{CLEAN_ROUTES}.parquet",
-        filters = [[("name", "in", operators_only_route_long_name)]]
-    )
-    operators_ok = pd.read_parquet(
-        f"{SCHED_GCS}{CLEAN_ROUTES}.parquet",
-        filters = [[("name", "not in", operators_only_route_long_name)]]
-    )
+    route_names_df = pd.read_parquet(f"{SCHED_GCS}{CLEAN_ROUTES}.parquet")
 
-    operators_need_cleaning = operators_need_cleaning.assign(
-        recent_combined_name = operators_need_cleaning.route_long_name
-    )
+    route_names_df = time_series_utils.clean_standardized_route_names(
+        route_names_df).drop_duplicates()
 
-    standardized_route_names = pd.concat([
-        operators_need_cleaning,
-        operators_ok
-    ], axis=0, ignore_index=True)[keep_cols]       
-
     if "name" in df.columns:
         df = df.drop(columns = "name")
 
-    df = pd.merge(
+    # Use `route_id` to merge to standardized_route_names
+    df2 = pd.merge(
         df,
-        standardized_route_names,
+        route_names_df,
         on = ["schedule_gtfs_dataset_key", 
               "route_id", "service_date"],
         how = "left",
     )
 
-    df = df.assign(
-        recent_combined_name = df.recent_combined_name.str.replace("__", " ")
-    ).drop(
-        columns = ["route_id"]
-    ).rename(
-        columns = {
-            "recent_route_id2": "route_id",
-            "recent_combined_name": "route_combined_name"
-        }
-    )
+    # After merging, we can replace route_id with recent_route_id2 
+    drop_cols = ["route_desc", "combined_name", "route_id2"]
+    df3 = time_series_utils.parse_route_combined_name(df2).drop(
+        columns = drop_cols)
 
-    return df
+    return df3
 
 
 if __name__ == "__main__":
@@ -237,7 +220,7 @@ def merge_in_standardized_route_names(
         "is_early", "is_ontime", "is_late"
     ]
 
-    df[integrify] = df[integrify].astype("Int64")
+    df[integrify] = df[integrify].fillna(0).astype("int")
 
     df.to_parquet(
         f"{RT_SCHED_GCS}{DIGEST_RT_SCHED}.parquet"

diff --git a/gtfs_digest/readable.yml b/gtfs_digest/readable.yml
@@ -0,0 +1,66 @@
+# GTFS schedule
+direction_id:
+    readable: Direction
+    caption: Something
+avg_scheduled_service_minutes: "Average Scheduled Service (trip minutes)"
+avg_stop_miles: Average Stop Distance (miles)
+ttl_service_hours: total service (hours)
+n_scheduled_trips: "# scheduled trips"
+frequency: Trips per Hour
+total_scheduled_service_minutes: "Aggregate Scheduled Service Minutes (all trips)"
+route_id: Route
+route_combined_name: Route
+
+# GTFS vehicle positions
+minutes_atleast1_vp: "# minutes with 1+ vp per minute"
+minutes_atleast2_vp: "# minutes with 2+ vp per minute"
+total_rt_service_minutes: "Aggregate Actual Service Minutes (all trips)"
+total_vp: "# vp"
+vp_in_shape: "# vp within scheduled shape"
+is_early": "# early arrival trips"
+is_ontime: "# on-time trips"
+is_late: "# late trips"
+n_vp_trips: "# trips with vp"
+vp_per_minute: "Average vp per minute"
+pct_in_shape: "% vp within scheduled shape"
+pct_rt_journey_atleast1_vp: "% actual trip minutes with 1+ vp per minute"
+pct_rt_journey_atleast2_vp: "% actual trip minutes with 2+ vp per minute"
+pct_sched_journey_atleast1_vp: "% scheduled trip minutes with 1+ vp per minute"
+pct_sched_journey_atleast2_vp: "% scheduled trip minutes with 2+ vp per minute"
+rt_sched_journey_ratio: "Actual / scheduled service ratio"
+avg_rt_service_minutes: "Average Actual Service (trip minutes)"
+speed_mph: "Speed (mph)"
+sched_rt_category: "GTFS Availability"
+
+# Operators
+organization_source_record_id: "Organization ID"
+organization_name: Organization
+name: Transit Operator
+caltrans_district: District
+base64_url: "base64 encoded feed URL"
+operator_n_routes: "# routes"
+operator_n_trips: "# trips"
+operator_n_shapes: "# shapes"
+operator_n_stops: "# stops"
+operator_n_arrivals: "# arrivals"
+operator_route_length_miles: "service miles"
+operator_arrivals_per_stop: "avg arrivals per stop"
+n_coverage_routes: "# coverage route types" 
+n_downtown_local_routes: "# downtown local route types"
+n_local_routes: "# local route types"
+n_rapid_routes: "# rapid route types"
+
+
+# Dates / time
+time_period: Period
+service_date: Date
+time_of_day: "time of day"
+day_name: "Day of Week"
+year: Year
+month: Month
+year_month: "Month - Year"
+
+# Roads
+road_freq_category: "Road Frequency Category"
+road_typology: "Road Typology Category"
+pct_typology: "% route miles with typology"
diff --git a/gtfs_funnel/concatenate_monthly_scheduled_service.py b/gtfs_funnel/concatenate_monthly_scheduled_service.py
@@ -14,7 +14,8 @@
 
     MONTHLY_SERVICE = GTFS_DATA_DICT.schedule_tables.monthly_scheduled_service
     CROSSWALK = GTFS_DATA_DICT.schedule_tables.gtfs_key_crosswalk
-
+    ROUTES = GTFS_DATA_DICT.schedule_tables.route_identification
+
     year_list = [2023, 2024]
     analysis_date_list = (rt_dates.y2024_dates + 
                           rt_dates.y2023_dates + 
@@ -47,18 +48,40 @@
             "schedule_source_record_id", 
             "schedule_gtfs_dataset_key",
             "organization_source_record_id", "organization_name", 
-        ],
+        ]
     ).drop(
         columns = "service_date"
-    ).drop_duplicates().reset_index(drop=True)
+    ).drop_duplicates().reset_index(drop=True)    
+
+    # Get standardized route names and clean up more
+    standardized_routes = pd.read_parquet(f"{SCHED_GCS}{ROUTES}.parquet")
+
+    route_names_df = time_series_utils.clean_standardized_route_names(
+        standardized_routes).pipe(
+        time_series_utils.parse_route_combined_name
+    )[
+        ["schedule_gtfs_dataset_key",
+         "route_long_name", "route_short_name", 
+         "route_id", "route_combined_name"]
+    ].drop_duplicates()
 
+    # Merge monthly service with crosswalk to get schedule_gtfs_dataset_key
     df2 = pd.merge(
         df,
         crosswalk,
         on = "schedule_source_record_id",
         how = "inner",
     )
 
-    df2.to_parquet(
+    # Merge in route_names so we use the standardized/cleaned up route names
+    df3 = pd.merge(
+        df2,
+        route_names_df,
+        on = ["schedule_gtfs_dataset_key", 
+              "route_long_name", "route_short_name"],
+        how = "left",   
+    )
+
+    df3.to_parquet(
         f"{SCHED_GCS}{MONTHLY_SERVICE}.parquet"
     ) 
diff --git a/portfolio/digest_typologies/README.md b/portfolio/digest_typologies/README.md
@@ -0,0 +1,5 @@
+# GTFS Digest
+
+Performance metrics from GTFS schedule and vehicle positions time-series data for all transit operators by route.
+
+[![digest_mermaid](https://mermaid.ink/img/pako:eNqVk1Fv2jAQx7-K5QnlJUGEhAKeNKkU2NOeWu1hZIpMfAGrjh3Zl7WA-O5zknXQba20PDin-_98Z9-dT7QwAiijURRlGiUqYOTzw_qeLOUOHGa6EwaDU6YJkVoiI51JSIB7qCBgJNhyB0F47f3KreRbBS74jXuptrLi9nBnlLHtvg-ryXqyvn3ZeiEe4Bkv1Gg0-htZGCvAvgUpqeEtzUFhtHh9jvV6ulpcMQgW5SukLMugl8_tzy_nwSDTmS6VeSr23CJ5WPSAa7Y7y-s9saZBiIS0UKA0ulf7tVDcuSWURJSklEqxX7X4Q3WFlTW-EN0pr6O03-3GFXsQjQJSAVpZuO-MMR82ij4R0TXx44VebFxTtfUjrgYQ_2CvQt9tLOY_XP4_CZabHZYuf4RDjiY3dse1PPL2-qSwxrknrh7f27_aOORte4Q8gsi7EuaaV_Bu1t4CLS4uGtIKbMWl8NPdzWBGu9nMKPOmgJI3CjPqG-lR3qC5P-iCMrQNhLSpBUdYSu47WVFWcuW8F4REY7_0L6Z7OCGtuabsRJ8pi2bjYTJPxvM0mc6S-ThOQ3qgLLkZxulklt6kaTwZzdL4HNKjMT5qPBzHfjZH8TRJPJ1O0y7ct07sUp5_AvaOGGY?type=png)](https://mermaid.live/edit#pako:eNqVk1Fv2jAQx7-K5QnlJUGEhAKeNKkU2NOeWu1hZIpMfAGrjh3Zl7WA-O5zknXQba20PDin-_98Z9-dT7QwAiijURRlGiUqYOTzw_qeLOUOHGa6EwaDU6YJkVoiI51JSIB7qCBgJNhyB0F47f3KreRbBS74jXuptrLi9nBnlLHtvg-ryXqyvn3ZeiEe4Bkv1Gg0-htZGCvAvgUpqeEtzUFhtHh9jvV6ulpcMQgW5SukLMugl8_tzy_nwSDTmS6VeSr23CJ5WPSAa7Y7y-s9saZBiIS0UKA0ulf7tVDcuSWURJSklEqxX7X4Q3WFlTW-EN0pr6O03-3GFXsQjQJSAVpZuO-MMR82ij4R0TXx44VebFxTtfUjrgYQ_2CvQt9tLOY_XP4_CZabHZYuf4RDjiY3dse1PPL2-qSwxrknrh7f27_aOORte4Q8gsi7EuaaV_Bu1t4CLS4uGtIKbMWl8NPdzWBGu9nMKPOmgJI3CjPqG-lR3qC5P-iCMrQNhLSpBUdYSu47WVFWcuW8F4REY7_0L6Z7OCGtuabsRJ8pi2bjYTJPxvM0mc6S-ThOQ3qgLLkZxulklt6kaTwZzdL4HNKjMT5qPBzHfjZH8TRJPJ1O0y7ct07sUp5_AvaOGGY)
diff --git a/portfolio/digest_typologies/_config.yml b/portfolio/digest_typologies/_config.yml
@@ -0,0 +1,43 @@
+# Book settings
+# Learn more at https://jupyterbook.org/customize/config.html
+
+title: Transit Operators and Route Typologies
+author: Cal-ITP
+copyright: "2024"
+#logo: calitp_logo_MAIN.png
+
+# Force re-execution of notebooks on each build.
+# See https://jupyterbook.org/content/execute.html
+execute:
+  execute_notebooks: 'off'
+  allow_errors: false
+  timeout: -1
+
+# Define the name of the latex output file for PDF builds
+latex:
+  latex_documents:
+    targetname: book.tex
+
+launch_buttons:
+  binderhub_url: "https://mybinder.org"
+  jupyterhub_url: "https://hubtest.k8s.calitp.jarv.us"
+  thebe: true
+
+repository:
+  url: https://github.com/cal-itp/data-analyses/  # Online location of your book
+#  path_to_book: docs  # Optional path to your book, relative to the repository root
+  path_to_book: gtfs_digest
+  branch: main  # Which branch of the repository should be used when creating links (optional)
+
+# Add GitHub buttons to your book
+# See https://jupyterbook.org/customize/config.html#add-a-link-to-your-repository
+html:
+  use_issues_button: true
+  use_repository_button: true
+  use_edit_page_button: true
+  google_analytics_id: 'G-JCX3Z8JZJC'
+
+sphinx:
+  config:
+    html_js_files:
+    - https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.4/require.min.js
diff --git a/portfolio/digest_typologies/_toc.yml b/portfolio/digest_typologies/_toc.yml
@@ -0,0 +1,11 @@
+format: jb-book
+parts:
+-   caption: null
+    chapters:
+    -   file: district_04-oakland.md
+        sections:
+        -   glob: district_04-oakland/*
+    -   file: district_07-los-angeles.md
+        sections:
+        -   glob: district_07-los-angeles/*
+root: README
diff --git a/portfolio/digest_typologies/district_04-oakland.md b/portfolio/digest_typologies/district_04-oakland.md
@@ -0,0 +1 @@
+# District 04 - Oakland
diff --git a/...land/0__typologies__district_04-oakland__name_alameda-contra-costa-transit-district.ipynb b/...land/0__typologies__district_04-oakland__name_alameda-contra-costa-transit-district.ipynb
diff --git a/...and/1__typologies__district_04-oakland__name_central-contra-costa-transit-authority.ipynb b/...and/1__typologies__district_04-oakland__name_central-contra-costa-transit-authority.ipynb
diff --git a/...4-oakland/2__typologies__district_04-oakland__name_city-and-county-of-san-francisco.ipynb b/...4-oakland/2__typologies__district_04-oakland__name_city-and-county-of-san-francisco.ipynb
diff --git a/...and/3__typologies__district_04-oakland__name_eastern-contra-costa-transit-authority.ipynb b/...and/3__typologies__district_04-oakland__name_eastern-contra-costa-transit-authority.ipynb
diff --git a/...t_04-oakland/4__typologies__district_04-oakland__name_marin-county-transit-district.ipynb b/...t_04-oakland/4__typologies__district_04-oakland__name_marin-county-transit-district.ipynb
diff --git a/...__typologies__district_04-oakland__name_santa-clara-valley-transportation-authority.ipynb b/...__typologies__district_04-oakland__name_santa-clara-valley-transportation-authority.ipynb
diff --git a/...pologies/district_04-oakland/6__typologies__district_04-oakland__name_sonoma-county.ipynb b/...pologies/district_04-oakland/6__typologies__district_04-oakland__name_sonoma-county.ipynb
diff --git a/portfolio/digest_typologies/district_07-los-angeles.md b/portfolio/digest_typologies/district_07-los-angeles.md
@@ -0,0 +1 @@
+# District 07 - Los Angeles
diff --git a/...ict_07-los-angeles/0__typologies__district_07-los-angeles__name_city-of-culver-city.ipynb b/...ict_07-los-angeles/0__typologies__district_07-los-angeles__name_city-of-culver-city.ipynb
diff --git a/...ict_07-los-angeles/1__typologies__district_07-los-angeles__name_city-of-los-angeles.ipynb b/...ict_07-los-angeles/1__typologies__district_07-los-angeles__name_city-of-los-angeles.ipynb
diff --git a/...ct_07-los-angeles/2__typologies__district_07-los-angeles__name_city-of-santa-monica.ipynb b/...ct_07-los-angeles/2__typologies__district_07-los-angeles__name_city-of-santa-monica.ipynb
diff --git a/...strict_07-los-angeles/3__typologies__district_07-los-angeles__name_foothill-transit.ipynb b/...strict_07-los-angeles/3__typologies__district_07-los-angeles__name_foothill-transit.ipynb
diff --git a/...rict_07-los-angeles/4__typologies__district_07-los-angeles__name_long-beach-transit.ipynb b/...rict_07-los-angeles/4__typologies__district_07-los-angeles__name_long-beach-transit.ipynb
diff --git a/...trict_07-los-angeles__name_los-angeles-county-metropolitan-transportation-authority.ipynb b/...trict_07-los-angeles__name_los-angeles-county-metropolitan-transportation-authority.ipynb
diff --git a/...s/6__typologies__district_07-los-angeles__name_university-of-california-los-angeles.ipynb b/...s/6__typologies__district_07-los-angeles__name_university-of-california-los-angeles.ipynb