add caltrans_district to ca stops

tiffanychu90 · tiffanychu90 · commit f5a263bb6a81 · 2025-01-07T21:44:47.000Z
diff --git a/open_data/create_stops_data.py b/open_data/create_stops_data.py
@@ -99,7 +99,7 @@ def finalize_export_df(df: gpd.GeoDataFrame) -> gpd.GeoDataFrame:
         'n_routes', 'route_ids_served', 'route_types_served', 
         'n_arrivals', 'n_hours_in_service',
     ]
-    agency_ids = ['base64_url']
+    agency_ids = ['base64_url', 'caltrans_district']
     
     col_order = route_cols + stop_cols + agency_ids + ['geometry']
     
diff --git a/open_data/open_data_utils.py b/open_data/open_data_utils.py
@@ -31,18 +31,23 @@ def standardize_operator_info_for_exports(
         f"{SCHED_GCS}{CROSSWALK_FILE}_{date}.parquet",
         columns = [
             "schedule_gtfs_dataset_key", "name", "base64_url", 
-            "organization_source_record_id", "organization_name"
+            "organization_source_record_id", "organization_name",
+            "caltrans_district",
         ],
         filters = [[("schedule_gtfs_dataset_key", "in", public_feeds)]]
     )
     
+    # Checked whether we need a left merge to keep stops outside of CA
+    # that may not have caltrans_district
+    # and inner merge is fine. All operators are assigned a caltrans_district
+    # so Amtrak / FlixBus stops have values populated
     df2 = pd.merge(
         df,
         crosswalk,
         on = "schedule_gtfs_dataset_key",
         how = "inner"
     )
-        
+    
     return df2
     
     

Original file line number	Diff line number	Diff line change
`@@ -99,7 +99,7 @@ def finalize_export_df(df: gpd.GeoDataFrame) -> gpd.GeoDataFrame:`
`99`	`99`	`'n_routes', 'route_ids_served', 'route_types_served',`
`100`	`100`	`'n_arrivals', 'n_hours_in_service',`
`101`	`101`	`]`
`102`		`- agency_ids = ['base64_url']`
	`102`	`+ agency_ids = ['base64_url', 'caltrans_district']`
`103`	`103`
`104`	`104`	`col_order = route_cols + stop_cols + agency_ids + ['geometry']`
`105`	`105`