Merge pull request #972 from cal-itp/open-data-gcs

tiffanychu90 · web-flow · commit 2a489fe5a70e · 2023-12-15T12:33:15.000-08:00
Open data gcs
diff --git a/high_quality_transit_areas/B1_create_hqta_segments.py b/high_quality_transit_areas/B1_create_hqta_segments.py
@@ -31,7 +31,7 @@
 from shared_utils import rt_utils, geog_utils_to_add
 from segment_speed_utils import helpers, gtfs_schedule_wrangling
 from utilities import GCS_FILE_PATH
-from update_vars import analysis_date, COMPILED_CACHED_VIEWS
+from update_vars import analysis_date
                         
 HQTA_SEGMENT_LENGTH = 1_250 # meters
 
diff --git a/high_quality_transit_areas/B2_sjoin_stops_to_segments.py b/high_quality_transit_areas/B2_sjoin_stops_to_segments.py
@@ -17,7 +17,7 @@
 from calitp_data_analysis import utils
 from segment_speed_utils import helpers, gtfs_schedule_wrangling
 from utilities import GCS_FILE_PATH
-from update_vars import analysis_date, COMPILED_CACHED_VIEWS, PROJECT_CRS
+from update_vars import analysis_date, PROJECT_CRS
 
 def max_trips_by_group(df: pd.DataFrame, 
                        group_cols: list,
diff --git a/high_quality_transit_areas/C3_create_bus_hqta_types.py b/high_quality_transit_areas/C3_create_bus_hqta_types.py
@@ -19,7 +19,7 @@
 import C1_prep_pairwise_intersections as prep_clip
 from calitp_data_analysis import utils
 from utilities import catalog_filepath, GCS_FILE_PATH
-from update_vars import analysis_date, COMPILED_CACHED_VIEWS, PROJECT_CRS
+from update_vars import analysis_date, PROJECT_CRS
 from segment_speed_utils import helpers
 
 
diff --git a/high_quality_transit_areas/D1_assemble_hqta_points.py b/high_quality_transit_areas/D1_assemble_hqta_points.py
@@ -21,7 +21,7 @@
 import utilities
 from calitp_data_analysis import geography_utils, utils
 from shared_utils import schedule_rt_utils
-from update_vars import analysis_date, TEMP_GCS, COMPILED_CACHED_VIEWS, PROJECT_CRS
+from update_vars import analysis_date, TEMP_GCS, PROJECT_CRS
 from segment_speed_utils import helpers
 
 EXPORT_PATH = f"{utilities.GCS_FILE_PATH}export/{analysis_date}/"
diff --git a/open_data/intake_justification.md b/open_data/intake_justification.md
@@ -17,6 +17,6 @@ Document links, justification, for why datasets need to be made public. Submit t
 
 High Quality Transit Areas, as described in Public Resources Code 21155, 21064.3, 21060.2, relies on the intersection of frequent transit service and is based on the General Transit Feed Specification (GTFS) data. This HQTA dataset provides four categories of high quality transit: rail, ferry, BRT, and the intersection of frequent bus corridors. 
 
-### Transit Routes / Stops
+### Transit Routes / Stops / Speeds
 
-The General Transit Feed Specification (GTFS) provides transit schedules, including transit route and stop information, in text files. The California Integrated Travel Project within Caltrans ingests GTFS data daily for all operators in the state, standardizes, and processes this data for storage in its data warehouse. This dataset compiles all the route and stop information for all CA transit operators and provides it in a geospatial format.
+The General Transit Feed Specification (GTFS) provides transit schedules, including transit route and stop information, in text files. The California Integrated Travel Project within Caltrans ingests GTFS data daily for all operators in the state, standardizes, and processes this data for storage in its data warehouse. This dataset compiles all the route and stop information for all CA transit operators and provides it in a geospatial format. It also compiles all the GTFS real-time vehicle positions data, processes it into a usable format as speeds.
diff --git a/open_data/publish_public_gcs.py b/open_data/publish_public_gcs.py
@@ -0,0 +1,99 @@
+"""
+Write some open data ouputs into public GCS 
+as zipped shapefiles
+"""
+import pandas as pd
+import gcsfs
+import geopandas as gpd
+
+from pathlib import Path
+
+from calitp_data_analysis import utils
+from segment_speed_utils.project_vars import PUBLIC_GCS
+from update_vars import HQTA_GCS, TRAFFIC_OPS_GCS, SEGMENT_GCS
+from gcs_to_esri import remove_zipped_shapefiles
+
+fs = gcsfs.GCSFileSystem()
+
+HQTA_DATA = ["ca_hq_transit_stops", "ca_hq_transit_areas"]
+GTFS_DATA = ["ca_transit_routes", "ca_transit_stops"]
+SPEED_DATA = ["speeds_by_stop_segments", "speeds_by_route_timeofday"]
+
+def construct_data_path(
+    filename: str, 
+    date: str
+) -> str:
+    """
+    Create gdf's filepath of the open data from its path in GCS bucket
+    """
+    if filename in HQTA_DATA:
+        path = f"{HQTA_GCS}export/{date}/{filename}"
+    
+    elif filename in GTFS_DATA:
+        path = f"{TRAFFIC_OPS_GCS}export/{filename}"
+    
+    elif filename in SPEED_DATA:
+        
+        if filename == "speeds_by_stop_segments":
+            path = f"{SEGMENT_GCS}export/avg_speeds_stop_segments_{date}"
+        
+        elif filename == "speeds_by_route_timeofday":
+            path = f"{SEGMENT_GCS}trip_summary/route_speeds_{date}"
+    
+    return f"{path}.parquet"
+
+
+def write_to_public_gcs(
+    filename: str, 
+    date: str
+) -> str:
+    """
+    Import geoparquet, write out zipped shapefile to 
+    local Hub, and upload to public bucket.
+    """
+    original_data_path = construct_data_path(filename, date)
+
+    # Get a path obj so we can parse for stem and suffix
+    # Don't save it because when we import GCS, we need str as path
+    original_path_obj = Path(original_data_path)
+    public_filename = (
+        f"{original_path_obj.stem}_"
+        f"{date}.zip" 
+    )
+    public_path = f"{PUBLIC_GCS}open_data/{public_filename}"
+    
+    gdf = gpd.read_parquet(original_data_path)
+    
+    # Don't write directly to GCS bucket, because our path is not 
+    # exactly the same. In public bucket, we want to write it to a sub-directory
+    utils.make_zipped_shapefile(
+        gdf, 
+        local_path = public_filename,
+        gcs_folder = None
+    )
+    
+    # Upload to GCS
+    fs.put(
+        public_filename,
+        public_path
+    )
+    
+    print(f"Uploaded {public_path}")
+        
+    return
+    
+
+if __name__ == "__main__":
+    
+    from shared_utils import rt_dates
+    
+    dates = [
+        rt_dates.DATES["aug2023"]
+    ]
+    
+    for d in dates:
+        write_to_public_gcs("ca_hq_transit_stops", d)
+    
+    remove_zipped_shapefiles()
+
+        
diff --git a/rt_segment_speeds/scripts/publish_public_gcs.py b/rt_segment_speeds/scripts/publish_public_gcs.py
@@ -6,7 +6,8 @@
 import pandas as pd
 
 from dask import delayed, compute
-from segment_speed_utils.project_vars import SEGMENT_GCS, PUBLIC_GCS
+from segment_speed_utils import helpers
+from segment_speed_utils.project_vars import SEGMENT_GCS, PUBLIC_GCS, CONFIG_PATH
 
 fs = gcsfs.GCSFileSystem()
 
@@ -20,28 +21,38 @@ def concatenate_datasets_across_months(dataset_name: str) -> pd.DataFrame:
         ) for d in list_of_files
     ]
     
+    if "shape" in dataset_name:
+        sort_cols = ["shape_id", "stop_sequence"]
+    elif "route" in dataset_name:
+        sort_cols = ["route_id", "direction_id", "stop_pair"]
+    
     df = (pd.concat(dfs, 
                     axis=0, ignore_index=True)
           .sort_values(["organization_name",
                         "year", "month", "peak_offpeak", "weekday_weekend",
-                        "shape_id", "stop_sequence"])
+                        ] + sort_cols)
           .reset_index(drop=True)
          )
     
     return df
 
 if __name__ == "__main__":
 
+    STOP_SEG_DICT = helpers.get_parameters(CONFIG_PATH, "stop_segments")
+
+
     DATASETS = [
-        "speeds_by_peak_daytype"
+        STOP_SEG_DICT["shape_rollup"],
+        STOP_SEG_DICT["route_direction_rollup"]
     ]
     
     for d in DATASETS:
+        
         start = datetime.datetime.now()
 
         df = delayed(concatenate_datasets_across_months)(d)
         df = compute(df)[0]
         df.to_parquet(f"{PUBLIC_GCS}speeds/{d}.parquet")
                 
         end = datetime.datetime.now()
-        print(f"save {d} to public GCS: {end - start}")
+        print(f"save {d} to public GCS: {end - start}")