Skip to content

Commit 2a489fe

Browse files
authored
Merge pull request #972 from cal-itp/open-data-gcs
Open data gcs
2 parents 901f31a + a2335f3 commit 2a489fe

File tree

7 files changed

+120
-10
lines changed

7 files changed

+120
-10
lines changed

high_quality_transit_areas/B1_create_hqta_segments.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@
3131
from shared_utils import rt_utils, geog_utils_to_add
3232
from segment_speed_utils import helpers, gtfs_schedule_wrangling
3333
from utilities import GCS_FILE_PATH
34-
from update_vars import analysis_date, COMPILED_CACHED_VIEWS
34+
from update_vars import analysis_date
3535

3636
HQTA_SEGMENT_LENGTH = 1_250 # meters
3737

high_quality_transit_areas/B2_sjoin_stops_to_segments.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
from calitp_data_analysis import utils
1818
from segment_speed_utils import helpers, gtfs_schedule_wrangling
1919
from utilities import GCS_FILE_PATH
20-
from update_vars import analysis_date, COMPILED_CACHED_VIEWS, PROJECT_CRS
20+
from update_vars import analysis_date, PROJECT_CRS
2121

2222
def max_trips_by_group(df: pd.DataFrame,
2323
group_cols: list,

high_quality_transit_areas/C3_create_bus_hqta_types.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919
import C1_prep_pairwise_intersections as prep_clip
2020
from calitp_data_analysis import utils
2121
from utilities import catalog_filepath, GCS_FILE_PATH
22-
from update_vars import analysis_date, COMPILED_CACHED_VIEWS, PROJECT_CRS
22+
from update_vars import analysis_date, PROJECT_CRS
2323
from segment_speed_utils import helpers
2424

2525

high_quality_transit_areas/D1_assemble_hqta_points.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@
2121
import utilities
2222
from calitp_data_analysis import geography_utils, utils
2323
from shared_utils import schedule_rt_utils
24-
from update_vars import analysis_date, TEMP_GCS, COMPILED_CACHED_VIEWS, PROJECT_CRS
24+
from update_vars import analysis_date, TEMP_GCS, PROJECT_CRS
2525
from segment_speed_utils import helpers
2626

2727
EXPORT_PATH = f"{utilities.GCS_FILE_PATH}export/{analysis_date}/"

open_data/intake_justification.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,6 @@ Document links, justification, for why datasets need to be made public. Submit t
1717

1818
High Quality Transit Areas, as described in Public Resources Code 21155, 21064.3, 21060.2, relies on the intersection of frequent transit service and is based on the General Transit Feed Specification (GTFS) data. This HQTA dataset provides four categories of high quality transit: rail, ferry, BRT, and the intersection of frequent bus corridors.
1919

20-
### Transit Routes / Stops
20+
### Transit Routes / Stops / Speeds
2121

22-
The General Transit Feed Specification (GTFS) provides transit schedules, including transit route and stop information, in text files. The California Integrated Travel Project within Caltrans ingests GTFS data daily for all operators in the state, standardizes, and processes this data for storage in its data warehouse. This dataset compiles all the route and stop information for all CA transit operators and provides it in a geospatial format.
22+
The General Transit Feed Specification (GTFS) provides transit schedules, including transit route and stop information, in text files. The California Integrated Travel Project within Caltrans ingests GTFS data daily for all operators in the state, standardizes, and processes this data for storage in its data warehouse. This dataset compiles all the route and stop information for all CA transit operators and provides it in a geospatial format. It also compiles all the GTFS real-time vehicle positions data, processes it into a usable format as speeds.

open_data/publish_public_gcs.py

Lines changed: 99 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,99 @@
1+
"""
2+
Write some open data ouputs into public GCS
3+
as zipped shapefiles
4+
"""
5+
import pandas as pd
6+
import gcsfs
7+
import geopandas as gpd
8+
9+
from pathlib import Path
10+
11+
from calitp_data_analysis import utils
12+
from segment_speed_utils.project_vars import PUBLIC_GCS
13+
from update_vars import HQTA_GCS, TRAFFIC_OPS_GCS, SEGMENT_GCS
14+
from gcs_to_esri import remove_zipped_shapefiles
15+
16+
fs = gcsfs.GCSFileSystem()
17+
18+
HQTA_DATA = ["ca_hq_transit_stops", "ca_hq_transit_areas"]
19+
GTFS_DATA = ["ca_transit_routes", "ca_transit_stops"]
20+
SPEED_DATA = ["speeds_by_stop_segments", "speeds_by_route_timeofday"]
21+
22+
def construct_data_path(
23+
filename: str,
24+
date: str
25+
) -> str:
26+
"""
27+
Create gdf's filepath of the open data from its path in GCS bucket
28+
"""
29+
if filename in HQTA_DATA:
30+
path = f"{HQTA_GCS}export/{date}/{filename}"
31+
32+
elif filename in GTFS_DATA:
33+
path = f"{TRAFFIC_OPS_GCS}export/{filename}"
34+
35+
elif filename in SPEED_DATA:
36+
37+
if filename == "speeds_by_stop_segments":
38+
path = f"{SEGMENT_GCS}export/avg_speeds_stop_segments_{date}"
39+
40+
elif filename == "speeds_by_route_timeofday":
41+
path = f"{SEGMENT_GCS}trip_summary/route_speeds_{date}"
42+
43+
return f"{path}.parquet"
44+
45+
46+
def write_to_public_gcs(
47+
filename: str,
48+
date: str
49+
) -> str:
50+
"""
51+
Import geoparquet, write out zipped shapefile to
52+
local Hub, and upload to public bucket.
53+
"""
54+
original_data_path = construct_data_path(filename, date)
55+
56+
# Get a path obj so we can parse for stem and suffix
57+
# Don't save it because when we import GCS, we need str as path
58+
original_path_obj = Path(original_data_path)
59+
public_filename = (
60+
f"{original_path_obj.stem}_"
61+
f"{date}.zip"
62+
)
63+
public_path = f"{PUBLIC_GCS}open_data/{public_filename}"
64+
65+
gdf = gpd.read_parquet(original_data_path)
66+
67+
# Don't write directly to GCS bucket, because our path is not
68+
# exactly the same. In public bucket, we want to write it to a sub-directory
69+
utils.make_zipped_shapefile(
70+
gdf,
71+
local_path = public_filename,
72+
gcs_folder = None
73+
)
74+
75+
# Upload to GCS
76+
fs.put(
77+
public_filename,
78+
public_path
79+
)
80+
81+
print(f"Uploaded {public_path}")
82+
83+
return
84+
85+
86+
if __name__ == "__main__":
87+
88+
from shared_utils import rt_dates
89+
90+
dates = [
91+
rt_dates.DATES["aug2023"]
92+
]
93+
94+
for d in dates:
95+
write_to_public_gcs("ca_hq_transit_stops", d)
96+
97+
remove_zipped_shapefiles()
98+
99+

rt_segment_speeds/scripts/publish_public_gcs.py

Lines changed: 15 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,8 @@
66
import pandas as pd
77

88
from dask import delayed, compute
9-
from segment_speed_utils.project_vars import SEGMENT_GCS, PUBLIC_GCS
9+
from segment_speed_utils import helpers
10+
from segment_speed_utils.project_vars import SEGMENT_GCS, PUBLIC_GCS, CONFIG_PATH
1011

1112
fs = gcsfs.GCSFileSystem()
1213

@@ -20,28 +21,38 @@ def concatenate_datasets_across_months(dataset_name: str) -> pd.DataFrame:
2021
) for d in list_of_files
2122
]
2223

24+
if "shape" in dataset_name:
25+
sort_cols = ["shape_id", "stop_sequence"]
26+
elif "route" in dataset_name:
27+
sort_cols = ["route_id", "direction_id", "stop_pair"]
28+
2329
df = (pd.concat(dfs,
2430
axis=0, ignore_index=True)
2531
.sort_values(["organization_name",
2632
"year", "month", "peak_offpeak", "weekday_weekend",
27-
"shape_id", "stop_sequence"])
33+
] + sort_cols)
2834
.reset_index(drop=True)
2935
)
3036

3137
return df
3238

3339
if __name__ == "__main__":
3440

41+
STOP_SEG_DICT = helpers.get_parameters(CONFIG_PATH, "stop_segments")
42+
43+
3544
DATASETS = [
36-
"speeds_by_peak_daytype"
45+
STOP_SEG_DICT["shape_rollup"],
46+
STOP_SEG_DICT["route_direction_rollup"]
3747
]
3848

3949
for d in DATASETS:
50+
4051
start = datetime.datetime.now()
4152

4253
df = delayed(concatenate_datasets_across_months)(d)
4354
df = compute(df)[0]
4455
df.to_parquet(f"{PUBLIC_GCS}speeds/{d}.parquet")
4556

4657
end = datetime.datetime.now()
47-
print(f"save {d} to public GCS: {end - start}")
58+
print(f"save {d} to public GCS: {end - start}")

0 commit comments

Comments
 (0)