Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

create batch of csvs per rpta for ntd ridership #979

Merged
merged 5 commits into from
Dec 22, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitattributes
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
portfolio/**/*.ipynb filter=lfs diff=lfs merge=lfs -text
portfolio/route_speeds/**/*.ipynb
portfolio/ntd_monthly_ridership/**/*.ipynb
portfolio/dla/district_10__district_title_district-10-stockton/0__dla_district_report__district_10__district_title_district-10-stockton.ipynb filter=lfs diff=lfs merge=lfs -text
portfolio/dla/district_11__district_title_district-11-san-diego/0__dla_district_report__district_11__district_title_district-11-san-diego.ipynb filter=lfs diff=lfs merge=lfs -text
portfolio/dla/district_12__district_title_district-12-irvine/0__dla_district_report__district_12__district_title_district-12-irvine.ipynb filter=lfs diff=lfs merge=lfs -text
Expand Down
3 changes: 3 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,9 @@ install_env:
#cd rt_delay/ && make setup_rt_analysis && cd ..
cd rt_segment_speeds && pip install -r requirements.txt && cd ..

production_portfolio:
python portfolio/portfolio.py index --deploy --prod

# Create .egg to upload to dask cloud cluster
egg_modules:
cd ~/data-analyses/rt_segment_speeds && python setup.py bdist_egg && cd ..
15 changes: 15 additions & 0 deletions _shared_utils/shared_utils/rt_dates.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,3 +61,18 @@
"Q3_2023": "2023-07-12",
"Q4_2023": "2023-10-11",
}

MONTH_DICT = {
1: "January",
2: "February",
3: "March",
4: "April",
5: "May",
6: "June",
7: "July",
8: "August",
9: "September",
10: "October",
11: "November",
12: "December",
}
10 changes: 10 additions & 0 deletions ntd/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
# NTD Monthly Ridership by RTPA

Provide CalSTA with NTD Monthly Ridership by each regional transportation planning authority (RTPA).

This report shows general ridership trends by transit agency, mode, and type of service. Reported unlinked passenger trips are reported, as well as the change from the prior year. For example, July 2023's change would be the change in July 2023's reported values against July 2022's reported values.

## Datasets
1. NTD monthly data: https://www.transit.dot.gov/ntd/data-product/monthly-module-adjusted-data-release.
2. [RTPA list](https://gis.data.ca.gov/datasets/CAEnergy::regional-transportation-planning-agencies/explore?appid=cf412a17daaa47bca93c6d6b7e77aff0&edit=true)
3. Download our processed full data [here](https://storage.googleapis.com/calitp-publish-data-analysis/ntd_monthly_ridership/).
185 changes: 185 additions & 0 deletions ntd/monthly_ridership_by_rtpa.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,185 @@
"""
NTD Monthly Ridership by RTPA

1. Transit operators (`ntd_id`) in CA should be associated with RTPAs (use crosswalk uploaded in GCS)
2. For each RTPA, grab the latest month's ridership column, sort transit operators alphabetically, and write out spreadsheets.
3. Spreadsheets stored in folder to send to CalSTA.
"""
import gcsfs
import geopandas as gpd
import os
import pandas as pd
import shutil

from calitp_data_analysis.sql import to_snakecase
from segment_speed_utils.project_vars import PUBLIC_GCS
from shared_utils.rt_dates import MONTH_DICT
from update_vars import GCS_FILE_PATH, NTD_MODES, NTD_TOS

fs = gcsfs.GCSFileSystem()

RTPA_URL = ("https://services3.arcgis.com/bWPjFyq029ChCGur/arcgis/rest/services/"
"RTPAs/FeatureServer/0/query?outFields=*&where=1%3D1&f=geojson"
)

#gpd.read_file(RTPA_URL).RTPA.drop_duplicates().to_csv("rtpa.csv")
def add_change_columns(
df: pd.DataFrame,
year: int,
month: int
) -> pd.DataFrame:
"""
"""
ntd_month_col = f"{month}/{year}"
prior_year_col = f"{month}/{int(year)-1}"

df[f"change_1yr_{ntd_month_col}"] = df[ntd_month_col] - df[prior_year_col]
df = get_percent_change(df, ntd_month_col, prior_year_col)

return df


def get_percent_change(
df: pd.DataFrame,
current_col: str,
prior_col: str
) -> pd.DataFrame:

df[f"pct_change_1yr_{current_col}"] = (
(df[current_col] - df[prior_col])
.divide(df[current_col])
.round(4)
)

return df

def save_rtpa_outputs(
df: pd.DataFrame, year: int, month: str,
upload_to_public: bool = False
):
"""
Export a csv for each RTPA into a folder.
Zip that folder.
Upload zipped file to GCS.
"""
for i in df.RTPA.unique():
# Filename should be snakecase
rtpa_snakecase = i.replace(' ', '_').lower()

(df[df.RTPA == i]
.sort_values("NTD ID")
.drop(columns = "_merge")
.to_csv(
f"./{year}_{month}/{rtpa_snakecase}.csv",
index = False)
)

# Zip this folder, and save zipped output to GCS
shutil.make_archive(f"./{year}_{month}", "zip", f"{year}_{month}")
print("Zipped folder")

fs.upload(
f"./{year}_{month}.zip",
f"{GCS_FILE_PATH}{year}_{month}.zip"
)

if upload_to_public:
fs.upload(
f"./{year}_{month}.zip",
f"{PUBLIC_GCS}ntd_monthly_ridership/{year}_{month}.zip"
)

print("Uploaded to GCS")

return


def produce_ntd_monthly_ridership_by_rtpa(
upt_url: str,
year: int,
month: str
) -> pd.DataFrame:
"""
Import NTD data from url, filter to CA,
merge in crosswalk, and save individual csvs.
"""
# Import data, make sure NTD ID is string
full_upt = pd.read_excel(
upt_url, sheet_name = "UPT",
dtype = {"NTD ID": "str"}
)

full_upt = full_upt[full_upt.Agency.notna()].reset_index(drop=True)
full_upt.to_parquet(
f"{GCS_FILE_PATH}ntd_monthly_ridership_{year}_{month}.parquet"
)

# Filter to CA
ca = full_upt[(full_upt["UZA Name"].str.contains(", CA")) &
(full_upt.Agency.notna())].reset_index(drop=True)

crosswalk = pd.read_csv(
f"{GCS_FILE_PATH}ntd_id_rtpa_crosswalk.csv",
dtype = {"NTD ID": "str"}
)

df = pd.merge(
ca,
# Merging on too many columns can create problems
# because csvs and dtypes aren't stable / consistent
# for NTD ID, Legacy NTD ID, and UZA
crosswalk[["NTD ID", "RTPA"]],
on = "NTD ID",
how = "left",
indicator = True
)

print(df._merge.value_counts())

# Good, everything merged, as we want
if len(df[df._merge=="left_only"]) > 0:
raise ValueError("There are unmerged rows to crosswalk")

# Add new columns
reversed_months = {v:k for k, v in MONTH_DICT.items()}

for m in range(1, reversed_months[month] + 1):
df = add_change_columns(df, year, m)

df = df.assign(
Mode_full = df.Mode.map(NTD_MODES),
TOS_full = df.TOS.map(NTD_TOS)
)

return df


def remove_local_outputs(year: int, month: str):
shutil.rmtree(f"{year}_{month}/")
os.remove(f"{year}_{month}.zip")


if __name__ == "__main__":

# Define variables we'll probably change later
from update_vars import YEAR, MONTH, MONTH_CREATED

FULL_URL = (
"https://www.transit.dot.gov/sites/fta.dot.gov/files/"
f"{MONTH_CREATED}/{MONTH}%20{YEAR}%20"
"Complete%20Monthly%20Ridership%20%28with%20"
"adjustments%20and%20estimates%29.xlsx"
)

df = produce_ntd_monthly_ridership_by_rtpa(FULL_URL, YEAR, MONTH)
print(df.columns)
df.to_parquet(f"{GCS_FILE_PATH}ca_monthly_ridership_{YEAR}_{MONTH}.parquet")

# For each RTPA, we'll produce a single csv and save it to a local folder
os.makedirs(f"./{YEAR}_{MONTH}/")

df = pd.read_parquet(
f"{GCS_FILE_PATH}ca_monthly_ridership_{YEAR}_{MONTH}.parquet"
)
save_rtpa_outputs(df, YEAR, MONTH, upload_to_public = False)
remove_local_outputs(YEAR, MONTH)
Loading