From 1fef9b0cf6561e9ef9d5198fbc429278f78913a6 Mon Sep 17 00:00:00 2001 From: Eric Dasmalchi Date: Mon, 6 May 2024 22:12:00 +0000 Subject: [PATCH] slight tidy, use gcs --- .../read_process_data.ipynb | 719 +++++++++++++----- 1 file changed, 535 insertions(+), 184 deletions(-) diff --git a/sb125_analyses/vmt_transit_sketch/read_process_data.ipynb b/sb125_analyses/vmt_transit_sketch/read_process_data.ipynb index 634f5c544..2cc71e981 100644 --- a/sb125_analyses/vmt_transit_sketch/read_process_data.ipynb +++ b/sb125_analyses/vmt_transit_sketch/read_process_data.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "id": "9e8158c2-a7f9-4b3c-a518-037132adf0c3", "metadata": {}, "outputs": [], @@ -15,7 +15,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "id": "81cddca7-bea4-494d-b0cb-02508d52b380", "metadata": {}, "outputs": [], @@ -25,7 +25,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "id": "37b679d4-f8bd-4450-bf9f-50b68e8570b4", "metadata": {}, "outputs": [], @@ -35,7 +35,29 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, + "id": "80324d88-ea3a-45a6-9362-933a2395ed31", + "metadata": {}, + "outputs": [], + "source": [ + "fs = get_fs()" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "3dcaf650-43fe-4532-9060-442b067ef173", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# ! pip install pygris" + ] + }, + { + "cell_type": "code", + "execution_count": 6, "id": "682268e8-78fc-4609-afc5-294f3c650b5e", "metadata": {}, "outputs": [], @@ -45,6 +67,16 @@ "# importlib.reload(_utils)" ] }, + { + "cell_type": "code", + "execution_count": 7, + "id": "5aeca438-6285-4cca-b375-ab8aa3849e42", + "metadata": {}, + "outputs": [], + "source": [ + "GCS_PATH = 'gs://calitp-analytics-data/data-analyses/sb125/vmt_transit_sketch/'" + ] + }, { "cell_type": "markdown", "id": "b076a21f-5a53-4b75-b140-0e4947099e42", @@ -57,7 +89,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "id": "85a89737-f90d-488f-9310-ca83557e476c", "metadata": { "tags": [] @@ -68,11 +100,22 @@ " '''\n", " zip_path: path to zip file containing a Replica trips export csv \n", " '''\n", - " with zipfile.ZipFile(zip_path) as z:\n", + " def parse_csv(zipfile):\n", " csvs = [f for f in z.namelist() if f[-3:] == 'csv']\n", " assert len(csvs) == 1\n", " with z.open(csvs[0]) as f:\n", " df = pd.read_csv(f)\n", + " return df\n", + " \n", + " if zip_path[:3] == 'gs:':\n", + " with fs.open(zip_path) as f:\n", + " with zipfile.ZipFile(f) as z:\n", + " df = parse_csv(z)\n", + " else:\n", + " with zipfile.ZipFile(f) as z:\n", + " df = parse_csv(z)\n", + " \n", + "\n", " df = (df >> filter(_.primary_mode.isin(['private_auto', 'auto_passenger', 'on_demand_auto', 'public_transit']))\n", " >> select(-_.origin_trct_2020, -_.activity_id)\n", " )\n", @@ -91,115 +134,159 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "c506f859-1ffc-4e36-ba03-1c4393ba4d9e", + "execution_count": 9, + "id": "f0df73e2-7ebe-431f-b533-6139cc9b79c0", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "'gs://calitp-analytics-data/data-analyses/sb125/vmt_transit_sketch/replica_raw/corridors/replica-fresno-trips_dataset.zip'" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "fresno_raw = read_group_replica(f'replica_raw/corridors/replica-fresno-trips_dataset.zip')" + "f'{GCS_PATH}replica_raw/corridors/replica-fresno-trips_dataset.zip'" ] }, { "cell_type": "code", - "execution_count": null, - "id": "51c4c2e4-91d8-47ef-b2a2-ca30c6e2b84f", - "metadata": {}, - "outputs": [], - "source": [ - "fresno_raw.describe()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7122b15b-574a-4cf1-84ab-0af08f703e57", - "metadata": {}, - "outputs": [], - "source": [ - "(wilshire_raw >> filter(_.is_auto)).trip_distance_miles.hist()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9423782c-0b5a-474a-bf5a-99ae2e6812b3", - "metadata": {}, - "outputs": [], - "source": [ - "(wilshire_raw >> filter(_.is_auto, _.trip_distance_miles < 4)).trip_distance_miles.hist()" - ] - }, - { - "cell_type": "markdown", - "id": "a48da885-0c7c-41a6-be13-8364e7fdc48c", - "metadata": { - "jp-MarkdownHeadingCollapsed": true, - "tags": [] - }, - "source": [ - "## quick vmt" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ec3470ac-5f0e-49a0-9000-f371f952bc74", - "metadata": {}, - "outputs": [], - "source": [ - "zip_path = f'replica_raw/replica-la_north-trips_dataset.zip'" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "868a0ae4-b076-4775-beba-fdc9ba764b27", - "metadata": {}, - "outputs": [], - "source": [ - "replica_filename = 'replica-mode_split_test-02_01_24-trips_dataset.csv'" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b4c08637-9bbc-4727-af6d-14dc1c66b4a1", + "execution_count": 10, + "id": "c506f859-1ffc-4e36-ba03-1c4393ba4d9e", "metadata": {}, "outputs": [], "source": [ - "with zipfile.ZipFile(zip_path) as z:\n", - " with z.open(replica_filename) as f:\n", - " df = pd.read_csv(f)" + "fresno_raw = read_group_replica(f'{GCS_PATH}replica_raw/corridors/replica-fresno-trips_dataset.zip')" ] }, { "cell_type": "code", - "execution_count": null, - "id": "5875f7f5-2c1f-40f6-95fa-0a8d106b1e7a", + "execution_count": 11, + "id": "51c4c2e4-91d8-47ef-b2a2-ca30c6e2b84f", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
trip_duration_minutestrip_distance_milesorigin_trct_fips_2020
count156196.000000156196.0000001.561960e+05
mean8.0273312.4894946.019003e+09
std6.0411162.3055811.731910e+03
min0.0000000.1000006.019000e+09
25%3.0000000.8000006.019002e+09
50%7.0000001.6000006.019004e+09
75%11.0000003.5000006.019005e+09
max93.00000017.8000006.019005e+09
\n", + "
" + ], + "text/plain": [ + " trip_duration_minutes trip_distance_miles origin_trct_fips_2020\n", + "count 156196.000000 156196.000000 1.561960e+05\n", + "mean 8.027331 2.489494 6.019003e+09\n", + "std 6.041116 2.305581 1.731910e+03\n", + "min 0.000000 0.100000 6.019000e+09\n", + "25% 3.000000 0.800000 6.019002e+09\n", + "50% 7.000000 1.600000 6.019004e+09\n", + "75% 11.000000 3.500000 6.019005e+09\n", + "max 93.000000 17.800000 6.019005e+09" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "miles_all = df.trip_distance_miles.sum()" + "fresno_raw.describe()" ] }, { "cell_type": "code", - "execution_count": null, - "id": "01a823a1-de52-4d08-8b80-a024c1397f95", + "execution_count": 12, + "id": "7122b15b-574a-4cf1-84ab-0af08f703e57", "metadata": {}, "outputs": [], "source": [ - "shorter = (df >> filter(_.trip_distance_miles < _.trip_distance_miles.quantile(.95))).trip_distance_miles.sum()" + "# (wilshire_raw >> filter(_.is_auto)).trip_distance_miles.hist()" ] }, { "cell_type": "code", - "execution_count": null, - "id": "132b4a03-9f73-4978-8a35-ae5c130f7f73", + "execution_count": 13, + "id": "9423782c-0b5a-474a-bf5a-99ae2e6812b3", "metadata": {}, "outputs": [], "source": [ - "shorter / miles_all" + "# (wilshire_raw >> filter(_.is_auto, _.trip_distance_miles < 4)).trip_distance_miles.hist()" ] }, { @@ -212,7 +299,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 14, "id": "dae2c9bb-5121-4966-85ed-111bd525c924", "metadata": {}, "outputs": [], @@ -223,32 +310,32 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 15, "id": "15544ecf-1140-426f-be93-5ce53e2b5f7e", "metadata": {}, "outputs": [], "source": [ - "grouped = pd.DataFrame()\n", - "for region in ['eureka']:\n", - " print(region)\n", - " # note replica filename includes date of download...\n", - " df = read_group_replica(f'replica_raw/corridors/replica-{region}-trips_dataset.zip')\n", - " grouped = pd.concat([grouped, df])" + "# grouped = pd.DataFrame()\n", + "# for region in ['eureka']:\n", + "# print(region)\n", + "# # note replica filename includes date of download...\n", + "# df = read_group_replica(f'replica_raw/corridors/replica-{region}-trips_dataset.zip')\n", + "# grouped = pd.concat([grouped, df])" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 16, "id": "1878b238-eec9-450c-bf3e-359016485495", "metadata": {}, "outputs": [], "source": [ - "grouped.to_parquet('intermediate/eureka_grouped.parquet')" + "# grouped.to_parquet('intermediate/eureka_grouped.parquet')" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 17, "id": "9fb929ae-0c66-4dfb-9a0e-604bfedef078", "metadata": { "tags": [] @@ -277,39 +364,79 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 18, "id": "240ddd8f-6a36-44e1-a1c0-32e6f50c3cac", "metadata": {}, "outputs": [], "source": [ "# grouped = pd.DataFrame()\n", "# for region in all_regions:\n", - "# grouped = pd.concat([grouped, pd.read_parquet(f'intermediate/replica-{region}-trips_dataset.parquet')])" + "# grouped = pd.concat([grouped, pd.read_parquet(f'{GCS_PATH}intermediate/replica-{region}-trips_dataset.parquet')])" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 19, + "id": "0de2a7b3-f1f1-43df-8e7e-2cf4b416378c", + "metadata": {}, + "outputs": [], + "source": [ + "# grouped" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "f1867025-ad72-4ff5-931c-30982a65f0af", + "metadata": {}, + "outputs": [], + "source": [ + "from calitp_data_analysis import utils" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "79dafb24-bebd-4ae6-91d4-8d8e80983c76", + "metadata": {}, + "outputs": [], + "source": [ + "# utils.geoparquet_gcs_export?" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "56e5f48c-649f-4e8f-8ef2-49ce8521cf2c", + "metadata": {}, + "outputs": [], + "source": [ + "# utils.geoparquet_gcs_export(grouped, f'{GCS_PATH}intermediate/', 'replica_grouped')" + ] + }, + { + "cell_type": "code", + "execution_count": 23, "id": "7596b001-2c1c-488f-b089-c375c0ddff4f", "metadata": {}, "outputs": [], "source": [ - "# replica_grouped = pd.read_parquet('intermediate/replica_grouped.parquet')" + "replica_grouped = pd.read_parquet(f'{GCS_PATH}intermediate/replica_grouped.parquet')" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 24, "id": "23b9ae5b-bd02-4978-b7d1-3e4b49de53b5", "metadata": {}, "outputs": [], "source": [ - "tracts_feeds = gpd.read_parquet('intermediate/feeds_tract_geo.parquet')" + "tracts_feeds = gpd.read_parquet(f'{GCS_PATH}intermediate/feeds_tract_geo.parquet')" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 25, "id": "73088513-9710-4c9a-a205-d531c3807345", "metadata": {}, "outputs": [], @@ -319,7 +446,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 26, "id": "61ce4f74-45e9-4c85-ad34-d2ac9c5cfc32", "metadata": {}, "outputs": [], @@ -329,37 +456,48 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 27, "id": "3f9512da-00e9-4768-a79c-8570c4a0bec2", "metadata": {}, "outputs": [], "source": [ - "replica_grouped = pd.read_parquet(f'intermediate/wilshire_grouped.parquet')" + "# replica_grouped = pd.read_parquet(f'intermediate/wilshire_grouped.parquet')" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 28, "id": "240b6b96-ea68-49f2-8509-883af8f0c36b", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "11288.75" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "18062 / 1.6" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 29, "id": "fa0e75af-11d9-4051-8b0f-3a8d95c166be", "metadata": {}, "outputs": [], "source": [ - "(replica_grouped >> filter(_.is_auto))." + "# (replica_grouped >> filter(_.is_auto))" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 30, "id": "06f007da-ad8d-4801-8595-85c5d44fe921", "metadata": {}, "outputs": [], @@ -414,42 +552,198 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "f4b313d7-62e5-46b4-ab93-6b25d176bdeb", - "metadata": {}, - "outputs": [], - "source": [ - "# replica_grouped = pd.read_parquet('intermediate/wilshire_grouped.parquet')" - ] - }, - { - "cell_type": "code", - "execution_count": null, + "execution_count": 31, "id": "d661657b-cac7-4e9a-806d-48579c0516f9", "metadata": { "tags": [] }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.2778337212327877\n" + ] + } + ], "source": [ - "# processed_df = process_grouped_data(replica_grouped, tracts_feeds)" + "processed_df = process_grouped_data(replica_grouped, tracts_feeds)" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 32, "id": "50f04f86-6f13-4654-952e-58dc07a23faa", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
origin_trct_fips_2020p50_mi_transitp50_mi_autototal_mi_transittotal_mi_autototal_trips_autop50_transit_longerno_transit_replicano_transittotal_minew_transit_miprojected_new_transit_trips
060014001004.26.95185.3165410.853636FalseFalseFalse170596.145956.69809610942.0
160014002004.13.43638.158763.428600TrueFalseFalse62401.516326.4540943982.0
260014003004.73.76130.7122736.557740TrueFalseFalse128867.234100.3385267255.0
\n", + "
" + ], + "text/plain": [ + " origin_trct_fips_2020 p50_mi_transit p50_mi_auto total_mi_transit \\\n", + "0 6001400100 4.2 6.9 5185.3 \n", + "1 6001400200 4.1 3.4 3638.1 \n", + "2 6001400300 4.7 3.7 6130.7 \n", + "\n", + " total_mi_auto total_trips_auto p50_transit_longer no_transit_replica \\\n", + "0 165410.8 53636 False False \n", + "1 58763.4 28600 True False \n", + "2 122736.5 57740 True False \n", + "\n", + " no_transit total_mi new_transit_mi projected_new_transit_trips \n", + "0 False 170596.1 45956.698096 10942.0 \n", + "1 False 62401.5 16326.454094 3982.0 \n", + "2 False 128867.2 34100.338526 7255.0 " + ] + }, + "execution_count": 32, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "processed_df >> head(3)" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 33, "id": "8169ca83-e540-48f4-af93-076a535f00c6", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
no_transittotal_mi_auto
0False1.162508e+09
1True1.294277e+08
\n", + "
" + ], + "text/plain": [ + " no_transit total_mi_auto\n", + "0 False 1.162508e+09\n", + "1 True 1.294277e+08" + ] + }, + "execution_count": 33, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "no_transit = processed_df >> group_by(_.no_transit) >> summarize(total_mi_auto = _.total_mi_auto.sum())\n", "no_transit" @@ -457,7 +751,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 34, "id": "c7175031-a66c-4f90-8dca-193198b9d932", "metadata": {}, "outputs": [], @@ -468,32 +762,94 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 35, "id": "6476da44-36ac-4604-89d5-76966eb9411d", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "True 6571\n", + "False 2534\n", + "Name: p50_transit_longer, dtype: int64" + ] + }, + "execution_count": 35, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "processed_df.p50_transit_longer.value_counts()" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 36, + "id": "70bdaf9e-397b-46d4-bc0a-a73722cf8591", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Using FIPS code '06' for input 'CA'\n" + ] + } + ], + "source": [ + "tract_geo = _utils.get_tract_geoms()" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "id": "c10b8a4f-b3de-4494-81b7-942c2b230c30", + "metadata": {}, + "outputs": [], + "source": [ + "ca_uzas = gpd.read_parquet(f'{GCS_PATH}intermediate/ca_uza.parquet')" + ] + }, + { + "cell_type": "code", + "execution_count": 59, + "id": "8d2f3dec-892d-4ffc-85fb-8e8866a51f42", + "metadata": {}, + "outputs": [], + "source": [ + "# census_pop = gpd.read_file(f'./census_ntd/DECENNIALPL2020.P1_2024-02-01T163251.zip')\n", + "\n", + "# census_cleaned = census_pop.iloc[2:,:][['GEO_ID', 'P1_001N']]\n", + "\n", + "# census_cleaned.to_parquet(f'{GCS_PATH}census_ntd/DECENNIALPL2020.parquet')" + ] + }, + { + "cell_type": "code", + "execution_count": 60, + "id": "d7b17b29-29b2-40f3-9038-c3b0b75f39f8", + "metadata": {}, + "outputs": [], + "source": [ + "census_cleaned = pd.read_parquet(f'{GCS_PATH}census_ntd/DECENNIALPL2020.parquet')" + ] + }, + { + "cell_type": "code", + "execution_count": 61, "id": "8995132b-5eba-48e8-bfdb-585f1ff1e9d8", "metadata": {}, "outputs": [], "source": [ - "def attach_tracts_pop(processed_df):\n", + "def attach_tracts_pop(processed_df, tract_geo, ca_uzas, census_cleaned):\n", " \n", - " tract_geo = _utils.get_tract_geoms()\n", " tract_geo.GEOID = tract_geo.GEOID.astype('int64')\n", " gdf = (tract_geo >> inner_join(_, processed_df, on = {'GEOID': 'origin_trct_fips_2020'})\n", " >> select(-_.origin_trct_fips_2020))\n", - " ca_uzas = gpd.read_parquet('intermediate/ca_uza.parquet')\n", " uza_joined = gpd.sjoin(gdf, ca_uzas, how = 'left')\n", " \n", - " census_pop = gpd.read_file('census_ntd/DECENNIALPL2020.P1_2024-02-01T163251.zip')\n", - " tract_pop = census_pop[['GEO_ID', 'P1_001N']].iloc[2:,:]\n", + " tract_pop = census_cleaned\n", " tract_pop.GEO_ID = tract_pop.GEO_ID.map(lambda x: x.split('US')[1])\n", " tract_pop.GEO_ID = tract_pop.GEO_ID.astype('int64')\n", " tract_pop = tract_pop >> select(_.total_pop == _.P1_001N, _.GEOID == _.GEO_ID)\n", @@ -506,12 +862,12 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 63, "id": "81f381a4-bb80-4fb7-97fa-853111914712", "metadata": {}, "outputs": [], "source": [ - "uza_joined = attach_tracts_pop(processed_df)" + "uza_joined = attach_tracts_pop(processed_df, tract_geo, ca_uzas, census_cleaned)" ] }, { @@ -523,22 +879,12 @@ }, "outputs": [], "source": [ - "for region in ['sanpablo', 'eureka', 'fresno', 'wilshire']:\n", - " replica_grouped = pd.read_parquet(f'intermediate/{region}_grouped.parquet')\n", - " processed_df = process_grouped_data(replica_grouped, tracts_feeds)\n", - " uza_joined = attach_tracts_pop(processed_df)\n", - " display(uza_joined >> head(3))\n", - " uza_joined.to_parquet(f'outputs/{region}_trips_with_uza.parquet')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "addb6ac5-5f4d-41dc-9a7a-0d6644bb699c", - "metadata": {}, - "outputs": [], - "source": [ - "# uza_joined.to_parquet('outputs/new_trips_with_uza.parquet')" + "# for region in ['sanpablo', 'eureka', 'fresno', 'wilshire']:\n", + "# replica_grouped = pd.read_parquet(f'intermediate/{region}_grouped.parquet')\n", + "# processed_df = process_grouped_data(replica_grouped, tracts_feeds)\n", + "# uza_joined = attach_tracts_pop(processed_df)\n", + "# display(uza_joined >> head(3))\n", + "# uza_joined.to_parquet(f'outputs/{region}_trips_with_uza.parquet')" ] }, { @@ -552,70 +898,75 @@ ] }, { - "cell_type": "markdown", - "id": "63e712e1-900d-430e-962e-72ed8dd30bbf", + "cell_type": "code", + "execution_count": 68, + "id": "6ed72b61-2c14-4eb2-a188-00384f52082a", "metadata": {}, + "outputs": [], "source": [ - "## Quick GCS Upload" + "uza_joined = (uza_joined\n", + " >> select(-_.index_right)\n", + " )" ] }, { "cell_type": "code", - "execution_count": null, - "id": "b0572b17-a9c2-4128-ab67-fa650c87fda0", + "execution_count": 71, + "id": "029334e4-be41-4f93-b95d-67687bd2d70e", "metadata": {}, "outputs": [], "source": [ - "fs = get_fs()" + "uza_joined['vmt_quantile'] = pd.qcut(uza_joined.total_mi_auto, 4, labels = ['p25', 'p50', 'p75', 'p100'])" ] }, { "cell_type": "code", - "execution_count": null, - "id": "76539b31-f757-4703-9f7a-2eea60834d06", + "execution_count": 72, + "id": "8f1b132b-8f81-4923-9efe-92f3162ee64c", "metadata": {}, "outputs": [], "source": [ - "_utils.GCS_PATH" + "!mkdir export" ] }, { "cell_type": "code", - "execution_count": null, - "id": "1518eea5-d2f1-4807-b392-4a4f2624439d", + "execution_count": 75, + "id": "ec2cc1b4-002b-498a-b1f0-055f5252fb70", "metadata": {}, "outputs": [], "source": [ - "lpath = 'replica_raw/'" + "uza_joined.vmt_quantile = uza_joined.vmt_quantile.astype(str)" ] }, { "cell_type": "code", - "execution_count": null, - "id": "e0048f15-f124-431d-9fae-35aa7ef3dd72", + "execution_count": 79, + "id": "354a6bea-fbc5-49d7-b531-2597f240510e", "metadata": {}, "outputs": [], "source": [ - "fs.put(lpath, _utils.GCS_PATH + lpath, recursive=True)" + "utils.geoparquet_gcs_export(uza_joined, f'{GCS_PATH}outputs/', 'new_trips_with_uza')" ] }, { - "cell_type": "markdown", - "id": "b27a8e5c-9d03-435c-bd5c-8a5de83a6d88", + "cell_type": "code", + "execution_count": 77, + "id": "30983d03-5c50-4480-85db-ed6666e34bd8", "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/tmp/ipykernel_532/1657757500.py:1: UserWarning: Column names longer than 10 characters will be truncated when saved to ESRI Shapefile.\n", + " uza_joined.to_file('./export/vmt_with_quantiles.shp')\n" + ] + } + ], "source": [ - "## Pulling _corridor_ level data\n", - "\n", - "* First, get corridor geoms" + "uza_joined.to_file('./export/vmt_with_quantiles.shp')" ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1a5d9b48-6f90-4e85-87e8-aa4c94d01aa3", - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": {