From 1fef9b0cf6561e9ef9d5198fbc429278f78913a6 Mon Sep 17 00:00:00 2001
From: Eric Dasmalchi <edasmalchi@gmail.com>
Date: Mon, 6 May 2024 22:12:00 +0000
Subject: [PATCH] slight tidy, use gcs

---
 .../read_process_data.ipynb                   | 719 +++++++++++++-----
 1 file changed, 535 insertions(+), 184 deletions(-)

diff --git a/sb125_analyses/vmt_transit_sketch/read_process_data.ipynb b/sb125_analyses/vmt_transit_sketch/read_process_data.ipynb
index 634f5c544..2cc71e981 100644
--- a/sb125_analyses/vmt_transit_sketch/read_process_data.ipynb
+++ b/sb125_analyses/vmt_transit_sketch/read_process_data.ipynb
@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 1,
    "id": "9e8158c2-a7f9-4b3c-a518-037132adf0c3",
    "metadata": {},
    "outputs": [],
@@ -15,7 +15,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 2,
    "id": "81cddca7-bea4-494d-b0cb-02508d52b380",
    "metadata": {},
    "outputs": [],
@@ -25,7 +25,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 3,
    "id": "37b679d4-f8bd-4450-bf9f-50b68e8570b4",
    "metadata": {},
    "outputs": [],
@@ -35,7 +35,29 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 4,
+   "id": "80324d88-ea3a-45a6-9362-933a2395ed31",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "fs = get_fs()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "3dcaf650-43fe-4532-9060-442b067ef173",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "# ! pip install pygris"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
    "id": "682268e8-78fc-4609-afc5-294f3c650b5e",
    "metadata": {},
    "outputs": [],
@@ -45,6 +67,16 @@
     "# importlib.reload(_utils)"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "5aeca438-6285-4cca-b375-ab8aa3849e42",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "GCS_PATH = 'gs://calitp-analytics-data/data-analyses/sb125/vmt_transit_sketch/'"
+   ]
+  },
   {
    "cell_type": "markdown",
    "id": "b076a21f-5a53-4b75-b140-0e4947099e42",
@@ -57,7 +89,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 8,
    "id": "85a89737-f90d-488f-9310-ca83557e476c",
    "metadata": {
     "tags": []
@@ -68,11 +100,22 @@
     "    '''\n",
     "    zip_path: path to zip file containing a Replica trips export csv    \n",
     "    '''\n",
-    "    with zipfile.ZipFile(zip_path) as z:\n",
+    "    def parse_csv(zipfile):\n",
     "        csvs = [f for f in z.namelist() if f[-3:] == 'csv']\n",
     "        assert len(csvs) == 1\n",
     "        with z.open(csvs[0]) as f:\n",
     "            df = pd.read_csv(f)\n",
+    "        return df\n",
+    "    \n",
+    "    if zip_path[:3] == 'gs:':\n",
+    "        with fs.open(zip_path) as f:\n",
+    "            with zipfile.ZipFile(f) as z:\n",
+    "                df = parse_csv(z)\n",
+    "    else:\n",
+    "        with zipfile.ZipFile(f) as z:\n",
+    "            df = parse_csv(z)\n",
+    "            \n",
+    "\n",
     "    df = (df >> filter(_.primary_mode.isin(['private_auto', 'auto_passenger', 'on_demand_auto', 'public_transit']))\n",
     "     >> select(-_.origin_trct_2020, -_.activity_id)\n",
     "         )\n",
@@ -91,115 +134,159 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "id": "c506f859-1ffc-4e36-ba03-1c4393ba4d9e",
+   "execution_count": 9,
+   "id": "f0df73e2-7ebe-431f-b533-6139cc9b79c0",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'gs://calitp-analytics-data/data-analyses/sb125/vmt_transit_sketch/replica_raw/corridors/replica-fresno-trips_dataset.zip'"
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
-    "fresno_raw = read_group_replica(f'replica_raw/corridors/replica-fresno-trips_dataset.zip')"
+    "f'{GCS_PATH}replica_raw/corridors/replica-fresno-trips_dataset.zip'"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "id": "51c4c2e4-91d8-47ef-b2a2-ca30c6e2b84f",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "fresno_raw.describe()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "7122b15b-574a-4cf1-84ab-0af08f703e57",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "(wilshire_raw >> filter(_.is_auto)).trip_distance_miles.hist()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "9423782c-0b5a-474a-bf5a-99ae2e6812b3",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "(wilshire_raw >> filter(_.is_auto, _.trip_distance_miles < 4)).trip_distance_miles.hist()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "a48da885-0c7c-41a6-be13-8364e7fdc48c",
-   "metadata": {
-    "jp-MarkdownHeadingCollapsed": true,
-    "tags": []
-   },
-   "source": [
-    "## quick vmt"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "ec3470ac-5f0e-49a0-9000-f371f952bc74",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "zip_path = f'replica_raw/replica-la_north-trips_dataset.zip'"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "868a0ae4-b076-4775-beba-fdc9ba764b27",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "replica_filename = 'replica-mode_split_test-02_01_24-trips_dataset.csv'"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "b4c08637-9bbc-4727-af6d-14dc1c66b4a1",
+   "execution_count": 10,
+   "id": "c506f859-1ffc-4e36-ba03-1c4393ba4d9e",
    "metadata": {},
    "outputs": [],
    "source": [
-    "with zipfile.ZipFile(zip_path) as z:\n",
-    "    with z.open(replica_filename) as f:\n",
-    "        df = pd.read_csv(f)"
+    "fresno_raw = read_group_replica(f'{GCS_PATH}replica_raw/corridors/replica-fresno-trips_dataset.zip')"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "id": "5875f7f5-2c1f-40f6-95fa-0a8d106b1e7a",
+   "execution_count": 11,
+   "id": "51c4c2e4-91d8-47ef-b2a2-ca30c6e2b84f",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>trip_duration_minutes</th>\n",
+       "      <th>trip_distance_miles</th>\n",
+       "      <th>origin_trct_fips_2020</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>count</th>\n",
+       "      <td>156196.000000</td>\n",
+       "      <td>156196.000000</td>\n",
+       "      <td>1.561960e+05</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>mean</th>\n",
+       "      <td>8.027331</td>\n",
+       "      <td>2.489494</td>\n",
+       "      <td>6.019003e+09</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>std</th>\n",
+       "      <td>6.041116</td>\n",
+       "      <td>2.305581</td>\n",
+       "      <td>1.731910e+03</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>min</th>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.100000</td>\n",
+       "      <td>6.019000e+09</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>25%</th>\n",
+       "      <td>3.000000</td>\n",
+       "      <td>0.800000</td>\n",
+       "      <td>6.019002e+09</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>50%</th>\n",
+       "      <td>7.000000</td>\n",
+       "      <td>1.600000</td>\n",
+       "      <td>6.019004e+09</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>75%</th>\n",
+       "      <td>11.000000</td>\n",
+       "      <td>3.500000</td>\n",
+       "      <td>6.019005e+09</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>max</th>\n",
+       "      <td>93.000000</td>\n",
+       "      <td>17.800000</td>\n",
+       "      <td>6.019005e+09</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "       trip_duration_minutes  trip_distance_miles  origin_trct_fips_2020\n",
+       "count          156196.000000        156196.000000           1.561960e+05\n",
+       "mean                8.027331             2.489494           6.019003e+09\n",
+       "std                 6.041116             2.305581           1.731910e+03\n",
+       "min                 0.000000             0.100000           6.019000e+09\n",
+       "25%                 3.000000             0.800000           6.019002e+09\n",
+       "50%                 7.000000             1.600000           6.019004e+09\n",
+       "75%                11.000000             3.500000           6.019005e+09\n",
+       "max                93.000000            17.800000           6.019005e+09"
+      ]
+     },
+     "execution_count": 11,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
-    "miles_all = df.trip_distance_miles.sum()"
+    "fresno_raw.describe()"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "id": "01a823a1-de52-4d08-8b80-a024c1397f95",
+   "execution_count": 12,
+   "id": "7122b15b-574a-4cf1-84ab-0af08f703e57",
    "metadata": {},
    "outputs": [],
    "source": [
-    "shorter = (df >> filter(_.trip_distance_miles < _.trip_distance_miles.quantile(.95))).trip_distance_miles.sum()"
+    "# (wilshire_raw >> filter(_.is_auto)).trip_distance_miles.hist()"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "id": "132b4a03-9f73-4978-8a35-ae5c130f7f73",
+   "execution_count": 13,
+   "id": "9423782c-0b5a-474a-bf5a-99ae2e6812b3",
    "metadata": {},
    "outputs": [],
    "source": [
-    "shorter / miles_all"
+    "# (wilshire_raw >> filter(_.is_auto, _.trip_distance_miles < 4)).trip_distance_miles.hist()"
    ]
   },
   {
@@ -212,7 +299,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 14,
    "id": "dae2c9bb-5121-4966-85ed-111bd525c924",
    "metadata": {},
    "outputs": [],
@@ -223,32 +310,32 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 15,
    "id": "15544ecf-1140-426f-be93-5ce53e2b5f7e",
    "metadata": {},
    "outputs": [],
    "source": [
-    "grouped = pd.DataFrame()\n",
-    "for region in ['eureka']:\n",
-    "    print(region)\n",
-    "    #  note replica filename includes date of download...\n",
-    "    df = read_group_replica(f'replica_raw/corridors/replica-{region}-trips_dataset.zip')\n",
-    "    grouped = pd.concat([grouped, df])"
+    "# grouped = pd.DataFrame()\n",
+    "# for region in ['eureka']:\n",
+    "#     print(region)\n",
+    "#     #  note replica filename includes date of download...\n",
+    "#     df = read_group_replica(f'replica_raw/corridors/replica-{region}-trips_dataset.zip')\n",
+    "#     grouped = pd.concat([grouped, df])"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 16,
    "id": "1878b238-eec9-450c-bf3e-359016485495",
    "metadata": {},
    "outputs": [],
    "source": [
-    "grouped.to_parquet('intermediate/eureka_grouped.parquet')"
+    "# grouped.to_parquet('intermediate/eureka_grouped.parquet')"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 17,
    "id": "9fb929ae-0c66-4dfb-9a0e-604bfedef078",
    "metadata": {
     "tags": []
@@ -277,39 +364,79 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 18,
    "id": "240ddd8f-6a36-44e1-a1c0-32e6f50c3cac",
    "metadata": {},
    "outputs": [],
    "source": [
     "# grouped = pd.DataFrame()\n",
     "# for region in all_regions:\n",
-    "#     grouped = pd.concat([grouped, pd.read_parquet(f'intermediate/replica-{region}-trips_dataset.parquet')])"
+    "#     grouped = pd.concat([grouped, pd.read_parquet(f'{GCS_PATH}intermediate/replica-{region}-trips_dataset.parquet')])"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 19,
+   "id": "0de2a7b3-f1f1-43df-8e7e-2cf4b416378c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# grouped"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "id": "f1867025-ad72-4ff5-931c-30982a65f0af",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from calitp_data_analysis import utils"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "id": "79dafb24-bebd-4ae6-91d4-8d8e80983c76",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# utils.geoparquet_gcs_export?"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "id": "56e5f48c-649f-4e8f-8ef2-49ce8521cf2c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# utils.geoparquet_gcs_export(grouped, f'{GCS_PATH}intermediate/', 'replica_grouped')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
    "id": "7596b001-2c1c-488f-b089-c375c0ddff4f",
    "metadata": {},
    "outputs": [],
    "source": [
-    "# replica_grouped = pd.read_parquet('intermediate/replica_grouped.parquet')"
+    "replica_grouped = pd.read_parquet(f'{GCS_PATH}intermediate/replica_grouped.parquet')"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 24,
    "id": "23b9ae5b-bd02-4978-b7d1-3e4b49de53b5",
    "metadata": {},
    "outputs": [],
    "source": [
-    "tracts_feeds = gpd.read_parquet('intermediate/feeds_tract_geo.parquet')"
+    "tracts_feeds = gpd.read_parquet(f'{GCS_PATH}intermediate/feeds_tract_geo.parquet')"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 25,
    "id": "73088513-9710-4c9a-a205-d531c3807345",
    "metadata": {},
    "outputs": [],
@@ -319,7 +446,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 26,
    "id": "61ce4f74-45e9-4c85-ad34-d2ac9c5cfc32",
    "metadata": {},
    "outputs": [],
@@ -329,37 +456,48 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 27,
    "id": "3f9512da-00e9-4768-a79c-8570c4a0bec2",
    "metadata": {},
    "outputs": [],
    "source": [
-    "replica_grouped = pd.read_parquet(f'intermediate/wilshire_grouped.parquet')"
+    "# replica_grouped = pd.read_parquet(f'intermediate/wilshire_grouped.parquet')"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 28,
    "id": "240b6b96-ea68-49f2-8509-883af8f0c36b",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "11288.75"
+      ]
+     },
+     "execution_count": 28,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
     "18062 / 1.6"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 29,
    "id": "fa0e75af-11d9-4051-8b0f-3a8d95c166be",
    "metadata": {},
    "outputs": [],
    "source": [
-    "(replica_grouped >> filter(_.is_auto))."
+    "# (replica_grouped >> filter(_.is_auto))"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 30,
    "id": "06f007da-ad8d-4801-8595-85c5d44fe921",
    "metadata": {},
    "outputs": [],
@@ -414,42 +552,198 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "id": "f4b313d7-62e5-46b4-ab93-6b25d176bdeb",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# replica_grouped = pd.read_parquet('intermediate/wilshire_grouped.parquet')"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 31,
    "id": "d661657b-cac7-4e9a-806d-48579c0516f9",
    "metadata": {
     "tags": []
    },
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "0.2778337212327877\n"
+     ]
+    }
+   ],
    "source": [
-    "# processed_df = process_grouped_data(replica_grouped, tracts_feeds)"
+    "processed_df = process_grouped_data(replica_grouped, tracts_feeds)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 32,
    "id": "50f04f86-6f13-4654-952e-58dc07a23faa",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>origin_trct_fips_2020</th>\n",
+       "      <th>p50_mi_transit</th>\n",
+       "      <th>p50_mi_auto</th>\n",
+       "      <th>total_mi_transit</th>\n",
+       "      <th>total_mi_auto</th>\n",
+       "      <th>total_trips_auto</th>\n",
+       "      <th>p50_transit_longer</th>\n",
+       "      <th>no_transit_replica</th>\n",
+       "      <th>no_transit</th>\n",
+       "      <th>total_mi</th>\n",
+       "      <th>new_transit_mi</th>\n",
+       "      <th>projected_new_transit_trips</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>6001400100</td>\n",
+       "      <td>4.2</td>\n",
+       "      <td>6.9</td>\n",
+       "      <td>5185.3</td>\n",
+       "      <td>165410.8</td>\n",
+       "      <td>53636</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>170596.1</td>\n",
+       "      <td>45956.698096</td>\n",
+       "      <td>10942.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>6001400200</td>\n",
+       "      <td>4.1</td>\n",
+       "      <td>3.4</td>\n",
+       "      <td>3638.1</td>\n",
+       "      <td>58763.4</td>\n",
+       "      <td>28600</td>\n",
+       "      <td>True</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>62401.5</td>\n",
+       "      <td>16326.454094</td>\n",
+       "      <td>3982.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>6001400300</td>\n",
+       "      <td>4.7</td>\n",
+       "      <td>3.7</td>\n",
+       "      <td>6130.7</td>\n",
+       "      <td>122736.5</td>\n",
+       "      <td>57740</td>\n",
+       "      <td>True</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>128867.2</td>\n",
+       "      <td>34100.338526</td>\n",
+       "      <td>7255.0</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   origin_trct_fips_2020  p50_mi_transit  p50_mi_auto  total_mi_transit  \\\n",
+       "0             6001400100             4.2          6.9            5185.3   \n",
+       "1             6001400200             4.1          3.4            3638.1   \n",
+       "2             6001400300             4.7          3.7            6130.7   \n",
+       "\n",
+       "   total_mi_auto  total_trips_auto  p50_transit_longer  no_transit_replica  \\\n",
+       "0       165410.8             53636               False               False   \n",
+       "1        58763.4             28600                True               False   \n",
+       "2       122736.5             57740                True               False   \n",
+       "\n",
+       "   no_transit  total_mi  new_transit_mi  projected_new_transit_trips  \n",
+       "0       False  170596.1    45956.698096                      10942.0  \n",
+       "1       False   62401.5    16326.454094                       3982.0  \n",
+       "2       False  128867.2    34100.338526                       7255.0  "
+      ]
+     },
+     "execution_count": 32,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
     "processed_df >> head(3)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 33,
    "id": "8169ca83-e540-48f4-af93-076a535f00c6",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>no_transit</th>\n",
+       "      <th>total_mi_auto</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>False</td>\n",
+       "      <td>1.162508e+09</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>True</td>\n",
+       "      <td>1.294277e+08</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   no_transit  total_mi_auto\n",
+       "0       False   1.162508e+09\n",
+       "1        True   1.294277e+08"
+      ]
+     },
+     "execution_count": 33,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
     "no_transit = processed_df >> group_by(_.no_transit) >> summarize(total_mi_auto = _.total_mi_auto.sum())\n",
     "no_transit"
@@ -457,7 +751,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 34,
    "id": "c7175031-a66c-4f90-8dca-193198b9d932",
    "metadata": {},
    "outputs": [],
@@ -468,32 +762,94 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 35,
    "id": "6476da44-36ac-4604-89d5-76966eb9411d",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "True     6571\n",
+       "False    2534\n",
+       "Name: p50_transit_longer, dtype: int64"
+      ]
+     },
+     "execution_count": 35,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
     "processed_df.p50_transit_longer.value_counts()"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 36,
+   "id": "70bdaf9e-397b-46d4-bc0a-a73722cf8591",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Using FIPS code '06' for input 'CA'\n"
+     ]
+    }
+   ],
+   "source": [
+    "tract_geo = _utils.get_tract_geoms()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 37,
+   "id": "c10b8a4f-b3de-4494-81b7-942c2b230c30",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ca_uzas = gpd.read_parquet(f'{GCS_PATH}intermediate/ca_uza.parquet')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 59,
+   "id": "8d2f3dec-892d-4ffc-85fb-8e8866a51f42",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# census_pop = gpd.read_file(f'./census_ntd/DECENNIALPL2020.P1_2024-02-01T163251.zip')\n",
+    "\n",
+    "# census_cleaned = census_pop.iloc[2:,:][['GEO_ID', 'P1_001N']]\n",
+    "\n",
+    "# census_cleaned.to_parquet(f'{GCS_PATH}census_ntd/DECENNIALPL2020.parquet')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 60,
+   "id": "d7b17b29-29b2-40f3-9038-c3b0b75f39f8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "census_cleaned = pd.read_parquet(f'{GCS_PATH}census_ntd/DECENNIALPL2020.parquet')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 61,
    "id": "8995132b-5eba-48e8-bfdb-585f1ff1e9d8",
    "metadata": {},
    "outputs": [],
    "source": [
-    "def attach_tracts_pop(processed_df):\n",
+    "def attach_tracts_pop(processed_df, tract_geo, ca_uzas, census_cleaned):\n",
     "    \n",
-    "    tract_geo = _utils.get_tract_geoms()\n",
     "    tract_geo.GEOID = tract_geo.GEOID.astype('int64')\n",
     "    gdf = (tract_geo >> inner_join(_, processed_df, on = {'GEOID': 'origin_trct_fips_2020'})\n",
     "                     >> select(-_.origin_trct_fips_2020))\n",
-    "    ca_uzas = gpd.read_parquet('intermediate/ca_uza.parquet')\n",
     "    uza_joined = gpd.sjoin(gdf, ca_uzas, how = 'left')\n",
     "    \n",
-    "    census_pop = gpd.read_file('census_ntd/DECENNIALPL2020.P1_2024-02-01T163251.zip')\n",
-    "    tract_pop = census_pop[['GEO_ID', 'P1_001N']].iloc[2:,:]\n",
+    "    tract_pop = census_cleaned\n",
     "    tract_pop.GEO_ID = tract_pop.GEO_ID.map(lambda x: x.split('US')[1])\n",
     "    tract_pop.GEO_ID = tract_pop.GEO_ID.astype('int64')\n",
     "    tract_pop = tract_pop >> select(_.total_pop == _.P1_001N, _.GEOID == _.GEO_ID)\n",
@@ -506,12 +862,12 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 63,
    "id": "81f381a4-bb80-4fb7-97fa-853111914712",
    "metadata": {},
    "outputs": [],
    "source": [
-    "uza_joined = attach_tracts_pop(processed_df)"
+    "uza_joined = attach_tracts_pop(processed_df, tract_geo, ca_uzas, census_cleaned)"
    ]
   },
   {
@@ -523,22 +879,12 @@
    },
    "outputs": [],
    "source": [
-    "for region in ['sanpablo', 'eureka', 'fresno', 'wilshire']:\n",
-    "    replica_grouped = pd.read_parquet(f'intermediate/{region}_grouped.parquet')\n",
-    "    processed_df = process_grouped_data(replica_grouped, tracts_feeds)\n",
-    "    uza_joined = attach_tracts_pop(processed_df)\n",
-    "    display(uza_joined >> head(3))\n",
-    "    uza_joined.to_parquet(f'outputs/{region}_trips_with_uza.parquet')"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "addb6ac5-5f4d-41dc-9a7a-0d6644bb699c",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# uza_joined.to_parquet('outputs/new_trips_with_uza.parquet')"
+    "# for region in ['sanpablo', 'eureka', 'fresno', 'wilshire']:\n",
+    "#     replica_grouped = pd.read_parquet(f'intermediate/{region}_grouped.parquet')\n",
+    "#     processed_df = process_grouped_data(replica_grouped, tracts_feeds)\n",
+    "#     uza_joined = attach_tracts_pop(processed_df)\n",
+    "#     display(uza_joined >> head(3))\n",
+    "#     uza_joined.to_parquet(f'outputs/{region}_trips_with_uza.parquet')"
    ]
   },
   {
@@ -552,70 +898,75 @@
    ]
   },
   {
-   "cell_type": "markdown",
-   "id": "63e712e1-900d-430e-962e-72ed8dd30bbf",
+   "cell_type": "code",
+   "execution_count": 68,
+   "id": "6ed72b61-2c14-4eb2-a188-00384f52082a",
    "metadata": {},
+   "outputs": [],
    "source": [
-    "## Quick GCS Upload"
+    "uza_joined = (uza_joined\n",
+    "              >> select(-_.index_right)\n",
+    "              )"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "id": "b0572b17-a9c2-4128-ab67-fa650c87fda0",
+   "execution_count": 71,
+   "id": "029334e4-be41-4f93-b95d-67687bd2d70e",
    "metadata": {},
    "outputs": [],
    "source": [
-    "fs = get_fs()"
+    "uza_joined['vmt_quantile'] = pd.qcut(uza_joined.total_mi_auto, 4, labels = ['p25', 'p50', 'p75', 'p100'])"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "id": "76539b31-f757-4703-9f7a-2eea60834d06",
+   "execution_count": 72,
+   "id": "8f1b132b-8f81-4923-9efe-92f3162ee64c",
    "metadata": {},
    "outputs": [],
    "source": [
-    "_utils.GCS_PATH"
+    "!mkdir export"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "id": "1518eea5-d2f1-4807-b392-4a4f2624439d",
+   "execution_count": 75,
+   "id": "ec2cc1b4-002b-498a-b1f0-055f5252fb70",
    "metadata": {},
    "outputs": [],
    "source": [
-    "lpath = 'replica_raw/'"
+    "uza_joined.vmt_quantile = uza_joined.vmt_quantile.astype(str)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "id": "e0048f15-f124-431d-9fae-35aa7ef3dd72",
+   "execution_count": 79,
+   "id": "354a6bea-fbc5-49d7-b531-2597f240510e",
    "metadata": {},
    "outputs": [],
    "source": [
-    "fs.put(lpath, _utils.GCS_PATH + lpath, recursive=True)"
+    "utils.geoparquet_gcs_export(uza_joined, f'{GCS_PATH}outputs/', 'new_trips_with_uza')"
    ]
   },
   {
-   "cell_type": "markdown",
-   "id": "b27a8e5c-9d03-435c-bd5c-8a5de83a6d88",
+   "cell_type": "code",
+   "execution_count": 77,
+   "id": "30983d03-5c50-4480-85db-ed6666e34bd8",
    "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/tmp/ipykernel_532/1657757500.py:1: UserWarning: Column names longer than 10 characters will be truncated when saved to ESRI Shapefile.\n",
+      "  uza_joined.to_file('./export/vmt_with_quantiles.shp')\n"
+     ]
+    }
+   ],
    "source": [
-    "## Pulling _corridor_ level data\n",
-    "\n",
-    "* First, get corridor geoms"
+    "uza_joined.to_file('./export/vmt_with_quantiles.shp')"
    ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "1a5d9b48-6f90-4e85-87e8-aa4c94d01aa3",
-   "metadata": {},
-   "outputs": [],
-   "source": []
   }
  ],
  "metadata": {