diff --git a/_shared_utils/shared_utils/publish_utils.py b/_shared_utils/shared_utils/publish_utils.py index 30f6ef8b7..098d29238 100644 --- a/_shared_utils/shared_utils/publish_utils.py +++ b/_shared_utils/shared_utils/publish_utils.py @@ -32,3 +32,18 @@ def write_to_public_gcs( os.remove(local_filename) return + + +def if_exists_then_delete(filepath: str): + """ + Check if file exists in GCS and delete. + For partitioned parquets, which are saved as folders, we need + to use recursive=True. + """ + if fs.exists(filepath): + if fs.isdir(filepath): + fs.rm(filepath, recursive=True) + else: + fs.rm(filepath) + + return diff --git a/traffic_ops/01_observed_flow_speed.ipynb b/traffic_ops/01_observed_flow_speed.ipynb new file mode 100644 index 000000000..8c51ec136 --- /dev/null +++ b/traffic_ops/01_observed_flow_speed.ipynb @@ -0,0 +1,1487 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "f207df7a-cc83-43d4-80cd-c1617c495b8f", + "metadata": {}, + "source": [ + "# Flow vs obs flow & speed vs obs speed\n", + "\n", + "Side-by-side comparison of what seems to be related columns.\n", + "\n", + "## What we learned\n", + "* `flow` is always >= `obs_flow`\n", + "* Most of the time, `flow = obs_flow`\n", + "* Why would imputation be more than what's observed? Is this only true when `flow == 0`? Yes, seems to be imputation happens when it's majority `obs_flow==0`, although there are a small percentage of cases where this isn't true.\n", + "* We'll just use `flow` for now, and use imputed values always?\n", + "* Most cases are `speed < obs_speed` (so imputed tends to be less than what detector says).\n", + "* Looking at descriptives, these are occurring where observed speed is really high, 100-200 mph!" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "9426f4a7-c670-4202-87a0-5772797a341f", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "\n", + "from utils import PROCESSED_GCS" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "9384cf2d-2a89-4eef-88ee-a0c95b88cca4", + "metadata": {}, + "outputs": [], + "source": [ + "def metric_vs_observed(metric: str, filtering: tuple) -> pd.DataFrame:\n", + " PREFIX = \"station_weekday_hour\"\n", + "\n", + " metric_df = pd.read_parquet(\n", + " f\"{PROCESSED_GCS}{PREFIX}_{metric}.parquet/\",\n", + " filters = filtering\n", + " )\n", + "\n", + " obs_metric_df = pd.read_parquet(\n", + " f\"{PROCESSED_GCS}{PREFIX}_obs_{metric}.parquet/\",\n", + " filters = filtering\n", + " )\n", + " \n", + " merge_cols = [\"station_uuid\", \"year\", \"month\", \"weekday\", \"hour\"]\n", + "\n", + " df = pd.merge(\n", + " metric_df,\n", + " obs_metric_df,\n", + " on = merge_cols,\n", + " how = \"outer\"\n", + " )\n", + " \n", + " return df" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "4ada8106-a779-4908-9234-218370911c5c", + "metadata": {}, + "outputs": [], + "source": [ + "def lane_comparisons(df: pd.DataFrame, lane_number: int, metric: str):\n", + " print(f\"\\nMetric: {metric}\")\n", + " print(f\"Month-Year: {df.month.iloc[0]}-{df.year.iloc[0]}\")\n", + " print(f\"Weekday: {df.weekday.iloc[0]} Hour: {df.hour.iloc[0]}\")\n", + " print(f\"******* lane number: {lane_number} ********\")\n", + "\n", + " N_ROWS = len(df)\n", + " \n", + " def rounded(numerator, denominator):\n", + " return round(numerator / denominator, 3)\n", + " \n", + " col = f\"lane_{lane_number}_{metric}\"\n", + " obs_col = f\"lane_{lane_number}_obs_{metric}\"\n", + " \n", + " N_EQUAL = df[df[col] == df[obs_col]].shape[0]\n", + " N_MORE = df[df[col] > df[obs_col]].shape[0]\n", + " N_LESS = df[df[col] < df[obs_col]].shape[0]\n", + " \n", + " \n", + " print(f\"# rows: {N_ROWS}\")\n", + " print(f\"equal: {N_EQUAL}, imputed > obs: {N_MORE}, imputed < obs: {N_LESS}\")\n", + " print(f\"% equal {rounded(N_EQUAL, N_ROWS)}\")\n", + " print(f\"greater: {rounded(N_MORE, N_ROWS)}, less: {rounded(N_LESS, N_ROWS)}\")\n", + " \n", + " if metric == \"speed\":\n", + " print(\"**** values when imputed < obs ****\")\n", + "\n", + " less_df = df.loc[df[col] < df[obs_col]]\n", + "\n", + " print(less_df[col].describe())\n", + " print(less_df[obs_col].describe())\n", + " \n", + " print(\"****values when imputed > obs *****\")\n", + " \n", + " more_df = df.loc[df[col] > df[obs_col]]\n", + " \n", + " more_df = more_df.assign(\n", + " obs_col_zero = more_df.apply(\n", + " lambda x: True if x[obs_col]==0 \n", + " else False, axis=1)\n", + " )\n", + " \n", + " print(more_df.obs_col_zero.value_counts())\n" + ] + }, + { + "cell_type": "markdown", + "id": "22b2740e-32a9-4600-9248-84a2efb32508", + "metadata": {}, + "source": [ + "## Flow vs observed flow diagnostics" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "433cf0e1-9e6d-44e3-9fd6-3959971d3b67", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Metric: flow\n", + "Month-Year: 6-2024\n", + "Weekday: 2 Hour: 18\n", + "******* lane number: 1 ********\n", + "# rows: 7026\n", + "equal: 6873, imputed > obs: 153, imputed < obs: 0\n", + "% equal 0.978\n", + "greater: 0.022, less: 0.0\n", + "****values when imputed > obs *****\n", + "True 108\n", + "False 45\n", + "Name: obs_col_zero, dtype: int64\n", + "\n", + "Metric: flow\n", + "Month-Year: 6-2024\n", + "Weekday: 2 Hour: 18\n", + "******* lane number: 2 ********\n", + "# rows: 7026\n", + "equal: 6906, imputed > obs: 120, imputed < obs: 0\n", + "% equal 0.983\n", + "greater: 0.017, less: 0.0\n", + "****values when imputed > obs *****\n", + "True 84\n", + "False 36\n", + "Name: obs_col_zero, dtype: int64\n", + "\n", + "Metric: flow\n", + "Month-Year: 6-2024\n", + "Weekday: 2 Hour: 18\n", + "******* lane number: 3 ********\n", + "# rows: 7026\n", + "equal: 6927, imputed > obs: 99, imputed < obs: 0\n", + "% equal 0.986\n", + "greater: 0.014, less: 0.0\n", + "****values when imputed > obs *****\n", + "True 66\n", + "False 33\n", + "Name: obs_col_zero, dtype: int64\n", + "\n", + "Metric: flow\n", + "Month-Year: 6-2024\n", + "Weekday: 2 Hour: 18\n", + "******* lane number: 4 ********\n", + "# rows: 7026\n", + "equal: 6978, imputed > obs: 48, imputed < obs: 0\n", + "% equal 0.993\n", + "greater: 0.007, less: 0.0\n", + "****values when imputed > obs *****\n", + "True 42\n", + "False 6\n", + "Name: obs_col_zero, dtype: int64\n", + "\n", + "Metric: flow\n", + "Month-Year: 6-2024\n", + "Weekday: 2 Hour: 18\n", + "******* lane number: 5 ********\n", + "# rows: 7026\n", + "equal: 7002, imputed > obs: 24, imputed < obs: 0\n", + "% equal 0.997\n", + "greater: 0.003, less: 0.0\n", + "****values when imputed > obs *****\n", + "True 24\n", + "Name: obs_col_zero, dtype: int64\n", + "\n", + "Metric: flow\n", + "Month-Year: 6-2024\n", + "Weekday: 2 Hour: 18\n", + "******* lane number: 6 ********\n", + "# rows: 7026\n", + "equal: 7023, imputed > obs: 3, imputed < obs: 0\n", + "% equal 1.0\n", + "greater: 0.0, less: 0.0\n", + "****values when imputed > obs *****\n", + "True 3\n", + "Name: obs_col_zero, dtype: int64\n", + "\n", + "Metric: flow\n", + "Month-Year: 6-2024\n", + "Weekday: 2 Hour: 18\n", + "******* lane number: 7 ********\n", + "# rows: 7026\n", + "equal: 7026, imputed > obs: 0, imputed < obs: 0\n", + "% equal 1.0\n", + "greater: 0.0, less: 0.0\n", + "****values when imputed > obs *****\n", + "Series([], Name: obs_col_zero, dtype: int64)\n", + "\n", + "Metric: flow\n", + "Month-Year: 6-2024\n", + "Weekday: 2 Hour: 18\n", + "******* lane number: 8 ********\n", + "# rows: 7026\n", + "equal: 7026, imputed > obs: 0, imputed < obs: 0\n", + "% equal 1.0\n", + "greater: 0.0, less: 0.0\n", + "****values when imputed > obs *****\n", + "Series([], Name: obs_col_zero, dtype: int64)\n" + ] + } + ], + "source": [ + "filtering = [[\n", + " (\"year\", \"==\", 2024), (\"month\", \"==\", 6),\n", + " (\"weekday\", \"==\", 2), (\"hour\", \"==\", 18)\n", + "]]\n", + "\n", + "METRIC = \"flow\"\n", + "df = metric_vs_observed(METRIC, filtering)\n", + "\n", + "for i in range(1, 9):\n", + " lane_comparisons(df, i, METRIC)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "38ec11bd-389a-4e5c-9f62-ede7578f0850", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Metric: flow\n", + "Month-Year: 3-2024\n", + "Weekday: 4 Hour: 10\n", + "******* lane number: 1 ********\n", + "# rows: 7884\n", + "equal: 7413, imputed > obs: 471, imputed < obs: 0\n", + "% equal 0.94\n", + "greater: 0.06, less: 0.0\n", + "****values when imputed > obs *****\n", + "False 375\n", + "True 96\n", + "Name: obs_col_zero, dtype: int64\n", + "\n", + "Metric: flow\n", + "Month-Year: 3-2024\n", + "Weekday: 4 Hour: 10\n", + "******* lane number: 2 ********\n", + "# rows: 7884\n", + "equal: 7590, imputed > obs: 294, imputed < obs: 0\n", + "% equal 0.963\n", + "greater: 0.037, less: 0.0\n", + "****values when imputed > obs *****\n", + "False 210\n", + "True 84\n", + "Name: obs_col_zero, dtype: int64\n", + "\n", + "Metric: flow\n", + "Month-Year: 3-2024\n", + "Weekday: 4 Hour: 10\n", + "******* lane number: 3 ********\n", + "# rows: 7884\n", + "equal: 7620, imputed > obs: 264, imputed < obs: 0\n", + "% equal 0.967\n", + "greater: 0.033, less: 0.0\n", + "****values when imputed > obs *****\n", + "False 174\n", + "True 90\n", + "Name: obs_col_zero, dtype: int64\n", + "\n", + "Metric: flow\n", + "Month-Year: 3-2024\n", + "Weekday: 4 Hour: 10\n", + "******* lane number: 4 ********\n", + "# rows: 7884\n", + "equal: 7662, imputed > obs: 222, imputed < obs: 0\n", + "% equal 0.972\n", + "greater: 0.028, less: 0.0\n", + "****values when imputed > obs *****\n", + "False 180\n", + "True 42\n", + "Name: obs_col_zero, dtype: int64\n", + "\n", + "Metric: flow\n", + "Month-Year: 3-2024\n", + "Weekday: 4 Hour: 10\n", + "******* lane number: 5 ********\n", + "# rows: 7884\n", + "equal: 7827, imputed > obs: 57, imputed < obs: 0\n", + "% equal 0.993\n", + "greater: 0.007, less: 0.0\n", + "****values when imputed > obs *****\n", + "False 48\n", + "True 9\n", + "Name: obs_col_zero, dtype: int64\n", + "\n", + "Metric: flow\n", + "Month-Year: 3-2024\n", + "Weekday: 4 Hour: 10\n", + "******* lane number: 6 ********\n", + "# rows: 7884\n", + "equal: 7878, imputed > obs: 6, imputed < obs: 0\n", + "% equal 0.999\n", + "greater: 0.001, less: 0.0\n", + "****values when imputed > obs *****\n", + "False 6\n", + "Name: obs_col_zero, dtype: int64\n", + "\n", + "Metric: flow\n", + "Month-Year: 3-2024\n", + "Weekday: 4 Hour: 10\n", + "******* lane number: 7 ********\n", + "# rows: 7884\n", + "equal: 7884, imputed > obs: 0, imputed < obs: 0\n", + "% equal 1.0\n", + "greater: 0.0, less: 0.0\n", + "****values when imputed > obs *****\n", + "Series([], Name: obs_col_zero, dtype: int64)\n", + "\n", + "Metric: flow\n", + "Month-Year: 3-2024\n", + "Weekday: 4 Hour: 10\n", + "******* lane number: 8 ********\n", + "# rows: 7884\n", + "equal: 7884, imputed > obs: 0, imputed < obs: 0\n", + "% equal 1.0\n", + "greater: 0.0, less: 0.0\n", + "****values when imputed > obs *****\n", + "Series([], Name: obs_col_zero, dtype: int64)\n" + ] + } + ], + "source": [ + "filtering = [[\n", + " (\"year\", \"==\", 2024), (\"month\", \"==\", 3),\n", + " (\"weekday\", \"==\", 4), (\"hour\", \"==\", 10)\n", + "]]\n", + "\n", + "df = metric_vs_observed(METRIC, filtering)\n", + "\n", + "for i in range(1, 9):\n", + " lane_comparisons(df, i, METRIC)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "9a183a7c-6c92-49fd-83ac-f8027a6253d4", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Metric: flow\n", + "Month-Year: 7-2023\n", + "Weekday: 2 Hour: 10\n", + "******* lane number: 1 ********\n", + "# rows: 7056\n", + "equal: 6552, imputed > obs: 504, imputed < obs: 0\n", + "% equal 0.929\n", + "greater: 0.071, less: 0.0\n", + "****values when imputed > obs *****\n", + "False 402\n", + "True 102\n", + "Name: obs_col_zero, dtype: int64\n", + "\n", + "Metric: flow\n", + "Month-Year: 7-2023\n", + "Weekday: 2 Hour: 10\n", + "******* lane number: 2 ********\n", + "# rows: 7056\n", + "equal: 6723, imputed > obs: 333, imputed < obs: 0\n", + "% equal 0.953\n", + "greater: 0.047, less: 0.0\n", + "****values when imputed > obs *****\n", + "False 258\n", + "True 75\n", + "Name: obs_col_zero, dtype: int64\n", + "\n", + "Metric: flow\n", + "Month-Year: 7-2023\n", + "Weekday: 2 Hour: 10\n", + "******* lane number: 3 ********\n", + "# rows: 7056\n", + "equal: 6759, imputed > obs: 297, imputed < obs: 0\n", + "% equal 0.958\n", + "greater: 0.042, less: 0.0\n", + "****values when imputed > obs *****\n", + "False 204\n", + "True 93\n", + "Name: obs_col_zero, dtype: int64\n", + "\n", + "Metric: flow\n", + "Month-Year: 7-2023\n", + "Weekday: 2 Hour: 10\n", + "******* lane number: 4 ********\n", + "# rows: 7056\n", + "equal: 6858, imputed > obs: 198, imputed < obs: 0\n", + "% equal 0.972\n", + "greater: 0.028, less: 0.0\n", + "****values when imputed > obs *****\n", + "False 153\n", + "True 45\n", + "Name: obs_col_zero, dtype: int64\n", + "\n", + "Metric: flow\n", + "Month-Year: 7-2023\n", + "Weekday: 2 Hour: 10\n", + "******* lane number: 5 ********\n", + "# rows: 7056\n", + "equal: 6999, imputed > obs: 57, imputed < obs: 0\n", + "% equal 0.992\n", + "greater: 0.008, less: 0.0\n", + "****values when imputed > obs *****\n", + "False 51\n", + "True 6\n", + "Name: obs_col_zero, dtype: int64\n", + "\n", + "Metric: flow\n", + "Month-Year: 7-2023\n", + "Weekday: 2 Hour: 10\n", + "******* lane number: 6 ********\n", + "# rows: 7056\n", + "equal: 7044, imputed > obs: 12, imputed < obs: 0\n", + "% equal 0.998\n", + "greater: 0.002, less: 0.0\n", + "****values when imputed > obs *****\n", + "False 12\n", + "Name: obs_col_zero, dtype: int64\n", + "\n", + "Metric: flow\n", + "Month-Year: 7-2023\n", + "Weekday: 2 Hour: 10\n", + "******* lane number: 7 ********\n", + "# rows: 7056\n", + "equal: 7056, imputed > obs: 0, imputed < obs: 0\n", + "% equal 1.0\n", + "greater: 0.0, less: 0.0\n", + "****values when imputed > obs *****\n", + "Series([], Name: obs_col_zero, dtype: int64)\n", + "\n", + "Metric: flow\n", + "Month-Year: 7-2023\n", + "Weekday: 2 Hour: 10\n", + "******* lane number: 8 ********\n", + "# rows: 7056\n", + "equal: 7056, imputed > obs: 0, imputed < obs: 0\n", + "% equal 1.0\n", + "greater: 0.0, less: 0.0\n", + "****values when imputed > obs *****\n", + "Series([], Name: obs_col_zero, dtype: int64)\n" + ] + } + ], + "source": [ + "filtering = [[\n", + " (\"year\", \"==\", 2023), (\"month\", \"==\", 7),\n", + " (\"weekday\", \"==\", 2), (\"hour\", \"==\", 10)\n", + "]]\n", + "\n", + "df = metric_vs_observed(METRIC, filtering)\n", + "\n", + "for i in range(1, 9):\n", + " lane_comparisons(df, i, METRIC)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "b9c1243b-d340-42a0-9f6c-52dd3cf2595b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Metric: flow\n", + "Month-Year: 9-2023\n", + "Weekday: 1 Hour: 14\n", + "******* lane number: 1 ********\n", + "# rows: 7761\n", + "equal: 6894, imputed > obs: 867, imputed < obs: 0\n", + "% equal 0.888\n", + "greater: 0.112, less: 0.0\n", + "****values when imputed > obs *****\n", + "False 762\n", + "True 105\n", + "Name: obs_col_zero, dtype: int64\n", + "\n", + "Metric: flow\n", + "Month-Year: 9-2023\n", + "Weekday: 1 Hour: 14\n", + "******* lane number: 2 ********\n", + "# rows: 7761\n", + "equal: 7245, imputed > obs: 516, imputed < obs: 0\n", + "% equal 0.934\n", + "greater: 0.066, less: 0.0\n", + "****values when imputed > obs *****\n", + "False 423\n", + "True 93\n", + "Name: obs_col_zero, dtype: int64\n", + "\n", + "Metric: flow\n", + "Month-Year: 9-2023\n", + "Weekday: 1 Hour: 14\n", + "******* lane number: 3 ********\n", + "# rows: 7761\n", + "equal: 7305, imputed > obs: 456, imputed < obs: 0\n", + "% equal 0.941\n", + "greater: 0.059, less: 0.0\n", + "****values when imputed > obs *****\n", + "False 381\n", + "True 75\n", + "Name: obs_col_zero, dtype: int64\n", + "\n", + "Metric: flow\n", + "Month-Year: 9-2023\n", + "Weekday: 1 Hour: 14\n", + "******* lane number: 4 ********\n", + "# rows: 7761\n", + "equal: 7425, imputed > obs: 336, imputed < obs: 0\n", + "% equal 0.957\n", + "greater: 0.043, less: 0.0\n", + "****values when imputed > obs *****\n", + "False 288\n", + "True 48\n", + "Name: obs_col_zero, dtype: int64\n", + "\n", + "Metric: flow\n", + "Month-Year: 9-2023\n", + "Weekday: 1 Hour: 14\n", + "******* lane number: 5 ********\n", + "# rows: 7761\n", + "equal: 7635, imputed > obs: 126, imputed < obs: 0\n", + "% equal 0.984\n", + "greater: 0.016, less: 0.0\n", + "****values when imputed > obs *****\n", + "False 102\n", + "True 24\n", + "Name: obs_col_zero, dtype: int64\n", + "\n", + "Metric: flow\n", + "Month-Year: 9-2023\n", + "Weekday: 1 Hour: 14\n", + "******* lane number: 6 ********\n", + "# rows: 7761\n", + "equal: 7728, imputed > obs: 33, imputed < obs: 0\n", + "% equal 0.996\n", + "greater: 0.004, less: 0.0\n", + "****values when imputed > obs *****\n", + "False 30\n", + "True 3\n", + "Name: obs_col_zero, dtype: int64\n", + "\n", + "Metric: flow\n", + "Month-Year: 9-2023\n", + "Weekday: 1 Hour: 14\n", + "******* lane number: 7 ********\n", + "# rows: 7761\n", + "equal: 7761, imputed > obs: 0, imputed < obs: 0\n", + "% equal 1.0\n", + "greater: 0.0, less: 0.0\n", + "****values when imputed > obs *****\n", + "Series([], Name: obs_col_zero, dtype: int64)\n", + "\n", + "Metric: flow\n", + "Month-Year: 9-2023\n", + "Weekday: 1 Hour: 14\n", + "******* lane number: 8 ********\n", + "# rows: 7761\n", + "equal: 7761, imputed > obs: 0, imputed < obs: 0\n", + "% equal 1.0\n", + "greater: 0.0, less: 0.0\n", + "****values when imputed > obs *****\n", + "Series([], Name: obs_col_zero, dtype: int64)\n" + ] + } + ], + "source": [ + "filtering = [[\n", + " (\"year\", \"==\", 2023), (\"month\", \"==\", 9),\n", + " (\"weekday\", \"==\", 1), (\"hour\", \"==\", 14)\n", + "]]\n", + "\n", + "df = metric_vs_observed(METRIC, filtering)\n", + "\n", + "for i in range(1, 9):\n", + " lane_comparisons(df, i, METRIC)" + ] + }, + { + "cell_type": "markdown", + "id": "b1fb9c85-6a4a-4b60-af3f-17f465087111", + "metadata": {}, + "source": [ + "## Speed vs observed speed diagnostics" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "9c3b7997-8906-4388-bd7a-1d68b884998a", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Metric: speed\n", + "Month-Year: 5-2024\n", + "Weekday: 2 Hour: 8\n", + "******* lane number: 1 ********\n", + "# rows: 2606\n", + "equal: 70, imputed > obs: 43, imputed < obs: 2479\n", + "% equal 0.027\n", + "greater: 0.017, less: 0.951\n", + "**** values when imputed < obs ****\n", + "count 2479.0\n", + "mean 57.44674\n", + "std 14.862704\n", + "min 5.42\n", + "25% 50.2975\n", + "50% 62.78\n", + "75% 67.87\n", + "max 79.48\n", + "Name: lane_1_speed, dtype: Float64\n", + "count 2479.0\n", + "mean 266.002098\n", + "std 84.267447\n", + "min 22.6\n", + "25% 210.65\n", + "50% 288.6\n", + "75% 326.1\n", + "max 397.4\n", + "Name: lane_1_obs_speed, dtype: Float64\n", + "****values when imputed > obs *****\n", + "True 33\n", + "False 10\n", + "Name: obs_col_zero, dtype: int64\n", + "\n", + "Metric: speed\n", + "Month-Year: 5-2024\n", + "Weekday: 2 Hour: 8\n", + "******* lane number: 2 ********\n", + "# rows: 2606\n", + "equal: 55, imputed > obs: 35, imputed < obs: 1631\n", + "% equal 0.021\n", + "greater: 0.013, less: 0.626\n", + "**** values when imputed < obs ****\n", + "count 1631.0\n", + "mean 54.435811\n", + "std 15.345102\n", + "min 6.06\n", + "25% 46.32\n", + "50% 60.075\n", + "75% 65.6675\n", + "max 73.966667\n", + "Name: lane_2_speed, dtype: Float64\n", + "count 1631.0\n", + "mean 251.42992\n", + "std 84.908873\n", + "min 21.2\n", + "25% 194.1\n", + "50% 275.3\n", + "75% 322.5\n", + "max 369.1\n", + "Name: lane_2_obs_speed, dtype: Float64\n", + "****values when imputed > obs *****\n", + "True 28\n", + "False 7\n", + "Name: obs_col_zero, dtype: int64\n", + "\n", + "Metric: speed\n", + "Month-Year: 5-2024\n", + "Weekday: 2 Hour: 8\n", + "******* lane number: 3 ********\n", + "# rows: 2606\n", + "equal: 51, imputed > obs: 35, imputed < obs: 1471\n", + "% equal 0.02\n", + "greater: 0.013, less: 0.564\n", + "**** values when imputed < obs ****\n", + "count 1471.0\n", + "mean 49.851535\n", + "std 14.697226\n", + "min 5.38\n", + "25% 42.41\n", + "50% 53.3\n", + "75% 61.42\n", + "max 77.825\n", + "Name: lane_3_speed, dtype: Float64\n", + "count 1471.0\n", + "mean 230.203739\n", + "std 79.750198\n", + "min 11.1\n", + "25% 178.3\n", + "50% 247.0\n", + "75% 295.75\n", + "max 360.3\n", + "Name: lane_3_obs_speed, dtype: Float64\n", + "****values when imputed > obs *****\n", + "True 28\n", + "False 7\n", + "Name: obs_col_zero, dtype: int64\n", + "\n", + "Metric: speed\n", + "Month-Year: 5-2024\n", + "Weekday: 2 Hour: 8\n", + "******* lane number: 4 ********\n", + "# rows: 2606\n", + "equal: 46, imputed > obs: 20, imputed < obs: 1131\n", + "% equal 0.018\n", + "greater: 0.008, less: 0.434\n", + "**** values when imputed < obs ****\n", + "count 1131.0\n", + "mean 47.919651\n", + "std 14.27912\n", + "min 4.54\n", + "25% 40.35\n", + "50% 51.42\n", + "75% 58.65\n", + "max 76.54\n", + "Name: lane_4_speed, dtype: Float64\n", + "count 1131.0\n", + "mean 218.736163\n", + "std 78.207275\n", + "min 11.3\n", + "25% 164.25\n", + "50% 234.4\n", + "75% 280.25\n", + "max 382.7\n", + "Name: lane_4_obs_speed, dtype: Float64\n", + "****values when imputed > obs *****\n", + "True 14\n", + "False 6\n", + "Name: obs_col_zero, dtype: int64\n", + "\n", + "Metric: speed\n", + "Month-Year: 5-2024\n", + "Weekday: 2 Hour: 8\n", + "******* lane number: 5 ********\n", + "# rows: 2606\n", + "equal: 11, imputed > obs: 9, imputed < obs: 373\n", + "% equal 0.004\n", + "greater: 0.003, less: 0.143\n", + "**** values when imputed < obs ****\n", + "count 373.0\n", + "mean 50.034495\n", + "std 13.540193\n", + "min 4.94\n", + "25% 43.45\n", + "50% 54.24\n", + "75% 60.333333\n", + "max 65.3\n", + "Name: lane_5_speed, dtype: Float64\n", + "count 373.0\n", + "mean 226.784718\n", + "std 77.696666\n", + "min 9.6\n", + "25% 172.8\n", + "50% 248.1\n", + "75% 289.1\n", + "max 324.8\n", + "Name: lane_5_obs_speed, dtype: Float64\n", + "****values when imputed > obs *****\n", + "True 6\n", + "False 3\n", + "Name: obs_col_zero, dtype: int64\n", + "\n", + "Metric: speed\n", + "Month-Year: 5-2024\n", + "Weekday: 2 Hour: 8\n", + "******* lane number: 6 ********\n", + "# rows: 2606\n", + "equal: 2, imputed > obs: 1, imputed < obs: 70\n", + "% equal 0.001\n", + "greater: 0.0, less: 0.027\n", + "**** values when imputed < obs ****\n", + "count 70.0\n", + "mean 49.241571\n", + "std 16.127912\n", + "min 5.86\n", + "25% 40.16\n", + "50% 55.28\n", + "75% 62.39\n", + "max 65.04\n", + "Name: lane_6_speed, dtype: Float64\n", + "count 70.0\n", + "mean 233.05\n", + "std 85.957105\n", + "min 17.7\n", + "25% 160.45\n", + "50% 265.65\n", + "75% 310.8\n", + "max 325.2\n", + "Name: lane_6_obs_speed, dtype: Float64\n", + "****values when imputed > obs *****\n", + "True 1\n", + "Name: obs_col_zero, dtype: int64\n", + "\n", + "Metric: speed\n", + "Month-Year: 5-2024\n", + "Weekday: 2 Hour: 8\n", + "******* lane number: 7 ********\n", + "# rows: 2606\n", + "equal: 1, imputed > obs: 0, imputed < obs: 1\n", + "% equal 0.0\n", + "greater: 0.0, less: 0.0\n", + "**** values when imputed < obs ****\n", + "count 1.0\n", + "mean 10.26\n", + "std \n", + "min 10.26\n", + "25% 10.26\n", + "50% 10.26\n", + "75% 10.26\n", + "max 10.26\n", + "Name: lane_7_speed, dtype: Float64\n", + "count 1.0\n", + "mean 38.6\n", + "std \n", + "min 38.6\n", + "25% 38.6\n", + "50% 38.6\n", + "75% 38.6\n", + "max 38.6\n", + "Name: lane_7_obs_speed, dtype: Float64\n", + "****values when imputed > obs *****\n", + "Series([], Name: obs_col_zero, dtype: int64)\n", + "\n", + "Metric: speed\n", + "Month-Year: 5-2024\n", + "Weekday: 2 Hour: 8\n", + "******* lane number: 8 ********\n", + "# rows: 2606\n", + "equal: 0, imputed > obs: 0, imputed < obs: 0\n", + "% equal 0.0\n", + "greater: 0.0, less: 0.0\n", + "**** values when imputed < obs ****\n", + "count 0.0\n", + "mean \n", + "std \n", + "min \n", + "25% \n", + "50% \n", + "75% \n", + "max \n", + "Name: lane_8_speed, dtype: Float64\n", + "count 0.0\n", + "mean \n", + "std \n", + "min \n", + "25% \n", + "50% \n", + "75% \n", + "max \n", + "Name: lane_8_obs_speed, dtype: Float64\n", + "****values when imputed > obs *****\n", + "Series([], Name: obs_col_zero, dtype: int64)\n" + ] + } + ], + "source": [ + "filtering = [[\n", + " (\"year\", \"==\", 2024), (\"month\", \"==\", 5),\n", + " (\"weekday\", \"==\", 2), (\"hour\", \"==\", 8)\n", + "]]\n", + "\n", + "METRIC = \"speed\"\n", + "df = metric_vs_observed(METRIC, filtering)\n", + "\n", + "for i in range(1, 9):\n", + " lane_comparisons(df, i, METRIC)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "2c7a4061-b791-422a-97c1-18eae73784b8", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Metric: speed\n", + "Month-Year: 8-2023\n", + "Weekday: 5 Hour: 7\n", + "******* lane number: 1 ********\n", + "# rows: 2311\n", + "equal: 98, imputed > obs: 31, imputed < obs: 2169\n", + "% equal 0.042\n", + "greater: 0.013, less: 0.939\n", + "**** values when imputed < obs ****\n", + "count 2169.0\n", + "mean 69.474558\n", + "std 5.819333\n", + "min 29.675\n", + "25% 64.85\n", + "50% 71.4\n", + "75% 74.325\n", + "max 83.475\n", + "Name: lane_1_speed, dtype: Float64\n", + "count 2169.0\n", + "mean 260.652006\n", + "std 47.678904\n", + "min 62.9\n", + "25% 255.3\n", + "50% 278.3\n", + "75% 296.5\n", + "max 333.9\n", + "Name: lane_1_obs_speed, dtype: Float64\n", + "****values when imputed > obs *****\n", + "True 26\n", + "False 5\n", + "Name: obs_col_zero, dtype: int64\n", + "\n", + "Metric: speed\n", + "Month-Year: 8-2023\n", + "Weekday: 5 Hour: 7\n", + "******* lane number: 2 ********\n", + "# rows: 2311\n", + "equal: 60, imputed > obs: 27, imputed < obs: 1459\n", + "% equal 0.026\n", + "greater: 0.012, less: 0.631\n", + "**** values when imputed < obs ****\n", + "count 1459.0\n", + "mean 68.689011\n", + "std 4.427647\n", + "min 23.45\n", + "25% 67.6375\n", + "50% 69.55\n", + "75% 70.75\n", + "max 76.95\n", + "Name: lane_2_speed, dtype: Float64\n", + "count 1459.0\n", + "mean 258.858259\n", + "std 44.162084\n", + "min 53.7\n", + "25% 259.1\n", + "50% 276.6\n", + "75% 282.7\n", + "max 307.8\n", + "Name: lane_2_obs_speed, dtype: Float64\n", + "****values when imputed > obs *****\n", + "True 24\n", + "False 3\n", + "Name: obs_col_zero, dtype: int64\n", + "\n", + "Metric: speed\n", + "Month-Year: 8-2023\n", + "Weekday: 5 Hour: 7\n", + "******* lane number: 3 ********\n", + "# rows: 2311\n", + "equal: 51, imputed > obs: 31, imputed < obs: 1356\n", + "% equal 0.022\n", + "greater: 0.013, less: 0.587\n", + "**** values when imputed < obs ****\n", + "count 1356.0\n", + "mean 63.441734\n", + "std 6.004107\n", + "min 40.7\n", + "25% 60.825\n", + "50% 64.783333\n", + "75% 67.175\n", + "max 81.2\n", + "Name: lane_3_speed, dtype: Float64\n", + "count 1356.0\n", + "mean 238.662021\n", + "std 44.415839\n", + "min 56.8\n", + "25% 220.425\n", + "50% 251.0\n", + "75% 267.5\n", + "max 321.7\n", + "Name: lane_3_obs_speed, dtype: Float64\n", + "****values when imputed > obs *****\n", + "True 28\n", + "False 3\n", + "Name: obs_col_zero, dtype: int64\n", + "\n", + "Metric: speed\n", + "Month-Year: 8-2023\n", + "Weekday: 5 Hour: 7\n", + "******* lane number: 4 ********\n", + "# rows: 2311\n", + "equal: 46, imputed > obs: 13, imputed < obs: 1047\n", + "% equal 0.02\n", + "greater: 0.006, less: 0.453\n", + "**** values when imputed < obs ****\n", + "count 1047.0\n", + "mean 61.107649\n", + "std 5.62594\n", + "min 34.925\n", + "25% 59.4\n", + "50% 61.825\n", + "75% 63.8625\n", + "max 72.05\n", + "Name: lane_4_speed, dtype: Float64\n", + "count 1047.0\n", + "mean 229.072015\n", + "std 42.75337\n", + "min 60.0\n", + "25% 211.4\n", + "50% 245.4\n", + "75% 251.1\n", + "max 288.2\n", + "Name: lane_4_obs_speed, dtype: Float64\n", + "****values when imputed > obs *****\n", + "True 10\n", + "False 3\n", + "Name: obs_col_zero, dtype: int64\n", + "\n", + "Metric: speed\n", + "Month-Year: 8-2023\n", + "Weekday: 5 Hour: 7\n", + "******* lane number: 5 ********\n", + "# rows: 2311\n", + "equal: 17, imputed > obs: 2, imputed < obs: 323\n", + "% equal 0.007\n", + "greater: 0.001, less: 0.14\n", + "**** values when imputed < obs ****\n", + "count 323.0\n", + "mean 62.318989\n", + "std 3.415836\n", + "min 42.175\n", + "25% 61.9375\n", + "50% 63.475\n", + "75% 64.2125\n", + "max 71.125\n", + "Name: lane_5_speed, dtype: Float64\n", + "count 323.0\n", + "mean 236.016099\n", + "std 38.740027\n", + "min 63.2\n", + "25% 239.3\n", + "50% 253.0\n", + "75% 256.7\n", + "max 284.5\n", + "Name: lane_5_obs_speed, dtype: Float64\n", + "****values when imputed > obs *****\n", + "True 2\n", + "Name: obs_col_zero, dtype: int64\n", + "\n", + "Metric: speed\n", + "Month-Year: 8-2023\n", + "Weekday: 5 Hour: 7\n", + "******* lane number: 6 ********\n", + "# rows: 2311\n", + "equal: 6, imputed > obs: 0, imputed < obs: 58\n", + "% equal 0.003\n", + "greater: 0.0, less: 0.025\n", + "**** values when imputed < obs ****\n", + "count 58.0\n", + "mean 63.066667\n", + "std 1.775147\n", + "min 55.35\n", + "25% 62.8625\n", + "50% 63.65\n", + "75% 64.19375\n", + "max 64.7\n", + "Name: lane_6_speed, dtype: Float64\n", + "count 58.0\n", + "mean 231.208621\n", + "std 50.683345\n", + "min 63.1\n", + "25% 242.7\n", + "50% 253.85\n", + "75% 256.675\n", + "max 258.8\n", + "Name: lane_6_obs_speed, dtype: Float64\n", + "****values when imputed > obs *****\n", + "Series([], Name: obs_col_zero, dtype: int64)\n", + "\n", + "Metric: speed\n", + "Month-Year: 8-2023\n", + "Weekday: 5 Hour: 7\n", + "******* lane number: 7 ********\n", + "# rows: 2311\n", + "equal: 0, imputed > obs: 0, imputed < obs: 2\n", + "% equal 0.0\n", + "greater: 0.0, less: 0.001\n", + "**** values when imputed < obs ****\n", + "count 2.0\n", + "mean 62.366667\n", + "std 1.791337\n", + "min 61.1\n", + "25% 61.733333\n", + "50% 62.366667\n", + "75% 63.0\n", + "max 63.633333\n", + "Name: lane_7_speed, dtype: Float64\n", + "count 2.0\n", + "mean 217.65\n", + "std 37.830213\n", + "min 190.9\n", + "25% 204.275\n", + "50% 217.65\n", + "75% 231.025\n", + "max 244.4\n", + "Name: lane_7_obs_speed, dtype: Float64\n", + "****values when imputed > obs *****\n", + "Series([], Name: obs_col_zero, dtype: int64)\n", + "\n", + "Metric: speed\n", + "Month-Year: 8-2023\n", + "Weekday: 5 Hour: 7\n", + "******* lane number: 8 ********\n", + "# rows: 2311\n", + "equal: 0, imputed > obs: 0, imputed < obs: 0\n", + "% equal 0.0\n", + "greater: 0.0, less: 0.0\n", + "**** values when imputed < obs ****\n", + "count 0.0\n", + "mean \n", + "std \n", + "min \n", + "25% \n", + "50% \n", + "75% \n", + "max \n", + "Name: lane_8_speed, dtype: Float64\n", + "count 0.0\n", + "mean \n", + "std \n", + "min \n", + "25% \n", + "50% \n", + "75% \n", + "max \n", + "Name: lane_8_obs_speed, dtype: Float64\n", + "****values when imputed > obs *****\n", + "Series([], Name: obs_col_zero, dtype: int64)\n" + ] + } + ], + "source": [ + "filtering = [[\n", + " (\"year\", \"==\", 2023), (\"month\", \"==\", 8),\n", + " (\"weekday\", \"==\", 5), (\"hour\", \"==\", 7)\n", + "]]\n", + "\n", + "df = metric_vs_observed(METRIC, filtering)\n", + "\n", + "for i in range(1, 9):\n", + " lane_comparisons(df, i, METRIC)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "8473d5a9-11ed-4e43-b553-1464c7a4aa24", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Metric: speed\n", + "Month-Year: 10-2023\n", + "Weekday: 3 Hour: 13\n", + "******* lane number: 1 ********\n", + "# rows: 2604\n", + "equal: 89, imputed > obs: 67, imputed < obs: 2430\n", + "% equal 0.034\n", + "greater: 0.026, less: 0.933\n", + "**** values when imputed < obs ****\n", + "count 2430.0\n", + "mean 64.361485\n", + "std 9.077721\n", + "min 17.875\n", + "25% 60.73125\n", + "50% 65.0\n", + "75% 71.075\n", + "max 78.9\n", + "Name: lane_1_speed, dtype: Float64\n", + "count 2430.0\n", + "mean 238.871605\n", + "std 55.517956\n", + "min 51.3\n", + "25% 212.4\n", + "50% 258.0\n", + "75% 280.4\n", + "max 315.6\n", + "Name: lane_1_obs_speed, dtype: Float64\n", + "****values when imputed > obs *****\n", + "True 49\n", + "False 18\n", + "Name: obs_col_zero, dtype: int64\n", + "\n", + "Metric: speed\n", + "Month-Year: 10-2023\n", + "Weekday: 3 Hour: 13\n", + "******* lane number: 2 ********\n", + "# rows: 2604\n", + "equal: 67, imputed > obs: 40, imputed < obs: 1622\n", + "% equal 0.026\n", + "greater: 0.015, less: 0.623\n", + "**** values when imputed < obs ****\n", + "count 1622.0\n", + "mean 61.670176\n", + "std 8.343252\n", + "min 17.9\n", + "25% 58.089583\n", + "50% 63.875\n", + "75% 67.25\n", + "max 74.375\n", + "Name: lane_2_speed, dtype: Float64\n", + "count 1622.0\n", + "mean 228.833847\n", + "std 51.732604\n", + "min 54.2\n", + "25% 204.05\n", + "50% 247.35\n", + "75% 266.3\n", + "max 297.5\n", + "Name: lane_2_obs_speed, dtype: Float64\n", + "****values when imputed > obs *****\n", + "True 30\n", + "False 10\n", + "Name: obs_col_zero, dtype: int64\n", + "\n", + "Metric: speed\n", + "Month-Year: 10-2023\n", + "Weekday: 3 Hour: 13\n", + "******* lane number: 3 ********\n", + "# rows: 2604\n", + "equal: 64, imputed > obs: 41, imputed < obs: 1483\n", + "% equal 0.025\n", + "greater: 0.016, less: 0.57\n", + "**** values when imputed < obs ****\n", + "count 1483.0\n", + "mean 55.931468\n", + "std 8.478252\n", + "min 15.35\n", + "25% 51.7\n", + "50% 57.0\n", + "75% 61.9\n", + "max 77.175\n", + "Name: lane_3_speed, dtype: Float64\n", + "count 1483.0\n", + "mean 207.672218\n", + "std 48.982583\n", + "min 45.9\n", + "25% 186.7\n", + "50% 218.9\n", + "75% 243.25\n", + "max 308.7\n", + "Name: lane_3_obs_speed, dtype: Float64\n", + "****values when imputed > obs *****\n", + "True 26\n", + "False 15\n", + "Name: obs_col_zero, dtype: int64\n", + "\n", + "Metric: speed\n", + "Month-Year: 10-2023\n", + "Weekday: 3 Hour: 13\n", + "******* lane number: 4 ********\n", + "# rows: 2604\n", + "equal: 58, imputed > obs: 27, imputed < obs: 1137\n", + "% equal 0.022\n", + "greater: 0.01, less: 0.437\n", + "**** values when imputed < obs ****\n", + "count 1137.0\n", + "mean 54.160195\n", + "std 7.923261\n", + "min 17.425\n", + "25% 50.575\n", + "50% 55.0\n", + "75% 59.475\n", + "max 76.75\n", + "Name: lane_4_speed, dtype: Float64\n", + "count 1137.0\n", + "mean 200.304222\n", + "std 46.375729\n", + "min 53.3\n", + "25% 180.8\n", + "50% 212.2\n", + "75% 231.1\n", + "max 307.0\n", + "Name: lane_4_obs_speed, dtype: Float64\n", + "****values when imputed > obs *****\n", + "True 17\n", + "False 10\n", + "Name: obs_col_zero, dtype: int64\n", + "\n", + "Metric: speed\n", + "Month-Year: 10-2023\n", + "Weekday: 3 Hour: 13\n", + "******* lane number: 5 ********\n", + "# rows: 2604\n", + "equal: 17, imputed > obs: 7, imputed < obs: 378\n", + "% equal 0.007\n", + "greater: 0.003, less: 0.145\n", + "**** values when imputed < obs ****\n", + "count 378.0\n", + "mean 56.131768\n", + "std 7.750725\n", + "min 19.7\n", + "25% 52.09375\n", + "50% 57.0375\n", + "75% 63.16875\n", + "max 64.875\n", + "Name: lane_5_speed, dtype: Float64\n", + "count 378.0\n", + "mean 205.230159\n", + "std 50.357375\n", + "min 41.7\n", + "25% 185.575\n", + "50% 220.0\n", + "75% 246.5\n", + "max 259.5\n", + "Name: lane_5_obs_speed, dtype: Float64\n", + "****values when imputed > obs *****\n", + "True 6\n", + "False 1\n", + "Name: obs_col_zero, dtype: int64\n", + "\n", + "Metric: speed\n", + "Month-Year: 10-2023\n", + "Weekday: 3 Hour: 13\n", + "******* lane number: 6 ********\n", + "# rows: 2604\n", + "equal: 4, imputed > obs: 2, imputed < obs: 74\n", + "% equal 0.002\n", + "greater: 0.001, less: 0.028\n", + "**** values when imputed < obs ****\n", + "count 74.0\n", + "mean 58.568018\n", + "std 6.357801\n", + "min 38.35\n", + "25% 55.93125\n", + "50% 60.8875\n", + "75% 63.41875\n", + "max 66.25\n", + "Name: lane_6_speed, dtype: Float64\n", + "count 74.0\n", + "mean 207.535135\n", + "std 54.252586\n", + "min 64.5\n", + "25% 186.675\n", + "50% 225.9\n", + "75% 253.175\n", + "max 257.8\n", + "Name: lane_6_obs_speed, dtype: Float64\n", + "****values when imputed > obs *****\n", + "True 2\n", + "Name: obs_col_zero, dtype: int64\n", + "\n", + "Metric: speed\n", + "Month-Year: 10-2023\n", + "Weekday: 3 Hour: 13\n", + "******* lane number: 7 ********\n", + "# rows: 2604\n", + "equal: 1, imputed > obs: 0, imputed < obs: 1\n", + "% equal 0.0\n", + "greater: 0.0, less: 0.0\n", + "**** values when imputed < obs ****\n", + "count 1.0\n", + "mean 63.5\n", + "std \n", + "min 63.5\n", + "25% 63.5\n", + "50% 63.5\n", + "75% 63.5\n", + "max 63.5\n", + "Name: lane_7_speed, dtype: Float64\n", + "count 1.0\n", + "mean 190.5\n", + "std \n", + "min 190.5\n", + "25% 190.5\n", + "50% 190.5\n", + "75% 190.5\n", + "max 190.5\n", + "Name: lane_7_obs_speed, dtype: Float64\n", + "****values when imputed > obs *****\n", + "Series([], Name: obs_col_zero, dtype: int64)\n", + "\n", + "Metric: speed\n", + "Month-Year: 10-2023\n", + "Weekday: 3 Hour: 13\n", + "******* lane number: 8 ********\n", + "# rows: 2604\n", + "equal: 0, imputed > obs: 0, imputed < obs: 0\n", + "% equal 0.0\n", + "greater: 0.0, less: 0.0\n", + "**** values when imputed < obs ****\n", + "count 0.0\n", + "mean \n", + "std \n", + "min \n", + "25% \n", + "50% \n", + "75% \n", + "max \n", + "Name: lane_8_speed, dtype: Float64\n", + "count 0.0\n", + "mean \n", + "std \n", + "min \n", + "25% \n", + "50% \n", + "75% \n", + "max \n", + "Name: lane_8_obs_speed, dtype: Float64\n", + "****values when imputed > obs *****\n", + "Series([], Name: obs_col_zero, dtype: int64)\n" + ] + } + ], + "source": [ + "filtering = [[\n", + " (\"year\", \"==\", 2023), (\"month\", \"==\", 10),\n", + " (\"weekday\", \"==\", 3), (\"hour\", \"==\", 13)\n", + "]]\n", + "\n", + "df = metric_vs_observed(METRIC, filtering)\n", + "\n", + "for i in range(1, 9):\n", + " lane_comparisons(df, i, METRIC)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8841dd2a-2d63-476d-8723-115f62dc526e", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/traffic_ops/aggregate.py b/traffic_ops/aggregate.py index 06b862755..bdcdac00d 100644 --- a/traffic_ops/aggregate.py +++ b/traffic_ops/aggregate.py @@ -14,6 +14,7 @@ import utils from utils import RAW_GCS, PROCESSED_GCS from crosswalks import station_id_cols +from shared_utils import publish_utils fs = gcsfs.GCSFileSystem() @@ -51,38 +52,25 @@ def aggregate_metric( group_cols: list, metric_name: Literal["flow", "truck_flow", "occ", "obs", "speed"] ) -> pd.DataFrame: - + """ + Aggregate metric (mean preferred for now) + against a list of grouping columns. + """ metric_cols = [c for c in df.columns if metric_name in c] - - if metric_name == "speed": - metric_agg = "mean" - else: - metric_agg = "sum" - - if metric_name in ["occ", "speed", "obs_speed"]: - metric_dtypes = {c: "Float64" for c in metric_cols} - - else: - metric_dtypes = {c: "Int64" for c in metric_cols} - + df2 = ( df .groupby(group_cols, group_keys=False) .agg( - {**{c: metric_agg for c in metric_cols}} + {**{c: "mean" for c in metric_cols}} ).reset_index() .astype({ - **metric_dtypes, - "year": "int16", - "month": "int8", - "weekday": "int8", + # since everything is mean, use floats, but allow NaNs + {c: "Float64" for c in metric_cols} }) ) - if "hour" in df2.columns: - df2 = df2.astype({"hour": "int8"}) - return df2 @@ -102,6 +90,9 @@ def metric_prep(metric: str) -> list: engine="pyarrow" ).columns.tolist() + # When we are interested in 1 particular metric, + # we should look for columns that contain a keyword + # but remove confounding ones (metric = flow; remove obs_flow, truck_flow) exclude_dict = { "flow": ["obs", "truck"], "occ": ["avg_occ"], @@ -117,6 +108,9 @@ def metric_prep(metric: str) -> list: any(word in c for word in exclude_dict[metric]) ] + # Create list of delayed dfs where we read in 1 partition + # and subset the columns + # Note: dd.read_parquet() has dtype errors import_dfs = [ delayed(read_filepart_merge_crosswalk)( filename, @@ -125,11 +119,18 @@ def metric_prep(metric: str) -> list: ) for part_i in list_of_files ] + # Add additional time columns we want time_dfs = [ delayed(utils.parse_for_time_components)(i) for i in import_dfs ] - time_dfs = [delayed(utils.add_peak_offpeak_column)(i, "hour") for i in time_dfs] + time_dfs = [delayed(utils.add_peak_offpeak_column)(i, "hour") + for i in time_dfs] + + time_dfs = [ + delayed(utils.add_weekday_weekend_column)(i, "weekday") + for i in time_dfs + ] return time_dfs @@ -146,6 +147,12 @@ def compute_and_export( """ metric_dfs = [compute(i)[0] for i in metric_dfs] results = pd.concat(metric_dfs, axis=0, ignore_index=True) + + for c in ["hour", "month", "weekday"]: + if c in results.columns: + results = results.astype({c: "int8"}) + if "year" in results.columns: + results = results.astype({"year": "int16"}) results.to_parquet( f"{PROCESSED_GCS}{export_filename}_{metric}.parquet", @@ -154,6 +161,38 @@ def compute_and_export( return +def process_one_metric( + metric_name: Literal["flow", "truck_flow", "occ", "obs", "speed"], + group_cols: list, + export_filename: str +): + """ + Prep and aggregate one metric and save out at particular grain. + """ + time0 = datetime.datetime.now() + + time_dfs = metric_prep(metric_name) + + aggregated_dfs = [ + delayed(aggregate_metric)(i, group_cols, metric_name) + for i in time_dfs + ] + + publish_utils.if_exists_then_delete( + f"{PROCESSED_GCS}{export_filename}_{metric_name}.parquet" + ) + + compute_and_export( + metric_name, + aggregated_dfs, + export_filename, + ) + + time1 = datetime.datetime.now() + print(f"{metric_name} exported {export_filename}: {time1 - time0}") + + return + def import_detector_status( filename: str = "hov_portion_detector_status_time_window", @@ -169,6 +208,8 @@ def import_detector_status( utils.parse_for_time_components ).pipe( utils.add_peak_offpeak_column, "hour" + ).pipe( + utils.add_weekday_weekend_column, "weekday" ) # Merge in station_uuid @@ -215,73 +256,37 @@ def aggregate_detector_samples( metric_list = [ "flow", "truck_flow", "obs_flow", - "occ", - "speed", "obs_speed", # mean - "pts_obs", + "occ", "speed", "obs_speed", "pts_obs", ] station_cols = ["station_uuid"] - weekday_hour_cols = ["year", "month", "weekday", "hour"] - weekday_peak_cols = ["year", "month", "weekday", "peak_offpeak"] - for metric in metric_list: - - time0 = datetime.datetime.now() - - time_dfs = metric_prep(metric) - - hour_dfs = [ - delayed(aggregate_metric)(i, station_cols + weekday_hour_cols, metric) - for i in time_dfs - ] - - compute_and_export( - metric, - hour_dfs, - "station_weekday_hour", - partition_cols = ["weekday", "hour"] - ) + GRAINS = { + "station_weekday_hour": station_cols + ["year", "month", "weekday", "hour"], + "station_weekday_peak": station_cols + ["year", "month", "weekday", "peak_offpeak"], + "station_daytype_hour": station_cols + ["hour", "daytype"] + } - time1 = datetime.datetime.now() - print(f"{metric} hourly aggregation: {time1 - time0}") - - peak_dfs = [ - delayed(aggregate_metric)(i, station_cols + weekday_peak_cols, metric) - for i in time_dfs - ] + for metric in metric_list: - compute_and_export( - metric, - peak_dfs, - "station_weekday_peak", - partition_cols = ["weekday", "peak_offpeak"] - ) + for export_filename, grain_cols in GRAINS.items(): + + process_one_metric(metric, grain_cols, export_filename) - time2 = datetime.datetime.now() - print(f"{metric} peak/offpeak aggregation: {time2 - time1}") - print(f"{metric} aggregation: {time2 - time0}") - detector_df = import_detector_status() - detector_station_hour = aggregate_detector_samples( - detector_df, - station_cols + weekday_hour_cols - ) - - detector_station_hour.to_parquet( - f"{PROCESSED_GCS}station_weekday_hour_detectors.parquet" - ) - - detector_station_weekday = aggregate_detector_samples( - detector_df, - station_cols + weekday_peak_cols - ) - - detector_station_weekday.to_parquet( - f"{PROCESSED_GCS}station_weekday_peak_detectors.parquet" - ) + for export_filename, grain_cols in GRAINS.items(): + + agg_df = aggregate_detector_samples( + detector_df, + grain_cols + ) + + agg_df.to_parquet( + f"{PROCESSED_GCS}{export_filename}_detectors.parquet" + ) end = datetime.datetime.now() print(f"execution time: {end - start}") \ No newline at end of file diff --git a/traffic_ops/utils.py b/traffic_ops/utils.py index 6dee8e87c..7c6f1e451 100644 --- a/traffic_ops/utils.py +++ b/traffic_ops/utils.py @@ -13,6 +13,10 @@ def parse_for_time_components( df: pd.DataFrame, time_col: str = "time_id" ) -> pd.DataFrame: + """ + Parse the time_id column into several components: + year, month, weekday (as integer), and hour. + """ df2 = df.assign( year = pd.to_datetime(df[time_col]).dt.year, @@ -29,7 +33,9 @@ def add_peak_offpeak_column( df: pd.DataFrame, hour_col: str = "hour" ) -> pd.DataFrame: - + """ + Categorize hour into peak / offpeak. + """ hours_in_day = range(0, 24) peak_offpeak_dict = { @@ -44,4 +50,21 @@ def add_peak_offpeak_column( return df - \ No newline at end of file +def add_weekday_weekend_column( + df: pd.DataFrame, + weekday_col: str = "weekday" +) -> pd.DataFrame: + """ + Categorize day of week into daytype (weekday or weekend). + """ + + weekday_weekend_dict = { + **{k: "weekday" for k in [0, 1, 2, 3, 4]}, + **{k: "weekend" for k in [5, 6]}, + } + + df = df.assign( + daytype = df[weekday_col].map(weekday_weekend_dict) + ) + + return df \ No newline at end of file