From a7d93ead0773b33e6dc3e047ae0673a9443e1f92 Mon Sep 17 00:00:00 2001 From: tiffanychu90 Date: Wed, 14 Aug 2024 17:17:36 +0000 Subject: [PATCH 1/2] add helpers.if_file_exists to publish_utils --- _shared_utils/shared_utils/publish_utils.py | 15 + traffic_ops/01_observed_flow_speed.ipynb | 1487 +++++++++++++++++++ traffic_ops/aggregate.py | 161 +- traffic_ops/utils.py | 27 +- 4 files changed, 1610 insertions(+), 80 deletions(-) create mode 100644 traffic_ops/01_observed_flow_speed.ipynb diff --git a/_shared_utils/shared_utils/publish_utils.py b/_shared_utils/shared_utils/publish_utils.py index 30f6ef8b76..098d29238b 100644 --- a/_shared_utils/shared_utils/publish_utils.py +++ b/_shared_utils/shared_utils/publish_utils.py @@ -32,3 +32,18 @@ def write_to_public_gcs( os.remove(local_filename) return + + +def if_exists_then_delete(filepath: str): + """ + Check if file exists in GCS and delete. + For partitioned parquets, which are saved as folders, we need + to use recursive=True. + """ + if fs.exists(filepath): + if fs.isdir(filepath): + fs.rm(filepath, recursive=True) + else: + fs.rm(filepath) + + return diff --git a/traffic_ops/01_observed_flow_speed.ipynb b/traffic_ops/01_observed_flow_speed.ipynb new file mode 100644 index 0000000000..8c51ec1361 --- /dev/null +++ b/traffic_ops/01_observed_flow_speed.ipynb @@ -0,0 +1,1487 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "f207df7a-cc83-43d4-80cd-c1617c495b8f", + "metadata": {}, + "source": [ + "# Flow vs obs flow & speed vs obs speed\n", + "\n", + "Side-by-side comparison of what seems to be related columns.\n", + "\n", + "## What we learned\n", + "* `flow` is always >= `obs_flow`\n", + "* Most of the time, `flow = obs_flow`\n", + "* Why would imputation be more than what's observed? Is this only true when `flow == 0`? Yes, seems to be imputation happens when it's majority `obs_flow==0`, although there are a small percentage of cases where this isn't true.\n", + "* We'll just use `flow` for now, and use imputed values always?\n", + "* Most cases are `speed < obs_speed` (so imputed tends to be less than what detector says).\n", + "* Looking at descriptives, these are occurring where observed speed is really high, 100-200 mph!" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "9426f4a7-c670-4202-87a0-5772797a341f", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "\n", + "from utils import PROCESSED_GCS" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "9384cf2d-2a89-4eef-88ee-a0c95b88cca4", + "metadata": {}, + "outputs": [], + "source": [ + "def metric_vs_observed(metric: str, filtering: tuple) -> pd.DataFrame:\n", + " PREFIX = \"station_weekday_hour\"\n", + "\n", + " metric_df = pd.read_parquet(\n", + " f\"{PROCESSED_GCS}{PREFIX}_{metric}.parquet/\",\n", + " filters = filtering\n", + " )\n", + "\n", + " obs_metric_df = pd.read_parquet(\n", + " f\"{PROCESSED_GCS}{PREFIX}_obs_{metric}.parquet/\",\n", + " filters = filtering\n", + " )\n", + " \n", + " merge_cols = [\"station_uuid\", \"year\", \"month\", \"weekday\", \"hour\"]\n", + "\n", + " df = pd.merge(\n", + " metric_df,\n", + " obs_metric_df,\n", + " on = merge_cols,\n", + " how = \"outer\"\n", + " )\n", + " \n", + " return df" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "4ada8106-a779-4908-9234-218370911c5c", + "metadata": {}, + "outputs": [], + "source": [ + "def lane_comparisons(df: pd.DataFrame, lane_number: int, metric: str):\n", + " print(f\"\\nMetric: {metric}\")\n", + " print(f\"Month-Year: {df.month.iloc[0]}-{df.year.iloc[0]}\")\n", + " print(f\"Weekday: {df.weekday.iloc[0]} Hour: {df.hour.iloc[0]}\")\n", + " print(f\"******* lane number: {lane_number} ********\")\n", + "\n", + " N_ROWS = len(df)\n", + " \n", + " def rounded(numerator, denominator):\n", + " return round(numerator / denominator, 3)\n", + " \n", + " col = f\"lane_{lane_number}_{metric}\"\n", + " obs_col = f\"lane_{lane_number}_obs_{metric}\"\n", + " \n", + " N_EQUAL = df[df[col] == df[obs_col]].shape[0]\n", + " N_MORE = df[df[col] > df[obs_col]].shape[0]\n", + " N_LESS = df[df[col] < df[obs_col]].shape[0]\n", + " \n", + " \n", + " print(f\"# rows: {N_ROWS}\")\n", + " print(f\"equal: {N_EQUAL}, imputed > obs: {N_MORE}, imputed < obs: {N_LESS}\")\n", + " print(f\"% equal {rounded(N_EQUAL, N_ROWS)}\")\n", + " print(f\"greater: {rounded(N_MORE, N_ROWS)}, less: {rounded(N_LESS, N_ROWS)}\")\n", + " \n", + " if metric == \"speed\":\n", + " print(\"**** values when imputed < obs ****\")\n", + "\n", + " less_df = df.loc[df[col] < df[obs_col]]\n", + "\n", + " print(less_df[col].describe())\n", + " print(less_df[obs_col].describe())\n", + " \n", + " print(\"****values when imputed > obs *****\")\n", + " \n", + " more_df = df.loc[df[col] > df[obs_col]]\n", + " \n", + " more_df = more_df.assign(\n", + " obs_col_zero = more_df.apply(\n", + " lambda x: True if x[obs_col]==0 \n", + " else False, axis=1)\n", + " )\n", + " \n", + " print(more_df.obs_col_zero.value_counts())\n" + ] + }, + { + "cell_type": "markdown", + "id": "22b2740e-32a9-4600-9248-84a2efb32508", + "metadata": {}, + "source": [ + "## Flow vs observed flow diagnostics" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "433cf0e1-9e6d-44e3-9fd6-3959971d3b67", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Metric: flow\n", + "Month-Year: 6-2024\n", + "Weekday: 2 Hour: 18\n", + "******* lane number: 1 ********\n", + "# rows: 7026\n", + "equal: 6873, imputed > obs: 153, imputed < obs: 0\n", + "% equal 0.978\n", + "greater: 0.022, less: 0.0\n", + "****values when imputed > obs *****\n", + "True 108\n", + "False 45\n", + "Name: obs_col_zero, dtype: int64\n", + "\n", + "Metric: flow\n", + "Month-Year: 6-2024\n", + "Weekday: 2 Hour: 18\n", + "******* lane number: 2 ********\n", + "# rows: 7026\n", + "equal: 6906, imputed > obs: 120, imputed < obs: 0\n", + "% equal 0.983\n", + "greater: 0.017, less: 0.0\n", + "****values when imputed > obs *****\n", + "True 84\n", + "False 36\n", + "Name: obs_col_zero, dtype: int64\n", + "\n", + "Metric: flow\n", + "Month-Year: 6-2024\n", + "Weekday: 2 Hour: 18\n", + "******* lane number: 3 ********\n", + "# rows: 7026\n", + "equal: 6927, imputed > obs: 99, imputed < obs: 0\n", + "% equal 0.986\n", + "greater: 0.014, less: 0.0\n", + "****values when imputed > obs *****\n", + "True 66\n", + "False 33\n", + "Name: obs_col_zero, dtype: int64\n", + "\n", + "Metric: flow\n", + "Month-Year: 6-2024\n", + "Weekday: 2 Hour: 18\n", + "******* lane number: 4 ********\n", + "# rows: 7026\n", + "equal: 6978, imputed > obs: 48, imputed < obs: 0\n", + "% equal 0.993\n", + "greater: 0.007, less: 0.0\n", + "****values when imputed > obs *****\n", + "True 42\n", + "False 6\n", + "Name: obs_col_zero, dtype: int64\n", + "\n", + "Metric: flow\n", + "Month-Year: 6-2024\n", + "Weekday: 2 Hour: 18\n", + "******* lane number: 5 ********\n", + "# rows: 7026\n", + "equal: 7002, imputed > obs: 24, imputed < obs: 0\n", + "% equal 0.997\n", + "greater: 0.003, less: 0.0\n", + "****values when imputed > obs *****\n", + "True 24\n", + "Name: obs_col_zero, dtype: int64\n", + "\n", + "Metric: flow\n", + "Month-Year: 6-2024\n", + "Weekday: 2 Hour: 18\n", + "******* lane number: 6 ********\n", + "# rows: 7026\n", + "equal: 7023, imputed > obs: 3, imputed < obs: 0\n", + "% equal 1.0\n", + "greater: 0.0, less: 0.0\n", + "****values when imputed > obs *****\n", + "True 3\n", + "Name: obs_col_zero, dtype: int64\n", + "\n", + "Metric: flow\n", + "Month-Year: 6-2024\n", + "Weekday: 2 Hour: 18\n", + "******* lane number: 7 ********\n", + "# rows: 7026\n", + "equal: 7026, imputed > obs: 0, imputed < obs: 0\n", + "% equal 1.0\n", + "greater: 0.0, less: 0.0\n", + "****values when imputed > obs *****\n", + "Series([], Name: obs_col_zero, dtype: int64)\n", + "\n", + "Metric: flow\n", + "Month-Year: 6-2024\n", + "Weekday: 2 Hour: 18\n", + "******* lane number: 8 ********\n", + "# rows: 7026\n", + "equal: 7026, imputed > obs: 0, imputed < obs: 0\n", + "% equal 1.0\n", + "greater: 0.0, less: 0.0\n", + "****values when imputed > obs *****\n", + "Series([], Name: obs_col_zero, dtype: int64)\n" + ] + } + ], + "source": [ + "filtering = [[\n", + " (\"year\", \"==\", 2024), (\"month\", \"==\", 6),\n", + " (\"weekday\", \"==\", 2), (\"hour\", \"==\", 18)\n", + "]]\n", + "\n", + "METRIC = \"flow\"\n", + "df = metric_vs_observed(METRIC, filtering)\n", + "\n", + "for i in range(1, 9):\n", + " lane_comparisons(df, i, METRIC)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "38ec11bd-389a-4e5c-9f62-ede7578f0850", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Metric: flow\n", + "Month-Year: 3-2024\n", + "Weekday: 4 Hour: 10\n", + "******* lane number: 1 ********\n", + "# rows: 7884\n", + "equal: 7413, imputed > obs: 471, imputed < obs: 0\n", + "% equal 0.94\n", + "greater: 0.06, less: 0.0\n", + "****values when imputed > obs *****\n", + "False 375\n", + "True 96\n", + "Name: obs_col_zero, dtype: int64\n", + "\n", + "Metric: flow\n", + "Month-Year: 3-2024\n", + "Weekday: 4 Hour: 10\n", + "******* lane number: 2 ********\n", + "# rows: 7884\n", + "equal: 7590, imputed > obs: 294, imputed < obs: 0\n", + "% equal 0.963\n", + "greater: 0.037, less: 0.0\n", + "****values when imputed > obs *****\n", + "False 210\n", + "True 84\n", + "Name: obs_col_zero, dtype: int64\n", + "\n", + "Metric: flow\n", + "Month-Year: 3-2024\n", + "Weekday: 4 Hour: 10\n", + "******* lane number: 3 ********\n", + "# rows: 7884\n", + "equal: 7620, imputed > obs: 264, imputed < obs: 0\n", + "% equal 0.967\n", + "greater: 0.033, less: 0.0\n", + "****values when imputed > obs *****\n", + "False 174\n", + "True 90\n", + "Name: obs_col_zero, dtype: int64\n", + "\n", + "Metric: flow\n", + "Month-Year: 3-2024\n", + "Weekday: 4 Hour: 10\n", + "******* lane number: 4 ********\n", + "# rows: 7884\n", + "equal: 7662, imputed > obs: 222, imputed < obs: 0\n", + "% equal 0.972\n", + "greater: 0.028, less: 0.0\n", + "****values when imputed > obs *****\n", + "False 180\n", + "True 42\n", + "Name: obs_col_zero, dtype: int64\n", + "\n", + "Metric: flow\n", + "Month-Year: 3-2024\n", + "Weekday: 4 Hour: 10\n", + "******* lane number: 5 ********\n", + "# rows: 7884\n", + "equal: 7827, imputed > obs: 57, imputed < obs: 0\n", + "% equal 0.993\n", + "greater: 0.007, less: 0.0\n", + "****values when imputed > obs *****\n", + "False 48\n", + "True 9\n", + "Name: obs_col_zero, dtype: int64\n", + "\n", + "Metric: flow\n", + "Month-Year: 3-2024\n", + "Weekday: 4 Hour: 10\n", + "******* lane number: 6 ********\n", + "# rows: 7884\n", + "equal: 7878, imputed > obs: 6, imputed < obs: 0\n", + "% equal 0.999\n", + "greater: 0.001, less: 0.0\n", + "****values when imputed > obs *****\n", + "False 6\n", + "Name: obs_col_zero, dtype: int64\n", + "\n", + "Metric: flow\n", + "Month-Year: 3-2024\n", + "Weekday: 4 Hour: 10\n", + "******* lane number: 7 ********\n", + "# rows: 7884\n", + "equal: 7884, imputed > obs: 0, imputed < obs: 0\n", + "% equal 1.0\n", + "greater: 0.0, less: 0.0\n", + "****values when imputed > obs *****\n", + "Series([], Name: obs_col_zero, dtype: int64)\n", + "\n", + "Metric: flow\n", + "Month-Year: 3-2024\n", + "Weekday: 4 Hour: 10\n", + "******* lane number: 8 ********\n", + "# rows: 7884\n", + "equal: 7884, imputed > obs: 0, imputed < obs: 0\n", + "% equal 1.0\n", + "greater: 0.0, less: 0.0\n", + "****values when imputed > obs *****\n", + "Series([], Name: obs_col_zero, dtype: int64)\n" + ] + } + ], + "source": [ + "filtering = [[\n", + " (\"year\", \"==\", 2024), (\"month\", \"==\", 3),\n", + " (\"weekday\", \"==\", 4), (\"hour\", \"==\", 10)\n", + "]]\n", + "\n", + "df = metric_vs_observed(METRIC, filtering)\n", + "\n", + "for i in range(1, 9):\n", + " lane_comparisons(df, i, METRIC)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "9a183a7c-6c92-49fd-83ac-f8027a6253d4", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Metric: flow\n", + "Month-Year: 7-2023\n", + "Weekday: 2 Hour: 10\n", + "******* lane number: 1 ********\n", + "# rows: 7056\n", + "equal: 6552, imputed > obs: 504, imputed < obs: 0\n", + "% equal 0.929\n", + "greater: 0.071, less: 0.0\n", + "****values when imputed > obs *****\n", + "False 402\n", + "True 102\n", + "Name: obs_col_zero, dtype: int64\n", + "\n", + "Metric: flow\n", + "Month-Year: 7-2023\n", + "Weekday: 2 Hour: 10\n", + "******* lane number: 2 ********\n", + "# rows: 7056\n", + "equal: 6723, imputed > obs: 333, imputed < obs: 0\n", + "% equal 0.953\n", + "greater: 0.047, less: 0.0\n", + "****values when imputed > obs *****\n", + "False 258\n", + "True 75\n", + "Name: obs_col_zero, dtype: int64\n", + "\n", + "Metric: flow\n", + "Month-Year: 7-2023\n", + "Weekday: 2 Hour: 10\n", + "******* lane number: 3 ********\n", + "# rows: 7056\n", + "equal: 6759, imputed > obs: 297, imputed < obs: 0\n", + "% equal 0.958\n", + "greater: 0.042, less: 0.0\n", + "****values when imputed > obs *****\n", + "False 204\n", + "True 93\n", + "Name: obs_col_zero, dtype: int64\n", + "\n", + "Metric: flow\n", + "Month-Year: 7-2023\n", + "Weekday: 2 Hour: 10\n", + "******* lane number: 4 ********\n", + "# rows: 7056\n", + "equal: 6858, imputed > obs: 198, imputed < obs: 0\n", + "% equal 0.972\n", + "greater: 0.028, less: 0.0\n", + "****values when imputed > obs *****\n", + "False 153\n", + "True 45\n", + "Name: obs_col_zero, dtype: int64\n", + "\n", + "Metric: flow\n", + "Month-Year: 7-2023\n", + "Weekday: 2 Hour: 10\n", + "******* lane number: 5 ********\n", + "# rows: 7056\n", + "equal: 6999, imputed > obs: 57, imputed < obs: 0\n", + "% equal 0.992\n", + "greater: 0.008, less: 0.0\n", + "****values when imputed > obs *****\n", + "False 51\n", + "True 6\n", + "Name: obs_col_zero, dtype: int64\n", + "\n", + "Metric: flow\n", + "Month-Year: 7-2023\n", + "Weekday: 2 Hour: 10\n", + "******* lane number: 6 ********\n", + "# rows: 7056\n", + "equal: 7044, imputed > obs: 12, imputed < obs: 0\n", + "% equal 0.998\n", + "greater: 0.002, less: 0.0\n", + "****values when imputed > obs *****\n", + "False 12\n", + "Name: obs_col_zero, dtype: int64\n", + "\n", + "Metric: flow\n", + "Month-Year: 7-2023\n", + "Weekday: 2 Hour: 10\n", + "******* lane number: 7 ********\n", + "# rows: 7056\n", + "equal: 7056, imputed > obs: 0, imputed < obs: 0\n", + "% equal 1.0\n", + "greater: 0.0, less: 0.0\n", + "****values when imputed > obs *****\n", + "Series([], Name: obs_col_zero, dtype: int64)\n", + "\n", + "Metric: flow\n", + "Month-Year: 7-2023\n", + "Weekday: 2 Hour: 10\n", + "******* lane number: 8 ********\n", + "# rows: 7056\n", + "equal: 7056, imputed > obs: 0, imputed < obs: 0\n", + "% equal 1.0\n", + "greater: 0.0, less: 0.0\n", + "****values when imputed > obs *****\n", + "Series([], Name: obs_col_zero, dtype: int64)\n" + ] + } + ], + "source": [ + "filtering = [[\n", + " (\"year\", \"==\", 2023), (\"month\", \"==\", 7),\n", + " (\"weekday\", \"==\", 2), (\"hour\", \"==\", 10)\n", + "]]\n", + "\n", + "df = metric_vs_observed(METRIC, filtering)\n", + "\n", + "for i in range(1, 9):\n", + " lane_comparisons(df, i, METRIC)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "b9c1243b-d340-42a0-9f6c-52dd3cf2595b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Metric: flow\n", + "Month-Year: 9-2023\n", + "Weekday: 1 Hour: 14\n", + "******* lane number: 1 ********\n", + "# rows: 7761\n", + "equal: 6894, imputed > obs: 867, imputed < obs: 0\n", + "% equal 0.888\n", + "greater: 0.112, less: 0.0\n", + "****values when imputed > obs *****\n", + "False 762\n", + "True 105\n", + "Name: obs_col_zero, dtype: int64\n", + "\n", + "Metric: flow\n", + "Month-Year: 9-2023\n", + "Weekday: 1 Hour: 14\n", + "******* lane number: 2 ********\n", + "# rows: 7761\n", + "equal: 7245, imputed > obs: 516, imputed < obs: 0\n", + "% equal 0.934\n", + "greater: 0.066, less: 0.0\n", + "****values when imputed > obs *****\n", + "False 423\n", + "True 93\n", + "Name: obs_col_zero, dtype: int64\n", + "\n", + "Metric: flow\n", + "Month-Year: 9-2023\n", + "Weekday: 1 Hour: 14\n", + "******* lane number: 3 ********\n", + "# rows: 7761\n", + "equal: 7305, imputed > obs: 456, imputed < obs: 0\n", + "% equal 0.941\n", + "greater: 0.059, less: 0.0\n", + "****values when imputed > obs *****\n", + "False 381\n", + "True 75\n", + "Name: obs_col_zero, dtype: int64\n", + "\n", + "Metric: flow\n", + "Month-Year: 9-2023\n", + "Weekday: 1 Hour: 14\n", + "******* lane number: 4 ********\n", + "# rows: 7761\n", + "equal: 7425, imputed > obs: 336, imputed < obs: 0\n", + "% equal 0.957\n", + "greater: 0.043, less: 0.0\n", + "****values when imputed > obs *****\n", + "False 288\n", + "True 48\n", + "Name: obs_col_zero, dtype: int64\n", + "\n", + "Metric: flow\n", + "Month-Year: 9-2023\n", + "Weekday: 1 Hour: 14\n", + "******* lane number: 5 ********\n", + "# rows: 7761\n", + "equal: 7635, imputed > obs: 126, imputed < obs: 0\n", + "% equal 0.984\n", + "greater: 0.016, less: 0.0\n", + "****values when imputed > obs *****\n", + "False 102\n", + "True 24\n", + "Name: obs_col_zero, dtype: int64\n", + "\n", + "Metric: flow\n", + "Month-Year: 9-2023\n", + "Weekday: 1 Hour: 14\n", + "******* lane number: 6 ********\n", + "# rows: 7761\n", + "equal: 7728, imputed > obs: 33, imputed < obs: 0\n", + "% equal 0.996\n", + "greater: 0.004, less: 0.0\n", + "****values when imputed > obs *****\n", + "False 30\n", + "True 3\n", + "Name: obs_col_zero, dtype: int64\n", + "\n", + "Metric: flow\n", + "Month-Year: 9-2023\n", + "Weekday: 1 Hour: 14\n", + "******* lane number: 7 ********\n", + "# rows: 7761\n", + "equal: 7761, imputed > obs: 0, imputed < obs: 0\n", + "% equal 1.0\n", + "greater: 0.0, less: 0.0\n", + "****values when imputed > obs *****\n", + "Series([], Name: obs_col_zero, dtype: int64)\n", + "\n", + "Metric: flow\n", + "Month-Year: 9-2023\n", + "Weekday: 1 Hour: 14\n", + "******* lane number: 8 ********\n", + "# rows: 7761\n", + "equal: 7761, imputed > obs: 0, imputed < obs: 0\n", + "% equal 1.0\n", + "greater: 0.0, less: 0.0\n", + "****values when imputed > obs *****\n", + "Series([], Name: obs_col_zero, dtype: int64)\n" + ] + } + ], + "source": [ + "filtering = [[\n", + " (\"year\", \"==\", 2023), (\"month\", \"==\", 9),\n", + " (\"weekday\", \"==\", 1), (\"hour\", \"==\", 14)\n", + "]]\n", + "\n", + "df = metric_vs_observed(METRIC, filtering)\n", + "\n", + "for i in range(1, 9):\n", + " lane_comparisons(df, i, METRIC)" + ] + }, + { + "cell_type": "markdown", + "id": "b1fb9c85-6a4a-4b60-af3f-17f465087111", + "metadata": {}, + "source": [ + "## Speed vs observed speed diagnostics" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "9c3b7997-8906-4388-bd7a-1d68b884998a", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Metric: speed\n", + "Month-Year: 5-2024\n", + "Weekday: 2 Hour: 8\n", + "******* lane number: 1 ********\n", + "# rows: 2606\n", + "equal: 70, imputed > obs: 43, imputed < obs: 2479\n", + "% equal 0.027\n", + "greater: 0.017, less: 0.951\n", + "**** values when imputed < obs ****\n", + "count 2479.0\n", + "mean 57.44674\n", + "std 14.862704\n", + "min 5.42\n", + "25% 50.2975\n", + "50% 62.78\n", + "75% 67.87\n", + "max 79.48\n", + "Name: lane_1_speed, dtype: Float64\n", + "count 2479.0\n", + "mean 266.002098\n", + "std 84.267447\n", + "min 22.6\n", + "25% 210.65\n", + "50% 288.6\n", + "75% 326.1\n", + "max 397.4\n", + "Name: lane_1_obs_speed, dtype: Float64\n", + "****values when imputed > obs *****\n", + "True 33\n", + "False 10\n", + "Name: obs_col_zero, dtype: int64\n", + "\n", + "Metric: speed\n", + "Month-Year: 5-2024\n", + "Weekday: 2 Hour: 8\n", + "******* lane number: 2 ********\n", + "# rows: 2606\n", + "equal: 55, imputed > obs: 35, imputed < obs: 1631\n", + "% equal 0.021\n", + "greater: 0.013, less: 0.626\n", + "**** values when imputed < obs ****\n", + "count 1631.0\n", + "mean 54.435811\n", + "std 15.345102\n", + "min 6.06\n", + "25% 46.32\n", + "50% 60.075\n", + "75% 65.6675\n", + "max 73.966667\n", + "Name: lane_2_speed, dtype: Float64\n", + "count 1631.0\n", + "mean 251.42992\n", + "std 84.908873\n", + "min 21.2\n", + "25% 194.1\n", + "50% 275.3\n", + "75% 322.5\n", + "max 369.1\n", + "Name: lane_2_obs_speed, dtype: Float64\n", + "****values when imputed > obs *****\n", + "True 28\n", + "False 7\n", + "Name: obs_col_zero, dtype: int64\n", + "\n", + "Metric: speed\n", + "Month-Year: 5-2024\n", + "Weekday: 2 Hour: 8\n", + "******* lane number: 3 ********\n", + "# rows: 2606\n", + "equal: 51, imputed > obs: 35, imputed < obs: 1471\n", + "% equal 0.02\n", + "greater: 0.013, less: 0.564\n", + "**** values when imputed < obs ****\n", + "count 1471.0\n", + "mean 49.851535\n", + "std 14.697226\n", + "min 5.38\n", + "25% 42.41\n", + "50% 53.3\n", + "75% 61.42\n", + "max 77.825\n", + "Name: lane_3_speed, dtype: Float64\n", + "count 1471.0\n", + "mean 230.203739\n", + "std 79.750198\n", + "min 11.1\n", + "25% 178.3\n", + "50% 247.0\n", + "75% 295.75\n", + "max 360.3\n", + "Name: lane_3_obs_speed, dtype: Float64\n", + "****values when imputed > obs *****\n", + "True 28\n", + "False 7\n", + "Name: obs_col_zero, dtype: int64\n", + "\n", + "Metric: speed\n", + "Month-Year: 5-2024\n", + "Weekday: 2 Hour: 8\n", + "******* lane number: 4 ********\n", + "# rows: 2606\n", + "equal: 46, imputed > obs: 20, imputed < obs: 1131\n", + "% equal 0.018\n", + "greater: 0.008, less: 0.434\n", + "**** values when imputed < obs ****\n", + "count 1131.0\n", + "mean 47.919651\n", + "std 14.27912\n", + "min 4.54\n", + "25% 40.35\n", + "50% 51.42\n", + "75% 58.65\n", + "max 76.54\n", + "Name: lane_4_speed, dtype: Float64\n", + "count 1131.0\n", + "mean 218.736163\n", + "std 78.207275\n", + "min 11.3\n", + "25% 164.25\n", + "50% 234.4\n", + "75% 280.25\n", + "max 382.7\n", + "Name: lane_4_obs_speed, dtype: Float64\n", + "****values when imputed > obs *****\n", + "True 14\n", + "False 6\n", + "Name: obs_col_zero, dtype: int64\n", + "\n", + "Metric: speed\n", + "Month-Year: 5-2024\n", + "Weekday: 2 Hour: 8\n", + "******* lane number: 5 ********\n", + "# rows: 2606\n", + "equal: 11, imputed > obs: 9, imputed < obs: 373\n", + "% equal 0.004\n", + "greater: 0.003, less: 0.143\n", + "**** values when imputed < obs ****\n", + "count 373.0\n", + "mean 50.034495\n", + "std 13.540193\n", + "min 4.94\n", + "25% 43.45\n", + "50% 54.24\n", + "75% 60.333333\n", + "max 65.3\n", + "Name: lane_5_speed, dtype: Float64\n", + "count 373.0\n", + "mean 226.784718\n", + "std 77.696666\n", + "min 9.6\n", + "25% 172.8\n", + "50% 248.1\n", + "75% 289.1\n", + "max 324.8\n", + "Name: lane_5_obs_speed, dtype: Float64\n", + "****values when imputed > obs *****\n", + "True 6\n", + "False 3\n", + "Name: obs_col_zero, dtype: int64\n", + "\n", + "Metric: speed\n", + "Month-Year: 5-2024\n", + "Weekday: 2 Hour: 8\n", + "******* lane number: 6 ********\n", + "# rows: 2606\n", + "equal: 2, imputed > obs: 1, imputed < obs: 70\n", + "% equal 0.001\n", + "greater: 0.0, less: 0.027\n", + "**** values when imputed < obs ****\n", + "count 70.0\n", + "mean 49.241571\n", + "std 16.127912\n", + "min 5.86\n", + "25% 40.16\n", + "50% 55.28\n", + "75% 62.39\n", + "max 65.04\n", + "Name: lane_6_speed, dtype: Float64\n", + "count 70.0\n", + "mean 233.05\n", + "std 85.957105\n", + "min 17.7\n", + "25% 160.45\n", + "50% 265.65\n", + "75% 310.8\n", + "max 325.2\n", + "Name: lane_6_obs_speed, dtype: Float64\n", + "****values when imputed > obs *****\n", + "True 1\n", + "Name: obs_col_zero, dtype: int64\n", + "\n", + "Metric: speed\n", + "Month-Year: 5-2024\n", + "Weekday: 2 Hour: 8\n", + "******* lane number: 7 ********\n", + "# rows: 2606\n", + "equal: 1, imputed > obs: 0, imputed < obs: 1\n", + "% equal 0.0\n", + "greater: 0.0, less: 0.0\n", + "**** values when imputed < obs ****\n", + "count 1.0\n", + "mean 10.26\n", + "std \n", + "min 10.26\n", + "25% 10.26\n", + "50% 10.26\n", + "75% 10.26\n", + "max 10.26\n", + "Name: lane_7_speed, dtype: Float64\n", + "count 1.0\n", + "mean 38.6\n", + "std \n", + "min 38.6\n", + "25% 38.6\n", + "50% 38.6\n", + "75% 38.6\n", + "max 38.6\n", + "Name: lane_7_obs_speed, dtype: Float64\n", + "****values when imputed > obs *****\n", + "Series([], Name: obs_col_zero, dtype: int64)\n", + "\n", + "Metric: speed\n", + "Month-Year: 5-2024\n", + "Weekday: 2 Hour: 8\n", + "******* lane number: 8 ********\n", + "# rows: 2606\n", + "equal: 0, imputed > obs: 0, imputed < obs: 0\n", + "% equal 0.0\n", + "greater: 0.0, less: 0.0\n", + "**** values when imputed < obs ****\n", + "count 0.0\n", + "mean \n", + "std \n", + "min \n", + "25% \n", + "50% \n", + "75% \n", + "max \n", + "Name: lane_8_speed, dtype: Float64\n", + "count 0.0\n", + "mean \n", + "std \n", + "min \n", + "25% \n", + "50% \n", + "75% \n", + "max \n", + "Name: lane_8_obs_speed, dtype: Float64\n", + "****values when imputed > obs *****\n", + "Series([], Name: obs_col_zero, dtype: int64)\n" + ] + } + ], + "source": [ + "filtering = [[\n", + " (\"year\", \"==\", 2024), (\"month\", \"==\", 5),\n", + " (\"weekday\", \"==\", 2), (\"hour\", \"==\", 8)\n", + "]]\n", + "\n", + "METRIC = \"speed\"\n", + "df = metric_vs_observed(METRIC, filtering)\n", + "\n", + "for i in range(1, 9):\n", + " lane_comparisons(df, i, METRIC)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "2c7a4061-b791-422a-97c1-18eae73784b8", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Metric: speed\n", + "Month-Year: 8-2023\n", + "Weekday: 5 Hour: 7\n", + "******* lane number: 1 ********\n", + "# rows: 2311\n", + "equal: 98, imputed > obs: 31, imputed < obs: 2169\n", + "% equal 0.042\n", + "greater: 0.013, less: 0.939\n", + "**** values when imputed < obs ****\n", + "count 2169.0\n", + "mean 69.474558\n", + "std 5.819333\n", + "min 29.675\n", + "25% 64.85\n", + "50% 71.4\n", + "75% 74.325\n", + "max 83.475\n", + "Name: lane_1_speed, dtype: Float64\n", + "count 2169.0\n", + "mean 260.652006\n", + "std 47.678904\n", + "min 62.9\n", + "25% 255.3\n", + "50% 278.3\n", + "75% 296.5\n", + "max 333.9\n", + "Name: lane_1_obs_speed, dtype: Float64\n", + "****values when imputed > obs *****\n", + "True 26\n", + "False 5\n", + "Name: obs_col_zero, dtype: int64\n", + "\n", + "Metric: speed\n", + "Month-Year: 8-2023\n", + "Weekday: 5 Hour: 7\n", + "******* lane number: 2 ********\n", + "# rows: 2311\n", + "equal: 60, imputed > obs: 27, imputed < obs: 1459\n", + "% equal 0.026\n", + "greater: 0.012, less: 0.631\n", + "**** values when imputed < obs ****\n", + "count 1459.0\n", + "mean 68.689011\n", + "std 4.427647\n", + "min 23.45\n", + "25% 67.6375\n", + "50% 69.55\n", + "75% 70.75\n", + "max 76.95\n", + "Name: lane_2_speed, dtype: Float64\n", + "count 1459.0\n", + "mean 258.858259\n", + "std 44.162084\n", + "min 53.7\n", + "25% 259.1\n", + "50% 276.6\n", + "75% 282.7\n", + "max 307.8\n", + "Name: lane_2_obs_speed, dtype: Float64\n", + "****values when imputed > obs *****\n", + "True 24\n", + "False 3\n", + "Name: obs_col_zero, dtype: int64\n", + "\n", + "Metric: speed\n", + "Month-Year: 8-2023\n", + "Weekday: 5 Hour: 7\n", + "******* lane number: 3 ********\n", + "# rows: 2311\n", + "equal: 51, imputed > obs: 31, imputed < obs: 1356\n", + "% equal 0.022\n", + "greater: 0.013, less: 0.587\n", + "**** values when imputed < obs ****\n", + "count 1356.0\n", + "mean 63.441734\n", + "std 6.004107\n", + "min 40.7\n", + "25% 60.825\n", + "50% 64.783333\n", + "75% 67.175\n", + "max 81.2\n", + "Name: lane_3_speed, dtype: Float64\n", + "count 1356.0\n", + "mean 238.662021\n", + "std 44.415839\n", + "min 56.8\n", + "25% 220.425\n", + "50% 251.0\n", + "75% 267.5\n", + "max 321.7\n", + "Name: lane_3_obs_speed, dtype: Float64\n", + "****values when imputed > obs *****\n", + "True 28\n", + "False 3\n", + "Name: obs_col_zero, dtype: int64\n", + "\n", + "Metric: speed\n", + "Month-Year: 8-2023\n", + "Weekday: 5 Hour: 7\n", + "******* lane number: 4 ********\n", + "# rows: 2311\n", + "equal: 46, imputed > obs: 13, imputed < obs: 1047\n", + "% equal 0.02\n", + "greater: 0.006, less: 0.453\n", + "**** values when imputed < obs ****\n", + "count 1047.0\n", + "mean 61.107649\n", + "std 5.62594\n", + "min 34.925\n", + "25% 59.4\n", + "50% 61.825\n", + "75% 63.8625\n", + "max 72.05\n", + "Name: lane_4_speed, dtype: Float64\n", + "count 1047.0\n", + "mean 229.072015\n", + "std 42.75337\n", + "min 60.0\n", + "25% 211.4\n", + "50% 245.4\n", + "75% 251.1\n", + "max 288.2\n", + "Name: lane_4_obs_speed, dtype: Float64\n", + "****values when imputed > obs *****\n", + "True 10\n", + "False 3\n", + "Name: obs_col_zero, dtype: int64\n", + "\n", + "Metric: speed\n", + "Month-Year: 8-2023\n", + "Weekday: 5 Hour: 7\n", + "******* lane number: 5 ********\n", + "# rows: 2311\n", + "equal: 17, imputed > obs: 2, imputed < obs: 323\n", + "% equal 0.007\n", + "greater: 0.001, less: 0.14\n", + "**** values when imputed < obs ****\n", + "count 323.0\n", + "mean 62.318989\n", + "std 3.415836\n", + "min 42.175\n", + "25% 61.9375\n", + "50% 63.475\n", + "75% 64.2125\n", + "max 71.125\n", + "Name: lane_5_speed, dtype: Float64\n", + "count 323.0\n", + "mean 236.016099\n", + "std 38.740027\n", + "min 63.2\n", + "25% 239.3\n", + "50% 253.0\n", + "75% 256.7\n", + "max 284.5\n", + "Name: lane_5_obs_speed, dtype: Float64\n", + "****values when imputed > obs *****\n", + "True 2\n", + "Name: obs_col_zero, dtype: int64\n", + "\n", + "Metric: speed\n", + "Month-Year: 8-2023\n", + "Weekday: 5 Hour: 7\n", + "******* lane number: 6 ********\n", + "# rows: 2311\n", + "equal: 6, imputed > obs: 0, imputed < obs: 58\n", + "% equal 0.003\n", + "greater: 0.0, less: 0.025\n", + "**** values when imputed < obs ****\n", + "count 58.0\n", + "mean 63.066667\n", + "std 1.775147\n", + "min 55.35\n", + "25% 62.8625\n", + "50% 63.65\n", + "75% 64.19375\n", + "max 64.7\n", + "Name: lane_6_speed, dtype: Float64\n", + "count 58.0\n", + "mean 231.208621\n", + "std 50.683345\n", + "min 63.1\n", + "25% 242.7\n", + "50% 253.85\n", + "75% 256.675\n", + "max 258.8\n", + "Name: lane_6_obs_speed, dtype: Float64\n", + "****values when imputed > obs *****\n", + "Series([], Name: obs_col_zero, dtype: int64)\n", + "\n", + "Metric: speed\n", + "Month-Year: 8-2023\n", + "Weekday: 5 Hour: 7\n", + "******* lane number: 7 ********\n", + "# rows: 2311\n", + "equal: 0, imputed > obs: 0, imputed < obs: 2\n", + "% equal 0.0\n", + "greater: 0.0, less: 0.001\n", + "**** values when imputed < obs ****\n", + "count 2.0\n", + "mean 62.366667\n", + "std 1.791337\n", + "min 61.1\n", + "25% 61.733333\n", + "50% 62.366667\n", + "75% 63.0\n", + "max 63.633333\n", + "Name: lane_7_speed, dtype: Float64\n", + "count 2.0\n", + "mean 217.65\n", + "std 37.830213\n", + "min 190.9\n", + "25% 204.275\n", + "50% 217.65\n", + "75% 231.025\n", + "max 244.4\n", + "Name: lane_7_obs_speed, dtype: Float64\n", + "****values when imputed > obs *****\n", + "Series([], Name: obs_col_zero, dtype: int64)\n", + "\n", + "Metric: speed\n", + "Month-Year: 8-2023\n", + "Weekday: 5 Hour: 7\n", + "******* lane number: 8 ********\n", + "# rows: 2311\n", + "equal: 0, imputed > obs: 0, imputed < obs: 0\n", + "% equal 0.0\n", + "greater: 0.0, less: 0.0\n", + "**** values when imputed < obs ****\n", + "count 0.0\n", + "mean \n", + "std \n", + "min \n", + "25% \n", + "50% \n", + "75% \n", + "max \n", + "Name: lane_8_speed, dtype: Float64\n", + "count 0.0\n", + "mean \n", + "std \n", + "min \n", + "25% \n", + "50% \n", + "75% \n", + "max \n", + "Name: lane_8_obs_speed, dtype: Float64\n", + "****values when imputed > obs *****\n", + "Series([], Name: obs_col_zero, dtype: int64)\n" + ] + } + ], + "source": [ + "filtering = [[\n", + " (\"year\", \"==\", 2023), (\"month\", \"==\", 8),\n", + " (\"weekday\", \"==\", 5), (\"hour\", \"==\", 7)\n", + "]]\n", + "\n", + "df = metric_vs_observed(METRIC, filtering)\n", + "\n", + "for i in range(1, 9):\n", + " lane_comparisons(df, i, METRIC)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "8473d5a9-11ed-4e43-b553-1464c7a4aa24", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Metric: speed\n", + "Month-Year: 10-2023\n", + "Weekday: 3 Hour: 13\n", + "******* lane number: 1 ********\n", + "# rows: 2604\n", + "equal: 89, imputed > obs: 67, imputed < obs: 2430\n", + "% equal 0.034\n", + "greater: 0.026, less: 0.933\n", + "**** values when imputed < obs ****\n", + "count 2430.0\n", + "mean 64.361485\n", + "std 9.077721\n", + "min 17.875\n", + "25% 60.73125\n", + "50% 65.0\n", + "75% 71.075\n", + "max 78.9\n", + "Name: lane_1_speed, dtype: Float64\n", + "count 2430.0\n", + "mean 238.871605\n", + "std 55.517956\n", + "min 51.3\n", + "25% 212.4\n", + "50% 258.0\n", + "75% 280.4\n", + "max 315.6\n", + "Name: lane_1_obs_speed, dtype: Float64\n", + "****values when imputed > obs *****\n", + "True 49\n", + "False 18\n", + "Name: obs_col_zero, dtype: int64\n", + "\n", + "Metric: speed\n", + "Month-Year: 10-2023\n", + "Weekday: 3 Hour: 13\n", + "******* lane number: 2 ********\n", + "# rows: 2604\n", + "equal: 67, imputed > obs: 40, imputed < obs: 1622\n", + "% equal 0.026\n", + "greater: 0.015, less: 0.623\n", + "**** values when imputed < obs ****\n", + "count 1622.0\n", + "mean 61.670176\n", + "std 8.343252\n", + "min 17.9\n", + "25% 58.089583\n", + "50% 63.875\n", + "75% 67.25\n", + "max 74.375\n", + "Name: lane_2_speed, dtype: Float64\n", + "count 1622.0\n", + "mean 228.833847\n", + "std 51.732604\n", + "min 54.2\n", + "25% 204.05\n", + "50% 247.35\n", + "75% 266.3\n", + "max 297.5\n", + "Name: lane_2_obs_speed, dtype: Float64\n", + "****values when imputed > obs *****\n", + "True 30\n", + "False 10\n", + "Name: obs_col_zero, dtype: int64\n", + "\n", + "Metric: speed\n", + "Month-Year: 10-2023\n", + "Weekday: 3 Hour: 13\n", + "******* lane number: 3 ********\n", + "# rows: 2604\n", + "equal: 64, imputed > obs: 41, imputed < obs: 1483\n", + "% equal 0.025\n", + "greater: 0.016, less: 0.57\n", + "**** values when imputed < obs ****\n", + "count 1483.0\n", + "mean 55.931468\n", + "std 8.478252\n", + "min 15.35\n", + "25% 51.7\n", + "50% 57.0\n", + "75% 61.9\n", + "max 77.175\n", + "Name: lane_3_speed, dtype: Float64\n", + "count 1483.0\n", + "mean 207.672218\n", + "std 48.982583\n", + "min 45.9\n", + "25% 186.7\n", + "50% 218.9\n", + "75% 243.25\n", + "max 308.7\n", + "Name: lane_3_obs_speed, dtype: Float64\n", + "****values when imputed > obs *****\n", + "True 26\n", + "False 15\n", + "Name: obs_col_zero, dtype: int64\n", + "\n", + "Metric: speed\n", + "Month-Year: 10-2023\n", + "Weekday: 3 Hour: 13\n", + "******* lane number: 4 ********\n", + "# rows: 2604\n", + "equal: 58, imputed > obs: 27, imputed < obs: 1137\n", + "% equal 0.022\n", + "greater: 0.01, less: 0.437\n", + "**** values when imputed < obs ****\n", + "count 1137.0\n", + "mean 54.160195\n", + "std 7.923261\n", + "min 17.425\n", + "25% 50.575\n", + "50% 55.0\n", + "75% 59.475\n", + "max 76.75\n", + "Name: lane_4_speed, dtype: Float64\n", + "count 1137.0\n", + "mean 200.304222\n", + "std 46.375729\n", + "min 53.3\n", + "25% 180.8\n", + "50% 212.2\n", + "75% 231.1\n", + "max 307.0\n", + "Name: lane_4_obs_speed, dtype: Float64\n", + "****values when imputed > obs *****\n", + "True 17\n", + "False 10\n", + "Name: obs_col_zero, dtype: int64\n", + "\n", + "Metric: speed\n", + "Month-Year: 10-2023\n", + "Weekday: 3 Hour: 13\n", + "******* lane number: 5 ********\n", + "# rows: 2604\n", + "equal: 17, imputed > obs: 7, imputed < obs: 378\n", + "% equal 0.007\n", + "greater: 0.003, less: 0.145\n", + "**** values when imputed < obs ****\n", + "count 378.0\n", + "mean 56.131768\n", + "std 7.750725\n", + "min 19.7\n", + "25% 52.09375\n", + "50% 57.0375\n", + "75% 63.16875\n", + "max 64.875\n", + "Name: lane_5_speed, dtype: Float64\n", + "count 378.0\n", + "mean 205.230159\n", + "std 50.357375\n", + "min 41.7\n", + "25% 185.575\n", + "50% 220.0\n", + "75% 246.5\n", + "max 259.5\n", + "Name: lane_5_obs_speed, dtype: Float64\n", + "****values when imputed > obs *****\n", + "True 6\n", + "False 1\n", + "Name: obs_col_zero, dtype: int64\n", + "\n", + "Metric: speed\n", + "Month-Year: 10-2023\n", + "Weekday: 3 Hour: 13\n", + "******* lane number: 6 ********\n", + "# rows: 2604\n", + "equal: 4, imputed > obs: 2, imputed < obs: 74\n", + "% equal 0.002\n", + "greater: 0.001, less: 0.028\n", + "**** values when imputed < obs ****\n", + "count 74.0\n", + "mean 58.568018\n", + "std 6.357801\n", + "min 38.35\n", + "25% 55.93125\n", + "50% 60.8875\n", + "75% 63.41875\n", + "max 66.25\n", + "Name: lane_6_speed, dtype: Float64\n", + "count 74.0\n", + "mean 207.535135\n", + "std 54.252586\n", + "min 64.5\n", + "25% 186.675\n", + "50% 225.9\n", + "75% 253.175\n", + "max 257.8\n", + "Name: lane_6_obs_speed, dtype: Float64\n", + "****values when imputed > obs *****\n", + "True 2\n", + "Name: obs_col_zero, dtype: int64\n", + "\n", + "Metric: speed\n", + "Month-Year: 10-2023\n", + "Weekday: 3 Hour: 13\n", + "******* lane number: 7 ********\n", + "# rows: 2604\n", + "equal: 1, imputed > obs: 0, imputed < obs: 1\n", + "% equal 0.0\n", + "greater: 0.0, less: 0.0\n", + "**** values when imputed < obs ****\n", + "count 1.0\n", + "mean 63.5\n", + "std \n", + "min 63.5\n", + "25% 63.5\n", + "50% 63.5\n", + "75% 63.5\n", + "max 63.5\n", + "Name: lane_7_speed, dtype: Float64\n", + "count 1.0\n", + "mean 190.5\n", + "std \n", + "min 190.5\n", + "25% 190.5\n", + "50% 190.5\n", + "75% 190.5\n", + "max 190.5\n", + "Name: lane_7_obs_speed, dtype: Float64\n", + "****values when imputed > obs *****\n", + "Series([], Name: obs_col_zero, dtype: int64)\n", + "\n", + "Metric: speed\n", + "Month-Year: 10-2023\n", + "Weekday: 3 Hour: 13\n", + "******* lane number: 8 ********\n", + "# rows: 2604\n", + "equal: 0, imputed > obs: 0, imputed < obs: 0\n", + "% equal 0.0\n", + "greater: 0.0, less: 0.0\n", + "**** values when imputed < obs ****\n", + "count 0.0\n", + "mean \n", + "std \n", + "min \n", + "25% \n", + "50% \n", + "75% \n", + "max \n", + "Name: lane_8_speed, dtype: Float64\n", + "count 0.0\n", + "mean \n", + "std \n", + "min \n", + "25% \n", + "50% \n", + "75% \n", + "max \n", + "Name: lane_8_obs_speed, dtype: Float64\n", + "****values when imputed > obs *****\n", + "Series([], Name: obs_col_zero, dtype: int64)\n" + ] + } + ], + "source": [ + "filtering = [[\n", + " (\"year\", \"==\", 2023), (\"month\", \"==\", 10),\n", + " (\"weekday\", \"==\", 3), (\"hour\", \"==\", 13)\n", + "]]\n", + "\n", + "df = metric_vs_observed(METRIC, filtering)\n", + "\n", + "for i in range(1, 9):\n", + " lane_comparisons(df, i, METRIC)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8841dd2a-2d63-476d-8723-115f62dc526e", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/traffic_ops/aggregate.py b/traffic_ops/aggregate.py index 06b862755f..bdcdac00d8 100644 --- a/traffic_ops/aggregate.py +++ b/traffic_ops/aggregate.py @@ -14,6 +14,7 @@ import utils from utils import RAW_GCS, PROCESSED_GCS from crosswalks import station_id_cols +from shared_utils import publish_utils fs = gcsfs.GCSFileSystem() @@ -51,38 +52,25 @@ def aggregate_metric( group_cols: list, metric_name: Literal["flow", "truck_flow", "occ", "obs", "speed"] ) -> pd.DataFrame: - + """ + Aggregate metric (mean preferred for now) + against a list of grouping columns. + """ metric_cols = [c for c in df.columns if metric_name in c] - - if metric_name == "speed": - metric_agg = "mean" - else: - metric_agg = "sum" - - if metric_name in ["occ", "speed", "obs_speed"]: - metric_dtypes = {c: "Float64" for c in metric_cols} - - else: - metric_dtypes = {c: "Int64" for c in metric_cols} - + df2 = ( df .groupby(group_cols, group_keys=False) .agg( - {**{c: metric_agg for c in metric_cols}} + {**{c: "mean" for c in metric_cols}} ).reset_index() .astype({ - **metric_dtypes, - "year": "int16", - "month": "int8", - "weekday": "int8", + # since everything is mean, use floats, but allow NaNs + {c: "Float64" for c in metric_cols} }) ) - if "hour" in df2.columns: - df2 = df2.astype({"hour": "int8"}) - return df2 @@ -102,6 +90,9 @@ def metric_prep(metric: str) -> list: engine="pyarrow" ).columns.tolist() + # When we are interested in 1 particular metric, + # we should look for columns that contain a keyword + # but remove confounding ones (metric = flow; remove obs_flow, truck_flow) exclude_dict = { "flow": ["obs", "truck"], "occ": ["avg_occ"], @@ -117,6 +108,9 @@ def metric_prep(metric: str) -> list: any(word in c for word in exclude_dict[metric]) ] + # Create list of delayed dfs where we read in 1 partition + # and subset the columns + # Note: dd.read_parquet() has dtype errors import_dfs = [ delayed(read_filepart_merge_crosswalk)( filename, @@ -125,11 +119,18 @@ def metric_prep(metric: str) -> list: ) for part_i in list_of_files ] + # Add additional time columns we want time_dfs = [ delayed(utils.parse_for_time_components)(i) for i in import_dfs ] - time_dfs = [delayed(utils.add_peak_offpeak_column)(i, "hour") for i in time_dfs] + time_dfs = [delayed(utils.add_peak_offpeak_column)(i, "hour") + for i in time_dfs] + + time_dfs = [ + delayed(utils.add_weekday_weekend_column)(i, "weekday") + for i in time_dfs + ] return time_dfs @@ -146,6 +147,12 @@ def compute_and_export( """ metric_dfs = [compute(i)[0] for i in metric_dfs] results = pd.concat(metric_dfs, axis=0, ignore_index=True) + + for c in ["hour", "month", "weekday"]: + if c in results.columns: + results = results.astype({c: "int8"}) + if "year" in results.columns: + results = results.astype({"year": "int16"}) results.to_parquet( f"{PROCESSED_GCS}{export_filename}_{metric}.parquet", @@ -154,6 +161,38 @@ def compute_and_export( return +def process_one_metric( + metric_name: Literal["flow", "truck_flow", "occ", "obs", "speed"], + group_cols: list, + export_filename: str +): + """ + Prep and aggregate one metric and save out at particular grain. + """ + time0 = datetime.datetime.now() + + time_dfs = metric_prep(metric_name) + + aggregated_dfs = [ + delayed(aggregate_metric)(i, group_cols, metric_name) + for i in time_dfs + ] + + publish_utils.if_exists_then_delete( + f"{PROCESSED_GCS}{export_filename}_{metric_name}.parquet" + ) + + compute_and_export( + metric_name, + aggregated_dfs, + export_filename, + ) + + time1 = datetime.datetime.now() + print(f"{metric_name} exported {export_filename}: {time1 - time0}") + + return + def import_detector_status( filename: str = "hov_portion_detector_status_time_window", @@ -169,6 +208,8 @@ def import_detector_status( utils.parse_for_time_components ).pipe( utils.add_peak_offpeak_column, "hour" + ).pipe( + utils.add_weekday_weekend_column, "weekday" ) # Merge in station_uuid @@ -215,73 +256,37 @@ def aggregate_detector_samples( metric_list = [ "flow", "truck_flow", "obs_flow", - "occ", - "speed", "obs_speed", # mean - "pts_obs", + "occ", "speed", "obs_speed", "pts_obs", ] station_cols = ["station_uuid"] - weekday_hour_cols = ["year", "month", "weekday", "hour"] - weekday_peak_cols = ["year", "month", "weekday", "peak_offpeak"] - for metric in metric_list: - - time0 = datetime.datetime.now() - - time_dfs = metric_prep(metric) - - hour_dfs = [ - delayed(aggregate_metric)(i, station_cols + weekday_hour_cols, metric) - for i in time_dfs - ] - - compute_and_export( - metric, - hour_dfs, - "station_weekday_hour", - partition_cols = ["weekday", "hour"] - ) + GRAINS = { + "station_weekday_hour": station_cols + ["year", "month", "weekday", "hour"], + "station_weekday_peak": station_cols + ["year", "month", "weekday", "peak_offpeak"], + "station_daytype_hour": station_cols + ["hour", "daytype"] + } - time1 = datetime.datetime.now() - print(f"{metric} hourly aggregation: {time1 - time0}") - - peak_dfs = [ - delayed(aggregate_metric)(i, station_cols + weekday_peak_cols, metric) - for i in time_dfs - ] + for metric in metric_list: - compute_and_export( - metric, - peak_dfs, - "station_weekday_peak", - partition_cols = ["weekday", "peak_offpeak"] - ) + for export_filename, grain_cols in GRAINS.items(): + + process_one_metric(metric, grain_cols, export_filename) - time2 = datetime.datetime.now() - print(f"{metric} peak/offpeak aggregation: {time2 - time1}") - print(f"{metric} aggregation: {time2 - time0}") - detector_df = import_detector_status() - detector_station_hour = aggregate_detector_samples( - detector_df, - station_cols + weekday_hour_cols - ) - - detector_station_hour.to_parquet( - f"{PROCESSED_GCS}station_weekday_hour_detectors.parquet" - ) - - detector_station_weekday = aggregate_detector_samples( - detector_df, - station_cols + weekday_peak_cols - ) - - detector_station_weekday.to_parquet( - f"{PROCESSED_GCS}station_weekday_peak_detectors.parquet" - ) + for export_filename, grain_cols in GRAINS.items(): + + agg_df = aggregate_detector_samples( + detector_df, + grain_cols + ) + + agg_df.to_parquet( + f"{PROCESSED_GCS}{export_filename}_detectors.parquet" + ) end = datetime.datetime.now() print(f"execution time: {end - start}") \ No newline at end of file diff --git a/traffic_ops/utils.py b/traffic_ops/utils.py index 6dee8e87c4..7c6f1e4513 100644 --- a/traffic_ops/utils.py +++ b/traffic_ops/utils.py @@ -13,6 +13,10 @@ def parse_for_time_components( df: pd.DataFrame, time_col: str = "time_id" ) -> pd.DataFrame: + """ + Parse the time_id column into several components: + year, month, weekday (as integer), and hour. + """ df2 = df.assign( year = pd.to_datetime(df[time_col]).dt.year, @@ -29,7 +33,9 @@ def add_peak_offpeak_column( df: pd.DataFrame, hour_col: str = "hour" ) -> pd.DataFrame: - + """ + Categorize hour into peak / offpeak. + """ hours_in_day = range(0, 24) peak_offpeak_dict = { @@ -44,4 +50,21 @@ def add_peak_offpeak_column( return df - \ No newline at end of file +def add_weekday_weekend_column( + df: pd.DataFrame, + weekday_col: str = "weekday" +) -> pd.DataFrame: + """ + Categorize day of week into daytype (weekday or weekend). + """ + + weekday_weekend_dict = { + **{k: "weekday" for k in [0, 1, 2, 3, 4]}, + **{k: "weekend" for k in [5, 6]}, + } + + df = df.assign( + daytype = df[weekday_col].map(weekday_weekend_dict) + ) + + return df \ No newline at end of file From ab93ad40344037b0b7d3d22cc650db38e9e53c30 Mon Sep 17 00:00:00 2001 From: tiffanychu90 Date: Wed, 14 Aug 2024 17:25:03 +0000 Subject: [PATCH 2/2] add exploratory notebook for flow/obs_flow, speed/obs_speed --- traffic_ops/01_observed_flow_speed.ipynb | 3521 ++++++++++++++-------- 1 file changed, 2270 insertions(+), 1251 deletions(-) diff --git a/traffic_ops/01_observed_flow_speed.ipynb b/traffic_ops/01_observed_flow_speed.ipynb index 8c51ec1361..53594fce44 100644 --- a/traffic_ops/01_observed_flow_speed.ipynb +++ b/traffic_ops/01_observed_flow_speed.ipynb @@ -10,12 +10,14 @@ "Side-by-side comparison of what seems to be related columns.\n", "\n", "## What we learned\n", + "* **use `flow`, `speed` columns** and ignore `obs_flow`, `obs_speed`\n", "* `flow` is always >= `obs_flow`\n", "* Most of the time, `flow = obs_flow`\n", - "* Why would imputation be more than what's observed? Is this only true when `flow == 0`? Yes, seems to be imputation happens when it's majority `obs_flow==0`, although there are a small percentage of cases where this isn't true.\n", + "* Why would imputation be more than what's observed? Is this only true when `obs_flow == 0`?\n", + " * It does appear like when `obs_flow has mean = 0`, `flow` will be higher and hold non-zeroes.\n", "* We'll just use `flow` for now, and use imputed values always?\n", "* Most cases are `speed < obs_speed` (so imputed tends to be less than what detector says).\n", - "* Looking at descriptives, these are occurring where observed speed is really high, 100-200 mph!" + "* Looking at descriptives, these are occurring where observed speed is perceived to be too high so that these are adjusted down, though means don't differ by too much for the subset where `speed < obs_speed`." ] }, { @@ -41,12 +43,12 @@ " PREFIX = \"station_weekday_hour\"\n", "\n", " metric_df = pd.read_parquet(\n", - " f\"{PROCESSED_GCS}{PREFIX}_{metric}.parquet/\",\n", + " f\"{PROCESSED_GCS}{PREFIX}_{metric}.parquet\",\n", " filters = filtering\n", " )\n", "\n", " obs_metric_df = pd.read_parquet(\n", - " f\"{PROCESSED_GCS}{PREFIX}_obs_{metric}.parquet/\",\n", + " f\"{PROCESSED_GCS}{PREFIX}_obs_{metric}.parquet\",\n", " filters = filtering\n", " )\n", " \n", @@ -65,15 +67,22 @@ { "cell_type": "code", "execution_count": 3, + "id": "0eeb85d1-1b33-4216-aafa-dd901f3c325d", + "metadata": {}, + "outputs": [], + "source": [ + "flow_df = metric_vs_observed(\"flow\", filtering = None)\n", + "speed_df = metric_vs_observed(\"speed\", filtering = None)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, "id": "4ada8106-a779-4908-9234-218370911c5c", "metadata": {}, "outputs": [], "source": [ "def lane_comparisons(df: pd.DataFrame, lane_number: int, metric: str):\n", - " print(f\"\\nMetric: {metric}\")\n", - " print(f\"Month-Year: {df.month.iloc[0]}-{df.year.iloc[0]}\")\n", - " print(f\"Weekday: {df.weekday.iloc[0]} Hour: {df.hour.iloc[0]}\")\n", - " print(f\"******* lane number: {lane_number} ********\")\n", "\n", " N_ROWS = len(df)\n", " \n", @@ -83,6 +92,8 @@ " col = f\"lane_{lane_number}_{metric}\"\n", " obs_col = f\"lane_{lane_number}_obs_{metric}\"\n", " \n", + " display(df[[col, obs_col]].describe())\n", + " \n", " N_EQUAL = df[df[col] == df[obs_col]].shape[0]\n", " N_MORE = df[df[col] > df[obs_col]].shape[0]\n", " N_LESS = df[df[col] < df[obs_col]].shape[0]\n", @@ -98,8 +109,7 @@ "\n", " less_df = df.loc[df[col] < df[obs_col]]\n", "\n", - " print(less_df[col].describe())\n", - " print(less_df[obs_col].describe())\n", + " display(less_df[[col, obs_col]].describe())\n", " \n", " print(\"****values when imputed > obs *****\")\n", " \n", @@ -124,483 +134,795 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 5, "id": "433cf0e1-9e6d-44e3-9fd6-3959971d3b67", "metadata": {}, "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
lane_1_flowlane_1_obs_flow
count5490744.05524535.0
mean761.029763722.35622
std564.397175562.738313
min0.00.0
25%246.4207.4
50%699.0645.75
75%1210.01164.333333
max5198.05198.0
\n", + "
" + ], + "text/plain": [ + " lane_1_flow lane_1_obs_flow\n", + "count 5490744.0 5524535.0\n", + "mean 761.029763 722.35622\n", + "std 564.397175 562.738313\n", + "min 0.0 0.0\n", + "25% 246.4 207.4\n", + "50% 699.0 645.75\n", + "75% 1210.0 1164.333333\n", + "max 5198.0 5198.0" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, { "name": "stdout", "output_type": "stream", "text": [ - "\n", - "Metric: flow\n", - "Month-Year: 6-2024\n", - "Weekday: 2 Hour: 18\n", - "******* lane number: 1 ********\n", - "# rows: 7026\n", - "equal: 6873, imputed > obs: 153, imputed < obs: 0\n", - "% equal 0.978\n", - "greater: 0.022, less: 0.0\n", - "****values when imputed > obs *****\n", - "True 108\n", - "False 45\n", - "Name: obs_col_zero, dtype: int64\n", - "\n", - "Metric: flow\n", - "Month-Year: 6-2024\n", - "Weekday: 2 Hour: 18\n", - "******* lane number: 2 ********\n", - "# rows: 7026\n", - "equal: 6906, imputed > obs: 120, imputed < obs: 0\n", - "% equal 0.983\n", - "greater: 0.017, less: 0.0\n", - "****values when imputed > obs *****\n", - "True 84\n", - "False 36\n", - "Name: obs_col_zero, dtype: int64\n", - "\n", - "Metric: flow\n", - "Month-Year: 6-2024\n", - "Weekday: 2 Hour: 18\n", - "******* lane number: 3 ********\n", - "# rows: 7026\n", - "equal: 6927, imputed > obs: 99, imputed < obs: 0\n", - "% equal 0.986\n", - "greater: 0.014, less: 0.0\n", - "****values when imputed > obs *****\n", - "True 66\n", - "False 33\n", - "Name: obs_col_zero, dtype: int64\n", - "\n", - "Metric: flow\n", - "Month-Year: 6-2024\n", - "Weekday: 2 Hour: 18\n", - "******* lane number: 4 ********\n", - "# rows: 7026\n", - "equal: 6978, imputed > obs: 48, imputed < obs: 0\n", - "% equal 0.993\n", - "greater: 0.007, less: 0.0\n", - "****values when imputed > obs *****\n", - "True 42\n", - "False 6\n", - "Name: obs_col_zero, dtype: int64\n", - "\n", - "Metric: flow\n", - "Month-Year: 6-2024\n", - "Weekday: 2 Hour: 18\n", - "******* lane number: 5 ********\n", - "# rows: 7026\n", - "equal: 7002, imputed > obs: 24, imputed < obs: 0\n", - "% equal 0.997\n", - "greater: 0.003, less: 0.0\n", + "# rows: 5524535\n", + "equal: 4826068, imputed > obs: 664676, imputed < obs: 0\n", + "% equal 0.874\n", + "greater: 0.12, less: 0.0\n", "****values when imputed > obs *****\n", - "True 24\n", - "Name: obs_col_zero, dtype: int64\n", - "\n", - "Metric: flow\n", - "Month-Year: 6-2024\n", - "Weekday: 2 Hour: 18\n", - "******* lane number: 6 ********\n", - "# rows: 7026\n", - "equal: 7023, imputed > obs: 3, imputed < obs: 0\n", - "% equal 1.0\n", - "greater: 0.0, less: 0.0\n", - "****values when imputed > obs *****\n", - "True 3\n", - "Name: obs_col_zero, dtype: int64\n", - "\n", - "Metric: flow\n", - "Month-Year: 6-2024\n", - "Weekday: 2 Hour: 18\n", - "******* lane number: 7 ********\n", - "# rows: 7026\n", - "equal: 7026, imputed > obs: 0, imputed < obs: 0\n", - "% equal 1.0\n", - "greater: 0.0, less: 0.0\n", - "****values when imputed > obs *****\n", - "Series([], Name: obs_col_zero, dtype: int64)\n", - "\n", - "Metric: flow\n", - "Month-Year: 6-2024\n", - "Weekday: 2 Hour: 18\n", - "******* lane number: 8 ********\n", - "# rows: 7026\n", - "equal: 7026, imputed > obs: 0, imputed < obs: 0\n", - "% equal 1.0\n", - "greater: 0.0, less: 0.0\n", - "****values when imputed > obs *****\n", - "Series([], Name: obs_col_zero, dtype: int64)\n" + "False 566253\n", + "True 98423\n", + "Name: obs_col_zero, dtype: int64\n" ] - } - ], - "source": [ - "filtering = [[\n", - " (\"year\", \"==\", 2024), (\"month\", \"==\", 6),\n", - " (\"weekday\", \"==\", 2), (\"hour\", \"==\", 18)\n", - "]]\n", - "\n", - "METRIC = \"flow\"\n", - "df = metric_vs_observed(METRIC, filtering)\n", - "\n", - "for i in range(1, 9):\n", - " lane_comparisons(df, i, METRIC)" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "38ec11bd-389a-4e5c-9f62-ede7578f0850", - "metadata": {}, - "outputs": [ + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
lane_2_flowlane_2_obs_flow
count3668484.05524535.0
mean912.07882577.289599
std544.438406606.562733
min0.00.0
25%400.40.0
50%965.0347.0
75%1354.51131.333333
max4737.04737.0
\n", + "
" + ], + "text/plain": [ + " lane_2_flow lane_2_obs_flow\n", + "count 3668484.0 5524535.0\n", + "mean 912.07882 577.289599\n", + "std 544.438406 606.562733\n", + "min 0.0 0.0\n", + "25% 400.4 0.0\n", + "50% 965.0 347.0\n", + "75% 1354.5 1131.333333\n", + "max 4737.0 4737.0" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, { "name": "stdout", "output_type": "stream", "text": [ - "\n", - "Metric: flow\n", - "Month-Year: 3-2024\n", - "Weekday: 4 Hour: 10\n", - "******* lane number: 1 ********\n", - "# rows: 7884\n", - "equal: 7413, imputed > obs: 471, imputed < obs: 0\n", - "% equal 0.94\n", - "greater: 0.06, less: 0.0\n", - "****values when imputed > obs *****\n", - "False 375\n", - "True 96\n", - "Name: obs_col_zero, dtype: int64\n", - "\n", - "Metric: flow\n", - "Month-Year: 3-2024\n", - "Weekday: 4 Hour: 10\n", - "******* lane number: 2 ********\n", - "# rows: 7884\n", - "equal: 7590, imputed > obs: 294, imputed < obs: 0\n", - "% equal 0.963\n", - "greater: 0.037, less: 0.0\n", - "****values when imputed > obs *****\n", - "False 210\n", - "True 84\n", - "Name: obs_col_zero, dtype: int64\n", - "\n", - "Metric: flow\n", - "Month-Year: 3-2024\n", - "Weekday: 4 Hour: 10\n", - "******* lane number: 3 ********\n", - "# rows: 7884\n", - "equal: 7620, imputed > obs: 264, imputed < obs: 0\n", - "% equal 0.967\n", - "greater: 0.033, less: 0.0\n", - "****values when imputed > obs *****\n", - "False 174\n", - "True 90\n", - "Name: obs_col_zero, dtype: int64\n", - "\n", - "Metric: flow\n", - "Month-Year: 3-2024\n", - "Weekday: 4 Hour: 10\n", - "******* lane number: 4 ********\n", - "# rows: 7884\n", - "equal: 7662, imputed > obs: 222, imputed < obs: 0\n", - "% equal 0.972\n", - "greater: 0.028, less: 0.0\n", + "# rows: 5524535\n", + "equal: 3219391, imputed > obs: 449093, imputed < obs: 0\n", + "% equal 0.583\n", + "greater: 0.081, less: 0.0\n", "****values when imputed > obs *****\n", - "False 180\n", - "True 42\n", - "Name: obs_col_zero, dtype: int64\n", - "\n", - "Metric: flow\n", - "Month-Year: 3-2024\n", - "Weekday: 4 Hour: 10\n", - "******* lane number: 5 ********\n", - "# rows: 7884\n", - "equal: 7827, imputed > obs: 57, imputed < obs: 0\n", - "% equal 0.993\n", - "greater: 0.007, less: 0.0\n", - "****values when imputed > obs *****\n", - "False 48\n", - "True 9\n", - "Name: obs_col_zero, dtype: int64\n", - "\n", - "Metric: flow\n", - "Month-Year: 3-2024\n", - "Weekday: 4 Hour: 10\n", - "******* lane number: 6 ********\n", - "# rows: 7884\n", - "equal: 7878, imputed > obs: 6, imputed < obs: 0\n", - "% equal 0.999\n", - "greater: 0.001, less: 0.0\n", - "****values when imputed > obs *****\n", - "False 6\n", - "Name: obs_col_zero, dtype: int64\n", - "\n", - "Metric: flow\n", - "Month-Year: 3-2024\n", - "Weekday: 4 Hour: 10\n", - "******* lane number: 7 ********\n", - "# rows: 7884\n", - "equal: 7884, imputed > obs: 0, imputed < obs: 0\n", - "% equal 1.0\n", - "greater: 0.0, less: 0.0\n", - "****values when imputed > obs *****\n", - "Series([], Name: obs_col_zero, dtype: int64)\n", - "\n", - "Metric: flow\n", - "Month-Year: 3-2024\n", - "Weekday: 4 Hour: 10\n", - "******* lane number: 8 ********\n", - "# rows: 7884\n", - "equal: 7884, imputed > obs: 0, imputed < obs: 0\n", - "% equal 1.0\n", - "greater: 0.0, less: 0.0\n", - "****values when imputed > obs *****\n", - "Series([], Name: obs_col_zero, dtype: int64)\n" + "False 372805\n", + "True 76288\n", + "Name: obs_col_zero, dtype: int64\n" ] - } - ], - "source": [ - "filtering = [[\n", - " (\"year\", \"==\", 2024), (\"month\", \"==\", 3),\n", - " (\"weekday\", \"==\", 4), (\"hour\", \"==\", 10)\n", - "]]\n", - "\n", - "df = metric_vs_observed(METRIC, filtering)\n", - "\n", - "for i in range(1, 9):\n", - " lane_comparisons(df, i, METRIC)" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "9a183a7c-6c92-49fd-83ac-f8027a6253d4", - "metadata": {}, - "outputs": [ + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
lane_3_flowlane_3_obs_flow
count3346786.05524535.0
mean829.965098478.92817
std480.593977540.801227
min0.00.0
25%386.750.0
50%861.25233.0
75%1217.0952.75
max4496.04496.0
\n", + "
" + ], + "text/plain": [ + " lane_3_flow lane_3_obs_flow\n", + "count 3346786.0 5524535.0\n", + "mean 829.965098 478.92817\n", + "std 480.593977 540.801227\n", + "min 0.0 0.0\n", + "25% 386.75 0.0\n", + "50% 861.25 233.0\n", + "75% 1217.0 952.75\n", + "max 4496.0 4496.0" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, { "name": "stdout", "output_type": "stream", "text": [ - "\n", - "Metric: flow\n", - "Month-Year: 7-2023\n", - "Weekday: 2 Hour: 10\n", - "******* lane number: 1 ********\n", - "# rows: 7056\n", - "equal: 6552, imputed > obs: 504, imputed < obs: 0\n", - "% equal 0.929\n", - "greater: 0.071, less: 0.0\n", - "****values when imputed > obs *****\n", - "False 402\n", - "True 102\n", - "Name: obs_col_zero, dtype: int64\n", - "\n", - "Metric: flow\n", - "Month-Year: 7-2023\n", - "Weekday: 2 Hour: 10\n", - "******* lane number: 2 ********\n", - "# rows: 7056\n", - "equal: 6723, imputed > obs: 333, imputed < obs: 0\n", - "% equal 0.953\n", - "greater: 0.047, less: 0.0\n", - "****values when imputed > obs *****\n", - "False 258\n", - "True 75\n", - "Name: obs_col_zero, dtype: int64\n", - "\n", - "Metric: flow\n", - "Month-Year: 7-2023\n", - "Weekday: 2 Hour: 10\n", - "******* lane number: 3 ********\n", - "# rows: 7056\n", - "equal: 6759, imputed > obs: 297, imputed < obs: 0\n", - "% equal 0.958\n", - "greater: 0.042, less: 0.0\n", - "****values when imputed > obs *****\n", - "False 204\n", - "True 93\n", - "Name: obs_col_zero, dtype: int64\n", - "\n", - "Metric: flow\n", - "Month-Year: 7-2023\n", - "Weekday: 2 Hour: 10\n", - "******* lane number: 4 ********\n", - "# rows: 7056\n", - "equal: 6858, imputed > obs: 198, imputed < obs: 0\n", - "% equal 0.972\n", - "greater: 0.028, less: 0.0\n", + "# rows: 5524535\n", + "equal: 2929831, imputed > obs: 416955, imputed < obs: 0\n", + "% equal 0.53\n", + "greater: 0.075, less: 0.0\n", "****values when imputed > obs *****\n", - "False 153\n", - "True 45\n", - "Name: obs_col_zero, dtype: int64\n", - "\n", - "Metric: flow\n", - "Month-Year: 7-2023\n", - "Weekday: 2 Hour: 10\n", - "******* lane number: 5 ********\n", - "# rows: 7056\n", - "equal: 6999, imputed > obs: 57, imputed < obs: 0\n", - "% equal 0.992\n", - "greater: 0.008, less: 0.0\n", - "****values when imputed > obs *****\n", - "False 51\n", - "True 6\n", - "Name: obs_col_zero, dtype: int64\n", - "\n", - "Metric: flow\n", - "Month-Year: 7-2023\n", - "Weekday: 2 Hour: 10\n", - "******* lane number: 6 ********\n", - "# rows: 7056\n", - "equal: 7044, imputed > obs: 12, imputed < obs: 0\n", - "% equal 0.998\n", - "greater: 0.002, less: 0.0\n", - "****values when imputed > obs *****\n", - "False 12\n", - "Name: obs_col_zero, dtype: int64\n", - "\n", - "Metric: flow\n", - "Month-Year: 7-2023\n", - "Weekday: 2 Hour: 10\n", - "******* lane number: 7 ********\n", - "# rows: 7056\n", - "equal: 7056, imputed > obs: 0, imputed < obs: 0\n", - "% equal 1.0\n", - "greater: 0.0, less: 0.0\n", - "****values when imputed > obs *****\n", - "Series([], Name: obs_col_zero, dtype: int64)\n", - "\n", - "Metric: flow\n", - "Month-Year: 7-2023\n", - "Weekday: 2 Hour: 10\n", - "******* lane number: 8 ********\n", - "# rows: 7056\n", - "equal: 7056, imputed > obs: 0, imputed < obs: 0\n", - "% equal 1.0\n", - "greater: 0.0, less: 0.0\n", - "****values when imputed > obs *****\n", - "Series([], Name: obs_col_zero, dtype: int64)\n" + "False 342808\n", + "True 74147\n", + "Name: obs_col_zero, dtype: int64\n" ] - } - ], - "source": [ - "filtering = [[\n", - " (\"year\", \"==\", 2023), (\"month\", \"==\", 7),\n", - " (\"weekday\", \"==\", 2), (\"hour\", \"==\", 10)\n", - "]]\n", - "\n", - "df = metric_vs_observed(METRIC, filtering)\n", - "\n", - "for i in range(1, 9):\n", - " lane_comparisons(df, i, METRIC)" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "b9c1243b-d340-42a0-9f6c-52dd3cf2595b", - "metadata": {}, - "outputs": [ + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
lane_4_flowlane_4_obs_flow
count2577647.05524535.0
mean768.204926342.366352
std474.421297490.061659
min0.00.0
25%331.750.0
50%770.250.0
75%1139.333333663.25
max4235.04235.0
\n", + "
" + ], + "text/plain": [ + " lane_4_flow lane_4_obs_flow\n", + "count 2577647.0 5524535.0\n", + "mean 768.204926 342.366352\n", + "std 474.421297 490.061659\n", + "min 0.0 0.0\n", + "25% 331.75 0.0\n", + "50% 770.25 0.0\n", + "75% 1139.333333 663.25\n", + "max 4235.0 4235.0" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, { "name": "stdout", "output_type": "stream", "text": [ - "\n", - "Metric: flow\n", - "Month-Year: 9-2023\n", - "Weekday: 1 Hour: 14\n", - "******* lane number: 1 ********\n", - "# rows: 7761\n", - "equal: 6894, imputed > obs: 867, imputed < obs: 0\n", - "% equal 0.888\n", - "greater: 0.112, less: 0.0\n", + "# rows: 5524535\n", + "equal: 2259657, imputed > obs: 317990, imputed < obs: 0\n", + "% equal 0.409\n", + "greater: 0.058, less: 0.0\n", "****values when imputed > obs *****\n", - "False 762\n", - "True 105\n", - "Name: obs_col_zero, dtype: int64\n", - "\n", - "Metric: flow\n", - "Month-Year: 9-2023\n", - "Weekday: 1 Hour: 14\n", - "******* lane number: 2 ********\n", - "# rows: 7761\n", - "equal: 7245, imputed > obs: 516, imputed < obs: 0\n", - "% equal 0.934\n", - "greater: 0.066, less: 0.0\n", - "****values when imputed > obs *****\n", - "False 423\n", - "True 93\n", - "Name: obs_col_zero, dtype: int64\n", - "\n", - "Metric: flow\n", - "Month-Year: 9-2023\n", - "Weekday: 1 Hour: 14\n", - "******* lane number: 3 ********\n", - "# rows: 7761\n", - "equal: 7305, imputed > obs: 456, imputed < obs: 0\n", - "% equal 0.941\n", - "greater: 0.059, less: 0.0\n", - "****values when imputed > obs *****\n", - "False 381\n", - "True 75\n", - "Name: obs_col_zero, dtype: int64\n", - "\n", - "Metric: flow\n", - "Month-Year: 9-2023\n", - "Weekday: 1 Hour: 14\n", - "******* lane number: 4 ********\n", - "# rows: 7761\n", - "equal: 7425, imputed > obs: 336, imputed < obs: 0\n", - "% equal 0.957\n", - "greater: 0.043, less: 0.0\n", - "****values when imputed > obs *****\n", - "False 288\n", - "True 48\n", - "Name: obs_col_zero, dtype: int64\n", - "\n", - "Metric: flow\n", - "Month-Year: 9-2023\n", - "Weekday: 1 Hour: 14\n", - "******* lane number: 5 ********\n", - "# rows: 7761\n", - "equal: 7635, imputed > obs: 126, imputed < obs: 0\n", - "% equal 0.984\n", - "greater: 0.016, less: 0.0\n", + "False 272609\n", + "True 45381\n", + "Name: obs_col_zero, dtype: int64\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
lane_5_flowlane_5_obs_flow
count834680.05524535.0
mean676.55977897.664623
std483.344518297.848786
min0.00.0
25%246.00.0
50%614.00.0
75%1036.00.0
max2755.52755.5
\n", + "
" + ], + "text/plain": [ + " lane_5_flow lane_5_obs_flow\n", + "count 834680.0 5524535.0\n", + "mean 676.559778 97.664623\n", + "std 483.344518 297.848786\n", + "min 0.0 0.0\n", + "25% 246.0 0.0\n", + "50% 614.0 0.0\n", + "75% 1036.0 0.0\n", + "max 2755.5 2755.5" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "# rows: 5524535\n", + "equal: 728520, imputed > obs: 106160, imputed < obs: 0\n", + "% equal 0.132\n", + "greater: 0.019, less: 0.0\n", "****values when imputed > obs *****\n", - "False 102\n", - "True 24\n", - "Name: obs_col_zero, dtype: int64\n", - "\n", - "Metric: flow\n", - "Month-Year: 9-2023\n", - "Weekday: 1 Hour: 14\n", - "******* lane number: 6 ********\n", - "# rows: 7761\n", - "equal: 7728, imputed > obs: 33, imputed < obs: 0\n", - "% equal 0.996\n", + "False 92664\n", + "True 13496\n", + "Name: obs_col_zero, dtype: int64\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
lane_6_flowlane_6_obs_flow
count157727.05524535.0
mean549.26493715.030314
std435.67858114.333933
min0.00.0
25%191.250.0
50%458.3333330.0
75%813.00.0
max2692.02692.0
\n", + "
" + ], + "text/plain": [ + " lane_6_flow lane_6_obs_flow\n", + "count 157727.0 5524535.0\n", + "mean 549.264937 15.030314\n", + "std 435.67858 114.333933\n", + "min 0.0 0.0\n", + "25% 191.25 0.0\n", + "50% 458.333333 0.0\n", + "75% 813.0 0.0\n", + "max 2692.0 2692.0" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "# rows: 5524535\n", + "equal: 137721, imputed > obs: 20006, imputed < obs: 0\n", + "% equal 0.025\n", "greater: 0.004, less: 0.0\n", "****values when imputed > obs *****\n", - "False 30\n", - "True 3\n", - "Name: obs_col_zero, dtype: int64\n", - "\n", - "Metric: flow\n", - "Month-Year: 9-2023\n", - "Weekday: 1 Hour: 14\n", - "******* lane number: 7 ********\n", - "# rows: 7761\n", - "equal: 7761, imputed > obs: 0, imputed < obs: 0\n", - "% equal 1.0\n", + "False 17530\n", + "True 2476\n", + "Name: obs_col_zero, dtype: int64\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
lane_7_flowlane_7_obs_flow
count4202.05524535.0
mean481.0565920.350383
std215.57248414.063007
min18.00.0
25%320.68750.0
50%549.450.0
75%643.250.0
max1051.6666671027.5
\n", + "
" + ], + "text/plain": [ + " lane_7_flow lane_7_obs_flow\n", + "count 4202.0 5524535.0\n", + "mean 481.056592 0.350383\n", + "std 215.572484 14.063007\n", + "min 18.0 0.0\n", + "25% 320.6875 0.0\n", + "50% 549.45 0.0\n", + "75% 643.25 0.0\n", + "max 1051.666667 1027.5" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "# rows: 5524535\n", + "equal: 3624, imputed > obs: 578, imputed < obs: 0\n", + "% equal 0.001\n", "greater: 0.0, less: 0.0\n", "****values when imputed > obs *****\n", - "Series([], Name: obs_col_zero, dtype: int64)\n", - "\n", - "Metric: flow\n", - "Month-Year: 9-2023\n", - "Weekday: 1 Hour: 14\n", - "******* lane number: 8 ********\n", - "# rows: 7761\n", - "equal: 7761, imputed > obs: 0, imputed < obs: 0\n", - "% equal 1.0\n", + "False 543\n", + "True 35\n", + "Name: obs_col_zero, dtype: int64\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
lane_8_flowlane_8_obs_flow
count0.05524535.0
mean<NA>0.0
std<NA>0.0
min<NA>0.0
25%<NA>0.0
50%<NA>0.0
75%<NA>0.0
max<NA>0.0
\n", + "
" + ], + "text/plain": [ + " lane_8_flow lane_8_obs_flow\n", + "count 0.0 5524535.0\n", + "mean 0.0\n", + "std 0.0\n", + "min 0.0\n", + "25% 0.0\n", + "50% 0.0\n", + "75% 0.0\n", + "max 0.0" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "# rows: 5524535\n", + "equal: 0, imputed > obs: 0, imputed < obs: 0\n", + "% equal 0.0\n", "greater: 0.0, less: 0.0\n", "****values when imputed > obs *****\n", "Series([], Name: obs_col_zero, dtype: int64)\n" @@ -608,15 +930,8 @@ } ], "source": [ - "filtering = [[\n", - " (\"year\", \"==\", 2023), (\"month\", \"==\", 9),\n", - " (\"weekday\", \"==\", 1), (\"hour\", \"==\", 14)\n", - "]]\n", - "\n", - "df = metric_vs_observed(METRIC, filtering)\n", - "\n", "for i in range(1, 9):\n", - " lane_comparisons(df, i, METRIC)" + " lane_comparisons(flow_df, i, \"flow\")" ] }, { @@ -629,829 +944,1533 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 6, "id": "9c3b7997-8906-4388-bd7a-1d68b884998a", "metadata": {}, "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
lane_1_speedlane_1_obs_speed
count5490744.05410636.0
mean65.60516465.664043
std10.38054410.444326
min3.03.0
25%63.62563.675
50%66.07566.233333
75%73.173.3
max90.590.5
\n", + "
" + ], + "text/plain": [ + " lane_1_speed lane_1_obs_speed\n", + "count 5490744.0 5410636.0\n", + "mean 65.605164 65.664043\n", + "std 10.380544 10.444326\n", + "min 3.0 3.0\n", + "25% 63.625 63.675\n", + "50% 66.075 66.233333\n", + "75% 73.1 73.3\n", + "max 90.5 90.5" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, { "name": "stdout", "output_type": "stream", "text": [ - "\n", - "Metric: speed\n", - "Month-Year: 5-2024\n", - "Weekday: 2 Hour: 8\n", - "******* lane number: 1 ********\n", - "# rows: 2606\n", - "equal: 70, imputed > obs: 43, imputed < obs: 2479\n", - "% equal 0.027\n", - "greater: 0.017, less: 0.951\n", - "**** values when imputed < obs ****\n", - "count 2479.0\n", - "mean 57.44674\n", - "std 14.862704\n", - "min 5.42\n", - "25% 50.2975\n", - "50% 62.78\n", - "75% 67.87\n", - "max 79.48\n", - "Name: lane_1_speed, dtype: Float64\n", - "count 2479.0\n", - "mean 266.002098\n", - "std 84.267447\n", - "min 22.6\n", - "25% 210.65\n", - "50% 288.6\n", - "75% 326.1\n", - "max 397.4\n", - "Name: lane_1_obs_speed, dtype: Float64\n", - "****values when imputed > obs *****\n", - "True 33\n", - "False 10\n", - "Name: obs_col_zero, dtype: int64\n", - "\n", - "Metric: speed\n", - "Month-Year: 5-2024\n", - "Weekday: 2 Hour: 8\n", - "******* lane number: 2 ********\n", - "# rows: 2606\n", - "equal: 55, imputed > obs: 35, imputed < obs: 1631\n", - "% equal 0.021\n", - "greater: 0.013, less: 0.626\n", - "**** values when imputed < obs ****\n", - "count 1631.0\n", - "mean 54.435811\n", - "std 15.345102\n", - "min 6.06\n", - "25% 46.32\n", - "50% 60.075\n", - "75% 65.6675\n", - "max 73.966667\n", - "Name: lane_2_speed, dtype: Float64\n", - "count 1631.0\n", - "mean 251.42992\n", - "std 84.908873\n", - "min 21.2\n", - "25% 194.1\n", - "50% 275.3\n", - "75% 322.5\n", - "max 369.1\n", - "Name: lane_2_obs_speed, dtype: Float64\n", - "****values when imputed > obs *****\n", - "True 28\n", - "False 7\n", - "Name: obs_col_zero, dtype: int64\n", - "\n", - "Metric: speed\n", - "Month-Year: 5-2024\n", - "Weekday: 2 Hour: 8\n", - "******* lane number: 3 ********\n", - "# rows: 2606\n", - "equal: 51, imputed > obs: 35, imputed < obs: 1471\n", - "% equal 0.02\n", - "greater: 0.013, less: 0.564\n", - "**** values when imputed < obs ****\n", - "count 1471.0\n", - "mean 49.851535\n", - "std 14.697226\n", - "min 5.38\n", - "25% 42.41\n", - "50% 53.3\n", - "75% 61.42\n", - "max 77.825\n", - "Name: lane_3_speed, dtype: Float64\n", - "count 1471.0\n", - "mean 230.203739\n", - "std 79.750198\n", - "min 11.1\n", - "25% 178.3\n", - "50% 247.0\n", - "75% 295.75\n", - "max 360.3\n", - "Name: lane_3_obs_speed, dtype: Float64\n", - "****values when imputed > obs *****\n", - "True 28\n", - "False 7\n", - "Name: obs_col_zero, dtype: int64\n", - "\n", - "Metric: speed\n", - "Month-Year: 5-2024\n", - "Weekday: 2 Hour: 8\n", - "******* lane number: 4 ********\n", - "# rows: 2606\n", - "equal: 46, imputed > obs: 20, imputed < obs: 1131\n", - "% equal 0.018\n", - "greater: 0.008, less: 0.434\n", - "**** values when imputed < obs ****\n", - "count 1131.0\n", - "mean 47.919651\n", - "std 14.27912\n", - "min 4.54\n", - "25% 40.35\n", - "50% 51.42\n", - "75% 58.65\n", - "max 76.54\n", - "Name: lane_4_speed, dtype: Float64\n", - "count 1131.0\n", - "mean 218.736163\n", - "std 78.207275\n", - "min 11.3\n", - "25% 164.25\n", - "50% 234.4\n", - "75% 280.25\n", - "max 382.7\n", - "Name: lane_4_obs_speed, dtype: Float64\n", - "****values when imputed > obs *****\n", - "True 14\n", - "False 6\n", - "Name: obs_col_zero, dtype: int64\n", - "\n", - "Metric: speed\n", - "Month-Year: 5-2024\n", - "Weekday: 2 Hour: 8\n", - "******* lane number: 5 ********\n", - "# rows: 2606\n", - "equal: 11, imputed > obs: 9, imputed < obs: 373\n", - "% equal 0.004\n", - "greater: 0.003, less: 0.143\n", - "**** values when imputed < obs ****\n", - "count 373.0\n", - "mean 50.034495\n", - "std 13.540193\n", - "min 4.94\n", - "25% 43.45\n", - "50% 54.24\n", - "75% 60.333333\n", - "max 65.3\n", - "Name: lane_5_speed, dtype: Float64\n", - "count 373.0\n", - "mean 226.784718\n", - "std 77.696666\n", - "min 9.6\n", - "25% 172.8\n", - "50% 248.1\n", - "75% 289.1\n", - "max 324.8\n", - "Name: lane_5_obs_speed, dtype: Float64\n", - "****values when imputed > obs *****\n", - "True 6\n", - "False 3\n", - "Name: obs_col_zero, dtype: int64\n", - "\n", - "Metric: speed\n", - "Month-Year: 5-2024\n", - "Weekday: 2 Hour: 8\n", - "******* lane number: 6 ********\n", - "# rows: 2606\n", - "equal: 2, imputed > obs: 1, imputed < obs: 70\n", - "% equal 0.001\n", - "greater: 0.0, less: 0.027\n", - "**** values when imputed < obs ****\n", - "count 70.0\n", - "mean 49.241571\n", - "std 16.127912\n", - "min 5.86\n", - "25% 40.16\n", - "50% 55.28\n", - "75% 62.39\n", - "max 65.04\n", - "Name: lane_6_speed, dtype: Float64\n", - "count 70.0\n", - "mean 233.05\n", - "std 85.957105\n", - "min 17.7\n", - "25% 160.45\n", - "50% 265.65\n", - "75% 310.8\n", - "max 325.2\n", - "Name: lane_6_obs_speed, dtype: Float64\n", - "****values when imputed > obs *****\n", - "True 1\n", - "Name: obs_col_zero, dtype: int64\n", - "\n", - "Metric: speed\n", - "Month-Year: 5-2024\n", - "Weekday: 2 Hour: 8\n", - "******* lane number: 7 ********\n", - "# rows: 2606\n", - "equal: 1, imputed > obs: 0, imputed < obs: 1\n", - "% equal 0.0\n", - "greater: 0.0, less: 0.0\n", - "**** values when imputed < obs ****\n", - "count 1.0\n", - "mean 10.26\n", - "std \n", - "min 10.26\n", - "25% 10.26\n", - "50% 10.26\n", - "75% 10.26\n", - "max 10.26\n", - "Name: lane_7_speed, dtype: Float64\n", - "count 1.0\n", - "mean 38.6\n", - "std \n", - "min 38.6\n", - "25% 38.6\n", - "50% 38.6\n", - "75% 38.6\n", - "max 38.6\n", - "Name: lane_7_obs_speed, dtype: Float64\n", - "****values when imputed > obs *****\n", - "Series([], Name: obs_col_zero, dtype: int64)\n", - "\n", - "Metric: speed\n", - "Month-Year: 5-2024\n", - "Weekday: 2 Hour: 8\n", - "******* lane number: 8 ********\n", - "# rows: 2606\n", - "equal: 0, imputed > obs: 0, imputed < obs: 0\n", - "% equal 0.0\n", - "greater: 0.0, less: 0.0\n", - "**** values when imputed < obs ****\n", - "count 0.0\n", - "mean \n", - "std \n", - "min \n", - "25% \n", - "50% \n", - "75% \n", - "max \n", - "Name: lane_8_speed, dtype: Float64\n", - "count 0.0\n", - "mean \n", - "std \n", - "min \n", - "25% \n", - "50% \n", - "75% \n", - "max \n", - "Name: lane_8_obs_speed, dtype: Float64\n", - "****values when imputed > obs *****\n", - "Series([], Name: obs_col_zero, dtype: int64)\n" + "# rows: 5524535\n", + "equal: 4874919, imputed > obs: 262289, imputed < obs: 273428\n", + "% equal 0.882\n", + "greater: 0.047, less: 0.049\n", + "**** values when imputed < obs ****\n" ] - } - ], - "source": [ - "filtering = [[\n", - " (\"year\", \"==\", 2024), (\"month\", \"==\", 5),\n", - " (\"weekday\", \"==\", 2), (\"hour\", \"==\", 8)\n", - "]]\n", - "\n", - "METRIC = \"speed\"\n", - "df = metric_vs_observed(METRIC, filtering)\n", - "\n", - "for i in range(1, 9):\n", - " lane_comparisons(df, i, METRIC)" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "2c7a4061-b791-422a-97c1-18eae73784b8", - "metadata": {}, - "outputs": [ + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
lane_1_speedlane_1_obs_speed
count273428.0273428.0
mean66.61813367.894924
std9.4468619.228313
min3.454.1
25%64.464.9
50%68.7570.9
75%73.13333374.525
max84.72588.4
\n", + "
" + ], + "text/plain": [ + " lane_1_speed lane_1_obs_speed\n", + "count 273428.0 273428.0\n", + "mean 66.618133 67.894924\n", + "std 9.446861 9.228313\n", + "min 3.45 4.1\n", + "25% 64.4 64.9\n", + "50% 68.75 70.9\n", + "75% 73.133333 74.525\n", + "max 84.725 88.4" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, { "name": "stdout", "output_type": "stream", "text": [ - "\n", - "Metric: speed\n", - "Month-Year: 8-2023\n", - "Weekday: 5 Hour: 7\n", - "******* lane number: 1 ********\n", - "# rows: 2311\n", - "equal: 98, imputed > obs: 31, imputed < obs: 2169\n", - "% equal 0.042\n", - "greater: 0.013, less: 0.939\n", - "**** values when imputed < obs ****\n", - "count 2169.0\n", - "mean 69.474558\n", - "std 5.819333\n", - "min 29.675\n", - "25% 64.85\n", - "50% 71.4\n", - "75% 74.325\n", - "max 83.475\n", - "Name: lane_1_speed, dtype: Float64\n", - "count 2169.0\n", - "mean 260.652006\n", - "std 47.678904\n", - "min 62.9\n", - "25% 255.3\n", - "50% 278.3\n", - "75% 296.5\n", - "max 333.9\n", - "Name: lane_1_obs_speed, dtype: Float64\n", - "****values when imputed > obs *****\n", - "True 26\n", - "False 5\n", - "Name: obs_col_zero, dtype: int64\n", - "\n", - "Metric: speed\n", - "Month-Year: 8-2023\n", - "Weekday: 5 Hour: 7\n", - "******* lane number: 2 ********\n", - "# rows: 2311\n", - "equal: 60, imputed > obs: 27, imputed < obs: 1459\n", - "% equal 0.026\n", - "greater: 0.012, less: 0.631\n", - "**** values when imputed < obs ****\n", - "count 1459.0\n", - "mean 68.689011\n", - "std 4.427647\n", - "min 23.45\n", - "25% 67.6375\n", - "50% 69.55\n", - "75% 70.75\n", - "max 76.95\n", - "Name: lane_2_speed, dtype: Float64\n", - "count 1459.0\n", - "mean 258.858259\n", - "std 44.162084\n", - "min 53.7\n", - "25% 259.1\n", - "50% 276.6\n", - "75% 282.7\n", - "max 307.8\n", - "Name: lane_2_obs_speed, dtype: Float64\n", - "****values when imputed > obs *****\n", - "True 24\n", - "False 3\n", - "Name: obs_col_zero, dtype: int64\n", - "\n", - "Metric: speed\n", - "Month-Year: 8-2023\n", - "Weekday: 5 Hour: 7\n", - "******* lane number: 3 ********\n", - "# rows: 2311\n", - "equal: 51, imputed > obs: 31, imputed < obs: 1356\n", - "% equal 0.022\n", - "greater: 0.013, less: 0.587\n", - "**** values when imputed < obs ****\n", - "count 1356.0\n", - "mean 63.441734\n", - "std 6.004107\n", - "min 40.7\n", - "25% 60.825\n", - "50% 64.783333\n", - "75% 67.175\n", - "max 81.2\n", - "Name: lane_3_speed, dtype: Float64\n", - "count 1356.0\n", - "mean 238.662021\n", - "std 44.415839\n", - "min 56.8\n", - "25% 220.425\n", - "50% 251.0\n", - "75% 267.5\n", - "max 321.7\n", - "Name: lane_3_obs_speed, dtype: Float64\n", - "****values when imputed > obs *****\n", - "True 28\n", - "False 3\n", - "Name: obs_col_zero, dtype: int64\n", - "\n", - "Metric: speed\n", - "Month-Year: 8-2023\n", - "Weekday: 5 Hour: 7\n", - "******* lane number: 4 ********\n", - "# rows: 2311\n", - "equal: 46, imputed > obs: 13, imputed < obs: 1047\n", - "% equal 0.02\n", - "greater: 0.006, less: 0.453\n", - "**** values when imputed < obs ****\n", - "count 1047.0\n", - "mean 61.107649\n", - "std 5.62594\n", - "min 34.925\n", - "25% 59.4\n", - "50% 61.825\n", - "75% 63.8625\n", - "max 72.05\n", - "Name: lane_4_speed, dtype: Float64\n", - "count 1047.0\n", - "mean 229.072015\n", - "std 42.75337\n", - "min 60.0\n", - "25% 211.4\n", - "50% 245.4\n", - "75% 251.1\n", - "max 288.2\n", - "Name: lane_4_obs_speed, dtype: Float64\n", - "****values when imputed > obs *****\n", - "True 10\n", - "False 3\n", - "Name: obs_col_zero, dtype: int64\n", - "\n", - "Metric: speed\n", - "Month-Year: 8-2023\n", - "Weekday: 5 Hour: 7\n", - "******* lane number: 5 ********\n", - "# rows: 2311\n", - "equal: 17, imputed > obs: 2, imputed < obs: 323\n", - "% equal 0.007\n", - "greater: 0.001, less: 0.14\n", - "**** values when imputed < obs ****\n", - "count 323.0\n", - "mean 62.318989\n", - "std 3.415836\n", - "min 42.175\n", - "25% 61.9375\n", - "50% 63.475\n", - "75% 64.2125\n", - "max 71.125\n", - "Name: lane_5_speed, dtype: Float64\n", - "count 323.0\n", - "mean 236.016099\n", - "std 38.740027\n", - "min 63.2\n", - "25% 239.3\n", - "50% 253.0\n", - "75% 256.7\n", - "max 284.5\n", - "Name: lane_5_obs_speed, dtype: Float64\n", - "****values when imputed > obs *****\n", - "True 2\n", - "Name: obs_col_zero, dtype: int64\n", - "\n", - "Metric: speed\n", - "Month-Year: 8-2023\n", - "Weekday: 5 Hour: 7\n", - "******* lane number: 6 ********\n", - "# rows: 2311\n", - "equal: 6, imputed > obs: 0, imputed < obs: 58\n", - "% equal 0.003\n", - "greater: 0.0, less: 0.025\n", - "**** values when imputed < obs ****\n", - "count 58.0\n", - "mean 63.066667\n", - "std 1.775147\n", - "min 55.35\n", - "25% 62.8625\n", - "50% 63.65\n", - "75% 64.19375\n", - "max 64.7\n", - "Name: lane_6_speed, dtype: Float64\n", - "count 58.0\n", - "mean 231.208621\n", - "std 50.683345\n", - "min 63.1\n", - "25% 242.7\n", - "50% 253.85\n", - "75% 256.675\n", - "max 258.8\n", - "Name: lane_6_obs_speed, dtype: Float64\n", "****values when imputed > obs *****\n", - "Series([], Name: obs_col_zero, dtype: int64)\n", - "\n", - "Metric: speed\n", - "Month-Year: 8-2023\n", - "Weekday: 5 Hour: 7\n", - "******* lane number: 7 ********\n", - "# rows: 2311\n", - "equal: 0, imputed > obs: 0, imputed < obs: 2\n", - "% equal 0.0\n", - "greater: 0.0, less: 0.001\n", - "**** values when imputed < obs ****\n", - "count 2.0\n", - "mean 62.366667\n", - "std 1.791337\n", - "min 61.1\n", - "25% 61.733333\n", - "50% 62.366667\n", - "75% 63.0\n", - "max 63.633333\n", - "Name: lane_7_speed, dtype: Float64\n", - "count 2.0\n", - "mean 217.65\n", - "std 37.830213\n", - "min 190.9\n", - "25% 204.275\n", - "50% 217.65\n", - "75% 231.025\n", - "max 244.4\n", - "Name: lane_7_obs_speed, dtype: Float64\n", + "False 262289\n", + "Name: obs_col_zero, dtype: int64\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
lane_2_speedlane_2_obs_speed
count3668484.03606628.0
mean64.51299364.604668
std9.8646579.928083
min3.03.0
25%62.8563.0
50%67.4567.65
75%70.4570.525
max89.67589.675
\n", + "
" + ], + "text/plain": [ + " lane_2_speed lane_2_obs_speed\n", + "count 3668484.0 3606628.0\n", + "mean 64.512993 64.604668\n", + "std 9.864657 9.928083\n", + "min 3.0 3.0\n", + "25% 62.85 63.0\n", + "50% 67.45 67.65\n", + "75% 70.45 70.525\n", + "max 89.675 89.675" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "# rows: 5524535\n", + "equal: 3241065, imputed > obs: 167026, imputed < obs: 198537\n", + "% equal 0.587\n", + "greater: 0.03, less: 0.036\n", + "**** values when imputed < obs ****\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
lane_2_speedlane_2_obs_speed
count198537.0198537.0
mean65.29103366.774167
std8.5073668.210417
min3.94.3
25%63.77565.0
50%67.47569.3
75%70.170.966667
max85.12589.3
\n", + "
" + ], + "text/plain": [ + " lane_2_speed lane_2_obs_speed\n", + "count 198537.0 198537.0\n", + "mean 65.291033 66.774167\n", + "std 8.507366 8.210417\n", + "min 3.9 4.3\n", + "25% 63.775 65.0\n", + "50% 67.475 69.3\n", + "75% 70.1 70.966667\n", + "max 85.125 89.3" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ "****values when imputed > obs *****\n", - "Series([], Name: obs_col_zero, dtype: int64)\n", - "\n", - "Metric: speed\n", - "Month-Year: 8-2023\n", - "Weekday: 5 Hour: 7\n", - "******* lane number: 8 ********\n", - "# rows: 2311\n", - "equal: 0, imputed > obs: 0, imputed < obs: 0\n", - "% equal 0.0\n", - "greater: 0.0, less: 0.0\n", - "**** values when imputed < obs ****\n", - "count 0.0\n", - "mean \n", - "std \n", - "min \n", - "25% \n", - "50% \n", - "75% \n", - "max \n", - "Name: lane_8_speed, dtype: Float64\n", - "count 0.0\n", - "mean \n", - "std \n", - "min \n", - "25% \n", - "50% \n", - "75% \n", - "max \n", - "Name: lane_8_obs_speed, dtype: Float64\n", + "False 167026\n", + "Name: obs_col_zero, dtype: int64\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
lane_3_speedlane_3_obs_speed
count3346786.03289997.0
mean60.46040560.582831
std10.02204210.065959
min3.03.0
25%57.47557.65
50%62.6262.675
75%66.8566.925
max90.790.7
\n", + "
" + ], + "text/plain": [ + " lane_3_speed lane_3_obs_speed\n", + "count 3346786.0 3289997.0\n", + "mean 60.460405 60.582831\n", + "std 10.022042 10.065959\n", + "min 3.0 3.0\n", + "25% 57.475 57.65\n", + "50% 62.62 62.675\n", + "75% 66.85 66.925\n", + "max 90.7 90.7" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "# rows: 5524535\n", + "equal: 2944499, imputed > obs: 156529, imputed < obs: 188969\n", + "% equal 0.533\n", + "greater: 0.028, less: 0.034\n", + "**** values when imputed < obs ****\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
lane_3_speedlane_3_obs_speed
count188969.0188969.0
mean60.94324962.748345
std8.9477318.648396
min3.1333333.25
25%58.460.866667
50%62.62564.3
75%66.5267.4
max85.67589.05
\n", + "
" + ], + "text/plain": [ + " lane_3_speed lane_3_obs_speed\n", + "count 188969.0 188969.0\n", + "mean 60.943249 62.748345\n", + "std 8.947731 8.648396\n", + "min 3.133333 3.25\n", + "25% 58.4 60.866667\n", + "50% 62.625 64.3\n", + "75% 66.52 67.4\n", + "max 85.675 89.05" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ "****values when imputed > obs *****\n", - "Series([], Name: obs_col_zero, dtype: int64)\n" + "False 156529\n", + "Name: obs_col_zero, dtype: int64\n" ] - } - ], - "source": [ - "filtering = [[\n", - " (\"year\", \"==\", 2023), (\"month\", \"==\", 8),\n", - " (\"weekday\", \"==\", 5), (\"hour\", \"==\", 7)\n", - "]]\n", - "\n", - "df = metric_vs_observed(METRIC, filtering)\n", - "\n", - "for i in range(1, 9):\n", - " lane_comparisons(df, i, METRIC)" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "id": "8473d5a9-11ed-4e43-b553-1464c7a4aa24", - "metadata": {}, - "outputs": [ + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
lane_4_speedlane_4_obs_speed
count2577647.02540089.0
mean58.18435858.260981
std9.7040249.78274
min3.03.0
25%55.255.275
50%61.1261.24
75%62.862.875
max88.0588.05
\n", + "
" + ], + "text/plain": [ + " lane_4_speed lane_4_obs_speed\n", + "count 2577647.0 2540089.0\n", + "mean 58.184358 58.260981\n", + "std 9.704024 9.78274\n", + "min 3.0 3.0\n", + "25% 55.2 55.275\n", + "50% 61.12 61.24\n", + "75% 62.8 62.875\n", + "max 88.05 88.05" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "# rows: 5524535\n", + "equal: 2270198, imputed > obs: 126301, imputed < obs: 143590\n", + "% equal 0.411\n", + "greater: 0.023, less: 0.026\n", + "**** values when imputed < obs ****\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
lane_4_speedlane_4_obs_speed
count143590.0143590.0
mean58.94418160.458667
std8.5655168.504824
min3.023.025
25%56.72558.375
50%61.03333362.35
75%63.065.033333
max80.62580.975
\n", + "
" + ], + "text/plain": [ + " lane_4_speed lane_4_obs_speed\n", + "count 143590.0 143590.0\n", + "mean 58.944181 60.458667\n", + "std 8.565516 8.504824\n", + "min 3.02 3.025\n", + "25% 56.725 58.375\n", + "50% 61.033333 62.35\n", + "75% 63.0 65.033333\n", + "max 80.625 80.975" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, { "name": "stdout", "output_type": "stream", "text": [ - "\n", - "Metric: speed\n", - "Month-Year: 10-2023\n", - "Weekday: 3 Hour: 13\n", - "******* lane number: 1 ********\n", - "# rows: 2604\n", - "equal: 89, imputed > obs: 67, imputed < obs: 2430\n", - "% equal 0.034\n", - "greater: 0.026, less: 0.933\n", - "**** values when imputed < obs ****\n", - "count 2430.0\n", - "mean 64.361485\n", - "std 9.077721\n", - "min 17.875\n", - "25% 60.73125\n", - "50% 65.0\n", - "75% 71.075\n", - "max 78.9\n", - "Name: lane_1_speed, dtype: Float64\n", - "count 2430.0\n", - "mean 238.871605\n", - "std 55.517956\n", - "min 51.3\n", - "25% 212.4\n", - "50% 258.0\n", - "75% 280.4\n", - "max 315.6\n", - "Name: lane_1_obs_speed, dtype: Float64\n", "****values when imputed > obs *****\n", - "True 49\n", - "False 18\n", - "Name: obs_col_zero, dtype: int64\n", - "\n", - "Metric: speed\n", - "Month-Year: 10-2023\n", - "Weekday: 3 Hour: 13\n", - "******* lane number: 2 ********\n", - "# rows: 2604\n", - "equal: 67, imputed > obs: 40, imputed < obs: 1622\n", - "% equal 0.026\n", - "greater: 0.015, less: 0.623\n", - "**** values when imputed < obs ****\n", - "count 1622.0\n", - "mean 61.670176\n", - "std 8.343252\n", - "min 17.9\n", - "25% 58.089583\n", - "50% 63.875\n", - "75% 67.25\n", - "max 74.375\n", - "Name: lane_2_speed, dtype: Float64\n", - "count 1622.0\n", - "mean 228.833847\n", - "std 51.732604\n", - "min 54.2\n", - "25% 204.05\n", - "50% 247.35\n", - "75% 266.3\n", - "max 297.5\n", - "Name: lane_2_obs_speed, dtype: Float64\n", + "False 126301\n", + "Name: obs_col_zero, dtype: int64\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
lane_5_speedlane_5_obs_speed
count834680.0824582.0
mean58.8222158.882487
std8.9076458.989742
min3.03.0
25%56.92557.02
50%62.762.866667
75%64.264.225
max88.088.0
\n", + "
" + ], + "text/plain": [ + " lane_5_speed lane_5_obs_speed\n", + "count 834680.0 824582.0\n", + "mean 58.82221 58.882487\n", + "std 8.907645 8.989742\n", + "min 3.0 3.0\n", + "25% 56.925 57.02\n", + "50% 62.7 62.866667\n", + "75% 64.2 64.225\n", + "max 88.0 88.0" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "# rows: 5524535\n", + "equal: 733822, imputed > obs: 42517, imputed < obs: 48243\n", + "% equal 0.133\n", + "greater: 0.008, less: 0.009\n", + "**** values when imputed < obs ****\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
lane_5_speedlane_5_obs_speed
count48243.048243.0
mean59.32059860.755841
std7.833197.553725
min3.1333333.2
25%58.060.2
50%62.363.75
75%63.7564.5
max74.1584.0
\n", + "
" + ], + "text/plain": [ + " lane_5_speed lane_5_obs_speed\n", + "count 48243.0 48243.0\n", + "mean 59.320598 60.755841\n", + "std 7.83319 7.553725\n", + "min 3.133333 3.2\n", + "25% 58.0 60.2\n", + "50% 62.3 63.75\n", + "75% 63.75 64.5\n", + "max 74.15 84.0" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ "****values when imputed > obs *****\n", - "True 30\n", - "False 10\n", - "Name: obs_col_zero, dtype: int64\n", - "\n", - "Metric: speed\n", - "Month-Year: 10-2023\n", - "Weekday: 3 Hour: 13\n", - "******* lane number: 3 ********\n", - "# rows: 2604\n", - "equal: 64, imputed > obs: 41, imputed < obs: 1483\n", + "False 42517\n", + "Name: obs_col_zero, dtype: int64\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
lane_6_speedlane_6_obs_speed
count157727.0155293.0
mean59.83393959.845897
std8.404688.450482
min3.03.0
25%59.259.2
50%63.463.466667
75%64.364.3
max77.177.1
\n", + "
" + ], + "text/plain": [ + " lane_6_speed lane_6_obs_speed\n", + "count 157727.0 155293.0\n", + "mean 59.833939 59.845897\n", + "std 8.40468 8.450482\n", + "min 3.0 3.0\n", + "25% 59.2 59.2\n", + "50% 63.4 63.466667\n", + "75% 64.3 64.3\n", + "max 77.1 77.1" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "# rows: 5524535\n", + "equal: 139096, imputed > obs: 7865, imputed < obs: 8332\n", "% equal 0.025\n", - "greater: 0.016, less: 0.57\n", - "**** values when imputed < obs ****\n", - "count 1483.0\n", - "mean 55.931468\n", - "std 8.478252\n", - "min 15.35\n", - "25% 51.7\n", - "50% 57.0\n", - "75% 61.9\n", - "max 77.175\n", - "Name: lane_3_speed, dtype: Float64\n", - "count 1483.0\n", - "mean 207.672218\n", - "std 48.982583\n", - "min 45.9\n", - "25% 186.7\n", - "50% 218.9\n", - "75% 243.25\n", - "max 308.7\n", - "Name: lane_3_obs_speed, dtype: Float64\n", - "****values when imputed > obs *****\n", - "True 26\n", - "False 15\n", - "Name: obs_col_zero, dtype: int64\n", - "\n", - "Metric: speed\n", - "Month-Year: 10-2023\n", - "Weekday: 3 Hour: 13\n", - "******* lane number: 4 ********\n", - "# rows: 2604\n", - "equal: 58, imputed > obs: 27, imputed < obs: 1137\n", - "% equal 0.022\n", - "greater: 0.01, less: 0.437\n", - "**** values when imputed < obs ****\n", - "count 1137.0\n", - "mean 54.160195\n", - "std 7.923261\n", - "min 17.425\n", - "25% 50.575\n", - "50% 55.0\n", - "75% 59.475\n", - "max 76.75\n", - "Name: lane_4_speed, dtype: Float64\n", - "count 1137.0\n", - "mean 200.304222\n", - "std 46.375729\n", - "min 53.3\n", - "25% 180.8\n", - "50% 212.2\n", - "75% 231.1\n", - "max 307.0\n", - "Name: lane_4_obs_speed, dtype: Float64\n", - "****values when imputed > obs *****\n", - "True 17\n", - "False 10\n", - "Name: obs_col_zero, dtype: int64\n", - "\n", - "Metric: speed\n", - "Month-Year: 10-2023\n", - "Weekday: 3 Hour: 13\n", - "******* lane number: 5 ********\n", - "# rows: 2604\n", - "equal: 17, imputed > obs: 7, imputed < obs: 378\n", - "% equal 0.007\n", - "greater: 0.003, less: 0.145\n", - "**** values when imputed < obs ****\n", - "count 378.0\n", - "mean 56.131768\n", - "std 7.750725\n", - "min 19.7\n", - "25% 52.09375\n", - "50% 57.0375\n", - "75% 63.16875\n", - "max 64.875\n", - "Name: lane_5_speed, dtype: Float64\n", - "count 378.0\n", - "mean 205.230159\n", - "std 50.357375\n", - "min 41.7\n", - "25% 185.575\n", - "50% 220.0\n", - "75% 246.5\n", - "max 259.5\n", - "Name: lane_5_obs_speed, dtype: Float64\n", - "****values when imputed > obs *****\n", - "True 6\n", - "False 1\n", - "Name: obs_col_zero, dtype: int64\n", - "\n", - "Metric: speed\n", - "Month-Year: 10-2023\n", - "Weekday: 3 Hour: 13\n", - "******* lane number: 6 ********\n", - "# rows: 2604\n", - "equal: 4, imputed > obs: 2, imputed < obs: 74\n", - "% equal 0.002\n", - "greater: 0.001, less: 0.028\n", - "**** values when imputed < obs ****\n", - "count 74.0\n", - "mean 58.568018\n", - "std 6.357801\n", - "min 38.35\n", - "25% 55.93125\n", - "50% 60.8875\n", - "75% 63.41875\n", - "max 66.25\n", - "Name: lane_6_speed, dtype: Float64\n", - "count 74.0\n", - "mean 207.535135\n", - "std 54.252586\n", - "min 64.5\n", - "25% 186.675\n", - "50% 225.9\n", - "75% 253.175\n", - "max 257.8\n", - "Name: lane_6_obs_speed, dtype: Float64\n", + "greater: 0.001, less: 0.002\n", + "**** values when imputed < obs ****\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
lane_6_speedlane_6_obs_speed
count8332.08332.0
mean60.68595961.573892
std7.1579286.932292
min3.1333333.15
25%60.77562.2
50%63.17564.0
75%64.0564.46
max68.7573.9
\n", + "
" + ], + "text/plain": [ + " lane_6_speed lane_6_obs_speed\n", + "count 8332.0 8332.0\n", + "mean 60.685959 61.573892\n", + "std 7.157928 6.932292\n", + "min 3.133333 3.15\n", + "25% 60.775 62.2\n", + "50% 63.175 64.0\n", + "75% 64.05 64.46\n", + "max 68.75 73.9" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ "****values when imputed > obs *****\n", - "True 2\n", - "Name: obs_col_zero, dtype: int64\n", - "\n", - "Metric: speed\n", - "Month-Year: 10-2023\n", - "Weekday: 3 Hour: 13\n", - "******* lane number: 7 ********\n", - "# rows: 2604\n", - "equal: 1, imputed > obs: 0, imputed < obs: 1\n", - "% equal 0.0\n", + "False 7865\n", + "Name: obs_col_zero, dtype: int64\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
lane_7_speedlane_7_obs_speed
count4202.04167.0
mean57.52335657.620076
std12.66394612.560834
min3.3333333.333333
25%58.8558.958333
50%63.063.05
75%63.963.933333
max73.373.3
\n", + "
" + ], + "text/plain": [ + " lane_7_speed lane_7_obs_speed\n", + "count 4202.0 4167.0\n", + "mean 57.523356 57.620076\n", + "std 12.663946 12.560834\n", + "min 3.333333 3.333333\n", + "25% 58.85 58.958333\n", + "50% 63.0 63.05\n", + "75% 63.9 63.933333\n", + "max 73.3 73.3" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "# rows: 5524535\n", + "equal: 3636, imputed > obs: 208, imputed < obs: 323\n", + "% equal 0.001\n", "greater: 0.0, less: 0.0\n", - "**** values when imputed < obs ****\n", - "count 1.0\n", - "mean 63.5\n", - "std \n", - "min 63.5\n", - "25% 63.5\n", - "50% 63.5\n", - "75% 63.5\n", - "max 63.5\n", - "Name: lane_7_speed, dtype: Float64\n", - "count 1.0\n", - "mean 190.5\n", - "std \n", - "min 190.5\n", - "25% 190.5\n", - "50% 190.5\n", - "75% 190.5\n", - "max 190.5\n", - "Name: lane_7_obs_speed, dtype: Float64\n", + "**** values when imputed < obs ****\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
lane_7_speedlane_7_obs_speed
count323.0323.0
mean58.02472159.346352
std12.73936812.055794
min5.86.05
25%61.762.473333
50%63.163.666667
75%63.75564.229167
max65.972.6
\n", + "
" + ], + "text/plain": [ + " lane_7_speed lane_7_obs_speed\n", + "count 323.0 323.0\n", + "mean 58.024721 59.346352\n", + "std 12.739368 12.055794\n", + "min 5.8 6.05\n", + "25% 61.7 62.473333\n", + "50% 63.1 63.666667\n", + "75% 63.755 64.229167\n", + "max 65.9 72.6" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ "****values when imputed > obs *****\n", - "Series([], Name: obs_col_zero, dtype: int64)\n", - "\n", - "Metric: speed\n", - "Month-Year: 10-2023\n", - "Weekday: 3 Hour: 13\n", - "******* lane number: 8 ********\n", - "# rows: 2604\n", + "False 208\n", + "Name: obs_col_zero, dtype: int64\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
lane_8_speedlane_8_obs_speed
count0.00.0
mean<NA><NA>
std<NA><NA>
min<NA><NA>
25%<NA><NA>
50%<NA><NA>
75%<NA><NA>
max<NA><NA>
\n", + "
" + ], + "text/plain": [ + " lane_8_speed lane_8_obs_speed\n", + "count 0.0 0.0\n", + "mean \n", + "std \n", + "min \n", + "25% \n", + "50% \n", + "75% \n", + "max " + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "# rows: 5524535\n", "equal: 0, imputed > obs: 0, imputed < obs: 0\n", "% equal 0.0\n", "greater: 0.0, less: 0.0\n", - "**** values when imputed < obs ****\n", - "count 0.0\n", - "mean \n", - "std \n", - "min \n", - "25% \n", - "50% \n", - "75% \n", - "max \n", - "Name: lane_8_speed, dtype: Float64\n", - "count 0.0\n", - "mean \n", - "std \n", - "min \n", - "25% \n", - "50% \n", - "75% \n", - "max \n", - "Name: lane_8_obs_speed, dtype: Float64\n", + "**** values when imputed < obs ****\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
lane_8_speedlane_8_obs_speed
count0.00.0
mean<NA><NA>
std<NA><NA>
min<NA><NA>
25%<NA><NA>
50%<NA><NA>
75%<NA><NA>
max<NA><NA>
\n", + "
" + ], + "text/plain": [ + " lane_8_speed lane_8_obs_speed\n", + "count 0.0 0.0\n", + "mean \n", + "std \n", + "min \n", + "25% \n", + "50% \n", + "75% \n", + "max " + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ "****values when imputed > obs *****\n", "Series([], Name: obs_col_zero, dtype: int64)\n" ] } ], "source": [ - "filtering = [[\n", - " (\"year\", \"==\", 2023), (\"month\", \"==\", 10),\n", - " (\"weekday\", \"==\", 3), (\"hour\", \"==\", 13)\n", - "]]\n", - "\n", - "df = metric_vs_observed(METRIC, filtering)\n", - "\n", "for i in range(1, 9):\n", - " lane_comparisons(df, i, METRIC)" + " lane_comparisons(speed_df, i, \"speed\")" ] }, {