From 204f0bec253938af7e144ab249dfeae45e238a06 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 25 Nov 2024 20:56:13 +0000 Subject: [PATCH] ci: auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- ...1.0-mb-data-preprocessing-mem-reduce.ipynb | 22 +- ....0a-mb-data-preprocessing-supervised.ipynb | 62 +- ...b-mb-data-preprocessing-unsupervised.ipynb | 73 +- .../3.0a-mb-explanatory-data-analysis.ipynb | 313 +++-- ....0b-mb-explanatory-matched-unmatched.ipynb | 51 +- notebooks/3.0c-feature-engineering.ipynb | 147 ++- notebooks/3.0d-mb-adv_val.ipynb | 26 +- notebooks/4.0a-mb-logistic-regression.ipynb | 65 +- notebooks/4.0b-mb-fttransformer.ipynb | 57 +- notebooks/4.0c-mb-feature-importances.ipynb | 479 ++++--- .../4.0e-mb-fttransformer-pretraining.ipynb | 124 +- notebooks/5.0a-mb-batch-size-finder.ipynb | 9 +- notebooks/6.0a-mb-results-fttransformer.ipynb | 51 +- .../6.0b-mb-results-classical-rules.ipynb | 40 +- notebooks/6.0c-mb-results-universal.ipynb | 233 ++-- .../6.0d-mb-results-gradient-boosting.ipynb | 17 +- notebooks/6.0e-mb-viz-universal.ipynb | 1158 ++++++++++++----- notebooks/6.0f-mb-viz-gradient-boosting.ipynb | 127 +- notebooks/6.0g-mb-viz-fttransformer.ipynb | 41 +- notebooks/6.0h-mb-viz-embeddings.ipynb | 79 +- notebooks/6.0i-mb-discussion.ipynb | 163 ++- src/otc/models/fttransformer.py | 6 +- 22 files changed, 2016 insertions(+), 1327 deletions(-) diff --git a/notebooks/1.0-mb-data-preprocessing-mem-reduce.ipynb b/notebooks/1.0-mb-data-preprocessing-mem-reduce.ipynb index 9291c322..385295a0 100644 --- a/notebooks/1.0-mb-data-preprocessing-mem-reduce.ipynb +++ b/notebooks/1.0-mb-data-preprocessing-mem-reduce.ipynb @@ -31,7 +31,7 @@ "ProgressBar.enable()\n", "\n", "import wandb\n", - "from tqdm.auto import tqdm\n" + "from tqdm.auto import tqdm" ] }, { @@ -47,7 +47,7 @@ "FILE_PATH_INPUT = (\n", " \"gs://thesis-bucket-option-trade-classification/data/raw/matched_cboe_quotes.csv\"\n", ")\n", - "FILE_PATH_OUTPUT = \"gs://thesis-bucket-option-trade-classification/data/preprocessed/\"\n" + "FILE_PATH_OUTPUT = \"gs://thesis-bucket-option-trade-classification/data/preprocessed/\"" ] }, { @@ -58,7 +58,7 @@ "source": [ "os.environ[\"GCLOUD_PROJECT\"] = \"flowing-mantis-239216\"\n", "credentials, _ = google.auth.default()\n", - "fs = gcsfs.GCSFileSystem(project=\"thesis\", token=credentials)\n" + "fs = gcsfs.GCSFileSystem(project=\"thesis\", token=credentials)" ] }, { @@ -76,7 +76,7 @@ "source": [ "# connect to weights and biases\n", "run = wandb.init(project=\"thesis\", job_type=\"dataset-creation\", entity=\"fbv\")\n", - "dataset = wandb.Artifact(name=f\"{EXCHANGE}_{STRATEGY}_csv\", type=\"raw_data\")\n" + "dataset = wandb.Artifact(name=f\"{EXCHANGE}_{STRATEGY}_csv\", type=\"raw_data\")" ] }, { @@ -88,8 +88,7 @@ "outputs": [], "source": [ "def import_data(input_file: str) -> pd.DataFrame:\n", - " \"\"\"\n", - " create a dataframe and optimize its memory usage.\n", + " \"\"\"Create a dataframe and optimize its memory usage.\n", "\n", " I.e., apply some optimizations i.e, manual inference of dtypes, pre-selection\n", " of unique columns and chunking to enable import.\n", @@ -189,7 +188,7 @@ "\n", " format = \"%d%b%y:%H:%M:%S\"\n", " df[\"QUOTE_DATETIME\"] = pd.to_datetime(df[\"QUOTE_DATETIME\"], format=format)\n", - " return df\n" + " return df" ] }, { @@ -203,8 +202,7 @@ "def df_to_parquet(\n", " x: pd.DataFrame, target_dir: str, chunk_size: int = 1000000, **parquet_wargs\n", ") -> None:\n", - " \"\"\"\n", - " Write pd.DataFrame to parquet format.\n", + " \"\"\"Write pd.DataFrame to parquet format.\n", "\n", " Args:\n", " x (pd.DataFrame): input dataframe.\n", @@ -222,7 +220,7 @@ " slc.to_parquet(output_path, **parquet_wargs)\n", "\n", " # log in w & b\n", - " dataset.add_reference(output_path, name=f\"raw_parquet_{chunk:04d}\")\n" + " dataset.add_reference(output_path, name=f\"raw_parquet_{chunk:04d}\")" ] }, { @@ -805,7 +803,7 @@ "client = Client()\n", "\n", "df = import_data(FILE_PATH_INPUT)\n", - "df_to_parquet(df, FILE_PATH_OUTPUT)\n" + "df_to_parquet(df, FILE_PATH_OUTPUT)" ] }, { @@ -833,7 +831,7 @@ "source": [ "# Log the artifact to save it as an output of this run\n", "run.log_artifact(dataset)\n", - "wandb.finish()\n" + "wandb.finish()" ] } ], diff --git a/notebooks/2.0a-mb-data-preprocessing-supervised.ipynb b/notebooks/2.0a-mb-data-preprocessing-supervised.ipynb index 0716ca5d..5232b639 100644 --- a/notebooks/2.0a-mb-data-preprocessing-supervised.ipynb +++ b/notebooks/2.0a-mb-data-preprocessing-supervised.ipynb @@ -21,7 +21,7 @@ "from pandas._testing.asserters import assert_almost_equal\n", "from tqdm.auto import tqdm\n", "\n", - "sys.path.append(\"..\")\n" + "sys.path.append(\"..\")" ] }, { @@ -34,7 +34,7 @@ "source": [ "EXCHANGE = \"cboe\" # \"ise\"\n", "STRATEGY = \"transfer\" # \"supervised\"\n", - "max_i = 50 if EXCHANGE == \"ise\" else 38 # number of partial files\n" + "max_i = 50 if EXCHANGE == \"ise\" else 38 # number of partial files" ] }, { @@ -53,7 +53,7 @@ "source": [ "# connect to weights and biases\n", "run = wandb.init(project=\"thesis\", job_type=\"dataset-creation\", entity=\"fbv\")\n", - "dataset = wandb.Artifact(name=f\"{EXCHANGE}_{STRATEGY}_raw\", type=\"preprocessed_data\")\n" + "dataset = wandb.Artifact(name=f\"{EXCHANGE}_{STRATEGY}_raw\", type=\"preprocessed_data\")" ] }, { @@ -67,7 +67,7 @@ "source": [ "os.environ[\"GCLOUD_PROJECT\"] = \"flowing-mantis-239216\"\n", "\n", - "fs = gcsfs.GCSFileSystem(project=\"thesis\")\n" + "fs = gcsfs.GCSFileSystem(project=\"thesis\")" ] }, { @@ -99,7 +99,7 @@ "source": [ "files = [\n", " f\"gs://thesis-bucket-option-trade-classification/data/preprocessed/{'unmatched' if STRATEGY == 'unsupervised' else 'matched'}_{EXCHANGE}_quotes_min_mem_usage_extended_part_{i:04d}.parquet\"\n", - " for i in range(0, max_i)\n", + " for i in range(max_i)\n", "]\n", "\n", "columns = [\n", @@ -130,7 +130,7 @@ "dfs = [pd.read_parquet(gc_file, columns=columns) for gc_file in tqdm(files)]\n", "df = pd.concat(dfs)\n", "\n", - "del dfs\n" + "del dfs" ] }, { @@ -146,7 +146,7 @@ }, "outputs": [], "source": [ - "df.memory_usage(deep=True).sum()\n" + "df.memory_usage(deep=True).sum()" ] }, { @@ -157,7 +157,7 @@ }, "outputs": [], "source": [ - "len(df)\n" + "len(df)" ] }, { @@ -207,7 +207,7 @@ " assert_almost_equal(\n", " stats_trade_size.values.tolist(), [18.14, 5.0, 223.24], atol=0.1\n", " )\n", - " assert_almost_equal(stats_buy_trades, 0.4500, atol=0.01)\n" + " assert_almost_equal(stats_buy_trades, 0.4500, atol=0.01)" ] }, { @@ -227,7 +227,7 @@ }, "outputs": [], "source": [ - "df.sort_values(by=\"QUOTE_DATETIME\", inplace=True)\n" + "df.sort_values(by=\"QUOTE_DATETIME\", inplace=True)" ] }, { @@ -263,7 +263,7 @@ "\n", "if EXCHANGE == \"cboe\" and STRATEGY == \"transfer\":\n", " # use everything after *ISE* validation set for transfer learning\n", - " test_range = df.QUOTE_DATETIME.between(\"2015-11-06 00:00:01\", \"2017-10-31 23:59:00\")\n" + " test_range = df.QUOTE_DATETIME.between(\"2015-11-06 00:00:01\", \"2017-10-31 23:59:00\")" ] }, { @@ -280,7 +280,6 @@ "outputs": [], "source": [ "if STRATEGY == \"supervised\":\n", - "\n", " train = df[train_range]\n", "\n", " len_train = len(train)\n", @@ -317,7 +316,7 @@ "\n", " output_path = f\"gs://thesis-bucket-option-trade-classification/data/preprocessed/{EXCHANGE}_{STRATEGY}_test.parquet\"\n", " test.to_parquet(output_path)\n", - " dataset.add_reference(output_path, name=\"test_set\")\n" + " dataset.add_reference(output_path, name=\"test_set\")" ] }, { @@ -347,7 +346,7 @@ "# Log the artifact to save it as an output of this run\n", "run.log_artifact(dataset)\n", "\n", - "wandb.finish()\n" + "wandb.finish()" ] }, { @@ -370,7 +369,7 @@ "val = pd.read_parquet(\n", " \"gs://thesis-bucket-option-trade-classification/data/ise_log_standardized/val_set.parquet\",\n", " engine=\"fastparquet\",\n", - ")\n" + ")" ] }, { @@ -382,7 +381,7 @@ "val = pd.read_parquet(\n", " \"gs://thesis-bucket-option-trade-classification/data/ise_log_standardized/val_set_20.parquet\",\n", " engine=\"fastparquet\",\n", - ")\n" + ")" ] }, { @@ -392,7 +391,7 @@ "outputs": [], "source": [ "y_train = train[\"buy_sell\"]\n", - "X_train = train.drop(columns=[\"buy_sell\"])\n" + "X_train = train.drop(columns=[\"buy_sell\"])" ] }, { @@ -402,7 +401,7 @@ "outputs": [], "source": [ "y_val = val[\"buy_sell\"]\n", - "X_val = val.drop(columns=[\"buy_sell\"])\n" + "X_val = val.drop(columns=[\"buy_sell\"])" ] }, { @@ -411,7 +410,7 @@ "metadata": {}, "outputs": [], "source": [ - "X_train.head()\n" + "X_train.head()" ] }, { @@ -431,7 +430,6 @@ " timestamp = np.linspace(0, 1, length)\n", " # keep weight fixed\n", " for strategy in [\"uniform\", \"exponential\"]:\n", - "\n", " if strategy == \"uniform\":\n", " weight = np.ones(length)\n", " else:\n", @@ -474,7 +472,7 @@ " \"strategy\": strategy,\n", " }\n", " print(res)\n", - " results_p.append(res)\n" + " results_p.append(res)" ] }, { @@ -483,7 +481,7 @@ "metadata": {}, "outputs": [], "source": [ - "results_df = pd.DataFrame(results_p)\n" + "results_df = pd.DataFrame(results_p)" ] }, { @@ -492,7 +490,7 @@ "metadata": {}, "outputs": [], "source": [ - "results_df\n" + "results_df" ] }, { @@ -501,7 +499,7 @@ "metadata": {}, "outputs": [], "source": [ - "results_df.to_csv(\"learning_curves_gbm_default_params.csv\")\n" + "results_df.to_csv(\"learning_curves_gbm_default_params.csv\")" ] }, { @@ -533,7 +531,7 @@ "data = pd.read_parquet(\n", " \"gs://thesis-bucket-option-trade-classification/data/classical_size_features_log_normalized/train_set_extended_60.parquet\",\n", " engine=\"fastparquet\",\n", - ")\n" + ")" ] }, { @@ -546,7 +544,7 @@ "source": [ "# try to predict last 10 % in training set using first 10 % of features. Accuracy should be above 50 %.\n", "label = data[\"buy_sell\"]\n", - "data.drop(columns=[\"buy_sell\"], inplace=True)\n" + "data.drop(columns=[\"buy_sell\"], inplace=True)" ] }, { @@ -563,7 +561,7 @@ "X_train = data.iloc[0 : len(data) // 10, :]\n", "X_test = data.iloc[-len(data) // 10 :, :]\n", "\n", - "del label, data\n" + "del label, data" ] }, { @@ -578,7 +576,7 @@ }, "outputs": [], "source": [ - "y_train.shape\n" + "y_train.shape" ] }, { @@ -599,7 +597,7 @@ " \"eval_metric\": \"Accuracy\",\n", " \"iterations\": 1000,\n", " \"early_stopping_rounds\": 100,\n", - "}\n" + "}" ] }, { @@ -610,7 +608,7 @@ }, "outputs": [], "source": [ - "columns = X_train.columns\n" + "columns = X_train.columns" ] }, { @@ -644,7 +642,7 @@ " model = CatBoostClassifier(**params)\n", " model.fit(X_train[[col]], y_train, eval_set=(X_test[[col]], y_test))\n", " acc = model.score(X_test[[col]], y_test)\n", - " results.append([col, acc])\n" + " results.append([col, acc])" ] }, { @@ -661,7 +659,7 @@ "outputs": [], "source": [ "results_df = pd.DataFrame(results, columns=[\"feature\", \"accuracy\"])\n", - "results_df.sort_values(by=\"accuracy\")\n" + "results_df.sort_values(by=\"accuracy\")" ] }, { diff --git a/notebooks/2.0b-mb-data-preprocessing-unsupervised.ipynb b/notebooks/2.0b-mb-data-preprocessing-unsupervised.ipynb index 3fa34286..653545e7 100644 --- a/notebooks/2.0b-mb-data-preprocessing-unsupervised.ipynb +++ b/notebooks/2.0b-mb-data-preprocessing-unsupervised.ipynb @@ -14,8 +14,7 @@ "import gcsfs\n", "import pandas as pd\n", "import wandb\n", - "\n", - "from tqdm.auto import tqdm\n" + "from tqdm.auto import tqdm" ] }, { @@ -28,7 +27,7 @@ "source": [ "exchange = \"ise\"\n", "strategy = \"unsupervised\"\n", - "max_i = 30 # number of partial files\n" + "max_i = 30 # number of partial files" ] }, { @@ -47,7 +46,7 @@ "source": [ "# connect to weights and biases\n", "run = wandb.init(project=\"thesis\", job_type=\"dataset-creation\", entity=\"fbv\")\n", - "dataset = wandb.Artifact(name=f\"{exchange}_{strategy}_raw\", type=\"preprocessed_data\")\n" + "dataset = wandb.Artifact(name=f\"{exchange}_{strategy}_raw\", type=\"preprocessed_data\")" ] }, { @@ -61,7 +60,7 @@ "source": [ "os.environ[\"GCLOUD_PROJECT\"] = \"flowing-mantis-239216\"\n", "\n", - "fs = gcsfs.GCSFileSystem(project=\"thesis\")\n" + "fs = gcsfs.GCSFileSystem(project=\"thesis\")" ] }, { @@ -93,7 +92,7 @@ "source": [ "files = [\n", " f\"gs://thesis-bucket-option-trade-classification/data/preprocessed/{'matched' if strategy == 'supervised' else 'unmatched'}_{exchange}_quotes_min_mem_usage_extended_part_{i:04d}.parquet\"\n", - " for i in range(0, max_i)\n", + " for i in range(max_i)\n", "]\n", "\n", "columns = [\n", @@ -122,7 +121,7 @@ "dfs = [pd.read_parquet(gc_file, columns=columns) for gc_file in tqdm(files)]\n", "df = pd.concat(dfs)\n", "\n", - "del dfs\n" + "del dfs" ] }, { @@ -138,7 +137,7 @@ }, "outputs": [], "source": [ - "df.memory_usage(deep=True).sum()\n" + "df.memory_usage(deep=True).sum()" ] }, { @@ -149,7 +148,7 @@ }, "outputs": [], "source": [ - "df.sort_values(by=\"QUOTE_DATETIME\", inplace=True)\n" + "df.sort_values(by=\"QUOTE_DATETIME\", inplace=True)" ] }, { @@ -160,7 +159,7 @@ }, "outputs": [], "source": [ - "df.head()\n" + "df.head()" ] }, { @@ -175,7 +174,7 @@ "labelled_df = pd.read_parquet(\n", " f\"gs://thesis-bucket-option-trade-classification/data/preprocessed/{exchange}_supervised_train.parquet\",\n", " columns=columns,\n", - ")\n" + ")" ] }, { @@ -186,7 +185,7 @@ }, "outputs": [], "source": [ - "labelled_df.head()\n" + "labelled_df.head()" ] }, { @@ -201,7 +200,7 @@ "date_range = labelled_df.QUOTE_DATETIME.between(\n", " df.QUOTE_DATETIME.min(), df.QUOTE_DATETIME.max()\n", ")\n", - "labelled_df = labelled_df[date_range]\n" + "labelled_df = labelled_df[date_range]" ] }, { @@ -232,7 +231,7 @@ " # 'price_ex_lead', 'price_ex_lag',\n", "]\n", "\n", - "labelled_df[\"duplicated\"] = labelled_df.duplicated(subset=subset)\n" + "labelled_df[\"duplicated\"] = labelled_df.duplicated(subset=subset)" ] }, { @@ -243,7 +242,7 @@ }, "outputs": [], "source": [ - "labelled_df[\"duplicated\"].value_counts()\n" + "labelled_df[\"duplicated\"].value_counts()" ] }, { @@ -254,7 +253,7 @@ }, "outputs": [], "source": [ - "labelled_df[labelled_df[\"optionid\"] == 83414152.0].head(20).T\n" + "labelled_df[labelled_df[\"optionid\"] == 83414152.0].head(20).T" ] }, { @@ -265,7 +264,7 @@ }, "outputs": [], "source": [ - "labelled_df[\"index_labelled\"] = labelled_df.index\n" + "labelled_df[\"index_labelled\"] = labelled_df.index" ] }, { @@ -277,7 +276,7 @@ "outputs": [], "source": [ "len_labelled_df = len(labelled_df)\n", - "len_df = len(df)\n" + "len_df = len(df)" ] }, { @@ -314,7 +313,7 @@ " how=\"left\",\n", " indicator=\"exists\",\n", " suffixes=(\"_unlabelled\", \"_labelled\"),\n", - ")\n" + ")" ] }, { @@ -325,7 +324,7 @@ }, "outputs": [], "source": [ - "df_w_indicator.head(50)\n" + "df_w_indicator.head(50)" ] }, { @@ -338,7 +337,7 @@ "source": [ "# interpolate missing indices. index increases 1 -> 2. So filling with float seems ok. will be inserted between int of labelled df.\n", "df_w_indicator[\"index_labelled\"].interpolate(\"linear\", inplace=True)\n", - "df_w_indicator.set_index(keys=\"index_labelled\", drop=True, inplace=True)\n" + "df_w_indicator.set_index(keys=\"index_labelled\", drop=True, inplace=True)" ] }, { @@ -349,7 +348,7 @@ }, "outputs": [], "source": [ - "df_w_indicator.head()\n" + "df_w_indicator.head()" ] }, { @@ -360,7 +359,7 @@ }, "outputs": [], "source": [ - "len(df_w_indicator)\n" + "len(df_w_indicator)" ] }, { @@ -371,7 +370,7 @@ }, "outputs": [], "source": [ - "len(df)\n" + "len(df)" ] }, { @@ -382,7 +381,7 @@ }, "outputs": [], "source": [ - "len(labelled_df)\n" + "len(labelled_df)" ] }, { @@ -394,7 +393,7 @@ "outputs": [], "source": [ "# sort columns lexigraphically\n", - "df_w_indicator.sort_index(axis=1, inplace=True)\n" + "df_w_indicator.sort_index(axis=1, inplace=True)" ] }, { @@ -405,7 +404,7 @@ }, "outputs": [], "source": [ - "df_w_indicator[df_w_indicator[\"exists\"] == \"both\"].head(20).T\n" + "df_w_indicator[df_w_indicator[\"exists\"] == \"both\"].head(20).T" ] }, { @@ -426,7 +425,7 @@ " \"2013-04-24 00:00:00\", \"2013-10-24 16:14:48\"\n", ")\n", "\n", - "df_w_indicator = df_w_indicator[date_range]\n" + "df_w_indicator = df_w_indicator[date_range]" ] }, { @@ -437,7 +436,7 @@ }, "outputs": [], "source": [ - "df_w_indicator.head(5).T\n" + "df_w_indicator.head(5).T" ] }, { @@ -449,7 +448,7 @@ "outputs": [], "source": [ "# add fields\n", - "df_w_indicator[\"buy_sell\"] = 0\n" + "df_w_indicator[\"buy_sell\"] = 0" ] }, { @@ -469,7 +468,7 @@ " df_w_indicator[\"STRK_PRC\"],\n", " df_w_indicator[\"EXPIRATION\"],\n", " ]\n", - ")[\"TRADE_SIZE\"].transform(\"sum\")\n" + ")[\"TRADE_SIZE\"].transform(\"sum\")" ] }, { @@ -480,7 +479,7 @@ }, "outputs": [], "source": [ - "df_w_indicator\n" + "df_w_indicator" ] }, { @@ -504,7 +503,7 @@ " \"duplicated\",\n", " ]\n", ")\n", - "train.columns = train.columns.str.replace(r\"_unlabelled$\", \"\", regex=True)\n" + "train.columns = train.columns.str.replace(r\"_unlabelled$\", \"\", regex=True)" ] }, { @@ -515,7 +514,7 @@ }, "outputs": [], "source": [ - "train.head().T\n" + "train.head().T" ] }, { @@ -526,7 +525,7 @@ }, "outputs": [], "source": [ - "train.describe()\n" + "train.describe()" ] }, { @@ -544,7 +543,7 @@ "source": [ "output_path = f\"gs://thesis-bucket-option-trade-classification/data/preprocessed/{exchange}_{strategy}_train.parquet\"\n", "train.to_parquet(output_path)\n", - "dataset.add_reference(output_path, name=\"train_set\")\n" + "dataset.add_reference(output_path, name=\"train_set\")" ] }, { @@ -573,7 +572,7 @@ "source": [ "# Log the artifact to save it as an output of this run\n", "run.log_artifact(dataset)\n", - "wandb.finish()\n" + "wandb.finish()" ] } ], diff --git a/notebooks/3.0a-mb-explanatory-data-analysis.ipynb b/notebooks/3.0a-mb-explanatory-data-analysis.ipynb index a0e2d981..23caadf9 100644 --- a/notebooks/3.0a-mb-explanatory-data-analysis.ipynb +++ b/notebooks/3.0a-mb-explanatory-data-analysis.ipynb @@ -46,7 +46,7 @@ "plt.style.use(\"seaborn-notebook\")\n", "\n", "# set ratio of figure\n", - "ratio = (16, 9)\n" + "ratio = (16, 9)" ] }, { @@ -59,9 +59,7 @@ "source": [ "# set fixed seed\n", "def seed_everything(seed) -> None:\n", - " \"\"\"\n", - " Seeds basic parameters for reproducibility of results.\n", - " \"\"\"\n", + " \"\"\"Seeds basic parameters for reproducibility of results.\"\"\"\n", " os.environ[\"PYTHONHASHSEED\"] = str(seed)\n", " random.seed(seed)\n", " # pandas and numpy as discussed here: https://stackoverflow.com/a/52375474/5755604\n", @@ -69,7 +67,7 @@ "\n", "\n", "seed = 42\n", - "seed_everything(seed)\n" + "seed_everything(seed)" ] }, { @@ -91,7 +89,7 @@ "source": [ "data = pd.read_parquet(\n", " \"gs://thesis-bucket-option-trade-classification/data/preprocessed/train_set_extended_60.parquet\"\n", - ").sample(frac=0.1, axis=0, random_state=seed)\n" + ").sample(frac=0.1, axis=0, random_state=seed)" ] }, { @@ -207,7 +205,7 @@ }, "outputs": [], "source": [ - "data.head()\n" + "data.head()" ] }, { @@ -223,7 +221,7 @@ }, "outputs": [], "source": [ - "data.describe()\n" + "data.describe()" ] }, { @@ -238,7 +236,7 @@ }, "outputs": [], "source": [ - "data.info()\n" + "data.info()" ] }, { @@ -253,7 +251,7 @@ }, "outputs": [], "source": [ - "print(data.shape)\n" + "print(data.shape)" ] }, { @@ -271,7 +269,7 @@ "print(data.shape)\n", "# drop identical rows, if present\n", "data.drop_duplicates(inplace=True)\n", - "print(data.shape)\n" + "print(data.shape)" ] }, { @@ -297,7 +295,7 @@ }, "outputs": [], "source": [ - "data.nunique()\n" + "data.nunique()" ] }, { @@ -313,7 +311,7 @@ }, "outputs": [], "source": [ - "data.head().T\n" + "data.head().T" ] }, { @@ -357,7 +355,7 @@ "outputs": [], "source": [ "corr: pd.DataFrame = data.corr()\n", - "sns.heatmap(corr, xticklabels=corr.columns.values, yticklabels=corr.columns.values) \n" + "sns.heatmap(corr, xticklabels=corr.columns.values, yticklabels=corr.columns.values)" ] }, { @@ -401,7 +399,7 @@ " \"price_all_lead\",\n", " \"day_vol\",\n", " ],\n", - ")\n" + ")" ] }, { @@ -428,7 +426,7 @@ "source": [ "sort_criteria = corr[\"buy_sell\"].abs().sort_values(ascending=False)\n", "corr_target = corr.sort_values(\"buy_sell\", ascending=False)[\"buy_sell\"]\n", - "corr_target.loc[sort_criteria.index].to_frame()\n" + "corr_target.loc[sort_criteria.index].to_frame()" ] }, { @@ -452,7 +450,7 @@ "outputs": [], "source": [ "# remove some columns, which will NOT be used in model\n", - "data.drop(columns=[\"optionid\"], inplace=True)\n" + "data.drop(columns=[\"optionid\"], inplace=True)" ] }, { @@ -484,7 +482,7 @@ "# Find index of feature columns with correlation greater than 0.975\n", "to_drop = [column for column in upper.columns if any(abs(upper[column]) > threshold)]\n", "\n", - "print(to_drop)\n" + "print(to_drop)" ] }, { @@ -514,7 +512,7 @@ "\n", "# For each column, record the variables that are above the threshold\n", "for col in corr:\n", - " above_threshold_vars[col] = list(corr.index[corr[col] > threshold])\n" + " above_threshold_vars[col] = list(corr.index[corr[col] > threshold])" ] }, { @@ -529,7 +527,7 @@ }, "outputs": [], "source": [ - "pd.Series(above_threshold_vars)\n" + "pd.Series(above_threshold_vars)" ] }, { @@ -570,7 +568,7 @@ }, "outputs": [], "source": [ - "data.head()\n" + "data.head()" ] }, { @@ -581,7 +579,7 @@ }, "outputs": [], "source": [ - "sample = data.select_dtypes(include=np.number).fillna(0).drop(columns=[\"buy_sell\"])\n" + "sample = data.select_dtypes(include=np.number).fillna(0).drop(columns=[\"buy_sell\"])" ] }, { @@ -600,7 +598,7 @@ " learning_rate=\"auto\",\n", " n_iter=300,\n", ")\n", - "Y = tsne.fit_transform(sample)\n" + "Y = tsne.fit_transform(sample)" ] }, { @@ -612,7 +610,7 @@ "outputs": [], "source": [ "dims = pd.DataFrame(Y, columns=[\"x\", \"y\"], index=data.index)\n", - "dims[\"class\"] = data[\"buy_sell\"]\n" + "dims[\"class\"] = data[\"buy_sell\"]" ] }, { @@ -623,7 +621,7 @@ }, "outputs": [], "source": [ - "dims[\"class\"] = data.buy_sell\n" + "dims[\"class\"] = data.buy_sell" ] }, { @@ -643,7 +641,7 @@ "scatter = plt.scatter(dims[\"x\"], dims[\"y\"], c=dims[\"class\"], cmap=plt.cm.rainbow)\n", "plt.setp(ax, xticks=[], yticks=[])\n", "plt.title(\"t-SNE of dataset\")\n", - "plt.legend(handles=scatter.legend_elements()[0], labels=[\"-1\", \"1\"])\n" + "plt.legend(handles=scatter.legend_elements()[0], labels=[\"-1\", \"1\"])" ] }, { @@ -655,7 +653,7 @@ "outputs": [], "source": [ "del sample\n", - "del dims\n" + "del dims" ] }, { @@ -728,7 +726,7 @@ "data[\"day\"] = data[\"QUOTE_DATETIME\"].dt.day\n", "data[\"month\"] = data[\"QUOTE_DATETIME\"].dt.month\n", "data[\"year\"] = data[\"QUOTE_DATETIME\"].dt.year\n", - "data[\"date\"] = data[\"QUOTE_DATETIME\"].dt.date\n" + "data[\"date\"] = data[\"QUOTE_DATETIME\"].dt.date" ] }, { @@ -786,7 +784,7 @@ " \"ttm (6-12] month\",\n", " \"ttm > 12 month\",\n", "]\n", - "data[\"ttm_binned\"] = pd.cut(data[\"ttm\"], bins_ttm, labels=ttm_labels)\n" + "data[\"ttm_binned\"] = pd.cut(data[\"ttm\"], bins_ttm, labels=ttm_labels)" ] }, { @@ -849,7 +847,7 @@ "data[\"abs_mid_BEST\"] = data[\"TRADE_PRICE\"] - mid_best\n", "\n", "data[\"spread_ex\"] = spread_ex\n", - "data[\"spread_best\"] = spread_best\n" + "data[\"spread_best\"] = spread_best" ] }, { @@ -869,7 +867,7 @@ }, "outputs": [], "source": [ - "data[\"symbol_is_index\"] = data[\"ROOT\"].str.startswith(\"^\").astype(int)\n" + "data[\"symbol_is_index\"] = data[\"ROOT\"].str.startswith(\"^\").astype(int)" ] }, { @@ -885,7 +883,7 @@ }, "outputs": [], "source": [ - "data.head()\n" + "data.head()" ] }, { @@ -915,8 +913,7 @@ "outputs": [], "source": [ "def plot_kde_target(var_name: str, clip: List[float] | None = None):\n", - " \"\"\"\n", - " Plot kde plots for buys (+1) and sells (-1) with regard to the feature 'var_name'.\n", + " \"\"\"Plot kde plots for buys (+1) and sells (-1) with regard to the feature 'var_name'.\n", "\n", " Args:\n", " var_name (str): name of feature\n", @@ -945,7 +942,7 @@ " f\"The correlation between '{var_name}' and the 'buy_sell' is {corr_var: 0.4f}\"\n", " )\n", " print(f\"Median value of sells = {median_sell: 0.4f}\")\n", - " print(f\"Median value of buys = {median_buy: 0.4f}\")\n" + " print(f\"Median value of buys = {median_buy: 0.4f}\")" ] }, { @@ -961,8 +958,7 @@ " clip: float | None = None,\n", " years: List[int] = [2006, 2010, 2013],\n", ") -> None:\n", - " \"\"\"\n", - " Plot several kde plots side by side for the feature.\n", + " \"\"\"Plot several kde plots side by side for the feature.\n", "\n", " Args:\n", " var_name (str): name of the feature\n", @@ -988,7 +984,7 @@ " )\n", " ax[y].xaxis.label.set_text(str(year))\n", "\n", - " fig.legend()\n" + " fig.legend()" ] }, { @@ -1003,9 +999,7 @@ "\n", "\n", "def plot_recessions() -> None:\n", - " \"\"\"\n", - " Add recession indicator to plot and entry to legend.\n", - " \"\"\"\n", + " \"\"\"Add recession indicator to plot and entry to legend.\"\"\"\n", " l = 0\n", " month = relativedelta.relativedelta(months=+1)\n", " for date, val in us_rec[\"USREC\"].items():\n", @@ -1019,7 +1013,7 @@ " alpha=0.25,\n", " label=\"_\" * l + \"recession\",\n", " )\n", - " l += 1\n" + " l += 1" ] }, { @@ -1033,8 +1027,7 @@ "def plot_time_series(\n", " feature: str | List[str], aggregation: str | List[Any] = \"count\"\n", ") -> pd.DataFrame:\n", - " \"\"\"\n", - " Plot feature over time. Aggregate using 'aggregation'.\n", + " \"\"\"Plot feature over time. Aggregate using 'aggregation'.\n", "\n", " Args:\n", " feature (str | List[str]): features to plot.\n", @@ -1058,7 +1051,7 @@ " ax.legend()\n", " plt.show()\n", "\n", - " return time_series\n" + " return time_series" ] }, { @@ -1081,7 +1074,7 @@ "cat_columns_bin = [\"bin_\" + x for x in cat_columns]\n", "\n", "# binarize categorical similar to Borisov et al.\n", - "data[cat_columns_bin] = data[cat_columns].apply(lambda x: pd.factorize(x)[0]) \n" + "data[cat_columns_bin] = data[cat_columns].apply(lambda x: pd.factorize(x)[0])" ] }, { @@ -1124,7 +1117,7 @@ }, "outputs": [], "source": [ - "trades_per_day = plot_time_series(\"TRADE_PRICE\", \"count\")\n" + "trades_per_day = plot_time_series(\"TRADE_PRICE\", \"count\")" ] }, { @@ -1139,7 +1132,7 @@ }, "outputs": [], "source": [ - "trades_per_day.iloc[:, 0].nlargest(N)\n" + "trades_per_day.iloc[:, 0].nlargest(N)" ] }, { @@ -1154,7 +1147,7 @@ }, "outputs": [], "source": [ - "trades_per_day.iloc[:, 0].nsmallest(N)\n" + "trades_per_day.iloc[:, 0].nsmallest(N)" ] }, { @@ -1190,8 +1183,8 @@ }, "outputs": [], "source": [ - "ax = sns.histplot(data, x=\"TRADE_SIZE\", bins=50) \n", - "ax.title.set_text(\"Histogram of trade size\")\n" + "ax = sns.histplot(data, x=\"TRADE_SIZE\", bins=50)\n", + "ax.title.set_text(\"Histogram of trade size\")" ] }, { @@ -1218,7 +1211,7 @@ }, "outputs": [], "source": [ - "trades_over_time = plot_time_series(\"TRADE_SIZE\", [\"mean\", \"median\"])\n" + "trades_over_time = plot_time_series(\"TRADE_SIZE\", [\"mean\", \"median\"])" ] }, { @@ -1236,7 +1229,7 @@ "source": [ "trade_ask_bid_size = plot_time_series(\n", " [\"TRADE_SIZE\", \"ask_size_ex\", \"bid_size_ex\"], \"mean\"\n", - ")\n" + ")" ] }, { @@ -1263,7 +1256,7 @@ }, "outputs": [], "source": [ - "data[\"TRADE_SIZE\"].describe()\n" + "data[\"TRADE_SIZE\"].describe()" ] }, { @@ -1279,7 +1272,7 @@ }, "outputs": [], "source": [ - "data[data[\"TRADE_SIZE\"].max() == data[\"TRADE_SIZE\"]]\n" + "data[data[\"TRADE_SIZE\"].max() == data[\"TRADE_SIZE\"]]" ] }, { @@ -1295,7 +1288,7 @@ }, "outputs": [], "source": [ - "data.nlargest(N, \"TRADE_SIZE\", keep=\"first\").T\n" + "data.nlargest(N, \"TRADE_SIZE\", keep=\"first\").T" ] }, { @@ -1312,8 +1305,8 @@ "outputs": [], "source": [ "data[\"log_trade_size\"] = np.log1p(data[\"TRADE_SIZE\"])\n", - "ax = sns.histplot(data, x=\"log_trade_size\", bins=50) \n", - "ax.title.set_text(f\"Histogram of trade size (log1p)\")\n" + "ax = sns.histplot(data, x=\"log_trade_size\", bins=50)\n", + "ax.title.set_text(\"Histogram of trade size (log1p)\")" ] }, { @@ -1329,7 +1322,7 @@ }, "outputs": [], "source": [ - "plot_kde_target(\"log_trade_size\", clip=[0, 6])\n" + "plot_kde_target(\"log_trade_size\", clip=[0, 6])" ] }, { @@ -1364,8 +1357,8 @@ }, "outputs": [], "source": [ - "ax = sns.histplot(data, x=\"TRADE_PRICE\", bins=50) \n", - "ax.title.set_text(\"Histogram of trade price\")\n" + "ax = sns.histplot(data, x=\"TRADE_PRICE\", bins=50)\n", + "ax.title.set_text(\"Histogram of trade price\")" ] }, { @@ -1382,7 +1375,7 @@ "outputs": [], "source": [ "ax = sns.boxplot(data=data, x=\"buy_sell\", y=\"TRADE_PRICE\")\n", - "ax.title.set_text(\"Box plot of 'TRADE_PRICE' for buys (1) and sells (-1)\")\n" + "ax.title.set_text(\"Box plot of 'TRADE_PRICE' for buys (1) and sells (-1)\")" ] }, { @@ -1405,7 +1398,7 @@ }, "outputs": [], "source": [ - "data[\"log_trade_price\"] = np.log1p(data[\"TRADE_PRICE\"])\n" + "data[\"log_trade_price\"] = np.log1p(data[\"TRADE_PRICE\"])" ] }, { @@ -1423,7 +1416,7 @@ "source": [ "fig, ax = plt.subplots()\n", "\n", - "sns.histplot(data, x=\"log_trade_price\", bins=50, stat=\"density\", label=\"log price\") \n", + "sns.histplot(data, x=\"log_trade_price\", bins=50, stat=\"density\", label=\"log price\")\n", "\n", "# extract the limits for the x-axis and fit normal distributon\n", "x0, x1 = ax.get_xlim()\n", @@ -1435,7 +1428,7 @@ "\n", "\n", "ax.title.set_text(\"Distribution of log prices\")\n", - "ax.legend()\n" + "ax.legend()" ] }, { @@ -1452,7 +1445,7 @@ "outputs": [], "source": [ "ax = sns.boxplot(data=data, x=\"buy_sell\", y=\"log_trade_price\")\n", - "ax.title.set_text(\"Box plot of log prices for buys (1) and sells (-1)\")\n" + "ax.title.set_text(\"Box plot of log prices for buys (1) and sells (-1)\")" ] }, { @@ -1468,7 +1461,7 @@ }, "outputs": [], "source": [ - "data.nlargest(N, \"TRADE_PRICE\", keep=\"first\").T\n" + "data.nlargest(N, \"TRADE_PRICE\", keep=\"first\").T" ] }, { @@ -1484,7 +1477,7 @@ }, "outputs": [], "source": [ - "trade_price_over_time = plot_time_series(\"TRADE_PRICE\", [\"mean\", \"median\"])\n" + "trade_price_over_time = plot_time_series(\"TRADE_PRICE\", [\"mean\", \"median\"])" ] }, { @@ -1502,7 +1495,7 @@ "source": [ "trade_price_over_time = plot_time_series(\n", " [\"TRADE_PRICE\", \"price_ex_lead\", \"price_ex_lag\"], \"mean\"\n", - ")\n" + ")" ] }, { @@ -1520,7 +1513,7 @@ "source": [ "trade_price_over_time = plot_time_series(\n", " [\"TRADE_PRICE\", \"price_ex_lead\", \"price_ex_lag\"], \"median\"\n", - ")\n" + ")" ] }, { @@ -1556,7 +1549,7 @@ }, "outputs": [], "source": [ - "ttm_over_time = plot_time_series(\"ttm\", \"mean\")\n" + "ttm_over_time = plot_time_series(\"ttm\", \"mean\")" ] }, { @@ -1572,8 +1565,8 @@ }, "outputs": [], "source": [ - "ax = sns.histplot(data=data[data[\"bid_ex\"] == 0.0], x=\"ttm\", bins=50) \n", - "ax.title.set_text(\"Count of transactions with regard to time to maturity (months)\")\n" + "ax = sns.histplot(data=data[data[\"bid_ex\"] == 0.0], x=\"ttm\", bins=50)\n", + "ax.title.set_text(\"Count of transactions with regard to time to maturity (months)\")" ] }, { @@ -1601,7 +1594,7 @@ "outputs": [], "source": [ "# TODO: ask of zero plausible?\n", - "sns.histplot(data=data[data[\"ask_ex\"] == 0.0], x=\"ttm\", bins=50) \n" + "sns.histplot(data=data[data[\"ask_ex\"] == 0.0], x=\"ttm\", bins=50)" ] }, { @@ -1626,8 +1619,8 @@ }, "outputs": [], "source": [ - "ax = sns.histplot(data, x=\"STRK_PRC\", bins=50) \n", - "ax.title.set_text(\"Histogram of strike price\")\n" + "ax = sns.histplot(data, x=\"STRK_PRC\", bins=50)\n", + "ax.title.set_text(\"Histogram of strike price\")" ] }, { @@ -1644,7 +1637,7 @@ "outputs": [], "source": [ "ax = sns.boxplot(data=data, x=\"buy_sell\", y=\"STRK_PRC\")\n", - "ax.title.set_text(\"Box plot of strike prices for buys (1) and sells (-1)\")\n" + "ax.title.set_text(\"Box plot of strike prices for buys (1) and sells (-1)\")" ] }, { @@ -1660,7 +1653,7 @@ }, "outputs": [], "source": [ - "strike_over_time = plot_time_series(\"STRK_PRC\", \"mean\")\n" + "strike_over_time = plot_time_series(\"STRK_PRC\", \"mean\")" ] }, { @@ -1683,7 +1676,7 @@ }, "outputs": [], "source": [ - "data[\"log_strk_prc\"] = np.log1p(data[\"STRK_PRC\"])\n" + "data[\"log_strk_prc\"] = np.log1p(data[\"STRK_PRC\"])" ] }, { @@ -1699,8 +1692,8 @@ }, "outputs": [], "source": [ - "ax = sns.histplot(data, x=\"log_strk_prc\", bins=50) \n", - "ax.title.set_text(\"Histogram of strike price (log1p)\")\n" + "ax = sns.histplot(data, x=\"log_strk_prc\", bins=50)\n", + "ax.title.set_text(\"Histogram of strike price (log1p)\")" ] }, { @@ -1717,7 +1710,7 @@ "outputs": [], "source": [ "ax = sns.boxplot(data=data, x=\"buy_sell\", y=\"log_strk_prc\")\n", - "ax.title.set_text(\"Box plot of strike prices for buys (1) and sells (-1)\")\n" + "ax.title.set_text(\"Box plot of strike prices for buys (1) and sells (-1)\")" ] }, { @@ -1742,7 +1735,7 @@ "outputs": [], "source": [ "ratio_buy_sell = data[\"buy_sell\"].value_counts() / data[\"buy_sell\"].count()\n", - "ratio_buy_sell.head()\n" + "ratio_buy_sell.head()" ] }, { @@ -1782,7 +1775,7 @@ "source": [ "ax = sns.countplot(data=data, x=\"OPTION_TYPE\", hue=\"buy_sell\")\n", "ax.title.set_text(\"Distribution of Buy / Sell indicator with regard to option type\")\n", - "sns.move_legend(ax, \"lower center\", bbox_to_anchor=(0.5, -0.3))\n" + "sns.move_legend(ax, \"lower center\", bbox_to_anchor=(0.5, -0.3))" ] }, { @@ -1811,7 +1804,7 @@ "ax.title.set_text(\"Distribution of Buy / Sell indicator with regard to year (binned)\")\n", "ax.set_xticklabels(ax.get_xticklabels(), rotation=40, ha=\"center\")\n", "plt.tight_layout()\n", - "plt.show()\n" + "plt.show()" ] }, { @@ -1842,7 +1835,7 @@ ")\n", "ax.set_xticklabels(ax.get_xticklabels(), rotation=40, ha=\"center\")\n", "plt.tight_layout()\n", - "plt.show()\n" + "plt.show()" ] }, { @@ -1859,7 +1852,7 @@ "outputs": [], "source": [ "ax = sns.scatterplot(data=sample, x=\"ttm\", y=\"bid_ex\", hue=\"OPTION_TYPE\")\n", - "ax.title.set_text(\"Scatter plot of time to maturity (months) and bid (ex)\")\n" + "ax.title.set_text(\"Scatter plot of time to maturity (months) and bid (ex)\")" ] }, { @@ -1875,8 +1868,8 @@ }, "outputs": [], "source": [ - "ax = sns.histplot(data=data[data[\"bid_ex\"] == 0.0], x=\"ttm\", bins=50) \n", - "ax.title.set_text(\"Count of transactions with regard to time to maturity (months)\")\n" + "ax = sns.histplot(data=data[data[\"bid_ex\"] == 0.0], x=\"ttm\", bins=50)\n", + "ax.title.set_text(\"Count of transactions with regard to time to maturity (months)\")" ] }, { @@ -1893,7 +1886,7 @@ "outputs": [], "source": [ "# TODO: ask of zero plausible?\n", - "sns.histplot(data=data[data[\"ask_ex\"] == 0.0], x=\"ttm\", bins=50) \n" + "sns.histplot(data=data[data[\"ask_ex\"] == 0.0], x=\"ttm\", bins=50)" ] }, { @@ -1930,7 +1923,7 @@ ")\n", "plot_recessions()\n", "ax.legend()\n", - "plt.show()\n" + "plt.show()" ] }, { @@ -1967,7 +1960,7 @@ "frequency_symbols = data[\"ROOT\"].value_counts().reset_index(name=\"Count\")\n", "frequency_symbols.rename(columns={\"index\": \"Symbol\"}, inplace=True)\n", "frequency_symbols.sort_values(\"Count\", ascending=True)\n", - "sns.histplot(data=frequency_symbols, x=\"Count\", bins=200)\n" + "sns.histplot(data=frequency_symbols, x=\"Count\", bins=200)" ] }, { @@ -1982,7 +1975,7 @@ }, "outputs": [], "source": [ - "frequency_symbols[frequency_symbols[\"Count\"] <= 5].count()\n" + "frequency_symbols[frequency_symbols[\"Count\"] <= 5].count()" ] }, { @@ -1998,7 +1991,7 @@ }, "outputs": [], "source": [ - "frequency_symbols\n" + "frequency_symbols" ] }, { @@ -2020,7 +2013,7 @@ "ax = sns.barplot(data=most_frequent_symbols, x=\"Symbol\", y=\"Count\")\n", "ax.title.set_text(f\"{N} most frequently traded symbols\")\n", "\n", - "most_frequent_symbols.head(N)\n" + "most_frequent_symbols.head(N)" ] }, { @@ -2031,7 +2024,7 @@ }, "outputs": [], "source": [ - "list_freq_symbols = most_frequent_symbols.Symbol.tolist()\n" + "list_freq_symbols = most_frequent_symbols.Symbol.tolist()" ] }, { @@ -2042,7 +2035,7 @@ }, "outputs": [], "source": [ - "frequent_symbols_over_time = data[data[\"ROOT\"].isin(list_freq_symbols)]\n" + "frequent_symbols_over_time = data[data[\"ROOT\"].isin(list_freq_symbols)]" ] }, { @@ -2060,7 +2053,7 @@ " .count()\n", " .reset_index()\n", " .rename(columns={\"TRADE_SIZE\": \"count\", \"QUOTE_DATETIME\": \"date\", \"ROOT\": \"Symbol\"})\n", - ")\n" + ")" ] }, { @@ -2075,7 +2068,7 @@ " frequent_symbols_trades_per_day.groupby([\"date\", \"Symbol\"])[\"count\"]\n", " .first()\n", " .unstack()\n", - ")\n" + ")" ] }, { @@ -2093,7 +2086,7 @@ "source": [ "frequent_symbols_over_time.plot(\n", " kind=\"line\", title=f\"{N} most frequently traded underlyings over time\"\n", - ")\n" + ")" ] }, { @@ -2117,7 +2110,7 @@ " columns=[\"QUOTE_DATETIME\", \"ROOT\"],\n", ")\n", "\n", - "roots_over_time = pd.concat([root_time_train, root_time_val, root_time_test])\n" + "roots_over_time = pd.concat([root_time_train, root_time_val, root_time_test])" ] }, { @@ -2135,7 +2128,7 @@ " .reset_index()\n", " .sample(N)\n", " .T\n", - ")\n" + ")" ] }, { @@ -2158,7 +2151,7 @@ " ),\n", " i / N,\n", " np.nan,\n", - " )\n" + " )" ] }, { @@ -2180,7 +2173,7 @@ "# beginning of validation and test set\n", "ax.axvline(\"2013-10-25\", color=\"gray\")\n", "ax.axvline(\"2015-11-06\", color=\"gray\")\n", - "ax.set_title(\"roots over time (min / max appearance)\")\n" + "ax.set_title(\"roots over time (min / max appearance)\")" ] }, { @@ -2201,7 +2194,7 @@ "ax.title.set_text(\n", " \"Distribution of Buy / Sell indicator with regard to whether underlying is an index\"\n", ")\n", - "sns.move_legend(ax, \"lower center\", bbox_to_anchor=(0.5, -0.3))\n" + "sns.move_legend(ax, \"lower center\", bbox_to_anchor=(0.5, -0.3))" ] }, { @@ -2220,7 +2213,7 @@ " data.groupby([\"symbol_is_index\", \"buy_sell\"])[\"buy_sell\"].count()\n", " / data.groupby([\"symbol_is_index\"])[\"buy_sell\"].count()\n", ")\n", - "ratios_is_index.head()\n" + "ratios_is_index.head()" ] }, { @@ -2246,7 +2239,7 @@ }, "outputs": [], "source": [ - "data[\"issue_type\"].value_counts(dropna=False)\n" + "data[\"issue_type\"].value_counts(dropna=False)" ] }, { @@ -2264,7 +2257,7 @@ "source": [ "ax = sns.countplot(data=data, x=\"issue_type\")\n", "ax.title.set_text(\"No. of transactions by issue type\")\n", - "ax.xaxis.label.set_text(\"issue type\")\n" + "ax.xaxis.label.set_text(\"issue type\")" ] }, { @@ -2291,7 +2284,7 @@ "source": [ "bid_ask_over_time = plot_time_series(\n", " [\"bid_ex\", \"ask_ex\", \"BEST_ASK\", \"BEST_BID\"], \"mean\"\n", - ")\n" + ")" ] }, { @@ -2316,8 +2309,8 @@ }, "outputs": [], "source": [ - "ax = sns.histplot(data, x=\"ask_ex\", bins=50) \n", - "ax.title.set_text(\"Histogram of ask (exchange)\")\n" + "ax = sns.histplot(data, x=\"ask_ex\", bins=50)\n", + "ax.title.set_text(\"Histogram of ask (exchange)\")" ] }, { @@ -2344,8 +2337,8 @@ "outputs": [], "source": [ "data[\"log_ask_ex\"] = np.log1p(data[\"ask_ex\"])\n", - "ax = sns.histplot(data, x=\"log_ask_ex\", bins=50) \n", - "ax.title.set_text(f\"Histogram of ask exchange (log1p)\")\n" + "ax = sns.histplot(data, x=\"log_ask_ex\", bins=50)\n", + "ax.title.set_text(\"Histogram of ask exchange (log1p)\")" ] }, { @@ -2361,7 +2354,7 @@ }, "outputs": [], "source": [ - "plot_kde_target(\"log_ask_ex\", clip=[0, 5])\n" + "plot_kde_target(\"log_ask_ex\", clip=[0, 5])" ] }, { @@ -2396,8 +2389,8 @@ }, "outputs": [], "source": [ - "ax = sns.histplot(data, x=\"bid_ex\", bins=50) \n", - "ax.title.set_text(\"Histogram of bid (exchange)\")\n" + "ax = sns.histplot(data, x=\"bid_ex\", bins=50)\n", + "ax.title.set_text(\"Histogram of bid (exchange)\")" ] }, { @@ -2414,8 +2407,8 @@ "outputs": [], "source": [ "data[\"log_bid_ex\"] = np.log1p(data[\"bid_ex\"])\n", - "ax = sns.histplot(data, x=\"log_bid_ex\", bins=50) \n", - "ax.title.set_text(f\"Histogram of bid exchange (log1p)\")\n" + "ax = sns.histplot(data, x=\"log_bid_ex\", bins=50)\n", + "ax.title.set_text(\"Histogram of bid exchange (log1p)\")" ] }, { @@ -2431,7 +2424,7 @@ }, "outputs": [], "source": [ - "plot_kde_target(\"log_bid_ex\", clip=[0, 5])\n" + "plot_kde_target(\"log_bid_ex\", clip=[0, 5])" ] }, { @@ -2448,8 +2441,8 @@ "outputs": [], "source": [ "data[\"log_bid_ex\"] = np.log1p(data[\"bid_ex\"])\n", - "ax = sns.histplot(data, x=\"log_bid_ex\", bins=50) \n", - "ax.title.set_text(\"Histogram of bid exchange (log1p)\")\n" + "ax = sns.histplot(data, x=\"log_bid_ex\", bins=50)\n", + "ax.title.set_text(\"Histogram of bid exchange (log1p)\")" ] }, { @@ -2465,7 +2458,7 @@ }, "outputs": [], "source": [ - "plot_kde_target(\"log_bid_ex\", clip=[-5, 6])\n" + "plot_kde_target(\"log_bid_ex\", clip=[-5, 6])" ] }, { @@ -2507,15 +2500,13 @@ "outputs": [], "source": [ "def visualize_nan():\n", - " \"\"\"\n", - " Visualize NaN values in a heatmap to learn about patterns.\n", - " \"\"\"\n", + " \"\"\"Visualize NaN values in a heatmap to learn about patterns.\"\"\"\n", " plt.subplots()\n", " sns.heatmap(data.head(50).isnull(), cbar=False)\n", " plt.xlabel(\"feature\")\n", " plt.ylabel(\"row\")\n", " plt.title(\"Missing values (colored in bright beige)\")\n", - " plt.show()\n" + " plt.show()" ] }, { @@ -2531,7 +2522,7 @@ }, "outputs": [], "source": [ - "visualize_nan()\n" + "visualize_nan()" ] }, { @@ -2557,7 +2548,7 @@ " xlabel=\"No. of missing values\",\n", " ylabel=\"feature\",\n", " title=\"Missing values\",\n", - ")\n" + ")" ] }, { @@ -2584,7 +2575,7 @@ " title=\"Missing values over time\",\n", " xlabel=\"Timestamp\",\n", " ylabel=\"No. of missing values\",\n", - ")\n" + ")" ] }, { @@ -2612,7 +2603,7 @@ "\n", "fig, ax = plt.subplots(figsize=(9, 9))\n", "ax = sns.heatmap(corr_mat, mask=mask, annot=False, annot_kws={\"size\": 10}, ax=ax)\n", - "ax.title.set_text(\"Correlation between missing features\")\n" + "ax.title.set_text(\"Correlation between missing features\")" ] }, { @@ -2652,7 +2643,7 @@ "ax0.set_aspect(\"auto\")\n", "ax0.set_title(\"Dendrogram of missing values\")\n", "\n", - "ax0\n" + "ax0" ] }, { @@ -2699,7 +2690,7 @@ "source": [ "corr = data.corr()\n", "\n", - "sns.heatmap(corr, xticklabels=corr.columns.values, yticklabels=corr.columns.values) \n" + "sns.heatmap(corr, xticklabels=corr.columns.values, yticklabels=corr.columns.values)" ] }, { @@ -2726,7 +2717,7 @@ "source": [ "sort_criteria = corr[\"buy_sell\"].abs().sort_values(ascending=False)\n", "corr_target = corr.sort_values(\"buy_sell\", ascending=False)[\"buy_sell\"]\n", - "corr_target.loc[sort_criteria.index].to_frame()\n" + "corr_target.loc[sort_criteria.index].to_frame()" ] }, { @@ -2749,7 +2740,7 @@ "# Find index of feature columns with correlation greater than 0.95\n", "to_drop = [column for column in upper.columns if any(abs(upper[column]) > 0.975)]\n", "\n", - "print(to_drop)\n" + "print(to_drop)" ] }, { @@ -2794,7 +2785,7 @@ "for col in corr:\n", " above_threshold_vars[col] = list(corr.index[corr[col] > threshold])\n", "\n", - "pd.Series(above_threshold_vars)\n" + "pd.Series(above_threshold_vars)" ] }, { @@ -2819,7 +2810,7 @@ }, "outputs": [], "source": [ - "corr_target.loc[sort_criteria.index].to_frame().T\n" + "corr_target.loc[sort_criteria.index].to_frame().T" ] }, { @@ -2855,7 +2846,7 @@ }, "outputs": [], "source": [ - "plot_kde_target(\"prox_ex\", clip=[-2, 2])\n" + "plot_kde_target(\"prox_ex\", clip=[-2, 2])" ] }, { @@ -2871,7 +2862,7 @@ }, "outputs": [], "source": [ - "plot_kde_target_comparsion(\"prox_ex\", years=[2006, 2010, 2013], clip=[-2, 2])\n" + "plot_kde_target_comparsion(\"prox_ex\", years=[2006, 2010, 2013], clip=[-2, 2])" ] }, { @@ -2898,7 +2889,7 @@ }, "outputs": [], "source": [ - "plot_kde_target(\"bid_ask_size_ratio_ex\", clip=[0, 100])\n" + "plot_kde_target(\"bid_ask_size_ratio_ex\", clip=[0, 100])" ] }, { @@ -2916,7 +2907,7 @@ "source": [ "plot_kde_target_comparsion(\n", " \"bid_ask_size_ratio_ex\", years=[2006, 2010, 2013], clip=[0, 100]\n", - ")\n" + ")" ] }, { @@ -2941,7 +2932,7 @@ }, "outputs": [], "source": [ - "plot_kde_target(\"log_bid_ex\")\n" + "plot_kde_target(\"log_bid_ex\")" ] }, { @@ -2957,7 +2948,7 @@ }, "outputs": [], "source": [ - "plot_kde_target_comparsion(\"log_bid_ex\", years=[2006, 2010, 2013])\n" + "plot_kde_target_comparsion(\"log_bid_ex\", years=[2006, 2010, 2013])" ] }, { @@ -2992,7 +2983,7 @@ }, "outputs": [], "source": [ - "plot_kde_target(\"log_trade_price\")\n" + "plot_kde_target(\"log_trade_price\")" ] }, { @@ -3008,7 +2999,7 @@ }, "outputs": [], "source": [ - "plot_kde_target_comparsion(\"log_trade_price\", years=[2006, 2010, 2013])\n" + "plot_kde_target_comparsion(\"log_trade_price\", years=[2006, 2010, 2013])" ] }, { @@ -3043,7 +3034,7 @@ }, "outputs": [], "source": [ - "plot_kde_target(\"ask_size_ex\", clip=[0, 2000])\n" + "plot_kde_target(\"ask_size_ex\", clip=[0, 2000])" ] }, { @@ -3059,7 +3050,7 @@ }, "outputs": [], "source": [ - "plot_kde_target_comparsion(\"ask_size_ex\", years=[2006, 2010, 2013], clip=[0, 2000])\n" + "plot_kde_target_comparsion(\"ask_size_ex\", years=[2006, 2010, 2013], clip=[0, 2000])" ] }, { @@ -3084,7 +3075,7 @@ }, "outputs": [], "source": [ - "plot_kde_target(\"bid_size_ex\", clip=[0, 1000])\n" + "plot_kde_target(\"bid_size_ex\", clip=[0, 1000])" ] }, { @@ -3100,7 +3091,7 @@ }, "outputs": [], "source": [ - "plot_kde_target_comparsion(\"bid_size_ex\", years=[2006, 2010, 2013], clip=[0, 1000])\n" + "plot_kde_target_comparsion(\"bid_size_ex\", years=[2006, 2010, 2013], clip=[0, 1000])" ] }, { @@ -3125,7 +3116,7 @@ }, "outputs": [], "source": [ - "plot_kde_target(\"abs_mid_ex\", clip=[-0.5, 0.5])\n" + "plot_kde_target(\"abs_mid_ex\", clip=[-0.5, 0.5])" ] }, { @@ -3141,7 +3132,7 @@ }, "outputs": [], "source": [ - "plot_kde_target_comparsion(\"abs_mid_ex\", years=[2006, 2010, 2013], clip=[-0.5, 0.5])\n" + "plot_kde_target_comparsion(\"abs_mid_ex\", years=[2006, 2010, 2013], clip=[-0.5, 0.5])" ] }, { @@ -3177,7 +3168,7 @@ }, "outputs": [], "source": [ - "plot_kde_target(\"day\")\n" + "plot_kde_target(\"day\")" ] }, { @@ -3193,7 +3184,7 @@ }, "outputs": [], "source": [ - "plot_kde_target(\"year\")\n" + "plot_kde_target(\"year\")" ] }, { @@ -3239,7 +3230,7 @@ }, "outputs": [], "source": [ - "plot_kde_target(\"chg_ex_lead\", clip=[-5, 5])\n" + "plot_kde_target(\"chg_ex_lead\", clip=[-5, 5])" ] }, { @@ -3255,7 +3246,7 @@ }, "outputs": [], "source": [ - "plot_kde_target_comparsion(\"chg_ex_lead\", years=[2006, 2010, 2013], clip=[-5, 5])\n" + "plot_kde_target_comparsion(\"chg_ex_lead\", years=[2006, 2010, 2013], clip=[-5, 5])" ] }, { @@ -3291,7 +3282,7 @@ }, "outputs": [], "source": [ - "plot_kde_target_comparsion(\"chg_ex_lag\", years=[2006, 2010, 2013], clip=[-5, 5])\n" + "plot_kde_target_comparsion(\"chg_ex_lag\", years=[2006, 2010, 2013], clip=[-5, 5])" ] }, { @@ -3307,7 +3298,7 @@ }, "outputs": [], "source": [ - "plot_kde_target_comparsion(\"chg_ex_lag\", years=[2006, 2010, 2013], clip=[-5, 5])\n" + "plot_kde_target_comparsion(\"chg_ex_lag\", years=[2006, 2010, 2013], clip=[-5, 5])" ] }, { @@ -3334,7 +3325,7 @@ }, "outputs": [], "source": [ - "plot_kde_target(\"log_bid_ex\", clip=[-5, 8])\n" + "plot_kde_target(\"log_bid_ex\", clip=[-5, 8])" ] }, { @@ -3345,7 +3336,7 @@ }, "outputs": [], "source": [ - "data.replace([np.inf, -np.inf], np.nan, inplace=True)\n" + "data.replace([np.inf, -np.inf], np.nan, inplace=True)" ] }, { @@ -3359,7 +3350,7 @@ "scaler = StandardScaler()\n", "data[\"log_bid_ex_scaled\"] = scaler.fit_transform(\n", " X=data[\"log_bid_ex\"].values.reshape(-1, 1)\n", - ")\n" + ")" ] }, { @@ -3374,7 +3365,7 @@ }, "outputs": [], "source": [ - "data[\"log_bid_ex_scaled\"].describe()\n" + "data[\"log_bid_ex_scaled\"].describe()" ] }, { @@ -3390,7 +3381,7 @@ }, "outputs": [], "source": [ - "plot_kde_target(\"log_bid_ex_scaled\", clip=[-5, 5])\n" + "plot_kde_target(\"log_bid_ex_scaled\", clip=[-5, 5])" ] }, { diff --git a/notebooks/3.0b-mb-explanatory-matched-unmatched.ipynb b/notebooks/3.0b-mb-explanatory-matched-unmatched.ipynb index a901d8bb..48aba670 100644 --- a/notebooks/3.0b-mb-explanatory-matched-unmatched.ipynb +++ b/notebooks/3.0b-mb-explanatory-matched-unmatched.ipynb @@ -9,25 +9,20 @@ "source": [ "from __future__ import annotations\n", "\n", - "import wandb\n", - "from pathlib import Path\n", - "import pandas as pd\n", - "\n", "import os\n", - "\n", - "from otc.features.build_features import (\n", - " features_classical_size,\n", - ")\n", - "\n", - "\n", + "from pathlib import Path\n", "from typing import List\n", "\n", "import matplotlib.pyplot as plt\n", "import numpy as np\n", "import pandas as pd\n", "import seaborn as sns\n", + "import wandb\n", + "from tqdm.auto import tqdm\n", "\n", - "from tqdm.auto import tqdm" + "from otc.features.build_features import (\n", + " features_classical_size,\n", + ")" ] }, { @@ -52,7 +47,7 @@ "\n", "os.environ[\"GCLOUD_PROJECT\"] = \"flowing-mantis-239216\"\n", "\n", - "run = wandb.init(project=\"thesis\", entity=\"fbv\")\n" + "run = wandb.init(project=\"thesis\", entity=\"fbv\")" ] }, { @@ -74,12 +69,14 @@ "artifact_dir_unlabelled = artifact_unlabelled.download()\n", "\n", "x_train_unlabelled = pd.read_parquet(\n", - " Path(artifact_dir_unlabelled, \"train_set.parquet\"), columns=[\"buy_sell\", *features_classical_size]\n", - " )\n", + " Path(artifact_dir_unlabelled, \"train_set.parquet\"),\n", + " columns=[\"buy_sell\", *features_classical_size],\n", + ")\n", "\n", "# labelled data\n", "x_train_labelled = pd.read_parquet(\n", - " Path(artifact_dir_labelled, \"train_set.parquet\"), columns=[\"buy_sell\", *features_classical_size]\n", + " Path(artifact_dir_labelled, \"train_set.parquet\"),\n", + " columns=[\"buy_sell\", *features_classical_size],\n", ")\n", "\n", "x_train_unlabelled[\"src\"] = \"unlabelled\"\n", @@ -139,7 +136,7 @@ "outputs": [], "source": [ "# slice to same time range as unlabelled trades\n", - "x_train_labelled = x_train_labelled.iloc[27248577 : 29510319]" + "x_train_labelled = x_train_labelled.iloc[27248577:29510319]" ] }, { @@ -176,18 +173,16 @@ "outputs": [], "source": [ "def plot_kde_src(var_name: str, clip: List[float] | None = None):\n", - " \"\"\"\n", - " Plot kde plots for labelled and unlabelled with regard to the feature 'var_name'.\n", + " \"\"\"Plot kde plots for labelled and unlabelled with regard to the feature 'var_name'.\n", "\n", " Args:\n", " var_name (str): name of feature\n", " clip (List[float] | None, optional): clipping range. Defaults to None.\n", " \"\"\"\n", - "\n", - " quantiles = np.linspace(.1, 1, 9, 0)\n", + " quantiles = np.linspace(0.1, 1, 9, 0)\n", " stats_unlabelled = data[data[\"src\"] == \"unlabelled\"][var_name].quantile(quantiles)\n", " stats_labelled = data[data[\"src\"] == \"labelled\"][var_name].quantile(quantiles)\n", - " \n", + "\n", " _, ax = plt.subplots()\n", " for i in [\"unlabelled\", \"labelled\"]:\n", " sns.kdeplot(\n", @@ -203,7 +198,9 @@ " sns.move_legend(ax, \"lower center\", bbox_to_anchor=(0.5, -0.3))\n", " plt.show()\n", "\n", - " stats = pd.concat([stats_unlabelled, stats_labelled], keys=[\"unlabelled\", \"labelled\"], axis=1)\n", + " stats = pd.concat(\n", + " [stats_unlabelled, stats_labelled], keys=[\"unlabelled\", \"labelled\"], axis=1\n", + " )\n", " print(stats)" ] }, @@ -256,8 +253,10 @@ "source": [ "def plot_hist(unlabelled, labelled, title):\n", " fig, ax = plt.subplots()\n", - " ax.hist(unlabelled, bins=50, alpha=0.5, label='unlabelled', density=True, range=[-2,2])\n", - " ax.hist(labelled, bins=50, alpha=0.5, label='labelled', density=True, range=[-2,2])\n", + " ax.hist(\n", + " unlabelled, bins=50, alpha=0.5, label=\"unlabelled\", density=True, range=[-2, 2]\n", + " )\n", + " ax.hist(labelled, bins=50, alpha=0.5, label=\"labelled\", density=True, range=[-2, 2])\n", " plt.title(title)\n", " plt.legend()\n", " plt.show()" @@ -398,7 +397,7 @@ "source": [ "var_name = \"spread\"\n", "\n", - "data[var_name] = data[\"ask_ex\"] - data[\"bid_ex\"] " + "data[var_name] = data[\"ask_ex\"] - data[\"bid_ex\"]" ] }, { @@ -445,7 +444,7 @@ "outputs": [], "source": [ "var_name = \"prc_delta\"\n", - "data[var_name] = (data[\"TRADE_PRICE\"] - data[\"price_ex_lead\"])" + "data[var_name] = data[\"TRADE_PRICE\"] - data[\"price_ex_lead\"]" ] }, { diff --git a/notebooks/3.0c-feature-engineering.ipynb b/notebooks/3.0c-feature-engineering.ipynb index 842e4f5b..88d1e553 100644 --- a/notebooks/3.0c-feature-engineering.ipynb +++ b/notebooks/3.0c-feature-engineering.ipynb @@ -18,15 +18,16 @@ "import gcsfs\n", "import google.auth\n", "import numpy as np\n", - "import numpy.typing as npt\n", "import pandas as pd\n", "import wandb\n", - "from catboost import CatBoostClassifier, Pool\n", + "from catboost import CatBoostClassifier\n", "from sklearn.exceptions import NotFittedError\n", "from sklearn.metrics import matthews_corrcoef\n", "from sklearn.model_selection import train_test_split\n", - "from sklearn.preprocessing import (OrdinalEncoder, PowerTransformer,\n", - " RobustScaler, StandardScaler)\n", + "from sklearn.preprocessing import (\n", + " OrdinalEncoder,\n", + " StandardScaler,\n", + ")\n", "from tqdm.auto import tqdm" ] }, @@ -39,7 +40,7 @@ "outputs": [], "source": [ "credentials, _ = google.auth.default()\n", - "fs = gcsfs.GCSFileSystem(project=\"thesis\", token=credentials)\n" + "fs = gcsfs.GCSFileSystem(project=\"thesis\", token=credentials)" ] }, { @@ -57,7 +58,7 @@ "outputs": [], "source": [ "# connect to weights and biases\n", - "run = wandb.init(project=\"thesis\", job_type=\"dataset-creation\", entity=\"fbv\")\n" + "run = wandb.init(project=\"thesis\", job_type=\"dataset-creation\", entity=\"fbv\")" ] }, { @@ -73,7 +74,7 @@ "\n", "exchange = \"ise\" # \"ise\" # \"cboe\"\n", "strategy = \"supervised\" # \"supervised\" #\"unsupervised\" # \"supervised\" # \"transfer\" # \"unsupervised\"\n", - "mode = \"none\" # \"none\" # \"log_standardized\"\n" + "mode = \"none\" # \"none\" # \"log_standardized\"" ] }, { @@ -92,7 +93,7 @@ "\n", "# load unscaled data\n", "artifact = run.use_artifact(dataset)\n", - "data_dir = artifact.download()\n" + "data_dir = artifact.download()" ] }, { @@ -127,7 +128,7 @@ " \"buy_sell\",\n", " \"day_vol\",\n", " \"myn\",\n", - "]\n" + "]" ] }, { @@ -161,7 +162,7 @@ " # load test set\n", " test = pd.read_parquet(\n", " Path(data_dir, \"test_set\"), engine=\"fastparquet\", columns=columns\n", - " )\n" + " )" ] }, { @@ -189,7 +190,7 @@ " \"price_ex_lag\",\n", " \"day_vol\",\n", " \"myn\",\n", - "]\n" + "]" ] }, { @@ -307,15 +308,14 @@ " data: pd.DataFrame,\n", " mode: Literal[\"log_standarized\", \"none\"] = \"log_standardized\",\n", ") -> pd.DataFrame:\n", - " \"\"\"\n", - " Create features, impute, and scale.\n", + " \"\"\"Create features, impute, and scale.\n", "\n", " Args:\n", " data (pd.DataFrame): input data frame.\n", + "\n", " Returns:\n", " pd.DataFrame: updated data frame.\n", " \"\"\"\n", - "\n", " # set up df, overwrite later\n", " x = pd.DataFrame(data={\"TRADE_PRICE\": data[\"TRADE_PRICE\"]}, index=data.index)\n", "\n", @@ -439,7 +439,7 @@ " data[\"ROOT\"].astype(str).values.reshape(-1, 1)\n", " )\n", " print(\"transform (val + test)\")\n", - " except NotFittedError as e:\n", + " except NotFittedError:\n", " x[num_cols] = scaler.fit_transform(x[num_cols])\n", " x[\"option_type\"] = oe_option_type.fit_transform(\n", " data[\"OPTION_TYPE\"].astype(str).values.reshape(-1, 1)\n", @@ -511,7 +511,7 @@ " x[\"root\"] = data[\"ROOT\"]\n", "\n", " x[\"buy_sell\"] = data[\"buy_sell\"].astype(\"int8\")\n", - " return x\n" + " return x" ] }, { @@ -566,7 +566,7 @@ " gc.collect()\n", " dataset.add_reference(output_path)\n", "\n", - "run.log_artifact(dataset)\n" + "run.log_artifact(dataset)" ] }, { @@ -580,7 +580,6 @@ "# save scaler to pickle\n", "\n", "if strategy == \"supervised\":\n", - "\n", " scalers = {\n", " \"scaler\": scaler,\n", " \"oe_option_type\": oe_option_type,\n", @@ -589,12 +588,12 @@ " }\n", " uri_scalers = f\"gs://thesis-bucket-option-trade-classification/data/preprocessed/{name}/scalers.sklearn\"\n", " with fs.open(uri_scalers, \"wb\") as f:\n", - " pickle.dump(scalers, f, protocol=4) \n", + " pickle.dump(scalers, f, protocol=4)\n", "\n", " # log scaler to wandb\n", " scaler = wandb.Artifact(name=f\"{name}_scaler\", type=\"scaler\")\n", " scaler.add_reference(uri_scalers)\n", - " run.log_artifact(scaler)\n" + " run.log_artifact(scaler)" ] }, { @@ -605,7 +604,7 @@ }, "outputs": [], "source": [ - "run.finish()\n" + "run.finish()" ] }, { @@ -656,7 +655,7 @@ " *features_classical,\n", " *features_size,\n", " \"buy_sell\", # add here and remove later\n", - "]\n" + "]" ] }, { @@ -674,7 +673,7 @@ " \"gs://thesis-bucket-option-trade-classification/data/ise_log_standardized/val_set_20.parquet\",\n", " engine=\"fastparquet\",\n", " columns=features_classical_size,\n", - ")\n" + ")" ] }, { @@ -689,7 +688,7 @@ "X = pd.concat([train, val])\n", "X.drop(columns=[\"buy_sell\"], inplace=True)\n", "# assign zeros to train set and ones to test set\n", - "y = [0] * len(train) + [1] * len(val)\n" + "y = [0] * len(train) + [1] * len(val)" ] }, { @@ -698,7 +697,7 @@ "metadata": {}, "outputs": [], "source": [ - "X.columns\n" + "X.columns" ] }, { @@ -715,7 +714,7 @@ " logging_level=\"Silent\",\n", " random_seed=42,\n", " eval_metric=\"Accuracy\",\n", - ")\n" + ")" ] }, { @@ -727,7 +726,7 @@ "X_train, X_test, y_train, y_test = train_test_split(\n", " X, y, test_size=0.2, random_state=42, shuffle=True\n", ")\n", - "clf.fit(X_train, y_train, eval_set=(X_test, y_test))\n" + "clf.fit(X_train, y_train, eval_set=(X_test, y_test))" ] }, { @@ -736,7 +735,7 @@ "metadata": {}, "outputs": [], "source": [ - "y_pred = clf.predict(X_test)\n" + "y_pred = clf.predict(X_test)" ] }, { @@ -746,7 +745,7 @@ "outputs": [], "source": [ "# use mcc as data is imbalanced 3/4 train set, 1/4 val set\n", - "print(matthews_corrcoef(y_test, y_pred))\n" + "print(matthews_corrcoef(y_test, y_pred))" ] }, { @@ -758,7 +757,7 @@ "feature_importance = clf.get_feature_importance(\n", " prettified=True, type=\"FeatureImportance\"\n", ")\n", - "feature_importance\n" + "feature_importance" ] }, { @@ -767,7 +766,7 @@ "metadata": {}, "outputs": [], "source": [ - "feature_importance.to_csv(\"feature_importance_gbm_classical_size.csv\")\n" + "feature_importance.to_csv(\"feature_importance_gbm_classical_size.csv\")" ] }, { @@ -795,7 +794,7 @@ " results.append({\"col\": col, \"static\": res.statistic, \"pvalue\": res.pvalue})\n", "\n", "results = pd.DataFrame(results)\n", - "results.to_csv(\"kolmogorov_smirnov.csv\")\n" + "results.to_csv(\"kolmogorov_smirnov.csv\")" ] }, { @@ -824,10 +823,11 @@ }, "outputs": [], "source": [ - "from otc.features.build_features import features_classical_size\n", "import matplotlib as mpl\n", "import matplotlib.pyplot as plt\n", - "from matplotlib import rc" + "from matplotlib import rc\n", + "\n", + "from otc.features.build_features import features_classical_size" ] }, { @@ -871,15 +871,30 @@ }, "outputs": [], "source": [ - "cols_clearname = [\"trade price\", \"bid (ex)\", \"ask (ex)\", \"ask (best)\",\n", - " \"bid (best)\", \"price lag (ex)\", \"price lead (ex)\",\n", - " \"price lag (all)\", \"price lead (all)\",\n", - " \"price chg. lead (ex)\", \"price chg. lag (ex)\",\n", - " \"price chg. lead (all)\", \"price chg. lag (all)\",\n", - " \"prox (ex)\", \"prox (best)\", \"bid ask size ratio (ex)\",\n", - " \"rel. bid size (ex)\", \"rel. ask size (ex)\", \"trade size\",\n", - " \"bid size (ex)\", \"ask size (ex)\", \"depth (ex)\"\n", - " ]" + "cols_clearname = [\n", + " \"trade price\",\n", + " \"bid (ex)\",\n", + " \"ask (ex)\",\n", + " \"ask (best)\",\n", + " \"bid (best)\",\n", + " \"price lag (ex)\",\n", + " \"price lead (ex)\",\n", + " \"price lag (all)\",\n", + " \"price lead (all)\",\n", + " \"price chg. lead (ex)\",\n", + " \"price chg. lag (ex)\",\n", + " \"price chg. lead (all)\",\n", + " \"price chg. lag (all)\",\n", + " \"prox (ex)\",\n", + " \"prox (best)\",\n", + " \"bid ask size ratio (ex)\",\n", + " \"rel. bid size (ex)\",\n", + " \"rel. ask size (ex)\",\n", + " \"trade size\",\n", + " \"bid size (ex)\",\n", + " \"ask size (ex)\",\n", + " \"depth (ex)\",\n", + "]" ] }, { @@ -924,12 +939,11 @@ "plt.rcParams.update(params)\n", "rc(\"text\", usetex=True)\n", "\n", - "plt.rc('text.latex', preamble=r'\\usepackage{amsmath}\\usepackage[utf8]{inputenc}')\n", + "plt.rc(\"text.latex\", preamble=r\"\\usepackage{amsmath}\\usepackage[utf8]{inputenc}\")\n", "\n", "cmap = mpl.colormaps.get_cmap(\"plasma\")\n", "\n", "\n", - "\n", "# https://ranocha.de/blog/colors/\n", "# Standard SciencePlots color cycle\n", "mpl.rcParams[\"axes.prop_cycle\"] = mpl.cycler(\n", @@ -938,11 +952,35 @@ "\n", "# line cyclers adapted to colourblind people\n", "from cycler import cycler\n", - "line_cycler = (cycler(color=[\"#E69F00\", \"#56B4E9\", \"#009E73\", \"#0072B2\", \"#D55E00\", \"#CC79A7\", \"#F0E442\"]) # + cycler(linestyle=[\"-\", \"--\", \"-.\", \":\", \"-\", \"--\", \"-.\"])\n", - " )\n", - "marker_cycler = (cycler(color=[\"#E69F00\", \"#56B4E9\", \"#009E73\", \"#0072B2\", \"#D55E00\", \"#CC79A7\", \"#F0E442\"]) +\n", - " cycler(linestyle=[\"none\", \"none\", \"none\", \"none\", \"none\", \"none\", \"none\"]) +\n", - " cycler(marker=[\"4\", \"2\", \"3\", \"1\", \"+\", \"x\", \".\"]))\n", + "\n", + "line_cycler = (\n", + " cycler(\n", + " color=[\n", + " \"#E69F00\",\n", + " \"#56B4E9\",\n", + " \"#009E73\",\n", + " \"#0072B2\",\n", + " \"#D55E00\",\n", + " \"#CC79A7\",\n", + " \"#F0E442\",\n", + " ]\n", + " ) # + cycler(linestyle=[\"-\", \"--\", \"-.\", \":\", \"-\", \"--\", \"-.\"])\n", + ")\n", + "marker_cycler = (\n", + " cycler(\n", + " color=[\n", + " \"#E69F00\",\n", + " \"#56B4E9\",\n", + " \"#009E73\",\n", + " \"#0072B2\",\n", + " \"#D55E00\",\n", + " \"#CC79A7\",\n", + " \"#F0E442\",\n", + " ]\n", + " )\n", + " + cycler(linestyle=[\"none\", \"none\", \"none\", \"none\", \"none\", \"none\", \"none\"])\n", + " + cycler(marker=[\"4\", \"2\", \"3\", \"1\", \"+\", \"x\", \".\"])\n", + ")\n", "\n", "plt.rc(\"axes\", prop_cycle=line_cycler)" ] @@ -971,25 +1009,22 @@ "index = 0\n", "\n", "for i, col in tqdm(enumerate(cols)):\n", - "\n", - " \n", " r = i // 4\n", " c = i % 4\n", "\n", - " \n", " ax[r][c].acorr(X[col].astype(float), usevlines=True, normed=True, maxlags=20, lw=1)\n", " ax[r][c].set_title(cols_clearname[index])\n", "\n", - " index +=1\n", + " index += 1\n", "\n", "# remove empty plots\n", "fig.delaxes(ax[5][2])\n", "fig.delaxes(ax[5][3])\n", "\n", "plt.savefig(\n", - " f\"../reports/Graphs/auto_corr_features.pdf\",\n", + " \"../reports/Graphs/auto_corr_features.pdf\",\n", " bbox_inches=\"tight\",\n", - ")\n" + ")" ] } ], diff --git a/notebooks/3.0d-mb-adv_val.ipynb b/notebooks/3.0d-mb-adv_val.ipynb index d9bdf0c8..cbd3abd0 100644 --- a/notebooks/3.0d-mb-adv_val.ipynb +++ b/notebooks/3.0d-mb-adv_val.ipynb @@ -23,10 +23,9 @@ "import pandas as pd\n", "import wandb\n", "from catboost import CatBoostClassifier, Pool\n", - "from tqdm.auto import tqdm\n", - "\n", "from sklearn.metrics import matthews_corrcoef\n", "from sklearn.model_selection import train_test_split\n", + "from tqdm.auto import tqdm\n", "\n", "sys.path.append(\"..\")\n", "from otc.features.build_features import (\n", @@ -34,7 +33,7 @@ " features_classical,\n", " features_classical_size,\n", " features_ml,\n", - ")\n" + ")" ] }, { @@ -48,7 +47,7 @@ "STRATEGY = \"supervised\" # \"supervised\" # \"transfer\"\n", "\n", "# ise-trained models, supervised/semisupervised\n", - "models = [\"classical\",\"classical-size\",\"ml\"]\n" + "models = [\"classical\", \"classical-size\", \"ml\"]" ] }, { @@ -70,7 +69,7 @@ "outputs": [], "source": [ "# set project name. Required to access files and artefacts\n", - "os.environ[\"GCLOUD_PROJECT\"] = \"flowing-mantis-239216\"\n" + "os.environ[\"GCLOUD_PROJECT\"] = \"flowing-mantis-239216\"" ] }, { @@ -90,7 +89,7 @@ "run = wandb.init(project=\"thesis\", entity=\"fbv\")\n", "\n", "artifact = run.use_artifact(dataset)\n", - "data_dir = artifact.download()\n" + "data_dir = artifact.download()" ] }, { @@ -120,7 +119,9 @@ "metadata": {}, "outputs": [], "source": [ - "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=len(val), random_state=42)" + "X_train, X_test, y_train, y_test = train_test_split(\n", + " X, y, test_size=len(val), random_state=42\n", + ")" ] }, { @@ -149,19 +150,17 @@ "}\n", "\n", "for feature_str in tqdm(models):\n", - "\n", " fs = FEATURE_MAP.get(feature_str)\n", " # filter categorical features that are in subset and get cardinality\n", " cat_features_sub = [tup[0] for tup in features_categorical if tup[0] in fs]\n", - " \n", + "\n", " train_pool = Pool(\n", " data=X_train.loc[:, fs],\n", " label=y_train,\n", " cat_features=cat_features_sub,\n", " )\n", "\n", - " model = CatBoostClassifier(task_type = \"GPU\").fit(train_pool)\n", - "\n", + " model = CatBoostClassifier(task_type=\"GPU\").fit(train_pool)\n", "\n", " test_pool = Pool(\n", " data=X_test.loc[:, fs],\n", @@ -170,10 +169,9 @@ " )\n", "\n", " mcc = matthews_corrcoef(y_test, model.predict(test_pool))\n", - " \n", + "\n", " print(feature_str)\n", - " print(mcc)\n", - " " + " print(mcc)" ] }, { diff --git a/notebooks/4.0a-mb-logistic-regression.ipynb b/notebooks/4.0a-mb-logistic-regression.ipynb index 105eecfc..789bd1f8 100644 --- a/notebooks/4.0a-mb-logistic-regression.ipynb +++ b/notebooks/4.0a-mb-logistic-regression.ipynb @@ -9,23 +9,21 @@ }, "outputs": [], "source": [ + "import math\n", "import os\n", "import sys\n", - "\n", - "import math\n", "from pathlib import Path\n", "\n", "import numpy as np\n", "import pandas as pd\n", "import torch\n", "import wandb\n", - "from torch import nn\n", "from torch import nn, optim\n", "from tqdm.auto import tqdm\n", "\n", "sys.path.append(\"..\")\n", - "from otc.data.dataset import TabDataset\n", "from otc.data.dataloader import TabDataLoader\n", + "from otc.data.dataset import TabDataset\n", "from otc.features.build_features import features_classical_size\n", "from otc.optim.early_stopping import EarlyStopping" ] @@ -55,7 +53,7 @@ "\n", "dataset = \"fbv/thesis/ise_supervised_log_standardized_clipped:latest\"\n", "artifact = run.use_artifact(dataset)\n", - "data_dir = artifact.download()\n" + "data_dir = artifact.download()" ] }, { @@ -71,11 +69,15 @@ "frac = 1\n", "\n", "# sample\n", - "X_train = pd.read_parquet(Path(data_dir, \"train_set.parquet\"), engine=\"fastparquet\").sample(frac=frac)\n", + "X_train = pd.read_parquet(\n", + " Path(data_dir, \"train_set.parquet\"), engine=\"fastparquet\"\n", + ").sample(frac=frac)\n", "y_train = X_train[\"buy_sell\"]\n", "X_train = X_train[features_classical_size]\n", "\n", - "X_val = pd.read_parquet(Path(data_dir, \"val_set.parquet\"), engine=\"fastparquet\").sample(frac=frac)\n", + "X_val = pd.read_parquet(Path(data_dir, \"val_set.parquet\"), engine=\"fastparquet\").sample(\n", + " frac=frac\n", + ")\n", "y_val = X_val[\"buy_sell\"]\n", "X_val = X_val[features_classical_size]\n", "\n", @@ -118,7 +120,7 @@ "test_data = TabDataset(X_test, y_test)\n", "\n", "dl_params = {\n", - " \"batch_size\": 32768, \n", + " \"batch_size\": 32768,\n", " \"device\": \"cuda\",\n", " \"shuffle\": True,\n", "}\n", @@ -129,7 +131,7 @@ " training_data.x_cont,\n", " training_data.weight,\n", " training_data.y,\n", - " **dl_params\n", + " **dl_params,\n", ")\n", "val_loader = TabDataLoader(\n", " val_data.x_cat, val_data.x_cont, val_data.weight, val_data.y, **dl_params\n", @@ -151,14 +153,15 @@ "source": [ "optim_params = {\"lr\": 1e-4, \"weight_decay\": 0.00001}\n", "\n", - "clf = LogisticRegression(input_size=X_train.shape[1],num_classes=1).to(\"cuda\")\n", + "clf = LogisticRegression(input_size=X_train.shape[1], num_classes=1).to(\"cuda\")\n", "\n", "criterion = nn.BCEWithLogitsLoss()\n", "\n", - "optimizer = optim.AdamW(clf.parameters(),\n", + "optimizer = optim.AdamW(\n", + " clf.parameters(),\n", " lr=optim_params[\"lr\"],\n", " weight_decay=optim_params[\"weight_decay\"],\n", - ")\n" + ")" ] }, { @@ -181,69 +184,65 @@ "\n", "\n", "for epoch in tqdm(range(epochs)):\n", - "\n", " # perform training\n", " loss_in_epoch_train = 0\n", "\n", " batch = 0\n", - " \n", + "\n", " for x_cat, x_cont, weights, targets in train_loader:\n", - " \n", " clf.train()\n", " optimizer.zero_grad()\n", "\n", - " with torch.autocast(device_type='cuda', dtype=torch.float16):\n", + " with torch.autocast(device_type=\"cuda\", dtype=torch.float16):\n", " logits = clf(x_cat, x_cont).flatten()\n", " train_loss = criterion(logits, targets)\n", "\n", " scaler.scale(train_loss).backward()\n", " scaler.step(optimizer)\n", " scaler.update()\n", - " \n", + "\n", " # add the mini-batch training loss to epoch loss\n", " loss_in_epoch_train += train_loss # .item()\n", " wandb.log({\"train_loss_step\": train_loss, \"epoch\": epoch, \"batch\": batch})\n", - " \n", + "\n", " batch += 1\n", - " step +=1\n", + " step += 1\n", "\n", " clf.eval()\n", " loss_in_epoch_val = 0.0\n", " correct = 0\n", - " \n", + "\n", " with torch.no_grad():\n", " for x_cat, x_cont, weights, targets in val_loader:\n", - " \n", " # for my implementation\n", " logits = clf(x_cat, x_cont).flatten()\n", " logits = logits.flatten()\n", "\n", " val_loss = criterion(logits, targets)\n", - " \n", + "\n", " # get probabilities and round to nearest integer\n", " preds = torch.sigmoid(logits).round()\n", " correct += (preds == targets).sum().item()\n", "\n", " loss_in_epoch_val += val_loss # val_loss #.item()\n", " wandb.log({\"val_loss_step\": val_loss, \"epoch\": epoch, \"batch\": batch})\n", - " \n", - " batch +=1 \n", + "\n", + " batch += 1\n", "\n", " # loss average over all batches\n", " train_loss = loss_in_epoch_train / len(train_loader)\n", " val_loss = loss_in_epoch_val / len(val_loader)\n", - " \n", + "\n", " # correct samples / no samples\n", " val_accuracy = correct / len(X_val)\n", " if best_accuracy < val_accuracy:\n", " best_accuracy = val_accuracy\n", " best_step = step\n", - " \n", - " \n", - " wandb.log({\"train_loss\": train_loss, 'epoch': epoch})\n", - " wandb.log({\"val_loss\": val_loss, 'epoch': epoch})\n", - " # wandb.log({\"val_accuracy\": val_accuracy, 'epoch': epoch}) \n", - " \n", + "\n", + " wandb.log({\"train_loss\": train_loss, \"epoch\": epoch})\n", + " wandb.log({\"val_loss\": val_loss, \"epoch\": epoch})\n", + " # wandb.log({\"val_accuracy\": val_accuracy, 'epoch': epoch})\n", + "\n", " print(f\"train:{train_loss} val:{val_loss}\")\n", " print(f\"val accuracy:{val_accuracy}\")\n", "\n", @@ -251,7 +250,7 @@ " early_stopping(-val_accuracy)\n", " if early_stopping.early_stop or math.isnan(train_loss) or math.isnan(val_loss):\n", " print(\"early stopping now.\")\n", - " break\n" + " break" ] }, { @@ -273,7 +272,7 @@ " # https://stackoverflow.com/a/66910866/5755604\n", " preds = torch.sigmoid(logits.squeeze())\n", " y_pred.append(preds.detach().cpu().numpy())\n", - " y_true.append(targets.detach().cpu().numpy()) \n", + " y_true.append(targets.detach().cpu().numpy())\n", "\n", "# round prediction to nearest int\n", "y_pred = np.rint(np.concatenate(y_pred))\n", diff --git a/notebooks/4.0b-mb-fttransformer.ipynb b/notebooks/4.0b-mb-fttransformer.ipynb index 162d7ecf..473a9f8a 100644 --- a/notebooks/4.0b-mb-fttransformer.ipynb +++ b/notebooks/4.0b-mb-fttransformer.ipynb @@ -10,17 +10,17 @@ "outputs": [], "source": [ "import glob\n", - "import os\n", "import math\n", + "import os\n", "import sys\n", "from pathlib import Path\n", "\n", "import numpy as np\n", "import pandas as pd\n", - "import wandb\n", "import torch\n", - "from torch import optim, nn\n", - "from tqdm.auto import tqdm\n" + "import wandb\n", + "from torch import nn, optim\n", + "from tqdm.auto import tqdm" ] }, { @@ -31,11 +31,11 @@ "outputs": [], "source": [ "sys.path.append(\"..\")\n", - "from otc.models.fttransformer import FeatureTokenizer, FTTransformer, Transformer\n", - "from otc.models.activation import ReGLU\n", - "from otc.data.dataset import TabDataset\n", "from otc.data.dataloader import TabDataLoader\n", + "from otc.data.dataset import TabDataset\n", "from otc.features.build_features import features_classical_size\n", + "from otc.models.activation import ReGLU\n", + "from otc.models.fttransformer import FeatureTokenizer, FTTransformer, Transformer\n", "from otc.optim.early_stopping import EarlyStopping\n", "from otc.optim.scheduler import CosineWarmupScheduler" ] @@ -65,7 +65,7 @@ "\n", "dataset = \"fbv/thesis/ise_supervised_log_standardized_clipped:latest\"\n", "artifact = run.use_artifact(dataset)\n", - "data_dir = artifact.download()\n" + "data_dir = artifact.download()" ] }, { @@ -98,7 +98,7 @@ "\n", "# eps = 0.1\n", "# y_train[np.where(y_train == 0)] = eps\n", - "# y_train[np.where(y_train == 1)] = 1.0 - eps\n" + "# y_train[np.where(y_train == 1)] = 1.0 - eps" ] }, { @@ -177,8 +177,8 @@ "optim_params = {\"lr\": 1e-4, \"weight_decay\": 0.00001}\n", "\n", "module_params = {\n", - " \"transformer\": Transformer(**transformer_kwargs), \n", - " \"feature_tokenizer\": FeatureTokenizer(**feature_tokenizer_kwargs), # noqa: E501\n", + " \"transformer\": Transformer(**transformer_kwargs),\n", + " \"feature_tokenizer\": FeatureTokenizer(**feature_tokenizer_kwargs),\n", " \"cat_features\": None,\n", " \"cat_cardinalities\": [],\n", "}\n", @@ -193,7 +193,7 @@ "# wandb.log(transformer_kwargs)\n", "# wandb.log(optim_params)\n", "# wandb.log(feature_tokenizer_kwargs)\n", - "# wandb.log(dl_params)\n" + "# wandb.log(dl_params)" ] }, { @@ -214,7 +214,7 @@ " training_data.x_cont,\n", " training_data.weight,\n", " training_data.y,\n", - " **dl_params\n", + " **dl_params,\n", ")\n", "val_loader = TabDataLoader(\n", " val_data.x_cat, val_data.x_cont, val_data.weight, val_data.y, **dl_params\n", @@ -222,7 +222,7 @@ "\n", "test_loader = TabDataLoader(\n", " test_data.x_cat, test_data.x_cont, test_data.weight, test_data.y, **dl_params\n", - ")\n" + ")" ] }, { @@ -249,7 +249,7 @@ "\n", "scheduler = CosineWarmupScheduler(\n", " optimizer=optimizer, warmup=warmup, max_iters=max_iters\n", - ")\n" + ")" ] }, { @@ -262,7 +262,6 @@ "outputs": [], "source": [ "def checkpoint(model, filename):\n", - "\n", " # remove old files\n", " for filename in glob.glob(f\"checkpoints/{run.id}*\"):\n", " os.remove(filename)\n", @@ -273,7 +272,7 @@ "\n", " # save new file\n", " print(\"saving new checkpoints.\")\n", - " torch.save(model.state_dict(), os.path.join(dir_checkpoints, f\"{run.id}*\"))\n" + " torch.save(model.state_dict(), os.path.join(dir_checkpoints, f\"{run.id}*\"))" ] }, { @@ -300,14 +299,12 @@ "best_step = -1\n", "\n", "for epoch in tqdm(range(epochs)):\n", - "\n", " # perform training\n", " loss_in_epoch_train = 0\n", "\n", " batch = 0\n", "\n", " for x_cat, x_cont, _, targets in train_loader:\n", - "\n", " clf.train()\n", " optimizer.zero_grad()\n", "\n", @@ -324,7 +321,9 @@ "\n", " # add the mini-batch training loss to epoch loss\n", " loss_in_epoch_train += train_loss.item()\n", - " wandb.log({\"train_loss_step\": train_loss.item(), \"epoch\": epoch, \"batch\": batch})\n", + " wandb.log(\n", + " {\"train_loss_step\": train_loss.item(), \"epoch\": epoch, \"batch\": batch}\n", + " )\n", "\n", " batch += 1\n", " step += 1\n", @@ -335,7 +334,6 @@ "\n", " with torch.no_grad():\n", " for x_cat, x_cont, _, targets in val_loader:\n", - "\n", " # for my implementation\n", " logits = clf(x_cat, x_cont).flatten()\n", " logits = logits.flatten()\n", @@ -347,7 +345,9 @@ " correct += (preds == targets).sum().item()\n", "\n", " loss_in_epoch_val += val_loss.item()\n", - " wandb.log({\"val_loss_step\": val_loss.item(), \"epoch\": epoch, \"batch\": batch})\n", + " wandb.log(\n", + " {\"val_loss_step\": val_loss.item(), \"epoch\": epoch, \"batch\": batch}\n", + " )\n", "\n", " batch += 1\n", "\n", @@ -372,7 +372,7 @@ " early_stopping(-val_accuracy)\n", " if early_stopping.early_stop or math.isnan(train_loss) or math.isnan(val_loss):\n", " print(\"meh... early stopping\")\n", - " break\n" + " break" ] }, { @@ -385,7 +385,7 @@ "outputs": [], "source": [ "cp = glob.glob(f\"checkpoints/{run.id}*\")\n", - "print(cp)\n" + "print(cp)" ] }, { @@ -397,7 +397,7 @@ }, "outputs": [], "source": [ - "clf.load_state_dict(torch.load(cp[0]))\n" + "clf.load_state_dict(torch.load(cp[0]))" ] }, { @@ -412,7 +412,6 @@ "y_pred, y_true = [], []\n", "\n", "for x_cat, x_cont, _, targets in test_loader:\n", - "\n", " logits = clf(x_cat, x_cont).flatten()\n", " logits = logits.flatten()\n", "\n", @@ -420,14 +419,14 @@ " # https://stackoverflow.com/a/66910866/5755604\n", " preds = torch.sigmoid(logits.squeeze())\n", " y_pred.append(preds.detach().cpu().numpy())\n", - " y_true.append(targets.detach().cpu().numpy()) \n", + " y_true.append(targets.detach().cpu().numpy())\n", "\n", "# round prediction to nearest int\n", "y_pred = np.rint(np.concatenate(y_pred))\n", "y_true = np.concatenate(y_true)\n", "\n", - "acc = (y_pred == y_true).sum() / len(y_true) \n", - "print(acc)\n" + "acc = (y_pred == y_true).sum() / len(y_true)\n", + "print(acc)" ] } ], diff --git a/notebooks/4.0c-mb-feature-importances.ipynb b/notebooks/4.0c-mb-feature-importances.ipynb index a34f6032..be90b3f7 100644 --- a/notebooks/4.0c-mb-feature-importances.ipynb +++ b/notebooks/4.0c-mb-feature-importances.ipynb @@ -24,39 +24,33 @@ "outputs": [], "source": [ "import os\n", - "import sys\n", "import pickle\n", + "import sys\n", "from pathlib import Path\n", "\n", - "from catboost import CatBoostClassifier, Pool\n", - "\n", - "import numpy as np\n", - "import matplotlib.pyplot as plt\n", "import matplotlib as mpl\n", - "from matplotlib import rc\n", - "\n", + "import matplotlib.pyplot as plt\n", + "import numpy as np\n", "import pandas as pd\n", "import torch\n", + "from catboost import CatBoostClassifier, Pool\n", + "from matplotlib import rc\n", "from torch import nn\n", "\n", "sys.path.append(\"..\")\n", - "from otc.models.classical_classifier import ClassicalClassifier\n", - "\n", + "import wandb\n", "from sage import GroupedMarginalImputer, PermutationEstimator\n", + "from tqdm.auto import tqdm\n", "\n", + "from otc.data.dataloader import TabDataLoader\n", + "from otc.data.dataset import TabDataset\n", "from otc.features.build_features import (\n", " features_categorical,\n", " features_classical,\n", " features_classical_size,\n", " features_ml,\n", ")\n", - "\n", - "from otc.data.dataset import TabDataset\n", - "from otc.data.dataloader import TabDataLoader\n", - "from otc.features.build_features import features_classical_size\n", - "\n", - "import wandb\n", - "from tqdm.auto import tqdm" + "from otc.models.classical_classifier import ClassicalClassifier" ] }, { @@ -70,12 +64,12 @@ "source": [ "SEED = 42\n", "\n", - "np.random.seed(42) \n", + "np.random.seed(42)\n", "\n", "# set globally here\n", - "EXCHANGE = \"ise\" \n", - "STRATEGY = \"supervised\" \n", - "SUBSET = \"test\" \n", + "EXCHANGE = \"ise\"\n", + "STRATEGY = \"supervised\"\n", + "SUBSET = \"test\"\n", "\n", "\n", "# Change depending on model!\n", @@ -113,36 +107,44 @@ "outputs": [], "source": [ "def get_feature_groups(feature_names, feature_str):\n", - "\n", " fg_classical = {\n", - " 'chg_all_lead (grouped)': ['price_all_lead', 'chg_all_lead'],\n", - " 'chg_all_lag (grouped)': ['price_all_lag', 'chg_all_lag'],\n", - " 'chg_ex_lead (grouped)': ['price_ex_lead', 'chg_ex_lead'],\n", - " 'chg_ex_lag (grouped)': ['price_ex_lag', 'chg_ex_lag'],\n", - " 'quote_best (grouped)': ['BEST_ASK', 'BEST_BID', 'prox_best'],\n", - " 'quote_ex (grouped)': ['bid_ex', 'ask_ex','prox_ex' ],\n", - " 'TRADE_PRICE': ['TRADE_PRICE'],\n", - " }\n", - " \n", - " fg_size = {'size_ex (grouped)': [ 'bid_ask_size_ratio_ex', 'rel_bid_size_ex', 'rel_ask_size_ex', 'bid_size_ex', 'ask_size_ex','depth_ex'], 'TRADE_SIZE': ['TRADE_SIZE']}\n", - " \n", + " \"chg_all_lead (grouped)\": [\"price_all_lead\", \"chg_all_lead\"],\n", + " \"chg_all_lag (grouped)\": [\"price_all_lag\", \"chg_all_lag\"],\n", + " \"chg_ex_lead (grouped)\": [\"price_ex_lead\", \"chg_ex_lead\"],\n", + " \"chg_ex_lag (grouped)\": [\"price_ex_lag\", \"chg_ex_lag\"],\n", + " \"quote_best (grouped)\": [\"BEST_ASK\", \"BEST_BID\", \"prox_best\"],\n", + " \"quote_ex (grouped)\": [\"bid_ex\", \"ask_ex\", \"prox_ex\"],\n", + " \"TRADE_PRICE\": [\"TRADE_PRICE\"],\n", + " }\n", + "\n", + " fg_size = {\n", + " \"size_ex (grouped)\": [\n", + " \"bid_ask_size_ratio_ex\",\n", + " \"rel_bid_size_ex\",\n", + " \"rel_ask_size_ex\",\n", + " \"bid_size_ex\",\n", + " \"ask_size_ex\",\n", + " \"depth_ex\",\n", + " ],\n", + " \"TRADE_SIZE\": [\"TRADE_SIZE\"],\n", + " }\n", + "\n", " fg_ml = {\n", " \"STRK_PRC\": [\"STRK_PRC\"],\n", " \"ttm\": [\"ttm\"],\n", " \"option_type\": [\"option_type\"],\n", - " \"root\":[\"root\"],\n", - " \"myn\":[\"myn\"],\n", - " \"day_vol\":[\"day_vol\"], \n", - " \"issue_type\":[\"issue_type\"],\n", + " \"root\": [\"root\"],\n", + " \"myn\": [\"myn\"],\n", + " \"day_vol\": [\"day_vol\"],\n", + " \"issue_type\": [\"issue_type\"],\n", " }\n", - " \n", + "\n", " if feature_str.endswith(\"classical\"):\n", - " feature_groups = group_names = fg_classical \n", + " feature_groups = group_names = fg_classical\n", " if feature_str.endswith(\"classical-size\"):\n", - " feature_groups = group_names = {**fg_classical , **fg_size}\n", + " feature_groups = group_names = {**fg_classical, **fg_size}\n", " if feature_str.endswith(\"ml\"):\n", - " feature_groups = group_names = {**fg_classical, **fg_size, **fg_ml} \n", - " \n", + " feature_groups = group_names = {**fg_classical, **fg_size, **fg_ml}\n", "\n", " # Group indices\n", " groups = []\n", @@ -152,7 +154,7 @@ " ind_list.append(feature_names.index(feature))\n", " groups.append(ind_list)\n", "\n", - " return groups, group_names\n" + " return groups, group_names" ] }, { @@ -172,7 +174,11 @@ "artifact = run.use_artifact(dataset)\n", "data_dir = artifact.download()\n", "\n", - "data = pd.read_parquet(Path(data_dir, \"test_set.parquet\"), engine=\"fastparquet\", columns=[*features_classical_size, \"buy_sell\"])\n", + "data = pd.read_parquet(\n", + " Path(data_dir, \"test_set.parquet\"),\n", + " engine=\"fastparquet\",\n", + " columns=[*features_classical_size, \"buy_sell\"],\n", + ")\n", "\n", "y_test = data[\"buy_sell\"]\n", "X_test = data.drop(columns=\"buy_sell\")\n", @@ -223,20 +229,27 @@ "# compare benchmarks\n", "configs = [\n", " [(\"quote\", \"best\"), (\"quote\", \"ex\"), (\"rev_tick\", \"all\")],\n", - " [(\"trade_size\", \"ex\"), (\"quote\", \"best\"), (\"quote\", \"ex\"), (\"depth\", \"best\"), (\"depth\", \"ex\"), (\"rev_tick\", \"all\")] \n", + " [\n", + " (\"trade_size\", \"ex\"),\n", + " (\"quote\", \"best\"),\n", + " (\"quote\", \"ex\"),\n", + " (\"depth\", \"best\"),\n", + " (\"depth\", \"ex\"),\n", + " (\"rev_tick\", \"all\"),\n", + " ],\n", "]\n", "\n", "results = []\n", "for config in configs:\n", - " \n", - " groups, group_names = get_feature_groups(X_importance.columns.tolist(), \"classical-size\")\n", - " \n", + " groups, group_names = get_feature_groups(\n", + " X_importance.columns.tolist(), \"classical-size\"\n", + " )\n", + "\n", " clf = ClassicalClassifier(layers=config, random_state=SEED, strategy=\"random\")\n", " # only set headers etc, no leakage\n", " clf.fit(X=X_test.head(5), y=y_test.head(5))\n", - " \n", + "\n", " def call_classical(X):\n", - " \n", " pred = clf.predict_proba(X)\n", " # max_class = np.argmax(pred, axis=-1)\n", " # return max_class\n", @@ -245,12 +258,14 @@ " # apply group based imputation + estimate importances in terms of zero-one loss\n", " imputer = GroupedMarginalImputer(call_classical, X_importance.values, groups)\n", " estimator = PermutationEstimator(imputer, \"zero one\")\n", - " \n", + "\n", " # calculate values over entire test set\n", " sage_values = estimator(X_test.values, y_test.values.clip(0))\n", - " \n", + "\n", " # save sage values + std deviation to data frame\n", - " result = pd.DataFrame(index=group_names, data={\"values\": sage_values.values, \"std\": sage_values.std})\n", + " result = pd.DataFrame(\n", + " index=group_names, data={\"values\": sage_values.values, \"std\": sage_values.std}\n", + " )\n", " results.append(result)" ] }, @@ -274,7 +289,7 @@ "results_df = pd.concat(results, axis=1, keys=names)\n", "\n", "# flatten column names (required to save to parquet)\n", - "results_df.columns = [' '.join(col).strip() for col in results_df.columns.values]" + "results_df.columns = [\" \".join(col).strip() for col in results_df.columns.values]" ] }, { @@ -298,7 +313,9 @@ "source": [ "KEY = f\"{EXCHANGE}_{STRATEGY}_{SUBSET}_classical_feature_importance_{sample_size}\"\n", "\n", - "URI_FI_CLASSICAL = f\"gs://thesis-bucket-option-trade-classification/data/results/{KEY}.parquet\"\n", + "URI_FI_CLASSICAL = (\n", + " f\"gs://thesis-bucket-option-trade-classification/data/results/{KEY}.parquet\"\n", + ")\n", "\n", "results_df.to_parquet(URI_FI_CLASSICAL)\n", "\n", @@ -340,7 +357,11 @@ "artifact = run.use_artifact(dataset)\n", "data_dir = artifact.download()\n", "\n", - "data = pd.read_parquet(Path(data_dir, \"test_set.parquet\"), engine=\"fastparquet\", columns=[*features_ml, \"buy_sell\"])\n", + "data = pd.read_parquet(\n", + " Path(data_dir, \"test_set.parquet\"),\n", + " engine=\"fastparquet\",\n", + " columns=[*features_ml, \"buy_sell\"],\n", + ")\n", "\n", "y_test = data[\"buy_sell\"]\n", "X_test = data.drop(columns=\"buy_sell\")\n", @@ -372,64 +393,64 @@ }, "outputs": [], "source": [ - "configs = [(\"classical\", \"1gzk7msy_CatBoostClassifier_default.cbm:latest\"),\n", + "configs = [\n", + " (\"classical\", \"1gzk7msy_CatBoostClassifier_default.cbm:latest\"),\n", " (\"classical-size\", \"3vntumoi_CatBoostClassifier_default.cbm:latest\"),\n", " (\"ml\", \"2t5zo50f_CatBoostClassifier_default.cbm:latest\"),\n", " (\"semi-classical\", \"37lymmzc_CatBoostClassifier_default.cbm:latest\"),\n", " (\"semi-classical-size\", \"1vmti6db_CatBoostClassifier_default.cbm:latest\"),\n", - " (\"semi-ml\", \"t55nd8r0_CatBoostClassifier_default.cbm:latest\")]\n", + " (\"semi-ml\", \"t55nd8r0_CatBoostClassifier_default.cbm:latest\"),\n", + "]\n", "\n", "results = []\n", "\n", "for feature_str, model in configs:\n", - " \n", " # get feature names and slice to subset\n", " fs = FEATURE_MAP.get(feature_str)\n", " X_importance_fs = X_importance.loc[:, fs]\n", " X_importance_cols = X_importance_fs.columns.tolist()\n", - " \n", + "\n", " # calculate cat indices\n", " if feature_str.endswith(\"ml\"):\n", " cat_features = [t[0] for t in features_categorical]\n", " cat_idx = [X_importance_cols.index(f) for f in cat_features]\n", - " \n", + "\n", " # get groups\n", " groups, group_names = get_feature_groups(X_importance_cols, feature_str)\n", - " \n", + "\n", " # load model by identifier from wandb\n", " model_name = model.split(\"/\")[-1].split(\":\")[0]\n", - " \n", + "\n", " artifact = run.use_artifact(model)\n", " model_dir = artifact.download()\n", " clf = CatBoostClassifier()\n", " clf.load_model(fname=Path(model_dir, model_name))\n", - " \n", - " \n", + "\n", " # use callable instead of default catboost as it doesn't work with categoricals otherwise\n", - " pred=None\n", - " \n", + " pred = None\n", + "\n", " def call_catboost(X):\n", - " if feature_str.endswith(\"ml\"): \n", + " if feature_str.endswith(\"ml\"):\n", " # convert categorical to int\n", " X = pd.DataFrame(X, columns=X_importance.columns)\n", " # Update the selected columns in the original DataFrame\n", " X[cat_features] = X.iloc[:, cat_idx].astype(int)\n", " # pass cat indices\n", " return clf.predict_proba(Pool(X, cat_features=cat_idx))\n", - " else:\n", - " return clf.predict_proba(X)\n", - " \n", - " \n", + " return clf.predict_proba(X)\n", + "\n", " # apply group based imputation + estimate importances in terms of zero-one loss\n", " imputer = GroupedMarginalImputer(call_catboost, X_importance_fs, groups)\n", " # imputer = MarginalImputer(call_catboost, X_importance_fs)\n", " estimator = PermutationEstimator(imputer, \"zero one\")\n", - " \n", + "\n", " # calculate values over entire test set\n", - " sage_values = estimator(X_test.loc[:,fs].values, y_test.clip(0).values)\n", - " \n", + " sage_values = estimator(X_test.loc[:, fs].values, y_test.clip(0).values)\n", + "\n", " # save sage values + std deviation to data frame\n", - " result = pd.DataFrame(index=group_names, data={\"values\": sage_values.values, \"std\": sage_values.std})\n", + " result = pd.DataFrame(\n", + " index=group_names, data={\"values\": sage_values.values, \"std\": sage_values.std}\n", + " )\n", " # result = pd.DataFrame(index=X_importance_cols, data={\"values\": sage_values.values, \"std\": sage_values.std})\n", " results.append(result)" ] @@ -445,7 +466,7 @@ "source": [ "names = [f\"gbm({feature_str[0]})\" for feature_str in configs]\n", "results_df = pd.concat(results, axis=1, keys=names)\n", - "results_df.columns = [' '.join(col).strip() for col in results_df.columns.values]" + "results_df.columns = [\" \".join(col).strip() for col in results_df.columns.values]" ] }, { @@ -472,12 +493,14 @@ "# list to data frame + set human readable names\n", "names = [f\"gbm({feature_str[0]})\" for feature_str in configs]\n", "results_df = pd.concat(results, axis=1, keys=names)\n", - "results_df.columns = [' '.join(col).strip() for col in results_df.columns.values]\n", + "results_df.columns = [\" \".join(col).strip() for col in results_df.columns.values]\n", "\n", "# save to google clound and save identiifer\n", "KEY = f\"{EXCHANGE}_{STRATEGY}_{SUBSET}_gbm_feature_importance_{sample_size}\"\n", "\n", - "URI_FI_GBM = f\"gs://thesis-bucket-option-trade-classification/data/results/{KEY}.parquet\"\n", + "URI_FI_GBM = (\n", + " f\"gs://thesis-bucket-option-trade-classification/data/results/{KEY}.parquet\"\n", + ")\n", "\n", "results_df.to_parquet(URI_FI_GBM)\n", "\n", @@ -514,10 +537,10 @@ "source": [ "configs = [\n", " (\"classical\", \"3jpe46s1_TransformerClassifier_default.pkl:latest\"),\n", - " (\"classical-size\", \"1qx3ul4j_TransformerClassifier_default.pkl:latest\"), \n", + " (\"classical-size\", \"1qx3ul4j_TransformerClassifier_default.pkl:latest\"),\n", " (\"ml\", \"2h81aiow_TransformerClassifier_default.pkl:latest\"),\n", " (\"semi-classical\", \"12isqh2m_TransformerClassifier_default.pkl:latest\"),\n", - " (\"semi-classical-size\", \"2hv1nayy_TransformerClassifier_default.pkl:latest\"), \n", + " (\"semi-classical-size\", \"2hv1nayy_TransformerClassifier_default.pkl:latest\"),\n", " (\"semi-ml\", \"3jbqpp4r_TransformerClassifier_default.pkl:latest\"),\n", "]\n", "\n", @@ -531,32 +554,34 @@ " fs = FEATURE_MAP.get(feature_str)\n", " X_importance_fs = X_importance.loc[:, fs]\n", " X_importance_cols = X_importance_fs.columns.tolist()\n", - " \n", + "\n", " # calculate cat indices\n", " if feature_str.endswith(\"ml\"):\n", " cat_features = [t[0] for t in features_categorical]\n", " cat_idx = [X_importance_cols.index(f) for f in cat_features]\n", - " \n", + "\n", " # get groups\n", " groups, group_names = get_feature_groups(X_importance_cols, feature_str)\n", - " \n", + "\n", " model_name = model.split(\"/\")[-1].split(\":\")[0]\n", "\n", " artifact = run.use_artifact(model)\n", " model_dir = artifact.download()\n", "\n", - " with open(Path(model_dir, model_name), 'rb') as f:\n", + " with open(Path(model_dir, model_name), \"rb\") as f:\n", " clf = pickle.load(f)\n", - " \n", + "\n", " # apply group based imputation + estimate importances in terms of zero-one loss\n", " imputer = GroupedMarginalImputer(clf, X_importance_fs, groups)\n", " estimator = PermutationEstimator(imputer, \"zero one\")\n", - " \n", + "\n", " # calculate values over entire test set\n", - " sage_values = estimator(X_test.loc[:,fs].values, y_test.clip(0).values)\n", - " \n", + " sage_values = estimator(X_test.loc[:, fs].values, y_test.clip(0).values)\n", + "\n", " # save sage values + std deviation to data frame\n", - " result = pd.DataFrame(index=group_names, data={\"values\": sage_values.values, \"std\": sage_values.std})\n", + " result = pd.DataFrame(\n", + " index=group_names, data={\"values\": sage_values.values, \"std\": sage_values.std}\n", + " )\n", " results.append(result)" ] }, @@ -572,12 +597,14 @@ "# list to data frame + set human readable names\n", "names = [f\"fttransformer({feature_str[0]})\" for feature_str in configs]\n", "results_df = pd.concat(results, axis=1, keys=names)\n", - "results_df.columns = [' '.join(col).strip() for col in results_df.columns.values]\n", + "results_df.columns = [\" \".join(col).strip() for col in results_df.columns.values]\n", "\n", "# save to google clound and save identiifer\n", "KEY = f\"{EXCHANGE}_{STRATEGY}_{SUBSET}_fttransformer_feature_importance_{sample_size}\"\n", "\n", - "URI_FI_FTTRANSFORMER = f\"gs://thesis-bucket-option-trade-classification/data/results/{KEY}.parquet\"\n", + "URI_FI_FTTRANSFORMER = (\n", + " f\"gs://thesis-bucket-option-trade-classification/data/results/{KEY}.parquet\"\n", + ")\n", "\n", "results_df.to_parquet(URI_FI_FTTRANSFORMER)\n", "\n", @@ -627,7 +654,7 @@ "plt.rcParams.update(params)\n", "rc(\"text\", usetex=True)\n", "\n", - "plt.rc('text.latex', preamble=r'\\usepackage{amsmath}\\usepackage[utf8]{inputenc}')\n", + "plt.rc(\"text.latex\", preamble=r\"\\usepackage{amsmath}\\usepackage[utf8]{inputenc}\")\n", "\n", "CM = 1 / 2.54\n", "\n", @@ -651,10 +678,10 @@ "\n", "artifact = run.use_artifact(MODEL)\n", "model_dir = artifact.download()\n", - " \n", - "with open(Path(model_dir, model_name), 'rb') as f:\n", + "\n", + "with open(Path(model_dir, model_name), \"rb\") as f:\n", " model = pickle.load(f)\n", - " \n", + "\n", "clf = model.clf" ] }, @@ -672,7 +699,11 @@ "artifact = run.use_artifact(dataset)\n", "data_dir = artifact.download()\n", "\n", - "data = pd.read_parquet(Path(data_dir, \"test_set.parquet\"), engine=\"fastparquet\", columns=[*features_ml, \"buy_sell\"])\n", + "data = pd.read_parquet(\n", + " Path(data_dir, \"test_set.parquet\"),\n", + " engine=\"fastparquet\",\n", + " columns=[*features_ml, \"buy_sell\"],\n", + ")\n", "\n", "y_test = data[\"buy_sell\"]\n", "X_test = data.drop(columns=\"buy_sell\")" @@ -716,17 +747,72 @@ "\n", "\n", "# at mid\n", - "idx = [39342276, 39342363, 39342387, 39342437, 39342436, 39342428,\n", - " 39342464, 39342540, 39342608, 39342598, 39342620, 39342632,\n", - " 39342674, 39342781, 39342804, 39342824, 39342818, 39342821,\n", - " 39342861, 39342871, 39342894, 39342898, 39342931, 39342934,\n", - " 39342948, 39342954, 39342960, 39342969, 39342986, 39342987,\n", - " 39342991, 39342992, 39343036, 39343082, 39343100, 39343098,\n", - " 39343099, 39343101, 39343102, 39343109, 39343112, 39343124,\n", - " 39343128, 39343165, 39343193, 39343199, 39343211, 39343215,\n", - " 39343234, 39343242, 39343298, 39343346, 39343370, 39343390,\n", - " 39343412, 39343413, 39343415, 39343414, 39343426, 39343433,\n", - " 39343465, 39343464, 39343485, 39343498]" + "idx = [\n", + " 39342276,\n", + " 39342363,\n", + " 39342387,\n", + " 39342437,\n", + " 39342436,\n", + " 39342428,\n", + " 39342464,\n", + " 39342540,\n", + " 39342608,\n", + " 39342598,\n", + " 39342620,\n", + " 39342632,\n", + " 39342674,\n", + " 39342781,\n", + " 39342804,\n", + " 39342824,\n", + " 39342818,\n", + " 39342821,\n", + " 39342861,\n", + " 39342871,\n", + " 39342894,\n", + " 39342898,\n", + " 39342931,\n", + " 39342934,\n", + " 39342948,\n", + " 39342954,\n", + " 39342960,\n", + " 39342969,\n", + " 39342986,\n", + " 39342987,\n", + " 39342991,\n", + " 39342992,\n", + " 39343036,\n", + " 39343082,\n", + " 39343100,\n", + " 39343098,\n", + " 39343099,\n", + " 39343101,\n", + " 39343102,\n", + " 39343109,\n", + " 39343112,\n", + " 39343124,\n", + " 39343128,\n", + " 39343165,\n", + " 39343193,\n", + " 39343199,\n", + " 39343211,\n", + " 39343215,\n", + " 39343234,\n", + " 39343242,\n", + " 39343298,\n", + " 39343346,\n", + " 39343370,\n", + " 39343390,\n", + " 39343412,\n", + " 39343413,\n", + " 39343415,\n", + " 39343414,\n", + " 39343426,\n", + " 39343433,\n", + " 39343465,\n", + " 39343464,\n", + " 39343485,\n", + " 39343498,\n", + "]" ] }, { @@ -746,22 +832,22 @@ "cat_unique_counts = model.module_params[\"cat_cardinalities\"]\n", "\n", "dl_params = {\n", - " \"batch_size\": batch_size, \n", + " \"batch_size\": batch_size,\n", " \"shuffle\": False,\n", " \"device\": device,\n", "}\n", "\n", - "test_data = TabDataset(X_test[X_test.index.isin(idx)], y_test[y_test.index.isin(idx)], cat_features=cat_features, cat_unique_counts=cat_unique_counts)\n", + "test_data = TabDataset(\n", + " X_test[X_test.index.isin(idx)],\n", + " y_test[y_test.index.isin(idx)],\n", + " cat_features=cat_features,\n", + " cat_unique_counts=cat_unique_counts,\n", + ")\n", "\n", "\n", "test_loader = TabDataLoader(\n", - " test_data.x_cat,\n", - " test_data.x_cont,\n", - " test_data.weight,\n", - " test_data.y,\n", - " **dl_params\n", - ")\n", - "\n" + " test_data.x_cat, test_data.x_cont, test_data.weight, test_data.y, **dl_params\n", + ")" ] }, { @@ -824,20 +910,19 @@ "grads = []\n", "\n", "for i, block in enumerate(clf.transformer.blocks):\n", - "\n", " grad = block.attention.get_attn_gradients().detach()\n", " cam = block.attention.get_attn().detach()\n", - " \n", + "\n", " cams.append(cam)\n", " grads.append(grad)\n", - " \n", + "\n", " # reshape to [batch_size x num_head, num_tokens, num_tokens]\n", " cam = cam.reshape(-1, cam.shape[-1], cam.shape[-1])\n", " grad = grad.reshape(-1, grad.shape[-1], grad.shape[-1])\n", - " \n", + "\n", " # dot product\n", " cam = grad * cam\n", - " \n", + "\n", " # reshape to [batch_size, num_head, num_tokens, num_tokens]\n", " cam = cam.reshape(batch_size, -1, cam.shape[-1], cam.shape[-1])\n", " # clamp negative values, calculate mean over heads\n", @@ -874,10 +959,10 @@ "max_stack = 16\n", "\n", "for i in range(max_stack):\n", - " row = batch_probs[-i][0,1:]\n", + " row = batch_probs[-i][0, 1:]\n", " # row = test[np.newaxis,...]\n", " stack.append(row)\n", - " \n", + "\n", "stack_np = np.vstack(stack)" ] }, @@ -904,35 +989,37 @@ }, "outputs": [], "source": [ - "labels_sanitized = ['trade price',\n", - " 'bid (ex)',\n", - " 'ask (ex)',\n", - " 'ask (best)',\n", - " 'bid (best)',\n", - " 'price lag (ex)',\n", - " 'price lead (ex)',\n", - " 'price lag (all)',\n", - " 'price lead (all)',\n", - " 'chg lead (ex)',\n", - " 'chg lag (ex)',\n", - " 'chg lead (all)',\n", - " 'chg lag (all)',\n", - " 'prox (ex)',\n", - " 'prox (best)',\n", - " 'bid ask size ratio (ex)',\n", - " 'rel. bid size (ex)',\n", - " 'rel. ask size (ex)',\n", - " 'trade size',\n", - " 'bid size (ex)',\n", - " 'ask size (ex)',\n", - " 'depth (ex)',\n", - " 'strike price',\n", - " 'time to maturity',\n", - " 'moneyness',\n", - " 'day volume',\n", - " 'option type',\n", - " 'issue type',\n", - " 'root']" + "labels_sanitized = [\n", + " \"trade price\",\n", + " \"bid (ex)\",\n", + " \"ask (ex)\",\n", + " \"ask (best)\",\n", + " \"bid (best)\",\n", + " \"price lag (ex)\",\n", + " \"price lead (ex)\",\n", + " \"price lag (all)\",\n", + " \"price lead (all)\",\n", + " \"chg lead (ex)\",\n", + " \"chg lag (ex)\",\n", + " \"chg lead (all)\",\n", + " \"chg lag (all)\",\n", + " \"prox (ex)\",\n", + " \"prox (best)\",\n", + " \"bid ask size ratio (ex)\",\n", + " \"rel. bid size (ex)\",\n", + " \"rel. ask size (ex)\",\n", + " \"trade size\",\n", + " \"bid size (ex)\",\n", + " \"ask size (ex)\",\n", + " \"depth (ex)\",\n", + " \"strike price\",\n", + " \"time to maturity\",\n", + " \"moneyness\",\n", + " \"day volume\",\n", + " \"option type\",\n", + " \"issue type\",\n", + " \"root\",\n", + "]" ] }, { @@ -956,12 +1043,12 @@ }, "outputs": [], "source": [ - "fig, ax = plt.subplots(1, 2, figsize=(14*CM,10*CM), sharey=True)\n", - "ax[0].imshow(stack_np.T, cmap='Blues', interpolation='nearest')\n", + "fig, ax = plt.subplots(1, 2, figsize=(14 * CM, 10 * CM), sharey=True)\n", + "ax[0].imshow(stack_np.T, cmap=\"Blues\", interpolation=\"nearest\")\n", "ax[0].yaxis.set_ticks(list(range(len(labels_sanitized))))\n", "ax[0].set_yticklabels(labels_sanitized)\n", "ax[0].set_xlabel(\"At Quotes\")\n", - "ax[1].imshow(stack_np_copy.T, cmap='Blues', interpolation='nearest')\n", + "ax[1].imshow(stack_np_copy.T, cmap=\"Blues\", interpolation=\"nearest\")\n", "ax[1].yaxis.set_ticks(list(range(len(labels_sanitized))))\n", "ax[1].set_yticklabels(labels_sanitized, fontsize=\"x-small\")\n", "ax[1].set_xlabel(\"At Mid\")\n", @@ -978,7 +1065,7 @@ }, "outputs": [], "source": [ - "labels_detail = [\"$\\mathtt{[CLS]}$\", *labels_sanitized]" + "labels_detail = [r\"$\\mathtt{[CLS]}$\", *labels_sanitized]" ] }, { @@ -1014,7 +1101,7 @@ }, "outputs": [], "source": [ - "labels_left = ['$\\\\mathtt{[CLS]}$', *[\"...\"]*(len(labels_detail) - 1)]" + "labels_left = [\"$\\\\mathtt{[CLS]}$\", *[\"...\"] * (len(labels_detail) - 1)]" ] }, { @@ -1052,7 +1139,7 @@ "source": [ "from matplotlib.pyplot import cm\n", "\n", - "plt.figure(figsize=(3*CM,10*CM))\n", + "plt.figure(figsize=(3 * CM, 10 * CM))\n", "\n", "\n", "yoffset = 0\n", @@ -1076,7 +1163,7 @@ "h = 0\n", "\n", "cam = cams[l].reshape(batch_size, -1, cam.shape[-1], cam.shape[-1])\n", - "attention = cam[0,h,:,:]\n", + "attention = cam[0, h, :, :]\n", "attention /= attention.sum(axis=-1, keepdims=True)\n", "\n", "\n", @@ -1084,22 +1171,40 @@ "color = iter(cm.rainbow(np.linspace(0, 1, heads * layer)))\n", "\n", "for position, word in enumerate(labels_left):\n", - " plt.text(0, yoffset - position * word_height, word,\n", - " ha=\"right\", va=\"center\", size=\"x-small\")\n", + " plt.text(\n", + " 0,\n", + " yoffset - position * word_height,\n", + " word,\n", + " ha=\"right\",\n", + " va=\"center\",\n", + " size=\"x-small\",\n", + " )\n", "for position, word in enumerate(labels_detail):\n", - " plt.text(width, yoffset - position * word_height, word,\n", - " ha=\"left\", va=\"center\", size=\"x-small\")\n", + " plt.text(\n", + " width,\n", + " yoffset - position * word_height,\n", + " word,\n", + " ha=\"left\",\n", + " va=\"center\",\n", + " size=\"x-small\",\n", + " )\n", "# focus on cls token\n", "c = next(color)\n", "# CLS is prepended, get first row, similar to chefer\n", "for i, vec in enumerate(attention[0:1]):\n", " for j, el in enumerate(vec):\n", - " plt.plot([xoffset + pad, xoffset + width - pad],\n", - " [yoffset - word_height * i, yoffset - word_height * j],\n", - " color=c, linewidth=2, alpha=el.item())\n", - "plt.axis('off')\n", + " plt.plot(\n", + " [xoffset + pad, xoffset + width - pad],\n", + " [yoffset - word_height * i, yoffset - word_height * j],\n", + " color=c,\n", + " linewidth=2,\n", + " alpha=el.item(),\n", + " )\n", + "plt.axis(\"off\")\n", "plt.tight_layout()\n", - "plt.savefig(f\"../reports/Graphs/attention_head_{h+1}_layer_{l+1}_{key}.pdf\", bbox_inches=\"tight\")" + "plt.savefig(\n", + " f\"../reports/Graphs/attention_head_{h+1}_layer_{l+1}_{key}.pdf\", bbox_inches=\"tight\"\n", + ")" ] }, { @@ -1113,7 +1218,7 @@ "source": [ "from matplotlib.pyplot import cm\n", "\n", - "plt.figure(figsize=(36,6))\n", + "plt.figure(figsize=(36, 6))\n", "\n", "\n", "yoffset = 0\n", @@ -1141,14 +1246,13 @@ "color = iter(cm.rainbow(np.linspace(0, 1, heads * layer)))\n", "\n", "for l in range(layer):\n", - "\n", - " for h in range (heads):\n", + " for h in range(heads):\n", " # [batch x head x attn x dim attn]\n", "\n", " cam = cams[l].reshape(batch_size, -1, cam.shape[-1], cam.shape[-1])\n", "\n", " # [first in batch, head h, :,:]\n", - " attention = cam[0,h,:,:]\n", + " attention = cam[0, h, :, :]\n", "\n", " attention /= attention.sum(axis=-1, keepdims=True)\n", "\n", @@ -1165,17 +1269,23 @@ " c = next(color)\n", " for i, vec in enumerate(attention[0:1]):\n", " for j, el in enumerate(vec):\n", - " axes[l,h].plot([pad, width - pad], # x axis\n", - " [word_height * i, word_height * j],\n", - " color=c, linewidth=2, alpha=el.item())\n", - "\n", - " axes[l,h].set_title(f\"head {l+1,h+1}\", size='xx-small')\n", - "# fig.tight_layout()\n", - " axes[l,h].set_xticks([])\n", - " axes[l,h].set_yticks([])\n", + " axes[l, h].plot(\n", + " [pad, width - pad], # x axis\n", + " [word_height * i, word_height * j],\n", + " color=c,\n", + " linewidth=2,\n", + " alpha=el.item(),\n", + " )\n", + "\n", + " axes[l, h].set_title(f\"head {l+1,h+1}\", size=\"xx-small\")\n", + " # fig.tight_layout()\n", + " axes[l, h].set_xticks([])\n", + " axes[l, h].set_yticks([])\n", " # axes[l,h].axis('off')\n", "\n", - "plt.savefig(f\"../reports/Graphs/attention_heads_layer_all_{key}.pdf\", bbox_inches=\"tight\")" + "plt.savefig(\n", + " f\"../reports/Graphs/attention_heads_layer_all_{key}.pdf\", bbox_inches=\"tight\"\n", + ")" ] }, { @@ -1185,8 +1295,7 @@ "metadata": {}, "outputs": [], "source": [ - "\n", - "data = {\"grads\":grads, \"cams\":cams, \"final-scores\":stack_np_copy}" + "data = {\"grads\": grads, \"cams\": cams, \"final-scores\": stack_np_copy}" ] }, { @@ -1199,10 +1308,10 @@ "outputs": [], "source": [ "# Specify the file path where you want to save the pickle file\n", - "file_path = 'data.pickle'\n", + "file_path = \"data.pickle\"\n", "\n", "# Open the file in binary mode and write the dictionary to it\n", - "with open(file_path, 'wb') as file:\n", + "with open(file_path, \"wb\") as file:\n", " pickle.dump(data, file)" ] } diff --git a/notebooks/4.0e-mb-fttransformer-pretraining.ipynb b/notebooks/4.0e-mb-fttransformer-pretraining.ipynb index 9f3c32ef..68d8cb8d 100644 --- a/notebooks/4.0e-mb-fttransformer-pretraining.ipynb +++ b/notebooks/4.0e-mb-fttransformer-pretraining.ipynb @@ -11,26 +11,22 @@ "from pathlib import Path\n", "\n", "import pandas as pd\n", - "\n", - "from tqdm.auto import tqdm\n", - "\n", "import torch\n", "import torch.nn as nn\n", "import torch.optim as optim\n", - "\n", "import wandb\n", + "from tqdm.auto import tqdm\n", "\n", + "from otc.data.dataloader import TabDataLoader\n", + "from otc.data.dataset import TabDataset\n", + "from otc.features.build_features import features_classical_size\n", "from otc.models.activation import ReGLU\n", "from otc.models.fttransformer import (\n", + " CLSHead,\n", " FeatureTokenizer,\n", " FTTransformer,\n", " Transformer,\n", - " CLSHead,\n", ")\n", - "\n", - "from otc.data.dataset import TabDataset\n", - "from otc.data.dataloader import TabDataLoader\n", - "from otc.features.build_features import features_classical_size\n", "from otc.optim.early_stopping import EarlyStopping\n", "from otc.optim.scheduler import CosineWarmupScheduler" ] @@ -64,12 +60,12 @@ "outputs": [], "source": [ "# preserve relative ordering, sample for testing ache\n", - "frac = 1 #0.05\n", + "frac = 1 # 0.05\n", "\n", "X_train = pd.read_parquet(Path(data_dir, \"train_set.parquet\"), engine=\"fastparquet\")\n", "X_train = X_train.sample(frac=frac, random_state=42)\n", "\n", - "y_train = X_train[\"buy_sell\"] # here: y = 0\n", + "y_train = X_train[\"buy_sell\"] # here: y = 0\n", "X_train = X_train[features_classical_size]" ] }, @@ -91,13 +87,12 @@ "outputs": [], "source": [ "def gen_perm(X):\n", - " \"\"\"\n", - " Generate index permutation.\n", - " \"\"\"\n", + " \"\"\"Generate index permutation.\"\"\"\n", " if X is None:\n", " return None\n", " return torch.randint_like(X, X.shape[0], dtype=torch.long)\n", "\n", + "\n", "x_cont_perm = gen_perm(x_cont)\n", "x_cat_perm = gen_perm(x_cat)" ] @@ -108,14 +103,13 @@ "metadata": {}, "outputs": [], "source": [ - "def gen_masks(X, perm, corrupt_probability = 0.15):\n", - " \"\"\"\n", - " Generate binary mask for detection.\n", - " \"\"\"\n", + "def gen_masks(X, perm, corrupt_probability=0.15):\n", + " \"\"\"Generate binary mask for detection.\"\"\"\n", " masks = torch.empty_like(X).bernoulli(p=corrupt_probability).bool()\n", - " new_masks = masks & (X != X[perm, torch.arange(X.shape[1], device=X.device)])\n", + " new_masks = masks & (X[perm, torch.arange(X.shape[1], device=X.device)] != X)\n", " return new_masks\n", "\n", + "\n", "# generate masks for numeric and for categorical features (optional)\n", "x_cont_mask = gen_masks(training_data.x_cont, x_cont_perm)\n", "\n", @@ -138,10 +132,9 @@ "x_cont[x_cont_mask] = x_cont_permuted[x_cont_mask]\n", "\n", "if x_cat is not None:\n", - "\n", " # along the 0 axis get elements based on perm_cat\n", " x_cat_permuted = torch.gather(x_cat, 0, x_cat_perm)\n", - " \n", + "\n", " # replace at mask\n", " x_cat[x_cat_mask] = x_cat_permuted[x_cat_mask]" ] @@ -166,7 +159,7 @@ "outputs": [], "source": [ "# split up into train (first 80 %) and val (last 20 %)\n", - "idx = int (len(x_cont) * 0.8)\n", + "idx = int(len(x_cont) * 0.8)\n", "\n", "x_cont_train, x_cont_val = torch.split(x_cont, idx, dim=0)\n", "masks_train, masks_val = torch.split(masks, idx, dim=0)\n", @@ -174,7 +167,7 @@ "if x_cat is not None:\n", " x_cat_train, x_cat_val = torch.split(x_cat, idx, dim=0)\n", "else:\n", - " x_cat_train, x_cat_val = None, None\n" + " x_cat_train, x_cat_val = None, None" ] }, { @@ -244,7 +237,7 @@ " \"n_tokens\": None,\n", " \"kv_compression_ratio\": None,\n", " \"kv_compression_sharing\": None,\n", - " \"head_activation\": nn.GELU, # nn.ReLU\n", + " \"head_activation\": nn.GELU, # nn.ReLU\n", " \"head_normalization\": nn.LayerNorm,\n", " \"d_out\": 1, # fix at 1, due to binary classification\n", "}\n", @@ -254,8 +247,8 @@ "optim_params = {\"lr\": 1e-4, \"weight_decay\": 0.00001}\n", "\n", "module_params = {\n", - " \"transformer\": Transformer(**transformer_kwargs), \n", - " \"feature_tokenizer\": FeatureTokenizer(**feature_tokenizer_kwargs), # noqa: E501\n", + " \"transformer\": Transformer(**transformer_kwargs),\n", + " \"feature_tokenizer\": FeatureTokenizer(**feature_tokenizer_kwargs),\n", " \"cat_features\": None,\n", " \"cat_cardinalities\": [],\n", "}\n", @@ -268,7 +261,7 @@ "clf_head = CLSHead(**head_kwargs)\n", "clf.transformer.head = clf_head\n", "\n", - "clf.to(device)\n" + "clf.to(device)" ] }, { @@ -277,19 +270,9 @@ "metadata": {}, "outputs": [], "source": [ - "train_loader = TabDataLoader(\n", - " x_cat_train,\n", - " x_cont_train,\n", - " masks_train, \n", - " **dl_params\n", - ")\n", + "train_loader = TabDataLoader(x_cat_train, x_cont_train, masks_train, **dl_params)\n", "\n", - "val_loader = TabDataLoader(\n", - " x_cat_val,\n", - " x_cont_val,\n", - " masks_val, \n", - " **dl_params\n", - ")" + "val_loader = TabDataLoader(x_cat_val, x_cont_val, masks_val, **dl_params)" ] }, { @@ -298,7 +281,8 @@ "metadata": {}, "outputs": [], "source": [ - "optimizer = optim.AdamW(clf.parameters(),\n", + "optimizer = optim.AdamW(\n", + " clf.parameters(),\n", " lr=optim_params[\"lr\"],\n", " weight_decay=optim_params[\"weight_decay\"],\n", ")\n", @@ -309,7 +293,9 @@ "print(f\"warmup steps: {warmup}\")\n", "print(max_iters)\n", "\n", - "scheduler = CosineWarmupScheduler(optimizer=optimizer, warmup=warmup, max_iters=max_iters)" + "scheduler = CosineWarmupScheduler(\n", + " optimizer=optimizer, warmup=warmup, max_iters=max_iters\n", + ")" ] }, { @@ -319,18 +305,17 @@ "outputs": [], "source": [ "def checkpoint(model):\n", - " \n", " # remove old files\n", " for fn in glob.glob(f\"checkpoints/{run.id}*\"):\n", - " os.remove(fn) \n", - " \n", + " os.remove(fn)\n", + "\n", " # create_dir\n", " dir_checkpoints = \"checkpoints/\"\n", - " os.makedirs(dir_checkpoints, exist_ok = True) \n", - " \n", + " os.makedirs(dir_checkpoints, exist_ok=True)\n", + "\n", " # save new file\n", " print(\"saving new checkpoints.\")\n", - " torch.save(model.state_dict(), os.path.join(dir_checkpoints,f\"{run.id}*\"))" + " torch.save(model.state_dict(), os.path.join(dir_checkpoints, f\"{run.id}*\"))" ] }, { @@ -349,75 +334,74 @@ "best_step = -1\n", "\n", "for epoch in tqdm(range(epochs)):\n", - "\n", " # perform training\n", " loss_in_epoch_train = 0\n", "\n", " batch = 0\n", - " \n", + "\n", " for x_cat, x_cont, masks in train_loader:\n", - " \n", " clf.train()\n", " optimizer.zero_grad()\n", - " \n", - " with torch.autocast(device_type='cuda', dtype=torch.float16):\n", + "\n", + " with torch.autocast(device_type=\"cuda\", dtype=torch.float16):\n", " logits = clf(x_cat, x_cont)\n", " train_loss = criterion(logits, masks.float())\n", "\n", " scaler.scale(train_loss).backward()\n", " scaler.step(optimizer)\n", " scaler.update()\n", - " \n", + "\n", " scheduler.step()\n", - " \n", + "\n", " # add the mini-batch training loss to epoch loss\n", " loss_in_epoch_train += train_loss.item()\n", - " \n", - " wandb.log({\"train_loss_step\": train_loss.item(), \"epoch\": epoch, \"batch\": batch})\n", + "\n", + " wandb.log(\n", + " {\"train_loss_step\": train_loss.item(), \"epoch\": epoch, \"batch\": batch}\n", + " )\n", "\n", " batch += 1\n", - " step +=1\n", + " step += 1\n", "\n", " clf.eval()\n", " loss_in_epoch_val = 0.0\n", " correct = 0\n", - " \n", + "\n", " with torch.no_grad():\n", " for x_cat, x_cont, masks in val_loader:\n", - "\n", " # for my implementation\n", " logits = clf(x_cat, x_cont)\n", " val_loss = criterion(logits, masks.float())\n", - " \n", "\n", " # hard_predictions = torch.zeros_like(logits, dtype=torch.long)\n", " # hard_predictions[logits > 0] = 1\n", " # correct += (hard_predictions.bool() == masks).sum() / hard_predictions.shape[0]\n", "\n", " loss_in_epoch_val += val_loss.item()\n", - " wandb.log({\"val_loss_step\": val_loss.item(), \"epoch\": epoch, \"batch\": batch})\n", - " \n", - " batch +=1 \n", + " wandb.log(\n", + " {\"val_loss_step\": val_loss.item(), \"epoch\": epoch, \"batch\": batch}\n", + " )\n", + "\n", + " batch += 1\n", "\n", " # correct / (rows * columns)\n", - " # val_accuracy = correct / (X_train.shape[0] * X_train.shape[1]) \n", - " \n", + " # val_accuracy = correct / (X_train.shape[0] * X_train.shape[1])\n", + "\n", " # loss average over all batches\n", " train_loss = loss_in_epoch_train / len(train_loader)\n", " val_loss = loss_in_epoch_val / len(val_loader)\n", - " \n", + "\n", " print(f\"train loss: {train_loss}\")\n", " print(f\"val loss: {val_loss}\")\n", - " \n", + "\n", " # correct samples / no samples\n", " # val_accuracy = correct / len(X_val)\n", " # if best_accuracy < val_accuracy:\n", " # checkpoint(clf, f\"checkpoints/{run.id}-{step}.ptx\")\n", " # best_accuracy = val_accuracy\n", " # best_step = step\n", - " \n", - " \n", - " wandb.log({\"train_loss\": train_loss, 'epoch': epoch})" + "\n", + " wandb.log({\"train_loss\": train_loss, \"epoch\": epoch})" ] }, { diff --git a/notebooks/5.0a-mb-batch-size-finder.ipynb b/notebooks/5.0a-mb-batch-size-finder.ipynb index 40b478e1..362d159b 100644 --- a/notebooks/5.0a-mb-batch-size-finder.ipynb +++ b/notebooks/5.0a-mb-batch-size-finder.ipynb @@ -6,12 +6,14 @@ "metadata": {}, "outputs": [], "source": [ + "import os\n", "from time import sleep\n", "from typing import Optional\n", "\n", "import torch\n", "import torch.nn as nn\n", "import torch.nn.functional as F\n", + "\n", "from otc.data.dataloader import TabDataLoader\n", "from otc.models.activation import ReGLU\n", "from otc.models.fttransformer import (\n", @@ -20,9 +22,7 @@ " Transformer,\n", ")\n", "\n", - "import os\n", - "\n", - "os.environ[\"CUDA_LAUNCH_BLOCKING\"] = \"1\"\n" + "os.environ[\"CUDA_LAUNCH_BLOCKING\"] = \"1\"" ] }, { @@ -111,7 +111,6 @@ "\n", "\n", "def get_datasets(batch_size: int, num_workers: int = 2):\n", - "\n", " x_cat = torch.randint(0, CAT_CARDINALITY, (DATASET_SIZE, NUM_FEATURES_CAT))\n", " x_cont = torch.rand((DATASET_SIZE, NUM_FEATURES_CONT))\n", " weight = torch.ones((DATASET_SIZE, 1))\n", @@ -195,7 +194,7 @@ "\n", "\n", "if __name__ == \"__main__\":\n", - " main()\n" + " main()" ] }, { diff --git a/notebooks/6.0a-mb-results-fttransformer.ipynb b/notebooks/6.0a-mb-results-fttransformer.ipynb index f456233a..ff79bdf9 100644 --- a/notebooks/6.0a-mb-results-fttransformer.ipynb +++ b/notebooks/6.0a-mb-results-fttransformer.ipynb @@ -11,16 +11,14 @@ "outputs": [], "source": [ "import os\n", - "import sys\n", "import pickle\n", + "import sys\n", "from pathlib import Path\n", "\n", - "import google.auth\n", "import gcsfs\n", - "\n", + "import google.auth\n", "import pandas as pd\n", "import wandb\n", - "\n", "from tqdm.auto import tqdm\n", "\n", "sys.path.append(\"..\")\n", @@ -29,7 +27,7 @@ " features_classical,\n", " features_classical_size,\n", " features_ml,\n", - ")\n" + ")" ] }, { @@ -41,7 +39,7 @@ "outputs": [], "source": [ "# set globally here\n", - "EXCHANGE = \"cboe\" # \"ise\" # \"cboe\"\n", + "EXCHANGE = \"cboe\" # \"ise\" # \"cboe\"\n", "STRATEGY = \"transfer\" # \"supervised\"\n", "SUBSET = \"test\" # \"all\"\n", "\n", @@ -49,7 +47,7 @@ "# ise-trained models, supervised/semisupervised\n", "models = [\n", " (\"classical\", \"3jpe46s1_TransformerClassifier_default.pkl:latest\"),\n", - " (\"classical-size\", \"1qx3ul4j_TransformerClassifier_default.pkl:latest\"), \n", + " (\"classical-size\", \"1qx3ul4j_TransformerClassifier_default.pkl:latest\"),\n", " (\"ml\", \"2h81aiow_TransformerClassifier_default.pkl:latest\"),\n", "]" ] @@ -64,7 +62,7 @@ "source": [ "# key used for files and artefacts\n", "key = f\"{EXCHANGE}_fttransformer_{STRATEGY}_{SUBSET}\"\n", - "dataset = f\"fbv/thesis/{EXCHANGE}_{STRATEGY}_log_standardized_clipped:latest\"\n" + "dataset = f\"fbv/thesis/{EXCHANGE}_{STRATEGY}_log_standardized_clipped:latest\"" ] }, { @@ -99,7 +97,7 @@ "run = wandb.init(project=\"thesis\", entity=\"fbv\")\n", "\n", "artifact = run.use_artifact(dataset)\n", - "data_dir = artifact.download()\n" + "data_dir = artifact.download()" ] }, { @@ -121,7 +119,7 @@ " data = pd.read_parquet(Path(data_dir, \"test_set.parquet\"), engine=\"fastparquet\")\n", "\n", "y_test = data[\"buy_sell\"]\n", - "X_test = data.drop(columns=\"buy_sell\")\n" + "X_test = data.drop(columns=\"buy_sell\")" ] }, { @@ -153,26 +151,24 @@ "outputs": [], "source": [ "def count_parameters(model):\n", - " \"\"\"\n", - " Count number of parameters, that require gradient-update in model.\n", - " \n", + " \"\"\"Count number of parameters, that require gradient-update in model.\n", + "\n", " Found here: https://discuss.pytorch.org/t/how-do-i-check-the-number-of-parameters-of-a-model/4325/9\n", " \"\"\"\n", " return sum(p.numel() for p in model.parameters() if p.requires_grad)\n", "\n", - "for feature_str, model in tqdm(models):\n", "\n", + "for feature_str, model in tqdm(models):\n", " model_name = model.split(\"/\")[-1].split(\":\")[0]\n", "\n", " artifact = run.use_artifact(model)\n", " model_dir = artifact.download()\n", - " \n", - " with open(Path(model_dir, model_name), 'rb') as f:\n", + "\n", + " with open(Path(model_dir, model_name), \"rb\") as f:\n", " model = pickle.load(f)\n", - " \n", + "\n", " print(feature_str)\n", - " print(count_parameters(model.clf))\n", - " " + " print(count_parameters(model.clf))" ] }, { @@ -193,19 +189,18 @@ "}\n", "\n", "for feature_str, model in tqdm(models):\n", - "\n", " model_name = model.split(\"/\")[-1].split(\":\")[0]\n", "\n", " artifact = run.use_artifact(model)\n", " model_dir = artifact.download()\n", - " \n", - " with open(Path(model_dir, model_name), 'rb') as f:\n", + "\n", + " with open(Path(model_dir, model_name), \"rb\") as f:\n", " model = pickle.load(f)\n", "\n", " fs = FEATURE_MAP.get(feature_str)\n", " # filter categorical features that are in subset and get cardinality\n", " cat_features_sub = [tup[0] for tup in features_categorical if tup[0] in fs]\n", - " \n", + "\n", " result = pd.Series(\n", " data=model.predict(X_test.loc[:, fs]),\n", " index=X_test.index,\n", @@ -233,7 +228,7 @@ "result_set.add_reference(output_path, name=\"results\")\n", "run.log_artifact(result_set)\n", "\n", - "wandb.finish()\n" + "wandb.finish()" ] }, { @@ -257,17 +252,17 @@ "artifact = run.use_artifact(models[-1][-1])\n", "model_dir = artifact.download()\n", "\n", - "with open(Path(model_dir, model_name), 'rb') as f:\n", + "with open(Path(model_dir, model_name), \"rb\") as f:\n", " model = pickle.load(f)\n", - " \n", - "key = model_name.split(\".\")[0] + \"-embedding.ptx\"\n", + "\n", + "key = model_name.split(\".\")[0] + \"-embedding.ptx\"\n", "\n", "\n", "uri_embedding = f\"gs://thesis-bucket-option-trade-classification/data/results/{key}\"\n", "embeddings = model.clf.feature_tokenizer.cat_tokenizer.embeddings\n", "with fs.open(uri_embedding, \"wb\") as f:\n", " pickle.dump(embeddings, f, protocol=4)\n", - " \n", + "\n", "result_set = wandb.Artifact(name=key, type=\"results\")\n", "result_set.add_reference(uri_embedding, name=\"results\")" ] diff --git a/notebooks/6.0b-mb-results-classical-rules.ipynb b/notebooks/6.0b-mb-results-classical-rules.ipynb index b2bc11b4..f8f5494c 100644 --- a/notebooks/6.0b-mb-results-classical-rules.ipynb +++ b/notebooks/6.0b-mb-results-classical-rules.ipynb @@ -18,7 +18,7 @@ "\n", "sys.path.append(\"..\")\n", "from otc.features.build_features import features_classical_size\n", - "from otc.models.classical_classifier import ClassicalClassifier\n" + "from otc.models.classical_classifier import ClassicalClassifier" ] }, { @@ -32,10 +32,10 @@ "# set here globally\n", "seed = 42\n", "\n", - "exchange = \"ise\" # \"cboe\"\n", + "exchange = \"ise\" # \"cboe\"\n", "models = \"classical\"\n", - "subset = \"test\" # \"test\" # \"all\" # \"test\"\n", - "strategy = \"supervised\" # \"transfer\"\n" + "subset = \"test\" # \"test\" # \"all\" # \"test\"\n", + "strategy = \"supervised\" # \"transfer\"" ] }, { @@ -49,7 +49,7 @@ "# key used for files and artefacts\n", "key = f\"{exchange}_{models}_{strategy}_{subset}\"\n", "\n", - "dataset = f\"fbv/thesis/{exchange}_{strategy}_none:latest\"\n" + "dataset = f\"fbv/thesis/{exchange}_{strategy}_none:latest\"" ] }, { @@ -65,7 +65,7 @@ "\n", "# load unscaled data\n", "artifact = run.use_artifact(dataset)\n", - "data_dir = artifact.download()\n" + "data_dir = artifact.download()" ] }, { @@ -79,7 +79,7 @@ "columns = [\n", " *features_classical_size,\n", " \"buy_sell\",\n", - "]\n" + "]" ] }, { @@ -106,8 +106,8 @@ "elif subset == \"val\":\n", " data = pd.read_parquet(\n", " Path(data_dir, \"val_set.parquet\"), engine=\"fastparquet\", columns=columns\n", - " ) \n", - " \n", + " )\n", + "\n", "elif subset == \"test\":\n", " data = pd.read_parquet(\n", " Path(data_dir, \"test_set.parquet\"), engine=\"fastparquet\", columns=columns\n", @@ -116,10 +116,12 @@ "y_test = data[\"buy_sell\"].astype(\"int8\")\n", "\n", "\n", - "data[\"TRADE_SIZE\"] = data[\"TRADE_SIZE\"].astype('float32') # update dtype Int64 - Float32\n", + "data[\"TRADE_SIZE\"] = data[\"TRADE_SIZE\"].astype(\n", + " \"float32\"\n", + ") # update dtype Int64 - Float32\n", "X_test = data.drop(columns=\"buy_sell\")\n", "\n", - "del data\n" + "del data" ] }, { @@ -130,7 +132,7 @@ }, "outputs": [], "source": [ - "rules = [ #classical\n", + "rules = [ # classical\n", " [(\"tick\", \"ex\")],\n", " [(\"rev_tick\", \"ex\")],\n", " [(\"tick\", \"all\")],\n", @@ -149,7 +151,7 @@ " [(\"rev_emo\", \"best\")],\n", " [(\"clnv\", \"best\")],\n", " [(\"rev_clnv\", \"best\")],\n", - " [(\"quote\", \"best\"), (\"quote\", \"ex\"), (\"rev_tick\", \"all\")], # grauer (benchmark 1)\n", + " [(\"quote\", \"best\"), (\"quote\", \"ex\"), (\"rev_tick\", \"all\")], # grauer (benchmark 1)\n", " [\n", " (\"trade_size\", \"ex\"),\n", " (\"quote\", \"best\"),\n", @@ -157,14 +159,14 @@ " (\"depth\", \"best\"),\n", " (\"depth\", \"ex\"),\n", " (\"rev_tick\", \"all\"),\n", - " ], # grauer (benchmark 2) \n", + " ], # grauer (benchmark 2)\n", "]\n", "\n", "# generate names for array\n", "names = []\n", "for r in tqdm(rules):\n", " name = \"->\".join(\"%s(%s)\" % tup for tup in r)\n", - " names.append(name)\n" + " names.append(name)" ] }, { @@ -182,7 +184,7 @@ " # fit is only used to set sklearn attributes, no leakage\n", " clf.fit(X=X_test.head(5), y=y_test.head(5))\n", " result = clf.predict(X_test).astype(int)\n", - " results.append(result)\n" + " results.append(result)" ] }, { @@ -191,7 +193,7 @@ "metadata": {}, "outputs": [], "source": [ - "results = pd.DataFrame(dict(zip(names, results)), index=X_test.index)\n" + "results = pd.DataFrame(dict(zip(names, results)), index=X_test.index)" ] }, { @@ -203,7 +205,7 @@ "output_path = (\n", " f\"gs://thesis-bucket-option-trade-classification/data/results/{key}.parquet\"\n", ")\n", - "results.to_parquet(output_path)\n" + "results.to_parquet(output_path)" ] }, { @@ -219,7 +221,7 @@ "result_set.add_reference(output_path, name=\"results\")\n", "run.log_artifact(result_set)\n", "\n", - "wandb.finish()\n" + "wandb.finish()" ] } ], diff --git a/notebooks/6.0c-mb-results-universal.ipynb b/notebooks/6.0c-mb-results-universal.ipynb index f007e6d7..3a80a71e 100644 --- a/notebooks/6.0c-mb-results-universal.ipynb +++ b/notebooks/6.0c-mb-results-universal.ipynb @@ -10,7 +10,6 @@ "outputs": [], "source": [ "import os\n", - "import random\n", "import sys\n", "from pathlib import Path\n", "\n", @@ -22,11 +21,12 @@ "import warnings\n", "\n", "import wandb\n", - "from otc.metrics.metrics import effective_spread\n", + "from numpy.exceptions import VisibleDeprecationWarning\n", "from scipy.stats import wilcoxon\n", + "from statsmodels.stats.contingency_tables import mcnemar\n", "from tqdm.auto import tqdm\n", "\n", - "from statsmodels.stats.contingency_tables import mcnemar" + "from otc.metrics.metrics import effective_spread" ] }, { @@ -38,10 +38,10 @@ "outputs": [], "source": [ "# set here globally\n", - "EXCHANGE = \"cboe\" # \"ise\"\n", - "MODELS = [\"gbm\",\"fttransformer\"] # \"classical\", \"fttransformer\", \"gbm\"\n", + "EXCHANGE = \"cboe\" # \"ise\"\n", + "MODELS = [\"gbm\", \"fttransformer\"] # \"classical\", \"fttransformer\", \"gbm\"\n", "SUBSET = \"test\" # \"all\"\n", - "STRATEGY = \"transfer\" # \"supervised\" \n", + "STRATEGY = \"transfer\" # \"supervised\"\n", "\n", "RETRAIN = False" ] @@ -62,7 +62,7 @@ "run = wandb.init(project=\"thesis\", entity=\"fbv\")\n", "\n", "# load unscaled data\n", - "artifact = run.use_artifact(DATASET) \n", + "artifact = run.use_artifact(DATASET)\n", "data_dir = artifact.download()\n", "\n", "# load results\n", @@ -73,9 +73,9 @@ " results = f\"fbv/thesis/{EXCHANGE}_{model}_{STRATEGY}_{SUBSET}_retrain:latest\"\n", " else:\n", " results = f\"fbv/thesis/{EXCHANGE}_{model}_{STRATEGY}_{SUBSET}:latest\"\n", - " artifact = run.use_artifact(results) \n", + " artifact = run.use_artifact(results)\n", " result_dir = artifact.download()\n", - " result_dirs.append(result_dir)\n" + " result_dirs.append(result_dir)" ] }, { @@ -135,7 +135,7 @@ "\n", "X_print = eval_data\n", "\n", - "del results\n" + "del results" ] }, { @@ -151,10 +151,10 @@ " [\n", " # (\"fttransformer\", \"fttransformer(classical)\"),\n", " # (\"fttransformer\", \"fttransformer(classical-size)\"),\n", - " # (\"fttransformer\", \"fttransformer(ml)\"), \n", + " # (\"fttransformer\", \"fttransformer(ml)\"),\n", " (\"fttransformer\", \"fttransformer(semi-classical)\"),\n", " (\"fttransformer\", \"fttransformer(semi-classical-size)\"),\n", - " (\"fttransformer\", \"fttransformer(semi-ml)\"), \n", + " (\"fttransformer\", \"fttransformer(semi-ml)\"),\n", " # (\"gbm\", \"gbm(classical)\"),\n", " # (\"gbm\", \"gbm(classical-size)\"),\n", " # (\"gbm\", \"gbm(ml)\"),\n", @@ -162,10 +162,9 @@ " # # (\"gbm\", \"gbm(classical-size-retraining)\"),\n", " # # (\"gbm\", \"gbm(ml-retraining)\"),\n", " (\"gbm\", \"gbm(semi-classical)\"),\n", - " (\"gbm\",'gbm(semi-classical-size)'),\n", - " (\"gbm\",'gbm(semi-ml)'),\n", - "\n", - "# # viz\n", + " (\"gbm\", \"gbm(semi-classical-size)\"),\n", + " (\"gbm\", \"gbm(semi-ml)\"),\n", + " # # viz\n", " # (\"classical\", \"tick(all)\"),\n", " # (\"classical\", \"quote(best)\"),\n", " # (\"classical\", \"quote(ex)\"),\n", @@ -177,40 +176,35 @@ " # (\"classical\", \"depth(ex)\"),\n", " # (\"classical\", \"depth(best)\"),\n", " # (\"classical\", \"trade_size(ex)\"),\n", - "\n", - "# # batch 1 / detailled analysis\n", - "# (\"classical\", \"tick(ex)\"),\n", - "# (\"classical\", \"rev_tick(ex)\"),\n", - "# (\"classical\", \"quote(ex)\"),\n", - "# (\"classical\", \"lr(ex)\"),\n", - "# (\"classical\", \"rev_lr(ex)\"),\n", - "# (\"classical\", \"emo(ex)\"),\n", - "# (\"classical\", \"rev_emo(ex)\"),\n", - " \n", - "# # batch 2\n", - "# (\"classical\", \"clnv(ex)\"),\n", - "# (\"classical\", \"rev_clnv(ex)\"),\n", - "# (\"classical\", \"tick(all)\"),\n", - "# (\"classical\", \"rev_tick(all)\"),\n", - "# (\"classical\", \"quote(best)\"),\n", - "# (\"classical\", \"lr(best)\"),\n", - "# (\"classical\", \"rev_lr(best)\"),\n", - " \n", - "# # batch 3\n", - "# (\"classical\", \"emo(best)\"),\n", - "# (\"classical\", \"rev_emo(best)\"),\n", - "# (\"classical\", \"clnv(best)\"),\n", - "# (\"classical\", \"rev_clnv(best)\"), \n", + " # # batch 1 / detailled analysis\n", + " # (\"classical\", \"tick(ex)\"),\n", + " # (\"classical\", \"rev_tick(ex)\"),\n", + " # (\"classical\", \"quote(ex)\"),\n", + " # (\"classical\", \"lr(ex)\"),\n", + " # (\"classical\", \"rev_lr(ex)\"),\n", + " # (\"classical\", \"emo(ex)\"),\n", + " # (\"classical\", \"rev_emo(ex)\"),\n", + " # # batch 2\n", + " # (\"classical\", \"clnv(ex)\"),\n", + " # (\"classical\", \"rev_clnv(ex)\"),\n", + " # (\"classical\", \"tick(all)\"),\n", + " # (\"classical\", \"rev_tick(all)\"),\n", + " # (\"classical\", \"quote(best)\"),\n", + " # (\"classical\", \"lr(best)\"),\n", + " # (\"classical\", \"rev_lr(best)\"),\n", + " # # batch 3\n", + " # (\"classical\", \"emo(best)\"),\n", + " # (\"classical\", \"rev_emo(best)\"),\n", + " # (\"classical\", \"clnv(best)\"),\n", + " # (\"classical\", \"rev_clnv(best)\"),\n", " # (\"classical\", \"quote(best)->quote(ex)->rev_tick(all)\"),\n", " # (\n", " # \"classical\",\n", " # \"trade_size(ex)->quote(best)->quote(ex)->depth(best)->depth(ex)->rev_tick(all)\",\n", " # ),\n", - " \n", " # detailed analysis\n", - " \n", " ]\n", - "]\n" + "]" ] }, { @@ -222,7 +216,7 @@ "outputs": [], "source": [ "LUT = {\n", - " \"Trade_Size(ex)->Quote(Best)->Depth(Best)->Quote(Ex)->Depth(Ex)->Rev_Tick(All)\": \"\\gls{GBM}\",\n", + " \"Trade_Size(ex)->Quote(Best)->Depth(Best)->Quote(Ex)->Depth(Ex)->Rev_Tick(All)\": r\"\\gls{GBM}\",\n", " \"(Ex)\": \" (Ex)\",\n", " \"(Best)\": \" (Best)\",\n", " \"(Classical)\": \" (Classical)\",\n", @@ -231,12 +225,12 @@ " \"Trade_Size\": \"Trade Size\",\n", " \"Depth\": \"Depth\",\n", " \"->\": \" $\\\\to$ \",\n", - " \"Lr\": \"\\gls{LR}\",\n", - " \"Emo\": \"\\gls{EMO}\",\n", - " \"Clnv\": \"\\gls{CLNV}\",\n", + " \"Lr\": r\"\\gls{LR}\",\n", + " \"Emo\": r\"\\gls{EMO}\",\n", + " \"Clnv\": r\"\\gls{CLNV}\",\n", " \"OPTION_TYPE\": \"Option Type\",\n", - " \"_\": \"$\\_\",\n", - " \"Gbm\": \"\\gls{GBM}\",\n", + " \"_\": r\"$\\_\",\n", + " \"Gbm\": r\"\\gls{GBM}\",\n", "}\n", "\n", "LUT_INDEX = {\n", @@ -260,7 +254,7 @@ "\n", "\n", "def highlight_max(s, props=\"\"):\n", - " return np.where(s == np.nanmax(s.values), props, \"\")\n" + " return np.where(s == np.nanmax(s.values), props, \"\")" ] }, { @@ -290,7 +284,7 @@ " convert_css=True,\n", " )\n", " )\n", - " return res\n" + " return res" ] }, { @@ -302,7 +296,7 @@ "outputs": [], "source": [ "classifiers = results_data.columns.tolist()\n", - "criterions = list(LUT_INDEX)\n" + "criterions = list(LUT_INDEX)" ] }, { @@ -355,7 +349,7 @@ " label=f\"{KEY.lower()}-unclassfied\",\n", " bold_axis=0,\n", ")\n", - "unclassified\n" + "unclassified" ] }, { @@ -381,14 +375,14 @@ "results_data.replace(0, np.nan, inplace=True)\n", "# assume same filler for every column\n", "filler = pd.Series(\n", - " rng.choice(a=[-1, 1], size=results_data.shape[0]),\n", - " index=results_data.index,\n", - " # columns=results_data.columns,\n", + " rng.choice(a=[-1, 1], size=results_data.shape[0]),\n", + " index=results_data.index,\n", + " # columns=results_data.columns,\n", ")\n", "\n", "# do column-wise as we run out of memory otherwise\n", "for classifier in tqdm(classifiers):\n", - " results_data[classifier].fillna(filler, inplace=True)\n" + " results_data[classifier].fillna(filler, inplace=True)" ] }, { @@ -404,40 +398,64 @@ "metadata": {}, "outputs": [], "source": [ - "\n", "X_print = pd.concat([X_print, results_data], axis=1)\n", "Y = X_print[[*results_data.columns.tolist(), \"buy_sell\"]]\n", "\n", - "import statsmodels\n", - "from statsmodels.stats.contingency_tables import mcnemar\n", "\n", "results = []\n", - "combinations = [(('gbm', 'gbm(semi-classical)'), ('fttransformer', 'fttransformer(semi-classical)')),\n", - " (('gbm', 'gbm(semi-classical-size)'), ('fttransformer', 'fttransformer(semi-classical-size)')),\n", - " (('gbm', 'gbm(semi-ml)'), ('fttransformer', 'fttransformer(semi-ml)'))]\n", + "combinations = [\n", + " (\n", + " (\"gbm\", \"gbm(semi-classical)\"),\n", + " (\"fttransformer\", \"fttransformer(semi-classical)\"),\n", + " ),\n", + " (\n", + " (\"gbm\", \"gbm(semi-classical-size)\"),\n", + " (\"fttransformer\", \"fttransformer(semi-classical-size)\"),\n", + " ),\n", + " ((\"gbm\", \"gbm(semi-ml)\"), (\"fttransformer\", \"fttransformer(semi-ml)\")),\n", + "]\n", "significance = 0.05\n", "\n", - "def get_contingency_table(Y, ground_truth, model_1, model_2):\n", "\n", + "def get_contingency_table(Y, ground_truth, model_1, model_2):\n", " Y_ = Y[[ground_truth, model_1, model_2]].copy().astype(int)\n", "\n", - " c_0_0 = np.where((Y_[model_1] == Y_[ground_truth]) & (Y_[model_2] == Y_[ground_truth]), 1, 0).sum()\n", - " c_0_1 = np.where((Y_[model_1] == Y_[ground_truth]) & (Y_[model_2] != Y_[ground_truth]), 1, 0).sum()\n", - " c_1_0 = np.where((Y_[model_1] != Y_[ground_truth]) & (Y_[model_2] == Y_[ground_truth]), 1, 0).sum()\n", - " c_1_1 = np.where((Y_[model_1] != Y_[ground_truth]) & (Y_[model_2] != Y_[ground_truth]), 1, 0).sum()\n", - " \n", + " c_0_0 = np.where(\n", + " (Y_[model_1] == Y_[ground_truth]) & (Y_[model_2] == Y_[ground_truth]), 1, 0\n", + " ).sum()\n", + " c_0_1 = np.where(\n", + " (Y_[model_1] == Y_[ground_truth]) & (Y_[model_2] != Y_[ground_truth]), 1, 0\n", + " ).sum()\n", + " c_1_0 = np.where(\n", + " (Y_[model_1] != Y_[ground_truth]) & (Y_[model_2] == Y_[ground_truth]), 1, 0\n", + " ).sum()\n", + " c_1_1 = np.where(\n", + " (Y_[model_1] != Y_[ground_truth]) & (Y_[model_2] != Y_[ground_truth]), 1, 0\n", + " ).sum()\n", + "\n", " # [both right, gbm right/transformer wrong, gbm wrong/transformer right, both wrong]\n", - " contingency_table = [[c_0_0, c_0_1],[c_1_0, c_1_1]]\n", + " contingency_table = [[c_0_0, c_0_1], [c_1_0, c_1_1]]\n", "\n", " return np.array(contingency_table)\n", "\n", - " \n", + "\n", "for combination in tqdm(combinations):\n", - " contingency_table = get_contingency_table(Y, 'buy_sell', combination[0], combination[1])\n", + " contingency_table = get_contingency_table(\n", + " Y, \"buy_sell\", combination[0], combination[1]\n", + " )\n", " test = mcnemar(contingency_table, exact=False, correction=True)\n", - " \n", - " results.append({\"contingency_table\": contingency_table, \"model_1\": combination[0], \"model_2\": combination[1], \"statistic\": test.statistic, \"p-value\": test.pvalue, \"significant\": test.pvalue < significance})\n", - " \n", + "\n", + " results.append(\n", + " {\n", + " \"contingency_table\": contingency_table,\n", + " \"model_1\": combination[0],\n", + " \"model_2\": combination[1],\n", + " \"statistic\": test.statistic,\n", + " \"p-value\": test.pvalue,\n", + " \"significant\": test.pvalue < significance,\n", + " }\n", + " )\n", + "\n", "pd.DataFrame(results).to_csv(f\"../models/{EXCHANGE}-mcnemar.csv\")" ] }, @@ -576,7 +594,7 @@ " \"year\",\n", " ],\n", " inplace=True,\n", - ")\n" + ")" ] }, { @@ -587,7 +605,7 @@ }, "outputs": [], "source": [ - "X_print = pd.concat([X_print, results_data], axis=1)\n" + "X_print = pd.concat([X_print, results_data], axis=1)" ] }, { @@ -617,7 +635,7 @@ "outputs": [], "source": [ "# FIXME: Find better approach\n", - "warnings.filterwarnings(\"ignore\", category=np.VisibleDeprecationWarning)\n", + "warnings.filterwarnings(\"ignore\", category=VisibleDeprecationWarning)\n", "\n", "result_dfs = []\n", "\n", @@ -650,7 +668,7 @@ " # )\n", "\n", " # store all result sets for later use\n", - " result_dfs.append(result_df)\n" + " result_dfs.append(result_df)" ] }, { @@ -690,7 +708,7 @@ " caption=(\"master-long\", \"master-short\"),\n", " label=f\"{KEY}-master\",\n", " bold_axis=0,\n", - ")\n" + ")" ] }, { @@ -730,7 +748,9 @@ "results = []\n", "\n", "# calculate true rel effective spread but not aggregated, convert to %\n", - "es_true = effective_spread(X_print[\"buy_sell\"], X_print[\"TRADE_PRICE\"], mid, mode=\"none\")\n", + "es_true = effective_spread(\n", + " X_print[\"buy_sell\"], X_print[\"TRADE_PRICE\"], mid, mode=\"none\"\n", + ")\n", "nom_true = np.nanmean(es_true)\n", "\n", "eps_true = np.empty(es_true.shape)\n", @@ -739,26 +759,31 @@ "\n", "\n", "for classifier in tqdm(classifiers):\n", - "\n", " # calculate pred rel effective spread but not aggregated convert to %\n", - " es_pred = effective_spread(X_print[classifier], X_print[\"TRADE_PRICE\"], mid, mode=\"none\")\n", - " \n", + " es_pred = effective_spread(\n", + " X_print[classifier], X_print[\"TRADE_PRICE\"], mid, mode=\"none\"\n", + " )\n", + "\n", " eps_pred = np.empty(es_pred.shape)\n", " np.divide(es_pred, mid, out=eps_pred, where=mid != 0)\n", "\n", - " wilcoxon_res = wilcoxon(eps_pred, eps_true, nan_policy=\"omit\", zero_method=\"zsplit\")\n", + " wilcoxon_res = wilcoxon(eps_pred, eps_true, nan_policy=\"omit\", zero_method=\"zsplit\")\n", "\n", " res = pd.Series(\n", - " {\n", - " \"nom_pred\": np.nanmean(es_pred),\n", - " \"rel_pred\": np.nanmean(eps_pred),\n", - " \"statistic\":wilcoxon_res.statistic,\n", - " \"pvalue\":wilcoxon_res.pvalue,\n", - " }, name=classifier\n", - " )\n", + " {\n", + " \"nom_pred\": np.nanmean(es_pred),\n", + " \"rel_pred\": np.nanmean(eps_pred),\n", + " \"statistic\": wilcoxon_res.statistic,\n", + " \"pvalue\": wilcoxon_res.pvalue,\n", + " },\n", + " name=classifier,\n", + " )\n", " results.append(res)\n", "\n", - "true_eff = pd.Series({\"nom_pred\":nom_true, \"rel_pred\": rel_true, \"statistic\":np.NaN, \"pvalue\":np.NaN}, name=\"true_eff\")\n", + "true_eff = pd.Series(\n", + " {\"nom_pred\": nom_true, \"rel_pred\": rel_true, \"statistic\": np.nan, \"pvalue\": np.nan},\n", + " name=\"true_eff\",\n", + ")\n", "\n", "results.append(true_eff)\n", "\n", @@ -773,7 +798,7 @@ }, "outputs": [], "source": [ - "results.T.style.format(\"{:.3f}\")\n" + "results.T.style.format(\"{:.3f}\")" ] }, { @@ -791,7 +816,7 @@ " label=f\"tab:eff-{KEY}\",\n", " caption=(f\"long-eff-{KEY}\", f\"short-eff-{KEY}\"),\n", " convert_css=True,\n", - ")\n" + ")" ] }, { @@ -821,7 +846,11 @@ "outputs": [], "source": [ "# classical baselines\n", - "view = [(\"fttransformer\", \"fttransformer(semi-classical)\"), (\"fttransformer\", \"fttransformer(semi-classical-size)\"), (\"fttransformer\", \"fttransformer(semi-ml)\")]\n", + "view = [\n", + " (\"fttransformer\", \"fttransformer(semi-classical)\"),\n", + " (\"fttransformer\", \"fttransformer(semi-classical-size)\"),\n", + " (\"fttransformer\", \"fttransformer(semi-ml)\"),\n", + "]\n", "\n", "base = master[\n", " [\n", @@ -838,7 +867,7 @@ "]\n", "\n", "# my ml models\n", - "revised = master[view]\n" + "revised = master[view]" ] }, { @@ -848,8 +877,7 @@ "outputs": [], "source": [ "def combine_results(revised: pd.DataFrame, base: pd.DataFrame) -> pd.DataFrame:\n", - " \"\"\"\n", - " Generate print layout like in Grauer et al.\n", + " \"\"\"Generate print layout like in Grauer et al.\n", "\n", " https://tex.stackexchange.com/questions/430283/table-with-numbers-in-parentheses-in-siunitx/430290#430290\n", "\n", @@ -864,11 +892,10 @@ " combo = pd.DataFrame(revised.values, index=revised.index, columns=midx)\n", "\n", " for i, mul_col in enumerate(combo.columns):\n", - "\n", " combo[(mul_col[0], \"pm\")] = (combo[mul_col] - base.iloc[:, i]).round(2)\n", " combo.sort_index(axis=1, inplace=True)\n", "\n", - " return combo\n" + " return combo" ] }, { @@ -897,7 +924,7 @@ " label=f\"tab:diff-{KEY}\",\n", " caption=(f\"long-diff-{KEY}\", f\"short-diff-{KEY}\"),\n", " convert_css=True,\n", - ")\n" + ")" ] }, { @@ -908,7 +935,7 @@ }, "outputs": [], "source": [ - "diff\n" + "diff" ] } ], diff --git a/notebooks/6.0d-mb-results-gradient-boosting.ipynb b/notebooks/6.0d-mb-results-gradient-boosting.ipynb index 31fcec96..1c14d52b 100644 --- a/notebooks/6.0d-mb-results-gradient-boosting.ipynb +++ b/notebooks/6.0d-mb-results-gradient-boosting.ipynb @@ -25,7 +25,7 @@ " features_classical,\n", " features_classical_size,\n", " features_ml,\n", - ")\n" + ")" ] }, { @@ -49,7 +49,7 @@ " (\"semi-classical\", \"37lymmzc_CatBoostClassifier_default.cbm:latest\"),\n", " (\"semi-classical-size\", \"1vmti6db_CatBoostClassifier_default.cbm:latest\"),\n", " (\"semi-ml\", \"t55nd8r0_CatBoostClassifier_default.cbm:latest\"),\n", - "]\n" + "]" ] }, { @@ -60,7 +60,7 @@ "source": [ "# key used for files and artefacts\n", "key = f\"{EXCHANGE}_gbm_{STRATEGY}_{SUBSET}\"\n", - "dataset = f\"fbv/thesis/{EXCHANGE}_{STRATEGY}_log_standardized_clipped:latest\"\n" + "dataset = f\"fbv/thesis/{EXCHANGE}_{STRATEGY}_log_standardized_clipped:latest\"" ] }, { @@ -72,7 +72,7 @@ "outputs": [], "source": [ "# set project name. Required to access files and artefacts\n", - "os.environ[\"GCLOUD_PROJECT\"] = \"flowing-mantis-239216\"\n" + "os.environ[\"GCLOUD_PROJECT\"] = \"flowing-mantis-239216\"" ] }, { @@ -92,7 +92,7 @@ "run = wandb.init(project=\"thesis\", entity=\"fbv\")\n", "\n", "artifact = run.use_artifact(dataset)\n", - "data_dir = artifact.download()\n" + "data_dir = artifact.download()" ] }, { @@ -125,7 +125,7 @@ " X_retrain = retrain_data.drop(columns=\"buy_sell\")\n", "\n", " weight_retrain = np.geomspace(0.001, 1, num=len(y_retrain))\n", - " timestamp_retrain = np.linspace(0, 1, len(y_retrain))\n" + " timestamp_retrain = np.linspace(0, 1, len(y_retrain))" ] }, { @@ -156,7 +156,6 @@ "}\n", "\n", "for feature_str, model in tqdm(models):\n", - "\n", " model_name = model.split(\"/\")[-1].split(\":\")[0]\n", "\n", " artifact = run.use_artifact(model)\n", @@ -198,7 +197,7 @@ " index=X_test.index,\n", " name=f\"gbm({feature_str}-retraining)\",\n", " )\n", - " results.append(result)\n" + " results.append(result)" ] }, { @@ -220,7 +219,7 @@ "result_set.add_reference(output_path, name=\"results\")\n", "run.log_artifact(result_set)\n", "\n", - "wandb.finish()\n" + "wandb.finish()" ] } ], diff --git a/notebooks/6.0e-mb-viz-universal.ipynb b/notebooks/6.0e-mb-viz-universal.ipynb index 9451c4f2..0d5eb7c0 100644 --- a/notebooks/6.0e-mb-viz-universal.ipynb +++ b/notebooks/6.0e-mb-viz-universal.ipynb @@ -9,23 +9,27 @@ }, "outputs": [], "source": [ - "import numpy as np\n", - "import matplotlib as mpl\n", - "import matplotlib.pyplot as plt\n", - "from matplotlib import rc\n", - "import torch\n", - "import pandas as pd\n", - "import matplotlib.dates as mdates\n", - "from matplotlib.dates import DateFormatter\n", - "import matplotlib.ticker as ticker\n", - "from matplotlib.ticker import FormatStrFormatter, StrMethodFormatter, PercentFormatter,MaxNLocator\n", - "\n", "import json\n", "import os\n", "import pickle\n", "from pathlib import Path\n", + "\n", + "import matplotlib as mpl\n", + "import matplotlib.dates as mdates\n", + "import matplotlib.pyplot as plt\n", + "import matplotlib.ticker as ticker\n", + "import numpy as np\n", "import optuna\n", + "import pandas as pd\n", + "import torch\n", "import wandb\n", + "from matplotlib import rc\n", + "from matplotlib.dates import DateFormatter\n", + "from matplotlib.ticker import (\n", + " MaxNLocator,\n", + " PercentFormatter,\n", + " StrMethodFormatter,\n", + ")\n", "\n", "os.environ[\"GCLOUD_PROJECT\"] = \"flowing-mantis-239216\"" ] @@ -51,7 +55,7 @@ "plt.rcParams.update(params)\n", "rc(\"text\", usetex=True)\n", "\n", - "plt.rc('text.latex', preamble=r'\\usepackage{amsmath}\\usepackage[utf8]{inputenc}')\n", + "plt.rc(\"text.latex\", preamble=r\"\\usepackage{amsmath}\\usepackage[utf8]{inputenc}\")\n", "\n", "CM = 1 / 2.54\n", "# cmap = plt.cm.get_cmap(\"viridis\")\n", @@ -73,11 +77,35 @@ "\n", "# line cyclers adapted to colourblind people\n", "from cycler import cycler\n", - "line_cycler = (cycler(color=[\"#E69F00\", \"#56B4E9\", \"#009E73\", \"#0072B2\", \"#D55E00\", \"#CC79A7\", \"#F0E442\"]) # + cycler(linestyle=[\"-\", \"--\", \"-.\", \":\", \"-\", \"--\", \"-.\"])\n", - " )\n", - "marker_cycler = (cycler(color=[\"#E69F00\", \"#56B4E9\", \"#009E73\", \"#0072B2\", \"#D55E00\", \"#CC79A7\", \"#F0E442\"]) +\n", - " cycler(linestyle=[\"none\", \"none\", \"none\", \"none\", \"none\", \"none\", \"none\"]) +\n", - " cycler(marker=[\"4\", \"2\", \"3\", \"1\", \"+\", \"x\", \".\"]))\n", + "\n", + "line_cycler = (\n", + " cycler(\n", + " color=[\n", + " \"#E69F00\",\n", + " \"#56B4E9\",\n", + " \"#009E73\",\n", + " \"#0072B2\",\n", + " \"#D55E00\",\n", + " \"#CC79A7\",\n", + " \"#F0E442\",\n", + " ]\n", + " ) # + cycler(linestyle=[\"-\", \"--\", \"-.\", \":\", \"-\", \"--\", \"-.\"])\n", + ")\n", + "marker_cycler = (\n", + " cycler(\n", + " color=[\n", + " \"#E69F00\",\n", + " \"#56B4E9\",\n", + " \"#009E73\",\n", + " \"#0072B2\",\n", + " \"#D55E00\",\n", + " \"#CC79A7\",\n", + " \"#F0E442\",\n", + " ]\n", + " )\n", + " + cycler(linestyle=[\"none\", \"none\", \"none\", \"none\", \"none\", \"none\", \"none\"])\n", + " + cycler(marker=[\"4\", \"2\", \"3\", \"1\", \"+\", \"x\", \".\"])\n", + ")\n", "\n", "plt.rc(\"axes\", prop_cycle=line_cycler)" ] @@ -97,8 +125,12 @@ }, "outputs": [], "source": [ - "accuracies_over_time_ise = pd.read_parquet(\"gs://thesis-bucket-option-trade-classification/data/results/ise_supervised_all-classical-accurcies-over-time.parquet\")\n", - "accuracies_over_time_cboe = pd.read_parquet(\"gs://thesis-bucket-option-trade-classification/data/results/cboe_supervised_all-classical-accurcies-over-time.parquet\")" + "accuracies_over_time_ise = pd.read_parquet(\n", + " \"gs://thesis-bucket-option-trade-classification/data/results/ise_supervised_all-classical-accurcies-over-time.parquet\"\n", + ")\n", + "accuracies_over_time_cboe = pd.read_parquet(\n", + " \"gs://thesis-bucket-option-trade-classification/data/results/cboe_supervised_all-classical-accurcies-over-time.parquet\"\n", + ")" ] }, { @@ -109,35 +141,79 @@ }, "outputs": [], "source": [ - "fig, ax = plt.subplots(2,1,figsize=(14*CM,10*CM), sharey=True, sharex=True, tight_layout=True)\n", + "fig, ax = plt.subplots(\n", + " 2, 1, figsize=(14 * CM, 10 * CM), sharey=True, sharex=True, tight_layout=True\n", + ")\n", "\n", "\n", - "ax[0].plot(accuracies_over_time_ise[\"tick(all)\"], label=\"$\\operatorname{tick}_{\\mathrm{all}}$\", lw=1)\n", - "ax[0].plot(accuracies_over_time_ise[\"quote(best)\"], label=\"$\\operatorname{quote}_{\\mathrm{nbbo}}$\", lw=1, zorder=20)\n", - "ax[0].plot(accuracies_over_time_ise[\"quote(best)->quote(ex)->rev_tick(all)\"], label=r\"$\\operatorname{gsu}_{\\mathrm{small}}$\", lw=1, zorder=50)\n", - "ax[0].plot(accuracies_over_time_ise[\"trade_size(ex)->quote(best)->quote(ex)->depth(best)->depth(ex)->rev_tick(all)\"], label=r\"$\\operatorname{gsu}_{\\mathrm{large}}$\", lw=1, zorder=100)\n", + "ax[0].plot(\n", + " accuracies_over_time_ise[\"tick(all)\"],\n", + " label=r\"$\\operatorname{tick}_{\\mathrm{all}}$\",\n", + " lw=1,\n", + ")\n", + "ax[0].plot(\n", + " accuracies_over_time_ise[\"quote(best)\"],\n", + " label=r\"$\\operatorname{quote}_{\\mathrm{nbbo}}$\",\n", + " lw=1,\n", + " zorder=20,\n", + ")\n", + "ax[0].plot(\n", + " accuracies_over_time_ise[\"quote(best)->quote(ex)->rev_tick(all)\"],\n", + " label=r\"$\\operatorname{gsu}_{\\mathrm{small}}$\",\n", + " lw=1,\n", + " zorder=50,\n", + ")\n", + "ax[0].plot(\n", + " accuracies_over_time_ise[\n", + " \"trade_size(ex)->quote(best)->quote(ex)->depth(best)->depth(ex)->rev_tick(all)\"\n", + " ],\n", + " label=r\"$\\operatorname{gsu}_{\\mathrm{large}}$\",\n", + " lw=1,\n", + " zorder=100,\n", + ")\n", "\n", - "ax[0].axvline(x=pd.Timestamp('2013-10-24'), linestyle='--', color='grey', linewidth=0.5)\n", - "ax[0].axvline(x=pd.Timestamp('2015-11-05'), linestyle='--', color='grey', linewidth=0.5)\n", + "ax[0].axvline(x=pd.Timestamp(\"2013-10-24\"), linestyle=\"--\", color=\"grey\", linewidth=0.5)\n", + "ax[0].axvline(x=pd.Timestamp(\"2015-11-05\"), linestyle=\"--\", color=\"grey\", linewidth=0.5)\n", "\n", "# ax[1].s\n", - "ax[1].plot(accuracies_over_time_cboe[\"tick(all)\"], label=\"$\\operatorname{tick}_{\\mathrm{all}}$\", lw=1)\n", - "ax[1].plot(accuracies_over_time_cboe[\"quote(best)\"], label=\"$\\operatorname{quote}_{\\mathrm{nbbo}}$\", lw=1, zorder=20)\n", - "ax[1].plot(accuracies_over_time_cboe[\"quote(best)->quote(ex)->rev_tick(all)\"], label=r\"$\\operatorname{gsu}_{\\mathrm{small}}$\", lw=1, zorder=50)\n", - "ax[1].plot(accuracies_over_time_cboe[\"trade_size(ex)->quote(best)->quote(ex)->depth(best)->depth(ex)->rev_tick(all)\"], label=r\"$\\operatorname{gsu}_{\\mathrm{large}}$\", lw=1, zorder=100)\n", + "ax[1].plot(\n", + " accuracies_over_time_cboe[\"tick(all)\"],\n", + " label=r\"$\\operatorname{tick}_{\\mathrm{all}}$\",\n", + " lw=1,\n", + ")\n", + "ax[1].plot(\n", + " accuracies_over_time_cboe[\"quote(best)\"],\n", + " label=r\"$\\operatorname{quote}_{\\mathrm{nbbo}}$\",\n", + " lw=1,\n", + " zorder=20,\n", + ")\n", + "ax[1].plot(\n", + " accuracies_over_time_cboe[\"quote(best)->quote(ex)->rev_tick(all)\"],\n", + " label=r\"$\\operatorname{gsu}_{\\mathrm{small}}$\",\n", + " lw=1,\n", + " zorder=50,\n", + ")\n", + "ax[1].plot(\n", + " accuracies_over_time_cboe[\n", + " \"trade_size(ex)->quote(best)->quote(ex)->depth(best)->depth(ex)->rev_tick(all)\"\n", + " ],\n", + " label=r\"$\\operatorname{gsu}_{\\mathrm{large}}$\",\n", + " lw=1,\n", + " zorder=100,\n", + ")\n", "\n", - "ax[1].axvline(x=pd.Timestamp('2015-11-05'), linestyle='--', color='grey', linewidth=0.5)\n", + "ax[1].axvline(x=pd.Timestamp(\"2015-11-05\"), linestyle=\"--\", color=\"grey\", linewidth=0.5)\n", "\n", - "#ax[1].legend(frameon=False, loc=\"lower center\", ncols=2, bbox_to_anchor=(0.5, -1))\n", + "# ax[1].legend(frameon=False, loc=\"lower center\", ncols=2, bbox_to_anchor=(0.5, -1))\n", "\n", "# y-axis\n", "ax[0].set_ylabel(\"Accuracy\")\n", "ax[1].set_ylabel(\"Accuracy\")\n", - "ax[0].set_ylim(0,100)\n", - "ax[0].yaxis.set_major_formatter(PercentFormatter(100.0,decimals=2))\n", + "ax[0].set_ylim(0, 100)\n", + "ax[0].yaxis.set_major_formatter(PercentFormatter(100.0, decimals=2))\n", "\n", "# first ise and last cboe\n", - "ax[0].set_xlim(accuracies_over_time_ise.index[0],accuracies_over_time_cboe.index[-1])\n", + "ax[0].set_xlim(accuracies_over_time_ise.index[0], accuracies_over_time_cboe.index[-1])\n", "\n", "# bins_dt = [pd.Timestamp(\"2000-01-01 00:00:00\"), pd.Timestamp(\"2013-10-24 23:59:00\"), pd.Timestamp(\"2015-11-05 23:59:00\"),pd.Timestamp(\"2099-12-31 23:59:59\")]\n", "# else:\n", @@ -145,10 +221,17 @@ "\n", "handles, labels = ax[1].get_legend_handles_labels()\n", "order = [0, 1, 2, 3]\n", - "ax[1].legend([handles[idx] for idx in order],[labels[idx] for idx in order], frameon=False, loc=\"lower center\", ncols=4, bbox_to_anchor=(0.5, -0.5))\n", + "ax[1].legend(\n", + " [handles[idx] for idx in order],\n", + " [labels[idx] for idx in order],\n", + " frameon=False,\n", + " loc=\"lower center\",\n", + " ncols=4,\n", + " bbox_to_anchor=(0.5, -0.5),\n", + ")\n", "\n", - "ax[0].set_title('ISE')\n", - "ax[1].set_title('CBOE')\n", + "ax[0].set_title(\"ISE\")\n", + "ax[1].set_title(\"CBOE\")\n", "\n", "# x-axis\n", "# ax.set_xlabel(\"Date\")\n", @@ -186,10 +269,10 @@ "\n", "artifact = run.use_artifact(MODEL)\n", "model_dir = artifact.download()\n", - " \n", - "with open(Path(model_dir, model_name), 'rb') as f:\n", + "\n", + "with open(Path(model_dir, model_name), \"rb\") as f:\n", " model = pickle.load(f)\n", - " \n", + "\n", "clf = model.clf" ] }, @@ -201,7 +284,10 @@ }, "outputs": [], "source": [ - "pretrain_data = [{'train_loss': d['train_loss'], 'val_loss': d['val_loss'], 'epoch': d['epoch']} for d in model._stats_pretrain_epoch]" + "pretrain_data = [\n", + " {\"train_loss\": d[\"train_loss\"], \"val_loss\": d[\"val_loss\"], \"epoch\": d[\"epoch\"]}\n", + " for d in model._stats_pretrain_epoch\n", + "]" ] }, { @@ -234,22 +320,22 @@ }, "outputs": [], "source": [ - "fig, axes = plt.subplots(1,figsize=(14*CM, 7*CM), sharex=True, sharey=True)\n", + "fig, axes = plt.subplots(1, figsize=(14 * CM, 7 * CM), sharex=True, sharey=True)\n", "\n", "axes.plot(stats_pretrain, lw=1)\n", "axes.set_ylabel(\"BCE Loss\")\n", "\n", "axes.set_xlabel(\"Epoch\")\n", - "axes.set_xlim([0,19])\n", - "#axes[1].plot(stats_exs.iloc[:,[5,6,7,8]], lw=1)\n", + "axes.set_xlim([0, 19])\n", + "# axes[1].plot(stats_exs.iloc[:,[5,6,7,8]], lw=1)\n", "\n", "# axes[0].set_title(\"ISE\")\n", "# axes[1].set_title(\"CBOE\")\n", "\n", - "#axes.set_xlim(pd.to_datetime(\"2005-05-02\"), pd.to_datetime(\"2017-10-31\"))\n", + "# axes.set_xlim(pd.to_datetime(\"2005-05-02\"), pd.to_datetime(\"2017-10-31\"))\n", "# axes.set_ylim([0,20.0])\n", "# axes.yaxis.set_major_formatter(PercentFormatter(100.0,decimals=2))\n", - "#axes.xaxis.set_major_formatter(DateFormatter('%b %Y'))\n", + "# axes.xaxis.set_major_formatter(DateFormatter('%b %Y'))\n", "\n", "# labels = [\"_\",\"_\",\"At Mid (ISE)\", \"At Mid (CBOE)\"]\n", "\n", @@ -257,14 +343,18 @@ "\n", "labels = [\"Loss (Train)\", \"Loss (Val)\"]\n", "\n", - "axes.legend(labels, frameon=False, loc = \"lower center\" ,bbox_to_anchor = (0, -0.7, 1, 1), ncols=2)\n", + "axes.legend(\n", + " labels, frameon=False, loc=\"lower center\", bbox_to_anchor=(0, -0.7, 1, 1), ncols=2\n", + ")\n", "\n", "# axes.legend()\n", "\n", "# axes.legend(labels, frameon=False, loc = \"lower center\",bbox_to_anchor=(0.5, -0.5), ncols=2)\n", "\n", "plt.tight_layout()\n", - "plt.savefig(\"../reports/Graphs/transformer_ise_pretrain_classical.pdf\", bbox_inches=\"tight\")" + "plt.savefig(\n", + " \"../reports/Graphs/transformer_ise_pretrain_classical.pdf\", bbox_inches=\"tight\"\n", + ")" ] }, { @@ -282,8 +372,12 @@ }, "outputs": [], "source": [ - "tsize_ise = pd.read_parquet(\"gs://thesis-bucket-option-trade-classification/data/results/ise_supervised_all-accuracies-tsize-ex.parquet\")\n", - "tsize_cboe = pd.read_parquet(\"gs://thesis-bucket-option-trade-classification/data/results/cboe_supervised_all-accuracies-tsize-ex.parquet\")" + "tsize_ise = pd.read_parquet(\n", + " \"gs://thesis-bucket-option-trade-classification/data/results/ise_supervised_all-accuracies-tsize-ex.parquet\"\n", + ")\n", + "tsize_cboe = pd.read_parquet(\n", + " \"gs://thesis-bucket-option-trade-classification/data/results/cboe_supervised_all-accuracies-tsize-ex.parquet\"\n", + ")" ] }, { @@ -294,8 +388,12 @@ }, "outputs": [], "source": [ - "stats_ise = pd.read_parquet('gs://thesis-bucket-option-trade-classification/data/results/ise_supervised_all-classical-stats-over-time.parquet')\n", - "stats_cboe = pd.read_parquet('gs://thesis-bucket-option-trade-classification/data/results/cboe_supervised_all-classical-stats-over-time.parquet')" + "stats_ise = pd.read_parquet(\n", + " \"gs://thesis-bucket-option-trade-classification/data/results/ise_supervised_all-classical-stats-over-time.parquet\"\n", + ")\n", + "stats_cboe = pd.read_parquet(\n", + " \"gs://thesis-bucket-option-trade-classification/data/results/cboe_supervised_all-classical-stats-over-time.parquet\"\n", + ")" ] }, { @@ -306,7 +404,7 @@ }, "outputs": [], "source": [ - "stats_exs = pd.concat([stats_ise,stats_cboe], axis=1)" + "stats_exs = pd.concat([stats_ise, stats_cboe], axis=1)" ] }, { @@ -317,7 +415,7 @@ }, "outputs": [], "source": [ - "tsize_exs = pd.concat([tsize_ise,tsize_cboe], axis=1)" + "tsize_exs = pd.concat([tsize_ise, tsize_cboe], axis=1)" ] }, { @@ -339,33 +437,45 @@ }, "outputs": [], "source": [ - "fig, ax = plt.subplots(2,1,figsize=(14*CM, 6*CM), sharex=True, sharey=True)\n", + "fig, ax = plt.subplots(2, 1, figsize=(14 * CM, 6 * CM), sharex=True, sharey=True)\n", "\n", - "ax[0].axvline(x=pd.to_datetime(\"2005-05-02\"), linestyle='--', color='grey', linewidth=0.5)\n", - "ax[0].axvline(x=pd.to_datetime(\"2013-10-24\"), linestyle='--', color='grey', linewidth=0.5)\n", - "ax[0].axvline(x=pd.to_datetime(\"2015-11-05\"), linestyle='--', color='grey', linewidth=0.5)\n", + "ax[0].axvline(\n", + " x=pd.to_datetime(\"2005-05-02\"), linestyle=\"--\", color=\"grey\", linewidth=0.5\n", + ")\n", + "ax[0].axvline(\n", + " x=pd.to_datetime(\"2013-10-24\"), linestyle=\"--\", color=\"grey\", linewidth=0.5\n", + ")\n", + "ax[0].axvline(\n", + " x=pd.to_datetime(\"2015-11-05\"), linestyle=\"--\", color=\"grey\", linewidth=0.5\n", + ")\n", "\n", - "ax[0].plot(tsize_exs.iloc[:,1], lw=1, label=\"CBOE\")\n", - "ax[0].plot(tsize_exs.iloc[:,0], lw=1, label=\"ISE\")\n", + "ax[0].plot(tsize_exs.iloc[:, 1], lw=1, label=\"CBOE\")\n", + "ax[0].plot(tsize_exs.iloc[:, 0], lw=1, label=\"ISE\")\n", "ax[0].set_ylabel(\"Accuracy\")\n", "ax[0].set_xlabel(None)\n", "\n", - "ax[1].axvline(x=pd.to_datetime(\"2005-05-02\"), linestyle='--', color='grey', linewidth=0.5)\n", - "ax[1].axvline(x=pd.to_datetime(\"2013-10-24\"), linestyle='--', color='grey', linewidth=0.5)\n", - "ax[1].axvline(x=pd.to_datetime(\"2015-11-05\"), linestyle='--', color='grey', linewidth=0.5)\n", + "ax[1].axvline(\n", + " x=pd.to_datetime(\"2005-05-02\"), linestyle=\"--\", color=\"grey\", linewidth=0.5\n", + ")\n", + "ax[1].axvline(\n", + " x=pd.to_datetime(\"2013-10-24\"), linestyle=\"--\", color=\"grey\", linewidth=0.5\n", + ")\n", + "ax[1].axvline(\n", + " x=pd.to_datetime(\"2015-11-05\"), linestyle=\"--\", color=\"grey\", linewidth=0.5\n", + ")\n", "\n", - "ax[1].plot(stats_exs.iloc[:,9], lw=1, label=\"CBOE\")\n", - "ax[1].plot(stats_exs.iloc[:,4], lw=1, label=\"ISE\")\n", + "ax[1].plot(stats_exs.iloc[:, 9], lw=1, label=\"CBOE\")\n", + "ax[1].plot(stats_exs.iloc[:, 4], lw=1, label=\"ISE\")\n", "ax[1].set_ylabel(\"Coverage\")\n", "ax[0].set_xlabel(None)\n", "\n", "ax[0].set_xlim(pd.to_datetime(\"2005-05-02\"), pd.to_datetime(\"2017-10-31\"))\n", - "ax[0].set_ylim(0,100)\n", - "ax[0].yaxis.set_major_formatter(PercentFormatter(100.0,decimals=2))\n", + "ax[0].set_ylim(0, 100)\n", + "ax[0].yaxis.set_major_formatter(PercentFormatter(100.0, decimals=2))\n", "# ax.xaxis.set_major_formatter(StrMethodFormatter('{x:,.0f}'))\n", - "ax[0].xaxis.set_major_formatter(DateFormatter('%b %Y'))\n", + "ax[0].xaxis.set_major_formatter(DateFormatter(\"%b %Y\"))\n", "\n", - "ax[1].legend(frameon=False, loc = \"lower center\",bbox_to_anchor=(0.5, -0.7), ncols=2)\n", + "ax[1].legend(frameon=False, loc=\"lower center\", bbox_to_anchor=(0.5, -0.7), ncols=2)\n", "\n", "# plt.show()\n", "\n", @@ -387,8 +497,12 @@ }, "outputs": [], "source": [ - "na_over_time_ise = pd.read_parquet(\"gs://thesis-bucket-option-trade-classification/data/results/ise_supervised_all-missing-over-time.parquet\")\n", - "na_over_time_cboe = pd.read_parquet(\"gs://thesis-bucket-option-trade-classification/data/results/cboe_supervised_all-missing-over-time.parquet\")" + "na_over_time_ise = pd.read_parquet(\n", + " \"gs://thesis-bucket-option-trade-classification/data/results/ise_supervised_all-missing-over-time.parquet\"\n", + ")\n", + "na_over_time_cboe = pd.read_parquet(\n", + " \"gs://thesis-bucket-option-trade-classification/data/results/cboe_supervised_all-missing-over-time.parquet\"\n", + ")" ] }, { @@ -417,13 +531,24 @@ }, "outputs": [], "source": [ - "filter = ['tick(all)', 'quote(best)','quote(best)->quote(ex)->rev_tick(all)','trade_size(ex)->quote(best)->quote(ex)->depth(best)->depth(ex)->rev_tick(all)']\n", + "filter = [\n", + " \"tick(all)\",\n", + " \"quote(best)\",\n", + " \"quote(best)->quote(ex)->rev_tick(all)\",\n", + " \"trade_size(ex)->quote(best)->quote(ex)->depth(best)->depth(ex)->rev_tick(all)\",\n", + "]\n", "\n", - "fig, axes = plt.subplots(2,1,figsize=(14*CM, 9*CM), sharex=True, sharey=True)\n", + "fig, axes = plt.subplots(2, 1, figsize=(14 * CM, 9 * CM), sharex=True, sharey=True)\n", "\n", - "axes[0].axvline(x=pd.to_datetime(\"2015-06-15\"), linestyle='--', color='grey', linewidth=0.5)\n", - "axes[0].axvline(x=pd.to_datetime(\"2016-10-12\"), linestyle='--', color='grey', linewidth=0.5)\n", - "axes[1].axvline(x=pd.to_datetime(\"2016-10-12\"), linestyle='--', color='grey', linewidth=0.5)\n", + "axes[0].axvline(\n", + " x=pd.to_datetime(\"2015-06-15\"), linestyle=\"--\", color=\"grey\", linewidth=0.5\n", + ")\n", + "axes[0].axvline(\n", + " x=pd.to_datetime(\"2016-10-12\"), linestyle=\"--\", color=\"grey\", linewidth=0.5\n", + ")\n", + "axes[1].axvline(\n", + " x=pd.to_datetime(\"2016-10-12\"), linestyle=\"--\", color=\"grey\", linewidth=0.5\n", + ")\n", "\n", "\n", "axes[0].plot(1 - na_over_time_ise[filter], lw=1)\n", @@ -437,15 +562,27 @@ "axes[1].set_ylabel(\"Coverage\")\n", "axes[0].set_xlabel(None)\n", "# ax.set_ylim(45, 100)\n", - "axes[0].yaxis.set_major_formatter(PercentFormatter(1.0,decimals=2))\n", + "axes[0].yaxis.set_major_formatter(PercentFormatter(1.0, decimals=2))\n", "# ax.xaxis.set_major_formatter(StrMethodFormatter('{x:,.0f}'))\n", - "axes[0].xaxis.set_major_formatter(DateFormatter('%b %Y'))\n", + "axes[0].xaxis.set_major_formatter(DateFormatter(\"%b %Y\"))\n", "\n", "ylim = axes[0].get_ylim()\n", "\n", "axes[0].set_xlim(pd.to_datetime(\"2005-05-02\"), pd.to_datetime(\"2017-10-31\"))\n", "\n", - "plt.legend([\"_\",\"$\\operatorname{tick}_{\\mathrm{all}}$\", \"$\\operatorname{quote}_{\\mathrm{nbbo}}$\",\"$\\operatorname{gsu}_{\\mathrm{small}}$\",\"$\\operatorname{gsu}_{\\mathrm{large}}$\"],frameon=False, loc = \"lower center\", bbox_to_anchor=(0.5, -0.5), ncols=4)\n", + "plt.legend(\n", + " [\n", + " \"_\",\n", + " r\"$\\operatorname{tick}_{\\mathrm{all}}$\",\n", + " r\"$\\operatorname{quote}_{\\mathrm{nbbo}}$\",\n", + " r\"$\\operatorname{gsu}_{\\mathrm{small}}$\",\n", + " r\"$\\operatorname{gsu}_{\\mathrm{large}}$\",\n", + " ],\n", + " frameon=False,\n", + " loc=\"lower center\",\n", + " bbox_to_anchor=(0.5, -0.5),\n", + " ncols=4,\n", + ")\n", "\n", "plt.tight_layout()\n", "\n", @@ -460,7 +597,7 @@ }, "outputs": [], "source": [ - "stats_exs['quote_best_mid_na'].mean()" + "stats_exs[\"quote_best_mid_na\"].mean()" ] }, { @@ -471,14 +608,18 @@ }, "outputs": [], "source": [ - "fig, axes = plt.subplots(1,figsize=(14*CM, 6*CM), sharex=True, sharey=True)\n", + "fig, axes = plt.subplots(1, figsize=(14 * CM, 6 * CM), sharex=True, sharey=True)\n", "\n", - "axes.axvline(x=pd.to_datetime(\"2015-06-15\"), linestyle='--', color='grey', linewidth=0.5)\n", - "axes.axvline(x=pd.to_datetime(\"2016-10-12\"), linestyle='--', color='grey', linewidth=0.5)\n", + "axes.axvline(\n", + " x=pd.to_datetime(\"2015-06-15\"), linestyle=\"--\", color=\"grey\", linewidth=0.5\n", + ")\n", + "axes.axvline(\n", + " x=pd.to_datetime(\"2016-10-12\"), linestyle=\"--\", color=\"grey\", linewidth=0.5\n", + ")\n", "# axes[1].axvline(x=pd.to_datetime(\"2016-10-12\"), linestyle='--', color='grey', linewidth=0.5)\n", "\n", - "axes.plot(stats_exs.iloc[:,[4,10]], lw=1)\n", - "#axes[1].plot(stats_exs.iloc[:,[5,6,7,8]], lw=1)\n", + "axes.plot(stats_exs.iloc[:, [4, 10]], lw=1)\n", + "# axes[1].plot(stats_exs.iloc[:,[5,6,7,8]], lw=1)\n", "\n", "# axes[0].set_title(\"ISE\")\n", "# axes[1].set_title(\"CBOE\")\n", @@ -488,14 +629,16 @@ "axes.invert_yaxis()\n", "\n", "axes.set_xlim(pd.to_datetime(\"2005-05-02\"), pd.to_datetime(\"2017-10-31\"))\n", - "axes.set_ylim([100-lim*100 for lim in ylim])\n", - "axes.yaxis.set_major_formatter(PercentFormatter(100.0,decimals=2))\n", - "axes.xaxis.set_major_formatter(DateFormatter('%b %Y'))\n", + "axes.set_ylim([100 - lim * 100 for lim in ylim])\n", + "axes.yaxis.set_major_formatter(PercentFormatter(100.0, decimals=2))\n", + "axes.xaxis.set_major_formatter(DateFormatter(\"%b %Y\"))\n", "axes.set_ylabel(\"Percentage\")\n", "\n", - "labels = [\"_\",\"_\",\"At Mid (ISE)\", \"At Mid (CBOE)\"]\n", + "labels = [\"_\", \"_\", \"At Mid (ISE)\", \"At Mid (CBOE)\"]\n", "\n", - "plt.legend(labels, frameon=False, loc = \"lower center\",bbox_to_anchor=(0.5, -0.5), ncols=2)\n", + "plt.legend(\n", + " labels, frameon=False, loc=\"lower center\", bbox_to_anchor=(0.5, -0.5), ncols=2\n", + ")\n", "\n", "plt.tight_layout()\n", "plt.savefig(\"../reports/Graphs/classical_at_mid_over_time.pdf\", bbox_inches=\"tight\")" @@ -509,25 +652,39 @@ }, "outputs": [], "source": [ - "fig, axes = plt.subplots(2,1,figsize=(14*CM, 9*CM), sharex=True, sharey=True)\n", + "fig, axes = plt.subplots(2, 1, figsize=(14 * CM, 9 * CM), sharex=True, sharey=True)\n", "\n", - "axes[0].axvline(x=pd.to_datetime(\"2015-06-15\"), linestyle='--', color='grey', linewidth=0.5)\n", - "axes[0].axvline(x=pd.to_datetime(\"2016-10-12\"), linestyle='--', color='grey', linewidth=0.5)\n", - "axes[1].axvline(x=pd.to_datetime(\"2016-10-12\"), linestyle='--', color='grey', linewidth=0.5)\n", + "axes[0].axvline(\n", + " x=pd.to_datetime(\"2015-06-15\"), linestyle=\"--\", color=\"grey\", linewidth=0.5\n", + ")\n", + "axes[0].axvline(\n", + " x=pd.to_datetime(\"2016-10-12\"), linestyle=\"--\", color=\"grey\", linewidth=0.5\n", + ")\n", + "axes[1].axvline(\n", + " x=pd.to_datetime(\"2016-10-12\"), linestyle=\"--\", color=\"grey\", linewidth=0.5\n", + ")\n", "\n", - "axes[0].plot(stats_exs.iloc[:,[0,1,2,3]], lw=1)\n", - "axes[1].plot(stats_exs.iloc[:,[5,6,7,8]], lw=1)\n", + "axes[0].plot(stats_exs.iloc[:, [0, 1, 2, 3]], lw=1)\n", + "axes[1].plot(stats_exs.iloc[:, [5, 6, 7, 8]], lw=1)\n", "\n", "axes[0].set_title(\"ISE\")\n", "axes[1].set_title(\"CBOE\")\n", "\n", "axes[0].set_xlim(pd.to_datetime(\"2005-05-02\"), pd.to_datetime(\"2017-10-31\"))\n", - "axes[0].yaxis.set_major_formatter(PercentFormatter(100.0,decimals=2))\n", - "axes[0].xaxis.set_major_formatter(DateFormatter('%b %Y'))\n", - "\n", - "labels = [\"_\",\"$\\operatorname{tick}_{\\mathrm{ex}}$\", \"$\\operatorname{tick}_{\\mathrm{all}}$\", \"$\\operatorname{quote}_{\\mathrm{nbbo}}$\", \"$\\operatorname{quote}_{\\mathrm{ex}}$\"]\n", + "axes[0].yaxis.set_major_formatter(PercentFormatter(100.0, decimals=2))\n", + "axes[0].xaxis.set_major_formatter(DateFormatter(\"%b %Y\"))\n", + "\n", + "labels = [\n", + " \"_\",\n", + " r\"$\\operatorname{tick}_{\\mathrm{ex}}$\",\n", + " r\"$\\operatorname{tick}_{\\mathrm{all}}$\",\n", + " r\"$\\operatorname{quote}_{\\mathrm{nbbo}}$\",\n", + " r\"$\\operatorname{quote}_{\\mathrm{ex}}$\",\n", + "]\n", "\n", - "plt.legend(labels, frameon=False, loc = \"lower center\",bbox_to_anchor=(0.5, -0.7), ncols=4)\n", + "plt.legend(\n", + " labels, frameon=False, loc=\"lower center\", bbox_to_anchor=(0.5, -0.7), ncols=4\n", + ")\n", "\n", "plt.tight_layout()\n", "plt.savefig(\"../reports/Graphs/not_applicable_over_time.pdf\", bbox_inches=\"tight\")" @@ -574,22 +731,22 @@ "outputs": [], "source": [ "# 100 linearly spaced numbers\n", - "x = np.linspace(-2,2,100)\n", + "x = np.linspace(-2, 2, 100)\n", "# the function, which is y = x^2 here\n", - "y = np.log(1 + np.exp(-2*x))\n", + "y = np.log(1 + np.exp(-2 * x))\n", "\n", "# setting the axes at the centre\n", "fig = plt.figure(figsize=(12 * CM, 6 * CM))\n", "ax = fig.add_subplot(1, 1, 1)\n", "\n", - "ax.xaxis.set_ticks_position('bottom')\n", - "ax.yaxis.set_ticks_position('left')\n", + "ax.xaxis.set_ticks_position(\"bottom\")\n", + "ax.yaxis.set_ticks_position(\"left\")\n", "\n", - "ax.set_xlabel(\"Margin $y-F_m(\\mathbf{x})$\")\n", + "ax.set_xlabel(r\"Margin $y-F_m(\\mathbf{x})$\")\n", "ax.set_ylabel(\"Loss\")\n", "\n", "# plot the function\n", - "plt.plot(x,y, label=\"cross-entropy loss\")\n", + "plt.plot(x, y, label=\"cross-entropy loss\")\n", "plt.legend(frameon=False)\n", "plt.savefig(\"../reports/Graphs/cross-entropy-loss.pdf\", bbox_inches=\"tight\")\n", "# show the p" @@ -633,8 +790,10 @@ "# y_2 = regr_2.predict(X_test)\n", "\n", "# Plot the results\n", - "plt.figure(figsize=(8*CM,6*CM))\n", - "plt.scatter(X, y, s=20, c=\"yellowgreen\", edgecolors=\"black\", linewidth=0.5, label=\"Data\")\n", + "plt.figure(figsize=(8 * CM, 6 * CM))\n", + "plt.scatter(\n", + " X, y, s=20, c=\"yellowgreen\", edgecolors=\"black\", linewidth=0.5, label=\"Data\"\n", + ")\n", "plt.plot(X_test, y_1, color=\"cornflowerblue\", label=\"Approximation\", linewidth=1)\n", "# plt.plot(X_test, y_2, color=\"yellowgreen\", label=\"max_depth=5\", linewidth=2)\n", "plt.xlabel(\"Feature\")\n", @@ -667,7 +826,7 @@ "def to_mpl(start: str, end: str):\n", " mpl_start = mdates.date2num(pd.to_datetime(start))\n", " mpl_end = mdates.date2num(pd.to_datetime(end))\n", - " return mpl_start, mpl_end - mpl_start\n" + " return mpl_start, mpl_end - mpl_start" ] }, { @@ -680,7 +839,7 @@ "outputs": [], "source": [ "def to_pos(span: tuple):\n", - " return span[0] + 0.5 * span[1]\n" + " return span[0] + 0.5 * span[1]" ] }, { @@ -701,7 +860,13 @@ "\n", "# ise pretraining\n", "span = [to_mpl(\"2013-04-23\", \"2013-10-24\")]\n", - "ax.broken_barh(span, (2.5, 1), facecolors=(168/255,209/255,238/255), edgecolor=\"black\", linewidth=0.8)\n", + "ax.broken_barh(\n", + " span,\n", + " (2.5, 1),\n", + " facecolors=(168 / 255, 209 / 255, 238 / 255),\n", + " edgecolor=\"black\",\n", + " linewidth=0.8,\n", + ")\n", "\n", "# ax.text(\n", "# x=to_pos(span[0]),\n", @@ -713,10 +878,16 @@ "# fontsize=\"small\",\n", "# )\n", "\n", - "arrow_properties = dict(facecolor='black', arrowstyle='->')\n", - "ax.annotate(\"train\", (to_pos(span[0]), 3), xytext=(30, 0),\n", - " textcoords='offset points', ha='center', va='center',\n", - " arrowprops=arrow_properties)\n", + "arrow_properties = dict(facecolor=\"black\", arrowstyle=\"->\")\n", + "ax.annotate(\n", + " \"train\",\n", + " (to_pos(span[0]), 3),\n", + " xytext=(30, 0),\n", + " textcoords=\"offset points\",\n", + " ha=\"center\",\n", + " va=\"center\",\n", + " arrowprops=arrow_properties,\n", + ")\n", "\n", "\n", "spans = [\n", @@ -729,7 +900,11 @@ "ax.broken_barh(\n", " spans,\n", " (1.2, 1),\n", - " facecolors=((168/255,209/255,238/255), (204/255,212/255,151/255), (239/255,171/255,170/255)),\n", + " facecolors=(\n", + " (168 / 255, 209 / 255, 238 / 255),\n", + " (204 / 255, 212 / 255, 151 / 255),\n", + " (239 / 255, 171 / 255, 170 / 255),\n", + " ),\n", " edgecolor=\"black\",\n", " linewidth=0.8,\n", ")\n", @@ -761,7 +936,7 @@ "bx.broken_barh(\n", " spans,\n", " (1.85, 1),\n", - " facecolors=(239/255,171/255,170/255),\n", + " facecolors=(239 / 255, 171 / 255, 170 / 255),\n", " edgecolor=\"black\",\n", " linewidth=0.8,\n", ")\n", @@ -780,7 +955,6 @@ " )\n", "\n", "\n", - "\n", "# Modify y-axis tick labels\n", "ax.set_yticks([1.7, 3], labels=[\"ISE\\n Labeled\", \"ISE\\n Unlabeled\"])\n", "bx.set_yticks([2.35], labels=[\"CBOE\\n Labeled\"])\n", @@ -794,7 +968,7 @@ "plt.xlabel(\"Date\")\n", "\n", "# plt.show()\n", - "plt.savefig(\"../reports/Graphs/train-test-split.pdf\", bbox_inches=\"tight\")\n" + "plt.savefig(\"../reports/Graphs/train-test-split.pdf\", bbox_inches=\"tight\")" ] }, { @@ -830,7 +1004,7 @@ " if titles:\n", " ax.set_title(titles[j])\n", " fig.colorbar(pcm, ax=axes)\n", - " plt.savefig(\"../reports/Graphs/attention-maps.pdf\", bbox_inches=\"tight\")\n" + " plt.savefig(\"../reports/Graphs/attention-maps.pdf\", bbox_inches=\"tight\")" ] }, { @@ -845,7 +1019,7 @@ "attention_weights = torch.rand(size=(2, 4, 10, 10))\n", "show_heatmaps(\n", " attention_weights, xlabel=\"Keys\", ylabel=\"Queries\", figsize=(12 * CM, 6 * CM)\n", - ")\n" + ")" ] }, { @@ -875,7 +1049,7 @@ "\n", " pos_encoding = angle_rads[np.newaxis, ...]\n", "\n", - " return pos_encoding\n" + " return pos_encoding" ] }, { @@ -900,7 +1074,7 @@ "plt.ylim((tokens, 0))\n", "plt.ylabel(\"token position $t$\")\n", "plt.colorbar()\n", - "plt.savefig(\"../reports/Graphs/positional-encoding.pdf\", bbox_inches=\"tight\")\n" + "plt.savefig(\"../reports/Graphs/positional-encoding.pdf\", bbox_inches=\"tight\")" ] }, { @@ -927,13 +1101,10 @@ "\n", "# import numpy as np\n", "# import matplotlib.pyplot as plt\n", + "from matplotlib.ticker import MaxNLocator # needed for integer only on axis\n", "from sklearn import datasets\n", - "from sklearn.svm import SVC\n", - "from sklearn.semi_supervised import LabelSpreading\n", "from sklearn.semi_supervised import SelfTrainingClassifier\n", - "\n", - "from matplotlib.ticker import MaxNLocator # needed for integer only on axis\n", - "from matplotlib.lines import Line2D # for creating the custom legend\n", + "from sklearn.svm import SVC\n", "\n", "iris = datasets.load_iris()\n", "\n", @@ -982,46 +1153,50 @@ "y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1\n", "xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))\n", "\n", - "PROB_DOT_SCALE = 40 # modifier to scale the probability dots\n", - "PROB_DOT_SCALE_POWER = 3 # exponential used to increase/decrease size of prob dots\n", - "TRUE_DOT_SIZE = 50 #\n", + "PROB_DOT_SCALE = 40 # modifier to scale the probability dots\n", + "PROB_DOT_SCALE_POWER = 3 # exponential used to increase/decrease size of prob dots\n", + "TRUE_DOT_SIZE = 50 #\n", "\n", - "redish = '#d73027'\n", - "orangeish = '#fc8d59'\n", - "yellowish = '#fee090'\n", - "blueish = '#4575b4'\n", - "colormap = np.array([redish,blueish,orangeish])\n", + "redish = \"#d73027\"\n", + "orangeish = \"#fc8d59\"\n", + "yellowish = \"#fee090\"\n", + "blueish = \"#4575b4\"\n", + "colormap = np.array([redish, blueish, orangeish])\n", "\n", "color_map = {-1: (1, 1, 1), 0: (0, 0, 0.9), 1: (1, 0, 0), 2: (0.8, 0.6, 0)}\n", "\n", - "ax = plt.figure(figsize=(12*CM, 6*CM))\n", + "ax = plt.figure(figsize=(12 * CM, 6 * CM))\n", "\n", "classifiers = (rbf_svc, st30)\n", "for i, (clf, y_train, title) in enumerate(classifiers):\n", " # Plot the decision boundary. For that, we will assign a color to each\n", " # point in the mesh [x_min, x_max]x[y_min, y_max].\n", - " plt.subplot(1, 2, i+1)\n", + " plt.subplot(1, 2, i + 1)\n", " Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])\n", - " \n", - " z_proba = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])\n", + "\n", + " z_proba = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])\n", " # the size of each probability dot\n", - " Z_size = np.max(Z_proba, axis=1) \n", - " \n", + " Z_size = np.max(Z_proba, axis=1)\n", + "\n", " Z = Z.reshape(xx.shape)\n", - " \n", - " tri = plt.tricontourf(xx.flatten(), yy.flatten(), z_proba[:,1], levels=14, cmap=\"RdBu_r\")\n", - " plt.contour(xx, yy, z_proba[:,1].reshape(xx.shape), 15, linewidths=0.5, colors=\"k\")\n", - " \n", - " \n", + "\n", + " tri = plt.tricontourf(\n", + " xx.flatten(), yy.flatten(), z_proba[:, 1], levels=14, cmap=\"RdBu_r\"\n", + " )\n", + " plt.contour(xx, yy, z_proba[:, 1].reshape(xx.shape), 15, linewidths=0.5, colors=\"k\")\n", + "\n", " # Plot also the training points\n", " colors = [color_map[y] for y in y_train]\n", "\n", - " \n", - " plt.scatter(X[:, 0], X[:, 1], c=colors, s=20, edgecolors=\"black\", linewidth=0.5, zorder=10)\n", + " plt.scatter(\n", + " X[:, 0], X[:, 1], c=colors, s=20, edgecolors=\"black\", linewidth=0.5, zorder=10\n", + " )\n", "\n", " plt.title(title, y=-0.3)\n", "\n", - "plt.savefig(\"../reports/Graphs/semi-supervised-decision-boundary.pdf\", bbox_inches=\"tight\")\n", + "plt.savefig(\n", + " \"../reports/Graphs/semi-supervised-decision-boundary.pdf\", bbox_inches=\"tight\"\n", + ")\n", "# plt.suptitle(\"Unlabeled points are colored white\", y=0.1)\n", "plt.show()" ] @@ -1047,13 +1222,13 @@ "# set study globally here\n", "# study = \"1gzk7msy.optuna:v49\" # gbm classical\n", "# study = \"3vntumoi.optuna:v49\" # gbm classical-size\n", - "study = \"2t5zo50f.optuna:v49\" # gbm ml\n", + "study = \"2t5zo50f.optuna:v49\" # gbm ml\n", "\n", "# study = \"37lymmzc.optuna:v49\" # gbm semi-classical\n", "# study = \"1vmti6db.optuna:v49\" # gbm semi classical-size\n", "# study = \"t55nd8r0.optuna:v49\" # gbm semi ml\n", "\n", - "# transformer \n", + "# transformer\n", "# study = \"3jpe46s1.optuna:v9\" # transformer classical\n", "# study = \"1qx3ul4j.optuna:v9\" # transformer classical-size\n", "# study = \"2h81aiow.optuna:v9\" # transformer ml" @@ -1100,7 +1275,7 @@ }, "outputs": [], "source": [ - "file = open(f\"./artifacts/{study_id}.optuna:{version}/{study_id}.optuna\",'rb')\n", + "file = open(f\"./artifacts/{study_id}.optuna:{version}/{study_id}.optuna\", \"rb\")\n", "study = pickle.load(file)\n", "\n", "sampler = study.sampler\n", @@ -1118,7 +1293,20 @@ }, "outputs": [], "source": [ - "LUT_LABELS = {\"Objective Value\": \"Accuracy\", \"bagging_temperature\": \"Bagging Temp.\", \"depth\":\"Depth\", \"l2_leaf_reg\": \"$\\ell_2$ Leaf Reg.\" , \"learning_rate\" : \"$\\eta$\", \"random_strength\": \"Rand. Str.\", \"attention_dropout\": \"Att Dropout\", \"d_token\":\"$d_e$\",\"ffn_dropout\":\"FFN Dropout\", \"weight_decay\":\"$\\lambda$\",\"lr\": \"$\\eta$\",\"n_blocks\":\"$L$\"}" + "LUT_LABELS = {\n", + " \"Objective Value\": \"Accuracy\",\n", + " \"bagging_temperature\": \"Bagging Temp.\",\n", + " \"depth\": \"Depth\",\n", + " \"l2_leaf_reg\": r\"$\\ell_2$ Leaf Reg.\",\n", + " \"learning_rate\": r\"$\\eta$\",\n", + " \"random_strength\": \"Rand. Str.\",\n", + " \"attention_dropout\": \"Att Dropout\",\n", + " \"d_token\": \"$d_e$\",\n", + " \"ffn_dropout\": \"FFN Dropout\",\n", + " \"weight_decay\": r\"$\\lambda$\",\n", + " \"lr\": r\"$\\eta$\",\n", + " \"n_blocks\": \"$L$\",\n", + "}" ] }, { @@ -1130,36 +1318,32 @@ }, "outputs": [], "source": [ - "from typing import Callable\n", - "from typing import Dict\n", - "from typing import List\n", - "from typing import Optional\n", - "from typing import Sequence\n", - "from typing import Tuple\n", - "from typing import Union\n", + "from collections.abc import Sequence\n", + "from typing import Callable, Dict, List, Optional, Tuple, Union\n", "\n", "import numpy as np\n", - "\n", - "from optuna._experimental import experimental_func\n", "from optuna._imports import try_import\n", "from optuna.logging import get_logger\n", "from optuna.study import Study\n", "from optuna.trial import FrozenTrial\n", - "from optuna.visualization._contour import _AxisInfo\n", - "from optuna.visualization._contour import _ContourInfo\n", - "from optuna.visualization._contour import _get_contour_info\n", - "from optuna.visualization._contour import _SubContourInfo\n", + "from optuna.visualization._contour import (\n", + " _AxisInfo,\n", + " _ContourInfo,\n", + " _get_contour_info,\n", + " _SubContourInfo,\n", + ")\n", "from optuna.visualization.matplotlib._matplotlib_imports import _imports\n", "\n", - "\n", "with try_import() as _optuna_imports:\n", " import scipy\n", "\n", "if _imports.is_successful():\n", - " from optuna.visualization.matplotlib._matplotlib_imports import Axes\n", - " from optuna.visualization.matplotlib._matplotlib_imports import Colormap\n", - " from optuna.visualization.matplotlib._matplotlib_imports import ContourSet\n", - " from optuna.visualization.matplotlib._matplotlib_imports import plt\n", + " from optuna.visualization.matplotlib._matplotlib_imports import (\n", + " Axes,\n", + " Colormap,\n", + " ContourSet,\n", + " plt,\n", + " )\n", "\n", "_logger = get_logger(__name__)\n", "\n", @@ -1187,7 +1371,6 @@ " those of the Plotly-based :func:`~optuna.visualization.plot_contour`.\n", "\n", " Example:\n", - "\n", " The following code snippet shows how to plot the parameter relationship as contour plot.\n", "\n", " .. plot::\n", @@ -1228,15 +1411,12 @@ " The colormap is reversed when the ``target`` argument isn't :obj:`None` or ``direction``\n", " of :class:`~optuna.study.Study` is ``minimize``.\n", " \"\"\"\n", - "\n", " _imports.check()\n", " info = _get_contour_info(study, params, target, target_name)\n", " return _get_contour_plot(info)\n", "\n", "\n", - "\n", "def _get_contour_plot(info: _ContourInfo) -> \"Axes\":\n", - "\n", " sorted_params = info.sorted_params\n", " sub_plot_infos = info.sub_plot_infos\n", " reverse_scale = info.reverse_scale\n", @@ -1258,7 +1438,7 @@ " axcb.set_label(\"Accuracy\")\n", " else:\n", " # Set up the graph style.\n", - " fig, axs = plt.subplots(n_params, n_params, figsize=(15 *CM, 15 *CM))\n", + " fig, axs = plt.subplots(n_params, n_params, figsize=(15 * CM, 15 * CM))\n", " cmap = _set_cmap(reverse_scale)\n", "\n", " # Prepare data and draw contour plots.\n", @@ -1276,7 +1456,7 @@ "\n", " # Set the formatter for the colorbar\n", " axcb.ax.yaxis.set_major_formatter(formatter)\n", - " \n", + "\n", " axcb.set_label(\"Accuracy\")\n", "\n", " return axs\n", @@ -1324,7 +1504,6 @@ " List[Union[int, float]],\n", " List[Union[int, float]],\n", "]:\n", - "\n", " x_values = []\n", " y_values = []\n", " z_values = []\n", @@ -1344,7 +1523,6 @@ " axis: _AxisInfo,\n", " values: Sequence[Union[str, float]],\n", " ) -> Tuple[np.ndarray, List[str], List[int], List[Union[int, float]]]:\n", - "\n", " # Convert categorical values to int.\n", " cat_param_labels = [] # type: List[str]\n", " cat_param_pos = [] # type: List[int]\n", @@ -1359,19 +1537,25 @@ "\n", " # For x and y, create 1-D array of evenly spaced coordinates on linear or log scale.\n", " if axis.is_log:\n", - " ci = np.logspace(np.log10(axis.range[0]), np.log10(axis.range[1]), CONTOUR_POINT_NUM)\n", + " ci = np.logspace(\n", + " np.log10(axis.range[0]), np.log10(axis.range[1]), CONTOUR_POINT_NUM\n", + " )\n", " else:\n", " ci = np.linspace(axis.range[0], axis.range[1], CONTOUR_POINT_NUM)\n", "\n", " return ci, cat_param_labels, cat_param_pos, list(returned_values)\n", "\n", - " xi, cat_param_labels_x, cat_param_pos_x, transformed_x_values = _calculate_axis_data(\n", - " xaxis,\n", - " x_values,\n", + " xi, cat_param_labels_x, cat_param_pos_x, transformed_x_values = (\n", + " _calculate_axis_data(\n", + " xaxis,\n", + " x_values,\n", + " )\n", " )\n", - " yi, cat_param_labels_y, cat_param_pos_y, transformed_y_values = _calculate_axis_data(\n", - " yaxis,\n", - " y_values,\n", + " yi, cat_param_labels_y, cat_param_pos_y, transformed_y_values = (\n", + " _calculate_axis_data(\n", + " yaxis,\n", + " y_values,\n", + " )\n", " )\n", "\n", " # Calculate grid data points.\n", @@ -1379,7 +1563,9 @@ " # Create irregularly spaced map of trial values\n", " # and interpolate it with Plotly's interpolation formulation.\n", " if xaxis.name != yaxis.name:\n", - " zmap = _create_zmap(transformed_x_values, transformed_y_values, z_values, xi, yi)\n", + " zmap = _create_zmap(\n", + " transformed_x_values, transformed_y_values, z_values, xi, yi\n", + " )\n", " zi = _interpolate_zmap(zmap, CONTOUR_POINT_NUM)\n", "\n", " return (\n", @@ -1396,8 +1582,9 @@ " )\n", "\n", "\n", - "def _generate_contour_subplot(info: _SubContourInfo, ax: \"Axes\", cmap: \"Colormap\") -> \"ContourSet\":\n", - "\n", + "def _generate_contour_subplot(\n", + " info: _SubContourInfo, ax: \"Axes\", cmap: \"Colormap\"\n", + ") -> \"ContourSet\":\n", " if len(info.xaxis.indices) < 2 or len(info.yaxis.indices) < 2:\n", " ax.label_outer()\n", " return ax\n", @@ -1408,9 +1595,9 @@ " ax.set_xlim(info.xaxis.range[0], info.xaxis.range[1])\n", " ax.set_ylim(info.yaxis.range[0], info.yaxis.range[1])\n", "\n", - " ax.tick_params(axis='both', which='major', labelsize=\"small\")\n", + " ax.tick_params(axis=\"both\", which=\"major\", labelsize=\"small\")\n", " # ax.tick_params(axis='both', which='minor', labelsize=\"x-small\")\n", - " \n", + "\n", " if info.xaxis.name == info.yaxis.name:\n", " ax.label_outer()\n", " return ax\n", @@ -1427,46 +1614,46 @@ " y_values,\n", " z_values,\n", " ) = _calculate_griddata(info.xaxis, info.yaxis, info.z_values)\n", - " \n", + "\n", " # https://stackoverflow.com/a/55929839/5755604\n", " max_value = max(z_values)\n", " order = np.argsort(z_values)\n", - "# print(order)\n", - "# print(np.take(x_values, order))\n", - " \n", - "# print(np.arrange(x_values[order]))\n", - " \n", + " # print(order)\n", + " # print(np.take(x_values, order))\n", + "\n", + " # print(np.arrange(x_values[order]))\n", + "\n", " mask = np.array([z < max_value for z in z_values])\n", " # colors = ['black' if z < max_value else 'white' for z in z_values]\n", - " # marker = [\"o\" if z != max_value else \"x\" for z in z_values]\n", - " #x_values = np.take(x_values, order)\n", - " #y_values = np.take(y_values, order)\n", + " # marker = [\"o\" if z != max_value else \"x\" for z in z_values]\n", + " # x_values = np.take(x_values, order)\n", + " # y_values = np.take(y_values, order)\n", " # colors = np.take(colors, order)\n", - " \n", + "\n", " x_values = np.array(x_values)\n", " y_values = np.array(y_values)\n", - " \n", + "\n", " cs = None\n", " if len(zi) > 0:\n", " # print(info.xaxis.name)\n", " if info.xaxis.is_log:\n", " ax.set_xscale(\"log\")\n", - " ax.tick_params(axis='x', which='major', labelsize=\"xx-small\")\n", + " ax.tick_params(axis=\"x\", which=\"major\", labelsize=\"xx-small\")\n", " if info.yaxis.is_log:\n", " ax.set_yscale(\"log\")\n", - " ax.tick_params(axis='y', which='major', labelsize=\"xx-small\")\n", + " ax.tick_params(axis=\"y\", which=\"major\", labelsize=\"xx-small\")\n", " # if info.xaxis.name in [\"lambda\", \"lr\"]:\n", " # ax.ticklabel_format(style='sci', axis='x')\n", " # if info.xaxis.name in [\"weight_decay\"]:\n", "\n", - " # ax.ticklabel_format(style='sci', axis='x')\n", - " \n", - " # print(\"yes\")\n", - " # ax.xaxis.set_major_formatter(plt.NullFormatter())\n", - " # ax.yaxis.set_major_formatter(plt.NullFormatter())\n", - " # ax.set_xticks([])\n", - " # ax.set_yticks([])\n", - " \n", + " # ax.ticklabel_format(style='sci', axis='x')\n", + "\n", + " # print(\"yes\")\n", + " # ax.xaxis.set_major_formatter(plt.NullFormatter())\n", + " # ax.yaxis.set_major_formatter(plt.NullFormatter())\n", + " # ax.set_xticks([])\n", + " # ax.set_yticks([])\n", + "\n", " if info.xaxis.name != info.yaxis.name:\n", " # Contour the gridded data.\n", " ax.contour(xi, yi, zi, 15, linewidths=0.5, colors=\"k\")\n", @@ -1482,7 +1669,7 @@ " edgecolors=\"grey\",\n", " linewidth=0.5,\n", " # zorder=order,\n", - " )\n", + " )\n", " ax.scatter(\n", " x_values[~mask],\n", " y_values[~mask],\n", @@ -1492,7 +1679,7 @@ " edgecolors=\"grey\",\n", " linewidth=0.5,\n", " zorder=100,\n", - " ) \n", + " )\n", " if info.xaxis.is_cat:\n", " ax.set_xticks(x_cat_param_pos)\n", " ax.set_xticklabels(x_cat_param_label)\n", @@ -1510,7 +1697,6 @@ " xi: np.ndarray,\n", " yi: np.ndarray,\n", ") -> Dict[Tuple[int, int], float]:\n", - "\n", " # Creates z-map from trial values and params.\n", " # z-map is represented by hashmap of coordinate and trial value pairs.\n", " #\n", @@ -1530,8 +1716,9 @@ " return zmap\n", "\n", "\n", - "def _interpolate_zmap(zmap: Dict[Tuple[int, int], float], contour_plot_num: int) -> np.ndarray:\n", - "\n", + "def _interpolate_zmap(\n", + " zmap: Dict[Tuple[int, int], float], contour_plot_num: int\n", + ") -> np.ndarray:\n", " # Implements interpolation formulation used in Plotly\n", " # to interpolate heatmaps and contour plots\n", " # https://github.com/plotly/plotly.js/blob/95b3bd1bb19d8dc226627442f8f66bce9576def8/src/traces/heatmap/interp2d.js#L15-L20\n", @@ -1562,7 +1749,10 @@ " b[grid_index] = zmap[(x, y)]\n", " else:\n", " for dx, dy in ((-1, 0), (1, 0), (0, -1), (0, 1)):\n", - " if 0 <= x + dx < contour_plot_num and 0 <= y + dy < contour_plot_num:\n", + " if (\n", + " 0 <= x + dx < contour_plot_num\n", + " and 0 <= y + dy < contour_plot_num\n", + " ):\n", " a_data.append(1)\n", " a_row.append(grid_index)\n", " a_col.append(grid_index)\n", @@ -1570,7 +1760,9 @@ " a_row.append(grid_index)\n", " a_col.append(grid_index + dy * contour_plot_num + dx)\n", "\n", - " z = scipy.sparse.linalg.spsolve(scipy.sparse.csc_matrix((a_data, (a_row, a_col))), b)\n", + " z = scipy.sparse.linalg.spsolve(\n", + " scipy.sparse.csc_matrix((a_data, (a_row, a_col))), b\n", + " )\n", "\n", " return z.reshape((contour_plot_num, contour_plot_num))" ] @@ -1585,7 +1777,9 @@ "outputs": [], "source": [ "axes = plot_contour(study)\n", - "plt.savefig(f\"../reports/Graphs/{study_id}-hyperparam-search-space.pdf\", bbox_inches=\"tight\")" + "plt.savefig(\n", + " f\"../reports/Graphs/{study_id}-hyperparam-search-space.pdf\", bbox_inches=\"tight\"\n", + ")" ] }, { @@ -1624,18 +1818,58 @@ }, "outputs": [], "source": [ - "fig, (ax2, ax4, ax3, ax1) = plt.subplots(4, 1, figsize=(12*CM, 15*CM))\n", - "\n", - "loss_train = learning_metrics[[\"default_train_loss\", \"activation_train_loss\", \"lr_scheduler_train_loss\", \"sample_weighting_train_loss\", \"label_smoothing_train_loss\"]].dropna(how=\"any\").reset_index(drop=True).rolling(20).mean()\n", + "fig, (ax2, ax4, ax3, ax1) = plt.subplots(4, 1, figsize=(12 * CM, 15 * CM))\n", + "\n", + "loss_train = (\n", + " learning_metrics[\n", + " [\n", + " \"default_train_loss\",\n", + " \"activation_train_loss\",\n", + " \"lr_scheduler_train_loss\",\n", + " \"sample_weighting_train_loss\",\n", + " \"label_smoothing_train_loss\",\n", + " ]\n", + " ]\n", + " .dropna(how=\"any\")\n", + " .reset_index(drop=True)\n", + " .rolling(20)\n", + " .mean()\n", + ")\n", "\n", - "ax2.plot(loss_train.index,loss_train[\"default_train_loss\"], label=\"Default\", linewidth=1, zorder=100)\n", - "ax2.plot(loss_train.index,loss_train[\"activation_train_loss\"], label=\"Activation\", linewidth=1)\n", - "ax2.plot(loss_train.index,loss_train[\"label_smoothing_train_loss\"], label=\"Label Smoothing\", linewidth=1)\n", - "ax2.plot(loss_train.index,loss_train[\"lr_scheduler_train_loss\"], label=\"Lr Schedule\", linewidth=1)\n", - "ax2.plot(loss_train.index,loss_train[\"sample_weighting_train_loss\"], label=\"Sample Weighting\", linewidth=1)\n", + "ax2.plot(\n", + " loss_train.index,\n", + " loss_train[\"default_train_loss\"],\n", + " label=\"Default\",\n", + " linewidth=1,\n", + " zorder=100,\n", + ")\n", + "ax2.plot(\n", + " loss_train.index,\n", + " loss_train[\"activation_train_loss\"],\n", + " label=\"Activation\",\n", + " linewidth=1,\n", + ")\n", + "ax2.plot(\n", + " loss_train.index,\n", + " loss_train[\"label_smoothing_train_loss\"],\n", + " label=\"Label Smoothing\",\n", + " linewidth=1,\n", + ")\n", + "ax2.plot(\n", + " loss_train.index,\n", + " loss_train[\"lr_scheduler_train_loss\"],\n", + " label=\"Lr Schedule\",\n", + " linewidth=1,\n", + ")\n", + "ax2.plot(\n", + " loss_train.index,\n", + " loss_train[\"sample_weighting_train_loss\"],\n", + " label=\"Sample Weighting\",\n", + " linewidth=1,\n", + ")\n", "ax2.set_ylabel(\"Log Loss (Train)\")\n", "\n", - "ax2.xaxis.set_major_formatter(StrMethodFormatter('{x:,.0f}'))\n", + "ax2.xaxis.set_major_formatter(StrMethodFormatter(\"{x:,.0f}\"))\n", "# ax2.set_xlabel(\"Iteration\")\n", "\n", "n_epochs = 10\n", @@ -1643,26 +1877,66 @@ "step_size = int(max(learning_metrics[\"default_train_step\"]) / n_epochs)\n", "\n", "for i in range(step_size, step_size * n_epochs + 1, step_size):\n", - " ax2.axvline(x=i, linestyle='--', color='grey', linewidth=0.5)\n", + " ax2.axvline(x=i, linestyle=\"--\", color=\"grey\", linewidth=0.5)\n", "\n", "ax2.set_xlim(0, step_size * n_epochs - 1)\n", "\n", "\n", "#\n", "\n", - "acc_train = learning_metrics[[\"default_train_accuracy\", \"activation_train_accuracy\", \"lr_scheduler_train_accuracy\", \"sample_weighting_train_accuracy\", \"label_smoothing_train_accuracy\"]].dropna(how=\"any\").reset_index(drop=True).rolling(20).mean()\n", - "ax4.plot(acc_train.index, acc_train[\"default_train_accuracy\"], label=\"Default\", linewidth=1, zorder=100)\n", - "ax4.plot(acc_train.index, acc_train[\"activation_train_accuracy\"], label=\"Activation\", linewidth=1)\n", - "ax4.plot(acc_train.index, acc_train[\"label_smoothing_train_accuracy\"], label=\"Label Smoothing\", linewidth=1)\n", - "ax4.plot(acc_train.index, acc_train[\"lr_scheduler_train_accuracy\"], label=\"Lr Schedule\", linewidth=1)\n", - "ax4.plot(acc_train.index, acc_train[\"sample_weighting_train_accuracy\"], label=\"Sample Weighting\", linewidth=1)\n", + "acc_train = (\n", + " learning_metrics[\n", + " [\n", + " \"default_train_accuracy\",\n", + " \"activation_train_accuracy\",\n", + " \"lr_scheduler_train_accuracy\",\n", + " \"sample_weighting_train_accuracy\",\n", + " \"label_smoothing_train_accuracy\",\n", + " ]\n", + " ]\n", + " .dropna(how=\"any\")\n", + " .reset_index(drop=True)\n", + " .rolling(20)\n", + " .mean()\n", + ")\n", + "ax4.plot(\n", + " acc_train.index,\n", + " acc_train[\"default_train_accuracy\"],\n", + " label=\"Default\",\n", + " linewidth=1,\n", + " zorder=100,\n", + ")\n", + "ax4.plot(\n", + " acc_train.index,\n", + " acc_train[\"activation_train_accuracy\"],\n", + " label=\"Activation\",\n", + " linewidth=1,\n", + ")\n", + "ax4.plot(\n", + " acc_train.index,\n", + " acc_train[\"label_smoothing_train_accuracy\"],\n", + " label=\"Label Smoothing\",\n", + " linewidth=1,\n", + ")\n", + "ax4.plot(\n", + " acc_train.index,\n", + " acc_train[\"lr_scheduler_train_accuracy\"],\n", + " label=\"Lr Schedule\",\n", + " linewidth=1,\n", + ")\n", + "ax4.plot(\n", + " acc_train.index,\n", + " acc_train[\"sample_weighting_train_accuracy\"],\n", + " label=\"Sample Weighting\",\n", + " linewidth=1,\n", + ")\n", "\n", "n_epochs = 10\n", "\n", "step_size = int(len(acc_train) / n_epochs)\n", "\n", "for i in range(step_size, step_size * n_epochs + 1, step_size):\n", - " ax4.axvline(x=i, linestyle='--', color='grey', linewidth=0.5)\n", + " ax4.axvline(x=i, linestyle=\"--\", color=\"grey\", linewidth=0.5)\n", "\n", "ax4.set_xlim(0, step_size * n_epochs - 1)\n", "ax4.set_ylim(0.67, 0.80)\n", @@ -1670,17 +1944,45 @@ "\n", "ax4.set_ylabel(\"Accuracy (Train)\")\n", "ax4.set_xlabel(\"Iteration\")\n", - "ax4.yaxis.set_major_formatter(PercentFormatter(1.0,decimals=2))\n", - "ax4.xaxis.set_major_formatter(StrMethodFormatter('{x:,.0f}'))\n", - "\n", - "loss_val = learning_metrics.groupby(\"default_epoch\")[[\"default_val_loss\", \"activation_val_loss\", \"lr_scheduler_val_loss\", \"sample_weighting_val_loss\", \"label_smoothing_val_loss\"]].mean()\n", - "\n", - "\n", - "ax3.plot(loss_val.index,loss_val[\"default_val_loss\"], label=\"Default\", linewidth=1, zorder=100)\n", - "ax3.plot(loss_val.index,loss_val[\"activation_val_loss\"], label=\"Activation\", linewidth=1)\n", - "ax3.plot(loss_val.index,loss_val[\"label_smoothing_val_loss\"], label=\"Label Smoothing\", linewidth=1)\n", - "ax3.plot(loss_val.index,loss_val[\"lr_scheduler_val_loss\"], label=\"Lr Schedule\", linewidth=1)\n", - "ax3.plot(loss_val.index,loss_val[\"sample_weighting_val_loss\"], label=\"Sample Weighting\", linewidth=1)\n", + "ax4.yaxis.set_major_formatter(PercentFormatter(1.0, decimals=2))\n", + "ax4.xaxis.set_major_formatter(StrMethodFormatter(\"{x:,.0f}\"))\n", + "\n", + "loss_val = learning_metrics.groupby(\"default_epoch\")[\n", + " [\n", + " \"default_val_loss\",\n", + " \"activation_val_loss\",\n", + " \"lr_scheduler_val_loss\",\n", + " \"sample_weighting_val_loss\",\n", + " \"label_smoothing_val_loss\",\n", + " ]\n", + "].mean()\n", + "\n", + "\n", + "ax3.plot(\n", + " loss_val.index,\n", + " loss_val[\"default_val_loss\"],\n", + " label=\"Default\",\n", + " linewidth=1,\n", + " zorder=100,\n", + ")\n", + "ax3.plot(\n", + " loss_val.index, loss_val[\"activation_val_loss\"], label=\"Activation\", linewidth=1\n", + ")\n", + "ax3.plot(\n", + " loss_val.index,\n", + " loss_val[\"label_smoothing_val_loss\"],\n", + " label=\"Label Smoothing\",\n", + " linewidth=1,\n", + ")\n", + "ax3.plot(\n", + " loss_val.index, loss_val[\"lr_scheduler_val_loss\"], label=\"Lr Schedule\", linewidth=1\n", + ")\n", + "ax3.plot(\n", + " loss_val.index,\n", + " loss_val[\"sample_weighting_val_loss\"],\n", + " label=\"Sample Weighting\",\n", + " linewidth=1,\n", + ")\n", "ax3.set_ylabel(\"Log Loss (Val)\")\n", "# ax3.set_xlabel(\"Step\")\n", "n_epochs = 10\n", @@ -1688,7 +1990,7 @@ "step_size = int(len(loss_val) / n_epochs)\n", "\n", "for i in range(step_size, step_size * n_epochs + 1, step_size):\n", - " ax3.axvline(x=i, linestyle='--', color='grey', linewidth=0.5)\n", + " ax3.axvline(x=i, linestyle=\"--\", color=\"grey\", linewidth=0.5)\n", "\n", "# ax.set_ylim(0.6, 0.72)\n", "\n", @@ -1696,14 +1998,45 @@ "ax3.set_ylabel(\"Log Loss (Val)\")\n", "ax3.set_xlim(0, step_size * n_epochs - 1)\n", "\n", - "ax3.xaxis.set_major_formatter(StrMethodFormatter('{x:,.0f}'))\n", - "\n", - "acc_val = learning_metrics.groupby(\"default_epoch\")[[\"default_val_accuracy\", \"activation_val_accuracy\", \"lr_scheduler_val_accuracy\", \"sample_weighting_val_accuracy\", \"label_smoothing_val_accuracy\"]].mean()\n", - "ax1.plot(acc_val.index,acc_val[\"default_val_accuracy\"], label=\"Default\", linewidth=1, zorder=100)\n", - "ax1.plot(acc_val.index,acc_val[\"activation_val_accuracy\"], label=\"Activation\", linewidth=1)\n", - "ax1.plot(acc_val.index,acc_val[\"label_smoothing_val_accuracy\"], label=\"Label Smoothing\", linewidth=1)\n", - "ax1.plot(acc_val.index,acc_val[\"lr_scheduler_val_accuracy\"], label=\"Lr Schedule\", linewidth=1)\n", - "ax1.plot(acc_val.index,acc_val[\"sample_weighting_val_accuracy\"], label=\"Sample Weighting\", linewidth=1)\n", + "ax3.xaxis.set_major_formatter(StrMethodFormatter(\"{x:,.0f}\"))\n", + "\n", + "acc_val = learning_metrics.groupby(\"default_epoch\")[\n", + " [\n", + " \"default_val_accuracy\",\n", + " \"activation_val_accuracy\",\n", + " \"lr_scheduler_val_accuracy\",\n", + " \"sample_weighting_val_accuracy\",\n", + " \"label_smoothing_val_accuracy\",\n", + " ]\n", + "].mean()\n", + "ax1.plot(\n", + " acc_val.index,\n", + " acc_val[\"default_val_accuracy\"],\n", + " label=\"Default\",\n", + " linewidth=1,\n", + " zorder=100,\n", + ")\n", + "ax1.plot(\n", + " acc_val.index, acc_val[\"activation_val_accuracy\"], label=\"Activation\", linewidth=1\n", + ")\n", + "ax1.plot(\n", + " acc_val.index,\n", + " acc_val[\"label_smoothing_val_accuracy\"],\n", + " label=\"Label Smoothing\",\n", + " linewidth=1,\n", + ")\n", + "ax1.plot(\n", + " acc_val.index,\n", + " acc_val[\"lr_scheduler_val_accuracy\"],\n", + " label=\"Lr Schedule\",\n", + " linewidth=1,\n", + ")\n", + "ax1.plot(\n", + " acc_val.index,\n", + " acc_val[\"sample_weighting_val_accuracy\"],\n", + " label=\"Sample Weighting\",\n", + " linewidth=1,\n", + ")\n", "ax1.set_ylabel(\"Log Loss (Val)\")\n", "ax1.set_xlabel(\"Epoch\")\n", "n_epochs = 10\n", @@ -1711,22 +2044,31 @@ "step_size = int(len(loss_val) / n_epochs)\n", "\n", "for i in range(step_size, step_size * n_epochs + 1, step_size):\n", - " ax1.axvline(x=i, linestyle='--', color='grey', linewidth=0.5)\n", + " ax1.axvline(x=i, linestyle=\"--\", color=\"grey\", linewidth=0.5)\n", "\n", "ax1.set_xlim(0, step_size * n_epochs - 1)\n", "# ax.set_ylim(0.6, 0.72)\n", "\n", "ax1.set_xlabel(\"Epoch\")\n", "ax1.set_ylabel(\"Accuracy (Val)\")\n", - "ax1.yaxis.set_major_formatter(PercentFormatter(1.0,decimals=2))\n", - "ax1.xaxis.set_major_formatter(StrMethodFormatter('{x:,.0f}'))\n", + "ax1.yaxis.set_major_formatter(PercentFormatter(1.0, decimals=2))\n", + "ax1.xaxis.set_major_formatter(StrMethodFormatter(\"{x:,.0f}\"))\n", "\n", "handles, labels = ax1.get_legend_handles_labels()\n", - "fig.legend(handles, labels, loc='lower center', frameon=False, ncol=3, bbox_to_anchor = (0, -0.07, 1, 1))\n", + "fig.legend(\n", + " handles,\n", + " labels,\n", + " loc=\"lower center\",\n", + " frameon=False,\n", + " ncol=3,\n", + " bbox_to_anchor=(0, -0.07, 1, 1),\n", + ")\n", "\n", "plt.tight_layout()\n", "\n", - "plt.savefig(f\"../reports/Graphs/fttransformer-optimisations-loss-acc.pdf\", bbox_inches=\"tight\")" + "plt.savefig(\n", + " \"../reports/Graphs/fttransformer-optimisations-loss-acc.pdf\", bbox_inches=\"tight\"\n", + ")" ] }, { @@ -1757,17 +2099,19 @@ "outputs": [], "source": [ "# visualize learning curves\n", - "with open(Path(model_dir,model_name[:-4]+\"_training.json\"), 'r') as j:\n", - " contents = json.loads(j.read())\n", - " \n", + "with open(Path(model_dir, model_name[:-4] + \"_training.json\")) as j:\n", + " contents = json.loads(j.read())\n", + "\n", "# extract relevant keys\n", "iterations = contents.get(\"iterations\")\n", - "test_metrics = [d['name'] for d in contents['meta']['test_metrics'] ]\n", - "test_results = [d['test'] for d in iterations]\n", - "learn_metrics = [d['name'] for d in contents['meta']['learn_metrics'] ]\n", - "learn_results = [d['learn'] for d in iterations]\n", + "test_metrics = [d[\"name\"] for d in contents[\"meta\"][\"test_metrics\"]]\n", + "test_results = [d[\"test\"] for d in iterations]\n", + "learn_metrics = [d[\"name\"] for d in contents[\"meta\"][\"learn_metrics\"]]\n", + "learn_results = [d[\"learn\"] for d in iterations]\n", "\n", - "metrics_learn = pd.DataFrame(learn_results, columns=learn_metrics).add_suffix(\" (train)\")\n", + "metrics_learn = pd.DataFrame(learn_results, columns=learn_metrics).add_suffix(\n", + " \" (train)\"\n", + ")\n", "metrics_test = pd.DataFrame(test_results, columns=test_metrics).add_suffix(\" (val)\")\n", "\n", "learning_metrics = pd.concat([metrics_learn, metrics_test], axis=1)" @@ -1828,23 +2172,50 @@ }, "outputs": [], "source": [ - "fig, (ax2, ax1) = plt.subplots(2, 1, figsize=(12*CM,7.5*CM), sharex=True)\n", + "fig, (ax2, ax1) = plt.subplots(2, 1, figsize=(12 * CM, 7.5 * CM), sharex=True)\n", "\n", "# plot accuracy\n", - "ax1.plot(learning_metrics.index,learning_metrics[\"default_learn_acc\"], label=\"Train\", linewidth=1)\n", - "ax1.plot(learning_metrics.index,learning_metrics[\"default_val_acc\"], label=\"Val\",linewidth=1)\n", - "ax1.yaxis.set_major_formatter(PercentFormatter(1.0,decimals=2))\n", + "ax1.plot(\n", + " learning_metrics.index,\n", + " learning_metrics[\"default_learn_acc\"],\n", + " label=\"Train\",\n", + " linewidth=1,\n", + ")\n", + "ax1.plot(\n", + " learning_metrics.index,\n", + " learning_metrics[\"default_val_acc\"],\n", + " label=\"Val\",\n", + " linewidth=1,\n", + ")\n", + "ax1.yaxis.set_major_formatter(PercentFormatter(1.0, decimals=2))\n", "ax1.set_ylabel(\"Accuracy\")\n", - "ax1.xaxis.set_major_formatter(StrMethodFormatter('{x:,.0f}'))\n", + "ax1.xaxis.set_major_formatter(StrMethodFormatter(\"{x:,.0f}\"))\n", "ax1.set_xlabel(\"Iterations\")\n", "\n", "# plot log loss\n", - "ax2.plot(learning_metrics.index,learning_metrics[\"default_learn_log\"], label=\"Train\",linewidth=1) \n", - "ax2.plot(learning_metrics.index,learning_metrics[\"default_val_log\"], label=\"Val\",linewidth=1) \n", + "ax2.plot(\n", + " learning_metrics.index,\n", + " learning_metrics[\"default_learn_log\"],\n", + " label=\"Train\",\n", + " linewidth=1,\n", + ")\n", + "ax2.plot(\n", + " learning_metrics.index,\n", + " learning_metrics[\"default_val_log\"],\n", + " label=\"Val\",\n", + " linewidth=1,\n", + ")\n", "ax2.set_ylabel(\"Log Loss\")\n", "\n", "handles, labels = ax1.get_legend_handles_labels()\n", - "fig.legend(handles, labels, loc='lower center', frameon=False, ncol=4, bbox_to_anchor = (0, -0.03, 1, 1))\n", + "fig.legend(\n", + " handles,\n", + " labels,\n", + " loc=\"lower center\",\n", + " frameon=False,\n", + " ncol=4,\n", + " bbox_to_anchor=(0, -0.03, 1, 1),\n", + ")\n", "\n", "plt.tight_layout()\n", "\n", @@ -1864,38 +2235,87 @@ }, "outputs": [], "source": [ - "fig, (ax2, ax1) = plt.subplots(2, 1, figsize=(12*CM,7.5*CM), sharex=True)\n", + "fig, (ax2, ax1) = plt.subplots(2, 1, figsize=(12 * CM, 7.5 * CM), sharex=True)\n", "\n", "# [\"default\", \"depth\", \"early_stopping\", \"border_count\", \"grow_policy\", \"exp_weighting\"]\n", "\n", "# plot accuracy\n", - "ax1.plot(learning_metrics.index,learning_metrics[\"default_val_acc\"], label=\"Default\", linewidth=1, zorder=100)\n", - "ax1.plot(learning_metrics.index,learning_metrics[\"early_stopping_val_acc\"], label=\"Early Stopping\", linewidth=1)\n", - "ax1.plot(learning_metrics.index,learning_metrics[\"grow_policy_val_acc\"], label=\"Grow Policy\", linewidth=1)\n", - "ax1.plot(learning_metrics.index,learning_metrics[\"exp_weighting_val_acc\"], label=\"Sample Weighting\", linewidth=1)\n", + "ax1.plot(\n", + " learning_metrics.index,\n", + " learning_metrics[\"default_val_acc\"],\n", + " label=\"Default\",\n", + " linewidth=1,\n", + " zorder=100,\n", + ")\n", + "ax1.plot(\n", + " learning_metrics.index,\n", + " learning_metrics[\"early_stopping_val_acc\"],\n", + " label=\"Early Stopping\",\n", + " linewidth=1,\n", + ")\n", + "ax1.plot(\n", + " learning_metrics.index,\n", + " learning_metrics[\"grow_policy_val_acc\"],\n", + " label=\"Grow Policy\",\n", + " linewidth=1,\n", + ")\n", + "ax1.plot(\n", + " learning_metrics.index,\n", + " learning_metrics[\"exp_weighting_val_acc\"],\n", + " label=\"Sample Weighting\",\n", + " linewidth=1,\n", + ")\n", "\n", - "ax1.yaxis.set_major_formatter(PercentFormatter(1.0,decimals=2))\n", + "ax1.yaxis.set_major_formatter(PercentFormatter(1.0, decimals=2))\n", "ax1.set_ylabel(\"Accuracy (Val)\")\n", - "ax1.xaxis.set_major_formatter(StrMethodFormatter('{x:,.0f}'))\n", - "#ax1.set_ylim(0.57, 0.64)\n", + "ax1.xaxis.set_major_formatter(StrMethodFormatter(\"{x:,.0f}\"))\n", + "# ax1.set_ylim(0.57, 0.64)\n", "ax1.set_xlabel(\"Iterations\")\n", "\n", "# plot log loss\n", - "ax2.plot(learning_metrics.index,learning_metrics[\"default_val_log\"], label=\"Default\", linewidth=1, zorder=100)\n", - "ax2.plot(learning_metrics.index,learning_metrics[\"early_stopping_val_log\"], label=\"Early Stopping\", linewidth=1)\n", - "ax2.plot(learning_metrics.index,learning_metrics[\"grow_policy_val_log\"], label=\"Grow Policy\", linewidth=1)\n", - "ax2.plot(learning_metrics.index,learning_metrics[\"exp_weighting_val_log\"], label=\"Sample Weighting\", linewidth=1)\n", + "ax2.plot(\n", + " learning_metrics.index,\n", + " learning_metrics[\"default_val_log\"],\n", + " label=\"Default\",\n", + " linewidth=1,\n", + " zorder=100,\n", + ")\n", + "ax2.plot(\n", + " learning_metrics.index,\n", + " learning_metrics[\"early_stopping_val_log\"],\n", + " label=\"Early Stopping\",\n", + " linewidth=1,\n", + ")\n", + "ax2.plot(\n", + " learning_metrics.index,\n", + " learning_metrics[\"grow_policy_val_log\"],\n", + " label=\"Grow Policy\",\n", + " linewidth=1,\n", + ")\n", + "ax2.plot(\n", + " learning_metrics.index,\n", + " learning_metrics[\"exp_weighting_val_log\"],\n", + " label=\"Sample Weighting\",\n", + " linewidth=1,\n", + ")\n", "ax2.set_ylabel(\"Log Loss (Val)\")\n", "# ax2.set_ylim(0.55, 0.7)\n", "\n", "plt.tight_layout()\n", "\n", "handles, labels = ax2.get_legend_handles_labels()\n", - "fig.legend(handles, labels, loc='lower center', frameon=False, ncol=4, bbox_to_anchor = (0, -0.03, 1, 1))\n", + "fig.legend(\n", + " handles,\n", + " labels,\n", + " loc=\"lower center\",\n", + " frameon=False,\n", + " ncol=4,\n", + " bbox_to_anchor=(0, -0.03, 1, 1),\n", + ")\n", "\n", "plt.tight_layout()\n", "\n", - "plt.savefig(f\"../reports/Graphs/gbm-optimisations-loss-acc.pdf\", bbox_inches=\"tight\")" + "plt.savefig(\"../reports/Graphs/gbm-optimisations-loss-acc.pdf\", bbox_inches=\"tight\")" ] }, { @@ -1916,8 +2336,7 @@ "outputs": [], "source": [ "import numpy as np\n", - "from torch import optim\n", - "from torch import nn\n", + "from torch import nn, optim\n", "\n", "\n", "class CosineWarmupScheduler(optim.lr_scheduler._LRScheduler):\n", @@ -1934,7 +2353,7 @@ " lr_factor = 0.5 * (1 + np.cos(np.pi * epoch / self.max_num_iters))\n", " if epoch <= self.warmup:\n", " lr_factor *= epoch * 1.0 / self.warmup\n", - " return lr_factor\n" + " return lr_factor" ] }, { @@ -1984,7 +2403,7 @@ "source": [ "lr = 1e-3\n", "\n", - "factor = [scheduler.get_lr_factor(i) * lr for i in range(0, max_iters)]\n", + "factor = [scheduler.get_lr_factor(i) * lr for i in range(max_iters)]\n", "\n", "fig = plt.figure(figsize=(12 * CM, 3.5 * CM))\n", "\n", @@ -1992,7 +2411,7 @@ "plt.xlabel(\"Iteration\")\n", "plt.ylabel(\"Learning Rate\")\n", "\n", - "plt.savefig(f\"lr-lin-warmup-cosine-decay.pdf\", bbox_inches=\"tight\")" + "plt.savefig(\"lr-lin-warmup-cosine-decay.pdf\", bbox_inches=\"tight\")" ] }, { @@ -2015,12 +2434,21 @@ "# TODO: replace with versioned results\n", "sample_size = 256\n", "\n", - "fi_classical = pd.read_parquet(f\"gs://thesis-bucket-option-trade-classification/data/results/ise_supervised_test_classical_feature_importance_{sample_size}.parquet\")\n", - "fi_gbm = pd.read_parquet(f\"gs://thesis-bucket-option-trade-classification/data/results/ise_supervised_test_gbm_feature_importance_{sample_size}.parquet\")\n", - "fi_transformer = pd.read_parquet(f\"gs://thesis-bucket-option-trade-classification/data/results/ise_supervised_test_fttransformer_feature_importance_{sample_size}.parquet\")\n", + "fi_classical = pd.read_parquet(\n", + " f\"gs://thesis-bucket-option-trade-classification/data/results/ise_supervised_test_classical_feature_importance_{sample_size}.parquet\"\n", + ")\n", + "fi_gbm = pd.read_parquet(\n", + " f\"gs://thesis-bucket-option-trade-classification/data/results/ise_supervised_test_gbm_feature_importance_{sample_size}.parquet\"\n", + ")\n", + "fi_transformer = pd.read_parquet(\n", + " f\"gs://thesis-bucket-option-trade-classification/data/results/ise_supervised_test_fttransformer_feature_importance_{sample_size}.parquet\"\n", + ")\n", "\n", "# set features to nan that are not part of dataset\n", - "fi_classical.loc[[\"size_ex (grouped)\", \"TRADE_SIZE\"],[\"quote(best)->quote(ex) values\",\"quote(best)->quote(ex) std\"]] = np.NaN" + "fi_classical.loc[\n", + " [\"size_ex (grouped)\", \"TRADE_SIZE\"],\n", + " [\"quote(best)->quote(ex) values\", \"quote(best)->quote(ex) std\"],\n", + "] = np.nan" ] }, { @@ -2053,39 +2481,118 @@ }, "outputs": [], "source": [ - "fig, axes = plt.subplots(1, 3, figsize=(18*CM, 12*CM), sharex=False, sharey=True)\n", + "fig, axes = plt.subplots(1, 3, figsize=(18 * CM, 12 * CM), sharex=False, sharey=True)\n", "\n", "# adapted from here: https://stackoverflow.com/a/15214551/5755604\n", "ind = np.arange(len(fi))\n", "width = 0.25\n", "\n", - "semi = \"\"#\"semi-\"\n", + "semi = \"\" # \"semi-\"\n", "\n", - "axes[0].barh(ind, fi[\"quote(best)->quote(ex)->rev_tick(all) values\"], width, xerr=fi[\"quote(best)->quote(ex)->rev_tick(all) std\"], label=f\"GSU\")\n", - "axes[0].barh(ind+width, fi[f\"gbm({semi}classical) values\"], width, xerr=fi[f\"gbm({semi}classical) std\"], label=f\"{semi}GBRT\")\n", - "axes[0].barh(ind+width + width, fi[f\"fttransformer({semi}classical) values\"], width, xerr=fi[f\"fttransformer({semi}classical) std\"], label=f\"{semi}Transformer\")\n", + "axes[0].barh(\n", + " ind,\n", + " fi[\"quote(best)->quote(ex)->rev_tick(all) values\"],\n", + " width,\n", + " xerr=fi[\"quote(best)->quote(ex)->rev_tick(all) std\"],\n", + " label=\"GSU\",\n", + ")\n", + "axes[0].barh(\n", + " ind + width,\n", + " fi[f\"gbm({semi}classical) values\"],\n", + " width,\n", + " xerr=fi[f\"gbm({semi}classical) std\"],\n", + " label=f\"{semi}GBRT\",\n", + ")\n", + "axes[0].barh(\n", + " ind + width + width,\n", + " fi[f\"fttransformer({semi}classical) values\"],\n", + " width,\n", + " xerr=fi[f\"fttransformer({semi}classical) std\"],\n", + " label=f\"{semi}Transformer\",\n", + ")\n", "# axes[0].barh(ind+width + width + width, fi[\"fttransformer(semi-classical) values\"], width, xerr=fi[\"fttransformer(semi-classical) std\"], label=\"Transformer (Pre-Train)\")\n", - "axes[0].axvline(0, color='black', linestyle='--', linewidth=0.5)\n", + "axes[0].axvline(0, color=\"black\", linestyle=\"--\", linewidth=0.5)\n", "axes[0].set_xlim([-0.15, 0.15])\n", "\n", - "axes[1].barh(ind, fi[\"trade_size(ex)->quote(best)->quote(ex)->depth(best)->depth(ex)->rev_tick(all) values\"], width, xerr=fi[\"trade_size(ex)->quote(best)->quote(ex)->depth(best)->depth(ex)->rev_tick(all) std\"], label=\"GSU\")\n", - "axes[1].barh(ind+width, fi[f\"gbm({semi}classical-size) values\"], width, xerr=fi[f\"gbm({semi}classical-size) std\"], label=f\"{semi}GBRT\")\n", - "axes[1].barh(ind+width + width, fi[f\"fttransformer({semi}classical-size) values\"], width, xerr=fi[f\"fttransformer({semi}classical-size) std\"], label=f\"{semi}Transformer\")\n", + "axes[1].barh(\n", + " ind,\n", + " fi[\n", + " \"trade_size(ex)->quote(best)->quote(ex)->depth(best)->depth(ex)->rev_tick(all) values\"\n", + " ],\n", + " width,\n", + " xerr=fi[\n", + " \"trade_size(ex)->quote(best)->quote(ex)->depth(best)->depth(ex)->rev_tick(all) std\"\n", + " ],\n", + " label=\"GSU\",\n", + ")\n", + "axes[1].barh(\n", + " ind + width,\n", + " fi[f\"gbm({semi}classical-size) values\"],\n", + " width,\n", + " xerr=fi[f\"gbm({semi}classical-size) std\"],\n", + " label=f\"{semi}GBRT\",\n", + ")\n", + "axes[1].barh(\n", + " ind + width + width,\n", + " fi[f\"fttransformer({semi}classical-size) values\"],\n", + " width,\n", + " xerr=fi[f\"fttransformer({semi}classical-size) std\"],\n", + " label=f\"{semi}Transformer\",\n", + ")\n", "# axes[1].barh(ind+width + width + width, fi[\"fttransformer(semi-classical-size) values\"], width, xerr=fi[\"fttransformer(semi-classical-size) std\"], label=\"Transformer (Pre-Train)\")\n", - "axes[1].axvline(0, color='black', linestyle='--', linewidth=0.5)\n", + "axes[1].axvline(0, color=\"black\", linestyle=\"--\", linewidth=0.5)\n", "axes[1].set_xlim([-0.15, 0.15])\n", "\n", - "axes[2].barh(ind, fi[\"trade_size(ex)->quote(best)->quote(ex)->depth(best)->depth(ex)->rev_tick(all) values\"], width, xerr=fi[\"trade_size(ex)->quote(best)->quote(ex)->depth(best)->depth(ex)->rev_tick(all) std\"], label=\"GSU\")\n", - "axes[2].barh(ind+width, fi[f\"gbm({semi}ml) values\"], width, xerr=fi[f\"gbm({semi}ml) std\"], label=f\"{semi}GBRT\")\n", - "axes[2].barh(ind+width + width, fi[f\"fttransformer({semi}ml) values\"], width, xerr=fi[f\"fttransformer({semi}ml) std\"], label=f\"{semi}Transformer\")\n", + "axes[2].barh(\n", + " ind,\n", + " fi[\n", + " \"trade_size(ex)->quote(best)->quote(ex)->depth(best)->depth(ex)->rev_tick(all) values\"\n", + " ],\n", + " width,\n", + " xerr=fi[\n", + " \"trade_size(ex)->quote(best)->quote(ex)->depth(best)->depth(ex)->rev_tick(all) std\"\n", + " ],\n", + " label=\"GSU\",\n", + ")\n", + "axes[2].barh(\n", + " ind + width,\n", + " fi[f\"gbm({semi}ml) values\"],\n", + " width,\n", + " xerr=fi[f\"gbm({semi}ml) std\"],\n", + " label=f\"{semi}GBRT\",\n", + ")\n", + "axes[2].barh(\n", + " ind + width + width,\n", + " fi[f\"fttransformer({semi}ml) values\"],\n", + " width,\n", + " xerr=fi[f\"fttransformer({semi}ml) std\"],\n", + " label=f\"{semi}Transformer\",\n", + ")\n", "# axes[2].barh(ind+width + width + width, fi[\"fttransformer(semi-ml) values\"], width, xerr=fi[\"fttransformer(semi-ml) std\"], label=\"Transformer (Pre-Train)\")\n", - "axes[2].axvline(0, color='black', linestyle='--', linewidth=0.5)\n", + "axes[2].axvline(0, color=\"black\", linestyle=\"--\", linewidth=0.5)\n", "axes[2].set_xlim([-0.15, 0.15])\n", "\n", "\n", "# set y-labels\n", - "labels = ['Price Lead All (Group)', 'Price Lag All (Group)', 'Price Lead Ex (Group)', 'Price Lag Ex (Group)', 'Quotes NBBO (Group)', 'Quotes Ex (Group)', 'Trade Price', \"Quotes Size (Group)\", 'Trade Size', 'Strike Price', 'Time To Maturity', 'Option Type', 'Root', 'Moneyness', \"Day Volume\", 'Issue Type']\n", - "axes[0].set(yticks=ind + width, yticklabels=labels, ylim=[2*width - 1, len(fi)])\n", + "labels = [\n", + " \"Price Lead All (Group)\",\n", + " \"Price Lag All (Group)\",\n", + " \"Price Lead Ex (Group)\",\n", + " \"Price Lag Ex (Group)\",\n", + " \"Quotes NBBO (Group)\",\n", + " \"Quotes Ex (Group)\",\n", + " \"Trade Price\",\n", + " \"Quotes Size (Group)\",\n", + " \"Trade Size\",\n", + " \"Strike Price\",\n", + " \"Time To Maturity\",\n", + " \"Option Type\",\n", + " \"Root\",\n", + " \"Moneyness\",\n", + " \"Day Volume\",\n", + " \"Issue Type\",\n", + "]\n", + "axes[0].set(yticks=ind + width, yticklabels=labels, ylim=[2 * width - 1, len(fi)])\n", "\n", "# set x-labels\n", "axes[0].set_xlabel(r\"SAGE Value\")\n", @@ -2098,8 +2605,15 @@ "axes[2].set_title(\"FS Option\")\n", "\n", "handles, labels = axes[0].get_legend_handles_labels()\n", - "labels = [l.replace(\"semi-\",\"(Semi) \") for l in labels]\n", - "fig.legend(handles, labels, loc = \"lower center\", frameon=False, bbox_to_anchor=(0.5, -0.05), ncols=3)\n", + "labels = [l.replace(\"semi-\", \"(Semi) \") for l in labels]\n", + "fig.legend(\n", + " handles,\n", + " labels,\n", + " loc=\"lower center\",\n", + " frameon=False,\n", + " bbox_to_anchor=(0.5, -0.05),\n", + " ncols=3,\n", + ")\n", "\n", "plt.tight_layout()\n", "\n", diff --git a/notebooks/6.0f-mb-viz-gradient-boosting.ipynb b/notebooks/6.0f-mb-viz-gradient-boosting.ipynb index 6c7e5414..0bfc039c 100644 --- a/notebooks/6.0f-mb-viz-gradient-boosting.ipynb +++ b/notebooks/6.0f-mb-viz-gradient-boosting.ipynb @@ -14,7 +14,6 @@ "import sys\n", "from pathlib import Path\n", "\n", - "\n", "import numpy as np\n", "import pandas as pd\n", "import wandb\n", @@ -25,7 +24,7 @@ "from otc.features.build_features import (\n", " features_categorical,\n", " features_classical,\n", - ")\n" + ")" ] }, { @@ -52,7 +51,7 @@ "source": [ "# key used for files and artefacts\n", "key = f\"{EXCHANGE}_gbm_{STRATEGY}_{SUBSET}_viz\"\n", - "dataset = f\"fbv/thesis/{EXCHANGE}_{STRATEGY}_log_standardized_clipped:latest\"\n" + "dataset = f\"fbv/thesis/{EXCHANGE}_{STRATEGY}_log_standardized_clipped:latest\"" ] }, { @@ -65,7 +64,7 @@ "outputs": [], "source": [ "# set project name. Required to access files and artefacts\n", - "os.environ[\"GCLOUD_PROJECT\"] = \"flowing-mantis-239216\"\n" + "os.environ[\"GCLOUD_PROJECT\"] = \"flowing-mantis-239216\"" ] }, { @@ -86,7 +85,7 @@ "run = wandb.init(project=\"thesis\", entity=\"fbv\")\n", "\n", "artifact = run.use_artifact(dataset)\n", - "data_dir = artifact.download()\n" + "data_dir = artifact.download()" ] }, { @@ -98,28 +97,29 @@ }, "outputs": [], "source": [ - "\n", "train = pd.read_parquet(Path(data_dir, \"train_set.parquet\"), engine=\"fastparquet\")\n", "y_train = train[\"buy_sell\"]\n", "X_train = train.drop(columns=\"buy_sell\")\n", "timestamp_train = np.linspace(0, 1, len(y_train))\n", "weights_exp_train = np.geomspace(0.001, 1, num=len(y_train))\n", "\n", - "cat_features_sub = [tup[0] for tup in features_categorical if tup[0] in features_classical]\n", + "cat_features_sub = [\n", + " tup[0] for tup in features_categorical if tup[0] in features_classical\n", + "]\n", "\n", "train_pool_uni = Pool(\n", - " data=X_train.loc[:, features_classical],\n", - " label=y_train,\n", - " cat_features=cat_features_sub,\n", - " timestamp=timestamp_train,\n", + " data=X_train.loc[:, features_classical],\n", + " label=y_train,\n", + " cat_features=cat_features_sub,\n", + " timestamp=timestamp_train,\n", ")\n", "\n", "train_pool_exp = Pool(\n", - " data=X_train.loc[:, features_classical],\n", - " label=y_train,\n", - " cat_features=cat_features_sub,\n", - " timestamp=timestamp_train,\n", - " weight=weights_exp_train,\n", + " data=X_train.loc[:, features_classical],\n", + " label=y_train,\n", + " cat_features=cat_features_sub,\n", + " timestamp=timestamp_train,\n", + " weight=weights_exp_train,\n", ")\n", "\n", "val = pd.read_parquet(Path(data_dir, \"val_set.parquet\"), engine=\"fastparquet\")\n", @@ -128,11 +128,11 @@ "timestamp_val = np.linspace(0, 1, len(y_val))\n", "\n", "val_pool_uni = Pool(\n", - " data=X_val.loc[:, features_classical],\n", - " label=y_val,\n", - " cat_features=cat_features_sub,\n", - " timestamp=timestamp_val,\n", - ") " + " data=X_val.loc[:, features_classical],\n", + " label=y_val,\n", + " cat_features=cat_features_sub,\n", + " timestamp=timestamp_val,\n", + ")" ] }, { @@ -154,23 +154,30 @@ " \"logging_level\": \"Silent\",\n", " \"task_type\": \"GPU\",\n", " \"random_seed\": 42,\n", - " \"eval_metric\": \"Accuracy\"\n", + " \"eval_metric\": \"Accuracy\",\n", "}\n", "\n", - "settings = [{\"iterations\": 5}, {\"iterations\": 100}, {\"iterations\": 1000}, {\"iterations\": 2000}]\n", - "[setting.update(kwargs_shared) for setting in settings] \n", - " \n", + "settings = [\n", + " {\"iterations\": 5},\n", + " {\"iterations\": 100},\n", + " {\"iterations\": 1000},\n", + " {\"iterations\": 2000},\n", + "]\n", + "[setting.update(kwargs_shared) for setting in settings]\n", + "\n", "results = []\n", "\n", "for setting in tqdm(settings):\n", " clf = CatBoostClassifier(**setting)\n", " clf.fit(train_pool_uni, eval_set=val_pool_uni)\n", - " \n", + "\n", " proba_predictions = clf.predict_proba(val_pool_uni)\n", " positive_class_prob = proba_predictions[:, 1]\n", " y_val_mapped = (y_val + 1) // 2\n", - " \n", - " result = -np.log(positive_class_prob) * y_val_mapped - np.log(1 - positive_class_prob) * (1 - y_val_mapped)\n", + "\n", + " result = -np.log(positive_class_prob) * y_val_mapped - np.log(\n", + " 1 - positive_class_prob\n", + " ) * (1 - y_val_mapped)\n", " results.append(result)" ] }, @@ -182,13 +189,11 @@ }, "outputs": [], "source": [ - "dfs = pd.concat(results, axis=1, keys = [\"iter_5\", \"iter_100\", \"iter_1000\", \"iter_2000\"])\n", + "dfs = pd.concat(results, axis=1, keys=[\"iter_5\", \"iter_100\", \"iter_1000\", \"iter_2000\"])\n", "key = f\"{EXCHANGE}_gbm_{STRATEGY}_{SUBSET}_viz_dist_loss\"\n", "\n", - "output_path = (\n", - " f\"gs://thesis-bucket-option-trade-classification/data/results/{key}-viz-dist-loss.parquet\"\n", - ")\n", - "dfs.columns = ['_'.join(col).rstrip('_') for col in dfs.columns.values]\n", + "output_path = f\"gs://thesis-bucket-option-trade-classification/data/results/{key}-viz-dist-loss.parquet\"\n", + "dfs.columns = [\"_\".join(col).rstrip(\"_\") for col in dfs.columns.values]\n", "dfs.to_parquet(output_path)\n", "\n", "# Log the artifact to save it as an output of this run\n", @@ -196,7 +201,7 @@ "result_set.add_reference(output_path, name=\"results\")\n", "run.log_artifact(result_set)\n", "\n", - "wandb.finish()\n" + "wandb.finish()" ] }, { @@ -240,11 +245,32 @@ "\n", "\n", "# complete config\n", - "settings = [{}, kwargs_depth, kwargs_earl_stopping, kwargs_border_count, kwargs_growth_strategy, {}]\n", - "[setting.update(kwargs_shared) for setting in settings] \n", + "settings = [\n", + " {},\n", + " kwargs_depth,\n", + " kwargs_earl_stopping,\n", + " kwargs_border_count,\n", + " kwargs_growth_strategy,\n", + " {},\n", + "]\n", + "[setting.update(kwargs_shared) for setting in settings]\n", "# set pools\n", - "pools = [train_pool_uni, train_pool_uni, train_pool_uni, train_pool_uni, train_pool_uni, train_pool_exp]\n", - "identifier = [\"default\", \"depth\", \"early_stopping\", \"border_count\", \"grow_policy\", \"exp_weighting\"]" + "pools = [\n", + " train_pool_uni,\n", + " train_pool_uni,\n", + " train_pool_uni,\n", + " train_pool_uni,\n", + " train_pool_uni,\n", + " train_pool_exp,\n", + "]\n", + "identifier = [\n", + " \"default\",\n", + " \"depth\",\n", + " \"early_stopping\",\n", + " \"border_count\",\n", + " \"grow_policy\",\n", + " \"exp_weighting\",\n", + "]" ] }, { @@ -281,14 +307,21 @@ "\n", "for result in results:\n", " key = list(result.keys())[0]\n", - " \n", + "\n", " learn_acc = result[key][\"learn\"][\"Accuracy\"]\n", " learn_log = result[key][\"learn\"][\"Logloss\"]\n", " val_acc = result[key][\"validation\"][\"Accuracy\"]\n", " val_log = result[key][\"validation\"][\"Logloss\"]\n", - " \n", - " df = pd.DataFrame({\"learn_acc\" :learn_acc, \"learn_log\":learn_log, \"val_acc\": val_acc, \"val_log\": val_log})\n", - " df.name=key\n", + "\n", + " df = pd.DataFrame(\n", + " {\n", + " \"learn_acc\": learn_acc,\n", + " \"learn_log\": learn_log,\n", + " \"val_acc\": val_acc,\n", + " \"val_log\": val_log,\n", + " }\n", + " )\n", + " df.name = key\n", " dfs.append(df)" ] }, @@ -300,12 +333,10 @@ }, "outputs": [], "source": [ - "dfs = pd.concat(dfs, axis=1, keys = identifier)\n", + "dfs = pd.concat(dfs, axis=1, keys=identifier)\n", "\n", - "output_path = (\n", - " f\"gs://thesis-bucket-option-trade-classification/data/results/{key}-viz-losses.parquet\"\n", - ")\n", - "dfs.columns = ['_'.join(col).rstrip('_') for col in dfs.columns.values]\n", + "output_path = f\"gs://thesis-bucket-option-trade-classification/data/results/{key}-viz-losses.parquet\"\n", + "dfs.columns = [\"_\".join(col).rstrip(\"_\") for col in dfs.columns.values]\n", "dfs.to_parquet(output_path)\n", "\n", "# Log the artifact to save it as an output of this run\n", @@ -313,7 +344,7 @@ "result_set.add_reference(output_path, name=\"results\")\n", "run.log_artifact(result_set)\n", "\n", - "wandb.finish()\n" + "wandb.finish()" ] } ], diff --git a/notebooks/6.0g-mb-viz-fttransformer.ipynb b/notebooks/6.0g-mb-viz-fttransformer.ipynb index 36a99e9a..2cca37bc 100644 --- a/notebooks/6.0g-mb-viz-fttransformer.ipynb +++ b/notebooks/6.0g-mb-viz-fttransformer.ipynb @@ -16,22 +16,20 @@ "\n", "import numpy as np\n", "import pandas as pd\n", - "\n", - "import wandb\n", "import torch\n", - "from torch import optim, nn\n", + "import wandb\n", + "from torch import nn, optim\n", "from tqdm.auto import tqdm\n", "\n", "sys.path.append(\"..\")\n", + "from otc.data.dataloader import TabDataLoader\n", + "from otc.data.dataset import TabDataset\n", "from otc.features.build_features import (\n", " features_classical,\n", ")\n", + "from otc.models.activation import GeGLU, ReGLU\n", "from otc.models.fttransformer import FeatureTokenizer, FTTransformer, Transformer\n", - "from otc.models.activation import ReGLU, GeGLU\n", - "from otc.data.dataset import TabDataset\n", - "from otc.data.dataloader import TabDataLoader\n", - "from otc.features.build_features import features_classical\n", - "from otc.optim.scheduler import CosineWarmupScheduler\n" + "from otc.optim.scheduler import CosineWarmupScheduler" ] }, { @@ -43,7 +41,7 @@ "# set globally here\n", "EXCHANGE = \"ise\" # \"cboe\"\n", "STRATEGY = \"supervised\" # \"transfer\"\n", - "SUBSET = \"test\" # \"all\"\n" + "SUBSET = \"test\" # \"all\"" ] }, { @@ -54,7 +52,7 @@ "source": [ "# key used for files and artefacts\n", "key = f\"{EXCHANGE}_fttransformer_{STRATEGY}_{SUBSET}_viz\"\n", - "dataset = f\"fbv/thesis/{EXCHANGE}_{STRATEGY}_log_standardized_clipped:latest\"\n" + "dataset = f\"fbv/thesis/{EXCHANGE}_{STRATEGY}_log_standardized_clipped:latest\"" ] }, { @@ -66,7 +64,7 @@ "outputs": [], "source": [ "# set project name. Required to access files and artefacts\n", - "os.environ[\"GCLOUD_PROJECT\"] = \"flowing-mantis-239216\"\n" + "os.environ[\"GCLOUD_PROJECT\"] = \"flowing-mantis-239216\"" ] }, { @@ -91,7 +89,7 @@ "val = pd.read_parquet(Path(data_dir, \"val_set.parquet\"), engine=\"fastparquet\")\n", "y_val = val[\"buy_sell\"]\n", "X_val = val.drop(columns=\"buy_sell\")\n", - "X_val = X_val.loc[:, features_classical]\n" + "X_val = X_val.loc[:, features_classical]" ] }, { @@ -141,7 +139,7 @@ " \"sample_weighting\",\n", " \"label_smoothing\",\n", " \"lr_scheduler\",\n", - "]\n" + "]" ] }, { @@ -180,7 +178,6 @@ "results = []\n", "\n", "for i, setting in enumerate(tqdm(settings)):\n", - "\n", " result = []\n", "\n", " transformer_kwargs = {\n", @@ -238,7 +235,7 @@ " training_data.x_cont,\n", " training_data.weight,\n", " training_data.y,\n", - " **dl_params\n", + " **dl_params,\n", " )\n", "\n", " val_loader = TabDataLoader(\n", @@ -314,7 +311,6 @@ "results = []\n", "\n", "for i, setting in enumerate(tqdm(settings)):\n", - "\n", " result = []\n", "\n", " transformer_kwargs = {\n", @@ -372,7 +368,7 @@ " training_data.x_cont,\n", " training_data.weight,\n", " training_data.y,\n", - " **dl_params\n", + " **dl_params,\n", " )\n", "\n", " val_loader = TabDataLoader(\n", @@ -421,13 +417,11 @@ " val_step = 0\n", "\n", " for epoch in range(epochs):\n", - "\n", " train_batch = 0\n", "\n", " results_epoch = []\n", "\n", " for x_cat, x_cont, weights, targets in train_loader:\n", - "\n", " clf.train()\n", " optimizer.zero_grad()\n", " with torch.autocast(device_type=\"cuda\", dtype=torch.float16):\n", @@ -471,7 +465,6 @@ "\n", " with torch.no_grad():\n", " for x_cat, x_cont, weights, targets in val_loader:\n", - "\n", " # for my implementation\n", " logits = clf(x_cat, x_cont).flatten()\n", " logits = logits.flatten()\n", @@ -507,7 +500,7 @@ " gc.collect()\n", " torch.cuda.empty_cache()\n", "\n", - " results.append({identifier[i]: result})\n" + " results.append({identifier[i]: result})" ] }, { @@ -522,7 +515,7 @@ " key = list(result.keys())[0]\n", " df = pd.DataFrame(result[key])\n", " df.name = key\n", - " dfs.append(df)\n" + " dfs.append(df)" ] }, { @@ -545,7 +538,7 @@ "result_set.add_reference(output_path, name=\"results\")\n", "run.log_artifact(result_set)\n", "\n", - "wandb.finish()\n" + "wandb.finish()" ] }, { @@ -555,7 +548,7 @@ "outputs": [], "source": [ "filter_col = [col for col in dfs if col.endswith(\"val_loss\")]\n", - "dfs[filter_col].dropna().reset_index(drop=True).plot()\n" + "dfs[filter_col].dropna().reset_index(drop=True).plot()" ] } ], diff --git a/notebooks/6.0h-mb-viz-embeddings.ipynb b/notebooks/6.0h-mb-viz-embeddings.ipynb index f6028b3a..c9bb7cc8 100644 --- a/notebooks/6.0h-mb-viz-embeddings.ipynb +++ b/notebooks/6.0h-mb-viz-embeddings.ipynb @@ -8,23 +8,20 @@ }, "outputs": [], "source": [ - "import gcsfs\n", - "import google.auth\n", - "\n", - "\n", "import json\n", "import os\n", "import pickle\n", "from pathlib import Path\n", "\n", - "from adjustText import adjust_text\n", - "\n", + "import gcsfs\n", + "import google.auth\n", + "import matplotlib.pyplot as plt\n", "import numpy as np\n", "import pandas as pd\n", - "import matplotlib.pyplot as plt\n", - "from matplotlib import rc\n", "import torch\n", - "import wandb" + "import wandb\n", + "from adjustText import adjust_text\n", + "from matplotlib import rc" ] }, { @@ -48,7 +45,10 @@ }, "outputs": [], "source": [ - "fs.get(\"gs://thesis-bucket-option-trade-classification/data/raw/matched_samples_ise_quotes_extended.csv\", \"ise_matched.csv\")" + "fs.get(\n", + " \"gs://thesis-bucket-option-trade-classification/data/raw/matched_samples_ise_quotes_extended.csv\",\n", + " \"ise_matched.csv\",\n", + ")" ] }, { @@ -57,8 +57,8 @@ "metadata": {}, "outputs": [], "source": [ - "sec_id_root = pd.read_csv(\"ise_matched.csv\",usecols=[\"ROOT\", \"secid_OM\"])\n", - "sec_id_root = sec_id_root.drop_duplicates(keep=\"last\",subset=\"ROOT\")" + "sec_id_root = pd.read_csv(\"ise_matched.csv\", usecols=[\"ROOT\", \"secid_OM\"])\n", + "sec_id_root = sec_id_root.drop_duplicates(keep=\"last\", subset=\"ROOT\")" ] }, { @@ -69,8 +69,10 @@ }, "outputs": [], "source": [ - "security_names = pd.read_csv('../data/security_name.csv')\n", - "security_names = security_names[[\"secid\", \"issuer\"]].drop_duplicates(subset=\"secid\", keep=\"last\")" + "security_names = pd.read_csv(\"../data/security_name.csv\")\n", + "security_names = security_names[[\"secid\", \"issuer\"]].drop_duplicates(\n", + " subset=\"secid\", keep=\"last\"\n", + ")" ] }, { @@ -92,7 +94,9 @@ }, "outputs": [], "source": [ - "label = pd.read_csv('../models/metadata.tsv', sep='\\t', header=None).rename({0:\"label\"},axis=1)" + "label = pd.read_csv(\"../models/metadata.tsv\", sep=\"\\t\", header=None).rename(\n", + " {0: \"label\"}, axis=1\n", + ")" ] }, { @@ -116,7 +120,9 @@ "source": [ "label_commented = label_merged[\"label\"]\n", "\n", - "commented_label = label_merged[\"label\"].astype(str) + \" (\" + label_merged[\"issuer\"].astype(str) + \")\"\n", + "commented_label = (\n", + " label_merged[\"label\"].astype(str) + \" (\" + label_merged[\"issuer\"].astype(str) + \")\"\n", + ")\n", "# skip issue type and option type\n", "label_commented.iloc[8:] = commented_label.iloc[8:]" ] @@ -129,7 +135,7 @@ }, "outputs": [], "source": [ - "label_commented.to_csv('../models/metadata_clearlabels.tsv',sep=\"\\t\")" + "label_commented.to_csv(\"../models/metadata_clearlabels.tsv\", sep=\"\\t\")" ] }, { @@ -150,7 +156,7 @@ "plt.rcParams.update(params)\n", "rc(\"text\", usetex=True)\n", "\n", - "plt.rc('text.latex', preamble=r'\\usepackage{amsmath}\\usepackage[utf8]{inputenc}')\n", + "plt.rc(\"text.latex\", preamble=r\"\\usepackage{amsmath}\\usepackage[utf8]{inputenc}\")\n", "\n", "CM = 1 / 2.54" ] @@ -193,8 +199,8 @@ "\n", "artifact = run.use_artifact(model)\n", "model_dir = artifact.download()\n", - " \n", - "with open(Path(model_dir, model_name), 'rb') as f:\n", + "\n", + "with open(Path(model_dir, model_name), \"rb\") as f:\n", " model = pickle.load(f)\n", "\n", "embeddings = model.clf.feature_tokenizer.cat_tokenizer.embeddings.weight.to(\"cpu\")" @@ -219,9 +225,9 @@ "source": [ "# as done https://github.com/pytorch/pytorch/issues/51445\n", "f = open(\"tensors.tsv\", mode=\"a\")\n", - "for x in embeddings: \n", - " x = [str(i.item()) for i in x] \n", - " f.write('\\t'.join(x) + '\\n')\n", + "for x in embeddings:\n", + " x = [str(i.item()) for i in x]\n", + " f.write(\"\\t\".join(x) + \"\\n\")\n", "f.close()" ] }, @@ -242,7 +248,7 @@ "outputs": [], "source": [ "# generate t-sne projection using save to bookmark feature https://projector.tensorflow.org/\n", - "with open('../models/state.txt') as f:\n", + "with open(\"../models/state.txt\") as f:\n", " d = json.load(f)" ] }, @@ -252,9 +258,11 @@ "metadata": {}, "outputs": [], "source": [ - "tsne_projections = pd.DataFrame(d[0]['projections'])\n", + "tsne_projections = pd.DataFrame(d[0][\"projections\"])\n", "# get labels from scalers\n", - "label = pd.read_csv('../models/metadata.tsv', sep='\\t', header=None).rename({0:\"label\"},axis=1)" + "label = pd.read_csv(\"../models/metadata.tsv\", sep=\"\\t\", header=None).rename(\n", + " {0: \"label\"}, axis=1\n", + ")" ] }, { @@ -266,9 +274,7 @@ "outputs": [], "source": [ "def cos_dist_norm(matrix_of_vectors: torch.Tensor):\n", - " \"\"\"\n", - " Compute the cosine distance ([0, 2]) between two vectors that have been normalized to unit norm.\n", - " \"\"\"\n", + " \"\"\"Compute the cosine distance ([0, 2]) between two vectors that have been normalized to unit norm.\"\"\"\n", " return 1 - matrix_of_vectors @ matrix_of_vectors.T" ] }, @@ -281,8 +287,7 @@ "outputs": [], "source": [ "def cos_sim(matrix_of_vectors: torch.Tensor):\n", - " \"\"\"\n", - " Computes cosine similarities for between all vectors, extremely useful for comparing\n", + " \"\"\"Computes cosine similarities for between all vectors, extremely useful for comparing\n", " similarities between embeddings when doing deep embedding learning.\n", "\n", " Adapted from: https://github.com/dalisson/pairwise_cosine_distance_pytorch/blob/master/pairwise_cosine_similarity.py\n", @@ -290,7 +295,7 @@ " and:\n", " https://github.com/tensorflow/tensorboard/blob/00eeb7adcbf341ec25b49c37abee1cfe395ea1f9/tensorboard/plugins/projector/vz_projector/vz-projector-inspector-panel.ts#L398\n", " https://github.com/tensorflow/tensorboard/blob/00eeb7adcbf341ec25b49c37abee1cfe395ea1f9/tensorboard/plugins/projector/vz_projector/vector.ts#L64\n", - " \n", + "\n", " input:\n", " matrix_of_vectors: tensor with shape (n_vectors, vector_size)\n", "\n", @@ -300,11 +305,10 @@ " row[0,0] is 1 and row[0,42] is the similarity between the first\n", " element in the input and the 43th element in the input.\n", " \"\"\"\n", - "\n", " dot_product = matrix_of_vectors @ matrix_of_vectors.t()\n", " norms = torch.sqrt(torch.einsum(\"ii->i\", dot_product))\n", " similarities = dot_product / (norms[None] * norms[..., None])\n", - " return similarities\n" + " return similarities" ] }, { @@ -316,9 +320,7 @@ "outputs": [], "source": [ "def cos_dist(matrix_of_vectors: torch.Tensor):\n", - " \"\"\"\n", - " Compute the cosine distance ([0, 2]) between two vectors.\n", - " \"\"\"\n", + " \"\"\"Compute the cosine distance ([0, 2]) between two vectors.\"\"\"\n", " return 1 - cos_sim(matrix_of_vectors)" ] }, @@ -422,7 +424,6 @@ "texts = []\n", "\n", "for i, cond in enumerate(mask):\n", - "\n", " if cond:\n", " l = label[\"label\"].iloc[i]\n", " factor = 1.5 if l == key else 1\n", @@ -449,7 +450,7 @@ "\n", "fig.tight_layout()\n", "\n", - "plt.savefig(f\"../reports/Graphs/categorical_embeddings_{key}.pdf\", bbox_inches=\"tight\")\n" + "plt.savefig(f\"../reports/Graphs/categorical_embeddings_{key}.pdf\", bbox_inches=\"tight\")" ] } ], diff --git a/notebooks/6.0i-mb-discussion.ipynb b/notebooks/6.0i-mb-discussion.ipynb index 8576cd68..04896590 100644 --- a/notebooks/6.0i-mb-discussion.ipynb +++ b/notebooks/6.0i-mb-discussion.ipynb @@ -10,16 +10,13 @@ "outputs": [], "source": [ "import os\n", - "import random\n", "import sys\n", "from pathlib import Path\n", "\n", "import numpy as np\n", "import pandas as pd\n", - "from sklearn.metrics import accuracy_score\n", "\n", "sys.path.append(\"..\")\n", - "import warnings\n", "\n", "import wandb\n", "from tqdm.auto import tqdm\n", @@ -96,14 +93,14 @@ " test = pd.read_parquet(\n", " Path(data_dir, \"test_set.parquet\"), engine=\"fastparquet\", columns=columns\n", " )\n", - " \n", + "\n", "elif strategy == \"transfer\":\n", " # load test set\n", " test = pd.read_parquet(\n", " Path(data_dir, \"test_set.parquet\"), engine=\"fastparquet\", columns=columns\n", " )\n", "\n", - " \n", + "\n", "elif strategy == \"unsupervised\":\n", " # load unlabelled training set\n", " train = pd.read_parquet(\n", @@ -120,19 +117,26 @@ "outputs": [], "source": [ "def summarize_stats(df):\n", - " summary_stats = pd.DataFrame(index=df.columns) # Create an empty DataFrame with column names as index\n", + " summary_stats = pd.DataFrame(\n", + " index=df.columns\n", + " ) # Create an empty DataFrame with column names as index\n", "\n", " # Calculate summary statistics\n", " # summary_stats['Count'] = df.count()\n", " # summary_stats['Nunique'] = df.nunique()\n", "\n", " summary_stats = df.describe(percentiles=[0.01, 0.05, 0.25, 0.5, 0.75, 0.95, 0.99])\n", - " summary_stats = summary_stats.transpose() # Transpose the table to have columns as variables\n", + " summary_stats = (\n", + " summary_stats.transpose()\n", + " ) # Transpose the table to have columns as variables\n", "\n", " # Rename the columns\n", - " summary_stats.rename(columns={'mean': 'Mean', 'std': 'SD', '50%': 'Median'},\n", - " inplace=True)\n", - " return summary_stats[[\"Mean\", \"SD\", \"1%\", \"5%\", \"25%\", \"Median\", \"75%\",\"95%\", \"99%\"]]" + " summary_stats.rename(\n", + " columns={\"mean\": \"Mean\", \"std\": \"SD\", \"50%\": \"Median\"}, inplace=True\n", + " )\n", + " return summary_stats[\n", + " [\"Mean\", \"SD\", \"1%\", \"5%\", \"25%\", \"Median\", \"75%\", \"95%\", \"99%\"]\n", + " ]" ] }, { @@ -196,10 +200,10 @@ "outputs": [], "source": [ "# set here globally\n", - "EXCHANGE = \"ise\" # \"ise\"\n", - "MODELS = [\"classical\"] # \"classical\", \"fttransformer\", \"gbm\"\n", + "EXCHANGE = \"ise\" # \"ise\"\n", + "MODELS = [\"classical\"] # \"classical\", \"fttransformer\", \"gbm\"\n", "SUBSET = \"all\" # \"all\"\n", - "STRATEGY = \"supervised\" # \"supervised\" \n", + "STRATEGY = \"supervised\" # \"supervised\"\n", "\n", "RETRAIN = False" ] @@ -220,7 +224,7 @@ "run = wandb.init(project=\"thesis\", entity=\"fbv\")\n", "\n", "# load unscaled data\n", - "artifact = run.use_artifact(DATASET) \n", + "artifact = run.use_artifact(DATASET)\n", "data_dir = artifact.download()\n", "\n", "# load results\n", @@ -231,9 +235,9 @@ " results = f\"fbv/thesis/{EXCHANGE}_{model}_{STRATEGY}_{SUBSET}_retrain:latest\"\n", " else:\n", " results = f\"fbv/thesis/{EXCHANGE}_{model}_{STRATEGY}_{SUBSET}:latest\"\n", - " artifact = run.use_artifact(results) \n", + " artifact = run.use_artifact(results)\n", " result_dir = artifact.download()\n", - " result_dirs.append(result_dir)\n" + " result_dirs.append(result_dir)" ] }, { @@ -297,7 +301,7 @@ "\n", "X_print = eval_data\n", "\n", - "del results\n" + "del results" ] }, { @@ -440,14 +444,16 @@ }, "outputs": [], "source": [ - "pivot_table = pd.pivot_table(X_print, \n", - " values='values',\n", - " columns='prox_q_binned',\n", - " index='TRADE_SIZE_binned',\n", - " aggfunc=sum,\n", - " fill_value=0,\n", - " margins=True)\n", - "pivot_table.div(pivot_table.iloc[:,-1], axis=0 )" + "pivot_table = pd.pivot_table(\n", + " X_print,\n", + " values=\"values\",\n", + " columns=\"prox_q_binned\",\n", + " index=\"TRADE_SIZE_binned\",\n", + " aggfunc=sum,\n", + " fill_value=0,\n", + " margins=True,\n", + ")\n", + "pivot_table.div(pivot_table.iloc[:, -1], axis=0)" ] }, { @@ -465,14 +471,16 @@ }, "outputs": [], "source": [ - "pivot_table = pd.pivot_table(X_print, \n", - " values='values',\n", - " columns='myn_binned',\n", - " index='TRADE_SIZE_binned',\n", - " aggfunc=sum,\n", - " fill_value=0,\n", - " margins=True)\n", - "pivot_table.div(pivot_table.iloc[:,-1], axis=0 )" + "pivot_table = pd.pivot_table(\n", + " X_print,\n", + " values=\"values\",\n", + " columns=\"myn_binned\",\n", + " index=\"TRADE_SIZE_binned\",\n", + " aggfunc=sum,\n", + " fill_value=0,\n", + " margins=True,\n", + ")\n", + "pivot_table.div(pivot_table.iloc[:, -1], axis=0)" ] }, { @@ -491,14 +499,16 @@ "outputs": [], "source": [ "# savickas: trades with longer maturity tend to be smaller\n", - "pivot_table = pd.pivot_table(X_print, \n", - " values='values',\n", - " index='ttm_binned',\n", - " columns='TRADE_SIZE_binned',\n", - " aggfunc=sum,\n", - " fill_value=0,\n", - " margins=True)\n", - "pivot_table.div(pivot_table.iloc[:,-1], axis=0 )" + "pivot_table = pd.pivot_table(\n", + " X_print,\n", + " values=\"values\",\n", + " index=\"ttm_binned\",\n", + " columns=\"TRADE_SIZE_binned\",\n", + " aggfunc=sum,\n", + " fill_value=0,\n", + " margins=True,\n", + ")\n", + "pivot_table.div(pivot_table.iloc[:, -1], axis=0)" ] }, { @@ -516,14 +526,16 @@ }, "outputs": [], "source": [ - "pivot_table = pd.pivot_table(X_print, \n", - " values='values',\n", - " index='issue_type',\n", - " columns=None,\n", - " aggfunc=sum,\n", - " fill_value=0,\n", - " margins=True)\n", - "pivot_table.div(pivot_table.iloc[-1], axis=1)\n" + "pivot_table = pd.pivot_table(\n", + " X_print,\n", + " values=\"values\",\n", + " index=\"issue_type\",\n", + " columns=None,\n", + " aggfunc=sum,\n", + " fill_value=0,\n", + " margins=True,\n", + ")\n", + "pivot_table.div(pivot_table.iloc[-1], axis=1)" ] }, { @@ -541,13 +553,15 @@ }, "outputs": [], "source": [ - "pivot_table = pd.pivot_table(X_print, \n", - " values='values',\n", - " index='prox_q_binned',\n", - " columns=None,\n", - " aggfunc=sum,\n", - " fill_value=0,\n", - " margins=True)\n", + "pivot_table = pd.pivot_table(\n", + " X_print,\n", + " values=\"values\",\n", + " index=\"prox_q_binned\",\n", + " columns=None,\n", + " aggfunc=sum,\n", + " fill_value=0,\n", + " margins=True,\n", + ")\n", "pivot_table.div(pivot_table.iloc[-1], axis=1)" ] }, @@ -570,7 +584,9 @@ "results = []\n", "\n", "# calculate true rel effective spread but not aggregated, convert to %\n", - "es_true = effective_spread(X_print[\"buy_sell\"], X_print[\"TRADE_PRICE\"], mid, mode=\"none\")\n", + "es_true = effective_spread(\n", + " X_print[\"buy_sell\"], X_print[\"TRADE_PRICE\"], mid, mode=\"none\"\n", + ")\n", "nom_true = np.nanmean(es_true)\n", "\n", "eps_true = np.empty(es_true.shape)\n", @@ -579,26 +595,31 @@ "\n", "\n", "for classifier in tqdm(classifiers):\n", - "\n", " # calculate pred rel effective spread but not aggregated convert to %\n", - " es_pred = effective_spread(X_print[classifier], X_print[\"TRADE_PRICE\"], mid, mode=\"none\")\n", - " \n", + " es_pred = effective_spread(\n", + " X_print[classifier], X_print[\"TRADE_PRICE\"], mid, mode=\"none\"\n", + " )\n", + "\n", " eps_pred = np.empty(es_pred.shape)\n", " np.divide(es_pred, mid, out=eps_pred, where=mid != 0)\n", "\n", - " wilcoxon_res = wilcoxon(eps_pred, eps_true, nan_policy=\"omit\", zero_method=\"zsplit\")\n", + " wilcoxon_res = wilcoxon(eps_pred, eps_true, nan_policy=\"omit\", zero_method=\"zsplit\")\n", "\n", " res = pd.Series(\n", - " {\n", - " \"nom_pred\": np.nanmean(es_pred),\n", - " \"rel_pred\": np.nanmean(eps_pred),\n", - " \"statistic\":wilcoxon_res.statistic,\n", - " \"pvalue\":wilcoxon_res.pvalue,\n", - " }, name=classifier\n", - " )\n", + " {\n", + " \"nom_pred\": np.nanmean(es_pred),\n", + " \"rel_pred\": np.nanmean(eps_pred),\n", + " \"statistic\": wilcoxon_res.statistic,\n", + " \"pvalue\": wilcoxon_res.pvalue,\n", + " },\n", + " name=classifier,\n", + " )\n", " results.append(res)\n", "\n", - "true_eff = pd.Series({\"nom_pred\":nom_true, \"rel_pred\": rel_true, \"statistic\":np.NaN, \"pvalue\":np.NaN}, name=\"true_eff\")\n", + "true_eff = pd.Series(\n", + " {\"nom_pred\": nom_true, \"rel_pred\": rel_true, \"statistic\": np.nan, \"pvalue\": np.nan},\n", + " name=\"true_eff\",\n", + ")\n", "\n", "results.append(true_eff)\n", "\n", @@ -613,7 +634,7 @@ }, "outputs": [], "source": [ - "results.T.style.format(\"{:.3f}\")\n" + "results.T.style.format(\"{:.3f}\")" ] }, { @@ -631,7 +652,7 @@ " label=f\"tab:eff-{KEY}\",\n", " caption=(f\"long-eff-{KEY}\", f\"short-eff-{KEY}\"),\n", " convert_css=True,\n", - ")\n" + ")" ] } ], diff --git a/src/otc/models/fttransformer.py b/src/otc/models/fttransformer.py index 627e5cca..590a6791 100644 --- a/src/otc/models/fttransformer.py +++ b/src/otc/models/fttransformer.py @@ -34,10 +34,8 @@ def _is_glu_activation(activation: Callable[..., nn.Module]) -> bool: bool: truth value. """ return ( - isinstance(activation, str) - and activation.endswith("GLU") - or activation in [ReGLU, GeGLU] - ) + isinstance(activation, str) and activation.endswith("GLU") + ) or activation in [ReGLU, GeGLU] def _all_or_none(values: list[Any]) -> bool: