From 4de1e0b5eba7e474f90b0e61b08b47cad804e005 Mon Sep 17 00:00:00 2001 From: Kamil Tagowski Date: Wed, 15 Feb 2023 09:23:28 +0100 Subject: [PATCH] fix: Fix validate lightning models inference tutorial --- .github/workflows/ghp_deploy.yml | 2 +- .../validate_lightning_models_inference.ipynb | 247 +++--------------- 2 files changed, 37 insertions(+), 212 deletions(-) diff --git a/.github/workflows/ghp_deploy.yml b/.github/workflows/ghp_deploy.yml index d282442b..25327df1 100644 --- a/.github/workflows/ghp_deploy.yml +++ b/.github/workflows/ghp_deploy.yml @@ -2,7 +2,7 @@ name: CD on: push: branches: - ["main", "master", "271-create-documentation-and-library-presentation"] + ["main", "master", "271-create-documentation-and-library-presentation", "fix/fix-inference-tutorial"] workflow_dispatch: jobs: deploy_ghp: diff --git a/nbs/01_Tutorials/validate_lightning_models_inference.ipynb b/nbs/01_Tutorials/validate_lightning_models_inference.ipynb index f1ee04e0..00270def 100644 --- a/nbs/01_Tutorials/validate_lightning_models_inference.ipynb +++ b/nbs/01_Tutorials/validate_lightning_models_inference.ipynb @@ -30,8 +30,15 @@ "outputs": [], "source": [ "#| hide\n", - "%load_ext autoreload\n", - "%autoreload 2" + "import os\n", + "import warnings\n", + "\n", + "# disable warnings\n", + "warnings.simplefilter(\"ignore\")\n", + "# set os environ variable for multiprocesses\n", + "os.environ[\"PYTHONWARNINGS\"] = \"ignore\"\n", + "\n", + "os.chdir(\"..\")" ] }, { @@ -39,21 +46,8 @@ "execution_count": null, "id": "1019b750-cebe-438b-b1ab-434d6f756864", "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/opt/conda/envs/embeddings/lib/python3.9/site-packages/tqdm/auto.py:22: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", - " from .autonotebook import tqdm as notebook_tqdm\n" - ] - } - ], + "outputs": [], "source": [ - "#| eval: false\n", - "import os\n", - "\n", - "os.chdir(\"..\")\n", "from typing import Any, Dict\n", "\n", "import pytorch_lightning as pl\n", @@ -75,25 +69,15 @@ "execution_count": null, "id": "2d0e06e2-3c5a-420b-b065-31d5ccd6b255", "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2023-02-13 23:05:26,246 - embeddings.utils.utils - WARNING - String 'hf-internal-testing/tiny-albert' contains '/'. Replacing it with '__'. Cleaned_text: hf-internal-testing__tiny-albert.\n", - "2023-02-13 23:05:26,247 - embeddings.utils.utils - WARNING - String 'clarin-pl/polemo2-official' contains '/'. Replacing it with '__'. Cleaned_text: clarin-pl__polemo2-official.\n", - "2023-02-13 23:05:26,254 - embeddings.utils.utils - WARNING - String 'hf-internal-testing/tiny-albert' contains '/'. Replacing it with '__'. Cleaned_text: hf-internal-testing__tiny-albert.\n", - "2023-02-13 23:05:26,256 - embeddings.utils.utils - WARNING - String 'clarin-pl/polemo2-official' contains '/'. Replacing it with '__'. Cleaned_text: clarin-pl__polemo2-official.\n" - ] - } - ], + "outputs": [], "source": [ - "#| eval: false\n", + "#|exec_doc\n", + "\n", "embedding_name_or_path = \"hf-internal-testing/tiny-albert\"\n", "dataset_name = \"clarin-pl/polemo2-official\"\n", "\n", "dataset_path = build_output_path(DATASET_PATH, embedding_name_or_path, dataset_name)\n", - "output_path = build_output_path(RESULTS_PATH, embedding_name_or_path, dataset_name)" + "output_path = build_output_path(\".\", embedding_name_or_path, dataset_name, mkdirs=True)" ] }, { @@ -109,26 +93,10 @@ "execution_count": null, "id": "095d1c88-900f-4275-a879-f9efdb73265a", "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Using custom data configuration default-e0c1ce6ddfd81769\n", - "Found cached dataset polemo2-official (/root/.cache/huggingface/datasets/clarin-pl___polemo2-official/default-e0c1ce6ddfd81769/0.0.0/2b75fdbe5def97538e81fb120f8752744b50729a4ce09bd75132bfc863a2fd70)\n", - "100%|██████████| 3/3 [00:00<00:00, 817.23it/s]\n", - "Loading cached split indices for dataset at /root/.cache/huggingface/datasets/clarin-pl___polemo2-official/default-e0c1ce6ddfd81769/0.0.0/2b75fdbe5def97538e81fb120f8752744b50729a4ce09bd75132bfc863a2fd70/cache-a54edce9681df8b7.arrow and /root/.cache/huggingface/datasets/clarin-pl___polemo2-official/default-e0c1ce6ddfd81769/0.0.0/2b75fdbe5def97538e81fb120f8752744b50729a4ce09bd75132bfc863a2fd70/cache-09cf731207f31628.arrow\n", - "Loading cached split indices for dataset at /root/.cache/huggingface/datasets/clarin-pl___polemo2-official/default-e0c1ce6ddfd81769/0.0.0/2b75fdbe5def97538e81fb120f8752744b50729a4ce09bd75132bfc863a2fd70/cache-c48721732fabb729.arrow and /root/.cache/huggingface/datasets/clarin-pl___polemo2-official/default-e0c1ce6ddfd81769/0.0.0/2b75fdbe5def97538e81fb120f8752744b50729a4ce09bd75132bfc863a2fd70/cache-f9d782422a65c7e6.arrow\n", - "Loading cached split indices for dataset at /root/.cache/huggingface/datasets/clarin-pl___polemo2-official/default-e0c1ce6ddfd81769/0.0.0/2b75fdbe5def97538e81fb120f8752744b50729a4ce09bd75132bfc863a2fd70/cache-0db6321193feb3ec.arrow and /root/.cache/huggingface/datasets/clarin-pl___polemo2-official/default-e0c1ce6ddfd81769/0.0.0/2b75fdbe5def97538e81fb120f8752744b50729a4ce09bd75132bfc863a2fd70/cache-4e6c26839c3e4adf.arrow\n", - "Loading cached processed dataset at /root/.cache/huggingface/datasets/clarin-pl___polemo2-official/default-e0c1ce6ddfd81769/0.0.0/2b75fdbe5def97538e81fb120f8752744b50729a4ce09bd75132bfc863a2fd70/cache-e32b75da1d28bfd0.arrow\n", - "Loading cached processed dataset at /root/.cache/huggingface/datasets/clarin-pl___polemo2-official/default-e0c1ce6ddfd81769/0.0.0/2b75fdbe5def97538e81fb120f8752744b50729a4ce09bd75132bfc863a2fd70/cache-98cbedcc70a23855.arrow\n", - "Loading cached processed dataset at /root/.cache/huggingface/datasets/clarin-pl___polemo2-official/default-e0c1ce6ddfd81769/0.0.0/2b75fdbe5def97538e81fb120f8752744b50729a4ce09bd75132bfc863a2fd70/cache-b2cbb8ab856bac0f.arrow\n", - " \r" - ] - } - ], + "outputs": [], "source": [ - "#| eval: false\n", + "#|exec_doc\n", + "\n", "def preprocess_data(path: str) -> Dict[str, Any]:\n", " pipeline = HuggingFacePreprocessingPipeline(\n", " dataset_name=dataset_name,\n", @@ -141,7 +109,7 @@ " persist_path=path,\n", " sample_missing_splits=None,\n", " ignore_test_subset=False,\n", - " downsample_splits=(0.01, 0.01, 0.05),\n", + " downsample_splits=(0.01, 0.01, 0.01),\n", " seed=441,\n", " )\n", " pipeline.run()\n", @@ -171,7 +139,8 @@ "metadata": {}, "outputs": [], "source": [ - "#| eval: false\n", + "#|exec_doc\n", + "\n", "config = LightningAdvancedConfig(\n", " finetune_last_n_layers=0,\n", " task_train_kwargs={\"max_epochs\": 1, \"deterministic\": True,},\n", @@ -199,140 +168,10 @@ "execution_count": null, "id": "148a0089-f461-4948-93fa-04f2e34ac9e0", "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "100%|██████████| 1/1 [00:00<00:00, 6.73ba/s]\n", - "100%|██████████| 1/1 [00:00<00:00, 25.69ba/s]\n", - "100%|██████████| 1/1 [00:00<00:00, 26.51ba/s]\n", - "Casting the dataset: 100%|██████████| 1/1 [00:00<00:00, 113.84ba/s]\n", - "Casting the dataset: 100%|██████████| 1/1 [00:00<00:00, 68.70ba/s]\n", - "Casting the dataset: 100%|██████████| 1/1 [00:00<00:00, 103.15ba/s]\n", - "Some weights of the model checkpoint at hf-internal-testing/tiny-albert were not used when initializing AlbertForSequenceClassification: ['predictions.LayerNorm.bias', 'predictions.LayerNorm.weight', 'predictions.bias', 'predictions.decoder.weight', 'predictions.decoder.bias', 'predictions.dense.bias', 'predictions.dense.weight']\n", - "- This IS expected if you are initializing AlbertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n", - "- This IS NOT expected if you are initializing AlbertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n", - "Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at hf-internal-testing/tiny-albert and are newly initialized: ['classifier.bias', 'albert.pooler.weight', 'albert.pooler.bias', 'classifier.weight']\n", - "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n", - "GPU available: True, used: False\n", - "TPU available: False, using: 0 TPU cores\n", - "IPU available: False, using: 0 IPUs\n", - "/opt/conda/envs/embeddings/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py:1579: UserWarning: GPU available but not used. Set the gpus flag in your trainer `Trainer(gpus=1)` or script `--gpus=1`.\n", - " rank_zero_warn(\n", - "\n", - " | Name | Type | Params\n", - "------------------------------------------------------------------\n", - "0 | model | AlbertForSequenceClassification | 352 K \n", - "1 | metrics | MetricCollection | 0 \n", - "2 | train_metrics | MetricCollection | 0 \n", - "3 | val_metrics | MetricCollection | 0 \n", - "4 | test_metrics | MetricCollection | 0 \n", - "------------------------------------------------------------------\n", - "132 Trainable params\n", - "352 K Non-trainable params\n", - "352 K Total params\n", - "1.410 Total estimated model params size (MB)\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Validation sanity check: 0%| | 0/1 [00:00