From 93cff75536a3cc3b12f07a5638268f16774dc603 Mon Sep 17 00:00:00 2001 From: vfdev Date: Sun, 24 Nov 2024 22:38:38 +0100 Subject: [PATCH 01/10] try updating gpu tests GHA (#3306) --- .github/workflows/gpu-tests.yml | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/.github/workflows/gpu-tests.yml b/.github/workflows/gpu-tests.yml index 81862e1f67bd..0a72711fbddc 100644 --- a/.github/workflows/gpu-tests.yml +++ b/.github/workflows/gpu-tests.yml @@ -16,7 +16,7 @@ concurrency: group: gpu-tests-${{ github.ref_name }}-${{ !(github.ref_protected) || github.sha }} cancel-in-progress: true -# Cherry-picked from https://github.com/pytorch/test-infra/blob/main/.github/workflows/linux_job.yml +# Cherry-picked from https://github.com/pytorch/test-infra/blob/main/.github/workflows/linux_job_v2.yml jobs: gpu-tests: @@ -25,7 +25,7 @@ jobs: pytorch-channel: [pytorch, pytorch-nightly] fail-fast: false env: - DOCKER_IMAGE: "pytorch/conda-builder:cuda12.1" + DOCKER_IMAGE: "pytorch/almalinux-builder:cuda12.4" REPOSITORY: ${{ github.repository }} PR_NUMBER: ${{ github.event.pull_request.number }} runs-on: linux.8xlarge.nvidia.gpu @@ -40,7 +40,7 @@ jobs: echo "::endgroup::" - name: Checkout repository (pytorch/test-infra) - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: # Support the use case where we need to checkout someone's fork repository: pytorch/test-infra @@ -55,7 +55,7 @@ jobs: docker-image: ${{ env.DOCKER_IMAGE }} - name: Checkout repository (${{ github.repository }}) - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: # Support the use case where we need to checkout someone's fork repository: ${{ github.repository }} @@ -102,9 +102,9 @@ jobs: # Install PyTorch if [ "${{ matrix.pytorch-channel }}" == "pytorch" ]; then - pip install --upgrade torch torchvision --index-url https://download.pytorch.org/whl/cu121 + pip install --upgrade torch torchvision --index-url https://download.pytorch.org/whl/cu124 else - pip install --upgrade --pre torch torchvision --index-url https://download.pytorch.org/whl/nightly/cu121 + pip install --upgrade --pre torch torchvision --index-url https://download.pytorch.org/whl/nightly/cu124 fi python -c "import torch; print(torch.__version__, ', CUDA is available: ', torch.cuda.is_available()); exit(not torch.cuda.is_available())" @@ -139,7 +139,7 @@ jobs: - name: Run examples in container continue-on-error: false run: | - SCRIPT=$(cat << EOF + script=$(cat << EOF set -xe From 36ff817506f0172516b66addfeb097892a04e933 Mon Sep 17 00:00:00 2001 From: vfdev Date: Sun, 24 Nov 2024 22:40:41 +0100 Subject: [PATCH 02/10] Update pyproject.toml (#3305) --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index a6aae5458ad8..016c0d9eb13b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.black] line-length = 120 -target-version = ['py38', 'py39'] +target-version = ['py39', 'py311'] include = '\.pyi?$' exclude = ''' From 9e2763e097e08a42caa9b7e03a88b6fff38621a5 Mon Sep 17 00:00:00 2001 From: vfdev Date: Tue, 3 Dec 2024 11:38:46 +0100 Subject: [PATCH 03/10] Update requirements-dev.txt (#3310) --- requirements-dev.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements-dev.txt b/requirements-dev.txt index d475e556cdff..91b560e56530 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -21,7 +21,7 @@ mlflow neptune-client>=0.16.17 tensorboard torchvision -pynvml +pynvml<12 # pynvml module was removed in 12.X, is not developed or maintained. We should replace pynvml with something else. clearml scikit-image py-rouge From 1c3b9e975073bd4be47533fe98adf537b2ea67b4 Mon Sep 17 00:00:00 2001 From: vfdev Date: Tue, 3 Dec 2024 13:07:30 +0100 Subject: [PATCH 04/10] Fixed GPU tests exec scripts and failing metrics (#3301) * Fixed GPU tests and failing metrics * Updated timeout param * Updated infra cuda12.1 -> cuda12.4 * Add tmate for debug * Disable sudo * Attempt to debug tmate! * Attempt to use bash in step * Update gpu-tests.yml * Skip failing test and remove tmate debugging * Fixed formatting --------- Co-authored-by: Sadra Barikbin --- .github/workflows/gpu-tests.yml | 2 +- .../clustering/calinski_harabasz_score.py | 4 ++-- .../metrics/clustering/davies_bouldin_score.py | 4 ++-- ignite/metrics/clustering/silhouette_score.py | 4 ++-- .../metrics/regression/kendall_correlation.py | 4 ++-- .../metrics/regression/spearman_correlation.py | 4 ++-- tests/common_test_functionality.sh | 5 ++--- .../metrics/test_classification_report.py | 17 +++++++++++++++++ tests/ignite/metrics/test_hsic.py | 4 ++-- tests/run_cpu_tests.sh | 5 ++--- tests/run_gpu_tests.sh | 10 +++++----- 11 files changed, 39 insertions(+), 24 deletions(-) diff --git a/.github/workflows/gpu-tests.yml b/.github/workflows/gpu-tests.yml index 0a72711fbddc..13c628ad302c 100644 --- a/.github/workflows/gpu-tests.yml +++ b/.github/workflows/gpu-tests.yml @@ -124,7 +124,7 @@ jobs: uses: nick-fields/retry@v2.9.0 with: max_attempts: 5 - timeout_minutes: 25 + timeout_minutes: 45 shell: bash command: docker exec -t pthd /bin/bash -xec 'bash tests/run_gpu_tests.sh 2' new_command_on_retry: docker exec -e USE_LAST_FAILED=1 -t pthd /bin/bash -xec 'bash tests/run_gpu_tests.sh 2' diff --git a/ignite/metrics/clustering/calinski_harabasz_score.py b/ignite/metrics/clustering/calinski_harabasz_score.py index fe58ac461517..79f8dc99ba50 100644 --- a/ignite/metrics/clustering/calinski_harabasz_score.py +++ b/ignite/metrics/clustering/calinski_harabasz_score.py @@ -11,8 +11,8 @@ def _calinski_harabasz_score(features: Tensor, labels: Tensor) -> float: from sklearn.metrics import calinski_harabasz_score - np_features = features.numpy() - np_labels = labels.numpy() + np_features = features.cpu().numpy() + np_labels = labels.cpu().numpy() score = calinski_harabasz_score(np_features, np_labels) return score diff --git a/ignite/metrics/clustering/davies_bouldin_score.py b/ignite/metrics/clustering/davies_bouldin_score.py index b34ec69f51ad..afea0518951b 100644 --- a/ignite/metrics/clustering/davies_bouldin_score.py +++ b/ignite/metrics/clustering/davies_bouldin_score.py @@ -11,8 +11,8 @@ def _davies_bouldin_score(features: Tensor, labels: Tensor) -> float: from sklearn.metrics import davies_bouldin_score - np_features = features.numpy() - np_labels = labels.numpy() + np_features = features.cpu().numpy() + np_labels = labels.cpu().numpy() score = davies_bouldin_score(np_features, np_labels) return score diff --git a/ignite/metrics/clustering/silhouette_score.py b/ignite/metrics/clustering/silhouette_score.py index 39b28c5d0409..48a59d583ec4 100644 --- a/ignite/metrics/clustering/silhouette_score.py +++ b/ignite/metrics/clustering/silhouette_score.py @@ -111,7 +111,7 @@ def __init__( def _silhouette_score(self, features: Tensor, labels: Tensor) -> float: from sklearn.metrics import silhouette_score - np_features = features.numpy() - np_labels = labels.numpy() + np_features = features.cpu().numpy() + np_labels = labels.cpu().numpy() score = silhouette_score(np_features, np_labels, **self._silhouette_kwargs) return score diff --git a/ignite/metrics/regression/kendall_correlation.py b/ignite/metrics/regression/kendall_correlation.py index 7ad87b224024..34d876a36599 100644 --- a/ignite/metrics/regression/kendall_correlation.py +++ b/ignite/metrics/regression/kendall_correlation.py @@ -16,8 +16,8 @@ def _get_kendall_tau(variant: str = "b") -> Callable[[Tensor, Tensor], float]: raise ValueError(f"variant accepts 'b' or 'c', got {variant!r}.") def _tau(predictions: Tensor, targets: Tensor) -> float: - np_preds = predictions.flatten().numpy() - np_targets = targets.flatten().numpy() + np_preds = predictions.flatten().cpu().numpy() + np_targets = targets.flatten().cpu().numpy() r = kendalltau(np_preds, np_targets, variant=variant).statistic return r diff --git a/ignite/metrics/regression/spearman_correlation.py b/ignite/metrics/regression/spearman_correlation.py index 7f126d6e56be..cbd89f67c9d0 100644 --- a/ignite/metrics/regression/spearman_correlation.py +++ b/ignite/metrics/regression/spearman_correlation.py @@ -12,8 +12,8 @@ def _spearman_r(predictions: Tensor, targets: Tensor) -> float: from scipy.stats import spearmanr - np_preds = predictions.flatten().numpy() - np_targets = targets.flatten().numpy() + np_preds = predictions.flatten().cpu().numpy() + np_targets = targets.flatten().cpu().numpy() r = spearmanr(np_preds, np_targets).statistic return r diff --git a/tests/common_test_functionality.sh b/tests/common_test_functionality.sh index 6e60947f927b..91003eddc092 100644 --- a/tests/common_test_functionality.sh +++ b/tests/common_test_functionality.sh @@ -85,7 +85,6 @@ run_tests() { skip_distrib_opt="" fi - echo [pytest] > pytest.ini ; echo "cache_dir=${cache_dir}" >> pytest.ini # Assemble options for the pytest command @@ -103,8 +102,8 @@ run_tests() { # Run the command if [ "$trap_deselected_exit_code" -eq "1" ]; then - CUDA_VISIBLE_DEVICES="" eval "pytest ${pytest_args}" || { exit_code=$?; if [ "$exit_code" -eq ${last_failed_no_failures_code} ]; then echo "All tests deselected"; else exit $exit_code; fi; } + eval "pytest ${pytest_args}" || { exit_code=$?; if [ "$exit_code" -eq ${last_failed_no_failures_code} ]; then echo "All tests deselected"; else exit $exit_code; fi; } else - CUDA_VISIBLE_DEVICES="" eval "pytest ${pytest_args}" + eval "pytest ${pytest_args}" fi } diff --git a/tests/ignite/metrics/test_classification_report.py b/tests/ignite/metrics/test_classification_report.py index 87e328c8051e..cae8b5145f55 100644 --- a/tests/ignite/metrics/test_classification_report.py +++ b/tests/ignite/metrics/test_classification_report.py @@ -164,6 +164,23 @@ def update(engine, i): @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU") @pytest.mark.skipif(Version(torch.__version__) < Version("1.7.0"), reason="Skip if < 1.7.0") def test_distrib_nccl_gpu(distributed_context_single_node_nccl): + + pytest.skip("Temporarily skip failing test. See https://github.com/pytorch/ignite/pull/3301") + # When run with 2 devices: + # tests/ignite/metrics/test_classification_report.py::test_distrib_nccl_gpu Fatal Python error: Aborted + # Thread 0x00007fac95c95700 (most recent call first): + # + + # Thread 0x00007facbb89b700 (most recent call first): + # + + # Thread 0x00007fae637f4700 (most recent call first): + # File "", line 534 in read + # File "", line 567 in from_io + # File "", line 1160 in _thread_receiver + # File "", line 341 in run + # File "", line 411 in _perform_spawn + device = idist.device() _test_integration_multiclass(device, True) _test_integration_multiclass(device, False) diff --git a/tests/ignite/metrics/test_hsic.py b/tests/ignite/metrics/test_hsic.py index 57af5fa2862c..28fe5c1f97db 100644 --- a/tests/ignite/metrics/test_hsic.py +++ b/tests/ignite/metrics/test_hsic.py @@ -139,10 +139,10 @@ def test_integration(self, sigma_x: float, sigma_y: float): metric_devices.append(device) for metric_device in metric_devices: - x = torch.randn((n_iters * batch_size, n_dims_x)).float().to(device) + x = torch.randn((n_iters * batch_size, n_dims_x), device=device).float() lin = nn.Linear(n_dims_x, n_dims_y).to(device) - y = torch.sin(lin(x) * 100) + torch.randn(n_iters * batch_size, n_dims_y) * 1e-4 + y = torch.sin(lin(x) * 100) + torch.randn(n_iters * batch_size, n_dims_y, device=x.device) * 1e-4 def data_loader(i, input_x, input_y): return input_x[i * batch_size : (i + 1) * batch_size], input_y[i * batch_size : (i + 1) * batch_size] diff --git a/tests/run_cpu_tests.sh b/tests/run_cpu_tests.sh index 8d387f5542e7..f52988a68183 100644 --- a/tests/run_cpu_tests.sh +++ b/tests/run_cpu_tests.sh @@ -6,8 +6,7 @@ skip_distrib_tests=${SKIP_DISTRIB_TESTS:-0} use_last_failed=${USE_LAST_FAILED:-0} match_tests_expression=${1:-""} - -run_tests \ +CUDA_VISIBLE_DEVICES="" run_tests \ --core_args "--tx 4*popen//python=python -vvv tests/ignite" \ --cache_dir ".cpu-not-distrib" \ --skip_distrib_tests "${skip_distrib_tests}" \ @@ -21,7 +20,7 @@ if [ "${skip_distrib_tests}" -eq "1" ]; then fi # Run 2 processes with --dist=each -run_tests \ +CUDA_VISIBLE_DEVICES="" run_tests \ --core_args "-m distributed -vvv tests/ignite" \ --world_size 2 \ --cache_dir ".cpu-distrib" \ diff --git a/tests/run_gpu_tests.sh b/tests/run_gpu_tests.sh index 26497f19c83e..c86d1d0746ee 100644 --- a/tests/run_gpu_tests.sh +++ b/tests/run_gpu_tests.sh @@ -2,26 +2,26 @@ source "$(dirname "$0")/common_test_functionality.sh" set -xeu -skip_distrib_tests=${SKIP_DISTRIB_TESTS:-1} +# https://pubs.opengroup.org/onlinepubs/009695399/utilities/xcu_chap02.html#tag_02_06_02 +skip_distrib_tests=${SKIP_DISTRIB_TESTS:-0} use_last_failed=${USE_LAST_FAILED:-0} ngpus=${1:-1} match_tests_expression=${2:-""} if [ -z "$match_tests_expression" ]; then - cuda_pattern="cuda" + cuda_pattern="cuda or nccl or gloo" else - cuda_pattern="cuda and $match_tests_expression" + cuda_pattern="(cuda or nccl or gloo) and $match_tests_expression" fi run_tests \ - --core_args "-vvv tests/ignite" \ + --core_args "-vvv tests/ignite -m 'not distributed'" \ --cache_dir ".gpu-cuda" \ --skip_distrib_tests "${skip_distrib_tests}" \ --use_coverage 1 \ --match_tests_expression "${cuda_pattern}" \ --use_last_failed ${use_last_failed} -# https://pubs.opengroup.org/onlinepubs/009695399/utilities/xcu_chap02.html#tag_02_06_02 if [ "${skip_distrib_tests}" -eq "1" ]; then exit 0 fi From 4f462109858291f3499e8fab809d91e69a9d9532 Mon Sep 17 00:00:00 2001 From: vfdev Date: Tue, 3 Dec 2024 13:39:54 +0100 Subject: [PATCH 05/10] Updated GpuInfo metric, pynvml<12 (#3311) --- ignite/contrib/engines/common.py | 2 +- ignite/metrics/gpu_info.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/ignite/contrib/engines/common.py b/ignite/contrib/engines/common.py index 09f769a18d0f..bcfa54be55ea 100644 --- a/ignite/contrib/engines/common.py +++ b/ignite/contrib/engines/common.py @@ -78,7 +78,7 @@ def setup_common_training_handlers( lr_scheduler: learning rate scheduler as native torch LRScheduler or ignite's parameter scheduler. with_gpu_stats: if True, :class:`~ignite.metrics.GpuInfo` is attached to the - trainer. This requires `pynvml` package to be installed. + trainer. This requires `pynvml<12` package to be installed. output_names: list of names associated with `update_function` output dictionary. with_pbars: if True, two progress bars on epochs and optionally on iterations are attached. Default, True. diff --git a/ignite/metrics/gpu_info.py b/ignite/metrics/gpu_info.py index 96ed4f07c57c..d13bbd8a1dae 100644 --- a/ignite/metrics/gpu_info.py +++ b/ignite/metrics/gpu_info.py @@ -10,7 +10,7 @@ class GpuInfo(Metric): """Provides GPU information: a) used memory percentage, b) gpu utilization percentage values as Metric - on each iterations. + on each iterations. This metric requires `pynvml `_ package of version `<12`. .. Note :: @@ -39,7 +39,7 @@ def __init__(self) -> None: except ImportError: raise ModuleNotFoundError( "This contrib module requires pynvml to be installed. " - "Please install it with command: \n pip install pynvml" + "Please install it with command: \n pip install 'pynvml<12'" ) # Let's check available devices if not torch.cuda.is_available(): From 6f8ad2a16b2d82fd6b2b83b849b86160fe8a8b6a Mon Sep 17 00:00:00 2001 From: Fabio Bonassi Date: Tue, 3 Dec 2024 17:28:22 +0100 Subject: [PATCH 06/10] =?UTF-8?q?Give=20the=20option=20to=20terminate=20th?= =?UTF-8?q?e=20engine=20without=20firing=20Events.COMPLET=E2=80=A6=20(#330?= =?UTF-8?q?9)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Give the option to terminate the engine without firing Events.COMPLETED. The default behaviour is not changed. Note that even though Events.COMPLETED is not fired, its timer is updated. * Update ignite/engine/engine.py Co-authored-by: vfdev * Update ignite/engine/engine.py Co-authored-by: vfdev * Update ignite/engine/engine.py Co-authored-by: vfdev * Update ignite/engine/engine.py Co-authored-by: vfdev * Update ignite/engine/events.py Co-authored-by: vfdev * Argument `skip_event_completed` renamed to `skip_completed` * - Fixed docs broken links. - Do not update self.state.times[Events.COMPLETED.name] if terminated - Fixed unit test * Update ignite/engine/engine.py Co-authored-by: vfdev * Refactoring and patching. - Engine time logging moved out of the if clause. In the log message "completed" has been replaced with "finished" to avoid confusion. - Same changes applied to the method `_internal_run_legacy()` * Restored .gitignore Sorry for accidentally including it into the previous commit! * Update ignite/engine/events.py * Fixed typo in test_engine.py * Parametrized test for engine.terminate(skip_completed) * Update event table * Fixed documentation --------- Co-authored-by: vfdev --- ignite/engine/engine.py | 41 +++++++++++++------ ignite/engine/events.py | 25 ++++++++---- tests/ignite/contrib/engines/test_common.py | 1 - tests/ignite/engine/test_engine.py | 45 ++++++++++++++------- 4 files changed, 76 insertions(+), 36 deletions(-) diff --git a/ignite/engine/engine.py b/ignite/engine/engine.py index 27a949cacca2..e2a148986075 100644 --- a/ignite/engine/engine.py +++ b/ignite/engine/engine.py @@ -140,6 +140,7 @@ def __init__(self, process_function: Callable[["Engine", Any], Any]): self._process_function = process_function self.last_event_name: Optional[Events] = None self.should_terminate = False + self.skip_completed_after_termination = False self.should_terminate_single_epoch = False self.should_interrupt = False self.state = State() @@ -538,7 +539,7 @@ def call_interrupt(): self.logger.info("interrupt signaled. Engine will interrupt the run after current iteration is finished.") self.should_interrupt = True - def terminate(self) -> None: + def terminate(self, skip_completed: bool = False) -> None: """Sends terminate signal to the engine, so that it terminates completely the run. The run is terminated after the event on which ``terminate`` method was called. The following events are triggered: @@ -547,6 +548,9 @@ def terminate(self) -> None: - :attr:`~ignite.engine.events.Events.TERMINATE` - :attr:`~ignite.engine.events.Events.COMPLETED` + Args: + skip_completed: if True, the event :attr:`~ignite.engine.events.Events.COMPLETED` is not fired after + :attr:`~ignite.engine.events.Events.TERMINATE`. Default is False. Examples: .. testcode:: @@ -617,9 +621,12 @@ def terminate(): .. versionchanged:: 0.4.10 Behaviour changed, for details see https://github.com/pytorch/ignite/issues/2669 + .. versionchanged:: 0.5.2 + Added `skip_completed` flag """ self.logger.info("Terminate signaled. Engine will stop after current iteration is finished.") self.should_terminate = True + self.skip_completed_after_termination = skip_completed def terminate_epoch(self) -> None: """Sends terminate signal to the engine, so that it terminates the current epoch. The run @@ -993,13 +1000,17 @@ def _internal_run_as_gen(self) -> Generator[Any, None, State]: time_taken = time.time() - start_time # time is available for handlers but must be updated after fire self.state.times[Events.COMPLETED.name] = time_taken - handlers_start_time = time.time() - self._fire_event(Events.COMPLETED) - time_taken += time.time() - handlers_start_time - # update time wrt handlers - self.state.times[Events.COMPLETED.name] = time_taken + + # do not fire Events.COMPLETED if we terminated the run with flag `skip_completed=True` + if not (self.should_terminate and self.skip_completed_after_termination): + handlers_start_time = time.time() + self._fire_event(Events.COMPLETED) + time_taken += time.time() - handlers_start_time + # update time wrt handlers + self.state.times[Events.COMPLETED.name] = time_taken + hours, mins, secs = _to_hours_mins_secs(time_taken) - self.logger.info(f"Engine run complete. Time taken: {hours:02d}:{mins:02d}:{secs:06.3f}") + self.logger.info(f"Engine run finished. Time taken: {hours:02d}:{mins:02d}:{secs:06.3f}") except BaseException as e: self._dataloader_iter = None @@ -1174,13 +1185,17 @@ def _internal_run_legacy(self) -> State: time_taken = time.time() - start_time # time is available for handlers but must be updated after fire self.state.times[Events.COMPLETED.name] = time_taken - handlers_start_time = time.time() - self._fire_event(Events.COMPLETED) - time_taken += time.time() - handlers_start_time - # update time wrt handlers - self.state.times[Events.COMPLETED.name] = time_taken + + # do not fire Events.COMPLETED if we terminated the run with flag `skip_completed=True` + if not (self.should_terminate and self.skip_completed_after_termination): + handlers_start_time = time.time() + self._fire_event(Events.COMPLETED) + time_taken += time.time() - handlers_start_time + # update time wrt handlers + self.state.times[Events.COMPLETED.name] = time_taken + hours, mins, secs = _to_hours_mins_secs(time_taken) - self.logger.info(f"Engine run complete. Time taken: {hours:02d}:{mins:02d}:{secs:06.3f}") + self.logger.info(f"Engine run finished. Time taken: {hours:02d}:{mins:02d}:{secs:06.3f}") except BaseException as e: self._dataloader_iter = None diff --git a/ignite/engine/events.py b/ignite/engine/events.py index 9dd99348492b..87622d3415cc 100644 --- a/ignite/engine/events.py +++ b/ignite/engine/events.py @@ -259,36 +259,47 @@ class Events(EventEnum): - TERMINATE_SINGLE_EPOCH : triggered when the run is about to end the current epoch, after receiving a :meth:`~ignite.engine.engine.Engine.terminate_epoch()` or :meth:`~ignite.engine.engine.Engine.terminate()` call. + - EPOCH_COMPLETED : triggered when the epoch is ended. Note that this is triggered even + when :meth:`~ignite.engine.engine.Engine.terminate_epoch()` is called. - TERMINATE : triggered when the run is about to end completely, after receiving :meth:`~ignite.engine.engine.Engine.terminate()` call. - - EPOCH_COMPLETED : triggered when the epoch is ended. Note that this is triggered even - when :meth:`~ignite.engine.engine.Engine.terminate_epoch()` is called. - - COMPLETED : triggered when engine's run is completed + - COMPLETED : triggered when engine's run is completed or terminated with + :meth:`~ignite.engine.engine.Engine.terminate()`, unless the flag + `skip_completed` is set to True. The table below illustrates which events are triggered when various termination methods are called. .. list-table:: - :widths: 24 25 33 18 + :widths: 35 38 28 20 20 :header-rows: 1 * - Method - - EVENT_COMPLETED - TERMINATE_SINGLE_EPOCH + - EPOCH_COMPLETED - TERMINATE + - COMPLETED * - no termination - - ✔ - ✗ + - ✔ - ✗ + - ✔ * - :meth:`~ignite.engine.engine.Engine.terminate_epoch()` - ✔ - ✔ - ✗ + - ✔ * - :meth:`~ignite.engine.engine.Engine.terminate()` - ✗ - ✔ - ✔ + - ✔ + * - :meth:`~ignite.engine.engine.Engine.terminate()` with `skip_completed=True` + - ✗ + - ✔ + - ✔ + - ✗ Since v0.3.0, Events become more flexible and allow to pass an event filter to the Engine: @@ -357,7 +368,7 @@ class CustomEvents(EventEnum): STARTED = "started" """triggered when engine's run is started.""" COMPLETED = "completed" - """triggered when engine's run is completed""" + """triggered when engine's run is completed, or after receiving terminate() call.""" ITERATION_STARTED = "iteration_started" """triggered when an iteration is started.""" diff --git a/tests/ignite/contrib/engines/test_common.py b/tests/ignite/contrib/engines/test_common.py index d0100be9e8da..e14042e62c15 100644 --- a/tests/ignite/contrib/engines/test_common.py +++ b/tests/ignite/contrib/engines/test_common.py @@ -8,7 +8,6 @@ from torch.utils.data.distributed import DistributedSampler import ignite.distributed as idist - import ignite.handlers as handlers from ignite.contrib.engines.common import ( _setup_logging, diff --git a/tests/ignite/engine/test_engine.py b/tests/ignite/engine/test_engine.py index 130212426504..fcb0299aa22d 100644 --- a/tests/ignite/engine/test_engine.py +++ b/tests/ignite/engine/test_engine.py @@ -40,11 +40,14 @@ class TestEngine: def set_interrupt_resume_enabled(self, interrupt_resume_enabled): Engine.interrupt_resume_enabled = interrupt_resume_enabled - def test_terminate(self): + @pytest.mark.parametrize("skip_completed", [True, False]) + def test_terminate(self, skip_completed): engine = Engine(lambda e, b: 1) assert not engine.should_terminate - engine.terminate() + assert not engine.skip_completed_after_termination + engine.terminate(skip_completed) assert engine.should_terminate + assert engine.skip_completed_after_termination == skip_completed def test_invalid_process_raises_with_invalid_signature(self): with pytest.raises(ValueError, match=r"Engine must be given a processing function in order to run"): @@ -236,25 +239,32 @@ def check_iter_and_data(): assert num_calls_check_iter_epoch == 1 @pytest.mark.parametrize( - "terminate_event, e, i", + "terminate_event, e, i, skip_completed", [ - (Events.STARTED, 0, 0), - (Events.EPOCH_STARTED(once=2), 2, None), - (Events.EPOCH_COMPLETED(once=2), 2, None), - (Events.GET_BATCH_STARTED(once=12), None, 12), - (Events.GET_BATCH_COMPLETED(once=12), None, 12), - (Events.ITERATION_STARTED(once=14), None, 14), - (Events.ITERATION_COMPLETED(once=14), None, 14), + (Events.STARTED, 0, 0, True), + (Events.EPOCH_STARTED(once=2), 2, None, True), + (Events.EPOCH_COMPLETED(once=2), 2, None, True), + (Events.GET_BATCH_STARTED(once=12), None, 12, True), + (Events.GET_BATCH_COMPLETED(once=12), None, 12, False), + (Events.ITERATION_STARTED(once=14), None, 14, True), + (Events.ITERATION_COMPLETED(once=14), None, 14, True), + (Events.STARTED, 0, 0, False), + (Events.EPOCH_STARTED(once=2), 2, None, False), + (Events.EPOCH_COMPLETED(once=2), 2, None, False), + (Events.GET_BATCH_STARTED(once=12), None, 12, False), + (Events.GET_BATCH_COMPLETED(once=12), None, 12, False), + (Events.ITERATION_STARTED(once=14), None, 14, False), + (Events.ITERATION_COMPLETED(once=14), None, 14, False), ], ) - def test_terminate_events_sequence(self, terminate_event, e, i): + def test_terminate_events_sequence(self, terminate_event, e, i, skip_completed): engine = RecordedEngine(MagicMock(return_value=1)) data = range(10) max_epochs = 5 @engine.on(terminate_event) def call_terminate(): - engine.terminate() + engine.terminate(skip_completed) @engine.on(Events.EXCEPTION_RAISED) def assert_no_exceptions(ee): @@ -271,10 +281,15 @@ def assert_no_exceptions(ee): if e is None: e = i // len(data) + 1 + if skip_completed: + assert engine.called_events[-1] == (e, i, Events.TERMINATE) + assert engine.called_events[-2] == (e, i, terminate_event) + else: + assert engine.called_events[-1] == (e, i, Events.COMPLETED) + assert engine.called_events[-2] == (e, i, Events.TERMINATE) + assert engine.called_events[-3] == (e, i, terminate_event) + assert engine.called_events[0] == (0, 0, Events.STARTED) - assert engine.called_events[-1] == (e, i, Events.COMPLETED) - assert engine.called_events[-2] == (e, i, Events.TERMINATE) - assert engine.called_events[-3] == (e, i, terminate_event) assert engine._dataloader_iter is None @pytest.mark.parametrize("data, epoch_length", [(None, 10), (range(10), None)]) From b636374108e425f86c8e050528d8fd240694806e Mon Sep 17 00:00:00 2001 From: Fabio Bonassi Date: Mon, 9 Dec 2024 14:20:29 +0100 Subject: [PATCH 07/10] Allow to terminate an epoch without firing `Events.EPOCH_COMPLETED` (#3313) * Added optional flag skip_epoch_completed to Engine.terminate_epoch() * Improved docs for terminate() and terminate_epoch() * Make the internal attribute skip_completed_after_termination private * - Merged flags "should_terminate" and "_skip_completed_after_termination". - Merged flags "should_terminate_single_epoch" and "_skip_epoch_completed_after_termination". * Union[bool, str] instead of the pipe operator for compatibility with older Python versions * Raise an RuntimeError when terminate_epoch() is called on Events.STARTED or Events.EPOCH_STARTED * Ignoring comparison-overlap warning from mypy to keep the code simple * Apply suggestions from code review * Update engine.py --------- Co-authored-by: vfdev --- ignite/engine/engine.py | 79 +++++++++++++++++++++--------- ignite/engine/events.py | 12 +++-- tests/ignite/engine/test_engine.py | 71 ++++++++++++++++++++------- 3 files changed, 119 insertions(+), 43 deletions(-) diff --git a/ignite/engine/engine.py b/ignite/engine/engine.py index e2a148986075..f3f95c9a2e27 100644 --- a/ignite/engine/engine.py +++ b/ignite/engine/engine.py @@ -139,9 +139,12 @@ def __init__(self, process_function: Callable[["Engine", Any], Any]): self.logger = logging.getLogger(__name__ + "." + self.__class__.__name__) self._process_function = process_function self.last_event_name: Optional[Events] = None - self.should_terminate = False - self.skip_completed_after_termination = False - self.should_terminate_single_epoch = False + # should_terminate flag: False - don't terminate, True - terminate, + # "skip_completed" - terminate and skip the event "COMPLETED" + self.should_terminate: Union[bool, str] = False + # should_terminate_single_epoch flag: False - don't terminate, True - terminate, + # "skip_epoch_completed" - terminate and skip the event "EPOCH_COMPLETED" + self.should_terminate_single_epoch: Union[bool, str] = False self.should_interrupt = False self.state = State() self._state_dict_user_keys: List[str] = [] @@ -546,7 +549,7 @@ def terminate(self, skip_completed: bool = False) -> None: - ... - Terminating event - :attr:`~ignite.engine.events.Events.TERMINATE` - - :attr:`~ignite.engine.events.Events.COMPLETED` + - :attr:`~ignite.engine.events.Events.COMPLETED` (unless `skip_completed=True`) Args: skip_completed: if True, the event :attr:`~ignite.engine.events.Events.COMPLETED` is not fired after @@ -625,25 +628,31 @@ def terminate(): Added `skip_completed` flag """ self.logger.info("Terminate signaled. Engine will stop after current iteration is finished.") - self.should_terminate = True - self.skip_completed_after_termination = skip_completed + self.should_terminate = "skip_completed" if skip_completed else True - def terminate_epoch(self) -> None: + def terminate_epoch(self, skip_epoch_completed: bool = False) -> None: """Sends terminate signal to the engine, so that it terminates the current epoch. The run continues from the next epoch. The following events are triggered: - ... - Event on which ``terminate_epoch`` method is called - :attr:`~ignite.engine.events.Events.TERMINATE_SINGLE_EPOCH` - - :attr:`~ignite.engine.events.Events.EPOCH_COMPLETED` + - :attr:`~ignite.engine.events.Events.EPOCH_COMPLETED` (unless `skip_epoch_completed=True`) - :attr:`~ignite.engine.events.Events.EPOCH_STARTED` - ... + + Args: + skip_epoch_completed: if True, the event :attr:`~ignite.engine.events.Events.EPOCH_COMPLETED` + is not fired after :attr:`~ignite.engine.events.Events.TERMINATE_SINGLE_EPOCH`. Default is False. + + .. versionchanged:: 0.5.2 + Added `skip_epoch_completed` flag """ self.logger.info( "Terminate current epoch is signaled. " "Current epoch iteration will stop after current iteration is finished." ) - self.should_terminate_single_epoch = True + self.should_terminate_single_epoch = "skip_epoch_completed" if skip_epoch_completed else True def _handle_exception(self, e: BaseException) -> None: if Events.EXCEPTION_RAISED in self._event_handlers: @@ -982,11 +991,17 @@ def _internal_run_as_gen(self) -> Generator[Any, None, State]: # time is available for handlers but must be updated after fire self.state.times[Events.EPOCH_COMPLETED.name] = epoch_time_taken - handlers_start_time = time.time() - self._fire_event(Events.EPOCH_COMPLETED) - epoch_time_taken += time.time() - handlers_start_time - # update time wrt handlers - self.state.times[Events.EPOCH_COMPLETED.name] = epoch_time_taken + if self.should_terminate_single_epoch != "skip_epoch_completed": # type: ignore[comparison-overlap] + handlers_start_time = time.time() + self._fire_event(Events.EPOCH_COMPLETED) + epoch_time_taken += time.time() - handlers_start_time + # update time wrt handlers + self.state.times[Events.EPOCH_COMPLETED.name] = epoch_time_taken + + if self.should_terminate_single_epoch: + # We skip raising _EngineTerminateSingleEpochException exception on Events.EPOCH_COMPLETED + # as epoch is already completed and nothing to terminate + self.should_terminate_single_epoch = False yield from self._maybe_terminate_or_interrupt() hours, mins, secs = _to_hours_mins_secs(epoch_time_taken) @@ -997,12 +1012,19 @@ def _internal_run_as_gen(self) -> Generator[Any, None, State]: except _EngineTerminateException: self._fire_event(Events.TERMINATE) + except _EngineTerminateSingleEpochException: + raise RuntimeError( + "The method terminate_epoch() should not be called on Event.STARTED or Event.EPOCH_STARTED." + "If this is a desired behaviour, please open a feature request on" + "https://github.com/pytorch/ignite/issues/new/choose" + ) + time_taken = time.time() - start_time # time is available for handlers but must be updated after fire self.state.times[Events.COMPLETED.name] = time_taken # do not fire Events.COMPLETED if we terminated the run with flag `skip_completed=True` - if not (self.should_terminate and self.skip_completed_after_termination): + if self.should_terminate != "skip_completed": # type: ignore[comparison-overlap] handlers_start_time = time.time() self._fire_event(Events.COMPLETED) time_taken += time.time() - handlers_start_time @@ -1121,7 +1143,6 @@ def _run_once_on_dataset_as_gen(self) -> Generator[State, None, float]: except _EngineTerminateSingleEpochException: self._fire_event(Events.TERMINATE_SINGLE_EPOCH, iter_counter=iter_counter) - self.should_terminate_single_epoch = False self._setup_dataloader_iter() except _EngineTerminateException as e: @@ -1167,11 +1188,17 @@ def _internal_run_legacy(self) -> State: # time is available for handlers but must be updated after fire self.state.times[Events.EPOCH_COMPLETED.name] = epoch_time_taken - handlers_start_time = time.time() - self._fire_event(Events.EPOCH_COMPLETED) - epoch_time_taken += time.time() - handlers_start_time - # update time wrt handlers - self.state.times[Events.EPOCH_COMPLETED.name] = epoch_time_taken + if self.should_terminate_single_epoch != "skip_epoch_completed": # type: ignore[comparison-overlap] + handlers_start_time = time.time() + self._fire_event(Events.EPOCH_COMPLETED) + epoch_time_taken += time.time() - handlers_start_time + # update time wrt handlers + self.state.times[Events.EPOCH_COMPLETED.name] = epoch_time_taken + + if self.should_terminate_single_epoch: + # We skip raising _EngineTerminateSingleEpochException exception on Events.EPOCH_COMPLETED + # as epoch is already completed and nothing to terminate + self.should_terminate_single_epoch = False self._maybe_terminate_legacy() hours, mins, secs = _to_hours_mins_secs(epoch_time_taken) @@ -1182,12 +1209,19 @@ def _internal_run_legacy(self) -> State: except _EngineTerminateException: self._fire_event(Events.TERMINATE) + except _EngineTerminateSingleEpochException: + raise RuntimeError( + "The method terminate_epoch() should not be called on Event.STARTED or Event.EPOCH_STARTED." + "If this is a desired behaviour, please open a feature request on" + "https://github.com/pytorch/ignite/issues/new/choose" + ) + time_taken = time.time() - start_time # time is available for handlers but must be updated after fire self.state.times[Events.COMPLETED.name] = time_taken # do not fire Events.COMPLETED if we terminated the run with flag `skip_completed=True` - if not (self.should_terminate and self.skip_completed_after_termination): + if self.should_terminate != "skip_completed": # type: ignore[comparison-overlap] handlers_start_time = time.time() self._fire_event(Events.COMPLETED) time_taken += time.time() - handlers_start_time @@ -1292,7 +1326,6 @@ def _run_once_on_dataset_legacy(self) -> float: except _EngineTerminateSingleEpochException: self._fire_event(Events.TERMINATE_SINGLE_EPOCH, iter_counter=iter_counter) - self.should_terminate_single_epoch = False self._setup_dataloader_iter() except _EngineTerminateException as e: diff --git a/ignite/engine/events.py b/ignite/engine/events.py index 87622d3415cc..7a348f947624 100644 --- a/ignite/engine/events.py +++ b/ignite/engine/events.py @@ -259,8 +259,9 @@ class Events(EventEnum): - TERMINATE_SINGLE_EPOCH : triggered when the run is about to end the current epoch, after receiving a :meth:`~ignite.engine.engine.Engine.terminate_epoch()` or :meth:`~ignite.engine.engine.Engine.terminate()` call. - - EPOCH_COMPLETED : triggered when the epoch is ended. Note that this is triggered even - when :meth:`~ignite.engine.engine.Engine.terminate_epoch()` is called. + - EPOCH_COMPLETED : triggered when the epoch is ended. This is triggered even + when :meth:`~ignite.engine.engine.Engine.terminate_epoch()` is called, + unless the flag `skip_epoch_completed` is set to True. - TERMINATE : triggered when the run is about to end completely, after receiving :meth:`~ignite.engine.engine.Engine.terminate()` call. @@ -272,7 +273,7 @@ class Events(EventEnum): The table below illustrates which events are triggered when various termination methods are called. .. list-table:: - :widths: 35 38 28 20 20 + :widths: 38 38 28 20 20 :header-rows: 1 * - Method @@ -290,6 +291,11 @@ class Events(EventEnum): - ✔ - ✗ - ✔ + * - :meth:`~ignite.engine.engine.Engine.terminate_epoch()` with `skip_epoch_completed=True` + - ✔ + - ✗ + - ✗ + - ✔ * - :meth:`~ignite.engine.engine.Engine.terminate()` - ✗ - ✔ diff --git a/tests/ignite/engine/test_engine.py b/tests/ignite/engine/test_engine.py index fcb0299aa22d..76e1ad837605 100644 --- a/tests/ignite/engine/test_engine.py +++ b/tests/ignite/engine/test_engine.py @@ -44,10 +44,13 @@ def set_interrupt_resume_enabled(self, interrupt_resume_enabled): def test_terminate(self, skip_completed): engine = Engine(lambda e, b: 1) assert not engine.should_terminate - assert not engine.skip_completed_after_termination + engine.terminate(skip_completed) - assert engine.should_terminate - assert engine.skip_completed_after_termination == skip_completed + + if skip_completed: + assert engine.should_terminate == "skip_completed" + else: + assert engine.should_terminate == True # noqa: E712 def test_invalid_process_raises_with_invalid_signature(self): with pytest.raises(ValueError, match=r"Engine must be given a processing function in order to run"): @@ -292,8 +295,11 @@ def assert_no_exceptions(ee): assert engine.called_events[0] == (0, 0, Events.STARTED) assert engine._dataloader_iter is None - @pytest.mark.parametrize("data, epoch_length", [(None, 10), (range(10), None)]) - def test_terminate_epoch_stops_mid_epoch(self, data, epoch_length): + @pytest.mark.parametrize( + "data, epoch_length, skip_epoch_completed", + [(None, 10, False), (range(10), None, False), (None, 10, True), (range(10), None, True)], + ) + def test_terminate_epoch_stops_mid_epoch(self, data, epoch_length, skip_epoch_completed): real_epoch_length = epoch_length if data is None else len(data) iteration_to_stop = real_epoch_length + 4 @@ -301,7 +307,7 @@ def test_terminate_epoch_stops_mid_epoch(self, data, epoch_length): def start_of_iteration_handler(engine): if engine.state.iteration == iteration_to_stop: - engine.terminate_epoch() + engine.terminate_epoch(skip_epoch_completed) max_epochs = 3 engine.add_event_handler(Events.ITERATION_STARTED, start_of_iteration_handler) @@ -312,15 +318,23 @@ def start_of_iteration_handler(engine): assert state.epoch == max_epochs @pytest.mark.parametrize( - "terminate_epoch_event, i", + "terminate_epoch_event, i, skip_epoch_completed", [ - (Events.GET_BATCH_STARTED(once=12), 12), - (Events.GET_BATCH_COMPLETED(once=12), 12), - (Events.ITERATION_STARTED(once=14), 14), - (Events.ITERATION_COMPLETED(once=14), 14), + (Events.GET_BATCH_STARTED(once=12), 12, False), + (Events.GET_BATCH_COMPLETED(once=12), 12, False), + (Events.ITERATION_STARTED(once=14), 14, False), + (Events.ITERATION_COMPLETED(once=14), 14, False), + (Events.GET_BATCH_STARTED(once=12), 12, True), + (Events.GET_BATCH_COMPLETED(once=12), 12, True), + (Events.ITERATION_STARTED(once=14), 14, True), + (Events.ITERATION_COMPLETED(once=14), 14, True), + (Events.STARTED, 30, False), + (Events.STARTED, 30, True), + (Events.EPOCH_STARTED(once=2), 10, False), + (Events.EPOCH_STARTED(once=2), 10, True), ], ) - def test_terminate_epoch_events_sequence(self, terminate_epoch_event, i): + def test_terminate_epoch_events_sequence(self, terminate_epoch_event, i, skip_epoch_completed): engine = RecordedEngine(MagicMock(return_value=1)) data = range(10) max_epochs = 3 @@ -331,31 +345,54 @@ def test_terminate_epoch_events_sequence(self, terminate_epoch_event, i): @engine.on(terminate_epoch_event) def call_terminate_epoch(): + assert not engine.should_terminate_single_epoch nonlocal call_count if call_count < 1: - engine.terminate_epoch() + engine.terminate_epoch(skip_epoch_completed) + if skip_epoch_completed: + assert engine.should_terminate_single_epoch == "skip_epoch_completed" + else: + assert engine.should_terminate_single_epoch == True # noqa: E712 + call_count += 1 + @engine.on(Events.EPOCH_STARTED) + def check_skip_reset(): + if terminate_epoch_event != Events.EPOCH_STARTED: + assert engine.should_terminate_single_epoch == False # noqa: E712 + @engine.on(Events.TERMINATE_SINGLE_EPOCH) def check_previous_events(iter_counter): e = i // len(data) + 1 - assert engine.called_events[0] == (0, 0, Events.STARTED) assert engine.called_events[-2] == (e, i, terminate_epoch_event) assert engine.called_events[-1] == (e, i, Events.TERMINATE_SINGLE_EPOCH) + if skip_epoch_completed: + assert engine.should_terminate_single_epoch == "skip_epoch_completed" + else: + assert engine.should_terminate_single_epoch == True # noqa: E712 @engine.on(Events.EPOCH_COMPLETED) def check_previous_events2(): e = i // len(data) + 1 if e == engine.state.epoch and i == engine.state.iteration: + assert not skip_epoch_completed + assert isinstance(engine.should_terminate_single_epoch, bool) assert engine.called_events[-3] == (e, i, terminate_epoch_event) assert engine.called_events[-2] == (e, i, Events.TERMINATE_SINGLE_EPOCH) assert engine.called_events[-1] == (e, i, Events.EPOCH_COMPLETED) - engine.run(data, max_epochs=max_epochs) + if terminate_epoch_event in [Events.STARTED, Events.EPOCH_STARTED]: + with pytest.raises(RuntimeError): + engine.run(data, max_epochs=max_epochs) + else: + engine.run(data, max_epochs=max_epochs) + + assert engine.state.epoch == max_epochs + assert (max_epochs - 1) * len(data) < engine.state.iteration < max_epochs * len(data) - assert engine.state.epoch == max_epochs - assert (max_epochs - 1) * len(data) < engine.state.iteration < max_epochs * len(data) + epoch_completed_events = [e for e in engine.called_events if e[2] == Events.EPOCH_COMPLETED.name] + assert len(epoch_completed_events) == max_epochs - skip_epoch_completed @pytest.mark.parametrize("data", [None, "mock_data_loader"]) def test_iteration_events_are_fired(self, data): From a3d691c3c2badce2a8e5d540538af2c7fed7811d Mon Sep 17 00:00:00 2001 From: Tudor Gulin Date: Mon, 9 Dec 2024 15:58:35 +0200 Subject: [PATCH 08/10] Fix deprecated statement (#3307) * fix-deprecated-warning Replaced torch.cuda.amp.autocast with torch.amp.autocast("cuda",...). * autopep8 fix * Update torch version to 1.12.0 * Address PR comments * Revert unwanted changes * Fix regex * Revert change in CycleGAN_with_torch_cuda_amp * Fix regex in test_create_supervised * Update ignite/engine/__init__.py * Update tests/ignite/engine/test_create_supervised.py --------- Co-authored-by: Gulin7 Co-authored-by: vfdev --- examples/cifar10/main.py | 7 +++-- .../benchmark_torch_cuda_amp.py | 5 ++-- examples/cifar10_qat/main.py | 5 ++-- .../CycleGAN_with_torch_cuda_amp.ipynb | 5 ++-- .../classification/imagenet/main.py | 9 +++--- .../segmentation/pascal_voc2012/main.py | 9 +++--- examples/transformers/main.py | 7 +++-- ignite/engine/__init__.py | 12 ++++---- tests/ignite/engine/test_create_supervised.py | 30 +++++++++---------- 9 files changed, 48 insertions(+), 41 deletions(-) diff --git a/examples/cifar10/main.py b/examples/cifar10/main.py index b64b81c1d036..b8dbce5d9601 100644 --- a/examples/cifar10/main.py +++ b/examples/cifar10/main.py @@ -7,7 +7,8 @@ import torch.nn as nn import torch.optim as optim import utils -from torch.cuda.amp import autocast, GradScaler +from torch.amp import autocast +from torch.cuda.amp import GradScaler import ignite import ignite.distributed as idist @@ -299,7 +300,7 @@ def train_step(engine, batch): model.train() - with autocast(enabled=with_amp): + with autocast("cuda", enabled=with_amp): y_pred = model(x) loss = criterion(y_pred, y) @@ -355,7 +356,7 @@ def evaluate_step(engine: Engine, batch): x = x.to(device, non_blocking=True) y = y.to(device, non_blocking=True) - with autocast(enabled=with_amp): + with autocast("cuda", enabled=with_amp): output = model(x) return output, y diff --git a/examples/cifar100_amp_benchmark/benchmark_torch_cuda_amp.py b/examples/cifar100_amp_benchmark/benchmark_torch_cuda_amp.py index 992f305bf24a..746d7eb54c49 100644 --- a/examples/cifar100_amp_benchmark/benchmark_torch_cuda_amp.py +++ b/examples/cifar100_amp_benchmark/benchmark_torch_cuda_amp.py @@ -1,6 +1,7 @@ import fire import torch -from torch.cuda.amp import autocast, GradScaler +from torch.amp import autocast +from torch.cuda.amp import GradScaler from torch.nn import CrossEntropyLoss from torch.optim import SGD from torchvision.models import wide_resnet50_2 @@ -34,7 +35,7 @@ def train_step(engine, batch): optimizer.zero_grad() # Runs the forward pass with autocasting. - with autocast(): + with autocast("cuda"): y_pred = model(x) loss = criterion(y_pred, y) diff --git a/examples/cifar10_qat/main.py b/examples/cifar10_qat/main.py index f965ce1e6e4d..7b8366a2a63f 100644 --- a/examples/cifar10_qat/main.py +++ b/examples/cifar10_qat/main.py @@ -6,7 +6,8 @@ import torch.nn as nn import torch.optim as optim import utils -from torch.cuda.amp import autocast, GradScaler +from torch.amp import autocast +from torch.cuda.amp import GradScaler import ignite import ignite.distributed as idist @@ -283,7 +284,7 @@ def train_step(engine, batch): model.train() - with autocast(enabled=with_amp): + with autocast("cuda", enabled=with_amp): y_pred = model(x) loss = criterion(y_pred, y) diff --git a/examples/notebooks/CycleGAN_with_torch_cuda_amp.ipynb b/examples/notebooks/CycleGAN_with_torch_cuda_amp.ipynb index 614c8528b8d6..f6271eaf3bda 100644 --- a/examples/notebooks/CycleGAN_with_torch_cuda_amp.ipynb +++ b/examples/notebooks/CycleGAN_with_torch_cuda_amp.ipynb @@ -887,7 +887,7 @@ "id": "JE8dLeEfIl_Z" }, "source": [ - "We will use [`torch.cuda.amp.autocast`](https://pytorch.org/docs/master/amp.html#torch.cuda.amp.autocast) and [`torch.cuda.amp.GradScaler`](https://pytorch.org/docs/master/amp.html#torch.cuda.amp.GradScaler) to perform automatic mixed precision training. Our code follows a [typical mixed precision training example](https://pytorch.org/docs/master/notes/amp_examples.html#typical-mixed-precision-training)." + "We will use [`torch.amp.autocast`](https://pytorch.org/docs/master/amp.html#torch.amp.autocast) and [`torch.cuda.amp.GradScaler`](https://pytorch.org/docs/master/amp.html#torch.cuda.amp.GradScaler) to perform automatic mixed precision training. Our code follows a [typical mixed precision training example](https://pytorch.org/docs/master/notes/amp_examples.html#typical-mixed-precision-training)." ] }, { @@ -896,7 +896,8 @@ "id": "vrJls4p-FRcA" }, "source": [ - "from torch.cuda.amp import autocast, GradScaler\n", + "from torch.cuda.amp import GradScaler\n", + "from torch.amp import autocast\n", "\n", "from ignite.utils import convert_tensor\n", "import torch.nn.functional as F\n", diff --git a/examples/references/classification/imagenet/main.py b/examples/references/classification/imagenet/main.py index 85c20c08a62b..defb4ddc1510 100644 --- a/examples/references/classification/imagenet/main.py +++ b/examples/references/classification/imagenet/main.py @@ -6,9 +6,10 @@ import torch try: - from torch.cuda.amp import autocast, GradScaler + from torch.amp import autocast + from torch.cuda.amp import GradScaler except ImportError: - raise RuntimeError("Please, use recent PyTorch version, e.g. >=1.6.0") + raise RuntimeError("Please, use recent PyTorch version, e.g. >=1.12.0") import dataflow as data import utils @@ -144,7 +145,7 @@ def create_trainer(model, optimizer, criterion, train_sampler, config, logger, w def training_step(engine, batch): model.train() x, y = prepare_batch(batch, device=device, non_blocking=True) - with autocast(enabled=with_amp): + with autocast("cuda", enabled=with_amp): y_pred = model(x) y_pred = model_output_transform(y_pred) loss = criterion(y_pred, y) / accumulation_steps @@ -235,7 +236,7 @@ def create_evaluator(model, metrics, config, with_clearml, tag="val"): @torch.no_grad() def evaluate_step(engine, batch): model.eval() - with autocast(enabled=with_amp): + with autocast("cuda", enabled=with_amp): x, y = prepare_batch(batch, device=config.device, non_blocking=True) y_pred = model(x) y_pred = model_output_transform(y_pred) diff --git a/examples/references/segmentation/pascal_voc2012/main.py b/examples/references/segmentation/pascal_voc2012/main.py index 20afebbb7d36..b6fbc7ad494a 100644 --- a/examples/references/segmentation/pascal_voc2012/main.py +++ b/examples/references/segmentation/pascal_voc2012/main.py @@ -6,9 +6,10 @@ import torch try: - from torch.cuda.amp import autocast, GradScaler + from torch.amp import autocast + from torch.cuda.amp import GradScaler except ImportError: - raise RuntimeError("Please, use recent PyTorch version, e.g. >=1.6.0") + raise RuntimeError("Please, use recent PyTorch version, e.g. >=1.12.0") import dataflow as data import utils @@ -191,7 +192,7 @@ def create_trainer(model, optimizer, criterion, train_sampler, config, logger, w def forward_pass(batch): model.train() x, y = prepare_batch(batch, device=device, non_blocking=True) - with autocast(enabled=with_amp): + with autocast("cuda", enabled=with_amp): y_pred = model(x) y_pred = model_output_transform(y_pred) loss = criterion(y_pred, y) / accumulation_steps @@ -272,7 +273,7 @@ def create_evaluator(model, metrics, config, with_clearml, tag="val"): @torch.no_grad() def evaluate_step(engine, batch): model.eval() - with autocast(enabled=with_amp): + with autocast("cuda", enabled=with_amp): x, y = prepare_batch(batch, device=config.device, non_blocking=True) y_pred = model(x) y_pred = model_output_transform(y_pred) diff --git a/examples/transformers/main.py b/examples/transformers/main.py index cd1a84d2195b..f8118eabf90e 100644 --- a/examples/transformers/main.py +++ b/examples/transformers/main.py @@ -7,7 +7,8 @@ import torch.nn as nn import torch.optim as optim import utils -from torch.cuda.amp import autocast, GradScaler +from torch.amp import autocast +from torch.cuda.amp import GradScaler import ignite import ignite.distributed as idist @@ -309,7 +310,7 @@ def train_step(engine, batch): model.train() - with autocast(enabled=with_amp): + with autocast("cuda", enabled=with_amp): y_pred = model(input_batch) loss = criterion(y_pred, labels) @@ -373,7 +374,7 @@ def evaluate_step(engine, batch): input_batch = {k: v.to(device, non_blocking=True, dtype=torch.long) for k, v in batch[0].items()} labels = labels.to(device, non_blocking=True, dtype=torch.float) - with autocast(enabled=with_amp): + with autocast("cuda", enabled=with_amp): output = model(input_batch) return output, labels diff --git a/ignite/engine/__init__.py b/ignite/engine/__init__.py index cbaac4e16cb7..6e82bc2f6bc7 100644 --- a/ignite/engine/__init__.py +++ b/ignite/engine/__init__.py @@ -185,9 +185,9 @@ def supervised_training_step_amp( """ try: - from torch.cuda.amp import autocast + from torch.amp import autocast except ImportError: - raise ImportError("Please install torch>=1.6.0 to use amp_mode='amp'.") + raise ImportError("Please install torch>=1.12.0 to use amp_mode='amp'.") if gradient_accumulation_steps <= 0: raise ValueError( @@ -200,7 +200,7 @@ def update(engine: Engine, batch: Sequence[torch.Tensor]) -> Union[Any, Tuple[to optimizer.zero_grad() model.train() x, y = prepare_batch(batch, device=device, non_blocking=non_blocking) - with autocast(enabled=True): + with autocast("cuda", enabled=True): output = model_fn(model, x) y_pred = model_transform(output) loss = loss_fn(y_pred, y) @@ -726,15 +726,15 @@ def supervised_evaluation_step_amp( Added `model_fn` to customize model's application on the sample """ try: - from torch.cuda.amp import autocast + from torch.amp import autocast except ImportError: - raise ImportError("Please install torch>=1.6.0 to use amp_mode='amp'.") + raise ImportError("Please install torch>=1.12.0 to use amp_mode='amp'.") def evaluate_step(engine: Engine, batch: Sequence[torch.Tensor]) -> Union[Any, Tuple[torch.Tensor]]: model.eval() with torch.no_grad(): x, y = prepare_batch(batch, device=device, non_blocking=non_blocking) - with autocast(enabled=True): + with autocast("cuda", enabled=True): output = model_fn(model, x) y_pred = model_transform(output) return output_transform(x, y, y_pred) diff --git a/tests/ignite/engine/test_create_supervised.py b/tests/ignite/engine/test_create_supervised.py index 54938167601a..4f07c95929e0 100644 --- a/tests/ignite/engine/test_create_supervised.py +++ b/tests/ignite/engine/test_create_supervised.py @@ -168,7 +168,7 @@ def _(): trainer.run(data) -@pytest.mark.skipif(Version(torch.__version__) < Version("1.6.0"), reason="Skip if < 1.6.0") +@pytest.mark.skipif(Version(torch.__version__) < Version("1.12.0"), reason="Skip if < 1.12.0") def test_create_supervised_training_scalar_assignment(): with mock.patch("ignite.engine._check_arg") as check_arg_mock: check_arg_mock.return_value = None, torch.cuda.amp.GradScaler(enabled=False) @@ -447,21 +447,21 @@ def test_create_supervised_trainer_apex_error(): def mock_torch_cuda_amp_module(): with patch.dict( "sys.modules", - {"torch.cuda.amp": None, "torch.cuda.amp.grad_scaler": None, "torch.cuda.amp.autocast_mode": None}, + {"torch.amp": None, "torch.cuda.amp": None, "torch.amp.autocast_mode": None}, ): yield torch def test_create_supervised_trainer_amp_error(mock_torch_cuda_amp_module): - with pytest.raises(ImportError, match="Please install torch>=1.6.0 to use amp_mode='amp'."): + with pytest.raises(ImportError, match="Please install torch>=1.12.0 to use amp_mode='amp'."): _test_create_supervised_trainer_wrong_accumulation(trainer_device="cpu", amp_mode="amp") - with pytest.raises(ImportError, match="Please install torch>=1.6.0 to use amp_mode='amp'."): + with pytest.raises(ImportError, match="Please install torch>=1.12.0 to use amp_mode='amp'."): _test_create_supervised_trainer(amp_mode="amp") with pytest.raises(ImportError, match="Please install torch>=1.6.0 to use scaler argument."): _test_create_supervised_trainer(amp_mode="amp", scaler=True) -@pytest.mark.skipif(Version(torch.__version__) < Version("1.5.0"), reason="Skip if < 1.5.0") +@pytest.mark.skipif(Version(torch.__version__) < Version("1.12.0"), reason="Skip if < 1.12.0") def test_create_supervised_trainer_scaler_not_amp(): scaler = torch.cuda.amp.GradScaler(enabled=torch.cuda.is_available()) @@ -501,7 +501,7 @@ def test_create_supervised_trainer_on_mps(): _test_create_mocked_supervised_trainer(model_device=model_device, trainer_device=trainer_device) -@pytest.mark.skipif(Version(torch.__version__) < Version("1.6.0"), reason="Skip if < 1.6.0") +@pytest.mark.skipif(Version(torch.__version__) < Version("1.12.0"), reason="Skip if < 1.12.0") @pytest.mark.skipif(not torch.cuda.is_available(), reason="Skip if no GPU") def test_create_supervised_trainer_on_cuda_amp(): model_device = trainer_device = "cuda" @@ -517,7 +517,7 @@ def test_create_supervised_trainer_on_cuda_amp(): _test_create_mocked_supervised_trainer(model_device=model_device, trainer_device=trainer_device, amp_mode="amp") -@pytest.mark.skipif(Version(torch.__version__) < Version("1.6.0"), reason="Skip if < 1.6.0") +@pytest.mark.skipif(Version(torch.__version__) < Version("1.12.0"), reason="Skip if < 1.12.0") @pytest.mark.skipif(not torch.cuda.is_available(), reason="Skip if no GPU") def test_create_supervised_trainer_on_cuda_amp_scaler(): model_device = trainer_device = "cuda" @@ -630,8 +630,8 @@ def test_create_supervised_evaluator(): _test_mocked_supervised_evaluator() # older versions didn't have the autocast method so we skip the test for older builds - if Version(torch.__version__) >= Version("1.6.0"): - with mock.patch("torch.cuda.amp.autocast") as mock_torch_cuda_amp_module: + if Version(torch.__version__) >= Version("1.12.0"): + with mock.patch("torch.amp.autocast") as mock_torch_cuda_amp_module: _test_create_evaluation_step_amp(mock_torch_cuda_amp_module) @@ -640,8 +640,8 @@ def test_create_supervised_evaluator_on_cpu(): _test_mocked_supervised_evaluator(evaluator_device="cpu") # older versions didn't have the autocast method so we skip the test for older builds - if Version(torch.__version__) >= Version("1.6.0"): - with mock.patch("torch.cuda.amp.autocast") as mock_torch_cuda_amp_module: + if Version(torch.__version__) >= Version("1.12.0"): + with mock.patch("torch.amp.autocast") as mock_torch_cuda_amp_module: _test_create_evaluation_step(mock_torch_cuda_amp_module, evaluator_device="cpu") _test_create_evaluation_step_amp(mock_torch_cuda_amp_module, evaluator_device="cpu") @@ -651,8 +651,8 @@ def test_create_supervised_evaluator_traced_on_cpu(): _test_mocked_supervised_evaluator(evaluator_device="cpu", trace=True) # older versions didn't have the autocast method so we skip the test for older builds - if Version(torch.__version__) >= Version("1.6.0"): - with mock.patch("torch.cuda.amp.autocast") as mock_torch_cuda_amp_module: + if Version(torch.__version__) >= Version("1.12.0"): + with mock.patch("torch.amp.autocast") as mock_torch_cuda_amp_module: _test_create_evaluation_step(mock_torch_cuda_amp_module, evaluator_device="cpu", trace=True) @@ -682,7 +682,7 @@ def test_create_supervised_evaluator_on_mps_with_model_on_cpu(): _test_mocked_supervised_evaluator(evaluator_device="mps") -@pytest.mark.skipif(Version(torch.__version__) < Version("1.6.0"), reason="Skip if < 1.6.0") +@pytest.mark.skipif(Version(torch.__version__) < Version("1.12.0"), reason="Skip if < 1.12.0") @pytest.mark.skipif(not torch.cuda.is_available(), reason="Skip if no GPU") def test_create_supervised_evaluator_on_cuda_amp(): model_device = evaluator_device = "cuda" @@ -691,7 +691,7 @@ def test_create_supervised_evaluator_on_cuda_amp(): def test_create_supervised_evaluator_amp_error(mock_torch_cuda_amp_module): - with pytest.raises(ImportError, match="Please install torch>=1.6.0 to use amp_mode='amp'."): + with pytest.raises(ImportError, match="Please install torch>=1.12.0 to use amp_mode='amp'."): _test_create_supervised_evaluator(amp_mode="amp") From d2f935d14625f3e8c7ffa38628a1585fe5e76db9 Mon Sep 17 00:00:00 2001 From: vfdev Date: Mon, 16 Dec 2024 15:58:46 +0100 Subject: [PATCH 09/10] Fixed failing test_roc_auc.py::test_check_compute_fn test (#3316) Test is failing due to scikit-learn changed: ValueError exception -> UndefinedMetricWarning between 1.15 and 1.16 --- tests/ignite/metrics/test_roc_auc.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/ignite/metrics/test_roc_auc.py b/tests/ignite/metrics/test_roc_auc.py index 1e60c480ca18..8695f188b5fb 100644 --- a/tests/ignite/metrics/test_roc_auc.py +++ b/tests/ignite/metrics/test_roc_auc.py @@ -4,6 +4,7 @@ import pytest import sklearn import torch +from sklearn.exceptions import UndefinedMetricWarning from sklearn.metrics import roc_auc_score import ignite.distributed as idist @@ -112,7 +113,7 @@ def test_check_compute_fn(): em = ROC_AUC(check_compute_fn=True) em.reset() - with pytest.warns(EpochMetricWarning, match=r"Probably, there can be a problem with `compute_fn`"): + with pytest.warns((UndefinedMetricWarning, EpochMetricWarning), match=r"Only one class.+present in y_true"): em.update(output) em = ROC_AUC(check_compute_fn=False) From 06fe8bd914711e9554c823c684b065d01710df82 Mon Sep 17 00:00:00 2001 From: vfdev Date: Tue, 17 Dec 2024 00:32:20 +0100 Subject: [PATCH 10/10] Update pytorch-version-tests.yml (#3315) * Update pytorch-version-tests.yml * Update pytorch-version-tests.yml * Update pytorch-version-tests.yml --- .github/workflows/pytorch-version-tests.yml | 23 +++++++++++---------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/.github/workflows/pytorch-version-tests.yml b/.github/workflows/pytorch-version-tests.yml index e14d46f27619..1456a045e7e6 100644 --- a/.github/workflows/pytorch-version-tests.yml +++ b/.github/workflows/pytorch-version-tests.yml @@ -15,24 +15,25 @@ jobs: max-parallel: 5 fail-fast: false matrix: - # Here we keep python 3.8 tests until the end of the 2024 and - # will drop python version and related pytorch versions - python-version: [3.8, 3.9, "3.10"] + python-version: [3.9, "3.10", "3.11"] pytorch-version: - [2.4.1, 2.3.1, 2.2.2, 2.0.1, 1.13.1, 1.12.1, 1.10.0, 1.8.1] + [2.4.1, 2.3.1, 2.2.2, 2.0.1, 1.13.1, 1.12.1, 1.10.0] exclude: - # disabling python 3.9 support with PyTorch 1.7.1 and 1.8.1, to stop repeated pytorch-version test fail. - # https://github.com/pytorch/ignite/issues/2383 - - pytorch-version: 1.8.1 - python-version: 3.9 - - pytorch-version: 1.8.1 - python-version: "3.10" - - pytorch-version: 1.10.0 python-version: "3.10" + - pytorch-version: 1.10.0 + python-version: "3.11" - pytorch-version: 1.11.0 python-version: "3.10" + - pytorch-version: 1.11.0 + python-version: "3.11" + - pytorch-version: 1.12.1 + python-version: "3.11" + # Conda fails to install cpuonly version and few cpu distributed tests are + # failing with unrelated errors + - pytorch-version: 1.13.1 + python-version: "3.11" steps: - uses: actions/checkout@v4