diff --git a/.github/workflows/gpu-tests.yml b/.github/workflows/gpu-tests.yml index 81862e1f67bd..13c628ad302c 100644 --- a/.github/workflows/gpu-tests.yml +++ b/.github/workflows/gpu-tests.yml @@ -16,7 +16,7 @@ concurrency: group: gpu-tests-${{ github.ref_name }}-${{ !(github.ref_protected) || github.sha }} cancel-in-progress: true -# Cherry-picked from https://github.com/pytorch/test-infra/blob/main/.github/workflows/linux_job.yml +# Cherry-picked from https://github.com/pytorch/test-infra/blob/main/.github/workflows/linux_job_v2.yml jobs: gpu-tests: @@ -25,7 +25,7 @@ jobs: pytorch-channel: [pytorch, pytorch-nightly] fail-fast: false env: - DOCKER_IMAGE: "pytorch/conda-builder:cuda12.1" + DOCKER_IMAGE: "pytorch/almalinux-builder:cuda12.4" REPOSITORY: ${{ github.repository }} PR_NUMBER: ${{ github.event.pull_request.number }} runs-on: linux.8xlarge.nvidia.gpu @@ -40,7 +40,7 @@ jobs: echo "::endgroup::" - name: Checkout repository (pytorch/test-infra) - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: # Support the use case where we need to checkout someone's fork repository: pytorch/test-infra @@ -55,7 +55,7 @@ jobs: docker-image: ${{ env.DOCKER_IMAGE }} - name: Checkout repository (${{ github.repository }}) - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: # Support the use case where we need to checkout someone's fork repository: ${{ github.repository }} @@ -102,9 +102,9 @@ jobs: # Install PyTorch if [ "${{ matrix.pytorch-channel }}" == "pytorch" ]; then - pip install --upgrade torch torchvision --index-url https://download.pytorch.org/whl/cu121 + pip install --upgrade torch torchvision --index-url https://download.pytorch.org/whl/cu124 else - pip install --upgrade --pre torch torchvision --index-url https://download.pytorch.org/whl/nightly/cu121 + pip install --upgrade --pre torch torchvision --index-url https://download.pytorch.org/whl/nightly/cu124 fi python -c "import torch; print(torch.__version__, ', CUDA is available: ', torch.cuda.is_available()); exit(not torch.cuda.is_available())" @@ -124,7 +124,7 @@ jobs: uses: nick-fields/retry@v2.9.0 with: max_attempts: 5 - timeout_minutes: 25 + timeout_minutes: 45 shell: bash command: docker exec -t pthd /bin/bash -xec 'bash tests/run_gpu_tests.sh 2' new_command_on_retry: docker exec -e USE_LAST_FAILED=1 -t pthd /bin/bash -xec 'bash tests/run_gpu_tests.sh 2' @@ -139,7 +139,7 @@ jobs: - name: Run examples in container continue-on-error: false run: | - SCRIPT=$(cat << EOF + script=$(cat << EOF set -xe diff --git a/.github/workflows/pytorch-version-tests.yml b/.github/workflows/pytorch-version-tests.yml index e14d46f27619..1456a045e7e6 100644 --- a/.github/workflows/pytorch-version-tests.yml +++ b/.github/workflows/pytorch-version-tests.yml @@ -15,24 +15,25 @@ jobs: max-parallel: 5 fail-fast: false matrix: - # Here we keep python 3.8 tests until the end of the 2024 and - # will drop python version and related pytorch versions - python-version: [3.8, 3.9, "3.10"] + python-version: [3.9, "3.10", "3.11"] pytorch-version: - [2.4.1, 2.3.1, 2.2.2, 2.0.1, 1.13.1, 1.12.1, 1.10.0, 1.8.1] + [2.4.1, 2.3.1, 2.2.2, 2.0.1, 1.13.1, 1.12.1, 1.10.0] exclude: - # disabling python 3.9 support with PyTorch 1.7.1 and 1.8.1, to stop repeated pytorch-version test fail. - # https://github.com/pytorch/ignite/issues/2383 - - pytorch-version: 1.8.1 - python-version: 3.9 - - pytorch-version: 1.8.1 - python-version: "3.10" - - pytorch-version: 1.10.0 python-version: "3.10" + - pytorch-version: 1.10.0 + python-version: "3.11" - pytorch-version: 1.11.0 python-version: "3.10" + - pytorch-version: 1.11.0 + python-version: "3.11" + - pytorch-version: 1.12.1 + python-version: "3.11" + # Conda fails to install cpuonly version and few cpu distributed tests are + # failing with unrelated errors + - pytorch-version: 1.13.1 + python-version: "3.11" steps: - uses: actions/checkout@v4 diff --git a/examples/cifar10/main.py b/examples/cifar10/main.py index b64b81c1d036..b8dbce5d9601 100644 --- a/examples/cifar10/main.py +++ b/examples/cifar10/main.py @@ -7,7 +7,8 @@ import torch.nn as nn import torch.optim as optim import utils -from torch.cuda.amp import autocast, GradScaler +from torch.amp import autocast +from torch.cuda.amp import GradScaler import ignite import ignite.distributed as idist @@ -299,7 +300,7 @@ def train_step(engine, batch): model.train() - with autocast(enabled=with_amp): + with autocast("cuda", enabled=with_amp): y_pred = model(x) loss = criterion(y_pred, y) @@ -355,7 +356,7 @@ def evaluate_step(engine: Engine, batch): x = x.to(device, non_blocking=True) y = y.to(device, non_blocking=True) - with autocast(enabled=with_amp): + with autocast("cuda", enabled=with_amp): output = model(x) return output, y diff --git a/examples/cifar100_amp_benchmark/benchmark_torch_cuda_amp.py b/examples/cifar100_amp_benchmark/benchmark_torch_cuda_amp.py index 992f305bf24a..746d7eb54c49 100644 --- a/examples/cifar100_amp_benchmark/benchmark_torch_cuda_amp.py +++ b/examples/cifar100_amp_benchmark/benchmark_torch_cuda_amp.py @@ -1,6 +1,7 @@ import fire import torch -from torch.cuda.amp import autocast, GradScaler +from torch.amp import autocast +from torch.cuda.amp import GradScaler from torch.nn import CrossEntropyLoss from torch.optim import SGD from torchvision.models import wide_resnet50_2 @@ -34,7 +35,7 @@ def train_step(engine, batch): optimizer.zero_grad() # Runs the forward pass with autocasting. - with autocast(): + with autocast("cuda"): y_pred = model(x) loss = criterion(y_pred, y) diff --git a/examples/cifar10_qat/main.py b/examples/cifar10_qat/main.py index f965ce1e6e4d..7b8366a2a63f 100644 --- a/examples/cifar10_qat/main.py +++ b/examples/cifar10_qat/main.py @@ -6,7 +6,8 @@ import torch.nn as nn import torch.optim as optim import utils -from torch.cuda.amp import autocast, GradScaler +from torch.amp import autocast +from torch.cuda.amp import GradScaler import ignite import ignite.distributed as idist @@ -283,7 +284,7 @@ def train_step(engine, batch): model.train() - with autocast(enabled=with_amp): + with autocast("cuda", enabled=with_amp): y_pred = model(x) loss = criterion(y_pred, y) diff --git a/examples/notebooks/CycleGAN_with_torch_cuda_amp.ipynb b/examples/notebooks/CycleGAN_with_torch_cuda_amp.ipynb index 614c8528b8d6..f6271eaf3bda 100644 --- a/examples/notebooks/CycleGAN_with_torch_cuda_amp.ipynb +++ b/examples/notebooks/CycleGAN_with_torch_cuda_amp.ipynb @@ -887,7 +887,7 @@ "id": "JE8dLeEfIl_Z" }, "source": [ - "We will use [`torch.cuda.amp.autocast`](https://pytorch.org/docs/master/amp.html#torch.cuda.amp.autocast) and [`torch.cuda.amp.GradScaler`](https://pytorch.org/docs/master/amp.html#torch.cuda.amp.GradScaler) to perform automatic mixed precision training. Our code follows a [typical mixed precision training example](https://pytorch.org/docs/master/notes/amp_examples.html#typical-mixed-precision-training)." + "We will use [`torch.amp.autocast`](https://pytorch.org/docs/master/amp.html#torch.amp.autocast) and [`torch.cuda.amp.GradScaler`](https://pytorch.org/docs/master/amp.html#torch.cuda.amp.GradScaler) to perform automatic mixed precision training. Our code follows a [typical mixed precision training example](https://pytorch.org/docs/master/notes/amp_examples.html#typical-mixed-precision-training)." ] }, { @@ -896,7 +896,8 @@ "id": "vrJls4p-FRcA" }, "source": [ - "from torch.cuda.amp import autocast, GradScaler\n", + "from torch.cuda.amp import GradScaler\n", + "from torch.amp import autocast\n", "\n", "from ignite.utils import convert_tensor\n", "import torch.nn.functional as F\n", diff --git a/examples/references/classification/imagenet/main.py b/examples/references/classification/imagenet/main.py index 85c20c08a62b..defb4ddc1510 100644 --- a/examples/references/classification/imagenet/main.py +++ b/examples/references/classification/imagenet/main.py @@ -6,9 +6,10 @@ import torch try: - from torch.cuda.amp import autocast, GradScaler + from torch.amp import autocast + from torch.cuda.amp import GradScaler except ImportError: - raise RuntimeError("Please, use recent PyTorch version, e.g. >=1.6.0") + raise RuntimeError("Please, use recent PyTorch version, e.g. >=1.12.0") import dataflow as data import utils @@ -144,7 +145,7 @@ def create_trainer(model, optimizer, criterion, train_sampler, config, logger, w def training_step(engine, batch): model.train() x, y = prepare_batch(batch, device=device, non_blocking=True) - with autocast(enabled=with_amp): + with autocast("cuda", enabled=with_amp): y_pred = model(x) y_pred = model_output_transform(y_pred) loss = criterion(y_pred, y) / accumulation_steps @@ -235,7 +236,7 @@ def create_evaluator(model, metrics, config, with_clearml, tag="val"): @torch.no_grad() def evaluate_step(engine, batch): model.eval() - with autocast(enabled=with_amp): + with autocast("cuda", enabled=with_amp): x, y = prepare_batch(batch, device=config.device, non_blocking=True) y_pred = model(x) y_pred = model_output_transform(y_pred) diff --git a/examples/references/segmentation/pascal_voc2012/main.py b/examples/references/segmentation/pascal_voc2012/main.py index 20afebbb7d36..b6fbc7ad494a 100644 --- a/examples/references/segmentation/pascal_voc2012/main.py +++ b/examples/references/segmentation/pascal_voc2012/main.py @@ -6,9 +6,10 @@ import torch try: - from torch.cuda.amp import autocast, GradScaler + from torch.amp import autocast + from torch.cuda.amp import GradScaler except ImportError: - raise RuntimeError("Please, use recent PyTorch version, e.g. >=1.6.0") + raise RuntimeError("Please, use recent PyTorch version, e.g. >=1.12.0") import dataflow as data import utils @@ -191,7 +192,7 @@ def create_trainer(model, optimizer, criterion, train_sampler, config, logger, w def forward_pass(batch): model.train() x, y = prepare_batch(batch, device=device, non_blocking=True) - with autocast(enabled=with_amp): + with autocast("cuda", enabled=with_amp): y_pred = model(x) y_pred = model_output_transform(y_pred) loss = criterion(y_pred, y) / accumulation_steps @@ -272,7 +273,7 @@ def create_evaluator(model, metrics, config, with_clearml, tag="val"): @torch.no_grad() def evaluate_step(engine, batch): model.eval() - with autocast(enabled=with_amp): + with autocast("cuda", enabled=with_amp): x, y = prepare_batch(batch, device=config.device, non_blocking=True) y_pred = model(x) y_pred = model_output_transform(y_pred) diff --git a/examples/transformers/main.py b/examples/transformers/main.py index cd1a84d2195b..f8118eabf90e 100644 --- a/examples/transformers/main.py +++ b/examples/transformers/main.py @@ -7,7 +7,8 @@ import torch.nn as nn import torch.optim as optim import utils -from torch.cuda.amp import autocast, GradScaler +from torch.amp import autocast +from torch.cuda.amp import GradScaler import ignite import ignite.distributed as idist @@ -309,7 +310,7 @@ def train_step(engine, batch): model.train() - with autocast(enabled=with_amp): + with autocast("cuda", enabled=with_amp): y_pred = model(input_batch) loss = criterion(y_pred, labels) @@ -373,7 +374,7 @@ def evaluate_step(engine, batch): input_batch = {k: v.to(device, non_blocking=True, dtype=torch.long) for k, v in batch[0].items()} labels = labels.to(device, non_blocking=True, dtype=torch.float) - with autocast(enabled=with_amp): + with autocast("cuda", enabled=with_amp): output = model(input_batch) return output, labels diff --git a/ignite/contrib/engines/common.py b/ignite/contrib/engines/common.py index 09f769a18d0f..bcfa54be55ea 100644 --- a/ignite/contrib/engines/common.py +++ b/ignite/contrib/engines/common.py @@ -78,7 +78,7 @@ def setup_common_training_handlers( lr_scheduler: learning rate scheduler as native torch LRScheduler or ignite's parameter scheduler. with_gpu_stats: if True, :class:`~ignite.metrics.GpuInfo` is attached to the - trainer. This requires `pynvml` package to be installed. + trainer. This requires `pynvml<12` package to be installed. output_names: list of names associated with `update_function` output dictionary. with_pbars: if True, two progress bars on epochs and optionally on iterations are attached. Default, True. diff --git a/ignite/engine/__init__.py b/ignite/engine/__init__.py index cbaac4e16cb7..6e82bc2f6bc7 100644 --- a/ignite/engine/__init__.py +++ b/ignite/engine/__init__.py @@ -185,9 +185,9 @@ def supervised_training_step_amp( """ try: - from torch.cuda.amp import autocast + from torch.amp import autocast except ImportError: - raise ImportError("Please install torch>=1.6.0 to use amp_mode='amp'.") + raise ImportError("Please install torch>=1.12.0 to use amp_mode='amp'.") if gradient_accumulation_steps <= 0: raise ValueError( @@ -200,7 +200,7 @@ def update(engine: Engine, batch: Sequence[torch.Tensor]) -> Union[Any, Tuple[to optimizer.zero_grad() model.train() x, y = prepare_batch(batch, device=device, non_blocking=non_blocking) - with autocast(enabled=True): + with autocast("cuda", enabled=True): output = model_fn(model, x) y_pred = model_transform(output) loss = loss_fn(y_pred, y) @@ -726,15 +726,15 @@ def supervised_evaluation_step_amp( Added `model_fn` to customize model's application on the sample """ try: - from torch.cuda.amp import autocast + from torch.amp import autocast except ImportError: - raise ImportError("Please install torch>=1.6.0 to use amp_mode='amp'.") + raise ImportError("Please install torch>=1.12.0 to use amp_mode='amp'.") def evaluate_step(engine: Engine, batch: Sequence[torch.Tensor]) -> Union[Any, Tuple[torch.Tensor]]: model.eval() with torch.no_grad(): x, y = prepare_batch(batch, device=device, non_blocking=non_blocking) - with autocast(enabled=True): + with autocast("cuda", enabled=True): output = model_fn(model, x) y_pred = model_transform(output) return output_transform(x, y, y_pred) diff --git a/ignite/engine/engine.py b/ignite/engine/engine.py index 27a949cacca2..f3f95c9a2e27 100644 --- a/ignite/engine/engine.py +++ b/ignite/engine/engine.py @@ -139,8 +139,12 @@ def __init__(self, process_function: Callable[["Engine", Any], Any]): self.logger = logging.getLogger(__name__ + "." + self.__class__.__name__) self._process_function = process_function self.last_event_name: Optional[Events] = None - self.should_terminate = False - self.should_terminate_single_epoch = False + # should_terminate flag: False - don't terminate, True - terminate, + # "skip_completed" - terminate and skip the event "COMPLETED" + self.should_terminate: Union[bool, str] = False + # should_terminate_single_epoch flag: False - don't terminate, True - terminate, + # "skip_epoch_completed" - terminate and skip the event "EPOCH_COMPLETED" + self.should_terminate_single_epoch: Union[bool, str] = False self.should_interrupt = False self.state = State() self._state_dict_user_keys: List[str] = [] @@ -538,15 +542,18 @@ def call_interrupt(): self.logger.info("interrupt signaled. Engine will interrupt the run after current iteration is finished.") self.should_interrupt = True - def terminate(self) -> None: + def terminate(self, skip_completed: bool = False) -> None: """Sends terminate signal to the engine, so that it terminates completely the run. The run is terminated after the event on which ``terminate`` method was called. The following events are triggered: - ... - Terminating event - :attr:`~ignite.engine.events.Events.TERMINATE` - - :attr:`~ignite.engine.events.Events.COMPLETED` + - :attr:`~ignite.engine.events.Events.COMPLETED` (unless `skip_completed=True`) + Args: + skip_completed: if True, the event :attr:`~ignite.engine.events.Events.COMPLETED` is not fired after + :attr:`~ignite.engine.events.Events.TERMINATE`. Default is False. Examples: .. testcode:: @@ -617,26 +624,35 @@ def terminate(): .. versionchanged:: 0.4.10 Behaviour changed, for details see https://github.com/pytorch/ignite/issues/2669 + .. versionchanged:: 0.5.2 + Added `skip_completed` flag """ self.logger.info("Terminate signaled. Engine will stop after current iteration is finished.") - self.should_terminate = True + self.should_terminate = "skip_completed" if skip_completed else True - def terminate_epoch(self) -> None: + def terminate_epoch(self, skip_epoch_completed: bool = False) -> None: """Sends terminate signal to the engine, so that it terminates the current epoch. The run continues from the next epoch. The following events are triggered: - ... - Event on which ``terminate_epoch`` method is called - :attr:`~ignite.engine.events.Events.TERMINATE_SINGLE_EPOCH` - - :attr:`~ignite.engine.events.Events.EPOCH_COMPLETED` + - :attr:`~ignite.engine.events.Events.EPOCH_COMPLETED` (unless `skip_epoch_completed=True`) - :attr:`~ignite.engine.events.Events.EPOCH_STARTED` - ... + + Args: + skip_epoch_completed: if True, the event :attr:`~ignite.engine.events.Events.EPOCH_COMPLETED` + is not fired after :attr:`~ignite.engine.events.Events.TERMINATE_SINGLE_EPOCH`. Default is False. + + .. versionchanged:: 0.5.2 + Added `skip_epoch_completed` flag """ self.logger.info( "Terminate current epoch is signaled. " "Current epoch iteration will stop after current iteration is finished." ) - self.should_terminate_single_epoch = True + self.should_terminate_single_epoch = "skip_epoch_completed" if skip_epoch_completed else True def _handle_exception(self, e: BaseException) -> None: if Events.EXCEPTION_RAISED in self._event_handlers: @@ -975,11 +991,17 @@ def _internal_run_as_gen(self) -> Generator[Any, None, State]: # time is available for handlers but must be updated after fire self.state.times[Events.EPOCH_COMPLETED.name] = epoch_time_taken - handlers_start_time = time.time() - self._fire_event(Events.EPOCH_COMPLETED) - epoch_time_taken += time.time() - handlers_start_time - # update time wrt handlers - self.state.times[Events.EPOCH_COMPLETED.name] = epoch_time_taken + if self.should_terminate_single_epoch != "skip_epoch_completed": # type: ignore[comparison-overlap] + handlers_start_time = time.time() + self._fire_event(Events.EPOCH_COMPLETED) + epoch_time_taken += time.time() - handlers_start_time + # update time wrt handlers + self.state.times[Events.EPOCH_COMPLETED.name] = epoch_time_taken + + if self.should_terminate_single_epoch: + # We skip raising _EngineTerminateSingleEpochException exception on Events.EPOCH_COMPLETED + # as epoch is already completed and nothing to terminate + self.should_terminate_single_epoch = False yield from self._maybe_terminate_or_interrupt() hours, mins, secs = _to_hours_mins_secs(epoch_time_taken) @@ -990,16 +1012,27 @@ def _internal_run_as_gen(self) -> Generator[Any, None, State]: except _EngineTerminateException: self._fire_event(Events.TERMINATE) + except _EngineTerminateSingleEpochException: + raise RuntimeError( + "The method terminate_epoch() should not be called on Event.STARTED or Event.EPOCH_STARTED." + "If this is a desired behaviour, please open a feature request on" + "https://github.com/pytorch/ignite/issues/new/choose" + ) + time_taken = time.time() - start_time # time is available for handlers but must be updated after fire self.state.times[Events.COMPLETED.name] = time_taken - handlers_start_time = time.time() - self._fire_event(Events.COMPLETED) - time_taken += time.time() - handlers_start_time - # update time wrt handlers - self.state.times[Events.COMPLETED.name] = time_taken + + # do not fire Events.COMPLETED if we terminated the run with flag `skip_completed=True` + if self.should_terminate != "skip_completed": # type: ignore[comparison-overlap] + handlers_start_time = time.time() + self._fire_event(Events.COMPLETED) + time_taken += time.time() - handlers_start_time + # update time wrt handlers + self.state.times[Events.COMPLETED.name] = time_taken + hours, mins, secs = _to_hours_mins_secs(time_taken) - self.logger.info(f"Engine run complete. Time taken: {hours:02d}:{mins:02d}:{secs:06.3f}") + self.logger.info(f"Engine run finished. Time taken: {hours:02d}:{mins:02d}:{secs:06.3f}") except BaseException as e: self._dataloader_iter = None @@ -1110,7 +1143,6 @@ def _run_once_on_dataset_as_gen(self) -> Generator[State, None, float]: except _EngineTerminateSingleEpochException: self._fire_event(Events.TERMINATE_SINGLE_EPOCH, iter_counter=iter_counter) - self.should_terminate_single_epoch = False self._setup_dataloader_iter() except _EngineTerminateException as e: @@ -1156,11 +1188,17 @@ def _internal_run_legacy(self) -> State: # time is available for handlers but must be updated after fire self.state.times[Events.EPOCH_COMPLETED.name] = epoch_time_taken - handlers_start_time = time.time() - self._fire_event(Events.EPOCH_COMPLETED) - epoch_time_taken += time.time() - handlers_start_time - # update time wrt handlers - self.state.times[Events.EPOCH_COMPLETED.name] = epoch_time_taken + if self.should_terminate_single_epoch != "skip_epoch_completed": # type: ignore[comparison-overlap] + handlers_start_time = time.time() + self._fire_event(Events.EPOCH_COMPLETED) + epoch_time_taken += time.time() - handlers_start_time + # update time wrt handlers + self.state.times[Events.EPOCH_COMPLETED.name] = epoch_time_taken + + if self.should_terminate_single_epoch: + # We skip raising _EngineTerminateSingleEpochException exception on Events.EPOCH_COMPLETED + # as epoch is already completed and nothing to terminate + self.should_terminate_single_epoch = False self._maybe_terminate_legacy() hours, mins, secs = _to_hours_mins_secs(epoch_time_taken) @@ -1171,16 +1209,27 @@ def _internal_run_legacy(self) -> State: except _EngineTerminateException: self._fire_event(Events.TERMINATE) + except _EngineTerminateSingleEpochException: + raise RuntimeError( + "The method terminate_epoch() should not be called on Event.STARTED or Event.EPOCH_STARTED." + "If this is a desired behaviour, please open a feature request on" + "https://github.com/pytorch/ignite/issues/new/choose" + ) + time_taken = time.time() - start_time # time is available for handlers but must be updated after fire self.state.times[Events.COMPLETED.name] = time_taken - handlers_start_time = time.time() - self._fire_event(Events.COMPLETED) - time_taken += time.time() - handlers_start_time - # update time wrt handlers - self.state.times[Events.COMPLETED.name] = time_taken + + # do not fire Events.COMPLETED if we terminated the run with flag `skip_completed=True` + if self.should_terminate != "skip_completed": # type: ignore[comparison-overlap] + handlers_start_time = time.time() + self._fire_event(Events.COMPLETED) + time_taken += time.time() - handlers_start_time + # update time wrt handlers + self.state.times[Events.COMPLETED.name] = time_taken + hours, mins, secs = _to_hours_mins_secs(time_taken) - self.logger.info(f"Engine run complete. Time taken: {hours:02d}:{mins:02d}:{secs:06.3f}") + self.logger.info(f"Engine run finished. Time taken: {hours:02d}:{mins:02d}:{secs:06.3f}") except BaseException as e: self._dataloader_iter = None @@ -1277,7 +1326,6 @@ def _run_once_on_dataset_legacy(self) -> float: except _EngineTerminateSingleEpochException: self._fire_event(Events.TERMINATE_SINGLE_EPOCH, iter_counter=iter_counter) - self.should_terminate_single_epoch = False self._setup_dataloader_iter() except _EngineTerminateException as e: diff --git a/ignite/engine/events.py b/ignite/engine/events.py index 9dd99348492b..7a348f947624 100644 --- a/ignite/engine/events.py +++ b/ignite/engine/events.py @@ -259,36 +259,53 @@ class Events(EventEnum): - TERMINATE_SINGLE_EPOCH : triggered when the run is about to end the current epoch, after receiving a :meth:`~ignite.engine.engine.Engine.terminate_epoch()` or :meth:`~ignite.engine.engine.Engine.terminate()` call. + - EPOCH_COMPLETED : triggered when the epoch is ended. This is triggered even + when :meth:`~ignite.engine.engine.Engine.terminate_epoch()` is called, + unless the flag `skip_epoch_completed` is set to True. - TERMINATE : triggered when the run is about to end completely, after receiving :meth:`~ignite.engine.engine.Engine.terminate()` call. - - EPOCH_COMPLETED : triggered when the epoch is ended. Note that this is triggered even - when :meth:`~ignite.engine.engine.Engine.terminate_epoch()` is called. - - COMPLETED : triggered when engine's run is completed + - COMPLETED : triggered when engine's run is completed or terminated with + :meth:`~ignite.engine.engine.Engine.terminate()`, unless the flag + `skip_completed` is set to True. The table below illustrates which events are triggered when various termination methods are called. .. list-table:: - :widths: 24 25 33 18 + :widths: 38 38 28 20 20 :header-rows: 1 * - Method - - EVENT_COMPLETED - TERMINATE_SINGLE_EPOCH + - EPOCH_COMPLETED - TERMINATE + - COMPLETED * - no termination - - ✔ - ✗ + - ✔ - ✗ + - ✔ * - :meth:`~ignite.engine.engine.Engine.terminate_epoch()` - ✔ - ✔ - ✗ + - ✔ + * - :meth:`~ignite.engine.engine.Engine.terminate_epoch()` with `skip_epoch_completed=True` + - ✔ + - ✗ + - ✗ + - ✔ * - :meth:`~ignite.engine.engine.Engine.terminate()` - ✗ - ✔ - ✔ + - ✔ + * - :meth:`~ignite.engine.engine.Engine.terminate()` with `skip_completed=True` + - ✗ + - ✔ + - ✔ + - ✗ Since v0.3.0, Events become more flexible and allow to pass an event filter to the Engine: @@ -357,7 +374,7 @@ class CustomEvents(EventEnum): STARTED = "started" """triggered when engine's run is started.""" COMPLETED = "completed" - """triggered when engine's run is completed""" + """triggered when engine's run is completed, or after receiving terminate() call.""" ITERATION_STARTED = "iteration_started" """triggered when an iteration is started.""" diff --git a/ignite/metrics/gpu_info.py b/ignite/metrics/gpu_info.py index 96ed4f07c57c..d13bbd8a1dae 100644 --- a/ignite/metrics/gpu_info.py +++ b/ignite/metrics/gpu_info.py @@ -10,7 +10,7 @@ class GpuInfo(Metric): """Provides GPU information: a) used memory percentage, b) gpu utilization percentage values as Metric - on each iterations. + on each iterations. This metric requires `pynvml `_ package of version `<12`. .. Note :: @@ -39,7 +39,7 @@ def __init__(self) -> None: except ImportError: raise ModuleNotFoundError( "This contrib module requires pynvml to be installed. " - "Please install it with command: \n pip install pynvml" + "Please install it with command: \n pip install 'pynvml<12'" ) # Let's check available devices if not torch.cuda.is_available(): diff --git a/pyproject.toml b/pyproject.toml index a6aae5458ad8..016c0d9eb13b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.black] line-length = 120 -target-version = ['py38', 'py39'] +target-version = ['py39', 'py311'] include = '\.pyi?$' exclude = ''' diff --git a/requirements-dev.txt b/requirements-dev.txt index d475e556cdff..91b560e56530 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -21,7 +21,7 @@ mlflow neptune-client>=0.16.17 tensorboard torchvision -pynvml +pynvml<12 # pynvml module was removed in 12.X, is not developed or maintained. We should replace pynvml with something else. clearml scikit-image py-rouge diff --git a/tests/ignite/contrib/engines/test_common.py b/tests/ignite/contrib/engines/test_common.py index d0100be9e8da..e14042e62c15 100644 --- a/tests/ignite/contrib/engines/test_common.py +++ b/tests/ignite/contrib/engines/test_common.py @@ -8,7 +8,6 @@ from torch.utils.data.distributed import DistributedSampler import ignite.distributed as idist - import ignite.handlers as handlers from ignite.contrib.engines.common import ( _setup_logging, diff --git a/tests/ignite/engine/test_create_supervised.py b/tests/ignite/engine/test_create_supervised.py index 54938167601a..4f07c95929e0 100644 --- a/tests/ignite/engine/test_create_supervised.py +++ b/tests/ignite/engine/test_create_supervised.py @@ -168,7 +168,7 @@ def _(): trainer.run(data) -@pytest.mark.skipif(Version(torch.__version__) < Version("1.6.0"), reason="Skip if < 1.6.0") +@pytest.mark.skipif(Version(torch.__version__) < Version("1.12.0"), reason="Skip if < 1.12.0") def test_create_supervised_training_scalar_assignment(): with mock.patch("ignite.engine._check_arg") as check_arg_mock: check_arg_mock.return_value = None, torch.cuda.amp.GradScaler(enabled=False) @@ -447,21 +447,21 @@ def test_create_supervised_trainer_apex_error(): def mock_torch_cuda_amp_module(): with patch.dict( "sys.modules", - {"torch.cuda.amp": None, "torch.cuda.amp.grad_scaler": None, "torch.cuda.amp.autocast_mode": None}, + {"torch.amp": None, "torch.cuda.amp": None, "torch.amp.autocast_mode": None}, ): yield torch def test_create_supervised_trainer_amp_error(mock_torch_cuda_amp_module): - with pytest.raises(ImportError, match="Please install torch>=1.6.0 to use amp_mode='amp'."): + with pytest.raises(ImportError, match="Please install torch>=1.12.0 to use amp_mode='amp'."): _test_create_supervised_trainer_wrong_accumulation(trainer_device="cpu", amp_mode="amp") - with pytest.raises(ImportError, match="Please install torch>=1.6.0 to use amp_mode='amp'."): + with pytest.raises(ImportError, match="Please install torch>=1.12.0 to use amp_mode='amp'."): _test_create_supervised_trainer(amp_mode="amp") with pytest.raises(ImportError, match="Please install torch>=1.6.0 to use scaler argument."): _test_create_supervised_trainer(amp_mode="amp", scaler=True) -@pytest.mark.skipif(Version(torch.__version__) < Version("1.5.0"), reason="Skip if < 1.5.0") +@pytest.mark.skipif(Version(torch.__version__) < Version("1.12.0"), reason="Skip if < 1.12.0") def test_create_supervised_trainer_scaler_not_amp(): scaler = torch.cuda.amp.GradScaler(enabled=torch.cuda.is_available()) @@ -501,7 +501,7 @@ def test_create_supervised_trainer_on_mps(): _test_create_mocked_supervised_trainer(model_device=model_device, trainer_device=trainer_device) -@pytest.mark.skipif(Version(torch.__version__) < Version("1.6.0"), reason="Skip if < 1.6.0") +@pytest.mark.skipif(Version(torch.__version__) < Version("1.12.0"), reason="Skip if < 1.12.0") @pytest.mark.skipif(not torch.cuda.is_available(), reason="Skip if no GPU") def test_create_supervised_trainer_on_cuda_amp(): model_device = trainer_device = "cuda" @@ -517,7 +517,7 @@ def test_create_supervised_trainer_on_cuda_amp(): _test_create_mocked_supervised_trainer(model_device=model_device, trainer_device=trainer_device, amp_mode="amp") -@pytest.mark.skipif(Version(torch.__version__) < Version("1.6.0"), reason="Skip if < 1.6.0") +@pytest.mark.skipif(Version(torch.__version__) < Version("1.12.0"), reason="Skip if < 1.12.0") @pytest.mark.skipif(not torch.cuda.is_available(), reason="Skip if no GPU") def test_create_supervised_trainer_on_cuda_amp_scaler(): model_device = trainer_device = "cuda" @@ -630,8 +630,8 @@ def test_create_supervised_evaluator(): _test_mocked_supervised_evaluator() # older versions didn't have the autocast method so we skip the test for older builds - if Version(torch.__version__) >= Version("1.6.0"): - with mock.patch("torch.cuda.amp.autocast") as mock_torch_cuda_amp_module: + if Version(torch.__version__) >= Version("1.12.0"): + with mock.patch("torch.amp.autocast") as mock_torch_cuda_amp_module: _test_create_evaluation_step_amp(mock_torch_cuda_amp_module) @@ -640,8 +640,8 @@ def test_create_supervised_evaluator_on_cpu(): _test_mocked_supervised_evaluator(evaluator_device="cpu") # older versions didn't have the autocast method so we skip the test for older builds - if Version(torch.__version__) >= Version("1.6.0"): - with mock.patch("torch.cuda.amp.autocast") as mock_torch_cuda_amp_module: + if Version(torch.__version__) >= Version("1.12.0"): + with mock.patch("torch.amp.autocast") as mock_torch_cuda_amp_module: _test_create_evaluation_step(mock_torch_cuda_amp_module, evaluator_device="cpu") _test_create_evaluation_step_amp(mock_torch_cuda_amp_module, evaluator_device="cpu") @@ -651,8 +651,8 @@ def test_create_supervised_evaluator_traced_on_cpu(): _test_mocked_supervised_evaluator(evaluator_device="cpu", trace=True) # older versions didn't have the autocast method so we skip the test for older builds - if Version(torch.__version__) >= Version("1.6.0"): - with mock.patch("torch.cuda.amp.autocast") as mock_torch_cuda_amp_module: + if Version(torch.__version__) >= Version("1.12.0"): + with mock.patch("torch.amp.autocast") as mock_torch_cuda_amp_module: _test_create_evaluation_step(mock_torch_cuda_amp_module, evaluator_device="cpu", trace=True) @@ -682,7 +682,7 @@ def test_create_supervised_evaluator_on_mps_with_model_on_cpu(): _test_mocked_supervised_evaluator(evaluator_device="mps") -@pytest.mark.skipif(Version(torch.__version__) < Version("1.6.0"), reason="Skip if < 1.6.0") +@pytest.mark.skipif(Version(torch.__version__) < Version("1.12.0"), reason="Skip if < 1.12.0") @pytest.mark.skipif(not torch.cuda.is_available(), reason="Skip if no GPU") def test_create_supervised_evaluator_on_cuda_amp(): model_device = evaluator_device = "cuda" @@ -691,7 +691,7 @@ def test_create_supervised_evaluator_on_cuda_amp(): def test_create_supervised_evaluator_amp_error(mock_torch_cuda_amp_module): - with pytest.raises(ImportError, match="Please install torch>=1.6.0 to use amp_mode='amp'."): + with pytest.raises(ImportError, match="Please install torch>=1.12.0 to use amp_mode='amp'."): _test_create_supervised_evaluator(amp_mode="amp") diff --git a/tests/ignite/engine/test_engine.py b/tests/ignite/engine/test_engine.py index 130212426504..76e1ad837605 100644 --- a/tests/ignite/engine/test_engine.py +++ b/tests/ignite/engine/test_engine.py @@ -40,11 +40,17 @@ class TestEngine: def set_interrupt_resume_enabled(self, interrupt_resume_enabled): Engine.interrupt_resume_enabled = interrupt_resume_enabled - def test_terminate(self): + @pytest.mark.parametrize("skip_completed", [True, False]) + def test_terminate(self, skip_completed): engine = Engine(lambda e, b: 1) assert not engine.should_terminate - engine.terminate() - assert engine.should_terminate + + engine.terminate(skip_completed) + + if skip_completed: + assert engine.should_terminate == "skip_completed" + else: + assert engine.should_terminate == True # noqa: E712 def test_invalid_process_raises_with_invalid_signature(self): with pytest.raises(ValueError, match=r"Engine must be given a processing function in order to run"): @@ -236,25 +242,32 @@ def check_iter_and_data(): assert num_calls_check_iter_epoch == 1 @pytest.mark.parametrize( - "terminate_event, e, i", + "terminate_event, e, i, skip_completed", [ - (Events.STARTED, 0, 0), - (Events.EPOCH_STARTED(once=2), 2, None), - (Events.EPOCH_COMPLETED(once=2), 2, None), - (Events.GET_BATCH_STARTED(once=12), None, 12), - (Events.GET_BATCH_COMPLETED(once=12), None, 12), - (Events.ITERATION_STARTED(once=14), None, 14), - (Events.ITERATION_COMPLETED(once=14), None, 14), + (Events.STARTED, 0, 0, True), + (Events.EPOCH_STARTED(once=2), 2, None, True), + (Events.EPOCH_COMPLETED(once=2), 2, None, True), + (Events.GET_BATCH_STARTED(once=12), None, 12, True), + (Events.GET_BATCH_COMPLETED(once=12), None, 12, False), + (Events.ITERATION_STARTED(once=14), None, 14, True), + (Events.ITERATION_COMPLETED(once=14), None, 14, True), + (Events.STARTED, 0, 0, False), + (Events.EPOCH_STARTED(once=2), 2, None, False), + (Events.EPOCH_COMPLETED(once=2), 2, None, False), + (Events.GET_BATCH_STARTED(once=12), None, 12, False), + (Events.GET_BATCH_COMPLETED(once=12), None, 12, False), + (Events.ITERATION_STARTED(once=14), None, 14, False), + (Events.ITERATION_COMPLETED(once=14), None, 14, False), ], ) - def test_terminate_events_sequence(self, terminate_event, e, i): + def test_terminate_events_sequence(self, terminate_event, e, i, skip_completed): engine = RecordedEngine(MagicMock(return_value=1)) data = range(10) max_epochs = 5 @engine.on(terminate_event) def call_terminate(): - engine.terminate() + engine.terminate(skip_completed) @engine.on(Events.EXCEPTION_RAISED) def assert_no_exceptions(ee): @@ -271,14 +284,22 @@ def assert_no_exceptions(ee): if e is None: e = i // len(data) + 1 + if skip_completed: + assert engine.called_events[-1] == (e, i, Events.TERMINATE) + assert engine.called_events[-2] == (e, i, terminate_event) + else: + assert engine.called_events[-1] == (e, i, Events.COMPLETED) + assert engine.called_events[-2] == (e, i, Events.TERMINATE) + assert engine.called_events[-3] == (e, i, terminate_event) + assert engine.called_events[0] == (0, 0, Events.STARTED) - assert engine.called_events[-1] == (e, i, Events.COMPLETED) - assert engine.called_events[-2] == (e, i, Events.TERMINATE) - assert engine.called_events[-3] == (e, i, terminate_event) assert engine._dataloader_iter is None - @pytest.mark.parametrize("data, epoch_length", [(None, 10), (range(10), None)]) - def test_terminate_epoch_stops_mid_epoch(self, data, epoch_length): + @pytest.mark.parametrize( + "data, epoch_length, skip_epoch_completed", + [(None, 10, False), (range(10), None, False), (None, 10, True), (range(10), None, True)], + ) + def test_terminate_epoch_stops_mid_epoch(self, data, epoch_length, skip_epoch_completed): real_epoch_length = epoch_length if data is None else len(data) iteration_to_stop = real_epoch_length + 4 @@ -286,7 +307,7 @@ def test_terminate_epoch_stops_mid_epoch(self, data, epoch_length): def start_of_iteration_handler(engine): if engine.state.iteration == iteration_to_stop: - engine.terminate_epoch() + engine.terminate_epoch(skip_epoch_completed) max_epochs = 3 engine.add_event_handler(Events.ITERATION_STARTED, start_of_iteration_handler) @@ -297,15 +318,23 @@ def start_of_iteration_handler(engine): assert state.epoch == max_epochs @pytest.mark.parametrize( - "terminate_epoch_event, i", + "terminate_epoch_event, i, skip_epoch_completed", [ - (Events.GET_BATCH_STARTED(once=12), 12), - (Events.GET_BATCH_COMPLETED(once=12), 12), - (Events.ITERATION_STARTED(once=14), 14), - (Events.ITERATION_COMPLETED(once=14), 14), + (Events.GET_BATCH_STARTED(once=12), 12, False), + (Events.GET_BATCH_COMPLETED(once=12), 12, False), + (Events.ITERATION_STARTED(once=14), 14, False), + (Events.ITERATION_COMPLETED(once=14), 14, False), + (Events.GET_BATCH_STARTED(once=12), 12, True), + (Events.GET_BATCH_COMPLETED(once=12), 12, True), + (Events.ITERATION_STARTED(once=14), 14, True), + (Events.ITERATION_COMPLETED(once=14), 14, True), + (Events.STARTED, 30, False), + (Events.STARTED, 30, True), + (Events.EPOCH_STARTED(once=2), 10, False), + (Events.EPOCH_STARTED(once=2), 10, True), ], ) - def test_terminate_epoch_events_sequence(self, terminate_epoch_event, i): + def test_terminate_epoch_events_sequence(self, terminate_epoch_event, i, skip_epoch_completed): engine = RecordedEngine(MagicMock(return_value=1)) data = range(10) max_epochs = 3 @@ -316,31 +345,54 @@ def test_terminate_epoch_events_sequence(self, terminate_epoch_event, i): @engine.on(terminate_epoch_event) def call_terminate_epoch(): + assert not engine.should_terminate_single_epoch nonlocal call_count if call_count < 1: - engine.terminate_epoch() + engine.terminate_epoch(skip_epoch_completed) + if skip_epoch_completed: + assert engine.should_terminate_single_epoch == "skip_epoch_completed" + else: + assert engine.should_terminate_single_epoch == True # noqa: E712 + call_count += 1 + @engine.on(Events.EPOCH_STARTED) + def check_skip_reset(): + if terminate_epoch_event != Events.EPOCH_STARTED: + assert engine.should_terminate_single_epoch == False # noqa: E712 + @engine.on(Events.TERMINATE_SINGLE_EPOCH) def check_previous_events(iter_counter): e = i // len(data) + 1 - assert engine.called_events[0] == (0, 0, Events.STARTED) assert engine.called_events[-2] == (e, i, terminate_epoch_event) assert engine.called_events[-1] == (e, i, Events.TERMINATE_SINGLE_EPOCH) + if skip_epoch_completed: + assert engine.should_terminate_single_epoch == "skip_epoch_completed" + else: + assert engine.should_terminate_single_epoch == True # noqa: E712 @engine.on(Events.EPOCH_COMPLETED) def check_previous_events2(): e = i // len(data) + 1 if e == engine.state.epoch and i == engine.state.iteration: + assert not skip_epoch_completed + assert isinstance(engine.should_terminate_single_epoch, bool) assert engine.called_events[-3] == (e, i, terminate_epoch_event) assert engine.called_events[-2] == (e, i, Events.TERMINATE_SINGLE_EPOCH) assert engine.called_events[-1] == (e, i, Events.EPOCH_COMPLETED) - engine.run(data, max_epochs=max_epochs) + if terminate_epoch_event in [Events.STARTED, Events.EPOCH_STARTED]: + with pytest.raises(RuntimeError): + engine.run(data, max_epochs=max_epochs) + else: + engine.run(data, max_epochs=max_epochs) + + assert engine.state.epoch == max_epochs + assert (max_epochs - 1) * len(data) < engine.state.iteration < max_epochs * len(data) - assert engine.state.epoch == max_epochs - assert (max_epochs - 1) * len(data) < engine.state.iteration < max_epochs * len(data) + epoch_completed_events = [e for e in engine.called_events if e[2] == Events.EPOCH_COMPLETED.name] + assert len(epoch_completed_events) == max_epochs - skip_epoch_completed @pytest.mark.parametrize("data", [None, "mock_data_loader"]) def test_iteration_events_are_fired(self, data): diff --git a/tests/ignite/metrics/test_classification_report.py b/tests/ignite/metrics/test_classification_report.py index 87e328c8051e..cae8b5145f55 100644 --- a/tests/ignite/metrics/test_classification_report.py +++ b/tests/ignite/metrics/test_classification_report.py @@ -164,6 +164,23 @@ def update(engine, i): @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU") @pytest.mark.skipif(Version(torch.__version__) < Version("1.7.0"), reason="Skip if < 1.7.0") def test_distrib_nccl_gpu(distributed_context_single_node_nccl): + + pytest.skip("Temporarily skip failing test. See https://github.com/pytorch/ignite/pull/3301") + # When run with 2 devices: + # tests/ignite/metrics/test_classification_report.py::test_distrib_nccl_gpu Fatal Python error: Aborted + # Thread 0x00007fac95c95700 (most recent call first): + # + + # Thread 0x00007facbb89b700 (most recent call first): + # + + # Thread 0x00007fae637f4700 (most recent call first): + # File "", line 534 in read + # File "", line 567 in from_io + # File "", line 1160 in _thread_receiver + # File "", line 341 in run + # File "", line 411 in _perform_spawn + device = idist.device() _test_integration_multiclass(device, True) _test_integration_multiclass(device, False) diff --git a/tests/ignite/metrics/test_roc_auc.py b/tests/ignite/metrics/test_roc_auc.py index 1e60c480ca18..8695f188b5fb 100644 --- a/tests/ignite/metrics/test_roc_auc.py +++ b/tests/ignite/metrics/test_roc_auc.py @@ -4,6 +4,7 @@ import pytest import sklearn import torch +from sklearn.exceptions import UndefinedMetricWarning from sklearn.metrics import roc_auc_score import ignite.distributed as idist @@ -112,7 +113,7 @@ def test_check_compute_fn(): em = ROC_AUC(check_compute_fn=True) em.reset() - with pytest.warns(EpochMetricWarning, match=r"Probably, there can be a problem with `compute_fn`"): + with pytest.warns((UndefinedMetricWarning, EpochMetricWarning), match=r"Only one class.+present in y_true"): em.update(output) em = ROC_AUC(check_compute_fn=False)