From 130fb58608f96472120139c1e723bb737d8f0aa5 Mon Sep 17 00:00:00 2001 From: Jagadish Krishnamoorthy Date: Tue, 29 Oct 2024 06:47:43 -0700 Subject: [PATCH] [cifar ds training]: Set cuda device during initialization of distributed backend. (#931) * Set cuda device during initialization of distributed backend. The commit is needed to avoid GPU 0 being set as the compute stream via torch.cuda.current_stream() during initialization across all GPUs. Signed-off-by: Jagadish Krishnamoorthy * Use device-agnostic accelerator API. Signed-off-by: Jagadish Krishnamoorthy --------- Signed-off-by: Jagadish Krishnamoorthy --- training/cifar/cifar10_deepspeed.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/training/cifar/cifar10_deepspeed.py b/training/cifar/cifar10_deepspeed.py index 521a75cdf..9888544d5 100755 --- a/training/cifar/cifar10_deepspeed.py +++ b/training/cifar/cifar10_deepspeed.py @@ -1,4 +1,5 @@ import argparse +import os import deepspeed import torch @@ -279,6 +280,8 @@ def test(model_engine, testset, local_device, target_dtype, test_batch_size=4): def main(args): # Initialize DeepSpeed distributed backend. deepspeed.init_distributed() + _local_rank = int(os.environ.get("LOCAL_RANK")) + get_accelerator().set_device(_local_rank) ######################################################################## # Step1. Data Preparation.