diff --git a/test/dtypes/test_nf4.py b/test/dtypes/test_nf4.py index 221eed8216..a42a209a38 100644 --- a/test/dtypes/test_nf4.py +++ b/test/dtypes/test_nf4.py @@ -43,7 +43,7 @@ to_nf4, ) from torchao.testing.utils import skip_if_rocm -from torchao.utils import torch_version_at_least +from torchao.utils import get_current_accelerator_device, torch_version_at_least bnb_available = False @@ -57,6 +57,7 @@ logging.basicConfig( format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", level=logging.INFO ) +_DEVICE = get_current_accelerator_device() def _build_input_weight(embed_dim: int, device: torch.device, dtype: torch.dtype): @@ -68,7 +69,7 @@ def _build_input_weight(embed_dim: int, device: torch.device, dtype: torch.dtype def _build_bnb_linear(input_weight, device): assert bnb_available, "Needs bitsandbytes support" - param = bnb.nn.Params4bit(input_weight, requires_grad=False, quant_type="nf4").cuda( + param = bnb.nn.Params4bit(input_weight, requires_grad=False, quant_type="nf4").to( device ) bnb_linear = bnb.nn.LinearNF4( @@ -121,7 +122,7 @@ def test_backward_dtype_match(self, dtype: torch.dtype): assert nf4_tensor.grad is None @unittest.skipIf(not bnb_available, "Need bnb availble") - @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available") + @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available") @unittest.skipIf( torch_version_at_least("2.7.0"), reason="Failing in CI" ) # TODO: fix this @@ -130,7 +131,7 @@ def test_backward_dtype_match(self, dtype: torch.dtype): def test_reconstruction_qlora_vs_bnb(self, dtype: torch.dtype): # From https://github.com/drisspg/transformer_nuggets/blob/f05afad68ad9086d342268f46a7f344617a02314/test/test_qlora.py#L65C1-L81C47 torch.manual_seed(0) - device = "cuda" + device = _DEVICE embed_dim = 512 input_weight = _build_input_weight(embed_dim, device, dtype) nf4_weight = to_nf4(input_weight) @@ -147,7 +148,7 @@ def test_reconstruction_qlora_vs_bnb(self, dtype: torch.dtype): assert (nugs_diff - bnb_diff).abs() < 2e-1 @unittest.skipIf(not bnb_available, "Need bnb availble") - @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available") + @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available") @skip_if_rocm("ROCm enablement in progress") @unittest.skipIf( torch_version_at_least("2.7.0"), reason="Failing in CI" @@ -160,12 +161,12 @@ def test_nf4_bnb_linear(self, dtype: torch.dtype): """ torch.manual_seed(0) dim = 512 - device = "cuda" + device = _DEVICE input_weight = _build_input_weight(dim, device, dtype) nf4_weight = to_nf4(input_weight) bnb_linear = _build_bnb_linear(input_weight, device) - inp = torch.randn(2, 512, dtype=dtype, device="cuda") + inp = torch.randn(2, 512, dtype=dtype, device=_DEVICE) out_nf4 = linear_nf4(inp, nf4_weight).sum() out_bnb = bnb_linear(inp).sum() @@ -176,11 +177,11 @@ def test_nf4_bnb_linear(self, dtype: torch.dtype): assert err_native < 0.5 * dim assert err_bnb < 0.5 * dim - @unittest.skipIf(not torch.cuda.is_available(), "Need cuda for test") + @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU for test") @parametrize("dtype", [torch.bfloat16, torch.float16, torch.float32]) def test_load_from_state_dicts(self, dtype: torch.dtype): """Tests loading to and from different module state dicts""" - input_tensor = torch.rand(64, device="cuda", dtype=dtype) + input_tensor = torch.rand(64, device=_DEVICE, dtype=dtype) base_mod = self.TestMod(input_tensor, 32, 2) dummy_dict = {"param": input_tensor} @@ -222,27 +223,27 @@ def test_to_copy(self, dtype: torch.dtype): nf4_to_dtype = input_tensor_nf4.to(dtype) torch.testing.assert_allclose(input_tensor, nf4_to_dtype, atol=0.13, rtol=0.13) - if torch.cuda.is_available(): - input_tensor = torch.rand(128, device="cuda") + if torch.accelerator.is_available(): + input_tensor = torch.rand(128, device=_DEVICE) input_tensor_nf4 = to_nf4(input_tensor, 32, 2) nf4_to_dtype = input_tensor_nf4.to(dtype) torch.testing.assert_allclose( input_tensor, nf4_to_dtype, atol=0.13, rtol=0.13 ) - @unittest.skipIf(not torch.cuda.is_available(), "Need cuda for test") + @unittest.skipIf(not torch.accelerator.is_available(), "Need gpu for test") def test_to_copy_device(self): input_tensor = torch.rand(128, device="cpu") t = to_nf4(input_tensor, 32, 2) assert t.device == torch.device("cpu") - z = t.cuda() - assert z.device.type == "cuda" # Because the device could be cuda:0 + z = t.to(_DEVICE) + assert z.device.type == _DEVICE.type # Because the device could be cuda:0 x = z.cpu() assert x.device == torch.device("cpu") - input_tensor = torch.rand(128, device="cuda") + input_tensor = torch.rand(128, device=_DEVICE) t = to_nf4(input_tensor, 32, 2) - assert t.device.type == "cuda" + assert t.device.type == _DEVICE.type @parametrize("dtype", [torch.bfloat16, torch.float16, torch.float32]) def test_to_dtype(self, dtype: torch.dtype): @@ -252,10 +253,10 @@ def test_to_dtype(self, dtype: torch.dtype): assert type(input_tensor_nf4.to(dtype)) is torch.Tensor assert input_tensor_nf4.to(dtype).dtype is dtype - @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available") + @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available") @parametrize("dtype", [torch.bfloat16, torch.float16, torch.float32]) def test_smoketest_linear(self, dtype: torch.dtype): - a = torch.randn(32, 32, dtype=dtype, device="cuda") + a = torch.randn(32, 32, dtype=dtype, device=_DEVICE) a_nf4 = torchao.dtypes.to_nf4(a, 16, 2) inp = torch.randn(2, 32, 32, dtype=a.dtype, device=a.device) _ = torch.nn.functional.linear(inp, a) @@ -272,17 +273,17 @@ def test_smoketest_linear_compile(self, dtype: torch.dtype): self.skipTest("test requires SM capability of at least (8, 0).") if version.parse(torch.__version__) < version.parse("2.3.0"): self.skipTest("test requires 2.3.0 and above for tracing NF4Tensor") - a = torch.randn(32, 32, dtype=dtype, device="cuda") + a = torch.randn(32, 32, dtype=dtype, device=_DEVICE) a_nf4 = torchao.dtypes.to_nf4(a, 16, 2) inp = torch.randn(2, 32, 32, dtype=a.dtype, device=a.device) _ = torch.compile(torch.nn.functional.linear, mode="max-autotune")(inp, a_nf4) - @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available") + @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available") @parametrize("dtype", [torch.bfloat16, torch.float16, torch.float32]) @parametrize("shape", [(16, 16), (32, 16)]) @parametrize("chunk_size", [8, 16, 32]) def test_chunk_size_equivalence(self, dtype: torch.dtype, shape, chunk_size): - a = torch.randn(shape, device="cuda", dtype=dtype) + a = torch.randn(shape, device=_DEVICE, dtype=dtype) with unittest.mock.patch("torchao.dtypes.nf4tensor.CHUNK_SIZE", chunk_size): nf4_patched = to_nf4(a, 16, 2) # This will be essentially no chunking since the numel is alot smaller than default chunk_size @@ -290,19 +291,19 @@ def test_chunk_size_equivalence(self, dtype: torch.dtype, shape, chunk_size): torch.testing.assert_close(nf4_patched.quantized_data, nf4_base.quantized_data) - @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available") + @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available") @parametrize("input_size", [(512 * 512,), (512, 512)]) def test_empty_like(self, input_size: Union[Tuple[int], int]): - nf4_tensor = to_nf4(torch.rand(input_size, device="cuda")) + nf4_tensor = to_nf4(torch.rand(input_size, device=_DEVICE)) new_tensor = torch.empty_like(nf4_tensor, device="cpu") self.assertTrue(isinstance(new_tensor, NF4Tensor)) self.assertEqual(new_tensor.get_device(), -1) # that it's on CPU self.assertEqual(new_tensor.size(), nf4_tensor.size()) - @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available") + @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available") @parametrize("compile", [False, True]) def test_quantize_api(self, compile): - nf4_linear = nn.Linear(512, 512, device="cuda") + nf4_linear = nn.Linear(512, 512, device=_DEVICE) torchao.quantize_(nf4_linear, nf4_weight_only()) assert isinstance(nf4_linear.weight, NF4Tensor) @@ -313,14 +314,14 @@ def test_quantize_api(self, compile): nf4_linear.compile() ref_linear.compile() - nf4_x = torch.randn(2, 512, device="cuda").requires_grad_() + nf4_x = torch.randn(2, 512, device=_DEVICE).requires_grad_() ref_x = nf4_x.detach().clone().requires_grad_() nf4_out = nf4_linear(nf4_x) ref_out = ref_linear(ref_x) self.assertEqual(nf4_out, ref_out) - grad_out = torch.randn(2, 512, device="cuda") + grad_out = torch.randn(2, 512, device=_DEVICE) nf4_out.backward(grad_out) ref_out.backward(grad_out) self.assertEqual(nf4_x.grad, ref_x.grad) @@ -511,7 +512,7 @@ def test_tensor_as_strided_invalid(self, input_size: Union[Tuple[int], int]): nf4_tensor, nf4_tensor.size(), stride, nf4_tensor.storage_offset() ) - @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available") + @unittest.skipIf(not torch.accelerator.is_available(), "Need GPu available") def test_pin_memory(self): nf4_tensor = to_nf4(torch.randn(512 * 512)) self.assertFalse(nf4_tensor.is_pinned()) @@ -519,35 +520,35 @@ def test_pin_memory(self): nf4_tensor = nf4_tensor.pin_memory() self.assertTrue(nf4_tensor.is_pinned()) - nf4_tensor = to_nf4(torch.randn(512 * 512, device="cuda")) + nf4_tensor = to_nf4(torch.randn(512 * 512, device=_DEVICE)) self.assertFalse(nf4_tensor.is_pinned()) - @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available") + @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available") def test_to_cuda(self): nf4_tensor = to_nf4(torch.randn(512 * 512)) self.assertEqual(nf4_tensor.device.type, "cpu") - nf4_tensor = nf4_tensor.to("cuda", non_blocking=True) - self.assertEqual(nf4_tensor.device.type, "cuda") + nf4_tensor = nf4_tensor.to(_DEVICE, non_blocking=True) + self.assertEqual(nf4_tensor.device.type, _DEVICE.type) self.assertEqual(type(nf4_tensor), NF4Tensor) nf4_tensor.get_original_weight() # make sure we can dequantize nf4_tensor = to_nf4(torch.randn(512 * 512)) self.assertEqual(nf4_tensor.device.type, "cpu") - nf4_tensor = nf4_tensor.to("cuda") - self.assertEqual(nf4_tensor.device.type, "cuda") + nf4_tensor = nf4_tensor.to(_DEVICE) + self.assertEqual(nf4_tensor.device.type, _DEVICE.type) self.assertEqual(type(nf4_tensor), NF4Tensor) nf4_tensor.get_original_weight() nf4_tensor = to_nf4(torch.randn(512 * 512)) self.assertEqual(nf4_tensor.device.type, "cpu") - nf4_tensor = nf4_tensor.to("cuda", torch.bfloat16) - self.assertEqual(nf4_tensor.device.type, "cuda") + nf4_tensor = nf4_tensor.to(_DEVICE, torch.bfloat16) + self.assertEqual(nf4_tensor.device.type, _DEVICE.type) self.assertEqual(nf4_tensor.dtype, torch.bfloat16) self.assertEqual(type(nf4_tensor), torch.Tensor) # dequantized - @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available") + @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available") def test_to_cpu(self): - nf4_tensor = to_nf4(torch.randn(512 * 512, device="cuda")) + nf4_tensor = to_nf4(torch.randn(512 * 512, device=_DEVICE)) nf4_tensor = nf4_tensor.cpu() self.assertEqual(nf4_tensor.device.type, "cpu") for attr in _INNER_TENSOR_NAMES_FOR_SHARDING: @@ -555,16 +556,16 @@ def test_to_cpu(self): self.assertEqual(inner_tensor.device.type, "cpu") nf4_tensor.get_original_weight() # make sure we can dequantize - @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available") + @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available") def test_to_module(self): linear = nn.Linear(512, 512, bias=False) linear.weight = nn.Parameter( to_nf4(linear.weight.detach()), requires_grad=False ) - linear.cuda() - self.assertEqual(linear.weight.device.type, "cuda") + linear.to(_DEVICE) + self.assertEqual(linear.weight.device.type, _DEVICE.type) weight = linear.weight.get_original_weight() - self.assertEqual(weight.device.type, "cuda") + self.assertEqual(weight.device.type, _DEVICE.type) linear.cpu() self.assertEqual(linear.weight.device.type, "cpu") @@ -575,20 +576,20 @@ def test_to_module(self): linear.weight = nn.Parameter( to_nf4(linear.weight.detach()), requires_grad=False ) - linear.to("cuda") - self.assertEqual(linear.weight.device.type, "cuda") + linear.to(_DEVICE) + self.assertEqual(linear.weight.device.type, _DEVICE.type) weight = linear.weight.get_original_weight() - self.assertEqual(weight.device.type, "cuda") + self.assertEqual(weight.device.type, _DEVICE.type) linear.to("cpu") self.assertEqual(linear.weight.device.type, "cpu") weight = linear.weight.get_original_weight() self.assertEqual(weight.device.type, "cpu") - @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available") + @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available") @parametrize("input_size", [512 * 512, (512 * 512,), (512, 512)]) def test_tensor_deepcopy(self, input_size: Union[Tuple[int], int]): - nf4_orig = to_nf4(torch.randn(input_size, device="cuda")) + nf4_orig = to_nf4(torch.randn(input_size, device=_DEVICE)) nf4_clone = copy.deepcopy(nf4_orig) self.assertEqual( nf4_clone.get_original_weight(), nf4_orig.get_original_weight() @@ -678,7 +679,7 @@ def _test_qlora_fsdp2( dropout_p=0, ) torch.manual_seed(42) - with torch.device("cuda"): + with torch.device(_DEVICE): base_model = Transformer(model_args) for layer in base_model.layers: # attention with lora adapters @@ -732,7 +733,7 @@ def _test_qlora_fsdp2( torch.manual_seed(42 + self.rank + 1) for iter_idx in range(5): - inp = torch.randint(0, vocab_size, (batch_size, seq_len), device="cuda") + inp = torch.randint(0, vocab_size, (batch_size, seq_len), device=_DEVICE) fsdp_optim.zero_grad(set_to_none=(iter_idx % 2 == 0)) fsdp_loss = fsdp_model(inp).sum() fsdp_loss.backward() @@ -756,7 +757,7 @@ def world_size(self) -> int: return 2 @skip_if_lt_x_gpu(2) - @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available") + @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available") def test_comm(self): self.run_subtests( {"input_size": [512, 2048]}, @@ -767,7 +768,7 @@ def _test_comm(self, input_size: int): from torch.distributed._composable.fsdp import fully_shard from torch.distributed._tensor import distribute_tensor - model = nn.Linear(input_size, input_size, device="cuda") + model = nn.Linear(input_size, input_size, device=_DEVICE) origin_tensor = model.weight origin_nf4_tensor = to_nf4(origin_tensor) model = fully_shard(model) diff --git a/test/dtypes/test_uintx.py b/test/dtypes/test_uintx.py index 3172381a3a..c8b4f6b7df 100644 --- a/test/dtypes/test_uintx.py +++ b/test/dtypes/test_uintx.py @@ -17,6 +17,7 @@ dequantize_affine, quantize_affine, ) +from torchao.utils import get_current_accelerator_device dtypes = ( torch.uint1, @@ -29,7 +30,8 @@ ) group_sizes = [32, 64, 128] -devices = ["cpu", "cuda"] +_DEVICE = get_current_accelerator_device() +devices = ["cpu", _DEVICE] @pytest.fixture(autouse=True) @@ -59,15 +61,15 @@ def forward(self, x): @pytest.mark.parametrize("dtype", dtypes) @pytest.mark.parametrize("group_size", group_sizes) -@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available") +@pytest.mark.skipif(not torch.accelerator.is_available(), reason="GPU not available") def test_uintx_quant_on_cpu_then_move_to_cuda(dtype, group_size): scale = 512 fp16_mod_on_cpu = Linear16(scale, "cpu") quantize_(fp16_mod_on_cpu, UIntXWeightOnlyConfig(dtype, group_size=group_size)) test_input_on_cpu = torch.randn(scale * 2, dtype=torch.float16, device="cpu") output_on_cpu = fp16_mod_on_cpu(test_input_on_cpu) - fp16_mod_on_cuda = fp16_mod_on_cpu.to("cuda") - test_input_on_cuda = test_input_on_cpu.to("cuda") + fp16_mod_on_cuda = fp16_mod_on_cpu.to(_DEVICE) + test_input_on_cuda = test_input_on_cpu.to(_DEVICE) output_on_cuda = fp16_mod_on_cuda(test_input_on_cuda) assert torch.allclose(output_on_cpu, output_on_cuda.cpu(), atol=1.0e-3), ( "The output of the model on CPU and CUDA should be close" @@ -77,7 +79,7 @@ def test_uintx_quant_on_cpu_then_move_to_cuda(dtype, group_size): @pytest.mark.parametrize("dtype", dtypes) @pytest.mark.parametrize("group_size", group_sizes) @pytest.mark.parametrize("device", devices) -@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available") +@pytest.mark.skipif(not torch.accelerator.is_available(), reason="GPU not available") def test_uintx_weight_only_model_quant(dtype, group_size, device): scale = 512 fp16 = Linear16(scale, device) @@ -91,7 +93,7 @@ def test_uintx_weight_only_model_quant(dtype, group_size, device): @pytest.mark.parametrize("dtype", dtypes) @pytest.mark.parametrize("group_size", group_sizes) @pytest.mark.parametrize("device", devices) -@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available") +@pytest.mark.skipif(not torch.accelerator.is_available(), reason="GPU not available") def test_uintx_weight_only_quant(dtype, group_size, device): input_float = torch.randn((1, 256), dtype=torch.float16, device=device) mapping_type = MappingType.SYMMETRIC @@ -125,26 +127,26 @@ def test_uintx_weight_only_quant(dtype, group_size, device): @pytest.mark.parametrize("dtype", dtypes) -@pytest.mark.skipif(not torch.cuda.is_available(), reason="Need CUDA available") +@pytest.mark.skipif(not torch.accelerator.is_available(), reason="Need GPU available") def test_uintx_target_dtype(dtype): - linear = torch.nn.Linear(128, 256, dtype=torch.bfloat16, device="cuda") + linear = torch.nn.Linear(128, 256, dtype=torch.bfloat16, device=_DEVICE) # make sure it runs quantize_(linear, UIntXWeightOnlyConfig(dtype)) - linear(torch.randn(1, 128, dtype=torch.bfloat16, device="cuda")) + linear(torch.randn(1, 128, dtype=torch.bfloat16, device=_DEVICE)) @pytest.mark.parametrize("dtype", dtypes) -@pytest.mark.skipif(not torch.cuda.is_available(), reason="Need CUDA available") +@pytest.mark.skipif(not torch.accelerator.is_available(), reason="Need GPU available") def test_uintx_target_dtype_compile(dtype): - linear = torch.nn.Linear(128, 256, dtype=torch.bfloat16, device="cuda") + linear = torch.nn.Linear(128, 256, dtype=torch.bfloat16, device=_DEVICE) # make sure it runs quantize_(linear, UIntXWeightOnlyConfig(dtype)) linear = torch.compile(linear) - linear(torch.randn(1, 128, dtype=torch.bfloat16, device="cuda")) + linear(torch.randn(1, 128, dtype=torch.bfloat16, device=_DEVICE)) @pytest.mark.parametrize("dtype", dtypes) -@pytest.mark.skipif(not torch.cuda.is_available(), reason="Need CUDA available") +@pytest.mark.skipif(not torch.accelerator.is_available(), reason="Need GPU available") def test_uintx_model_size(dtype): from torchao.utils import get_model_size_in_bytes @@ -161,7 +163,7 @@ def test_uintx_model_size(dtype): torch.uint7: (7 / 8 + 1 / 16 + 1 / 32) / 2, } linear = torch.nn.Sequential( - torch.nn.Linear(128, 256, bias=False, dtype=torch.bfloat16, device="cuda") + torch.nn.Linear(128, 256, bias=False, dtype=torch.bfloat16, device=_DEVICE) ) bf16_size = get_model_size_in_bytes(linear) # make sure it runs