Refactor test_bmm(_add) with filter_test_cases_by_test_env (facebooki…

…ncubator#635) Summary: Pull Request resolved: facebookincubator#635 Almost all unit tests from `test_bmm` and `test_bmm_add` are running on *both* V100 and A100 hosts. This is inefficient, as only a small fraction of the tests must run on A100. In this diff, the tests are refactored to rely on `filter_test_cases_by_test_env` instead of `filter_test_cases_by_params`, which leads to a more frugal use of A100 hosts. See the test plan for the before / after numbers. Reviewed By: hl475 Differential Revision: D45405007 fbshipit-source-id: 98ce1c39d09b057586f759f22b9eedd6bcd4dbd5
ROCm · fsx950223 · Apr 10, 2023 · Apr 11, 2023 · Apr 11, 2023 · Apr 11, 2023
commit 6ad335187006781d01d86ab5acdf2636dad6a8b8
diff --git a/tests/unittest/ops/test_bmm.py b/tests/unittest/ops/test_bmm.py
@@ -21,21 +21,12 @@
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
 from aitemplate.testing.test_utils import (
-    filter_test_cases_by_params,
+    filter_test_cases_by_test_env,
     get_random_torch_tensor,
     get_torch_empty_tensor,
-    TestEnv,
 )
 from aitemplate.utils import shape_utils
 
-from parameterized import parameterized
-
-
-_TEST_PARAMS = {
-    TestEnv.CUDA_LESS_THAN_SM80: [("float16")],
-    TestEnv.CUDA_SM80: [("float32"), ("bfloat16")],
-}
-
 
 class BMMTestCase(unittest.TestCase):
     def _test_rcr(self, bs, ms, N, K, test_name, dtype="float16"):
@@ -68,15 +59,15 @@ def _test_rcr(self, bs, ms, N, K, test_name, dtype="float16"):
 
     def test_rcr(self):
         self._test_rcr([1024], [128], N=512, K=256, test_name="static")
-        if detect_target().name() == "cuda":
-            self._test_rcr([1, 5, 977, 1024], [32], N=512, K=256, test_name="dynamic_b")
-            self._test_rcr([1], [100, 200, 300], N=512, K=256, test_name="dynamic_m")
-            self._test_rcr(
-                [1, 2, 5], [100, 200, 300], N=512, K=256, test_name="dynamic_bm"
-            )
-            self._test_rcr([0], [128], N=512, K=256, test_name="zero_batch")
-            self._test_rcr([1], [128], N=512, K=0, test_name="zero_k")
-            self._test_rcr([1], [128], N=0, K=8, test_name="zero_n")
+        self._test_rcr([1, 5, 977, 1024], [32], N=512, K=256, test_name="dynamic_b")
+        self._test_rcr([1], [100, 200, 300], N=512, K=256, test_name="dynamic_m")
+        self._test_rcr([1, 2, 5], [100, 200, 300], N=512, K=256, test_name="dynamic_bm")
+        self._test_rcr([0], [128], N=512, K=256, test_name="zero_batch")
+        self._test_rcr([1], [128], N=512, K=0, test_name="zero_k")
+        self._test_rcr([1], [128], N=0, K=8, test_name="zero_n")
+
+    def test_rcr_rocm(self):
+        self._test_rcr([1024], [128], N=512, K=256, test_name="static")
 
     def _test_crr(self, bs, ks, M, N, test_name, dtype="float16"):
         target = detect_target()
@@ -107,10 +98,12 @@ def _test_crr(self, bs, ks, M, N, test_name, dtype="float16"):
 
     def test_crr(self):
         self._test_crr([1024], [128], M=256, N=512, test_name="static")
-        if detect_target().name() == "cuda":
-            self._test_crr([3, 977, 1024], [128], M=256, N=512, test_name="dynamic_b")
-            self._test_crr([5], [45, 56, 78], M=256, N=512, test_name="dynamic_k")
-            self._test_crr([1, 2, 5], [3, 6, 8], M=256, N=512, test_name="dynamic_bk")
+        self._test_crr([3, 977, 1024], [128], M=256, N=512, test_name="dynamic_b")
+        self._test_crr([5], [45, 56, 78], M=256, N=512, test_name="dynamic_k")
+        self._test_crr([1, 2, 5], [3, 6, 8], M=256, N=512, test_name="dynamic_bk")
+
+    def test_crr_rocm(self):
+        self._test_crr([1024], [128], M=256, N=512, test_name="static")
 
     def _test_rrr(self, bs, ms, K, N, test_name, dtype="float16"):
         target = detect_target()
@@ -138,10 +131,12 @@ def _test_rrr(self, bs, ms, K, N, test_name, dtype="float16"):
 
     def test_rrr(self):
         self._test_rrr([87], [23], K=256, N=512, test_name="static")
-        if detect_target().name() == "cuda":
-            self._test_rrr([2, 5, 99], [23], K=128, N=512, test_name="dynamic_b")
-            self._test_rrr([77], [4, 7, 9], K=8, N=512, test_name="dynamic_m")
-            self._test_rrr([2, 5, 7], [1, 7, 9], K=256, N=512, test_name="dynamic_bm")
+        self._test_rrr([2, 5, 99], [23], K=128, N=512, test_name="dynamic_b")
+        self._test_rrr([77], [4, 7, 9], K=8, N=512, test_name="dynamic_m")
+        self._test_rrr([2, 5, 7], [1, 7, 9], K=256, N=512, test_name="dynamic_bm")
+
+    def test_rrr_rocm(self):
+        self._test_rrr([87], [23], K=256, N=512, test_name="static")
 
     def _test_ccr(self, bs, M, N, K, test_name, dtype="float16"):
         target = detect_target()
@@ -166,8 +161,10 @@ def _test_ccr(self, bs, M, N, K, test_name, dtype="float16"):
 
     def test_ccr(self):
         self._test_ccr([77], M=256, N=64, K=128, test_name="static")
-        if detect_target().name() == "cuda":
-            self._test_ccr([1, 9, 101], M=256, N=64, K=128, test_name="dynamic_b")
+        self._test_ccr([1, 9, 101], M=256, N=64, K=128, test_name="dynamic_b")
+
+    def test_ccr_rocm(self):
+        self._test_ccr([77], M=256, N=64, K=128, test_name="static")
 
     def _test_rcc(self, bs, ms, N, K, test_name, dtype="float16"):
         target = detect_target()
@@ -200,15 +197,15 @@ def _test_rcc(self, bs, ms, N, K, test_name, dtype="float16"):
 
     def test_rcc(self):
         self._test_rcc([1024], [128], N=512, K=256, test_name="static")
-        if detect_target().name() == "cuda":
-            self._test_rcc([1, 5, 977, 1024], [32], N=512, K=256, test_name="dynamic_b")
-            self._test_rcc([1], [100, 200, 300], N=512, K=256, test_name="dynamic_m")
-            self._test_rcc(
-                [1, 2, 5], [100, 200, 300], N=512, K=256, test_name="dynamic_bm"
-            )
-            self._test_rcc([0], [128], N=512, K=256, test_name="zero_batch")
-            self._test_rcc([1], [128], N=512, K=0, test_name="zero_k")
-            self._test_rcc([1], [128], N=0, K=8, test_name="zero_n")
+        self._test_rcc([1, 5, 977, 1024], [32], N=512, K=256, test_name="dynamic_b")
+        self._test_rcc([1], [100, 200, 300], N=512, K=256, test_name="dynamic_m")
+        self._test_rcc([1, 2, 5], [100, 200, 300], N=512, K=256, test_name="dynamic_bm")
+        self._test_rcc([0], [128], N=512, K=256, test_name="zero_batch")
+        self._test_rcc([1], [128], N=512, K=0, test_name="zero_k")
+        self._test_rcc([1], [128], N=0, K=8, test_name="zero_n")
+
+    def test_rcc_rocm(self):
+        self._test_rcc([1024], [128], N=512, K=256, test_name="static")
 
     def _test_crc(self, bs, ks, M, N, test_name, dtype="float16"):
         target = detect_target()
@@ -240,10 +237,12 @@ def _test_crc(self, bs, ks, M, N, test_name, dtype="float16"):
 
     def test_crc(self):
         self._test_crc([1024], [128], M=256, N=512, test_name="static")
-        if detect_target().name() == "cuda":
-            self._test_crc([3, 977, 1024], [128], M=256, N=512, test_name="dynamic_b")
-            self._test_crc([5], [45, 56, 78], M=256, N=512, test_name="dynamic_k")
-            self._test_crc([1, 2, 5], [3, 6, 8], M=256, N=512, test_name="dynamic_bk")
+        self._test_crc([3, 977, 1024], [128], M=256, N=512, test_name="dynamic_b")
+        self._test_crc([5], [45, 56, 78], M=256, N=512, test_name="dynamic_k")
+        self._test_crc([1, 2, 5], [3, 6, 8], M=256, N=512, test_name="dynamic_bk")
+
+    def test_crc_rocm(self):
+        self._test_crc([1024], [128], M=256, N=512, test_name="static")
 
     def _test_rrc(self, bs, ms, K, N, test_name, dtype="float16"):
         target = detect_target()
@@ -272,10 +271,12 @@ def _test_rrc(self, bs, ms, K, N, test_name, dtype="float16"):
 
     def test_rrc(self):
         self._test_rrc([87], [23], K=256, N=512, test_name="static")
-        if detect_target().name() == "cuda":
-            self._test_rrc([2, 5, 99], [23], K=128, N=512, test_name="dynamic_b")
-            self._test_rrc([77], [4, 7, 9], K=8, N=512, test_name="dynamic_m")
-            self._test_rrc([2, 5, 7], [1, 7, 9], K=256, N=512, test_name="dynamic_bm")
+        self._test_rrc([2, 5, 99], [23], K=128, N=512, test_name="dynamic_b")
+        self._test_rrc([77], [4, 7, 9], K=8, N=512, test_name="dynamic_m")
+        self._test_rrc([2, 5, 7], [1, 7, 9], K=256, N=512, test_name="dynamic_bm")
+
+    def test_rrc_rocm(self):
+        self._test_rrc([87], [23], K=256, N=512, test_name="static")
 
     def _test_ccc(self, bs, M, N, K, test_name, dtype="float16"):
         target = detect_target()
@@ -302,12 +303,37 @@ def _test_ccc(self, bs, M, N, K, test_name, dtype="float16"):
 
     def test_ccc(self):
         self._test_ccc([77], M=256, N=64, K=128, test_name="static")
-        if detect_target().name() == "cuda":
-            self._test_ccc([1, 9, 101], M=256, N=64, K=128, test_name="dynamic_b")
+        self._test_ccc([1, 9, 101], M=256, N=64, K=128, test_name="dynamic_b")
+
+    def test_ccc_rocm(self):
+        self._test_ccc([77], M=256, N=64, K=128, test_name="static")
+
+    def test_bmm_0_fp32_sm80(self, dtype="float32"):
+        self._test_rcr([128], [64], N=8, K=64, test_name=f"static_{dtype}", dtype=dtype)
+        self._test_rcr(
+            [1, 5, 77, 128],
+            [32],
+            N=16,
+            K=64,
+            test_name=f"dynamic_b_{dtype}",
+            dtype=dtype,
+        )
+        self._test_crr(
+            [1, 2, 5],
+            [3, 6, 8],
+            M=24,
+            N=64,
+            test_name=f"dynamic_bk_{dtype}",
+            dtype=dtype,
+        )
+        self._test_rrr(
+            [8], [4, 7, 9], K=64, N=32, test_name=f"dynamic_m_{dtype}", dtype=dtype
+        )
+        self._test_ccr(
+            [1, 9, 11], M=64, N=32, K=16, test_name=f"dynamic_b_{dtype}", dtype=dtype
+        )
 
-    @parameterized.expand(**filter_test_cases_by_params(_TEST_PARAMS))
-    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
-    def test_bmm_0_dtype(self, dtype):
+    def test_bmm_0_bf16(self, dtype="bfloat16"):
         self._test_rcr([128], [64], N=8, K=64, test_name=f"static_{dtype}", dtype=dtype)
         self._test_rcr(
             [1, 5, 77, 128],
@@ -332,9 +358,32 @@ def test_bmm_0_dtype(self, dtype):
             [1, 9, 11], M=64, N=32, K=16, test_name=f"dynamic_b_{dtype}", dtype=dtype
         )
 
-    @parameterized.expand(**filter_test_cases_by_params(_TEST_PARAMS))
-    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
-    def test_bmm_1_dtype(self, dtype):
+    def test_bmm_1_fp32_sm80(self, dtype="float32"):
+        self._test_rcc([128], [64], N=8, K=64, test_name=f"static_{dtype}", dtype=dtype)
+        self._test_rcc(
+            [1, 5, 77, 128],
+            [32],
+            N=16,
+            K=64,
+            test_name=f"dynamic_b_{dtype}",
+            dtype=dtype,
+        )
+        self._test_crc(
+            [1, 2, 5],
+            [3, 6, 8],
+            M=24,
+            N=64,
+            test_name=f"dynamic_bk_{dtype}",
+            dtype=dtype,
+        )
+        self._test_rrc(
+            [8], [4, 7, 9], K=64, N=32, test_name=f"dynamic_m_{dtype}", dtype=dtype
+        )
+        self._test_ccc(
+            [1, 9, 11], M=64, N=32, K=16, test_name=f"dynamic_b_{dtype}", dtype=dtype
+        )
+
+    def test_bmm_1_bf16(self, dtype="bfloat16"):
         self._test_rcc([128], [64], N=8, K=64, test_name=f"static_{dtype}", dtype=dtype)
         self._test_rcc(
             [1, 5, 77, 128],
@@ -727,8 +776,7 @@ def test_ccc(self):
         self._test_ccr([8, 16], [8, 32, 8], "2d_broadcastable_a")
         self._test_ccr([8, 8, 16], [32, 8], "2d_broadcastable_b")
 
-    @parameterized.expand(**filter_test_cases_by_params(_TEST_PARAMS))
-    def test_bmm_broadcast_0_dtype(self, dtype):
+    def test_bmm_broadcast_0_fp32_sm80(self, dtype="float32"):
         self._test_rcr([2, 16, 8], [1, 32, 8], f"broadcastable_b_{dtype}", dtype=dtype)
         self._test_rcr([16, 8], [8, 32, 8], f"2d_broadcastable_a_{dtype}", dtype=dtype)
         self._test_crr([1, 8, 16], [2, 8, 32], f"broadcastable_a_{dtype}", dtype=dtype)
@@ -738,8 +786,27 @@ def test_bmm_broadcast_0_dtype(self, dtype):
         self._test_ccr([1, 8, 16], [2, 32, 8], f"broadcastable_a_{dtype}", dtype=dtype)
         self._test_ccr([8, 8, 16], [32, 8], f"2d_broadcastable_b_{dtype}", dtype=dtype)
 
-    @parameterized.expand(**filter_test_cases_by_params(_TEST_PARAMS))
-    def test_bmm_broadcast_1_dtype(self, dtype):
+    def test_bmm_broadcast_0_bf16(self, dtype="bfloat16"):
+        self._test_rcr([2, 16, 8], [1, 32, 8], f"broadcastable_b_{dtype}", dtype=dtype)
+        self._test_rcr([16, 8], [8, 32, 8], f"2d_broadcastable_a_{dtype}", dtype=dtype)
+        self._test_crr([1, 8, 16], [2, 8, 32], f"broadcastable_a_{dtype}", dtype=dtype)
+        self._test_crr([8, 8, 16], [8, 32], f"2d_broadcastable_b_{dtype}", dtype=dtype)
+        self._test_rrr([2, 16, 8], [1, 8, 32], f"broadcastable_b_{dtype}", dtype=dtype)
+        self._test_rrr([16, 8], [8, 8, 32], f"2d_broadcastable_a_{dtype}", dtype=dtype)
+        self._test_ccr([1, 8, 16], [2, 32, 8], f"broadcastable_a_{dtype}", dtype=dtype)
+        self._test_ccr([8, 8, 16], [32, 8], f"2d_broadcastable_b_{dtype}", dtype=dtype)
+
+    def test_bmm_broadcast_1_fp32_sm80(self, dtype="float32"):
+        self._test_rcc([2, 16, 8], [1, 32, 8], f"broadcastable_b_{dtype}", dtype=dtype)
+        self._test_rcc([16, 8], [8, 32, 8], f"2d_broadcastable_a_{dtype}", dtype=dtype)
+        self._test_crc([1, 8, 16], [2, 8, 32], f"broadcastable_a_{dtype}", dtype=dtype)
+        self._test_crc([8, 8, 16], [8, 32], f"2d_broadcastable_b_{dtype}", dtype=dtype)
+        self._test_rrc([2, 16, 8], [1, 8, 32], f"broadcastable_b_{dtype}", dtype=dtype)
+        self._test_rrc([16, 8], [8, 8, 32], f"2d_broadcastable_a_{dtype}", dtype=dtype)
+        self._test_ccc([1, 8, 16], [2, 32, 8], f"broadcastable_a_{dtype}", dtype=dtype)
+        self._test_ccc([8, 8, 16], [32, 8], f"2d_broadcastable_b_{dtype}", dtype=dtype)
+
+    def test_bmm_broadcast_1_bf16(self, dtype="bfloat16"):
         self._test_rcc([2, 16, 8], [1, 32, 8], f"broadcastable_b_{dtype}", dtype=dtype)
         self._test_rcc([16, 8], [8, 32, 8], f"2d_broadcastable_a_{dtype}", dtype=dtype)
         self._test_crc([1, 8, 16], [2, 8, 32], f"broadcastable_a_{dtype}", dtype=dtype)
@@ -772,7 +839,7 @@ def test_rcr_fail(self, dtype="float16"):
         try:
             module.run_with_tensors({"input_0": X_pt, "input_1": W_pt}, [y])
             raise AssertionError(
-                "Shouldn't be able to run be imcompatible tensor shape!"
+                "Shouldn't be able to run be incompatible tensor shape!"
             )
         except RuntimeError:
             pass
@@ -800,7 +867,7 @@ def test_rrr_fail(self, dtype="float16"):
         try:
             module.run_with_tensors({"input_0": X_pt, "input_1": W_pt}, [y])
             raise AssertionError(
-                "Shouldn't be able to run be imcompatible tensor shape!"
+                "Shouldn't be able to run be incompatible tensor shape!"
             )
         except RuntimeError:
             pass
@@ -828,11 +895,15 @@ def test_rcc_fail(self, dtype="float16"):
         try:
             module.run_with_tensors({"input_0": X_pt, "input_1": W_pt}, [y])
             raise AssertionError(
-                "Shouldn't be able to run be imcompatible tensor shape!"
+                "Shouldn't be able to run be incompatible tensor shape!"
             )
         except RuntimeError:
             pass
 
 
+filter_test_cases_by_test_env(BMMTestCase)
+filter_test_cases_by_test_env(BMMBroadcastTestCase)
+
+
 if __name__ == "__main__":
     unittest.main()