From 9c6f205368355516965d3f7dcad5e519d7e57db5 Mon Sep 17 00:00:00 2001
From: lijialin03 <lijialin03@baidu.com>
Date: Tue, 4 Nov 2025 12:45:52 +0000
Subject: [PATCH 1/3] [API compatibility] update paddle LayerNorm api

---
 python/paddle/nn/layer/norm.py                |  61 +++-
 .../test_layer_norm_op_v2_dygraph.py          | 296 ++++++++++++++++++
 2 files changed, 352 insertions(+), 5 deletions(-)
 create mode 100644 test/legacy_test/test_layer_norm_op_v2_dygraph.py

diff --git a/python/paddle/nn/layer/norm.py b/python/paddle/nn/layer/norm.py
index b0315dd8936891..f26599bfbec648 100644
--- a/python/paddle/nn/layer/norm.py
+++ b/python/paddle/nn/layer/norm.py
@@ -35,6 +35,7 @@
 
 from paddle import _C_ops, in_dynamic_mode, pir_utils
 from paddle.device import get_all_custom_device_type
+from paddle.utils.decorator_utils import param_one_alias
 
 from ...base import dygraph_utils
 from ...base.data_feeder import check_variable_and_dtype
@@ -64,6 +65,7 @@
         DataLayoutND,
         DTypeLike,
         ParamAttrLike,
+        PlaceLike,
         ShapeLike,
     )
 
@@ -602,13 +604,34 @@ class LayerNorm(Layer):
             which is expected to be of that specific size.
         epsilon(float, optional): The small value added to the variance to prevent
             division by zero. Default: 1e-05.
+            alias: ``eps``.
+        elementwise_affine(bool, optional): Whether to apply element-wise affine transformation
+            (i.e., learnable scale and bias). If set to ``False``, both the scale (:math:`g`) and
+            bias (:math:`b`) parameters will be disabled, regardless of the settings of `weight_attr`
+            and `bias_attr`. This parameter acts as a master switch. Defaults to True.
+            **Note: This argument must be passed as a keyword argument.**
+        bias(bool, optional): Whether to include a learnable bias term in the layer. This setting
+            only takes effect when `elementwise_affine` is ``True``. If set to ``False``, no bias
+            parameter will be created, even if `bias_attr` is specified. Defaults to True.
+            **Note: This argument must be passed as a keyword argument.**
         weight_attr(ParamAttr|bool|None, optional): The parameter attribute for the learnable
-            gain :math:`g`. If False, weight is None. If is None, a default :code:`ParamAttr` would be added as scale. The
-            :attr:`param_attr` is initialized as 1 if it is added. Default: None. For more information, please refer to :ref:`api_paddle_ParamAttr` .
+            gain :math:`g` (scale). This setting only takes effect when `elementwise_affine` is ``True``.
+            - If set to ``False``, no gain parameter will be created.
+            - If set to ``None`` or ``True``, a default :code:`ParamAttr` will be used, and the
+              parameter will be initialized to 1.
+            - If set to a custom :code:`ParamAttr` object, it will be used to configure the parameter.
+            Default: None.
+            **Note: This argument must be passed as a keyword argument.**
         bias_attr(ParamAttr|bool|None, optional): The parameter attribute for the learnable
-            bias :math:`b`. If is False, bias is None. If is None, a default :code:`ParamAttr` would be added as bias. The
-            :attr:`bias_attr` is initialized as 0 if it is added. Default: None. For more information, please refer to :ref:`api_paddle_ParamAttr` .
+            bias :math:`b`. This setting only takes effect when both `elementwise_affine` and `bias` are ``True``.
+            - If set to ``False``, no bias parameter will be created.
+            - If set to ``None`` or ``True``, a default :code:`ParamAttr` will be used, and the
+              parameter will be initialized to 0.
+            - If set to a custom :code:`ParamAttr` object, it will be used to configure the parameter.
+            Default: None.
+            **Note: This argument must be passed as a keyword argument.**
         name(str|None, optional): Name for the LayerNorm, default is None. For more information, please refer to :ref:`api_guide_Name` .
+            **Note: This argument must be passed as a keyword argument.**
 
     Shape:
         - x: 2-D, 3-D, 4-D or 5-D tensor.
@@ -642,10 +665,16 @@ class LayerNorm(Layer):
     weight: Tensor | None
     bias: Tensor | None
 
+    @param_one_alias(["epsilon", "eps"])
     def __init__(
         self,
         normalized_shape: int | Sequence[int],
         epsilon: float = 1e-5,
+        *,
+        elementwise_affine: bool = True,
+        bias: bool = True,
+        device: PlaceLike | None = None,
+        dtype: DTypeLike | None = None,
         weight_attr: bool | ParamAttr | None = None,
         bias_attr: bool | ParamAttr | None = None,
         name: str | None = None,
@@ -656,6 +685,21 @@ def __init__(
 
         self._normalized_shape = list(normalized_shape)
         self._epsilon = epsilon
+        self._device = device
+        self._dtype = (
+            self._helper.get_default_dtype() if dtype is None else dtype
+        )
+
+        if not elementwise_affine:
+            weight_attr = False
+            bias_attr = False
+        else:
+            weight_attr = weight_attr if weight_attr is not False else None
+            if not bias:
+                bias_attr = False
+            else:
+                bias_attr = bias_attr if bias_attr is not False else None
+
         self._weight_attr = weight_attr
         self._bias_attr = bias_attr
         param_shape = [np.prod(self._normalized_shape)]
@@ -665,15 +709,22 @@ def __init__(
         else:
             self.weight = self.create_parameter(
                 attr=self._weight_attr,
+                dtype=self._dtype,
                 shape=param_shape,
                 default_initializer=Constant(1.0),
+                device=self._device,
             )
 
         if bias_attr is False:
             self.bias = None
         else:
             self.bias = self.create_parameter(
-                attr=self._bias_attr, shape=param_shape, is_bias=True
+                attr=self._bias_attr,
+                dtype=self._dtype,
+                shape=param_shape,
+                default_initializer=Constant(0.0),
+                device=self._device,
+                is_bias=True,
             )
 
     def forward(self, input: Tensor) -> Tensor:
diff --git a/test/legacy_test/test_layer_norm_op_v2_dygraph.py b/test/legacy_test/test_layer_norm_op_v2_dygraph.py
new file mode 100644
index 00000000000000..5e759e9b7a3ec0
--- /dev/null
+++ b/test/legacy_test/test_layer_norm_op_v2_dygraph.py
@@ -0,0 +1,296 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+from functools import reduce
+from operator import mul
+
+import numpy as np
+
+import paddle
+from paddle import nn
+
+
+def _reference_layer_norm_naive(x, scale, beta, epsilon, begin_norm_axis=1):
+    x_shape = x.shape
+    N = reduce(mul, x_shape[0:begin_norm_axis], 1)
+    D = reduce(mul, x_shape[begin_norm_axis : len(x_shape)], 1)
+    x.shape = [N, D]
+
+    mean = np.mean(x, axis=1)
+    var = np.var(x, axis=1) + epsilon
+    output = np.divide(
+        (x - mean.reshape([N, 1])), (np.sqrt(var)).reshape([N, 1])
+    )
+    if scale is not None:
+        output = scale.reshape([1, D]) * output
+    if beta is not None:
+        output = output + beta.reshape([1, D])
+
+    x.shape, output.shape = x_shape, x_shape
+    return output
+
+
+class TestLayerNormOp(unittest.TestCase):
+    def setUp(self):
+        paddle.disable_static()
+        self.x_shape = [2, 6, 6, 3]
+        self.epsilon = 1e-5
+        self.begin_norm_axis = 1
+
+    @unittest.skipIf(
+        not paddle.in_dynamic_mode(), "test is only for dynamic mode"
+    )
+    def test_basic_fp32(self):
+        """test basic functionality with float32."""
+        x_np = np.random.random(self.x_shape).astype('float32')
+        scale_np = np.random.random(
+            self.x_shape[self.begin_norm_axis :]
+        ).astype('float32')
+        bias_np = np.random.random(self.x_shape[self.begin_norm_axis :]).astype(
+            'float32'
+        )
+        scale = paddle.to_tensor(scale_np).reshape(-1)
+        bias = paddle.to_tensor(bias_np).reshape(-1)
+
+        ln = nn.LayerNorm(
+            normalized_shape=self.x_shape[self.begin_norm_axis :],
+            weight_attr=nn.initializer.Assign(scale),
+            bias_attr=nn.initializer.Assign(bias),
+            epsilon=self.epsilon,
+        )
+
+        x_pd = paddle.to_tensor(x_np)
+        y_pd = ln(x_pd)
+        expect_res = _reference_layer_norm_naive(
+            x_np, scale_np, bias_np, self.epsilon, self.begin_norm_axis
+        )
+
+        np.testing.assert_allclose(
+            y_pd.numpy(), expect_res, rtol=1e-5, atol=1e-4
+        )
+
+    @unittest.skipIf(
+        not paddle.in_dynamic_mode(), "test is only for dynamic mode"
+    )
+    def test_no_scale_no_bias_fp32(self):
+        """test the case when both scale and bias are disabled (FP32)."""
+        x_np = np.random.random(self.x_shape).astype('float32')
+        x_pd = paddle.to_tensor(x_np)
+
+        ln = nn.LayerNorm(
+            normalized_shape=self.x_shape[self.begin_norm_axis :],
+            elementwise_affine=False,
+            epsilon=self.epsilon,
+        )
+        y_pd = ln(x_pd)
+
+        expect_res = _reference_layer_norm_naive(
+            x_np, None, None, self.epsilon, self.begin_norm_axis
+        )
+        np.testing.assert_allclose(
+            y_pd.numpy(), expect_res, rtol=1e-5, atol=1e-4
+        )
+
+    @unittest.skipIf(
+        not paddle.in_dynamic_mode(), "test is only for dynamic mode"
+    )
+    def test_with_scale_no_bias_fp32(self):
+        """test the case when only scale is enabled (FP32)."""
+        x_np = np.random.random(self.x_shape).astype('float32')
+        scale_np = np.random.random(
+            self.x_shape[self.begin_norm_axis :]
+        ).astype('float32')
+        scale = paddle.to_tensor(scale_np).reshape(-1)
+
+        ln = nn.LayerNorm(
+            normalized_shape=self.x_shape[self.begin_norm_axis :],
+            elementwise_affine=True,
+            bias_attr=False,
+            epsilon=self.epsilon,
+        )
+        with paddle.no_grad():
+            ln.weight.set_value(scale)
+
+        x_pd = paddle.to_tensor(x_np)
+        y_pd = ln(x_pd)
+
+        expect_res = _reference_layer_norm_naive(
+            x_np, scale_np, None, self.epsilon, self.begin_norm_axis
+        )
+        np.testing.assert_allclose(
+            y_pd.numpy(), expect_res, rtol=1e-5, atol=1e-4
+        )
+
+    @unittest.skipIf(
+        not paddle.in_dynamic_mode(), "test is only for dynamic mode"
+    )
+    def test_no_scale_with_bias_fp32(self):
+        """test the case when only bias is enabled (FP32)."""
+        x_np = np.random.random(self.x_shape).astype('float32')
+        bias_np = np.random.random(self.x_shape[self.begin_norm_axis :]).astype(
+            'float32'
+        )
+        bias = paddle.to_tensor(bias_np).reshape(-1)
+
+        ln = nn.LayerNorm(
+            normalized_shape=self.x_shape[self.begin_norm_axis :],
+            elementwise_affine=True,
+            weight_attr=False,
+            epsilon=self.epsilon,
+        )
+        with paddle.no_grad():
+            ln.bias.set_value(bias)
+
+        x_pd = paddle.to_tensor(x_np)
+        y_pd = ln(x_pd)
+
+        expect_res = _reference_layer_norm_naive(
+            x_np, None, bias_np, self.epsilon, self.begin_norm_axis
+        )
+        np.testing.assert_allclose(
+            y_pd.numpy(), expect_res, rtol=1e-5, atol=1e-4
+        )
+
+    def test_bf16_forward_backward(self):
+        """test forward and backward pass with bfloat16 precision."""
+        place = paddle.CUDAPlace(0)
+
+        with paddle.base.dygraph.guard(place):
+            x_np = np.random.random(self.x_shape).astype('float32')
+            scale_np = np.random.random(
+                self.x_shape[self.begin_norm_axis :]
+            ).astype('float32')
+            bias_np = np.random.random(
+                self.x_shape[self.begin_norm_axis :]
+            ).astype('float32')
+
+            x = paddle.to_tensor(x_np).cast(paddle.bfloat16)
+            x.stop_gradient = False
+
+            scale = paddle.to_tensor(scale_np).cast(paddle.bfloat16).reshape(-1)
+            bias = paddle.to_tensor(bias_np).cast(paddle.bfloat16).reshape(-1)
+
+            ln = nn.LayerNorm(
+                normalized_shape=self.x_shape[self.begin_norm_axis :],
+                weight_attr=nn.initializer.Assign(scale),
+                bias_attr=nn.initializer.Assign(bias),
+                epsilon=self.epsilon,
+            )
+            ln.to(device='cuda')
+
+            y = ln(x)
+            loss = y.sum()
+            loss.backward()
+
+            self.assertIsNotNone(x.grad)
+            self.assertIsNotNone(ln.weight.grad)
+            self.assertIsNotNone(ln.bias.grad)
+
+
+class TestLayerNormParam(unittest.TestCase):
+    def setUp(self):
+        self.normalized_shape = [6]
+        self.x_tensor = paddle.randn([2, 4, 4, 6])
+
+    def test_elementwise_affine_false(self):
+        """test that when elementwise_affine=False, no learnable parameters are created."""
+        layer = nn.LayerNorm(
+            normalized_shape=self.normalized_shape, elementwise_affine=False
+        )
+        self.assertIsNone(layer.weight)
+        self.assertIsNone(layer.bias)
+
+        out = layer(self.x_tensor)
+        self.assertEqual(out.shape, self.x_tensor.shape)
+
+    @unittest.skipIf(
+        not paddle.in_dynamic_mode(), "test is only for dynamic mode"
+    )
+    def test_elementwise_affine_true(self):
+        """test that when elementwise_affine=True and attr=None, parameters are created with default initialization."""
+        layer = nn.LayerNorm(
+            normalized_shape=self.normalized_shape, elementwise_affine=True
+        )
+        self.assertIsNotNone(layer.weight)
+        self.assertIsNotNone(layer.bias)
+
+        expected_weight = paddle.ones([6])
+        expected_bias = paddle.zeros([6])
+        self.assertTrue(paddle.allclose(layer.weight, expected_weight))
+        self.assertTrue(paddle.allclose(layer.bias, expected_bias))
+
+    def test_bias_false(self):
+        """test that when bias=False, the bias parameter is disabled even if elementwise_affine=True."""
+        layer = nn.LayerNorm(
+            normalized_shape=self.normalized_shape,
+            elementwise_affine=True,
+            bias=False,
+        )
+        self.assertIsNotNone(layer.weight)
+        self.assertIsNone(layer.bias)
+
+    @unittest.skipIf(
+        not paddle.in_dynamic_mode(), "test is only for dynamic mode"
+    )
+    def test_attr_custom_initialization(self):
+        """test that weight_attr and bias_attr can be used to customize the initialization of the weight parameter."""
+        weight_attr = paddle.nn.initializer.Constant(value=2.0)
+        bias_attr = paddle.nn.initializer.Constant(value=3.0)
+        layer = nn.LayerNorm(
+            normalized_shape=self.normalized_shape,
+            elementwise_affine=True,
+            weight_attr=weight_attr,
+            bias_attr=bias_attr,
+        )
+
+        expected_weight = paddle.full([6], 2.0)
+        expected_bias = paddle.full([6], 3.0)
+        self.assertTrue(paddle.allclose(layer.weight, expected_weight))
+        self.assertTrue(paddle.allclose(layer.bias, expected_bias))
+
+    def test_alias(self):
+        """test parameter alias epsilon/eps"""
+        layer_epsilon = nn.LayerNorm(
+            normalized_shape=self.normalized_shape,
+            elementwise_affine=True,
+            epsilon=1e-5,
+        )
+        layer_eps = nn.LayerNorm(
+            normalized_shape=self.normalized_shape,
+            elementwise_affine=True,
+            eps=1e-5,
+        )
+
+        out_epsilon = layer_epsilon(self.x_tensor)
+        out_eps = layer_eps(self.x_tensor)
+
+        np.testing.assert_array_equal(out_epsilon.numpy(), out_eps.numpy())
+
+    def test_errors(self):
+        """test for errors."""
+        layer_norm = nn.LayerNorm(self.normalized_shape)
+        x1 = np.random.random([3, *self.normalized_shape]).astype('float32')
+        with self.assertRaises(ValueError):
+            layer_norm(x1)
+        with self.assertRaises(TypeError):
+            nn.LayerNorm(self.normalized_shape, 1e-5, None, None, "name")
+        with self.assertRaises(TypeError):
+            nn.LayerNorm(
+                self.normalized_shape, 1e-5, False, "cpu", paddle.float32
+            )
+
+
+if __name__ == '__main__':
+    unittest.main()

From e8c1f8a0ae7ee4af109f7b247c810d2d25f91638 Mon Sep 17 00:00:00 2001
From: lijialin03 <lijialin03@baidu.com>
Date: Mon, 10 Nov 2025 07:19:59 +0000
Subject: [PATCH 2/3] update:modify tests to fit static mode

---
 .../test_layer_norm_op_v2_dygraph.py          | 360 +++++++++---------
 1 file changed, 182 insertions(+), 178 deletions(-)

diff --git a/test/legacy_test/test_layer_norm_op_v2_dygraph.py b/test/legacy_test/test_layer_norm_op_v2_dygraph.py
index 5e759e9b7a3ec0..a6d8b9dd00f357 100644
--- a/test/legacy_test/test_layer_norm_op_v2_dygraph.py
+++ b/test/legacy_test/test_layer_norm_op_v2_dygraph.py
@@ -17,6 +17,7 @@
 from operator import mul
 
 import numpy as np
+from op_test import get_places
 
 import paddle
 from paddle import nn
@@ -48,161 +49,161 @@ def setUp(self):
         self.x_shape = [2, 6, 6, 3]
         self.epsilon = 1e-5
         self.begin_norm_axis = 1
+        self.places = get_places()
 
-    @unittest.skipIf(
-        not paddle.in_dynamic_mode(), "test is only for dynamic mode"
-    )
     def test_basic_fp32(self):
         """test basic functionality with float32."""
-        x_np = np.random.random(self.x_shape).astype('float32')
-        scale_np = np.random.random(
-            self.x_shape[self.begin_norm_axis :]
-        ).astype('float32')
-        bias_np = np.random.random(self.x_shape[self.begin_norm_axis :]).astype(
-            'float32'
-        )
-        scale = paddle.to_tensor(scale_np).reshape(-1)
-        bias = paddle.to_tensor(bias_np).reshape(-1)
-
-        ln = nn.LayerNorm(
-            normalized_shape=self.x_shape[self.begin_norm_axis :],
-            weight_attr=nn.initializer.Assign(scale),
-            bias_attr=nn.initializer.Assign(bias),
-            epsilon=self.epsilon,
-        )
-
-        x_pd = paddle.to_tensor(x_np)
-        y_pd = ln(x_pd)
-        expect_res = _reference_layer_norm_naive(
-            x_np, scale_np, bias_np, self.epsilon, self.begin_norm_axis
-        )
-
-        np.testing.assert_allclose(
-            y_pd.numpy(), expect_res, rtol=1e-5, atol=1e-4
-        )
+        for place in self.places:
+            with paddle.base.dygraph.guard(place):
+                x_np = np.random.random(self.x_shape).astype('float32')
+                scale_np = np.random.random(
+                    self.x_shape[self.begin_norm_axis :]
+                ).astype('float32')
+                bias_np = np.random.random(
+                    self.x_shape[self.begin_norm_axis :]
+                ).astype('float32')
+                scale = paddle.to_tensor(scale_np).reshape(-1)
+                bias = paddle.to_tensor(bias_np).reshape(-1)
+
+                ln = nn.LayerNorm(
+                    normalized_shape=self.x_shape[self.begin_norm_axis :],
+                    weight_attr=nn.initializer.Assign(scale),
+                    bias_attr=nn.initializer.Assign(bias),
+                    epsilon=self.epsilon,
+                )
+
+                x_pd = paddle.to_tensor(x_np)
+                y_pd = ln(x_pd)
+                expect_res = _reference_layer_norm_naive(
+                    x_np, scale_np, bias_np, self.epsilon, self.begin_norm_axis
+                )
+
+                np.testing.assert_allclose(
+                    y_pd.numpy(), expect_res, rtol=1e-5, atol=1e-4
+                )
 
-    @unittest.skipIf(
-        not paddle.in_dynamic_mode(), "test is only for dynamic mode"
-    )
     def test_no_scale_no_bias_fp32(self):
         """test the case when both scale and bias are disabled (FP32)."""
-        x_np = np.random.random(self.x_shape).astype('float32')
-        x_pd = paddle.to_tensor(x_np)
-
-        ln = nn.LayerNorm(
-            normalized_shape=self.x_shape[self.begin_norm_axis :],
-            elementwise_affine=False,
-            epsilon=self.epsilon,
-        )
-        y_pd = ln(x_pd)
-
-        expect_res = _reference_layer_norm_naive(
-            x_np, None, None, self.epsilon, self.begin_norm_axis
-        )
-        np.testing.assert_allclose(
-            y_pd.numpy(), expect_res, rtol=1e-5, atol=1e-4
-        )
+        for place in self.places:
+            with paddle.base.dygraph.guard(place):
+                x_np = np.random.random(self.x_shape).astype('float32')
+                x_pd = paddle.to_tensor(x_np)
+
+                ln = nn.LayerNorm(
+                    normalized_shape=self.x_shape[self.begin_norm_axis :],
+                    elementwise_affine=False,
+                    epsilon=self.epsilon,
+                )
+                y_pd = ln(x_pd)
+
+                expect_res = _reference_layer_norm_naive(
+                    x_np, None, None, self.epsilon, self.begin_norm_axis
+                )
+                np.testing.assert_allclose(
+                    y_pd.numpy(), expect_res, rtol=1e-5, atol=1e-4
+                )
 
-    @unittest.skipIf(
-        not paddle.in_dynamic_mode(), "test is only for dynamic mode"
-    )
     def test_with_scale_no_bias_fp32(self):
         """test the case when only scale is enabled (FP32)."""
-        x_np = np.random.random(self.x_shape).astype('float32')
-        scale_np = np.random.random(
-            self.x_shape[self.begin_norm_axis :]
-        ).astype('float32')
-        scale = paddle.to_tensor(scale_np).reshape(-1)
-
-        ln = nn.LayerNorm(
-            normalized_shape=self.x_shape[self.begin_norm_axis :],
-            elementwise_affine=True,
-            bias_attr=False,
-            epsilon=self.epsilon,
-        )
-        with paddle.no_grad():
-            ln.weight.set_value(scale)
-
-        x_pd = paddle.to_tensor(x_np)
-        y_pd = ln(x_pd)
-
-        expect_res = _reference_layer_norm_naive(
-            x_np, scale_np, None, self.epsilon, self.begin_norm_axis
-        )
-        np.testing.assert_allclose(
-            y_pd.numpy(), expect_res, rtol=1e-5, atol=1e-4
-        )
+        for place in self.places:
+            with paddle.base.dygraph.guard(place):
+                x_np = np.random.random(self.x_shape).astype('float32')
+                scale_np = np.random.random(
+                    self.x_shape[self.begin_norm_axis :]
+                ).astype('float32')
+                scale = paddle.to_tensor(scale_np).reshape(-1)
+
+                ln = nn.LayerNorm(
+                    normalized_shape=self.x_shape[self.begin_norm_axis :],
+                    elementwise_affine=True,
+                    bias_attr=False,
+                    epsilon=self.epsilon,
+                )
+                with paddle.no_grad():
+                    ln.weight.set_value(scale)
+
+                x_pd = paddle.to_tensor(x_np)
+                y_pd = ln(x_pd)
+
+                expect_res = _reference_layer_norm_naive(
+                    x_np, scale_np, None, self.epsilon, self.begin_norm_axis
+                )
+                np.testing.assert_allclose(
+                    y_pd.numpy(), expect_res, rtol=1e-5, atol=1e-4
+                )
 
-    @unittest.skipIf(
-        not paddle.in_dynamic_mode(), "test is only for dynamic mode"
-    )
     def test_no_scale_with_bias_fp32(self):
         """test the case when only bias is enabled (FP32)."""
-        x_np = np.random.random(self.x_shape).astype('float32')
-        bias_np = np.random.random(self.x_shape[self.begin_norm_axis :]).astype(
-            'float32'
-        )
-        bias = paddle.to_tensor(bias_np).reshape(-1)
-
-        ln = nn.LayerNorm(
-            normalized_shape=self.x_shape[self.begin_norm_axis :],
-            elementwise_affine=True,
-            weight_attr=False,
-            epsilon=self.epsilon,
-        )
-        with paddle.no_grad():
-            ln.bias.set_value(bias)
-
-        x_pd = paddle.to_tensor(x_np)
-        y_pd = ln(x_pd)
-
-        expect_res = _reference_layer_norm_naive(
-            x_np, None, bias_np, self.epsilon, self.begin_norm_axis
-        )
-        np.testing.assert_allclose(
-            y_pd.numpy(), expect_res, rtol=1e-5, atol=1e-4
-        )
+        for place in self.places:
+            with paddle.base.dygraph.guard(place):
+                x_np = np.random.random(self.x_shape).astype('float32')
+                bias_np = np.random.random(
+                    self.x_shape[self.begin_norm_axis :]
+                ).astype('float32')
+                bias = paddle.to_tensor(bias_np).reshape(-1)
+
+                ln = nn.LayerNorm(
+                    normalized_shape=self.x_shape[self.begin_norm_axis :],
+                    elementwise_affine=True,
+                    weight_attr=False,
+                    epsilon=self.epsilon,
+                )
+                with paddle.no_grad():
+                    ln.bias.set_value(bias)
+
+                x_pd = paddle.to_tensor(x_np)
+                y_pd = ln(x_pd)
+
+                expect_res = _reference_layer_norm_naive(
+                    x_np, None, bias_np, self.epsilon, self.begin_norm_axis
+                )
+                np.testing.assert_allclose(
+                    y_pd.numpy(), expect_res, rtol=1e-5, atol=1e-4
+                )
 
     def test_bf16_forward_backward(self):
         """test forward and backward pass with bfloat16 precision."""
-        place = paddle.CUDAPlace(0)
-
-        with paddle.base.dygraph.guard(place):
-            x_np = np.random.random(self.x_shape).astype('float32')
-            scale_np = np.random.random(
-                self.x_shape[self.begin_norm_axis :]
-            ).astype('float32')
-            bias_np = np.random.random(
-                self.x_shape[self.begin_norm_axis :]
-            ).astype('float32')
-
-            x = paddle.to_tensor(x_np).cast(paddle.bfloat16)
-            x.stop_gradient = False
-
-            scale = paddle.to_tensor(scale_np).cast(paddle.bfloat16).reshape(-1)
-            bias = paddle.to_tensor(bias_np).cast(paddle.bfloat16).reshape(-1)
-
-            ln = nn.LayerNorm(
-                normalized_shape=self.x_shape[self.begin_norm_axis :],
-                weight_attr=nn.initializer.Assign(scale),
-                bias_attr=nn.initializer.Assign(bias),
-                epsilon=self.epsilon,
-            )
-            ln.to(device='cuda')
-
-            y = ln(x)
-            loss = y.sum()
-            loss.backward()
-
-            self.assertIsNotNone(x.grad)
-            self.assertIsNotNone(ln.weight.grad)
-            self.assertIsNotNone(ln.bias.grad)
+        for place in self.places:
+            with paddle.base.dygraph.guard(place):
+                x_np = np.random.random(self.x_shape).astype('float32')
+                scale_np = np.random.random(
+                    self.x_shape[self.begin_norm_axis :]
+                ).astype('float32')
+                bias_np = np.random.random(
+                    self.x_shape[self.begin_norm_axis :]
+                ).astype('float32')
+
+                x = paddle.to_tensor(x_np).cast(paddle.bfloat16)
+                x.stop_gradient = False
+
+                scale = (
+                    paddle.to_tensor(scale_np).cast(paddle.bfloat16).reshape(-1)
+                )
+                bias = (
+                    paddle.to_tensor(bias_np).cast(paddle.bfloat16).reshape(-1)
+                )
+
+                ln = nn.LayerNorm(
+                    normalized_shape=self.x_shape[self.begin_norm_axis :],
+                    weight_attr=nn.initializer.Assign(scale),
+                    bias_attr=nn.initializer.Assign(bias),
+                    epsilon=self.epsilon,
+                )
+
+                y = ln(x)
+                loss = y.sum()
+                loss.backward()
+
+                self.assertIsNotNone(x.grad)
+                self.assertIsNotNone(ln.weight.grad)
+                self.assertIsNotNone(ln.bias.grad)
 
 
 class TestLayerNormParam(unittest.TestCase):
     def setUp(self):
         self.normalized_shape = [6]
         self.x_tensor = paddle.randn([2, 4, 4, 6])
+        self.places = get_places()
 
     def test_elementwise_affine_false(self):
         """test that when elementwise_affine=False, no learnable parameters are created."""
@@ -215,21 +216,21 @@ def test_elementwise_affine_false(self):
         out = layer(self.x_tensor)
         self.assertEqual(out.shape, self.x_tensor.shape)
 
-    @unittest.skipIf(
-        not paddle.in_dynamic_mode(), "test is only for dynamic mode"
-    )
     def test_elementwise_affine_true(self):
         """test that when elementwise_affine=True and attr=None, parameters are created with default initialization."""
-        layer = nn.LayerNorm(
-            normalized_shape=self.normalized_shape, elementwise_affine=True
-        )
-        self.assertIsNotNone(layer.weight)
-        self.assertIsNotNone(layer.bias)
-
-        expected_weight = paddle.ones([6])
-        expected_bias = paddle.zeros([6])
-        self.assertTrue(paddle.allclose(layer.weight, expected_weight))
-        self.assertTrue(paddle.allclose(layer.bias, expected_bias))
+        for place in self.places:
+            with paddle.base.dygraph.guard(place):
+                layer = nn.LayerNorm(
+                    normalized_shape=self.normalized_shape,
+                    elementwise_affine=True,
+                )
+                self.assertIsNotNone(layer.weight)
+                self.assertIsNotNone(layer.bias)
+
+                expected_weight = paddle.ones([6])
+                expected_bias = paddle.zeros([6])
+                self.assertTrue(paddle.allclose(layer.weight, expected_weight))
+                self.assertTrue(paddle.allclose(layer.bias, expected_bias))
 
     def test_bias_false(self):
         """test that when bias=False, the bias parameter is disabled even if elementwise_affine=True."""
@@ -241,42 +242,45 @@ def test_bias_false(self):
         self.assertIsNotNone(layer.weight)
         self.assertIsNone(layer.bias)
 
-    @unittest.skipIf(
-        not paddle.in_dynamic_mode(), "test is only for dynamic mode"
-    )
     def test_attr_custom_initialization(self):
         """test that weight_attr and bias_attr can be used to customize the initialization of the weight parameter."""
-        weight_attr = paddle.nn.initializer.Constant(value=2.0)
-        bias_attr = paddle.nn.initializer.Constant(value=3.0)
-        layer = nn.LayerNorm(
-            normalized_shape=self.normalized_shape,
-            elementwise_affine=True,
-            weight_attr=weight_attr,
-            bias_attr=bias_attr,
-        )
-
-        expected_weight = paddle.full([6], 2.0)
-        expected_bias = paddle.full([6], 3.0)
-        self.assertTrue(paddle.allclose(layer.weight, expected_weight))
-        self.assertTrue(paddle.allclose(layer.bias, expected_bias))
+        for place in self.places:
+            with paddle.base.dygraph.guard(place):
+                weight_attr = paddle.nn.initializer.Constant(value=2.0)
+                bias_attr = paddle.nn.initializer.Constant(value=3.0)
+                layer = nn.LayerNorm(
+                    normalized_shape=self.normalized_shape,
+                    elementwise_affine=True,
+                    weight_attr=weight_attr,
+                    bias_attr=bias_attr,
+                )
+
+                expected_weight = paddle.full([6], 2.0)
+                expected_bias = paddle.full([6], 3.0)
+                self.assertTrue(paddle.allclose(layer.weight, expected_weight))
+                self.assertTrue(paddle.allclose(layer.bias, expected_bias))
 
     def test_alias(self):
         """test parameter alias epsilon/eps"""
-        layer_epsilon = nn.LayerNorm(
-            normalized_shape=self.normalized_shape,
-            elementwise_affine=True,
-            epsilon=1e-5,
-        )
-        layer_eps = nn.LayerNorm(
-            normalized_shape=self.normalized_shape,
-            elementwise_affine=True,
-            eps=1e-5,
-        )
-
-        out_epsilon = layer_epsilon(self.x_tensor)
-        out_eps = layer_eps(self.x_tensor)
-
-        np.testing.assert_array_equal(out_epsilon.numpy(), out_eps.numpy())
+        for place in self.places:
+            with paddle.base.dygraph.guard(place):
+                layer_epsilon = nn.LayerNorm(
+                    normalized_shape=self.normalized_shape,
+                    elementwise_affine=True,
+                    epsilon=1e-5,
+                )
+                layer_eps = nn.LayerNorm(
+                    normalized_shape=self.normalized_shape,
+                    elementwise_affine=True,
+                    eps=1e-5,
+                )
+
+                out_epsilon = layer_epsilon(self.x_tensor)
+                out_eps = layer_eps(self.x_tensor)
+
+                np.testing.assert_array_equal(
+                    out_epsilon.numpy(), out_eps.numpy()
+                )
 
     def test_errors(self):
         """test for errors."""

From 9ce0b33a2c55f9749b9423656ebd90079928e136 Mon Sep 17 00:00:00 2001
From: lijialin03 <lijialin03@baidu.com>
Date: Mon, 10 Nov 2025 09:46:21 +0000
Subject: [PATCH 3/3] update:optimize tests

---
 test/legacy_test/test_layer_norm_op_v2.py     | 160 ++++++++++
 .../test_layer_norm_op_v2_dygraph.py          | 300 ------------------
 2 files changed, 160 insertions(+), 300 deletions(-)
 delete mode 100644 test/legacy_test/test_layer_norm_op_v2_dygraph.py

diff --git a/test/legacy_test/test_layer_norm_op_v2.py b/test/legacy_test/test_layer_norm_op_v2.py
index a8bfec46252114..652d9d8194aa24 100644
--- a/test/legacy_test/test_layer_norm_op_v2.py
+++ b/test/legacy_test/test_layer_norm_op_v2.py
@@ -159,6 +159,166 @@ def compute_v4(x):
             )
 
 
+class TestLayerNormParam(unittest.TestCase):
+    def setUp(self):
+        self.normalized_shape = [6]
+        self.x_shape = [2, 4, 4, 6]
+        self.epsilon = 1e-5
+        self.places = get_places()
+
+    def test_elementwise_affine_false(self):
+        """test that when elementwise_affine=False, weight and bias parameters are not created."""
+        for p in self.places:
+            with base.dygraph.guard(p):
+                layer = paddle.nn.LayerNorm(
+                    normalized_shape=self.normalized_shape,
+                    elementwise_affine=False,
+                )
+                self.assertIsNone(
+                    layer.weight,
+                    "Weight should be None when elementwise_affine=False",
+                )
+                self.assertIsNone(
+                    layer.bias,
+                    "Bias should be None when elementwise_affine=False",
+                )
+
+                x_tensor = paddle.randn(self.x_shape)
+                out = layer(x_tensor)
+                self.assertEqual(out.shape, x_tensor.shape)
+
+    def test_elementwise_affine_true(self):
+        """test that when elementwise_affine=True and attr=None, parameters are created with default initialization."""
+        for place in self.places:
+            with paddle.base.dygraph.guard(place):
+                layer = paddle.nn.LayerNorm(
+                    normalized_shape=self.normalized_shape,
+                    elementwise_affine=True,
+                )
+                self.assertIsNotNone(
+                    layer.weight,
+                    "Weight should not be None when elementwise_affine=True",
+                )
+                self.assertIsNotNone(
+                    layer.bias,
+                    "Weight should not be None when elementwise_affine=True",
+                )
+
+                expected_weight = paddle.ones(self.normalized_shape)
+                expected_bias = paddle.zeros(self.normalized_shape)
+
+                self.assertTrue(paddle.allclose(layer.weight, expected_weight))
+                self.assertTrue(paddle.allclose(layer.bias, expected_bias))
+
+    def test_bias_false(self):
+        """test that when bias=False, the bias parameter is disabled even if elementwise_affine=True."""
+        for p in self.places:
+            with base.dygraph.guard(p):
+                layer = paddle.nn.LayerNorm(
+                    normalized_shape=self.normalized_shape,
+                    elementwise_affine=True,
+                    bias=False,
+                )
+                self.assertIsNotNone(
+                    layer.weight,
+                    "Weight should exist when elementwise_affine=True",
+                )
+                self.assertIsNone(
+                    layer.bias, "Bias should be None when bias_attr=False"
+                )
+
+    def test_weight_and_bias_false(self):
+        """test that when weight_attr=False and bias_attr=False, both parameters are disabled."""
+        for p in self.places:
+            with base.dygraph.guard(p):
+                layer = paddle.nn.LayerNorm(
+                    normalized_shape=self.normalized_shape,
+                    elementwise_affine=True,
+                    weight_attr=False,
+                    bias_attr=False,
+                )
+                self.assertIsNotNone(
+                    layer.weight,
+                    "Weight should not be None when elementwise_affine=True although weight_attr=False",
+                )
+                self.assertIsNotNone(
+                    layer.bias,
+                    "Bias should not be None when elementwise_affine=True although bias_attr=False",
+                )
+
+    def test_custom_initialization(self):
+        """test custom initialization using weight_attr and bias_attr."""
+        for p in self.places:
+            with base.dygraph.guard(p):
+                weight_val = 2.5
+                bias_val = -1.0
+                weight_initializer = paddle.nn.initializer.Constant(
+                    value=weight_val
+                )
+                bias_initializer = paddle.nn.initializer.Constant(
+                    value=bias_val
+                )
+
+                layer = paddle.nn.LayerNorm(
+                    normalized_shape=self.normalized_shape,
+                    elementwise_affine=True,
+                    weight_attr=weight_initializer,
+                    bias_attr=bias_initializer,
+                )
+
+                expected_weight = paddle.full(
+                    self.normalized_shape, weight_val, dtype=layer.weight.dtype
+                )
+                expected_bias = paddle.full(
+                    self.normalized_shape, bias_val, dtype=layer.bias.dtype
+                )
+
+                self.assertTrue(
+                    paddle.allclose(layer.weight, expected_weight),
+                    f"Weight initialization failed. Got {layer.weight.numpy()}, expected {expected_weight.numpy()}",
+                )
+                self.assertTrue(
+                    paddle.allclose(layer.bias, expected_bias),
+                    f"Bias initialization failed. Got {layer.bias.numpy()}, expected {expected_bias.numpy()}",
+                )
+
+    def test_alias(self):
+        """test parameter alias epsilon/eps"""
+        for place in self.places:
+            with paddle.base.dygraph.guard(place):
+                layer_epsilon = paddle.nn.LayerNorm(
+                    normalized_shape=self.normalized_shape,
+                    elementwise_affine=True,
+                    epsilon=1e-5,
+                )
+                layer_eps = paddle.nn.LayerNorm(
+                    normalized_shape=self.normalized_shape,
+                    elementwise_affine=True,
+                    eps=1e-5,
+                )
+
+                x_tensor = paddle.randn(self.x_shape)
+                out_epsilon = layer_epsilon(x_tensor)
+                out_eps = layer_eps(x_tensor)
+
+                np.testing.assert_array_equal(
+                    out_epsilon.numpy(), out_eps.numpy()
+                )
+
+    def test_errors(self):
+        """test for errors."""
+        layer_norm = paddle.nn.LayerNorm(self.normalized_shape)
+        x1 = np.random.random([3, *self.normalized_shape]).astype('float32')
+        with self.assertRaises(TypeError):
+            layer_norm(x1)
+        with self.assertRaises(TypeError):
+            paddle.nn.LayerNorm(self.normalized_shape, 1e-5, None, None, "name")
+        with self.assertRaises(TypeError):
+            paddle.nn.LayerNorm(
+                self.normalized_shape, 1e-5, False, "cpu", paddle.float32
+            )
+
+
 if __name__ == '__main__':
     paddle.enable_static()
     unittest.main()
diff --git a/test/legacy_test/test_layer_norm_op_v2_dygraph.py b/test/legacy_test/test_layer_norm_op_v2_dygraph.py
deleted file mode 100644
index a6d8b9dd00f357..00000000000000
--- a/test/legacy_test/test_layer_norm_op_v2_dygraph.py
+++ /dev/null
@@ -1,300 +0,0 @@
-# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-from functools import reduce
-from operator import mul
-
-import numpy as np
-from op_test import get_places
-
-import paddle
-from paddle import nn
-
-
-def _reference_layer_norm_naive(x, scale, beta, epsilon, begin_norm_axis=1):
-    x_shape = x.shape
-    N = reduce(mul, x_shape[0:begin_norm_axis], 1)
-    D = reduce(mul, x_shape[begin_norm_axis : len(x_shape)], 1)
-    x.shape = [N, D]
-
-    mean = np.mean(x, axis=1)
-    var = np.var(x, axis=1) + epsilon
-    output = np.divide(
-        (x - mean.reshape([N, 1])), (np.sqrt(var)).reshape([N, 1])
-    )
-    if scale is not None:
-        output = scale.reshape([1, D]) * output
-    if beta is not None:
-        output = output + beta.reshape([1, D])
-
-    x.shape, output.shape = x_shape, x_shape
-    return output
-
-
-class TestLayerNormOp(unittest.TestCase):
-    def setUp(self):
-        paddle.disable_static()
-        self.x_shape = [2, 6, 6, 3]
-        self.epsilon = 1e-5
-        self.begin_norm_axis = 1
-        self.places = get_places()
-
-    def test_basic_fp32(self):
-        """test basic functionality with float32."""
-        for place in self.places:
-            with paddle.base.dygraph.guard(place):
-                x_np = np.random.random(self.x_shape).astype('float32')
-                scale_np = np.random.random(
-                    self.x_shape[self.begin_norm_axis :]
-                ).astype('float32')
-                bias_np = np.random.random(
-                    self.x_shape[self.begin_norm_axis :]
-                ).astype('float32')
-                scale = paddle.to_tensor(scale_np).reshape(-1)
-                bias = paddle.to_tensor(bias_np).reshape(-1)
-
-                ln = nn.LayerNorm(
-                    normalized_shape=self.x_shape[self.begin_norm_axis :],
-                    weight_attr=nn.initializer.Assign(scale),
-                    bias_attr=nn.initializer.Assign(bias),
-                    epsilon=self.epsilon,
-                )
-
-                x_pd = paddle.to_tensor(x_np)
-                y_pd = ln(x_pd)
-                expect_res = _reference_layer_norm_naive(
-                    x_np, scale_np, bias_np, self.epsilon, self.begin_norm_axis
-                )
-
-                np.testing.assert_allclose(
-                    y_pd.numpy(), expect_res, rtol=1e-5, atol=1e-4
-                )
-
-    def test_no_scale_no_bias_fp32(self):
-        """test the case when both scale and bias are disabled (FP32)."""
-        for place in self.places:
-            with paddle.base.dygraph.guard(place):
-                x_np = np.random.random(self.x_shape).astype('float32')
-                x_pd = paddle.to_tensor(x_np)
-
-                ln = nn.LayerNorm(
-                    normalized_shape=self.x_shape[self.begin_norm_axis :],
-                    elementwise_affine=False,
-                    epsilon=self.epsilon,
-                )
-                y_pd = ln(x_pd)
-
-                expect_res = _reference_layer_norm_naive(
-                    x_np, None, None, self.epsilon, self.begin_norm_axis
-                )
-                np.testing.assert_allclose(
-                    y_pd.numpy(), expect_res, rtol=1e-5, atol=1e-4
-                )
-
-    def test_with_scale_no_bias_fp32(self):
-        """test the case when only scale is enabled (FP32)."""
-        for place in self.places:
-            with paddle.base.dygraph.guard(place):
-                x_np = np.random.random(self.x_shape).astype('float32')
-                scale_np = np.random.random(
-                    self.x_shape[self.begin_norm_axis :]
-                ).astype('float32')
-                scale = paddle.to_tensor(scale_np).reshape(-1)
-
-                ln = nn.LayerNorm(
-                    normalized_shape=self.x_shape[self.begin_norm_axis :],
-                    elementwise_affine=True,
-                    bias_attr=False,
-                    epsilon=self.epsilon,
-                )
-                with paddle.no_grad():
-                    ln.weight.set_value(scale)
-
-                x_pd = paddle.to_tensor(x_np)
-                y_pd = ln(x_pd)
-
-                expect_res = _reference_layer_norm_naive(
-                    x_np, scale_np, None, self.epsilon, self.begin_norm_axis
-                )
-                np.testing.assert_allclose(
-                    y_pd.numpy(), expect_res, rtol=1e-5, atol=1e-4
-                )
-
-    def test_no_scale_with_bias_fp32(self):
-        """test the case when only bias is enabled (FP32)."""
-        for place in self.places:
-            with paddle.base.dygraph.guard(place):
-                x_np = np.random.random(self.x_shape).astype('float32')
-                bias_np = np.random.random(
-                    self.x_shape[self.begin_norm_axis :]
-                ).astype('float32')
-                bias = paddle.to_tensor(bias_np).reshape(-1)
-
-                ln = nn.LayerNorm(
-                    normalized_shape=self.x_shape[self.begin_norm_axis :],
-                    elementwise_affine=True,
-                    weight_attr=False,
-                    epsilon=self.epsilon,
-                )
-                with paddle.no_grad():
-                    ln.bias.set_value(bias)
-
-                x_pd = paddle.to_tensor(x_np)
-                y_pd = ln(x_pd)
-
-                expect_res = _reference_layer_norm_naive(
-                    x_np, None, bias_np, self.epsilon, self.begin_norm_axis
-                )
-                np.testing.assert_allclose(
-                    y_pd.numpy(), expect_res, rtol=1e-5, atol=1e-4
-                )
-
-    def test_bf16_forward_backward(self):
-        """test forward and backward pass with bfloat16 precision."""
-        for place in self.places:
-            with paddle.base.dygraph.guard(place):
-                x_np = np.random.random(self.x_shape).astype('float32')
-                scale_np = np.random.random(
-                    self.x_shape[self.begin_norm_axis :]
-                ).astype('float32')
-                bias_np = np.random.random(
-                    self.x_shape[self.begin_norm_axis :]
-                ).astype('float32')
-
-                x = paddle.to_tensor(x_np).cast(paddle.bfloat16)
-                x.stop_gradient = False
-
-                scale = (
-                    paddle.to_tensor(scale_np).cast(paddle.bfloat16).reshape(-1)
-                )
-                bias = (
-                    paddle.to_tensor(bias_np).cast(paddle.bfloat16).reshape(-1)
-                )
-
-                ln = nn.LayerNorm(
-                    normalized_shape=self.x_shape[self.begin_norm_axis :],
-                    weight_attr=nn.initializer.Assign(scale),
-                    bias_attr=nn.initializer.Assign(bias),
-                    epsilon=self.epsilon,
-                )
-
-                y = ln(x)
-                loss = y.sum()
-                loss.backward()
-
-                self.assertIsNotNone(x.grad)
-                self.assertIsNotNone(ln.weight.grad)
-                self.assertIsNotNone(ln.bias.grad)
-
-
-class TestLayerNormParam(unittest.TestCase):
-    def setUp(self):
-        self.normalized_shape = [6]
-        self.x_tensor = paddle.randn([2, 4, 4, 6])
-        self.places = get_places()
-
-    def test_elementwise_affine_false(self):
-        """test that when elementwise_affine=False, no learnable parameters are created."""
-        layer = nn.LayerNorm(
-            normalized_shape=self.normalized_shape, elementwise_affine=False
-        )
-        self.assertIsNone(layer.weight)
-        self.assertIsNone(layer.bias)
-
-        out = layer(self.x_tensor)
-        self.assertEqual(out.shape, self.x_tensor.shape)
-
-    def test_elementwise_affine_true(self):
-        """test that when elementwise_affine=True and attr=None, parameters are created with default initialization."""
-        for place in self.places:
-            with paddle.base.dygraph.guard(place):
-                layer = nn.LayerNorm(
-                    normalized_shape=self.normalized_shape,
-                    elementwise_affine=True,
-                )
-                self.assertIsNotNone(layer.weight)
-                self.assertIsNotNone(layer.bias)
-
-                expected_weight = paddle.ones([6])
-                expected_bias = paddle.zeros([6])
-                self.assertTrue(paddle.allclose(layer.weight, expected_weight))
-                self.assertTrue(paddle.allclose(layer.bias, expected_bias))
-
-    def test_bias_false(self):
-        """test that when bias=False, the bias parameter is disabled even if elementwise_affine=True."""
-        layer = nn.LayerNorm(
-            normalized_shape=self.normalized_shape,
-            elementwise_affine=True,
-            bias=False,
-        )
-        self.assertIsNotNone(layer.weight)
-        self.assertIsNone(layer.bias)
-
-    def test_attr_custom_initialization(self):
-        """test that weight_attr and bias_attr can be used to customize the initialization of the weight parameter."""
-        for place in self.places:
-            with paddle.base.dygraph.guard(place):
-                weight_attr = paddle.nn.initializer.Constant(value=2.0)
-                bias_attr = paddle.nn.initializer.Constant(value=3.0)
-                layer = nn.LayerNorm(
-                    normalized_shape=self.normalized_shape,
-                    elementwise_affine=True,
-                    weight_attr=weight_attr,
-                    bias_attr=bias_attr,
-                )
-
-                expected_weight = paddle.full([6], 2.0)
-                expected_bias = paddle.full([6], 3.0)
-                self.assertTrue(paddle.allclose(layer.weight, expected_weight))
-                self.assertTrue(paddle.allclose(layer.bias, expected_bias))
-
-    def test_alias(self):
-        """test parameter alias epsilon/eps"""
-        for place in self.places:
-            with paddle.base.dygraph.guard(place):
-                layer_epsilon = nn.LayerNorm(
-                    normalized_shape=self.normalized_shape,
-                    elementwise_affine=True,
-                    epsilon=1e-5,
-                )
-                layer_eps = nn.LayerNorm(
-                    normalized_shape=self.normalized_shape,
-                    elementwise_affine=True,
-                    eps=1e-5,
-                )
-
-                out_epsilon = layer_epsilon(self.x_tensor)
-                out_eps = layer_eps(self.x_tensor)
-
-                np.testing.assert_array_equal(
-                    out_epsilon.numpy(), out_eps.numpy()
-                )
-
-    def test_errors(self):
-        """test for errors."""
-        layer_norm = nn.LayerNorm(self.normalized_shape)
-        x1 = np.random.random([3, *self.normalized_shape]).astype('float32')
-        with self.assertRaises(ValueError):
-            layer_norm(x1)
-        with self.assertRaises(TypeError):
-            nn.LayerNorm(self.normalized_shape, 1e-5, None, None, "name")
-        with self.assertRaises(TypeError):
-            nn.LayerNorm(
-                self.normalized_shape, 1e-5, False, "cpu", paddle.float32
-            )
-
-
-if __name__ == '__main__':
-    unittest.main()