From 7f5c2f4990f4f645453cd40b0afcd854c1c7b07d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=8D=97=E9=9C=84?= Copyright (c) Ant Financial Service Group and its affiliates. Base class to create custom To create a custom To ensure correctness and best performance, make sure you are calling the
+correct methods on See :ref: Examples:: Define the forward of the custom autograd Function. This function is to be overridden by all subclasses.
+There are two ways to define forward: Usage 1 (Combined forward and ctx):: Usage 2 (Separate forward and ctx):: The context can be used to store arbitrary data that can be then
+retrieved during the backward pass. Tensors should not be stored
+directly on Define a formula for differentiating the operation with backward mode automatic differentiation. This function is to be overridden by all subclasses.
+(Defining this function is equivalent to defining the It must accept a context The context can be used to retrieve tensors saved during the forward
+pass. It also has an attribute Copyright (c) Ant Financial Service Group and its affiliates. Base class to create custom To create a custom To ensure correctness and best performance, make sure you are calling the
+correct methods on See :ref: Examples:: Define the forward of the custom autograd Function. This function is to be overridden by all subclasses.
+There are two ways to define forward: Usage 1 (Combined forward and ctx):: Usage 2 (Separate forward and ctx):: The context can be used to store arbitrary data that can be then
+retrieved during the backward pass. Tensors should not be stored
+directly on Define a formula for differentiating the operation with backward mode automatic differentiation. This function is to be overridden by all subclasses.
+(Defining this function is equivalent to defining the It must accept a context The context can be used to retrieve tensors saved during the forward
+pass. It also has an attribute Copyright (c) Ant Financial Service Group and its affiliates. Base class to create custom To create a custom To ensure correctness and best performance, make sure you are calling the
+correct methods on See :ref: Examples:: Define the forward of the custom autograd Function. This function is to be overridden by all subclasses.
+There are two ways to define forward: Usage 1 (Combined forward and ctx):: Usage 2 (Separate forward and ctx):: The context can be used to store arbitrary data that can be then
+retrieved during the backward pass. Tensors should not be stored
+directly on Define a formula for differentiating the operation with backward mode automatic differentiation. This function is to be overridden by all subclasses.
+(Defining this function is equivalent to defining the It must accept a context The context can be used to retrieve tensors saved during the forward
+pass. It also has an attribute Base class to create custom To create a custom To ensure correctness and best performance, make sure you are calling the
+correct methods on See :ref: Examples:: Define the forward of the custom autograd Function. This function is to be overridden by all subclasses.
+There are two ways to define forward: Usage 1 (Combined forward and ctx):: Usage 2 (Separate forward and ctx):: The context can be used to store arbitrary data that can be then
+retrieved during the backward pass. Tensors should not be stored
+directly on Define a formula for differentiating the operation with backward mode automatic differentiation. This function is to be overridden by all subclasses.
+(Defining this function is equivalent to defining the It must accept a context The context can be used to retrieve tensors saved during the forward
+pass. It also has an attribute Copyright (c) Ant Financial Service Group and its affiliates. Base class to create custom To create a custom To ensure correctness and best performance, make sure you are calling the
+correct methods on See :ref: Examples:: Define the forward of the custom autograd Function. This function is to be overridden by all subclasses.
+There are two ways to define forward: Usage 1 (Combined forward and ctx):: Usage 2 (Separate forward and ctx):: The context can be used to store arbitrary data that can be then
+retrieved during the backward pass. Tensors should not be stored
+directly on Define a formula for differentiating the operation with backward mode automatic differentiation. This function is to be overridden by all subclasses.
+(Defining this function is equivalent to defining the It must accept a context The context can be used to retrieve tensors saved during the forward
+pass. It also has an attribute Base class to create custom To create a custom To ensure correctness and best performance, make sure you are calling the
+correct methods on See :ref: Examples:: Define the forward of the custom autograd Function. This function is to be overridden by all subclasses.
+There are two ways to define forward: Usage 1 (Combined forward and ctx):: Usage 2 (Separate forward and ctx):: The context can be used to store arbitrary data that can be then
+retrieved during the backward pass. Tensors should not be stored
+directly on Define a formula for differentiating the operation with backward mode automatic differentiation. This function is to be overridden by all subclasses.
+(Defining this function is equivalent to defining the It must accept a context The context can be used to retrieve tensors saved during the forward
+pass. It also has an attribute Copyright (c) Ant Financial Service Group and its affiliates. Base class to create custom To create a custom To ensure correctness and best performance, make sure you are calling the
+correct methods on See :ref: Examples:: Define the forward of the custom autograd Function. This function is to be overridden by all subclasses.
+There are two ways to define forward: Usage 1 (Combined forward and ctx):: Usage 2 (Separate forward and ctx):: The context can be used to store arbitrary data that can be then
+retrieved during the backward pass. Tensors should not be stored
+directly on Define a formula for differentiating the operation with backward mode automatic differentiation. This function is to be overridden by all subclasses.
+(Defining this function is equivalent to defining the It must accept a context The context can be used to retrieve tensors saved during the forward
+pass. It also has an attribute Copyright (c) Ant Financial Service Group and its affiliates. Base class to create custom To create a custom To ensure correctness and best performance, make sure you are calling the
+correct methods on See :ref: Examples:: Define the forward of the custom autograd Function. This function is to be overridden by all subclasses.
+There are two ways to define forward: Usage 1 (Combined forward and ctx):: Usage 2 (Separate forward and ctx):: The context can be used to store arbitrary data that can be then
+retrieved during the backward pass. Tensors should not be stored
+directly on Define a formula for differentiating the operation with backward mode automatic differentiation. This function is to be overridden by all subclasses.
+(Defining this function is equivalent to defining the It must accept a context The context can be used to retrieve tensors saved during the forward
+pass. It also has an attribute Copyright (c) Ant Financial Service Group and its affiliates. Copyright (c) Ant Financial Service Group and its affiliates. Copyright (c) Ant Financial Service Group and its affiliates. Copyright (c) Ant Financial Service Group and its affiliates. Copyright (c) Ant Financial Service Group and its affiliates. inplace add y to x Returns: x += y if accum=True else x.copy_(y) Copyright (c) Ant Financial Service Group and its affiliates. Copyright (c) Ant Financial Service Group and its affiliates. Copyright (c) Ant Financial Service Group and its affiliates. Fused RMSNorm forward and block quantization. out: quantization data
+ scale: quantization scale
+ rms: Reciprocal of the root mean square of the input calculated over the last dimension.
+ transpose_output: quantization data of transposed gradient
+ transpose_scale: quantization scale of transposed gradient norm and gate in linear attention Returns: Copyright (c) Ant Financial Service Group and its affiliates. Copyright (c) Ant Financial Service Group and its affiliates. Copyright (c) Ant Financial Service Group and its affiliates. Copyright (c) Ant Financial Service Group and its affiliates. Copyright (c) Ant Financial Service Group and its affiliates. Copyright (c) Ant Financial Service Group and its affiliates.
+linghe
+
+
+
+
+
+
+linghe
+
+
+
+
+
+
+linghe
+
+ autograd.Function.autograd.Function, subclass this class and implement
+the forward() and backward() static methods. Then, to use your custom
+op in the forward pass, call the class method apply. Do not call
+forward() directly.ctx and validating your backward function using
+torch.autograd.gradcheck().extending-autograd for more details on how to use this class.
+>>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_AUTOGRAD)
+>>> class Exp(Function):
+>>> @staticmethod
+>>> def forward(ctx, i):
+>>> result = i.exp()
+>>> ctx.save_for_backward(result)
+>>> return result
+>>>
+>>> @staticmethod
+>>> def backward(ctx, grad_output):
+>>> result, = ctx.saved_tensors
+>>> return grad_output * result
+>>>
+>>> # Use it by calling the apply method:
+>>> # xdoctest: +SKIP
+>>> output = Exp.apply(input)
+
+
+@staticmethod
+def forward(ctx: Any, *args: Any, **kwargs: Any) -> Any:
+ pass
+
+
+
+combining-forward-context for more details
+
+@staticmethod
+def forward(*args: Any, **kwargs: Any) -> Any:
+ pass
+
+@staticmethod
+def setup_context(ctx: Any, inputs: Tuple[Any, ...], output: Any) -> None:
+ pass
+
+
+
+torch.autograd.Function.setup_context()
+staticmethod to handle setting up the ctx object.
+output is the output of the forward, inputs are a Tuple of inputs
+to the forward.extending-autograd for more detailsctx (though this is not currently enforced for
+backward compatibility). Instead, tensors should be saved either with
+ctx.save_for_backward() if they are intended to be used in
+backward (equivalently, vjp) or ctx.save_for_forward()
+if they are intended to be used for in jvp.vjp function.)ctx as the first argument, followed by
+as many outputs as the forward() returned (None will be passed in
+for non tensor outputs of the forward function),
+and it should return as many tensors, as there were inputs to
+forward(). Each argument is the gradient w.r.t the given output,
+and each returned value should be the gradient w.r.t. the
+corresponding input. If an input is not a Tensor or is a Tensor not
+requiring grads, you can just pass None as a gradient for that input.ctx.needs_input_grad as a tuple
+of booleans representing whether each input needs gradient. E.g.,
+backward() will have ctx.needs_input_grad[0] = True if the
+first input to forward() needs gradient computed w.r.t. the
+output.
+linghe
+
+ autograd.Function.autograd.Function, subclass this class and implement
+the forward() and backward() static methods. Then, to use your custom
+op in the forward pass, call the class method apply. Do not call
+forward() directly.ctx and validating your backward function using
+torch.autograd.gradcheck().extending-autograd for more details on how to use this class.
+>>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_AUTOGRAD)
+>>> class Exp(Function):
+>>> @staticmethod
+>>> def forward(ctx, i):
+>>> result = i.exp()
+>>> ctx.save_for_backward(result)
+>>> return result
+>>>
+>>> @staticmethod
+>>> def backward(ctx, grad_output):
+>>> result, = ctx.saved_tensors
+>>> return grad_output * result
+>>>
+>>> # Use it by calling the apply method:
+>>> # xdoctest: +SKIP
+>>> output = Exp.apply(input)
+
+
+@staticmethod
+def forward(ctx: Any, *args: Any, **kwargs: Any) -> Any:
+ pass
+
+
+
+combining-forward-context for more details
+
+@staticmethod
+def forward(*args: Any, **kwargs: Any) -> Any:
+ pass
+
+@staticmethod
+def setup_context(ctx: Any, inputs: Tuple[Any, ...], output: Any) -> None:
+ pass
+
+
+
+torch.autograd.Function.setup_context()
+staticmethod to handle setting up the ctx object.
+output is the output of the forward, inputs are a Tuple of inputs
+to the forward.extending-autograd for more detailsctx (though this is not currently enforced for
+backward compatibility). Instead, tensors should be saved either with
+ctx.save_for_backward() if they are intended to be used in
+backward (equivalently, vjp) or ctx.save_for_forward()
+if they are intended to be used for in jvp.vjp function.)ctx as the first argument, followed by
+as many outputs as the forward() returned (None will be passed in
+for non tensor outputs of the forward function),
+and it should return as many tensors, as there were inputs to
+forward(). Each argument is the gradient w.r.t the given output,
+and each returned value should be the gradient w.r.t. the
+corresponding input. If an input is not a Tensor or is a Tensor not
+requiring grads, you can just pass None as a gradient for that input.ctx.needs_input_grad as a tuple
+of booleans representing whether each input needs gradient. E.g.,
+backward() will have ctx.needs_input_grad[0] = True if the
+first input to forward() needs gradient computed w.r.t. the
+output.
+linghe
+
+ autograd.Function.autograd.Function, subclass this class and implement
+the forward() and backward() static methods. Then, to use your custom
+op in the forward pass, call the class method apply. Do not call
+forward() directly.ctx and validating your backward function using
+torch.autograd.gradcheck().extending-autograd for more details on how to use this class.
+>>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_AUTOGRAD)
+>>> class Exp(Function):
+>>> @staticmethod
+>>> def forward(ctx, i):
+>>> result = i.exp()
+>>> ctx.save_for_backward(result)
+>>> return result
+>>>
+>>> @staticmethod
+>>> def backward(ctx, grad_output):
+>>> result, = ctx.saved_tensors
+>>> return grad_output * result
+>>>
+>>> # Use it by calling the apply method:
+>>> # xdoctest: +SKIP
+>>> output = Exp.apply(input)
+
+
+@staticmethod
+def forward(ctx: Any, *args: Any, **kwargs: Any) -> Any:
+ pass
+
+
+
+combining-forward-context for more details
+
+@staticmethod
+def forward(*args: Any, **kwargs: Any) -> Any:
+ pass
+
+@staticmethod
+def setup_context(ctx: Any, inputs: Tuple[Any, ...], output: Any) -> None:
+ pass
+
+
+
+torch.autograd.Function.setup_context()
+staticmethod to handle setting up the ctx object.
+output is the output of the forward, inputs are a Tuple of inputs
+to the forward.extending-autograd for more detailsctx (though this is not currently enforced for
+backward compatibility). Instead, tensors should be saved either with
+ctx.save_for_backward() if they are intended to be used in
+backward (equivalently, vjp) or ctx.save_for_forward()
+if they are intended to be used for in jvp.vjp function.)ctx as the first argument, followed by
+as many outputs as the forward() returned (None will be passed in
+for non tensor outputs of the forward function),
+and it should return as many tensors, as there were inputs to
+forward(). Each argument is the gradient w.r.t the given output,
+and each returned value should be the gradient w.r.t. the
+corresponding input. If an input is not a Tensor or is a Tensor not
+requiring grads, you can just pass None as a gradient for that input.ctx.needs_input_grad as a tuple
+of booleans representing whether each input needs gradient. E.g.,
+backward() will have ctx.needs_input_grad[0] = True if the
+first input to forward() needs gradient computed w.r.t. the
+output.autograd.Function.autograd.Function, subclass this class and implement
+the forward() and backward() static methods. Then, to use your custom
+op in the forward pass, call the class method apply. Do not call
+forward() directly.ctx and validating your backward function using
+torch.autograd.gradcheck().extending-autograd for more details on how to use this class.
+>>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_AUTOGRAD)
+>>> class Exp(Function):
+>>> @staticmethod
+>>> def forward(ctx, i):
+>>> result = i.exp()
+>>> ctx.save_for_backward(result)
+>>> return result
+>>>
+>>> @staticmethod
+>>> def backward(ctx, grad_output):
+>>> result, = ctx.saved_tensors
+>>> return grad_output * result
+>>>
+>>> # Use it by calling the apply method:
+>>> # xdoctest: +SKIP
+>>> output = Exp.apply(input)
+
+
+@staticmethod
+def forward(ctx: Any, *args: Any, **kwargs: Any) -> Any:
+ pass
+
+
+
+combining-forward-context for more details
+
+@staticmethod
+def forward(*args: Any, **kwargs: Any) -> Any:
+ pass
+
+@staticmethod
+def setup_context(ctx: Any, inputs: Tuple[Any, ...], output: Any) -> None:
+ pass
+
+
+
+torch.autograd.Function.setup_context()
+staticmethod to handle setting up the ctx object.
+output is the output of the forward, inputs are a Tuple of inputs
+to the forward.extending-autograd for more detailsctx (though this is not currently enforced for
+backward compatibility). Instead, tensors should be saved either with
+ctx.save_for_backward() if they are intended to be used in
+backward (equivalently, vjp) or ctx.save_for_forward()
+if they are intended to be used for in jvp.vjp function.)ctx as the first argument, followed by
+as many outputs as the forward() returned (None will be passed in
+for non tensor outputs of the forward function),
+and it should return as many tensors, as there were inputs to
+forward(). Each argument is the gradient w.r.t the given output,
+and each returned value should be the gradient w.r.t. the
+corresponding input. If an input is not a Tensor or is a Tensor not
+requiring grads, you can just pass None as a gradient for that input.ctx.needs_input_grad as a tuple
+of booleans representing whether each input needs gradient. E.g.,
+backward() will have ctx.needs_input_grad[0] = True if the
+first input to forward() needs gradient computed w.r.t. the
+output.
+linghe
+
+ autograd.Function.autograd.Function, subclass this class and implement
+the forward() and backward() static methods. Then, to use your custom
+op in the forward pass, call the class method apply. Do not call
+forward() directly.ctx and validating your backward function using
+torch.autograd.gradcheck().extending-autograd for more details on how to use this class.
+>>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_AUTOGRAD)
+>>> class Exp(Function):
+>>> @staticmethod
+>>> def forward(ctx, i):
+>>> result = i.exp()
+>>> ctx.save_for_backward(result)
+>>> return result
+>>>
+>>> @staticmethod
+>>> def backward(ctx, grad_output):
+>>> result, = ctx.saved_tensors
+>>> return grad_output * result
+>>>
+>>> # Use it by calling the apply method:
+>>> # xdoctest: +SKIP
+>>> output = Exp.apply(input)
+
+
+@staticmethod
+def forward(ctx: Any, *args: Any, **kwargs: Any) -> Any:
+ pass
+
+
+
+combining-forward-context for more details
+
+@staticmethod
+def forward(*args: Any, **kwargs: Any) -> Any:
+ pass
+
+@staticmethod
+def setup_context(ctx: Any, inputs: Tuple[Any, ...], output: Any) -> None:
+ pass
+
+
+
+torch.autograd.Function.setup_context()
+staticmethod to handle setting up the ctx object.
+output is the output of the forward, inputs are a Tuple of inputs
+to the forward.extending-autograd for more detailsctx (though this is not currently enforced for
+backward compatibility). Instead, tensors should be saved either with
+ctx.save_for_backward() if they are intended to be used in
+backward (equivalently, vjp) or ctx.save_for_forward()
+if they are intended to be used for in jvp.vjp function.)ctx as the first argument, followed by
+as many outputs as the forward() returned (None will be passed in
+for non tensor outputs of the forward function),
+and it should return as many tensors, as there were inputs to
+forward(). Each argument is the gradient w.r.t the given output,
+and each returned value should be the gradient w.r.t. the
+corresponding input. If an input is not a Tensor or is a Tensor not
+requiring grads, you can just pass None as a gradient for that input.ctx.needs_input_grad as a tuple
+of booleans representing whether each input needs gradient. E.g.,
+backward() will have ctx.needs_input_grad[0] = True if the
+first input to forward() needs gradient computed w.r.t. the
+output.autograd.Function.autograd.Function, subclass this class and implement
+the forward() and backward() static methods. Then, to use your custom
+op in the forward pass, call the class method apply. Do not call
+forward() directly.ctx and validating your backward function using
+torch.autograd.gradcheck().extending-autograd for more details on how to use this class.
+>>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_AUTOGRAD)
+>>> class Exp(Function):
+>>> @staticmethod
+>>> def forward(ctx, i):
+>>> result = i.exp()
+>>> ctx.save_for_backward(result)
+>>> return result
+>>>
+>>> @staticmethod
+>>> def backward(ctx, grad_output):
+>>> result, = ctx.saved_tensors
+>>> return grad_output * result
+>>>
+>>> # Use it by calling the apply method:
+>>> # xdoctest: +SKIP
+>>> output = Exp.apply(input)
+
+
+@staticmethod
+def forward(ctx: Any, *args: Any, **kwargs: Any) -> Any:
+ pass
+
+
+
+combining-forward-context for more details
+
+@staticmethod
+def forward(*args: Any, **kwargs: Any) -> Any:
+ pass
+
+@staticmethod
+def setup_context(ctx: Any, inputs: Tuple[Any, ...], output: Any) -> None:
+ pass
+
+
+
+torch.autograd.Function.setup_context()
+staticmethod to handle setting up the ctx object.
+output is the output of the forward, inputs are a Tuple of inputs
+to the forward.extending-autograd for more detailsctx (though this is not currently enforced for
+backward compatibility). Instead, tensors should be saved either with
+ctx.save_for_backward() if they are intended to be used in
+backward (equivalently, vjp) or ctx.save_for_forward()
+if they are intended to be used for in jvp.vjp function.)ctx as the first argument, followed by
+as many outputs as the forward() returned (None will be passed in
+for non tensor outputs of the forward function),
+and it should return as many tensors, as there were inputs to
+forward(). Each argument is the gradient w.r.t the given output,
+and each returned value should be the gradient w.r.t. the
+corresponding input. If an input is not a Tensor or is a Tensor not
+requiring grads, you can just pass None as a gradient for that input.ctx.needs_input_grad as a tuple
+of booleans representing whether each input needs gradient. E.g.,
+backward() will have ctx.needs_input_grad[0] = True if the
+first input to forward() needs gradient computed w.r.t. the
+output.
+linghe
+
+ autograd.Function.autograd.Function, subclass this class and implement
+the forward() and backward() static methods. Then, to use your custom
+op in the forward pass, call the class method apply. Do not call
+forward() directly.ctx and validating your backward function using
+torch.autograd.gradcheck().extending-autograd for more details on how to use this class.
+>>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_AUTOGRAD)
+>>> class Exp(Function):
+>>> @staticmethod
+>>> def forward(ctx, i):
+>>> result = i.exp()
+>>> ctx.save_for_backward(result)
+>>> return result
+>>>
+>>> @staticmethod
+>>> def backward(ctx, grad_output):
+>>> result, = ctx.saved_tensors
+>>> return grad_output * result
+>>>
+>>> # Use it by calling the apply method:
+>>> # xdoctest: +SKIP
+>>> output = Exp.apply(input)
+
+
+@staticmethod
+def forward(ctx: Any, *args: Any, **kwargs: Any) -> Any:
+ pass
+
+
+
+combining-forward-context for more details
+
+@staticmethod
+def forward(*args: Any, **kwargs: Any) -> Any:
+ pass
+
+@staticmethod
+def setup_context(ctx: Any, inputs: Tuple[Any, ...], output: Any) -> None:
+ pass
+
+
+
+torch.autograd.Function.setup_context()
+staticmethod to handle setting up the ctx object.
+output is the output of the forward, inputs are a Tuple of inputs
+to the forward.extending-autograd for more detailsctx (though this is not currently enforced for
+backward compatibility). Instead, tensors should be saved either with
+ctx.save_for_backward() if they are intended to be used in
+backward (equivalently, vjp) or ctx.save_for_forward()
+if they are intended to be used for in jvp.vjp function.)ctx as the first argument, followed by
+as many outputs as the forward() returned (None will be passed in
+for non tensor outputs of the forward function),
+and it should return as many tensors, as there were inputs to
+forward(). Each argument is the gradient w.r.t the given output,
+and each returned value should be the gradient w.r.t. the
+corresponding input. If an input is not a Tensor or is a Tensor not
+requiring grads, you can just pass None as a gradient for that input.ctx.needs_input_grad as a tuple
+of booleans representing whether each input needs gradient. E.g.,
+backward() will have ctx.needs_input_grad[0] = True if the
+first input to forward() needs gradient computed w.r.t. the
+output.
+linghe
+
+ autograd.Function.autograd.Function, subclass this class and implement
+the forward() and backward() static methods. Then, to use your custom
+op in the forward pass, call the class method apply. Do not call
+forward() directly.ctx and validating your backward function using
+torch.autograd.gradcheck().extending-autograd for more details on how to use this class.
+>>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_AUTOGRAD)
+>>> class Exp(Function):
+>>> @staticmethod
+>>> def forward(ctx, i):
+>>> result = i.exp()
+>>> ctx.save_for_backward(result)
+>>> return result
+>>>
+>>> @staticmethod
+>>> def backward(ctx, grad_output):
+>>> result, = ctx.saved_tensors
+>>> return grad_output * result
+>>>
+>>> # Use it by calling the apply method:
+>>> # xdoctest: +SKIP
+>>> output = Exp.apply(input)
+
+
+@staticmethod
+def forward(ctx: Any, *args: Any, **kwargs: Any) -> Any:
+ pass
+
+
+
+combining-forward-context for more details
+
+@staticmethod
+def forward(*args: Any, **kwargs: Any) -> Any:
+ pass
+
+@staticmethod
+def setup_context(ctx: Any, inputs: Tuple[Any, ...], output: Any) -> None:
+ pass
+
+
+
+torch.autograd.Function.setup_context()
+staticmethod to handle setting up the ctx object.
+output is the output of the forward, inputs are a Tuple of inputs
+to the forward.extending-autograd for more detailsctx (though this is not currently enforced for
+backward compatibility). Instead, tensors should be saved either with
+ctx.save_for_backward() if they are intended to be used in
+backward (equivalently, vjp) or ctx.save_for_forward()
+if they are intended to be used for in jvp.vjp function.)ctx as the first argument, followed by
+as many outputs as the forward() returned (None will be passed in
+for non tensor outputs of the forward function),
+and it should return as many tensors, as there were inputs to
+forward(). Each argument is the gradient w.r.t the given output,
+and each returned value should be the gradient w.r.t. the
+corresponding input. If an input is not a Tensor or is a Tensor not
+requiring grads, you can just pass None as a gradient for that input.ctx.needs_input_grad as a tuple
+of booleans representing whether each input needs gradient. E.g.,
+backward() will have ctx.needs_input_grad[0] = True if the
+first input to forward() needs gradient computed w.r.t. the
+output.
+linghe
+
+
+
+
+
+
+linghe
+
+
+linghe
+
+
+
+
+
+
+linghe
+
+
+
+
+
+
+linghe
+
+
+linghe
+
+
+linghe
+
+
+
+
+
+
+linghe
+
+
+linghe
+
+
+
+
+
+
+linghe
+
+ Arguments:
+
+
+
+
+
+linghe
+
+
+linghe
+
+
+linghe
+
+
+linghe
+
+
+
+
+
+ Arguments:
+
+
+
+
+Returns:
+
+
+
+Arguments:
+
+
+
+
+
+linghe
+
+
+linghe
+
+
+linghe
+
+
+linghe
+
+
+linghe
+
+
+linghe
+
+
Copyright (c) Ant Financial Service Group and its affiliates.
\n"}, {"fullname": "linghe.facade.add.InplaceAddFunction", "modulename": "linghe.facade.add", "qualname": "InplaceAddFunction", "kind": "class", "doc": "Base class to create custom autograd.Function.
To create a custom autograd.Function, subclass this class and implement\nthe :meth:forward and :meth:backward static methods. Then, to use your custom\nop in the forward pass, call the class method apply. Do not call\n:meth:forward directly.
To ensure correctness and best performance, make sure you are calling the\ncorrect methods on ctx and validating your backward function using\n:func:torch.autograd.gradcheck.
See :ref:extending-autograd for more details on how to use this class.
Examples::
\n\n>>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_AUTOGRAD)\n>>> class Exp(Function):\n>>> @staticmethod\n>>> def forward(ctx, i):\n>>> result = i.exp()\n>>> ctx.save_for_backward(result)\n>>> return result\n>>>\n>>> @staticmethod\n>>> def backward(ctx, grad_output):\n>>> result, = ctx.saved_tensors\n>>> return grad_output * result\n>>>\n>>> # Use it by calling the apply method:\n>>> # xdoctest: +SKIP\n>>> output = Exp.apply(input)\n\n", "bases": "torch.autograd.function.Function"}, {"fullname": "linghe.facade.add.InplaceAddFunction.forward", "modulename": "linghe.facade.add", "qualname": "InplaceAddFunction.forward", "kind": "function", "doc": "Define the forward of the custom autograd Function.
\n\nThis function is to be overridden by all subclasses.\nThere are two ways to define forward:
\n\nUsage 1 (Combined forward and ctx)::
\n\n@staticmethod\ndef forward(ctx: Any, *args: Any, **kwargs: Any) -> Any:\n pass\n\n\ncombining-forward-context for more detailsUsage 2 (Separate forward and ctx)::
\n\n@staticmethod\ndef forward(*args: Any, **kwargs: Any) -> Any:\n pass\n\n@staticmethod\ndef setup_context(ctx: Any, inputs: Tuple[Any, ...], output: Any) -> None:\n pass\n\n\ntorch.autograd.Function.setup_context\nstaticmethod to handle setting up the ctx object.\noutput is the output of the forward, inputs are a Tuple of inputs\nto the forward.extending-autograd for more detailsThe context can be used to store arbitrary data that can be then\nretrieved during the backward pass. Tensors should not be stored\ndirectly on ctx (though this is not currently enforced for\nbackward compatibility). Instead, tensors should be saved either with\n:func:ctx.save_for_backward if they are intended to be used in\nbackward (equivalently, vjp) or :func:ctx.save_for_forward\nif they are intended to be used for in jvp.
Define a formula for differentiating the operation with backward mode automatic differentiation.
\n\nThis function is to be overridden by all subclasses.\n(Defining this function is equivalent to defining the vjp function.)
It must accept a context :attr:ctx as the first argument, followed by\nas many outputs as the :func:forward returned (None will be passed in\nfor non tensor outputs of the forward function),\nand it should return as many tensors, as there were inputs to\n:func:forward. Each argument is the gradient w.r.t the given output,\nand each returned value should be the gradient w.r.t. the\ncorresponding input. If an input is not a Tensor or is a Tensor not\nrequiring grads, you can just pass None as a gradient for that input.
The context can be used to retrieve tensors saved during the forward\npass. It also has an attribute :attr:ctx.needs_input_grad as a tuple\nof booleans representing whether each input needs gradient. E.g.,\n:func:backward will have ctx.needs_input_grad[0] = True if the\nfirst input to :func:forward needs gradient computed w.r.t. the\noutput.
Copyright (c) Ant Financial Service Group and its affiliates.
\n"}, {"fullname": "linghe.facade.fp32_linear.FusedFp32GEMM", "modulename": "linghe.facade.fp32_linear", "qualname": "FusedFp32GEMM", "kind": "class", "doc": "Base class to create custom autograd.Function.
To create a custom autograd.Function, subclass this class and implement\nthe :meth:forward and :meth:backward static methods. Then, to use your custom\nop in the forward pass, call the class method apply. Do not call\n:meth:forward directly.
To ensure correctness and best performance, make sure you are calling the\ncorrect methods on ctx and validating your backward function using\n:func:torch.autograd.gradcheck.
See :ref:extending-autograd for more details on how to use this class.
Examples::
\n\n>>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_AUTOGRAD)\n>>> class Exp(Function):\n>>> @staticmethod\n>>> def forward(ctx, i):\n>>> result = i.exp()\n>>> ctx.save_for_backward(result)\n>>> return result\n>>>\n>>> @staticmethod\n>>> def backward(ctx, grad_output):\n>>> result, = ctx.saved_tensors\n>>> return grad_output * result\n>>>\n>>> # Use it by calling the apply method:\n>>> # xdoctest: +SKIP\n>>> output = Exp.apply(input)\n\n", "bases": "torch.autograd.function.Function"}, {"fullname": "linghe.facade.fp32_linear.FusedFp32GEMM.forward", "modulename": "linghe.facade.fp32_linear", "qualname": "FusedFp32GEMM.forward", "kind": "function", "doc": "Define the forward of the custom autograd Function.
\n\nThis function is to be overridden by all subclasses.\nThere are two ways to define forward:
\n\nUsage 1 (Combined forward and ctx)::
\n\n@staticmethod\ndef forward(ctx: Any, *args: Any, **kwargs: Any) -> Any:\n pass\n\n\ncombining-forward-context for more detailsUsage 2 (Separate forward and ctx)::
\n\n@staticmethod\ndef forward(*args: Any, **kwargs: Any) -> Any:\n pass\n\n@staticmethod\ndef setup_context(ctx: Any, inputs: Tuple[Any, ...], output: Any) -> None:\n pass\n\n\ntorch.autograd.Function.setup_context\nstaticmethod to handle setting up the ctx object.\noutput is the output of the forward, inputs are a Tuple of inputs\nto the forward.extending-autograd for more detailsThe context can be used to store arbitrary data that can be then\nretrieved during the backward pass. Tensors should not be stored\ndirectly on ctx (though this is not currently enforced for\nbackward compatibility). Instead, tensors should be saved either with\n:func:ctx.save_for_backward if they are intended to be used in\nbackward (equivalently, vjp) or :func:ctx.save_for_forward\nif they are intended to be used for in jvp.
Define a formula for differentiating the operation with backward mode automatic differentiation.
\n\nThis function is to be overridden by all subclasses.\n(Defining this function is equivalent to defining the vjp function.)
It must accept a context :attr:ctx as the first argument, followed by\nas many outputs as the :func:forward returned (None will be passed in\nfor non tensor outputs of the forward function),\nand it should return as many tensors, as there were inputs to\n:func:forward. Each argument is the gradient w.r.t the given output,\nand each returned value should be the gradient w.r.t. the\ncorresponding input. If an input is not a Tensor or is a Tensor not\nrequiring grads, you can just pass None as a gradient for that input.
The context can be used to retrieve tensors saved during the forward\npass. It also has an attribute :attr:ctx.needs_input_grad as a tuple\nof booleans representing whether each input needs gradient. E.g.,\n:func:backward will have ctx.needs_input_grad[0] = True if the\nfirst input to :func:forward needs gradient computed w.r.t. the\noutput.
Copyright (c) Ant Financial Service Group and its affiliates.
\n"}, {"fullname": "linghe.facade.loss.SoftmaxCrossEntropyFunction", "modulename": "linghe.facade.loss", "qualname": "SoftmaxCrossEntropyFunction", "kind": "class", "doc": "Base class to create custom autograd.Function.
To create a custom autograd.Function, subclass this class and implement\nthe :meth:forward and :meth:backward static methods. Then, to use your custom\nop in the forward pass, call the class method apply. Do not call\n:meth:forward directly.
To ensure correctness and best performance, make sure you are calling the\ncorrect methods on ctx and validating your backward function using\n:func:torch.autograd.gradcheck.
See :ref:extending-autograd for more details on how to use this class.
Examples::
\n\n>>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_AUTOGRAD)\n>>> class Exp(Function):\n>>> @staticmethod\n>>> def forward(ctx, i):\n>>> result = i.exp()\n>>> ctx.save_for_backward(result)\n>>> return result\n>>>\n>>> @staticmethod\n>>> def backward(ctx, grad_output):\n>>> result, = ctx.saved_tensors\n>>> return grad_output * result\n>>>\n>>> # Use it by calling the apply method:\n>>> # xdoctest: +SKIP\n>>> output = Exp.apply(input)\n\n", "bases": "torch.autograd.function.Function"}, {"fullname": "linghe.facade.loss.SoftmaxCrossEntropyFunction.forward", "modulename": "linghe.facade.loss", "qualname": "SoftmaxCrossEntropyFunction.forward", "kind": "function", "doc": "Define the forward of the custom autograd Function.
\n\nThis function is to be overridden by all subclasses.\nThere are two ways to define forward:
\n\nUsage 1 (Combined forward and ctx)::
\n\n@staticmethod\ndef forward(ctx: Any, *args: Any, **kwargs: Any) -> Any:\n pass\n\n\ncombining-forward-context for more detailsUsage 2 (Separate forward and ctx)::
\n\n@staticmethod\ndef forward(*args: Any, **kwargs: Any) -> Any:\n pass\n\n@staticmethod\ndef setup_context(ctx: Any, inputs: Tuple[Any, ...], output: Any) -> None:\n pass\n\n\ntorch.autograd.Function.setup_context\nstaticmethod to handle setting up the ctx object.\noutput is the output of the forward, inputs are a Tuple of inputs\nto the forward.extending-autograd for more detailsThe context can be used to store arbitrary data that can be then\nretrieved during the backward pass. Tensors should not be stored\ndirectly on ctx (though this is not currently enforced for\nbackward compatibility). Instead, tensors should be saved either with\n:func:ctx.save_for_backward if they are intended to be used in\nbackward (equivalently, vjp) or :func:ctx.save_for_forward\nif they are intended to be used for in jvp.
Define a formula for differentiating the operation with backward mode automatic differentiation.
\n\nThis function is to be overridden by all subclasses.\n(Defining this function is equivalent to defining the vjp function.)
It must accept a context :attr:ctx as the first argument, followed by\nas many outputs as the :func:forward returned (None will be passed in\nfor non tensor outputs of the forward function),\nand it should return as many tensors, as there were inputs to\n:func:forward. Each argument is the gradient w.r.t the given output,\nand each returned value should be the gradient w.r.t. the\ncorresponding input. If an input is not a Tensor or is a Tensor not\nrequiring grads, you can just pass None as a gradient for that input.
The context can be used to retrieve tensors saved during the forward\npass. It also has an attribute :attr:ctx.needs_input_grad as a tuple\nof booleans representing whether each input needs gradient. E.g.,\n:func:backward will have ctx.needs_input_grad[0] = True if the\nfirst input to :func:forward needs gradient computed w.r.t. the\noutput.
Base class to create custom autograd.Function.
To create a custom autograd.Function, subclass this class and implement\nthe :meth:forward and :meth:backward static methods. Then, to use your custom\nop in the forward pass, call the class method apply. Do not call\n:meth:forward directly.
To ensure correctness and best performance, make sure you are calling the\ncorrect methods on ctx and validating your backward function using\n:func:torch.autograd.gradcheck.
See :ref:extending-autograd for more details on how to use this class.
Examples::
\n\n>>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_AUTOGRAD)\n>>> class Exp(Function):\n>>> @staticmethod\n>>> def forward(ctx, i):\n>>> result = i.exp()\n>>> ctx.save_for_backward(result)\n>>> return result\n>>>\n>>> @staticmethod\n>>> def backward(ctx, grad_output):\n>>> result, = ctx.saved_tensors\n>>> return grad_output * result\n>>>\n>>> # Use it by calling the apply method:\n>>> # xdoctest: +SKIP\n>>> output = Exp.apply(input)\n\n", "bases": "torch.autograd.function.Function"}, {"fullname": "linghe.facade.loss.GradScalingFunction.forward", "modulename": "linghe.facade.loss", "qualname": "GradScalingFunction.forward", "kind": "function", "doc": "Define the forward of the custom autograd Function.
\n\nThis function is to be overridden by all subclasses.\nThere are two ways to define forward:
\n\nUsage 1 (Combined forward and ctx)::
\n\n@staticmethod\ndef forward(ctx: Any, *args: Any, **kwargs: Any) -> Any:\n pass\n\n\ncombining-forward-context for more detailsUsage 2 (Separate forward and ctx)::
\n\n@staticmethod\ndef forward(*args: Any, **kwargs: Any) -> Any:\n pass\n\n@staticmethod\ndef setup_context(ctx: Any, inputs: Tuple[Any, ...], output: Any) -> None:\n pass\n\n\ntorch.autograd.Function.setup_context\nstaticmethod to handle setting up the ctx object.\noutput is the output of the forward, inputs are a Tuple of inputs\nto the forward.extending-autograd for more detailsThe context can be used to store arbitrary data that can be then\nretrieved during the backward pass. Tensors should not be stored\ndirectly on ctx (though this is not currently enforced for\nbackward compatibility). Instead, tensors should be saved either with\n:func:ctx.save_for_backward if they are intended to be used in\nbackward (equivalently, vjp) or :func:ctx.save_for_forward\nif they are intended to be used for in jvp.
Define a formula for differentiating the operation with backward mode automatic differentiation.
\n\nThis function is to be overridden by all subclasses.\n(Defining this function is equivalent to defining the vjp function.)
It must accept a context :attr:ctx as the first argument, followed by\nas many outputs as the :func:forward returned (None will be passed in\nfor non tensor outputs of the forward function),\nand it should return as many tensors, as there were inputs to\n:func:forward. Each argument is the gradient w.r.t the given output,\nand each returned value should be the gradient w.r.t. the\ncorresponding input. If an input is not a Tensor or is a Tensor not\nrequiring grads, you can just pass None as a gradient for that input.
The context can be used to retrieve tensors saved during the forward\npass. It also has an attribute :attr:ctx.needs_input_grad as a tuple\nof booleans representing whether each input needs gradient. E.g.,\n:func:backward will have ctx.needs_input_grad[0] = True if the\nfirst input to :func:forward needs gradient computed w.r.t. the\noutput.
Copyright (c) Ant Financial Service Group and its affiliates.
\n"}, {"fullname": "linghe.facade.norm.RMSNormFunction", "modulename": "linghe.facade.norm", "qualname": "RMSNormFunction", "kind": "class", "doc": "Base class to create custom autograd.Function.
To create a custom autograd.Function, subclass this class and implement\nthe :meth:forward and :meth:backward static methods. Then, to use your custom\nop in the forward pass, call the class method apply. Do not call\n:meth:forward directly.
To ensure correctness and best performance, make sure you are calling the\ncorrect methods on ctx and validating your backward function using\n:func:torch.autograd.gradcheck.
See :ref:extending-autograd for more details on how to use this class.
Examples::
\n\n>>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_AUTOGRAD)\n>>> class Exp(Function):\n>>> @staticmethod\n>>> def forward(ctx, i):\n>>> result = i.exp()\n>>> ctx.save_for_backward(result)\n>>> return result\n>>>\n>>> @staticmethod\n>>> def backward(ctx, grad_output):\n>>> result, = ctx.saved_tensors\n>>> return grad_output * result\n>>>\n>>> # Use it by calling the apply method:\n>>> # xdoctest: +SKIP\n>>> output = Exp.apply(input)\n\n", "bases": "torch.autograd.function.Function"}, {"fullname": "linghe.facade.norm.RMSNormFunction.forward", "modulename": "linghe.facade.norm", "qualname": "RMSNormFunction.forward", "kind": "function", "doc": "Define the forward of the custom autograd Function.
\n\nThis function is to be overridden by all subclasses.\nThere are two ways to define forward:
\n\nUsage 1 (Combined forward and ctx)::
\n\n@staticmethod\ndef forward(ctx: Any, *args: Any, **kwargs: Any) -> Any:\n pass\n\n\ncombining-forward-context for more detailsUsage 2 (Separate forward and ctx)::
\n\n@staticmethod\ndef forward(*args: Any, **kwargs: Any) -> Any:\n pass\n\n@staticmethod\ndef setup_context(ctx: Any, inputs: Tuple[Any, ...], output: Any) -> None:\n pass\n\n\ntorch.autograd.Function.setup_context\nstaticmethod to handle setting up the ctx object.\noutput is the output of the forward, inputs are a Tuple of inputs\nto the forward.extending-autograd for more detailsThe context can be used to store arbitrary data that can be then\nretrieved during the backward pass. Tensors should not be stored\ndirectly on ctx (though this is not currently enforced for\nbackward compatibility). Instead, tensors should be saved either with\n:func:ctx.save_for_backward if they are intended to be used in\nbackward (equivalently, vjp) or :func:ctx.save_for_forward\nif they are intended to be used for in jvp.
Define a formula for differentiating the operation with backward mode automatic differentiation.
\n\nThis function is to be overridden by all subclasses.\n(Defining this function is equivalent to defining the vjp function.)
It must accept a context :attr:ctx as the first argument, followed by\nas many outputs as the :func:forward returned (None will be passed in\nfor non tensor outputs of the forward function),\nand it should return as many tensors, as there were inputs to\n:func:forward. Each argument is the gradient w.r.t the given output,\nand each returned value should be the gradient w.r.t. the\ncorresponding input. If an input is not a Tensor or is a Tensor not\nrequiring grads, you can just pass None as a gradient for that input.
The context can be used to retrieve tensors saved during the forward\npass. It also has an attribute :attr:ctx.needs_input_grad as a tuple\nof booleans representing whether each input needs gradient. E.g.,\n:func:backward will have ctx.needs_input_grad[0] = True if the\nfirst input to :func:forward needs gradient computed w.r.t. the\noutput.
Base class to create custom autograd.Function.
To create a custom autograd.Function, subclass this class and implement\nthe :meth:forward and :meth:backward static methods. Then, to use your custom\nop in the forward pass, call the class method apply. Do not call\n:meth:forward directly.
To ensure correctness and best performance, make sure you are calling the\ncorrect methods on ctx and validating your backward function using\n:func:torch.autograd.gradcheck.
See :ref:extending-autograd for more details on how to use this class.
Examples::
\n\n>>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_AUTOGRAD)\n>>> class Exp(Function):\n>>> @staticmethod\n>>> def forward(ctx, i):\n>>> result = i.exp()\n>>> ctx.save_for_backward(result)\n>>> return result\n>>>\n>>> @staticmethod\n>>> def backward(ctx, grad_output):\n>>> result, = ctx.saved_tensors\n>>> return grad_output * result\n>>>\n>>> # Use it by calling the apply method:\n>>> # xdoctest: +SKIP\n>>> output = Exp.apply(input)\n\n", "bases": "torch.autograd.function.Function"}, {"fullname": "linghe.facade.norm.GroupNormGateFunction.forward", "modulename": "linghe.facade.norm", "qualname": "GroupNormGateFunction.forward", "kind": "function", "doc": "Define the forward of the custom autograd Function.
\n\nThis function is to be overridden by all subclasses.\nThere are two ways to define forward:
\n\nUsage 1 (Combined forward and ctx)::
\n\n@staticmethod\ndef forward(ctx: Any, *args: Any, **kwargs: Any) -> Any:\n pass\n\n\ncombining-forward-context for more detailsUsage 2 (Separate forward and ctx)::
\n\n@staticmethod\ndef forward(*args: Any, **kwargs: Any) -> Any:\n pass\n\n@staticmethod\ndef setup_context(ctx: Any, inputs: Tuple[Any, ...], output: Any) -> None:\n pass\n\n\ntorch.autograd.Function.setup_context\nstaticmethod to handle setting up the ctx object.\noutput is the output of the forward, inputs are a Tuple of inputs\nto the forward.extending-autograd for more detailsThe context can be used to store arbitrary data that can be then\nretrieved during the backward pass. Tensors should not be stored\ndirectly on ctx (though this is not currently enforced for\nbackward compatibility). Instead, tensors should be saved either with\n:func:ctx.save_for_backward if they are intended to be used in\nbackward (equivalently, vjp) or :func:ctx.save_for_forward\nif they are intended to be used for in jvp.
Define a formula for differentiating the operation with backward mode automatic differentiation.
\n\nThis function is to be overridden by all subclasses.\n(Defining this function is equivalent to defining the vjp function.)
It must accept a context :attr:ctx as the first argument, followed by\nas many outputs as the :func:forward returned (None will be passed in\nfor non tensor outputs of the forward function),\nand it should return as many tensors, as there were inputs to\n:func:forward. Each argument is the gradient w.r.t the given output,\nand each returned value should be the gradient w.r.t. the\ncorresponding input. If an input is not a Tensor or is a Tensor not\nrequiring grads, you can just pass None as a gradient for that input.
The context can be used to retrieve tensors saved during the forward\npass. It also has an attribute :attr:ctx.needs_input_grad as a tuple\nof booleans representing whether each input needs gradient. E.g.,\n:func:backward will have ctx.needs_input_grad[0] = True if the\nfirst input to :func:forward needs gradient computed w.r.t. the\noutput.
Copyright (c) Ant Financial Service Group and its affiliates.
\n"}, {"fullname": "linghe.facade.rope.QkNormHalfRopeFunction", "modulename": "linghe.facade.rope", "qualname": "QkNormHalfRopeFunction", "kind": "class", "doc": "Base class to create custom autograd.Function.
To create a custom autograd.Function, subclass this class and implement\nthe :meth:forward and :meth:backward static methods. Then, to use your custom\nop in the forward pass, call the class method apply. Do not call\n:meth:forward directly.
To ensure correctness and best performance, make sure you are calling the\ncorrect methods on ctx and validating your backward function using\n:func:torch.autograd.gradcheck.
See :ref:extending-autograd for more details on how to use this class.
Examples::
\n\n>>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_AUTOGRAD)\n>>> class Exp(Function):\n>>> @staticmethod\n>>> def forward(ctx, i):\n>>> result = i.exp()\n>>> ctx.save_for_backward(result)\n>>> return result\n>>>\n>>> @staticmethod\n>>> def backward(ctx, grad_output):\n>>> result, = ctx.saved_tensors\n>>> return grad_output * result\n>>>\n>>> # Use it by calling the apply method:\n>>> # xdoctest: +SKIP\n>>> output = Exp.apply(input)\n\n", "bases": "torch.autograd.function.Function"}, {"fullname": "linghe.facade.rope.QkNormHalfRopeFunction.forward", "modulename": "linghe.facade.rope", "qualname": "QkNormHalfRopeFunction.forward", "kind": "function", "doc": "Define the forward of the custom autograd Function.
\n\nThis function is to be overridden by all subclasses.\nThere are two ways to define forward:
\n\nUsage 1 (Combined forward and ctx)::
\n\n@staticmethod\ndef forward(ctx: Any, *args: Any, **kwargs: Any) -> Any:\n pass\n\n\ncombining-forward-context for more detailsUsage 2 (Separate forward and ctx)::
\n\n@staticmethod\ndef forward(*args: Any, **kwargs: Any) -> Any:\n pass\n\n@staticmethod\ndef setup_context(ctx: Any, inputs: Tuple[Any, ...], output: Any) -> None:\n pass\n\n\ntorch.autograd.Function.setup_context\nstaticmethod to handle setting up the ctx object.\noutput is the output of the forward, inputs are a Tuple of inputs\nto the forward.extending-autograd for more detailsThe context can be used to store arbitrary data that can be then\nretrieved during the backward pass. Tensors should not be stored\ndirectly on ctx (though this is not currently enforced for\nbackward compatibility). Instead, tensors should be saved either with\n:func:ctx.save_for_backward if they are intended to be used in\nbackward (equivalently, vjp) or :func:ctx.save_for_forward\nif they are intended to be used for in jvp.
Define a formula for differentiating the operation with backward mode automatic differentiation.
\n\nThis function is to be overridden by all subclasses.\n(Defining this function is equivalent to defining the vjp function.)
It must accept a context :attr:ctx as the first argument, followed by\nas many outputs as the :func:forward returned (None will be passed in\nfor non tensor outputs of the forward function),\nand it should return as many tensors, as there were inputs to\n:func:forward. Each argument is the gradient w.r.t the given output,\nand each returned value should be the gradient w.r.t. the\ncorresponding input. If an input is not a Tensor or is a Tensor not\nrequiring grads, you can just pass None as a gradient for that input.
The context can be used to retrieve tensors saved during the forward\npass. It also has an attribute :attr:ctx.needs_input_grad as a tuple\nof booleans representing whether each input needs gradient. E.g.,\n:func:backward will have ctx.needs_input_grad[0] = True if the\nfirst input to :func:forward needs gradient computed w.r.t. the\noutput.
Copyright (c) Ant Financial Service Group and its affiliates.
\n"}, {"fullname": "linghe.facade.transpose.TransposeDim01Function", "modulename": "linghe.facade.transpose", "qualname": "TransposeDim01Function", "kind": "class", "doc": "Base class to create custom autograd.Function.
To create a custom autograd.Function, subclass this class and implement\nthe :meth:forward and :meth:backward static methods. Then, to use your custom\nop in the forward pass, call the class method apply. Do not call\n:meth:forward directly.
To ensure correctness and best performance, make sure you are calling the\ncorrect methods on ctx and validating your backward function using\n:func:torch.autograd.gradcheck.
See :ref:extending-autograd for more details on how to use this class.
Examples::
\n\n>>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_AUTOGRAD)\n>>> class Exp(Function):\n>>> @staticmethod\n>>> def forward(ctx, i):\n>>> result = i.exp()\n>>> ctx.save_for_backward(result)\n>>> return result\n>>>\n>>> @staticmethod\n>>> def backward(ctx, grad_output):\n>>> result, = ctx.saved_tensors\n>>> return grad_output * result\n>>>\n>>> # Use it by calling the apply method:\n>>> # xdoctest: +SKIP\n>>> output = Exp.apply(input)\n\n", "bases": "torch.autograd.function.Function"}, {"fullname": "linghe.facade.transpose.TransposeDim01Function.forward", "modulename": "linghe.facade.transpose", "qualname": "TransposeDim01Function.forward", "kind": "function", "doc": "Define the forward of the custom autograd Function.
\n\nThis function is to be overridden by all subclasses.\nThere are two ways to define forward:
\n\nUsage 1 (Combined forward and ctx)::
\n\n@staticmethod\ndef forward(ctx: Any, *args: Any, **kwargs: Any) -> Any:\n pass\n\n\ncombining-forward-context for more detailsUsage 2 (Separate forward and ctx)::
\n\n@staticmethod\ndef forward(*args: Any, **kwargs: Any) -> Any:\n pass\n\n@staticmethod\ndef setup_context(ctx: Any, inputs: Tuple[Any, ...], output: Any) -> None:\n pass\n\n\ntorch.autograd.Function.setup_context\nstaticmethod to handle setting up the ctx object.\noutput is the output of the forward, inputs are a Tuple of inputs\nto the forward.extending-autograd for more detailsThe context can be used to store arbitrary data that can be then\nretrieved during the backward pass. Tensors should not be stored\ndirectly on ctx (though this is not currently enforced for\nbackward compatibility). Instead, tensors should be saved either with\n:func:ctx.save_for_backward if they are intended to be used in\nbackward (equivalently, vjp) or :func:ctx.save_for_forward\nif they are intended to be used for in jvp.
Define a formula for differentiating the operation with backward mode automatic differentiation.
\n\nThis function is to be overridden by all subclasses.\n(Defining this function is equivalent to defining the vjp function.)
It must accept a context :attr:ctx as the first argument, followed by\nas many outputs as the :func:forward returned (None will be passed in\nfor non tensor outputs of the forward function),\nand it should return as many tensors, as there were inputs to\n:func:forward. Each argument is the gradient w.r.t the given output,\nand each returned value should be the gradient w.r.t. the\ncorresponding input. If an input is not a Tensor or is a Tensor not\nrequiring grads, you can just pass None as a gradient for that input.
The context can be used to retrieve tensors saved during the forward\npass. It also has an attribute :attr:ctx.needs_input_grad as a tuple\nof booleans representing whether each input needs gradient. E.g.,\n:func:backward will have ctx.needs_input_grad[0] = True if the\nfirst input to :func:forward needs gradient computed w.r.t. the\noutput.
Copyright (c) Ant Financial Service Group and its affiliates.
\n"}, {"fullname": "linghe.gemm.fp32_gemm.fp32_gemm_kernel", "modulename": "linghe.gemm.fp32_gemm", "qualname": "fp32_gemm_kernel", "kind": "function", "doc": "\n", "signature": "(\ta_ptr,\tb_ptr,\tc_ptr,\tM,\tN: int,\tK: int,\tBLOCK_SIZE_K: int,\tBLOCK_SIZE_M: int,\tBLOCK_SIZE_N: int):", "funcdef": "def"}, {"fullname": "linghe.gemm.fp32_gemm.triton_fp32_gemm", "modulename": "linghe.gemm.fp32_gemm", "qualname": "triton_fp32_gemm", "kind": "function", "doc": "\n", "signature": "(a: torch.Tensor, b: torch.Tensor):", "funcdef": "def"}, {"fullname": "linghe.gemm.fp32_gemm.scaled_fp32_gemm_kernel", "modulename": "linghe.gemm.fp32_gemm", "qualname": "scaled_fp32_gemm_kernel", "kind": "function", "doc": "\n", "signature": "(\ta_ptr,\tb_ptr,\tscale_ptr,\tc_ptr,\tM,\tN: int,\tK: int,\tBLOCK_SIZE_K: int,\tBLOCK_SIZE_M: int,\tBLOCK_SIZE_N: int):", "funcdef": "def"}, {"fullname": "linghe.gemm.fp32_gemm.triton_scaled_fp32_gemm", "modulename": "linghe.gemm.fp32_gemm", "qualname": "triton_scaled_fp32_gemm", "kind": "function", "doc": "\n", "signature": "(a: torch.Tensor, b: torch.Tensor, scale: torch.Tensor):", "funcdef": "def"}, {"fullname": "linghe.gemm.fp32_gemm.fp32_gemm_for_backward_kernel", "modulename": "linghe.gemm.fp32_gemm", "qualname": "fp32_gemm_for_backward_kernel", "kind": "function", "doc": "\n", "signature": "(\ta_ptr,\tb_ptr,\tc_ptr,\tM,\tN: int,\tK: int,\tACCUM: int,\tBLOCK_SIZE_K: int,\tBLOCK_SIZE_M: int,\tBLOCK_SIZE_N: int):", "funcdef": "def"}, {"fullname": "linghe.gemm.fp32_gemm.triton_fp32_gemm_for_backward", "modulename": "linghe.gemm.fp32_gemm", "qualname": "triton_fp32_gemm_for_backward", "kind": "function", "doc": "\n", "signature": "(\ta: torch.Tensor,\tb: torch.Tensor,\tc: Optional[torch.Tensor] = None,\taccum=False):", "funcdef": "def"}, {"fullname": "linghe.gemm.fp32_gemm.fp32_gemm_for_update_kernel", "modulename": "linghe.gemm.fp32_gemm", "qualname": "fp32_gemm_for_update_kernel", "kind": "function", "doc": "\n", "signature": "(\ta_ptr,\tb_ptr,\tc_ptr,\tM,\tN: int,\tK: int,\tBLOCK_SIZE_K: int,\tBLOCK_SIZE_M: int,\tBLOCK_SIZE_N: int):", "funcdef": "def"}, {"fullname": "linghe.gemm.fp32_gemm.triton_fp32_gemm_for_update", "modulename": "linghe.gemm.fp32_gemm", "qualname": "triton_fp32_gemm_for_update", "kind": "function", "doc": "\n", "signature": "(a: torch.Tensor, b: torch.Tensor):", "funcdef": "def"}, {"fullname": "linghe.gemm.fp32_gemm.scaled_fp32_gemm_for_update_kernel", "modulename": "linghe.gemm.fp32_gemm", "qualname": "scaled_fp32_gemm_for_update_kernel", "kind": "function", "doc": "\n", "signature": "(\ta_ptr,\tb_ptr,\tscale_ptr,\tc_ptr,\tM,\tN: int,\tK: int,\tBLOCK_SIZE_K: int,\tBLOCK_SIZE_M: int,\tBLOCK_SIZE_N: int):", "funcdef": "def"}, {"fullname": "linghe.gemm.fp32_gemm.triton_scaled_fp32_gemm_for_update", "modulename": "linghe.gemm.fp32_gemm", "qualname": "triton_scaled_fp32_gemm_for_update", "kind": "function", "doc": "\n", "signature": "(a: torch.Tensor, b: torch.Tensor, scale: torch.Tensor):", "funcdef": "def"}, {"fullname": "linghe.quant", "modulename": "linghe.quant", "kind": "module", "doc": "\n"}, {"fullname": "linghe.quant.block", "modulename": "linghe.quant.block", "kind": "module", "doc": "\n"}, {"fullname": "linghe.quant.block.block", "modulename": "linghe.quant.block.block", "kind": "module", "doc": "Copyright (c) Ant Financial Service Group and its affiliates.
\n"}, {"fullname": "linghe.quant.block.block.block_quant_kernel", "modulename": "linghe.quant.block.block", "qualname": "block_quant_kernel", "kind": "function", "doc": "\n", "signature": "(x_ptr, y_ptr, s_ptr, M, N, BLOCK_SIZE: int, ROUND: int):", "funcdef": "def"}, {"fullname": "linghe.quant.block.block.block_quant", "modulename": "linghe.quant.block.block", "qualname": "block_quant", "kind": "function", "doc": "\n", "signature": "(x, block_size=128, round_scale=False):", "funcdef": "def"}, {"fullname": "linghe.quant.block.group", "modulename": "linghe.quant.block.group", "kind": "module", "doc": "Copyright (c) Ant Financial Service Group and its affiliates.
\n"}, {"fullname": "linghe.quant.block.group.group_quant_kernel", "modulename": "linghe.quant.block.group", "qualname": "group_quant_kernel", "kind": "function", "doc": "\n", "signature": "(x_ptr, y_ptr, s_ptr, N, BLOCK_SIZE: int, K: int, ROUND: int):", "funcdef": "def"}, {"fullname": "linghe.quant.block.group.triton_group_quant", "modulename": "linghe.quant.block.group", "qualname": "triton_group_quant", "kind": "function", "doc": "\n", "signature": "(x, dtype=torch.float8_e4m3fn, group_size=128, round_scale=False):", "funcdef": "def"}, {"fullname": "linghe.quant.block.group.persist_group_quant_kernel", "modulename": "linghe.quant.block.group", "qualname": "persist_group_quant_kernel", "kind": "function", "doc": "\n", "signature": "(x_ptr, y_ptr, s_ptr, N, BLOCK_SIZE: int, B: int, K: int, ROUND: int):", "funcdef": "def"}, {"fullname": "linghe.quant.block.group.triton_persist_group_quant", "modulename": "linghe.quant.block.group", "qualname": "triton_persist_group_quant", "kind": "function", "doc": "\n", "signature": "(x, dtype=torch.float8_e4m3fn, group_size=128, round_scale=False):", "funcdef": "def"}, {"fullname": "linghe.quant.channel", "modulename": "linghe.quant.channel", "kind": "module", "doc": "\n"}, {"fullname": "linghe.quant.channel.channel", "modulename": "linghe.quant.channel.channel", "kind": "module", "doc": "Copyright (c) Ant Financial Service Group and its affiliates.
\n"}, {"fullname": "linghe.quant.channel.channel.row_quant_kernel", "modulename": "linghe.quant.channel.channel", "qualname": "row_quant_kernel", "kind": "function", "doc": "\n", "signature": "(x_ptr, q_ptr, s_ptr, M, N, BLOCK_SIZE: int, ROUND: int):", "funcdef": "def"}, {"fullname": "linghe.quant.channel.channel.triton_row_quant", "modulename": "linghe.quant.channel.channel", "qualname": "triton_row_quant", "kind": "function", "doc": "\n", "signature": "(x, round_scale=False):", "funcdef": "def"}, {"fullname": "linghe.quant.channel.channel.deprecated_tokenwise_row_quant_kernel", "modulename": "linghe.quant.channel.channel", "qualname": "deprecated_tokenwise_row_quant_kernel", "kind": "function", "doc": "\n", "signature": "(x_ptr, out_ptr, scale_ptr, M, T: int, N: int, ROUND: int):", "funcdef": "def"}, {"fullname": "linghe.quant.channel.channel.triton_deprecated_tokenwise_row_quant", "modulename": "linghe.quant.channel.channel", "qualname": "triton_deprecated_tokenwise_row_quant", "kind": "function", "doc": "\n", "signature": "(x, out=None, scale=None, round_scale=False):", "funcdef": "def"}, {"fullname": "linghe.quant.channel.channel.tokenwise_row_quant_kernel", "modulename": "linghe.quant.channel.channel", "qualname": "tokenwise_row_quant_kernel", "kind": "function", "doc": "\n", "signature": "(x_ptr, out_ptr, scale_ptr, N: int, ROUND: int):", "funcdef": "def"}, {"fullname": "linghe.quant.channel.channel.triton_tokenwise_row_quant", "modulename": "linghe.quant.channel.channel", "qualname": "triton_tokenwise_row_quant", "kind": "function", "doc": "\n", "signature": "(x, out=None, scale=None, round_scale=False):", "funcdef": "def"}, {"fullname": "linghe.quant.channel.channel.transpose_row_quant_kernel", "modulename": "linghe.quant.channel.channel", "qualname": "transpose_row_quant_kernel", "kind": "function", "doc": "\n", "signature": "(x_ptr, q_ptr, s_ptr, M, N, H: int, W: int, ROUND: int):", "funcdef": "def"}, {"fullname": "linghe.quant.channel.channel.triton_transpose_row_quant", "modulename": "linghe.quant.channel.channel", "qualname": "triton_transpose_row_quant", "kind": "function", "doc": "\n", "signature": "(x, side=0, round_scale=False):", "funcdef": "def"}, {"fullname": "linghe.quant.channel.channel.triton_channel_quant_nt", "modulename": "linghe.quant.channel.channel", "qualname": "triton_channel_quant_nt", "kind": "function", "doc": "\n", "signature": "(x, w):", "funcdef": "def"}, {"fullname": "linghe.quant.channel.channel.triton_channel_quant_nn", "modulename": "linghe.quant.channel.channel", "qualname": "triton_channel_quant_nn", "kind": "function", "doc": "\n", "signature": "(y, w):", "funcdef": "def"}, {"fullname": "linghe.quant.channel.channel.triton_channel_quant_tn", "modulename": "linghe.quant.channel.channel", "qualname": "triton_channel_quant_tn", "kind": "function", "doc": "\n", "signature": "(y, x):", "funcdef": "def"}, {"fullname": "linghe.quant.channel.channel.channel_quant_forward", "modulename": "linghe.quant.channel.channel", "qualname": "channel_quant_forward", "kind": "function", "doc": "\n", "signature": "(x, w):", "funcdef": "def"}, {"fullname": "linghe.quant.channel.channel.channel_quant_backward", "modulename": "linghe.quant.channel.channel", "qualname": "channel_quant_backward", "kind": "function", "doc": "\n", "signature": "(y, w):", "funcdef": "def"}, {"fullname": "linghe.quant.channel.channel.channel_quant_update", "modulename": "linghe.quant.channel.channel", "qualname": "channel_quant_update", "kind": "function", "doc": "\n", "signature": "(y, x):", "funcdef": "def"}, {"fullname": "linghe.quant.channel.channel.fp8_channel_f_and_b", "modulename": "linghe.quant.channel.channel", "qualname": "fp8_channel_f_and_b", "kind": "function", "doc": "\n", "signature": "(x, w, y):", "funcdef": "def"}, {"fullname": "linghe.utils", "modulename": "linghe.utils", "kind": "module", "doc": "\n"}, {"fullname": "linghe.utils.add", "modulename": "linghe.utils.add", "kind": "module", "doc": "Copyright (c) Ant Financial Service Group and its affiliates.
\n"}, {"fullname": "linghe.utils.add.inplace_add_kernel", "modulename": "linghe.utils.add", "qualname": "inplace_add_kernel", "kind": "function", "doc": "\n", "signature": "(x_ptr, y_ptr, M, N, H: int, W: int, EVEN: int, ACCUM: int):", "funcdef": "def"}, {"fullname": "linghe.utils.add.triton_inplace_add", "modulename": "linghe.utils.add", "qualname": "triton_inplace_add", "kind": "function", "doc": "inplace add y to x\nArgs:\n x: Tensor\n y: Tensor\n accum: whether accum y to x
\n\nReturns: x += y if accum=True else x.copy_(y)
\n", "signature": "(x: torch.Tensor, y: torch.Tensor, accum: bool = True):", "funcdef": "def"}, {"fullname": "linghe.utils.dot", "modulename": "linghe.utils.dot", "kind": "module", "doc": "Copyright (c) Ant Financial Service Group and its affiliates.
\n"}, {"fullname": "linghe.utils.dot.dot_kernel", "modulename": "linghe.utils.dot", "qualname": "dot_kernel", "kind": "function", "doc": "\n", "signature": "(x_ptr, y_ptr, sum_ptr, M, N, H: int, W: int):", "funcdef": "def"}, {"fullname": "linghe.utils.dot.triton_dot", "modulename": "linghe.utils.dot", "qualname": "triton_dot", "kind": "function", "doc": "\n", "signature": "(x, y):", "funcdef": "def"}, {"fullname": "linghe.utils.dot.mix_precise_dot_kernel", "modulename": "linghe.utils.dot", "qualname": "mix_precise_dot_kernel", "kind": "function", "doc": "\n", "signature": "(\tx_ptr,\tq_ptr,\tsum_ptr,\tsmooth_scale_ptr,\tquant_scale_ptr,\tM,\tN,\tH: int,\tW: int):", "funcdef": "def"}, {"fullname": "linghe.utils.dot.triton_mix_precise_dot", "modulename": "linghe.utils.dot", "qualname": "triton_mix_precise_dot", "kind": "function", "doc": "\n", "signature": "(x, q, smooth_scale, quant_scale, reverse=False):", "funcdef": "def"}, {"fullname": "linghe.utils.gather", "modulename": "linghe.utils.gather", "kind": "module", "doc": "Copyright (c) Ant Financial Service Group and its affiliates.
\n"}, {"fullname": "linghe.utils.gather.block_count_kernel", "modulename": "linghe.utils.gather", "qualname": "block_count_kernel", "kind": "function", "doc": "\n", "signature": "(map_ptr, count_ptr, M, B, T: int, b: int, E: int):", "funcdef": "def"}, {"fullname": "linghe.utils.gather.make_row_id_map_kernel", "modulename": "linghe.utils.gather", "qualname": "make_row_id_map_kernel", "kind": "function", "doc": "\n", "signature": "(map_ptr, count_ptr, output_ptr, M, B, P, T: int, b: int, E: int):", "funcdef": "def"}, {"fullname": "linghe.utils.gather.triton_make_row_id_map", "modulename": "linghe.utils.gather", "qualname": "triton_make_row_id_map", "kind": "function", "doc": "\n", "signature": "(routing_map: torch.Tensor, multiple_of: int = 1):", "funcdef": "def"}, {"fullname": "linghe.utils.gather.make_row_id_map_and_indices_kernel", "modulename": "linghe.utils.gather", "qualname": "make_row_id_map_and_indices_kernel", "kind": "function", "doc": "\n", "signature": "(\tmap_ptr,\tcount_ptr,\trow_map_ptr,\trow_indices_ptr,\tM,\tB,\tP,\tT: int,\tb: int,\tE: int):", "funcdef": "def"}, {"fullname": "linghe.utils.gather.triton_make_row_id_map_and_indices", "modulename": "linghe.utils.gather", "qualname": "triton_make_row_id_map_and_indices", "kind": "function", "doc": "\n", "signature": "(routing_map: torch.Tensor, num_out_tokens: int, multiple_of: int = 1):", "funcdef": "def"}, {"fullname": "linghe.utils.gather.index_select_kernel", "modulename": "linghe.utils.gather", "qualname": "index_select_kernel", "kind": "function", "doc": "\n", "signature": "(\tx_ptr,\tout_ptr,\tscale_ptr,\tscale_out_ptr,\tindex_ptr,\tM,\tT,\tN: int,\tSCALE: int):", "funcdef": "def"}, {"fullname": "linghe.utils.gather.triton_index_select", "modulename": "linghe.utils.gather", "qualname": "triton_index_select", "kind": "function", "doc": "\n", "signature": "(x, indices, scale=None, out=None, scale_out=None):", "funcdef": "def"}, {"fullname": "linghe.utils.gather.permute_with_mask_map_kernel", "modulename": "linghe.utils.gather", "qualname": "permute_with_mask_map_kernel", "kind": "function", "doc": "\n", "signature": "(\tdata_ptr,\tscale_ptr,\tprobs_ptr,\tmask_map_ptr,\toutput_data_ptr,\toutput_scale_ptr,\toutput_probs_ptr,\tnum_experts: int,\tN: int,\ths: int,\tSCALE: int,\tPROB: int):", "funcdef": "def"}, {"fullname": "linghe.utils.gather.fill_padded_token_with_zero_kernel", "modulename": "linghe.utils.gather", "qualname": "fill_padded_token_with_zero_kernel", "kind": "function", "doc": "\n", "signature": "(\tdata_ptr,\tscale_ptr,\tprobs_ptr,\tmax_indices_ptr,\ttoken_per_expert_ptr,\tN: int,\ths: int,\tSCALE: int,\tPROB: int):", "funcdef": "def"}, {"fullname": "linghe.utils.gather.triton_permute_with_mask_map", "modulename": "linghe.utils.gather", "qualname": "triton_permute_with_mask_map", "kind": "function", "doc": "\n", "signature": "(\tinp: torch.Tensor,\tscale: torch.Tensor,\tprobs: torch.Tensor,\trow_id_map: torch.Tensor,\tnum_out_tokens: int,\tcontiguous: bool = True,\ttokens_per_expert: Optional[torch.Tensor] = None):", "funcdef": "def"}, {"fullname": "linghe.utils.gather.batch_smooth_transpose_smooth_permute_kernel", "modulename": "linghe.utils.gather", "qualname": "batch_smooth_transpose_smooth_permute_kernel", "kind": "function", "doc": "\n", "signature": "(\tx_ptr,\tscale_ptr,\toss_ptr,\tss_ptr,\tindex_ptr,\tcount_ptr,\taccum_ptr,\tq_ptr,\tqs_ptr,\tN: int,\tE: int,\tH: int,\tW: int,\tSMOOTHED: int,\tROUND: int):", "funcdef": "def"}, {"fullname": "linghe.utils.gather.triton_batch_transpose_smooth_permute_with_indices", "modulename": "linghe.utils.gather", "qualname": "triton_batch_transpose_smooth_permute_with_indices", "kind": "function", "doc": "\n", "signature": "(\tx,\tscale,\torg_smooth_scale,\tsmooth_scales,\tindices,\ttoken_count_per_expert,\tsplits,\tx_q=None,\tx_scale=None,\tround_scale=False):", "funcdef": "def"}, {"fullname": "linghe.utils.gather.smooth_weighted_permute_with_indices_kernel", "modulename": "linghe.utils.gather", "qualname": "smooth_weighted_permute_with_indices_kernel", "kind": "function", "doc": "\n", "signature": "(\tgrads_ptr,\ttokens_ptr,\tq_ptr,\tss_ptr,\tqs_ptr,\tcount_ptr,\taccum_ptr,\tindex_ptr,\tsum_ptr,\tM,\tN: int,\tREVERSE: int,\tROUND: int):", "funcdef": "def"}, {"fullname": "linghe.utils.gather.triton_smooth_weighted_permute_with_indices", "modulename": "linghe.utils.gather", "qualname": "triton_smooth_weighted_permute_with_indices", "kind": "function", "doc": "\n", "signature": "(\tgrads,\ttokens,\tsmooth_scales,\ttoken_count_per_expert,\tindices,\tx_q=None,\tx_scale=None,\tx_sum=None,\treverse=False,\tround_scale=False):", "funcdef": "def"}, {"fullname": "linghe.utils.gather.smooth_permute_with_indices_kernel", "modulename": "linghe.utils.gather", "qualname": "smooth_permute_with_indices_kernel", "kind": "function", "doc": "\n", "signature": "(\tgrads_data_ptr,\tgrads_scale_ptr,\tq_ptr,\tss_ptr,\tqs_ptr,\tcount_ptr,\taccum_ptr,\tindex_ptr,\tN: int,\ths: int,\tREVERSE: int,\tROUND: int,\tGROUP: int):", "funcdef": "def"}, {"fullname": "linghe.utils.gather.triton_smooth_permute_with_indices", "modulename": "linghe.utils.gather", "qualname": "triton_smooth_permute_with_indices", "kind": "function", "doc": "\n", "signature": "(\tgrad_data,\tgrad_scale,\tsmooth_scales,\ttoken_count_per_expert,\tindices,\tx_q=None,\tx_scale=None,\treverse=False,\tround_scale=False):", "funcdef": "def"}, {"fullname": "linghe.utils.gather.smooth_permute_with_mask_map_kernel", "modulename": "linghe.utils.gather", "qualname": "smooth_permute_with_mask_map_kernel", "kind": "function", "doc": "\n", "signature": "(\tgrads_data_ptr,\tquant_data_ptr,\tmask_map_ptr,\tgrads_scale_ptr,\tsmooth_scale_ptr,\tquant_scale_ptr,\tM,\tT,\tN: int,\ths: int,\tREVERSE: int,\tROUND: int):", "funcdef": "def"}, {"fullname": "linghe.utils.gather.triton_smooth_permute_with_mask_map", "modulename": "linghe.utils.gather", "qualname": "triton_smooth_permute_with_mask_map", "kind": "function", "doc": "\n", "signature": "(\tinp: torch.Tensor,\trow_id_map: torch.Tensor,\tscale: torch.Tensor,\tnum_tokens: int,\tnum_experts: int,\tnum_out_tokens: int,\thidden_size: int,\tsmooth_scales: torch.Tensor,\treverse=True,\tround_scale=False):", "funcdef": "def"}, {"fullname": "linghe.utils.gather.deprecated_smooth_permute_with_mask_map_kernel", "modulename": "linghe.utils.gather", "qualname": "deprecated_smooth_permute_with_mask_map_kernel", "kind": "function", "doc": "\n", "signature": "(\tgrads_data_ptr,\tquant_data_ptr,\tmask_map_ptr,\tsmooth_scale_ptr,\tquant_scale_ptr,\tM,\tT,\tN: int,\tREVERSE: int,\tROUND: int):", "funcdef": "def"}, {"fullname": "linghe.utils.gather.triton_deprecated_smooth_permute_with_mask_map", "modulename": "linghe.utils.gather", "qualname": "triton_deprecated_smooth_permute_with_mask_map", "kind": "function", "doc": "\n", "signature": "(\tinp: torch.Tensor,\trow_id_map: torch.Tensor,\tnum_tokens: int,\tnum_experts: int,\tnum_out_tokens: int,\thidden_size: int,\tsmooth_scales: torch.Tensor,\treverse=True,\tround_scale=False):", "funcdef": "def"}, {"fullname": "linghe.utils.loss", "modulename": "linghe.utils.loss", "kind": "module", "doc": "Copyright (c) Ant Financial Service Group and its affiliates.
\n"}, {"fullname": "linghe.utils.loss.softmax_cross_entropy_forward_kernel", "modulename": "linghe.utils.loss", "qualname": "softmax_cross_entropy_forward_kernel", "kind": "function", "doc": "\n", "signature": "(\tlogit_ptr,\tlabel_ptr,\tloss_ptr,\tsum_exp_ptr,\tmax_logit_ptr,\tN,\tB: int):", "funcdef": "def"}, {"fullname": "linghe.utils.loss.triton_softmax_cross_entropy_forward", "modulename": "linghe.utils.loss", "qualname": "triton_softmax_cross_entropy_forward", "kind": "function", "doc": "\n", "signature": "(logits, labels):", "funcdef": "def"}, {"fullname": "linghe.utils.loss.softmax_cross_entropy_backward_kernel", "modulename": "linghe.utils.loss", "qualname": "softmax_cross_entropy_backward_kernel", "kind": "function", "doc": "\n", "signature": "(\tlogit_ptr,\tlabel_ptr,\tsum_exp_ptr,\tmax_logit_ptr,\tinput_grad_ptr,\toutput_grad_ptr,\tN,\tB: int):", "funcdef": "def"}, {"fullname": "linghe.utils.loss.triton_softmax_cross_entropy_backward", "modulename": "linghe.utils.loss", "qualname": "triton_softmax_cross_entropy_backward", "kind": "function", "doc": "\n", "signature": "(logits, labels, sum_exp, max_logit, input_grad, output_grad=None):", "funcdef": "def"}, {"fullname": "linghe.utils.norm", "modulename": "linghe.utils.norm", "kind": "module", "doc": "\n"}, {"fullname": "linghe.utils.norm.rms_norm_forward_kernel", "modulename": "linghe.utils.norm", "qualname": "rms_norm_forward_kernel", "kind": "function", "doc": "\n", "signature": "(x_ptr, weight_ptr, out_ptr, eps, M, T, N: int, W: int):", "funcdef": "def"}, {"fullname": "linghe.utils.norm.triton_rms_norm_forward", "modulename": "linghe.utils.norm", "qualname": "triton_rms_norm_forward", "kind": "function", "doc": "\n", "signature": "(x, weight, eps=1e-06, out=None):", "funcdef": "def"}, {"fullname": "linghe.utils.norm.rms_norm_backward_kernel", "modulename": "linghe.utils.norm", "qualname": "rms_norm_backward_kernel", "kind": "function", "doc": "\n", "signature": "(\tgrad_output_ptr,\tx_ptr,\tw_ptr,\tdx_ptr,\tdw_ptr,\teps,\tM,\tT,\tN: int,\tW: int):", "funcdef": "def"}, {"fullname": "linghe.utils.norm.triton_rms_norm_backward", "modulename": "linghe.utils.norm", "qualname": "triton_rms_norm_backward", "kind": "function", "doc": "\n", "signature": "(grad_output, x, w, eps=1e-06):", "funcdef": "def"}, {"fullname": "linghe.utils.norm.rms_norm_and_block_quant_forward_kernel", "modulename": "linghe.utils.norm", "qualname": "rms_norm_and_block_quant_forward_kernel", "kind": "function", "doc": "\n", "signature": "(\tx_ptr,\tweight_ptr,\tout_ptr,\tscale_ptr,\ttranspose_output_ptr,\ttranspose_scale_ptr,\trms_ptr,\teps,\tM,\tT: int,\tN: int,\tnb: int,\tW: int,\tH: int,\tROUND: int):", "funcdef": "def"}, {"fullname": "linghe.utils.norm.rms_norm_and_block_quant_forward_n_kernel", "modulename": "linghe.utils.norm", "qualname": "rms_norm_and_block_quant_forward_n_kernel", "kind": "function", "doc": "\n", "signature": "(\tx_ptr,\tweight_ptr,\tout_ptr,\tscale_ptr,\trms_ptr,\teps,\tM: int,\tT: int,\tN: int,\tnb: int,\tW: int,\tROUND: int):", "funcdef": "def"}, {"fullname": "linghe.utils.norm.rms_norm_and_block_quant_forward_t_kernel", "modulename": "linghe.utils.norm", "qualname": "rms_norm_and_block_quant_forward_t_kernel", "kind": "function", "doc": "\n", "signature": "(\tx_ptr,\tweight_ptr,\ttranspose_output_ptr,\ttranspose_scale_ptr,\trms_ptr,\tM,\tN,\tW: int,\tROUND: int):", "funcdef": "def"}, {"fullname": "linghe.utils.norm.triton_rms_norm_and_block_quant_forward", "modulename": "linghe.utils.norm", "qualname": "triton_rms_norm_and_block_quant_forward", "kind": "function", "doc": "Fused RMSNorm forward and block quantization.\nArgs:\n x: Input tensor, shape [M, N]\n weight: RMSNorm weight, shape [N]\n eps: epsilon value for L2 normalization.\n out: output of quantization data\n scale: output of quantization scale.\n rms: output of rms\n round_scale: Set whether to force power of 2 scales.\n output_mode: one of {0, 1, 2}.\n 0: only output non-transpose tensor\n 1: only output transposed tensor\n 2: return both\nReturns:\n out: quantization data\n scale: quantization scale\n rms: Reciprocal of the root mean square of the input calculated over the last dimension.\n transpose_output: quantization data of transposed gradient\n transpose_scale: quantization scale of transposed gradient
\n", "signature": "(\tx: torch.Tensor,\tweight: torch.Tensor,\teps: float = 1e-06,\tout: Optional[torch.Tensor] = None,\tscale: Optional[torch.Tensor] = None,\trms: Optional[torch.Tensor] = None,\tround_scale: bool = False,\toutput_mode: int = 2):", "funcdef": "def"}, {"fullname": "linghe.utils.norm.group_norm_gate_forward_kernel", "modulename": "linghe.utils.norm", "qualname": "group_norm_gate_forward_kernel", "kind": "function", "doc": "\n", "signature": "(\tx_ptr,\tgate_ptr,\tweight_ptr,\tout_ptr,\teps,\tbs,\tlength,\tDIM: int,\tD: int,\tGROUP_SIZE: int):", "funcdef": "def"}, {"fullname": "linghe.utils.norm.triton_group_norm_gate_forward", "modulename": "linghe.utils.norm", "qualname": "triton_group_norm_gate_forward", "kind": "function", "doc": "norm and gate in linear attention\nArgs:\n x:\n gate:\n weight:\n eps:\n group_size:
\n\nReturns:
\n", "signature": "(x: torch.Tensor, gate, weight, eps=1e-06, group_size=4):", "funcdef": "def"}, {"fullname": "linghe.utils.norm.group_rms_gate_backward_kernel", "modulename": "linghe.utils.norm", "qualname": "group_rms_gate_backward_kernel", "kind": "function", "doc": "\n", "signature": "(\tgrad_output_ptr,\tx_ptr,\tgate_ptr,\tw_ptr,\tdx_ptr,\tdg_ptr,\tdw_ptr,\teps,\tbs,\tlength,\tDIM: int,\tD: int,\tGROUP_SIZE: int,\tT: int):", "funcdef": "def"}, {"fullname": "linghe.utils.norm.triton_group_norm_gate_backward", "modulename": "linghe.utils.norm", "qualname": "triton_group_norm_gate_backward", "kind": "function", "doc": "\n", "signature": "(grad_output, x, gate, weight, eps=1e-06, group_size=4):", "funcdef": "def"}, {"fullname": "linghe.utils.rearange", "modulename": "linghe.utils.rearange", "kind": "module", "doc": "Copyright (c) Ant Financial Service Group and its affiliates.
\n"}, {"fullname": "linghe.utils.rearange.split_and_cat_kernel", "modulename": "linghe.utils.rearange", "qualname": "split_and_cat_kernel", "kind": "function", "doc": "\n", "signature": "(\tx_ptr,\ty_ptr,\tscale_ptr,\tscale_output_ptr,\tcount_ptr,\taccum_ptr,\trev_accum_ptr,\tindex_ptr,\tM,\tN: int,\tSCALE: int,\tK: int):", "funcdef": "def"}, {"fullname": "linghe.utils.rearange.triton_split_and_cat", "modulename": "linghe.utils.rearange", "qualname": "triton_split_and_cat", "kind": "function", "doc": "\n", "signature": "(x, counts, indices, scales=None):", "funcdef": "def"}, {"fullname": "linghe.utils.reduce", "modulename": "linghe.utils.reduce", "kind": "module", "doc": "Copyright (c) Ant Financial Service Group and its affiliates.
\n"}, {"fullname": "linghe.utils.reduce.abs_max_kernel", "modulename": "linghe.utils.reduce", "qualname": "abs_max_kernel", "kind": "function", "doc": "\n", "signature": "(\tx_ptr,\tscale_ptr,\tsmooth_scale_ptr,\toutput_ptr,\tmin_value,\tM,\tN,\tH: int,\tW: int,\tEVEN: int,\tQUANTIZED: int):", "funcdef": "def"}, {"fullname": "linghe.utils.reduce.triton_abs_max", "modulename": "linghe.utils.reduce", "qualname": "triton_abs_max", "kind": "function", "doc": "\n", "signature": "(x, scale=None, smooth_scale=None, min_value=1e-30, axis=0):", "funcdef": "def"}, {"fullname": "linghe.utils.reduce.batch_count_zero_kernel", "modulename": "linghe.utils.reduce", "qualname": "batch_count_zero_kernel", "kind": "function", "doc": "\n", "signature": "(input_ptrs, size_ptr, count_ptr, B: int):", "funcdef": "def"}, {"fullname": "linghe.utils.reduce.triton_batch_count_zero", "modulename": "linghe.utils.reduce", "qualname": "triton_batch_count_zero", "kind": "function", "doc": "\n", "signature": "(xs):", "funcdef": "def"}, {"fullname": "linghe.utils.reduce.batch_sum_with_ord_kernel", "modulename": "linghe.utils.reduce", "qualname": "batch_sum_with_ord_kernel", "kind": "function", "doc": "\n", "signature": "(input_ptrs, size_ptr, count_ptr, B: int, ORD: int):", "funcdef": "def"}, {"fullname": "linghe.utils.reduce.triton_batch_sum_with_ord", "modulename": "linghe.utils.reduce", "qualname": "triton_batch_sum_with_ord", "kind": "function", "doc": "\n", "signature": "(xs, ord=2):", "funcdef": "def"}, {"fullname": "linghe.utils.rope", "modulename": "linghe.utils.rope", "kind": "module", "doc": "Copyright (c) Ant Financial Service Group and its affiliates.
\n"}, {"fullname": "linghe.utils.rope.half_rope_forward_kernel", "modulename": "linghe.utils.rope", "qualname": "half_rope_forward_kernel", "kind": "function", "doc": "\n", "signature": "(\tq_ptr,\tk_ptr,\tfreqs_ptr,\tqo_ptr,\tko_ptr,\tB,\tq_stride,\tk_stride,\tH: int,\th: int,\tD: int,\td: int):", "funcdef": "def"}, {"fullname": "linghe.utils.rope.triton_half_rope_forward", "modulename": "linghe.utils.rope", "qualname": "triton_half_rope_forward", "kind": "function", "doc": "\n", "signature": "(q, k, freqs):", "funcdef": "def"}, {"fullname": "linghe.utils.rope.half_rope_backward_kernel", "modulename": "linghe.utils.rope", "qualname": "half_rope_backward_kernel", "kind": "function", "doc": "\n", "signature": "(q_ptr, k_ptr, freqs_ptr, B, H: int, h: int, D: int, d: int):", "funcdef": "def"}, {"fullname": "linghe.utils.rope.triton_half_rope_backward", "modulename": "linghe.utils.rope", "qualname": "triton_half_rope_backward", "kind": "function", "doc": "\n", "signature": "(q_grad, k_grad, freqs, inplace=False):", "funcdef": "def"}, {"fullname": "linghe.utils.rope.qk_norm_and_half_rope_forward_kernel", "modulename": "linghe.utils.rope", "qualname": "qk_norm_and_half_rope_forward_kernel", "kind": "function", "doc": "\n", "signature": "(\tqkv_ptr,\tq_norm_weight_ptr,\tk_norm_weight_ptr,\tfreqs_ptr,\tqo_ptr,\tko_ptr,\tvo_ptr,\tB,\tstride,\teps,\tH: int,\th: int,\tD: int,\td: int,\tinterleave: int):", "funcdef": "def"}, {"fullname": "linghe.utils.rope.triton_qk_norm_and_half_rope_forward", "modulename": "linghe.utils.rope", "qualname": "triton_qk_norm_and_half_rope_forward", "kind": "function", "doc": "\n", "signature": "(\tqkv,\tq_norm_weight,\tk_norm_weight,\tfreqs,\tH=32,\th=4,\teps=1e-06,\tinterleave=True,\ttranspose=False):", "funcdef": "def"}, {"fullname": "linghe.utils.rope.qk_norm_and_half_rope_backward_kernel", "modulename": "linghe.utils.rope", "qualname": "qk_norm_and_half_rope_backward_kernel", "kind": "function", "doc": "\n", "signature": "(\tgq_ptr,\tgk_ptr,\tgv_ptr,\tqkv_ptr,\tq_norm_weight_ptr,\tk_norm_weight_ptr,\tfreqs_ptr,\tdqkv_ptr,\tdqw_ptr,\tdkw_ptr,\tB,\tstride,\teps,\tH: int,\th: int,\tD: int,\td: int,\tinterleave: int):", "funcdef": "def"}, {"fullname": "linghe.utils.rope.triton_qk_norm_and_half_rope_backward", "modulename": "linghe.utils.rope", "qualname": "triton_qk_norm_and_half_rope_backward", "kind": "function", "doc": "\n", "signature": "(\tgq,\tgk,\tgv,\tqkv,\tq_norm_weight,\tk_norm_weight,\tfreqs,\teps=1e-06,\ttranspose=False,\tinterleave=True):", "funcdef": "def"}, {"fullname": "linghe.utils.scatter", "modulename": "linghe.utils.scatter", "kind": "module", "doc": "Copyright (c) Ant Financial Service Group and its affiliates.
\n"}, {"fullname": "linghe.utils.scatter.aligned_scatter_add_kernel", "modulename": "linghe.utils.scatter", "qualname": "aligned_scatter_add_kernel", "kind": "function", "doc": "\n", "signature": "(\tx_ptr,\to_ptr,\tindices_ptr,\tweights_ptr,\tM,\tN: int,\tK: int,\tSCALE: int):", "funcdef": "def"}, {"fullname": "linghe.utils.scatter.triton_aligned_scatter_add", "modulename": "linghe.utils.scatter", "qualname": "triton_aligned_scatter_add", "kind": "function", "doc": "\n", "signature": "(x, outputs, indices, weights=None):", "funcdef": "def"}, {"fullname": "linghe.utils.scatter.scatter_add_kernel", "modulename": "linghe.utils.scatter", "qualname": "scatter_add_kernel", "kind": "function", "doc": "\n", "signature": "(x_ptr, o_ptr, indices_ptr, M, T, N: int):", "funcdef": "def"}, {"fullname": "linghe.utils.scatter.fp32_to_bf16_kernel", "modulename": "linghe.utils.scatter", "qualname": "fp32_to_bf16_kernel", "kind": "function", "doc": "\n", "signature": "(x_ptr, o_ptr, M, T, N: int):", "funcdef": "def"}, {"fullname": "linghe.utils.scatter.triton_scatter_add", "modulename": "linghe.utils.scatter", "qualname": "triton_scatter_add", "kind": "function", "doc": "\n", "signature": "(x, outputs, indices):", "funcdef": "def"}, {"fullname": "linghe.utils.scatter.unpermute_with_mask_map_kernel", "modulename": "linghe.utils.scatter", "qualname": "unpermute_with_mask_map_kernel", "kind": "function", "doc": "\n", "signature": "(\tgrads_ptr,\tprobs_ptr,\tmask_map_ptr,\toutput_ptr,\toutput_probs_ptr,\tnum_experts: int,\tN: int,\tPROB: int):", "funcdef": "def"}, {"fullname": "linghe.utils.scatter.triton_unpermute_with_mask_map", "modulename": "linghe.utils.scatter", "qualname": "triton_unpermute_with_mask_map", "kind": "function", "doc": "\n", "signature": "(grad: torch.Tensor, row_id_map: torch.Tensor, probs: torch.Tensor):", "funcdef": "def"}, {"fullname": "linghe.utils.silu", "modulename": "linghe.utils.silu", "kind": "module", "doc": "Copyright (c) Ant Financial Service Group and its affiliates.
\n"}, {"fullname": "linghe.utils.silu.silu_and_block_quant_forward_kernel", "modulename": "linghe.utils.silu", "qualname": "silu_and_block_quant_forward_kernel", "kind": "function", "doc": "\n", "signature": "(\tx_ptr,\tout_ptr,\tscale_ptr,\ttranspose_output_ptr,\ttranspose_scale_ptr,\tM,\tn: int,\tROUND: int,\tOUTPUT_MODE: int):", "funcdef": "def"}, {"fullname": "linghe.utils.silu.triton_silu_and_block_quant_forward", "modulename": "linghe.utils.silu", "qualname": "triton_silu_and_block_quant_forward", "kind": "function", "doc": "\n", "signature": "(x, out=None, scale=None, round_scale=False, output_mode=2):", "funcdef": "def"}, {"fullname": "linghe.utils.silu.silu_and_block_quant_backward_kernel", "modulename": "linghe.utils.silu", "qualname": "silu_and_block_quant_backward_kernel", "kind": "function", "doc": "\n", "signature": "(\tg_ptr,\tx_ptr,\tdx_ptr,\tdx_scale_ptr,\ttranspose_dx_ptr,\ttranspose_dx_scale_ptr,\tM,\tn: int,\tROUND: int):", "funcdef": "def"}, {"fullname": "linghe.utils.silu.triton_silu_and_block_quant_backward", "modulename": "linghe.utils.silu", "qualname": "triton_silu_and_block_quant_backward", "kind": "function", "doc": "\n", "signature": "(g, x, round_scale=False):", "funcdef": "def"}, {"fullname": "linghe.utils.silu.batch_weighted_silu_and_block_quant_forward_kernel", "modulename": "linghe.utils.silu", "qualname": "batch_weighted_silu_and_block_quant_forward_kernel", "kind": "function", "doc": "\n", "signature": "(\tx_ptr,\tweight_ptr,\tout_ptr,\tscale_ptr,\ttranspose_output_ptr,\ttranspose_scale_ptr,\tcount_ptr,\taccum_ptr,\tn: int,\tE: int,\tROUND: int,\tOUTPUT_MODE: int):", "funcdef": "def"}, {"fullname": "linghe.utils.silu.triton_batch_weighted_silu_and_block_quant_forward", "modulename": "linghe.utils.silu", "qualname": "triton_batch_weighted_silu_and_block_quant_forward", "kind": "function", "doc": "\n", "signature": "(\tx,\tweight,\tcounts,\tsplits=None,\tout=None,\tscale=None,\tround_scale=False,\toutput_mode=2):", "funcdef": "def"}, {"fullname": "linghe.utils.silu.batch_weighted_silu_and_block_quant_backward_kernel", "modulename": "linghe.utils.silu", "qualname": "batch_weighted_silu_and_block_quant_backward_kernel", "kind": "function", "doc": "\n", "signature": "(\tg_ptr,\tx_ptr,\tweight_ptr,\tcount_ptr,\taccum_ptr,\tdx_ptr,\tdx_scale_ptr,\ttranspose_dx_ptr,\ttranspose_dx_scale_ptr,\tdw_ptr,\tn: int,\tE: int,\tROUND: int):", "funcdef": "def"}, {"fullname": "linghe.utils.silu.triton_batch_weighted_silu_and_block_quant_backward", "modulename": "linghe.utils.silu", "qualname": "triton_batch_weighted_silu_and_block_quant_backward", "kind": "function", "doc": "\n", "signature": "(g, x, weight, counts, splits=None, round_scale=False):", "funcdef": "def"}, {"fullname": "linghe.utils.transpose", "modulename": "linghe.utils.transpose", "kind": "module", "doc": "Copyright (c) Ant Financial Service Group and its affiliates.
\n"}, {"fullname": "linghe.utils.transpose.deprecated_transpose_kernel", "modulename": "linghe.utils.transpose", "qualname": "deprecated_transpose_kernel", "kind": "function", "doc": "\n", "signature": "(x_ptr, t_ptr, M, N, H: int, W: int, EVEN: int):", "funcdef": "def"}, {"fullname": "linghe.utils.transpose.triton_depracated_transpose", "modulename": "linghe.utils.transpose", "qualname": "triton_depracated_transpose", "kind": "function", "doc": "\n", "signature": "(x):", "funcdef": "def"}, {"fullname": "linghe.utils.transpose.transpose_kernel", "modulename": "linghe.utils.transpose", "qualname": "transpose_kernel", "kind": "function", "doc": "\n", "signature": "(x_ptr, t_ptr, M, N, H: int, W: int, EVEN: int):", "funcdef": "def"}, {"fullname": "linghe.utils.transpose.transpose_dim_0_1_kernel", "modulename": "linghe.utils.transpose", "qualname": "transpose_dim_0_1_kernel", "kind": "function", "doc": "\n", "signature": "(x_ptr, t_ptr, B, M, b_stride, m_stride, N: int):", "funcdef": "def"}, {"fullname": "linghe.utils.transpose.triton_transpose", "modulename": "linghe.utils.transpose", "qualname": "triton_transpose", "kind": "function", "doc": "\n", "signature": "(x, dim0=None, dim1=None):", "funcdef": "def"}, {"fullname": "linghe.utils.transpose.transpose_and_pad_kernel", "modulename": "linghe.utils.transpose", "qualname": "transpose_and_pad_kernel", "kind": "function", "doc": "\n", "signature": "(x_ptr, t_ptr, M, N, P, H: int, W: int, EVEN: int):", "funcdef": "def"}, {"fullname": "linghe.utils.transpose.triton_transpose_and_pad", "modulename": "linghe.utils.transpose", "qualname": "triton_transpose_and_pad", "kind": "function", "doc": "\n", "signature": "(x, out=None, pad=True):", "funcdef": "def"}, {"fullname": "linghe.utils.transpose.batch_transpose_kernel", "modulename": "linghe.utils.transpose", "qualname": "batch_transpose_kernel", "kind": "function", "doc": "\n", "signature": "(xs_ptr, xts_ptr, M, N, H: int, W: int):", "funcdef": "def"}, {"fullname": "linghe.utils.transpose.triton_batch_transpose", "modulename": "linghe.utils.transpose", "qualname": "triton_batch_transpose", "kind": "function", "doc": "\n", "signature": "(xs, xts=None):", "funcdef": "def"}, {"fullname": "linghe.utils.transpose.batch_transpose_and_pad_kernel", "modulename": "linghe.utils.transpose", "qualname": "batch_transpose_and_pad_kernel", "kind": "function", "doc": "\n", "signature": "(x_ptr, t_ptr, count_ptr, accum_ptr, pad_accum_ptr, N, H: int, W: int):", "funcdef": "def"}, {"fullname": "linghe.utils.transpose.triton_batch_transpose_and_pad", "modulename": "linghe.utils.transpose", "qualname": "triton_batch_transpose_and_pad", "kind": "function", "doc": "\n", "signature": "(x, count_list, x_t=None, pad=True):", "funcdef": "def"}, {"fullname": "linghe.utils.transpose.configs", "modulename": "linghe.utils.transpose", "qualname": "configs", "kind": "variable", "doc": "\n", "default_value": "[<triton.Config object>, <triton.Config object>, <triton.Config object>, <triton.Config object>, <triton.Config object>, <triton.Config object>, <triton.Config object>, <triton.Config object>, <triton.Config object>, <triton.Config object>, <triton.Config object>, <triton.Config object>, <triton.Config object>, <triton.Config object>, <triton.Config object>, <triton.Config object>, <triton.Config object>, <triton.Config object>, <triton.Config object>, <triton.Config object>, <triton.Config object>, <triton.Config object>, <triton.Config object>, <triton.Config object>, <triton.Config object>, <triton.Config object>, <triton.Config object>, <triton.Config object>, <triton.Config object>, <triton.Config object>, <triton.Config object>, <triton.Config object>, <triton.Config object>, <triton.Config object>, <triton.Config object>, <triton.Config object>, <triton.Config object>, <triton.Config object>, <triton.Config object>, <triton.Config object>, <triton.Config object>, <triton.Config object>, <triton.Config object>, <triton.Config object>, <triton.Config object>, <triton.Config object>, <triton.Config object>, <triton.Config object>, <triton.Config object>, <triton.Config object>, <triton.Config object>, <triton.Config object>, <triton.Config object>, <triton.Config object>, <triton.Config object>, <triton.Config object>, <triton.Config object>, <triton.Config object>, <triton.Config object>, <triton.Config object>, <triton.Config object>, <triton.Config object>, <triton.Config object>, <triton.Config object>, <triton.Config object>, <triton.Config object>, <triton.Config object>, <triton.Config object>, <triton.Config object>, <triton.Config object>, <triton.Config object>, <triton.Config object>, <triton.Config object>, <triton.Config object>, <triton.Config object>, <triton.Config object>, <triton.Config object>, <triton.Config object>, <triton.Config object>, <triton.Config object>, <triton.Config object>, <triton.Config object>, <triton.Config object>, <triton.Config object>, <triton.Config object>, <triton.Config object>, <triton.Config object>, <triton.Config object>, <triton.Config object>, <triton.Config object>, <triton.Config object>, <triton.Config object>, <triton.Config object>, <triton.Config object>, <triton.Config object>, <triton.Config object>, <triton.Config object>, <triton.Config object>, <triton.Config object>, <triton.Config object>, <triton.Config object>, <triton.Config object>, <triton.Config object>, <triton.Config object>, <triton.Config object>, <triton.Config object>, <triton.Config object>, <triton.Config object>, <triton.Config object>, <triton.Config object>, <triton.Config object>, <triton.Config object>, <triton.Config object>, <triton.Config object>, <triton.Config object>, <triton.Config object>, <triton.Config object>, <triton.Config object>, <triton.Config object>, <triton.Config object>, <triton.Config object>, <triton.Config object>, <triton.Config object>, <triton.Config object>, <triton.Config object>, <triton.Config object>, <triton.Config object>, <triton.Config object>, <triton.Config object>, <triton.Config object>, <triton.Config object>, <triton.Config object>, <triton.Config object>, <triton.Config object>, <triton.Config object>, <triton.Config object>, <triton.Config object>, <triton.Config object>, <triton.Config object>, <triton.Config object>, <triton.Config object>, <triton.Config object>, <triton.Config object>, <triton.Config object>, <triton.Config object>, <triton.Config object>, <triton.Config object>, <triton.Config object>, <triton.Config object>, <triton.Config object>, <triton.Config object>, <triton.Config object>, <triton.Config object>, <triton.Config object>, <triton.Config object>, <triton.Config object>, <triton.Config object>, <triton.Config object>, <triton.Config object>, <triton.Config object>, <triton.Config object>, <triton.Config object>, <triton.Config object>, <triton.Config object>, <triton.Config object>, <triton.Config object>, <triton.Config object>, <triton.Config object>, <triton.Config object>, <triton.Config object>, <triton.Config object>, <triton.Config object>, <triton.Config object>, <triton.Config object>, <triton.Config object>, <triton.Config object>, <triton.Config object>, <triton.Config object>, <triton.Config object>, <triton.Config object>]"}, {"fullname": "linghe.utils.transpose.opt_transpose_kernel", "modulename": "linghe.utils.transpose", "qualname": "opt_transpose_kernel", "kind": "function", "doc": "\n", "signature": "(x_ptr, t_ptr, M, N, D, H: int, W: int):", "funcdef": "def"}, {"fullname": "linghe.utils.transpose.triton_opt_transpose", "modulename": "linghe.utils.transpose", "qualname": "triton_opt_transpose", "kind": "function", "doc": "\n", "signature": "(x):", "funcdef": "def"}]; + + // mirrored in build-search-index.js (part 1) + // Also split on html tags. this is a cheap heuristic, but good enough. + elasticlunr.tokenizer.setSeperator(/[\s\-.;&_'"=,()]+|<[^>]*>/); + + let searchIndex; + if (docs._isPrebuiltIndex) { + console.info("using precompiled search index"); + searchIndex = elasticlunr.Index.load(docs); + } else { + console.time("building search index"); + // mirrored in build-search-index.js (part 2) + searchIndex = elasticlunr(function () { + this.pipeline.remove(elasticlunr.stemmer); + this.pipeline.remove(elasticlunr.stopWordFilter); + this.addField("qualname"); + this.addField("fullname"); + this.addField("annotation"); + this.addField("default_value"); + this.addField("signature"); + this.addField("bases"); + this.addField("doc"); + this.setRef("fullname"); + }); + for (let doc of docs) { + searchIndex.addDoc(doc); + } + console.timeEnd("building search index"); + } + + return (term) => searchIndex.search(term, { + fields: { + qualname: {boost: 4}, + fullname: {boost: 2}, + annotation: {boost: 2}, + default_value: {boost: 2}, + signature: {boost: 2}, + bases: {boost: 2}, + doc: {boost: 1}, + }, + expand: true + }); +})(); \ No newline at end of file diff --git a/linghe/__init__.py b/linghe/__init__.py index e69de29..8b13789 100644 --- a/linghe/__init__.py +++ b/linghe/__init__.py @@ -0,0 +1 @@ + diff --git a/linghe/utils/add.py b/linghe/utils/add.py index 907c852..18201a6 100644 --- a/linghe/utils/add.py +++ b/linghe/utils/add.py @@ -3,6 +3,8 @@ Copyright (c) Ant Financial Service Group and its affiliates. """ +import torch +from typing import Iterable, Optional, Tuple import triton import triton.language as tl @@ -43,7 +45,16 @@ def inplace_add_kernel(x_ptr, y_ptr, M, N, H: tl.constexpr, W: tl.constexpr, rid * H + tl.arange(0, H)[None, :] < M)) -def triton_inplace_add(x, y, accum=True): +def triton_inplace_add(x: torch.Tensor, y: torch.Tensor, accum : bool = True): + """ + inplace add y to x + Args: + x: Tensor + y: Tensor + accum: whether accum y to x + + Returns: x += y if accum=True else x.copy_(y) + """ N = x.shape[-1] M = x.numel() // N # M, N = x.shape @@ -64,63 +75,3 @@ def triton_inplace_add(x, y, accum=True): num_warps=num_warps ) return x - - -@triton.jit -def block_add_kernel(x_ptr, y_ptr, M, N, H: tl.constexpr, W: tl.constexpr, - EVEN: tl.constexpr, ACCUM: tl.constexpr): - rid = tl.program_id(axis=0) - cid = tl.program_id(axis=1) - offs = rid * H * N + cid * W + tl.arange(0, H)[:, None] * N + tl.arange(0, - W)[ - None, :] - if ACCUM: - if EVEN: - x = tl.load(x_ptr + offs) - y = tl.load(y_ptr + offs).to(tl.float32) - tl.store(x_ptr + offs, x + y) - else: - x = tl.load(x_ptr + offs, - mask=(cid * W + tl.arange(0, W)[None, :] < N) & ( - rid * H + tl.arange(0, H)[:, None] < M)) - y = tl.load(y_ptr + offs, - mask=(cid * W + tl.arange(0, W)[None, :] < N) & ( - rid * H + tl.arange(0, H)[:, None] < M)) - tl.store(x_ptr + offs, x + y, - mask=(cid * W + tl.arange(0, W)[:, None] < N) & ( - rid * H + tl.arange(0, H)[None, :] < M)) - else: - if EVEN: - y = tl.load(y_ptr + offs).to(tl.float32) - tl.store(x_ptr + offs, y) - else: - y = tl.load(y_ptr + offs, - mask=(cid * W + tl.arange(0, W)[None, :] < N) & ( - rid * H + tl.arange(0, H)[:, None] < M)) - tl.store(x_ptr + offs, y, - mask=(cid * W + tl.arange(0, W)[:, None] < N) & ( - rid * H + tl.arange(0, H)[None, :] < M)) - - -def triton_block_add(x, y, accum=True): - shape = x.shape[-1] - N = shape - M = x.numel() // N - # M, N = x.shape - H = 128 - W = 128 - EVEN = M % H == 0 and N % W == 0 - num_stages = 2 - num_warps = 8 - - grid = (triton.cdiv(M, H), triton.cdiv(N, W)) - block_add_kernel[grid]( - x, y, - M, N, - H, W, - EVEN, - accum, - num_stages=num_stages, - num_warps=num_warps - ) - return x diff --git a/linghe/utils/norm.py b/linghe/utils/norm.py index 085af0a..a627c78 100644 --- a/linghe/utils/norm.py +++ b/linghe/utils/norm.py @@ -2,7 +2,7 @@ import torch import triton import triton.language as tl - +from typing import Optional @triton.jit @@ -256,10 +256,35 @@ def rms_norm_and_block_quant_forward_t_kernel(x_ptr, -def triton_rms_norm_and_block_quant_forward(x, weight, eps=1e-6, - out=None, scale=None, rms=None, - round_scale=False, - output_mode=2): +def triton_rms_norm_and_block_quant_forward(x: torch.Tensor, + weight: torch.Tensor, + eps: float = 1e-6, + out: Optional[torch.Tensor] = None, + scale: Optional[torch.Tensor] = None, + rms: Optional[torch.Tensor] = None, + round_scale: bool = False, + output_mode: int = 2): + """ + Fused RMSNorm forward and block quantization. + Args: + x: Input tensor, shape [M, N] + weight: RMSNorm weight, shape [N] + eps: epsilon value for L2 normalization. + out: output of quantization data + scale: output of quantization scale. + rms: output of rms + round_scale: Set whether to force power of 2 scales. + output_mode: one of {0, 1, 2}. + 0: only output non-transpose tensor + 1: only output transposed tensor + 2: return both + Returns: + out: quantization data + scale: quantization scale + rms: Reciprocal of the root mean square of the input calculated over the last dimension. + transpose_output: quantization data of transposed gradient + transpose_scale: quantization scale of transposed gradient + """ # row-wise read, row-wise write M, N = x.shape assert N <= 8192 and 8192 % N == 0 @@ -375,7 +400,19 @@ def group_norm_gate_forward_kernel(x_ptr, gate_ptr, weight_ptr, out_ptr, eps, bs weight: [dim] output: [length, bs, dim] """ -def triton_group_norm_gate_forward(x, gate, weight, eps=1e-6, group_size=4): +def triton_group_norm_gate_forward(x: torch.Tensor, gate, weight, eps=1e-6, group_size=4): + """ + norm and gate in linear attention + Args: + x: + gate: + weight: + eps: + group_size: + + Returns: + + """ # row-wise read, row-wise write length, bs, dim = gate.shape assert dim <= 8192 and triton.next_power_of_2(dim) == dim and triton.next_power_of_2(group_size) == group_size diff --git a/setup.py b/setup.py index cd094eb..bc21ae2 100644 --- a/setup.py +++ b/setup.py @@ -18,7 +18,7 @@ license="MIT", license_files=("LICENSE",), description="LLM traning kernels", - URL="https://code.alipay.com/pia/linghe", + URL="https://github.com/inclusionAI/linghe", packages=find_packages(), install_requires=[], python_requires=">=3.8", From 9aeb62b296215f18005f0ab84f3596557bc64aeb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=8D=97=E9=9C=84?=
@@ -66,4 +66,4 @@ Examples can be found in tests.
## Api Reference
---
-Please refer to [API doc](asserts/api.md)
\ No newline at end of file
+Please refer to [API](https://inclusionai.github.io/linghe/)
\ No newline at end of file
diff --git a/asserts/linghe.png b/asserts/linghe.png
index fc1f51c173a79586100e2e0480a64f7d212a2ba4..c1778e43919880f41f19e1cb3a1886548b21063c 100644
GIT binary patch
literal 61328
zcmeFZg+5pz@Fko27k{@a)cv~N-LyFC`Ld))lNE(yv<-TE5EgcFg*%tt`0vfQn9A9s
zk8x?!&m1i|=r8SDbQ_)W?E8ZG_UwxI-<+H%CYam7Iu?r)Ki#FV*<&Ofw=9_3udYb`
z-TNgXe5pKmXhL`evbMt!+lI-d^Q%mnGrq2X;&1cR$AEBOd|!p<7u(s-ftsAoyj^m!
z1ONL-y|szIKZm`BmBy7#-WgTA7+af`4N~0)BSIa$3=aS66QUv5uO2`&XFO64FP56p
zil*$#{LlgAZw`)|8KhhIM8W3YIRAe(Upa!yf#`n?ak0Mt`Aw!B$ZVt{s9eea^$-7F
zZ+w^b#r|(L?mwRZ)0QF~I>18C`v37(TAajd|1?(%U66>|6j6E}{ar9Z4%SbJiZqgx
zXbjWt*zif$<-DC IgW^u3tLSak?p@-bm<5~sz|tDLCj9o=p~7g>aY)DUvsFr(xsR9qX+otM4ts2~?==ZJ5;H?cq%%OzTWCAAZhp~Uq869t&iUDXLkyI~
zx@Epi|!G<=G1?XqLO-@cWBPg&g6udJ`PBs0C->N+oyKOcLC^ZhCuW+fx7aU3Ocq%kF
z^)Ejcs>AVw=_;h_B)zlP-5M8#}_tV$q#kScH6>OabEp~noP