Skip to content

Commit

Permalink
LoongArch64: Fixed snrm2_lsx.S and cnrm2_lsx.S
Browse files Browse the repository at this point in the history
When the data type is single-precision real or single-precision complex,
converting it to double precision does not prevent overflow (as exposed in LAPACK tests).
The only solution is to follow C's approach: find the maximum value in the
array and divide each element by that maximum to avoid this issue
  • Loading branch information
XiWeiGu committed Feb 12, 2025
1 parent 9e75d6b commit 68f9c81
Show file tree
Hide file tree
Showing 2 changed files with 98 additions and 54 deletions.
75 changes: 47 additions & 28 deletions kernel/loongarch64/cnrm2_lsx.S
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define VX4 $vr21
#define res1 $vr19
#define res2 $vr20
#define RCP $f2
#define VALPHA $vr3

PROLOGUE

Expand All @@ -55,10 +57,26 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
LDINT INCX, 0(INCX)
#endif

vxor.v res1, res1, res1
vxor.v res2, res2, res2
bge $r0, N, .L999
beq $r0, INCX, .L999
addi.d $sp, $sp, -32
st.d $ra, $sp, 0
st.d N, $sp, 8
st.d X, $sp, 16
st.d INCX, $sp, 24
bl camax_k
ld.d $ra, $sp, 0
ld.d N, $sp, 8
ld.d X, $sp, 16
ld.d INCX, $sp, 24
addi.d $sp, $sp, 32

frecip.s RCP, $f0
vreplvei.w VALPHA, $vr2, 0
vxor.v res1, res1, res1
vxor.v res2, res2, res2
fcmp.ceq.s $fcc0, $f0, $f19
bcnez $fcc0, .L999
li.d TEMP, 1
slli.d TEMP, TEMP, ZBASE_SHIFT
slli.d INCX, INCX, ZBASE_SHIFT
Expand All @@ -69,16 +87,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

.L10:
vld VX0, X, 0 * SIZE
vfcvtl.d.s VX1, VX0
vfcvth.d.s VX2, VX0
vfmadd.d res1, VX1, VX1, res1
vfmadd.d res2, VX2, VX2, res2
vld VX0, X, 4 * SIZE
vfcvtl.d.s VX3, VX0
vfcvth.d.s VX4, VX0
vfmadd.d res1, VX3, VX3, res1
vfmadd.d res2, VX4, VX4, res2
addi.d I, I, -1
vld VX0, X, 0 * SIZE
vld VX1, X, 4 * SIZE
vfmul.s VX0, VX0, VALPHA
vfmul.s VX1, VX1, VALPHA

vfmadd.s res1, VX0, VX0, res1
vfmadd.s res2, VX1, VX1, res2

addi.d X, X, 8 * SIZE
blt $r0, I, .L10
b .L996
Expand All @@ -99,10 +116,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
vinsgr2vr.w VX0, t3, 2
vinsgr2vr.w VX0, t4, 3
add.d X, X, INCX
vfcvtl.d.s VX1, VX0
vfcvth.d.s VX2, VX0
vfmadd.d res1, VX1, VX1, res1
vfmadd.d res2, VX2, VX2, res2
vfmul.s VX0, VX0, VALPHA
vfmadd.s res1, VX0, VX0, res1

ld.w t1, X, 0 * SIZE
ld.w t2, X, 1 * SIZE
add.d X, X, INCX
Expand All @@ -113,19 +129,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
vinsgr2vr.w VX0, t3, 2
vinsgr2vr.w VX0, t4, 3
add.d X, X, INCX
vfcvtl.d.s VX3, VX0
vfcvth.d.s VX4, VX0
vfmadd.d res1, VX3, VX3, res1
vfmadd.d res2, VX4, VX4, res2
vfmul.s VX0, VX0, VALPHA
vfmadd.s res2, VX0, VX0, res2

addi.d I, I, -1
blt $r0, I, .L21
b .L996
.align 3

.L996:
vfadd.d res1, res1, res2
vreplvei.d VX1, res1, 1
vfadd.d res1, VX1, res1
vfadd.s res1, res1, res2
vreplvei.w VX1, res1, 1
vreplvei.w VX2, res1, 2
vreplvei.w VX3, res1, 3
vfadd.s res1, VX1, res1
vfadd.s res1, VX2, res1
vfadd.s res1, VX3, res1
.align 3

.L997:
Expand All @@ -137,18 +156,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
fld.s a1, X, 0 * SIZE
fld.s a2, X, 1 * SIZE
addi.d I, I, -1
fcvt.d.s a1, a1
fcvt.d.s a2, a2
fmadd.d res, a1, a1, res
fmadd.d res, a2, a2, res
fmul.s a1, a1, RCP
fmul.s a2, a2, RCP
fmadd.s res, a1, a1, res
fmadd.s res, a2, a2, res
add.d X, X, INCX
blt $r0, I, .L998
.align 3

.L999:
fsqrt.d res, res
fsqrt.s res, res
fmul.s $f0, res, $f0
move $r4, $r17
fcvt.s.d $f0, $f19
jirl $r0, $r1, 0x0
.align 3

Expand Down
77 changes: 51 additions & 26 deletions kernel/loongarch64/snrm2_lsx.S
Original file line number Diff line number Diff line change
Expand Up @@ -52,17 +52,44 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
/* Don't change following FR unless you know the effects. */
#define res1 $vr19
#define res2 $vr20
#define RCP $f2
#define VALPHA $vr3

// The optimization for snrm2 cannot simply involve
// extending the data type from float to double and
// then summing the squares of the data. LAPACK tests
// have shown that this approach can still lead to data overflow.
// Instead, we need to find the maximum absolute value in the entire
// array and divide each data element by this maximum value before
// performing the calculation. This approach can avoid overflow (and does not require extending the data type).

PROLOGUE

#ifdef F_INTERFACE
LDINT N, 0(N)
LDINT INCX, 0(INCX)
#endif
vxor.v res1, res1, res1
vxor.v res2, res2, res2
bge $r0, N, .L999
beq $r0, INCX, .L999

addi.d $sp, $sp, -32
st.d $ra, $sp, 0
st.d N, $sp, 8
st.d X, $sp, 16
st.d INCX, $sp, 24
bl samax_k
ld.d $ra, $sp, 0
ld.d N, $sp, 8
ld.d X, $sp, 16
ld.d INCX, $sp, 24
addi.d $sp, $sp, 32

frecip.s RCP, $f0
vreplvei.w VALPHA, $vr2, 0
vxor.v res1, res1, res1
vxor.v res2, res2, res2
fcmp.ceq.s $fcc0, $f0, $f19
bcnez $fcc0, .L999
li.d TEMP, SIZE
slli.d INCX, INCX, BASE_SHIFT
srai.d I, N, 3
Expand All @@ -75,14 +102,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
vld VX5, X, 4 * SIZE
addi.d I, I, -1
addi.d X, X, 8 * SIZE
vfcvtl.d.s VX1, VX0
vfcvth.d.s VX2, VX0
vfcvtl.d.s VX3, VX5
vfcvth.d.s VX4, VX5
vfmadd.d res1, VX1, VX1, res1
vfmadd.d res2, VX2, VX2, res2
vfmadd.d res1, VX3, VX3, res1
vfmadd.d res2, VX4, VX4, res2

vfmul.s VX0, VX0, VALPHA
vfmul.s VX5, VX5, VALPHA

vfmadd.s res1, VX0, VX0, res1
vfmadd.s res2, VX5, VX5, res2
blt $r0, I, .L10
b .L996
.align 3
Expand All @@ -104,10 +129,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
vinsgr2vr.w VX0, t2, 1
vinsgr2vr.w VX0, t3, 2
vinsgr2vr.w VX0, t4, 3
vfcvtl.d.s VX1, VX0
vfcvth.d.s VX2, VX0
vfmadd.d res1, VX1, VX1, res1
vfmadd.d res2, VX2, VX2, res2
vfmul.s VX0, VX0, VALPHA
vfmadd.s res1, VX0, VX0, res1

ld.w t1, X, 0
add.d X, X, INCX
ld.w t2, X, 0
Expand All @@ -120,19 +144,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
vinsgr2vr.w VX0, t2, 1
vinsgr2vr.w VX0, t3, 2
vinsgr2vr.w VX0, t4, 3
vfcvtl.d.s VX3, VX0
vfcvth.d.s VX4, VX0
vfmadd.d res1, VX3, VX3, res1
vfmadd.d res2, VX4, VX4, res2
vfmul.s VX0, VX0, VALPHA
vfmadd.s res2, VX0, VX0, res2
addi.d I, I, -1
blt $r0, I, .L21
b .L996
.align 3

.L996:
vfadd.d res1, res1, res2
vreplvei.d VX1, res1, 1
vfadd.d res1, VX1, res1
vfadd.s res1, res1, res2
vreplvei.w VX1, res1, 1
vreplvei.w VX2, res1, 2
vreplvei.w VX3, res1, 3
vfadd.s res1, VX1, res1
vfadd.s res1, VX2, res1
vfadd.s res1, VX3, res1
.align 3

.L997:
Expand All @@ -143,16 +168,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.L998:
fld.s $f15, X, 0
addi.d I, I, -1
fcvt.d.s $f15, $f15
fmadd.d $f19, $f15, $f15, $f19
fmul.s $f15, $f15, RCP
fmadd.s $f19, $f15, $f15, $f19
add.d X, X, INCX
blt $r0, I, .L998
.align 3

.L999:
fsqrt.d $f19, $f19
fsqrt.s $f19, $f19
fmul.s $f0, $f19, $f0
move $r4, $r17
fcvt.s.d $f0, $f19
jirl $r0, $r1, 0x0
.align 3

Expand Down

0 comments on commit 68f9c81

Please sign in to comment.