From 67157af26bf60bc4b0431e2b14469ded67238582 Mon Sep 17 00:00:00 2001 From: Frank Wessels Date: Fri, 23 Aug 2024 01:30:40 -0700 Subject: [PATCH] Make ARM SVE code vector length agnostic (#285) * Make ARM SVE code vector length agnostic * Generate correct matrix for code-gen based on actual vector length (for 256 bits and below) * Missing changes in reedsolomon.go * Fix build for testing on amd64 --- _gen/gen-arm-sve.go | 141 +++++++++++ _gen/gen.go | 1 + _gen/go.mod | 2 +- _gen/go.sum | 2 + galois.go | 17 +- galois_amd64_test.go | 2 +- galois_arm64.go | 8 +- galois_arm64_test.go | 4 +- galois_gen_arm64.s | 546 +++++++++++++++++++++++++------------------ galois_test.go | 18 +- options.go | 2 + reedsolomon.go | 10 +- 12 files changed, 499 insertions(+), 254 deletions(-) diff --git a/_gen/gen-arm-sve.go b/_gen/gen-arm-sve.go index 64f37b4..4f0ed0e 100644 --- a/_gen/gen-arm-sve.go +++ b/_gen/gen-arm-sve.go @@ -9,6 +9,7 @@ import ( "log" "os" "regexp" + "slices" "strconv" "strings" @@ -359,3 +360,143 @@ func genArmSve() { fromAvx2ToSve() addEarlyExit("Sve") } + +func assemble(sve string) string { + opcode, err := sve_as.Assemble(sve) + if err != nil { + return fmt.Sprintf(" WORD $0x00000000 // %s", sve) + } else { + return fmt.Sprintf(" WORD $0x%08x // %s", opcode, sve) + } +} + +func addArmSveVectorLength() (addInits []string) { + const filename = "../galois_gen_arm64.s" + asmOut := &bytes.Buffer{} + + file, err := os.Open(filename) + if err != nil { + return + } + defer file.Close() + + // Create a scanner to read the file line by line + scanner := bufio.NewScanner(file) + + routine := "" + addInits = make([]string, 0) + + // Iterate over each line + for scanner.Scan() { + line := scanner.Text() + + if strings.HasPrefix(line, "TEXT ·") { + routine = line + } + + correctShift := func(shift, vl string) { + if strings.Contains(line, " // lsr ") && strings.HasSuffix(strings.TrimSpace(line), ", "+shift) { + instr := strings.Split(strings.TrimSpace(line), "// lsr ")[1] + args := strings.Split(instr, ", ") + if len(args) == 3 && args[0] == args[1] { + // keep the original right shift, but reverse the effect (so effectively + // clearing out the lower bits so we cannot do eg. "half loops" ) + line += "\n" + line += assemble(fmt.Sprintf("lsl %s, %s, %s", args[0], args[1], shift)) + "\n" + line += assemble(fmt.Sprintf("rdvl x16, %s", vl)) + "\n" + line += assemble(fmt.Sprintf("udiv %s, %s, x16", args[0], args[1])) + } + } + } + + correctShift("#6", "#2") + correctShift("#5", "#1") + + if strings.Contains(line, " // add ") && strings.HasSuffix(strings.TrimSpace(line), "#64") { + instr := strings.Split(strings.TrimSpace(line), "// add ")[1] + args := strings.Split(instr, ", ") + if len(args) == 3 && args[0] == args[1] { + line = assemble(fmt.Sprintf("addvl %s, %s, #2", args[0], args[1])) + } + } + + if strings.Contains(line, " // add ") && strings.HasSuffix(strings.TrimSpace(line), "#32") { + instr := strings.Split(strings.TrimSpace(line), "// add ")[1] + args := strings.Split(instr, ", ") + if len(args) == 3 && args[0] == args[1] { + line = assemble(fmt.Sprintf("addvl %s, %s, #1", args[0], args[1])) + } + } + + if strings.Contains(line, " // add ") && strings.HasSuffix(strings.TrimSpace(line), "#4") { + // mark routine as needing initialization of register 17 + addInits = append(addInits, routine) + line = assemble("add x15, x15, x17") + } + + asmOut.WriteString(line + "\n") + } + + // Check for any errors that occurred during scanning + if err = scanner.Err(); err != nil { + log.Fatal(err) + } else if err = os.WriteFile("../galois_gen_arm64.s", asmOut.Bytes(), 0644); err != nil { + log.Fatal(err) + } + + return +} + +func addArmSveInitializations(addInits []string) { + + const filename = "../galois_gen_arm64.s" + asmOut := &bytes.Buffer{} + + file, err := os.Open(filename) + if err != nil { + return + } + defer file.Close() + + // Create a scanner to read the file line by line + scanner := bufio.NewScanner(file) + routine := "" + checkNextLine := false + + // Iterate over each line + for scanner.Scan() { + line := scanner.Text() + + if strings.HasPrefix(line, "TEXT ·") { + routine = line + } + + if strings.Contains(line, "// Load number of input shards") { + checkNextLine = true + } else { + if checkNextLine { + idx := slices.IndexFunc(addInits, func(s string) bool { return s == routine }) + if idx != -1 { + line += "\n" + line += assemble("rdvl x17, #1") + "\n" + line += assemble("lsr x17, x17, #3") + } + checkNextLine = false + } + } + + asmOut.WriteString(line + "\n") + } + + // Check for any errors that occurred during scanning + if err = scanner.Err(); err != nil { + log.Fatal(err) + } else if err = os.WriteFile("../galois_gen_arm64.s", asmOut.Bytes(), 0644); err != nil { + log.Fatal(err) + } +} + +func genArmSveAllVl() { + addInits := addArmSveVectorLength() + addArmSveInitializations(addInits) +} diff --git a/_gen/gen.go b/_gen/gen.go index 2fa5797..0fb751d 100644 --- a/_gen/gen.go +++ b/_gen/gen.go @@ -93,6 +93,7 @@ func main() { if pshufb { genArmSve() genArmNeon() + genArmSveAllVl() } Generate() } diff --git a/_gen/go.mod b/_gen/go.mod index d1406bf..df9d785 100644 --- a/_gen/go.mod +++ b/_gen/go.mod @@ -9,7 +9,7 @@ require ( require ( github.com/fwessels/avxTwo2sve v0.0.0-20240611172111-6b8528700471 // indirect - github.com/fwessels/sve-as v0.0.0-20240611015707-daffc010447f // indirect + github.com/fwessels/sve-as v0.0.0-20240817192210-83d5dbff9505 // indirect golang.org/x/mod v0.6.0 // indirect golang.org/x/sys v0.1.0 // indirect golang.org/x/tools v0.2.0 // indirect diff --git a/_gen/go.sum b/_gen/go.sum index 4938f10..6da8d85 100644 --- a/_gen/go.sum +++ b/_gen/go.sum @@ -2,6 +2,8 @@ github.com/fwessels/avxTwo2sve v0.0.0-20240611172111-6b8528700471 h1:omdgAKxePZx github.com/fwessels/avxTwo2sve v0.0.0-20240611172111-6b8528700471/go.mod h1:9+ibRsEIs0vLXkalKCGEbZfVS4fafeIvMvM9GvIsdeQ= github.com/fwessels/sve-as v0.0.0-20240611015707-daffc010447f h1:HQud3yIU82LdkQzHEYiSJs73wCHjprIqeZE9JvSjKbQ= github.com/fwessels/sve-as v0.0.0-20240611015707-daffc010447f/go.mod h1:j3s7EY79XxNMyjx/54Vo6asZafWU4yijB+KIfj4hrh8= +github.com/fwessels/sve-as v0.0.0-20240817192210-83d5dbff9505 h1:oKLoVXrXDsNNTdNLsSbEu18Vy0Z0b1yeanl5TG4qSyU= +github.com/fwessels/sve-as v0.0.0-20240817192210-83d5dbff9505/go.mod h1:j3s7EY79XxNMyjx/54Vo6asZafWU4yijB+KIfj4hrh8= github.com/klauspost/asmfmt v1.3.1 h1:7xZi1N7s9gTLbqiM8KUv8TLyysavbTRGBT5/ly0bRtw= github.com/klauspost/asmfmt v1.3.1/go.mod h1:AG8TuvYojzulgDAMCnYn50l/5QV3Bs/tp6j0HLHbNSE= github.com/mmcloughlin/avo v0.5.1-0.20221128045730-bf1d05562091 h1:C2c8ttOBeyhs1SvyCXVPCFd0EqtPiTKGnMWQ+JkM0Lc= diff --git a/galois.go b/galois.go index 9b36395..bbc521f 100644 --- a/galois.go +++ b/galois.go @@ -910,14 +910,14 @@ func galExp(a byte, n int) byte { return expTable[uint8(logResult)] } -func genCodeGenMatrix(matrixRows [][]byte, inputs, inIdx, outputs int, dst []byte) []byte { +func genCodeGenMatrix(matrixRows [][]byte, inputs, inIdx, outputs, vectorLength int, dst []byte) []byte { if !codeGen { panic("codegen not enabled") } total := inputs * outputs // Duplicated in+out - wantBytes := total * 32 * 2 + wantBytes := total * vectorLength * 2 if cap(dst) < wantBytes { dst = AllocAligned(1, wantBytes)[0] } else { @@ -925,15 +925,16 @@ func genCodeGenMatrix(matrixRows [][]byte, inputs, inIdx, outputs int, dst []byt } for i, row := range matrixRows[:outputs] { for j, idx := range row[inIdx : inIdx+inputs] { - dstIdx := (j*outputs + i) * 64 + dstIdx := (j*outputs + i) * vectorLength * 2 dstPart := dst[dstIdx:] - dstPart = dstPart[:64] + dstPart = dstPart[:vectorLength*2] lo := mulTableLow[idx][:] hi := mulTableHigh[idx][:] - copy(dstPart[:16], lo) - copy(dstPart[16:32], lo) - copy(dstPart[32:48], hi) - copy(dstPart[48:64], hi) + + for k := 0; k < vectorLength; k += 16 { + copy(dstPart[k:k+16], lo) + copy(dstPart[vectorLength*2-(k+16):vectorLength*2-k], hi) + } } } return dst diff --git a/galois_amd64_test.go b/galois_amd64_test.go index 23ed18d..1e053c1 100644 --- a/galois_amd64_test.go +++ b/galois_amd64_test.go @@ -10,6 +10,6 @@ import ( func TestGenGalois(t *testing.T) { if defaultOptions.useAVX2 { - testGenGaloisUpto10x10(t, galMulSlicesAvx2, galMulSlicesAvx2Xor) + testGenGaloisUpto10x10(t, galMulSlicesAvx2, galMulSlicesAvx2Xor, 32) } } diff --git a/galois_arm64.go b/galois_arm64.go index 08f1ae8..e34f39a 100644 --- a/galois_arm64.go +++ b/galois_arm64.go @@ -17,8 +17,12 @@ func getVectorLength() (vl, pl uint64) func init() { if defaultOptions.useSVE { - if vl, _ := getVectorLength(); vl != 256 { - defaultOptions.useSVE = false // Temp fix: disable SVE for non-256 vector widths (ie Graviton4) + if vl, _ := getVectorLength(); vl <= 256 { + // set vector length in bytes + defaultOptions.vectorLength = int(vl) >> 3 + } else { + // disable SVE for hardware implementatons over 256 bits (only know to be Fujitsu A64FX atm) + defaultOptions.useSVE = false } } } diff --git a/galois_arm64_test.go b/galois_arm64_test.go index 736d46b..a096e8a 100644 --- a/galois_arm64_test.go +++ b/galois_arm64_test.go @@ -11,9 +11,9 @@ import ( func TestGenGalois(t *testing.T) { if defaultOptions.useSVE { - testGenGaloisUpto10x10(t, galMulSlicesSve, galMulSlicesSveXor) + testGenGaloisUpto10x10(t, galMulSlicesSve, galMulSlicesSveXor, defaultOptions.vectorLength) } if defaultOptions.useNEON { - testGenGaloisUpto10x10(t, galMulSlicesNeon, galMulSlicesNeonXor) + testGenGaloisUpto10x10(t, galMulSlicesNeon, galMulSlicesNeonXor, 32) } } diff --git a/galois_gen_arm64.s b/galois_gen_arm64.s index 335b94c..dd974c1 100644 --- a/galois_gen_arm64.s +++ b/galois_gen_arm64.s @@ -13,6 +13,9 @@ TEXT ·mulSve_10x1_64(SB), $0-88 MOVD n+80(FP), R0 MOVD matrix_base+0(FP), R2 WORD $0xd346fc00 // lsr x0, x0, #6 + WORD $0xd37ae400 // lsl x0, x0, #6 + WORD $0x04bf5050 // rdvl x16, #2 + WORD $0x9ad00800 // udiv x0, x0, x16 WORD $0xea00001f // tst x0, x0 BEQ mulSve_10x1_64_end MOVD in_base+24(FP), R3 @@ -55,7 +58,7 @@ mulSve_10x1_64_loop: // Load and process 64 bytes from input 0 to 1 outputs WORD $0x85804026 // ldr z6, [x1] WORD $0x85804425 // ldr z5, [x1, #1, MUL VL] - WORD $0x91010021 // add x1, x1, #64 + WORD $0x04215041 // addvl x1, x1, #2 WORD $0x04fc94c7 // lsr z7.d, z6.d, #4 WORD $0x04fc94a8 // lsr z8.d, z5.d, #4 WORD $0x042230c6 // and z6.d, z6.d, z2.d @@ -77,7 +80,7 @@ mulSve_10x1_64_loop: // Load and process 64 bytes from input 1 to 1 outputs WORD $0x85804086 // ldr z6, [x4] WORD $0x85804485 // ldr z5, [x4, #1, MUL VL] - WORD $0x91010084 // add x4, x4, #64 + WORD $0x04245044 // addvl x4, x4, #2 WORD $0x04fc94c7 // lsr z7.d, z6.d, #4 WORD $0x04fc94a8 // lsr z8.d, z5.d, #4 WORD $0x042230c6 // and z6.d, z6.d, z2.d @@ -101,7 +104,7 @@ mulSve_10x1_64_loop: // Load and process 64 bytes from input 2 to 1 outputs WORD $0x858040a6 // ldr z6, [x5] WORD $0x858044a5 // ldr z5, [x5, #1, MUL VL] - WORD $0x910100a5 // add x5, x5, #64 + WORD $0x04255045 // addvl x5, x5, #2 WORD $0x04fc94c7 // lsr z7.d, z6.d, #4 WORD $0x04fc94a8 // lsr z8.d, z5.d, #4 WORD $0x042230c6 // and z6.d, z6.d, z2.d @@ -125,7 +128,7 @@ mulSve_10x1_64_loop: // Load and process 64 bytes from input 3 to 1 outputs WORD $0x85804106 // ldr z6, [x8] WORD $0x85804505 // ldr z5, [x8, #1, MUL VL] - WORD $0x91010108 // add x8, x8, #64 + WORD $0x04285048 // addvl x8, x8, #2 WORD $0x04fc94c7 // lsr z7.d, z6.d, #4 WORD $0x04fc94a8 // lsr z8.d, z5.d, #4 WORD $0x042230c6 // and z6.d, z6.d, z2.d @@ -149,7 +152,7 @@ mulSve_10x1_64_loop: // Load and process 64 bytes from input 4 to 1 outputs WORD $0x85804126 // ldr z6, [x9] WORD $0x85804525 // ldr z5, [x9, #1, MUL VL] - WORD $0x91010129 // add x9, x9, #64 + WORD $0x04295049 // addvl x9, x9, #2 WORD $0x04fc94c7 // lsr z7.d, z6.d, #4 WORD $0x04fc94a8 // lsr z8.d, z5.d, #4 WORD $0x042230c6 // and z6.d, z6.d, z2.d @@ -173,7 +176,7 @@ mulSve_10x1_64_loop: // Load and process 64 bytes from input 5 to 1 outputs WORD $0x85804146 // ldr z6, [x10] WORD $0x85804545 // ldr z5, [x10, #1, MUL VL] - WORD $0x9101014a // add x10, x10, #64 + WORD $0x042a504a // addvl x10, x10, #2 WORD $0x04fc94c7 // lsr z7.d, z6.d, #4 WORD $0x04fc94a8 // lsr z8.d, z5.d, #4 WORD $0x042230c6 // and z6.d, z6.d, z2.d @@ -197,7 +200,7 @@ mulSve_10x1_64_loop: // Load and process 64 bytes from input 6 to 1 outputs WORD $0x85804166 // ldr z6, [x11] WORD $0x85804565 // ldr z5, [x11, #1, MUL VL] - WORD $0x9101016b // add x11, x11, #64 + WORD $0x042b504b // addvl x11, x11, #2 WORD $0x04fc94c7 // lsr z7.d, z6.d, #4 WORD $0x04fc94a8 // lsr z8.d, z5.d, #4 WORD $0x042230c6 // and z6.d, z6.d, z2.d @@ -221,7 +224,7 @@ mulSve_10x1_64_loop: // Load and process 64 bytes from input 7 to 1 outputs WORD $0x85804186 // ldr z6, [x12] WORD $0x85804585 // ldr z5, [x12, #1, MUL VL] - WORD $0x9101018c // add x12, x12, #64 + WORD $0x042c504c // addvl x12, x12, #2 WORD $0x04fc94c7 // lsr z7.d, z6.d, #4 WORD $0x04fc94a8 // lsr z8.d, z5.d, #4 WORD $0x042230c6 // and z6.d, z6.d, z2.d @@ -245,7 +248,7 @@ mulSve_10x1_64_loop: // Load and process 64 bytes from input 8 to 1 outputs WORD $0x858041a6 // ldr z6, [x13] WORD $0x858045a5 // ldr z5, [x13, #1, MUL VL] - WORD $0x910101ad // add x13, x13, #64 + WORD $0x042d504d // addvl x13, x13, #2 WORD $0x04fc94c7 // lsr z7.d, z6.d, #4 WORD $0x04fc94a8 // lsr z8.d, z5.d, #4 WORD $0x042230c6 // and z6.d, z6.d, z2.d @@ -269,7 +272,7 @@ mulSve_10x1_64_loop: // Load and process 64 bytes from input 9 to 1 outputs WORD $0x85804066 // ldr z6, [x3] WORD $0x85804465 // ldr z5, [x3, #1, MUL VL] - WORD $0x91010063 // add x3, x3, #64 + WORD $0x04235043 // addvl x3, x3, #2 WORD $0x04fc94c7 // lsr z7.d, z6.d, #4 WORD $0x04fc94a8 // lsr z8.d, z5.d, #4 WORD $0x042230c6 // and z6.d, z6.d, z2.d @@ -291,7 +294,7 @@ mulSve_10x1_64_store: // Store 1 outputs WORD $0xe58041c0 // str z0, [x14] WORD $0xe58045c1 // str z1, [x14, #1, MUL VL] - WORD $0x910101ce // add x14, x14, #64 + WORD $0x042e504e // addvl x14, x14, #2 // Prepare for next loop WORD $0xf1000400 // subs x0, x0, #1 @@ -309,6 +312,9 @@ TEXT ·mulSve_10x1_64Xor(SB), $0-88 MOVD n+80(FP), R0 MOVD matrix_base+0(FP), R2 WORD $0xd346fc00 // lsr x0, x0, #6 + WORD $0xd37ae400 // lsl x0, x0, #6 + WORD $0x04bf5050 // rdvl x16, #2 + WORD $0x9ad00800 // udiv x0, x0, x16 WORD $0xea00001f // tst x0, x0 BEQ mulSve_10x1_64Xor_end MOVD in_base+24(FP), R3 @@ -355,7 +361,7 @@ mulSve_10x1_64Xor_loop: // Load and process 64 bytes from input 0 to 1 outputs WORD $0x85804026 // ldr z6, [x1] WORD $0x85804425 // ldr z5, [x1, #1, MUL VL] - WORD $0x91010021 // add x1, x1, #64 + WORD $0x04215041 // addvl x1, x1, #2 WORD $0x04fc94c7 // lsr z7.d, z6.d, #4 WORD $0x04fc94a8 // lsr z8.d, z5.d, #4 WORD $0x042230c6 // and z6.d, z6.d, z2.d @@ -379,7 +385,7 @@ mulSve_10x1_64Xor_loop: // Load and process 64 bytes from input 1 to 1 outputs WORD $0x85804086 // ldr z6, [x4] WORD $0x85804485 // ldr z5, [x4, #1, MUL VL] - WORD $0x91010084 // add x4, x4, #64 + WORD $0x04245044 // addvl x4, x4, #2 WORD $0x04fc94c7 // lsr z7.d, z6.d, #4 WORD $0x04fc94a8 // lsr z8.d, z5.d, #4 WORD $0x042230c6 // and z6.d, z6.d, z2.d @@ -403,7 +409,7 @@ mulSve_10x1_64Xor_loop: // Load and process 64 bytes from input 2 to 1 outputs WORD $0x858040a6 // ldr z6, [x5] WORD $0x858044a5 // ldr z5, [x5, #1, MUL VL] - WORD $0x910100a5 // add x5, x5, #64 + WORD $0x04255045 // addvl x5, x5, #2 WORD $0x04fc94c7 // lsr z7.d, z6.d, #4 WORD $0x04fc94a8 // lsr z8.d, z5.d, #4 WORD $0x042230c6 // and z6.d, z6.d, z2.d @@ -427,7 +433,7 @@ mulSve_10x1_64Xor_loop: // Load and process 64 bytes from input 3 to 1 outputs WORD $0x85804106 // ldr z6, [x8] WORD $0x85804505 // ldr z5, [x8, #1, MUL VL] - WORD $0x91010108 // add x8, x8, #64 + WORD $0x04285048 // addvl x8, x8, #2 WORD $0x04fc94c7 // lsr z7.d, z6.d, #4 WORD $0x04fc94a8 // lsr z8.d, z5.d, #4 WORD $0x042230c6 // and z6.d, z6.d, z2.d @@ -451,7 +457,7 @@ mulSve_10x1_64Xor_loop: // Load and process 64 bytes from input 4 to 1 outputs WORD $0x85804126 // ldr z6, [x9] WORD $0x85804525 // ldr z5, [x9, #1, MUL VL] - WORD $0x91010129 // add x9, x9, #64 + WORD $0x04295049 // addvl x9, x9, #2 WORD $0x04fc94c7 // lsr z7.d, z6.d, #4 WORD $0x04fc94a8 // lsr z8.d, z5.d, #4 WORD $0x042230c6 // and z6.d, z6.d, z2.d @@ -475,7 +481,7 @@ mulSve_10x1_64Xor_loop: // Load and process 64 bytes from input 5 to 1 outputs WORD $0x85804146 // ldr z6, [x10] WORD $0x85804545 // ldr z5, [x10, #1, MUL VL] - WORD $0x9101014a // add x10, x10, #64 + WORD $0x042a504a // addvl x10, x10, #2 WORD $0x04fc94c7 // lsr z7.d, z6.d, #4 WORD $0x04fc94a8 // lsr z8.d, z5.d, #4 WORD $0x042230c6 // and z6.d, z6.d, z2.d @@ -499,7 +505,7 @@ mulSve_10x1_64Xor_loop: // Load and process 64 bytes from input 6 to 1 outputs WORD $0x85804166 // ldr z6, [x11] WORD $0x85804565 // ldr z5, [x11, #1, MUL VL] - WORD $0x9101016b // add x11, x11, #64 + WORD $0x042b504b // addvl x11, x11, #2 WORD $0x04fc94c7 // lsr z7.d, z6.d, #4 WORD $0x04fc94a8 // lsr z8.d, z5.d, #4 WORD $0x042230c6 // and z6.d, z6.d, z2.d @@ -523,7 +529,7 @@ mulSve_10x1_64Xor_loop: // Load and process 64 bytes from input 7 to 1 outputs WORD $0x85804186 // ldr z6, [x12] WORD $0x85804585 // ldr z5, [x12, #1, MUL VL] - WORD $0x9101018c // add x12, x12, #64 + WORD $0x042c504c // addvl x12, x12, #2 WORD $0x04fc94c7 // lsr z7.d, z6.d, #4 WORD $0x04fc94a8 // lsr z8.d, z5.d, #4 WORD $0x042230c6 // and z6.d, z6.d, z2.d @@ -547,7 +553,7 @@ mulSve_10x1_64Xor_loop: // Load and process 64 bytes from input 8 to 1 outputs WORD $0x858041a6 // ldr z6, [x13] WORD $0x858045a5 // ldr z5, [x13, #1, MUL VL] - WORD $0x910101ad // add x13, x13, #64 + WORD $0x042d504d // addvl x13, x13, #2 WORD $0x04fc94c7 // lsr z7.d, z6.d, #4 WORD $0x04fc94a8 // lsr z8.d, z5.d, #4 WORD $0x042230c6 // and z6.d, z6.d, z2.d @@ -571,7 +577,7 @@ mulSve_10x1_64Xor_loop: // Load and process 64 bytes from input 9 to 1 outputs WORD $0x85804066 // ldr z6, [x3] WORD $0x85804465 // ldr z5, [x3, #1, MUL VL] - WORD $0x91010063 // add x3, x3, #64 + WORD $0x04235043 // addvl x3, x3, #2 WORD $0x04fc94c7 // lsr z7.d, z6.d, #4 WORD $0x04fc94a8 // lsr z8.d, z5.d, #4 WORD $0x042230c6 // and z6.d, z6.d, z2.d @@ -593,7 +599,7 @@ mulSve_10x1_64Xor_store: // Store 1 outputs WORD $0xe58041c0 // str z0, [x14] WORD $0xe58045c1 // str z1, [x14, #1, MUL VL] - WORD $0x910101ce // add x14, x14, #64 + WORD $0x042e504e // addvl x14, x14, #2 // Prepare for next loop WORD $0xf1000400 // subs x0, x0, #1 @@ -611,6 +617,9 @@ TEXT ·mulSve_10x2_64(SB), $8-88 MOVD n+80(FP), R0 MOVD matrix_base+0(FP), R2 WORD $0xd346fc00 // lsr x0, x0, #6 + WORD $0xd37ae400 // lsl x0, x0, #6 + WORD $0x04bf5050 // rdvl x16, #2 + WORD $0x9ad00800 // udiv x0, x0, x16 WORD $0xea00001f // tst x0, x0 BEQ mulSve_10x2_64_end MOVD in_base+24(FP), R3 @@ -655,7 +664,7 @@ mulSve_10x2_64_loop: // Load and process 64 bytes from input 0 to 2 outputs WORD $0x85804029 // ldr z9, [x1] WORD $0x8580442b // ldr z11, [x1, #1, MUL VL] - WORD $0x91010021 // add x1, x1, #64 + WORD $0x04215041 // addvl x1, x1, #2 WORD $0x04fc952a // lsr z10.d, z9.d, #4 WORD $0x04fc956c // lsr z12.d, z11.d, #4 WORD $0x04243129 // and z9.d, z9.d, z4.d @@ -685,7 +694,7 @@ mulSve_10x2_64_loop: // Load and process 64 bytes from input 1 to 2 outputs WORD $0x85804089 // ldr z9, [x4] WORD $0x8580448b // ldr z11, [x4, #1, MUL VL] - WORD $0x91010084 // add x4, x4, #64 + WORD $0x04245044 // addvl x4, x4, #2 WORD $0x04fc952a // lsr z10.d, z9.d, #4 WORD $0x04fc956c // lsr z12.d, z11.d, #4 WORD $0x04243129 // and z9.d, z9.d, z4.d @@ -719,7 +728,7 @@ mulSve_10x2_64_loop: // Load and process 64 bytes from input 2 to 2 outputs WORD $0x858040a9 // ldr z9, [x5] WORD $0x858044ab // ldr z11, [x5, #1, MUL VL] - WORD $0x910100a5 // add x5, x5, #64 + WORD $0x04255045 // addvl x5, x5, #2 WORD $0x04fc952a // lsr z10.d, z9.d, #4 WORD $0x04fc956c // lsr z12.d, z11.d, #4 WORD $0x04243129 // and z9.d, z9.d, z4.d @@ -753,7 +762,7 @@ mulSve_10x2_64_loop: // Load and process 64 bytes from input 3 to 2 outputs WORD $0x85804109 // ldr z9, [x8] WORD $0x8580450b // ldr z11, [x8, #1, MUL VL] - WORD $0x91010108 // add x8, x8, #64 + WORD $0x04285048 // addvl x8, x8, #2 WORD $0x04fc952a // lsr z10.d, z9.d, #4 WORD $0x04fc956c // lsr z12.d, z11.d, #4 WORD $0x04243129 // and z9.d, z9.d, z4.d @@ -787,7 +796,7 @@ mulSve_10x2_64_loop: // Load and process 64 bytes from input 4 to 2 outputs WORD $0x85804129 // ldr z9, [x9] WORD $0x8580452b // ldr z11, [x9, #1, MUL VL] - WORD $0x91010129 // add x9, x9, #64 + WORD $0x04295049 // addvl x9, x9, #2 WORD $0x04fc952a // lsr z10.d, z9.d, #4 WORD $0x04fc956c // lsr z12.d, z11.d, #4 WORD $0x04243129 // and z9.d, z9.d, z4.d @@ -821,7 +830,7 @@ mulSve_10x2_64_loop: // Load and process 64 bytes from input 5 to 2 outputs WORD $0x85804149 // ldr z9, [x10] WORD $0x8580454b // ldr z11, [x10, #1, MUL VL] - WORD $0x9101014a // add x10, x10, #64 + WORD $0x042a504a // addvl x10, x10, #2 WORD $0x04fc952a // lsr z10.d, z9.d, #4 WORD $0x04fc956c // lsr z12.d, z11.d, #4 WORD $0x04243129 // and z9.d, z9.d, z4.d @@ -855,7 +864,7 @@ mulSve_10x2_64_loop: // Load and process 64 bytes from input 6 to 2 outputs WORD $0x85804169 // ldr z9, [x11] WORD $0x8580456b // ldr z11, [x11, #1, MUL VL] - WORD $0x9101016b // add x11, x11, #64 + WORD $0x042b504b // addvl x11, x11, #2 WORD $0x04fc952a // lsr z10.d, z9.d, #4 WORD $0x04fc956c // lsr z12.d, z11.d, #4 WORD $0x04243129 // and z9.d, z9.d, z4.d @@ -889,7 +898,7 @@ mulSve_10x2_64_loop: // Load and process 64 bytes from input 7 to 2 outputs WORD $0x85804189 // ldr z9, [x12] WORD $0x8580458b // ldr z11, [x12, #1, MUL VL] - WORD $0x9101018c // add x12, x12, #64 + WORD $0x042c504c // addvl x12, x12, #2 WORD $0x04fc952a // lsr z10.d, z9.d, #4 WORD $0x04fc956c // lsr z12.d, z11.d, #4 WORD $0x04243129 // and z9.d, z9.d, z4.d @@ -923,7 +932,7 @@ mulSve_10x2_64_loop: // Load and process 64 bytes from input 8 to 2 outputs WORD $0x858041a9 // ldr z9, [x13] WORD $0x858045ab // ldr z11, [x13, #1, MUL VL] - WORD $0x910101ad // add x13, x13, #64 + WORD $0x042d504d // addvl x13, x13, #2 WORD $0x04fc952a // lsr z10.d, z9.d, #4 WORD $0x04fc956c // lsr z12.d, z11.d, #4 WORD $0x04243129 // and z9.d, z9.d, z4.d @@ -957,7 +966,7 @@ mulSve_10x2_64_loop: // Load and process 64 bytes from input 9 to 2 outputs WORD $0x85804069 // ldr z9, [x3] WORD $0x8580446b // ldr z11, [x3, #1, MUL VL] - WORD $0x91010063 // add x3, x3, #64 + WORD $0x04235043 // addvl x3, x3, #2 WORD $0x04fc952a // lsr z10.d, z9.d, #4 WORD $0x04fc956c // lsr z12.d, z11.d, #4 WORD $0x04243129 // and z9.d, z9.d, z4.d @@ -989,10 +998,10 @@ mulSve_10x2_64_store: // Store 2 outputs WORD $0xe58041e0 // str z0, [x15] WORD $0xe58045e1 // str z1, [x15, #1, MUL VL] - WORD $0x910101ef // add x15, x15, #64 + WORD $0x042f504f // addvl x15, x15, #2 WORD $0xe58041c2 // str z2, [x14] WORD $0xe58045c3 // str z3, [x14, #1, MUL VL] - WORD $0x910101ce // add x14, x14, #64 + WORD $0x042e504e // addvl x14, x14, #2 // Prepare for next loop WORD $0xf1000400 // subs x0, x0, #1 @@ -1010,6 +1019,9 @@ TEXT ·mulSve_10x2_64Xor(SB), $8-88 MOVD n+80(FP), R0 MOVD matrix_base+0(FP), R2 WORD $0xd346fc00 // lsr x0, x0, #6 + WORD $0xd37ae400 // lsl x0, x0, #6 + WORD $0x04bf5050 // rdvl x16, #2 + WORD $0x9ad00800 // udiv x0, x0, x16 WORD $0xea00001f // tst x0, x0 BEQ mulSve_10x2_64Xor_end MOVD in_base+24(FP), R3 @@ -1060,7 +1072,7 @@ mulSve_10x2_64Xor_loop: // Load and process 64 bytes from input 0 to 2 outputs WORD $0x85804029 // ldr z9, [x1] WORD $0x8580442b // ldr z11, [x1, #1, MUL VL] - WORD $0x91010021 // add x1, x1, #64 + WORD $0x04215041 // addvl x1, x1, #2 WORD $0x04fc952a // lsr z10.d, z9.d, #4 WORD $0x04fc956c // lsr z12.d, z11.d, #4 WORD $0x04243129 // and z9.d, z9.d, z4.d @@ -1094,7 +1106,7 @@ mulSve_10x2_64Xor_loop: // Load and process 64 bytes from input 1 to 2 outputs WORD $0x85804089 // ldr z9, [x4] WORD $0x8580448b // ldr z11, [x4, #1, MUL VL] - WORD $0x91010084 // add x4, x4, #64 + WORD $0x04245044 // addvl x4, x4, #2 WORD $0x04fc952a // lsr z10.d, z9.d, #4 WORD $0x04fc956c // lsr z12.d, z11.d, #4 WORD $0x04243129 // and z9.d, z9.d, z4.d @@ -1128,7 +1140,7 @@ mulSve_10x2_64Xor_loop: // Load and process 64 bytes from input 2 to 2 outputs WORD $0x858040a9 // ldr z9, [x5] WORD $0x858044ab // ldr z11, [x5, #1, MUL VL] - WORD $0x910100a5 // add x5, x5, #64 + WORD $0x04255045 // addvl x5, x5, #2 WORD $0x04fc952a // lsr z10.d, z9.d, #4 WORD $0x04fc956c // lsr z12.d, z11.d, #4 WORD $0x04243129 // and z9.d, z9.d, z4.d @@ -1162,7 +1174,7 @@ mulSve_10x2_64Xor_loop: // Load and process 64 bytes from input 3 to 2 outputs WORD $0x85804109 // ldr z9, [x8] WORD $0x8580450b // ldr z11, [x8, #1, MUL VL] - WORD $0x91010108 // add x8, x8, #64 + WORD $0x04285048 // addvl x8, x8, #2 WORD $0x04fc952a // lsr z10.d, z9.d, #4 WORD $0x04fc956c // lsr z12.d, z11.d, #4 WORD $0x04243129 // and z9.d, z9.d, z4.d @@ -1196,7 +1208,7 @@ mulSve_10x2_64Xor_loop: // Load and process 64 bytes from input 4 to 2 outputs WORD $0x85804129 // ldr z9, [x9] WORD $0x8580452b // ldr z11, [x9, #1, MUL VL] - WORD $0x91010129 // add x9, x9, #64 + WORD $0x04295049 // addvl x9, x9, #2 WORD $0x04fc952a // lsr z10.d, z9.d, #4 WORD $0x04fc956c // lsr z12.d, z11.d, #4 WORD $0x04243129 // and z9.d, z9.d, z4.d @@ -1230,7 +1242,7 @@ mulSve_10x2_64Xor_loop: // Load and process 64 bytes from input 5 to 2 outputs WORD $0x85804149 // ldr z9, [x10] WORD $0x8580454b // ldr z11, [x10, #1, MUL VL] - WORD $0x9101014a // add x10, x10, #64 + WORD $0x042a504a // addvl x10, x10, #2 WORD $0x04fc952a // lsr z10.d, z9.d, #4 WORD $0x04fc956c // lsr z12.d, z11.d, #4 WORD $0x04243129 // and z9.d, z9.d, z4.d @@ -1264,7 +1276,7 @@ mulSve_10x2_64Xor_loop: // Load and process 64 bytes from input 6 to 2 outputs WORD $0x85804169 // ldr z9, [x11] WORD $0x8580456b // ldr z11, [x11, #1, MUL VL] - WORD $0x9101016b // add x11, x11, #64 + WORD $0x042b504b // addvl x11, x11, #2 WORD $0x04fc952a // lsr z10.d, z9.d, #4 WORD $0x04fc956c // lsr z12.d, z11.d, #4 WORD $0x04243129 // and z9.d, z9.d, z4.d @@ -1298,7 +1310,7 @@ mulSve_10x2_64Xor_loop: // Load and process 64 bytes from input 7 to 2 outputs WORD $0x85804189 // ldr z9, [x12] WORD $0x8580458b // ldr z11, [x12, #1, MUL VL] - WORD $0x9101018c // add x12, x12, #64 + WORD $0x042c504c // addvl x12, x12, #2 WORD $0x04fc952a // lsr z10.d, z9.d, #4 WORD $0x04fc956c // lsr z12.d, z11.d, #4 WORD $0x04243129 // and z9.d, z9.d, z4.d @@ -1332,7 +1344,7 @@ mulSve_10x2_64Xor_loop: // Load and process 64 bytes from input 8 to 2 outputs WORD $0x858041a9 // ldr z9, [x13] WORD $0x858045ab // ldr z11, [x13, #1, MUL VL] - WORD $0x910101ad // add x13, x13, #64 + WORD $0x042d504d // addvl x13, x13, #2 WORD $0x04fc952a // lsr z10.d, z9.d, #4 WORD $0x04fc956c // lsr z12.d, z11.d, #4 WORD $0x04243129 // and z9.d, z9.d, z4.d @@ -1366,7 +1378,7 @@ mulSve_10x2_64Xor_loop: // Load and process 64 bytes from input 9 to 2 outputs WORD $0x85804069 // ldr z9, [x3] WORD $0x8580446b // ldr z11, [x3, #1, MUL VL] - WORD $0x91010063 // add x3, x3, #64 + WORD $0x04235043 // addvl x3, x3, #2 WORD $0x04fc952a // lsr z10.d, z9.d, #4 WORD $0x04fc956c // lsr z12.d, z11.d, #4 WORD $0x04243129 // and z9.d, z9.d, z4.d @@ -1398,10 +1410,10 @@ mulSve_10x2_64Xor_store: // Store 2 outputs WORD $0xe58041e0 // str z0, [x15] WORD $0xe58045e1 // str z1, [x15, #1, MUL VL] - WORD $0x910101ef // add x15, x15, #64 + WORD $0x042f504f // addvl x15, x15, #2 WORD $0xe58041c2 // str z2, [x14] WORD $0xe58045c3 // str z3, [x14, #1, MUL VL] - WORD $0x910101ce // add x14, x14, #64 + WORD $0x042e504e // addvl x14, x14, #2 // Prepare for next loop WORD $0xf1000400 // subs x0, x0, #1 @@ -1419,6 +1431,9 @@ TEXT ·mulSve_10x3_64(SB), $8-88 MOVD n+80(FP), R0 MOVD matrix_base+0(FP), R2 WORD $0xd346fc00 // lsr x0, x0, #6 + WORD $0xd37ae400 // lsl x0, x0, #6 + WORD $0x04bf5050 // rdvl x16, #2 + WORD $0x9ad00800 // udiv x0, x0, x16 WORD $0xea00001f // tst x0, x0 BEQ mulSve_10x3_64_end MOVD in_base+24(FP), R0 @@ -1461,6 +1476,9 @@ TEXT ·mulSve_10x3_64(SB), $8-88 // Reload length to save a register MOVD n+80(FP), R6 WORD $0xd346fcc6 // lsr x6, x6, #6 + WORD $0xd37ae4c6 // lsl x6, x6, #6 + WORD $0x04bf5050 // rdvl x16, #2 + WORD $0x9ad008c6 // udiv x6, x6, x16 // Load number of input shards MOVD in_len+32(FP), R16 @@ -1469,7 +1487,7 @@ mulSve_10x3_64_loop: // Load and process 64 bytes from input 0 to 3 outputs WORD $0x8580406b // ldr z11, [x3] WORD $0x8580446d // ldr z13, [x3, #1, MUL VL] - WORD $0x91010063 // add x3, x3, #64 + WORD $0x04235043 // addvl x3, x3, #2 WORD $0x04fc956c // lsr z12.d, z11.d, #4 WORD $0x04fc95ae // lsr z14.d, z13.d, #4 WORD $0x0426316b // and z11.d, z11.d, z6.d @@ -1507,7 +1525,7 @@ mulSve_10x3_64_loop: // Load and process 64 bytes from input 1 to 3 outputs WORD $0x8580402b // ldr z11, [x1] WORD $0x8580442d // ldr z13, [x1, #1, MUL VL] - WORD $0x91010021 // add x1, x1, #64 + WORD $0x04215041 // addvl x1, x1, #2 WORD $0x04fc956c // lsr z12.d, z11.d, #4 WORD $0x04fc95ae // lsr z14.d, z13.d, #4 WORD $0x0426316b // and z11.d, z11.d, z6.d @@ -1551,7 +1569,7 @@ mulSve_10x3_64_loop: // Load and process 64 bytes from input 2 to 3 outputs WORD $0x8580408b // ldr z11, [x4] WORD $0x8580448d // ldr z13, [x4, #1, MUL VL] - WORD $0x91010084 // add x4, x4, #64 + WORD $0x04245044 // addvl x4, x4, #2 WORD $0x04fc956c // lsr z12.d, z11.d, #4 WORD $0x04fc95ae // lsr z14.d, z13.d, #4 WORD $0x0426316b // and z11.d, z11.d, z6.d @@ -1595,7 +1613,7 @@ mulSve_10x3_64_loop: // Load and process 64 bytes from input 3 to 3 outputs WORD $0x858040ab // ldr z11, [x5] WORD $0x858044ad // ldr z13, [x5, #1, MUL VL] - WORD $0x910100a5 // add x5, x5, #64 + WORD $0x04255045 // addvl x5, x5, #2 WORD $0x04fc956c // lsr z12.d, z11.d, #4 WORD $0x04fc95ae // lsr z14.d, z13.d, #4 WORD $0x0426316b // and z11.d, z11.d, z6.d @@ -1639,7 +1657,7 @@ mulSve_10x3_64_loop: // Load and process 64 bytes from input 4 to 3 outputs WORD $0x8580410b // ldr z11, [x8] WORD $0x8580450d // ldr z13, [x8, #1, MUL VL] - WORD $0x91010108 // add x8, x8, #64 + WORD $0x04285048 // addvl x8, x8, #2 WORD $0x04fc956c // lsr z12.d, z11.d, #4 WORD $0x04fc95ae // lsr z14.d, z13.d, #4 WORD $0x0426316b // and z11.d, z11.d, z6.d @@ -1683,7 +1701,7 @@ mulSve_10x3_64_loop: // Load and process 64 bytes from input 5 to 3 outputs WORD $0x8580412b // ldr z11, [x9] WORD $0x8580452d // ldr z13, [x9, #1, MUL VL] - WORD $0x91010129 // add x9, x9, #64 + WORD $0x04295049 // addvl x9, x9, #2 WORD $0x04fc956c // lsr z12.d, z11.d, #4 WORD $0x04fc95ae // lsr z14.d, z13.d, #4 WORD $0x0426316b // and z11.d, z11.d, z6.d @@ -1727,7 +1745,7 @@ mulSve_10x3_64_loop: // Load and process 64 bytes from input 6 to 3 outputs WORD $0x8580414b // ldr z11, [x10] WORD $0x8580454d // ldr z13, [x10, #1, MUL VL] - WORD $0x9101014a // add x10, x10, #64 + WORD $0x042a504a // addvl x10, x10, #2 WORD $0x04fc956c // lsr z12.d, z11.d, #4 WORD $0x04fc95ae // lsr z14.d, z13.d, #4 WORD $0x0426316b // and z11.d, z11.d, z6.d @@ -1771,7 +1789,7 @@ mulSve_10x3_64_loop: // Load and process 64 bytes from input 7 to 3 outputs WORD $0x8580416b // ldr z11, [x11] WORD $0x8580456d // ldr z13, [x11, #1, MUL VL] - WORD $0x9101016b // add x11, x11, #64 + WORD $0x042b504b // addvl x11, x11, #2 WORD $0x04fc956c // lsr z12.d, z11.d, #4 WORD $0x04fc95ae // lsr z14.d, z13.d, #4 WORD $0x0426316b // and z11.d, z11.d, z6.d @@ -1815,7 +1833,7 @@ mulSve_10x3_64_loop: // Load and process 64 bytes from input 8 to 3 outputs WORD $0x8580418b // ldr z11, [x12] WORD $0x8580458d // ldr z13, [x12, #1, MUL VL] - WORD $0x9101018c // add x12, x12, #64 + WORD $0x042c504c // addvl x12, x12, #2 WORD $0x04fc956c // lsr z12.d, z11.d, #4 WORD $0x04fc95ae // lsr z14.d, z13.d, #4 WORD $0x0426316b // and z11.d, z11.d, z6.d @@ -1859,7 +1877,7 @@ mulSve_10x3_64_loop: // Load and process 64 bytes from input 9 to 3 outputs WORD $0x8580400b // ldr z11, [x0] WORD $0x8580440d // ldr z13, [x0, #1, MUL VL] - WORD $0x91010000 // add x0, x0, #64 + WORD $0x04205040 // addvl x0, x0, #2 WORD $0x04fc956c // lsr z12.d, z11.d, #4 WORD $0x04fc95ae // lsr z14.d, z13.d, #4 WORD $0x0426316b // and z11.d, z11.d, z6.d @@ -1901,13 +1919,13 @@ mulSve_10x3_64_store: // Store 3 outputs WORD $0xe58041c0 // str z0, [x14] WORD $0xe58045c1 // str z1, [x14, #1, MUL VL] - WORD $0x910101ce // add x14, x14, #64 + WORD $0x042e504e // addvl x14, x14, #2 WORD $0xe58041e2 // str z2, [x15] WORD $0xe58045e3 // str z3, [x15, #1, MUL VL] - WORD $0x910101ef // add x15, x15, #64 + WORD $0x042f504f // addvl x15, x15, #2 WORD $0xe58041a4 // str z4, [x13] WORD $0xe58045a5 // str z5, [x13, #1, MUL VL] - WORD $0x910101ad // add x13, x13, #64 + WORD $0x042d504d // addvl x13, x13, #2 // Prepare for next loop WORD $0xf10004c6 // subs x6, x6, #1 @@ -1925,6 +1943,9 @@ TEXT ·mulSve_10x3_64Xor(SB), $8-88 MOVD n+80(FP), R0 MOVD matrix_base+0(FP), R2 WORD $0xd346fc00 // lsr x0, x0, #6 + WORD $0xd37ae400 // lsl x0, x0, #6 + WORD $0x04bf5050 // rdvl x16, #2 + WORD $0x9ad00800 // udiv x0, x0, x16 WORD $0xea00001f // tst x0, x0 BEQ mulSve_10x3_64Xor_end MOVD in_base+24(FP), R0 @@ -1967,6 +1988,9 @@ TEXT ·mulSve_10x3_64Xor(SB), $8-88 // Reload length to save a register MOVD n+80(FP), R6 WORD $0xd346fcc6 // lsr x6, x6, #6 + WORD $0xd37ae4c6 // lsl x6, x6, #6 + WORD $0x04bf5050 // rdvl x16, #2 + WORD $0x9ad008c6 // udiv x6, x6, x16 // Load number of input shards MOVD in_len+32(FP), R16 @@ -1983,7 +2007,7 @@ mulSve_10x3_64Xor_loop: // Load and process 64 bytes from input 0 to 3 outputs WORD $0x8580406b // ldr z11, [x3] WORD $0x8580446d // ldr z13, [x3, #1, MUL VL] - WORD $0x91010063 // add x3, x3, #64 + WORD $0x04235043 // addvl x3, x3, #2 WORD $0x04fc956c // lsr z12.d, z11.d, #4 WORD $0x04fc95ae // lsr z14.d, z13.d, #4 WORD $0x0426316b // and z11.d, z11.d, z6.d @@ -2027,7 +2051,7 @@ mulSve_10x3_64Xor_loop: // Load and process 64 bytes from input 1 to 3 outputs WORD $0x8580402b // ldr z11, [x1] WORD $0x8580442d // ldr z13, [x1, #1, MUL VL] - WORD $0x91010021 // add x1, x1, #64 + WORD $0x04215041 // addvl x1, x1, #2 WORD $0x04fc956c // lsr z12.d, z11.d, #4 WORD $0x04fc95ae // lsr z14.d, z13.d, #4 WORD $0x0426316b // and z11.d, z11.d, z6.d @@ -2071,7 +2095,7 @@ mulSve_10x3_64Xor_loop: // Load and process 64 bytes from input 2 to 3 outputs WORD $0x8580408b // ldr z11, [x4] WORD $0x8580448d // ldr z13, [x4, #1, MUL VL] - WORD $0x91010084 // add x4, x4, #64 + WORD $0x04245044 // addvl x4, x4, #2 WORD $0x04fc956c // lsr z12.d, z11.d, #4 WORD $0x04fc95ae // lsr z14.d, z13.d, #4 WORD $0x0426316b // and z11.d, z11.d, z6.d @@ -2115,7 +2139,7 @@ mulSve_10x3_64Xor_loop: // Load and process 64 bytes from input 3 to 3 outputs WORD $0x858040ab // ldr z11, [x5] WORD $0x858044ad // ldr z13, [x5, #1, MUL VL] - WORD $0x910100a5 // add x5, x5, #64 + WORD $0x04255045 // addvl x5, x5, #2 WORD $0x04fc956c // lsr z12.d, z11.d, #4 WORD $0x04fc95ae // lsr z14.d, z13.d, #4 WORD $0x0426316b // and z11.d, z11.d, z6.d @@ -2159,7 +2183,7 @@ mulSve_10x3_64Xor_loop: // Load and process 64 bytes from input 4 to 3 outputs WORD $0x8580410b // ldr z11, [x8] WORD $0x8580450d // ldr z13, [x8, #1, MUL VL] - WORD $0x91010108 // add x8, x8, #64 + WORD $0x04285048 // addvl x8, x8, #2 WORD $0x04fc956c // lsr z12.d, z11.d, #4 WORD $0x04fc95ae // lsr z14.d, z13.d, #4 WORD $0x0426316b // and z11.d, z11.d, z6.d @@ -2203,7 +2227,7 @@ mulSve_10x3_64Xor_loop: // Load and process 64 bytes from input 5 to 3 outputs WORD $0x8580412b // ldr z11, [x9] WORD $0x8580452d // ldr z13, [x9, #1, MUL VL] - WORD $0x91010129 // add x9, x9, #64 + WORD $0x04295049 // addvl x9, x9, #2 WORD $0x04fc956c // lsr z12.d, z11.d, #4 WORD $0x04fc95ae // lsr z14.d, z13.d, #4 WORD $0x0426316b // and z11.d, z11.d, z6.d @@ -2247,7 +2271,7 @@ mulSve_10x3_64Xor_loop: // Load and process 64 bytes from input 6 to 3 outputs WORD $0x8580414b // ldr z11, [x10] WORD $0x8580454d // ldr z13, [x10, #1, MUL VL] - WORD $0x9101014a // add x10, x10, #64 + WORD $0x042a504a // addvl x10, x10, #2 WORD $0x04fc956c // lsr z12.d, z11.d, #4 WORD $0x04fc95ae // lsr z14.d, z13.d, #4 WORD $0x0426316b // and z11.d, z11.d, z6.d @@ -2291,7 +2315,7 @@ mulSve_10x3_64Xor_loop: // Load and process 64 bytes from input 7 to 3 outputs WORD $0x8580416b // ldr z11, [x11] WORD $0x8580456d // ldr z13, [x11, #1, MUL VL] - WORD $0x9101016b // add x11, x11, #64 + WORD $0x042b504b // addvl x11, x11, #2 WORD $0x04fc956c // lsr z12.d, z11.d, #4 WORD $0x04fc95ae // lsr z14.d, z13.d, #4 WORD $0x0426316b // and z11.d, z11.d, z6.d @@ -2335,7 +2359,7 @@ mulSve_10x3_64Xor_loop: // Load and process 64 bytes from input 8 to 3 outputs WORD $0x8580418b // ldr z11, [x12] WORD $0x8580458d // ldr z13, [x12, #1, MUL VL] - WORD $0x9101018c // add x12, x12, #64 + WORD $0x042c504c // addvl x12, x12, #2 WORD $0x04fc956c // lsr z12.d, z11.d, #4 WORD $0x04fc95ae // lsr z14.d, z13.d, #4 WORD $0x0426316b // and z11.d, z11.d, z6.d @@ -2379,7 +2403,7 @@ mulSve_10x3_64Xor_loop: // Load and process 64 bytes from input 9 to 3 outputs WORD $0x8580400b // ldr z11, [x0] WORD $0x8580440d // ldr z13, [x0, #1, MUL VL] - WORD $0x91010000 // add x0, x0, #64 + WORD $0x04205040 // addvl x0, x0, #2 WORD $0x04fc956c // lsr z12.d, z11.d, #4 WORD $0x04fc95ae // lsr z14.d, z13.d, #4 WORD $0x0426316b // and z11.d, z11.d, z6.d @@ -2421,13 +2445,13 @@ mulSve_10x3_64Xor_store: // Store 3 outputs WORD $0xe58041c0 // str z0, [x14] WORD $0xe58045c1 // str z1, [x14, #1, MUL VL] - WORD $0x910101ce // add x14, x14, #64 + WORD $0x042e504e // addvl x14, x14, #2 WORD $0xe58041e2 // str z2, [x15] WORD $0xe58045e3 // str z3, [x15, #1, MUL VL] - WORD $0x910101ef // add x15, x15, #64 + WORD $0x042f504f // addvl x15, x15, #2 WORD $0xe58041a4 // str z4, [x13] WORD $0xe58045a5 // str z5, [x13, #1, MUL VL] - WORD $0x910101ad // add x13, x13, #64 + WORD $0x042d504d // addvl x13, x13, #2 // Prepare for next loop WORD $0xf10004c6 // subs x6, x6, #1 @@ -2446,6 +2470,9 @@ TEXT ·mulSve_10x4(SB), NOSPLIT, $8-88 MOVD n+80(FP), R0 MOVD matrix_base+0(FP), R2 WORD $0xd345fc00 // lsr x0, x0, #5 + WORD $0xd37be800 // lsl x0, x0, #5 + WORD $0x04bf5030 // rdvl x16, #1 + WORD $0x9ad00800 // udiv x0, x0, x16 WORD $0xea00001f // tst x0, x0 BEQ mulSve_10x4_end MOVD in_base+24(FP), R3 @@ -2480,11 +2507,13 @@ TEXT ·mulSve_10x4(SB), NOSPLIT, $8-88 // Load number of input shards MOVD in_len+32(FP), R16 + WORD $0x04bf5031 // rdvl x17, #1 + WORD $0xd343fe31 // lsr x17, x17, #3 mulSve_10x4_loop: // Load and process 32 bytes from input 0 to 4 outputs WORD $0x85804027 // ldr z7, [x1] - WORD $0x91008021 // add x1, x1, #32 + WORD $0x04215021 // addvl x1, x1, #1 WORD $0x04fc94e8 // lsr z8.d, z7.d, #4 WORD $0x042430e7 // and z7.d, z7.d, z4.d WORD $0x04243108 // and z8.d, z8.d, z4.d @@ -2514,7 +2543,7 @@ mulSve_10x4_loop: // Load and process 32 bytes from input 1 to 4 outputs WORD $0x85804087 // ldr z7, [x4] - WORD $0x91008084 // add x4, x4, #32 + WORD $0x04245024 // addvl x4, x4, #1 WORD $0x04fc94e8 // lsr z8.d, z7.d, #4 WORD $0x042430e7 // and z7.d, z7.d, z4.d WORD $0x04243108 // and z8.d, z8.d, z4.d @@ -2548,7 +2577,7 @@ mulSve_10x4_loop: // Load and process 32 bytes from input 2 to 4 outputs WORD $0x858040a7 // ldr z7, [x5] - WORD $0x910080a5 // add x5, x5, #32 + WORD $0x04255025 // addvl x5, x5, #1 WORD $0x04fc94e8 // lsr z8.d, z7.d, #4 WORD $0x042430e7 // and z7.d, z7.d, z4.d WORD $0x04243108 // and z8.d, z8.d, z4.d @@ -2582,7 +2611,7 @@ mulSve_10x4_loop: // Load and process 32 bytes from input 3 to 4 outputs WORD $0x85804107 // ldr z7, [x8] - WORD $0x91008108 // add x8, x8, #32 + WORD $0x04285028 // addvl x8, x8, #1 WORD $0x04fc94e8 // lsr z8.d, z7.d, #4 WORD $0x042430e7 // and z7.d, z7.d, z4.d WORD $0x04243108 // and z8.d, z8.d, z4.d @@ -2616,7 +2645,7 @@ mulSve_10x4_loop: // Load and process 32 bytes from input 4 to 4 outputs WORD $0x85804127 // ldr z7, [x9] - WORD $0x91008129 // add x9, x9, #32 + WORD $0x04295029 // addvl x9, x9, #1 WORD $0x04fc94e8 // lsr z8.d, z7.d, #4 WORD $0x042430e7 // and z7.d, z7.d, z4.d WORD $0x04243108 // and z8.d, z8.d, z4.d @@ -2650,7 +2679,7 @@ mulSve_10x4_loop: // Load and process 32 bytes from input 5 to 4 outputs WORD $0x85804147 // ldr z7, [x10] - WORD $0x9100814a // add x10, x10, #32 + WORD $0x042a502a // addvl x10, x10, #1 WORD $0x04fc94e8 // lsr z8.d, z7.d, #4 WORD $0x042430e7 // and z7.d, z7.d, z4.d WORD $0x04243108 // and z8.d, z8.d, z4.d @@ -2684,7 +2713,7 @@ mulSve_10x4_loop: // Load and process 32 bytes from input 6 to 4 outputs WORD $0x85804167 // ldr z7, [x11] - WORD $0x9100816b // add x11, x11, #32 + WORD $0x042b502b // addvl x11, x11, #1 WORD $0x04fc94e8 // lsr z8.d, z7.d, #4 WORD $0x042430e7 // and z7.d, z7.d, z4.d WORD $0x04243108 // and z8.d, z8.d, z4.d @@ -2718,7 +2747,7 @@ mulSve_10x4_loop: // Load and process 32 bytes from input 7 to 4 outputs WORD $0x85804187 // ldr z7, [x12] - WORD $0x9100818c // add x12, x12, #32 + WORD $0x042c502c // addvl x12, x12, #1 WORD $0x04fc94e8 // lsr z8.d, z7.d, #4 WORD $0x042430e7 // and z7.d, z7.d, z4.d WORD $0x04243108 // and z8.d, z8.d, z4.d @@ -2752,7 +2781,7 @@ mulSve_10x4_loop: // Load and process 32 bytes from input 8 to 4 outputs WORD $0x858041a7 // ldr z7, [x13] - WORD $0x910081ad // add x13, x13, #32 + WORD $0x042d502d // addvl x13, x13, #1 WORD $0x04fc94e8 // lsr z8.d, z7.d, #4 WORD $0x042430e7 // and z7.d, z7.d, z4.d WORD $0x04243108 // and z8.d, z8.d, z4.d @@ -2786,7 +2815,7 @@ mulSve_10x4_loop: // Load and process 32 bytes from input 9 to 4 outputs WORD $0x85804067 // ldr z7, [x3] - WORD $0x91008063 // add x3, x3, #32 + WORD $0x04235023 // addvl x3, x3, #1 WORD $0x04fc94e8 // lsr z8.d, z7.d, #4 WORD $0x042430e7 // and z7.d, z7.d, z4.d WORD $0x04243108 // and z8.d, z8.d, z4.d @@ -2827,7 +2856,7 @@ mulSve_10x4_store: WORD $0xe5ef40c3 // st1d { z3.d }, p0, [x6, x15, lsl #3] // Prepare for next loop - WORD $0x910011ef // add x15, x15, #4 + WORD $0x8b1101ef // add x15, x15, x17 WORD $0xf1000400 // subs x0, x0, #1 BNE mulSve_10x4_loop @@ -2844,6 +2873,9 @@ TEXT ·mulSve_10x4Xor(SB), NOSPLIT, $8-88 MOVD n+80(FP), R0 MOVD matrix_base+0(FP), R2 WORD $0xd345fc00 // lsr x0, x0, #5 + WORD $0xd37be800 // lsl x0, x0, #5 + WORD $0x04bf5030 // rdvl x16, #1 + WORD $0x9ad00800 // udiv x0, x0, x16 WORD $0xea00001f // tst x0, x0 BEQ mulSve_10x4Xor_end MOVD in_base+24(FP), R3 @@ -2878,11 +2910,13 @@ TEXT ·mulSve_10x4Xor(SB), NOSPLIT, $8-88 // Load number of input shards MOVD in_len+32(FP), R16 + WORD $0x04bf5031 // rdvl x17, #1 + WORD $0xd343fe31 // lsr x17, x17, #3 mulSve_10x4Xor_loop: // Load and process 32 bytes from input 0 to 4 outputs WORD $0x85804027 // ldr z7, [x1] - WORD $0x91008021 // add x1, x1, #32 + WORD $0x04215021 // addvl x1, x1, #1 WORD $0x04fc94e8 // lsr z8.d, z7.d, #4 WORD $0x042430e7 // and z7.d, z7.d, z4.d WORD $0x04243108 // and z8.d, z8.d, z4.d @@ -2924,7 +2958,7 @@ mulSve_10x4Xor_loop: // Load and process 32 bytes from input 1 to 4 outputs WORD $0x85804087 // ldr z7, [x4] - WORD $0x91008084 // add x4, x4, #32 + WORD $0x04245024 // addvl x4, x4, #1 WORD $0x04fc94e8 // lsr z8.d, z7.d, #4 WORD $0x042430e7 // and z7.d, z7.d, z4.d WORD $0x04243108 // and z8.d, z8.d, z4.d @@ -2958,7 +2992,7 @@ mulSve_10x4Xor_loop: // Load and process 32 bytes from input 2 to 4 outputs WORD $0x858040a7 // ldr z7, [x5] - WORD $0x910080a5 // add x5, x5, #32 + WORD $0x04255025 // addvl x5, x5, #1 WORD $0x04fc94e8 // lsr z8.d, z7.d, #4 WORD $0x042430e7 // and z7.d, z7.d, z4.d WORD $0x04243108 // and z8.d, z8.d, z4.d @@ -2992,7 +3026,7 @@ mulSve_10x4Xor_loop: // Load and process 32 bytes from input 3 to 4 outputs WORD $0x85804107 // ldr z7, [x8] - WORD $0x91008108 // add x8, x8, #32 + WORD $0x04285028 // addvl x8, x8, #1 WORD $0x04fc94e8 // lsr z8.d, z7.d, #4 WORD $0x042430e7 // and z7.d, z7.d, z4.d WORD $0x04243108 // and z8.d, z8.d, z4.d @@ -3026,7 +3060,7 @@ mulSve_10x4Xor_loop: // Load and process 32 bytes from input 4 to 4 outputs WORD $0x85804127 // ldr z7, [x9] - WORD $0x91008129 // add x9, x9, #32 + WORD $0x04295029 // addvl x9, x9, #1 WORD $0x04fc94e8 // lsr z8.d, z7.d, #4 WORD $0x042430e7 // and z7.d, z7.d, z4.d WORD $0x04243108 // and z8.d, z8.d, z4.d @@ -3060,7 +3094,7 @@ mulSve_10x4Xor_loop: // Load and process 32 bytes from input 5 to 4 outputs WORD $0x85804147 // ldr z7, [x10] - WORD $0x9100814a // add x10, x10, #32 + WORD $0x042a502a // addvl x10, x10, #1 WORD $0x04fc94e8 // lsr z8.d, z7.d, #4 WORD $0x042430e7 // and z7.d, z7.d, z4.d WORD $0x04243108 // and z8.d, z8.d, z4.d @@ -3094,7 +3128,7 @@ mulSve_10x4Xor_loop: // Load and process 32 bytes from input 6 to 4 outputs WORD $0x85804167 // ldr z7, [x11] - WORD $0x9100816b // add x11, x11, #32 + WORD $0x042b502b // addvl x11, x11, #1 WORD $0x04fc94e8 // lsr z8.d, z7.d, #4 WORD $0x042430e7 // and z7.d, z7.d, z4.d WORD $0x04243108 // and z8.d, z8.d, z4.d @@ -3128,7 +3162,7 @@ mulSve_10x4Xor_loop: // Load and process 32 bytes from input 7 to 4 outputs WORD $0x85804187 // ldr z7, [x12] - WORD $0x9100818c // add x12, x12, #32 + WORD $0x042c502c // addvl x12, x12, #1 WORD $0x04fc94e8 // lsr z8.d, z7.d, #4 WORD $0x042430e7 // and z7.d, z7.d, z4.d WORD $0x04243108 // and z8.d, z8.d, z4.d @@ -3162,7 +3196,7 @@ mulSve_10x4Xor_loop: // Load and process 32 bytes from input 8 to 4 outputs WORD $0x858041a7 // ldr z7, [x13] - WORD $0x910081ad // add x13, x13, #32 + WORD $0x042d502d // addvl x13, x13, #1 WORD $0x04fc94e8 // lsr z8.d, z7.d, #4 WORD $0x042430e7 // and z7.d, z7.d, z4.d WORD $0x04243108 // and z8.d, z8.d, z4.d @@ -3196,7 +3230,7 @@ mulSve_10x4Xor_loop: // Load and process 32 bytes from input 9 to 4 outputs WORD $0x85804067 // ldr z7, [x3] - WORD $0x91008063 // add x3, x3, #32 + WORD $0x04235023 // addvl x3, x3, #1 WORD $0x04fc94e8 // lsr z8.d, z7.d, #4 WORD $0x042430e7 // and z7.d, z7.d, z4.d WORD $0x04243108 // and z8.d, z8.d, z4.d @@ -3237,7 +3271,7 @@ mulSve_10x4Xor_store: WORD $0xe5ef40c3 // st1d { z3.d }, p0, [x6, x15, lsl #3] // Prepare for next loop - WORD $0x910011ef // add x15, x15, #4 + WORD $0x8b1101ef // add x15, x15, x17 WORD $0xf1000400 // subs x0, x0, #1 BNE mulSve_10x4Xor_loop @@ -3254,6 +3288,9 @@ TEXT ·mulSve_10x5(SB), NOSPLIT, $8-88 MOVD n+80(FP), R0 MOVD matrix_base+0(FP), R2 WORD $0xd345fc00 // lsr x0, x0, #5 + WORD $0xd37be800 // lsl x0, x0, #5 + WORD $0x04bf5030 // rdvl x16, #1 + WORD $0x9ad00800 // udiv x0, x0, x16 WORD $0xea00001f // tst x0, x0 BEQ mulSve_10x5_end MOVD in_base+24(FP), R3 @@ -3288,11 +3325,13 @@ TEXT ·mulSve_10x5(SB), NOSPLIT, $8-88 // Load number of input shards MOVD in_len+32(FP), R16 + WORD $0x04bf5031 // rdvl x17, #1 + WORD $0xd343fe31 // lsr x17, x17, #3 mulSve_10x5_loop: // Load and process 32 bytes from input 0 to 5 outputs WORD $0x85804028 // ldr z8, [x1] - WORD $0x91008021 // add x1, x1, #32 + WORD $0x04215021 // addvl x1, x1, #1 WORD $0x04fc9509 // lsr z9.d, z8.d, #4 WORD $0x04253108 // and z8.d, z8.d, z5.d WORD $0x04253129 // and z9.d, z9.d, z5.d @@ -3327,7 +3366,7 @@ mulSve_10x5_loop: // Load and process 32 bytes from input 1 to 5 outputs WORD $0x85804088 // ldr z8, [x4] - WORD $0x91008084 // add x4, x4, #32 + WORD $0x04245024 // addvl x4, x4, #1 WORD $0x04fc9509 // lsr z9.d, z8.d, #4 WORD $0x04253108 // and z8.d, z8.d, z5.d WORD $0x04253129 // and z9.d, z9.d, z5.d @@ -3367,7 +3406,7 @@ mulSve_10x5_loop: // Load and process 32 bytes from input 2 to 5 outputs WORD $0x858040a8 // ldr z8, [x5] - WORD $0x910080a5 // add x5, x5, #32 + WORD $0x04255025 // addvl x5, x5, #1 WORD $0x04fc9509 // lsr z9.d, z8.d, #4 WORD $0x04253108 // and z8.d, z8.d, z5.d WORD $0x04253129 // and z9.d, z9.d, z5.d @@ -3407,7 +3446,7 @@ mulSve_10x5_loop: // Load and process 32 bytes from input 3 to 5 outputs WORD $0x85804108 // ldr z8, [x8] - WORD $0x91008108 // add x8, x8, #32 + WORD $0x04285028 // addvl x8, x8, #1 WORD $0x04fc9509 // lsr z9.d, z8.d, #4 WORD $0x04253108 // and z8.d, z8.d, z5.d WORD $0x04253129 // and z9.d, z9.d, z5.d @@ -3447,7 +3486,7 @@ mulSve_10x5_loop: // Load and process 32 bytes from input 4 to 5 outputs WORD $0x85804128 // ldr z8, [x9] - WORD $0x91008129 // add x9, x9, #32 + WORD $0x04295029 // addvl x9, x9, #1 WORD $0x04fc9509 // lsr z9.d, z8.d, #4 WORD $0x04253108 // and z8.d, z8.d, z5.d WORD $0x04253129 // and z9.d, z9.d, z5.d @@ -3487,7 +3526,7 @@ mulSve_10x5_loop: // Load and process 32 bytes from input 5 to 5 outputs WORD $0x85804148 // ldr z8, [x10] - WORD $0x9100814a // add x10, x10, #32 + WORD $0x042a502a // addvl x10, x10, #1 WORD $0x04fc9509 // lsr z9.d, z8.d, #4 WORD $0x04253108 // and z8.d, z8.d, z5.d WORD $0x04253129 // and z9.d, z9.d, z5.d @@ -3527,7 +3566,7 @@ mulSve_10x5_loop: // Load and process 32 bytes from input 6 to 5 outputs WORD $0x85804168 // ldr z8, [x11] - WORD $0x9100816b // add x11, x11, #32 + WORD $0x042b502b // addvl x11, x11, #1 WORD $0x04fc9509 // lsr z9.d, z8.d, #4 WORD $0x04253108 // and z8.d, z8.d, z5.d WORD $0x04253129 // and z9.d, z9.d, z5.d @@ -3567,7 +3606,7 @@ mulSve_10x5_loop: // Load and process 32 bytes from input 7 to 5 outputs WORD $0x85804188 // ldr z8, [x12] - WORD $0x9100818c // add x12, x12, #32 + WORD $0x042c502c // addvl x12, x12, #1 WORD $0x04fc9509 // lsr z9.d, z8.d, #4 WORD $0x04253108 // and z8.d, z8.d, z5.d WORD $0x04253129 // and z9.d, z9.d, z5.d @@ -3607,7 +3646,7 @@ mulSve_10x5_loop: // Load and process 32 bytes from input 8 to 5 outputs WORD $0x858041a8 // ldr z8, [x13] - WORD $0x910081ad // add x13, x13, #32 + WORD $0x042d502d // addvl x13, x13, #1 WORD $0x04fc9509 // lsr z9.d, z8.d, #4 WORD $0x04253108 // and z8.d, z8.d, z5.d WORD $0x04253129 // and z9.d, z9.d, z5.d @@ -3647,7 +3686,7 @@ mulSve_10x5_loop: // Load and process 32 bytes from input 9 to 5 outputs WORD $0x85804068 // ldr z8, [x3] - WORD $0x91008063 // add x3, x3, #32 + WORD $0x04235023 // addvl x3, x3, #1 WORD $0x04fc9509 // lsr z9.d, z8.d, #4 WORD $0x04253108 // and z8.d, z8.d, z5.d WORD $0x04253129 // and z9.d, z9.d, z5.d @@ -3696,7 +3735,7 @@ mulSve_10x5_store: WORD $0xe5ef40c4 // st1d { z4.d }, p0, [x6, x15, lsl #3] // Prepare for next loop - WORD $0x910011ef // add x15, x15, #4 + WORD $0x8b1101ef // add x15, x15, x17 WORD $0xf1000400 // subs x0, x0, #1 BNE mulSve_10x5_loop @@ -3713,6 +3752,9 @@ TEXT ·mulSve_10x5Xor(SB), NOSPLIT, $8-88 MOVD n+80(FP), R0 MOVD matrix_base+0(FP), R2 WORD $0xd345fc00 // lsr x0, x0, #5 + WORD $0xd37be800 // lsl x0, x0, #5 + WORD $0x04bf5030 // rdvl x16, #1 + WORD $0x9ad00800 // udiv x0, x0, x16 WORD $0xea00001f // tst x0, x0 BEQ mulSve_10x5Xor_end MOVD in_base+24(FP), R3 @@ -3747,11 +3789,13 @@ TEXT ·mulSve_10x5Xor(SB), NOSPLIT, $8-88 // Load number of input shards MOVD in_len+32(FP), R16 + WORD $0x04bf5031 // rdvl x17, #1 + WORD $0xd343fe31 // lsr x17, x17, #3 mulSve_10x5Xor_loop: // Load and process 32 bytes from input 0 to 5 outputs WORD $0x85804028 // ldr z8, [x1] - WORD $0x91008021 // add x1, x1, #32 + WORD $0x04215021 // addvl x1, x1, #1 WORD $0x04fc9509 // lsr z9.d, z8.d, #4 WORD $0x04253108 // and z8.d, z8.d, z5.d WORD $0x04253129 // and z9.d, z9.d, z5.d @@ -3801,7 +3845,7 @@ mulSve_10x5Xor_loop: // Load and process 32 bytes from input 1 to 5 outputs WORD $0x85804088 // ldr z8, [x4] - WORD $0x91008084 // add x4, x4, #32 + WORD $0x04245024 // addvl x4, x4, #1 WORD $0x04fc9509 // lsr z9.d, z8.d, #4 WORD $0x04253108 // and z8.d, z8.d, z5.d WORD $0x04253129 // and z9.d, z9.d, z5.d @@ -3841,7 +3885,7 @@ mulSve_10x5Xor_loop: // Load and process 32 bytes from input 2 to 5 outputs WORD $0x858040a8 // ldr z8, [x5] - WORD $0x910080a5 // add x5, x5, #32 + WORD $0x04255025 // addvl x5, x5, #1 WORD $0x04fc9509 // lsr z9.d, z8.d, #4 WORD $0x04253108 // and z8.d, z8.d, z5.d WORD $0x04253129 // and z9.d, z9.d, z5.d @@ -3881,7 +3925,7 @@ mulSve_10x5Xor_loop: // Load and process 32 bytes from input 3 to 5 outputs WORD $0x85804108 // ldr z8, [x8] - WORD $0x91008108 // add x8, x8, #32 + WORD $0x04285028 // addvl x8, x8, #1 WORD $0x04fc9509 // lsr z9.d, z8.d, #4 WORD $0x04253108 // and z8.d, z8.d, z5.d WORD $0x04253129 // and z9.d, z9.d, z5.d @@ -3921,7 +3965,7 @@ mulSve_10x5Xor_loop: // Load and process 32 bytes from input 4 to 5 outputs WORD $0x85804128 // ldr z8, [x9] - WORD $0x91008129 // add x9, x9, #32 + WORD $0x04295029 // addvl x9, x9, #1 WORD $0x04fc9509 // lsr z9.d, z8.d, #4 WORD $0x04253108 // and z8.d, z8.d, z5.d WORD $0x04253129 // and z9.d, z9.d, z5.d @@ -3961,7 +4005,7 @@ mulSve_10x5Xor_loop: // Load and process 32 bytes from input 5 to 5 outputs WORD $0x85804148 // ldr z8, [x10] - WORD $0x9100814a // add x10, x10, #32 + WORD $0x042a502a // addvl x10, x10, #1 WORD $0x04fc9509 // lsr z9.d, z8.d, #4 WORD $0x04253108 // and z8.d, z8.d, z5.d WORD $0x04253129 // and z9.d, z9.d, z5.d @@ -4001,7 +4045,7 @@ mulSve_10x5Xor_loop: // Load and process 32 bytes from input 6 to 5 outputs WORD $0x85804168 // ldr z8, [x11] - WORD $0x9100816b // add x11, x11, #32 + WORD $0x042b502b // addvl x11, x11, #1 WORD $0x04fc9509 // lsr z9.d, z8.d, #4 WORD $0x04253108 // and z8.d, z8.d, z5.d WORD $0x04253129 // and z9.d, z9.d, z5.d @@ -4041,7 +4085,7 @@ mulSve_10x5Xor_loop: // Load and process 32 bytes from input 7 to 5 outputs WORD $0x85804188 // ldr z8, [x12] - WORD $0x9100818c // add x12, x12, #32 + WORD $0x042c502c // addvl x12, x12, #1 WORD $0x04fc9509 // lsr z9.d, z8.d, #4 WORD $0x04253108 // and z8.d, z8.d, z5.d WORD $0x04253129 // and z9.d, z9.d, z5.d @@ -4081,7 +4125,7 @@ mulSve_10x5Xor_loop: // Load and process 32 bytes from input 8 to 5 outputs WORD $0x858041a8 // ldr z8, [x13] - WORD $0x910081ad // add x13, x13, #32 + WORD $0x042d502d // addvl x13, x13, #1 WORD $0x04fc9509 // lsr z9.d, z8.d, #4 WORD $0x04253108 // and z8.d, z8.d, z5.d WORD $0x04253129 // and z9.d, z9.d, z5.d @@ -4121,7 +4165,7 @@ mulSve_10x5Xor_loop: // Load and process 32 bytes from input 9 to 5 outputs WORD $0x85804068 // ldr z8, [x3] - WORD $0x91008063 // add x3, x3, #32 + WORD $0x04235023 // addvl x3, x3, #1 WORD $0x04fc9509 // lsr z9.d, z8.d, #4 WORD $0x04253108 // and z8.d, z8.d, z5.d WORD $0x04253129 // and z9.d, z9.d, z5.d @@ -4170,7 +4214,7 @@ mulSve_10x5Xor_store: WORD $0xe5ef40c4 // st1d { z4.d }, p0, [x6, x15, lsl #3] // Prepare for next loop - WORD $0x910011ef // add x15, x15, #4 + WORD $0x8b1101ef // add x15, x15, x17 WORD $0xf1000400 // subs x0, x0, #1 BNE mulSve_10x5Xor_loop @@ -4187,6 +4231,9 @@ TEXT ·mulSve_10x6(SB), NOSPLIT, $8-88 MOVD n+80(FP), R0 MOVD matrix_base+0(FP), R2 WORD $0xd345fc00 // lsr x0, x0, #5 + WORD $0xd37be800 // lsl x0, x0, #5 + WORD $0x04bf5030 // rdvl x16, #1 + WORD $0x9ad00800 // udiv x0, x0, x16 WORD $0xea00001f // tst x0, x0 BEQ mulSve_10x6_end MOVD in_base+24(FP), R3 @@ -4221,11 +4268,13 @@ TEXT ·mulSve_10x6(SB), NOSPLIT, $8-88 // Load number of input shards MOVD in_len+32(FP), R16 + WORD $0x04bf5031 // rdvl x17, #1 + WORD $0xd343fe31 // lsr x17, x17, #3 mulSve_10x6_loop: // Load and process 32 bytes from input 0 to 6 outputs WORD $0x85804029 // ldr z9, [x1] - WORD $0x91008021 // add x1, x1, #32 + WORD $0x04215021 // addvl x1, x1, #1 WORD $0x04fc952a // lsr z10.d, z9.d, #4 WORD $0x04263129 // and z9.d, z9.d, z6.d WORD $0x0426314a // and z10.d, z10.d, z6.d @@ -4265,7 +4314,7 @@ mulSve_10x6_loop: // Load and process 32 bytes from input 1 to 6 outputs WORD $0x85804089 // ldr z9, [x4] - WORD $0x91008084 // add x4, x4, #32 + WORD $0x04245024 // addvl x4, x4, #1 WORD $0x04fc952a // lsr z10.d, z9.d, #4 WORD $0x04263129 // and z9.d, z9.d, z6.d WORD $0x0426314a // and z10.d, z10.d, z6.d @@ -4311,7 +4360,7 @@ mulSve_10x6_loop: // Load and process 32 bytes from input 2 to 6 outputs WORD $0x858040a9 // ldr z9, [x5] - WORD $0x910080a5 // add x5, x5, #32 + WORD $0x04255025 // addvl x5, x5, #1 WORD $0x04fc952a // lsr z10.d, z9.d, #4 WORD $0x04263129 // and z9.d, z9.d, z6.d WORD $0x0426314a // and z10.d, z10.d, z6.d @@ -4357,7 +4406,7 @@ mulSve_10x6_loop: // Load and process 32 bytes from input 3 to 6 outputs WORD $0x85804109 // ldr z9, [x8] - WORD $0x91008108 // add x8, x8, #32 + WORD $0x04285028 // addvl x8, x8, #1 WORD $0x04fc952a // lsr z10.d, z9.d, #4 WORD $0x04263129 // and z9.d, z9.d, z6.d WORD $0x0426314a // and z10.d, z10.d, z6.d @@ -4403,7 +4452,7 @@ mulSve_10x6_loop: // Load and process 32 bytes from input 4 to 6 outputs WORD $0x85804129 // ldr z9, [x9] - WORD $0x91008129 // add x9, x9, #32 + WORD $0x04295029 // addvl x9, x9, #1 WORD $0x04fc952a // lsr z10.d, z9.d, #4 WORD $0x04263129 // and z9.d, z9.d, z6.d WORD $0x0426314a // and z10.d, z10.d, z6.d @@ -4449,7 +4498,7 @@ mulSve_10x6_loop: // Load and process 32 bytes from input 5 to 6 outputs WORD $0x85804149 // ldr z9, [x10] - WORD $0x9100814a // add x10, x10, #32 + WORD $0x042a502a // addvl x10, x10, #1 WORD $0x04fc952a // lsr z10.d, z9.d, #4 WORD $0x04263129 // and z9.d, z9.d, z6.d WORD $0x0426314a // and z10.d, z10.d, z6.d @@ -4495,7 +4544,7 @@ mulSve_10x6_loop: // Load and process 32 bytes from input 6 to 6 outputs WORD $0x85804169 // ldr z9, [x11] - WORD $0x9100816b // add x11, x11, #32 + WORD $0x042b502b // addvl x11, x11, #1 WORD $0x04fc952a // lsr z10.d, z9.d, #4 WORD $0x04263129 // and z9.d, z9.d, z6.d WORD $0x0426314a // and z10.d, z10.d, z6.d @@ -4541,7 +4590,7 @@ mulSve_10x6_loop: // Load and process 32 bytes from input 7 to 6 outputs WORD $0x85804189 // ldr z9, [x12] - WORD $0x9100818c // add x12, x12, #32 + WORD $0x042c502c // addvl x12, x12, #1 WORD $0x04fc952a // lsr z10.d, z9.d, #4 WORD $0x04263129 // and z9.d, z9.d, z6.d WORD $0x0426314a // and z10.d, z10.d, z6.d @@ -4587,7 +4636,7 @@ mulSve_10x6_loop: // Load and process 32 bytes from input 8 to 6 outputs WORD $0x858041a9 // ldr z9, [x13] - WORD $0x910081ad // add x13, x13, #32 + WORD $0x042d502d // addvl x13, x13, #1 WORD $0x04fc952a // lsr z10.d, z9.d, #4 WORD $0x04263129 // and z9.d, z9.d, z6.d WORD $0x0426314a // and z10.d, z10.d, z6.d @@ -4633,7 +4682,7 @@ mulSve_10x6_loop: // Load and process 32 bytes from input 9 to 6 outputs WORD $0x85804069 // ldr z9, [x3] - WORD $0x91008063 // add x3, x3, #32 + WORD $0x04235023 // addvl x3, x3, #1 WORD $0x04fc952a // lsr z10.d, z9.d, #4 WORD $0x04263129 // and z9.d, z9.d, z6.d WORD $0x0426314a // and z10.d, z10.d, z6.d @@ -4690,7 +4739,7 @@ mulSve_10x6_store: WORD $0xe5ef40c5 // st1d { z5.d }, p0, [x6, x15, lsl #3] // Prepare for next loop - WORD $0x910011ef // add x15, x15, #4 + WORD $0x8b1101ef // add x15, x15, x17 WORD $0xf1000400 // subs x0, x0, #1 BNE mulSve_10x6_loop @@ -4707,6 +4756,9 @@ TEXT ·mulSve_10x6Xor(SB), NOSPLIT, $8-88 MOVD n+80(FP), R0 MOVD matrix_base+0(FP), R2 WORD $0xd345fc00 // lsr x0, x0, #5 + WORD $0xd37be800 // lsl x0, x0, #5 + WORD $0x04bf5030 // rdvl x16, #1 + WORD $0x9ad00800 // udiv x0, x0, x16 WORD $0xea00001f // tst x0, x0 BEQ mulSve_10x6Xor_end MOVD in_base+24(FP), R3 @@ -4741,11 +4793,13 @@ TEXT ·mulSve_10x6Xor(SB), NOSPLIT, $8-88 // Load number of input shards MOVD in_len+32(FP), R16 + WORD $0x04bf5031 // rdvl x17, #1 + WORD $0xd343fe31 // lsr x17, x17, #3 mulSve_10x6Xor_loop: // Load and process 32 bytes from input 0 to 6 outputs WORD $0x85804029 // ldr z9, [x1] - WORD $0x91008021 // add x1, x1, #32 + WORD $0x04215021 // addvl x1, x1, #1 WORD $0x04fc952a // lsr z10.d, z9.d, #4 WORD $0x04263129 // and z9.d, z9.d, z6.d WORD $0x0426314a // and z10.d, z10.d, z6.d @@ -4803,7 +4857,7 @@ mulSve_10x6Xor_loop: // Load and process 32 bytes from input 1 to 6 outputs WORD $0x85804089 // ldr z9, [x4] - WORD $0x91008084 // add x4, x4, #32 + WORD $0x04245024 // addvl x4, x4, #1 WORD $0x04fc952a // lsr z10.d, z9.d, #4 WORD $0x04263129 // and z9.d, z9.d, z6.d WORD $0x0426314a // and z10.d, z10.d, z6.d @@ -4849,7 +4903,7 @@ mulSve_10x6Xor_loop: // Load and process 32 bytes from input 2 to 6 outputs WORD $0x858040a9 // ldr z9, [x5] - WORD $0x910080a5 // add x5, x5, #32 + WORD $0x04255025 // addvl x5, x5, #1 WORD $0x04fc952a // lsr z10.d, z9.d, #4 WORD $0x04263129 // and z9.d, z9.d, z6.d WORD $0x0426314a // and z10.d, z10.d, z6.d @@ -4895,7 +4949,7 @@ mulSve_10x6Xor_loop: // Load and process 32 bytes from input 3 to 6 outputs WORD $0x85804109 // ldr z9, [x8] - WORD $0x91008108 // add x8, x8, #32 + WORD $0x04285028 // addvl x8, x8, #1 WORD $0x04fc952a // lsr z10.d, z9.d, #4 WORD $0x04263129 // and z9.d, z9.d, z6.d WORD $0x0426314a // and z10.d, z10.d, z6.d @@ -4941,7 +4995,7 @@ mulSve_10x6Xor_loop: // Load and process 32 bytes from input 4 to 6 outputs WORD $0x85804129 // ldr z9, [x9] - WORD $0x91008129 // add x9, x9, #32 + WORD $0x04295029 // addvl x9, x9, #1 WORD $0x04fc952a // lsr z10.d, z9.d, #4 WORD $0x04263129 // and z9.d, z9.d, z6.d WORD $0x0426314a // and z10.d, z10.d, z6.d @@ -4987,7 +5041,7 @@ mulSve_10x6Xor_loop: // Load and process 32 bytes from input 5 to 6 outputs WORD $0x85804149 // ldr z9, [x10] - WORD $0x9100814a // add x10, x10, #32 + WORD $0x042a502a // addvl x10, x10, #1 WORD $0x04fc952a // lsr z10.d, z9.d, #4 WORD $0x04263129 // and z9.d, z9.d, z6.d WORD $0x0426314a // and z10.d, z10.d, z6.d @@ -5033,7 +5087,7 @@ mulSve_10x6Xor_loop: // Load and process 32 bytes from input 6 to 6 outputs WORD $0x85804169 // ldr z9, [x11] - WORD $0x9100816b // add x11, x11, #32 + WORD $0x042b502b // addvl x11, x11, #1 WORD $0x04fc952a // lsr z10.d, z9.d, #4 WORD $0x04263129 // and z9.d, z9.d, z6.d WORD $0x0426314a // and z10.d, z10.d, z6.d @@ -5079,7 +5133,7 @@ mulSve_10x6Xor_loop: // Load and process 32 bytes from input 7 to 6 outputs WORD $0x85804189 // ldr z9, [x12] - WORD $0x9100818c // add x12, x12, #32 + WORD $0x042c502c // addvl x12, x12, #1 WORD $0x04fc952a // lsr z10.d, z9.d, #4 WORD $0x04263129 // and z9.d, z9.d, z6.d WORD $0x0426314a // and z10.d, z10.d, z6.d @@ -5125,7 +5179,7 @@ mulSve_10x6Xor_loop: // Load and process 32 bytes from input 8 to 6 outputs WORD $0x858041a9 // ldr z9, [x13] - WORD $0x910081ad // add x13, x13, #32 + WORD $0x042d502d // addvl x13, x13, #1 WORD $0x04fc952a // lsr z10.d, z9.d, #4 WORD $0x04263129 // and z9.d, z9.d, z6.d WORD $0x0426314a // and z10.d, z10.d, z6.d @@ -5171,7 +5225,7 @@ mulSve_10x6Xor_loop: // Load and process 32 bytes from input 9 to 6 outputs WORD $0x85804069 // ldr z9, [x3] - WORD $0x91008063 // add x3, x3, #32 + WORD $0x04235023 // addvl x3, x3, #1 WORD $0x04fc952a // lsr z10.d, z9.d, #4 WORD $0x04263129 // and z9.d, z9.d, z6.d WORD $0x0426314a // and z10.d, z10.d, z6.d @@ -5228,7 +5282,7 @@ mulSve_10x6Xor_store: WORD $0xe5ef40c5 // st1d { z5.d }, p0, [x6, x15, lsl #3] // Prepare for next loop - WORD $0x910011ef // add x15, x15, #4 + WORD $0x8b1101ef // add x15, x15, x17 WORD $0xf1000400 // subs x0, x0, #1 BNE mulSve_10x6Xor_loop @@ -5245,6 +5299,9 @@ TEXT ·mulSve_10x7(SB), NOSPLIT, $8-88 MOVD n+80(FP), R0 MOVD matrix_base+0(FP), R2 WORD $0xd345fc00 // lsr x0, x0, #5 + WORD $0xd37be800 // lsl x0, x0, #5 + WORD $0x04bf5030 // rdvl x16, #1 + WORD $0x9ad00800 // udiv x0, x0, x16 WORD $0xea00001f // tst x0, x0 BEQ mulSve_10x7_end MOVD in_base+24(FP), R3 @@ -5279,11 +5336,13 @@ TEXT ·mulSve_10x7(SB), NOSPLIT, $8-88 // Load number of input shards MOVD in_len+32(FP), R16 + WORD $0x04bf5031 // rdvl x17, #1 + WORD $0xd343fe31 // lsr x17, x17, #3 mulSve_10x7_loop: // Load and process 32 bytes from input 0 to 7 outputs WORD $0x8580402a // ldr z10, [x1] - WORD $0x91008021 // add x1, x1, #32 + WORD $0x04215021 // addvl x1, x1, #1 WORD $0x04fc954b // lsr z11.d, z10.d, #4 WORD $0x0427314a // and z10.d, z10.d, z7.d WORD $0x0427316b // and z11.d, z11.d, z7.d @@ -5328,7 +5387,7 @@ mulSve_10x7_loop: // Load and process 32 bytes from input 1 to 7 outputs WORD $0x8580408a // ldr z10, [x4] - WORD $0x91008084 // add x4, x4, #32 + WORD $0x04245024 // addvl x4, x4, #1 WORD $0x04fc954b // lsr z11.d, z10.d, #4 WORD $0x0427314a // and z10.d, z10.d, z7.d WORD $0x0427316b // and z11.d, z11.d, z7.d @@ -5380,7 +5439,7 @@ mulSve_10x7_loop: // Load and process 32 bytes from input 2 to 7 outputs WORD $0x858040aa // ldr z10, [x5] - WORD $0x910080a5 // add x5, x5, #32 + WORD $0x04255025 // addvl x5, x5, #1 WORD $0x04fc954b // lsr z11.d, z10.d, #4 WORD $0x0427314a // and z10.d, z10.d, z7.d WORD $0x0427316b // and z11.d, z11.d, z7.d @@ -5432,7 +5491,7 @@ mulSve_10x7_loop: // Load and process 32 bytes from input 3 to 7 outputs WORD $0x8580410a // ldr z10, [x8] - WORD $0x91008108 // add x8, x8, #32 + WORD $0x04285028 // addvl x8, x8, #1 WORD $0x04fc954b // lsr z11.d, z10.d, #4 WORD $0x0427314a // and z10.d, z10.d, z7.d WORD $0x0427316b // and z11.d, z11.d, z7.d @@ -5484,7 +5543,7 @@ mulSve_10x7_loop: // Load and process 32 bytes from input 4 to 7 outputs WORD $0x8580412a // ldr z10, [x9] - WORD $0x91008129 // add x9, x9, #32 + WORD $0x04295029 // addvl x9, x9, #1 WORD $0x04fc954b // lsr z11.d, z10.d, #4 WORD $0x0427314a // and z10.d, z10.d, z7.d WORD $0x0427316b // and z11.d, z11.d, z7.d @@ -5536,7 +5595,7 @@ mulSve_10x7_loop: // Load and process 32 bytes from input 5 to 7 outputs WORD $0x8580414a // ldr z10, [x10] - WORD $0x9100814a // add x10, x10, #32 + WORD $0x042a502a // addvl x10, x10, #1 WORD $0x04fc954b // lsr z11.d, z10.d, #4 WORD $0x0427314a // and z10.d, z10.d, z7.d WORD $0x0427316b // and z11.d, z11.d, z7.d @@ -5588,7 +5647,7 @@ mulSve_10x7_loop: // Load and process 32 bytes from input 6 to 7 outputs WORD $0x8580416a // ldr z10, [x11] - WORD $0x9100816b // add x11, x11, #32 + WORD $0x042b502b // addvl x11, x11, #1 WORD $0x04fc954b // lsr z11.d, z10.d, #4 WORD $0x0427314a // and z10.d, z10.d, z7.d WORD $0x0427316b // and z11.d, z11.d, z7.d @@ -5640,7 +5699,7 @@ mulSve_10x7_loop: // Load and process 32 bytes from input 7 to 7 outputs WORD $0x8580418a // ldr z10, [x12] - WORD $0x9100818c // add x12, x12, #32 + WORD $0x042c502c // addvl x12, x12, #1 WORD $0x04fc954b // lsr z11.d, z10.d, #4 WORD $0x0427314a // and z10.d, z10.d, z7.d WORD $0x0427316b // and z11.d, z11.d, z7.d @@ -5692,7 +5751,7 @@ mulSve_10x7_loop: // Load and process 32 bytes from input 8 to 7 outputs WORD $0x858041aa // ldr z10, [x13] - WORD $0x910081ad // add x13, x13, #32 + WORD $0x042d502d // addvl x13, x13, #1 WORD $0x04fc954b // lsr z11.d, z10.d, #4 WORD $0x0427314a // and z10.d, z10.d, z7.d WORD $0x0427316b // and z11.d, z11.d, z7.d @@ -5744,7 +5803,7 @@ mulSve_10x7_loop: // Load and process 32 bytes from input 9 to 7 outputs WORD $0x8580406a // ldr z10, [x3] - WORD $0x91008063 // add x3, x3, #32 + WORD $0x04235023 // addvl x3, x3, #1 WORD $0x04fc954b // lsr z11.d, z10.d, #4 WORD $0x0427314a // and z10.d, z10.d, z7.d WORD $0x0427316b // and z11.d, z11.d, z7.d @@ -5809,7 +5868,7 @@ mulSve_10x7_store: WORD $0xe5ef40c6 // st1d { z6.d }, p0, [x6, x15, lsl #3] // Prepare for next loop - WORD $0x910011ef // add x15, x15, #4 + WORD $0x8b1101ef // add x15, x15, x17 WORD $0xf1000400 // subs x0, x0, #1 BNE mulSve_10x7_loop @@ -5826,6 +5885,9 @@ TEXT ·mulSve_10x7Xor(SB), NOSPLIT, $8-88 MOVD n+80(FP), R0 MOVD matrix_base+0(FP), R2 WORD $0xd345fc00 // lsr x0, x0, #5 + WORD $0xd37be800 // lsl x0, x0, #5 + WORD $0x04bf5030 // rdvl x16, #1 + WORD $0x9ad00800 // udiv x0, x0, x16 WORD $0xea00001f // tst x0, x0 BEQ mulSve_10x7Xor_end MOVD in_base+24(FP), R3 @@ -5860,11 +5922,13 @@ TEXT ·mulSve_10x7Xor(SB), NOSPLIT, $8-88 // Load number of input shards MOVD in_len+32(FP), R16 + WORD $0x04bf5031 // rdvl x17, #1 + WORD $0xd343fe31 // lsr x17, x17, #3 mulSve_10x7Xor_loop: // Load and process 32 bytes from input 0 to 7 outputs WORD $0x8580402a // ldr z10, [x1] - WORD $0x91008021 // add x1, x1, #32 + WORD $0x04215021 // addvl x1, x1, #1 WORD $0x04fc954b // lsr z11.d, z10.d, #4 WORD $0x0427314a // and z10.d, z10.d, z7.d WORD $0x0427316b // and z11.d, z11.d, z7.d @@ -5930,7 +5994,7 @@ mulSve_10x7Xor_loop: // Load and process 32 bytes from input 1 to 7 outputs WORD $0x8580408a // ldr z10, [x4] - WORD $0x91008084 // add x4, x4, #32 + WORD $0x04245024 // addvl x4, x4, #1 WORD $0x04fc954b // lsr z11.d, z10.d, #4 WORD $0x0427314a // and z10.d, z10.d, z7.d WORD $0x0427316b // and z11.d, z11.d, z7.d @@ -5982,7 +6046,7 @@ mulSve_10x7Xor_loop: // Load and process 32 bytes from input 2 to 7 outputs WORD $0x858040aa // ldr z10, [x5] - WORD $0x910080a5 // add x5, x5, #32 + WORD $0x04255025 // addvl x5, x5, #1 WORD $0x04fc954b // lsr z11.d, z10.d, #4 WORD $0x0427314a // and z10.d, z10.d, z7.d WORD $0x0427316b // and z11.d, z11.d, z7.d @@ -6034,7 +6098,7 @@ mulSve_10x7Xor_loop: // Load and process 32 bytes from input 3 to 7 outputs WORD $0x8580410a // ldr z10, [x8] - WORD $0x91008108 // add x8, x8, #32 + WORD $0x04285028 // addvl x8, x8, #1 WORD $0x04fc954b // lsr z11.d, z10.d, #4 WORD $0x0427314a // and z10.d, z10.d, z7.d WORD $0x0427316b // and z11.d, z11.d, z7.d @@ -6086,7 +6150,7 @@ mulSve_10x7Xor_loop: // Load and process 32 bytes from input 4 to 7 outputs WORD $0x8580412a // ldr z10, [x9] - WORD $0x91008129 // add x9, x9, #32 + WORD $0x04295029 // addvl x9, x9, #1 WORD $0x04fc954b // lsr z11.d, z10.d, #4 WORD $0x0427314a // and z10.d, z10.d, z7.d WORD $0x0427316b // and z11.d, z11.d, z7.d @@ -6138,7 +6202,7 @@ mulSve_10x7Xor_loop: // Load and process 32 bytes from input 5 to 7 outputs WORD $0x8580414a // ldr z10, [x10] - WORD $0x9100814a // add x10, x10, #32 + WORD $0x042a502a // addvl x10, x10, #1 WORD $0x04fc954b // lsr z11.d, z10.d, #4 WORD $0x0427314a // and z10.d, z10.d, z7.d WORD $0x0427316b // and z11.d, z11.d, z7.d @@ -6190,7 +6254,7 @@ mulSve_10x7Xor_loop: // Load and process 32 bytes from input 6 to 7 outputs WORD $0x8580416a // ldr z10, [x11] - WORD $0x9100816b // add x11, x11, #32 + WORD $0x042b502b // addvl x11, x11, #1 WORD $0x04fc954b // lsr z11.d, z10.d, #4 WORD $0x0427314a // and z10.d, z10.d, z7.d WORD $0x0427316b // and z11.d, z11.d, z7.d @@ -6242,7 +6306,7 @@ mulSve_10x7Xor_loop: // Load and process 32 bytes from input 7 to 7 outputs WORD $0x8580418a // ldr z10, [x12] - WORD $0x9100818c // add x12, x12, #32 + WORD $0x042c502c // addvl x12, x12, #1 WORD $0x04fc954b // lsr z11.d, z10.d, #4 WORD $0x0427314a // and z10.d, z10.d, z7.d WORD $0x0427316b // and z11.d, z11.d, z7.d @@ -6294,7 +6358,7 @@ mulSve_10x7Xor_loop: // Load and process 32 bytes from input 8 to 7 outputs WORD $0x858041aa // ldr z10, [x13] - WORD $0x910081ad // add x13, x13, #32 + WORD $0x042d502d // addvl x13, x13, #1 WORD $0x04fc954b // lsr z11.d, z10.d, #4 WORD $0x0427314a // and z10.d, z10.d, z7.d WORD $0x0427316b // and z11.d, z11.d, z7.d @@ -6346,7 +6410,7 @@ mulSve_10x7Xor_loop: // Load and process 32 bytes from input 9 to 7 outputs WORD $0x8580406a // ldr z10, [x3] - WORD $0x91008063 // add x3, x3, #32 + WORD $0x04235023 // addvl x3, x3, #1 WORD $0x04fc954b // lsr z11.d, z10.d, #4 WORD $0x0427314a // and z10.d, z10.d, z7.d WORD $0x0427316b // and z11.d, z11.d, z7.d @@ -6411,7 +6475,7 @@ mulSve_10x7Xor_store: WORD $0xe5ef40c6 // st1d { z6.d }, p0, [x6, x15, lsl #3] // Prepare for next loop - WORD $0x910011ef // add x15, x15, #4 + WORD $0x8b1101ef // add x15, x15, x17 WORD $0xf1000400 // subs x0, x0, #1 BNE mulSve_10x7Xor_loop @@ -6428,6 +6492,9 @@ TEXT ·mulSve_10x8(SB), NOSPLIT, $8-88 MOVD n+80(FP), R0 MOVD matrix_base+0(FP), R2 WORD $0xd345fc00 // lsr x0, x0, #5 + WORD $0xd37be800 // lsl x0, x0, #5 + WORD $0x04bf5030 // rdvl x16, #1 + WORD $0x9ad00800 // udiv x0, x0, x16 WORD $0xea00001f // tst x0, x0 BEQ mulSve_10x8_end MOVD in_base+24(FP), R3 @@ -6462,11 +6529,13 @@ TEXT ·mulSve_10x8(SB), NOSPLIT, $8-88 // Load number of input shards MOVD in_len+32(FP), R16 + WORD $0x04bf5031 // rdvl x17, #1 + WORD $0xd343fe31 // lsr x17, x17, #3 mulSve_10x8_loop: // Load and process 32 bytes from input 0 to 8 outputs WORD $0x8580402b // ldr z11, [x1] - WORD $0x91008021 // add x1, x1, #32 + WORD $0x04215021 // addvl x1, x1, #1 WORD $0x04fc956c // lsr z12.d, z11.d, #4 WORD $0x0428316b // and z11.d, z11.d, z8.d WORD $0x0428318c // and z12.d, z12.d, z8.d @@ -6516,7 +6585,7 @@ mulSve_10x8_loop: // Load and process 32 bytes from input 1 to 8 outputs WORD $0x8580408b // ldr z11, [x4] - WORD $0x91008084 // add x4, x4, #32 + WORD $0x04245024 // addvl x4, x4, #1 WORD $0x04fc956c // lsr z12.d, z11.d, #4 WORD $0x0428316b // and z11.d, z11.d, z8.d WORD $0x0428318c // and z12.d, z12.d, z8.d @@ -6574,7 +6643,7 @@ mulSve_10x8_loop: // Load and process 32 bytes from input 2 to 8 outputs WORD $0x858040ab // ldr z11, [x5] - WORD $0x910080a5 // add x5, x5, #32 + WORD $0x04255025 // addvl x5, x5, #1 WORD $0x04fc956c // lsr z12.d, z11.d, #4 WORD $0x0428316b // and z11.d, z11.d, z8.d WORD $0x0428318c // and z12.d, z12.d, z8.d @@ -6632,7 +6701,7 @@ mulSve_10x8_loop: // Load and process 32 bytes from input 3 to 8 outputs WORD $0x8580410b // ldr z11, [x8] - WORD $0x91008108 // add x8, x8, #32 + WORD $0x04285028 // addvl x8, x8, #1 WORD $0x04fc956c // lsr z12.d, z11.d, #4 WORD $0x0428316b // and z11.d, z11.d, z8.d WORD $0x0428318c // and z12.d, z12.d, z8.d @@ -6690,7 +6759,7 @@ mulSve_10x8_loop: // Load and process 32 bytes from input 4 to 8 outputs WORD $0x8580412b // ldr z11, [x9] - WORD $0x91008129 // add x9, x9, #32 + WORD $0x04295029 // addvl x9, x9, #1 WORD $0x04fc956c // lsr z12.d, z11.d, #4 WORD $0x0428316b // and z11.d, z11.d, z8.d WORD $0x0428318c // and z12.d, z12.d, z8.d @@ -6748,7 +6817,7 @@ mulSve_10x8_loop: // Load and process 32 bytes from input 5 to 8 outputs WORD $0x8580414b // ldr z11, [x10] - WORD $0x9100814a // add x10, x10, #32 + WORD $0x042a502a // addvl x10, x10, #1 WORD $0x04fc956c // lsr z12.d, z11.d, #4 WORD $0x0428316b // and z11.d, z11.d, z8.d WORD $0x0428318c // and z12.d, z12.d, z8.d @@ -6806,7 +6875,7 @@ mulSve_10x8_loop: // Load and process 32 bytes from input 6 to 8 outputs WORD $0x8580416b // ldr z11, [x11] - WORD $0x9100816b // add x11, x11, #32 + WORD $0x042b502b // addvl x11, x11, #1 WORD $0x04fc956c // lsr z12.d, z11.d, #4 WORD $0x0428316b // and z11.d, z11.d, z8.d WORD $0x0428318c // and z12.d, z12.d, z8.d @@ -6864,7 +6933,7 @@ mulSve_10x8_loop: // Load and process 32 bytes from input 7 to 8 outputs WORD $0x8580418b // ldr z11, [x12] - WORD $0x9100818c // add x12, x12, #32 + WORD $0x042c502c // addvl x12, x12, #1 WORD $0x04fc956c // lsr z12.d, z11.d, #4 WORD $0x0428316b // and z11.d, z11.d, z8.d WORD $0x0428318c // and z12.d, z12.d, z8.d @@ -6922,7 +6991,7 @@ mulSve_10x8_loop: // Load and process 32 bytes from input 8 to 8 outputs WORD $0x858041ab // ldr z11, [x13] - WORD $0x910081ad // add x13, x13, #32 + WORD $0x042d502d // addvl x13, x13, #1 WORD $0x04fc956c // lsr z12.d, z11.d, #4 WORD $0x0428316b // and z11.d, z11.d, z8.d WORD $0x0428318c // and z12.d, z12.d, z8.d @@ -6980,7 +7049,7 @@ mulSve_10x8_loop: // Load and process 32 bytes from input 9 to 8 outputs WORD $0x8580406b // ldr z11, [x3] - WORD $0x91008063 // add x3, x3, #32 + WORD $0x04235023 // addvl x3, x3, #1 WORD $0x04fc956c // lsr z12.d, z11.d, #4 WORD $0x0428316b // and z11.d, z11.d, z8.d WORD $0x0428318c // and z12.d, z12.d, z8.d @@ -7053,7 +7122,7 @@ mulSve_10x8_store: WORD $0xe5ef40c7 // st1d { z7.d }, p0, [x6, x15, lsl #3] // Prepare for next loop - WORD $0x910011ef // add x15, x15, #4 + WORD $0x8b1101ef // add x15, x15, x17 WORD $0xf1000400 // subs x0, x0, #1 BNE mulSve_10x8_loop @@ -7070,6 +7139,9 @@ TEXT ·mulSve_10x8Xor(SB), NOSPLIT, $8-88 MOVD n+80(FP), R0 MOVD matrix_base+0(FP), R2 WORD $0xd345fc00 // lsr x0, x0, #5 + WORD $0xd37be800 // lsl x0, x0, #5 + WORD $0x04bf5030 // rdvl x16, #1 + WORD $0x9ad00800 // udiv x0, x0, x16 WORD $0xea00001f // tst x0, x0 BEQ mulSve_10x8Xor_end MOVD in_base+24(FP), R3 @@ -7104,11 +7176,13 @@ TEXT ·mulSve_10x8Xor(SB), NOSPLIT, $8-88 // Load number of input shards MOVD in_len+32(FP), R16 + WORD $0x04bf5031 // rdvl x17, #1 + WORD $0xd343fe31 // lsr x17, x17, #3 mulSve_10x8Xor_loop: // Load and process 32 bytes from input 0 to 8 outputs WORD $0x8580402b // ldr z11, [x1] - WORD $0x91008021 // add x1, x1, #32 + WORD $0x04215021 // addvl x1, x1, #1 WORD $0x04fc956c // lsr z12.d, z11.d, #4 WORD $0x0428316b // and z11.d, z11.d, z8.d WORD $0x0428318c // and z12.d, z12.d, z8.d @@ -7182,7 +7256,7 @@ mulSve_10x8Xor_loop: // Load and process 32 bytes from input 1 to 8 outputs WORD $0x8580408b // ldr z11, [x4] - WORD $0x91008084 // add x4, x4, #32 + WORD $0x04245024 // addvl x4, x4, #1 WORD $0x04fc956c // lsr z12.d, z11.d, #4 WORD $0x0428316b // and z11.d, z11.d, z8.d WORD $0x0428318c // and z12.d, z12.d, z8.d @@ -7240,7 +7314,7 @@ mulSve_10x8Xor_loop: // Load and process 32 bytes from input 2 to 8 outputs WORD $0x858040ab // ldr z11, [x5] - WORD $0x910080a5 // add x5, x5, #32 + WORD $0x04255025 // addvl x5, x5, #1 WORD $0x04fc956c // lsr z12.d, z11.d, #4 WORD $0x0428316b // and z11.d, z11.d, z8.d WORD $0x0428318c // and z12.d, z12.d, z8.d @@ -7298,7 +7372,7 @@ mulSve_10x8Xor_loop: // Load and process 32 bytes from input 3 to 8 outputs WORD $0x8580410b // ldr z11, [x8] - WORD $0x91008108 // add x8, x8, #32 + WORD $0x04285028 // addvl x8, x8, #1 WORD $0x04fc956c // lsr z12.d, z11.d, #4 WORD $0x0428316b // and z11.d, z11.d, z8.d WORD $0x0428318c // and z12.d, z12.d, z8.d @@ -7356,7 +7430,7 @@ mulSve_10x8Xor_loop: // Load and process 32 bytes from input 4 to 8 outputs WORD $0x8580412b // ldr z11, [x9] - WORD $0x91008129 // add x9, x9, #32 + WORD $0x04295029 // addvl x9, x9, #1 WORD $0x04fc956c // lsr z12.d, z11.d, #4 WORD $0x0428316b // and z11.d, z11.d, z8.d WORD $0x0428318c // and z12.d, z12.d, z8.d @@ -7414,7 +7488,7 @@ mulSve_10x8Xor_loop: // Load and process 32 bytes from input 5 to 8 outputs WORD $0x8580414b // ldr z11, [x10] - WORD $0x9100814a // add x10, x10, #32 + WORD $0x042a502a // addvl x10, x10, #1 WORD $0x04fc956c // lsr z12.d, z11.d, #4 WORD $0x0428316b // and z11.d, z11.d, z8.d WORD $0x0428318c // and z12.d, z12.d, z8.d @@ -7472,7 +7546,7 @@ mulSve_10x8Xor_loop: // Load and process 32 bytes from input 6 to 8 outputs WORD $0x8580416b // ldr z11, [x11] - WORD $0x9100816b // add x11, x11, #32 + WORD $0x042b502b // addvl x11, x11, #1 WORD $0x04fc956c // lsr z12.d, z11.d, #4 WORD $0x0428316b // and z11.d, z11.d, z8.d WORD $0x0428318c // and z12.d, z12.d, z8.d @@ -7530,7 +7604,7 @@ mulSve_10x8Xor_loop: // Load and process 32 bytes from input 7 to 8 outputs WORD $0x8580418b // ldr z11, [x12] - WORD $0x9100818c // add x12, x12, #32 + WORD $0x042c502c // addvl x12, x12, #1 WORD $0x04fc956c // lsr z12.d, z11.d, #4 WORD $0x0428316b // and z11.d, z11.d, z8.d WORD $0x0428318c // and z12.d, z12.d, z8.d @@ -7588,7 +7662,7 @@ mulSve_10x8Xor_loop: // Load and process 32 bytes from input 8 to 8 outputs WORD $0x858041ab // ldr z11, [x13] - WORD $0x910081ad // add x13, x13, #32 + WORD $0x042d502d // addvl x13, x13, #1 WORD $0x04fc956c // lsr z12.d, z11.d, #4 WORD $0x0428316b // and z11.d, z11.d, z8.d WORD $0x0428318c // and z12.d, z12.d, z8.d @@ -7646,7 +7720,7 @@ mulSve_10x8Xor_loop: // Load and process 32 bytes from input 9 to 8 outputs WORD $0x8580406b // ldr z11, [x3] - WORD $0x91008063 // add x3, x3, #32 + WORD $0x04235023 // addvl x3, x3, #1 WORD $0x04fc956c // lsr z12.d, z11.d, #4 WORD $0x0428316b // and z11.d, z11.d, z8.d WORD $0x0428318c // and z12.d, z12.d, z8.d @@ -7719,7 +7793,7 @@ mulSve_10x8Xor_store: WORD $0xe5ef40c7 // st1d { z7.d }, p0, [x6, x15, lsl #3] // Prepare for next loop - WORD $0x910011ef // add x15, x15, #4 + WORD $0x8b1101ef // add x15, x15, x17 WORD $0xf1000400 // subs x0, x0, #1 BNE mulSve_10x8Xor_loop @@ -7736,6 +7810,9 @@ TEXT ·mulSve_10x9(SB), NOSPLIT, $8-88 MOVD n+80(FP), R0 MOVD matrix_base+0(FP), R2 WORD $0xd345fc00 // lsr x0, x0, #5 + WORD $0xd37be800 // lsl x0, x0, #5 + WORD $0x04bf5030 // rdvl x16, #1 + WORD $0x9ad00800 // udiv x0, x0, x16 WORD $0xea00001f // tst x0, x0 BEQ mulSve_10x9_end MOVD in_base+24(FP), R3 @@ -7770,11 +7847,13 @@ TEXT ·mulSve_10x9(SB), NOSPLIT, $8-88 // Load number of input shards MOVD in_len+32(FP), R16 + WORD $0x04bf5031 // rdvl x17, #1 + WORD $0xd343fe31 // lsr x17, x17, #3 mulSve_10x9_loop: // Load and process 32 bytes from input 0 to 9 outputs WORD $0x8580402c // ldr z12, [x1] - WORD $0x91008021 // add x1, x1, #32 + WORD $0x04215021 // addvl x1, x1, #1 WORD $0x04fc958d // lsr z13.d, z12.d, #4 WORD $0x0429318c // and z12.d, z12.d, z9.d WORD $0x042931ad // and z13.d, z13.d, z9.d @@ -7829,7 +7908,7 @@ mulSve_10x9_loop: // Load and process 32 bytes from input 1 to 9 outputs WORD $0x8580408c // ldr z12, [x4] - WORD $0x91008084 // add x4, x4, #32 + WORD $0x04245024 // addvl x4, x4, #1 WORD $0x04fc958d // lsr z13.d, z12.d, #4 WORD $0x0429318c // and z12.d, z12.d, z9.d WORD $0x042931ad // and z13.d, z13.d, z9.d @@ -7893,7 +7972,7 @@ mulSve_10x9_loop: // Load and process 32 bytes from input 2 to 9 outputs WORD $0x858040ac // ldr z12, [x5] - WORD $0x910080a5 // add x5, x5, #32 + WORD $0x04255025 // addvl x5, x5, #1 WORD $0x04fc958d // lsr z13.d, z12.d, #4 WORD $0x0429318c // and z12.d, z12.d, z9.d WORD $0x042931ad // and z13.d, z13.d, z9.d @@ -7957,7 +8036,7 @@ mulSve_10x9_loop: // Load and process 32 bytes from input 3 to 9 outputs WORD $0x8580410c // ldr z12, [x8] - WORD $0x91008108 // add x8, x8, #32 + WORD $0x04285028 // addvl x8, x8, #1 WORD $0x04fc958d // lsr z13.d, z12.d, #4 WORD $0x0429318c // and z12.d, z12.d, z9.d WORD $0x042931ad // and z13.d, z13.d, z9.d @@ -8021,7 +8100,7 @@ mulSve_10x9_loop: // Load and process 32 bytes from input 4 to 9 outputs WORD $0x8580412c // ldr z12, [x9] - WORD $0x91008129 // add x9, x9, #32 + WORD $0x04295029 // addvl x9, x9, #1 WORD $0x04fc958d // lsr z13.d, z12.d, #4 WORD $0x0429318c // and z12.d, z12.d, z9.d WORD $0x042931ad // and z13.d, z13.d, z9.d @@ -8085,7 +8164,7 @@ mulSve_10x9_loop: // Load and process 32 bytes from input 5 to 9 outputs WORD $0x8580414c // ldr z12, [x10] - WORD $0x9100814a // add x10, x10, #32 + WORD $0x042a502a // addvl x10, x10, #1 WORD $0x04fc958d // lsr z13.d, z12.d, #4 WORD $0x0429318c // and z12.d, z12.d, z9.d WORD $0x042931ad // and z13.d, z13.d, z9.d @@ -8149,7 +8228,7 @@ mulSve_10x9_loop: // Load and process 32 bytes from input 6 to 9 outputs WORD $0x8580416c // ldr z12, [x11] - WORD $0x9100816b // add x11, x11, #32 + WORD $0x042b502b // addvl x11, x11, #1 WORD $0x04fc958d // lsr z13.d, z12.d, #4 WORD $0x0429318c // and z12.d, z12.d, z9.d WORD $0x042931ad // and z13.d, z13.d, z9.d @@ -8213,7 +8292,7 @@ mulSve_10x9_loop: // Load and process 32 bytes from input 7 to 9 outputs WORD $0x8580418c // ldr z12, [x12] - WORD $0x9100818c // add x12, x12, #32 + WORD $0x042c502c // addvl x12, x12, #1 WORD $0x04fc958d // lsr z13.d, z12.d, #4 WORD $0x0429318c // and z12.d, z12.d, z9.d WORD $0x042931ad // and z13.d, z13.d, z9.d @@ -8277,7 +8356,7 @@ mulSve_10x9_loop: // Load and process 32 bytes from input 8 to 9 outputs WORD $0x858041ac // ldr z12, [x13] - WORD $0x910081ad // add x13, x13, #32 + WORD $0x042d502d // addvl x13, x13, #1 WORD $0x04fc958d // lsr z13.d, z12.d, #4 WORD $0x0429318c // and z12.d, z12.d, z9.d WORD $0x042931ad // and z13.d, z13.d, z9.d @@ -8341,7 +8420,7 @@ mulSve_10x9_loop: // Load and process 32 bytes from input 9 to 9 outputs WORD $0x8580406c // ldr z12, [x3] - WORD $0x91008063 // add x3, x3, #32 + WORD $0x04235023 // addvl x3, x3, #1 WORD $0x04fc958d // lsr z13.d, z12.d, #4 WORD $0x0429318c // and z12.d, z12.d, z9.d WORD $0x042931ad // and z13.d, z13.d, z9.d @@ -8422,7 +8501,7 @@ mulSve_10x9_store: WORD $0xe5ef40c8 // st1d { z8.d }, p0, [x6, x15, lsl #3] // Prepare for next loop - WORD $0x910011ef // add x15, x15, #4 + WORD $0x8b1101ef // add x15, x15, x17 WORD $0xf1000400 // subs x0, x0, #1 BNE mulSve_10x9_loop @@ -8439,6 +8518,9 @@ TEXT ·mulSve_10x9Xor(SB), NOSPLIT, $8-88 MOVD n+80(FP), R0 MOVD matrix_base+0(FP), R2 WORD $0xd345fc00 // lsr x0, x0, #5 + WORD $0xd37be800 // lsl x0, x0, #5 + WORD $0x04bf5030 // rdvl x16, #1 + WORD $0x9ad00800 // udiv x0, x0, x16 WORD $0xea00001f // tst x0, x0 BEQ mulSve_10x9Xor_end MOVD in_base+24(FP), R3 @@ -8473,11 +8555,13 @@ TEXT ·mulSve_10x9Xor(SB), NOSPLIT, $8-88 // Load number of input shards MOVD in_len+32(FP), R16 + WORD $0x04bf5031 // rdvl x17, #1 + WORD $0xd343fe31 // lsr x17, x17, #3 mulSve_10x9Xor_loop: // Load and process 32 bytes from input 0 to 9 outputs WORD $0x8580402c // ldr z12, [x1] - WORD $0x91008021 // add x1, x1, #32 + WORD $0x04215021 // addvl x1, x1, #1 WORD $0x04fc958d // lsr z13.d, z12.d, #4 WORD $0x0429318c // and z12.d, z12.d, z9.d WORD $0x042931ad // and z13.d, z13.d, z9.d @@ -8559,7 +8643,7 @@ mulSve_10x9Xor_loop: // Load and process 32 bytes from input 1 to 9 outputs WORD $0x8580408c // ldr z12, [x4] - WORD $0x91008084 // add x4, x4, #32 + WORD $0x04245024 // addvl x4, x4, #1 WORD $0x04fc958d // lsr z13.d, z12.d, #4 WORD $0x0429318c // and z12.d, z12.d, z9.d WORD $0x042931ad // and z13.d, z13.d, z9.d @@ -8623,7 +8707,7 @@ mulSve_10x9Xor_loop: // Load and process 32 bytes from input 2 to 9 outputs WORD $0x858040ac // ldr z12, [x5] - WORD $0x910080a5 // add x5, x5, #32 + WORD $0x04255025 // addvl x5, x5, #1 WORD $0x04fc958d // lsr z13.d, z12.d, #4 WORD $0x0429318c // and z12.d, z12.d, z9.d WORD $0x042931ad // and z13.d, z13.d, z9.d @@ -8687,7 +8771,7 @@ mulSve_10x9Xor_loop: // Load and process 32 bytes from input 3 to 9 outputs WORD $0x8580410c // ldr z12, [x8] - WORD $0x91008108 // add x8, x8, #32 + WORD $0x04285028 // addvl x8, x8, #1 WORD $0x04fc958d // lsr z13.d, z12.d, #4 WORD $0x0429318c // and z12.d, z12.d, z9.d WORD $0x042931ad // and z13.d, z13.d, z9.d @@ -8751,7 +8835,7 @@ mulSve_10x9Xor_loop: // Load and process 32 bytes from input 4 to 9 outputs WORD $0x8580412c // ldr z12, [x9] - WORD $0x91008129 // add x9, x9, #32 + WORD $0x04295029 // addvl x9, x9, #1 WORD $0x04fc958d // lsr z13.d, z12.d, #4 WORD $0x0429318c // and z12.d, z12.d, z9.d WORD $0x042931ad // and z13.d, z13.d, z9.d @@ -8815,7 +8899,7 @@ mulSve_10x9Xor_loop: // Load and process 32 bytes from input 5 to 9 outputs WORD $0x8580414c // ldr z12, [x10] - WORD $0x9100814a // add x10, x10, #32 + WORD $0x042a502a // addvl x10, x10, #1 WORD $0x04fc958d // lsr z13.d, z12.d, #4 WORD $0x0429318c // and z12.d, z12.d, z9.d WORD $0x042931ad // and z13.d, z13.d, z9.d @@ -8879,7 +8963,7 @@ mulSve_10x9Xor_loop: // Load and process 32 bytes from input 6 to 9 outputs WORD $0x8580416c // ldr z12, [x11] - WORD $0x9100816b // add x11, x11, #32 + WORD $0x042b502b // addvl x11, x11, #1 WORD $0x04fc958d // lsr z13.d, z12.d, #4 WORD $0x0429318c // and z12.d, z12.d, z9.d WORD $0x042931ad // and z13.d, z13.d, z9.d @@ -8943,7 +9027,7 @@ mulSve_10x9Xor_loop: // Load and process 32 bytes from input 7 to 9 outputs WORD $0x8580418c // ldr z12, [x12] - WORD $0x9100818c // add x12, x12, #32 + WORD $0x042c502c // addvl x12, x12, #1 WORD $0x04fc958d // lsr z13.d, z12.d, #4 WORD $0x0429318c // and z12.d, z12.d, z9.d WORD $0x042931ad // and z13.d, z13.d, z9.d @@ -9007,7 +9091,7 @@ mulSve_10x9Xor_loop: // Load and process 32 bytes from input 8 to 9 outputs WORD $0x858041ac // ldr z12, [x13] - WORD $0x910081ad // add x13, x13, #32 + WORD $0x042d502d // addvl x13, x13, #1 WORD $0x04fc958d // lsr z13.d, z12.d, #4 WORD $0x0429318c // and z12.d, z12.d, z9.d WORD $0x042931ad // and z13.d, z13.d, z9.d @@ -9071,7 +9155,7 @@ mulSve_10x9Xor_loop: // Load and process 32 bytes from input 9 to 9 outputs WORD $0x8580406c // ldr z12, [x3] - WORD $0x91008063 // add x3, x3, #32 + WORD $0x04235023 // addvl x3, x3, #1 WORD $0x04fc958d // lsr z13.d, z12.d, #4 WORD $0x0429318c // and z12.d, z12.d, z9.d WORD $0x042931ad // and z13.d, z13.d, z9.d @@ -9152,7 +9236,7 @@ mulSve_10x9Xor_store: WORD $0xe5ef40c8 // st1d { z8.d }, p0, [x6, x15, lsl #3] // Prepare for next loop - WORD $0x910011ef // add x15, x15, #4 + WORD $0x8b1101ef // add x15, x15, x17 WORD $0xf1000400 // subs x0, x0, #1 BNE mulSve_10x9Xor_loop @@ -9169,6 +9253,9 @@ TEXT ·mulSve_10x10(SB), NOSPLIT, $8-88 MOVD n+80(FP), R0 MOVD matrix_base+0(FP), R2 WORD $0xd345fc00 // lsr x0, x0, #5 + WORD $0xd37be800 // lsl x0, x0, #5 + WORD $0x04bf5030 // rdvl x16, #1 + WORD $0x9ad00800 // udiv x0, x0, x16 WORD $0xea00001f // tst x0, x0 BEQ mulSve_10x10_end MOVD in_base+24(FP), R3 @@ -9203,11 +9290,13 @@ TEXT ·mulSve_10x10(SB), NOSPLIT, $8-88 // Load number of input shards MOVD in_len+32(FP), R16 + WORD $0x04bf5031 // rdvl x17, #1 + WORD $0xd343fe31 // lsr x17, x17, #3 mulSve_10x10_loop: // Load and process 32 bytes from input 0 to 10 outputs WORD $0x8580402d // ldr z13, [x1] - WORD $0x91008021 // add x1, x1, #32 + WORD $0x04215021 // addvl x1, x1, #1 WORD $0x04fc95ae // lsr z14.d, z13.d, #4 WORD $0x042a31ad // and z13.d, z13.d, z10.d WORD $0x042a31ce // and z14.d, z14.d, z10.d @@ -9267,7 +9356,7 @@ mulSve_10x10_loop: // Load and process 32 bytes from input 1 to 10 outputs WORD $0x8580408d // ldr z13, [x4] - WORD $0x91008084 // add x4, x4, #32 + WORD $0x04245024 // addvl x4, x4, #1 WORD $0x04fc95ae // lsr z14.d, z13.d, #4 WORD $0x042a31ad // and z13.d, z13.d, z10.d WORD $0x042a31ce // and z14.d, z14.d, z10.d @@ -9337,7 +9426,7 @@ mulSve_10x10_loop: // Load and process 32 bytes from input 2 to 10 outputs WORD $0x858040ad // ldr z13, [x5] - WORD $0x910080a5 // add x5, x5, #32 + WORD $0x04255025 // addvl x5, x5, #1 WORD $0x04fc95ae // lsr z14.d, z13.d, #4 WORD $0x042a31ad // and z13.d, z13.d, z10.d WORD $0x042a31ce // and z14.d, z14.d, z10.d @@ -9407,7 +9496,7 @@ mulSve_10x10_loop: // Load and process 32 bytes from input 3 to 10 outputs WORD $0x8580410d // ldr z13, [x8] - WORD $0x91008108 // add x8, x8, #32 + WORD $0x04285028 // addvl x8, x8, #1 WORD $0x04fc95ae // lsr z14.d, z13.d, #4 WORD $0x042a31ad // and z13.d, z13.d, z10.d WORD $0x042a31ce // and z14.d, z14.d, z10.d @@ -9477,7 +9566,7 @@ mulSve_10x10_loop: // Load and process 32 bytes from input 4 to 10 outputs WORD $0x8580412d // ldr z13, [x9] - WORD $0x91008129 // add x9, x9, #32 + WORD $0x04295029 // addvl x9, x9, #1 WORD $0x04fc95ae // lsr z14.d, z13.d, #4 WORD $0x042a31ad // and z13.d, z13.d, z10.d WORD $0x042a31ce // and z14.d, z14.d, z10.d @@ -9547,7 +9636,7 @@ mulSve_10x10_loop: // Load and process 32 bytes from input 5 to 10 outputs WORD $0x8580414d // ldr z13, [x10] - WORD $0x9100814a // add x10, x10, #32 + WORD $0x042a502a // addvl x10, x10, #1 WORD $0x04fc95ae // lsr z14.d, z13.d, #4 WORD $0x042a31ad // and z13.d, z13.d, z10.d WORD $0x042a31ce // and z14.d, z14.d, z10.d @@ -9617,7 +9706,7 @@ mulSve_10x10_loop: // Load and process 32 bytes from input 6 to 10 outputs WORD $0x8580416d // ldr z13, [x11] - WORD $0x9100816b // add x11, x11, #32 + WORD $0x042b502b // addvl x11, x11, #1 WORD $0x04fc95ae // lsr z14.d, z13.d, #4 WORD $0x042a31ad // and z13.d, z13.d, z10.d WORD $0x042a31ce // and z14.d, z14.d, z10.d @@ -9687,7 +9776,7 @@ mulSve_10x10_loop: // Load and process 32 bytes from input 7 to 10 outputs WORD $0x8580418d // ldr z13, [x12] - WORD $0x9100818c // add x12, x12, #32 + WORD $0x042c502c // addvl x12, x12, #1 WORD $0x04fc95ae // lsr z14.d, z13.d, #4 WORD $0x042a31ad // and z13.d, z13.d, z10.d WORD $0x042a31ce // and z14.d, z14.d, z10.d @@ -9757,7 +9846,7 @@ mulSve_10x10_loop: // Load and process 32 bytes from input 8 to 10 outputs WORD $0x858041ad // ldr z13, [x13] - WORD $0x910081ad // add x13, x13, #32 + WORD $0x042d502d // addvl x13, x13, #1 WORD $0x04fc95ae // lsr z14.d, z13.d, #4 WORD $0x042a31ad // and z13.d, z13.d, z10.d WORD $0x042a31ce // and z14.d, z14.d, z10.d @@ -9827,7 +9916,7 @@ mulSve_10x10_loop: // Load and process 32 bytes from input 9 to 10 outputs WORD $0x8580406d // ldr z13, [x3] - WORD $0x91008063 // add x3, x3, #32 + WORD $0x04235023 // addvl x3, x3, #1 WORD $0x04fc95ae // lsr z14.d, z13.d, #4 WORD $0x042a31ad // and z13.d, z13.d, z10.d WORD $0x042a31ce // and z14.d, z14.d, z10.d @@ -9916,7 +10005,7 @@ mulSve_10x10_store: WORD $0xe5ef40c9 // st1d { z9.d }, p0, [x6, x15, lsl #3] // Prepare for next loop - WORD $0x910011ef // add x15, x15, #4 + WORD $0x8b1101ef // add x15, x15, x17 WORD $0xf1000400 // subs x0, x0, #1 BNE mulSve_10x10_loop @@ -9933,6 +10022,9 @@ TEXT ·mulSve_10x10Xor(SB), NOSPLIT, $8-88 MOVD n+80(FP), R0 MOVD matrix_base+0(FP), R2 WORD $0xd345fc00 // lsr x0, x0, #5 + WORD $0xd37be800 // lsl x0, x0, #5 + WORD $0x04bf5030 // rdvl x16, #1 + WORD $0x9ad00800 // udiv x0, x0, x16 WORD $0xea00001f // tst x0, x0 BEQ mulSve_10x10Xor_end MOVD in_base+24(FP), R3 @@ -9967,11 +10059,13 @@ TEXT ·mulSve_10x10Xor(SB), NOSPLIT, $8-88 // Load number of input shards MOVD in_len+32(FP), R16 + WORD $0x04bf5031 // rdvl x17, #1 + WORD $0xd343fe31 // lsr x17, x17, #3 mulSve_10x10Xor_loop: // Load and process 32 bytes from input 0 to 10 outputs WORD $0x8580402d // ldr z13, [x1] - WORD $0x91008021 // add x1, x1, #32 + WORD $0x04215021 // addvl x1, x1, #1 WORD $0x04fc95ae // lsr z14.d, z13.d, #4 WORD $0x042a31ad // and z13.d, z13.d, z10.d WORD $0x042a31ce // and z14.d, z14.d, z10.d @@ -10061,7 +10155,7 @@ mulSve_10x10Xor_loop: // Load and process 32 bytes from input 1 to 10 outputs WORD $0x8580408d // ldr z13, [x4] - WORD $0x91008084 // add x4, x4, #32 + WORD $0x04245024 // addvl x4, x4, #1 WORD $0x04fc95ae // lsr z14.d, z13.d, #4 WORD $0x042a31ad // and z13.d, z13.d, z10.d WORD $0x042a31ce // and z14.d, z14.d, z10.d @@ -10131,7 +10225,7 @@ mulSve_10x10Xor_loop: // Load and process 32 bytes from input 2 to 10 outputs WORD $0x858040ad // ldr z13, [x5] - WORD $0x910080a5 // add x5, x5, #32 + WORD $0x04255025 // addvl x5, x5, #1 WORD $0x04fc95ae // lsr z14.d, z13.d, #4 WORD $0x042a31ad // and z13.d, z13.d, z10.d WORD $0x042a31ce // and z14.d, z14.d, z10.d @@ -10201,7 +10295,7 @@ mulSve_10x10Xor_loop: // Load and process 32 bytes from input 3 to 10 outputs WORD $0x8580410d // ldr z13, [x8] - WORD $0x91008108 // add x8, x8, #32 + WORD $0x04285028 // addvl x8, x8, #1 WORD $0x04fc95ae // lsr z14.d, z13.d, #4 WORD $0x042a31ad // and z13.d, z13.d, z10.d WORD $0x042a31ce // and z14.d, z14.d, z10.d @@ -10271,7 +10365,7 @@ mulSve_10x10Xor_loop: // Load and process 32 bytes from input 4 to 10 outputs WORD $0x8580412d // ldr z13, [x9] - WORD $0x91008129 // add x9, x9, #32 + WORD $0x04295029 // addvl x9, x9, #1 WORD $0x04fc95ae // lsr z14.d, z13.d, #4 WORD $0x042a31ad // and z13.d, z13.d, z10.d WORD $0x042a31ce // and z14.d, z14.d, z10.d @@ -10341,7 +10435,7 @@ mulSve_10x10Xor_loop: // Load and process 32 bytes from input 5 to 10 outputs WORD $0x8580414d // ldr z13, [x10] - WORD $0x9100814a // add x10, x10, #32 + WORD $0x042a502a // addvl x10, x10, #1 WORD $0x04fc95ae // lsr z14.d, z13.d, #4 WORD $0x042a31ad // and z13.d, z13.d, z10.d WORD $0x042a31ce // and z14.d, z14.d, z10.d @@ -10411,7 +10505,7 @@ mulSve_10x10Xor_loop: // Load and process 32 bytes from input 6 to 10 outputs WORD $0x8580416d // ldr z13, [x11] - WORD $0x9100816b // add x11, x11, #32 + WORD $0x042b502b // addvl x11, x11, #1 WORD $0x04fc95ae // lsr z14.d, z13.d, #4 WORD $0x042a31ad // and z13.d, z13.d, z10.d WORD $0x042a31ce // and z14.d, z14.d, z10.d @@ -10481,7 +10575,7 @@ mulSve_10x10Xor_loop: // Load and process 32 bytes from input 7 to 10 outputs WORD $0x8580418d // ldr z13, [x12] - WORD $0x9100818c // add x12, x12, #32 + WORD $0x042c502c // addvl x12, x12, #1 WORD $0x04fc95ae // lsr z14.d, z13.d, #4 WORD $0x042a31ad // and z13.d, z13.d, z10.d WORD $0x042a31ce // and z14.d, z14.d, z10.d @@ -10551,7 +10645,7 @@ mulSve_10x10Xor_loop: // Load and process 32 bytes from input 8 to 10 outputs WORD $0x858041ad // ldr z13, [x13] - WORD $0x910081ad // add x13, x13, #32 + WORD $0x042d502d // addvl x13, x13, #1 WORD $0x04fc95ae // lsr z14.d, z13.d, #4 WORD $0x042a31ad // and z13.d, z13.d, z10.d WORD $0x042a31ce // and z14.d, z14.d, z10.d @@ -10621,7 +10715,7 @@ mulSve_10x10Xor_loop: // Load and process 32 bytes from input 9 to 10 outputs WORD $0x8580406d // ldr z13, [x3] - WORD $0x91008063 // add x3, x3, #32 + WORD $0x04235023 // addvl x3, x3, #1 WORD $0x04fc95ae // lsr z14.d, z13.d, #4 WORD $0x042a31ad // and z13.d, z13.d, z10.d WORD $0x042a31ce // and z14.d, z14.d, z10.d @@ -10710,7 +10804,7 @@ mulSve_10x10Xor_store: WORD $0xe5ef40c9 // st1d { z9.d }, p0, [x6, x15, lsl #3] // Prepare for next loop - WORD $0x910011ef // add x15, x15, #4 + WORD $0x8b1101ef // add x15, x15, x17 WORD $0xf1000400 // subs x0, x0, #1 BNE mulSve_10x10Xor_loop diff --git a/galois_test.go b/galois_test.go index 580b216..4b151b2 100644 --- a/galois_test.go +++ b/galois_test.go @@ -235,7 +235,7 @@ func TestSliceGalAdd(t *testing.T) { } } -func testGenGalois(t *testing.T, matrixRows [][]byte, size, start, stop int, f func(matrix []byte, in, out [][]byte, start, stop int) int) { +func testGenGalois(t *testing.T, matrixRows [][]byte, size, start, stop int, f func(matrix []byte, in, out [][]byte, start, stop int) int, vectorLength int) { // reference versions galMulSliceRef := func(c byte, in, out []byte) { @@ -270,7 +270,7 @@ func testGenGalois(t *testing.T, matrixRows [][]byte, size, start, stop int, f f } } - m := genCodeGenMatrix(matrixRows, len(inputs), 0, len(outputs), nil) + m := genCodeGenMatrix(matrixRows, len(inputs), 0, len(outputs), vectorLength, nil) end := start + f(m, inputs, outputs, start, stop) if end != stop { @@ -297,7 +297,7 @@ func testGenGalois(t *testing.T, matrixRows [][]byte, size, start, stop int, f f } } -func testGenGaloisXor(t *testing.T, matrixRows [][]byte, size, start, stop int, f func(matrix []byte, in, out [][]byte, start, stop int) int) { +func testGenGaloisXor(t *testing.T, matrixRows [][]byte, size, start, stop int, f func(matrix []byte, in, out [][]byte, start, stop int) int, vectorLength int) { // reference version galMulSliceXorRef := func(c byte, in, out []byte) { @@ -327,7 +327,7 @@ func testGenGaloisXor(t *testing.T, matrixRows [][]byte, size, start, stop int, } } - m := genCodeGenMatrix(matrixRows, len(inputs), 0, len(outputs), nil) + m := genCodeGenMatrix(matrixRows, len(inputs), 0, len(outputs), vectorLength, nil) end := start + f(m, inputs, outputs, start, stop) if end != stop { @@ -363,7 +363,7 @@ func testGenGaloisEarlyAbort(t *testing.T, matrixRows [][]byte, size int, f func } } -func testGenGaloisUpto10x10(t *testing.T, f, fXor func(matrix []byte, in, out [][]byte, start, stop int) int) { +func testGenGaloisUpto10x10(t *testing.T, f, fXor func(matrix []byte, in, out [][]byte, start, stop int) int, vectorLength int) { for output := 1; output <= codeGenMaxOutputs; output++ { for input := 1; input <= codeGenMaxInputs; input++ { @@ -386,15 +386,15 @@ func testGenGaloisUpto10x10(t *testing.T, f, fXor func(matrix []byte, in, out [] const limit = 1024 for ; size < limit; size += stepsize { // test full range - testGenGalois(t, matrixRows, size, 0, size, f) - testGenGaloisXor(t, matrixRows, size, 0, size, fXor) + testGenGalois(t, matrixRows, size, 0, size, f, vectorLength) + testGenGaloisXor(t, matrixRows, size, 0, size, fXor, vectorLength) if size >= stepsize*2 && size < limit-stepsize*2 { start := stepsize stop := size - start // test partial range - testGenGalois(t, matrixRows, size, start, stop, f) - testGenGaloisXor(t, matrixRows, size, start, stop, fXor) + testGenGalois(t, matrixRows, size, start, stop, f, vectorLength) + testGenGaloisXor(t, matrixRows, size, start, stop, fXor, vectorLength) } } } diff --git a/options.go b/options.go index 377137e..cde2555 100644 --- a/options.go +++ b/options.go @@ -24,6 +24,7 @@ type options struct { useSSE2, useNEON, useSVE bool + vectorLength int useJerasureMatrix bool usePAR1Matrix bool @@ -55,6 +56,7 @@ var defaultOptions = options{ useAvxGNFI: cpuid.CPU.Supports(cpuid.AVX, cpuid.GFNI), useNEON: cpuid.CPU.Supports(cpuid.ASIMD), useSVE: cpuid.CPU.Supports(cpuid.SVE), + vectorLength: 32, // default vector length is 32 bytes (256 bits) for AVX2 code gen } // leopardMode controls the use of leopard GF in encoding and decoding. diff --git a/reedsolomon.go b/reedsolomon.go index 3b6f5b7..443543f 100644 --- a/reedsolomon.go +++ b/reedsolomon.go @@ -833,7 +833,7 @@ func (r *reedSolomon) codeSomeShards(matrixRows, inputs, outputs [][]byte, byteC start += (*galMulGFNI)(m, inputs, outputs, 0, byteCount) end = len(inputs[0]) } else if galMulGen, _, ok := r.hasCodeGen(byteCount, len(inputs), len(outputs)); ok { - m := genCodeGenMatrix(matrixRows, len(inputs), 0, len(outputs), r.getTmpSlice()) + m := genCodeGenMatrix(matrixRows, len(inputs), 0, len(outputs), r.o.vectorLength, r.getTmpSlice()) start += (*galMulGen)(m, inputs, outputs, 0, byteCount) r.putTmpSlice(m) end = len(inputs[0]) @@ -864,7 +864,7 @@ func (r *reedSolomon) codeSomeShards(matrixRows, inputs, outputs [][]byte, byteC start = (*galMulGFNIXor)(m, inPer, outPer, 0, byteCount) } } else { - m = genCodeGenMatrix(matrixRows[outIdx:], len(inPer), inIdx, len(outPer), m) + m = genCodeGenMatrix(matrixRows[outIdx:], len(inPer), inIdx, len(outPer), r.o.vectorLength, m) if inIdx == 0 { start = (*galMulGen)(m, inPer, outPer, 0, byteCount) } else { @@ -914,7 +914,7 @@ func (r *reedSolomon) codeSomeShardsP(matrixRows, inputs, outputs [][]byte, byte var tmp [codeGenMaxInputs * codeGenMaxOutputs]uint64 gfniMatrix = genGFNIMatrix(matrixRows, len(inputs), 0, len(outputs), tmp[:]) } else if useCodeGen { - genMatrix = genCodeGenMatrix(matrixRows, len(inputs), 0, len(outputs), r.getTmpSlice()) + genMatrix = genCodeGenMatrix(matrixRows, len(inputs), 0, len(outputs), r.o.vectorLength, r.getTmpSlice()) defer r.putTmpSlice(genMatrix) } else if galMulGFNI, galMulGFNIXor, useGFNI := r.canGFNI(byteCount/4, codeGenMaxInputs, codeGenMaxOutputs); useGFNI && byteCount < 10<<20 && len(inputs)+len(outputs) > codeGenMinShards { @@ -1025,7 +1025,7 @@ func (r *reedSolomon) codeSomeShardsAVXP(matrixRows, inputs, outputs [][]byte, b outPer = outPer[:codeGenMaxOutputs] } // Generate local matrix - m := genCodeGenMatrix(matrixRows[outIdx:], len(inPer), inIdx, len(outPer), tmp) + m := genCodeGenMatrix(matrixRows[outIdx:], len(inPer), inIdx, len(outPer), r.o.vectorLength, tmp) tmp = tmp[len(m):] plan = append(plan, state{ input: inPer, @@ -1056,7 +1056,7 @@ func (r *reedSolomon) codeSomeShardsAVXP(matrixRows, inputs, outputs [][]byte, b inPer = inPer[:codeGenMaxInputs] } // Generate local matrix - m := genCodeGenMatrix(matrixRows[outIdx:], len(inPer), inIdx, len(outPer), tmp) + m := genCodeGenMatrix(matrixRows[outIdx:], len(inPer), inIdx, len(outPer), r.o.vectorLength, tmp) tmp = tmp[len(m):] //fmt.Println("bytes:", len(inPer)*r.o.perRound, "out:", len(outPer)*r.o.perRound) plan = append(plan, state{