diff --git a/go.mod b/go.mod index 019edd94c..b9baa3327 100644 --- a/go.mod +++ b/go.mod @@ -5,10 +5,10 @@ require ( github.com/pkg/errors v0.9.1 github.com/urfave/cli v1.22.15 github.com/xtaci/kcp-go/v5 v5.6.8 - github.com/xtaci/qpp v1.1.5 + github.com/xtaci/qpp v1.1.6 github.com/xtaci/smux v1.5.24 github.com/xtaci/tcpraw v1.2.25 - golang.org/x/crypto v0.24.0 + golang.org/x/crypto v0.25.0 ) require ( @@ -16,13 +16,13 @@ require ( github.com/cpuguy83/go-md2man/v2 v2.0.4 // indirect github.com/google/gopacket v1.1.19 // indirect github.com/klauspost/cpuid/v2 v2.2.8 // indirect - github.com/klauspost/reedsolomon v1.12.1 // indirect + github.com/klauspost/reedsolomon v1.12.2 // indirect github.com/russross/blackfriday/v2 v2.1.0 // indirect github.com/templexxx/cpu v0.1.0 // indirect github.com/templexxx/xorsimd v0.4.2 // indirect github.com/tjfoc/gmsm v1.4.1 // indirect - golang.org/x/net v0.26.0 // indirect - golang.org/x/sys v0.21.0 // indirect + golang.org/x/net v0.27.0 // indirect + golang.org/x/sys v0.22.0 // indirect ) go 1.22.3 diff --git a/go.sum b/go.sum index 6d4ea8659..558e43ab2 100644 --- a/go.sum +++ b/go.sum @@ -37,6 +37,8 @@ github.com/klauspost/cpuid/v2 v2.2.8 h1:+StwCXwm9PdpiEkPyzBXIy+M9KUb4ODm0Zarf1kS github.com/klauspost/cpuid/v2 v2.2.8/go.mod h1:Lcz8mBdAVJIBVzewtcLocK12l3Y+JytZYpaMropDUws= github.com/klauspost/reedsolomon v1.12.1 h1:NhWgum1efX1x58daOBGCFWcxtEhOhXKKl1HAPQUp03Q= github.com/klauspost/reedsolomon v1.12.1/go.mod h1:nEi5Kjb6QqtbofI6s+cbG/j1da11c96IBYBSnVGtuBs= +github.com/klauspost/reedsolomon v1.12.2 h1:TC0hlL/tTRxiMNnqHCzKsY11E0fIIKGCoZ2vQoPKIEM= +github.com/klauspost/reedsolomon v1.12.2/go.mod h1:nEi5Kjb6QqtbofI6s+cbG/j1da11c96IBYBSnVGtuBs= github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4= github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= @@ -83,6 +85,8 @@ github.com/xtaci/qpp v1.1.4 h1:J4uUJy+7KVFWTduuIQr/MCiD9Ik8x7AblOtYbwMhC+s= github.com/xtaci/qpp v1.1.4/go.mod h1:dJS3usaXNMbWxZSWCAdxz01UgJcz9wXDkd4BccDY/V0= github.com/xtaci/qpp v1.1.5 h1:EpFDE3lpCkOtGVIQTxww0tjQEOOCohuBuISL+c7/iWw= github.com/xtaci/qpp v1.1.5/go.mod h1:dJS3usaXNMbWxZSWCAdxz01UgJcz9wXDkd4BccDY/V0= +github.com/xtaci/qpp v1.1.6 h1:c0eVP1bbHQ+mDpPhT1FtKhKDGhikJGDw+tC/aVh9/pw= +github.com/xtaci/qpp v1.1.6/go.mod h1:dJS3usaXNMbWxZSWCAdxz01UgJcz9wXDkd4BccDY/V0= github.com/xtaci/smux v1.5.24 h1:77emW9dtnOxxOQ5ltR+8BbsX1kzcOxQ5gB+aaV9hXOY= github.com/xtaci/smux v1.5.24/go.mod h1:OMlQbT5vcgl2gb49mFkYo6SMf+zP3rcjcwQz7ZU7IGY= github.com/xtaci/tcpraw v1.2.25 h1:VDlqo0op17JeXBM6e2G9ocCNLOJcw9mZbobMbJjo0vk= @@ -93,6 +97,8 @@ golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPh golang.org/x/crypto v0.0.0-20201012173705-84dcc777aaee/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= golang.org/x/crypto v0.24.0 h1:mnl8DM0o513X8fdIkmyFE/5hTYxbwYOjDS/+rK6qpRI= golang.org/x/crypto v0.24.0/go.mod h1:Z1PMYSOR5nyMcyAVAIQSKCDwalqy85Aqn1x3Ws4L5DM= +golang.org/x/crypto v0.25.0 h1:ypSNr+bnYL2YhwoMt2zPxHFmbAN1KZs/njMG3hxUp30= +golang.org/x/crypto v0.25.0/go.mod h1:T+wALwcMOSE0kXgUAnPAHqTLW+XHgcELELW8VaDgm/M= golang.org/x/exp v0.0.0-20190121172915-509febef88a4/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= golang.org/x/lint v0.0.0-20181026193005-c67002cb31c3/go.mod h1:UVdnD1Gm6xHRNCYTkRU2/jEulfH38KcIWyp/GAMgvoE= golang.org/x/lint v0.0.0-20190227174305-5b3e6a55c961/go.mod h1:wehouNa3lNwaWXcvxsM5YxQ5yQlVC4a0KAMCusXpPoU= @@ -108,6 +114,8 @@ golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLL golang.org/x/net v0.0.0-20201010224723-4f7140c49acb/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU= golang.org/x/net v0.26.0 h1:soB7SVo0PWrY4vPW/+ay0jKDNScG2X9wFeYlXIvJsOQ= golang.org/x/net v0.26.0/go.mod h1:5YKkiSynbBIh3p6iOc/vibscux0x38BZDkn8sCUPxHE= +golang.org/x/net v0.27.0 h1:5K3Njcw06/l2y9vpGCSdcxWOYHOUk3dVNGDXN+FvAys= +golang.org/x/net v0.27.0/go.mod h1:dDi0PyhWNoiUOrAS8uXv/vnScO4wnHQO4mj9fn/RytE= golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U= golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20181108010431-42b317875d0f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= @@ -119,6 +127,8 @@ golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7w golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.21.0 h1:rF+pYz3DAGSQAxAu1CbC7catZg4ebC4UIeIhKxBZvws= golang.org/x/sys v0.21.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/sys v0.22.0 h1:RI27ohtqKCnwULzJLqkv897zojh5/DwS/ENaMzUOaWI= +golang.org/x/sys v0.22.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= diff --git a/vendor/github.com/klauspost/reedsolomon/README.md b/vendor/github.com/klauspost/reedsolomon/README.md index bdcb9e787..d94512a3a 100644 --- a/vendor/github.com/klauspost/reedsolomon/README.md +++ b/vendor/github.com/klauspost/reedsolomon/README.md @@ -25,6 +25,10 @@ Using Go modules is recommended. # Changes +## 2024 + + * Auto-generation of SVE and NEON routines for ARM based on AVX2 code. This results in a speedup of 2x for SVE (as measured using Graviton 3 on AWS) and a speedup of 1.5x as compared to the existing NEON-accelerated code. + ## 2022 * [GFNI](https://github.com/klauspost/reedsolomon/pull/224) support for amd64, for up to 3x faster processing. @@ -558,8 +562,9 @@ The removed code may not be infringing and even after `-tags=nopshufb` there may * [Reed-Solomon Erasure Coding in Haskell](https://github.com/NicolasT/reedsolomon). Haskell port of the package with similar performance. * [reed-solomon-erasure](https://github.com/darrenldl/reed-solomon-erasure). Compatible Rust implementation. * [go-erasure](https://github.com/somethingnew2-0/go-erasure). A similar library using cgo, slower in my tests. -* [Screaming Fast Galois Field Arithmetic](http://www.snia.org/sites/default/files2/SDC2013/presentations/NewThinking/EthanMiller_Screaming_Fast_Galois_Field%20Arithmetic_SIMD%20Instructions.pdf). Basis for SSE3 optimizations. +* [Screaming Fast Galois Field Arithmetic](https://www.snia.org/sites/default/files/files2/files2/SDC2013/presentations/NewThinking/EthanMiller_Screaming_Fast_Galois_Field%20Arithmetic_SIMD%20Instructions.pdf). Basis for SSE3 optimizations. * [Leopard-RS](https://github.com/catid/leopard) C library used as basis for GF16 implementation. +* [reed-solomon-simd](https://github.com/AndersTrier/reed-solomon-simd) Leopard-RS Rust implementation. # License diff --git a/vendor/github.com/klauspost/reedsolomon/galois.go b/vendor/github.com/klauspost/reedsolomon/galois.go index 697f9ca67..9b3639502 100644 --- a/vendor/github.com/klauspost/reedsolomon/galois.go +++ b/vendor/github.com/klauspost/reedsolomon/galois.go @@ -910,8 +910,8 @@ func galExp(a byte, n int) byte { return expTable[uint8(logResult)] } -func genAvx2Matrix(matrixRows [][]byte, inputs, inIdx, outputs int, dst []byte) []byte { - if !avx2CodeGen { +func genCodeGenMatrix(matrixRows [][]byte, inputs, inIdx, outputs int, dst []byte) []byte { + if !codeGen { panic("codegen not enabled") } total := inputs * outputs @@ -942,7 +942,7 @@ func genAvx2Matrix(matrixRows [][]byte, inputs, inIdx, outputs int, dst []byte) var gf2p811dMulMatrices = [256]uint64{0, 0x102040810204080, 0x8001828488102040, 0x8103868c983060c0, 0x408041c2c4881020, 0x418245cad4a850a0, 0xc081c3464c983060, 0xc183c74e5cb870e0, 0x2040a061e2c48810, 0x2142a469f2e4c890, 0xa04122e56ad4a850, 0xa14326ed7af4e8d0, 0x60c0e1a3264c9830, 0x61c2e5ab366cd8b0, 0xe0c16327ae5cb870, 0xe1c3672fbe7cf8f0, 0x102050b071e2c488, 0x112254b861c28408, 0x9021d234f9f2e4c8, 0x9123d63ce9d2a448, 0x50a01172b56ad4a8, 0x51a2157aa54a9428, 0xd0a193f63d7af4e8, 0xd1a397fe2d5ab468, 0x3060f0d193264c98, 0x3162f4d983060c18, 0xb06172551b366cd8, 0xb163765d0b162c58, 0x70e0b11357ae5cb8, 0x71e2b51b478e1c38, 0xf0e13397dfbe7cf8, 0xf1e3379fcf9e3c78, 0x8810a8d83871e2c4, 0x8912acd02851a244, 0x8112a5cb061c284, 0x9132e54a0418204, 0xc890e91afcf9f2e4, 0xc992ed12ecd9b264, 0x48916b9e74e9d2a4, 0x49936f9664c99224, 0xa85008b9dab56ad4, 0xa9520cb1ca952a54, 0x28518a3d52a54a94, 0x29538e3542850a14, 0xe8d0497b1e3d7af4, 0xe9d24d730e1d3a74, 0x68d1cbff962d5ab4, 0x69d3cff7860d1a34, 0x9830f8684993264c, 0x9932fc6059b366cc, 0x18317aecc183060c, 0x19337ee4d1a3468c, 0xd8b0b9aa8d1b366c, 0xd9b2bda29d3b76ec, 0x58b13b2e050b162c, 0x59b33f26152b56ac, 0xb8705809ab57ae5c, 0xb9725c01bb77eedc, 0x3871da8d23478e1c, 0x3973de853367ce9c, 0xf8f019cb6fdfbe7c, 0xf9f21dc37ffffefc, 0x78f19b4fe7cf9e3c, 0x79f39f47f7efdebc, 0xc488d46c1c3871e2, 0xc58ad0640c183162, 0x448956e8942851a2, 0x458b52e084081122, 0x840895aed8b061c2, 0x850a91a6c8902142, 0x409172a50a04182, 0x50b132240800102, 0xe4c8740dfefcf9f2, 0xe5ca7005eedcb972, 0x64c9f68976ecd9b2, 0x65cbf28166cc9932, 0xa44835cf3a74e9d2, 0xa54a31c72a54a952, 0x2449b74bb264c992, 0x254bb343a2448912, 0xd4a884dc6ddab56a, 0xd5aa80d47dfaf5ea, 0x54a90658e5ca952a, 0x55ab0250f5ead5aa, 0x9428c51ea952a54a, 0x952ac116b972e5ca, 0x1429479a2142850a, 0x152b43923162c58a, 0xf4e824bd8f1e3d7a, 0xf5ea20b59f3e7dfa, 0x74e9a639070e1d3a, 0x75eba231172e5dba, 0xb468657f4b962d5a, 0xb56a61775bb66dda, 0x3469e7fbc3860d1a, 0x356be3f3d3a64d9a, 0x4c987cb424499326, 0x4d9a78bc3469d3a6, 0xcc99fe30ac59b366, 0xcd9bfa38bc79f3e6, 0xc183d76e0c18306, 0xd1a397ef0e1c386, 0x8c19bff268d1a346, 0x8d1bbbfa78f1e3c6, 0x6cd8dcd5c68d1b36, 0x6ddad8ddd6ad5bb6, 0xecd95e514e9d3b76, 0xeddb5a595ebd7bf6, 0x2c589d1702050b16, 0x2d5a991f12254b96, 0xac591f938a152b56, 0xad5b1b9b9a356bd6, 0x5cb82c0455ab57ae, 0x5dba280c458b172e, 0xdcb9ae80ddbb77ee, 0xddbbaa88cd9b376e, 0x1c386dc69123478e, 0x1d3a69ce8103070e, 0x9c39ef42193367ce, 0x9d3beb4a0913274e, 0x7cf88c65b76fdfbe, 0x7dfa886da74f9f3e, 0xfcf90ee13f7ffffe, 0xfdfb0ae92f5fbf7e, 0x3c78cda773e7cf9e, 0x3d7ac9af63c78f1e, 0xbc794f23fbf7efde, 0xbd7b4b2bebd7af5e, 0xe2c46a368e1c3871, 0xe3c66e3e9e3c78f1, 0x62c5e8b2060c1831, 0x63c7ecba162c58b1, 0xa2442bf44a942851, 0xa3462ffc5ab468d1, 0x2245a970c2840811, 0x2347ad78d2a44891, 0xc284ca576cd8b061, 0xc386ce5f7cf8f0e1, 0x428548d3e4c89021, 0x43874cdbf4e8d0a1, 0x82048b95a850a041, 0x83068f9db870e0c1, 0x205091120408001, 0x3070d193060c081, 0xf2e43a86fffefcf9, 0xf3e63e8eefdebc79, 0x72e5b80277eedcb9, 0x73e7bc0a67ce9c39, 0xb2647b443b76ecd9, 0xb3667f4c2b56ac59, 0x3265f9c0b366cc99, 0x3367fdc8a3468c19, 0xd2a49ae71d3a74e9, 0xd3a69eef0d1a3469, 0x52a51863952a54a9, 0x53a71c6b850a1429, 0x9224db25d9b264c9, 0x9326df2dc9922449, 0x122559a151a24489, 0x13275da941820409, 0x6ad4c2eeb66ddab5, 0x6bd6c6e6a64d9a35, 0xead5406a3e7dfaf5, 0xebd744622e5dba75, 0x2a54832c72e5ca95, 0x2b56872462c58a15, 0xaa5501a8faf5ead5, 0xab5705a0ead5aa55, 0x4a94628f54a952a5, 0x4b96668744891225, 0xca95e00bdcb972e5, 0xcb97e403cc993265, 0xa14234d90214285, 0xb16274580010205, 0x8a15a1c9183162c5, 0x8b17a5c108112245, 0x7af4925ec78f1e3d, 0x7bf69656d7af5ebd, 0xfaf510da4f9f3e7d, 0xfbf714d25fbf7efd, 0x3a74d39c03070e1d, 0x3b76d79413274e9d, 0xba7551188b172e5d, 0xbb7755109b376edd, 0x5ab4323f254b962d, 0x5bb63637356bd6ad, 0xdab5b0bbad5bb66d, 0xdbb7b4b3bd7bf6ed, 0x1a3473fde1c3860d, 0x1b3677f5f1e3c68d, 0x9a35f17969d3a64d, 0x9b37f57179f3e6cd, 0x264cbe5a92244993, 0x274eba5282040913, 0xa64d3cde1a3469d3, 0xa74f38d60a142953, 0x66ccff9856ac59b3, 0x67cefb90468c1933, 0xe6cd7d1cdebc79f3, 0xe7cf7914ce9c3973, 0x60c1e3b70e0c183, 0x70e1a3360c08103, 0x860d9cbff8f0e1c3, 0x870f98b7e8d0a143, 0x468c5ff9b468d1a3, 0x478e5bf1a4489123, 0xc68ddd7d3c78f1e3, 0xc78fd9752c58b163, 0x366ceeeae3c68d1b, 0x376eeae2f3e6cd9b, 0xb66d6c6e6bd6ad5b, 0xb76f68667bf6eddb, 0x76ecaf28274e9d3b, 0x77eeab20376eddbb, 0xf6ed2dacaf5ebd7b, 0xf7ef29a4bf7efdfb, 0x162c4e8b0102050b, 0x172e4a831122458b, 0x962dcc0f8912254b, 0x972fc807993265cb, 0x56ac0f49c58a152b, 0x57ae0b41d5aa55ab, 0xd6ad8dcd4d9a356b, 0xd7af89c55dba75eb, 0xae5c1682aa55ab57, 0xaf5e128aba75ebd7, 0x2e5d940622458b17, 0x2f5f900e3265cb97, 0xeedc57406eddbb77, 0xefde53487efdfbf7, 0x6eddd5c4e6cd9b37, 0x6fdfd1ccf6eddbb7, 0x8e1cb6e348912347, 0x8f1eb2eb58b163c7, 0xe1d3467c0810307, 0xf1f306fd0a14387, 0xce9cf7218c193367, 0xcf9ef3299c3973e7, 0x4e9d75a504091327, 0x4f9f71ad142953a7, 0xbe7c4632dbb76fdf, 0xbf7e423acb972f5f, 0x3e7dc4b653a74f9f, 0x3f7fc0be43870f1f, 0xfefc07f01f3f7fff, 0xfffe03f80f1f3f7f, 0x7efd8574972f5fbf, 0x7fff817c870f1f3f, 0x9e3ce6533973e7cf, 0x9f3ee25b2953a74f, 0x1e3d64d7b163c78f, 0x1f3f60dfa143870f, 0xdebca791fdfbf7ef, 0xdfbea399eddbb76f, 0x5ebd251575ebd7af, 0x5fbf211d65cb972f} func genGFNIMatrix(matrixRows [][]byte, inputs, inIdx, outputs int, dst []uint64) []uint64 { - if !avx2CodeGen { + if !codeGen { panic("codegen not enabled") } total := inputs * outputs diff --git a/vendor/github.com/klauspost/reedsolomon/galois_gen_amd64.s b/vendor/github.com/klauspost/reedsolomon/galois_gen_amd64.s index ad253a65a..8ff74bf43 100644 --- a/vendor/github.com/klauspost/reedsolomon/galois_gen_amd64.s +++ b/vendor/github.com/klauspost/reedsolomon/galois_gen_amd64.s @@ -126,7 +126,6 @@ TEXT ·mulAvxTwo_1x1_64(SB), $0-88 MOVQ in_base+24(FP), CX MOVQ (CX), CX MOVQ out_base+48(FP), DX - MOVQ out_base+48(FP), DX MOVQ (DX), DX MOVQ start+72(FP), BX @@ -366,7 +365,6 @@ TEXT ·mulAvxTwo_1x1_64Xor(SB), $0-88 MOVQ in_base+24(FP), CX MOVQ (CX), CX MOVQ out_base+48(FP), DX - MOVQ out_base+48(FP), DX MOVQ (DX), DX MOVQ start+72(FP), BX @@ -428,7 +426,6 @@ TEXT ·mulAvxTwo_1x2_64(SB), $0-88 MOVQ in_base+24(FP), DX MOVQ (DX), DX MOVQ out_base+48(FP), BX - MOVQ out_base+48(FP), BX MOVQ (BX), SI MOVQ 24(BX), BX MOVQ start+72(FP), DI @@ -709,7 +706,6 @@ TEXT ·mulAvxTwo_1x2_64Xor(SB), $0-88 MOVQ in_base+24(FP), DX MOVQ (DX), DX MOVQ out_base+48(FP), BX - MOVQ out_base+48(FP), BX MOVQ (BX), SI MOVQ 24(BX), BX MOVQ start+72(FP), DI @@ -788,7 +784,6 @@ TEXT ·mulAvxTwo_1x3_64(SB), $0-88 MOVQ in_base+24(FP), DX MOVQ (DX), DX MOVQ out_base+48(FP), BX - MOVQ out_base+48(FP), BX MOVQ (BX), SI MOVQ 24(BX), DI MOVQ 48(BX), BX @@ -1110,7 +1105,6 @@ TEXT ·mulAvxTwo_1x3_64Xor(SB), $0-88 MOVQ in_base+24(FP), DX MOVQ (DX), DX MOVQ out_base+48(FP), BX - MOVQ out_base+48(FP), BX MOVQ (BX), SI MOVQ 24(BX), DI MOVQ 48(BX), BX @@ -5164,7 +5158,6 @@ TEXT ·mulAvxTwo_2x1_64(SB), $0-88 MOVQ (CX), DX MOVQ 24(CX), CX MOVQ out_base+48(FP), BX - MOVQ out_base+48(FP), BX MOVQ (BX), BX MOVQ start+72(FP), SI @@ -5461,7 +5454,6 @@ TEXT ·mulAvxTwo_2x1_64Xor(SB), $0-88 MOVQ (CX), DX MOVQ 24(CX), CX MOVQ out_base+48(FP), BX - MOVQ out_base+48(FP), BX MOVQ (BX), BX MOVQ start+72(FP), SI @@ -5542,7 +5534,6 @@ TEXT ·mulAvxTwo_2x2_64(SB), $0-88 MOVQ (DX), BX MOVQ 24(DX), DX MOVQ out_base+48(FP), SI - MOVQ out_base+48(FP), SI MOVQ (SI), DI MOVQ 24(SI), SI MOVQ start+72(FP), R8 @@ -5900,7 +5891,6 @@ TEXT ·mulAvxTwo_2x2_64Xor(SB), $0-88 MOVQ (DX), BX MOVQ 24(DX), DX MOVQ out_base+48(FP), SI - MOVQ out_base+48(FP), SI MOVQ (SI), DI MOVQ 24(SI), SI MOVQ start+72(FP), R8 @@ -6008,7 +5998,6 @@ TEXT ·mulAvxTwo_2x3_64(SB), $0-88 MOVQ (DX), BX MOVQ 24(DX), DX MOVQ out_base+48(FP), SI - MOVQ out_base+48(FP), SI MOVQ (SI), DI MOVQ 24(SI), R8 MOVQ 48(SI), SI @@ -6427,7 +6416,6 @@ TEXT ·mulAvxTwo_2x3_64Xor(SB), $0-88 MOVQ (DX), BX MOVQ 24(DX), DX MOVQ out_base+48(FP), SI - MOVQ out_base+48(FP), SI MOVQ (SI), DI MOVQ 24(SI), R8 MOVQ 48(SI), SI @@ -11886,7 +11874,6 @@ TEXT ·mulAvxTwo_3x1_64(SB), $0-88 MOVQ 24(DX), SI MOVQ 48(DX), DX MOVQ out_base+48(FP), DI - MOVQ out_base+48(FP), DI MOVQ (DI), DI MOVQ start+72(FP), R8 @@ -12240,7 +12227,6 @@ TEXT ·mulAvxTwo_3x1_64Xor(SB), $0-88 MOVQ 24(DX), SI MOVQ 48(DX), DX MOVQ out_base+48(FP), DI - MOVQ out_base+48(FP), DI MOVQ (DI), DI MOVQ start+72(FP), R8 @@ -12346,7 +12332,6 @@ TEXT ·mulAvxTwo_3x2_64(SB), $0-88 MOVQ 24(DX), SI MOVQ 48(DX), DX MOVQ out_base+48(FP), DI - MOVQ out_base+48(FP), DI MOVQ (DI), R8 MOVQ 24(DI), DI MOVQ start+72(FP), R9 @@ -12781,7 +12766,6 @@ TEXT ·mulAvxTwo_3x2_64Xor(SB), $0-88 MOVQ 24(DX), SI MOVQ 48(DX), DX MOVQ out_base+48(FP), DI - MOVQ out_base+48(FP), DI MOVQ (DI), R8 MOVQ 24(DI), DI MOVQ start+72(FP), R9 @@ -12918,7 +12902,6 @@ TEXT ·mulAvxTwo_3x3_64(SB), $0-88 MOVQ 24(DX), SI MOVQ 48(DX), DX MOVQ out_base+48(FP), DI - MOVQ out_base+48(FP), DI MOVQ (DI), R8 MOVQ 24(DI), R9 MOVQ 48(DI), DI @@ -13434,7 +13417,6 @@ TEXT ·mulAvxTwo_3x3_64Xor(SB), $0-88 MOVQ 24(DX), SI MOVQ 48(DX), DX MOVQ out_base+48(FP), DI - MOVQ out_base+48(FP), DI MOVQ (DI), R8 MOVQ 24(DI), R9 MOVQ 48(DI), DI @@ -20286,7 +20268,6 @@ TEXT ·mulAvxTwo_4x1_64(SB), $0-88 MOVQ 48(DX), DI MOVQ 72(DX), DX MOVQ out_base+48(FP), R8 - MOVQ out_base+48(FP), R8 MOVQ (R8), R8 MOVQ start+72(FP), R9 @@ -20697,7 +20678,6 @@ TEXT ·mulAvxTwo_4x1_64Xor(SB), $0-88 MOVQ 48(DX), DI MOVQ 72(DX), DX MOVQ out_base+48(FP), R8 - MOVQ out_base+48(FP), R8 MOVQ (R8), R8 MOVQ start+72(FP), R9 @@ -20824,7 +20804,6 @@ TEXT ·mulAvxTwo_4x2_64(SB), $0-88 MOVQ 48(DX), DI MOVQ 72(DX), DX MOVQ out_base+48(FP), R8 - MOVQ out_base+48(FP), R8 MOVQ (R8), R9 MOVQ 24(R8), R8 MOVQ start+72(FP), R10 @@ -21336,7 +21315,6 @@ TEXT ·mulAvxTwo_4x2_64Xor(SB), $0-88 MOVQ 48(DX), DI MOVQ 72(DX), DX MOVQ out_base+48(FP), R8 - MOVQ out_base+48(FP), R8 MOVQ (R8), R9 MOVQ 24(R8), R8 MOVQ start+72(FP), R10 @@ -21502,7 +21480,6 @@ TEXT ·mulAvxTwo_4x3_64(SB), $0-88 MOVQ 48(DX), DI MOVQ 72(DX), DX MOVQ out_base+48(FP), R8 - MOVQ out_base+48(FP), R8 MOVQ (R8), R9 MOVQ 24(R8), R10 MOVQ 48(R8), R8 @@ -22115,7 +22092,6 @@ TEXT ·mulAvxTwo_4x3_64Xor(SB), $0-88 MOVQ 48(DX), DI MOVQ 72(DX), DX MOVQ out_base+48(FP), R8 - MOVQ out_base+48(FP), R8 MOVQ (R8), R9 MOVQ 24(R8), R10 MOVQ 48(R8), R8 @@ -30216,7 +30192,6 @@ TEXT ·mulAvxTwo_5x1_64(SB), $0-88 MOVQ 72(DX), R8 MOVQ 96(DX), DX MOVQ out_base+48(FP), R9 - MOVQ out_base+48(FP), R9 MOVQ (R9), R9 MOVQ start+72(FP), R10 @@ -30684,7 +30659,6 @@ TEXT ·mulAvxTwo_5x1_64Xor(SB), $0-88 MOVQ 72(DX), R8 MOVQ 96(DX), DX MOVQ out_base+48(FP), R9 - MOVQ out_base+48(FP), R9 MOVQ (R9), R9 MOVQ start+72(FP), R10 @@ -30832,7 +30806,6 @@ TEXT ·mulAvxTwo_5x2_64(SB), $0-88 MOVQ 72(DX), R8 MOVQ 96(DX), DX MOVQ out_base+48(FP), R9 - MOVQ out_base+48(FP), R9 MOVQ (R9), R10 MOVQ 24(R9), R9 MOVQ start+72(FP), R11 @@ -31421,7 +31394,6 @@ TEXT ·mulAvxTwo_5x2_64Xor(SB), $0-88 MOVQ 72(DX), R8 MOVQ 96(DX), DX MOVQ out_base+48(FP), R9 - MOVQ out_base+48(FP), R9 MOVQ (R9), R10 MOVQ 24(R9), R9 MOVQ start+72(FP), R11 @@ -31616,7 +31588,6 @@ TEXT ·mulAvxTwo_5x3_64(SB), $0-88 MOVQ 72(DX), R8 MOVQ 96(DX), DX MOVQ out_base+48(FP), R9 - MOVQ out_base+48(FP), R9 MOVQ (R9), R10 MOVQ 24(R9), R11 MOVQ 48(R9), R9 @@ -32326,7 +32297,6 @@ TEXT ·mulAvxTwo_5x3_64Xor(SB), $0-88 MOVQ 72(DX), R8 MOVQ 96(DX), DX MOVQ out_base+48(FP), R9 - MOVQ out_base+48(FP), R9 MOVQ (R9), R10 MOVQ 24(R9), R11 MOVQ 48(R9), R9 @@ -41669,7 +41639,6 @@ TEXT ·mulAvxTwo_6x1_64(SB), $0-88 MOVQ 96(DX), R9 MOVQ 120(DX), DX MOVQ out_base+48(FP), R10 - MOVQ out_base+48(FP), R10 MOVQ (R10), R10 MOVQ start+72(FP), R11 @@ -42194,7 +42163,6 @@ TEXT ·mulAvxTwo_6x1_64Xor(SB), $0-88 MOVQ 96(DX), R9 MOVQ 120(DX), DX MOVQ out_base+48(FP), R10 - MOVQ out_base+48(FP), R10 MOVQ (R10), R10 MOVQ start+72(FP), R11 @@ -42363,7 +42331,6 @@ TEXT ·mulAvxTwo_6x2_64(SB), $0-88 MOVQ 96(DX), R9 MOVQ 120(DX), DX MOVQ out_base+48(FP), R10 - MOVQ out_base+48(FP), R10 MOVQ (R10), R11 MOVQ 24(R10), R10 MOVQ start+72(FP), R12 @@ -43029,7 +42996,6 @@ TEXT ·mulAvxTwo_6x2_64Xor(SB), $0-88 MOVQ 96(DX), R9 MOVQ 120(DX), DX MOVQ out_base+48(FP), R10 - MOVQ out_base+48(FP), R10 MOVQ (R10), R11 MOVQ 24(R10), R10 MOVQ start+72(FP), R12 @@ -43253,7 +43219,6 @@ TEXT ·mulAvxTwo_6x3_64(SB), $0-88 MOVQ 96(DX), R9 MOVQ 120(DX), DX MOVQ out_base+48(FP), R10 - MOVQ out_base+48(FP), R10 MOVQ (R10), R11 MOVQ 24(R10), R12 MOVQ 48(R10), R10 @@ -44060,7 +44025,6 @@ TEXT ·mulAvxTwo_6x3_64Xor(SB), $0-88 MOVQ 96(DX), R9 MOVQ 120(DX), DX MOVQ out_base+48(FP), R10 - MOVQ out_base+48(FP), R10 MOVQ (R10), R11 MOVQ 24(R10), R12 MOVQ 48(R10), R10 @@ -54644,7 +54608,6 @@ TEXT ·mulAvxTwo_7x1_64(SB), $0-88 MOVQ 120(DX), R10 MOVQ 144(DX), DX MOVQ out_base+48(FP), R11 - MOVQ out_base+48(FP), R11 MOVQ (R11), R11 MOVQ start+72(FP), R12 @@ -55226,7 +55189,6 @@ TEXT ·mulAvxTwo_7x1_64Xor(SB), $0-88 MOVQ 120(DX), R10 MOVQ 144(DX), DX MOVQ out_base+48(FP), R11 - MOVQ out_base+48(FP), R11 MOVQ (R11), R11 MOVQ start+72(FP), R12 @@ -55416,7 +55378,6 @@ TEXT ·mulAvxTwo_7x2_64(SB), $0-88 MOVQ 120(DX), R10 MOVQ 144(DX), DX MOVQ out_base+48(FP), R11 - MOVQ out_base+48(FP), R11 MOVQ (R11), R12 MOVQ 24(R11), R11 MOVQ start+72(FP), R13 @@ -56159,7 +56120,6 @@ TEXT ·mulAvxTwo_7x2_64Xor(SB), $0-88 MOVQ 120(DX), R10 MOVQ 144(DX), DX MOVQ out_base+48(FP), R11 - MOVQ out_base+48(FP), R11 MOVQ (R11), R12 MOVQ 24(R11), R11 MOVQ start+72(FP), R13 @@ -56412,7 +56372,6 @@ TEXT ·mulAvxTwo_7x3_64(SB), $0-88 MOVQ 120(DX), R10 MOVQ 144(DX), DX MOVQ out_base+48(FP), R11 - MOVQ out_base+48(FP), R11 MOVQ (R11), R12 MOVQ 24(R11), R13 MOVQ 48(R11), R11 @@ -57316,7 +57275,6 @@ TEXT ·mulAvxTwo_7x3_64Xor(SB), $0-88 MOVQ 120(DX), R10 MOVQ 144(DX), DX MOVQ out_base+48(FP), R11 - MOVQ out_base+48(FP), R11 MOVQ (R11), R12 MOVQ 24(R11), R13 MOVQ 48(R11), R11 @@ -69146,7 +69104,6 @@ TEXT ·mulAvxTwo_8x1_64(SB), $0-88 MOVQ 144(DX), R11 MOVQ 168(DX), DX MOVQ out_base+48(FP), R12 - MOVQ out_base+48(FP), R12 MOVQ (R12), R12 MOVQ start+72(FP), R13 @@ -69785,7 +69742,6 @@ TEXT ·mulAvxTwo_8x1_64Xor(SB), $0-88 MOVQ 144(DX), R11 MOVQ 168(DX), DX MOVQ out_base+48(FP), R12 - MOVQ out_base+48(FP), R12 MOVQ (R12), R12 MOVQ start+72(FP), R13 @@ -69996,7 +69952,6 @@ TEXT ·mulAvxTwo_8x2_64(SB), $0-88 MOVQ 144(DX), R11 MOVQ 168(DX), DX MOVQ out_base+48(FP), R12 - MOVQ out_base+48(FP), R12 MOVQ (R12), R13 MOVQ 24(R12), R12 MOVQ start+72(FP), R14 @@ -70816,7 +70771,6 @@ TEXT ·mulAvxTwo_8x2_64Xor(SB), $0-88 MOVQ 144(DX), R11 MOVQ 168(DX), DX MOVQ out_base+48(FP), R12 - MOVQ out_base+48(FP), R12 MOVQ (R12), R13 MOVQ 24(R12), R12 MOVQ start+72(FP), R14 @@ -71098,7 +71052,6 @@ TEXT ·mulAvxTwo_8x3_64(SB), $0-88 MOVQ 144(DX), R11 MOVQ 168(DX), DX MOVQ out_base+48(FP), R12 - MOVQ out_base+48(FP), R12 MOVQ (R12), R13 MOVQ 24(R12), R14 MOVQ 48(R12), R12 @@ -72099,7 +72052,6 @@ TEXT ·mulAvxTwo_8x3_64Xor(SB), $0-88 MOVQ 144(DX), R11 MOVQ 168(DX), DX MOVQ out_base+48(FP), R12 - MOVQ out_base+48(FP), R12 MOVQ (R12), R13 MOVQ 24(R12), R14 MOVQ 48(R12), R12 @@ -85180,7 +85132,6 @@ TEXT ·mulAvxTwo_9x1_64(SB), $0-88 MOVQ 168(DX), R12 MOVQ 192(DX), DX MOVQ out_base+48(FP), R13 - MOVQ out_base+48(FP), R13 MOVQ (R13), R13 MOVQ start+72(FP), R14 @@ -85876,7 +85827,6 @@ TEXT ·mulAvxTwo_9x1_64Xor(SB), $0-88 MOVQ 168(DX), R12 MOVQ 192(DX), DX MOVQ out_base+48(FP), R13 - MOVQ out_base+48(FP), R13 MOVQ (R13), R13 MOVQ start+72(FP), R14 @@ -86108,7 +86058,6 @@ TEXT ·mulAvxTwo_9x2_64(SB), $0-88 MOVQ 168(DX), R12 MOVQ 192(DX), DX MOVQ out_base+48(FP), R13 - MOVQ out_base+48(FP), R13 MOVQ (R13), R14 MOVQ 24(R13), R13 MOVQ start+72(FP), R15 @@ -87005,7 +86954,6 @@ TEXT ·mulAvxTwo_9x2_64Xor(SB), $0-88 MOVQ 168(DX), R12 MOVQ 192(DX), DX MOVQ out_base+48(FP), R13 - MOVQ out_base+48(FP), R13 MOVQ (R13), R14 MOVQ 24(R13), R13 MOVQ start+72(FP), R15 @@ -87316,7 +87264,6 @@ TEXT ·mulAvxTwo_9x3_64(SB), $8-88 MOVQ 168(DX), R12 MOVQ 192(DX), DX MOVQ out_base+48(FP), R13 - MOVQ out_base+48(FP), R13 MOVQ (R13), R14 MOVQ 24(R13), R15 MOVQ 48(R13), R13 @@ -88414,7 +88361,6 @@ TEXT ·mulAvxTwo_9x3_64Xor(SB), $8-88 MOVQ 168(DX), R12 MOVQ 192(DX), DX MOVQ out_base+48(FP), R13 - MOVQ out_base+48(FP), R13 MOVQ (R13), R14 MOVQ 24(R13), R15 MOVQ 48(R13), R13 @@ -102755,7 +102701,6 @@ TEXT ·mulAvxTwo_10x1_64(SB), $0-88 MOVQ 192(DX), R13 MOVQ 216(DX), DX MOVQ out_base+48(FP), R14 - MOVQ out_base+48(FP), R14 MOVQ (R14), R14 MOVQ start+72(FP), R15 @@ -103508,7 +103453,6 @@ TEXT ·mulAvxTwo_10x1_64Xor(SB), $0-88 MOVQ 192(DX), R13 MOVQ 216(DX), DX MOVQ out_base+48(FP), R14 - MOVQ out_base+48(FP), R14 MOVQ (R14), R14 MOVQ start+72(FP), R15 @@ -103761,7 +103705,6 @@ TEXT ·mulAvxTwo_10x2_64(SB), $8-88 MOVQ 192(DX), R13 MOVQ 216(DX), DX MOVQ out_base+48(FP), R14 - MOVQ out_base+48(FP), R14 MOVQ (R14), R15 MOVQ 24(R14), R14 MOVQ start+72(FP), BP @@ -104735,7 +104678,6 @@ TEXT ·mulAvxTwo_10x2_64Xor(SB), $8-88 MOVQ 192(DX), R13 MOVQ 216(DX), DX MOVQ out_base+48(FP), R14 - MOVQ out_base+48(FP), R14 MOVQ (R14), R15 MOVQ 24(R14), R14 MOVQ start+72(FP), BP @@ -105075,7 +105017,6 @@ TEXT ·mulAvxTwo_10x3_64(SB), $8-88 MOVQ 192(AX), R12 MOVQ 216(AX), AX MOVQ out_base+48(FP), R13 - MOVQ out_base+48(FP), R13 MOVQ (R13), R14 MOVQ 24(R13), R15 MOVQ 48(R13), R13 @@ -106284,7 +106225,6 @@ TEXT ·mulAvxTwo_10x3_64Xor(SB), $8-88 MOVQ 192(AX), R12 MOVQ 216(AX), AX MOVQ out_base+48(FP), R13 - MOVQ out_base+48(FP), R13 MOVQ (R13), R14 MOVQ 24(R13), R15 MOVQ 48(R13), R13 diff --git a/vendor/github.com/klauspost/reedsolomon/galois_gen_arm64.go b/vendor/github.com/klauspost/reedsolomon/galois_gen_arm64.go new file mode 100644 index 000000000..2f8719038 --- /dev/null +++ b/vendor/github.com/klauspost/reedsolomon/galois_gen_arm64.go @@ -0,0 +1,125 @@ +// Code generated by command: go generate gen.go. DO NOT EDIT. + +//go:build !noasm && !appengine && !gccgo && !nopshufb + +package reedsolomon + +//go:noescape +func mulSve_10x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +//go:noescape +func mulSve_10x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +//go:noescape +func mulSve_10x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +//go:noescape +func mulSve_10x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +//go:noescape +func mulSve_10x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +//go:noescape +func mulSve_10x3_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +//go:noescape +func mulSve_10x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +//go:noescape +func mulSve_10x4Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +//go:noescape +func mulSve_10x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +//go:noescape +func mulSve_10x5Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +//go:noescape +func mulSve_10x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +//go:noescape +func mulSve_10x6Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +//go:noescape +func mulSve_10x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +//go:noescape +func mulSve_10x7Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +//go:noescape +func mulSve_10x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +//go:noescape +func mulSve_10x8Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +//go:noescape +func mulSve_10x9(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +//go:noescape +func mulSve_10x9Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +//go:noescape +func mulSve_10x10(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +//go:noescape +func mulSve_10x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +//go:noescape +func mulNeon_10x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +//go:noescape +func mulNeon_10x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +//go:noescape +func mulNeon_10x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +//go:noescape +func mulNeon_10x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +//go:noescape +func mulNeon_10x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +//go:noescape +func mulNeon_10x3_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +//go:noescape +func mulNeon_10x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +//go:noescape +func mulNeon_10x4Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +//go:noescape +func mulNeon_10x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +//go:noescape +func mulNeon_10x5Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +//go:noescape +func mulNeon_10x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +//go:noescape +func mulNeon_10x6Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +//go:noescape +func mulNeon_10x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +//go:noescape +func mulNeon_10x7Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +//go:noescape +func mulNeon_10x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +//go:noescape +func mulNeon_10x8Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +//go:noescape +func mulNeon_10x9(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +//go:noescape +func mulNeon_10x9Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +//go:noescape +func mulNeon_10x10(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +//go:noescape +func mulNeon_10x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) diff --git a/vendor/github.com/klauspost/reedsolomon/galois_gen_arm64.s b/vendor/github.com/klauspost/reedsolomon/galois_gen_arm64.s new file mode 100644 index 000000000..335b94c36 --- /dev/null +++ b/vendor/github.com/klauspost/reedsolomon/galois_gen_arm64.s @@ -0,0 +1,26958 @@ +// Code generated by command: go generate gen.go. DO NOT EDIT. + +//go:build !appengine && !noasm && !nogen && !nopshufb && gc + +#include "textflag.h" + +// func mulSve_10x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: SVE +TEXT ·mulSve_10x1_64(SB), $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 46 YMM used + MOVD n+80(FP), R0 + MOVD matrix_base+0(FP), R2 + WORD $0xd346fc00 // lsr x0, x0, #6 + WORD $0xea00001f // tst x0, x0 + BEQ mulSve_10x1_64_end + MOVD in_base+24(FP), R3 + MOVD (R3), R1 + MOVD 24(R3), R4 + MOVD 48(R3), R5 + MOVD 72(R3), R8 + MOVD 96(R3), R9 + MOVD 120(R3), R10 + MOVD 144(R3), R11 + MOVD 168(R3), R12 + MOVD 192(R3), R13 + MOVD 216(R3), R3 + MOVD out_base+48(FP), R14 + MOVD (R14), R14 + MOVD start+72(FP), R15 + + // Add start offset to output + WORD $0x8b0f01ce // add x14, x14, x15 + + // Add start offset to input + WORD $0x8b0f0021 // add x1, x1, x15 + WORD $0x8b0f0084 // add x4, x4, x15 + WORD $0x8b0f00a5 // add x5, x5, x15 + WORD $0x8b0f0108 // add x8, x8, x15 + WORD $0x8b0f0129 // add x9, x9, x15 + WORD $0x8b0f014a // add x10, x10, x15 + WORD $0x8b0f016b // add x11, x11, x15 + WORD $0x8b0f018c // add x12, x12, x15 + WORD $0x8b0f01ad // add x13, x13, x15 + WORD $0x8b0f0063 // add x3, x3, x15 + WORD $0xd28001ef // mov x15, #15 + WORD $0x05e039e2 // mov z2.d, x15 + WORD $0x05212042 // dup z2.b, z2.b[0] + + // Load number of input shards + MOVD in_len+32(FP), R16 + +mulSve_10x1_64_loop: + // Load and process 64 bytes from input 0 to 1 outputs + WORD $0x85804026 // ldr z6, [x1] + WORD $0x85804425 // ldr z5, [x1, #1, MUL VL] + WORD $0x91010021 // add x1, x1, #64 + WORD $0x04fc94c7 // lsr z7.d, z6.d, #4 + WORD $0x04fc94a8 // lsr z8.d, z5.d, #4 + WORD $0x042230c6 // and z6.d, z6.d, z2.d + WORD $0x042230a5 // and z5.d, z5.d, z2.d + WORD $0x042230e7 // and z7.d, z7.d, z2.d + WORD $0x04223108 // and z8.d, z8.d, z2.d + WORD $0x85804043 // ldr z3, [x2] + WORD $0x85804444 // ldr z4, [x2, #1, MUL VL] + WORD $0x05253065 // tbl z5.b, z3.b, z5.b + WORD $0x05263063 // tbl z3.b, z3.b, z6.b + WORD $0x05283086 // tbl z6.b, z4.b, z8.b + WORD $0x05273084 // tbl z4.b, z4.b, z7.b + WORD $0x04a33080 // eor z0.d, z4.d, z3.d + WORD $0x04a530c1 // eor z1.d, z6.d, z5.d + // Check for early termination + CMP $1, R16 + BEQ mulSve_10x1_64_store + + // Load and process 64 bytes from input 1 to 1 outputs + WORD $0x85804086 // ldr z6, [x4] + WORD $0x85804485 // ldr z5, [x4, #1, MUL VL] + WORD $0x91010084 // add x4, x4, #64 + WORD $0x04fc94c7 // lsr z7.d, z6.d, #4 + WORD $0x04fc94a8 // lsr z8.d, z5.d, #4 + WORD $0x042230c6 // and z6.d, z6.d, z2.d + WORD $0x042230a5 // and z5.d, z5.d, z2.d + WORD $0x042230e7 // and z7.d, z7.d, z2.d + WORD $0x04223108 // and z8.d, z8.d, z2.d + WORD $0x85804843 // ldr z3, [x2, #2, MUL VL] + WORD $0x85804c44 // ldr z4, [x2, #3, MUL VL] + WORD $0x05253065 // tbl z5.b, z3.b, z5.b + WORD $0x05263063 // tbl z3.b, z3.b, z6.b + WORD $0x05283086 // tbl z6.b, z4.b, z8.b + WORD $0x05273084 // tbl z4.b, z4.b, z7.b + WORD $0x04a33000 // eor z0.d, z0.d, z3.d + WORD $0x04a43000 // eor z0.d, z0.d, z4.d + WORD $0x04a53021 // eor z1.d, z1.d, z5.d + WORD $0x04a63021 // eor z1.d, z1.d, z6.d + // Check for early termination + CMP $2, R16 + BEQ mulSve_10x1_64_store + + // Load and process 64 bytes from input 2 to 1 outputs + WORD $0x858040a6 // ldr z6, [x5] + WORD $0x858044a5 // ldr z5, [x5, #1, MUL VL] + WORD $0x910100a5 // add x5, x5, #64 + WORD $0x04fc94c7 // lsr z7.d, z6.d, #4 + WORD $0x04fc94a8 // lsr z8.d, z5.d, #4 + WORD $0x042230c6 // and z6.d, z6.d, z2.d + WORD $0x042230a5 // and z5.d, z5.d, z2.d + WORD $0x042230e7 // and z7.d, z7.d, z2.d + WORD $0x04223108 // and z8.d, z8.d, z2.d + WORD $0x85805043 // ldr z3, [x2, #4, MUL VL] + WORD $0x85805444 // ldr z4, [x2, #5, MUL VL] + WORD $0x05253065 // tbl z5.b, z3.b, z5.b + WORD $0x05263063 // tbl z3.b, z3.b, z6.b + WORD $0x05283086 // tbl z6.b, z4.b, z8.b + WORD $0x05273084 // tbl z4.b, z4.b, z7.b + WORD $0x04a33000 // eor z0.d, z0.d, z3.d + WORD $0x04a43000 // eor z0.d, z0.d, z4.d + WORD $0x04a53021 // eor z1.d, z1.d, z5.d + WORD $0x04a63021 // eor z1.d, z1.d, z6.d + // Check for early termination + CMP $3, R16 + BEQ mulSve_10x1_64_store + + // Load and process 64 bytes from input 3 to 1 outputs + WORD $0x85804106 // ldr z6, [x8] + WORD $0x85804505 // ldr z5, [x8, #1, MUL VL] + WORD $0x91010108 // add x8, x8, #64 + WORD $0x04fc94c7 // lsr z7.d, z6.d, #4 + WORD $0x04fc94a8 // lsr z8.d, z5.d, #4 + WORD $0x042230c6 // and z6.d, z6.d, z2.d + WORD $0x042230a5 // and z5.d, z5.d, z2.d + WORD $0x042230e7 // and z7.d, z7.d, z2.d + WORD $0x04223108 // and z8.d, z8.d, z2.d + WORD $0x85805843 // ldr z3, [x2, #6, MUL VL] + WORD $0x85805c44 // ldr z4, [x2, #7, MUL VL] + WORD $0x05253065 // tbl z5.b, z3.b, z5.b + WORD $0x05263063 // tbl z3.b, z3.b, z6.b + WORD $0x05283086 // tbl z6.b, z4.b, z8.b + WORD $0x05273084 // tbl z4.b, z4.b, z7.b + WORD $0x04a33000 // eor z0.d, z0.d, z3.d + WORD $0x04a43000 // eor z0.d, z0.d, z4.d + WORD $0x04a53021 // eor z1.d, z1.d, z5.d + WORD $0x04a63021 // eor z1.d, z1.d, z6.d + // Check for early termination + CMP $4, R16 + BEQ mulSve_10x1_64_store + + // Load and process 64 bytes from input 4 to 1 outputs + WORD $0x85804126 // ldr z6, [x9] + WORD $0x85804525 // ldr z5, [x9, #1, MUL VL] + WORD $0x91010129 // add x9, x9, #64 + WORD $0x04fc94c7 // lsr z7.d, z6.d, #4 + WORD $0x04fc94a8 // lsr z8.d, z5.d, #4 + WORD $0x042230c6 // and z6.d, z6.d, z2.d + WORD $0x042230a5 // and z5.d, z5.d, z2.d + WORD $0x042230e7 // and z7.d, z7.d, z2.d + WORD $0x04223108 // and z8.d, z8.d, z2.d + WORD $0x85814043 // ldr z3, [x2, #8, MUL VL] + WORD $0x85814444 // ldr z4, [x2, #9, MUL VL] + WORD $0x05253065 // tbl z5.b, z3.b, z5.b + WORD $0x05263063 // tbl z3.b, z3.b, z6.b + WORD $0x05283086 // tbl z6.b, z4.b, z8.b + WORD $0x05273084 // tbl z4.b, z4.b, z7.b + WORD $0x04a33000 // eor z0.d, z0.d, z3.d + WORD $0x04a43000 // eor z0.d, z0.d, z4.d + WORD $0x04a53021 // eor z1.d, z1.d, z5.d + WORD $0x04a63021 // eor z1.d, z1.d, z6.d + // Check for early termination + CMP $5, R16 + BEQ mulSve_10x1_64_store + + // Load and process 64 bytes from input 5 to 1 outputs + WORD $0x85804146 // ldr z6, [x10] + WORD $0x85804545 // ldr z5, [x10, #1, MUL VL] + WORD $0x9101014a // add x10, x10, #64 + WORD $0x04fc94c7 // lsr z7.d, z6.d, #4 + WORD $0x04fc94a8 // lsr z8.d, z5.d, #4 + WORD $0x042230c6 // and z6.d, z6.d, z2.d + WORD $0x042230a5 // and z5.d, z5.d, z2.d + WORD $0x042230e7 // and z7.d, z7.d, z2.d + WORD $0x04223108 // and z8.d, z8.d, z2.d + WORD $0x85814843 // ldr z3, [x2, #10, MUL VL] + WORD $0x85814c44 // ldr z4, [x2, #11, MUL VL] + WORD $0x05253065 // tbl z5.b, z3.b, z5.b + WORD $0x05263063 // tbl z3.b, z3.b, z6.b + WORD $0x05283086 // tbl z6.b, z4.b, z8.b + WORD $0x05273084 // tbl z4.b, z4.b, z7.b + WORD $0x04a33000 // eor z0.d, z0.d, z3.d + WORD $0x04a43000 // eor z0.d, z0.d, z4.d + WORD $0x04a53021 // eor z1.d, z1.d, z5.d + WORD $0x04a63021 // eor z1.d, z1.d, z6.d + // Check for early termination + CMP $6, R16 + BEQ mulSve_10x1_64_store + + // Load and process 64 bytes from input 6 to 1 outputs + WORD $0x85804166 // ldr z6, [x11] + WORD $0x85804565 // ldr z5, [x11, #1, MUL VL] + WORD $0x9101016b // add x11, x11, #64 + WORD $0x04fc94c7 // lsr z7.d, z6.d, #4 + WORD $0x04fc94a8 // lsr z8.d, z5.d, #4 + WORD $0x042230c6 // and z6.d, z6.d, z2.d + WORD $0x042230a5 // and z5.d, z5.d, z2.d + WORD $0x042230e7 // and z7.d, z7.d, z2.d + WORD $0x04223108 // and z8.d, z8.d, z2.d + WORD $0x85815043 // ldr z3, [x2, #12, MUL VL] + WORD $0x85815444 // ldr z4, [x2, #13, MUL VL] + WORD $0x05253065 // tbl z5.b, z3.b, z5.b + WORD $0x05263063 // tbl z3.b, z3.b, z6.b + WORD $0x05283086 // tbl z6.b, z4.b, z8.b + WORD $0x05273084 // tbl z4.b, z4.b, z7.b + WORD $0x04a33000 // eor z0.d, z0.d, z3.d + WORD $0x04a43000 // eor z0.d, z0.d, z4.d + WORD $0x04a53021 // eor z1.d, z1.d, z5.d + WORD $0x04a63021 // eor z1.d, z1.d, z6.d + // Check for early termination + CMP $7, R16 + BEQ mulSve_10x1_64_store + + // Load and process 64 bytes from input 7 to 1 outputs + WORD $0x85804186 // ldr z6, [x12] + WORD $0x85804585 // ldr z5, [x12, #1, MUL VL] + WORD $0x9101018c // add x12, x12, #64 + WORD $0x04fc94c7 // lsr z7.d, z6.d, #4 + WORD $0x04fc94a8 // lsr z8.d, z5.d, #4 + WORD $0x042230c6 // and z6.d, z6.d, z2.d + WORD $0x042230a5 // and z5.d, z5.d, z2.d + WORD $0x042230e7 // and z7.d, z7.d, z2.d + WORD $0x04223108 // and z8.d, z8.d, z2.d + WORD $0x85815843 // ldr z3, [x2, #14, MUL VL] + WORD $0x85815c44 // ldr z4, [x2, #15, MUL VL] + WORD $0x05253065 // tbl z5.b, z3.b, z5.b + WORD $0x05263063 // tbl z3.b, z3.b, z6.b + WORD $0x05283086 // tbl z6.b, z4.b, z8.b + WORD $0x05273084 // tbl z4.b, z4.b, z7.b + WORD $0x04a33000 // eor z0.d, z0.d, z3.d + WORD $0x04a43000 // eor z0.d, z0.d, z4.d + WORD $0x04a53021 // eor z1.d, z1.d, z5.d + WORD $0x04a63021 // eor z1.d, z1.d, z6.d + // Check for early termination + CMP $8, R16 + BEQ mulSve_10x1_64_store + + // Load and process 64 bytes from input 8 to 1 outputs + WORD $0x858041a6 // ldr z6, [x13] + WORD $0x858045a5 // ldr z5, [x13, #1, MUL VL] + WORD $0x910101ad // add x13, x13, #64 + WORD $0x04fc94c7 // lsr z7.d, z6.d, #4 + WORD $0x04fc94a8 // lsr z8.d, z5.d, #4 + WORD $0x042230c6 // and z6.d, z6.d, z2.d + WORD $0x042230a5 // and z5.d, z5.d, z2.d + WORD $0x042230e7 // and z7.d, z7.d, z2.d + WORD $0x04223108 // and z8.d, z8.d, z2.d + WORD $0x85824043 // ldr z3, [x2, #16, MUL VL] + WORD $0x85824444 // ldr z4, [x2, #17, MUL VL] + WORD $0x05253065 // tbl z5.b, z3.b, z5.b + WORD $0x05263063 // tbl z3.b, z3.b, z6.b + WORD $0x05283086 // tbl z6.b, z4.b, z8.b + WORD $0x05273084 // tbl z4.b, z4.b, z7.b + WORD $0x04a33000 // eor z0.d, z0.d, z3.d + WORD $0x04a43000 // eor z0.d, z0.d, z4.d + WORD $0x04a53021 // eor z1.d, z1.d, z5.d + WORD $0x04a63021 // eor z1.d, z1.d, z6.d + // Check for early termination + CMP $9, R16 + BEQ mulSve_10x1_64_store + + // Load and process 64 bytes from input 9 to 1 outputs + WORD $0x85804066 // ldr z6, [x3] + WORD $0x85804465 // ldr z5, [x3, #1, MUL VL] + WORD $0x91010063 // add x3, x3, #64 + WORD $0x04fc94c7 // lsr z7.d, z6.d, #4 + WORD $0x04fc94a8 // lsr z8.d, z5.d, #4 + WORD $0x042230c6 // and z6.d, z6.d, z2.d + WORD $0x042230a5 // and z5.d, z5.d, z2.d + WORD $0x042230e7 // and z7.d, z7.d, z2.d + WORD $0x04223108 // and z8.d, z8.d, z2.d + WORD $0x85824843 // ldr z3, [x2, #18, MUL VL] + WORD $0x85824c44 // ldr z4, [x2, #19, MUL VL] + WORD $0x05253065 // tbl z5.b, z3.b, z5.b + WORD $0x05263063 // tbl z3.b, z3.b, z6.b + WORD $0x05283086 // tbl z6.b, z4.b, z8.b + WORD $0x05273084 // tbl z4.b, z4.b, z7.b + WORD $0x04a33000 // eor z0.d, z0.d, z3.d + WORD $0x04a43000 // eor z0.d, z0.d, z4.d + WORD $0x04a53021 // eor z1.d, z1.d, z5.d + WORD $0x04a63021 // eor z1.d, z1.d, z6.d + +mulSve_10x1_64_store: + // Store 1 outputs + WORD $0xe58041c0 // str z0, [x14] + WORD $0xe58045c1 // str z1, [x14, #1, MUL VL] + WORD $0x910101ce // add x14, x14, #64 + + // Prepare for next loop + WORD $0xf1000400 // subs x0, x0, #1 + BNE mulSve_10x1_64_loop + +mulSve_10x1_64_end: + RET + +// func mulSve_10x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: SVE +TEXT ·mulSve_10x1_64Xor(SB), $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 46 YMM used + MOVD n+80(FP), R0 + MOVD matrix_base+0(FP), R2 + WORD $0xd346fc00 // lsr x0, x0, #6 + WORD $0xea00001f // tst x0, x0 + BEQ mulSve_10x1_64Xor_end + MOVD in_base+24(FP), R3 + MOVD (R3), R1 + MOVD 24(R3), R4 + MOVD 48(R3), R5 + MOVD 72(R3), R8 + MOVD 96(R3), R9 + MOVD 120(R3), R10 + MOVD 144(R3), R11 + MOVD 168(R3), R12 + MOVD 192(R3), R13 + MOVD 216(R3), R3 + MOVD out_base+48(FP), R14 + MOVD (R14), R14 + MOVD start+72(FP), R15 + + // Add start offset to output + WORD $0x8b0f01ce // add x14, x14, x15 + + // Add start offset to input + WORD $0x8b0f0021 // add x1, x1, x15 + WORD $0x8b0f0084 // add x4, x4, x15 + WORD $0x8b0f00a5 // add x5, x5, x15 + WORD $0x8b0f0108 // add x8, x8, x15 + WORD $0x8b0f0129 // add x9, x9, x15 + WORD $0x8b0f014a // add x10, x10, x15 + WORD $0x8b0f016b // add x11, x11, x15 + WORD $0x8b0f018c // add x12, x12, x15 + WORD $0x8b0f01ad // add x13, x13, x15 + WORD $0x8b0f0063 // add x3, x3, x15 + WORD $0xd28001ef // mov x15, #15 + WORD $0x05e039e2 // mov z2.d, x15 + WORD $0x05212042 // dup z2.b, z2.b[0] + + // Load number of input shards + MOVD in_len+32(FP), R16 + +mulSve_10x1_64Xor_loop: + // Load 1 outputs + WORD $0x858041c0 // ldr z0, [x14] + WORD $0x858045c1 // ldr z1, [x14, #1, MUL VL] + + // Load and process 64 bytes from input 0 to 1 outputs + WORD $0x85804026 // ldr z6, [x1] + WORD $0x85804425 // ldr z5, [x1, #1, MUL VL] + WORD $0x91010021 // add x1, x1, #64 + WORD $0x04fc94c7 // lsr z7.d, z6.d, #4 + WORD $0x04fc94a8 // lsr z8.d, z5.d, #4 + WORD $0x042230c6 // and z6.d, z6.d, z2.d + WORD $0x042230a5 // and z5.d, z5.d, z2.d + WORD $0x042230e7 // and z7.d, z7.d, z2.d + WORD $0x04223108 // and z8.d, z8.d, z2.d + WORD $0x85804043 // ldr z3, [x2] + WORD $0x85804444 // ldr z4, [x2, #1, MUL VL] + WORD $0x05253065 // tbl z5.b, z3.b, z5.b + WORD $0x05263063 // tbl z3.b, z3.b, z6.b + WORD $0x05283086 // tbl z6.b, z4.b, z8.b + WORD $0x05273084 // tbl z4.b, z4.b, z7.b + WORD $0x04a33000 // eor z0.d, z0.d, z3.d + WORD $0x04a43000 // eor z0.d, z0.d, z4.d + WORD $0x04a53021 // eor z1.d, z1.d, z5.d + WORD $0x04a63021 // eor z1.d, z1.d, z6.d + // Check for early termination + CMP $1, R16 + BEQ mulSve_10x1_64Xor_store + + // Load and process 64 bytes from input 1 to 1 outputs + WORD $0x85804086 // ldr z6, [x4] + WORD $0x85804485 // ldr z5, [x4, #1, MUL VL] + WORD $0x91010084 // add x4, x4, #64 + WORD $0x04fc94c7 // lsr z7.d, z6.d, #4 + WORD $0x04fc94a8 // lsr z8.d, z5.d, #4 + WORD $0x042230c6 // and z6.d, z6.d, z2.d + WORD $0x042230a5 // and z5.d, z5.d, z2.d + WORD $0x042230e7 // and z7.d, z7.d, z2.d + WORD $0x04223108 // and z8.d, z8.d, z2.d + WORD $0x85804843 // ldr z3, [x2, #2, MUL VL] + WORD $0x85804c44 // ldr z4, [x2, #3, MUL VL] + WORD $0x05253065 // tbl z5.b, z3.b, z5.b + WORD $0x05263063 // tbl z3.b, z3.b, z6.b + WORD $0x05283086 // tbl z6.b, z4.b, z8.b + WORD $0x05273084 // tbl z4.b, z4.b, z7.b + WORD $0x04a33000 // eor z0.d, z0.d, z3.d + WORD $0x04a43000 // eor z0.d, z0.d, z4.d + WORD $0x04a53021 // eor z1.d, z1.d, z5.d + WORD $0x04a63021 // eor z1.d, z1.d, z6.d + // Check for early termination + CMP $2, R16 + BEQ mulSve_10x1_64Xor_store + + // Load and process 64 bytes from input 2 to 1 outputs + WORD $0x858040a6 // ldr z6, [x5] + WORD $0x858044a5 // ldr z5, [x5, #1, MUL VL] + WORD $0x910100a5 // add x5, x5, #64 + WORD $0x04fc94c7 // lsr z7.d, z6.d, #4 + WORD $0x04fc94a8 // lsr z8.d, z5.d, #4 + WORD $0x042230c6 // and z6.d, z6.d, z2.d + WORD $0x042230a5 // and z5.d, z5.d, z2.d + WORD $0x042230e7 // and z7.d, z7.d, z2.d + WORD $0x04223108 // and z8.d, z8.d, z2.d + WORD $0x85805043 // ldr z3, [x2, #4, MUL VL] + WORD $0x85805444 // ldr z4, [x2, #5, MUL VL] + WORD $0x05253065 // tbl z5.b, z3.b, z5.b + WORD $0x05263063 // tbl z3.b, z3.b, z6.b + WORD $0x05283086 // tbl z6.b, z4.b, z8.b + WORD $0x05273084 // tbl z4.b, z4.b, z7.b + WORD $0x04a33000 // eor z0.d, z0.d, z3.d + WORD $0x04a43000 // eor z0.d, z0.d, z4.d + WORD $0x04a53021 // eor z1.d, z1.d, z5.d + WORD $0x04a63021 // eor z1.d, z1.d, z6.d + // Check for early termination + CMP $3, R16 + BEQ mulSve_10x1_64Xor_store + + // Load and process 64 bytes from input 3 to 1 outputs + WORD $0x85804106 // ldr z6, [x8] + WORD $0x85804505 // ldr z5, [x8, #1, MUL VL] + WORD $0x91010108 // add x8, x8, #64 + WORD $0x04fc94c7 // lsr z7.d, z6.d, #4 + WORD $0x04fc94a8 // lsr z8.d, z5.d, #4 + WORD $0x042230c6 // and z6.d, z6.d, z2.d + WORD $0x042230a5 // and z5.d, z5.d, z2.d + WORD $0x042230e7 // and z7.d, z7.d, z2.d + WORD $0x04223108 // and z8.d, z8.d, z2.d + WORD $0x85805843 // ldr z3, [x2, #6, MUL VL] + WORD $0x85805c44 // ldr z4, [x2, #7, MUL VL] + WORD $0x05253065 // tbl z5.b, z3.b, z5.b + WORD $0x05263063 // tbl z3.b, z3.b, z6.b + WORD $0x05283086 // tbl z6.b, z4.b, z8.b + WORD $0x05273084 // tbl z4.b, z4.b, z7.b + WORD $0x04a33000 // eor z0.d, z0.d, z3.d + WORD $0x04a43000 // eor z0.d, z0.d, z4.d + WORD $0x04a53021 // eor z1.d, z1.d, z5.d + WORD $0x04a63021 // eor z1.d, z1.d, z6.d + // Check for early termination + CMP $4, R16 + BEQ mulSve_10x1_64Xor_store + + // Load and process 64 bytes from input 4 to 1 outputs + WORD $0x85804126 // ldr z6, [x9] + WORD $0x85804525 // ldr z5, [x9, #1, MUL VL] + WORD $0x91010129 // add x9, x9, #64 + WORD $0x04fc94c7 // lsr z7.d, z6.d, #4 + WORD $0x04fc94a8 // lsr z8.d, z5.d, #4 + WORD $0x042230c6 // and z6.d, z6.d, z2.d + WORD $0x042230a5 // and z5.d, z5.d, z2.d + WORD $0x042230e7 // and z7.d, z7.d, z2.d + WORD $0x04223108 // and z8.d, z8.d, z2.d + WORD $0x85814043 // ldr z3, [x2, #8, MUL VL] + WORD $0x85814444 // ldr z4, [x2, #9, MUL VL] + WORD $0x05253065 // tbl z5.b, z3.b, z5.b + WORD $0x05263063 // tbl z3.b, z3.b, z6.b + WORD $0x05283086 // tbl z6.b, z4.b, z8.b + WORD $0x05273084 // tbl z4.b, z4.b, z7.b + WORD $0x04a33000 // eor z0.d, z0.d, z3.d + WORD $0x04a43000 // eor z0.d, z0.d, z4.d + WORD $0x04a53021 // eor z1.d, z1.d, z5.d + WORD $0x04a63021 // eor z1.d, z1.d, z6.d + // Check for early termination + CMP $5, R16 + BEQ mulSve_10x1_64Xor_store + + // Load and process 64 bytes from input 5 to 1 outputs + WORD $0x85804146 // ldr z6, [x10] + WORD $0x85804545 // ldr z5, [x10, #1, MUL VL] + WORD $0x9101014a // add x10, x10, #64 + WORD $0x04fc94c7 // lsr z7.d, z6.d, #4 + WORD $0x04fc94a8 // lsr z8.d, z5.d, #4 + WORD $0x042230c6 // and z6.d, z6.d, z2.d + WORD $0x042230a5 // and z5.d, z5.d, z2.d + WORD $0x042230e7 // and z7.d, z7.d, z2.d + WORD $0x04223108 // and z8.d, z8.d, z2.d + WORD $0x85814843 // ldr z3, [x2, #10, MUL VL] + WORD $0x85814c44 // ldr z4, [x2, #11, MUL VL] + WORD $0x05253065 // tbl z5.b, z3.b, z5.b + WORD $0x05263063 // tbl z3.b, z3.b, z6.b + WORD $0x05283086 // tbl z6.b, z4.b, z8.b + WORD $0x05273084 // tbl z4.b, z4.b, z7.b + WORD $0x04a33000 // eor z0.d, z0.d, z3.d + WORD $0x04a43000 // eor z0.d, z0.d, z4.d + WORD $0x04a53021 // eor z1.d, z1.d, z5.d + WORD $0x04a63021 // eor z1.d, z1.d, z6.d + // Check for early termination + CMP $6, R16 + BEQ mulSve_10x1_64Xor_store + + // Load and process 64 bytes from input 6 to 1 outputs + WORD $0x85804166 // ldr z6, [x11] + WORD $0x85804565 // ldr z5, [x11, #1, MUL VL] + WORD $0x9101016b // add x11, x11, #64 + WORD $0x04fc94c7 // lsr z7.d, z6.d, #4 + WORD $0x04fc94a8 // lsr z8.d, z5.d, #4 + WORD $0x042230c6 // and z6.d, z6.d, z2.d + WORD $0x042230a5 // and z5.d, z5.d, z2.d + WORD $0x042230e7 // and z7.d, z7.d, z2.d + WORD $0x04223108 // and z8.d, z8.d, z2.d + WORD $0x85815043 // ldr z3, [x2, #12, MUL VL] + WORD $0x85815444 // ldr z4, [x2, #13, MUL VL] + WORD $0x05253065 // tbl z5.b, z3.b, z5.b + WORD $0x05263063 // tbl z3.b, z3.b, z6.b + WORD $0x05283086 // tbl z6.b, z4.b, z8.b + WORD $0x05273084 // tbl z4.b, z4.b, z7.b + WORD $0x04a33000 // eor z0.d, z0.d, z3.d + WORD $0x04a43000 // eor z0.d, z0.d, z4.d + WORD $0x04a53021 // eor z1.d, z1.d, z5.d + WORD $0x04a63021 // eor z1.d, z1.d, z6.d + // Check for early termination + CMP $7, R16 + BEQ mulSve_10x1_64Xor_store + + // Load and process 64 bytes from input 7 to 1 outputs + WORD $0x85804186 // ldr z6, [x12] + WORD $0x85804585 // ldr z5, [x12, #1, MUL VL] + WORD $0x9101018c // add x12, x12, #64 + WORD $0x04fc94c7 // lsr z7.d, z6.d, #4 + WORD $0x04fc94a8 // lsr z8.d, z5.d, #4 + WORD $0x042230c6 // and z6.d, z6.d, z2.d + WORD $0x042230a5 // and z5.d, z5.d, z2.d + WORD $0x042230e7 // and z7.d, z7.d, z2.d + WORD $0x04223108 // and z8.d, z8.d, z2.d + WORD $0x85815843 // ldr z3, [x2, #14, MUL VL] + WORD $0x85815c44 // ldr z4, [x2, #15, MUL VL] + WORD $0x05253065 // tbl z5.b, z3.b, z5.b + WORD $0x05263063 // tbl z3.b, z3.b, z6.b + WORD $0x05283086 // tbl z6.b, z4.b, z8.b + WORD $0x05273084 // tbl z4.b, z4.b, z7.b + WORD $0x04a33000 // eor z0.d, z0.d, z3.d + WORD $0x04a43000 // eor z0.d, z0.d, z4.d + WORD $0x04a53021 // eor z1.d, z1.d, z5.d + WORD $0x04a63021 // eor z1.d, z1.d, z6.d + // Check for early termination + CMP $8, R16 + BEQ mulSve_10x1_64Xor_store + + // Load and process 64 bytes from input 8 to 1 outputs + WORD $0x858041a6 // ldr z6, [x13] + WORD $0x858045a5 // ldr z5, [x13, #1, MUL VL] + WORD $0x910101ad // add x13, x13, #64 + WORD $0x04fc94c7 // lsr z7.d, z6.d, #4 + WORD $0x04fc94a8 // lsr z8.d, z5.d, #4 + WORD $0x042230c6 // and z6.d, z6.d, z2.d + WORD $0x042230a5 // and z5.d, z5.d, z2.d + WORD $0x042230e7 // and z7.d, z7.d, z2.d + WORD $0x04223108 // and z8.d, z8.d, z2.d + WORD $0x85824043 // ldr z3, [x2, #16, MUL VL] + WORD $0x85824444 // ldr z4, [x2, #17, MUL VL] + WORD $0x05253065 // tbl z5.b, z3.b, z5.b + WORD $0x05263063 // tbl z3.b, z3.b, z6.b + WORD $0x05283086 // tbl z6.b, z4.b, z8.b + WORD $0x05273084 // tbl z4.b, z4.b, z7.b + WORD $0x04a33000 // eor z0.d, z0.d, z3.d + WORD $0x04a43000 // eor z0.d, z0.d, z4.d + WORD $0x04a53021 // eor z1.d, z1.d, z5.d + WORD $0x04a63021 // eor z1.d, z1.d, z6.d + // Check for early termination + CMP $9, R16 + BEQ mulSve_10x1_64Xor_store + + // Load and process 64 bytes from input 9 to 1 outputs + WORD $0x85804066 // ldr z6, [x3] + WORD $0x85804465 // ldr z5, [x3, #1, MUL VL] + WORD $0x91010063 // add x3, x3, #64 + WORD $0x04fc94c7 // lsr z7.d, z6.d, #4 + WORD $0x04fc94a8 // lsr z8.d, z5.d, #4 + WORD $0x042230c6 // and z6.d, z6.d, z2.d + WORD $0x042230a5 // and z5.d, z5.d, z2.d + WORD $0x042230e7 // and z7.d, z7.d, z2.d + WORD $0x04223108 // and z8.d, z8.d, z2.d + WORD $0x85824843 // ldr z3, [x2, #18, MUL VL] + WORD $0x85824c44 // ldr z4, [x2, #19, MUL VL] + WORD $0x05253065 // tbl z5.b, z3.b, z5.b + WORD $0x05263063 // tbl z3.b, z3.b, z6.b + WORD $0x05283086 // tbl z6.b, z4.b, z8.b + WORD $0x05273084 // tbl z4.b, z4.b, z7.b + WORD $0x04a33000 // eor z0.d, z0.d, z3.d + WORD $0x04a43000 // eor z0.d, z0.d, z4.d + WORD $0x04a53021 // eor z1.d, z1.d, z5.d + WORD $0x04a63021 // eor z1.d, z1.d, z6.d + +mulSve_10x1_64Xor_store: + // Store 1 outputs + WORD $0xe58041c0 // str z0, [x14] + WORD $0xe58045c1 // str z1, [x14, #1, MUL VL] + WORD $0x910101ce // add x14, x14, #64 + + // Prepare for next loop + WORD $0xf1000400 // subs x0, x0, #1 + BNE mulSve_10x1_64Xor_loop + +mulSve_10x1_64Xor_end: + RET + +// func mulSve_10x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: SVE +TEXT ·mulSve_10x2_64(SB), $8-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 89 YMM used + MOVD n+80(FP), R0 + MOVD matrix_base+0(FP), R2 + WORD $0xd346fc00 // lsr x0, x0, #6 + WORD $0xea00001f // tst x0, x0 + BEQ mulSve_10x2_64_end + MOVD in_base+24(FP), R3 + MOVD (R3), R1 + MOVD 24(R3), R4 + MOVD 48(R3), R5 + MOVD 72(R3), R8 + MOVD 96(R3), R9 + MOVD 120(R3), R10 + MOVD 144(R3), R11 + MOVD 168(R3), R12 + MOVD 192(R3), R13 + MOVD 216(R3), R3 + MOVD out_base+48(FP), R14 + MOVD (R14), R15 + MOVD 24(R14), R14 + MOVD start+72(FP), R6 + + // Add start offset to output + WORD $0x8b0601ef // add x15, x15, x6 + WORD $0x8b0601ce // add x14, x14, x6 + + // Add start offset to input + WORD $0x8b060021 // add x1, x1, x6 + WORD $0x8b060084 // add x4, x4, x6 + WORD $0x8b0600a5 // add x5, x5, x6 + WORD $0x8b060108 // add x8, x8, x6 + WORD $0x8b060129 // add x9, x9, x6 + WORD $0x8b06014a // add x10, x10, x6 + WORD $0x8b06016b // add x11, x11, x6 + WORD $0x8b06018c // add x12, x12, x6 + WORD $0x8b0601ad // add x13, x13, x6 + WORD $0x8b060063 // add x3, x3, x6 + WORD $0xd28001e6 // mov x6, #15 + WORD $0x05e038c4 // mov z4.d, x6 + WORD $0x05212084 // dup z4.b, z4.b[0] + + // Load number of input shards + MOVD in_len+32(FP), R16 + +mulSve_10x2_64_loop: + // Load and process 64 bytes from input 0 to 2 outputs + WORD $0x85804029 // ldr z9, [x1] + WORD $0x8580442b // ldr z11, [x1, #1, MUL VL] + WORD $0x91010021 // add x1, x1, #64 + WORD $0x04fc952a // lsr z10.d, z9.d, #4 + WORD $0x04fc956c // lsr z12.d, z11.d, #4 + WORD $0x04243129 // and z9.d, z9.d, z4.d + WORD $0x0424316b // and z11.d, z11.d, z4.d + WORD $0x0424314a // and z10.d, z10.d, z4.d + WORD $0x0424318c // and z12.d, z12.d, z4.d + WORD $0x85804045 // ldr z5, [x2] + WORD $0x85804446 // ldr z6, [x2, #1, MUL VL] + WORD $0x052b30a7 // tbl z7.b, z5.b, z11.b + WORD $0x052930a5 // tbl z5.b, z5.b, z9.b + WORD $0x052c30c8 // tbl z8.b, z6.b, z12.b + WORD $0x052a30c6 // tbl z6.b, z6.b, z10.b + WORD $0x04a530c0 // eor z0.d, z6.d, z5.d + WORD $0x04a73101 // eor z1.d, z8.d, z7.d + WORD $0x85804845 // ldr z5, [x2, #2, MUL VL] + WORD $0x85804c46 // ldr z6, [x2, #3, MUL VL] + WORD $0x052b30a7 // tbl z7.b, z5.b, z11.b + WORD $0x052930a5 // tbl z5.b, z5.b, z9.b + WORD $0x052c30c8 // tbl z8.b, z6.b, z12.b + WORD $0x052a30c6 // tbl z6.b, z6.b, z10.b + WORD $0x04a530c2 // eor z2.d, z6.d, z5.d + WORD $0x04a73103 // eor z3.d, z8.d, z7.d + // Check for early termination + CMP $1, R16 + BEQ mulSve_10x2_64_store + + // Load and process 64 bytes from input 1 to 2 outputs + WORD $0x85804089 // ldr z9, [x4] + WORD $0x8580448b // ldr z11, [x4, #1, MUL VL] + WORD $0x91010084 // add x4, x4, #64 + WORD $0x04fc952a // lsr z10.d, z9.d, #4 + WORD $0x04fc956c // lsr z12.d, z11.d, #4 + WORD $0x04243129 // and z9.d, z9.d, z4.d + WORD $0x0424316b // and z11.d, z11.d, z4.d + WORD $0x0424314a // and z10.d, z10.d, z4.d + WORD $0x0424318c // and z12.d, z12.d, z4.d + WORD $0x85805045 // ldr z5, [x2, #4, MUL VL] + WORD $0x85805446 // ldr z6, [x2, #5, MUL VL] + WORD $0x052b30a7 // tbl z7.b, z5.b, z11.b + WORD $0x052930a5 // tbl z5.b, z5.b, z9.b + WORD $0x052c30c8 // tbl z8.b, z6.b, z12.b + WORD $0x052a30c6 // tbl z6.b, z6.b, z10.b + WORD $0x04a53000 // eor z0.d, z0.d, z5.d + WORD $0x04a63000 // eor z0.d, z0.d, z6.d + WORD $0x04a73021 // eor z1.d, z1.d, z7.d + WORD $0x04a83021 // eor z1.d, z1.d, z8.d + WORD $0x85805845 // ldr z5, [x2, #6, MUL VL] + WORD $0x85805c46 // ldr z6, [x2, #7, MUL VL] + WORD $0x052b30a7 // tbl z7.b, z5.b, z11.b + WORD $0x052930a5 // tbl z5.b, z5.b, z9.b + WORD $0x052c30c8 // tbl z8.b, z6.b, z12.b + WORD $0x052a30c6 // tbl z6.b, z6.b, z10.b + WORD $0x04a53042 // eor z2.d, z2.d, z5.d + WORD $0x04a63042 // eor z2.d, z2.d, z6.d + WORD $0x04a73063 // eor z3.d, z3.d, z7.d + WORD $0x04a83063 // eor z3.d, z3.d, z8.d + // Check for early termination + CMP $2, R16 + BEQ mulSve_10x2_64_store + + // Load and process 64 bytes from input 2 to 2 outputs + WORD $0x858040a9 // ldr z9, [x5] + WORD $0x858044ab // ldr z11, [x5, #1, MUL VL] + WORD $0x910100a5 // add x5, x5, #64 + WORD $0x04fc952a // lsr z10.d, z9.d, #4 + WORD $0x04fc956c // lsr z12.d, z11.d, #4 + WORD $0x04243129 // and z9.d, z9.d, z4.d + WORD $0x0424316b // and z11.d, z11.d, z4.d + WORD $0x0424314a // and z10.d, z10.d, z4.d + WORD $0x0424318c // and z12.d, z12.d, z4.d + WORD $0x85814045 // ldr z5, [x2, #8, MUL VL] + WORD $0x85814446 // ldr z6, [x2, #9, MUL VL] + WORD $0x052b30a7 // tbl z7.b, z5.b, z11.b + WORD $0x052930a5 // tbl z5.b, z5.b, z9.b + WORD $0x052c30c8 // tbl z8.b, z6.b, z12.b + WORD $0x052a30c6 // tbl z6.b, z6.b, z10.b + WORD $0x04a53000 // eor z0.d, z0.d, z5.d + WORD $0x04a63000 // eor z0.d, z0.d, z6.d + WORD $0x04a73021 // eor z1.d, z1.d, z7.d + WORD $0x04a83021 // eor z1.d, z1.d, z8.d + WORD $0x85814845 // ldr z5, [x2, #10, MUL VL] + WORD $0x85814c46 // ldr z6, [x2, #11, MUL VL] + WORD $0x052b30a7 // tbl z7.b, z5.b, z11.b + WORD $0x052930a5 // tbl z5.b, z5.b, z9.b + WORD $0x052c30c8 // tbl z8.b, z6.b, z12.b + WORD $0x052a30c6 // tbl z6.b, z6.b, z10.b + WORD $0x04a53042 // eor z2.d, z2.d, z5.d + WORD $0x04a63042 // eor z2.d, z2.d, z6.d + WORD $0x04a73063 // eor z3.d, z3.d, z7.d + WORD $0x04a83063 // eor z3.d, z3.d, z8.d + // Check for early termination + CMP $3, R16 + BEQ mulSve_10x2_64_store + + // Load and process 64 bytes from input 3 to 2 outputs + WORD $0x85804109 // ldr z9, [x8] + WORD $0x8580450b // ldr z11, [x8, #1, MUL VL] + WORD $0x91010108 // add x8, x8, #64 + WORD $0x04fc952a // lsr z10.d, z9.d, #4 + WORD $0x04fc956c // lsr z12.d, z11.d, #4 + WORD $0x04243129 // and z9.d, z9.d, z4.d + WORD $0x0424316b // and z11.d, z11.d, z4.d + WORD $0x0424314a // and z10.d, z10.d, z4.d + WORD $0x0424318c // and z12.d, z12.d, z4.d + WORD $0x85815045 // ldr z5, [x2, #12, MUL VL] + WORD $0x85815446 // ldr z6, [x2, #13, MUL VL] + WORD $0x052b30a7 // tbl z7.b, z5.b, z11.b + WORD $0x052930a5 // tbl z5.b, z5.b, z9.b + WORD $0x052c30c8 // tbl z8.b, z6.b, z12.b + WORD $0x052a30c6 // tbl z6.b, z6.b, z10.b + WORD $0x04a53000 // eor z0.d, z0.d, z5.d + WORD $0x04a63000 // eor z0.d, z0.d, z6.d + WORD $0x04a73021 // eor z1.d, z1.d, z7.d + WORD $0x04a83021 // eor z1.d, z1.d, z8.d + WORD $0x85815845 // ldr z5, [x2, #14, MUL VL] + WORD $0x85815c46 // ldr z6, [x2, #15, MUL VL] + WORD $0x052b30a7 // tbl z7.b, z5.b, z11.b + WORD $0x052930a5 // tbl z5.b, z5.b, z9.b + WORD $0x052c30c8 // tbl z8.b, z6.b, z12.b + WORD $0x052a30c6 // tbl z6.b, z6.b, z10.b + WORD $0x04a53042 // eor z2.d, z2.d, z5.d + WORD $0x04a63042 // eor z2.d, z2.d, z6.d + WORD $0x04a73063 // eor z3.d, z3.d, z7.d + WORD $0x04a83063 // eor z3.d, z3.d, z8.d + // Check for early termination + CMP $4, R16 + BEQ mulSve_10x2_64_store + + // Load and process 64 bytes from input 4 to 2 outputs + WORD $0x85804129 // ldr z9, [x9] + WORD $0x8580452b // ldr z11, [x9, #1, MUL VL] + WORD $0x91010129 // add x9, x9, #64 + WORD $0x04fc952a // lsr z10.d, z9.d, #4 + WORD $0x04fc956c // lsr z12.d, z11.d, #4 + WORD $0x04243129 // and z9.d, z9.d, z4.d + WORD $0x0424316b // and z11.d, z11.d, z4.d + WORD $0x0424314a // and z10.d, z10.d, z4.d + WORD $0x0424318c // and z12.d, z12.d, z4.d + WORD $0x85824045 // ldr z5, [x2, #16, MUL VL] + WORD $0x85824446 // ldr z6, [x2, #17, MUL VL] + WORD $0x052b30a7 // tbl z7.b, z5.b, z11.b + WORD $0x052930a5 // tbl z5.b, z5.b, z9.b + WORD $0x052c30c8 // tbl z8.b, z6.b, z12.b + WORD $0x052a30c6 // tbl z6.b, z6.b, z10.b + WORD $0x04a53000 // eor z0.d, z0.d, z5.d + WORD $0x04a63000 // eor z0.d, z0.d, z6.d + WORD $0x04a73021 // eor z1.d, z1.d, z7.d + WORD $0x04a83021 // eor z1.d, z1.d, z8.d + WORD $0x85824845 // ldr z5, [x2, #18, MUL VL] + WORD $0x85824c46 // ldr z6, [x2, #19, MUL VL] + WORD $0x052b30a7 // tbl z7.b, z5.b, z11.b + WORD $0x052930a5 // tbl z5.b, z5.b, z9.b + WORD $0x052c30c8 // tbl z8.b, z6.b, z12.b + WORD $0x052a30c6 // tbl z6.b, z6.b, z10.b + WORD $0x04a53042 // eor z2.d, z2.d, z5.d + WORD $0x04a63042 // eor z2.d, z2.d, z6.d + WORD $0x04a73063 // eor z3.d, z3.d, z7.d + WORD $0x04a83063 // eor z3.d, z3.d, z8.d + // Check for early termination + CMP $5, R16 + BEQ mulSve_10x2_64_store + + // Load and process 64 bytes from input 5 to 2 outputs + WORD $0x85804149 // ldr z9, [x10] + WORD $0x8580454b // ldr z11, [x10, #1, MUL VL] + WORD $0x9101014a // add x10, x10, #64 + WORD $0x04fc952a // lsr z10.d, z9.d, #4 + WORD $0x04fc956c // lsr z12.d, z11.d, #4 + WORD $0x04243129 // and z9.d, z9.d, z4.d + WORD $0x0424316b // and z11.d, z11.d, z4.d + WORD $0x0424314a // and z10.d, z10.d, z4.d + WORD $0x0424318c // and z12.d, z12.d, z4.d + WORD $0x85825045 // ldr z5, [x2, #20, MUL VL] + WORD $0x85825446 // ldr z6, [x2, #21, MUL VL] + WORD $0x052b30a7 // tbl z7.b, z5.b, z11.b + WORD $0x052930a5 // tbl z5.b, z5.b, z9.b + WORD $0x052c30c8 // tbl z8.b, z6.b, z12.b + WORD $0x052a30c6 // tbl z6.b, z6.b, z10.b + WORD $0x04a53000 // eor z0.d, z0.d, z5.d + WORD $0x04a63000 // eor z0.d, z0.d, z6.d + WORD $0x04a73021 // eor z1.d, z1.d, z7.d + WORD $0x04a83021 // eor z1.d, z1.d, z8.d + WORD $0x85825845 // ldr z5, [x2, #22, MUL VL] + WORD $0x85825c46 // ldr z6, [x2, #23, MUL VL] + WORD $0x052b30a7 // tbl z7.b, z5.b, z11.b + WORD $0x052930a5 // tbl z5.b, z5.b, z9.b + WORD $0x052c30c8 // tbl z8.b, z6.b, z12.b + WORD $0x052a30c6 // tbl z6.b, z6.b, z10.b + WORD $0x04a53042 // eor z2.d, z2.d, z5.d + WORD $0x04a63042 // eor z2.d, z2.d, z6.d + WORD $0x04a73063 // eor z3.d, z3.d, z7.d + WORD $0x04a83063 // eor z3.d, z3.d, z8.d + // Check for early termination + CMP $6, R16 + BEQ mulSve_10x2_64_store + + // Load and process 64 bytes from input 6 to 2 outputs + WORD $0x85804169 // ldr z9, [x11] + WORD $0x8580456b // ldr z11, [x11, #1, MUL VL] + WORD $0x9101016b // add x11, x11, #64 + WORD $0x04fc952a // lsr z10.d, z9.d, #4 + WORD $0x04fc956c // lsr z12.d, z11.d, #4 + WORD $0x04243129 // and z9.d, z9.d, z4.d + WORD $0x0424316b // and z11.d, z11.d, z4.d + WORD $0x0424314a // and z10.d, z10.d, z4.d + WORD $0x0424318c // and z12.d, z12.d, z4.d + WORD $0x85834045 // ldr z5, [x2, #24, MUL VL] + WORD $0x85834446 // ldr z6, [x2, #25, MUL VL] + WORD $0x052b30a7 // tbl z7.b, z5.b, z11.b + WORD $0x052930a5 // tbl z5.b, z5.b, z9.b + WORD $0x052c30c8 // tbl z8.b, z6.b, z12.b + WORD $0x052a30c6 // tbl z6.b, z6.b, z10.b + WORD $0x04a53000 // eor z0.d, z0.d, z5.d + WORD $0x04a63000 // eor z0.d, z0.d, z6.d + WORD $0x04a73021 // eor z1.d, z1.d, z7.d + WORD $0x04a83021 // eor z1.d, z1.d, z8.d + WORD $0x85834845 // ldr z5, [x2, #26, MUL VL] + WORD $0x85834c46 // ldr z6, [x2, #27, MUL VL] + WORD $0x052b30a7 // tbl z7.b, z5.b, z11.b + WORD $0x052930a5 // tbl z5.b, z5.b, z9.b + WORD $0x052c30c8 // tbl z8.b, z6.b, z12.b + WORD $0x052a30c6 // tbl z6.b, z6.b, z10.b + WORD $0x04a53042 // eor z2.d, z2.d, z5.d + WORD $0x04a63042 // eor z2.d, z2.d, z6.d + WORD $0x04a73063 // eor z3.d, z3.d, z7.d + WORD $0x04a83063 // eor z3.d, z3.d, z8.d + // Check for early termination + CMP $7, R16 + BEQ mulSve_10x2_64_store + + // Load and process 64 bytes from input 7 to 2 outputs + WORD $0x85804189 // ldr z9, [x12] + WORD $0x8580458b // ldr z11, [x12, #1, MUL VL] + WORD $0x9101018c // add x12, x12, #64 + WORD $0x04fc952a // lsr z10.d, z9.d, #4 + WORD $0x04fc956c // lsr z12.d, z11.d, #4 + WORD $0x04243129 // and z9.d, z9.d, z4.d + WORD $0x0424316b // and z11.d, z11.d, z4.d + WORD $0x0424314a // and z10.d, z10.d, z4.d + WORD $0x0424318c // and z12.d, z12.d, z4.d + WORD $0x85835045 // ldr z5, [x2, #28, MUL VL] + WORD $0x85835446 // ldr z6, [x2, #29, MUL VL] + WORD $0x052b30a7 // tbl z7.b, z5.b, z11.b + WORD $0x052930a5 // tbl z5.b, z5.b, z9.b + WORD $0x052c30c8 // tbl z8.b, z6.b, z12.b + WORD $0x052a30c6 // tbl z6.b, z6.b, z10.b + WORD $0x04a53000 // eor z0.d, z0.d, z5.d + WORD $0x04a63000 // eor z0.d, z0.d, z6.d + WORD $0x04a73021 // eor z1.d, z1.d, z7.d + WORD $0x04a83021 // eor z1.d, z1.d, z8.d + WORD $0x85835845 // ldr z5, [x2, #30, MUL VL] + WORD $0x85835c46 // ldr z6, [x2, #31, MUL VL] + WORD $0x052b30a7 // tbl z7.b, z5.b, z11.b + WORD $0x052930a5 // tbl z5.b, z5.b, z9.b + WORD $0x052c30c8 // tbl z8.b, z6.b, z12.b + WORD $0x052a30c6 // tbl z6.b, z6.b, z10.b + WORD $0x04a53042 // eor z2.d, z2.d, z5.d + WORD $0x04a63042 // eor z2.d, z2.d, z6.d + WORD $0x04a73063 // eor z3.d, z3.d, z7.d + WORD $0x04a83063 // eor z3.d, z3.d, z8.d + // Check for early termination + CMP $8, R16 + BEQ mulSve_10x2_64_store + + // Load and process 64 bytes from input 8 to 2 outputs + WORD $0x858041a9 // ldr z9, [x13] + WORD $0x858045ab // ldr z11, [x13, #1, MUL VL] + WORD $0x910101ad // add x13, x13, #64 + WORD $0x04fc952a // lsr z10.d, z9.d, #4 + WORD $0x04fc956c // lsr z12.d, z11.d, #4 + WORD $0x04243129 // and z9.d, z9.d, z4.d + WORD $0x0424316b // and z11.d, z11.d, z4.d + WORD $0x0424314a // and z10.d, z10.d, z4.d + WORD $0x0424318c // and z12.d, z12.d, z4.d + WORD $0x85844045 // ldr z5, [x2, #32, MUL VL] + WORD $0x85844446 // ldr z6, [x2, #33, MUL VL] + WORD $0x052b30a7 // tbl z7.b, z5.b, z11.b + WORD $0x052930a5 // tbl z5.b, z5.b, z9.b + WORD $0x052c30c8 // tbl z8.b, z6.b, z12.b + WORD $0x052a30c6 // tbl z6.b, z6.b, z10.b + WORD $0x04a53000 // eor z0.d, z0.d, z5.d + WORD $0x04a63000 // eor z0.d, z0.d, z6.d + WORD $0x04a73021 // eor z1.d, z1.d, z7.d + WORD $0x04a83021 // eor z1.d, z1.d, z8.d + WORD $0x85844845 // ldr z5, [x2, #34, MUL VL] + WORD $0x85844c46 // ldr z6, [x2, #35, MUL VL] + WORD $0x052b30a7 // tbl z7.b, z5.b, z11.b + WORD $0x052930a5 // tbl z5.b, z5.b, z9.b + WORD $0x052c30c8 // tbl z8.b, z6.b, z12.b + WORD $0x052a30c6 // tbl z6.b, z6.b, z10.b + WORD $0x04a53042 // eor z2.d, z2.d, z5.d + WORD $0x04a63042 // eor z2.d, z2.d, z6.d + WORD $0x04a73063 // eor z3.d, z3.d, z7.d + WORD $0x04a83063 // eor z3.d, z3.d, z8.d + // Check for early termination + CMP $9, R16 + BEQ mulSve_10x2_64_store + + // Load and process 64 bytes from input 9 to 2 outputs + WORD $0x85804069 // ldr z9, [x3] + WORD $0x8580446b // ldr z11, [x3, #1, MUL VL] + WORD $0x91010063 // add x3, x3, #64 + WORD $0x04fc952a // lsr z10.d, z9.d, #4 + WORD $0x04fc956c // lsr z12.d, z11.d, #4 + WORD $0x04243129 // and z9.d, z9.d, z4.d + WORD $0x0424316b // and z11.d, z11.d, z4.d + WORD $0x0424314a // and z10.d, z10.d, z4.d + WORD $0x0424318c // and z12.d, z12.d, z4.d + WORD $0x85845045 // ldr z5, [x2, #36, MUL VL] + WORD $0x85845446 // ldr z6, [x2, #37, MUL VL] + WORD $0x052b30a7 // tbl z7.b, z5.b, z11.b + WORD $0x052930a5 // tbl z5.b, z5.b, z9.b + WORD $0x052c30c8 // tbl z8.b, z6.b, z12.b + WORD $0x052a30c6 // tbl z6.b, z6.b, z10.b + WORD $0x04a53000 // eor z0.d, z0.d, z5.d + WORD $0x04a63000 // eor z0.d, z0.d, z6.d + WORD $0x04a73021 // eor z1.d, z1.d, z7.d + WORD $0x04a83021 // eor z1.d, z1.d, z8.d + WORD $0x85845845 // ldr z5, [x2, #38, MUL VL] + WORD $0x85845c46 // ldr z6, [x2, #39, MUL VL] + WORD $0x052b30a7 // tbl z7.b, z5.b, z11.b + WORD $0x052930a5 // tbl z5.b, z5.b, z9.b + WORD $0x052c30c8 // tbl z8.b, z6.b, z12.b + WORD $0x052a30c6 // tbl z6.b, z6.b, z10.b + WORD $0x04a53042 // eor z2.d, z2.d, z5.d + WORD $0x04a63042 // eor z2.d, z2.d, z6.d + WORD $0x04a73063 // eor z3.d, z3.d, z7.d + WORD $0x04a83063 // eor z3.d, z3.d, z8.d + +mulSve_10x2_64_store: + // Store 2 outputs + WORD $0xe58041e0 // str z0, [x15] + WORD $0xe58045e1 // str z1, [x15, #1, MUL VL] + WORD $0x910101ef // add x15, x15, #64 + WORD $0xe58041c2 // str z2, [x14] + WORD $0xe58045c3 // str z3, [x14, #1, MUL VL] + WORD $0x910101ce // add x14, x14, #64 + + // Prepare for next loop + WORD $0xf1000400 // subs x0, x0, #1 + BNE mulSve_10x2_64_loop + +mulSve_10x2_64_end: + RET + +// func mulSve_10x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: SVE +TEXT ·mulSve_10x2_64Xor(SB), $8-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 89 YMM used + MOVD n+80(FP), R0 + MOVD matrix_base+0(FP), R2 + WORD $0xd346fc00 // lsr x0, x0, #6 + WORD $0xea00001f // tst x0, x0 + BEQ mulSve_10x2_64Xor_end + MOVD in_base+24(FP), R3 + MOVD (R3), R1 + MOVD 24(R3), R4 + MOVD 48(R3), R5 + MOVD 72(R3), R8 + MOVD 96(R3), R9 + MOVD 120(R3), R10 + MOVD 144(R3), R11 + MOVD 168(R3), R12 + MOVD 192(R3), R13 + MOVD 216(R3), R3 + MOVD out_base+48(FP), R14 + MOVD (R14), R15 + MOVD 24(R14), R14 + MOVD start+72(FP), R6 + + // Add start offset to output + WORD $0x8b0601ef // add x15, x15, x6 + WORD $0x8b0601ce // add x14, x14, x6 + + // Add start offset to input + WORD $0x8b060021 // add x1, x1, x6 + WORD $0x8b060084 // add x4, x4, x6 + WORD $0x8b0600a5 // add x5, x5, x6 + WORD $0x8b060108 // add x8, x8, x6 + WORD $0x8b060129 // add x9, x9, x6 + WORD $0x8b06014a // add x10, x10, x6 + WORD $0x8b06016b // add x11, x11, x6 + WORD $0x8b06018c // add x12, x12, x6 + WORD $0x8b0601ad // add x13, x13, x6 + WORD $0x8b060063 // add x3, x3, x6 + WORD $0xd28001e6 // mov x6, #15 + WORD $0x05e038c4 // mov z4.d, x6 + WORD $0x05212084 // dup z4.b, z4.b[0] + + // Load number of input shards + MOVD in_len+32(FP), R16 + +mulSve_10x2_64Xor_loop: + // Load 2 outputs + WORD $0x858041e0 // ldr z0, [x15] + WORD $0x858045e1 // ldr z1, [x15, #1, MUL VL] + WORD $0x858041c2 // ldr z2, [x14] + WORD $0x858045c3 // ldr z3, [x14, #1, MUL VL] + + // Load and process 64 bytes from input 0 to 2 outputs + WORD $0x85804029 // ldr z9, [x1] + WORD $0x8580442b // ldr z11, [x1, #1, MUL VL] + WORD $0x91010021 // add x1, x1, #64 + WORD $0x04fc952a // lsr z10.d, z9.d, #4 + WORD $0x04fc956c // lsr z12.d, z11.d, #4 + WORD $0x04243129 // and z9.d, z9.d, z4.d + WORD $0x0424316b // and z11.d, z11.d, z4.d + WORD $0x0424314a // and z10.d, z10.d, z4.d + WORD $0x0424318c // and z12.d, z12.d, z4.d + WORD $0x85804045 // ldr z5, [x2] + WORD $0x85804446 // ldr z6, [x2, #1, MUL VL] + WORD $0x052b30a7 // tbl z7.b, z5.b, z11.b + WORD $0x052930a5 // tbl z5.b, z5.b, z9.b + WORD $0x052c30c8 // tbl z8.b, z6.b, z12.b + WORD $0x052a30c6 // tbl z6.b, z6.b, z10.b + WORD $0x04a53000 // eor z0.d, z0.d, z5.d + WORD $0x04a63000 // eor z0.d, z0.d, z6.d + WORD $0x04a73021 // eor z1.d, z1.d, z7.d + WORD $0x04a83021 // eor z1.d, z1.d, z8.d + WORD $0x85804845 // ldr z5, [x2, #2, MUL VL] + WORD $0x85804c46 // ldr z6, [x2, #3, MUL VL] + WORD $0x052b30a7 // tbl z7.b, z5.b, z11.b + WORD $0x052930a5 // tbl z5.b, z5.b, z9.b + WORD $0x052c30c8 // tbl z8.b, z6.b, z12.b + WORD $0x052a30c6 // tbl z6.b, z6.b, z10.b + WORD $0x04a53042 // eor z2.d, z2.d, z5.d + WORD $0x04a63042 // eor z2.d, z2.d, z6.d + WORD $0x04a73063 // eor z3.d, z3.d, z7.d + WORD $0x04a83063 // eor z3.d, z3.d, z8.d + // Check for early termination + CMP $1, R16 + BEQ mulSve_10x2_64Xor_store + + // Load and process 64 bytes from input 1 to 2 outputs + WORD $0x85804089 // ldr z9, [x4] + WORD $0x8580448b // ldr z11, [x4, #1, MUL VL] + WORD $0x91010084 // add x4, x4, #64 + WORD $0x04fc952a // lsr z10.d, z9.d, #4 + WORD $0x04fc956c // lsr z12.d, z11.d, #4 + WORD $0x04243129 // and z9.d, z9.d, z4.d + WORD $0x0424316b // and z11.d, z11.d, z4.d + WORD $0x0424314a // and z10.d, z10.d, z4.d + WORD $0x0424318c // and z12.d, z12.d, z4.d + WORD $0x85805045 // ldr z5, [x2, #4, MUL VL] + WORD $0x85805446 // ldr z6, [x2, #5, MUL VL] + WORD $0x052b30a7 // tbl z7.b, z5.b, z11.b + WORD $0x052930a5 // tbl z5.b, z5.b, z9.b + WORD $0x052c30c8 // tbl z8.b, z6.b, z12.b + WORD $0x052a30c6 // tbl z6.b, z6.b, z10.b + WORD $0x04a53000 // eor z0.d, z0.d, z5.d + WORD $0x04a63000 // eor z0.d, z0.d, z6.d + WORD $0x04a73021 // eor z1.d, z1.d, z7.d + WORD $0x04a83021 // eor z1.d, z1.d, z8.d + WORD $0x85805845 // ldr z5, [x2, #6, MUL VL] + WORD $0x85805c46 // ldr z6, [x2, #7, MUL VL] + WORD $0x052b30a7 // tbl z7.b, z5.b, z11.b + WORD $0x052930a5 // tbl z5.b, z5.b, z9.b + WORD $0x052c30c8 // tbl z8.b, z6.b, z12.b + WORD $0x052a30c6 // tbl z6.b, z6.b, z10.b + WORD $0x04a53042 // eor z2.d, z2.d, z5.d + WORD $0x04a63042 // eor z2.d, z2.d, z6.d + WORD $0x04a73063 // eor z3.d, z3.d, z7.d + WORD $0x04a83063 // eor z3.d, z3.d, z8.d + // Check for early termination + CMP $2, R16 + BEQ mulSve_10x2_64Xor_store + + // Load and process 64 bytes from input 2 to 2 outputs + WORD $0x858040a9 // ldr z9, [x5] + WORD $0x858044ab // ldr z11, [x5, #1, MUL VL] + WORD $0x910100a5 // add x5, x5, #64 + WORD $0x04fc952a // lsr z10.d, z9.d, #4 + WORD $0x04fc956c // lsr z12.d, z11.d, #4 + WORD $0x04243129 // and z9.d, z9.d, z4.d + WORD $0x0424316b // and z11.d, z11.d, z4.d + WORD $0x0424314a // and z10.d, z10.d, z4.d + WORD $0x0424318c // and z12.d, z12.d, z4.d + WORD $0x85814045 // ldr z5, [x2, #8, MUL VL] + WORD $0x85814446 // ldr z6, [x2, #9, MUL VL] + WORD $0x052b30a7 // tbl z7.b, z5.b, z11.b + WORD $0x052930a5 // tbl z5.b, z5.b, z9.b + WORD $0x052c30c8 // tbl z8.b, z6.b, z12.b + WORD $0x052a30c6 // tbl z6.b, z6.b, z10.b + WORD $0x04a53000 // eor z0.d, z0.d, z5.d + WORD $0x04a63000 // eor z0.d, z0.d, z6.d + WORD $0x04a73021 // eor z1.d, z1.d, z7.d + WORD $0x04a83021 // eor z1.d, z1.d, z8.d + WORD $0x85814845 // ldr z5, [x2, #10, MUL VL] + WORD $0x85814c46 // ldr z6, [x2, #11, MUL VL] + WORD $0x052b30a7 // tbl z7.b, z5.b, z11.b + WORD $0x052930a5 // tbl z5.b, z5.b, z9.b + WORD $0x052c30c8 // tbl z8.b, z6.b, z12.b + WORD $0x052a30c6 // tbl z6.b, z6.b, z10.b + WORD $0x04a53042 // eor z2.d, z2.d, z5.d + WORD $0x04a63042 // eor z2.d, z2.d, z6.d + WORD $0x04a73063 // eor z3.d, z3.d, z7.d + WORD $0x04a83063 // eor z3.d, z3.d, z8.d + // Check for early termination + CMP $3, R16 + BEQ mulSve_10x2_64Xor_store + + // Load and process 64 bytes from input 3 to 2 outputs + WORD $0x85804109 // ldr z9, [x8] + WORD $0x8580450b // ldr z11, [x8, #1, MUL VL] + WORD $0x91010108 // add x8, x8, #64 + WORD $0x04fc952a // lsr z10.d, z9.d, #4 + WORD $0x04fc956c // lsr z12.d, z11.d, #4 + WORD $0x04243129 // and z9.d, z9.d, z4.d + WORD $0x0424316b // and z11.d, z11.d, z4.d + WORD $0x0424314a // and z10.d, z10.d, z4.d + WORD $0x0424318c // and z12.d, z12.d, z4.d + WORD $0x85815045 // ldr z5, [x2, #12, MUL VL] + WORD $0x85815446 // ldr z6, [x2, #13, MUL VL] + WORD $0x052b30a7 // tbl z7.b, z5.b, z11.b + WORD $0x052930a5 // tbl z5.b, z5.b, z9.b + WORD $0x052c30c8 // tbl z8.b, z6.b, z12.b + WORD $0x052a30c6 // tbl z6.b, z6.b, z10.b + WORD $0x04a53000 // eor z0.d, z0.d, z5.d + WORD $0x04a63000 // eor z0.d, z0.d, z6.d + WORD $0x04a73021 // eor z1.d, z1.d, z7.d + WORD $0x04a83021 // eor z1.d, z1.d, z8.d + WORD $0x85815845 // ldr z5, [x2, #14, MUL VL] + WORD $0x85815c46 // ldr z6, [x2, #15, MUL VL] + WORD $0x052b30a7 // tbl z7.b, z5.b, z11.b + WORD $0x052930a5 // tbl z5.b, z5.b, z9.b + WORD $0x052c30c8 // tbl z8.b, z6.b, z12.b + WORD $0x052a30c6 // tbl z6.b, z6.b, z10.b + WORD $0x04a53042 // eor z2.d, z2.d, z5.d + WORD $0x04a63042 // eor z2.d, z2.d, z6.d + WORD $0x04a73063 // eor z3.d, z3.d, z7.d + WORD $0x04a83063 // eor z3.d, z3.d, z8.d + // Check for early termination + CMP $4, R16 + BEQ mulSve_10x2_64Xor_store + + // Load and process 64 bytes from input 4 to 2 outputs + WORD $0x85804129 // ldr z9, [x9] + WORD $0x8580452b // ldr z11, [x9, #1, MUL VL] + WORD $0x91010129 // add x9, x9, #64 + WORD $0x04fc952a // lsr z10.d, z9.d, #4 + WORD $0x04fc956c // lsr z12.d, z11.d, #4 + WORD $0x04243129 // and z9.d, z9.d, z4.d + WORD $0x0424316b // and z11.d, z11.d, z4.d + WORD $0x0424314a // and z10.d, z10.d, z4.d + WORD $0x0424318c // and z12.d, z12.d, z4.d + WORD $0x85824045 // ldr z5, [x2, #16, MUL VL] + WORD $0x85824446 // ldr z6, [x2, #17, MUL VL] + WORD $0x052b30a7 // tbl z7.b, z5.b, z11.b + WORD $0x052930a5 // tbl z5.b, z5.b, z9.b + WORD $0x052c30c8 // tbl z8.b, z6.b, z12.b + WORD $0x052a30c6 // tbl z6.b, z6.b, z10.b + WORD $0x04a53000 // eor z0.d, z0.d, z5.d + WORD $0x04a63000 // eor z0.d, z0.d, z6.d + WORD $0x04a73021 // eor z1.d, z1.d, z7.d + WORD $0x04a83021 // eor z1.d, z1.d, z8.d + WORD $0x85824845 // ldr z5, [x2, #18, MUL VL] + WORD $0x85824c46 // ldr z6, [x2, #19, MUL VL] + WORD $0x052b30a7 // tbl z7.b, z5.b, z11.b + WORD $0x052930a5 // tbl z5.b, z5.b, z9.b + WORD $0x052c30c8 // tbl z8.b, z6.b, z12.b + WORD $0x052a30c6 // tbl z6.b, z6.b, z10.b + WORD $0x04a53042 // eor z2.d, z2.d, z5.d + WORD $0x04a63042 // eor z2.d, z2.d, z6.d + WORD $0x04a73063 // eor z3.d, z3.d, z7.d + WORD $0x04a83063 // eor z3.d, z3.d, z8.d + // Check for early termination + CMP $5, R16 + BEQ mulSve_10x2_64Xor_store + + // Load and process 64 bytes from input 5 to 2 outputs + WORD $0x85804149 // ldr z9, [x10] + WORD $0x8580454b // ldr z11, [x10, #1, MUL VL] + WORD $0x9101014a // add x10, x10, #64 + WORD $0x04fc952a // lsr z10.d, z9.d, #4 + WORD $0x04fc956c // lsr z12.d, z11.d, #4 + WORD $0x04243129 // and z9.d, z9.d, z4.d + WORD $0x0424316b // and z11.d, z11.d, z4.d + WORD $0x0424314a // and z10.d, z10.d, z4.d + WORD $0x0424318c // and z12.d, z12.d, z4.d + WORD $0x85825045 // ldr z5, [x2, #20, MUL VL] + WORD $0x85825446 // ldr z6, [x2, #21, MUL VL] + WORD $0x052b30a7 // tbl z7.b, z5.b, z11.b + WORD $0x052930a5 // tbl z5.b, z5.b, z9.b + WORD $0x052c30c8 // tbl z8.b, z6.b, z12.b + WORD $0x052a30c6 // tbl z6.b, z6.b, z10.b + WORD $0x04a53000 // eor z0.d, z0.d, z5.d + WORD $0x04a63000 // eor z0.d, z0.d, z6.d + WORD $0x04a73021 // eor z1.d, z1.d, z7.d + WORD $0x04a83021 // eor z1.d, z1.d, z8.d + WORD $0x85825845 // ldr z5, [x2, #22, MUL VL] + WORD $0x85825c46 // ldr z6, [x2, #23, MUL VL] + WORD $0x052b30a7 // tbl z7.b, z5.b, z11.b + WORD $0x052930a5 // tbl z5.b, z5.b, z9.b + WORD $0x052c30c8 // tbl z8.b, z6.b, z12.b + WORD $0x052a30c6 // tbl z6.b, z6.b, z10.b + WORD $0x04a53042 // eor z2.d, z2.d, z5.d + WORD $0x04a63042 // eor z2.d, z2.d, z6.d + WORD $0x04a73063 // eor z3.d, z3.d, z7.d + WORD $0x04a83063 // eor z3.d, z3.d, z8.d + // Check for early termination + CMP $6, R16 + BEQ mulSve_10x2_64Xor_store + + // Load and process 64 bytes from input 6 to 2 outputs + WORD $0x85804169 // ldr z9, [x11] + WORD $0x8580456b // ldr z11, [x11, #1, MUL VL] + WORD $0x9101016b // add x11, x11, #64 + WORD $0x04fc952a // lsr z10.d, z9.d, #4 + WORD $0x04fc956c // lsr z12.d, z11.d, #4 + WORD $0x04243129 // and z9.d, z9.d, z4.d + WORD $0x0424316b // and z11.d, z11.d, z4.d + WORD $0x0424314a // and z10.d, z10.d, z4.d + WORD $0x0424318c // and z12.d, z12.d, z4.d + WORD $0x85834045 // ldr z5, [x2, #24, MUL VL] + WORD $0x85834446 // ldr z6, [x2, #25, MUL VL] + WORD $0x052b30a7 // tbl z7.b, z5.b, z11.b + WORD $0x052930a5 // tbl z5.b, z5.b, z9.b + WORD $0x052c30c8 // tbl z8.b, z6.b, z12.b + WORD $0x052a30c6 // tbl z6.b, z6.b, z10.b + WORD $0x04a53000 // eor z0.d, z0.d, z5.d + WORD $0x04a63000 // eor z0.d, z0.d, z6.d + WORD $0x04a73021 // eor z1.d, z1.d, z7.d + WORD $0x04a83021 // eor z1.d, z1.d, z8.d + WORD $0x85834845 // ldr z5, [x2, #26, MUL VL] + WORD $0x85834c46 // ldr z6, [x2, #27, MUL VL] + WORD $0x052b30a7 // tbl z7.b, z5.b, z11.b + WORD $0x052930a5 // tbl z5.b, z5.b, z9.b + WORD $0x052c30c8 // tbl z8.b, z6.b, z12.b + WORD $0x052a30c6 // tbl z6.b, z6.b, z10.b + WORD $0x04a53042 // eor z2.d, z2.d, z5.d + WORD $0x04a63042 // eor z2.d, z2.d, z6.d + WORD $0x04a73063 // eor z3.d, z3.d, z7.d + WORD $0x04a83063 // eor z3.d, z3.d, z8.d + // Check for early termination + CMP $7, R16 + BEQ mulSve_10x2_64Xor_store + + // Load and process 64 bytes from input 7 to 2 outputs + WORD $0x85804189 // ldr z9, [x12] + WORD $0x8580458b // ldr z11, [x12, #1, MUL VL] + WORD $0x9101018c // add x12, x12, #64 + WORD $0x04fc952a // lsr z10.d, z9.d, #4 + WORD $0x04fc956c // lsr z12.d, z11.d, #4 + WORD $0x04243129 // and z9.d, z9.d, z4.d + WORD $0x0424316b // and z11.d, z11.d, z4.d + WORD $0x0424314a // and z10.d, z10.d, z4.d + WORD $0x0424318c // and z12.d, z12.d, z4.d + WORD $0x85835045 // ldr z5, [x2, #28, MUL VL] + WORD $0x85835446 // ldr z6, [x2, #29, MUL VL] + WORD $0x052b30a7 // tbl z7.b, z5.b, z11.b + WORD $0x052930a5 // tbl z5.b, z5.b, z9.b + WORD $0x052c30c8 // tbl z8.b, z6.b, z12.b + WORD $0x052a30c6 // tbl z6.b, z6.b, z10.b + WORD $0x04a53000 // eor z0.d, z0.d, z5.d + WORD $0x04a63000 // eor z0.d, z0.d, z6.d + WORD $0x04a73021 // eor z1.d, z1.d, z7.d + WORD $0x04a83021 // eor z1.d, z1.d, z8.d + WORD $0x85835845 // ldr z5, [x2, #30, MUL VL] + WORD $0x85835c46 // ldr z6, [x2, #31, MUL VL] + WORD $0x052b30a7 // tbl z7.b, z5.b, z11.b + WORD $0x052930a5 // tbl z5.b, z5.b, z9.b + WORD $0x052c30c8 // tbl z8.b, z6.b, z12.b + WORD $0x052a30c6 // tbl z6.b, z6.b, z10.b + WORD $0x04a53042 // eor z2.d, z2.d, z5.d + WORD $0x04a63042 // eor z2.d, z2.d, z6.d + WORD $0x04a73063 // eor z3.d, z3.d, z7.d + WORD $0x04a83063 // eor z3.d, z3.d, z8.d + // Check for early termination + CMP $8, R16 + BEQ mulSve_10x2_64Xor_store + + // Load and process 64 bytes from input 8 to 2 outputs + WORD $0x858041a9 // ldr z9, [x13] + WORD $0x858045ab // ldr z11, [x13, #1, MUL VL] + WORD $0x910101ad // add x13, x13, #64 + WORD $0x04fc952a // lsr z10.d, z9.d, #4 + WORD $0x04fc956c // lsr z12.d, z11.d, #4 + WORD $0x04243129 // and z9.d, z9.d, z4.d + WORD $0x0424316b // and z11.d, z11.d, z4.d + WORD $0x0424314a // and z10.d, z10.d, z4.d + WORD $0x0424318c // and z12.d, z12.d, z4.d + WORD $0x85844045 // ldr z5, [x2, #32, MUL VL] + WORD $0x85844446 // ldr z6, [x2, #33, MUL VL] + WORD $0x052b30a7 // tbl z7.b, z5.b, z11.b + WORD $0x052930a5 // tbl z5.b, z5.b, z9.b + WORD $0x052c30c8 // tbl z8.b, z6.b, z12.b + WORD $0x052a30c6 // tbl z6.b, z6.b, z10.b + WORD $0x04a53000 // eor z0.d, z0.d, z5.d + WORD $0x04a63000 // eor z0.d, z0.d, z6.d + WORD $0x04a73021 // eor z1.d, z1.d, z7.d + WORD $0x04a83021 // eor z1.d, z1.d, z8.d + WORD $0x85844845 // ldr z5, [x2, #34, MUL VL] + WORD $0x85844c46 // ldr z6, [x2, #35, MUL VL] + WORD $0x052b30a7 // tbl z7.b, z5.b, z11.b + WORD $0x052930a5 // tbl z5.b, z5.b, z9.b + WORD $0x052c30c8 // tbl z8.b, z6.b, z12.b + WORD $0x052a30c6 // tbl z6.b, z6.b, z10.b + WORD $0x04a53042 // eor z2.d, z2.d, z5.d + WORD $0x04a63042 // eor z2.d, z2.d, z6.d + WORD $0x04a73063 // eor z3.d, z3.d, z7.d + WORD $0x04a83063 // eor z3.d, z3.d, z8.d + // Check for early termination + CMP $9, R16 + BEQ mulSve_10x2_64Xor_store + + // Load and process 64 bytes from input 9 to 2 outputs + WORD $0x85804069 // ldr z9, [x3] + WORD $0x8580446b // ldr z11, [x3, #1, MUL VL] + WORD $0x91010063 // add x3, x3, #64 + WORD $0x04fc952a // lsr z10.d, z9.d, #4 + WORD $0x04fc956c // lsr z12.d, z11.d, #4 + WORD $0x04243129 // and z9.d, z9.d, z4.d + WORD $0x0424316b // and z11.d, z11.d, z4.d + WORD $0x0424314a // and z10.d, z10.d, z4.d + WORD $0x0424318c // and z12.d, z12.d, z4.d + WORD $0x85845045 // ldr z5, [x2, #36, MUL VL] + WORD $0x85845446 // ldr z6, [x2, #37, MUL VL] + WORD $0x052b30a7 // tbl z7.b, z5.b, z11.b + WORD $0x052930a5 // tbl z5.b, z5.b, z9.b + WORD $0x052c30c8 // tbl z8.b, z6.b, z12.b + WORD $0x052a30c6 // tbl z6.b, z6.b, z10.b + WORD $0x04a53000 // eor z0.d, z0.d, z5.d + WORD $0x04a63000 // eor z0.d, z0.d, z6.d + WORD $0x04a73021 // eor z1.d, z1.d, z7.d + WORD $0x04a83021 // eor z1.d, z1.d, z8.d + WORD $0x85845845 // ldr z5, [x2, #38, MUL VL] + WORD $0x85845c46 // ldr z6, [x2, #39, MUL VL] + WORD $0x052b30a7 // tbl z7.b, z5.b, z11.b + WORD $0x052930a5 // tbl z5.b, z5.b, z9.b + WORD $0x052c30c8 // tbl z8.b, z6.b, z12.b + WORD $0x052a30c6 // tbl z6.b, z6.b, z10.b + WORD $0x04a53042 // eor z2.d, z2.d, z5.d + WORD $0x04a63042 // eor z2.d, z2.d, z6.d + WORD $0x04a73063 // eor z3.d, z3.d, z7.d + WORD $0x04a83063 // eor z3.d, z3.d, z8.d + +mulSve_10x2_64Xor_store: + // Store 2 outputs + WORD $0xe58041e0 // str z0, [x15] + WORD $0xe58045e1 // str z1, [x15, #1, MUL VL] + WORD $0x910101ef // add x15, x15, #64 + WORD $0xe58041c2 // str z2, [x14] + WORD $0xe58045c3 // str z3, [x14, #1, MUL VL] + WORD $0x910101ce // add x14, x14, #64 + + // Prepare for next loop + WORD $0xf1000400 // subs x0, x0, #1 + BNE mulSve_10x2_64Xor_loop + +mulSve_10x2_64Xor_end: + RET + +// func mulSve_10x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: SVE +TEXT ·mulSve_10x3_64(SB), $8-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 130 YMM used + MOVD n+80(FP), R0 + MOVD matrix_base+0(FP), R2 + WORD $0xd346fc00 // lsr x0, x0, #6 + WORD $0xea00001f // tst x0, x0 + BEQ mulSve_10x3_64_end + MOVD in_base+24(FP), R0 + MOVD (R0), R3 + MOVD 24(R0), R1 + MOVD 48(R0), R4 + MOVD 72(R0), R5 + MOVD 96(R0), R8 + MOVD 120(R0), R9 + MOVD 144(R0), R10 + MOVD 168(R0), R11 + MOVD 192(R0), R12 + MOVD 216(R0), R0 + MOVD out_base+48(FP), R13 + MOVD (R13), R14 + MOVD 24(R13), R15 + MOVD 48(R13), R13 + MOVD start+72(FP), R6 + + // Add start offset to output + WORD $0x8b0601ce // add x14, x14, x6 + WORD $0x8b0601ef // add x15, x15, x6 + WORD $0x8b0601ad // add x13, x13, x6 + + // Add start offset to input + WORD $0x8b060063 // add x3, x3, x6 + WORD $0x8b060021 // add x1, x1, x6 + WORD $0x8b060084 // add x4, x4, x6 + WORD $0x8b0600a5 // add x5, x5, x6 + WORD $0x8b060108 // add x8, x8, x6 + WORD $0x8b060129 // add x9, x9, x6 + WORD $0x8b06014a // add x10, x10, x6 + WORD $0x8b06016b // add x11, x11, x6 + WORD $0x8b06018c // add x12, x12, x6 + WORD $0x8b060000 // add x0, x0, x6 + WORD $0xd28001e6 // mov x6, #15 + WORD $0x05e038c6 // mov z6.d, x6 + WORD $0x052120c6 // dup z6.b, z6.b[0] + + // Reload length to save a register + MOVD n+80(FP), R6 + WORD $0xd346fcc6 // lsr x6, x6, #6 + + // Load number of input shards + MOVD in_len+32(FP), R16 + +mulSve_10x3_64_loop: + // Load and process 64 bytes from input 0 to 3 outputs + WORD $0x8580406b // ldr z11, [x3] + WORD $0x8580446d // ldr z13, [x3, #1, MUL VL] + WORD $0x91010063 // add x3, x3, #64 + WORD $0x04fc956c // lsr z12.d, z11.d, #4 + WORD $0x04fc95ae // lsr z14.d, z13.d, #4 + WORD $0x0426316b // and z11.d, z11.d, z6.d + WORD $0x042631ad // and z13.d, z13.d, z6.d + WORD $0x0426318c // and z12.d, z12.d, z6.d + WORD $0x042631ce // and z14.d, z14.d, z6.d + WORD $0x85804047 // ldr z7, [x2] + WORD $0x85804448 // ldr z8, [x2, #1, MUL VL] + WORD $0x052d30e9 // tbl z9.b, z7.b, z13.b + WORD $0x052b30e7 // tbl z7.b, z7.b, z11.b + WORD $0x052e310a // tbl z10.b, z8.b, z14.b + WORD $0x052c3108 // tbl z8.b, z8.b, z12.b + WORD $0x04a73100 // eor z0.d, z8.d, z7.d + WORD $0x04a93141 // eor z1.d, z10.d, z9.d + WORD $0x85804847 // ldr z7, [x2, #2, MUL VL] + WORD $0x85804c48 // ldr z8, [x2, #3, MUL VL] + WORD $0x052d30e9 // tbl z9.b, z7.b, z13.b + WORD $0x052b30e7 // tbl z7.b, z7.b, z11.b + WORD $0x052e310a // tbl z10.b, z8.b, z14.b + WORD $0x052c3108 // tbl z8.b, z8.b, z12.b + WORD $0x04a73102 // eor z2.d, z8.d, z7.d + WORD $0x04a93143 // eor z3.d, z10.d, z9.d + WORD $0x85805047 // ldr z7, [x2, #4, MUL VL] + WORD $0x85805448 // ldr z8, [x2, #5, MUL VL] + WORD $0x052d30e9 // tbl z9.b, z7.b, z13.b + WORD $0x052b30e7 // tbl z7.b, z7.b, z11.b + WORD $0x052e310a // tbl z10.b, z8.b, z14.b + WORD $0x052c3108 // tbl z8.b, z8.b, z12.b + WORD $0x04a73104 // eor z4.d, z8.d, z7.d + WORD $0x04a93145 // eor z5.d, z10.d, z9.d + // Check for early termination + CMP $1, R16 + BEQ mulSve_10x3_64_store + + // Load and process 64 bytes from input 1 to 3 outputs + WORD $0x8580402b // ldr z11, [x1] + WORD $0x8580442d // ldr z13, [x1, #1, MUL VL] + WORD $0x91010021 // add x1, x1, #64 + WORD $0x04fc956c // lsr z12.d, z11.d, #4 + WORD $0x04fc95ae // lsr z14.d, z13.d, #4 + WORD $0x0426316b // and z11.d, z11.d, z6.d + WORD $0x042631ad // and z13.d, z13.d, z6.d + WORD $0x0426318c // and z12.d, z12.d, z6.d + WORD $0x042631ce // and z14.d, z14.d, z6.d + WORD $0x85805847 // ldr z7, [x2, #6, MUL VL] + WORD $0x85805c48 // ldr z8, [x2, #7, MUL VL] + WORD $0x052d30e9 // tbl z9.b, z7.b, z13.b + WORD $0x052b30e7 // tbl z7.b, z7.b, z11.b + WORD $0x052e310a // tbl z10.b, z8.b, z14.b + WORD $0x052c3108 // tbl z8.b, z8.b, z12.b + WORD $0x04a73000 // eor z0.d, z0.d, z7.d + WORD $0x04a83000 // eor z0.d, z0.d, z8.d + WORD $0x04a93021 // eor z1.d, z1.d, z9.d + WORD $0x04aa3021 // eor z1.d, z1.d, z10.d + WORD $0x85814047 // ldr z7, [x2, #8, MUL VL] + WORD $0x85814448 // ldr z8, [x2, #9, MUL VL] + WORD $0x052d30e9 // tbl z9.b, z7.b, z13.b + WORD $0x052b30e7 // tbl z7.b, z7.b, z11.b + WORD $0x052e310a // tbl z10.b, z8.b, z14.b + WORD $0x052c3108 // tbl z8.b, z8.b, z12.b + WORD $0x04a73042 // eor z2.d, z2.d, z7.d + WORD $0x04a83042 // eor z2.d, z2.d, z8.d + WORD $0x04a93063 // eor z3.d, z3.d, z9.d + WORD $0x04aa3063 // eor z3.d, z3.d, z10.d + WORD $0x85814847 // ldr z7, [x2, #10, MUL VL] + WORD $0x85814c48 // ldr z8, [x2, #11, MUL VL] + WORD $0x052d30e9 // tbl z9.b, z7.b, z13.b + WORD $0x052b30e7 // tbl z7.b, z7.b, z11.b + WORD $0x052e310a // tbl z10.b, z8.b, z14.b + WORD $0x052c3108 // tbl z8.b, z8.b, z12.b + WORD $0x04a73084 // eor z4.d, z4.d, z7.d + WORD $0x04a83084 // eor z4.d, z4.d, z8.d + WORD $0x04a930a5 // eor z5.d, z5.d, z9.d + WORD $0x04aa30a5 // eor z5.d, z5.d, z10.d + // Check for early termination + CMP $2, R16 + BEQ mulSve_10x3_64_store + + // Load and process 64 bytes from input 2 to 3 outputs + WORD $0x8580408b // ldr z11, [x4] + WORD $0x8580448d // ldr z13, [x4, #1, MUL VL] + WORD $0x91010084 // add x4, x4, #64 + WORD $0x04fc956c // lsr z12.d, z11.d, #4 + WORD $0x04fc95ae // lsr z14.d, z13.d, #4 + WORD $0x0426316b // and z11.d, z11.d, z6.d + WORD $0x042631ad // and z13.d, z13.d, z6.d + WORD $0x0426318c // and z12.d, z12.d, z6.d + WORD $0x042631ce // and z14.d, z14.d, z6.d + WORD $0x85815047 // ldr z7, [x2, #12, MUL VL] + WORD $0x85815448 // ldr z8, [x2, #13, MUL VL] + WORD $0x052d30e9 // tbl z9.b, z7.b, z13.b + WORD $0x052b30e7 // tbl z7.b, z7.b, z11.b + WORD $0x052e310a // tbl z10.b, z8.b, z14.b + WORD $0x052c3108 // tbl z8.b, z8.b, z12.b + WORD $0x04a73000 // eor z0.d, z0.d, z7.d + WORD $0x04a83000 // eor z0.d, z0.d, z8.d + WORD $0x04a93021 // eor z1.d, z1.d, z9.d + WORD $0x04aa3021 // eor z1.d, z1.d, z10.d + WORD $0x85815847 // ldr z7, [x2, #14, MUL VL] + WORD $0x85815c48 // ldr z8, [x2, #15, MUL VL] + WORD $0x052d30e9 // tbl z9.b, z7.b, z13.b + WORD $0x052b30e7 // tbl z7.b, z7.b, z11.b + WORD $0x052e310a // tbl z10.b, z8.b, z14.b + WORD $0x052c3108 // tbl z8.b, z8.b, z12.b + WORD $0x04a73042 // eor z2.d, z2.d, z7.d + WORD $0x04a83042 // eor z2.d, z2.d, z8.d + WORD $0x04a93063 // eor z3.d, z3.d, z9.d + WORD $0x04aa3063 // eor z3.d, z3.d, z10.d + WORD $0x85824047 // ldr z7, [x2, #16, MUL VL] + WORD $0x85824448 // ldr z8, [x2, #17, MUL VL] + WORD $0x052d30e9 // tbl z9.b, z7.b, z13.b + WORD $0x052b30e7 // tbl z7.b, z7.b, z11.b + WORD $0x052e310a // tbl z10.b, z8.b, z14.b + WORD $0x052c3108 // tbl z8.b, z8.b, z12.b + WORD $0x04a73084 // eor z4.d, z4.d, z7.d + WORD $0x04a83084 // eor z4.d, z4.d, z8.d + WORD $0x04a930a5 // eor z5.d, z5.d, z9.d + WORD $0x04aa30a5 // eor z5.d, z5.d, z10.d + // Check for early termination + CMP $3, R16 + BEQ mulSve_10x3_64_store + + // Load and process 64 bytes from input 3 to 3 outputs + WORD $0x858040ab // ldr z11, [x5] + WORD $0x858044ad // ldr z13, [x5, #1, MUL VL] + WORD $0x910100a5 // add x5, x5, #64 + WORD $0x04fc956c // lsr z12.d, z11.d, #4 + WORD $0x04fc95ae // lsr z14.d, z13.d, #4 + WORD $0x0426316b // and z11.d, z11.d, z6.d + WORD $0x042631ad // and z13.d, z13.d, z6.d + WORD $0x0426318c // and z12.d, z12.d, z6.d + WORD $0x042631ce // and z14.d, z14.d, z6.d + WORD $0x85824847 // ldr z7, [x2, #18, MUL VL] + WORD $0x85824c48 // ldr z8, [x2, #19, MUL VL] + WORD $0x052d30e9 // tbl z9.b, z7.b, z13.b + WORD $0x052b30e7 // tbl z7.b, z7.b, z11.b + WORD $0x052e310a // tbl z10.b, z8.b, z14.b + WORD $0x052c3108 // tbl z8.b, z8.b, z12.b + WORD $0x04a73000 // eor z0.d, z0.d, z7.d + WORD $0x04a83000 // eor z0.d, z0.d, z8.d + WORD $0x04a93021 // eor z1.d, z1.d, z9.d + WORD $0x04aa3021 // eor z1.d, z1.d, z10.d + WORD $0x85825047 // ldr z7, [x2, #20, MUL VL] + WORD $0x85825448 // ldr z8, [x2, #21, MUL VL] + WORD $0x052d30e9 // tbl z9.b, z7.b, z13.b + WORD $0x052b30e7 // tbl z7.b, z7.b, z11.b + WORD $0x052e310a // tbl z10.b, z8.b, z14.b + WORD $0x052c3108 // tbl z8.b, z8.b, z12.b + WORD $0x04a73042 // eor z2.d, z2.d, z7.d + WORD $0x04a83042 // eor z2.d, z2.d, z8.d + WORD $0x04a93063 // eor z3.d, z3.d, z9.d + WORD $0x04aa3063 // eor z3.d, z3.d, z10.d + WORD $0x85825847 // ldr z7, [x2, #22, MUL VL] + WORD $0x85825c48 // ldr z8, [x2, #23, MUL VL] + WORD $0x052d30e9 // tbl z9.b, z7.b, z13.b + WORD $0x052b30e7 // tbl z7.b, z7.b, z11.b + WORD $0x052e310a // tbl z10.b, z8.b, z14.b + WORD $0x052c3108 // tbl z8.b, z8.b, z12.b + WORD $0x04a73084 // eor z4.d, z4.d, z7.d + WORD $0x04a83084 // eor z4.d, z4.d, z8.d + WORD $0x04a930a5 // eor z5.d, z5.d, z9.d + WORD $0x04aa30a5 // eor z5.d, z5.d, z10.d + // Check for early termination + CMP $4, R16 + BEQ mulSve_10x3_64_store + + // Load and process 64 bytes from input 4 to 3 outputs + WORD $0x8580410b // ldr z11, [x8] + WORD $0x8580450d // ldr z13, [x8, #1, MUL VL] + WORD $0x91010108 // add x8, x8, #64 + WORD $0x04fc956c // lsr z12.d, z11.d, #4 + WORD $0x04fc95ae // lsr z14.d, z13.d, #4 + WORD $0x0426316b // and z11.d, z11.d, z6.d + WORD $0x042631ad // and z13.d, z13.d, z6.d + WORD $0x0426318c // and z12.d, z12.d, z6.d + WORD $0x042631ce // and z14.d, z14.d, z6.d + WORD $0x85834047 // ldr z7, [x2, #24, MUL VL] + WORD $0x85834448 // ldr z8, [x2, #25, MUL VL] + WORD $0x052d30e9 // tbl z9.b, z7.b, z13.b + WORD $0x052b30e7 // tbl z7.b, z7.b, z11.b + WORD $0x052e310a // tbl z10.b, z8.b, z14.b + WORD $0x052c3108 // tbl z8.b, z8.b, z12.b + WORD $0x04a73000 // eor z0.d, z0.d, z7.d + WORD $0x04a83000 // eor z0.d, z0.d, z8.d + WORD $0x04a93021 // eor z1.d, z1.d, z9.d + WORD $0x04aa3021 // eor z1.d, z1.d, z10.d + WORD $0x85834847 // ldr z7, [x2, #26, MUL VL] + WORD $0x85834c48 // ldr z8, [x2, #27, MUL VL] + WORD $0x052d30e9 // tbl z9.b, z7.b, z13.b + WORD $0x052b30e7 // tbl z7.b, z7.b, z11.b + WORD $0x052e310a // tbl z10.b, z8.b, z14.b + WORD $0x052c3108 // tbl z8.b, z8.b, z12.b + WORD $0x04a73042 // eor z2.d, z2.d, z7.d + WORD $0x04a83042 // eor z2.d, z2.d, z8.d + WORD $0x04a93063 // eor z3.d, z3.d, z9.d + WORD $0x04aa3063 // eor z3.d, z3.d, z10.d + WORD $0x85835047 // ldr z7, [x2, #28, MUL VL] + WORD $0x85835448 // ldr z8, [x2, #29, MUL VL] + WORD $0x052d30e9 // tbl z9.b, z7.b, z13.b + WORD $0x052b30e7 // tbl z7.b, z7.b, z11.b + WORD $0x052e310a // tbl z10.b, z8.b, z14.b + WORD $0x052c3108 // tbl z8.b, z8.b, z12.b + WORD $0x04a73084 // eor z4.d, z4.d, z7.d + WORD $0x04a83084 // eor z4.d, z4.d, z8.d + WORD $0x04a930a5 // eor z5.d, z5.d, z9.d + WORD $0x04aa30a5 // eor z5.d, z5.d, z10.d + // Check for early termination + CMP $5, R16 + BEQ mulSve_10x3_64_store + + // Load and process 64 bytes from input 5 to 3 outputs + WORD $0x8580412b // ldr z11, [x9] + WORD $0x8580452d // ldr z13, [x9, #1, MUL VL] + WORD $0x91010129 // add x9, x9, #64 + WORD $0x04fc956c // lsr z12.d, z11.d, #4 + WORD $0x04fc95ae // lsr z14.d, z13.d, #4 + WORD $0x0426316b // and z11.d, z11.d, z6.d + WORD $0x042631ad // and z13.d, z13.d, z6.d + WORD $0x0426318c // and z12.d, z12.d, z6.d + WORD $0x042631ce // and z14.d, z14.d, z6.d + WORD $0x85835847 // ldr z7, [x2, #30, MUL VL] + WORD $0x85835c48 // ldr z8, [x2, #31, MUL VL] + WORD $0x052d30e9 // tbl z9.b, z7.b, z13.b + WORD $0x052b30e7 // tbl z7.b, z7.b, z11.b + WORD $0x052e310a // tbl z10.b, z8.b, z14.b + WORD $0x052c3108 // tbl z8.b, z8.b, z12.b + WORD $0x04a73000 // eor z0.d, z0.d, z7.d + WORD $0x04a83000 // eor z0.d, z0.d, z8.d + WORD $0x04a93021 // eor z1.d, z1.d, z9.d + WORD $0x04aa3021 // eor z1.d, z1.d, z10.d + WORD $0x85844047 // ldr z7, [x2, #32, MUL VL] + WORD $0x85844448 // ldr z8, [x2, #33, MUL VL] + WORD $0x052d30e9 // tbl z9.b, z7.b, z13.b + WORD $0x052b30e7 // tbl z7.b, z7.b, z11.b + WORD $0x052e310a // tbl z10.b, z8.b, z14.b + WORD $0x052c3108 // tbl z8.b, z8.b, z12.b + WORD $0x04a73042 // eor z2.d, z2.d, z7.d + WORD $0x04a83042 // eor z2.d, z2.d, z8.d + WORD $0x04a93063 // eor z3.d, z3.d, z9.d + WORD $0x04aa3063 // eor z3.d, z3.d, z10.d + WORD $0x85844847 // ldr z7, [x2, #34, MUL VL] + WORD $0x85844c48 // ldr z8, [x2, #35, MUL VL] + WORD $0x052d30e9 // tbl z9.b, z7.b, z13.b + WORD $0x052b30e7 // tbl z7.b, z7.b, z11.b + WORD $0x052e310a // tbl z10.b, z8.b, z14.b + WORD $0x052c3108 // tbl z8.b, z8.b, z12.b + WORD $0x04a73084 // eor z4.d, z4.d, z7.d + WORD $0x04a83084 // eor z4.d, z4.d, z8.d + WORD $0x04a930a5 // eor z5.d, z5.d, z9.d + WORD $0x04aa30a5 // eor z5.d, z5.d, z10.d + // Check for early termination + CMP $6, R16 + BEQ mulSve_10x3_64_store + + // Load and process 64 bytes from input 6 to 3 outputs + WORD $0x8580414b // ldr z11, [x10] + WORD $0x8580454d // ldr z13, [x10, #1, MUL VL] + WORD $0x9101014a // add x10, x10, #64 + WORD $0x04fc956c // lsr z12.d, z11.d, #4 + WORD $0x04fc95ae // lsr z14.d, z13.d, #4 + WORD $0x0426316b // and z11.d, z11.d, z6.d + WORD $0x042631ad // and z13.d, z13.d, z6.d + WORD $0x0426318c // and z12.d, z12.d, z6.d + WORD $0x042631ce // and z14.d, z14.d, z6.d + WORD $0x85845047 // ldr z7, [x2, #36, MUL VL] + WORD $0x85845448 // ldr z8, [x2, #37, MUL VL] + WORD $0x052d30e9 // tbl z9.b, z7.b, z13.b + WORD $0x052b30e7 // tbl z7.b, z7.b, z11.b + WORD $0x052e310a // tbl z10.b, z8.b, z14.b + WORD $0x052c3108 // tbl z8.b, z8.b, z12.b + WORD $0x04a73000 // eor z0.d, z0.d, z7.d + WORD $0x04a83000 // eor z0.d, z0.d, z8.d + WORD $0x04a93021 // eor z1.d, z1.d, z9.d + WORD $0x04aa3021 // eor z1.d, z1.d, z10.d + WORD $0x85845847 // ldr z7, [x2, #38, MUL VL] + WORD $0x85845c48 // ldr z8, [x2, #39, MUL VL] + WORD $0x052d30e9 // tbl z9.b, z7.b, z13.b + WORD $0x052b30e7 // tbl z7.b, z7.b, z11.b + WORD $0x052e310a // tbl z10.b, z8.b, z14.b + WORD $0x052c3108 // tbl z8.b, z8.b, z12.b + WORD $0x04a73042 // eor z2.d, z2.d, z7.d + WORD $0x04a83042 // eor z2.d, z2.d, z8.d + WORD $0x04a93063 // eor z3.d, z3.d, z9.d + WORD $0x04aa3063 // eor z3.d, z3.d, z10.d + WORD $0x85854047 // ldr z7, [x2, #40, MUL VL] + WORD $0x85854448 // ldr z8, [x2, #41, MUL VL] + WORD $0x052d30e9 // tbl z9.b, z7.b, z13.b + WORD $0x052b30e7 // tbl z7.b, z7.b, z11.b + WORD $0x052e310a // tbl z10.b, z8.b, z14.b + WORD $0x052c3108 // tbl z8.b, z8.b, z12.b + WORD $0x04a73084 // eor z4.d, z4.d, z7.d + WORD $0x04a83084 // eor z4.d, z4.d, z8.d + WORD $0x04a930a5 // eor z5.d, z5.d, z9.d + WORD $0x04aa30a5 // eor z5.d, z5.d, z10.d + // Check for early termination + CMP $7, R16 + BEQ mulSve_10x3_64_store + + // Load and process 64 bytes from input 7 to 3 outputs + WORD $0x8580416b // ldr z11, [x11] + WORD $0x8580456d // ldr z13, [x11, #1, MUL VL] + WORD $0x9101016b // add x11, x11, #64 + WORD $0x04fc956c // lsr z12.d, z11.d, #4 + WORD $0x04fc95ae // lsr z14.d, z13.d, #4 + WORD $0x0426316b // and z11.d, z11.d, z6.d + WORD $0x042631ad // and z13.d, z13.d, z6.d + WORD $0x0426318c // and z12.d, z12.d, z6.d + WORD $0x042631ce // and z14.d, z14.d, z6.d + WORD $0x85854847 // ldr z7, [x2, #42, MUL VL] + WORD $0x85854c48 // ldr z8, [x2, #43, MUL VL] + WORD $0x052d30e9 // tbl z9.b, z7.b, z13.b + WORD $0x052b30e7 // tbl z7.b, z7.b, z11.b + WORD $0x052e310a // tbl z10.b, z8.b, z14.b + WORD $0x052c3108 // tbl z8.b, z8.b, z12.b + WORD $0x04a73000 // eor z0.d, z0.d, z7.d + WORD $0x04a83000 // eor z0.d, z0.d, z8.d + WORD $0x04a93021 // eor z1.d, z1.d, z9.d + WORD $0x04aa3021 // eor z1.d, z1.d, z10.d + WORD $0x85855047 // ldr z7, [x2, #44, MUL VL] + WORD $0x85855448 // ldr z8, [x2, #45, MUL VL] + WORD $0x052d30e9 // tbl z9.b, z7.b, z13.b + WORD $0x052b30e7 // tbl z7.b, z7.b, z11.b + WORD $0x052e310a // tbl z10.b, z8.b, z14.b + WORD $0x052c3108 // tbl z8.b, z8.b, z12.b + WORD $0x04a73042 // eor z2.d, z2.d, z7.d + WORD $0x04a83042 // eor z2.d, z2.d, z8.d + WORD $0x04a93063 // eor z3.d, z3.d, z9.d + WORD $0x04aa3063 // eor z3.d, z3.d, z10.d + WORD $0x85855847 // ldr z7, [x2, #46, MUL VL] + WORD $0x85855c48 // ldr z8, [x2, #47, MUL VL] + WORD $0x052d30e9 // tbl z9.b, z7.b, z13.b + WORD $0x052b30e7 // tbl z7.b, z7.b, z11.b + WORD $0x052e310a // tbl z10.b, z8.b, z14.b + WORD $0x052c3108 // tbl z8.b, z8.b, z12.b + WORD $0x04a73084 // eor z4.d, z4.d, z7.d + WORD $0x04a83084 // eor z4.d, z4.d, z8.d + WORD $0x04a930a5 // eor z5.d, z5.d, z9.d + WORD $0x04aa30a5 // eor z5.d, z5.d, z10.d + // Check for early termination + CMP $8, R16 + BEQ mulSve_10x3_64_store + + // Load and process 64 bytes from input 8 to 3 outputs + WORD $0x8580418b // ldr z11, [x12] + WORD $0x8580458d // ldr z13, [x12, #1, MUL VL] + WORD $0x9101018c // add x12, x12, #64 + WORD $0x04fc956c // lsr z12.d, z11.d, #4 + WORD $0x04fc95ae // lsr z14.d, z13.d, #4 + WORD $0x0426316b // and z11.d, z11.d, z6.d + WORD $0x042631ad // and z13.d, z13.d, z6.d + WORD $0x0426318c // and z12.d, z12.d, z6.d + WORD $0x042631ce // and z14.d, z14.d, z6.d + WORD $0x85864047 // ldr z7, [x2, #48, MUL VL] + WORD $0x85864448 // ldr z8, [x2, #49, MUL VL] + WORD $0x052d30e9 // tbl z9.b, z7.b, z13.b + WORD $0x052b30e7 // tbl z7.b, z7.b, z11.b + WORD $0x052e310a // tbl z10.b, z8.b, z14.b + WORD $0x052c3108 // tbl z8.b, z8.b, z12.b + WORD $0x04a73000 // eor z0.d, z0.d, z7.d + WORD $0x04a83000 // eor z0.d, z0.d, z8.d + WORD $0x04a93021 // eor z1.d, z1.d, z9.d + WORD $0x04aa3021 // eor z1.d, z1.d, z10.d + WORD $0x85864847 // ldr z7, [x2, #50, MUL VL] + WORD $0x85864c48 // ldr z8, [x2, #51, MUL VL] + WORD $0x052d30e9 // tbl z9.b, z7.b, z13.b + WORD $0x052b30e7 // tbl z7.b, z7.b, z11.b + WORD $0x052e310a // tbl z10.b, z8.b, z14.b + WORD $0x052c3108 // tbl z8.b, z8.b, z12.b + WORD $0x04a73042 // eor z2.d, z2.d, z7.d + WORD $0x04a83042 // eor z2.d, z2.d, z8.d + WORD $0x04a93063 // eor z3.d, z3.d, z9.d + WORD $0x04aa3063 // eor z3.d, z3.d, z10.d + WORD $0x85865047 // ldr z7, [x2, #52, MUL VL] + WORD $0x85865448 // ldr z8, [x2, #53, MUL VL] + WORD $0x052d30e9 // tbl z9.b, z7.b, z13.b + WORD $0x052b30e7 // tbl z7.b, z7.b, z11.b + WORD $0x052e310a // tbl z10.b, z8.b, z14.b + WORD $0x052c3108 // tbl z8.b, z8.b, z12.b + WORD $0x04a73084 // eor z4.d, z4.d, z7.d + WORD $0x04a83084 // eor z4.d, z4.d, z8.d + WORD $0x04a930a5 // eor z5.d, z5.d, z9.d + WORD $0x04aa30a5 // eor z5.d, z5.d, z10.d + // Check for early termination + CMP $9, R16 + BEQ mulSve_10x3_64_store + + // Load and process 64 bytes from input 9 to 3 outputs + WORD $0x8580400b // ldr z11, [x0] + WORD $0x8580440d // ldr z13, [x0, #1, MUL VL] + WORD $0x91010000 // add x0, x0, #64 + WORD $0x04fc956c // lsr z12.d, z11.d, #4 + WORD $0x04fc95ae // lsr z14.d, z13.d, #4 + WORD $0x0426316b // and z11.d, z11.d, z6.d + WORD $0x042631ad // and z13.d, z13.d, z6.d + WORD $0x0426318c // and z12.d, z12.d, z6.d + WORD $0x042631ce // and z14.d, z14.d, z6.d + WORD $0x85865847 // ldr z7, [x2, #54, MUL VL] + WORD $0x85865c48 // ldr z8, [x2, #55, MUL VL] + WORD $0x052d30e9 // tbl z9.b, z7.b, z13.b + WORD $0x052b30e7 // tbl z7.b, z7.b, z11.b + WORD $0x052e310a // tbl z10.b, z8.b, z14.b + WORD $0x052c3108 // tbl z8.b, z8.b, z12.b + WORD $0x04a73000 // eor z0.d, z0.d, z7.d + WORD $0x04a83000 // eor z0.d, z0.d, z8.d + WORD $0x04a93021 // eor z1.d, z1.d, z9.d + WORD $0x04aa3021 // eor z1.d, z1.d, z10.d + WORD $0x85874047 // ldr z7, [x2, #56, MUL VL] + WORD $0x85874448 // ldr z8, [x2, #57, MUL VL] + WORD $0x052d30e9 // tbl z9.b, z7.b, z13.b + WORD $0x052b30e7 // tbl z7.b, z7.b, z11.b + WORD $0x052e310a // tbl z10.b, z8.b, z14.b + WORD $0x052c3108 // tbl z8.b, z8.b, z12.b + WORD $0x04a73042 // eor z2.d, z2.d, z7.d + WORD $0x04a83042 // eor z2.d, z2.d, z8.d + WORD $0x04a93063 // eor z3.d, z3.d, z9.d + WORD $0x04aa3063 // eor z3.d, z3.d, z10.d + WORD $0x85874847 // ldr z7, [x2, #58, MUL VL] + WORD $0x85874c48 // ldr z8, [x2, #59, MUL VL] + WORD $0x052d30e9 // tbl z9.b, z7.b, z13.b + WORD $0x052b30e7 // tbl z7.b, z7.b, z11.b + WORD $0x052e310a // tbl z10.b, z8.b, z14.b + WORD $0x052c3108 // tbl z8.b, z8.b, z12.b + WORD $0x04a73084 // eor z4.d, z4.d, z7.d + WORD $0x04a83084 // eor z4.d, z4.d, z8.d + WORD $0x04a930a5 // eor z5.d, z5.d, z9.d + WORD $0x04aa30a5 // eor z5.d, z5.d, z10.d + +mulSve_10x3_64_store: + // Store 3 outputs + WORD $0xe58041c0 // str z0, [x14] + WORD $0xe58045c1 // str z1, [x14, #1, MUL VL] + WORD $0x910101ce // add x14, x14, #64 + WORD $0xe58041e2 // str z2, [x15] + WORD $0xe58045e3 // str z3, [x15, #1, MUL VL] + WORD $0x910101ef // add x15, x15, #64 + WORD $0xe58041a4 // str z4, [x13] + WORD $0xe58045a5 // str z5, [x13, #1, MUL VL] + WORD $0x910101ad // add x13, x13, #64 + + // Prepare for next loop + WORD $0xf10004c6 // subs x6, x6, #1 + BNE mulSve_10x3_64_loop + +mulSve_10x3_64_end: + RET + +// func mulSve_10x3_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: SVE +TEXT ·mulSve_10x3_64Xor(SB), $8-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 130 YMM used + MOVD n+80(FP), R0 + MOVD matrix_base+0(FP), R2 + WORD $0xd346fc00 // lsr x0, x0, #6 + WORD $0xea00001f // tst x0, x0 + BEQ mulSve_10x3_64Xor_end + MOVD in_base+24(FP), R0 + MOVD (R0), R3 + MOVD 24(R0), R1 + MOVD 48(R0), R4 + MOVD 72(R0), R5 + MOVD 96(R0), R8 + MOVD 120(R0), R9 + MOVD 144(R0), R10 + MOVD 168(R0), R11 + MOVD 192(R0), R12 + MOVD 216(R0), R0 + MOVD out_base+48(FP), R13 + MOVD (R13), R14 + MOVD 24(R13), R15 + MOVD 48(R13), R13 + MOVD start+72(FP), R6 + + // Add start offset to output + WORD $0x8b0601ce // add x14, x14, x6 + WORD $0x8b0601ef // add x15, x15, x6 + WORD $0x8b0601ad // add x13, x13, x6 + + // Add start offset to input + WORD $0x8b060063 // add x3, x3, x6 + WORD $0x8b060021 // add x1, x1, x6 + WORD $0x8b060084 // add x4, x4, x6 + WORD $0x8b0600a5 // add x5, x5, x6 + WORD $0x8b060108 // add x8, x8, x6 + WORD $0x8b060129 // add x9, x9, x6 + WORD $0x8b06014a // add x10, x10, x6 + WORD $0x8b06016b // add x11, x11, x6 + WORD $0x8b06018c // add x12, x12, x6 + WORD $0x8b060000 // add x0, x0, x6 + WORD $0xd28001e6 // mov x6, #15 + WORD $0x05e038c6 // mov z6.d, x6 + WORD $0x052120c6 // dup z6.b, z6.b[0] + + // Reload length to save a register + MOVD n+80(FP), R6 + WORD $0xd346fcc6 // lsr x6, x6, #6 + + // Load number of input shards + MOVD in_len+32(FP), R16 + +mulSve_10x3_64Xor_loop: + // Load 3 outputs + WORD $0x858041c0 // ldr z0, [x14] + WORD $0x858045c1 // ldr z1, [x14, #1, MUL VL] + WORD $0x858041e2 // ldr z2, [x15] + WORD $0x858045e3 // ldr z3, [x15, #1, MUL VL] + WORD $0x858041a4 // ldr z4, [x13] + WORD $0x858045a5 // ldr z5, [x13, #1, MUL VL] + + // Load and process 64 bytes from input 0 to 3 outputs + WORD $0x8580406b // ldr z11, [x3] + WORD $0x8580446d // ldr z13, [x3, #1, MUL VL] + WORD $0x91010063 // add x3, x3, #64 + WORD $0x04fc956c // lsr z12.d, z11.d, #4 + WORD $0x04fc95ae // lsr z14.d, z13.d, #4 + WORD $0x0426316b // and z11.d, z11.d, z6.d + WORD $0x042631ad // and z13.d, z13.d, z6.d + WORD $0x0426318c // and z12.d, z12.d, z6.d + WORD $0x042631ce // and z14.d, z14.d, z6.d + WORD $0x85804047 // ldr z7, [x2] + WORD $0x85804448 // ldr z8, [x2, #1, MUL VL] + WORD $0x052d30e9 // tbl z9.b, z7.b, z13.b + WORD $0x052b30e7 // tbl z7.b, z7.b, z11.b + WORD $0x052e310a // tbl z10.b, z8.b, z14.b + WORD $0x052c3108 // tbl z8.b, z8.b, z12.b + WORD $0x04a73000 // eor z0.d, z0.d, z7.d + WORD $0x04a83000 // eor z0.d, z0.d, z8.d + WORD $0x04a93021 // eor z1.d, z1.d, z9.d + WORD $0x04aa3021 // eor z1.d, z1.d, z10.d + WORD $0x85804847 // ldr z7, [x2, #2, MUL VL] + WORD $0x85804c48 // ldr z8, [x2, #3, MUL VL] + WORD $0x052d30e9 // tbl z9.b, z7.b, z13.b + WORD $0x052b30e7 // tbl z7.b, z7.b, z11.b + WORD $0x052e310a // tbl z10.b, z8.b, z14.b + WORD $0x052c3108 // tbl z8.b, z8.b, z12.b + WORD $0x04a73042 // eor z2.d, z2.d, z7.d + WORD $0x04a83042 // eor z2.d, z2.d, z8.d + WORD $0x04a93063 // eor z3.d, z3.d, z9.d + WORD $0x04aa3063 // eor z3.d, z3.d, z10.d + WORD $0x85805047 // ldr z7, [x2, #4, MUL VL] + WORD $0x85805448 // ldr z8, [x2, #5, MUL VL] + WORD $0x052d30e9 // tbl z9.b, z7.b, z13.b + WORD $0x052b30e7 // tbl z7.b, z7.b, z11.b + WORD $0x052e310a // tbl z10.b, z8.b, z14.b + WORD $0x052c3108 // tbl z8.b, z8.b, z12.b + WORD $0x04a73084 // eor z4.d, z4.d, z7.d + WORD $0x04a83084 // eor z4.d, z4.d, z8.d + WORD $0x04a930a5 // eor z5.d, z5.d, z9.d + WORD $0x04aa30a5 // eor z5.d, z5.d, z10.d + // Check for early termination + CMP $1, R16 + BEQ mulSve_10x3_64Xor_store + + // Load and process 64 bytes from input 1 to 3 outputs + WORD $0x8580402b // ldr z11, [x1] + WORD $0x8580442d // ldr z13, [x1, #1, MUL VL] + WORD $0x91010021 // add x1, x1, #64 + WORD $0x04fc956c // lsr z12.d, z11.d, #4 + WORD $0x04fc95ae // lsr z14.d, z13.d, #4 + WORD $0x0426316b // and z11.d, z11.d, z6.d + WORD $0x042631ad // and z13.d, z13.d, z6.d + WORD $0x0426318c // and z12.d, z12.d, z6.d + WORD $0x042631ce // and z14.d, z14.d, z6.d + WORD $0x85805847 // ldr z7, [x2, #6, MUL VL] + WORD $0x85805c48 // ldr z8, [x2, #7, MUL VL] + WORD $0x052d30e9 // tbl z9.b, z7.b, z13.b + WORD $0x052b30e7 // tbl z7.b, z7.b, z11.b + WORD $0x052e310a // tbl z10.b, z8.b, z14.b + WORD $0x052c3108 // tbl z8.b, z8.b, z12.b + WORD $0x04a73000 // eor z0.d, z0.d, z7.d + WORD $0x04a83000 // eor z0.d, z0.d, z8.d + WORD $0x04a93021 // eor z1.d, z1.d, z9.d + WORD $0x04aa3021 // eor z1.d, z1.d, z10.d + WORD $0x85814047 // ldr z7, [x2, #8, MUL VL] + WORD $0x85814448 // ldr z8, [x2, #9, MUL VL] + WORD $0x052d30e9 // tbl z9.b, z7.b, z13.b + WORD $0x052b30e7 // tbl z7.b, z7.b, z11.b + WORD $0x052e310a // tbl z10.b, z8.b, z14.b + WORD $0x052c3108 // tbl z8.b, z8.b, z12.b + WORD $0x04a73042 // eor z2.d, z2.d, z7.d + WORD $0x04a83042 // eor z2.d, z2.d, z8.d + WORD $0x04a93063 // eor z3.d, z3.d, z9.d + WORD $0x04aa3063 // eor z3.d, z3.d, z10.d + WORD $0x85814847 // ldr z7, [x2, #10, MUL VL] + WORD $0x85814c48 // ldr z8, [x2, #11, MUL VL] + WORD $0x052d30e9 // tbl z9.b, z7.b, z13.b + WORD $0x052b30e7 // tbl z7.b, z7.b, z11.b + WORD $0x052e310a // tbl z10.b, z8.b, z14.b + WORD $0x052c3108 // tbl z8.b, z8.b, z12.b + WORD $0x04a73084 // eor z4.d, z4.d, z7.d + WORD $0x04a83084 // eor z4.d, z4.d, z8.d + WORD $0x04a930a5 // eor z5.d, z5.d, z9.d + WORD $0x04aa30a5 // eor z5.d, z5.d, z10.d + // Check for early termination + CMP $2, R16 + BEQ mulSve_10x3_64Xor_store + + // Load and process 64 bytes from input 2 to 3 outputs + WORD $0x8580408b // ldr z11, [x4] + WORD $0x8580448d // ldr z13, [x4, #1, MUL VL] + WORD $0x91010084 // add x4, x4, #64 + WORD $0x04fc956c // lsr z12.d, z11.d, #4 + WORD $0x04fc95ae // lsr z14.d, z13.d, #4 + WORD $0x0426316b // and z11.d, z11.d, z6.d + WORD $0x042631ad // and z13.d, z13.d, z6.d + WORD $0x0426318c // and z12.d, z12.d, z6.d + WORD $0x042631ce // and z14.d, z14.d, z6.d + WORD $0x85815047 // ldr z7, [x2, #12, MUL VL] + WORD $0x85815448 // ldr z8, [x2, #13, MUL VL] + WORD $0x052d30e9 // tbl z9.b, z7.b, z13.b + WORD $0x052b30e7 // tbl z7.b, z7.b, z11.b + WORD $0x052e310a // tbl z10.b, z8.b, z14.b + WORD $0x052c3108 // tbl z8.b, z8.b, z12.b + WORD $0x04a73000 // eor z0.d, z0.d, z7.d + WORD $0x04a83000 // eor z0.d, z0.d, z8.d + WORD $0x04a93021 // eor z1.d, z1.d, z9.d + WORD $0x04aa3021 // eor z1.d, z1.d, z10.d + WORD $0x85815847 // ldr z7, [x2, #14, MUL VL] + WORD $0x85815c48 // ldr z8, [x2, #15, MUL VL] + WORD $0x052d30e9 // tbl z9.b, z7.b, z13.b + WORD $0x052b30e7 // tbl z7.b, z7.b, z11.b + WORD $0x052e310a // tbl z10.b, z8.b, z14.b + WORD $0x052c3108 // tbl z8.b, z8.b, z12.b + WORD $0x04a73042 // eor z2.d, z2.d, z7.d + WORD $0x04a83042 // eor z2.d, z2.d, z8.d + WORD $0x04a93063 // eor z3.d, z3.d, z9.d + WORD $0x04aa3063 // eor z3.d, z3.d, z10.d + WORD $0x85824047 // ldr z7, [x2, #16, MUL VL] + WORD $0x85824448 // ldr z8, [x2, #17, MUL VL] + WORD $0x052d30e9 // tbl z9.b, z7.b, z13.b + WORD $0x052b30e7 // tbl z7.b, z7.b, z11.b + WORD $0x052e310a // tbl z10.b, z8.b, z14.b + WORD $0x052c3108 // tbl z8.b, z8.b, z12.b + WORD $0x04a73084 // eor z4.d, z4.d, z7.d + WORD $0x04a83084 // eor z4.d, z4.d, z8.d + WORD $0x04a930a5 // eor z5.d, z5.d, z9.d + WORD $0x04aa30a5 // eor z5.d, z5.d, z10.d + // Check for early termination + CMP $3, R16 + BEQ mulSve_10x3_64Xor_store + + // Load and process 64 bytes from input 3 to 3 outputs + WORD $0x858040ab // ldr z11, [x5] + WORD $0x858044ad // ldr z13, [x5, #1, MUL VL] + WORD $0x910100a5 // add x5, x5, #64 + WORD $0x04fc956c // lsr z12.d, z11.d, #4 + WORD $0x04fc95ae // lsr z14.d, z13.d, #4 + WORD $0x0426316b // and z11.d, z11.d, z6.d + WORD $0x042631ad // and z13.d, z13.d, z6.d + WORD $0x0426318c // and z12.d, z12.d, z6.d + WORD $0x042631ce // and z14.d, z14.d, z6.d + WORD $0x85824847 // ldr z7, [x2, #18, MUL VL] + WORD $0x85824c48 // ldr z8, [x2, #19, MUL VL] + WORD $0x052d30e9 // tbl z9.b, z7.b, z13.b + WORD $0x052b30e7 // tbl z7.b, z7.b, z11.b + WORD $0x052e310a // tbl z10.b, z8.b, z14.b + WORD $0x052c3108 // tbl z8.b, z8.b, z12.b + WORD $0x04a73000 // eor z0.d, z0.d, z7.d + WORD $0x04a83000 // eor z0.d, z0.d, z8.d + WORD $0x04a93021 // eor z1.d, z1.d, z9.d + WORD $0x04aa3021 // eor z1.d, z1.d, z10.d + WORD $0x85825047 // ldr z7, [x2, #20, MUL VL] + WORD $0x85825448 // ldr z8, [x2, #21, MUL VL] + WORD $0x052d30e9 // tbl z9.b, z7.b, z13.b + WORD $0x052b30e7 // tbl z7.b, z7.b, z11.b + WORD $0x052e310a // tbl z10.b, z8.b, z14.b + WORD $0x052c3108 // tbl z8.b, z8.b, z12.b + WORD $0x04a73042 // eor z2.d, z2.d, z7.d + WORD $0x04a83042 // eor z2.d, z2.d, z8.d + WORD $0x04a93063 // eor z3.d, z3.d, z9.d + WORD $0x04aa3063 // eor z3.d, z3.d, z10.d + WORD $0x85825847 // ldr z7, [x2, #22, MUL VL] + WORD $0x85825c48 // ldr z8, [x2, #23, MUL VL] + WORD $0x052d30e9 // tbl z9.b, z7.b, z13.b + WORD $0x052b30e7 // tbl z7.b, z7.b, z11.b + WORD $0x052e310a // tbl z10.b, z8.b, z14.b + WORD $0x052c3108 // tbl z8.b, z8.b, z12.b + WORD $0x04a73084 // eor z4.d, z4.d, z7.d + WORD $0x04a83084 // eor z4.d, z4.d, z8.d + WORD $0x04a930a5 // eor z5.d, z5.d, z9.d + WORD $0x04aa30a5 // eor z5.d, z5.d, z10.d + // Check for early termination + CMP $4, R16 + BEQ mulSve_10x3_64Xor_store + + // Load and process 64 bytes from input 4 to 3 outputs + WORD $0x8580410b // ldr z11, [x8] + WORD $0x8580450d // ldr z13, [x8, #1, MUL VL] + WORD $0x91010108 // add x8, x8, #64 + WORD $0x04fc956c // lsr z12.d, z11.d, #4 + WORD $0x04fc95ae // lsr z14.d, z13.d, #4 + WORD $0x0426316b // and z11.d, z11.d, z6.d + WORD $0x042631ad // and z13.d, z13.d, z6.d + WORD $0x0426318c // and z12.d, z12.d, z6.d + WORD $0x042631ce // and z14.d, z14.d, z6.d + WORD $0x85834047 // ldr z7, [x2, #24, MUL VL] + WORD $0x85834448 // ldr z8, [x2, #25, MUL VL] + WORD $0x052d30e9 // tbl z9.b, z7.b, z13.b + WORD $0x052b30e7 // tbl z7.b, z7.b, z11.b + WORD $0x052e310a // tbl z10.b, z8.b, z14.b + WORD $0x052c3108 // tbl z8.b, z8.b, z12.b + WORD $0x04a73000 // eor z0.d, z0.d, z7.d + WORD $0x04a83000 // eor z0.d, z0.d, z8.d + WORD $0x04a93021 // eor z1.d, z1.d, z9.d + WORD $0x04aa3021 // eor z1.d, z1.d, z10.d + WORD $0x85834847 // ldr z7, [x2, #26, MUL VL] + WORD $0x85834c48 // ldr z8, [x2, #27, MUL VL] + WORD $0x052d30e9 // tbl z9.b, z7.b, z13.b + WORD $0x052b30e7 // tbl z7.b, z7.b, z11.b + WORD $0x052e310a // tbl z10.b, z8.b, z14.b + WORD $0x052c3108 // tbl z8.b, z8.b, z12.b + WORD $0x04a73042 // eor z2.d, z2.d, z7.d + WORD $0x04a83042 // eor z2.d, z2.d, z8.d + WORD $0x04a93063 // eor z3.d, z3.d, z9.d + WORD $0x04aa3063 // eor z3.d, z3.d, z10.d + WORD $0x85835047 // ldr z7, [x2, #28, MUL VL] + WORD $0x85835448 // ldr z8, [x2, #29, MUL VL] + WORD $0x052d30e9 // tbl z9.b, z7.b, z13.b + WORD $0x052b30e7 // tbl z7.b, z7.b, z11.b + WORD $0x052e310a // tbl z10.b, z8.b, z14.b + WORD $0x052c3108 // tbl z8.b, z8.b, z12.b + WORD $0x04a73084 // eor z4.d, z4.d, z7.d + WORD $0x04a83084 // eor z4.d, z4.d, z8.d + WORD $0x04a930a5 // eor z5.d, z5.d, z9.d + WORD $0x04aa30a5 // eor z5.d, z5.d, z10.d + // Check for early termination + CMP $5, R16 + BEQ mulSve_10x3_64Xor_store + + // Load and process 64 bytes from input 5 to 3 outputs + WORD $0x8580412b // ldr z11, [x9] + WORD $0x8580452d // ldr z13, [x9, #1, MUL VL] + WORD $0x91010129 // add x9, x9, #64 + WORD $0x04fc956c // lsr z12.d, z11.d, #4 + WORD $0x04fc95ae // lsr z14.d, z13.d, #4 + WORD $0x0426316b // and z11.d, z11.d, z6.d + WORD $0x042631ad // and z13.d, z13.d, z6.d + WORD $0x0426318c // and z12.d, z12.d, z6.d + WORD $0x042631ce // and z14.d, z14.d, z6.d + WORD $0x85835847 // ldr z7, [x2, #30, MUL VL] + WORD $0x85835c48 // ldr z8, [x2, #31, MUL VL] + WORD $0x052d30e9 // tbl z9.b, z7.b, z13.b + WORD $0x052b30e7 // tbl z7.b, z7.b, z11.b + WORD $0x052e310a // tbl z10.b, z8.b, z14.b + WORD $0x052c3108 // tbl z8.b, z8.b, z12.b + WORD $0x04a73000 // eor z0.d, z0.d, z7.d + WORD $0x04a83000 // eor z0.d, z0.d, z8.d + WORD $0x04a93021 // eor z1.d, z1.d, z9.d + WORD $0x04aa3021 // eor z1.d, z1.d, z10.d + WORD $0x85844047 // ldr z7, [x2, #32, MUL VL] + WORD $0x85844448 // ldr z8, [x2, #33, MUL VL] + WORD $0x052d30e9 // tbl z9.b, z7.b, z13.b + WORD $0x052b30e7 // tbl z7.b, z7.b, z11.b + WORD $0x052e310a // tbl z10.b, z8.b, z14.b + WORD $0x052c3108 // tbl z8.b, z8.b, z12.b + WORD $0x04a73042 // eor z2.d, z2.d, z7.d + WORD $0x04a83042 // eor z2.d, z2.d, z8.d + WORD $0x04a93063 // eor z3.d, z3.d, z9.d + WORD $0x04aa3063 // eor z3.d, z3.d, z10.d + WORD $0x85844847 // ldr z7, [x2, #34, MUL VL] + WORD $0x85844c48 // ldr z8, [x2, #35, MUL VL] + WORD $0x052d30e9 // tbl z9.b, z7.b, z13.b + WORD $0x052b30e7 // tbl z7.b, z7.b, z11.b + WORD $0x052e310a // tbl z10.b, z8.b, z14.b + WORD $0x052c3108 // tbl z8.b, z8.b, z12.b + WORD $0x04a73084 // eor z4.d, z4.d, z7.d + WORD $0x04a83084 // eor z4.d, z4.d, z8.d + WORD $0x04a930a5 // eor z5.d, z5.d, z9.d + WORD $0x04aa30a5 // eor z5.d, z5.d, z10.d + // Check for early termination + CMP $6, R16 + BEQ mulSve_10x3_64Xor_store + + // Load and process 64 bytes from input 6 to 3 outputs + WORD $0x8580414b // ldr z11, [x10] + WORD $0x8580454d // ldr z13, [x10, #1, MUL VL] + WORD $0x9101014a // add x10, x10, #64 + WORD $0x04fc956c // lsr z12.d, z11.d, #4 + WORD $0x04fc95ae // lsr z14.d, z13.d, #4 + WORD $0x0426316b // and z11.d, z11.d, z6.d + WORD $0x042631ad // and z13.d, z13.d, z6.d + WORD $0x0426318c // and z12.d, z12.d, z6.d + WORD $0x042631ce // and z14.d, z14.d, z6.d + WORD $0x85845047 // ldr z7, [x2, #36, MUL VL] + WORD $0x85845448 // ldr z8, [x2, #37, MUL VL] + WORD $0x052d30e9 // tbl z9.b, z7.b, z13.b + WORD $0x052b30e7 // tbl z7.b, z7.b, z11.b + WORD $0x052e310a // tbl z10.b, z8.b, z14.b + WORD $0x052c3108 // tbl z8.b, z8.b, z12.b + WORD $0x04a73000 // eor z0.d, z0.d, z7.d + WORD $0x04a83000 // eor z0.d, z0.d, z8.d + WORD $0x04a93021 // eor z1.d, z1.d, z9.d + WORD $0x04aa3021 // eor z1.d, z1.d, z10.d + WORD $0x85845847 // ldr z7, [x2, #38, MUL VL] + WORD $0x85845c48 // ldr z8, [x2, #39, MUL VL] + WORD $0x052d30e9 // tbl z9.b, z7.b, z13.b + WORD $0x052b30e7 // tbl z7.b, z7.b, z11.b + WORD $0x052e310a // tbl z10.b, z8.b, z14.b + WORD $0x052c3108 // tbl z8.b, z8.b, z12.b + WORD $0x04a73042 // eor z2.d, z2.d, z7.d + WORD $0x04a83042 // eor z2.d, z2.d, z8.d + WORD $0x04a93063 // eor z3.d, z3.d, z9.d + WORD $0x04aa3063 // eor z3.d, z3.d, z10.d + WORD $0x85854047 // ldr z7, [x2, #40, MUL VL] + WORD $0x85854448 // ldr z8, [x2, #41, MUL VL] + WORD $0x052d30e9 // tbl z9.b, z7.b, z13.b + WORD $0x052b30e7 // tbl z7.b, z7.b, z11.b + WORD $0x052e310a // tbl z10.b, z8.b, z14.b + WORD $0x052c3108 // tbl z8.b, z8.b, z12.b + WORD $0x04a73084 // eor z4.d, z4.d, z7.d + WORD $0x04a83084 // eor z4.d, z4.d, z8.d + WORD $0x04a930a5 // eor z5.d, z5.d, z9.d + WORD $0x04aa30a5 // eor z5.d, z5.d, z10.d + // Check for early termination + CMP $7, R16 + BEQ mulSve_10x3_64Xor_store + + // Load and process 64 bytes from input 7 to 3 outputs + WORD $0x8580416b // ldr z11, [x11] + WORD $0x8580456d // ldr z13, [x11, #1, MUL VL] + WORD $0x9101016b // add x11, x11, #64 + WORD $0x04fc956c // lsr z12.d, z11.d, #4 + WORD $0x04fc95ae // lsr z14.d, z13.d, #4 + WORD $0x0426316b // and z11.d, z11.d, z6.d + WORD $0x042631ad // and z13.d, z13.d, z6.d + WORD $0x0426318c // and z12.d, z12.d, z6.d + WORD $0x042631ce // and z14.d, z14.d, z6.d + WORD $0x85854847 // ldr z7, [x2, #42, MUL VL] + WORD $0x85854c48 // ldr z8, [x2, #43, MUL VL] + WORD $0x052d30e9 // tbl z9.b, z7.b, z13.b + WORD $0x052b30e7 // tbl z7.b, z7.b, z11.b + WORD $0x052e310a // tbl z10.b, z8.b, z14.b + WORD $0x052c3108 // tbl z8.b, z8.b, z12.b + WORD $0x04a73000 // eor z0.d, z0.d, z7.d + WORD $0x04a83000 // eor z0.d, z0.d, z8.d + WORD $0x04a93021 // eor z1.d, z1.d, z9.d + WORD $0x04aa3021 // eor z1.d, z1.d, z10.d + WORD $0x85855047 // ldr z7, [x2, #44, MUL VL] + WORD $0x85855448 // ldr z8, [x2, #45, MUL VL] + WORD $0x052d30e9 // tbl z9.b, z7.b, z13.b + WORD $0x052b30e7 // tbl z7.b, z7.b, z11.b + WORD $0x052e310a // tbl z10.b, z8.b, z14.b + WORD $0x052c3108 // tbl z8.b, z8.b, z12.b + WORD $0x04a73042 // eor z2.d, z2.d, z7.d + WORD $0x04a83042 // eor z2.d, z2.d, z8.d + WORD $0x04a93063 // eor z3.d, z3.d, z9.d + WORD $0x04aa3063 // eor z3.d, z3.d, z10.d + WORD $0x85855847 // ldr z7, [x2, #46, MUL VL] + WORD $0x85855c48 // ldr z8, [x2, #47, MUL VL] + WORD $0x052d30e9 // tbl z9.b, z7.b, z13.b + WORD $0x052b30e7 // tbl z7.b, z7.b, z11.b + WORD $0x052e310a // tbl z10.b, z8.b, z14.b + WORD $0x052c3108 // tbl z8.b, z8.b, z12.b + WORD $0x04a73084 // eor z4.d, z4.d, z7.d + WORD $0x04a83084 // eor z4.d, z4.d, z8.d + WORD $0x04a930a5 // eor z5.d, z5.d, z9.d + WORD $0x04aa30a5 // eor z5.d, z5.d, z10.d + // Check for early termination + CMP $8, R16 + BEQ mulSve_10x3_64Xor_store + + // Load and process 64 bytes from input 8 to 3 outputs + WORD $0x8580418b // ldr z11, [x12] + WORD $0x8580458d // ldr z13, [x12, #1, MUL VL] + WORD $0x9101018c // add x12, x12, #64 + WORD $0x04fc956c // lsr z12.d, z11.d, #4 + WORD $0x04fc95ae // lsr z14.d, z13.d, #4 + WORD $0x0426316b // and z11.d, z11.d, z6.d + WORD $0x042631ad // and z13.d, z13.d, z6.d + WORD $0x0426318c // and z12.d, z12.d, z6.d + WORD $0x042631ce // and z14.d, z14.d, z6.d + WORD $0x85864047 // ldr z7, [x2, #48, MUL VL] + WORD $0x85864448 // ldr z8, [x2, #49, MUL VL] + WORD $0x052d30e9 // tbl z9.b, z7.b, z13.b + WORD $0x052b30e7 // tbl z7.b, z7.b, z11.b + WORD $0x052e310a // tbl z10.b, z8.b, z14.b + WORD $0x052c3108 // tbl z8.b, z8.b, z12.b + WORD $0x04a73000 // eor z0.d, z0.d, z7.d + WORD $0x04a83000 // eor z0.d, z0.d, z8.d + WORD $0x04a93021 // eor z1.d, z1.d, z9.d + WORD $0x04aa3021 // eor z1.d, z1.d, z10.d + WORD $0x85864847 // ldr z7, [x2, #50, MUL VL] + WORD $0x85864c48 // ldr z8, [x2, #51, MUL VL] + WORD $0x052d30e9 // tbl z9.b, z7.b, z13.b + WORD $0x052b30e7 // tbl z7.b, z7.b, z11.b + WORD $0x052e310a // tbl z10.b, z8.b, z14.b + WORD $0x052c3108 // tbl z8.b, z8.b, z12.b + WORD $0x04a73042 // eor z2.d, z2.d, z7.d + WORD $0x04a83042 // eor z2.d, z2.d, z8.d + WORD $0x04a93063 // eor z3.d, z3.d, z9.d + WORD $0x04aa3063 // eor z3.d, z3.d, z10.d + WORD $0x85865047 // ldr z7, [x2, #52, MUL VL] + WORD $0x85865448 // ldr z8, [x2, #53, MUL VL] + WORD $0x052d30e9 // tbl z9.b, z7.b, z13.b + WORD $0x052b30e7 // tbl z7.b, z7.b, z11.b + WORD $0x052e310a // tbl z10.b, z8.b, z14.b + WORD $0x052c3108 // tbl z8.b, z8.b, z12.b + WORD $0x04a73084 // eor z4.d, z4.d, z7.d + WORD $0x04a83084 // eor z4.d, z4.d, z8.d + WORD $0x04a930a5 // eor z5.d, z5.d, z9.d + WORD $0x04aa30a5 // eor z5.d, z5.d, z10.d + // Check for early termination + CMP $9, R16 + BEQ mulSve_10x3_64Xor_store + + // Load and process 64 bytes from input 9 to 3 outputs + WORD $0x8580400b // ldr z11, [x0] + WORD $0x8580440d // ldr z13, [x0, #1, MUL VL] + WORD $0x91010000 // add x0, x0, #64 + WORD $0x04fc956c // lsr z12.d, z11.d, #4 + WORD $0x04fc95ae // lsr z14.d, z13.d, #4 + WORD $0x0426316b // and z11.d, z11.d, z6.d + WORD $0x042631ad // and z13.d, z13.d, z6.d + WORD $0x0426318c // and z12.d, z12.d, z6.d + WORD $0x042631ce // and z14.d, z14.d, z6.d + WORD $0x85865847 // ldr z7, [x2, #54, MUL VL] + WORD $0x85865c48 // ldr z8, [x2, #55, MUL VL] + WORD $0x052d30e9 // tbl z9.b, z7.b, z13.b + WORD $0x052b30e7 // tbl z7.b, z7.b, z11.b + WORD $0x052e310a // tbl z10.b, z8.b, z14.b + WORD $0x052c3108 // tbl z8.b, z8.b, z12.b + WORD $0x04a73000 // eor z0.d, z0.d, z7.d + WORD $0x04a83000 // eor z0.d, z0.d, z8.d + WORD $0x04a93021 // eor z1.d, z1.d, z9.d + WORD $0x04aa3021 // eor z1.d, z1.d, z10.d + WORD $0x85874047 // ldr z7, [x2, #56, MUL VL] + WORD $0x85874448 // ldr z8, [x2, #57, MUL VL] + WORD $0x052d30e9 // tbl z9.b, z7.b, z13.b + WORD $0x052b30e7 // tbl z7.b, z7.b, z11.b + WORD $0x052e310a // tbl z10.b, z8.b, z14.b + WORD $0x052c3108 // tbl z8.b, z8.b, z12.b + WORD $0x04a73042 // eor z2.d, z2.d, z7.d + WORD $0x04a83042 // eor z2.d, z2.d, z8.d + WORD $0x04a93063 // eor z3.d, z3.d, z9.d + WORD $0x04aa3063 // eor z3.d, z3.d, z10.d + WORD $0x85874847 // ldr z7, [x2, #58, MUL VL] + WORD $0x85874c48 // ldr z8, [x2, #59, MUL VL] + WORD $0x052d30e9 // tbl z9.b, z7.b, z13.b + WORD $0x052b30e7 // tbl z7.b, z7.b, z11.b + WORD $0x052e310a // tbl z10.b, z8.b, z14.b + WORD $0x052c3108 // tbl z8.b, z8.b, z12.b + WORD $0x04a73084 // eor z4.d, z4.d, z7.d + WORD $0x04a83084 // eor z4.d, z4.d, z8.d + WORD $0x04a930a5 // eor z5.d, z5.d, z9.d + WORD $0x04aa30a5 // eor z5.d, z5.d, z10.d + +mulSve_10x3_64Xor_store: + // Store 3 outputs + WORD $0xe58041c0 // str z0, [x14] + WORD $0xe58045c1 // str z1, [x14, #1, MUL VL] + WORD $0x910101ce // add x14, x14, #64 + WORD $0xe58041e2 // str z2, [x15] + WORD $0xe58045e3 // str z3, [x15, #1, MUL VL] + WORD $0x910101ef // add x15, x15, #64 + WORD $0xe58041a4 // str z4, [x13] + WORD $0xe58045a5 // str z5, [x13, #1, MUL VL] + WORD $0x910101ad // add x13, x13, #64 + + // Prepare for next loop + WORD $0xf10004c6 // subs x6, x6, #1 + BNE mulSve_10x3_64Xor_loop + +mulSve_10x3_64Xor_end: + RET + +// func mulSve_10x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: SVE +TEXT ·mulSve_10x4(SB), NOSPLIT, $8-88 + WORD $0x25d8e3e0 // ptrue p0.d + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 89 YMM used + MOVD n+80(FP), R0 + MOVD matrix_base+0(FP), R2 + WORD $0xd345fc00 // lsr x0, x0, #5 + WORD $0xea00001f // tst x0, x0 + BEQ mulSve_10x4_end + MOVD in_base+24(FP), R3 + MOVD (R3), R1 + MOVD 24(R3), R4 + MOVD 48(R3), R5 + MOVD 72(R3), R8 + MOVD 96(R3), R9 + MOVD 120(R3), R10 + MOVD 144(R3), R11 + MOVD 168(R3), R12 + MOVD 192(R3), R13 + MOVD 216(R3), R3 + MOVD out_base+48(FP), R14 + MOVD start+72(FP), R15 + + // Add start offset to input + WORD $0x8b0f0021 // add x1, x1, x15 + WORD $0x8b0f0084 // add x4, x4, x15 + WORD $0x8b0f00a5 // add x5, x5, x15 + WORD $0x8b0f0108 // add x8, x8, x15 + WORD $0x8b0f0129 // add x9, x9, x15 + WORD $0x8b0f014a // add x10, x10, x15 + WORD $0x8b0f016b // add x11, x11, x15 + WORD $0x8b0f018c // add x12, x12, x15 + WORD $0x8b0f01ad // add x13, x13, x15 + WORD $0x8b0f0063 // add x3, x3, x15 + WORD $0xd343fdef // lsr x15, x15, #3 + WORD $0xd28001e6 // mov x6, #15 + WORD $0x05e038c4 // mov z4.d, x6 + WORD $0x05212084 // dup z4.b, z4.b[0] + + // Load number of input shards + MOVD in_len+32(FP), R16 + +mulSve_10x4_loop: + // Load and process 32 bytes from input 0 to 4 outputs + WORD $0x85804027 // ldr z7, [x1] + WORD $0x91008021 // add x1, x1, #32 + WORD $0x04fc94e8 // lsr z8.d, z7.d, #4 + WORD $0x042430e7 // and z7.d, z7.d, z4.d + WORD $0x04243108 // and z8.d, z8.d, z4.d + WORD $0x85804045 // ldr z5, [x2] + WORD $0x85804446 // ldr z6, [x2, #1, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a530c0 // eor z0.d, z6.d, z5.d + WORD $0x85804845 // ldr z5, [x2, #2, MUL VL] + WORD $0x85804c46 // ldr z6, [x2, #3, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a530c1 // eor z1.d, z6.d, z5.d + WORD $0x85805045 // ldr z5, [x2, #4, MUL VL] + WORD $0x85805446 // ldr z6, [x2, #5, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a530c2 // eor z2.d, z6.d, z5.d + WORD $0x85805845 // ldr z5, [x2, #6, MUL VL] + WORD $0x85805c46 // ldr z6, [x2, #7, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a530c3 // eor z3.d, z6.d, z5.d + // Check for early termination + CMP $1, R16 + BEQ mulSve_10x4_store + + // Load and process 32 bytes from input 1 to 4 outputs + WORD $0x85804087 // ldr z7, [x4] + WORD $0x91008084 // add x4, x4, #32 + WORD $0x04fc94e8 // lsr z8.d, z7.d, #4 + WORD $0x042430e7 // and z7.d, z7.d, z4.d + WORD $0x04243108 // and z8.d, z8.d, z4.d + WORD $0x85814045 // ldr z5, [x2, #8, MUL VL] + WORD $0x85814446 // ldr z6, [x2, #9, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53000 // eor z0.d, z0.d, z5.d + WORD $0x04a63000 // eor z0.d, z0.d, z6.d + WORD $0x85814845 // ldr z5, [x2, #10, MUL VL] + WORD $0x85814c46 // ldr z6, [x2, #11, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53021 // eor z1.d, z1.d, z5.d + WORD $0x04a63021 // eor z1.d, z1.d, z6.d + WORD $0x85815045 // ldr z5, [x2, #12, MUL VL] + WORD $0x85815446 // ldr z6, [x2, #13, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53042 // eor z2.d, z2.d, z5.d + WORD $0x04a63042 // eor z2.d, z2.d, z6.d + WORD $0x85815845 // ldr z5, [x2, #14, MUL VL] + WORD $0x85815c46 // ldr z6, [x2, #15, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53063 // eor z3.d, z3.d, z5.d + WORD $0x04a63063 // eor z3.d, z3.d, z6.d + // Check for early termination + CMP $2, R16 + BEQ mulSve_10x4_store + + // Load and process 32 bytes from input 2 to 4 outputs + WORD $0x858040a7 // ldr z7, [x5] + WORD $0x910080a5 // add x5, x5, #32 + WORD $0x04fc94e8 // lsr z8.d, z7.d, #4 + WORD $0x042430e7 // and z7.d, z7.d, z4.d + WORD $0x04243108 // and z8.d, z8.d, z4.d + WORD $0x85824045 // ldr z5, [x2, #16, MUL VL] + WORD $0x85824446 // ldr z6, [x2, #17, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53000 // eor z0.d, z0.d, z5.d + WORD $0x04a63000 // eor z0.d, z0.d, z6.d + WORD $0x85824845 // ldr z5, [x2, #18, MUL VL] + WORD $0x85824c46 // ldr z6, [x2, #19, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53021 // eor z1.d, z1.d, z5.d + WORD $0x04a63021 // eor z1.d, z1.d, z6.d + WORD $0x85825045 // ldr z5, [x2, #20, MUL VL] + WORD $0x85825446 // ldr z6, [x2, #21, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53042 // eor z2.d, z2.d, z5.d + WORD $0x04a63042 // eor z2.d, z2.d, z6.d + WORD $0x85825845 // ldr z5, [x2, #22, MUL VL] + WORD $0x85825c46 // ldr z6, [x2, #23, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53063 // eor z3.d, z3.d, z5.d + WORD $0x04a63063 // eor z3.d, z3.d, z6.d + // Check for early termination + CMP $3, R16 + BEQ mulSve_10x4_store + + // Load and process 32 bytes from input 3 to 4 outputs + WORD $0x85804107 // ldr z7, [x8] + WORD $0x91008108 // add x8, x8, #32 + WORD $0x04fc94e8 // lsr z8.d, z7.d, #4 + WORD $0x042430e7 // and z7.d, z7.d, z4.d + WORD $0x04243108 // and z8.d, z8.d, z4.d + WORD $0x85834045 // ldr z5, [x2, #24, MUL VL] + WORD $0x85834446 // ldr z6, [x2, #25, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53000 // eor z0.d, z0.d, z5.d + WORD $0x04a63000 // eor z0.d, z0.d, z6.d + WORD $0x85834845 // ldr z5, [x2, #26, MUL VL] + WORD $0x85834c46 // ldr z6, [x2, #27, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53021 // eor z1.d, z1.d, z5.d + WORD $0x04a63021 // eor z1.d, z1.d, z6.d + WORD $0x85835045 // ldr z5, [x2, #28, MUL VL] + WORD $0x85835446 // ldr z6, [x2, #29, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53042 // eor z2.d, z2.d, z5.d + WORD $0x04a63042 // eor z2.d, z2.d, z6.d + WORD $0x85835845 // ldr z5, [x2, #30, MUL VL] + WORD $0x85835c46 // ldr z6, [x2, #31, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53063 // eor z3.d, z3.d, z5.d + WORD $0x04a63063 // eor z3.d, z3.d, z6.d + // Check for early termination + CMP $4, R16 + BEQ mulSve_10x4_store + + // Load and process 32 bytes from input 4 to 4 outputs + WORD $0x85804127 // ldr z7, [x9] + WORD $0x91008129 // add x9, x9, #32 + WORD $0x04fc94e8 // lsr z8.d, z7.d, #4 + WORD $0x042430e7 // and z7.d, z7.d, z4.d + WORD $0x04243108 // and z8.d, z8.d, z4.d + WORD $0x85844045 // ldr z5, [x2, #32, MUL VL] + WORD $0x85844446 // ldr z6, [x2, #33, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53000 // eor z0.d, z0.d, z5.d + WORD $0x04a63000 // eor z0.d, z0.d, z6.d + WORD $0x85844845 // ldr z5, [x2, #34, MUL VL] + WORD $0x85844c46 // ldr z6, [x2, #35, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53021 // eor z1.d, z1.d, z5.d + WORD $0x04a63021 // eor z1.d, z1.d, z6.d + WORD $0x85845045 // ldr z5, [x2, #36, MUL VL] + WORD $0x85845446 // ldr z6, [x2, #37, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53042 // eor z2.d, z2.d, z5.d + WORD $0x04a63042 // eor z2.d, z2.d, z6.d + WORD $0x85845845 // ldr z5, [x2, #38, MUL VL] + WORD $0x85845c46 // ldr z6, [x2, #39, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53063 // eor z3.d, z3.d, z5.d + WORD $0x04a63063 // eor z3.d, z3.d, z6.d + // Check for early termination + CMP $5, R16 + BEQ mulSve_10x4_store + + // Load and process 32 bytes from input 5 to 4 outputs + WORD $0x85804147 // ldr z7, [x10] + WORD $0x9100814a // add x10, x10, #32 + WORD $0x04fc94e8 // lsr z8.d, z7.d, #4 + WORD $0x042430e7 // and z7.d, z7.d, z4.d + WORD $0x04243108 // and z8.d, z8.d, z4.d + WORD $0x85854045 // ldr z5, [x2, #40, MUL VL] + WORD $0x85854446 // ldr z6, [x2, #41, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53000 // eor z0.d, z0.d, z5.d + WORD $0x04a63000 // eor z0.d, z0.d, z6.d + WORD $0x85854845 // ldr z5, [x2, #42, MUL VL] + WORD $0x85854c46 // ldr z6, [x2, #43, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53021 // eor z1.d, z1.d, z5.d + WORD $0x04a63021 // eor z1.d, z1.d, z6.d + WORD $0x85855045 // ldr z5, [x2, #44, MUL VL] + WORD $0x85855446 // ldr z6, [x2, #45, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53042 // eor z2.d, z2.d, z5.d + WORD $0x04a63042 // eor z2.d, z2.d, z6.d + WORD $0x85855845 // ldr z5, [x2, #46, MUL VL] + WORD $0x85855c46 // ldr z6, [x2, #47, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53063 // eor z3.d, z3.d, z5.d + WORD $0x04a63063 // eor z3.d, z3.d, z6.d + // Check for early termination + CMP $6, R16 + BEQ mulSve_10x4_store + + // Load and process 32 bytes from input 6 to 4 outputs + WORD $0x85804167 // ldr z7, [x11] + WORD $0x9100816b // add x11, x11, #32 + WORD $0x04fc94e8 // lsr z8.d, z7.d, #4 + WORD $0x042430e7 // and z7.d, z7.d, z4.d + WORD $0x04243108 // and z8.d, z8.d, z4.d + WORD $0x85864045 // ldr z5, [x2, #48, MUL VL] + WORD $0x85864446 // ldr z6, [x2, #49, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53000 // eor z0.d, z0.d, z5.d + WORD $0x04a63000 // eor z0.d, z0.d, z6.d + WORD $0x85864845 // ldr z5, [x2, #50, MUL VL] + WORD $0x85864c46 // ldr z6, [x2, #51, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53021 // eor z1.d, z1.d, z5.d + WORD $0x04a63021 // eor z1.d, z1.d, z6.d + WORD $0x85865045 // ldr z5, [x2, #52, MUL VL] + WORD $0x85865446 // ldr z6, [x2, #53, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53042 // eor z2.d, z2.d, z5.d + WORD $0x04a63042 // eor z2.d, z2.d, z6.d + WORD $0x85865845 // ldr z5, [x2, #54, MUL VL] + WORD $0x85865c46 // ldr z6, [x2, #55, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53063 // eor z3.d, z3.d, z5.d + WORD $0x04a63063 // eor z3.d, z3.d, z6.d + // Check for early termination + CMP $7, R16 + BEQ mulSve_10x4_store + + // Load and process 32 bytes from input 7 to 4 outputs + WORD $0x85804187 // ldr z7, [x12] + WORD $0x9100818c // add x12, x12, #32 + WORD $0x04fc94e8 // lsr z8.d, z7.d, #4 + WORD $0x042430e7 // and z7.d, z7.d, z4.d + WORD $0x04243108 // and z8.d, z8.d, z4.d + WORD $0x85874045 // ldr z5, [x2, #56, MUL VL] + WORD $0x85874446 // ldr z6, [x2, #57, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53000 // eor z0.d, z0.d, z5.d + WORD $0x04a63000 // eor z0.d, z0.d, z6.d + WORD $0x85874845 // ldr z5, [x2, #58, MUL VL] + WORD $0x85874c46 // ldr z6, [x2, #59, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53021 // eor z1.d, z1.d, z5.d + WORD $0x04a63021 // eor z1.d, z1.d, z6.d + WORD $0x85875045 // ldr z5, [x2, #60, MUL VL] + WORD $0x85875446 // ldr z6, [x2, #61, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53042 // eor z2.d, z2.d, z5.d + WORD $0x04a63042 // eor z2.d, z2.d, z6.d + WORD $0x85875845 // ldr z5, [x2, #62, MUL VL] + WORD $0x85875c46 // ldr z6, [x2, #63, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53063 // eor z3.d, z3.d, z5.d + WORD $0x04a63063 // eor z3.d, z3.d, z6.d + // Check for early termination + CMP $8, R16 + BEQ mulSve_10x4_store + + // Load and process 32 bytes from input 8 to 4 outputs + WORD $0x858041a7 // ldr z7, [x13] + WORD $0x910081ad // add x13, x13, #32 + WORD $0x04fc94e8 // lsr z8.d, z7.d, #4 + WORD $0x042430e7 // and z7.d, z7.d, z4.d + WORD $0x04243108 // and z8.d, z8.d, z4.d + WORD $0x85884045 // ldr z5, [x2, #64, MUL VL] + WORD $0x85884446 // ldr z6, [x2, #65, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53000 // eor z0.d, z0.d, z5.d + WORD $0x04a63000 // eor z0.d, z0.d, z6.d + WORD $0x85884845 // ldr z5, [x2, #66, MUL VL] + WORD $0x85884c46 // ldr z6, [x2, #67, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53021 // eor z1.d, z1.d, z5.d + WORD $0x04a63021 // eor z1.d, z1.d, z6.d + WORD $0x85885045 // ldr z5, [x2, #68, MUL VL] + WORD $0x85885446 // ldr z6, [x2, #69, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53042 // eor z2.d, z2.d, z5.d + WORD $0x04a63042 // eor z2.d, z2.d, z6.d + WORD $0x85885845 // ldr z5, [x2, #70, MUL VL] + WORD $0x85885c46 // ldr z6, [x2, #71, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53063 // eor z3.d, z3.d, z5.d + WORD $0x04a63063 // eor z3.d, z3.d, z6.d + // Check for early termination + CMP $9, R16 + BEQ mulSve_10x4_store + + // Load and process 32 bytes from input 9 to 4 outputs + WORD $0x85804067 // ldr z7, [x3] + WORD $0x91008063 // add x3, x3, #32 + WORD $0x04fc94e8 // lsr z8.d, z7.d, #4 + WORD $0x042430e7 // and z7.d, z7.d, z4.d + WORD $0x04243108 // and z8.d, z8.d, z4.d + WORD $0x85894045 // ldr z5, [x2, #72, MUL VL] + WORD $0x85894446 // ldr z6, [x2, #73, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53000 // eor z0.d, z0.d, z5.d + WORD $0x04a63000 // eor z0.d, z0.d, z6.d + WORD $0x85894845 // ldr z5, [x2, #74, MUL VL] + WORD $0x85894c46 // ldr z6, [x2, #75, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53021 // eor z1.d, z1.d, z5.d + WORD $0x04a63021 // eor z1.d, z1.d, z6.d + WORD $0x85895045 // ldr z5, [x2, #76, MUL VL] + WORD $0x85895446 // ldr z6, [x2, #77, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53042 // eor z2.d, z2.d, z5.d + WORD $0x04a63042 // eor z2.d, z2.d, z6.d + WORD $0x85895845 // ldr z5, [x2, #78, MUL VL] + WORD $0x85895c46 // ldr z6, [x2, #79, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53063 // eor z3.d, z3.d, z5.d + WORD $0x04a63063 // eor z3.d, z3.d, z6.d + +mulSve_10x4_store: + // Store 4 outputs + MOVD (R14), R6 + WORD $0xe5ef40c0 // st1d { z0.d }, p0, [x6, x15, lsl #3] + MOVD 24(R14), R6 + WORD $0xe5ef40c1 // st1d { z1.d }, p0, [x6, x15, lsl #3] + MOVD 48(R14), R6 + WORD $0xe5ef40c2 // st1d { z2.d }, p0, [x6, x15, lsl #3] + MOVD 72(R14), R6 + WORD $0xe5ef40c3 // st1d { z3.d }, p0, [x6, x15, lsl #3] + + // Prepare for next loop + WORD $0x910011ef // add x15, x15, #4 + WORD $0xf1000400 // subs x0, x0, #1 + BNE mulSve_10x4_loop + +mulSve_10x4_end: + RET + +// func mulSve_10x4Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: SVE +TEXT ·mulSve_10x4Xor(SB), NOSPLIT, $8-88 + WORD $0x25d8e3e0 // ptrue p0.d + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 89 YMM used + MOVD n+80(FP), R0 + MOVD matrix_base+0(FP), R2 + WORD $0xd345fc00 // lsr x0, x0, #5 + WORD $0xea00001f // tst x0, x0 + BEQ mulSve_10x4Xor_end + MOVD in_base+24(FP), R3 + MOVD (R3), R1 + MOVD 24(R3), R4 + MOVD 48(R3), R5 + MOVD 72(R3), R8 + MOVD 96(R3), R9 + MOVD 120(R3), R10 + MOVD 144(R3), R11 + MOVD 168(R3), R12 + MOVD 192(R3), R13 + MOVD 216(R3), R3 + MOVD out_base+48(FP), R14 + MOVD start+72(FP), R15 + + // Add start offset to input + WORD $0x8b0f0021 // add x1, x1, x15 + WORD $0x8b0f0084 // add x4, x4, x15 + WORD $0x8b0f00a5 // add x5, x5, x15 + WORD $0x8b0f0108 // add x8, x8, x15 + WORD $0x8b0f0129 // add x9, x9, x15 + WORD $0x8b0f014a // add x10, x10, x15 + WORD $0x8b0f016b // add x11, x11, x15 + WORD $0x8b0f018c // add x12, x12, x15 + WORD $0x8b0f01ad // add x13, x13, x15 + WORD $0x8b0f0063 // add x3, x3, x15 + WORD $0xd343fdef // lsr x15, x15, #3 + WORD $0xd28001e6 // mov x6, #15 + WORD $0x05e038c4 // mov z4.d, x6 + WORD $0x05212084 // dup z4.b, z4.b[0] + + // Load number of input shards + MOVD in_len+32(FP), R16 + +mulSve_10x4Xor_loop: + // Load and process 32 bytes from input 0 to 4 outputs + WORD $0x85804027 // ldr z7, [x1] + WORD $0x91008021 // add x1, x1, #32 + WORD $0x04fc94e8 // lsr z8.d, z7.d, #4 + WORD $0x042430e7 // and z7.d, z7.d, z4.d + WORD $0x04243108 // and z8.d, z8.d, z4.d + MOVD (R14), R6 + WORD $0xa5ef40c0 // ld1d { z0.d }, p0/z, [x6, x15, lsl #3] + WORD $0x85804045 // ldr z5, [x2] + WORD $0x85804446 // ldr z6, [x2, #1, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53000 // eor z0.d, z0.d, z5.d + WORD $0x04a63000 // eor z0.d, z0.d, z6.d + MOVD 24(R14), R6 + WORD $0xa5ef40c1 // ld1d { z1.d }, p0/z, [x6, x15, lsl #3] + WORD $0x85804845 // ldr z5, [x2, #2, MUL VL] + WORD $0x85804c46 // ldr z6, [x2, #3, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53021 // eor z1.d, z1.d, z5.d + WORD $0x04a63021 // eor z1.d, z1.d, z6.d + MOVD 48(R14), R6 + WORD $0xa5ef40c2 // ld1d { z2.d }, p0/z, [x6, x15, lsl #3] + WORD $0x85805045 // ldr z5, [x2, #4, MUL VL] + WORD $0x85805446 // ldr z6, [x2, #5, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53042 // eor z2.d, z2.d, z5.d + WORD $0x04a63042 // eor z2.d, z2.d, z6.d + MOVD 72(R14), R6 + WORD $0xa5ef40c3 // ld1d { z3.d }, p0/z, [x6, x15, lsl #3] + WORD $0x85805845 // ldr z5, [x2, #6, MUL VL] + WORD $0x85805c46 // ldr z6, [x2, #7, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53063 // eor z3.d, z3.d, z5.d + WORD $0x04a63063 // eor z3.d, z3.d, z6.d + // Check for early termination + CMP $1, R16 + BEQ mulSve_10x4Xor_store + + // Load and process 32 bytes from input 1 to 4 outputs + WORD $0x85804087 // ldr z7, [x4] + WORD $0x91008084 // add x4, x4, #32 + WORD $0x04fc94e8 // lsr z8.d, z7.d, #4 + WORD $0x042430e7 // and z7.d, z7.d, z4.d + WORD $0x04243108 // and z8.d, z8.d, z4.d + WORD $0x85814045 // ldr z5, [x2, #8, MUL VL] + WORD $0x85814446 // ldr z6, [x2, #9, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53000 // eor z0.d, z0.d, z5.d + WORD $0x04a63000 // eor z0.d, z0.d, z6.d + WORD $0x85814845 // ldr z5, [x2, #10, MUL VL] + WORD $0x85814c46 // ldr z6, [x2, #11, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53021 // eor z1.d, z1.d, z5.d + WORD $0x04a63021 // eor z1.d, z1.d, z6.d + WORD $0x85815045 // ldr z5, [x2, #12, MUL VL] + WORD $0x85815446 // ldr z6, [x2, #13, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53042 // eor z2.d, z2.d, z5.d + WORD $0x04a63042 // eor z2.d, z2.d, z6.d + WORD $0x85815845 // ldr z5, [x2, #14, MUL VL] + WORD $0x85815c46 // ldr z6, [x2, #15, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53063 // eor z3.d, z3.d, z5.d + WORD $0x04a63063 // eor z3.d, z3.d, z6.d + // Check for early termination + CMP $2, R16 + BEQ mulSve_10x4Xor_store + + // Load and process 32 bytes from input 2 to 4 outputs + WORD $0x858040a7 // ldr z7, [x5] + WORD $0x910080a5 // add x5, x5, #32 + WORD $0x04fc94e8 // lsr z8.d, z7.d, #4 + WORD $0x042430e7 // and z7.d, z7.d, z4.d + WORD $0x04243108 // and z8.d, z8.d, z4.d + WORD $0x85824045 // ldr z5, [x2, #16, MUL VL] + WORD $0x85824446 // ldr z6, [x2, #17, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53000 // eor z0.d, z0.d, z5.d + WORD $0x04a63000 // eor z0.d, z0.d, z6.d + WORD $0x85824845 // ldr z5, [x2, #18, MUL VL] + WORD $0x85824c46 // ldr z6, [x2, #19, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53021 // eor z1.d, z1.d, z5.d + WORD $0x04a63021 // eor z1.d, z1.d, z6.d + WORD $0x85825045 // ldr z5, [x2, #20, MUL VL] + WORD $0x85825446 // ldr z6, [x2, #21, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53042 // eor z2.d, z2.d, z5.d + WORD $0x04a63042 // eor z2.d, z2.d, z6.d + WORD $0x85825845 // ldr z5, [x2, #22, MUL VL] + WORD $0x85825c46 // ldr z6, [x2, #23, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53063 // eor z3.d, z3.d, z5.d + WORD $0x04a63063 // eor z3.d, z3.d, z6.d + // Check for early termination + CMP $3, R16 + BEQ mulSve_10x4Xor_store + + // Load and process 32 bytes from input 3 to 4 outputs + WORD $0x85804107 // ldr z7, [x8] + WORD $0x91008108 // add x8, x8, #32 + WORD $0x04fc94e8 // lsr z8.d, z7.d, #4 + WORD $0x042430e7 // and z7.d, z7.d, z4.d + WORD $0x04243108 // and z8.d, z8.d, z4.d + WORD $0x85834045 // ldr z5, [x2, #24, MUL VL] + WORD $0x85834446 // ldr z6, [x2, #25, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53000 // eor z0.d, z0.d, z5.d + WORD $0x04a63000 // eor z0.d, z0.d, z6.d + WORD $0x85834845 // ldr z5, [x2, #26, MUL VL] + WORD $0x85834c46 // ldr z6, [x2, #27, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53021 // eor z1.d, z1.d, z5.d + WORD $0x04a63021 // eor z1.d, z1.d, z6.d + WORD $0x85835045 // ldr z5, [x2, #28, MUL VL] + WORD $0x85835446 // ldr z6, [x2, #29, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53042 // eor z2.d, z2.d, z5.d + WORD $0x04a63042 // eor z2.d, z2.d, z6.d + WORD $0x85835845 // ldr z5, [x2, #30, MUL VL] + WORD $0x85835c46 // ldr z6, [x2, #31, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53063 // eor z3.d, z3.d, z5.d + WORD $0x04a63063 // eor z3.d, z3.d, z6.d + // Check for early termination + CMP $4, R16 + BEQ mulSve_10x4Xor_store + + // Load and process 32 bytes from input 4 to 4 outputs + WORD $0x85804127 // ldr z7, [x9] + WORD $0x91008129 // add x9, x9, #32 + WORD $0x04fc94e8 // lsr z8.d, z7.d, #4 + WORD $0x042430e7 // and z7.d, z7.d, z4.d + WORD $0x04243108 // and z8.d, z8.d, z4.d + WORD $0x85844045 // ldr z5, [x2, #32, MUL VL] + WORD $0x85844446 // ldr z6, [x2, #33, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53000 // eor z0.d, z0.d, z5.d + WORD $0x04a63000 // eor z0.d, z0.d, z6.d + WORD $0x85844845 // ldr z5, [x2, #34, MUL VL] + WORD $0x85844c46 // ldr z6, [x2, #35, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53021 // eor z1.d, z1.d, z5.d + WORD $0x04a63021 // eor z1.d, z1.d, z6.d + WORD $0x85845045 // ldr z5, [x2, #36, MUL VL] + WORD $0x85845446 // ldr z6, [x2, #37, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53042 // eor z2.d, z2.d, z5.d + WORD $0x04a63042 // eor z2.d, z2.d, z6.d + WORD $0x85845845 // ldr z5, [x2, #38, MUL VL] + WORD $0x85845c46 // ldr z6, [x2, #39, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53063 // eor z3.d, z3.d, z5.d + WORD $0x04a63063 // eor z3.d, z3.d, z6.d + // Check for early termination + CMP $5, R16 + BEQ mulSve_10x4Xor_store + + // Load and process 32 bytes from input 5 to 4 outputs + WORD $0x85804147 // ldr z7, [x10] + WORD $0x9100814a // add x10, x10, #32 + WORD $0x04fc94e8 // lsr z8.d, z7.d, #4 + WORD $0x042430e7 // and z7.d, z7.d, z4.d + WORD $0x04243108 // and z8.d, z8.d, z4.d + WORD $0x85854045 // ldr z5, [x2, #40, MUL VL] + WORD $0x85854446 // ldr z6, [x2, #41, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53000 // eor z0.d, z0.d, z5.d + WORD $0x04a63000 // eor z0.d, z0.d, z6.d + WORD $0x85854845 // ldr z5, [x2, #42, MUL VL] + WORD $0x85854c46 // ldr z6, [x2, #43, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53021 // eor z1.d, z1.d, z5.d + WORD $0x04a63021 // eor z1.d, z1.d, z6.d + WORD $0x85855045 // ldr z5, [x2, #44, MUL VL] + WORD $0x85855446 // ldr z6, [x2, #45, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53042 // eor z2.d, z2.d, z5.d + WORD $0x04a63042 // eor z2.d, z2.d, z6.d + WORD $0x85855845 // ldr z5, [x2, #46, MUL VL] + WORD $0x85855c46 // ldr z6, [x2, #47, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53063 // eor z3.d, z3.d, z5.d + WORD $0x04a63063 // eor z3.d, z3.d, z6.d + // Check for early termination + CMP $6, R16 + BEQ mulSve_10x4Xor_store + + // Load and process 32 bytes from input 6 to 4 outputs + WORD $0x85804167 // ldr z7, [x11] + WORD $0x9100816b // add x11, x11, #32 + WORD $0x04fc94e8 // lsr z8.d, z7.d, #4 + WORD $0x042430e7 // and z7.d, z7.d, z4.d + WORD $0x04243108 // and z8.d, z8.d, z4.d + WORD $0x85864045 // ldr z5, [x2, #48, MUL VL] + WORD $0x85864446 // ldr z6, [x2, #49, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53000 // eor z0.d, z0.d, z5.d + WORD $0x04a63000 // eor z0.d, z0.d, z6.d + WORD $0x85864845 // ldr z5, [x2, #50, MUL VL] + WORD $0x85864c46 // ldr z6, [x2, #51, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53021 // eor z1.d, z1.d, z5.d + WORD $0x04a63021 // eor z1.d, z1.d, z6.d + WORD $0x85865045 // ldr z5, [x2, #52, MUL VL] + WORD $0x85865446 // ldr z6, [x2, #53, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53042 // eor z2.d, z2.d, z5.d + WORD $0x04a63042 // eor z2.d, z2.d, z6.d + WORD $0x85865845 // ldr z5, [x2, #54, MUL VL] + WORD $0x85865c46 // ldr z6, [x2, #55, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53063 // eor z3.d, z3.d, z5.d + WORD $0x04a63063 // eor z3.d, z3.d, z6.d + // Check for early termination + CMP $7, R16 + BEQ mulSve_10x4Xor_store + + // Load and process 32 bytes from input 7 to 4 outputs + WORD $0x85804187 // ldr z7, [x12] + WORD $0x9100818c // add x12, x12, #32 + WORD $0x04fc94e8 // lsr z8.d, z7.d, #4 + WORD $0x042430e7 // and z7.d, z7.d, z4.d + WORD $0x04243108 // and z8.d, z8.d, z4.d + WORD $0x85874045 // ldr z5, [x2, #56, MUL VL] + WORD $0x85874446 // ldr z6, [x2, #57, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53000 // eor z0.d, z0.d, z5.d + WORD $0x04a63000 // eor z0.d, z0.d, z6.d + WORD $0x85874845 // ldr z5, [x2, #58, MUL VL] + WORD $0x85874c46 // ldr z6, [x2, #59, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53021 // eor z1.d, z1.d, z5.d + WORD $0x04a63021 // eor z1.d, z1.d, z6.d + WORD $0x85875045 // ldr z5, [x2, #60, MUL VL] + WORD $0x85875446 // ldr z6, [x2, #61, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53042 // eor z2.d, z2.d, z5.d + WORD $0x04a63042 // eor z2.d, z2.d, z6.d + WORD $0x85875845 // ldr z5, [x2, #62, MUL VL] + WORD $0x85875c46 // ldr z6, [x2, #63, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53063 // eor z3.d, z3.d, z5.d + WORD $0x04a63063 // eor z3.d, z3.d, z6.d + // Check for early termination + CMP $8, R16 + BEQ mulSve_10x4Xor_store + + // Load and process 32 bytes from input 8 to 4 outputs + WORD $0x858041a7 // ldr z7, [x13] + WORD $0x910081ad // add x13, x13, #32 + WORD $0x04fc94e8 // lsr z8.d, z7.d, #4 + WORD $0x042430e7 // and z7.d, z7.d, z4.d + WORD $0x04243108 // and z8.d, z8.d, z4.d + WORD $0x85884045 // ldr z5, [x2, #64, MUL VL] + WORD $0x85884446 // ldr z6, [x2, #65, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53000 // eor z0.d, z0.d, z5.d + WORD $0x04a63000 // eor z0.d, z0.d, z6.d + WORD $0x85884845 // ldr z5, [x2, #66, MUL VL] + WORD $0x85884c46 // ldr z6, [x2, #67, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53021 // eor z1.d, z1.d, z5.d + WORD $0x04a63021 // eor z1.d, z1.d, z6.d + WORD $0x85885045 // ldr z5, [x2, #68, MUL VL] + WORD $0x85885446 // ldr z6, [x2, #69, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53042 // eor z2.d, z2.d, z5.d + WORD $0x04a63042 // eor z2.d, z2.d, z6.d + WORD $0x85885845 // ldr z5, [x2, #70, MUL VL] + WORD $0x85885c46 // ldr z6, [x2, #71, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53063 // eor z3.d, z3.d, z5.d + WORD $0x04a63063 // eor z3.d, z3.d, z6.d + // Check for early termination + CMP $9, R16 + BEQ mulSve_10x4Xor_store + + // Load and process 32 bytes from input 9 to 4 outputs + WORD $0x85804067 // ldr z7, [x3] + WORD $0x91008063 // add x3, x3, #32 + WORD $0x04fc94e8 // lsr z8.d, z7.d, #4 + WORD $0x042430e7 // and z7.d, z7.d, z4.d + WORD $0x04243108 // and z8.d, z8.d, z4.d + WORD $0x85894045 // ldr z5, [x2, #72, MUL VL] + WORD $0x85894446 // ldr z6, [x2, #73, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53000 // eor z0.d, z0.d, z5.d + WORD $0x04a63000 // eor z0.d, z0.d, z6.d + WORD $0x85894845 // ldr z5, [x2, #74, MUL VL] + WORD $0x85894c46 // ldr z6, [x2, #75, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53021 // eor z1.d, z1.d, z5.d + WORD $0x04a63021 // eor z1.d, z1.d, z6.d + WORD $0x85895045 // ldr z5, [x2, #76, MUL VL] + WORD $0x85895446 // ldr z6, [x2, #77, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53042 // eor z2.d, z2.d, z5.d + WORD $0x04a63042 // eor z2.d, z2.d, z6.d + WORD $0x85895845 // ldr z5, [x2, #78, MUL VL] + WORD $0x85895c46 // ldr z6, [x2, #79, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53063 // eor z3.d, z3.d, z5.d + WORD $0x04a63063 // eor z3.d, z3.d, z6.d + +mulSve_10x4Xor_store: + // Store 4 outputs + MOVD (R14), R6 + WORD $0xe5ef40c0 // st1d { z0.d }, p0, [x6, x15, lsl #3] + MOVD 24(R14), R6 + WORD $0xe5ef40c1 // st1d { z1.d }, p0, [x6, x15, lsl #3] + MOVD 48(R14), R6 + WORD $0xe5ef40c2 // st1d { z2.d }, p0, [x6, x15, lsl #3] + MOVD 72(R14), R6 + WORD $0xe5ef40c3 // st1d { z3.d }, p0, [x6, x15, lsl #3] + + // Prepare for next loop + WORD $0x910011ef // add x15, x15, #4 + WORD $0xf1000400 // subs x0, x0, #1 + BNE mulSve_10x4Xor_loop + +mulSve_10x4Xor_end: + RET + +// func mulSve_10x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: SVE +TEXT ·mulSve_10x5(SB), NOSPLIT, $8-88 + WORD $0x25d8e3e0 // ptrue p0.d + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 110 YMM used + MOVD n+80(FP), R0 + MOVD matrix_base+0(FP), R2 + WORD $0xd345fc00 // lsr x0, x0, #5 + WORD $0xea00001f // tst x0, x0 + BEQ mulSve_10x5_end + MOVD in_base+24(FP), R3 + MOVD (R3), R1 + MOVD 24(R3), R4 + MOVD 48(R3), R5 + MOVD 72(R3), R8 + MOVD 96(R3), R9 + MOVD 120(R3), R10 + MOVD 144(R3), R11 + MOVD 168(R3), R12 + MOVD 192(R3), R13 + MOVD 216(R3), R3 + MOVD out_base+48(FP), R14 + MOVD start+72(FP), R15 + + // Add start offset to input + WORD $0x8b0f0021 // add x1, x1, x15 + WORD $0x8b0f0084 // add x4, x4, x15 + WORD $0x8b0f00a5 // add x5, x5, x15 + WORD $0x8b0f0108 // add x8, x8, x15 + WORD $0x8b0f0129 // add x9, x9, x15 + WORD $0x8b0f014a // add x10, x10, x15 + WORD $0x8b0f016b // add x11, x11, x15 + WORD $0x8b0f018c // add x12, x12, x15 + WORD $0x8b0f01ad // add x13, x13, x15 + WORD $0x8b0f0063 // add x3, x3, x15 + WORD $0xd343fdef // lsr x15, x15, #3 + WORD $0xd28001e6 // mov x6, #15 + WORD $0x05e038c5 // mov z5.d, x6 + WORD $0x052120a5 // dup z5.b, z5.b[0] + + // Load number of input shards + MOVD in_len+32(FP), R16 + +mulSve_10x5_loop: + // Load and process 32 bytes from input 0 to 5 outputs + WORD $0x85804028 // ldr z8, [x1] + WORD $0x91008021 // add x1, x1, #32 + WORD $0x04fc9509 // lsr z9.d, z8.d, #4 + WORD $0x04253108 // and z8.d, z8.d, z5.d + WORD $0x04253129 // and z9.d, z9.d, z5.d + WORD $0x85804046 // ldr z6, [x2] + WORD $0x85804447 // ldr z7, [x2, #1, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a630e0 // eor z0.d, z7.d, z6.d + WORD $0x85804846 // ldr z6, [x2, #2, MUL VL] + WORD $0x85804c47 // ldr z7, [x2, #3, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a630e1 // eor z1.d, z7.d, z6.d + WORD $0x85805046 // ldr z6, [x2, #4, MUL VL] + WORD $0x85805447 // ldr z7, [x2, #5, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a630e2 // eor z2.d, z7.d, z6.d + WORD $0x85805846 // ldr z6, [x2, #6, MUL VL] + WORD $0x85805c47 // ldr z7, [x2, #7, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a630e3 // eor z3.d, z7.d, z6.d + WORD $0x85814046 // ldr z6, [x2, #8, MUL VL] + WORD $0x85814447 // ldr z7, [x2, #9, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a630e4 // eor z4.d, z7.d, z6.d + // Check for early termination + CMP $1, R16 + BEQ mulSve_10x5_store + + // Load and process 32 bytes from input 1 to 5 outputs + WORD $0x85804088 // ldr z8, [x4] + WORD $0x91008084 // add x4, x4, #32 + WORD $0x04fc9509 // lsr z9.d, z8.d, #4 + WORD $0x04253108 // and z8.d, z8.d, z5.d + WORD $0x04253129 // and z9.d, z9.d, z5.d + WORD $0x85814846 // ldr z6, [x2, #10, MUL VL] + WORD $0x85814c47 // ldr z7, [x2, #11, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63000 // eor z0.d, z0.d, z6.d + WORD $0x04a73000 // eor z0.d, z0.d, z7.d + WORD $0x85815046 // ldr z6, [x2, #12, MUL VL] + WORD $0x85815447 // ldr z7, [x2, #13, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63021 // eor z1.d, z1.d, z6.d + WORD $0x04a73021 // eor z1.d, z1.d, z7.d + WORD $0x85815846 // ldr z6, [x2, #14, MUL VL] + WORD $0x85815c47 // ldr z7, [x2, #15, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63042 // eor z2.d, z2.d, z6.d + WORD $0x04a73042 // eor z2.d, z2.d, z7.d + WORD $0x85824046 // ldr z6, [x2, #16, MUL VL] + WORD $0x85824447 // ldr z7, [x2, #17, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63063 // eor z3.d, z3.d, z6.d + WORD $0x04a73063 // eor z3.d, z3.d, z7.d + WORD $0x85824846 // ldr z6, [x2, #18, MUL VL] + WORD $0x85824c47 // ldr z7, [x2, #19, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63084 // eor z4.d, z4.d, z6.d + WORD $0x04a73084 // eor z4.d, z4.d, z7.d + // Check for early termination + CMP $2, R16 + BEQ mulSve_10x5_store + + // Load and process 32 bytes from input 2 to 5 outputs + WORD $0x858040a8 // ldr z8, [x5] + WORD $0x910080a5 // add x5, x5, #32 + WORD $0x04fc9509 // lsr z9.d, z8.d, #4 + WORD $0x04253108 // and z8.d, z8.d, z5.d + WORD $0x04253129 // and z9.d, z9.d, z5.d + WORD $0x85825046 // ldr z6, [x2, #20, MUL VL] + WORD $0x85825447 // ldr z7, [x2, #21, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63000 // eor z0.d, z0.d, z6.d + WORD $0x04a73000 // eor z0.d, z0.d, z7.d + WORD $0x85825846 // ldr z6, [x2, #22, MUL VL] + WORD $0x85825c47 // ldr z7, [x2, #23, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63021 // eor z1.d, z1.d, z6.d + WORD $0x04a73021 // eor z1.d, z1.d, z7.d + WORD $0x85834046 // ldr z6, [x2, #24, MUL VL] + WORD $0x85834447 // ldr z7, [x2, #25, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63042 // eor z2.d, z2.d, z6.d + WORD $0x04a73042 // eor z2.d, z2.d, z7.d + WORD $0x85834846 // ldr z6, [x2, #26, MUL VL] + WORD $0x85834c47 // ldr z7, [x2, #27, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63063 // eor z3.d, z3.d, z6.d + WORD $0x04a73063 // eor z3.d, z3.d, z7.d + WORD $0x85835046 // ldr z6, [x2, #28, MUL VL] + WORD $0x85835447 // ldr z7, [x2, #29, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63084 // eor z4.d, z4.d, z6.d + WORD $0x04a73084 // eor z4.d, z4.d, z7.d + // Check for early termination + CMP $3, R16 + BEQ mulSve_10x5_store + + // Load and process 32 bytes from input 3 to 5 outputs + WORD $0x85804108 // ldr z8, [x8] + WORD $0x91008108 // add x8, x8, #32 + WORD $0x04fc9509 // lsr z9.d, z8.d, #4 + WORD $0x04253108 // and z8.d, z8.d, z5.d + WORD $0x04253129 // and z9.d, z9.d, z5.d + WORD $0x85835846 // ldr z6, [x2, #30, MUL VL] + WORD $0x85835c47 // ldr z7, [x2, #31, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63000 // eor z0.d, z0.d, z6.d + WORD $0x04a73000 // eor z0.d, z0.d, z7.d + WORD $0x85844046 // ldr z6, [x2, #32, MUL VL] + WORD $0x85844447 // ldr z7, [x2, #33, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63021 // eor z1.d, z1.d, z6.d + WORD $0x04a73021 // eor z1.d, z1.d, z7.d + WORD $0x85844846 // ldr z6, [x2, #34, MUL VL] + WORD $0x85844c47 // ldr z7, [x2, #35, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63042 // eor z2.d, z2.d, z6.d + WORD $0x04a73042 // eor z2.d, z2.d, z7.d + WORD $0x85845046 // ldr z6, [x2, #36, MUL VL] + WORD $0x85845447 // ldr z7, [x2, #37, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63063 // eor z3.d, z3.d, z6.d + WORD $0x04a73063 // eor z3.d, z3.d, z7.d + WORD $0x85845846 // ldr z6, [x2, #38, MUL VL] + WORD $0x85845c47 // ldr z7, [x2, #39, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63084 // eor z4.d, z4.d, z6.d + WORD $0x04a73084 // eor z4.d, z4.d, z7.d + // Check for early termination + CMP $4, R16 + BEQ mulSve_10x5_store + + // Load and process 32 bytes from input 4 to 5 outputs + WORD $0x85804128 // ldr z8, [x9] + WORD $0x91008129 // add x9, x9, #32 + WORD $0x04fc9509 // lsr z9.d, z8.d, #4 + WORD $0x04253108 // and z8.d, z8.d, z5.d + WORD $0x04253129 // and z9.d, z9.d, z5.d + WORD $0x85854046 // ldr z6, [x2, #40, MUL VL] + WORD $0x85854447 // ldr z7, [x2, #41, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63000 // eor z0.d, z0.d, z6.d + WORD $0x04a73000 // eor z0.d, z0.d, z7.d + WORD $0x85854846 // ldr z6, [x2, #42, MUL VL] + WORD $0x85854c47 // ldr z7, [x2, #43, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63021 // eor z1.d, z1.d, z6.d + WORD $0x04a73021 // eor z1.d, z1.d, z7.d + WORD $0x85855046 // ldr z6, [x2, #44, MUL VL] + WORD $0x85855447 // ldr z7, [x2, #45, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63042 // eor z2.d, z2.d, z6.d + WORD $0x04a73042 // eor z2.d, z2.d, z7.d + WORD $0x85855846 // ldr z6, [x2, #46, MUL VL] + WORD $0x85855c47 // ldr z7, [x2, #47, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63063 // eor z3.d, z3.d, z6.d + WORD $0x04a73063 // eor z3.d, z3.d, z7.d + WORD $0x85864046 // ldr z6, [x2, #48, MUL VL] + WORD $0x85864447 // ldr z7, [x2, #49, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63084 // eor z4.d, z4.d, z6.d + WORD $0x04a73084 // eor z4.d, z4.d, z7.d + // Check for early termination + CMP $5, R16 + BEQ mulSve_10x5_store + + // Load and process 32 bytes from input 5 to 5 outputs + WORD $0x85804148 // ldr z8, [x10] + WORD $0x9100814a // add x10, x10, #32 + WORD $0x04fc9509 // lsr z9.d, z8.d, #4 + WORD $0x04253108 // and z8.d, z8.d, z5.d + WORD $0x04253129 // and z9.d, z9.d, z5.d + WORD $0x85864846 // ldr z6, [x2, #50, MUL VL] + WORD $0x85864c47 // ldr z7, [x2, #51, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63000 // eor z0.d, z0.d, z6.d + WORD $0x04a73000 // eor z0.d, z0.d, z7.d + WORD $0x85865046 // ldr z6, [x2, #52, MUL VL] + WORD $0x85865447 // ldr z7, [x2, #53, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63021 // eor z1.d, z1.d, z6.d + WORD $0x04a73021 // eor z1.d, z1.d, z7.d + WORD $0x85865846 // ldr z6, [x2, #54, MUL VL] + WORD $0x85865c47 // ldr z7, [x2, #55, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63042 // eor z2.d, z2.d, z6.d + WORD $0x04a73042 // eor z2.d, z2.d, z7.d + WORD $0x85874046 // ldr z6, [x2, #56, MUL VL] + WORD $0x85874447 // ldr z7, [x2, #57, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63063 // eor z3.d, z3.d, z6.d + WORD $0x04a73063 // eor z3.d, z3.d, z7.d + WORD $0x85874846 // ldr z6, [x2, #58, MUL VL] + WORD $0x85874c47 // ldr z7, [x2, #59, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63084 // eor z4.d, z4.d, z6.d + WORD $0x04a73084 // eor z4.d, z4.d, z7.d + // Check for early termination + CMP $6, R16 + BEQ mulSve_10x5_store + + // Load and process 32 bytes from input 6 to 5 outputs + WORD $0x85804168 // ldr z8, [x11] + WORD $0x9100816b // add x11, x11, #32 + WORD $0x04fc9509 // lsr z9.d, z8.d, #4 + WORD $0x04253108 // and z8.d, z8.d, z5.d + WORD $0x04253129 // and z9.d, z9.d, z5.d + WORD $0x85875046 // ldr z6, [x2, #60, MUL VL] + WORD $0x85875447 // ldr z7, [x2, #61, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63000 // eor z0.d, z0.d, z6.d + WORD $0x04a73000 // eor z0.d, z0.d, z7.d + WORD $0x85875846 // ldr z6, [x2, #62, MUL VL] + WORD $0x85875c47 // ldr z7, [x2, #63, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63021 // eor z1.d, z1.d, z6.d + WORD $0x04a73021 // eor z1.d, z1.d, z7.d + WORD $0x85884046 // ldr z6, [x2, #64, MUL VL] + WORD $0x85884447 // ldr z7, [x2, #65, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63042 // eor z2.d, z2.d, z6.d + WORD $0x04a73042 // eor z2.d, z2.d, z7.d + WORD $0x85884846 // ldr z6, [x2, #66, MUL VL] + WORD $0x85884c47 // ldr z7, [x2, #67, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63063 // eor z3.d, z3.d, z6.d + WORD $0x04a73063 // eor z3.d, z3.d, z7.d + WORD $0x85885046 // ldr z6, [x2, #68, MUL VL] + WORD $0x85885447 // ldr z7, [x2, #69, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63084 // eor z4.d, z4.d, z6.d + WORD $0x04a73084 // eor z4.d, z4.d, z7.d + // Check for early termination + CMP $7, R16 + BEQ mulSve_10x5_store + + // Load and process 32 bytes from input 7 to 5 outputs + WORD $0x85804188 // ldr z8, [x12] + WORD $0x9100818c // add x12, x12, #32 + WORD $0x04fc9509 // lsr z9.d, z8.d, #4 + WORD $0x04253108 // and z8.d, z8.d, z5.d + WORD $0x04253129 // and z9.d, z9.d, z5.d + WORD $0x85885846 // ldr z6, [x2, #70, MUL VL] + WORD $0x85885c47 // ldr z7, [x2, #71, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63000 // eor z0.d, z0.d, z6.d + WORD $0x04a73000 // eor z0.d, z0.d, z7.d + WORD $0x85894046 // ldr z6, [x2, #72, MUL VL] + WORD $0x85894447 // ldr z7, [x2, #73, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63021 // eor z1.d, z1.d, z6.d + WORD $0x04a73021 // eor z1.d, z1.d, z7.d + WORD $0x85894846 // ldr z6, [x2, #74, MUL VL] + WORD $0x85894c47 // ldr z7, [x2, #75, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63042 // eor z2.d, z2.d, z6.d + WORD $0x04a73042 // eor z2.d, z2.d, z7.d + WORD $0x85895046 // ldr z6, [x2, #76, MUL VL] + WORD $0x85895447 // ldr z7, [x2, #77, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63063 // eor z3.d, z3.d, z6.d + WORD $0x04a73063 // eor z3.d, z3.d, z7.d + WORD $0x85895846 // ldr z6, [x2, #78, MUL VL] + WORD $0x85895c47 // ldr z7, [x2, #79, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63084 // eor z4.d, z4.d, z6.d + WORD $0x04a73084 // eor z4.d, z4.d, z7.d + // Check for early termination + CMP $8, R16 + BEQ mulSve_10x5_store + + // Load and process 32 bytes from input 8 to 5 outputs + WORD $0x858041a8 // ldr z8, [x13] + WORD $0x910081ad // add x13, x13, #32 + WORD $0x04fc9509 // lsr z9.d, z8.d, #4 + WORD $0x04253108 // and z8.d, z8.d, z5.d + WORD $0x04253129 // and z9.d, z9.d, z5.d + WORD $0x858a4046 // ldr z6, [x2, #80, MUL VL] + WORD $0x858a4447 // ldr z7, [x2, #81, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63000 // eor z0.d, z0.d, z6.d + WORD $0x04a73000 // eor z0.d, z0.d, z7.d + WORD $0x858a4846 // ldr z6, [x2, #82, MUL VL] + WORD $0x858a4c47 // ldr z7, [x2, #83, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63021 // eor z1.d, z1.d, z6.d + WORD $0x04a73021 // eor z1.d, z1.d, z7.d + WORD $0x858a5046 // ldr z6, [x2, #84, MUL VL] + WORD $0x858a5447 // ldr z7, [x2, #85, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63042 // eor z2.d, z2.d, z6.d + WORD $0x04a73042 // eor z2.d, z2.d, z7.d + WORD $0x858a5846 // ldr z6, [x2, #86, MUL VL] + WORD $0x858a5c47 // ldr z7, [x2, #87, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63063 // eor z3.d, z3.d, z6.d + WORD $0x04a73063 // eor z3.d, z3.d, z7.d + WORD $0x858b4046 // ldr z6, [x2, #88, MUL VL] + WORD $0x858b4447 // ldr z7, [x2, #89, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63084 // eor z4.d, z4.d, z6.d + WORD $0x04a73084 // eor z4.d, z4.d, z7.d + // Check for early termination + CMP $9, R16 + BEQ mulSve_10x5_store + + // Load and process 32 bytes from input 9 to 5 outputs + WORD $0x85804068 // ldr z8, [x3] + WORD $0x91008063 // add x3, x3, #32 + WORD $0x04fc9509 // lsr z9.d, z8.d, #4 + WORD $0x04253108 // and z8.d, z8.d, z5.d + WORD $0x04253129 // and z9.d, z9.d, z5.d + WORD $0x858b4846 // ldr z6, [x2, #90, MUL VL] + WORD $0x858b4c47 // ldr z7, [x2, #91, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63000 // eor z0.d, z0.d, z6.d + WORD $0x04a73000 // eor z0.d, z0.d, z7.d + WORD $0x858b5046 // ldr z6, [x2, #92, MUL VL] + WORD $0x858b5447 // ldr z7, [x2, #93, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63021 // eor z1.d, z1.d, z6.d + WORD $0x04a73021 // eor z1.d, z1.d, z7.d + WORD $0x858b5846 // ldr z6, [x2, #94, MUL VL] + WORD $0x858b5c47 // ldr z7, [x2, #95, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63042 // eor z2.d, z2.d, z6.d + WORD $0x04a73042 // eor z2.d, z2.d, z7.d + WORD $0x858c4046 // ldr z6, [x2, #96, MUL VL] + WORD $0x858c4447 // ldr z7, [x2, #97, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63063 // eor z3.d, z3.d, z6.d + WORD $0x04a73063 // eor z3.d, z3.d, z7.d + WORD $0x858c4846 // ldr z6, [x2, #98, MUL VL] + WORD $0x858c4c47 // ldr z7, [x2, #99, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63084 // eor z4.d, z4.d, z6.d + WORD $0x04a73084 // eor z4.d, z4.d, z7.d + +mulSve_10x5_store: + // Store 5 outputs + MOVD (R14), R6 + WORD $0xe5ef40c0 // st1d { z0.d }, p0, [x6, x15, lsl #3] + MOVD 24(R14), R6 + WORD $0xe5ef40c1 // st1d { z1.d }, p0, [x6, x15, lsl #3] + MOVD 48(R14), R6 + WORD $0xe5ef40c2 // st1d { z2.d }, p0, [x6, x15, lsl #3] + MOVD 72(R14), R6 + WORD $0xe5ef40c3 // st1d { z3.d }, p0, [x6, x15, lsl #3] + MOVD 96(R14), R6 + WORD $0xe5ef40c4 // st1d { z4.d }, p0, [x6, x15, lsl #3] + + // Prepare for next loop + WORD $0x910011ef // add x15, x15, #4 + WORD $0xf1000400 // subs x0, x0, #1 + BNE mulSve_10x5_loop + +mulSve_10x5_end: + RET + +// func mulSve_10x5Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: SVE +TEXT ·mulSve_10x5Xor(SB), NOSPLIT, $8-88 + WORD $0x25d8e3e0 // ptrue p0.d + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 110 YMM used + MOVD n+80(FP), R0 + MOVD matrix_base+0(FP), R2 + WORD $0xd345fc00 // lsr x0, x0, #5 + WORD $0xea00001f // tst x0, x0 + BEQ mulSve_10x5Xor_end + MOVD in_base+24(FP), R3 + MOVD (R3), R1 + MOVD 24(R3), R4 + MOVD 48(R3), R5 + MOVD 72(R3), R8 + MOVD 96(R3), R9 + MOVD 120(R3), R10 + MOVD 144(R3), R11 + MOVD 168(R3), R12 + MOVD 192(R3), R13 + MOVD 216(R3), R3 + MOVD out_base+48(FP), R14 + MOVD start+72(FP), R15 + + // Add start offset to input + WORD $0x8b0f0021 // add x1, x1, x15 + WORD $0x8b0f0084 // add x4, x4, x15 + WORD $0x8b0f00a5 // add x5, x5, x15 + WORD $0x8b0f0108 // add x8, x8, x15 + WORD $0x8b0f0129 // add x9, x9, x15 + WORD $0x8b0f014a // add x10, x10, x15 + WORD $0x8b0f016b // add x11, x11, x15 + WORD $0x8b0f018c // add x12, x12, x15 + WORD $0x8b0f01ad // add x13, x13, x15 + WORD $0x8b0f0063 // add x3, x3, x15 + WORD $0xd343fdef // lsr x15, x15, #3 + WORD $0xd28001e6 // mov x6, #15 + WORD $0x05e038c5 // mov z5.d, x6 + WORD $0x052120a5 // dup z5.b, z5.b[0] + + // Load number of input shards + MOVD in_len+32(FP), R16 + +mulSve_10x5Xor_loop: + // Load and process 32 bytes from input 0 to 5 outputs + WORD $0x85804028 // ldr z8, [x1] + WORD $0x91008021 // add x1, x1, #32 + WORD $0x04fc9509 // lsr z9.d, z8.d, #4 + WORD $0x04253108 // and z8.d, z8.d, z5.d + WORD $0x04253129 // and z9.d, z9.d, z5.d + MOVD (R14), R6 + WORD $0xa5ef40c0 // ld1d { z0.d }, p0/z, [x6, x15, lsl #3] + WORD $0x85804046 // ldr z6, [x2] + WORD $0x85804447 // ldr z7, [x2, #1, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63000 // eor z0.d, z0.d, z6.d + WORD $0x04a73000 // eor z0.d, z0.d, z7.d + MOVD 24(R14), R6 + WORD $0xa5ef40c1 // ld1d { z1.d }, p0/z, [x6, x15, lsl #3] + WORD $0x85804846 // ldr z6, [x2, #2, MUL VL] + WORD $0x85804c47 // ldr z7, [x2, #3, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63021 // eor z1.d, z1.d, z6.d + WORD $0x04a73021 // eor z1.d, z1.d, z7.d + MOVD 48(R14), R6 + WORD $0xa5ef40c2 // ld1d { z2.d }, p0/z, [x6, x15, lsl #3] + WORD $0x85805046 // ldr z6, [x2, #4, MUL VL] + WORD $0x85805447 // ldr z7, [x2, #5, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63042 // eor z2.d, z2.d, z6.d + WORD $0x04a73042 // eor z2.d, z2.d, z7.d + MOVD 72(R14), R6 + WORD $0xa5ef40c3 // ld1d { z3.d }, p0/z, [x6, x15, lsl #3] + WORD $0x85805846 // ldr z6, [x2, #6, MUL VL] + WORD $0x85805c47 // ldr z7, [x2, #7, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63063 // eor z3.d, z3.d, z6.d + WORD $0x04a73063 // eor z3.d, z3.d, z7.d + MOVD 96(R14), R6 + WORD $0xa5ef40c4 // ld1d { z4.d }, p0/z, [x6, x15, lsl #3] + WORD $0x85814046 // ldr z6, [x2, #8, MUL VL] + WORD $0x85814447 // ldr z7, [x2, #9, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63084 // eor z4.d, z4.d, z6.d + WORD $0x04a73084 // eor z4.d, z4.d, z7.d + // Check for early termination + CMP $1, R16 + BEQ mulSve_10x5Xor_store + + // Load and process 32 bytes from input 1 to 5 outputs + WORD $0x85804088 // ldr z8, [x4] + WORD $0x91008084 // add x4, x4, #32 + WORD $0x04fc9509 // lsr z9.d, z8.d, #4 + WORD $0x04253108 // and z8.d, z8.d, z5.d + WORD $0x04253129 // and z9.d, z9.d, z5.d + WORD $0x85814846 // ldr z6, [x2, #10, MUL VL] + WORD $0x85814c47 // ldr z7, [x2, #11, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63000 // eor z0.d, z0.d, z6.d + WORD $0x04a73000 // eor z0.d, z0.d, z7.d + WORD $0x85815046 // ldr z6, [x2, #12, MUL VL] + WORD $0x85815447 // ldr z7, [x2, #13, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63021 // eor z1.d, z1.d, z6.d + WORD $0x04a73021 // eor z1.d, z1.d, z7.d + WORD $0x85815846 // ldr z6, [x2, #14, MUL VL] + WORD $0x85815c47 // ldr z7, [x2, #15, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63042 // eor z2.d, z2.d, z6.d + WORD $0x04a73042 // eor z2.d, z2.d, z7.d + WORD $0x85824046 // ldr z6, [x2, #16, MUL VL] + WORD $0x85824447 // ldr z7, [x2, #17, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63063 // eor z3.d, z3.d, z6.d + WORD $0x04a73063 // eor z3.d, z3.d, z7.d + WORD $0x85824846 // ldr z6, [x2, #18, MUL VL] + WORD $0x85824c47 // ldr z7, [x2, #19, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63084 // eor z4.d, z4.d, z6.d + WORD $0x04a73084 // eor z4.d, z4.d, z7.d + // Check for early termination + CMP $2, R16 + BEQ mulSve_10x5Xor_store + + // Load and process 32 bytes from input 2 to 5 outputs + WORD $0x858040a8 // ldr z8, [x5] + WORD $0x910080a5 // add x5, x5, #32 + WORD $0x04fc9509 // lsr z9.d, z8.d, #4 + WORD $0x04253108 // and z8.d, z8.d, z5.d + WORD $0x04253129 // and z9.d, z9.d, z5.d + WORD $0x85825046 // ldr z6, [x2, #20, MUL VL] + WORD $0x85825447 // ldr z7, [x2, #21, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63000 // eor z0.d, z0.d, z6.d + WORD $0x04a73000 // eor z0.d, z0.d, z7.d + WORD $0x85825846 // ldr z6, [x2, #22, MUL VL] + WORD $0x85825c47 // ldr z7, [x2, #23, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63021 // eor z1.d, z1.d, z6.d + WORD $0x04a73021 // eor z1.d, z1.d, z7.d + WORD $0x85834046 // ldr z6, [x2, #24, MUL VL] + WORD $0x85834447 // ldr z7, [x2, #25, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63042 // eor z2.d, z2.d, z6.d + WORD $0x04a73042 // eor z2.d, z2.d, z7.d + WORD $0x85834846 // ldr z6, [x2, #26, MUL VL] + WORD $0x85834c47 // ldr z7, [x2, #27, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63063 // eor z3.d, z3.d, z6.d + WORD $0x04a73063 // eor z3.d, z3.d, z7.d + WORD $0x85835046 // ldr z6, [x2, #28, MUL VL] + WORD $0x85835447 // ldr z7, [x2, #29, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63084 // eor z4.d, z4.d, z6.d + WORD $0x04a73084 // eor z4.d, z4.d, z7.d + // Check for early termination + CMP $3, R16 + BEQ mulSve_10x5Xor_store + + // Load and process 32 bytes from input 3 to 5 outputs + WORD $0x85804108 // ldr z8, [x8] + WORD $0x91008108 // add x8, x8, #32 + WORD $0x04fc9509 // lsr z9.d, z8.d, #4 + WORD $0x04253108 // and z8.d, z8.d, z5.d + WORD $0x04253129 // and z9.d, z9.d, z5.d + WORD $0x85835846 // ldr z6, [x2, #30, MUL VL] + WORD $0x85835c47 // ldr z7, [x2, #31, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63000 // eor z0.d, z0.d, z6.d + WORD $0x04a73000 // eor z0.d, z0.d, z7.d + WORD $0x85844046 // ldr z6, [x2, #32, MUL VL] + WORD $0x85844447 // ldr z7, [x2, #33, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63021 // eor z1.d, z1.d, z6.d + WORD $0x04a73021 // eor z1.d, z1.d, z7.d + WORD $0x85844846 // ldr z6, [x2, #34, MUL VL] + WORD $0x85844c47 // ldr z7, [x2, #35, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63042 // eor z2.d, z2.d, z6.d + WORD $0x04a73042 // eor z2.d, z2.d, z7.d + WORD $0x85845046 // ldr z6, [x2, #36, MUL VL] + WORD $0x85845447 // ldr z7, [x2, #37, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63063 // eor z3.d, z3.d, z6.d + WORD $0x04a73063 // eor z3.d, z3.d, z7.d + WORD $0x85845846 // ldr z6, [x2, #38, MUL VL] + WORD $0x85845c47 // ldr z7, [x2, #39, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63084 // eor z4.d, z4.d, z6.d + WORD $0x04a73084 // eor z4.d, z4.d, z7.d + // Check for early termination + CMP $4, R16 + BEQ mulSve_10x5Xor_store + + // Load and process 32 bytes from input 4 to 5 outputs + WORD $0x85804128 // ldr z8, [x9] + WORD $0x91008129 // add x9, x9, #32 + WORD $0x04fc9509 // lsr z9.d, z8.d, #4 + WORD $0x04253108 // and z8.d, z8.d, z5.d + WORD $0x04253129 // and z9.d, z9.d, z5.d + WORD $0x85854046 // ldr z6, [x2, #40, MUL VL] + WORD $0x85854447 // ldr z7, [x2, #41, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63000 // eor z0.d, z0.d, z6.d + WORD $0x04a73000 // eor z0.d, z0.d, z7.d + WORD $0x85854846 // ldr z6, [x2, #42, MUL VL] + WORD $0x85854c47 // ldr z7, [x2, #43, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63021 // eor z1.d, z1.d, z6.d + WORD $0x04a73021 // eor z1.d, z1.d, z7.d + WORD $0x85855046 // ldr z6, [x2, #44, MUL VL] + WORD $0x85855447 // ldr z7, [x2, #45, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63042 // eor z2.d, z2.d, z6.d + WORD $0x04a73042 // eor z2.d, z2.d, z7.d + WORD $0x85855846 // ldr z6, [x2, #46, MUL VL] + WORD $0x85855c47 // ldr z7, [x2, #47, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63063 // eor z3.d, z3.d, z6.d + WORD $0x04a73063 // eor z3.d, z3.d, z7.d + WORD $0x85864046 // ldr z6, [x2, #48, MUL VL] + WORD $0x85864447 // ldr z7, [x2, #49, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63084 // eor z4.d, z4.d, z6.d + WORD $0x04a73084 // eor z4.d, z4.d, z7.d + // Check for early termination + CMP $5, R16 + BEQ mulSve_10x5Xor_store + + // Load and process 32 bytes from input 5 to 5 outputs + WORD $0x85804148 // ldr z8, [x10] + WORD $0x9100814a // add x10, x10, #32 + WORD $0x04fc9509 // lsr z9.d, z8.d, #4 + WORD $0x04253108 // and z8.d, z8.d, z5.d + WORD $0x04253129 // and z9.d, z9.d, z5.d + WORD $0x85864846 // ldr z6, [x2, #50, MUL VL] + WORD $0x85864c47 // ldr z7, [x2, #51, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63000 // eor z0.d, z0.d, z6.d + WORD $0x04a73000 // eor z0.d, z0.d, z7.d + WORD $0x85865046 // ldr z6, [x2, #52, MUL VL] + WORD $0x85865447 // ldr z7, [x2, #53, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63021 // eor z1.d, z1.d, z6.d + WORD $0x04a73021 // eor z1.d, z1.d, z7.d + WORD $0x85865846 // ldr z6, [x2, #54, MUL VL] + WORD $0x85865c47 // ldr z7, [x2, #55, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63042 // eor z2.d, z2.d, z6.d + WORD $0x04a73042 // eor z2.d, z2.d, z7.d + WORD $0x85874046 // ldr z6, [x2, #56, MUL VL] + WORD $0x85874447 // ldr z7, [x2, #57, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63063 // eor z3.d, z3.d, z6.d + WORD $0x04a73063 // eor z3.d, z3.d, z7.d + WORD $0x85874846 // ldr z6, [x2, #58, MUL VL] + WORD $0x85874c47 // ldr z7, [x2, #59, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63084 // eor z4.d, z4.d, z6.d + WORD $0x04a73084 // eor z4.d, z4.d, z7.d + // Check for early termination + CMP $6, R16 + BEQ mulSve_10x5Xor_store + + // Load and process 32 bytes from input 6 to 5 outputs + WORD $0x85804168 // ldr z8, [x11] + WORD $0x9100816b // add x11, x11, #32 + WORD $0x04fc9509 // lsr z9.d, z8.d, #4 + WORD $0x04253108 // and z8.d, z8.d, z5.d + WORD $0x04253129 // and z9.d, z9.d, z5.d + WORD $0x85875046 // ldr z6, [x2, #60, MUL VL] + WORD $0x85875447 // ldr z7, [x2, #61, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63000 // eor z0.d, z0.d, z6.d + WORD $0x04a73000 // eor z0.d, z0.d, z7.d + WORD $0x85875846 // ldr z6, [x2, #62, MUL VL] + WORD $0x85875c47 // ldr z7, [x2, #63, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63021 // eor z1.d, z1.d, z6.d + WORD $0x04a73021 // eor z1.d, z1.d, z7.d + WORD $0x85884046 // ldr z6, [x2, #64, MUL VL] + WORD $0x85884447 // ldr z7, [x2, #65, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63042 // eor z2.d, z2.d, z6.d + WORD $0x04a73042 // eor z2.d, z2.d, z7.d + WORD $0x85884846 // ldr z6, [x2, #66, MUL VL] + WORD $0x85884c47 // ldr z7, [x2, #67, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63063 // eor z3.d, z3.d, z6.d + WORD $0x04a73063 // eor z3.d, z3.d, z7.d + WORD $0x85885046 // ldr z6, [x2, #68, MUL VL] + WORD $0x85885447 // ldr z7, [x2, #69, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63084 // eor z4.d, z4.d, z6.d + WORD $0x04a73084 // eor z4.d, z4.d, z7.d + // Check for early termination + CMP $7, R16 + BEQ mulSve_10x5Xor_store + + // Load and process 32 bytes from input 7 to 5 outputs + WORD $0x85804188 // ldr z8, [x12] + WORD $0x9100818c // add x12, x12, #32 + WORD $0x04fc9509 // lsr z9.d, z8.d, #4 + WORD $0x04253108 // and z8.d, z8.d, z5.d + WORD $0x04253129 // and z9.d, z9.d, z5.d + WORD $0x85885846 // ldr z6, [x2, #70, MUL VL] + WORD $0x85885c47 // ldr z7, [x2, #71, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63000 // eor z0.d, z0.d, z6.d + WORD $0x04a73000 // eor z0.d, z0.d, z7.d + WORD $0x85894046 // ldr z6, [x2, #72, MUL VL] + WORD $0x85894447 // ldr z7, [x2, #73, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63021 // eor z1.d, z1.d, z6.d + WORD $0x04a73021 // eor z1.d, z1.d, z7.d + WORD $0x85894846 // ldr z6, [x2, #74, MUL VL] + WORD $0x85894c47 // ldr z7, [x2, #75, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63042 // eor z2.d, z2.d, z6.d + WORD $0x04a73042 // eor z2.d, z2.d, z7.d + WORD $0x85895046 // ldr z6, [x2, #76, MUL VL] + WORD $0x85895447 // ldr z7, [x2, #77, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63063 // eor z3.d, z3.d, z6.d + WORD $0x04a73063 // eor z3.d, z3.d, z7.d + WORD $0x85895846 // ldr z6, [x2, #78, MUL VL] + WORD $0x85895c47 // ldr z7, [x2, #79, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63084 // eor z4.d, z4.d, z6.d + WORD $0x04a73084 // eor z4.d, z4.d, z7.d + // Check for early termination + CMP $8, R16 + BEQ mulSve_10x5Xor_store + + // Load and process 32 bytes from input 8 to 5 outputs + WORD $0x858041a8 // ldr z8, [x13] + WORD $0x910081ad // add x13, x13, #32 + WORD $0x04fc9509 // lsr z9.d, z8.d, #4 + WORD $0x04253108 // and z8.d, z8.d, z5.d + WORD $0x04253129 // and z9.d, z9.d, z5.d + WORD $0x858a4046 // ldr z6, [x2, #80, MUL VL] + WORD $0x858a4447 // ldr z7, [x2, #81, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63000 // eor z0.d, z0.d, z6.d + WORD $0x04a73000 // eor z0.d, z0.d, z7.d + WORD $0x858a4846 // ldr z6, [x2, #82, MUL VL] + WORD $0x858a4c47 // ldr z7, [x2, #83, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63021 // eor z1.d, z1.d, z6.d + WORD $0x04a73021 // eor z1.d, z1.d, z7.d + WORD $0x858a5046 // ldr z6, [x2, #84, MUL VL] + WORD $0x858a5447 // ldr z7, [x2, #85, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63042 // eor z2.d, z2.d, z6.d + WORD $0x04a73042 // eor z2.d, z2.d, z7.d + WORD $0x858a5846 // ldr z6, [x2, #86, MUL VL] + WORD $0x858a5c47 // ldr z7, [x2, #87, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63063 // eor z3.d, z3.d, z6.d + WORD $0x04a73063 // eor z3.d, z3.d, z7.d + WORD $0x858b4046 // ldr z6, [x2, #88, MUL VL] + WORD $0x858b4447 // ldr z7, [x2, #89, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63084 // eor z4.d, z4.d, z6.d + WORD $0x04a73084 // eor z4.d, z4.d, z7.d + // Check for early termination + CMP $9, R16 + BEQ mulSve_10x5Xor_store + + // Load and process 32 bytes from input 9 to 5 outputs + WORD $0x85804068 // ldr z8, [x3] + WORD $0x91008063 // add x3, x3, #32 + WORD $0x04fc9509 // lsr z9.d, z8.d, #4 + WORD $0x04253108 // and z8.d, z8.d, z5.d + WORD $0x04253129 // and z9.d, z9.d, z5.d + WORD $0x858b4846 // ldr z6, [x2, #90, MUL VL] + WORD $0x858b4c47 // ldr z7, [x2, #91, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63000 // eor z0.d, z0.d, z6.d + WORD $0x04a73000 // eor z0.d, z0.d, z7.d + WORD $0x858b5046 // ldr z6, [x2, #92, MUL VL] + WORD $0x858b5447 // ldr z7, [x2, #93, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63021 // eor z1.d, z1.d, z6.d + WORD $0x04a73021 // eor z1.d, z1.d, z7.d + WORD $0x858b5846 // ldr z6, [x2, #94, MUL VL] + WORD $0x858b5c47 // ldr z7, [x2, #95, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63042 // eor z2.d, z2.d, z6.d + WORD $0x04a73042 // eor z2.d, z2.d, z7.d + WORD $0x858c4046 // ldr z6, [x2, #96, MUL VL] + WORD $0x858c4447 // ldr z7, [x2, #97, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63063 // eor z3.d, z3.d, z6.d + WORD $0x04a73063 // eor z3.d, z3.d, z7.d + WORD $0x858c4846 // ldr z6, [x2, #98, MUL VL] + WORD $0x858c4c47 // ldr z7, [x2, #99, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63084 // eor z4.d, z4.d, z6.d + WORD $0x04a73084 // eor z4.d, z4.d, z7.d + +mulSve_10x5Xor_store: + // Store 5 outputs + MOVD (R14), R6 + WORD $0xe5ef40c0 // st1d { z0.d }, p0, [x6, x15, lsl #3] + MOVD 24(R14), R6 + WORD $0xe5ef40c1 // st1d { z1.d }, p0, [x6, x15, lsl #3] + MOVD 48(R14), R6 + WORD $0xe5ef40c2 // st1d { z2.d }, p0, [x6, x15, lsl #3] + MOVD 72(R14), R6 + WORD $0xe5ef40c3 // st1d { z3.d }, p0, [x6, x15, lsl #3] + MOVD 96(R14), R6 + WORD $0xe5ef40c4 // st1d { z4.d }, p0, [x6, x15, lsl #3] + + // Prepare for next loop + WORD $0x910011ef // add x15, x15, #4 + WORD $0xf1000400 // subs x0, x0, #1 + BNE mulSve_10x5Xor_loop + +mulSve_10x5Xor_end: + RET + +// func mulSve_10x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: SVE +TEXT ·mulSve_10x6(SB), NOSPLIT, $8-88 + WORD $0x25d8e3e0 // ptrue p0.d + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 131 YMM used + MOVD n+80(FP), R0 + MOVD matrix_base+0(FP), R2 + WORD $0xd345fc00 // lsr x0, x0, #5 + WORD $0xea00001f // tst x0, x0 + BEQ mulSve_10x6_end + MOVD in_base+24(FP), R3 + MOVD (R3), R1 + MOVD 24(R3), R4 + MOVD 48(R3), R5 + MOVD 72(R3), R8 + MOVD 96(R3), R9 + MOVD 120(R3), R10 + MOVD 144(R3), R11 + MOVD 168(R3), R12 + MOVD 192(R3), R13 + MOVD 216(R3), R3 + MOVD out_base+48(FP), R14 + MOVD start+72(FP), R15 + + // Add start offset to input + WORD $0x8b0f0021 // add x1, x1, x15 + WORD $0x8b0f0084 // add x4, x4, x15 + WORD $0x8b0f00a5 // add x5, x5, x15 + WORD $0x8b0f0108 // add x8, x8, x15 + WORD $0x8b0f0129 // add x9, x9, x15 + WORD $0x8b0f014a // add x10, x10, x15 + WORD $0x8b0f016b // add x11, x11, x15 + WORD $0x8b0f018c // add x12, x12, x15 + WORD $0x8b0f01ad // add x13, x13, x15 + WORD $0x8b0f0063 // add x3, x3, x15 + WORD $0xd343fdef // lsr x15, x15, #3 + WORD $0xd28001e6 // mov x6, #15 + WORD $0x05e038c6 // mov z6.d, x6 + WORD $0x052120c6 // dup z6.b, z6.b[0] + + // Load number of input shards + MOVD in_len+32(FP), R16 + +mulSve_10x6_loop: + // Load and process 32 bytes from input 0 to 6 outputs + WORD $0x85804029 // ldr z9, [x1] + WORD $0x91008021 // add x1, x1, #32 + WORD $0x04fc952a // lsr z10.d, z9.d, #4 + WORD $0x04263129 // and z9.d, z9.d, z6.d + WORD $0x0426314a // and z10.d, z10.d, z6.d + WORD $0x85804047 // ldr z7, [x2] + WORD $0x85804448 // ldr z8, [x2, #1, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73100 // eor z0.d, z8.d, z7.d + WORD $0x85804847 // ldr z7, [x2, #2, MUL VL] + WORD $0x85804c48 // ldr z8, [x2, #3, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73101 // eor z1.d, z8.d, z7.d + WORD $0x85805047 // ldr z7, [x2, #4, MUL VL] + WORD $0x85805448 // ldr z8, [x2, #5, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73102 // eor z2.d, z8.d, z7.d + WORD $0x85805847 // ldr z7, [x2, #6, MUL VL] + WORD $0x85805c48 // ldr z8, [x2, #7, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73103 // eor z3.d, z8.d, z7.d + WORD $0x85814047 // ldr z7, [x2, #8, MUL VL] + WORD $0x85814448 // ldr z8, [x2, #9, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73104 // eor z4.d, z8.d, z7.d + WORD $0x85814847 // ldr z7, [x2, #10, MUL VL] + WORD $0x85814c48 // ldr z8, [x2, #11, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73105 // eor z5.d, z8.d, z7.d + // Check for early termination + CMP $1, R16 + BEQ mulSve_10x6_store + + // Load and process 32 bytes from input 1 to 6 outputs + WORD $0x85804089 // ldr z9, [x4] + WORD $0x91008084 // add x4, x4, #32 + WORD $0x04fc952a // lsr z10.d, z9.d, #4 + WORD $0x04263129 // and z9.d, z9.d, z6.d + WORD $0x0426314a // and z10.d, z10.d, z6.d + WORD $0x85815047 // ldr z7, [x2, #12, MUL VL] + WORD $0x85815448 // ldr z8, [x2, #13, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73000 // eor z0.d, z0.d, z7.d + WORD $0x04a83000 // eor z0.d, z0.d, z8.d + WORD $0x85815847 // ldr z7, [x2, #14, MUL VL] + WORD $0x85815c48 // ldr z8, [x2, #15, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73021 // eor z1.d, z1.d, z7.d + WORD $0x04a83021 // eor z1.d, z1.d, z8.d + WORD $0x85824047 // ldr z7, [x2, #16, MUL VL] + WORD $0x85824448 // ldr z8, [x2, #17, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73042 // eor z2.d, z2.d, z7.d + WORD $0x04a83042 // eor z2.d, z2.d, z8.d + WORD $0x85824847 // ldr z7, [x2, #18, MUL VL] + WORD $0x85824c48 // ldr z8, [x2, #19, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73063 // eor z3.d, z3.d, z7.d + WORD $0x04a83063 // eor z3.d, z3.d, z8.d + WORD $0x85825047 // ldr z7, [x2, #20, MUL VL] + WORD $0x85825448 // ldr z8, [x2, #21, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73084 // eor z4.d, z4.d, z7.d + WORD $0x04a83084 // eor z4.d, z4.d, z8.d + WORD $0x85825847 // ldr z7, [x2, #22, MUL VL] + WORD $0x85825c48 // ldr z8, [x2, #23, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a730a5 // eor z5.d, z5.d, z7.d + WORD $0x04a830a5 // eor z5.d, z5.d, z8.d + // Check for early termination + CMP $2, R16 + BEQ mulSve_10x6_store + + // Load and process 32 bytes from input 2 to 6 outputs + WORD $0x858040a9 // ldr z9, [x5] + WORD $0x910080a5 // add x5, x5, #32 + WORD $0x04fc952a // lsr z10.d, z9.d, #4 + WORD $0x04263129 // and z9.d, z9.d, z6.d + WORD $0x0426314a // and z10.d, z10.d, z6.d + WORD $0x85834047 // ldr z7, [x2, #24, MUL VL] + WORD $0x85834448 // ldr z8, [x2, #25, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73000 // eor z0.d, z0.d, z7.d + WORD $0x04a83000 // eor z0.d, z0.d, z8.d + WORD $0x85834847 // ldr z7, [x2, #26, MUL VL] + WORD $0x85834c48 // ldr z8, [x2, #27, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73021 // eor z1.d, z1.d, z7.d + WORD $0x04a83021 // eor z1.d, z1.d, z8.d + WORD $0x85835047 // ldr z7, [x2, #28, MUL VL] + WORD $0x85835448 // ldr z8, [x2, #29, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73042 // eor z2.d, z2.d, z7.d + WORD $0x04a83042 // eor z2.d, z2.d, z8.d + WORD $0x85835847 // ldr z7, [x2, #30, MUL VL] + WORD $0x85835c48 // ldr z8, [x2, #31, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73063 // eor z3.d, z3.d, z7.d + WORD $0x04a83063 // eor z3.d, z3.d, z8.d + WORD $0x85844047 // ldr z7, [x2, #32, MUL VL] + WORD $0x85844448 // ldr z8, [x2, #33, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73084 // eor z4.d, z4.d, z7.d + WORD $0x04a83084 // eor z4.d, z4.d, z8.d + WORD $0x85844847 // ldr z7, [x2, #34, MUL VL] + WORD $0x85844c48 // ldr z8, [x2, #35, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a730a5 // eor z5.d, z5.d, z7.d + WORD $0x04a830a5 // eor z5.d, z5.d, z8.d + // Check for early termination + CMP $3, R16 + BEQ mulSve_10x6_store + + // Load and process 32 bytes from input 3 to 6 outputs + WORD $0x85804109 // ldr z9, [x8] + WORD $0x91008108 // add x8, x8, #32 + WORD $0x04fc952a // lsr z10.d, z9.d, #4 + WORD $0x04263129 // and z9.d, z9.d, z6.d + WORD $0x0426314a // and z10.d, z10.d, z6.d + WORD $0x85845047 // ldr z7, [x2, #36, MUL VL] + WORD $0x85845448 // ldr z8, [x2, #37, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73000 // eor z0.d, z0.d, z7.d + WORD $0x04a83000 // eor z0.d, z0.d, z8.d + WORD $0x85845847 // ldr z7, [x2, #38, MUL VL] + WORD $0x85845c48 // ldr z8, [x2, #39, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73021 // eor z1.d, z1.d, z7.d + WORD $0x04a83021 // eor z1.d, z1.d, z8.d + WORD $0x85854047 // ldr z7, [x2, #40, MUL VL] + WORD $0x85854448 // ldr z8, [x2, #41, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73042 // eor z2.d, z2.d, z7.d + WORD $0x04a83042 // eor z2.d, z2.d, z8.d + WORD $0x85854847 // ldr z7, [x2, #42, MUL VL] + WORD $0x85854c48 // ldr z8, [x2, #43, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73063 // eor z3.d, z3.d, z7.d + WORD $0x04a83063 // eor z3.d, z3.d, z8.d + WORD $0x85855047 // ldr z7, [x2, #44, MUL VL] + WORD $0x85855448 // ldr z8, [x2, #45, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73084 // eor z4.d, z4.d, z7.d + WORD $0x04a83084 // eor z4.d, z4.d, z8.d + WORD $0x85855847 // ldr z7, [x2, #46, MUL VL] + WORD $0x85855c48 // ldr z8, [x2, #47, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a730a5 // eor z5.d, z5.d, z7.d + WORD $0x04a830a5 // eor z5.d, z5.d, z8.d + // Check for early termination + CMP $4, R16 + BEQ mulSve_10x6_store + + // Load and process 32 bytes from input 4 to 6 outputs + WORD $0x85804129 // ldr z9, [x9] + WORD $0x91008129 // add x9, x9, #32 + WORD $0x04fc952a // lsr z10.d, z9.d, #4 + WORD $0x04263129 // and z9.d, z9.d, z6.d + WORD $0x0426314a // and z10.d, z10.d, z6.d + WORD $0x85864047 // ldr z7, [x2, #48, MUL VL] + WORD $0x85864448 // ldr z8, [x2, #49, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73000 // eor z0.d, z0.d, z7.d + WORD $0x04a83000 // eor z0.d, z0.d, z8.d + WORD $0x85864847 // ldr z7, [x2, #50, MUL VL] + WORD $0x85864c48 // ldr z8, [x2, #51, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73021 // eor z1.d, z1.d, z7.d + WORD $0x04a83021 // eor z1.d, z1.d, z8.d + WORD $0x85865047 // ldr z7, [x2, #52, MUL VL] + WORD $0x85865448 // ldr z8, [x2, #53, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73042 // eor z2.d, z2.d, z7.d + WORD $0x04a83042 // eor z2.d, z2.d, z8.d + WORD $0x85865847 // ldr z7, [x2, #54, MUL VL] + WORD $0x85865c48 // ldr z8, [x2, #55, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73063 // eor z3.d, z3.d, z7.d + WORD $0x04a83063 // eor z3.d, z3.d, z8.d + WORD $0x85874047 // ldr z7, [x2, #56, MUL VL] + WORD $0x85874448 // ldr z8, [x2, #57, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73084 // eor z4.d, z4.d, z7.d + WORD $0x04a83084 // eor z4.d, z4.d, z8.d + WORD $0x85874847 // ldr z7, [x2, #58, MUL VL] + WORD $0x85874c48 // ldr z8, [x2, #59, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a730a5 // eor z5.d, z5.d, z7.d + WORD $0x04a830a5 // eor z5.d, z5.d, z8.d + // Check for early termination + CMP $5, R16 + BEQ mulSve_10x6_store + + // Load and process 32 bytes from input 5 to 6 outputs + WORD $0x85804149 // ldr z9, [x10] + WORD $0x9100814a // add x10, x10, #32 + WORD $0x04fc952a // lsr z10.d, z9.d, #4 + WORD $0x04263129 // and z9.d, z9.d, z6.d + WORD $0x0426314a // and z10.d, z10.d, z6.d + WORD $0x85875047 // ldr z7, [x2, #60, MUL VL] + WORD $0x85875448 // ldr z8, [x2, #61, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73000 // eor z0.d, z0.d, z7.d + WORD $0x04a83000 // eor z0.d, z0.d, z8.d + WORD $0x85875847 // ldr z7, [x2, #62, MUL VL] + WORD $0x85875c48 // ldr z8, [x2, #63, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73021 // eor z1.d, z1.d, z7.d + WORD $0x04a83021 // eor z1.d, z1.d, z8.d + WORD $0x85884047 // ldr z7, [x2, #64, MUL VL] + WORD $0x85884448 // ldr z8, [x2, #65, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73042 // eor z2.d, z2.d, z7.d + WORD $0x04a83042 // eor z2.d, z2.d, z8.d + WORD $0x85884847 // ldr z7, [x2, #66, MUL VL] + WORD $0x85884c48 // ldr z8, [x2, #67, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73063 // eor z3.d, z3.d, z7.d + WORD $0x04a83063 // eor z3.d, z3.d, z8.d + WORD $0x85885047 // ldr z7, [x2, #68, MUL VL] + WORD $0x85885448 // ldr z8, [x2, #69, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73084 // eor z4.d, z4.d, z7.d + WORD $0x04a83084 // eor z4.d, z4.d, z8.d + WORD $0x85885847 // ldr z7, [x2, #70, MUL VL] + WORD $0x85885c48 // ldr z8, [x2, #71, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a730a5 // eor z5.d, z5.d, z7.d + WORD $0x04a830a5 // eor z5.d, z5.d, z8.d + // Check for early termination + CMP $6, R16 + BEQ mulSve_10x6_store + + // Load and process 32 bytes from input 6 to 6 outputs + WORD $0x85804169 // ldr z9, [x11] + WORD $0x9100816b // add x11, x11, #32 + WORD $0x04fc952a // lsr z10.d, z9.d, #4 + WORD $0x04263129 // and z9.d, z9.d, z6.d + WORD $0x0426314a // and z10.d, z10.d, z6.d + WORD $0x85894047 // ldr z7, [x2, #72, MUL VL] + WORD $0x85894448 // ldr z8, [x2, #73, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73000 // eor z0.d, z0.d, z7.d + WORD $0x04a83000 // eor z0.d, z0.d, z8.d + WORD $0x85894847 // ldr z7, [x2, #74, MUL VL] + WORD $0x85894c48 // ldr z8, [x2, #75, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73021 // eor z1.d, z1.d, z7.d + WORD $0x04a83021 // eor z1.d, z1.d, z8.d + WORD $0x85895047 // ldr z7, [x2, #76, MUL VL] + WORD $0x85895448 // ldr z8, [x2, #77, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73042 // eor z2.d, z2.d, z7.d + WORD $0x04a83042 // eor z2.d, z2.d, z8.d + WORD $0x85895847 // ldr z7, [x2, #78, MUL VL] + WORD $0x85895c48 // ldr z8, [x2, #79, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73063 // eor z3.d, z3.d, z7.d + WORD $0x04a83063 // eor z3.d, z3.d, z8.d + WORD $0x858a4047 // ldr z7, [x2, #80, MUL VL] + WORD $0x858a4448 // ldr z8, [x2, #81, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73084 // eor z4.d, z4.d, z7.d + WORD $0x04a83084 // eor z4.d, z4.d, z8.d + WORD $0x858a4847 // ldr z7, [x2, #82, MUL VL] + WORD $0x858a4c48 // ldr z8, [x2, #83, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a730a5 // eor z5.d, z5.d, z7.d + WORD $0x04a830a5 // eor z5.d, z5.d, z8.d + // Check for early termination + CMP $7, R16 + BEQ mulSve_10x6_store + + // Load and process 32 bytes from input 7 to 6 outputs + WORD $0x85804189 // ldr z9, [x12] + WORD $0x9100818c // add x12, x12, #32 + WORD $0x04fc952a // lsr z10.d, z9.d, #4 + WORD $0x04263129 // and z9.d, z9.d, z6.d + WORD $0x0426314a // and z10.d, z10.d, z6.d + WORD $0x858a5047 // ldr z7, [x2, #84, MUL VL] + WORD $0x858a5448 // ldr z8, [x2, #85, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73000 // eor z0.d, z0.d, z7.d + WORD $0x04a83000 // eor z0.d, z0.d, z8.d + WORD $0x858a5847 // ldr z7, [x2, #86, MUL VL] + WORD $0x858a5c48 // ldr z8, [x2, #87, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73021 // eor z1.d, z1.d, z7.d + WORD $0x04a83021 // eor z1.d, z1.d, z8.d + WORD $0x858b4047 // ldr z7, [x2, #88, MUL VL] + WORD $0x858b4448 // ldr z8, [x2, #89, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73042 // eor z2.d, z2.d, z7.d + WORD $0x04a83042 // eor z2.d, z2.d, z8.d + WORD $0x858b4847 // ldr z7, [x2, #90, MUL VL] + WORD $0x858b4c48 // ldr z8, [x2, #91, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73063 // eor z3.d, z3.d, z7.d + WORD $0x04a83063 // eor z3.d, z3.d, z8.d + WORD $0x858b5047 // ldr z7, [x2, #92, MUL VL] + WORD $0x858b5448 // ldr z8, [x2, #93, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73084 // eor z4.d, z4.d, z7.d + WORD $0x04a83084 // eor z4.d, z4.d, z8.d + WORD $0x858b5847 // ldr z7, [x2, #94, MUL VL] + WORD $0x858b5c48 // ldr z8, [x2, #95, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a730a5 // eor z5.d, z5.d, z7.d + WORD $0x04a830a5 // eor z5.d, z5.d, z8.d + // Check for early termination + CMP $8, R16 + BEQ mulSve_10x6_store + + // Load and process 32 bytes from input 8 to 6 outputs + WORD $0x858041a9 // ldr z9, [x13] + WORD $0x910081ad // add x13, x13, #32 + WORD $0x04fc952a // lsr z10.d, z9.d, #4 + WORD $0x04263129 // and z9.d, z9.d, z6.d + WORD $0x0426314a // and z10.d, z10.d, z6.d + WORD $0x858c4047 // ldr z7, [x2, #96, MUL VL] + WORD $0x858c4448 // ldr z8, [x2, #97, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73000 // eor z0.d, z0.d, z7.d + WORD $0x04a83000 // eor z0.d, z0.d, z8.d + WORD $0x858c4847 // ldr z7, [x2, #98, MUL VL] + WORD $0x858c4c48 // ldr z8, [x2, #99, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73021 // eor z1.d, z1.d, z7.d + WORD $0x04a83021 // eor z1.d, z1.d, z8.d + WORD $0x858c5047 // ldr z7, [x2, #100, MUL VL] + WORD $0x858c5448 // ldr z8, [x2, #101, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73042 // eor z2.d, z2.d, z7.d + WORD $0x04a83042 // eor z2.d, z2.d, z8.d + WORD $0x858c5847 // ldr z7, [x2, #102, MUL VL] + WORD $0x858c5c48 // ldr z8, [x2, #103, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73063 // eor z3.d, z3.d, z7.d + WORD $0x04a83063 // eor z3.d, z3.d, z8.d + WORD $0x858d4047 // ldr z7, [x2, #104, MUL VL] + WORD $0x858d4448 // ldr z8, [x2, #105, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73084 // eor z4.d, z4.d, z7.d + WORD $0x04a83084 // eor z4.d, z4.d, z8.d + WORD $0x858d4847 // ldr z7, [x2, #106, MUL VL] + WORD $0x858d4c48 // ldr z8, [x2, #107, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a730a5 // eor z5.d, z5.d, z7.d + WORD $0x04a830a5 // eor z5.d, z5.d, z8.d + // Check for early termination + CMP $9, R16 + BEQ mulSve_10x6_store + + // Load and process 32 bytes from input 9 to 6 outputs + WORD $0x85804069 // ldr z9, [x3] + WORD $0x91008063 // add x3, x3, #32 + WORD $0x04fc952a // lsr z10.d, z9.d, #4 + WORD $0x04263129 // and z9.d, z9.d, z6.d + WORD $0x0426314a // and z10.d, z10.d, z6.d + WORD $0x858d5047 // ldr z7, [x2, #108, MUL VL] + WORD $0x858d5448 // ldr z8, [x2, #109, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73000 // eor z0.d, z0.d, z7.d + WORD $0x04a83000 // eor z0.d, z0.d, z8.d + WORD $0x858d5847 // ldr z7, [x2, #110, MUL VL] + WORD $0x858d5c48 // ldr z8, [x2, #111, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73021 // eor z1.d, z1.d, z7.d + WORD $0x04a83021 // eor z1.d, z1.d, z8.d + WORD $0x858e4047 // ldr z7, [x2, #112, MUL VL] + WORD $0x858e4448 // ldr z8, [x2, #113, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73042 // eor z2.d, z2.d, z7.d + WORD $0x04a83042 // eor z2.d, z2.d, z8.d + WORD $0x858e4847 // ldr z7, [x2, #114, MUL VL] + WORD $0x858e4c48 // ldr z8, [x2, #115, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73063 // eor z3.d, z3.d, z7.d + WORD $0x04a83063 // eor z3.d, z3.d, z8.d + WORD $0x858e5047 // ldr z7, [x2, #116, MUL VL] + WORD $0x858e5448 // ldr z8, [x2, #117, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73084 // eor z4.d, z4.d, z7.d + WORD $0x04a83084 // eor z4.d, z4.d, z8.d + WORD $0x858e5847 // ldr z7, [x2, #118, MUL VL] + WORD $0x858e5c48 // ldr z8, [x2, #119, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a730a5 // eor z5.d, z5.d, z7.d + WORD $0x04a830a5 // eor z5.d, z5.d, z8.d + +mulSve_10x6_store: + // Store 6 outputs + MOVD (R14), R6 + WORD $0xe5ef40c0 // st1d { z0.d }, p0, [x6, x15, lsl #3] + MOVD 24(R14), R6 + WORD $0xe5ef40c1 // st1d { z1.d }, p0, [x6, x15, lsl #3] + MOVD 48(R14), R6 + WORD $0xe5ef40c2 // st1d { z2.d }, p0, [x6, x15, lsl #3] + MOVD 72(R14), R6 + WORD $0xe5ef40c3 // st1d { z3.d }, p0, [x6, x15, lsl #3] + MOVD 96(R14), R6 + WORD $0xe5ef40c4 // st1d { z4.d }, p0, [x6, x15, lsl #3] + MOVD 120(R14), R6 + WORD $0xe5ef40c5 // st1d { z5.d }, p0, [x6, x15, lsl #3] + + // Prepare for next loop + WORD $0x910011ef // add x15, x15, #4 + WORD $0xf1000400 // subs x0, x0, #1 + BNE mulSve_10x6_loop + +mulSve_10x6_end: + RET + +// func mulSve_10x6Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: SVE +TEXT ·mulSve_10x6Xor(SB), NOSPLIT, $8-88 + WORD $0x25d8e3e0 // ptrue p0.d + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 131 YMM used + MOVD n+80(FP), R0 + MOVD matrix_base+0(FP), R2 + WORD $0xd345fc00 // lsr x0, x0, #5 + WORD $0xea00001f // tst x0, x0 + BEQ mulSve_10x6Xor_end + MOVD in_base+24(FP), R3 + MOVD (R3), R1 + MOVD 24(R3), R4 + MOVD 48(R3), R5 + MOVD 72(R3), R8 + MOVD 96(R3), R9 + MOVD 120(R3), R10 + MOVD 144(R3), R11 + MOVD 168(R3), R12 + MOVD 192(R3), R13 + MOVD 216(R3), R3 + MOVD out_base+48(FP), R14 + MOVD start+72(FP), R15 + + // Add start offset to input + WORD $0x8b0f0021 // add x1, x1, x15 + WORD $0x8b0f0084 // add x4, x4, x15 + WORD $0x8b0f00a5 // add x5, x5, x15 + WORD $0x8b0f0108 // add x8, x8, x15 + WORD $0x8b0f0129 // add x9, x9, x15 + WORD $0x8b0f014a // add x10, x10, x15 + WORD $0x8b0f016b // add x11, x11, x15 + WORD $0x8b0f018c // add x12, x12, x15 + WORD $0x8b0f01ad // add x13, x13, x15 + WORD $0x8b0f0063 // add x3, x3, x15 + WORD $0xd343fdef // lsr x15, x15, #3 + WORD $0xd28001e6 // mov x6, #15 + WORD $0x05e038c6 // mov z6.d, x6 + WORD $0x052120c6 // dup z6.b, z6.b[0] + + // Load number of input shards + MOVD in_len+32(FP), R16 + +mulSve_10x6Xor_loop: + // Load and process 32 bytes from input 0 to 6 outputs + WORD $0x85804029 // ldr z9, [x1] + WORD $0x91008021 // add x1, x1, #32 + WORD $0x04fc952a // lsr z10.d, z9.d, #4 + WORD $0x04263129 // and z9.d, z9.d, z6.d + WORD $0x0426314a // and z10.d, z10.d, z6.d + MOVD (R14), R6 + WORD $0xa5ef40c0 // ld1d { z0.d }, p0/z, [x6, x15, lsl #3] + WORD $0x85804047 // ldr z7, [x2] + WORD $0x85804448 // ldr z8, [x2, #1, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73000 // eor z0.d, z0.d, z7.d + WORD $0x04a83000 // eor z0.d, z0.d, z8.d + MOVD 24(R14), R6 + WORD $0xa5ef40c1 // ld1d { z1.d }, p0/z, [x6, x15, lsl #3] + WORD $0x85804847 // ldr z7, [x2, #2, MUL VL] + WORD $0x85804c48 // ldr z8, [x2, #3, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73021 // eor z1.d, z1.d, z7.d + WORD $0x04a83021 // eor z1.d, z1.d, z8.d + MOVD 48(R14), R6 + WORD $0xa5ef40c2 // ld1d { z2.d }, p0/z, [x6, x15, lsl #3] + WORD $0x85805047 // ldr z7, [x2, #4, MUL VL] + WORD $0x85805448 // ldr z8, [x2, #5, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73042 // eor z2.d, z2.d, z7.d + WORD $0x04a83042 // eor z2.d, z2.d, z8.d + MOVD 72(R14), R6 + WORD $0xa5ef40c3 // ld1d { z3.d }, p0/z, [x6, x15, lsl #3] + WORD $0x85805847 // ldr z7, [x2, #6, MUL VL] + WORD $0x85805c48 // ldr z8, [x2, #7, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73063 // eor z3.d, z3.d, z7.d + WORD $0x04a83063 // eor z3.d, z3.d, z8.d + MOVD 96(R14), R6 + WORD $0xa5ef40c4 // ld1d { z4.d }, p0/z, [x6, x15, lsl #3] + WORD $0x85814047 // ldr z7, [x2, #8, MUL VL] + WORD $0x85814448 // ldr z8, [x2, #9, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73084 // eor z4.d, z4.d, z7.d + WORD $0x04a83084 // eor z4.d, z4.d, z8.d + MOVD 120(R14), R6 + WORD $0xa5ef40c5 // ld1d { z5.d }, p0/z, [x6, x15, lsl #3] + WORD $0x85814847 // ldr z7, [x2, #10, MUL VL] + WORD $0x85814c48 // ldr z8, [x2, #11, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a730a5 // eor z5.d, z5.d, z7.d + WORD $0x04a830a5 // eor z5.d, z5.d, z8.d + // Check for early termination + CMP $1, R16 + BEQ mulSve_10x6Xor_store + + // Load and process 32 bytes from input 1 to 6 outputs + WORD $0x85804089 // ldr z9, [x4] + WORD $0x91008084 // add x4, x4, #32 + WORD $0x04fc952a // lsr z10.d, z9.d, #4 + WORD $0x04263129 // and z9.d, z9.d, z6.d + WORD $0x0426314a // and z10.d, z10.d, z6.d + WORD $0x85815047 // ldr z7, [x2, #12, MUL VL] + WORD $0x85815448 // ldr z8, [x2, #13, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73000 // eor z0.d, z0.d, z7.d + WORD $0x04a83000 // eor z0.d, z0.d, z8.d + WORD $0x85815847 // ldr z7, [x2, #14, MUL VL] + WORD $0x85815c48 // ldr z8, [x2, #15, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73021 // eor z1.d, z1.d, z7.d + WORD $0x04a83021 // eor z1.d, z1.d, z8.d + WORD $0x85824047 // ldr z7, [x2, #16, MUL VL] + WORD $0x85824448 // ldr z8, [x2, #17, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73042 // eor z2.d, z2.d, z7.d + WORD $0x04a83042 // eor z2.d, z2.d, z8.d + WORD $0x85824847 // ldr z7, [x2, #18, MUL VL] + WORD $0x85824c48 // ldr z8, [x2, #19, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73063 // eor z3.d, z3.d, z7.d + WORD $0x04a83063 // eor z3.d, z3.d, z8.d + WORD $0x85825047 // ldr z7, [x2, #20, MUL VL] + WORD $0x85825448 // ldr z8, [x2, #21, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73084 // eor z4.d, z4.d, z7.d + WORD $0x04a83084 // eor z4.d, z4.d, z8.d + WORD $0x85825847 // ldr z7, [x2, #22, MUL VL] + WORD $0x85825c48 // ldr z8, [x2, #23, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a730a5 // eor z5.d, z5.d, z7.d + WORD $0x04a830a5 // eor z5.d, z5.d, z8.d + // Check for early termination + CMP $2, R16 + BEQ mulSve_10x6Xor_store + + // Load and process 32 bytes from input 2 to 6 outputs + WORD $0x858040a9 // ldr z9, [x5] + WORD $0x910080a5 // add x5, x5, #32 + WORD $0x04fc952a // lsr z10.d, z9.d, #4 + WORD $0x04263129 // and z9.d, z9.d, z6.d + WORD $0x0426314a // and z10.d, z10.d, z6.d + WORD $0x85834047 // ldr z7, [x2, #24, MUL VL] + WORD $0x85834448 // ldr z8, [x2, #25, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73000 // eor z0.d, z0.d, z7.d + WORD $0x04a83000 // eor z0.d, z0.d, z8.d + WORD $0x85834847 // ldr z7, [x2, #26, MUL VL] + WORD $0x85834c48 // ldr z8, [x2, #27, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73021 // eor z1.d, z1.d, z7.d + WORD $0x04a83021 // eor z1.d, z1.d, z8.d + WORD $0x85835047 // ldr z7, [x2, #28, MUL VL] + WORD $0x85835448 // ldr z8, [x2, #29, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73042 // eor z2.d, z2.d, z7.d + WORD $0x04a83042 // eor z2.d, z2.d, z8.d + WORD $0x85835847 // ldr z7, [x2, #30, MUL VL] + WORD $0x85835c48 // ldr z8, [x2, #31, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73063 // eor z3.d, z3.d, z7.d + WORD $0x04a83063 // eor z3.d, z3.d, z8.d + WORD $0x85844047 // ldr z7, [x2, #32, MUL VL] + WORD $0x85844448 // ldr z8, [x2, #33, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73084 // eor z4.d, z4.d, z7.d + WORD $0x04a83084 // eor z4.d, z4.d, z8.d + WORD $0x85844847 // ldr z7, [x2, #34, MUL VL] + WORD $0x85844c48 // ldr z8, [x2, #35, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a730a5 // eor z5.d, z5.d, z7.d + WORD $0x04a830a5 // eor z5.d, z5.d, z8.d + // Check for early termination + CMP $3, R16 + BEQ mulSve_10x6Xor_store + + // Load and process 32 bytes from input 3 to 6 outputs + WORD $0x85804109 // ldr z9, [x8] + WORD $0x91008108 // add x8, x8, #32 + WORD $0x04fc952a // lsr z10.d, z9.d, #4 + WORD $0x04263129 // and z9.d, z9.d, z6.d + WORD $0x0426314a // and z10.d, z10.d, z6.d + WORD $0x85845047 // ldr z7, [x2, #36, MUL VL] + WORD $0x85845448 // ldr z8, [x2, #37, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73000 // eor z0.d, z0.d, z7.d + WORD $0x04a83000 // eor z0.d, z0.d, z8.d + WORD $0x85845847 // ldr z7, [x2, #38, MUL VL] + WORD $0x85845c48 // ldr z8, [x2, #39, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73021 // eor z1.d, z1.d, z7.d + WORD $0x04a83021 // eor z1.d, z1.d, z8.d + WORD $0x85854047 // ldr z7, [x2, #40, MUL VL] + WORD $0x85854448 // ldr z8, [x2, #41, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73042 // eor z2.d, z2.d, z7.d + WORD $0x04a83042 // eor z2.d, z2.d, z8.d + WORD $0x85854847 // ldr z7, [x2, #42, MUL VL] + WORD $0x85854c48 // ldr z8, [x2, #43, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73063 // eor z3.d, z3.d, z7.d + WORD $0x04a83063 // eor z3.d, z3.d, z8.d + WORD $0x85855047 // ldr z7, [x2, #44, MUL VL] + WORD $0x85855448 // ldr z8, [x2, #45, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73084 // eor z4.d, z4.d, z7.d + WORD $0x04a83084 // eor z4.d, z4.d, z8.d + WORD $0x85855847 // ldr z7, [x2, #46, MUL VL] + WORD $0x85855c48 // ldr z8, [x2, #47, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a730a5 // eor z5.d, z5.d, z7.d + WORD $0x04a830a5 // eor z5.d, z5.d, z8.d + // Check for early termination + CMP $4, R16 + BEQ mulSve_10x6Xor_store + + // Load and process 32 bytes from input 4 to 6 outputs + WORD $0x85804129 // ldr z9, [x9] + WORD $0x91008129 // add x9, x9, #32 + WORD $0x04fc952a // lsr z10.d, z9.d, #4 + WORD $0x04263129 // and z9.d, z9.d, z6.d + WORD $0x0426314a // and z10.d, z10.d, z6.d + WORD $0x85864047 // ldr z7, [x2, #48, MUL VL] + WORD $0x85864448 // ldr z8, [x2, #49, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73000 // eor z0.d, z0.d, z7.d + WORD $0x04a83000 // eor z0.d, z0.d, z8.d + WORD $0x85864847 // ldr z7, [x2, #50, MUL VL] + WORD $0x85864c48 // ldr z8, [x2, #51, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73021 // eor z1.d, z1.d, z7.d + WORD $0x04a83021 // eor z1.d, z1.d, z8.d + WORD $0x85865047 // ldr z7, [x2, #52, MUL VL] + WORD $0x85865448 // ldr z8, [x2, #53, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73042 // eor z2.d, z2.d, z7.d + WORD $0x04a83042 // eor z2.d, z2.d, z8.d + WORD $0x85865847 // ldr z7, [x2, #54, MUL VL] + WORD $0x85865c48 // ldr z8, [x2, #55, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73063 // eor z3.d, z3.d, z7.d + WORD $0x04a83063 // eor z3.d, z3.d, z8.d + WORD $0x85874047 // ldr z7, [x2, #56, MUL VL] + WORD $0x85874448 // ldr z8, [x2, #57, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73084 // eor z4.d, z4.d, z7.d + WORD $0x04a83084 // eor z4.d, z4.d, z8.d + WORD $0x85874847 // ldr z7, [x2, #58, MUL VL] + WORD $0x85874c48 // ldr z8, [x2, #59, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a730a5 // eor z5.d, z5.d, z7.d + WORD $0x04a830a5 // eor z5.d, z5.d, z8.d + // Check for early termination + CMP $5, R16 + BEQ mulSve_10x6Xor_store + + // Load and process 32 bytes from input 5 to 6 outputs + WORD $0x85804149 // ldr z9, [x10] + WORD $0x9100814a // add x10, x10, #32 + WORD $0x04fc952a // lsr z10.d, z9.d, #4 + WORD $0x04263129 // and z9.d, z9.d, z6.d + WORD $0x0426314a // and z10.d, z10.d, z6.d + WORD $0x85875047 // ldr z7, [x2, #60, MUL VL] + WORD $0x85875448 // ldr z8, [x2, #61, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73000 // eor z0.d, z0.d, z7.d + WORD $0x04a83000 // eor z0.d, z0.d, z8.d + WORD $0x85875847 // ldr z7, [x2, #62, MUL VL] + WORD $0x85875c48 // ldr z8, [x2, #63, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73021 // eor z1.d, z1.d, z7.d + WORD $0x04a83021 // eor z1.d, z1.d, z8.d + WORD $0x85884047 // ldr z7, [x2, #64, MUL VL] + WORD $0x85884448 // ldr z8, [x2, #65, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73042 // eor z2.d, z2.d, z7.d + WORD $0x04a83042 // eor z2.d, z2.d, z8.d + WORD $0x85884847 // ldr z7, [x2, #66, MUL VL] + WORD $0x85884c48 // ldr z8, [x2, #67, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73063 // eor z3.d, z3.d, z7.d + WORD $0x04a83063 // eor z3.d, z3.d, z8.d + WORD $0x85885047 // ldr z7, [x2, #68, MUL VL] + WORD $0x85885448 // ldr z8, [x2, #69, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73084 // eor z4.d, z4.d, z7.d + WORD $0x04a83084 // eor z4.d, z4.d, z8.d + WORD $0x85885847 // ldr z7, [x2, #70, MUL VL] + WORD $0x85885c48 // ldr z8, [x2, #71, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a730a5 // eor z5.d, z5.d, z7.d + WORD $0x04a830a5 // eor z5.d, z5.d, z8.d + // Check for early termination + CMP $6, R16 + BEQ mulSve_10x6Xor_store + + // Load and process 32 bytes from input 6 to 6 outputs + WORD $0x85804169 // ldr z9, [x11] + WORD $0x9100816b // add x11, x11, #32 + WORD $0x04fc952a // lsr z10.d, z9.d, #4 + WORD $0x04263129 // and z9.d, z9.d, z6.d + WORD $0x0426314a // and z10.d, z10.d, z6.d + WORD $0x85894047 // ldr z7, [x2, #72, MUL VL] + WORD $0x85894448 // ldr z8, [x2, #73, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73000 // eor z0.d, z0.d, z7.d + WORD $0x04a83000 // eor z0.d, z0.d, z8.d + WORD $0x85894847 // ldr z7, [x2, #74, MUL VL] + WORD $0x85894c48 // ldr z8, [x2, #75, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73021 // eor z1.d, z1.d, z7.d + WORD $0x04a83021 // eor z1.d, z1.d, z8.d + WORD $0x85895047 // ldr z7, [x2, #76, MUL VL] + WORD $0x85895448 // ldr z8, [x2, #77, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73042 // eor z2.d, z2.d, z7.d + WORD $0x04a83042 // eor z2.d, z2.d, z8.d + WORD $0x85895847 // ldr z7, [x2, #78, MUL VL] + WORD $0x85895c48 // ldr z8, [x2, #79, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73063 // eor z3.d, z3.d, z7.d + WORD $0x04a83063 // eor z3.d, z3.d, z8.d + WORD $0x858a4047 // ldr z7, [x2, #80, MUL VL] + WORD $0x858a4448 // ldr z8, [x2, #81, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73084 // eor z4.d, z4.d, z7.d + WORD $0x04a83084 // eor z4.d, z4.d, z8.d + WORD $0x858a4847 // ldr z7, [x2, #82, MUL VL] + WORD $0x858a4c48 // ldr z8, [x2, #83, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a730a5 // eor z5.d, z5.d, z7.d + WORD $0x04a830a5 // eor z5.d, z5.d, z8.d + // Check for early termination + CMP $7, R16 + BEQ mulSve_10x6Xor_store + + // Load and process 32 bytes from input 7 to 6 outputs + WORD $0x85804189 // ldr z9, [x12] + WORD $0x9100818c // add x12, x12, #32 + WORD $0x04fc952a // lsr z10.d, z9.d, #4 + WORD $0x04263129 // and z9.d, z9.d, z6.d + WORD $0x0426314a // and z10.d, z10.d, z6.d + WORD $0x858a5047 // ldr z7, [x2, #84, MUL VL] + WORD $0x858a5448 // ldr z8, [x2, #85, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73000 // eor z0.d, z0.d, z7.d + WORD $0x04a83000 // eor z0.d, z0.d, z8.d + WORD $0x858a5847 // ldr z7, [x2, #86, MUL VL] + WORD $0x858a5c48 // ldr z8, [x2, #87, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73021 // eor z1.d, z1.d, z7.d + WORD $0x04a83021 // eor z1.d, z1.d, z8.d + WORD $0x858b4047 // ldr z7, [x2, #88, MUL VL] + WORD $0x858b4448 // ldr z8, [x2, #89, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73042 // eor z2.d, z2.d, z7.d + WORD $0x04a83042 // eor z2.d, z2.d, z8.d + WORD $0x858b4847 // ldr z7, [x2, #90, MUL VL] + WORD $0x858b4c48 // ldr z8, [x2, #91, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73063 // eor z3.d, z3.d, z7.d + WORD $0x04a83063 // eor z3.d, z3.d, z8.d + WORD $0x858b5047 // ldr z7, [x2, #92, MUL VL] + WORD $0x858b5448 // ldr z8, [x2, #93, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73084 // eor z4.d, z4.d, z7.d + WORD $0x04a83084 // eor z4.d, z4.d, z8.d + WORD $0x858b5847 // ldr z7, [x2, #94, MUL VL] + WORD $0x858b5c48 // ldr z8, [x2, #95, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a730a5 // eor z5.d, z5.d, z7.d + WORD $0x04a830a5 // eor z5.d, z5.d, z8.d + // Check for early termination + CMP $8, R16 + BEQ mulSve_10x6Xor_store + + // Load and process 32 bytes from input 8 to 6 outputs + WORD $0x858041a9 // ldr z9, [x13] + WORD $0x910081ad // add x13, x13, #32 + WORD $0x04fc952a // lsr z10.d, z9.d, #4 + WORD $0x04263129 // and z9.d, z9.d, z6.d + WORD $0x0426314a // and z10.d, z10.d, z6.d + WORD $0x858c4047 // ldr z7, [x2, #96, MUL VL] + WORD $0x858c4448 // ldr z8, [x2, #97, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73000 // eor z0.d, z0.d, z7.d + WORD $0x04a83000 // eor z0.d, z0.d, z8.d + WORD $0x858c4847 // ldr z7, [x2, #98, MUL VL] + WORD $0x858c4c48 // ldr z8, [x2, #99, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73021 // eor z1.d, z1.d, z7.d + WORD $0x04a83021 // eor z1.d, z1.d, z8.d + WORD $0x858c5047 // ldr z7, [x2, #100, MUL VL] + WORD $0x858c5448 // ldr z8, [x2, #101, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73042 // eor z2.d, z2.d, z7.d + WORD $0x04a83042 // eor z2.d, z2.d, z8.d + WORD $0x858c5847 // ldr z7, [x2, #102, MUL VL] + WORD $0x858c5c48 // ldr z8, [x2, #103, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73063 // eor z3.d, z3.d, z7.d + WORD $0x04a83063 // eor z3.d, z3.d, z8.d + WORD $0x858d4047 // ldr z7, [x2, #104, MUL VL] + WORD $0x858d4448 // ldr z8, [x2, #105, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73084 // eor z4.d, z4.d, z7.d + WORD $0x04a83084 // eor z4.d, z4.d, z8.d + WORD $0x858d4847 // ldr z7, [x2, #106, MUL VL] + WORD $0x858d4c48 // ldr z8, [x2, #107, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a730a5 // eor z5.d, z5.d, z7.d + WORD $0x04a830a5 // eor z5.d, z5.d, z8.d + // Check for early termination + CMP $9, R16 + BEQ mulSve_10x6Xor_store + + // Load and process 32 bytes from input 9 to 6 outputs + WORD $0x85804069 // ldr z9, [x3] + WORD $0x91008063 // add x3, x3, #32 + WORD $0x04fc952a // lsr z10.d, z9.d, #4 + WORD $0x04263129 // and z9.d, z9.d, z6.d + WORD $0x0426314a // and z10.d, z10.d, z6.d + WORD $0x858d5047 // ldr z7, [x2, #108, MUL VL] + WORD $0x858d5448 // ldr z8, [x2, #109, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73000 // eor z0.d, z0.d, z7.d + WORD $0x04a83000 // eor z0.d, z0.d, z8.d + WORD $0x858d5847 // ldr z7, [x2, #110, MUL VL] + WORD $0x858d5c48 // ldr z8, [x2, #111, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73021 // eor z1.d, z1.d, z7.d + WORD $0x04a83021 // eor z1.d, z1.d, z8.d + WORD $0x858e4047 // ldr z7, [x2, #112, MUL VL] + WORD $0x858e4448 // ldr z8, [x2, #113, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73042 // eor z2.d, z2.d, z7.d + WORD $0x04a83042 // eor z2.d, z2.d, z8.d + WORD $0x858e4847 // ldr z7, [x2, #114, MUL VL] + WORD $0x858e4c48 // ldr z8, [x2, #115, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73063 // eor z3.d, z3.d, z7.d + WORD $0x04a83063 // eor z3.d, z3.d, z8.d + WORD $0x858e5047 // ldr z7, [x2, #116, MUL VL] + WORD $0x858e5448 // ldr z8, [x2, #117, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73084 // eor z4.d, z4.d, z7.d + WORD $0x04a83084 // eor z4.d, z4.d, z8.d + WORD $0x858e5847 // ldr z7, [x2, #118, MUL VL] + WORD $0x858e5c48 // ldr z8, [x2, #119, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a730a5 // eor z5.d, z5.d, z7.d + WORD $0x04a830a5 // eor z5.d, z5.d, z8.d + +mulSve_10x6Xor_store: + // Store 6 outputs + MOVD (R14), R6 + WORD $0xe5ef40c0 // st1d { z0.d }, p0, [x6, x15, lsl #3] + MOVD 24(R14), R6 + WORD $0xe5ef40c1 // st1d { z1.d }, p0, [x6, x15, lsl #3] + MOVD 48(R14), R6 + WORD $0xe5ef40c2 // st1d { z2.d }, p0, [x6, x15, lsl #3] + MOVD 72(R14), R6 + WORD $0xe5ef40c3 // st1d { z3.d }, p0, [x6, x15, lsl #3] + MOVD 96(R14), R6 + WORD $0xe5ef40c4 // st1d { z4.d }, p0, [x6, x15, lsl #3] + MOVD 120(R14), R6 + WORD $0xe5ef40c5 // st1d { z5.d }, p0, [x6, x15, lsl #3] + + // Prepare for next loop + WORD $0x910011ef // add x15, x15, #4 + WORD $0xf1000400 // subs x0, x0, #1 + BNE mulSve_10x6Xor_loop + +mulSve_10x6Xor_end: + RET + +// func mulSve_10x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: SVE +TEXT ·mulSve_10x7(SB), NOSPLIT, $8-88 + WORD $0x25d8e3e0 // ptrue p0.d + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 152 YMM used + MOVD n+80(FP), R0 + MOVD matrix_base+0(FP), R2 + WORD $0xd345fc00 // lsr x0, x0, #5 + WORD $0xea00001f // tst x0, x0 + BEQ mulSve_10x7_end + MOVD in_base+24(FP), R3 + MOVD (R3), R1 + MOVD 24(R3), R4 + MOVD 48(R3), R5 + MOVD 72(R3), R8 + MOVD 96(R3), R9 + MOVD 120(R3), R10 + MOVD 144(R3), R11 + MOVD 168(R3), R12 + MOVD 192(R3), R13 + MOVD 216(R3), R3 + MOVD out_base+48(FP), R14 + MOVD start+72(FP), R15 + + // Add start offset to input + WORD $0x8b0f0021 // add x1, x1, x15 + WORD $0x8b0f0084 // add x4, x4, x15 + WORD $0x8b0f00a5 // add x5, x5, x15 + WORD $0x8b0f0108 // add x8, x8, x15 + WORD $0x8b0f0129 // add x9, x9, x15 + WORD $0x8b0f014a // add x10, x10, x15 + WORD $0x8b0f016b // add x11, x11, x15 + WORD $0x8b0f018c // add x12, x12, x15 + WORD $0x8b0f01ad // add x13, x13, x15 + WORD $0x8b0f0063 // add x3, x3, x15 + WORD $0xd343fdef // lsr x15, x15, #3 + WORD $0xd28001e6 // mov x6, #15 + WORD $0x05e038c7 // mov z7.d, x6 + WORD $0x052120e7 // dup z7.b, z7.b[0] + + // Load number of input shards + MOVD in_len+32(FP), R16 + +mulSve_10x7_loop: + // Load and process 32 bytes from input 0 to 7 outputs + WORD $0x8580402a // ldr z10, [x1] + WORD $0x91008021 // add x1, x1, #32 + WORD $0x04fc954b // lsr z11.d, z10.d, #4 + WORD $0x0427314a // and z10.d, z10.d, z7.d + WORD $0x0427316b // and z11.d, z11.d, z7.d + WORD $0x85804048 // ldr z8, [x2] + WORD $0x85804449 // ldr z9, [x2, #1, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83120 // eor z0.d, z9.d, z8.d + WORD $0x85804848 // ldr z8, [x2, #2, MUL VL] + WORD $0x85804c49 // ldr z9, [x2, #3, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83121 // eor z1.d, z9.d, z8.d + WORD $0x85805048 // ldr z8, [x2, #4, MUL VL] + WORD $0x85805449 // ldr z9, [x2, #5, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83122 // eor z2.d, z9.d, z8.d + WORD $0x85805848 // ldr z8, [x2, #6, MUL VL] + WORD $0x85805c49 // ldr z9, [x2, #7, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83123 // eor z3.d, z9.d, z8.d + WORD $0x85814048 // ldr z8, [x2, #8, MUL VL] + WORD $0x85814449 // ldr z9, [x2, #9, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83124 // eor z4.d, z9.d, z8.d + WORD $0x85814848 // ldr z8, [x2, #10, MUL VL] + WORD $0x85814c49 // ldr z9, [x2, #11, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83125 // eor z5.d, z9.d, z8.d + WORD $0x85815048 // ldr z8, [x2, #12, MUL VL] + WORD $0x85815449 // ldr z9, [x2, #13, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83126 // eor z6.d, z9.d, z8.d + // Check for early termination + CMP $1, R16 + BEQ mulSve_10x7_store + + // Load and process 32 bytes from input 1 to 7 outputs + WORD $0x8580408a // ldr z10, [x4] + WORD $0x91008084 // add x4, x4, #32 + WORD $0x04fc954b // lsr z11.d, z10.d, #4 + WORD $0x0427314a // and z10.d, z10.d, z7.d + WORD $0x0427316b // and z11.d, z11.d, z7.d + WORD $0x85815848 // ldr z8, [x2, #14, MUL VL] + WORD $0x85815c49 // ldr z9, [x2, #15, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83000 // eor z0.d, z0.d, z8.d + WORD $0x04a93000 // eor z0.d, z0.d, z9.d + WORD $0x85824048 // ldr z8, [x2, #16, MUL VL] + WORD $0x85824449 // ldr z9, [x2, #17, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83021 // eor z1.d, z1.d, z8.d + WORD $0x04a93021 // eor z1.d, z1.d, z9.d + WORD $0x85824848 // ldr z8, [x2, #18, MUL VL] + WORD $0x85824c49 // ldr z9, [x2, #19, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83042 // eor z2.d, z2.d, z8.d + WORD $0x04a93042 // eor z2.d, z2.d, z9.d + WORD $0x85825048 // ldr z8, [x2, #20, MUL VL] + WORD $0x85825449 // ldr z9, [x2, #21, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83063 // eor z3.d, z3.d, z8.d + WORD $0x04a93063 // eor z3.d, z3.d, z9.d + WORD $0x85825848 // ldr z8, [x2, #22, MUL VL] + WORD $0x85825c49 // ldr z9, [x2, #23, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83084 // eor z4.d, z4.d, z8.d + WORD $0x04a93084 // eor z4.d, z4.d, z9.d + WORD $0x85834048 // ldr z8, [x2, #24, MUL VL] + WORD $0x85834449 // ldr z9, [x2, #25, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a830a5 // eor z5.d, z5.d, z8.d + WORD $0x04a930a5 // eor z5.d, z5.d, z9.d + WORD $0x85834848 // ldr z8, [x2, #26, MUL VL] + WORD $0x85834c49 // ldr z9, [x2, #27, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a830c6 // eor z6.d, z6.d, z8.d + WORD $0x04a930c6 // eor z6.d, z6.d, z9.d + // Check for early termination + CMP $2, R16 + BEQ mulSve_10x7_store + + // Load and process 32 bytes from input 2 to 7 outputs + WORD $0x858040aa // ldr z10, [x5] + WORD $0x910080a5 // add x5, x5, #32 + WORD $0x04fc954b // lsr z11.d, z10.d, #4 + WORD $0x0427314a // and z10.d, z10.d, z7.d + WORD $0x0427316b // and z11.d, z11.d, z7.d + WORD $0x85835048 // ldr z8, [x2, #28, MUL VL] + WORD $0x85835449 // ldr z9, [x2, #29, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83000 // eor z0.d, z0.d, z8.d + WORD $0x04a93000 // eor z0.d, z0.d, z9.d + WORD $0x85835848 // ldr z8, [x2, #30, MUL VL] + WORD $0x85835c49 // ldr z9, [x2, #31, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83021 // eor z1.d, z1.d, z8.d + WORD $0x04a93021 // eor z1.d, z1.d, z9.d + WORD $0x85844048 // ldr z8, [x2, #32, MUL VL] + WORD $0x85844449 // ldr z9, [x2, #33, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83042 // eor z2.d, z2.d, z8.d + WORD $0x04a93042 // eor z2.d, z2.d, z9.d + WORD $0x85844848 // ldr z8, [x2, #34, MUL VL] + WORD $0x85844c49 // ldr z9, [x2, #35, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83063 // eor z3.d, z3.d, z8.d + WORD $0x04a93063 // eor z3.d, z3.d, z9.d + WORD $0x85845048 // ldr z8, [x2, #36, MUL VL] + WORD $0x85845449 // ldr z9, [x2, #37, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83084 // eor z4.d, z4.d, z8.d + WORD $0x04a93084 // eor z4.d, z4.d, z9.d + WORD $0x85845848 // ldr z8, [x2, #38, MUL VL] + WORD $0x85845c49 // ldr z9, [x2, #39, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a830a5 // eor z5.d, z5.d, z8.d + WORD $0x04a930a5 // eor z5.d, z5.d, z9.d + WORD $0x85854048 // ldr z8, [x2, #40, MUL VL] + WORD $0x85854449 // ldr z9, [x2, #41, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a830c6 // eor z6.d, z6.d, z8.d + WORD $0x04a930c6 // eor z6.d, z6.d, z9.d + // Check for early termination + CMP $3, R16 + BEQ mulSve_10x7_store + + // Load and process 32 bytes from input 3 to 7 outputs + WORD $0x8580410a // ldr z10, [x8] + WORD $0x91008108 // add x8, x8, #32 + WORD $0x04fc954b // lsr z11.d, z10.d, #4 + WORD $0x0427314a // and z10.d, z10.d, z7.d + WORD $0x0427316b // and z11.d, z11.d, z7.d + WORD $0x85854848 // ldr z8, [x2, #42, MUL VL] + WORD $0x85854c49 // ldr z9, [x2, #43, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83000 // eor z0.d, z0.d, z8.d + WORD $0x04a93000 // eor z0.d, z0.d, z9.d + WORD $0x85855048 // ldr z8, [x2, #44, MUL VL] + WORD $0x85855449 // ldr z9, [x2, #45, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83021 // eor z1.d, z1.d, z8.d + WORD $0x04a93021 // eor z1.d, z1.d, z9.d + WORD $0x85855848 // ldr z8, [x2, #46, MUL VL] + WORD $0x85855c49 // ldr z9, [x2, #47, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83042 // eor z2.d, z2.d, z8.d + WORD $0x04a93042 // eor z2.d, z2.d, z9.d + WORD $0x85864048 // ldr z8, [x2, #48, MUL VL] + WORD $0x85864449 // ldr z9, [x2, #49, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83063 // eor z3.d, z3.d, z8.d + WORD $0x04a93063 // eor z3.d, z3.d, z9.d + WORD $0x85864848 // ldr z8, [x2, #50, MUL VL] + WORD $0x85864c49 // ldr z9, [x2, #51, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83084 // eor z4.d, z4.d, z8.d + WORD $0x04a93084 // eor z4.d, z4.d, z9.d + WORD $0x85865048 // ldr z8, [x2, #52, MUL VL] + WORD $0x85865449 // ldr z9, [x2, #53, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a830a5 // eor z5.d, z5.d, z8.d + WORD $0x04a930a5 // eor z5.d, z5.d, z9.d + WORD $0x85865848 // ldr z8, [x2, #54, MUL VL] + WORD $0x85865c49 // ldr z9, [x2, #55, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a830c6 // eor z6.d, z6.d, z8.d + WORD $0x04a930c6 // eor z6.d, z6.d, z9.d + // Check for early termination + CMP $4, R16 + BEQ mulSve_10x7_store + + // Load and process 32 bytes from input 4 to 7 outputs + WORD $0x8580412a // ldr z10, [x9] + WORD $0x91008129 // add x9, x9, #32 + WORD $0x04fc954b // lsr z11.d, z10.d, #4 + WORD $0x0427314a // and z10.d, z10.d, z7.d + WORD $0x0427316b // and z11.d, z11.d, z7.d + WORD $0x85874048 // ldr z8, [x2, #56, MUL VL] + WORD $0x85874449 // ldr z9, [x2, #57, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83000 // eor z0.d, z0.d, z8.d + WORD $0x04a93000 // eor z0.d, z0.d, z9.d + WORD $0x85874848 // ldr z8, [x2, #58, MUL VL] + WORD $0x85874c49 // ldr z9, [x2, #59, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83021 // eor z1.d, z1.d, z8.d + WORD $0x04a93021 // eor z1.d, z1.d, z9.d + WORD $0x85875048 // ldr z8, [x2, #60, MUL VL] + WORD $0x85875449 // ldr z9, [x2, #61, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83042 // eor z2.d, z2.d, z8.d + WORD $0x04a93042 // eor z2.d, z2.d, z9.d + WORD $0x85875848 // ldr z8, [x2, #62, MUL VL] + WORD $0x85875c49 // ldr z9, [x2, #63, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83063 // eor z3.d, z3.d, z8.d + WORD $0x04a93063 // eor z3.d, z3.d, z9.d + WORD $0x85884048 // ldr z8, [x2, #64, MUL VL] + WORD $0x85884449 // ldr z9, [x2, #65, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83084 // eor z4.d, z4.d, z8.d + WORD $0x04a93084 // eor z4.d, z4.d, z9.d + WORD $0x85884848 // ldr z8, [x2, #66, MUL VL] + WORD $0x85884c49 // ldr z9, [x2, #67, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a830a5 // eor z5.d, z5.d, z8.d + WORD $0x04a930a5 // eor z5.d, z5.d, z9.d + WORD $0x85885048 // ldr z8, [x2, #68, MUL VL] + WORD $0x85885449 // ldr z9, [x2, #69, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a830c6 // eor z6.d, z6.d, z8.d + WORD $0x04a930c6 // eor z6.d, z6.d, z9.d + // Check for early termination + CMP $5, R16 + BEQ mulSve_10x7_store + + // Load and process 32 bytes from input 5 to 7 outputs + WORD $0x8580414a // ldr z10, [x10] + WORD $0x9100814a // add x10, x10, #32 + WORD $0x04fc954b // lsr z11.d, z10.d, #4 + WORD $0x0427314a // and z10.d, z10.d, z7.d + WORD $0x0427316b // and z11.d, z11.d, z7.d + WORD $0x85885848 // ldr z8, [x2, #70, MUL VL] + WORD $0x85885c49 // ldr z9, [x2, #71, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83000 // eor z0.d, z0.d, z8.d + WORD $0x04a93000 // eor z0.d, z0.d, z9.d + WORD $0x85894048 // ldr z8, [x2, #72, MUL VL] + WORD $0x85894449 // ldr z9, [x2, #73, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83021 // eor z1.d, z1.d, z8.d + WORD $0x04a93021 // eor z1.d, z1.d, z9.d + WORD $0x85894848 // ldr z8, [x2, #74, MUL VL] + WORD $0x85894c49 // ldr z9, [x2, #75, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83042 // eor z2.d, z2.d, z8.d + WORD $0x04a93042 // eor z2.d, z2.d, z9.d + WORD $0x85895048 // ldr z8, [x2, #76, MUL VL] + WORD $0x85895449 // ldr z9, [x2, #77, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83063 // eor z3.d, z3.d, z8.d + WORD $0x04a93063 // eor z3.d, z3.d, z9.d + WORD $0x85895848 // ldr z8, [x2, #78, MUL VL] + WORD $0x85895c49 // ldr z9, [x2, #79, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83084 // eor z4.d, z4.d, z8.d + WORD $0x04a93084 // eor z4.d, z4.d, z9.d + WORD $0x858a4048 // ldr z8, [x2, #80, MUL VL] + WORD $0x858a4449 // ldr z9, [x2, #81, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a830a5 // eor z5.d, z5.d, z8.d + WORD $0x04a930a5 // eor z5.d, z5.d, z9.d + WORD $0x858a4848 // ldr z8, [x2, #82, MUL VL] + WORD $0x858a4c49 // ldr z9, [x2, #83, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a830c6 // eor z6.d, z6.d, z8.d + WORD $0x04a930c6 // eor z6.d, z6.d, z9.d + // Check for early termination + CMP $6, R16 + BEQ mulSve_10x7_store + + // Load and process 32 bytes from input 6 to 7 outputs + WORD $0x8580416a // ldr z10, [x11] + WORD $0x9100816b // add x11, x11, #32 + WORD $0x04fc954b // lsr z11.d, z10.d, #4 + WORD $0x0427314a // and z10.d, z10.d, z7.d + WORD $0x0427316b // and z11.d, z11.d, z7.d + WORD $0x858a5048 // ldr z8, [x2, #84, MUL VL] + WORD $0x858a5449 // ldr z9, [x2, #85, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83000 // eor z0.d, z0.d, z8.d + WORD $0x04a93000 // eor z0.d, z0.d, z9.d + WORD $0x858a5848 // ldr z8, [x2, #86, MUL VL] + WORD $0x858a5c49 // ldr z9, [x2, #87, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83021 // eor z1.d, z1.d, z8.d + WORD $0x04a93021 // eor z1.d, z1.d, z9.d + WORD $0x858b4048 // ldr z8, [x2, #88, MUL VL] + WORD $0x858b4449 // ldr z9, [x2, #89, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83042 // eor z2.d, z2.d, z8.d + WORD $0x04a93042 // eor z2.d, z2.d, z9.d + WORD $0x858b4848 // ldr z8, [x2, #90, MUL VL] + WORD $0x858b4c49 // ldr z9, [x2, #91, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83063 // eor z3.d, z3.d, z8.d + WORD $0x04a93063 // eor z3.d, z3.d, z9.d + WORD $0x858b5048 // ldr z8, [x2, #92, MUL VL] + WORD $0x858b5449 // ldr z9, [x2, #93, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83084 // eor z4.d, z4.d, z8.d + WORD $0x04a93084 // eor z4.d, z4.d, z9.d + WORD $0x858b5848 // ldr z8, [x2, #94, MUL VL] + WORD $0x858b5c49 // ldr z9, [x2, #95, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a830a5 // eor z5.d, z5.d, z8.d + WORD $0x04a930a5 // eor z5.d, z5.d, z9.d + WORD $0x858c4048 // ldr z8, [x2, #96, MUL VL] + WORD $0x858c4449 // ldr z9, [x2, #97, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a830c6 // eor z6.d, z6.d, z8.d + WORD $0x04a930c6 // eor z6.d, z6.d, z9.d + // Check for early termination + CMP $7, R16 + BEQ mulSve_10x7_store + + // Load and process 32 bytes from input 7 to 7 outputs + WORD $0x8580418a // ldr z10, [x12] + WORD $0x9100818c // add x12, x12, #32 + WORD $0x04fc954b // lsr z11.d, z10.d, #4 + WORD $0x0427314a // and z10.d, z10.d, z7.d + WORD $0x0427316b // and z11.d, z11.d, z7.d + WORD $0x858c4848 // ldr z8, [x2, #98, MUL VL] + WORD $0x858c4c49 // ldr z9, [x2, #99, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83000 // eor z0.d, z0.d, z8.d + WORD $0x04a93000 // eor z0.d, z0.d, z9.d + WORD $0x858c5048 // ldr z8, [x2, #100, MUL VL] + WORD $0x858c5449 // ldr z9, [x2, #101, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83021 // eor z1.d, z1.d, z8.d + WORD $0x04a93021 // eor z1.d, z1.d, z9.d + WORD $0x858c5848 // ldr z8, [x2, #102, MUL VL] + WORD $0x858c5c49 // ldr z9, [x2, #103, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83042 // eor z2.d, z2.d, z8.d + WORD $0x04a93042 // eor z2.d, z2.d, z9.d + WORD $0x858d4048 // ldr z8, [x2, #104, MUL VL] + WORD $0x858d4449 // ldr z9, [x2, #105, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83063 // eor z3.d, z3.d, z8.d + WORD $0x04a93063 // eor z3.d, z3.d, z9.d + WORD $0x858d4848 // ldr z8, [x2, #106, MUL VL] + WORD $0x858d4c49 // ldr z9, [x2, #107, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83084 // eor z4.d, z4.d, z8.d + WORD $0x04a93084 // eor z4.d, z4.d, z9.d + WORD $0x858d5048 // ldr z8, [x2, #108, MUL VL] + WORD $0x858d5449 // ldr z9, [x2, #109, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a830a5 // eor z5.d, z5.d, z8.d + WORD $0x04a930a5 // eor z5.d, z5.d, z9.d + WORD $0x858d5848 // ldr z8, [x2, #110, MUL VL] + WORD $0x858d5c49 // ldr z9, [x2, #111, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a830c6 // eor z6.d, z6.d, z8.d + WORD $0x04a930c6 // eor z6.d, z6.d, z9.d + // Check for early termination + CMP $8, R16 + BEQ mulSve_10x7_store + + // Load and process 32 bytes from input 8 to 7 outputs + WORD $0x858041aa // ldr z10, [x13] + WORD $0x910081ad // add x13, x13, #32 + WORD $0x04fc954b // lsr z11.d, z10.d, #4 + WORD $0x0427314a // and z10.d, z10.d, z7.d + WORD $0x0427316b // and z11.d, z11.d, z7.d + WORD $0x858e4048 // ldr z8, [x2, #112, MUL VL] + WORD $0x858e4449 // ldr z9, [x2, #113, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83000 // eor z0.d, z0.d, z8.d + WORD $0x04a93000 // eor z0.d, z0.d, z9.d + WORD $0x858e4848 // ldr z8, [x2, #114, MUL VL] + WORD $0x858e4c49 // ldr z9, [x2, #115, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83021 // eor z1.d, z1.d, z8.d + WORD $0x04a93021 // eor z1.d, z1.d, z9.d + WORD $0x858e5048 // ldr z8, [x2, #116, MUL VL] + WORD $0x858e5449 // ldr z9, [x2, #117, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83042 // eor z2.d, z2.d, z8.d + WORD $0x04a93042 // eor z2.d, z2.d, z9.d + WORD $0x858e5848 // ldr z8, [x2, #118, MUL VL] + WORD $0x858e5c49 // ldr z9, [x2, #119, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83063 // eor z3.d, z3.d, z8.d + WORD $0x04a93063 // eor z3.d, z3.d, z9.d + WORD $0x858f4048 // ldr z8, [x2, #120, MUL VL] + WORD $0x858f4449 // ldr z9, [x2, #121, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83084 // eor z4.d, z4.d, z8.d + WORD $0x04a93084 // eor z4.d, z4.d, z9.d + WORD $0x858f4848 // ldr z8, [x2, #122, MUL VL] + WORD $0x858f4c49 // ldr z9, [x2, #123, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a830a5 // eor z5.d, z5.d, z8.d + WORD $0x04a930a5 // eor z5.d, z5.d, z9.d + WORD $0x858f5048 // ldr z8, [x2, #124, MUL VL] + WORD $0x858f5449 // ldr z9, [x2, #125, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a830c6 // eor z6.d, z6.d, z8.d + WORD $0x04a930c6 // eor z6.d, z6.d, z9.d + // Check for early termination + CMP $9, R16 + BEQ mulSve_10x7_store + + // Load and process 32 bytes from input 9 to 7 outputs + WORD $0x8580406a // ldr z10, [x3] + WORD $0x91008063 // add x3, x3, #32 + WORD $0x04fc954b // lsr z11.d, z10.d, #4 + WORD $0x0427314a // and z10.d, z10.d, z7.d + WORD $0x0427316b // and z11.d, z11.d, z7.d + WORD $0x858f5848 // ldr z8, [x2, #126, MUL VL] + WORD $0x858f5c49 // ldr z9, [x2, #127, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83000 // eor z0.d, z0.d, z8.d + WORD $0x04a93000 // eor z0.d, z0.d, z9.d + WORD $0x85904048 // ldr z8, [x2, #128, MUL VL] + WORD $0x85904449 // ldr z9, [x2, #129, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83021 // eor z1.d, z1.d, z8.d + WORD $0x04a93021 // eor z1.d, z1.d, z9.d + WORD $0x85904848 // ldr z8, [x2, #130, MUL VL] + WORD $0x85904c49 // ldr z9, [x2, #131, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83042 // eor z2.d, z2.d, z8.d + WORD $0x04a93042 // eor z2.d, z2.d, z9.d + WORD $0x85905048 // ldr z8, [x2, #132, MUL VL] + WORD $0x85905449 // ldr z9, [x2, #133, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83063 // eor z3.d, z3.d, z8.d + WORD $0x04a93063 // eor z3.d, z3.d, z9.d + WORD $0x85905848 // ldr z8, [x2, #134, MUL VL] + WORD $0x85905c49 // ldr z9, [x2, #135, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83084 // eor z4.d, z4.d, z8.d + WORD $0x04a93084 // eor z4.d, z4.d, z9.d + WORD $0x85914048 // ldr z8, [x2, #136, MUL VL] + WORD $0x85914449 // ldr z9, [x2, #137, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a830a5 // eor z5.d, z5.d, z8.d + WORD $0x04a930a5 // eor z5.d, z5.d, z9.d + WORD $0x85914848 // ldr z8, [x2, #138, MUL VL] + WORD $0x85914c49 // ldr z9, [x2, #139, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a830c6 // eor z6.d, z6.d, z8.d + WORD $0x04a930c6 // eor z6.d, z6.d, z9.d + +mulSve_10x7_store: + // Store 7 outputs + MOVD (R14), R6 + WORD $0xe5ef40c0 // st1d { z0.d }, p0, [x6, x15, lsl #3] + MOVD 24(R14), R6 + WORD $0xe5ef40c1 // st1d { z1.d }, p0, [x6, x15, lsl #3] + MOVD 48(R14), R6 + WORD $0xe5ef40c2 // st1d { z2.d }, p0, [x6, x15, lsl #3] + MOVD 72(R14), R6 + WORD $0xe5ef40c3 // st1d { z3.d }, p0, [x6, x15, lsl #3] + MOVD 96(R14), R6 + WORD $0xe5ef40c4 // st1d { z4.d }, p0, [x6, x15, lsl #3] + MOVD 120(R14), R6 + WORD $0xe5ef40c5 // st1d { z5.d }, p0, [x6, x15, lsl #3] + MOVD 144(R14), R6 + WORD $0xe5ef40c6 // st1d { z6.d }, p0, [x6, x15, lsl #3] + + // Prepare for next loop + WORD $0x910011ef // add x15, x15, #4 + WORD $0xf1000400 // subs x0, x0, #1 + BNE mulSve_10x7_loop + +mulSve_10x7_end: + RET + +// func mulSve_10x7Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: SVE +TEXT ·mulSve_10x7Xor(SB), NOSPLIT, $8-88 + WORD $0x25d8e3e0 // ptrue p0.d + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 152 YMM used + MOVD n+80(FP), R0 + MOVD matrix_base+0(FP), R2 + WORD $0xd345fc00 // lsr x0, x0, #5 + WORD $0xea00001f // tst x0, x0 + BEQ mulSve_10x7Xor_end + MOVD in_base+24(FP), R3 + MOVD (R3), R1 + MOVD 24(R3), R4 + MOVD 48(R3), R5 + MOVD 72(R3), R8 + MOVD 96(R3), R9 + MOVD 120(R3), R10 + MOVD 144(R3), R11 + MOVD 168(R3), R12 + MOVD 192(R3), R13 + MOVD 216(R3), R3 + MOVD out_base+48(FP), R14 + MOVD start+72(FP), R15 + + // Add start offset to input + WORD $0x8b0f0021 // add x1, x1, x15 + WORD $0x8b0f0084 // add x4, x4, x15 + WORD $0x8b0f00a5 // add x5, x5, x15 + WORD $0x8b0f0108 // add x8, x8, x15 + WORD $0x8b0f0129 // add x9, x9, x15 + WORD $0x8b0f014a // add x10, x10, x15 + WORD $0x8b0f016b // add x11, x11, x15 + WORD $0x8b0f018c // add x12, x12, x15 + WORD $0x8b0f01ad // add x13, x13, x15 + WORD $0x8b0f0063 // add x3, x3, x15 + WORD $0xd343fdef // lsr x15, x15, #3 + WORD $0xd28001e6 // mov x6, #15 + WORD $0x05e038c7 // mov z7.d, x6 + WORD $0x052120e7 // dup z7.b, z7.b[0] + + // Load number of input shards + MOVD in_len+32(FP), R16 + +mulSve_10x7Xor_loop: + // Load and process 32 bytes from input 0 to 7 outputs + WORD $0x8580402a // ldr z10, [x1] + WORD $0x91008021 // add x1, x1, #32 + WORD $0x04fc954b // lsr z11.d, z10.d, #4 + WORD $0x0427314a // and z10.d, z10.d, z7.d + WORD $0x0427316b // and z11.d, z11.d, z7.d + MOVD (R14), R6 + WORD $0xa5ef40c0 // ld1d { z0.d }, p0/z, [x6, x15, lsl #3] + WORD $0x85804048 // ldr z8, [x2] + WORD $0x85804449 // ldr z9, [x2, #1, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83000 // eor z0.d, z0.d, z8.d + WORD $0x04a93000 // eor z0.d, z0.d, z9.d + MOVD 24(R14), R6 + WORD $0xa5ef40c1 // ld1d { z1.d }, p0/z, [x6, x15, lsl #3] + WORD $0x85804848 // ldr z8, [x2, #2, MUL VL] + WORD $0x85804c49 // ldr z9, [x2, #3, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83021 // eor z1.d, z1.d, z8.d + WORD $0x04a93021 // eor z1.d, z1.d, z9.d + MOVD 48(R14), R6 + WORD $0xa5ef40c2 // ld1d { z2.d }, p0/z, [x6, x15, lsl #3] + WORD $0x85805048 // ldr z8, [x2, #4, MUL VL] + WORD $0x85805449 // ldr z9, [x2, #5, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83042 // eor z2.d, z2.d, z8.d + WORD $0x04a93042 // eor z2.d, z2.d, z9.d + MOVD 72(R14), R6 + WORD $0xa5ef40c3 // ld1d { z3.d }, p0/z, [x6, x15, lsl #3] + WORD $0x85805848 // ldr z8, [x2, #6, MUL VL] + WORD $0x85805c49 // ldr z9, [x2, #7, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83063 // eor z3.d, z3.d, z8.d + WORD $0x04a93063 // eor z3.d, z3.d, z9.d + MOVD 96(R14), R6 + WORD $0xa5ef40c4 // ld1d { z4.d }, p0/z, [x6, x15, lsl #3] + WORD $0x85814048 // ldr z8, [x2, #8, MUL VL] + WORD $0x85814449 // ldr z9, [x2, #9, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83084 // eor z4.d, z4.d, z8.d + WORD $0x04a93084 // eor z4.d, z4.d, z9.d + MOVD 120(R14), R6 + WORD $0xa5ef40c5 // ld1d { z5.d }, p0/z, [x6, x15, lsl #3] + WORD $0x85814848 // ldr z8, [x2, #10, MUL VL] + WORD $0x85814c49 // ldr z9, [x2, #11, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a830a5 // eor z5.d, z5.d, z8.d + WORD $0x04a930a5 // eor z5.d, z5.d, z9.d + MOVD 144(R14), R6 + WORD $0xa5ef40c6 // ld1d { z6.d }, p0/z, [x6, x15, lsl #3] + WORD $0x85815048 // ldr z8, [x2, #12, MUL VL] + WORD $0x85815449 // ldr z9, [x2, #13, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a830c6 // eor z6.d, z6.d, z8.d + WORD $0x04a930c6 // eor z6.d, z6.d, z9.d + // Check for early termination + CMP $1, R16 + BEQ mulSve_10x7Xor_store + + // Load and process 32 bytes from input 1 to 7 outputs + WORD $0x8580408a // ldr z10, [x4] + WORD $0x91008084 // add x4, x4, #32 + WORD $0x04fc954b // lsr z11.d, z10.d, #4 + WORD $0x0427314a // and z10.d, z10.d, z7.d + WORD $0x0427316b // and z11.d, z11.d, z7.d + WORD $0x85815848 // ldr z8, [x2, #14, MUL VL] + WORD $0x85815c49 // ldr z9, [x2, #15, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83000 // eor z0.d, z0.d, z8.d + WORD $0x04a93000 // eor z0.d, z0.d, z9.d + WORD $0x85824048 // ldr z8, [x2, #16, MUL VL] + WORD $0x85824449 // ldr z9, [x2, #17, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83021 // eor z1.d, z1.d, z8.d + WORD $0x04a93021 // eor z1.d, z1.d, z9.d + WORD $0x85824848 // ldr z8, [x2, #18, MUL VL] + WORD $0x85824c49 // ldr z9, [x2, #19, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83042 // eor z2.d, z2.d, z8.d + WORD $0x04a93042 // eor z2.d, z2.d, z9.d + WORD $0x85825048 // ldr z8, [x2, #20, MUL VL] + WORD $0x85825449 // ldr z9, [x2, #21, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83063 // eor z3.d, z3.d, z8.d + WORD $0x04a93063 // eor z3.d, z3.d, z9.d + WORD $0x85825848 // ldr z8, [x2, #22, MUL VL] + WORD $0x85825c49 // ldr z9, [x2, #23, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83084 // eor z4.d, z4.d, z8.d + WORD $0x04a93084 // eor z4.d, z4.d, z9.d + WORD $0x85834048 // ldr z8, [x2, #24, MUL VL] + WORD $0x85834449 // ldr z9, [x2, #25, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a830a5 // eor z5.d, z5.d, z8.d + WORD $0x04a930a5 // eor z5.d, z5.d, z9.d + WORD $0x85834848 // ldr z8, [x2, #26, MUL VL] + WORD $0x85834c49 // ldr z9, [x2, #27, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a830c6 // eor z6.d, z6.d, z8.d + WORD $0x04a930c6 // eor z6.d, z6.d, z9.d + // Check for early termination + CMP $2, R16 + BEQ mulSve_10x7Xor_store + + // Load and process 32 bytes from input 2 to 7 outputs + WORD $0x858040aa // ldr z10, [x5] + WORD $0x910080a5 // add x5, x5, #32 + WORD $0x04fc954b // lsr z11.d, z10.d, #4 + WORD $0x0427314a // and z10.d, z10.d, z7.d + WORD $0x0427316b // and z11.d, z11.d, z7.d + WORD $0x85835048 // ldr z8, [x2, #28, MUL VL] + WORD $0x85835449 // ldr z9, [x2, #29, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83000 // eor z0.d, z0.d, z8.d + WORD $0x04a93000 // eor z0.d, z0.d, z9.d + WORD $0x85835848 // ldr z8, [x2, #30, MUL VL] + WORD $0x85835c49 // ldr z9, [x2, #31, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83021 // eor z1.d, z1.d, z8.d + WORD $0x04a93021 // eor z1.d, z1.d, z9.d + WORD $0x85844048 // ldr z8, [x2, #32, MUL VL] + WORD $0x85844449 // ldr z9, [x2, #33, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83042 // eor z2.d, z2.d, z8.d + WORD $0x04a93042 // eor z2.d, z2.d, z9.d + WORD $0x85844848 // ldr z8, [x2, #34, MUL VL] + WORD $0x85844c49 // ldr z9, [x2, #35, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83063 // eor z3.d, z3.d, z8.d + WORD $0x04a93063 // eor z3.d, z3.d, z9.d + WORD $0x85845048 // ldr z8, [x2, #36, MUL VL] + WORD $0x85845449 // ldr z9, [x2, #37, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83084 // eor z4.d, z4.d, z8.d + WORD $0x04a93084 // eor z4.d, z4.d, z9.d + WORD $0x85845848 // ldr z8, [x2, #38, MUL VL] + WORD $0x85845c49 // ldr z9, [x2, #39, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a830a5 // eor z5.d, z5.d, z8.d + WORD $0x04a930a5 // eor z5.d, z5.d, z9.d + WORD $0x85854048 // ldr z8, [x2, #40, MUL VL] + WORD $0x85854449 // ldr z9, [x2, #41, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a830c6 // eor z6.d, z6.d, z8.d + WORD $0x04a930c6 // eor z6.d, z6.d, z9.d + // Check for early termination + CMP $3, R16 + BEQ mulSve_10x7Xor_store + + // Load and process 32 bytes from input 3 to 7 outputs + WORD $0x8580410a // ldr z10, [x8] + WORD $0x91008108 // add x8, x8, #32 + WORD $0x04fc954b // lsr z11.d, z10.d, #4 + WORD $0x0427314a // and z10.d, z10.d, z7.d + WORD $0x0427316b // and z11.d, z11.d, z7.d + WORD $0x85854848 // ldr z8, [x2, #42, MUL VL] + WORD $0x85854c49 // ldr z9, [x2, #43, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83000 // eor z0.d, z0.d, z8.d + WORD $0x04a93000 // eor z0.d, z0.d, z9.d + WORD $0x85855048 // ldr z8, [x2, #44, MUL VL] + WORD $0x85855449 // ldr z9, [x2, #45, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83021 // eor z1.d, z1.d, z8.d + WORD $0x04a93021 // eor z1.d, z1.d, z9.d + WORD $0x85855848 // ldr z8, [x2, #46, MUL VL] + WORD $0x85855c49 // ldr z9, [x2, #47, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83042 // eor z2.d, z2.d, z8.d + WORD $0x04a93042 // eor z2.d, z2.d, z9.d + WORD $0x85864048 // ldr z8, [x2, #48, MUL VL] + WORD $0x85864449 // ldr z9, [x2, #49, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83063 // eor z3.d, z3.d, z8.d + WORD $0x04a93063 // eor z3.d, z3.d, z9.d + WORD $0x85864848 // ldr z8, [x2, #50, MUL VL] + WORD $0x85864c49 // ldr z9, [x2, #51, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83084 // eor z4.d, z4.d, z8.d + WORD $0x04a93084 // eor z4.d, z4.d, z9.d + WORD $0x85865048 // ldr z8, [x2, #52, MUL VL] + WORD $0x85865449 // ldr z9, [x2, #53, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a830a5 // eor z5.d, z5.d, z8.d + WORD $0x04a930a5 // eor z5.d, z5.d, z9.d + WORD $0x85865848 // ldr z8, [x2, #54, MUL VL] + WORD $0x85865c49 // ldr z9, [x2, #55, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a830c6 // eor z6.d, z6.d, z8.d + WORD $0x04a930c6 // eor z6.d, z6.d, z9.d + // Check for early termination + CMP $4, R16 + BEQ mulSve_10x7Xor_store + + // Load and process 32 bytes from input 4 to 7 outputs + WORD $0x8580412a // ldr z10, [x9] + WORD $0x91008129 // add x9, x9, #32 + WORD $0x04fc954b // lsr z11.d, z10.d, #4 + WORD $0x0427314a // and z10.d, z10.d, z7.d + WORD $0x0427316b // and z11.d, z11.d, z7.d + WORD $0x85874048 // ldr z8, [x2, #56, MUL VL] + WORD $0x85874449 // ldr z9, [x2, #57, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83000 // eor z0.d, z0.d, z8.d + WORD $0x04a93000 // eor z0.d, z0.d, z9.d + WORD $0x85874848 // ldr z8, [x2, #58, MUL VL] + WORD $0x85874c49 // ldr z9, [x2, #59, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83021 // eor z1.d, z1.d, z8.d + WORD $0x04a93021 // eor z1.d, z1.d, z9.d + WORD $0x85875048 // ldr z8, [x2, #60, MUL VL] + WORD $0x85875449 // ldr z9, [x2, #61, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83042 // eor z2.d, z2.d, z8.d + WORD $0x04a93042 // eor z2.d, z2.d, z9.d + WORD $0x85875848 // ldr z8, [x2, #62, MUL VL] + WORD $0x85875c49 // ldr z9, [x2, #63, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83063 // eor z3.d, z3.d, z8.d + WORD $0x04a93063 // eor z3.d, z3.d, z9.d + WORD $0x85884048 // ldr z8, [x2, #64, MUL VL] + WORD $0x85884449 // ldr z9, [x2, #65, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83084 // eor z4.d, z4.d, z8.d + WORD $0x04a93084 // eor z4.d, z4.d, z9.d + WORD $0x85884848 // ldr z8, [x2, #66, MUL VL] + WORD $0x85884c49 // ldr z9, [x2, #67, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a830a5 // eor z5.d, z5.d, z8.d + WORD $0x04a930a5 // eor z5.d, z5.d, z9.d + WORD $0x85885048 // ldr z8, [x2, #68, MUL VL] + WORD $0x85885449 // ldr z9, [x2, #69, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a830c6 // eor z6.d, z6.d, z8.d + WORD $0x04a930c6 // eor z6.d, z6.d, z9.d + // Check for early termination + CMP $5, R16 + BEQ mulSve_10x7Xor_store + + // Load and process 32 bytes from input 5 to 7 outputs + WORD $0x8580414a // ldr z10, [x10] + WORD $0x9100814a // add x10, x10, #32 + WORD $0x04fc954b // lsr z11.d, z10.d, #4 + WORD $0x0427314a // and z10.d, z10.d, z7.d + WORD $0x0427316b // and z11.d, z11.d, z7.d + WORD $0x85885848 // ldr z8, [x2, #70, MUL VL] + WORD $0x85885c49 // ldr z9, [x2, #71, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83000 // eor z0.d, z0.d, z8.d + WORD $0x04a93000 // eor z0.d, z0.d, z9.d + WORD $0x85894048 // ldr z8, [x2, #72, MUL VL] + WORD $0x85894449 // ldr z9, [x2, #73, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83021 // eor z1.d, z1.d, z8.d + WORD $0x04a93021 // eor z1.d, z1.d, z9.d + WORD $0x85894848 // ldr z8, [x2, #74, MUL VL] + WORD $0x85894c49 // ldr z9, [x2, #75, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83042 // eor z2.d, z2.d, z8.d + WORD $0x04a93042 // eor z2.d, z2.d, z9.d + WORD $0x85895048 // ldr z8, [x2, #76, MUL VL] + WORD $0x85895449 // ldr z9, [x2, #77, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83063 // eor z3.d, z3.d, z8.d + WORD $0x04a93063 // eor z3.d, z3.d, z9.d + WORD $0x85895848 // ldr z8, [x2, #78, MUL VL] + WORD $0x85895c49 // ldr z9, [x2, #79, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83084 // eor z4.d, z4.d, z8.d + WORD $0x04a93084 // eor z4.d, z4.d, z9.d + WORD $0x858a4048 // ldr z8, [x2, #80, MUL VL] + WORD $0x858a4449 // ldr z9, [x2, #81, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a830a5 // eor z5.d, z5.d, z8.d + WORD $0x04a930a5 // eor z5.d, z5.d, z9.d + WORD $0x858a4848 // ldr z8, [x2, #82, MUL VL] + WORD $0x858a4c49 // ldr z9, [x2, #83, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a830c6 // eor z6.d, z6.d, z8.d + WORD $0x04a930c6 // eor z6.d, z6.d, z9.d + // Check for early termination + CMP $6, R16 + BEQ mulSve_10x7Xor_store + + // Load and process 32 bytes from input 6 to 7 outputs + WORD $0x8580416a // ldr z10, [x11] + WORD $0x9100816b // add x11, x11, #32 + WORD $0x04fc954b // lsr z11.d, z10.d, #4 + WORD $0x0427314a // and z10.d, z10.d, z7.d + WORD $0x0427316b // and z11.d, z11.d, z7.d + WORD $0x858a5048 // ldr z8, [x2, #84, MUL VL] + WORD $0x858a5449 // ldr z9, [x2, #85, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83000 // eor z0.d, z0.d, z8.d + WORD $0x04a93000 // eor z0.d, z0.d, z9.d + WORD $0x858a5848 // ldr z8, [x2, #86, MUL VL] + WORD $0x858a5c49 // ldr z9, [x2, #87, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83021 // eor z1.d, z1.d, z8.d + WORD $0x04a93021 // eor z1.d, z1.d, z9.d + WORD $0x858b4048 // ldr z8, [x2, #88, MUL VL] + WORD $0x858b4449 // ldr z9, [x2, #89, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83042 // eor z2.d, z2.d, z8.d + WORD $0x04a93042 // eor z2.d, z2.d, z9.d + WORD $0x858b4848 // ldr z8, [x2, #90, MUL VL] + WORD $0x858b4c49 // ldr z9, [x2, #91, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83063 // eor z3.d, z3.d, z8.d + WORD $0x04a93063 // eor z3.d, z3.d, z9.d + WORD $0x858b5048 // ldr z8, [x2, #92, MUL VL] + WORD $0x858b5449 // ldr z9, [x2, #93, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83084 // eor z4.d, z4.d, z8.d + WORD $0x04a93084 // eor z4.d, z4.d, z9.d + WORD $0x858b5848 // ldr z8, [x2, #94, MUL VL] + WORD $0x858b5c49 // ldr z9, [x2, #95, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a830a5 // eor z5.d, z5.d, z8.d + WORD $0x04a930a5 // eor z5.d, z5.d, z9.d + WORD $0x858c4048 // ldr z8, [x2, #96, MUL VL] + WORD $0x858c4449 // ldr z9, [x2, #97, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a830c6 // eor z6.d, z6.d, z8.d + WORD $0x04a930c6 // eor z6.d, z6.d, z9.d + // Check for early termination + CMP $7, R16 + BEQ mulSve_10x7Xor_store + + // Load and process 32 bytes from input 7 to 7 outputs + WORD $0x8580418a // ldr z10, [x12] + WORD $0x9100818c // add x12, x12, #32 + WORD $0x04fc954b // lsr z11.d, z10.d, #4 + WORD $0x0427314a // and z10.d, z10.d, z7.d + WORD $0x0427316b // and z11.d, z11.d, z7.d + WORD $0x858c4848 // ldr z8, [x2, #98, MUL VL] + WORD $0x858c4c49 // ldr z9, [x2, #99, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83000 // eor z0.d, z0.d, z8.d + WORD $0x04a93000 // eor z0.d, z0.d, z9.d + WORD $0x858c5048 // ldr z8, [x2, #100, MUL VL] + WORD $0x858c5449 // ldr z9, [x2, #101, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83021 // eor z1.d, z1.d, z8.d + WORD $0x04a93021 // eor z1.d, z1.d, z9.d + WORD $0x858c5848 // ldr z8, [x2, #102, MUL VL] + WORD $0x858c5c49 // ldr z9, [x2, #103, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83042 // eor z2.d, z2.d, z8.d + WORD $0x04a93042 // eor z2.d, z2.d, z9.d + WORD $0x858d4048 // ldr z8, [x2, #104, MUL VL] + WORD $0x858d4449 // ldr z9, [x2, #105, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83063 // eor z3.d, z3.d, z8.d + WORD $0x04a93063 // eor z3.d, z3.d, z9.d + WORD $0x858d4848 // ldr z8, [x2, #106, MUL VL] + WORD $0x858d4c49 // ldr z9, [x2, #107, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83084 // eor z4.d, z4.d, z8.d + WORD $0x04a93084 // eor z4.d, z4.d, z9.d + WORD $0x858d5048 // ldr z8, [x2, #108, MUL VL] + WORD $0x858d5449 // ldr z9, [x2, #109, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a830a5 // eor z5.d, z5.d, z8.d + WORD $0x04a930a5 // eor z5.d, z5.d, z9.d + WORD $0x858d5848 // ldr z8, [x2, #110, MUL VL] + WORD $0x858d5c49 // ldr z9, [x2, #111, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a830c6 // eor z6.d, z6.d, z8.d + WORD $0x04a930c6 // eor z6.d, z6.d, z9.d + // Check for early termination + CMP $8, R16 + BEQ mulSve_10x7Xor_store + + // Load and process 32 bytes from input 8 to 7 outputs + WORD $0x858041aa // ldr z10, [x13] + WORD $0x910081ad // add x13, x13, #32 + WORD $0x04fc954b // lsr z11.d, z10.d, #4 + WORD $0x0427314a // and z10.d, z10.d, z7.d + WORD $0x0427316b // and z11.d, z11.d, z7.d + WORD $0x858e4048 // ldr z8, [x2, #112, MUL VL] + WORD $0x858e4449 // ldr z9, [x2, #113, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83000 // eor z0.d, z0.d, z8.d + WORD $0x04a93000 // eor z0.d, z0.d, z9.d + WORD $0x858e4848 // ldr z8, [x2, #114, MUL VL] + WORD $0x858e4c49 // ldr z9, [x2, #115, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83021 // eor z1.d, z1.d, z8.d + WORD $0x04a93021 // eor z1.d, z1.d, z9.d + WORD $0x858e5048 // ldr z8, [x2, #116, MUL VL] + WORD $0x858e5449 // ldr z9, [x2, #117, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83042 // eor z2.d, z2.d, z8.d + WORD $0x04a93042 // eor z2.d, z2.d, z9.d + WORD $0x858e5848 // ldr z8, [x2, #118, MUL VL] + WORD $0x858e5c49 // ldr z9, [x2, #119, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83063 // eor z3.d, z3.d, z8.d + WORD $0x04a93063 // eor z3.d, z3.d, z9.d + WORD $0x858f4048 // ldr z8, [x2, #120, MUL VL] + WORD $0x858f4449 // ldr z9, [x2, #121, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83084 // eor z4.d, z4.d, z8.d + WORD $0x04a93084 // eor z4.d, z4.d, z9.d + WORD $0x858f4848 // ldr z8, [x2, #122, MUL VL] + WORD $0x858f4c49 // ldr z9, [x2, #123, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a830a5 // eor z5.d, z5.d, z8.d + WORD $0x04a930a5 // eor z5.d, z5.d, z9.d + WORD $0x858f5048 // ldr z8, [x2, #124, MUL VL] + WORD $0x858f5449 // ldr z9, [x2, #125, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a830c6 // eor z6.d, z6.d, z8.d + WORD $0x04a930c6 // eor z6.d, z6.d, z9.d + // Check for early termination + CMP $9, R16 + BEQ mulSve_10x7Xor_store + + // Load and process 32 bytes from input 9 to 7 outputs + WORD $0x8580406a // ldr z10, [x3] + WORD $0x91008063 // add x3, x3, #32 + WORD $0x04fc954b // lsr z11.d, z10.d, #4 + WORD $0x0427314a // and z10.d, z10.d, z7.d + WORD $0x0427316b // and z11.d, z11.d, z7.d + WORD $0x858f5848 // ldr z8, [x2, #126, MUL VL] + WORD $0x858f5c49 // ldr z9, [x2, #127, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83000 // eor z0.d, z0.d, z8.d + WORD $0x04a93000 // eor z0.d, z0.d, z9.d + WORD $0x85904048 // ldr z8, [x2, #128, MUL VL] + WORD $0x85904449 // ldr z9, [x2, #129, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83021 // eor z1.d, z1.d, z8.d + WORD $0x04a93021 // eor z1.d, z1.d, z9.d + WORD $0x85904848 // ldr z8, [x2, #130, MUL VL] + WORD $0x85904c49 // ldr z9, [x2, #131, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83042 // eor z2.d, z2.d, z8.d + WORD $0x04a93042 // eor z2.d, z2.d, z9.d + WORD $0x85905048 // ldr z8, [x2, #132, MUL VL] + WORD $0x85905449 // ldr z9, [x2, #133, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83063 // eor z3.d, z3.d, z8.d + WORD $0x04a93063 // eor z3.d, z3.d, z9.d + WORD $0x85905848 // ldr z8, [x2, #134, MUL VL] + WORD $0x85905c49 // ldr z9, [x2, #135, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83084 // eor z4.d, z4.d, z8.d + WORD $0x04a93084 // eor z4.d, z4.d, z9.d + WORD $0x85914048 // ldr z8, [x2, #136, MUL VL] + WORD $0x85914449 // ldr z9, [x2, #137, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a830a5 // eor z5.d, z5.d, z8.d + WORD $0x04a930a5 // eor z5.d, z5.d, z9.d + WORD $0x85914848 // ldr z8, [x2, #138, MUL VL] + WORD $0x85914c49 // ldr z9, [x2, #139, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a830c6 // eor z6.d, z6.d, z8.d + WORD $0x04a930c6 // eor z6.d, z6.d, z9.d + +mulSve_10x7Xor_store: + // Store 7 outputs + MOVD (R14), R6 + WORD $0xe5ef40c0 // st1d { z0.d }, p0, [x6, x15, lsl #3] + MOVD 24(R14), R6 + WORD $0xe5ef40c1 // st1d { z1.d }, p0, [x6, x15, lsl #3] + MOVD 48(R14), R6 + WORD $0xe5ef40c2 // st1d { z2.d }, p0, [x6, x15, lsl #3] + MOVD 72(R14), R6 + WORD $0xe5ef40c3 // st1d { z3.d }, p0, [x6, x15, lsl #3] + MOVD 96(R14), R6 + WORD $0xe5ef40c4 // st1d { z4.d }, p0, [x6, x15, lsl #3] + MOVD 120(R14), R6 + WORD $0xe5ef40c5 // st1d { z5.d }, p0, [x6, x15, lsl #3] + MOVD 144(R14), R6 + WORD $0xe5ef40c6 // st1d { z6.d }, p0, [x6, x15, lsl #3] + + // Prepare for next loop + WORD $0x910011ef // add x15, x15, #4 + WORD $0xf1000400 // subs x0, x0, #1 + BNE mulSve_10x7Xor_loop + +mulSve_10x7Xor_end: + RET + +// func mulSve_10x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: SVE +TEXT ·mulSve_10x8(SB), NOSPLIT, $8-88 + WORD $0x25d8e3e0 // ptrue p0.d + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 173 YMM used + MOVD n+80(FP), R0 + MOVD matrix_base+0(FP), R2 + WORD $0xd345fc00 // lsr x0, x0, #5 + WORD $0xea00001f // tst x0, x0 + BEQ mulSve_10x8_end + MOVD in_base+24(FP), R3 + MOVD (R3), R1 + MOVD 24(R3), R4 + MOVD 48(R3), R5 + MOVD 72(R3), R8 + MOVD 96(R3), R9 + MOVD 120(R3), R10 + MOVD 144(R3), R11 + MOVD 168(R3), R12 + MOVD 192(R3), R13 + MOVD 216(R3), R3 + MOVD out_base+48(FP), R14 + MOVD start+72(FP), R15 + + // Add start offset to input + WORD $0x8b0f0021 // add x1, x1, x15 + WORD $0x8b0f0084 // add x4, x4, x15 + WORD $0x8b0f00a5 // add x5, x5, x15 + WORD $0x8b0f0108 // add x8, x8, x15 + WORD $0x8b0f0129 // add x9, x9, x15 + WORD $0x8b0f014a // add x10, x10, x15 + WORD $0x8b0f016b // add x11, x11, x15 + WORD $0x8b0f018c // add x12, x12, x15 + WORD $0x8b0f01ad // add x13, x13, x15 + WORD $0x8b0f0063 // add x3, x3, x15 + WORD $0xd343fdef // lsr x15, x15, #3 + WORD $0xd28001e6 // mov x6, #15 + WORD $0x05e038c8 // mov z8.d, x6 + WORD $0x05212108 // dup z8.b, z8.b[0] + + // Load number of input shards + MOVD in_len+32(FP), R16 + +mulSve_10x8_loop: + // Load and process 32 bytes from input 0 to 8 outputs + WORD $0x8580402b // ldr z11, [x1] + WORD $0x91008021 // add x1, x1, #32 + WORD $0x04fc956c // lsr z12.d, z11.d, #4 + WORD $0x0428316b // and z11.d, z11.d, z8.d + WORD $0x0428318c // and z12.d, z12.d, z8.d + WORD $0x85804049 // ldr z9, [x2] + WORD $0x8580444a // ldr z10, [x2, #1, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93140 // eor z0.d, z10.d, z9.d + WORD $0x85804849 // ldr z9, [x2, #2, MUL VL] + WORD $0x85804c4a // ldr z10, [x2, #3, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93141 // eor z1.d, z10.d, z9.d + WORD $0x85805049 // ldr z9, [x2, #4, MUL VL] + WORD $0x8580544a // ldr z10, [x2, #5, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93142 // eor z2.d, z10.d, z9.d + WORD $0x85805849 // ldr z9, [x2, #6, MUL VL] + WORD $0x85805c4a // ldr z10, [x2, #7, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93143 // eor z3.d, z10.d, z9.d + WORD $0x85814049 // ldr z9, [x2, #8, MUL VL] + WORD $0x8581444a // ldr z10, [x2, #9, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93144 // eor z4.d, z10.d, z9.d + WORD $0x85814849 // ldr z9, [x2, #10, MUL VL] + WORD $0x85814c4a // ldr z10, [x2, #11, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93145 // eor z5.d, z10.d, z9.d + WORD $0x85815049 // ldr z9, [x2, #12, MUL VL] + WORD $0x8581544a // ldr z10, [x2, #13, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93146 // eor z6.d, z10.d, z9.d + WORD $0x85815849 // ldr z9, [x2, #14, MUL VL] + WORD $0x85815c4a // ldr z10, [x2, #15, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93147 // eor z7.d, z10.d, z9.d + // Check for early termination + CMP $1, R16 + BEQ mulSve_10x8_store + + // Load and process 32 bytes from input 1 to 8 outputs + WORD $0x8580408b // ldr z11, [x4] + WORD $0x91008084 // add x4, x4, #32 + WORD $0x04fc956c // lsr z12.d, z11.d, #4 + WORD $0x0428316b // and z11.d, z11.d, z8.d + WORD $0x0428318c // and z12.d, z12.d, z8.d + WORD $0x85824049 // ldr z9, [x2, #16, MUL VL] + WORD $0x8582444a // ldr z10, [x2, #17, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93000 // eor z0.d, z0.d, z9.d + WORD $0x04aa3000 // eor z0.d, z0.d, z10.d + WORD $0x85824849 // ldr z9, [x2, #18, MUL VL] + WORD $0x85824c4a // ldr z10, [x2, #19, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93021 // eor z1.d, z1.d, z9.d + WORD $0x04aa3021 // eor z1.d, z1.d, z10.d + WORD $0x85825049 // ldr z9, [x2, #20, MUL VL] + WORD $0x8582544a // ldr z10, [x2, #21, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93042 // eor z2.d, z2.d, z9.d + WORD $0x04aa3042 // eor z2.d, z2.d, z10.d + WORD $0x85825849 // ldr z9, [x2, #22, MUL VL] + WORD $0x85825c4a // ldr z10, [x2, #23, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93063 // eor z3.d, z3.d, z9.d + WORD $0x04aa3063 // eor z3.d, z3.d, z10.d + WORD $0x85834049 // ldr z9, [x2, #24, MUL VL] + WORD $0x8583444a // ldr z10, [x2, #25, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93084 // eor z4.d, z4.d, z9.d + WORD $0x04aa3084 // eor z4.d, z4.d, z10.d + WORD $0x85834849 // ldr z9, [x2, #26, MUL VL] + WORD $0x85834c4a // ldr z10, [x2, #27, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a930a5 // eor z5.d, z5.d, z9.d + WORD $0x04aa30a5 // eor z5.d, z5.d, z10.d + WORD $0x85835049 // ldr z9, [x2, #28, MUL VL] + WORD $0x8583544a // ldr z10, [x2, #29, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a930c6 // eor z6.d, z6.d, z9.d + WORD $0x04aa30c6 // eor z6.d, z6.d, z10.d + WORD $0x85835849 // ldr z9, [x2, #30, MUL VL] + WORD $0x85835c4a // ldr z10, [x2, #31, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a930e7 // eor z7.d, z7.d, z9.d + WORD $0x04aa30e7 // eor z7.d, z7.d, z10.d + // Check for early termination + CMP $2, R16 + BEQ mulSve_10x8_store + + // Load and process 32 bytes from input 2 to 8 outputs + WORD $0x858040ab // ldr z11, [x5] + WORD $0x910080a5 // add x5, x5, #32 + WORD $0x04fc956c // lsr z12.d, z11.d, #4 + WORD $0x0428316b // and z11.d, z11.d, z8.d + WORD $0x0428318c // and z12.d, z12.d, z8.d + WORD $0x85844049 // ldr z9, [x2, #32, MUL VL] + WORD $0x8584444a // ldr z10, [x2, #33, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93000 // eor z0.d, z0.d, z9.d + WORD $0x04aa3000 // eor z0.d, z0.d, z10.d + WORD $0x85844849 // ldr z9, [x2, #34, MUL VL] + WORD $0x85844c4a // ldr z10, [x2, #35, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93021 // eor z1.d, z1.d, z9.d + WORD $0x04aa3021 // eor z1.d, z1.d, z10.d + WORD $0x85845049 // ldr z9, [x2, #36, MUL VL] + WORD $0x8584544a // ldr z10, [x2, #37, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93042 // eor z2.d, z2.d, z9.d + WORD $0x04aa3042 // eor z2.d, z2.d, z10.d + WORD $0x85845849 // ldr z9, [x2, #38, MUL VL] + WORD $0x85845c4a // ldr z10, [x2, #39, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93063 // eor z3.d, z3.d, z9.d + WORD $0x04aa3063 // eor z3.d, z3.d, z10.d + WORD $0x85854049 // ldr z9, [x2, #40, MUL VL] + WORD $0x8585444a // ldr z10, [x2, #41, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93084 // eor z4.d, z4.d, z9.d + WORD $0x04aa3084 // eor z4.d, z4.d, z10.d + WORD $0x85854849 // ldr z9, [x2, #42, MUL VL] + WORD $0x85854c4a // ldr z10, [x2, #43, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a930a5 // eor z5.d, z5.d, z9.d + WORD $0x04aa30a5 // eor z5.d, z5.d, z10.d + WORD $0x85855049 // ldr z9, [x2, #44, MUL VL] + WORD $0x8585544a // ldr z10, [x2, #45, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a930c6 // eor z6.d, z6.d, z9.d + WORD $0x04aa30c6 // eor z6.d, z6.d, z10.d + WORD $0x85855849 // ldr z9, [x2, #46, MUL VL] + WORD $0x85855c4a // ldr z10, [x2, #47, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a930e7 // eor z7.d, z7.d, z9.d + WORD $0x04aa30e7 // eor z7.d, z7.d, z10.d + // Check for early termination + CMP $3, R16 + BEQ mulSve_10x8_store + + // Load and process 32 bytes from input 3 to 8 outputs + WORD $0x8580410b // ldr z11, [x8] + WORD $0x91008108 // add x8, x8, #32 + WORD $0x04fc956c // lsr z12.d, z11.d, #4 + WORD $0x0428316b // and z11.d, z11.d, z8.d + WORD $0x0428318c // and z12.d, z12.d, z8.d + WORD $0x85864049 // ldr z9, [x2, #48, MUL VL] + WORD $0x8586444a // ldr z10, [x2, #49, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93000 // eor z0.d, z0.d, z9.d + WORD $0x04aa3000 // eor z0.d, z0.d, z10.d + WORD $0x85864849 // ldr z9, [x2, #50, MUL VL] + WORD $0x85864c4a // ldr z10, [x2, #51, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93021 // eor z1.d, z1.d, z9.d + WORD $0x04aa3021 // eor z1.d, z1.d, z10.d + WORD $0x85865049 // ldr z9, [x2, #52, MUL VL] + WORD $0x8586544a // ldr z10, [x2, #53, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93042 // eor z2.d, z2.d, z9.d + WORD $0x04aa3042 // eor z2.d, z2.d, z10.d + WORD $0x85865849 // ldr z9, [x2, #54, MUL VL] + WORD $0x85865c4a // ldr z10, [x2, #55, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93063 // eor z3.d, z3.d, z9.d + WORD $0x04aa3063 // eor z3.d, z3.d, z10.d + WORD $0x85874049 // ldr z9, [x2, #56, MUL VL] + WORD $0x8587444a // ldr z10, [x2, #57, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93084 // eor z4.d, z4.d, z9.d + WORD $0x04aa3084 // eor z4.d, z4.d, z10.d + WORD $0x85874849 // ldr z9, [x2, #58, MUL VL] + WORD $0x85874c4a // ldr z10, [x2, #59, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a930a5 // eor z5.d, z5.d, z9.d + WORD $0x04aa30a5 // eor z5.d, z5.d, z10.d + WORD $0x85875049 // ldr z9, [x2, #60, MUL VL] + WORD $0x8587544a // ldr z10, [x2, #61, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a930c6 // eor z6.d, z6.d, z9.d + WORD $0x04aa30c6 // eor z6.d, z6.d, z10.d + WORD $0x85875849 // ldr z9, [x2, #62, MUL VL] + WORD $0x85875c4a // ldr z10, [x2, #63, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a930e7 // eor z7.d, z7.d, z9.d + WORD $0x04aa30e7 // eor z7.d, z7.d, z10.d + // Check for early termination + CMP $4, R16 + BEQ mulSve_10x8_store + + // Load and process 32 bytes from input 4 to 8 outputs + WORD $0x8580412b // ldr z11, [x9] + WORD $0x91008129 // add x9, x9, #32 + WORD $0x04fc956c // lsr z12.d, z11.d, #4 + WORD $0x0428316b // and z11.d, z11.d, z8.d + WORD $0x0428318c // and z12.d, z12.d, z8.d + WORD $0x85884049 // ldr z9, [x2, #64, MUL VL] + WORD $0x8588444a // ldr z10, [x2, #65, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93000 // eor z0.d, z0.d, z9.d + WORD $0x04aa3000 // eor z0.d, z0.d, z10.d + WORD $0x85884849 // ldr z9, [x2, #66, MUL VL] + WORD $0x85884c4a // ldr z10, [x2, #67, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93021 // eor z1.d, z1.d, z9.d + WORD $0x04aa3021 // eor z1.d, z1.d, z10.d + WORD $0x85885049 // ldr z9, [x2, #68, MUL VL] + WORD $0x8588544a // ldr z10, [x2, #69, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93042 // eor z2.d, z2.d, z9.d + WORD $0x04aa3042 // eor z2.d, z2.d, z10.d + WORD $0x85885849 // ldr z9, [x2, #70, MUL VL] + WORD $0x85885c4a // ldr z10, [x2, #71, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93063 // eor z3.d, z3.d, z9.d + WORD $0x04aa3063 // eor z3.d, z3.d, z10.d + WORD $0x85894049 // ldr z9, [x2, #72, MUL VL] + WORD $0x8589444a // ldr z10, [x2, #73, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93084 // eor z4.d, z4.d, z9.d + WORD $0x04aa3084 // eor z4.d, z4.d, z10.d + WORD $0x85894849 // ldr z9, [x2, #74, MUL VL] + WORD $0x85894c4a // ldr z10, [x2, #75, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a930a5 // eor z5.d, z5.d, z9.d + WORD $0x04aa30a5 // eor z5.d, z5.d, z10.d + WORD $0x85895049 // ldr z9, [x2, #76, MUL VL] + WORD $0x8589544a // ldr z10, [x2, #77, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a930c6 // eor z6.d, z6.d, z9.d + WORD $0x04aa30c6 // eor z6.d, z6.d, z10.d + WORD $0x85895849 // ldr z9, [x2, #78, MUL VL] + WORD $0x85895c4a // ldr z10, [x2, #79, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a930e7 // eor z7.d, z7.d, z9.d + WORD $0x04aa30e7 // eor z7.d, z7.d, z10.d + // Check for early termination + CMP $5, R16 + BEQ mulSve_10x8_store + + // Load and process 32 bytes from input 5 to 8 outputs + WORD $0x8580414b // ldr z11, [x10] + WORD $0x9100814a // add x10, x10, #32 + WORD $0x04fc956c // lsr z12.d, z11.d, #4 + WORD $0x0428316b // and z11.d, z11.d, z8.d + WORD $0x0428318c // and z12.d, z12.d, z8.d + WORD $0x858a4049 // ldr z9, [x2, #80, MUL VL] + WORD $0x858a444a // ldr z10, [x2, #81, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93000 // eor z0.d, z0.d, z9.d + WORD $0x04aa3000 // eor z0.d, z0.d, z10.d + WORD $0x858a4849 // ldr z9, [x2, #82, MUL VL] + WORD $0x858a4c4a // ldr z10, [x2, #83, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93021 // eor z1.d, z1.d, z9.d + WORD $0x04aa3021 // eor z1.d, z1.d, z10.d + WORD $0x858a5049 // ldr z9, [x2, #84, MUL VL] + WORD $0x858a544a // ldr z10, [x2, #85, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93042 // eor z2.d, z2.d, z9.d + WORD $0x04aa3042 // eor z2.d, z2.d, z10.d + WORD $0x858a5849 // ldr z9, [x2, #86, MUL VL] + WORD $0x858a5c4a // ldr z10, [x2, #87, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93063 // eor z3.d, z3.d, z9.d + WORD $0x04aa3063 // eor z3.d, z3.d, z10.d + WORD $0x858b4049 // ldr z9, [x2, #88, MUL VL] + WORD $0x858b444a // ldr z10, [x2, #89, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93084 // eor z4.d, z4.d, z9.d + WORD $0x04aa3084 // eor z4.d, z4.d, z10.d + WORD $0x858b4849 // ldr z9, [x2, #90, MUL VL] + WORD $0x858b4c4a // ldr z10, [x2, #91, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a930a5 // eor z5.d, z5.d, z9.d + WORD $0x04aa30a5 // eor z5.d, z5.d, z10.d + WORD $0x858b5049 // ldr z9, [x2, #92, MUL VL] + WORD $0x858b544a // ldr z10, [x2, #93, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a930c6 // eor z6.d, z6.d, z9.d + WORD $0x04aa30c6 // eor z6.d, z6.d, z10.d + WORD $0x858b5849 // ldr z9, [x2, #94, MUL VL] + WORD $0x858b5c4a // ldr z10, [x2, #95, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a930e7 // eor z7.d, z7.d, z9.d + WORD $0x04aa30e7 // eor z7.d, z7.d, z10.d + // Check for early termination + CMP $6, R16 + BEQ mulSve_10x8_store + + // Load and process 32 bytes from input 6 to 8 outputs + WORD $0x8580416b // ldr z11, [x11] + WORD $0x9100816b // add x11, x11, #32 + WORD $0x04fc956c // lsr z12.d, z11.d, #4 + WORD $0x0428316b // and z11.d, z11.d, z8.d + WORD $0x0428318c // and z12.d, z12.d, z8.d + WORD $0x858c4049 // ldr z9, [x2, #96, MUL VL] + WORD $0x858c444a // ldr z10, [x2, #97, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93000 // eor z0.d, z0.d, z9.d + WORD $0x04aa3000 // eor z0.d, z0.d, z10.d + WORD $0x858c4849 // ldr z9, [x2, #98, MUL VL] + WORD $0x858c4c4a // ldr z10, [x2, #99, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93021 // eor z1.d, z1.d, z9.d + WORD $0x04aa3021 // eor z1.d, z1.d, z10.d + WORD $0x858c5049 // ldr z9, [x2, #100, MUL VL] + WORD $0x858c544a // ldr z10, [x2, #101, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93042 // eor z2.d, z2.d, z9.d + WORD $0x04aa3042 // eor z2.d, z2.d, z10.d + WORD $0x858c5849 // ldr z9, [x2, #102, MUL VL] + WORD $0x858c5c4a // ldr z10, [x2, #103, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93063 // eor z3.d, z3.d, z9.d + WORD $0x04aa3063 // eor z3.d, z3.d, z10.d + WORD $0x858d4049 // ldr z9, [x2, #104, MUL VL] + WORD $0x858d444a // ldr z10, [x2, #105, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93084 // eor z4.d, z4.d, z9.d + WORD $0x04aa3084 // eor z4.d, z4.d, z10.d + WORD $0x858d4849 // ldr z9, [x2, #106, MUL VL] + WORD $0x858d4c4a // ldr z10, [x2, #107, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a930a5 // eor z5.d, z5.d, z9.d + WORD $0x04aa30a5 // eor z5.d, z5.d, z10.d + WORD $0x858d5049 // ldr z9, [x2, #108, MUL VL] + WORD $0x858d544a // ldr z10, [x2, #109, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a930c6 // eor z6.d, z6.d, z9.d + WORD $0x04aa30c6 // eor z6.d, z6.d, z10.d + WORD $0x858d5849 // ldr z9, [x2, #110, MUL VL] + WORD $0x858d5c4a // ldr z10, [x2, #111, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a930e7 // eor z7.d, z7.d, z9.d + WORD $0x04aa30e7 // eor z7.d, z7.d, z10.d + // Check for early termination + CMP $7, R16 + BEQ mulSve_10x8_store + + // Load and process 32 bytes from input 7 to 8 outputs + WORD $0x8580418b // ldr z11, [x12] + WORD $0x9100818c // add x12, x12, #32 + WORD $0x04fc956c // lsr z12.d, z11.d, #4 + WORD $0x0428316b // and z11.d, z11.d, z8.d + WORD $0x0428318c // and z12.d, z12.d, z8.d + WORD $0x858e4049 // ldr z9, [x2, #112, MUL VL] + WORD $0x858e444a // ldr z10, [x2, #113, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93000 // eor z0.d, z0.d, z9.d + WORD $0x04aa3000 // eor z0.d, z0.d, z10.d + WORD $0x858e4849 // ldr z9, [x2, #114, MUL VL] + WORD $0x858e4c4a // ldr z10, [x2, #115, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93021 // eor z1.d, z1.d, z9.d + WORD $0x04aa3021 // eor z1.d, z1.d, z10.d + WORD $0x858e5049 // ldr z9, [x2, #116, MUL VL] + WORD $0x858e544a // ldr z10, [x2, #117, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93042 // eor z2.d, z2.d, z9.d + WORD $0x04aa3042 // eor z2.d, z2.d, z10.d + WORD $0x858e5849 // ldr z9, [x2, #118, MUL VL] + WORD $0x858e5c4a // ldr z10, [x2, #119, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93063 // eor z3.d, z3.d, z9.d + WORD $0x04aa3063 // eor z3.d, z3.d, z10.d + WORD $0x858f4049 // ldr z9, [x2, #120, MUL VL] + WORD $0x858f444a // ldr z10, [x2, #121, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93084 // eor z4.d, z4.d, z9.d + WORD $0x04aa3084 // eor z4.d, z4.d, z10.d + WORD $0x858f4849 // ldr z9, [x2, #122, MUL VL] + WORD $0x858f4c4a // ldr z10, [x2, #123, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a930a5 // eor z5.d, z5.d, z9.d + WORD $0x04aa30a5 // eor z5.d, z5.d, z10.d + WORD $0x858f5049 // ldr z9, [x2, #124, MUL VL] + WORD $0x858f544a // ldr z10, [x2, #125, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a930c6 // eor z6.d, z6.d, z9.d + WORD $0x04aa30c6 // eor z6.d, z6.d, z10.d + WORD $0x858f5849 // ldr z9, [x2, #126, MUL VL] + WORD $0x858f5c4a // ldr z10, [x2, #127, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a930e7 // eor z7.d, z7.d, z9.d + WORD $0x04aa30e7 // eor z7.d, z7.d, z10.d + // Check for early termination + CMP $8, R16 + BEQ mulSve_10x8_store + + // Load and process 32 bytes from input 8 to 8 outputs + WORD $0x858041ab // ldr z11, [x13] + WORD $0x910081ad // add x13, x13, #32 + WORD $0x04fc956c // lsr z12.d, z11.d, #4 + WORD $0x0428316b // and z11.d, z11.d, z8.d + WORD $0x0428318c // and z12.d, z12.d, z8.d + WORD $0x85904049 // ldr z9, [x2, #128, MUL VL] + WORD $0x8590444a // ldr z10, [x2, #129, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93000 // eor z0.d, z0.d, z9.d + WORD $0x04aa3000 // eor z0.d, z0.d, z10.d + WORD $0x85904849 // ldr z9, [x2, #130, MUL VL] + WORD $0x85904c4a // ldr z10, [x2, #131, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93021 // eor z1.d, z1.d, z9.d + WORD $0x04aa3021 // eor z1.d, z1.d, z10.d + WORD $0x85905049 // ldr z9, [x2, #132, MUL VL] + WORD $0x8590544a // ldr z10, [x2, #133, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93042 // eor z2.d, z2.d, z9.d + WORD $0x04aa3042 // eor z2.d, z2.d, z10.d + WORD $0x85905849 // ldr z9, [x2, #134, MUL VL] + WORD $0x85905c4a // ldr z10, [x2, #135, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93063 // eor z3.d, z3.d, z9.d + WORD $0x04aa3063 // eor z3.d, z3.d, z10.d + WORD $0x85914049 // ldr z9, [x2, #136, MUL VL] + WORD $0x8591444a // ldr z10, [x2, #137, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93084 // eor z4.d, z4.d, z9.d + WORD $0x04aa3084 // eor z4.d, z4.d, z10.d + WORD $0x85914849 // ldr z9, [x2, #138, MUL VL] + WORD $0x85914c4a // ldr z10, [x2, #139, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a930a5 // eor z5.d, z5.d, z9.d + WORD $0x04aa30a5 // eor z5.d, z5.d, z10.d + WORD $0x85915049 // ldr z9, [x2, #140, MUL VL] + WORD $0x8591544a // ldr z10, [x2, #141, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a930c6 // eor z6.d, z6.d, z9.d + WORD $0x04aa30c6 // eor z6.d, z6.d, z10.d + WORD $0x85915849 // ldr z9, [x2, #142, MUL VL] + WORD $0x85915c4a // ldr z10, [x2, #143, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a930e7 // eor z7.d, z7.d, z9.d + WORD $0x04aa30e7 // eor z7.d, z7.d, z10.d + // Check for early termination + CMP $9, R16 + BEQ mulSve_10x8_store + + // Load and process 32 bytes from input 9 to 8 outputs + WORD $0x8580406b // ldr z11, [x3] + WORD $0x91008063 // add x3, x3, #32 + WORD $0x04fc956c // lsr z12.d, z11.d, #4 + WORD $0x0428316b // and z11.d, z11.d, z8.d + WORD $0x0428318c // and z12.d, z12.d, z8.d + WORD $0x85924049 // ldr z9, [x2, #144, MUL VL] + WORD $0x8592444a // ldr z10, [x2, #145, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93000 // eor z0.d, z0.d, z9.d + WORD $0x04aa3000 // eor z0.d, z0.d, z10.d + WORD $0x85924849 // ldr z9, [x2, #146, MUL VL] + WORD $0x85924c4a // ldr z10, [x2, #147, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93021 // eor z1.d, z1.d, z9.d + WORD $0x04aa3021 // eor z1.d, z1.d, z10.d + WORD $0x85925049 // ldr z9, [x2, #148, MUL VL] + WORD $0x8592544a // ldr z10, [x2, #149, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93042 // eor z2.d, z2.d, z9.d + WORD $0x04aa3042 // eor z2.d, z2.d, z10.d + WORD $0x85925849 // ldr z9, [x2, #150, MUL VL] + WORD $0x85925c4a // ldr z10, [x2, #151, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93063 // eor z3.d, z3.d, z9.d + WORD $0x04aa3063 // eor z3.d, z3.d, z10.d + WORD $0x85934049 // ldr z9, [x2, #152, MUL VL] + WORD $0x8593444a // ldr z10, [x2, #153, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93084 // eor z4.d, z4.d, z9.d + WORD $0x04aa3084 // eor z4.d, z4.d, z10.d + WORD $0x85934849 // ldr z9, [x2, #154, MUL VL] + WORD $0x85934c4a // ldr z10, [x2, #155, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a930a5 // eor z5.d, z5.d, z9.d + WORD $0x04aa30a5 // eor z5.d, z5.d, z10.d + WORD $0x85935049 // ldr z9, [x2, #156, MUL VL] + WORD $0x8593544a // ldr z10, [x2, #157, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a930c6 // eor z6.d, z6.d, z9.d + WORD $0x04aa30c6 // eor z6.d, z6.d, z10.d + WORD $0x85935849 // ldr z9, [x2, #158, MUL VL] + WORD $0x85935c4a // ldr z10, [x2, #159, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a930e7 // eor z7.d, z7.d, z9.d + WORD $0x04aa30e7 // eor z7.d, z7.d, z10.d + +mulSve_10x8_store: + // Store 8 outputs + MOVD (R14), R6 + WORD $0xe5ef40c0 // st1d { z0.d }, p0, [x6, x15, lsl #3] + MOVD 24(R14), R6 + WORD $0xe5ef40c1 // st1d { z1.d }, p0, [x6, x15, lsl #3] + MOVD 48(R14), R6 + WORD $0xe5ef40c2 // st1d { z2.d }, p0, [x6, x15, lsl #3] + MOVD 72(R14), R6 + WORD $0xe5ef40c3 // st1d { z3.d }, p0, [x6, x15, lsl #3] + MOVD 96(R14), R6 + WORD $0xe5ef40c4 // st1d { z4.d }, p0, [x6, x15, lsl #3] + MOVD 120(R14), R6 + WORD $0xe5ef40c5 // st1d { z5.d }, p0, [x6, x15, lsl #3] + MOVD 144(R14), R6 + WORD $0xe5ef40c6 // st1d { z6.d }, p0, [x6, x15, lsl #3] + MOVD 168(R14), R6 + WORD $0xe5ef40c7 // st1d { z7.d }, p0, [x6, x15, lsl #3] + + // Prepare for next loop + WORD $0x910011ef // add x15, x15, #4 + WORD $0xf1000400 // subs x0, x0, #1 + BNE mulSve_10x8_loop + +mulSve_10x8_end: + RET + +// func mulSve_10x8Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: SVE +TEXT ·mulSve_10x8Xor(SB), NOSPLIT, $8-88 + WORD $0x25d8e3e0 // ptrue p0.d + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 173 YMM used + MOVD n+80(FP), R0 + MOVD matrix_base+0(FP), R2 + WORD $0xd345fc00 // lsr x0, x0, #5 + WORD $0xea00001f // tst x0, x0 + BEQ mulSve_10x8Xor_end + MOVD in_base+24(FP), R3 + MOVD (R3), R1 + MOVD 24(R3), R4 + MOVD 48(R3), R5 + MOVD 72(R3), R8 + MOVD 96(R3), R9 + MOVD 120(R3), R10 + MOVD 144(R3), R11 + MOVD 168(R3), R12 + MOVD 192(R3), R13 + MOVD 216(R3), R3 + MOVD out_base+48(FP), R14 + MOVD start+72(FP), R15 + + // Add start offset to input + WORD $0x8b0f0021 // add x1, x1, x15 + WORD $0x8b0f0084 // add x4, x4, x15 + WORD $0x8b0f00a5 // add x5, x5, x15 + WORD $0x8b0f0108 // add x8, x8, x15 + WORD $0x8b0f0129 // add x9, x9, x15 + WORD $0x8b0f014a // add x10, x10, x15 + WORD $0x8b0f016b // add x11, x11, x15 + WORD $0x8b0f018c // add x12, x12, x15 + WORD $0x8b0f01ad // add x13, x13, x15 + WORD $0x8b0f0063 // add x3, x3, x15 + WORD $0xd343fdef // lsr x15, x15, #3 + WORD $0xd28001e6 // mov x6, #15 + WORD $0x05e038c8 // mov z8.d, x6 + WORD $0x05212108 // dup z8.b, z8.b[0] + + // Load number of input shards + MOVD in_len+32(FP), R16 + +mulSve_10x8Xor_loop: + // Load and process 32 bytes from input 0 to 8 outputs + WORD $0x8580402b // ldr z11, [x1] + WORD $0x91008021 // add x1, x1, #32 + WORD $0x04fc956c // lsr z12.d, z11.d, #4 + WORD $0x0428316b // and z11.d, z11.d, z8.d + WORD $0x0428318c // and z12.d, z12.d, z8.d + MOVD (R14), R6 + WORD $0xa5ef40c0 // ld1d { z0.d }, p0/z, [x6, x15, lsl #3] + WORD $0x85804049 // ldr z9, [x2] + WORD $0x8580444a // ldr z10, [x2, #1, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93000 // eor z0.d, z0.d, z9.d + WORD $0x04aa3000 // eor z0.d, z0.d, z10.d + MOVD 24(R14), R6 + WORD $0xa5ef40c1 // ld1d { z1.d }, p0/z, [x6, x15, lsl #3] + WORD $0x85804849 // ldr z9, [x2, #2, MUL VL] + WORD $0x85804c4a // ldr z10, [x2, #3, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93021 // eor z1.d, z1.d, z9.d + WORD $0x04aa3021 // eor z1.d, z1.d, z10.d + MOVD 48(R14), R6 + WORD $0xa5ef40c2 // ld1d { z2.d }, p0/z, [x6, x15, lsl #3] + WORD $0x85805049 // ldr z9, [x2, #4, MUL VL] + WORD $0x8580544a // ldr z10, [x2, #5, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93042 // eor z2.d, z2.d, z9.d + WORD $0x04aa3042 // eor z2.d, z2.d, z10.d + MOVD 72(R14), R6 + WORD $0xa5ef40c3 // ld1d { z3.d }, p0/z, [x6, x15, lsl #3] + WORD $0x85805849 // ldr z9, [x2, #6, MUL VL] + WORD $0x85805c4a // ldr z10, [x2, #7, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93063 // eor z3.d, z3.d, z9.d + WORD $0x04aa3063 // eor z3.d, z3.d, z10.d + MOVD 96(R14), R6 + WORD $0xa5ef40c4 // ld1d { z4.d }, p0/z, [x6, x15, lsl #3] + WORD $0x85814049 // ldr z9, [x2, #8, MUL VL] + WORD $0x8581444a // ldr z10, [x2, #9, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93084 // eor z4.d, z4.d, z9.d + WORD $0x04aa3084 // eor z4.d, z4.d, z10.d + MOVD 120(R14), R6 + WORD $0xa5ef40c5 // ld1d { z5.d }, p0/z, [x6, x15, lsl #3] + WORD $0x85814849 // ldr z9, [x2, #10, MUL VL] + WORD $0x85814c4a // ldr z10, [x2, #11, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a930a5 // eor z5.d, z5.d, z9.d + WORD $0x04aa30a5 // eor z5.d, z5.d, z10.d + MOVD 144(R14), R6 + WORD $0xa5ef40c6 // ld1d { z6.d }, p0/z, [x6, x15, lsl #3] + WORD $0x85815049 // ldr z9, [x2, #12, MUL VL] + WORD $0x8581544a // ldr z10, [x2, #13, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a930c6 // eor z6.d, z6.d, z9.d + WORD $0x04aa30c6 // eor z6.d, z6.d, z10.d + MOVD 168(R14), R6 + WORD $0xa5ef40c7 // ld1d { z7.d }, p0/z, [x6, x15, lsl #3] + WORD $0x85815849 // ldr z9, [x2, #14, MUL VL] + WORD $0x85815c4a // ldr z10, [x2, #15, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a930e7 // eor z7.d, z7.d, z9.d + WORD $0x04aa30e7 // eor z7.d, z7.d, z10.d + // Check for early termination + CMP $1, R16 + BEQ mulSve_10x8Xor_store + + // Load and process 32 bytes from input 1 to 8 outputs + WORD $0x8580408b // ldr z11, [x4] + WORD $0x91008084 // add x4, x4, #32 + WORD $0x04fc956c // lsr z12.d, z11.d, #4 + WORD $0x0428316b // and z11.d, z11.d, z8.d + WORD $0x0428318c // and z12.d, z12.d, z8.d + WORD $0x85824049 // ldr z9, [x2, #16, MUL VL] + WORD $0x8582444a // ldr z10, [x2, #17, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93000 // eor z0.d, z0.d, z9.d + WORD $0x04aa3000 // eor z0.d, z0.d, z10.d + WORD $0x85824849 // ldr z9, [x2, #18, MUL VL] + WORD $0x85824c4a // ldr z10, [x2, #19, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93021 // eor z1.d, z1.d, z9.d + WORD $0x04aa3021 // eor z1.d, z1.d, z10.d + WORD $0x85825049 // ldr z9, [x2, #20, MUL VL] + WORD $0x8582544a // ldr z10, [x2, #21, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93042 // eor z2.d, z2.d, z9.d + WORD $0x04aa3042 // eor z2.d, z2.d, z10.d + WORD $0x85825849 // ldr z9, [x2, #22, MUL VL] + WORD $0x85825c4a // ldr z10, [x2, #23, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93063 // eor z3.d, z3.d, z9.d + WORD $0x04aa3063 // eor z3.d, z3.d, z10.d + WORD $0x85834049 // ldr z9, [x2, #24, MUL VL] + WORD $0x8583444a // ldr z10, [x2, #25, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93084 // eor z4.d, z4.d, z9.d + WORD $0x04aa3084 // eor z4.d, z4.d, z10.d + WORD $0x85834849 // ldr z9, [x2, #26, MUL VL] + WORD $0x85834c4a // ldr z10, [x2, #27, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a930a5 // eor z5.d, z5.d, z9.d + WORD $0x04aa30a5 // eor z5.d, z5.d, z10.d + WORD $0x85835049 // ldr z9, [x2, #28, MUL VL] + WORD $0x8583544a // ldr z10, [x2, #29, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a930c6 // eor z6.d, z6.d, z9.d + WORD $0x04aa30c6 // eor z6.d, z6.d, z10.d + WORD $0x85835849 // ldr z9, [x2, #30, MUL VL] + WORD $0x85835c4a // ldr z10, [x2, #31, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a930e7 // eor z7.d, z7.d, z9.d + WORD $0x04aa30e7 // eor z7.d, z7.d, z10.d + // Check for early termination + CMP $2, R16 + BEQ mulSve_10x8Xor_store + + // Load and process 32 bytes from input 2 to 8 outputs + WORD $0x858040ab // ldr z11, [x5] + WORD $0x910080a5 // add x5, x5, #32 + WORD $0x04fc956c // lsr z12.d, z11.d, #4 + WORD $0x0428316b // and z11.d, z11.d, z8.d + WORD $0x0428318c // and z12.d, z12.d, z8.d + WORD $0x85844049 // ldr z9, [x2, #32, MUL VL] + WORD $0x8584444a // ldr z10, [x2, #33, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93000 // eor z0.d, z0.d, z9.d + WORD $0x04aa3000 // eor z0.d, z0.d, z10.d + WORD $0x85844849 // ldr z9, [x2, #34, MUL VL] + WORD $0x85844c4a // ldr z10, [x2, #35, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93021 // eor z1.d, z1.d, z9.d + WORD $0x04aa3021 // eor z1.d, z1.d, z10.d + WORD $0x85845049 // ldr z9, [x2, #36, MUL VL] + WORD $0x8584544a // ldr z10, [x2, #37, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93042 // eor z2.d, z2.d, z9.d + WORD $0x04aa3042 // eor z2.d, z2.d, z10.d + WORD $0x85845849 // ldr z9, [x2, #38, MUL VL] + WORD $0x85845c4a // ldr z10, [x2, #39, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93063 // eor z3.d, z3.d, z9.d + WORD $0x04aa3063 // eor z3.d, z3.d, z10.d + WORD $0x85854049 // ldr z9, [x2, #40, MUL VL] + WORD $0x8585444a // ldr z10, [x2, #41, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93084 // eor z4.d, z4.d, z9.d + WORD $0x04aa3084 // eor z4.d, z4.d, z10.d + WORD $0x85854849 // ldr z9, [x2, #42, MUL VL] + WORD $0x85854c4a // ldr z10, [x2, #43, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a930a5 // eor z5.d, z5.d, z9.d + WORD $0x04aa30a5 // eor z5.d, z5.d, z10.d + WORD $0x85855049 // ldr z9, [x2, #44, MUL VL] + WORD $0x8585544a // ldr z10, [x2, #45, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a930c6 // eor z6.d, z6.d, z9.d + WORD $0x04aa30c6 // eor z6.d, z6.d, z10.d + WORD $0x85855849 // ldr z9, [x2, #46, MUL VL] + WORD $0x85855c4a // ldr z10, [x2, #47, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a930e7 // eor z7.d, z7.d, z9.d + WORD $0x04aa30e7 // eor z7.d, z7.d, z10.d + // Check for early termination + CMP $3, R16 + BEQ mulSve_10x8Xor_store + + // Load and process 32 bytes from input 3 to 8 outputs + WORD $0x8580410b // ldr z11, [x8] + WORD $0x91008108 // add x8, x8, #32 + WORD $0x04fc956c // lsr z12.d, z11.d, #4 + WORD $0x0428316b // and z11.d, z11.d, z8.d + WORD $0x0428318c // and z12.d, z12.d, z8.d + WORD $0x85864049 // ldr z9, [x2, #48, MUL VL] + WORD $0x8586444a // ldr z10, [x2, #49, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93000 // eor z0.d, z0.d, z9.d + WORD $0x04aa3000 // eor z0.d, z0.d, z10.d + WORD $0x85864849 // ldr z9, [x2, #50, MUL VL] + WORD $0x85864c4a // ldr z10, [x2, #51, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93021 // eor z1.d, z1.d, z9.d + WORD $0x04aa3021 // eor z1.d, z1.d, z10.d + WORD $0x85865049 // ldr z9, [x2, #52, MUL VL] + WORD $0x8586544a // ldr z10, [x2, #53, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93042 // eor z2.d, z2.d, z9.d + WORD $0x04aa3042 // eor z2.d, z2.d, z10.d + WORD $0x85865849 // ldr z9, [x2, #54, MUL VL] + WORD $0x85865c4a // ldr z10, [x2, #55, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93063 // eor z3.d, z3.d, z9.d + WORD $0x04aa3063 // eor z3.d, z3.d, z10.d + WORD $0x85874049 // ldr z9, [x2, #56, MUL VL] + WORD $0x8587444a // ldr z10, [x2, #57, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93084 // eor z4.d, z4.d, z9.d + WORD $0x04aa3084 // eor z4.d, z4.d, z10.d + WORD $0x85874849 // ldr z9, [x2, #58, MUL VL] + WORD $0x85874c4a // ldr z10, [x2, #59, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a930a5 // eor z5.d, z5.d, z9.d + WORD $0x04aa30a5 // eor z5.d, z5.d, z10.d + WORD $0x85875049 // ldr z9, [x2, #60, MUL VL] + WORD $0x8587544a // ldr z10, [x2, #61, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a930c6 // eor z6.d, z6.d, z9.d + WORD $0x04aa30c6 // eor z6.d, z6.d, z10.d + WORD $0x85875849 // ldr z9, [x2, #62, MUL VL] + WORD $0x85875c4a // ldr z10, [x2, #63, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a930e7 // eor z7.d, z7.d, z9.d + WORD $0x04aa30e7 // eor z7.d, z7.d, z10.d + // Check for early termination + CMP $4, R16 + BEQ mulSve_10x8Xor_store + + // Load and process 32 bytes from input 4 to 8 outputs + WORD $0x8580412b // ldr z11, [x9] + WORD $0x91008129 // add x9, x9, #32 + WORD $0x04fc956c // lsr z12.d, z11.d, #4 + WORD $0x0428316b // and z11.d, z11.d, z8.d + WORD $0x0428318c // and z12.d, z12.d, z8.d + WORD $0x85884049 // ldr z9, [x2, #64, MUL VL] + WORD $0x8588444a // ldr z10, [x2, #65, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93000 // eor z0.d, z0.d, z9.d + WORD $0x04aa3000 // eor z0.d, z0.d, z10.d + WORD $0x85884849 // ldr z9, [x2, #66, MUL VL] + WORD $0x85884c4a // ldr z10, [x2, #67, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93021 // eor z1.d, z1.d, z9.d + WORD $0x04aa3021 // eor z1.d, z1.d, z10.d + WORD $0x85885049 // ldr z9, [x2, #68, MUL VL] + WORD $0x8588544a // ldr z10, [x2, #69, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93042 // eor z2.d, z2.d, z9.d + WORD $0x04aa3042 // eor z2.d, z2.d, z10.d + WORD $0x85885849 // ldr z9, [x2, #70, MUL VL] + WORD $0x85885c4a // ldr z10, [x2, #71, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93063 // eor z3.d, z3.d, z9.d + WORD $0x04aa3063 // eor z3.d, z3.d, z10.d + WORD $0x85894049 // ldr z9, [x2, #72, MUL VL] + WORD $0x8589444a // ldr z10, [x2, #73, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93084 // eor z4.d, z4.d, z9.d + WORD $0x04aa3084 // eor z4.d, z4.d, z10.d + WORD $0x85894849 // ldr z9, [x2, #74, MUL VL] + WORD $0x85894c4a // ldr z10, [x2, #75, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a930a5 // eor z5.d, z5.d, z9.d + WORD $0x04aa30a5 // eor z5.d, z5.d, z10.d + WORD $0x85895049 // ldr z9, [x2, #76, MUL VL] + WORD $0x8589544a // ldr z10, [x2, #77, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a930c6 // eor z6.d, z6.d, z9.d + WORD $0x04aa30c6 // eor z6.d, z6.d, z10.d + WORD $0x85895849 // ldr z9, [x2, #78, MUL VL] + WORD $0x85895c4a // ldr z10, [x2, #79, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a930e7 // eor z7.d, z7.d, z9.d + WORD $0x04aa30e7 // eor z7.d, z7.d, z10.d + // Check for early termination + CMP $5, R16 + BEQ mulSve_10x8Xor_store + + // Load and process 32 bytes from input 5 to 8 outputs + WORD $0x8580414b // ldr z11, [x10] + WORD $0x9100814a // add x10, x10, #32 + WORD $0x04fc956c // lsr z12.d, z11.d, #4 + WORD $0x0428316b // and z11.d, z11.d, z8.d + WORD $0x0428318c // and z12.d, z12.d, z8.d + WORD $0x858a4049 // ldr z9, [x2, #80, MUL VL] + WORD $0x858a444a // ldr z10, [x2, #81, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93000 // eor z0.d, z0.d, z9.d + WORD $0x04aa3000 // eor z0.d, z0.d, z10.d + WORD $0x858a4849 // ldr z9, [x2, #82, MUL VL] + WORD $0x858a4c4a // ldr z10, [x2, #83, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93021 // eor z1.d, z1.d, z9.d + WORD $0x04aa3021 // eor z1.d, z1.d, z10.d + WORD $0x858a5049 // ldr z9, [x2, #84, MUL VL] + WORD $0x858a544a // ldr z10, [x2, #85, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93042 // eor z2.d, z2.d, z9.d + WORD $0x04aa3042 // eor z2.d, z2.d, z10.d + WORD $0x858a5849 // ldr z9, [x2, #86, MUL VL] + WORD $0x858a5c4a // ldr z10, [x2, #87, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93063 // eor z3.d, z3.d, z9.d + WORD $0x04aa3063 // eor z3.d, z3.d, z10.d + WORD $0x858b4049 // ldr z9, [x2, #88, MUL VL] + WORD $0x858b444a // ldr z10, [x2, #89, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93084 // eor z4.d, z4.d, z9.d + WORD $0x04aa3084 // eor z4.d, z4.d, z10.d + WORD $0x858b4849 // ldr z9, [x2, #90, MUL VL] + WORD $0x858b4c4a // ldr z10, [x2, #91, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a930a5 // eor z5.d, z5.d, z9.d + WORD $0x04aa30a5 // eor z5.d, z5.d, z10.d + WORD $0x858b5049 // ldr z9, [x2, #92, MUL VL] + WORD $0x858b544a // ldr z10, [x2, #93, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a930c6 // eor z6.d, z6.d, z9.d + WORD $0x04aa30c6 // eor z6.d, z6.d, z10.d + WORD $0x858b5849 // ldr z9, [x2, #94, MUL VL] + WORD $0x858b5c4a // ldr z10, [x2, #95, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a930e7 // eor z7.d, z7.d, z9.d + WORD $0x04aa30e7 // eor z7.d, z7.d, z10.d + // Check for early termination + CMP $6, R16 + BEQ mulSve_10x8Xor_store + + // Load and process 32 bytes from input 6 to 8 outputs + WORD $0x8580416b // ldr z11, [x11] + WORD $0x9100816b // add x11, x11, #32 + WORD $0x04fc956c // lsr z12.d, z11.d, #4 + WORD $0x0428316b // and z11.d, z11.d, z8.d + WORD $0x0428318c // and z12.d, z12.d, z8.d + WORD $0x858c4049 // ldr z9, [x2, #96, MUL VL] + WORD $0x858c444a // ldr z10, [x2, #97, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93000 // eor z0.d, z0.d, z9.d + WORD $0x04aa3000 // eor z0.d, z0.d, z10.d + WORD $0x858c4849 // ldr z9, [x2, #98, MUL VL] + WORD $0x858c4c4a // ldr z10, [x2, #99, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93021 // eor z1.d, z1.d, z9.d + WORD $0x04aa3021 // eor z1.d, z1.d, z10.d + WORD $0x858c5049 // ldr z9, [x2, #100, MUL VL] + WORD $0x858c544a // ldr z10, [x2, #101, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93042 // eor z2.d, z2.d, z9.d + WORD $0x04aa3042 // eor z2.d, z2.d, z10.d + WORD $0x858c5849 // ldr z9, [x2, #102, MUL VL] + WORD $0x858c5c4a // ldr z10, [x2, #103, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93063 // eor z3.d, z3.d, z9.d + WORD $0x04aa3063 // eor z3.d, z3.d, z10.d + WORD $0x858d4049 // ldr z9, [x2, #104, MUL VL] + WORD $0x858d444a // ldr z10, [x2, #105, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93084 // eor z4.d, z4.d, z9.d + WORD $0x04aa3084 // eor z4.d, z4.d, z10.d + WORD $0x858d4849 // ldr z9, [x2, #106, MUL VL] + WORD $0x858d4c4a // ldr z10, [x2, #107, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a930a5 // eor z5.d, z5.d, z9.d + WORD $0x04aa30a5 // eor z5.d, z5.d, z10.d + WORD $0x858d5049 // ldr z9, [x2, #108, MUL VL] + WORD $0x858d544a // ldr z10, [x2, #109, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a930c6 // eor z6.d, z6.d, z9.d + WORD $0x04aa30c6 // eor z6.d, z6.d, z10.d + WORD $0x858d5849 // ldr z9, [x2, #110, MUL VL] + WORD $0x858d5c4a // ldr z10, [x2, #111, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a930e7 // eor z7.d, z7.d, z9.d + WORD $0x04aa30e7 // eor z7.d, z7.d, z10.d + // Check for early termination + CMP $7, R16 + BEQ mulSve_10x8Xor_store + + // Load and process 32 bytes from input 7 to 8 outputs + WORD $0x8580418b // ldr z11, [x12] + WORD $0x9100818c // add x12, x12, #32 + WORD $0x04fc956c // lsr z12.d, z11.d, #4 + WORD $0x0428316b // and z11.d, z11.d, z8.d + WORD $0x0428318c // and z12.d, z12.d, z8.d + WORD $0x858e4049 // ldr z9, [x2, #112, MUL VL] + WORD $0x858e444a // ldr z10, [x2, #113, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93000 // eor z0.d, z0.d, z9.d + WORD $0x04aa3000 // eor z0.d, z0.d, z10.d + WORD $0x858e4849 // ldr z9, [x2, #114, MUL VL] + WORD $0x858e4c4a // ldr z10, [x2, #115, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93021 // eor z1.d, z1.d, z9.d + WORD $0x04aa3021 // eor z1.d, z1.d, z10.d + WORD $0x858e5049 // ldr z9, [x2, #116, MUL VL] + WORD $0x858e544a // ldr z10, [x2, #117, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93042 // eor z2.d, z2.d, z9.d + WORD $0x04aa3042 // eor z2.d, z2.d, z10.d + WORD $0x858e5849 // ldr z9, [x2, #118, MUL VL] + WORD $0x858e5c4a // ldr z10, [x2, #119, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93063 // eor z3.d, z3.d, z9.d + WORD $0x04aa3063 // eor z3.d, z3.d, z10.d + WORD $0x858f4049 // ldr z9, [x2, #120, MUL VL] + WORD $0x858f444a // ldr z10, [x2, #121, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93084 // eor z4.d, z4.d, z9.d + WORD $0x04aa3084 // eor z4.d, z4.d, z10.d + WORD $0x858f4849 // ldr z9, [x2, #122, MUL VL] + WORD $0x858f4c4a // ldr z10, [x2, #123, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a930a5 // eor z5.d, z5.d, z9.d + WORD $0x04aa30a5 // eor z5.d, z5.d, z10.d + WORD $0x858f5049 // ldr z9, [x2, #124, MUL VL] + WORD $0x858f544a // ldr z10, [x2, #125, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a930c6 // eor z6.d, z6.d, z9.d + WORD $0x04aa30c6 // eor z6.d, z6.d, z10.d + WORD $0x858f5849 // ldr z9, [x2, #126, MUL VL] + WORD $0x858f5c4a // ldr z10, [x2, #127, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a930e7 // eor z7.d, z7.d, z9.d + WORD $0x04aa30e7 // eor z7.d, z7.d, z10.d + // Check for early termination + CMP $8, R16 + BEQ mulSve_10x8Xor_store + + // Load and process 32 bytes from input 8 to 8 outputs + WORD $0x858041ab // ldr z11, [x13] + WORD $0x910081ad // add x13, x13, #32 + WORD $0x04fc956c // lsr z12.d, z11.d, #4 + WORD $0x0428316b // and z11.d, z11.d, z8.d + WORD $0x0428318c // and z12.d, z12.d, z8.d + WORD $0x85904049 // ldr z9, [x2, #128, MUL VL] + WORD $0x8590444a // ldr z10, [x2, #129, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93000 // eor z0.d, z0.d, z9.d + WORD $0x04aa3000 // eor z0.d, z0.d, z10.d + WORD $0x85904849 // ldr z9, [x2, #130, MUL VL] + WORD $0x85904c4a // ldr z10, [x2, #131, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93021 // eor z1.d, z1.d, z9.d + WORD $0x04aa3021 // eor z1.d, z1.d, z10.d + WORD $0x85905049 // ldr z9, [x2, #132, MUL VL] + WORD $0x8590544a // ldr z10, [x2, #133, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93042 // eor z2.d, z2.d, z9.d + WORD $0x04aa3042 // eor z2.d, z2.d, z10.d + WORD $0x85905849 // ldr z9, [x2, #134, MUL VL] + WORD $0x85905c4a // ldr z10, [x2, #135, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93063 // eor z3.d, z3.d, z9.d + WORD $0x04aa3063 // eor z3.d, z3.d, z10.d + WORD $0x85914049 // ldr z9, [x2, #136, MUL VL] + WORD $0x8591444a // ldr z10, [x2, #137, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93084 // eor z4.d, z4.d, z9.d + WORD $0x04aa3084 // eor z4.d, z4.d, z10.d + WORD $0x85914849 // ldr z9, [x2, #138, MUL VL] + WORD $0x85914c4a // ldr z10, [x2, #139, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a930a5 // eor z5.d, z5.d, z9.d + WORD $0x04aa30a5 // eor z5.d, z5.d, z10.d + WORD $0x85915049 // ldr z9, [x2, #140, MUL VL] + WORD $0x8591544a // ldr z10, [x2, #141, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a930c6 // eor z6.d, z6.d, z9.d + WORD $0x04aa30c6 // eor z6.d, z6.d, z10.d + WORD $0x85915849 // ldr z9, [x2, #142, MUL VL] + WORD $0x85915c4a // ldr z10, [x2, #143, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a930e7 // eor z7.d, z7.d, z9.d + WORD $0x04aa30e7 // eor z7.d, z7.d, z10.d + // Check for early termination + CMP $9, R16 + BEQ mulSve_10x8Xor_store + + // Load and process 32 bytes from input 9 to 8 outputs + WORD $0x8580406b // ldr z11, [x3] + WORD $0x91008063 // add x3, x3, #32 + WORD $0x04fc956c // lsr z12.d, z11.d, #4 + WORD $0x0428316b // and z11.d, z11.d, z8.d + WORD $0x0428318c // and z12.d, z12.d, z8.d + WORD $0x85924049 // ldr z9, [x2, #144, MUL VL] + WORD $0x8592444a // ldr z10, [x2, #145, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93000 // eor z0.d, z0.d, z9.d + WORD $0x04aa3000 // eor z0.d, z0.d, z10.d + WORD $0x85924849 // ldr z9, [x2, #146, MUL VL] + WORD $0x85924c4a // ldr z10, [x2, #147, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93021 // eor z1.d, z1.d, z9.d + WORD $0x04aa3021 // eor z1.d, z1.d, z10.d + WORD $0x85925049 // ldr z9, [x2, #148, MUL VL] + WORD $0x8592544a // ldr z10, [x2, #149, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93042 // eor z2.d, z2.d, z9.d + WORD $0x04aa3042 // eor z2.d, z2.d, z10.d + WORD $0x85925849 // ldr z9, [x2, #150, MUL VL] + WORD $0x85925c4a // ldr z10, [x2, #151, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93063 // eor z3.d, z3.d, z9.d + WORD $0x04aa3063 // eor z3.d, z3.d, z10.d + WORD $0x85934049 // ldr z9, [x2, #152, MUL VL] + WORD $0x8593444a // ldr z10, [x2, #153, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93084 // eor z4.d, z4.d, z9.d + WORD $0x04aa3084 // eor z4.d, z4.d, z10.d + WORD $0x85934849 // ldr z9, [x2, #154, MUL VL] + WORD $0x85934c4a // ldr z10, [x2, #155, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a930a5 // eor z5.d, z5.d, z9.d + WORD $0x04aa30a5 // eor z5.d, z5.d, z10.d + WORD $0x85935049 // ldr z9, [x2, #156, MUL VL] + WORD $0x8593544a // ldr z10, [x2, #157, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a930c6 // eor z6.d, z6.d, z9.d + WORD $0x04aa30c6 // eor z6.d, z6.d, z10.d + WORD $0x85935849 // ldr z9, [x2, #158, MUL VL] + WORD $0x85935c4a // ldr z10, [x2, #159, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a930e7 // eor z7.d, z7.d, z9.d + WORD $0x04aa30e7 // eor z7.d, z7.d, z10.d + +mulSve_10x8Xor_store: + // Store 8 outputs + MOVD (R14), R6 + WORD $0xe5ef40c0 // st1d { z0.d }, p0, [x6, x15, lsl #3] + MOVD 24(R14), R6 + WORD $0xe5ef40c1 // st1d { z1.d }, p0, [x6, x15, lsl #3] + MOVD 48(R14), R6 + WORD $0xe5ef40c2 // st1d { z2.d }, p0, [x6, x15, lsl #3] + MOVD 72(R14), R6 + WORD $0xe5ef40c3 // st1d { z3.d }, p0, [x6, x15, lsl #3] + MOVD 96(R14), R6 + WORD $0xe5ef40c4 // st1d { z4.d }, p0, [x6, x15, lsl #3] + MOVD 120(R14), R6 + WORD $0xe5ef40c5 // st1d { z5.d }, p0, [x6, x15, lsl #3] + MOVD 144(R14), R6 + WORD $0xe5ef40c6 // st1d { z6.d }, p0, [x6, x15, lsl #3] + MOVD 168(R14), R6 + WORD $0xe5ef40c7 // st1d { z7.d }, p0, [x6, x15, lsl #3] + + // Prepare for next loop + WORD $0x910011ef // add x15, x15, #4 + WORD $0xf1000400 // subs x0, x0, #1 + BNE mulSve_10x8Xor_loop + +mulSve_10x8Xor_end: + RET + +// func mulSve_10x9(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: SVE +TEXT ·mulSve_10x9(SB), NOSPLIT, $8-88 + WORD $0x25d8e3e0 // ptrue p0.d + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 194 YMM used + MOVD n+80(FP), R0 + MOVD matrix_base+0(FP), R2 + WORD $0xd345fc00 // lsr x0, x0, #5 + WORD $0xea00001f // tst x0, x0 + BEQ mulSve_10x9_end + MOVD in_base+24(FP), R3 + MOVD (R3), R1 + MOVD 24(R3), R4 + MOVD 48(R3), R5 + MOVD 72(R3), R8 + MOVD 96(R3), R9 + MOVD 120(R3), R10 + MOVD 144(R3), R11 + MOVD 168(R3), R12 + MOVD 192(R3), R13 + MOVD 216(R3), R3 + MOVD out_base+48(FP), R14 + MOVD start+72(FP), R15 + + // Add start offset to input + WORD $0x8b0f0021 // add x1, x1, x15 + WORD $0x8b0f0084 // add x4, x4, x15 + WORD $0x8b0f00a5 // add x5, x5, x15 + WORD $0x8b0f0108 // add x8, x8, x15 + WORD $0x8b0f0129 // add x9, x9, x15 + WORD $0x8b0f014a // add x10, x10, x15 + WORD $0x8b0f016b // add x11, x11, x15 + WORD $0x8b0f018c // add x12, x12, x15 + WORD $0x8b0f01ad // add x13, x13, x15 + WORD $0x8b0f0063 // add x3, x3, x15 + WORD $0xd343fdef // lsr x15, x15, #3 + WORD $0xd28001e6 // mov x6, #15 + WORD $0x05e038c9 // mov z9.d, x6 + WORD $0x05212129 // dup z9.b, z9.b[0] + + // Load number of input shards + MOVD in_len+32(FP), R16 + +mulSve_10x9_loop: + // Load and process 32 bytes from input 0 to 9 outputs + WORD $0x8580402c // ldr z12, [x1] + WORD $0x91008021 // add x1, x1, #32 + WORD $0x04fc958d // lsr z13.d, z12.d, #4 + WORD $0x0429318c // and z12.d, z12.d, z9.d + WORD $0x042931ad // and z13.d, z13.d, z9.d + WORD $0x8580404a // ldr z10, [x2] + WORD $0x8580444b // ldr z11, [x2, #1, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3160 // eor z0.d, z11.d, z10.d + WORD $0x8580484a // ldr z10, [x2, #2, MUL VL] + WORD $0x85804c4b // ldr z11, [x2, #3, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3161 // eor z1.d, z11.d, z10.d + WORD $0x8580504a // ldr z10, [x2, #4, MUL VL] + WORD $0x8580544b // ldr z11, [x2, #5, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3162 // eor z2.d, z11.d, z10.d + WORD $0x8580584a // ldr z10, [x2, #6, MUL VL] + WORD $0x85805c4b // ldr z11, [x2, #7, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3163 // eor z3.d, z11.d, z10.d + WORD $0x8581404a // ldr z10, [x2, #8, MUL VL] + WORD $0x8581444b // ldr z11, [x2, #9, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3164 // eor z4.d, z11.d, z10.d + WORD $0x8581484a // ldr z10, [x2, #10, MUL VL] + WORD $0x85814c4b // ldr z11, [x2, #11, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3165 // eor z5.d, z11.d, z10.d + WORD $0x8581504a // ldr z10, [x2, #12, MUL VL] + WORD $0x8581544b // ldr z11, [x2, #13, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3166 // eor z6.d, z11.d, z10.d + WORD $0x8581584a // ldr z10, [x2, #14, MUL VL] + WORD $0x85815c4b // ldr z11, [x2, #15, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3167 // eor z7.d, z11.d, z10.d + WORD $0x8582404a // ldr z10, [x2, #16, MUL VL] + WORD $0x8582444b // ldr z11, [x2, #17, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3168 // eor z8.d, z11.d, z10.d + // Check for early termination + CMP $1, R16 + BEQ mulSve_10x9_store + + // Load and process 32 bytes from input 1 to 9 outputs + WORD $0x8580408c // ldr z12, [x4] + WORD $0x91008084 // add x4, x4, #32 + WORD $0x04fc958d // lsr z13.d, z12.d, #4 + WORD $0x0429318c // and z12.d, z12.d, z9.d + WORD $0x042931ad // and z13.d, z13.d, z9.d + WORD $0x8582484a // ldr z10, [x2, #18, MUL VL] + WORD $0x85824c4b // ldr z11, [x2, #19, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3000 // eor z0.d, z0.d, z10.d + WORD $0x04ab3000 // eor z0.d, z0.d, z11.d + WORD $0x8582504a // ldr z10, [x2, #20, MUL VL] + WORD $0x8582544b // ldr z11, [x2, #21, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3021 // eor z1.d, z1.d, z10.d + WORD $0x04ab3021 // eor z1.d, z1.d, z11.d + WORD $0x8582584a // ldr z10, [x2, #22, MUL VL] + WORD $0x85825c4b // ldr z11, [x2, #23, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3042 // eor z2.d, z2.d, z10.d + WORD $0x04ab3042 // eor z2.d, z2.d, z11.d + WORD $0x8583404a // ldr z10, [x2, #24, MUL VL] + WORD $0x8583444b // ldr z11, [x2, #25, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3063 // eor z3.d, z3.d, z10.d + WORD $0x04ab3063 // eor z3.d, z3.d, z11.d + WORD $0x8583484a // ldr z10, [x2, #26, MUL VL] + WORD $0x85834c4b // ldr z11, [x2, #27, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3084 // eor z4.d, z4.d, z10.d + WORD $0x04ab3084 // eor z4.d, z4.d, z11.d + WORD $0x8583504a // ldr z10, [x2, #28, MUL VL] + WORD $0x8583544b // ldr z11, [x2, #29, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa30a5 // eor z5.d, z5.d, z10.d + WORD $0x04ab30a5 // eor z5.d, z5.d, z11.d + WORD $0x8583584a // ldr z10, [x2, #30, MUL VL] + WORD $0x85835c4b // ldr z11, [x2, #31, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa30c6 // eor z6.d, z6.d, z10.d + WORD $0x04ab30c6 // eor z6.d, z6.d, z11.d + WORD $0x8584404a // ldr z10, [x2, #32, MUL VL] + WORD $0x8584444b // ldr z11, [x2, #33, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa30e7 // eor z7.d, z7.d, z10.d + WORD $0x04ab30e7 // eor z7.d, z7.d, z11.d + WORD $0x8584484a // ldr z10, [x2, #34, MUL VL] + WORD $0x85844c4b // ldr z11, [x2, #35, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3108 // eor z8.d, z8.d, z10.d + WORD $0x04ab3108 // eor z8.d, z8.d, z11.d + // Check for early termination + CMP $2, R16 + BEQ mulSve_10x9_store + + // Load and process 32 bytes from input 2 to 9 outputs + WORD $0x858040ac // ldr z12, [x5] + WORD $0x910080a5 // add x5, x5, #32 + WORD $0x04fc958d // lsr z13.d, z12.d, #4 + WORD $0x0429318c // and z12.d, z12.d, z9.d + WORD $0x042931ad // and z13.d, z13.d, z9.d + WORD $0x8584504a // ldr z10, [x2, #36, MUL VL] + WORD $0x8584544b // ldr z11, [x2, #37, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3000 // eor z0.d, z0.d, z10.d + WORD $0x04ab3000 // eor z0.d, z0.d, z11.d + WORD $0x8584584a // ldr z10, [x2, #38, MUL VL] + WORD $0x85845c4b // ldr z11, [x2, #39, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3021 // eor z1.d, z1.d, z10.d + WORD $0x04ab3021 // eor z1.d, z1.d, z11.d + WORD $0x8585404a // ldr z10, [x2, #40, MUL VL] + WORD $0x8585444b // ldr z11, [x2, #41, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3042 // eor z2.d, z2.d, z10.d + WORD $0x04ab3042 // eor z2.d, z2.d, z11.d + WORD $0x8585484a // ldr z10, [x2, #42, MUL VL] + WORD $0x85854c4b // ldr z11, [x2, #43, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3063 // eor z3.d, z3.d, z10.d + WORD $0x04ab3063 // eor z3.d, z3.d, z11.d + WORD $0x8585504a // ldr z10, [x2, #44, MUL VL] + WORD $0x8585544b // ldr z11, [x2, #45, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3084 // eor z4.d, z4.d, z10.d + WORD $0x04ab3084 // eor z4.d, z4.d, z11.d + WORD $0x8585584a // ldr z10, [x2, #46, MUL VL] + WORD $0x85855c4b // ldr z11, [x2, #47, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa30a5 // eor z5.d, z5.d, z10.d + WORD $0x04ab30a5 // eor z5.d, z5.d, z11.d + WORD $0x8586404a // ldr z10, [x2, #48, MUL VL] + WORD $0x8586444b // ldr z11, [x2, #49, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa30c6 // eor z6.d, z6.d, z10.d + WORD $0x04ab30c6 // eor z6.d, z6.d, z11.d + WORD $0x8586484a // ldr z10, [x2, #50, MUL VL] + WORD $0x85864c4b // ldr z11, [x2, #51, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa30e7 // eor z7.d, z7.d, z10.d + WORD $0x04ab30e7 // eor z7.d, z7.d, z11.d + WORD $0x8586504a // ldr z10, [x2, #52, MUL VL] + WORD $0x8586544b // ldr z11, [x2, #53, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3108 // eor z8.d, z8.d, z10.d + WORD $0x04ab3108 // eor z8.d, z8.d, z11.d + // Check for early termination + CMP $3, R16 + BEQ mulSve_10x9_store + + // Load and process 32 bytes from input 3 to 9 outputs + WORD $0x8580410c // ldr z12, [x8] + WORD $0x91008108 // add x8, x8, #32 + WORD $0x04fc958d // lsr z13.d, z12.d, #4 + WORD $0x0429318c // and z12.d, z12.d, z9.d + WORD $0x042931ad // and z13.d, z13.d, z9.d + WORD $0x8586584a // ldr z10, [x2, #54, MUL VL] + WORD $0x85865c4b // ldr z11, [x2, #55, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3000 // eor z0.d, z0.d, z10.d + WORD $0x04ab3000 // eor z0.d, z0.d, z11.d + WORD $0x8587404a // ldr z10, [x2, #56, MUL VL] + WORD $0x8587444b // ldr z11, [x2, #57, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3021 // eor z1.d, z1.d, z10.d + WORD $0x04ab3021 // eor z1.d, z1.d, z11.d + WORD $0x8587484a // ldr z10, [x2, #58, MUL VL] + WORD $0x85874c4b // ldr z11, [x2, #59, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3042 // eor z2.d, z2.d, z10.d + WORD $0x04ab3042 // eor z2.d, z2.d, z11.d + WORD $0x8587504a // ldr z10, [x2, #60, MUL VL] + WORD $0x8587544b // ldr z11, [x2, #61, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3063 // eor z3.d, z3.d, z10.d + WORD $0x04ab3063 // eor z3.d, z3.d, z11.d + WORD $0x8587584a // ldr z10, [x2, #62, MUL VL] + WORD $0x85875c4b // ldr z11, [x2, #63, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3084 // eor z4.d, z4.d, z10.d + WORD $0x04ab3084 // eor z4.d, z4.d, z11.d + WORD $0x8588404a // ldr z10, [x2, #64, MUL VL] + WORD $0x8588444b // ldr z11, [x2, #65, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa30a5 // eor z5.d, z5.d, z10.d + WORD $0x04ab30a5 // eor z5.d, z5.d, z11.d + WORD $0x8588484a // ldr z10, [x2, #66, MUL VL] + WORD $0x85884c4b // ldr z11, [x2, #67, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa30c6 // eor z6.d, z6.d, z10.d + WORD $0x04ab30c6 // eor z6.d, z6.d, z11.d + WORD $0x8588504a // ldr z10, [x2, #68, MUL VL] + WORD $0x8588544b // ldr z11, [x2, #69, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa30e7 // eor z7.d, z7.d, z10.d + WORD $0x04ab30e7 // eor z7.d, z7.d, z11.d + WORD $0x8588584a // ldr z10, [x2, #70, MUL VL] + WORD $0x85885c4b // ldr z11, [x2, #71, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3108 // eor z8.d, z8.d, z10.d + WORD $0x04ab3108 // eor z8.d, z8.d, z11.d + // Check for early termination + CMP $4, R16 + BEQ mulSve_10x9_store + + // Load and process 32 bytes from input 4 to 9 outputs + WORD $0x8580412c // ldr z12, [x9] + WORD $0x91008129 // add x9, x9, #32 + WORD $0x04fc958d // lsr z13.d, z12.d, #4 + WORD $0x0429318c // and z12.d, z12.d, z9.d + WORD $0x042931ad // and z13.d, z13.d, z9.d + WORD $0x8589404a // ldr z10, [x2, #72, MUL VL] + WORD $0x8589444b // ldr z11, [x2, #73, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3000 // eor z0.d, z0.d, z10.d + WORD $0x04ab3000 // eor z0.d, z0.d, z11.d + WORD $0x8589484a // ldr z10, [x2, #74, MUL VL] + WORD $0x85894c4b // ldr z11, [x2, #75, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3021 // eor z1.d, z1.d, z10.d + WORD $0x04ab3021 // eor z1.d, z1.d, z11.d + WORD $0x8589504a // ldr z10, [x2, #76, MUL VL] + WORD $0x8589544b // ldr z11, [x2, #77, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3042 // eor z2.d, z2.d, z10.d + WORD $0x04ab3042 // eor z2.d, z2.d, z11.d + WORD $0x8589584a // ldr z10, [x2, #78, MUL VL] + WORD $0x85895c4b // ldr z11, [x2, #79, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3063 // eor z3.d, z3.d, z10.d + WORD $0x04ab3063 // eor z3.d, z3.d, z11.d + WORD $0x858a404a // ldr z10, [x2, #80, MUL VL] + WORD $0x858a444b // ldr z11, [x2, #81, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3084 // eor z4.d, z4.d, z10.d + WORD $0x04ab3084 // eor z4.d, z4.d, z11.d + WORD $0x858a484a // ldr z10, [x2, #82, MUL VL] + WORD $0x858a4c4b // ldr z11, [x2, #83, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa30a5 // eor z5.d, z5.d, z10.d + WORD $0x04ab30a5 // eor z5.d, z5.d, z11.d + WORD $0x858a504a // ldr z10, [x2, #84, MUL VL] + WORD $0x858a544b // ldr z11, [x2, #85, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa30c6 // eor z6.d, z6.d, z10.d + WORD $0x04ab30c6 // eor z6.d, z6.d, z11.d + WORD $0x858a584a // ldr z10, [x2, #86, MUL VL] + WORD $0x858a5c4b // ldr z11, [x2, #87, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa30e7 // eor z7.d, z7.d, z10.d + WORD $0x04ab30e7 // eor z7.d, z7.d, z11.d + WORD $0x858b404a // ldr z10, [x2, #88, MUL VL] + WORD $0x858b444b // ldr z11, [x2, #89, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3108 // eor z8.d, z8.d, z10.d + WORD $0x04ab3108 // eor z8.d, z8.d, z11.d + // Check for early termination + CMP $5, R16 + BEQ mulSve_10x9_store + + // Load and process 32 bytes from input 5 to 9 outputs + WORD $0x8580414c // ldr z12, [x10] + WORD $0x9100814a // add x10, x10, #32 + WORD $0x04fc958d // lsr z13.d, z12.d, #4 + WORD $0x0429318c // and z12.d, z12.d, z9.d + WORD $0x042931ad // and z13.d, z13.d, z9.d + WORD $0x858b484a // ldr z10, [x2, #90, MUL VL] + WORD $0x858b4c4b // ldr z11, [x2, #91, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3000 // eor z0.d, z0.d, z10.d + WORD $0x04ab3000 // eor z0.d, z0.d, z11.d + WORD $0x858b504a // ldr z10, [x2, #92, MUL VL] + WORD $0x858b544b // ldr z11, [x2, #93, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3021 // eor z1.d, z1.d, z10.d + WORD $0x04ab3021 // eor z1.d, z1.d, z11.d + WORD $0x858b584a // ldr z10, [x2, #94, MUL VL] + WORD $0x858b5c4b // ldr z11, [x2, #95, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3042 // eor z2.d, z2.d, z10.d + WORD $0x04ab3042 // eor z2.d, z2.d, z11.d + WORD $0x858c404a // ldr z10, [x2, #96, MUL VL] + WORD $0x858c444b // ldr z11, [x2, #97, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3063 // eor z3.d, z3.d, z10.d + WORD $0x04ab3063 // eor z3.d, z3.d, z11.d + WORD $0x858c484a // ldr z10, [x2, #98, MUL VL] + WORD $0x858c4c4b // ldr z11, [x2, #99, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3084 // eor z4.d, z4.d, z10.d + WORD $0x04ab3084 // eor z4.d, z4.d, z11.d + WORD $0x858c504a // ldr z10, [x2, #100, MUL VL] + WORD $0x858c544b // ldr z11, [x2, #101, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa30a5 // eor z5.d, z5.d, z10.d + WORD $0x04ab30a5 // eor z5.d, z5.d, z11.d + WORD $0x858c584a // ldr z10, [x2, #102, MUL VL] + WORD $0x858c5c4b // ldr z11, [x2, #103, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa30c6 // eor z6.d, z6.d, z10.d + WORD $0x04ab30c6 // eor z6.d, z6.d, z11.d + WORD $0x858d404a // ldr z10, [x2, #104, MUL VL] + WORD $0x858d444b // ldr z11, [x2, #105, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa30e7 // eor z7.d, z7.d, z10.d + WORD $0x04ab30e7 // eor z7.d, z7.d, z11.d + WORD $0x858d484a // ldr z10, [x2, #106, MUL VL] + WORD $0x858d4c4b // ldr z11, [x2, #107, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3108 // eor z8.d, z8.d, z10.d + WORD $0x04ab3108 // eor z8.d, z8.d, z11.d + // Check for early termination + CMP $6, R16 + BEQ mulSve_10x9_store + + // Load and process 32 bytes from input 6 to 9 outputs + WORD $0x8580416c // ldr z12, [x11] + WORD $0x9100816b // add x11, x11, #32 + WORD $0x04fc958d // lsr z13.d, z12.d, #4 + WORD $0x0429318c // and z12.d, z12.d, z9.d + WORD $0x042931ad // and z13.d, z13.d, z9.d + WORD $0x858d504a // ldr z10, [x2, #108, MUL VL] + WORD $0x858d544b // ldr z11, [x2, #109, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3000 // eor z0.d, z0.d, z10.d + WORD $0x04ab3000 // eor z0.d, z0.d, z11.d + WORD $0x858d584a // ldr z10, [x2, #110, MUL VL] + WORD $0x858d5c4b // ldr z11, [x2, #111, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3021 // eor z1.d, z1.d, z10.d + WORD $0x04ab3021 // eor z1.d, z1.d, z11.d + WORD $0x858e404a // ldr z10, [x2, #112, MUL VL] + WORD $0x858e444b // ldr z11, [x2, #113, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3042 // eor z2.d, z2.d, z10.d + WORD $0x04ab3042 // eor z2.d, z2.d, z11.d + WORD $0x858e484a // ldr z10, [x2, #114, MUL VL] + WORD $0x858e4c4b // ldr z11, [x2, #115, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3063 // eor z3.d, z3.d, z10.d + WORD $0x04ab3063 // eor z3.d, z3.d, z11.d + WORD $0x858e504a // ldr z10, [x2, #116, MUL VL] + WORD $0x858e544b // ldr z11, [x2, #117, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3084 // eor z4.d, z4.d, z10.d + WORD $0x04ab3084 // eor z4.d, z4.d, z11.d + WORD $0x858e584a // ldr z10, [x2, #118, MUL VL] + WORD $0x858e5c4b // ldr z11, [x2, #119, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa30a5 // eor z5.d, z5.d, z10.d + WORD $0x04ab30a5 // eor z5.d, z5.d, z11.d + WORD $0x858f404a // ldr z10, [x2, #120, MUL VL] + WORD $0x858f444b // ldr z11, [x2, #121, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa30c6 // eor z6.d, z6.d, z10.d + WORD $0x04ab30c6 // eor z6.d, z6.d, z11.d + WORD $0x858f484a // ldr z10, [x2, #122, MUL VL] + WORD $0x858f4c4b // ldr z11, [x2, #123, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa30e7 // eor z7.d, z7.d, z10.d + WORD $0x04ab30e7 // eor z7.d, z7.d, z11.d + WORD $0x858f504a // ldr z10, [x2, #124, MUL VL] + WORD $0x858f544b // ldr z11, [x2, #125, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3108 // eor z8.d, z8.d, z10.d + WORD $0x04ab3108 // eor z8.d, z8.d, z11.d + // Check for early termination + CMP $7, R16 + BEQ mulSve_10x9_store + + // Load and process 32 bytes from input 7 to 9 outputs + WORD $0x8580418c // ldr z12, [x12] + WORD $0x9100818c // add x12, x12, #32 + WORD $0x04fc958d // lsr z13.d, z12.d, #4 + WORD $0x0429318c // and z12.d, z12.d, z9.d + WORD $0x042931ad // and z13.d, z13.d, z9.d + WORD $0x858f584a // ldr z10, [x2, #126, MUL VL] + WORD $0x858f5c4b // ldr z11, [x2, #127, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3000 // eor z0.d, z0.d, z10.d + WORD $0x04ab3000 // eor z0.d, z0.d, z11.d + WORD $0x8590404a // ldr z10, [x2, #128, MUL VL] + WORD $0x8590444b // ldr z11, [x2, #129, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3021 // eor z1.d, z1.d, z10.d + WORD $0x04ab3021 // eor z1.d, z1.d, z11.d + WORD $0x8590484a // ldr z10, [x2, #130, MUL VL] + WORD $0x85904c4b // ldr z11, [x2, #131, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3042 // eor z2.d, z2.d, z10.d + WORD $0x04ab3042 // eor z2.d, z2.d, z11.d + WORD $0x8590504a // ldr z10, [x2, #132, MUL VL] + WORD $0x8590544b // ldr z11, [x2, #133, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3063 // eor z3.d, z3.d, z10.d + WORD $0x04ab3063 // eor z3.d, z3.d, z11.d + WORD $0x8590584a // ldr z10, [x2, #134, MUL VL] + WORD $0x85905c4b // ldr z11, [x2, #135, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3084 // eor z4.d, z4.d, z10.d + WORD $0x04ab3084 // eor z4.d, z4.d, z11.d + WORD $0x8591404a // ldr z10, [x2, #136, MUL VL] + WORD $0x8591444b // ldr z11, [x2, #137, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa30a5 // eor z5.d, z5.d, z10.d + WORD $0x04ab30a5 // eor z5.d, z5.d, z11.d + WORD $0x8591484a // ldr z10, [x2, #138, MUL VL] + WORD $0x85914c4b // ldr z11, [x2, #139, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa30c6 // eor z6.d, z6.d, z10.d + WORD $0x04ab30c6 // eor z6.d, z6.d, z11.d + WORD $0x8591504a // ldr z10, [x2, #140, MUL VL] + WORD $0x8591544b // ldr z11, [x2, #141, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa30e7 // eor z7.d, z7.d, z10.d + WORD $0x04ab30e7 // eor z7.d, z7.d, z11.d + WORD $0x8591584a // ldr z10, [x2, #142, MUL VL] + WORD $0x85915c4b // ldr z11, [x2, #143, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3108 // eor z8.d, z8.d, z10.d + WORD $0x04ab3108 // eor z8.d, z8.d, z11.d + // Check for early termination + CMP $8, R16 + BEQ mulSve_10x9_store + + // Load and process 32 bytes from input 8 to 9 outputs + WORD $0x858041ac // ldr z12, [x13] + WORD $0x910081ad // add x13, x13, #32 + WORD $0x04fc958d // lsr z13.d, z12.d, #4 + WORD $0x0429318c // and z12.d, z12.d, z9.d + WORD $0x042931ad // and z13.d, z13.d, z9.d + WORD $0x8592404a // ldr z10, [x2, #144, MUL VL] + WORD $0x8592444b // ldr z11, [x2, #145, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3000 // eor z0.d, z0.d, z10.d + WORD $0x04ab3000 // eor z0.d, z0.d, z11.d + WORD $0x8592484a // ldr z10, [x2, #146, MUL VL] + WORD $0x85924c4b // ldr z11, [x2, #147, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3021 // eor z1.d, z1.d, z10.d + WORD $0x04ab3021 // eor z1.d, z1.d, z11.d + WORD $0x8592504a // ldr z10, [x2, #148, MUL VL] + WORD $0x8592544b // ldr z11, [x2, #149, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3042 // eor z2.d, z2.d, z10.d + WORD $0x04ab3042 // eor z2.d, z2.d, z11.d + WORD $0x8592584a // ldr z10, [x2, #150, MUL VL] + WORD $0x85925c4b // ldr z11, [x2, #151, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3063 // eor z3.d, z3.d, z10.d + WORD $0x04ab3063 // eor z3.d, z3.d, z11.d + WORD $0x8593404a // ldr z10, [x2, #152, MUL VL] + WORD $0x8593444b // ldr z11, [x2, #153, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3084 // eor z4.d, z4.d, z10.d + WORD $0x04ab3084 // eor z4.d, z4.d, z11.d + WORD $0x8593484a // ldr z10, [x2, #154, MUL VL] + WORD $0x85934c4b // ldr z11, [x2, #155, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa30a5 // eor z5.d, z5.d, z10.d + WORD $0x04ab30a5 // eor z5.d, z5.d, z11.d + WORD $0x8593504a // ldr z10, [x2, #156, MUL VL] + WORD $0x8593544b // ldr z11, [x2, #157, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa30c6 // eor z6.d, z6.d, z10.d + WORD $0x04ab30c6 // eor z6.d, z6.d, z11.d + WORD $0x8593584a // ldr z10, [x2, #158, MUL VL] + WORD $0x85935c4b // ldr z11, [x2, #159, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa30e7 // eor z7.d, z7.d, z10.d + WORD $0x04ab30e7 // eor z7.d, z7.d, z11.d + WORD $0x8594404a // ldr z10, [x2, #160, MUL VL] + WORD $0x8594444b // ldr z11, [x2, #161, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3108 // eor z8.d, z8.d, z10.d + WORD $0x04ab3108 // eor z8.d, z8.d, z11.d + // Check for early termination + CMP $9, R16 + BEQ mulSve_10x9_store + + // Load and process 32 bytes from input 9 to 9 outputs + WORD $0x8580406c // ldr z12, [x3] + WORD $0x91008063 // add x3, x3, #32 + WORD $0x04fc958d // lsr z13.d, z12.d, #4 + WORD $0x0429318c // and z12.d, z12.d, z9.d + WORD $0x042931ad // and z13.d, z13.d, z9.d + WORD $0x8594484a // ldr z10, [x2, #162, MUL VL] + WORD $0x85944c4b // ldr z11, [x2, #163, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3000 // eor z0.d, z0.d, z10.d + WORD $0x04ab3000 // eor z0.d, z0.d, z11.d + WORD $0x8594504a // ldr z10, [x2, #164, MUL VL] + WORD $0x8594544b // ldr z11, [x2, #165, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3021 // eor z1.d, z1.d, z10.d + WORD $0x04ab3021 // eor z1.d, z1.d, z11.d + WORD $0x8594584a // ldr z10, [x2, #166, MUL VL] + WORD $0x85945c4b // ldr z11, [x2, #167, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3042 // eor z2.d, z2.d, z10.d + WORD $0x04ab3042 // eor z2.d, z2.d, z11.d + WORD $0x8595404a // ldr z10, [x2, #168, MUL VL] + WORD $0x8595444b // ldr z11, [x2, #169, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3063 // eor z3.d, z3.d, z10.d + WORD $0x04ab3063 // eor z3.d, z3.d, z11.d + WORD $0x8595484a // ldr z10, [x2, #170, MUL VL] + WORD $0x85954c4b // ldr z11, [x2, #171, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3084 // eor z4.d, z4.d, z10.d + WORD $0x04ab3084 // eor z4.d, z4.d, z11.d + WORD $0x8595504a // ldr z10, [x2, #172, MUL VL] + WORD $0x8595544b // ldr z11, [x2, #173, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa30a5 // eor z5.d, z5.d, z10.d + WORD $0x04ab30a5 // eor z5.d, z5.d, z11.d + WORD $0x8595584a // ldr z10, [x2, #174, MUL VL] + WORD $0x85955c4b // ldr z11, [x2, #175, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa30c6 // eor z6.d, z6.d, z10.d + WORD $0x04ab30c6 // eor z6.d, z6.d, z11.d + WORD $0x8596404a // ldr z10, [x2, #176, MUL VL] + WORD $0x8596444b // ldr z11, [x2, #177, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa30e7 // eor z7.d, z7.d, z10.d + WORD $0x04ab30e7 // eor z7.d, z7.d, z11.d + WORD $0x8596484a // ldr z10, [x2, #178, MUL VL] + WORD $0x85964c4b // ldr z11, [x2, #179, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3108 // eor z8.d, z8.d, z10.d + WORD $0x04ab3108 // eor z8.d, z8.d, z11.d + +mulSve_10x9_store: + // Store 9 outputs + MOVD (R14), R6 + WORD $0xe5ef40c0 // st1d { z0.d }, p0, [x6, x15, lsl #3] + MOVD 24(R14), R6 + WORD $0xe5ef40c1 // st1d { z1.d }, p0, [x6, x15, lsl #3] + MOVD 48(R14), R6 + WORD $0xe5ef40c2 // st1d { z2.d }, p0, [x6, x15, lsl #3] + MOVD 72(R14), R6 + WORD $0xe5ef40c3 // st1d { z3.d }, p0, [x6, x15, lsl #3] + MOVD 96(R14), R6 + WORD $0xe5ef40c4 // st1d { z4.d }, p0, [x6, x15, lsl #3] + MOVD 120(R14), R6 + WORD $0xe5ef40c5 // st1d { z5.d }, p0, [x6, x15, lsl #3] + MOVD 144(R14), R6 + WORD $0xe5ef40c6 // st1d { z6.d }, p0, [x6, x15, lsl #3] + MOVD 168(R14), R6 + WORD $0xe5ef40c7 // st1d { z7.d }, p0, [x6, x15, lsl #3] + MOVD 192(R14), R6 + WORD $0xe5ef40c8 // st1d { z8.d }, p0, [x6, x15, lsl #3] + + // Prepare for next loop + WORD $0x910011ef // add x15, x15, #4 + WORD $0xf1000400 // subs x0, x0, #1 + BNE mulSve_10x9_loop + +mulSve_10x9_end: + RET + +// func mulSve_10x9Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: SVE +TEXT ·mulSve_10x9Xor(SB), NOSPLIT, $8-88 + WORD $0x25d8e3e0 // ptrue p0.d + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 194 YMM used + MOVD n+80(FP), R0 + MOVD matrix_base+0(FP), R2 + WORD $0xd345fc00 // lsr x0, x0, #5 + WORD $0xea00001f // tst x0, x0 + BEQ mulSve_10x9Xor_end + MOVD in_base+24(FP), R3 + MOVD (R3), R1 + MOVD 24(R3), R4 + MOVD 48(R3), R5 + MOVD 72(R3), R8 + MOVD 96(R3), R9 + MOVD 120(R3), R10 + MOVD 144(R3), R11 + MOVD 168(R3), R12 + MOVD 192(R3), R13 + MOVD 216(R3), R3 + MOVD out_base+48(FP), R14 + MOVD start+72(FP), R15 + + // Add start offset to input + WORD $0x8b0f0021 // add x1, x1, x15 + WORD $0x8b0f0084 // add x4, x4, x15 + WORD $0x8b0f00a5 // add x5, x5, x15 + WORD $0x8b0f0108 // add x8, x8, x15 + WORD $0x8b0f0129 // add x9, x9, x15 + WORD $0x8b0f014a // add x10, x10, x15 + WORD $0x8b0f016b // add x11, x11, x15 + WORD $0x8b0f018c // add x12, x12, x15 + WORD $0x8b0f01ad // add x13, x13, x15 + WORD $0x8b0f0063 // add x3, x3, x15 + WORD $0xd343fdef // lsr x15, x15, #3 + WORD $0xd28001e6 // mov x6, #15 + WORD $0x05e038c9 // mov z9.d, x6 + WORD $0x05212129 // dup z9.b, z9.b[0] + + // Load number of input shards + MOVD in_len+32(FP), R16 + +mulSve_10x9Xor_loop: + // Load and process 32 bytes from input 0 to 9 outputs + WORD $0x8580402c // ldr z12, [x1] + WORD $0x91008021 // add x1, x1, #32 + WORD $0x04fc958d // lsr z13.d, z12.d, #4 + WORD $0x0429318c // and z12.d, z12.d, z9.d + WORD $0x042931ad // and z13.d, z13.d, z9.d + MOVD (R14), R6 + WORD $0xa5ef40c0 // ld1d { z0.d }, p0/z, [x6, x15, lsl #3] + WORD $0x8580404a // ldr z10, [x2] + WORD $0x8580444b // ldr z11, [x2, #1, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3000 // eor z0.d, z0.d, z10.d + WORD $0x04ab3000 // eor z0.d, z0.d, z11.d + MOVD 24(R14), R6 + WORD $0xa5ef40c1 // ld1d { z1.d }, p0/z, [x6, x15, lsl #3] + WORD $0x8580484a // ldr z10, [x2, #2, MUL VL] + WORD $0x85804c4b // ldr z11, [x2, #3, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3021 // eor z1.d, z1.d, z10.d + WORD $0x04ab3021 // eor z1.d, z1.d, z11.d + MOVD 48(R14), R6 + WORD $0xa5ef40c2 // ld1d { z2.d }, p0/z, [x6, x15, lsl #3] + WORD $0x8580504a // ldr z10, [x2, #4, MUL VL] + WORD $0x8580544b // ldr z11, [x2, #5, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3042 // eor z2.d, z2.d, z10.d + WORD $0x04ab3042 // eor z2.d, z2.d, z11.d + MOVD 72(R14), R6 + WORD $0xa5ef40c3 // ld1d { z3.d }, p0/z, [x6, x15, lsl #3] + WORD $0x8580584a // ldr z10, [x2, #6, MUL VL] + WORD $0x85805c4b // ldr z11, [x2, #7, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3063 // eor z3.d, z3.d, z10.d + WORD $0x04ab3063 // eor z3.d, z3.d, z11.d + MOVD 96(R14), R6 + WORD $0xa5ef40c4 // ld1d { z4.d }, p0/z, [x6, x15, lsl #3] + WORD $0x8581404a // ldr z10, [x2, #8, MUL VL] + WORD $0x8581444b // ldr z11, [x2, #9, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3084 // eor z4.d, z4.d, z10.d + WORD $0x04ab3084 // eor z4.d, z4.d, z11.d + MOVD 120(R14), R6 + WORD $0xa5ef40c5 // ld1d { z5.d }, p0/z, [x6, x15, lsl #3] + WORD $0x8581484a // ldr z10, [x2, #10, MUL VL] + WORD $0x85814c4b // ldr z11, [x2, #11, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa30a5 // eor z5.d, z5.d, z10.d + WORD $0x04ab30a5 // eor z5.d, z5.d, z11.d + MOVD 144(R14), R6 + WORD $0xa5ef40c6 // ld1d { z6.d }, p0/z, [x6, x15, lsl #3] + WORD $0x8581504a // ldr z10, [x2, #12, MUL VL] + WORD $0x8581544b // ldr z11, [x2, #13, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa30c6 // eor z6.d, z6.d, z10.d + WORD $0x04ab30c6 // eor z6.d, z6.d, z11.d + MOVD 168(R14), R6 + WORD $0xa5ef40c7 // ld1d { z7.d }, p0/z, [x6, x15, lsl #3] + WORD $0x8581584a // ldr z10, [x2, #14, MUL VL] + WORD $0x85815c4b // ldr z11, [x2, #15, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa30e7 // eor z7.d, z7.d, z10.d + WORD $0x04ab30e7 // eor z7.d, z7.d, z11.d + MOVD 192(R14), R6 + WORD $0xa5ef40c8 // ld1d { z8.d }, p0/z, [x6, x15, lsl #3] + WORD $0x8582404a // ldr z10, [x2, #16, MUL VL] + WORD $0x8582444b // ldr z11, [x2, #17, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3108 // eor z8.d, z8.d, z10.d + WORD $0x04ab3108 // eor z8.d, z8.d, z11.d + // Check for early termination + CMP $1, R16 + BEQ mulSve_10x9Xor_store + + // Load and process 32 bytes from input 1 to 9 outputs + WORD $0x8580408c // ldr z12, [x4] + WORD $0x91008084 // add x4, x4, #32 + WORD $0x04fc958d // lsr z13.d, z12.d, #4 + WORD $0x0429318c // and z12.d, z12.d, z9.d + WORD $0x042931ad // and z13.d, z13.d, z9.d + WORD $0x8582484a // ldr z10, [x2, #18, MUL VL] + WORD $0x85824c4b // ldr z11, [x2, #19, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3000 // eor z0.d, z0.d, z10.d + WORD $0x04ab3000 // eor z0.d, z0.d, z11.d + WORD $0x8582504a // ldr z10, [x2, #20, MUL VL] + WORD $0x8582544b // ldr z11, [x2, #21, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3021 // eor z1.d, z1.d, z10.d + WORD $0x04ab3021 // eor z1.d, z1.d, z11.d + WORD $0x8582584a // ldr z10, [x2, #22, MUL VL] + WORD $0x85825c4b // ldr z11, [x2, #23, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3042 // eor z2.d, z2.d, z10.d + WORD $0x04ab3042 // eor z2.d, z2.d, z11.d + WORD $0x8583404a // ldr z10, [x2, #24, MUL VL] + WORD $0x8583444b // ldr z11, [x2, #25, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3063 // eor z3.d, z3.d, z10.d + WORD $0x04ab3063 // eor z3.d, z3.d, z11.d + WORD $0x8583484a // ldr z10, [x2, #26, MUL VL] + WORD $0x85834c4b // ldr z11, [x2, #27, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3084 // eor z4.d, z4.d, z10.d + WORD $0x04ab3084 // eor z4.d, z4.d, z11.d + WORD $0x8583504a // ldr z10, [x2, #28, MUL VL] + WORD $0x8583544b // ldr z11, [x2, #29, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa30a5 // eor z5.d, z5.d, z10.d + WORD $0x04ab30a5 // eor z5.d, z5.d, z11.d + WORD $0x8583584a // ldr z10, [x2, #30, MUL VL] + WORD $0x85835c4b // ldr z11, [x2, #31, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa30c6 // eor z6.d, z6.d, z10.d + WORD $0x04ab30c6 // eor z6.d, z6.d, z11.d + WORD $0x8584404a // ldr z10, [x2, #32, MUL VL] + WORD $0x8584444b // ldr z11, [x2, #33, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa30e7 // eor z7.d, z7.d, z10.d + WORD $0x04ab30e7 // eor z7.d, z7.d, z11.d + WORD $0x8584484a // ldr z10, [x2, #34, MUL VL] + WORD $0x85844c4b // ldr z11, [x2, #35, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3108 // eor z8.d, z8.d, z10.d + WORD $0x04ab3108 // eor z8.d, z8.d, z11.d + // Check for early termination + CMP $2, R16 + BEQ mulSve_10x9Xor_store + + // Load and process 32 bytes from input 2 to 9 outputs + WORD $0x858040ac // ldr z12, [x5] + WORD $0x910080a5 // add x5, x5, #32 + WORD $0x04fc958d // lsr z13.d, z12.d, #4 + WORD $0x0429318c // and z12.d, z12.d, z9.d + WORD $0x042931ad // and z13.d, z13.d, z9.d + WORD $0x8584504a // ldr z10, [x2, #36, MUL VL] + WORD $0x8584544b // ldr z11, [x2, #37, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3000 // eor z0.d, z0.d, z10.d + WORD $0x04ab3000 // eor z0.d, z0.d, z11.d + WORD $0x8584584a // ldr z10, [x2, #38, MUL VL] + WORD $0x85845c4b // ldr z11, [x2, #39, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3021 // eor z1.d, z1.d, z10.d + WORD $0x04ab3021 // eor z1.d, z1.d, z11.d + WORD $0x8585404a // ldr z10, [x2, #40, MUL VL] + WORD $0x8585444b // ldr z11, [x2, #41, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3042 // eor z2.d, z2.d, z10.d + WORD $0x04ab3042 // eor z2.d, z2.d, z11.d + WORD $0x8585484a // ldr z10, [x2, #42, MUL VL] + WORD $0x85854c4b // ldr z11, [x2, #43, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3063 // eor z3.d, z3.d, z10.d + WORD $0x04ab3063 // eor z3.d, z3.d, z11.d + WORD $0x8585504a // ldr z10, [x2, #44, MUL VL] + WORD $0x8585544b // ldr z11, [x2, #45, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3084 // eor z4.d, z4.d, z10.d + WORD $0x04ab3084 // eor z4.d, z4.d, z11.d + WORD $0x8585584a // ldr z10, [x2, #46, MUL VL] + WORD $0x85855c4b // ldr z11, [x2, #47, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa30a5 // eor z5.d, z5.d, z10.d + WORD $0x04ab30a5 // eor z5.d, z5.d, z11.d + WORD $0x8586404a // ldr z10, [x2, #48, MUL VL] + WORD $0x8586444b // ldr z11, [x2, #49, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa30c6 // eor z6.d, z6.d, z10.d + WORD $0x04ab30c6 // eor z6.d, z6.d, z11.d + WORD $0x8586484a // ldr z10, [x2, #50, MUL VL] + WORD $0x85864c4b // ldr z11, [x2, #51, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa30e7 // eor z7.d, z7.d, z10.d + WORD $0x04ab30e7 // eor z7.d, z7.d, z11.d + WORD $0x8586504a // ldr z10, [x2, #52, MUL VL] + WORD $0x8586544b // ldr z11, [x2, #53, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3108 // eor z8.d, z8.d, z10.d + WORD $0x04ab3108 // eor z8.d, z8.d, z11.d + // Check for early termination + CMP $3, R16 + BEQ mulSve_10x9Xor_store + + // Load and process 32 bytes from input 3 to 9 outputs + WORD $0x8580410c // ldr z12, [x8] + WORD $0x91008108 // add x8, x8, #32 + WORD $0x04fc958d // lsr z13.d, z12.d, #4 + WORD $0x0429318c // and z12.d, z12.d, z9.d + WORD $0x042931ad // and z13.d, z13.d, z9.d + WORD $0x8586584a // ldr z10, [x2, #54, MUL VL] + WORD $0x85865c4b // ldr z11, [x2, #55, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3000 // eor z0.d, z0.d, z10.d + WORD $0x04ab3000 // eor z0.d, z0.d, z11.d + WORD $0x8587404a // ldr z10, [x2, #56, MUL VL] + WORD $0x8587444b // ldr z11, [x2, #57, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3021 // eor z1.d, z1.d, z10.d + WORD $0x04ab3021 // eor z1.d, z1.d, z11.d + WORD $0x8587484a // ldr z10, [x2, #58, MUL VL] + WORD $0x85874c4b // ldr z11, [x2, #59, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3042 // eor z2.d, z2.d, z10.d + WORD $0x04ab3042 // eor z2.d, z2.d, z11.d + WORD $0x8587504a // ldr z10, [x2, #60, MUL VL] + WORD $0x8587544b // ldr z11, [x2, #61, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3063 // eor z3.d, z3.d, z10.d + WORD $0x04ab3063 // eor z3.d, z3.d, z11.d + WORD $0x8587584a // ldr z10, [x2, #62, MUL VL] + WORD $0x85875c4b // ldr z11, [x2, #63, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3084 // eor z4.d, z4.d, z10.d + WORD $0x04ab3084 // eor z4.d, z4.d, z11.d + WORD $0x8588404a // ldr z10, [x2, #64, MUL VL] + WORD $0x8588444b // ldr z11, [x2, #65, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa30a5 // eor z5.d, z5.d, z10.d + WORD $0x04ab30a5 // eor z5.d, z5.d, z11.d + WORD $0x8588484a // ldr z10, [x2, #66, MUL VL] + WORD $0x85884c4b // ldr z11, [x2, #67, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa30c6 // eor z6.d, z6.d, z10.d + WORD $0x04ab30c6 // eor z6.d, z6.d, z11.d + WORD $0x8588504a // ldr z10, [x2, #68, MUL VL] + WORD $0x8588544b // ldr z11, [x2, #69, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa30e7 // eor z7.d, z7.d, z10.d + WORD $0x04ab30e7 // eor z7.d, z7.d, z11.d + WORD $0x8588584a // ldr z10, [x2, #70, MUL VL] + WORD $0x85885c4b // ldr z11, [x2, #71, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3108 // eor z8.d, z8.d, z10.d + WORD $0x04ab3108 // eor z8.d, z8.d, z11.d + // Check for early termination + CMP $4, R16 + BEQ mulSve_10x9Xor_store + + // Load and process 32 bytes from input 4 to 9 outputs + WORD $0x8580412c // ldr z12, [x9] + WORD $0x91008129 // add x9, x9, #32 + WORD $0x04fc958d // lsr z13.d, z12.d, #4 + WORD $0x0429318c // and z12.d, z12.d, z9.d + WORD $0x042931ad // and z13.d, z13.d, z9.d + WORD $0x8589404a // ldr z10, [x2, #72, MUL VL] + WORD $0x8589444b // ldr z11, [x2, #73, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3000 // eor z0.d, z0.d, z10.d + WORD $0x04ab3000 // eor z0.d, z0.d, z11.d + WORD $0x8589484a // ldr z10, [x2, #74, MUL VL] + WORD $0x85894c4b // ldr z11, [x2, #75, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3021 // eor z1.d, z1.d, z10.d + WORD $0x04ab3021 // eor z1.d, z1.d, z11.d + WORD $0x8589504a // ldr z10, [x2, #76, MUL VL] + WORD $0x8589544b // ldr z11, [x2, #77, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3042 // eor z2.d, z2.d, z10.d + WORD $0x04ab3042 // eor z2.d, z2.d, z11.d + WORD $0x8589584a // ldr z10, [x2, #78, MUL VL] + WORD $0x85895c4b // ldr z11, [x2, #79, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3063 // eor z3.d, z3.d, z10.d + WORD $0x04ab3063 // eor z3.d, z3.d, z11.d + WORD $0x858a404a // ldr z10, [x2, #80, MUL VL] + WORD $0x858a444b // ldr z11, [x2, #81, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3084 // eor z4.d, z4.d, z10.d + WORD $0x04ab3084 // eor z4.d, z4.d, z11.d + WORD $0x858a484a // ldr z10, [x2, #82, MUL VL] + WORD $0x858a4c4b // ldr z11, [x2, #83, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa30a5 // eor z5.d, z5.d, z10.d + WORD $0x04ab30a5 // eor z5.d, z5.d, z11.d + WORD $0x858a504a // ldr z10, [x2, #84, MUL VL] + WORD $0x858a544b // ldr z11, [x2, #85, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa30c6 // eor z6.d, z6.d, z10.d + WORD $0x04ab30c6 // eor z6.d, z6.d, z11.d + WORD $0x858a584a // ldr z10, [x2, #86, MUL VL] + WORD $0x858a5c4b // ldr z11, [x2, #87, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa30e7 // eor z7.d, z7.d, z10.d + WORD $0x04ab30e7 // eor z7.d, z7.d, z11.d + WORD $0x858b404a // ldr z10, [x2, #88, MUL VL] + WORD $0x858b444b // ldr z11, [x2, #89, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3108 // eor z8.d, z8.d, z10.d + WORD $0x04ab3108 // eor z8.d, z8.d, z11.d + // Check for early termination + CMP $5, R16 + BEQ mulSve_10x9Xor_store + + // Load and process 32 bytes from input 5 to 9 outputs + WORD $0x8580414c // ldr z12, [x10] + WORD $0x9100814a // add x10, x10, #32 + WORD $0x04fc958d // lsr z13.d, z12.d, #4 + WORD $0x0429318c // and z12.d, z12.d, z9.d + WORD $0x042931ad // and z13.d, z13.d, z9.d + WORD $0x858b484a // ldr z10, [x2, #90, MUL VL] + WORD $0x858b4c4b // ldr z11, [x2, #91, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3000 // eor z0.d, z0.d, z10.d + WORD $0x04ab3000 // eor z0.d, z0.d, z11.d + WORD $0x858b504a // ldr z10, [x2, #92, MUL VL] + WORD $0x858b544b // ldr z11, [x2, #93, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3021 // eor z1.d, z1.d, z10.d + WORD $0x04ab3021 // eor z1.d, z1.d, z11.d + WORD $0x858b584a // ldr z10, [x2, #94, MUL VL] + WORD $0x858b5c4b // ldr z11, [x2, #95, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3042 // eor z2.d, z2.d, z10.d + WORD $0x04ab3042 // eor z2.d, z2.d, z11.d + WORD $0x858c404a // ldr z10, [x2, #96, MUL VL] + WORD $0x858c444b // ldr z11, [x2, #97, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3063 // eor z3.d, z3.d, z10.d + WORD $0x04ab3063 // eor z3.d, z3.d, z11.d + WORD $0x858c484a // ldr z10, [x2, #98, MUL VL] + WORD $0x858c4c4b // ldr z11, [x2, #99, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3084 // eor z4.d, z4.d, z10.d + WORD $0x04ab3084 // eor z4.d, z4.d, z11.d + WORD $0x858c504a // ldr z10, [x2, #100, MUL VL] + WORD $0x858c544b // ldr z11, [x2, #101, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa30a5 // eor z5.d, z5.d, z10.d + WORD $0x04ab30a5 // eor z5.d, z5.d, z11.d + WORD $0x858c584a // ldr z10, [x2, #102, MUL VL] + WORD $0x858c5c4b // ldr z11, [x2, #103, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa30c6 // eor z6.d, z6.d, z10.d + WORD $0x04ab30c6 // eor z6.d, z6.d, z11.d + WORD $0x858d404a // ldr z10, [x2, #104, MUL VL] + WORD $0x858d444b // ldr z11, [x2, #105, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa30e7 // eor z7.d, z7.d, z10.d + WORD $0x04ab30e7 // eor z7.d, z7.d, z11.d + WORD $0x858d484a // ldr z10, [x2, #106, MUL VL] + WORD $0x858d4c4b // ldr z11, [x2, #107, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3108 // eor z8.d, z8.d, z10.d + WORD $0x04ab3108 // eor z8.d, z8.d, z11.d + // Check for early termination + CMP $6, R16 + BEQ mulSve_10x9Xor_store + + // Load and process 32 bytes from input 6 to 9 outputs + WORD $0x8580416c // ldr z12, [x11] + WORD $0x9100816b // add x11, x11, #32 + WORD $0x04fc958d // lsr z13.d, z12.d, #4 + WORD $0x0429318c // and z12.d, z12.d, z9.d + WORD $0x042931ad // and z13.d, z13.d, z9.d + WORD $0x858d504a // ldr z10, [x2, #108, MUL VL] + WORD $0x858d544b // ldr z11, [x2, #109, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3000 // eor z0.d, z0.d, z10.d + WORD $0x04ab3000 // eor z0.d, z0.d, z11.d + WORD $0x858d584a // ldr z10, [x2, #110, MUL VL] + WORD $0x858d5c4b // ldr z11, [x2, #111, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3021 // eor z1.d, z1.d, z10.d + WORD $0x04ab3021 // eor z1.d, z1.d, z11.d + WORD $0x858e404a // ldr z10, [x2, #112, MUL VL] + WORD $0x858e444b // ldr z11, [x2, #113, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3042 // eor z2.d, z2.d, z10.d + WORD $0x04ab3042 // eor z2.d, z2.d, z11.d + WORD $0x858e484a // ldr z10, [x2, #114, MUL VL] + WORD $0x858e4c4b // ldr z11, [x2, #115, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3063 // eor z3.d, z3.d, z10.d + WORD $0x04ab3063 // eor z3.d, z3.d, z11.d + WORD $0x858e504a // ldr z10, [x2, #116, MUL VL] + WORD $0x858e544b // ldr z11, [x2, #117, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3084 // eor z4.d, z4.d, z10.d + WORD $0x04ab3084 // eor z4.d, z4.d, z11.d + WORD $0x858e584a // ldr z10, [x2, #118, MUL VL] + WORD $0x858e5c4b // ldr z11, [x2, #119, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa30a5 // eor z5.d, z5.d, z10.d + WORD $0x04ab30a5 // eor z5.d, z5.d, z11.d + WORD $0x858f404a // ldr z10, [x2, #120, MUL VL] + WORD $0x858f444b // ldr z11, [x2, #121, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa30c6 // eor z6.d, z6.d, z10.d + WORD $0x04ab30c6 // eor z6.d, z6.d, z11.d + WORD $0x858f484a // ldr z10, [x2, #122, MUL VL] + WORD $0x858f4c4b // ldr z11, [x2, #123, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa30e7 // eor z7.d, z7.d, z10.d + WORD $0x04ab30e7 // eor z7.d, z7.d, z11.d + WORD $0x858f504a // ldr z10, [x2, #124, MUL VL] + WORD $0x858f544b // ldr z11, [x2, #125, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3108 // eor z8.d, z8.d, z10.d + WORD $0x04ab3108 // eor z8.d, z8.d, z11.d + // Check for early termination + CMP $7, R16 + BEQ mulSve_10x9Xor_store + + // Load and process 32 bytes from input 7 to 9 outputs + WORD $0x8580418c // ldr z12, [x12] + WORD $0x9100818c // add x12, x12, #32 + WORD $0x04fc958d // lsr z13.d, z12.d, #4 + WORD $0x0429318c // and z12.d, z12.d, z9.d + WORD $0x042931ad // and z13.d, z13.d, z9.d + WORD $0x858f584a // ldr z10, [x2, #126, MUL VL] + WORD $0x858f5c4b // ldr z11, [x2, #127, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3000 // eor z0.d, z0.d, z10.d + WORD $0x04ab3000 // eor z0.d, z0.d, z11.d + WORD $0x8590404a // ldr z10, [x2, #128, MUL VL] + WORD $0x8590444b // ldr z11, [x2, #129, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3021 // eor z1.d, z1.d, z10.d + WORD $0x04ab3021 // eor z1.d, z1.d, z11.d + WORD $0x8590484a // ldr z10, [x2, #130, MUL VL] + WORD $0x85904c4b // ldr z11, [x2, #131, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3042 // eor z2.d, z2.d, z10.d + WORD $0x04ab3042 // eor z2.d, z2.d, z11.d + WORD $0x8590504a // ldr z10, [x2, #132, MUL VL] + WORD $0x8590544b // ldr z11, [x2, #133, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3063 // eor z3.d, z3.d, z10.d + WORD $0x04ab3063 // eor z3.d, z3.d, z11.d + WORD $0x8590584a // ldr z10, [x2, #134, MUL VL] + WORD $0x85905c4b // ldr z11, [x2, #135, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3084 // eor z4.d, z4.d, z10.d + WORD $0x04ab3084 // eor z4.d, z4.d, z11.d + WORD $0x8591404a // ldr z10, [x2, #136, MUL VL] + WORD $0x8591444b // ldr z11, [x2, #137, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa30a5 // eor z5.d, z5.d, z10.d + WORD $0x04ab30a5 // eor z5.d, z5.d, z11.d + WORD $0x8591484a // ldr z10, [x2, #138, MUL VL] + WORD $0x85914c4b // ldr z11, [x2, #139, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa30c6 // eor z6.d, z6.d, z10.d + WORD $0x04ab30c6 // eor z6.d, z6.d, z11.d + WORD $0x8591504a // ldr z10, [x2, #140, MUL VL] + WORD $0x8591544b // ldr z11, [x2, #141, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa30e7 // eor z7.d, z7.d, z10.d + WORD $0x04ab30e7 // eor z7.d, z7.d, z11.d + WORD $0x8591584a // ldr z10, [x2, #142, MUL VL] + WORD $0x85915c4b // ldr z11, [x2, #143, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3108 // eor z8.d, z8.d, z10.d + WORD $0x04ab3108 // eor z8.d, z8.d, z11.d + // Check for early termination + CMP $8, R16 + BEQ mulSve_10x9Xor_store + + // Load and process 32 bytes from input 8 to 9 outputs + WORD $0x858041ac // ldr z12, [x13] + WORD $0x910081ad // add x13, x13, #32 + WORD $0x04fc958d // lsr z13.d, z12.d, #4 + WORD $0x0429318c // and z12.d, z12.d, z9.d + WORD $0x042931ad // and z13.d, z13.d, z9.d + WORD $0x8592404a // ldr z10, [x2, #144, MUL VL] + WORD $0x8592444b // ldr z11, [x2, #145, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3000 // eor z0.d, z0.d, z10.d + WORD $0x04ab3000 // eor z0.d, z0.d, z11.d + WORD $0x8592484a // ldr z10, [x2, #146, MUL VL] + WORD $0x85924c4b // ldr z11, [x2, #147, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3021 // eor z1.d, z1.d, z10.d + WORD $0x04ab3021 // eor z1.d, z1.d, z11.d + WORD $0x8592504a // ldr z10, [x2, #148, MUL VL] + WORD $0x8592544b // ldr z11, [x2, #149, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3042 // eor z2.d, z2.d, z10.d + WORD $0x04ab3042 // eor z2.d, z2.d, z11.d + WORD $0x8592584a // ldr z10, [x2, #150, MUL VL] + WORD $0x85925c4b // ldr z11, [x2, #151, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3063 // eor z3.d, z3.d, z10.d + WORD $0x04ab3063 // eor z3.d, z3.d, z11.d + WORD $0x8593404a // ldr z10, [x2, #152, MUL VL] + WORD $0x8593444b // ldr z11, [x2, #153, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3084 // eor z4.d, z4.d, z10.d + WORD $0x04ab3084 // eor z4.d, z4.d, z11.d + WORD $0x8593484a // ldr z10, [x2, #154, MUL VL] + WORD $0x85934c4b // ldr z11, [x2, #155, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa30a5 // eor z5.d, z5.d, z10.d + WORD $0x04ab30a5 // eor z5.d, z5.d, z11.d + WORD $0x8593504a // ldr z10, [x2, #156, MUL VL] + WORD $0x8593544b // ldr z11, [x2, #157, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa30c6 // eor z6.d, z6.d, z10.d + WORD $0x04ab30c6 // eor z6.d, z6.d, z11.d + WORD $0x8593584a // ldr z10, [x2, #158, MUL VL] + WORD $0x85935c4b // ldr z11, [x2, #159, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa30e7 // eor z7.d, z7.d, z10.d + WORD $0x04ab30e7 // eor z7.d, z7.d, z11.d + WORD $0x8594404a // ldr z10, [x2, #160, MUL VL] + WORD $0x8594444b // ldr z11, [x2, #161, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3108 // eor z8.d, z8.d, z10.d + WORD $0x04ab3108 // eor z8.d, z8.d, z11.d + // Check for early termination + CMP $9, R16 + BEQ mulSve_10x9Xor_store + + // Load and process 32 bytes from input 9 to 9 outputs + WORD $0x8580406c // ldr z12, [x3] + WORD $0x91008063 // add x3, x3, #32 + WORD $0x04fc958d // lsr z13.d, z12.d, #4 + WORD $0x0429318c // and z12.d, z12.d, z9.d + WORD $0x042931ad // and z13.d, z13.d, z9.d + WORD $0x8594484a // ldr z10, [x2, #162, MUL VL] + WORD $0x85944c4b // ldr z11, [x2, #163, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3000 // eor z0.d, z0.d, z10.d + WORD $0x04ab3000 // eor z0.d, z0.d, z11.d + WORD $0x8594504a // ldr z10, [x2, #164, MUL VL] + WORD $0x8594544b // ldr z11, [x2, #165, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3021 // eor z1.d, z1.d, z10.d + WORD $0x04ab3021 // eor z1.d, z1.d, z11.d + WORD $0x8594584a // ldr z10, [x2, #166, MUL VL] + WORD $0x85945c4b // ldr z11, [x2, #167, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3042 // eor z2.d, z2.d, z10.d + WORD $0x04ab3042 // eor z2.d, z2.d, z11.d + WORD $0x8595404a // ldr z10, [x2, #168, MUL VL] + WORD $0x8595444b // ldr z11, [x2, #169, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3063 // eor z3.d, z3.d, z10.d + WORD $0x04ab3063 // eor z3.d, z3.d, z11.d + WORD $0x8595484a // ldr z10, [x2, #170, MUL VL] + WORD $0x85954c4b // ldr z11, [x2, #171, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3084 // eor z4.d, z4.d, z10.d + WORD $0x04ab3084 // eor z4.d, z4.d, z11.d + WORD $0x8595504a // ldr z10, [x2, #172, MUL VL] + WORD $0x8595544b // ldr z11, [x2, #173, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa30a5 // eor z5.d, z5.d, z10.d + WORD $0x04ab30a5 // eor z5.d, z5.d, z11.d + WORD $0x8595584a // ldr z10, [x2, #174, MUL VL] + WORD $0x85955c4b // ldr z11, [x2, #175, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa30c6 // eor z6.d, z6.d, z10.d + WORD $0x04ab30c6 // eor z6.d, z6.d, z11.d + WORD $0x8596404a // ldr z10, [x2, #176, MUL VL] + WORD $0x8596444b // ldr z11, [x2, #177, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa30e7 // eor z7.d, z7.d, z10.d + WORD $0x04ab30e7 // eor z7.d, z7.d, z11.d + WORD $0x8596484a // ldr z10, [x2, #178, MUL VL] + WORD $0x85964c4b // ldr z11, [x2, #179, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3108 // eor z8.d, z8.d, z10.d + WORD $0x04ab3108 // eor z8.d, z8.d, z11.d + +mulSve_10x9Xor_store: + // Store 9 outputs + MOVD (R14), R6 + WORD $0xe5ef40c0 // st1d { z0.d }, p0, [x6, x15, lsl #3] + MOVD 24(R14), R6 + WORD $0xe5ef40c1 // st1d { z1.d }, p0, [x6, x15, lsl #3] + MOVD 48(R14), R6 + WORD $0xe5ef40c2 // st1d { z2.d }, p0, [x6, x15, lsl #3] + MOVD 72(R14), R6 + WORD $0xe5ef40c3 // st1d { z3.d }, p0, [x6, x15, lsl #3] + MOVD 96(R14), R6 + WORD $0xe5ef40c4 // st1d { z4.d }, p0, [x6, x15, lsl #3] + MOVD 120(R14), R6 + WORD $0xe5ef40c5 // st1d { z5.d }, p0, [x6, x15, lsl #3] + MOVD 144(R14), R6 + WORD $0xe5ef40c6 // st1d { z6.d }, p0, [x6, x15, lsl #3] + MOVD 168(R14), R6 + WORD $0xe5ef40c7 // st1d { z7.d }, p0, [x6, x15, lsl #3] + MOVD 192(R14), R6 + WORD $0xe5ef40c8 // st1d { z8.d }, p0, [x6, x15, lsl #3] + + // Prepare for next loop + WORD $0x910011ef // add x15, x15, #4 + WORD $0xf1000400 // subs x0, x0, #1 + BNE mulSve_10x9Xor_loop + +mulSve_10x9Xor_end: + RET + +// func mulSve_10x10(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: SVE +TEXT ·mulSve_10x10(SB), NOSPLIT, $8-88 + WORD $0x25d8e3e0 // ptrue p0.d + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 215 YMM used + MOVD n+80(FP), R0 + MOVD matrix_base+0(FP), R2 + WORD $0xd345fc00 // lsr x0, x0, #5 + WORD $0xea00001f // tst x0, x0 + BEQ mulSve_10x10_end + MOVD in_base+24(FP), R3 + MOVD (R3), R1 + MOVD 24(R3), R4 + MOVD 48(R3), R5 + MOVD 72(R3), R8 + MOVD 96(R3), R9 + MOVD 120(R3), R10 + MOVD 144(R3), R11 + MOVD 168(R3), R12 + MOVD 192(R3), R13 + MOVD 216(R3), R3 + MOVD out_base+48(FP), R14 + MOVD start+72(FP), R15 + + // Add start offset to input + WORD $0x8b0f0021 // add x1, x1, x15 + WORD $0x8b0f0084 // add x4, x4, x15 + WORD $0x8b0f00a5 // add x5, x5, x15 + WORD $0x8b0f0108 // add x8, x8, x15 + WORD $0x8b0f0129 // add x9, x9, x15 + WORD $0x8b0f014a // add x10, x10, x15 + WORD $0x8b0f016b // add x11, x11, x15 + WORD $0x8b0f018c // add x12, x12, x15 + WORD $0x8b0f01ad // add x13, x13, x15 + WORD $0x8b0f0063 // add x3, x3, x15 + WORD $0xd343fdef // lsr x15, x15, #3 + WORD $0xd28001e6 // mov x6, #15 + WORD $0x05e038ca // mov z10.d, x6 + WORD $0x0521214a // dup z10.b, z10.b[0] + + // Load number of input shards + MOVD in_len+32(FP), R16 + +mulSve_10x10_loop: + // Load and process 32 bytes from input 0 to 10 outputs + WORD $0x8580402d // ldr z13, [x1] + WORD $0x91008021 // add x1, x1, #32 + WORD $0x04fc95ae // lsr z14.d, z13.d, #4 + WORD $0x042a31ad // and z13.d, z13.d, z10.d + WORD $0x042a31ce // and z14.d, z14.d, z10.d + WORD $0x8580404b // ldr z11, [x2] + WORD $0x8580444c // ldr z12, [x2, #1, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3180 // eor z0.d, z12.d, z11.d + WORD $0x8580484b // ldr z11, [x2, #2, MUL VL] + WORD $0x85804c4c // ldr z12, [x2, #3, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3181 // eor z1.d, z12.d, z11.d + WORD $0x8580504b // ldr z11, [x2, #4, MUL VL] + WORD $0x8580544c // ldr z12, [x2, #5, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3182 // eor z2.d, z12.d, z11.d + WORD $0x8580584b // ldr z11, [x2, #6, MUL VL] + WORD $0x85805c4c // ldr z12, [x2, #7, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3183 // eor z3.d, z12.d, z11.d + WORD $0x8581404b // ldr z11, [x2, #8, MUL VL] + WORD $0x8581444c // ldr z12, [x2, #9, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3184 // eor z4.d, z12.d, z11.d + WORD $0x8581484b // ldr z11, [x2, #10, MUL VL] + WORD $0x85814c4c // ldr z12, [x2, #11, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3185 // eor z5.d, z12.d, z11.d + WORD $0x8581504b // ldr z11, [x2, #12, MUL VL] + WORD $0x8581544c // ldr z12, [x2, #13, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3186 // eor z6.d, z12.d, z11.d + WORD $0x8581584b // ldr z11, [x2, #14, MUL VL] + WORD $0x85815c4c // ldr z12, [x2, #15, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3187 // eor z7.d, z12.d, z11.d + WORD $0x8582404b // ldr z11, [x2, #16, MUL VL] + WORD $0x8582444c // ldr z12, [x2, #17, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3188 // eor z8.d, z12.d, z11.d + WORD $0x8582484b // ldr z11, [x2, #18, MUL VL] + WORD $0x85824c4c // ldr z12, [x2, #19, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3189 // eor z9.d, z12.d, z11.d + // Check for early termination + CMP $1, R16 + BEQ mulSve_10x10_store + + // Load and process 32 bytes from input 1 to 10 outputs + WORD $0x8580408d // ldr z13, [x4] + WORD $0x91008084 // add x4, x4, #32 + WORD $0x04fc95ae // lsr z14.d, z13.d, #4 + WORD $0x042a31ad // and z13.d, z13.d, z10.d + WORD $0x042a31ce // and z14.d, z14.d, z10.d + WORD $0x8582504b // ldr z11, [x2, #20, MUL VL] + WORD $0x8582544c // ldr z12, [x2, #21, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3000 // eor z0.d, z0.d, z11.d + WORD $0x04ac3000 // eor z0.d, z0.d, z12.d + WORD $0x8582584b // ldr z11, [x2, #22, MUL VL] + WORD $0x85825c4c // ldr z12, [x2, #23, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3021 // eor z1.d, z1.d, z11.d + WORD $0x04ac3021 // eor z1.d, z1.d, z12.d + WORD $0x8583404b // ldr z11, [x2, #24, MUL VL] + WORD $0x8583444c // ldr z12, [x2, #25, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3042 // eor z2.d, z2.d, z11.d + WORD $0x04ac3042 // eor z2.d, z2.d, z12.d + WORD $0x8583484b // ldr z11, [x2, #26, MUL VL] + WORD $0x85834c4c // ldr z12, [x2, #27, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3063 // eor z3.d, z3.d, z11.d + WORD $0x04ac3063 // eor z3.d, z3.d, z12.d + WORD $0x8583504b // ldr z11, [x2, #28, MUL VL] + WORD $0x8583544c // ldr z12, [x2, #29, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3084 // eor z4.d, z4.d, z11.d + WORD $0x04ac3084 // eor z4.d, z4.d, z12.d + WORD $0x8583584b // ldr z11, [x2, #30, MUL VL] + WORD $0x85835c4c // ldr z12, [x2, #31, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab30a5 // eor z5.d, z5.d, z11.d + WORD $0x04ac30a5 // eor z5.d, z5.d, z12.d + WORD $0x8584404b // ldr z11, [x2, #32, MUL VL] + WORD $0x8584444c // ldr z12, [x2, #33, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab30c6 // eor z6.d, z6.d, z11.d + WORD $0x04ac30c6 // eor z6.d, z6.d, z12.d + WORD $0x8584484b // ldr z11, [x2, #34, MUL VL] + WORD $0x85844c4c // ldr z12, [x2, #35, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab30e7 // eor z7.d, z7.d, z11.d + WORD $0x04ac30e7 // eor z7.d, z7.d, z12.d + WORD $0x8584504b // ldr z11, [x2, #36, MUL VL] + WORD $0x8584544c // ldr z12, [x2, #37, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3108 // eor z8.d, z8.d, z11.d + WORD $0x04ac3108 // eor z8.d, z8.d, z12.d + WORD $0x8584584b // ldr z11, [x2, #38, MUL VL] + WORD $0x85845c4c // ldr z12, [x2, #39, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3129 // eor z9.d, z9.d, z11.d + WORD $0x04ac3129 // eor z9.d, z9.d, z12.d + // Check for early termination + CMP $2, R16 + BEQ mulSve_10x10_store + + // Load and process 32 bytes from input 2 to 10 outputs + WORD $0x858040ad // ldr z13, [x5] + WORD $0x910080a5 // add x5, x5, #32 + WORD $0x04fc95ae // lsr z14.d, z13.d, #4 + WORD $0x042a31ad // and z13.d, z13.d, z10.d + WORD $0x042a31ce // and z14.d, z14.d, z10.d + WORD $0x8585404b // ldr z11, [x2, #40, MUL VL] + WORD $0x8585444c // ldr z12, [x2, #41, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3000 // eor z0.d, z0.d, z11.d + WORD $0x04ac3000 // eor z0.d, z0.d, z12.d + WORD $0x8585484b // ldr z11, [x2, #42, MUL VL] + WORD $0x85854c4c // ldr z12, [x2, #43, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3021 // eor z1.d, z1.d, z11.d + WORD $0x04ac3021 // eor z1.d, z1.d, z12.d + WORD $0x8585504b // ldr z11, [x2, #44, MUL VL] + WORD $0x8585544c // ldr z12, [x2, #45, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3042 // eor z2.d, z2.d, z11.d + WORD $0x04ac3042 // eor z2.d, z2.d, z12.d + WORD $0x8585584b // ldr z11, [x2, #46, MUL VL] + WORD $0x85855c4c // ldr z12, [x2, #47, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3063 // eor z3.d, z3.d, z11.d + WORD $0x04ac3063 // eor z3.d, z3.d, z12.d + WORD $0x8586404b // ldr z11, [x2, #48, MUL VL] + WORD $0x8586444c // ldr z12, [x2, #49, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3084 // eor z4.d, z4.d, z11.d + WORD $0x04ac3084 // eor z4.d, z4.d, z12.d + WORD $0x8586484b // ldr z11, [x2, #50, MUL VL] + WORD $0x85864c4c // ldr z12, [x2, #51, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab30a5 // eor z5.d, z5.d, z11.d + WORD $0x04ac30a5 // eor z5.d, z5.d, z12.d + WORD $0x8586504b // ldr z11, [x2, #52, MUL VL] + WORD $0x8586544c // ldr z12, [x2, #53, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab30c6 // eor z6.d, z6.d, z11.d + WORD $0x04ac30c6 // eor z6.d, z6.d, z12.d + WORD $0x8586584b // ldr z11, [x2, #54, MUL VL] + WORD $0x85865c4c // ldr z12, [x2, #55, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab30e7 // eor z7.d, z7.d, z11.d + WORD $0x04ac30e7 // eor z7.d, z7.d, z12.d + WORD $0x8587404b // ldr z11, [x2, #56, MUL VL] + WORD $0x8587444c // ldr z12, [x2, #57, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3108 // eor z8.d, z8.d, z11.d + WORD $0x04ac3108 // eor z8.d, z8.d, z12.d + WORD $0x8587484b // ldr z11, [x2, #58, MUL VL] + WORD $0x85874c4c // ldr z12, [x2, #59, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3129 // eor z9.d, z9.d, z11.d + WORD $0x04ac3129 // eor z9.d, z9.d, z12.d + // Check for early termination + CMP $3, R16 + BEQ mulSve_10x10_store + + // Load and process 32 bytes from input 3 to 10 outputs + WORD $0x8580410d // ldr z13, [x8] + WORD $0x91008108 // add x8, x8, #32 + WORD $0x04fc95ae // lsr z14.d, z13.d, #4 + WORD $0x042a31ad // and z13.d, z13.d, z10.d + WORD $0x042a31ce // and z14.d, z14.d, z10.d + WORD $0x8587504b // ldr z11, [x2, #60, MUL VL] + WORD $0x8587544c // ldr z12, [x2, #61, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3000 // eor z0.d, z0.d, z11.d + WORD $0x04ac3000 // eor z0.d, z0.d, z12.d + WORD $0x8587584b // ldr z11, [x2, #62, MUL VL] + WORD $0x85875c4c // ldr z12, [x2, #63, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3021 // eor z1.d, z1.d, z11.d + WORD $0x04ac3021 // eor z1.d, z1.d, z12.d + WORD $0x8588404b // ldr z11, [x2, #64, MUL VL] + WORD $0x8588444c // ldr z12, [x2, #65, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3042 // eor z2.d, z2.d, z11.d + WORD $0x04ac3042 // eor z2.d, z2.d, z12.d + WORD $0x8588484b // ldr z11, [x2, #66, MUL VL] + WORD $0x85884c4c // ldr z12, [x2, #67, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3063 // eor z3.d, z3.d, z11.d + WORD $0x04ac3063 // eor z3.d, z3.d, z12.d + WORD $0x8588504b // ldr z11, [x2, #68, MUL VL] + WORD $0x8588544c // ldr z12, [x2, #69, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3084 // eor z4.d, z4.d, z11.d + WORD $0x04ac3084 // eor z4.d, z4.d, z12.d + WORD $0x8588584b // ldr z11, [x2, #70, MUL VL] + WORD $0x85885c4c // ldr z12, [x2, #71, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab30a5 // eor z5.d, z5.d, z11.d + WORD $0x04ac30a5 // eor z5.d, z5.d, z12.d + WORD $0x8589404b // ldr z11, [x2, #72, MUL VL] + WORD $0x8589444c // ldr z12, [x2, #73, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab30c6 // eor z6.d, z6.d, z11.d + WORD $0x04ac30c6 // eor z6.d, z6.d, z12.d + WORD $0x8589484b // ldr z11, [x2, #74, MUL VL] + WORD $0x85894c4c // ldr z12, [x2, #75, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab30e7 // eor z7.d, z7.d, z11.d + WORD $0x04ac30e7 // eor z7.d, z7.d, z12.d + WORD $0x8589504b // ldr z11, [x2, #76, MUL VL] + WORD $0x8589544c // ldr z12, [x2, #77, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3108 // eor z8.d, z8.d, z11.d + WORD $0x04ac3108 // eor z8.d, z8.d, z12.d + WORD $0x8589584b // ldr z11, [x2, #78, MUL VL] + WORD $0x85895c4c // ldr z12, [x2, #79, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3129 // eor z9.d, z9.d, z11.d + WORD $0x04ac3129 // eor z9.d, z9.d, z12.d + // Check for early termination + CMP $4, R16 + BEQ mulSve_10x10_store + + // Load and process 32 bytes from input 4 to 10 outputs + WORD $0x8580412d // ldr z13, [x9] + WORD $0x91008129 // add x9, x9, #32 + WORD $0x04fc95ae // lsr z14.d, z13.d, #4 + WORD $0x042a31ad // and z13.d, z13.d, z10.d + WORD $0x042a31ce // and z14.d, z14.d, z10.d + WORD $0x858a404b // ldr z11, [x2, #80, MUL VL] + WORD $0x858a444c // ldr z12, [x2, #81, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3000 // eor z0.d, z0.d, z11.d + WORD $0x04ac3000 // eor z0.d, z0.d, z12.d + WORD $0x858a484b // ldr z11, [x2, #82, MUL VL] + WORD $0x858a4c4c // ldr z12, [x2, #83, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3021 // eor z1.d, z1.d, z11.d + WORD $0x04ac3021 // eor z1.d, z1.d, z12.d + WORD $0x858a504b // ldr z11, [x2, #84, MUL VL] + WORD $0x858a544c // ldr z12, [x2, #85, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3042 // eor z2.d, z2.d, z11.d + WORD $0x04ac3042 // eor z2.d, z2.d, z12.d + WORD $0x858a584b // ldr z11, [x2, #86, MUL VL] + WORD $0x858a5c4c // ldr z12, [x2, #87, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3063 // eor z3.d, z3.d, z11.d + WORD $0x04ac3063 // eor z3.d, z3.d, z12.d + WORD $0x858b404b // ldr z11, [x2, #88, MUL VL] + WORD $0x858b444c // ldr z12, [x2, #89, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3084 // eor z4.d, z4.d, z11.d + WORD $0x04ac3084 // eor z4.d, z4.d, z12.d + WORD $0x858b484b // ldr z11, [x2, #90, MUL VL] + WORD $0x858b4c4c // ldr z12, [x2, #91, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab30a5 // eor z5.d, z5.d, z11.d + WORD $0x04ac30a5 // eor z5.d, z5.d, z12.d + WORD $0x858b504b // ldr z11, [x2, #92, MUL VL] + WORD $0x858b544c // ldr z12, [x2, #93, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab30c6 // eor z6.d, z6.d, z11.d + WORD $0x04ac30c6 // eor z6.d, z6.d, z12.d + WORD $0x858b584b // ldr z11, [x2, #94, MUL VL] + WORD $0x858b5c4c // ldr z12, [x2, #95, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab30e7 // eor z7.d, z7.d, z11.d + WORD $0x04ac30e7 // eor z7.d, z7.d, z12.d + WORD $0x858c404b // ldr z11, [x2, #96, MUL VL] + WORD $0x858c444c // ldr z12, [x2, #97, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3108 // eor z8.d, z8.d, z11.d + WORD $0x04ac3108 // eor z8.d, z8.d, z12.d + WORD $0x858c484b // ldr z11, [x2, #98, MUL VL] + WORD $0x858c4c4c // ldr z12, [x2, #99, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3129 // eor z9.d, z9.d, z11.d + WORD $0x04ac3129 // eor z9.d, z9.d, z12.d + // Check for early termination + CMP $5, R16 + BEQ mulSve_10x10_store + + // Load and process 32 bytes from input 5 to 10 outputs + WORD $0x8580414d // ldr z13, [x10] + WORD $0x9100814a // add x10, x10, #32 + WORD $0x04fc95ae // lsr z14.d, z13.d, #4 + WORD $0x042a31ad // and z13.d, z13.d, z10.d + WORD $0x042a31ce // and z14.d, z14.d, z10.d + WORD $0x858c504b // ldr z11, [x2, #100, MUL VL] + WORD $0x858c544c // ldr z12, [x2, #101, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3000 // eor z0.d, z0.d, z11.d + WORD $0x04ac3000 // eor z0.d, z0.d, z12.d + WORD $0x858c584b // ldr z11, [x2, #102, MUL VL] + WORD $0x858c5c4c // ldr z12, [x2, #103, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3021 // eor z1.d, z1.d, z11.d + WORD $0x04ac3021 // eor z1.d, z1.d, z12.d + WORD $0x858d404b // ldr z11, [x2, #104, MUL VL] + WORD $0x858d444c // ldr z12, [x2, #105, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3042 // eor z2.d, z2.d, z11.d + WORD $0x04ac3042 // eor z2.d, z2.d, z12.d + WORD $0x858d484b // ldr z11, [x2, #106, MUL VL] + WORD $0x858d4c4c // ldr z12, [x2, #107, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3063 // eor z3.d, z3.d, z11.d + WORD $0x04ac3063 // eor z3.d, z3.d, z12.d + WORD $0x858d504b // ldr z11, [x2, #108, MUL VL] + WORD $0x858d544c // ldr z12, [x2, #109, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3084 // eor z4.d, z4.d, z11.d + WORD $0x04ac3084 // eor z4.d, z4.d, z12.d + WORD $0x858d584b // ldr z11, [x2, #110, MUL VL] + WORD $0x858d5c4c // ldr z12, [x2, #111, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab30a5 // eor z5.d, z5.d, z11.d + WORD $0x04ac30a5 // eor z5.d, z5.d, z12.d + WORD $0x858e404b // ldr z11, [x2, #112, MUL VL] + WORD $0x858e444c // ldr z12, [x2, #113, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab30c6 // eor z6.d, z6.d, z11.d + WORD $0x04ac30c6 // eor z6.d, z6.d, z12.d + WORD $0x858e484b // ldr z11, [x2, #114, MUL VL] + WORD $0x858e4c4c // ldr z12, [x2, #115, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab30e7 // eor z7.d, z7.d, z11.d + WORD $0x04ac30e7 // eor z7.d, z7.d, z12.d + WORD $0x858e504b // ldr z11, [x2, #116, MUL VL] + WORD $0x858e544c // ldr z12, [x2, #117, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3108 // eor z8.d, z8.d, z11.d + WORD $0x04ac3108 // eor z8.d, z8.d, z12.d + WORD $0x858e584b // ldr z11, [x2, #118, MUL VL] + WORD $0x858e5c4c // ldr z12, [x2, #119, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3129 // eor z9.d, z9.d, z11.d + WORD $0x04ac3129 // eor z9.d, z9.d, z12.d + // Check for early termination + CMP $6, R16 + BEQ mulSve_10x10_store + + // Load and process 32 bytes from input 6 to 10 outputs + WORD $0x8580416d // ldr z13, [x11] + WORD $0x9100816b // add x11, x11, #32 + WORD $0x04fc95ae // lsr z14.d, z13.d, #4 + WORD $0x042a31ad // and z13.d, z13.d, z10.d + WORD $0x042a31ce // and z14.d, z14.d, z10.d + WORD $0x858f404b // ldr z11, [x2, #120, MUL VL] + WORD $0x858f444c // ldr z12, [x2, #121, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3000 // eor z0.d, z0.d, z11.d + WORD $0x04ac3000 // eor z0.d, z0.d, z12.d + WORD $0x858f484b // ldr z11, [x2, #122, MUL VL] + WORD $0x858f4c4c // ldr z12, [x2, #123, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3021 // eor z1.d, z1.d, z11.d + WORD $0x04ac3021 // eor z1.d, z1.d, z12.d + WORD $0x858f504b // ldr z11, [x2, #124, MUL VL] + WORD $0x858f544c // ldr z12, [x2, #125, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3042 // eor z2.d, z2.d, z11.d + WORD $0x04ac3042 // eor z2.d, z2.d, z12.d + WORD $0x858f584b // ldr z11, [x2, #126, MUL VL] + WORD $0x858f5c4c // ldr z12, [x2, #127, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3063 // eor z3.d, z3.d, z11.d + WORD $0x04ac3063 // eor z3.d, z3.d, z12.d + WORD $0x8590404b // ldr z11, [x2, #128, MUL VL] + WORD $0x8590444c // ldr z12, [x2, #129, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3084 // eor z4.d, z4.d, z11.d + WORD $0x04ac3084 // eor z4.d, z4.d, z12.d + WORD $0x8590484b // ldr z11, [x2, #130, MUL VL] + WORD $0x85904c4c // ldr z12, [x2, #131, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab30a5 // eor z5.d, z5.d, z11.d + WORD $0x04ac30a5 // eor z5.d, z5.d, z12.d + WORD $0x8590504b // ldr z11, [x2, #132, MUL VL] + WORD $0x8590544c // ldr z12, [x2, #133, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab30c6 // eor z6.d, z6.d, z11.d + WORD $0x04ac30c6 // eor z6.d, z6.d, z12.d + WORD $0x8590584b // ldr z11, [x2, #134, MUL VL] + WORD $0x85905c4c // ldr z12, [x2, #135, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab30e7 // eor z7.d, z7.d, z11.d + WORD $0x04ac30e7 // eor z7.d, z7.d, z12.d + WORD $0x8591404b // ldr z11, [x2, #136, MUL VL] + WORD $0x8591444c // ldr z12, [x2, #137, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3108 // eor z8.d, z8.d, z11.d + WORD $0x04ac3108 // eor z8.d, z8.d, z12.d + WORD $0x8591484b // ldr z11, [x2, #138, MUL VL] + WORD $0x85914c4c // ldr z12, [x2, #139, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3129 // eor z9.d, z9.d, z11.d + WORD $0x04ac3129 // eor z9.d, z9.d, z12.d + // Check for early termination + CMP $7, R16 + BEQ mulSve_10x10_store + + // Load and process 32 bytes from input 7 to 10 outputs + WORD $0x8580418d // ldr z13, [x12] + WORD $0x9100818c // add x12, x12, #32 + WORD $0x04fc95ae // lsr z14.d, z13.d, #4 + WORD $0x042a31ad // and z13.d, z13.d, z10.d + WORD $0x042a31ce // and z14.d, z14.d, z10.d + WORD $0x8591504b // ldr z11, [x2, #140, MUL VL] + WORD $0x8591544c // ldr z12, [x2, #141, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3000 // eor z0.d, z0.d, z11.d + WORD $0x04ac3000 // eor z0.d, z0.d, z12.d + WORD $0x8591584b // ldr z11, [x2, #142, MUL VL] + WORD $0x85915c4c // ldr z12, [x2, #143, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3021 // eor z1.d, z1.d, z11.d + WORD $0x04ac3021 // eor z1.d, z1.d, z12.d + WORD $0x8592404b // ldr z11, [x2, #144, MUL VL] + WORD $0x8592444c // ldr z12, [x2, #145, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3042 // eor z2.d, z2.d, z11.d + WORD $0x04ac3042 // eor z2.d, z2.d, z12.d + WORD $0x8592484b // ldr z11, [x2, #146, MUL VL] + WORD $0x85924c4c // ldr z12, [x2, #147, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3063 // eor z3.d, z3.d, z11.d + WORD $0x04ac3063 // eor z3.d, z3.d, z12.d + WORD $0x8592504b // ldr z11, [x2, #148, MUL VL] + WORD $0x8592544c // ldr z12, [x2, #149, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3084 // eor z4.d, z4.d, z11.d + WORD $0x04ac3084 // eor z4.d, z4.d, z12.d + WORD $0x8592584b // ldr z11, [x2, #150, MUL VL] + WORD $0x85925c4c // ldr z12, [x2, #151, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab30a5 // eor z5.d, z5.d, z11.d + WORD $0x04ac30a5 // eor z5.d, z5.d, z12.d + WORD $0x8593404b // ldr z11, [x2, #152, MUL VL] + WORD $0x8593444c // ldr z12, [x2, #153, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab30c6 // eor z6.d, z6.d, z11.d + WORD $0x04ac30c6 // eor z6.d, z6.d, z12.d + WORD $0x8593484b // ldr z11, [x2, #154, MUL VL] + WORD $0x85934c4c // ldr z12, [x2, #155, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab30e7 // eor z7.d, z7.d, z11.d + WORD $0x04ac30e7 // eor z7.d, z7.d, z12.d + WORD $0x8593504b // ldr z11, [x2, #156, MUL VL] + WORD $0x8593544c // ldr z12, [x2, #157, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3108 // eor z8.d, z8.d, z11.d + WORD $0x04ac3108 // eor z8.d, z8.d, z12.d + WORD $0x8593584b // ldr z11, [x2, #158, MUL VL] + WORD $0x85935c4c // ldr z12, [x2, #159, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3129 // eor z9.d, z9.d, z11.d + WORD $0x04ac3129 // eor z9.d, z9.d, z12.d + // Check for early termination + CMP $8, R16 + BEQ mulSve_10x10_store + + // Load and process 32 bytes from input 8 to 10 outputs + WORD $0x858041ad // ldr z13, [x13] + WORD $0x910081ad // add x13, x13, #32 + WORD $0x04fc95ae // lsr z14.d, z13.d, #4 + WORD $0x042a31ad // and z13.d, z13.d, z10.d + WORD $0x042a31ce // and z14.d, z14.d, z10.d + WORD $0x8594404b // ldr z11, [x2, #160, MUL VL] + WORD $0x8594444c // ldr z12, [x2, #161, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3000 // eor z0.d, z0.d, z11.d + WORD $0x04ac3000 // eor z0.d, z0.d, z12.d + WORD $0x8594484b // ldr z11, [x2, #162, MUL VL] + WORD $0x85944c4c // ldr z12, [x2, #163, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3021 // eor z1.d, z1.d, z11.d + WORD $0x04ac3021 // eor z1.d, z1.d, z12.d + WORD $0x8594504b // ldr z11, [x2, #164, MUL VL] + WORD $0x8594544c // ldr z12, [x2, #165, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3042 // eor z2.d, z2.d, z11.d + WORD $0x04ac3042 // eor z2.d, z2.d, z12.d + WORD $0x8594584b // ldr z11, [x2, #166, MUL VL] + WORD $0x85945c4c // ldr z12, [x2, #167, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3063 // eor z3.d, z3.d, z11.d + WORD $0x04ac3063 // eor z3.d, z3.d, z12.d + WORD $0x8595404b // ldr z11, [x2, #168, MUL VL] + WORD $0x8595444c // ldr z12, [x2, #169, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3084 // eor z4.d, z4.d, z11.d + WORD $0x04ac3084 // eor z4.d, z4.d, z12.d + WORD $0x8595484b // ldr z11, [x2, #170, MUL VL] + WORD $0x85954c4c // ldr z12, [x2, #171, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab30a5 // eor z5.d, z5.d, z11.d + WORD $0x04ac30a5 // eor z5.d, z5.d, z12.d + WORD $0x8595504b // ldr z11, [x2, #172, MUL VL] + WORD $0x8595544c // ldr z12, [x2, #173, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab30c6 // eor z6.d, z6.d, z11.d + WORD $0x04ac30c6 // eor z6.d, z6.d, z12.d + WORD $0x8595584b // ldr z11, [x2, #174, MUL VL] + WORD $0x85955c4c // ldr z12, [x2, #175, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab30e7 // eor z7.d, z7.d, z11.d + WORD $0x04ac30e7 // eor z7.d, z7.d, z12.d + WORD $0x8596404b // ldr z11, [x2, #176, MUL VL] + WORD $0x8596444c // ldr z12, [x2, #177, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3108 // eor z8.d, z8.d, z11.d + WORD $0x04ac3108 // eor z8.d, z8.d, z12.d + WORD $0x8596484b // ldr z11, [x2, #178, MUL VL] + WORD $0x85964c4c // ldr z12, [x2, #179, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3129 // eor z9.d, z9.d, z11.d + WORD $0x04ac3129 // eor z9.d, z9.d, z12.d + // Check for early termination + CMP $9, R16 + BEQ mulSve_10x10_store + + // Load and process 32 bytes from input 9 to 10 outputs + WORD $0x8580406d // ldr z13, [x3] + WORD $0x91008063 // add x3, x3, #32 + WORD $0x04fc95ae // lsr z14.d, z13.d, #4 + WORD $0x042a31ad // and z13.d, z13.d, z10.d + WORD $0x042a31ce // and z14.d, z14.d, z10.d + WORD $0x8596504b // ldr z11, [x2, #180, MUL VL] + WORD $0x8596544c // ldr z12, [x2, #181, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3000 // eor z0.d, z0.d, z11.d + WORD $0x04ac3000 // eor z0.d, z0.d, z12.d + WORD $0x8596584b // ldr z11, [x2, #182, MUL VL] + WORD $0x85965c4c // ldr z12, [x2, #183, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3021 // eor z1.d, z1.d, z11.d + WORD $0x04ac3021 // eor z1.d, z1.d, z12.d + WORD $0x8597404b // ldr z11, [x2, #184, MUL VL] + WORD $0x8597444c // ldr z12, [x2, #185, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3042 // eor z2.d, z2.d, z11.d + WORD $0x04ac3042 // eor z2.d, z2.d, z12.d + WORD $0x8597484b // ldr z11, [x2, #186, MUL VL] + WORD $0x85974c4c // ldr z12, [x2, #187, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3063 // eor z3.d, z3.d, z11.d + WORD $0x04ac3063 // eor z3.d, z3.d, z12.d + WORD $0x8597504b // ldr z11, [x2, #188, MUL VL] + WORD $0x8597544c // ldr z12, [x2, #189, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3084 // eor z4.d, z4.d, z11.d + WORD $0x04ac3084 // eor z4.d, z4.d, z12.d + WORD $0x8597584b // ldr z11, [x2, #190, MUL VL] + WORD $0x85975c4c // ldr z12, [x2, #191, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab30a5 // eor z5.d, z5.d, z11.d + WORD $0x04ac30a5 // eor z5.d, z5.d, z12.d + WORD $0x8598404b // ldr z11, [x2, #192, MUL VL] + WORD $0x8598444c // ldr z12, [x2, #193, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab30c6 // eor z6.d, z6.d, z11.d + WORD $0x04ac30c6 // eor z6.d, z6.d, z12.d + WORD $0x8598484b // ldr z11, [x2, #194, MUL VL] + WORD $0x85984c4c // ldr z12, [x2, #195, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab30e7 // eor z7.d, z7.d, z11.d + WORD $0x04ac30e7 // eor z7.d, z7.d, z12.d + WORD $0x8598504b // ldr z11, [x2, #196, MUL VL] + WORD $0x8598544c // ldr z12, [x2, #197, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3108 // eor z8.d, z8.d, z11.d + WORD $0x04ac3108 // eor z8.d, z8.d, z12.d + WORD $0x8598584b // ldr z11, [x2, #198, MUL VL] + WORD $0x85985c4c // ldr z12, [x2, #199, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3129 // eor z9.d, z9.d, z11.d + WORD $0x04ac3129 // eor z9.d, z9.d, z12.d + +mulSve_10x10_store: + // Store 10 outputs + MOVD (R14), R6 + WORD $0xe5ef40c0 // st1d { z0.d }, p0, [x6, x15, lsl #3] + MOVD 24(R14), R6 + WORD $0xe5ef40c1 // st1d { z1.d }, p0, [x6, x15, lsl #3] + MOVD 48(R14), R6 + WORD $0xe5ef40c2 // st1d { z2.d }, p0, [x6, x15, lsl #3] + MOVD 72(R14), R6 + WORD $0xe5ef40c3 // st1d { z3.d }, p0, [x6, x15, lsl #3] + MOVD 96(R14), R6 + WORD $0xe5ef40c4 // st1d { z4.d }, p0, [x6, x15, lsl #3] + MOVD 120(R14), R6 + WORD $0xe5ef40c5 // st1d { z5.d }, p0, [x6, x15, lsl #3] + MOVD 144(R14), R6 + WORD $0xe5ef40c6 // st1d { z6.d }, p0, [x6, x15, lsl #3] + MOVD 168(R14), R6 + WORD $0xe5ef40c7 // st1d { z7.d }, p0, [x6, x15, lsl #3] + MOVD 192(R14), R6 + WORD $0xe5ef40c8 // st1d { z8.d }, p0, [x6, x15, lsl #3] + MOVD 216(R14), R6 + WORD $0xe5ef40c9 // st1d { z9.d }, p0, [x6, x15, lsl #3] + + // Prepare for next loop + WORD $0x910011ef // add x15, x15, #4 + WORD $0xf1000400 // subs x0, x0, #1 + BNE mulSve_10x10_loop + +mulSve_10x10_end: + RET + +// func mulSve_10x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: SVE +TEXT ·mulSve_10x10Xor(SB), NOSPLIT, $8-88 + WORD $0x25d8e3e0 // ptrue p0.d + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 215 YMM used + MOVD n+80(FP), R0 + MOVD matrix_base+0(FP), R2 + WORD $0xd345fc00 // lsr x0, x0, #5 + WORD $0xea00001f // tst x0, x0 + BEQ mulSve_10x10Xor_end + MOVD in_base+24(FP), R3 + MOVD (R3), R1 + MOVD 24(R3), R4 + MOVD 48(R3), R5 + MOVD 72(R3), R8 + MOVD 96(R3), R9 + MOVD 120(R3), R10 + MOVD 144(R3), R11 + MOVD 168(R3), R12 + MOVD 192(R3), R13 + MOVD 216(R3), R3 + MOVD out_base+48(FP), R14 + MOVD start+72(FP), R15 + + // Add start offset to input + WORD $0x8b0f0021 // add x1, x1, x15 + WORD $0x8b0f0084 // add x4, x4, x15 + WORD $0x8b0f00a5 // add x5, x5, x15 + WORD $0x8b0f0108 // add x8, x8, x15 + WORD $0x8b0f0129 // add x9, x9, x15 + WORD $0x8b0f014a // add x10, x10, x15 + WORD $0x8b0f016b // add x11, x11, x15 + WORD $0x8b0f018c // add x12, x12, x15 + WORD $0x8b0f01ad // add x13, x13, x15 + WORD $0x8b0f0063 // add x3, x3, x15 + WORD $0xd343fdef // lsr x15, x15, #3 + WORD $0xd28001e6 // mov x6, #15 + WORD $0x05e038ca // mov z10.d, x6 + WORD $0x0521214a // dup z10.b, z10.b[0] + + // Load number of input shards + MOVD in_len+32(FP), R16 + +mulSve_10x10Xor_loop: + // Load and process 32 bytes from input 0 to 10 outputs + WORD $0x8580402d // ldr z13, [x1] + WORD $0x91008021 // add x1, x1, #32 + WORD $0x04fc95ae // lsr z14.d, z13.d, #4 + WORD $0x042a31ad // and z13.d, z13.d, z10.d + WORD $0x042a31ce // and z14.d, z14.d, z10.d + MOVD (R14), R6 + WORD $0xa5ef40c0 // ld1d { z0.d }, p0/z, [x6, x15, lsl #3] + WORD $0x8580404b // ldr z11, [x2] + WORD $0x8580444c // ldr z12, [x2, #1, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3000 // eor z0.d, z0.d, z11.d + WORD $0x04ac3000 // eor z0.d, z0.d, z12.d + MOVD 24(R14), R6 + WORD $0xa5ef40c1 // ld1d { z1.d }, p0/z, [x6, x15, lsl #3] + WORD $0x8580484b // ldr z11, [x2, #2, MUL VL] + WORD $0x85804c4c // ldr z12, [x2, #3, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3021 // eor z1.d, z1.d, z11.d + WORD $0x04ac3021 // eor z1.d, z1.d, z12.d + MOVD 48(R14), R6 + WORD $0xa5ef40c2 // ld1d { z2.d }, p0/z, [x6, x15, lsl #3] + WORD $0x8580504b // ldr z11, [x2, #4, MUL VL] + WORD $0x8580544c // ldr z12, [x2, #5, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3042 // eor z2.d, z2.d, z11.d + WORD $0x04ac3042 // eor z2.d, z2.d, z12.d + MOVD 72(R14), R6 + WORD $0xa5ef40c3 // ld1d { z3.d }, p0/z, [x6, x15, lsl #3] + WORD $0x8580584b // ldr z11, [x2, #6, MUL VL] + WORD $0x85805c4c // ldr z12, [x2, #7, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3063 // eor z3.d, z3.d, z11.d + WORD $0x04ac3063 // eor z3.d, z3.d, z12.d + MOVD 96(R14), R6 + WORD $0xa5ef40c4 // ld1d { z4.d }, p0/z, [x6, x15, lsl #3] + WORD $0x8581404b // ldr z11, [x2, #8, MUL VL] + WORD $0x8581444c // ldr z12, [x2, #9, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3084 // eor z4.d, z4.d, z11.d + WORD $0x04ac3084 // eor z4.d, z4.d, z12.d + MOVD 120(R14), R6 + WORD $0xa5ef40c5 // ld1d { z5.d }, p0/z, [x6, x15, lsl #3] + WORD $0x8581484b // ldr z11, [x2, #10, MUL VL] + WORD $0x85814c4c // ldr z12, [x2, #11, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab30a5 // eor z5.d, z5.d, z11.d + WORD $0x04ac30a5 // eor z5.d, z5.d, z12.d + MOVD 144(R14), R6 + WORD $0xa5ef40c6 // ld1d { z6.d }, p0/z, [x6, x15, lsl #3] + WORD $0x8581504b // ldr z11, [x2, #12, MUL VL] + WORD $0x8581544c // ldr z12, [x2, #13, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab30c6 // eor z6.d, z6.d, z11.d + WORD $0x04ac30c6 // eor z6.d, z6.d, z12.d + MOVD 168(R14), R6 + WORD $0xa5ef40c7 // ld1d { z7.d }, p0/z, [x6, x15, lsl #3] + WORD $0x8581584b // ldr z11, [x2, #14, MUL VL] + WORD $0x85815c4c // ldr z12, [x2, #15, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab30e7 // eor z7.d, z7.d, z11.d + WORD $0x04ac30e7 // eor z7.d, z7.d, z12.d + MOVD 192(R14), R6 + WORD $0xa5ef40c8 // ld1d { z8.d }, p0/z, [x6, x15, lsl #3] + WORD $0x8582404b // ldr z11, [x2, #16, MUL VL] + WORD $0x8582444c // ldr z12, [x2, #17, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3108 // eor z8.d, z8.d, z11.d + WORD $0x04ac3108 // eor z8.d, z8.d, z12.d + MOVD 216(R14), R6 + WORD $0xa5ef40c9 // ld1d { z9.d }, p0/z, [x6, x15, lsl #3] + WORD $0x8582484b // ldr z11, [x2, #18, MUL VL] + WORD $0x85824c4c // ldr z12, [x2, #19, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3129 // eor z9.d, z9.d, z11.d + WORD $0x04ac3129 // eor z9.d, z9.d, z12.d + // Check for early termination + CMP $1, R16 + BEQ mulSve_10x10Xor_store + + // Load and process 32 bytes from input 1 to 10 outputs + WORD $0x8580408d // ldr z13, [x4] + WORD $0x91008084 // add x4, x4, #32 + WORD $0x04fc95ae // lsr z14.d, z13.d, #4 + WORD $0x042a31ad // and z13.d, z13.d, z10.d + WORD $0x042a31ce // and z14.d, z14.d, z10.d + WORD $0x8582504b // ldr z11, [x2, #20, MUL VL] + WORD $0x8582544c // ldr z12, [x2, #21, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3000 // eor z0.d, z0.d, z11.d + WORD $0x04ac3000 // eor z0.d, z0.d, z12.d + WORD $0x8582584b // ldr z11, [x2, #22, MUL VL] + WORD $0x85825c4c // ldr z12, [x2, #23, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3021 // eor z1.d, z1.d, z11.d + WORD $0x04ac3021 // eor z1.d, z1.d, z12.d + WORD $0x8583404b // ldr z11, [x2, #24, MUL VL] + WORD $0x8583444c // ldr z12, [x2, #25, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3042 // eor z2.d, z2.d, z11.d + WORD $0x04ac3042 // eor z2.d, z2.d, z12.d + WORD $0x8583484b // ldr z11, [x2, #26, MUL VL] + WORD $0x85834c4c // ldr z12, [x2, #27, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3063 // eor z3.d, z3.d, z11.d + WORD $0x04ac3063 // eor z3.d, z3.d, z12.d + WORD $0x8583504b // ldr z11, [x2, #28, MUL VL] + WORD $0x8583544c // ldr z12, [x2, #29, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3084 // eor z4.d, z4.d, z11.d + WORD $0x04ac3084 // eor z4.d, z4.d, z12.d + WORD $0x8583584b // ldr z11, [x2, #30, MUL VL] + WORD $0x85835c4c // ldr z12, [x2, #31, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab30a5 // eor z5.d, z5.d, z11.d + WORD $0x04ac30a5 // eor z5.d, z5.d, z12.d + WORD $0x8584404b // ldr z11, [x2, #32, MUL VL] + WORD $0x8584444c // ldr z12, [x2, #33, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab30c6 // eor z6.d, z6.d, z11.d + WORD $0x04ac30c6 // eor z6.d, z6.d, z12.d + WORD $0x8584484b // ldr z11, [x2, #34, MUL VL] + WORD $0x85844c4c // ldr z12, [x2, #35, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab30e7 // eor z7.d, z7.d, z11.d + WORD $0x04ac30e7 // eor z7.d, z7.d, z12.d + WORD $0x8584504b // ldr z11, [x2, #36, MUL VL] + WORD $0x8584544c // ldr z12, [x2, #37, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3108 // eor z8.d, z8.d, z11.d + WORD $0x04ac3108 // eor z8.d, z8.d, z12.d + WORD $0x8584584b // ldr z11, [x2, #38, MUL VL] + WORD $0x85845c4c // ldr z12, [x2, #39, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3129 // eor z9.d, z9.d, z11.d + WORD $0x04ac3129 // eor z9.d, z9.d, z12.d + // Check for early termination + CMP $2, R16 + BEQ mulSve_10x10Xor_store + + // Load and process 32 bytes from input 2 to 10 outputs + WORD $0x858040ad // ldr z13, [x5] + WORD $0x910080a5 // add x5, x5, #32 + WORD $0x04fc95ae // lsr z14.d, z13.d, #4 + WORD $0x042a31ad // and z13.d, z13.d, z10.d + WORD $0x042a31ce // and z14.d, z14.d, z10.d + WORD $0x8585404b // ldr z11, [x2, #40, MUL VL] + WORD $0x8585444c // ldr z12, [x2, #41, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3000 // eor z0.d, z0.d, z11.d + WORD $0x04ac3000 // eor z0.d, z0.d, z12.d + WORD $0x8585484b // ldr z11, [x2, #42, MUL VL] + WORD $0x85854c4c // ldr z12, [x2, #43, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3021 // eor z1.d, z1.d, z11.d + WORD $0x04ac3021 // eor z1.d, z1.d, z12.d + WORD $0x8585504b // ldr z11, [x2, #44, MUL VL] + WORD $0x8585544c // ldr z12, [x2, #45, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3042 // eor z2.d, z2.d, z11.d + WORD $0x04ac3042 // eor z2.d, z2.d, z12.d + WORD $0x8585584b // ldr z11, [x2, #46, MUL VL] + WORD $0x85855c4c // ldr z12, [x2, #47, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3063 // eor z3.d, z3.d, z11.d + WORD $0x04ac3063 // eor z3.d, z3.d, z12.d + WORD $0x8586404b // ldr z11, [x2, #48, MUL VL] + WORD $0x8586444c // ldr z12, [x2, #49, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3084 // eor z4.d, z4.d, z11.d + WORD $0x04ac3084 // eor z4.d, z4.d, z12.d + WORD $0x8586484b // ldr z11, [x2, #50, MUL VL] + WORD $0x85864c4c // ldr z12, [x2, #51, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab30a5 // eor z5.d, z5.d, z11.d + WORD $0x04ac30a5 // eor z5.d, z5.d, z12.d + WORD $0x8586504b // ldr z11, [x2, #52, MUL VL] + WORD $0x8586544c // ldr z12, [x2, #53, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab30c6 // eor z6.d, z6.d, z11.d + WORD $0x04ac30c6 // eor z6.d, z6.d, z12.d + WORD $0x8586584b // ldr z11, [x2, #54, MUL VL] + WORD $0x85865c4c // ldr z12, [x2, #55, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab30e7 // eor z7.d, z7.d, z11.d + WORD $0x04ac30e7 // eor z7.d, z7.d, z12.d + WORD $0x8587404b // ldr z11, [x2, #56, MUL VL] + WORD $0x8587444c // ldr z12, [x2, #57, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3108 // eor z8.d, z8.d, z11.d + WORD $0x04ac3108 // eor z8.d, z8.d, z12.d + WORD $0x8587484b // ldr z11, [x2, #58, MUL VL] + WORD $0x85874c4c // ldr z12, [x2, #59, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3129 // eor z9.d, z9.d, z11.d + WORD $0x04ac3129 // eor z9.d, z9.d, z12.d + // Check for early termination + CMP $3, R16 + BEQ mulSve_10x10Xor_store + + // Load and process 32 bytes from input 3 to 10 outputs + WORD $0x8580410d // ldr z13, [x8] + WORD $0x91008108 // add x8, x8, #32 + WORD $0x04fc95ae // lsr z14.d, z13.d, #4 + WORD $0x042a31ad // and z13.d, z13.d, z10.d + WORD $0x042a31ce // and z14.d, z14.d, z10.d + WORD $0x8587504b // ldr z11, [x2, #60, MUL VL] + WORD $0x8587544c // ldr z12, [x2, #61, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3000 // eor z0.d, z0.d, z11.d + WORD $0x04ac3000 // eor z0.d, z0.d, z12.d + WORD $0x8587584b // ldr z11, [x2, #62, MUL VL] + WORD $0x85875c4c // ldr z12, [x2, #63, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3021 // eor z1.d, z1.d, z11.d + WORD $0x04ac3021 // eor z1.d, z1.d, z12.d + WORD $0x8588404b // ldr z11, [x2, #64, MUL VL] + WORD $0x8588444c // ldr z12, [x2, #65, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3042 // eor z2.d, z2.d, z11.d + WORD $0x04ac3042 // eor z2.d, z2.d, z12.d + WORD $0x8588484b // ldr z11, [x2, #66, MUL VL] + WORD $0x85884c4c // ldr z12, [x2, #67, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3063 // eor z3.d, z3.d, z11.d + WORD $0x04ac3063 // eor z3.d, z3.d, z12.d + WORD $0x8588504b // ldr z11, [x2, #68, MUL VL] + WORD $0x8588544c // ldr z12, [x2, #69, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3084 // eor z4.d, z4.d, z11.d + WORD $0x04ac3084 // eor z4.d, z4.d, z12.d + WORD $0x8588584b // ldr z11, [x2, #70, MUL VL] + WORD $0x85885c4c // ldr z12, [x2, #71, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab30a5 // eor z5.d, z5.d, z11.d + WORD $0x04ac30a5 // eor z5.d, z5.d, z12.d + WORD $0x8589404b // ldr z11, [x2, #72, MUL VL] + WORD $0x8589444c // ldr z12, [x2, #73, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab30c6 // eor z6.d, z6.d, z11.d + WORD $0x04ac30c6 // eor z6.d, z6.d, z12.d + WORD $0x8589484b // ldr z11, [x2, #74, MUL VL] + WORD $0x85894c4c // ldr z12, [x2, #75, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab30e7 // eor z7.d, z7.d, z11.d + WORD $0x04ac30e7 // eor z7.d, z7.d, z12.d + WORD $0x8589504b // ldr z11, [x2, #76, MUL VL] + WORD $0x8589544c // ldr z12, [x2, #77, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3108 // eor z8.d, z8.d, z11.d + WORD $0x04ac3108 // eor z8.d, z8.d, z12.d + WORD $0x8589584b // ldr z11, [x2, #78, MUL VL] + WORD $0x85895c4c // ldr z12, [x2, #79, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3129 // eor z9.d, z9.d, z11.d + WORD $0x04ac3129 // eor z9.d, z9.d, z12.d + // Check for early termination + CMP $4, R16 + BEQ mulSve_10x10Xor_store + + // Load and process 32 bytes from input 4 to 10 outputs + WORD $0x8580412d // ldr z13, [x9] + WORD $0x91008129 // add x9, x9, #32 + WORD $0x04fc95ae // lsr z14.d, z13.d, #4 + WORD $0x042a31ad // and z13.d, z13.d, z10.d + WORD $0x042a31ce // and z14.d, z14.d, z10.d + WORD $0x858a404b // ldr z11, [x2, #80, MUL VL] + WORD $0x858a444c // ldr z12, [x2, #81, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3000 // eor z0.d, z0.d, z11.d + WORD $0x04ac3000 // eor z0.d, z0.d, z12.d + WORD $0x858a484b // ldr z11, [x2, #82, MUL VL] + WORD $0x858a4c4c // ldr z12, [x2, #83, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3021 // eor z1.d, z1.d, z11.d + WORD $0x04ac3021 // eor z1.d, z1.d, z12.d + WORD $0x858a504b // ldr z11, [x2, #84, MUL VL] + WORD $0x858a544c // ldr z12, [x2, #85, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3042 // eor z2.d, z2.d, z11.d + WORD $0x04ac3042 // eor z2.d, z2.d, z12.d + WORD $0x858a584b // ldr z11, [x2, #86, MUL VL] + WORD $0x858a5c4c // ldr z12, [x2, #87, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3063 // eor z3.d, z3.d, z11.d + WORD $0x04ac3063 // eor z3.d, z3.d, z12.d + WORD $0x858b404b // ldr z11, [x2, #88, MUL VL] + WORD $0x858b444c // ldr z12, [x2, #89, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3084 // eor z4.d, z4.d, z11.d + WORD $0x04ac3084 // eor z4.d, z4.d, z12.d + WORD $0x858b484b // ldr z11, [x2, #90, MUL VL] + WORD $0x858b4c4c // ldr z12, [x2, #91, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab30a5 // eor z5.d, z5.d, z11.d + WORD $0x04ac30a5 // eor z5.d, z5.d, z12.d + WORD $0x858b504b // ldr z11, [x2, #92, MUL VL] + WORD $0x858b544c // ldr z12, [x2, #93, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab30c6 // eor z6.d, z6.d, z11.d + WORD $0x04ac30c6 // eor z6.d, z6.d, z12.d + WORD $0x858b584b // ldr z11, [x2, #94, MUL VL] + WORD $0x858b5c4c // ldr z12, [x2, #95, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab30e7 // eor z7.d, z7.d, z11.d + WORD $0x04ac30e7 // eor z7.d, z7.d, z12.d + WORD $0x858c404b // ldr z11, [x2, #96, MUL VL] + WORD $0x858c444c // ldr z12, [x2, #97, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3108 // eor z8.d, z8.d, z11.d + WORD $0x04ac3108 // eor z8.d, z8.d, z12.d + WORD $0x858c484b // ldr z11, [x2, #98, MUL VL] + WORD $0x858c4c4c // ldr z12, [x2, #99, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3129 // eor z9.d, z9.d, z11.d + WORD $0x04ac3129 // eor z9.d, z9.d, z12.d + // Check for early termination + CMP $5, R16 + BEQ mulSve_10x10Xor_store + + // Load and process 32 bytes from input 5 to 10 outputs + WORD $0x8580414d // ldr z13, [x10] + WORD $0x9100814a // add x10, x10, #32 + WORD $0x04fc95ae // lsr z14.d, z13.d, #4 + WORD $0x042a31ad // and z13.d, z13.d, z10.d + WORD $0x042a31ce // and z14.d, z14.d, z10.d + WORD $0x858c504b // ldr z11, [x2, #100, MUL VL] + WORD $0x858c544c // ldr z12, [x2, #101, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3000 // eor z0.d, z0.d, z11.d + WORD $0x04ac3000 // eor z0.d, z0.d, z12.d + WORD $0x858c584b // ldr z11, [x2, #102, MUL VL] + WORD $0x858c5c4c // ldr z12, [x2, #103, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3021 // eor z1.d, z1.d, z11.d + WORD $0x04ac3021 // eor z1.d, z1.d, z12.d + WORD $0x858d404b // ldr z11, [x2, #104, MUL VL] + WORD $0x858d444c // ldr z12, [x2, #105, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3042 // eor z2.d, z2.d, z11.d + WORD $0x04ac3042 // eor z2.d, z2.d, z12.d + WORD $0x858d484b // ldr z11, [x2, #106, MUL VL] + WORD $0x858d4c4c // ldr z12, [x2, #107, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3063 // eor z3.d, z3.d, z11.d + WORD $0x04ac3063 // eor z3.d, z3.d, z12.d + WORD $0x858d504b // ldr z11, [x2, #108, MUL VL] + WORD $0x858d544c // ldr z12, [x2, #109, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3084 // eor z4.d, z4.d, z11.d + WORD $0x04ac3084 // eor z4.d, z4.d, z12.d + WORD $0x858d584b // ldr z11, [x2, #110, MUL VL] + WORD $0x858d5c4c // ldr z12, [x2, #111, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab30a5 // eor z5.d, z5.d, z11.d + WORD $0x04ac30a5 // eor z5.d, z5.d, z12.d + WORD $0x858e404b // ldr z11, [x2, #112, MUL VL] + WORD $0x858e444c // ldr z12, [x2, #113, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab30c6 // eor z6.d, z6.d, z11.d + WORD $0x04ac30c6 // eor z6.d, z6.d, z12.d + WORD $0x858e484b // ldr z11, [x2, #114, MUL VL] + WORD $0x858e4c4c // ldr z12, [x2, #115, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab30e7 // eor z7.d, z7.d, z11.d + WORD $0x04ac30e7 // eor z7.d, z7.d, z12.d + WORD $0x858e504b // ldr z11, [x2, #116, MUL VL] + WORD $0x858e544c // ldr z12, [x2, #117, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3108 // eor z8.d, z8.d, z11.d + WORD $0x04ac3108 // eor z8.d, z8.d, z12.d + WORD $0x858e584b // ldr z11, [x2, #118, MUL VL] + WORD $0x858e5c4c // ldr z12, [x2, #119, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3129 // eor z9.d, z9.d, z11.d + WORD $0x04ac3129 // eor z9.d, z9.d, z12.d + // Check for early termination + CMP $6, R16 + BEQ mulSve_10x10Xor_store + + // Load and process 32 bytes from input 6 to 10 outputs + WORD $0x8580416d // ldr z13, [x11] + WORD $0x9100816b // add x11, x11, #32 + WORD $0x04fc95ae // lsr z14.d, z13.d, #4 + WORD $0x042a31ad // and z13.d, z13.d, z10.d + WORD $0x042a31ce // and z14.d, z14.d, z10.d + WORD $0x858f404b // ldr z11, [x2, #120, MUL VL] + WORD $0x858f444c // ldr z12, [x2, #121, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3000 // eor z0.d, z0.d, z11.d + WORD $0x04ac3000 // eor z0.d, z0.d, z12.d + WORD $0x858f484b // ldr z11, [x2, #122, MUL VL] + WORD $0x858f4c4c // ldr z12, [x2, #123, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3021 // eor z1.d, z1.d, z11.d + WORD $0x04ac3021 // eor z1.d, z1.d, z12.d + WORD $0x858f504b // ldr z11, [x2, #124, MUL VL] + WORD $0x858f544c // ldr z12, [x2, #125, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3042 // eor z2.d, z2.d, z11.d + WORD $0x04ac3042 // eor z2.d, z2.d, z12.d + WORD $0x858f584b // ldr z11, [x2, #126, MUL VL] + WORD $0x858f5c4c // ldr z12, [x2, #127, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3063 // eor z3.d, z3.d, z11.d + WORD $0x04ac3063 // eor z3.d, z3.d, z12.d + WORD $0x8590404b // ldr z11, [x2, #128, MUL VL] + WORD $0x8590444c // ldr z12, [x2, #129, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3084 // eor z4.d, z4.d, z11.d + WORD $0x04ac3084 // eor z4.d, z4.d, z12.d + WORD $0x8590484b // ldr z11, [x2, #130, MUL VL] + WORD $0x85904c4c // ldr z12, [x2, #131, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab30a5 // eor z5.d, z5.d, z11.d + WORD $0x04ac30a5 // eor z5.d, z5.d, z12.d + WORD $0x8590504b // ldr z11, [x2, #132, MUL VL] + WORD $0x8590544c // ldr z12, [x2, #133, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab30c6 // eor z6.d, z6.d, z11.d + WORD $0x04ac30c6 // eor z6.d, z6.d, z12.d + WORD $0x8590584b // ldr z11, [x2, #134, MUL VL] + WORD $0x85905c4c // ldr z12, [x2, #135, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab30e7 // eor z7.d, z7.d, z11.d + WORD $0x04ac30e7 // eor z7.d, z7.d, z12.d + WORD $0x8591404b // ldr z11, [x2, #136, MUL VL] + WORD $0x8591444c // ldr z12, [x2, #137, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3108 // eor z8.d, z8.d, z11.d + WORD $0x04ac3108 // eor z8.d, z8.d, z12.d + WORD $0x8591484b // ldr z11, [x2, #138, MUL VL] + WORD $0x85914c4c // ldr z12, [x2, #139, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3129 // eor z9.d, z9.d, z11.d + WORD $0x04ac3129 // eor z9.d, z9.d, z12.d + // Check for early termination + CMP $7, R16 + BEQ mulSve_10x10Xor_store + + // Load and process 32 bytes from input 7 to 10 outputs + WORD $0x8580418d // ldr z13, [x12] + WORD $0x9100818c // add x12, x12, #32 + WORD $0x04fc95ae // lsr z14.d, z13.d, #4 + WORD $0x042a31ad // and z13.d, z13.d, z10.d + WORD $0x042a31ce // and z14.d, z14.d, z10.d + WORD $0x8591504b // ldr z11, [x2, #140, MUL VL] + WORD $0x8591544c // ldr z12, [x2, #141, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3000 // eor z0.d, z0.d, z11.d + WORD $0x04ac3000 // eor z0.d, z0.d, z12.d + WORD $0x8591584b // ldr z11, [x2, #142, MUL VL] + WORD $0x85915c4c // ldr z12, [x2, #143, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3021 // eor z1.d, z1.d, z11.d + WORD $0x04ac3021 // eor z1.d, z1.d, z12.d + WORD $0x8592404b // ldr z11, [x2, #144, MUL VL] + WORD $0x8592444c // ldr z12, [x2, #145, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3042 // eor z2.d, z2.d, z11.d + WORD $0x04ac3042 // eor z2.d, z2.d, z12.d + WORD $0x8592484b // ldr z11, [x2, #146, MUL VL] + WORD $0x85924c4c // ldr z12, [x2, #147, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3063 // eor z3.d, z3.d, z11.d + WORD $0x04ac3063 // eor z3.d, z3.d, z12.d + WORD $0x8592504b // ldr z11, [x2, #148, MUL VL] + WORD $0x8592544c // ldr z12, [x2, #149, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3084 // eor z4.d, z4.d, z11.d + WORD $0x04ac3084 // eor z4.d, z4.d, z12.d + WORD $0x8592584b // ldr z11, [x2, #150, MUL VL] + WORD $0x85925c4c // ldr z12, [x2, #151, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab30a5 // eor z5.d, z5.d, z11.d + WORD $0x04ac30a5 // eor z5.d, z5.d, z12.d + WORD $0x8593404b // ldr z11, [x2, #152, MUL VL] + WORD $0x8593444c // ldr z12, [x2, #153, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab30c6 // eor z6.d, z6.d, z11.d + WORD $0x04ac30c6 // eor z6.d, z6.d, z12.d + WORD $0x8593484b // ldr z11, [x2, #154, MUL VL] + WORD $0x85934c4c // ldr z12, [x2, #155, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab30e7 // eor z7.d, z7.d, z11.d + WORD $0x04ac30e7 // eor z7.d, z7.d, z12.d + WORD $0x8593504b // ldr z11, [x2, #156, MUL VL] + WORD $0x8593544c // ldr z12, [x2, #157, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3108 // eor z8.d, z8.d, z11.d + WORD $0x04ac3108 // eor z8.d, z8.d, z12.d + WORD $0x8593584b // ldr z11, [x2, #158, MUL VL] + WORD $0x85935c4c // ldr z12, [x2, #159, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3129 // eor z9.d, z9.d, z11.d + WORD $0x04ac3129 // eor z9.d, z9.d, z12.d + // Check for early termination + CMP $8, R16 + BEQ mulSve_10x10Xor_store + + // Load and process 32 bytes from input 8 to 10 outputs + WORD $0x858041ad // ldr z13, [x13] + WORD $0x910081ad // add x13, x13, #32 + WORD $0x04fc95ae // lsr z14.d, z13.d, #4 + WORD $0x042a31ad // and z13.d, z13.d, z10.d + WORD $0x042a31ce // and z14.d, z14.d, z10.d + WORD $0x8594404b // ldr z11, [x2, #160, MUL VL] + WORD $0x8594444c // ldr z12, [x2, #161, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3000 // eor z0.d, z0.d, z11.d + WORD $0x04ac3000 // eor z0.d, z0.d, z12.d + WORD $0x8594484b // ldr z11, [x2, #162, MUL VL] + WORD $0x85944c4c // ldr z12, [x2, #163, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3021 // eor z1.d, z1.d, z11.d + WORD $0x04ac3021 // eor z1.d, z1.d, z12.d + WORD $0x8594504b // ldr z11, [x2, #164, MUL VL] + WORD $0x8594544c // ldr z12, [x2, #165, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3042 // eor z2.d, z2.d, z11.d + WORD $0x04ac3042 // eor z2.d, z2.d, z12.d + WORD $0x8594584b // ldr z11, [x2, #166, MUL VL] + WORD $0x85945c4c // ldr z12, [x2, #167, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3063 // eor z3.d, z3.d, z11.d + WORD $0x04ac3063 // eor z3.d, z3.d, z12.d + WORD $0x8595404b // ldr z11, [x2, #168, MUL VL] + WORD $0x8595444c // ldr z12, [x2, #169, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3084 // eor z4.d, z4.d, z11.d + WORD $0x04ac3084 // eor z4.d, z4.d, z12.d + WORD $0x8595484b // ldr z11, [x2, #170, MUL VL] + WORD $0x85954c4c // ldr z12, [x2, #171, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab30a5 // eor z5.d, z5.d, z11.d + WORD $0x04ac30a5 // eor z5.d, z5.d, z12.d + WORD $0x8595504b // ldr z11, [x2, #172, MUL VL] + WORD $0x8595544c // ldr z12, [x2, #173, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab30c6 // eor z6.d, z6.d, z11.d + WORD $0x04ac30c6 // eor z6.d, z6.d, z12.d + WORD $0x8595584b // ldr z11, [x2, #174, MUL VL] + WORD $0x85955c4c // ldr z12, [x2, #175, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab30e7 // eor z7.d, z7.d, z11.d + WORD $0x04ac30e7 // eor z7.d, z7.d, z12.d + WORD $0x8596404b // ldr z11, [x2, #176, MUL VL] + WORD $0x8596444c // ldr z12, [x2, #177, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3108 // eor z8.d, z8.d, z11.d + WORD $0x04ac3108 // eor z8.d, z8.d, z12.d + WORD $0x8596484b // ldr z11, [x2, #178, MUL VL] + WORD $0x85964c4c // ldr z12, [x2, #179, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3129 // eor z9.d, z9.d, z11.d + WORD $0x04ac3129 // eor z9.d, z9.d, z12.d + // Check for early termination + CMP $9, R16 + BEQ mulSve_10x10Xor_store + + // Load and process 32 bytes from input 9 to 10 outputs + WORD $0x8580406d // ldr z13, [x3] + WORD $0x91008063 // add x3, x3, #32 + WORD $0x04fc95ae // lsr z14.d, z13.d, #4 + WORD $0x042a31ad // and z13.d, z13.d, z10.d + WORD $0x042a31ce // and z14.d, z14.d, z10.d + WORD $0x8596504b // ldr z11, [x2, #180, MUL VL] + WORD $0x8596544c // ldr z12, [x2, #181, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3000 // eor z0.d, z0.d, z11.d + WORD $0x04ac3000 // eor z0.d, z0.d, z12.d + WORD $0x8596584b // ldr z11, [x2, #182, MUL VL] + WORD $0x85965c4c // ldr z12, [x2, #183, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3021 // eor z1.d, z1.d, z11.d + WORD $0x04ac3021 // eor z1.d, z1.d, z12.d + WORD $0x8597404b // ldr z11, [x2, #184, MUL VL] + WORD $0x8597444c // ldr z12, [x2, #185, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3042 // eor z2.d, z2.d, z11.d + WORD $0x04ac3042 // eor z2.d, z2.d, z12.d + WORD $0x8597484b // ldr z11, [x2, #186, MUL VL] + WORD $0x85974c4c // ldr z12, [x2, #187, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3063 // eor z3.d, z3.d, z11.d + WORD $0x04ac3063 // eor z3.d, z3.d, z12.d + WORD $0x8597504b // ldr z11, [x2, #188, MUL VL] + WORD $0x8597544c // ldr z12, [x2, #189, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3084 // eor z4.d, z4.d, z11.d + WORD $0x04ac3084 // eor z4.d, z4.d, z12.d + WORD $0x8597584b // ldr z11, [x2, #190, MUL VL] + WORD $0x85975c4c // ldr z12, [x2, #191, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab30a5 // eor z5.d, z5.d, z11.d + WORD $0x04ac30a5 // eor z5.d, z5.d, z12.d + WORD $0x8598404b // ldr z11, [x2, #192, MUL VL] + WORD $0x8598444c // ldr z12, [x2, #193, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab30c6 // eor z6.d, z6.d, z11.d + WORD $0x04ac30c6 // eor z6.d, z6.d, z12.d + WORD $0x8598484b // ldr z11, [x2, #194, MUL VL] + WORD $0x85984c4c // ldr z12, [x2, #195, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab30e7 // eor z7.d, z7.d, z11.d + WORD $0x04ac30e7 // eor z7.d, z7.d, z12.d + WORD $0x8598504b // ldr z11, [x2, #196, MUL VL] + WORD $0x8598544c // ldr z12, [x2, #197, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3108 // eor z8.d, z8.d, z11.d + WORD $0x04ac3108 // eor z8.d, z8.d, z12.d + WORD $0x8598584b // ldr z11, [x2, #198, MUL VL] + WORD $0x85985c4c // ldr z12, [x2, #199, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3129 // eor z9.d, z9.d, z11.d + WORD $0x04ac3129 // eor z9.d, z9.d, z12.d + +mulSve_10x10Xor_store: + // Store 10 outputs + MOVD (R14), R6 + WORD $0xe5ef40c0 // st1d { z0.d }, p0, [x6, x15, lsl #3] + MOVD 24(R14), R6 + WORD $0xe5ef40c1 // st1d { z1.d }, p0, [x6, x15, lsl #3] + MOVD 48(R14), R6 + WORD $0xe5ef40c2 // st1d { z2.d }, p0, [x6, x15, lsl #3] + MOVD 72(R14), R6 + WORD $0xe5ef40c3 // st1d { z3.d }, p0, [x6, x15, lsl #3] + MOVD 96(R14), R6 + WORD $0xe5ef40c4 // st1d { z4.d }, p0, [x6, x15, lsl #3] + MOVD 120(R14), R6 + WORD $0xe5ef40c5 // st1d { z5.d }, p0, [x6, x15, lsl #3] + MOVD 144(R14), R6 + WORD $0xe5ef40c6 // st1d { z6.d }, p0, [x6, x15, lsl #3] + MOVD 168(R14), R6 + WORD $0xe5ef40c7 // st1d { z7.d }, p0, [x6, x15, lsl #3] + MOVD 192(R14), R6 + WORD $0xe5ef40c8 // st1d { z8.d }, p0, [x6, x15, lsl #3] + MOVD 216(R14), R6 + WORD $0xe5ef40c9 // st1d { z9.d }, p0, [x6, x15, lsl #3] + + // Prepare for next loop + WORD $0x910011ef // add x15, x15, #4 + WORD $0xf1000400 // subs x0, x0, #1 + BNE mulSve_10x10Xor_loop + +mulSve_10x10Xor_end: + RET + +// func mulNeon_10x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: NEON +TEXT ·mulNeon_10x1_64(SB), $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 46 YMM used + MOVD n+80(FP), R0 + LSR $6, R0 + TST R0, R0 + BEQ mulNeon_10x1_64_end + MOVD in_base+24(FP), R3 + MOVD (R3), R1 + MOVD 24(R3), R4 + MOVD 48(R3), R5 + MOVD 72(R3), R8 + MOVD 96(R3), R9 + MOVD 120(R3), R10 + MOVD 144(R3), R11 + MOVD 168(R3), R12 + MOVD 192(R3), R13 + MOVD 216(R3), R3 + MOVD out_base+48(FP), R14 + MOVD (R14), R14 + MOVD start+72(FP), R15 + + // Add start offset to output + ADD R15, R14 + + // Add start offset to input + ADD R15, R1 + ADD R15, R4 + ADD R15, R5 + ADD R15, R8 + ADD R15, R9 + ADD R15, R10 + ADD R15, R11 + ADD R15, R12 + ADD R15, R13 + ADD R15, R3 + MOVD $15, R15 + VMOV R15, V4.B[0] + VDUP V4.B[0], V4.B16 + + // Load number of input shards + MOVD in_len+32(FP), R16 + +mulNeon_10x1_64_loop: + MOVD matrix_base+0(FP), R2 + // Load and process 64 bytes from input 0 to 1 outputs + VLD1.P 32(R1), [V12.B16, V13.B16] + VLD1.P 32(R1), [V10.B16, V11.B16] + VUSHR $4, V12.B16, V14.B16 + VUSHR $4, V13.B16, V15.B16 + VUSHR $4, V10.B16, V16.B16 + VUSHR $4, V11.B16, V17.B16 + VAND V4.B16, V12.B16, V12.B16 + VAND V4.B16, V13.B16, V13.B16 + VAND V4.B16, V10.B16, V10.B16 + VAND V4.B16, V11.B16, V11.B16 + VAND V4.B16, V14.B16, V14.B16 + VAND V4.B16, V15.B16, V15.B16 + VAND V4.B16, V16.B16, V16.B16 + VAND V4.B16, V17.B16, V17.B16 + VLD1.P 32(R2), [V6.B16, V7.B16] + VLD1.P 32(R2), [V8.B16, V9.B16] + VTBL V10.B16, [V6.B16], V10.B16 + VTBL V11.B16, [V7.B16], V11.B16 + VTBL V12.B16, [V6.B16], V6.B16 + VTBL V13.B16, [V7.B16], V7.B16 + VTBL V16.B16, [V8.B16], V12.B16 + VTBL V17.B16, [V9.B16], V13.B16 + VTBL V14.B16, [V8.B16], V8.B16 + VTBL V15.B16, [V9.B16], V9.B16 + VEOR V6.B16, V8.B16, V0.B16 + VEOR V7.B16, V9.B16, V1.B16 + VEOR V10.B16, V12.B16, V2.B16 + VEOR V11.B16, V13.B16, V3.B16 + // Check for early termination + CMP $1, R16 + BEQ mulNeon_10x1_64_store + + // Load and process 64 bytes from input 1 to 1 outputs + VLD1.P 32(R4), [V12.B16, V13.B16] + VLD1.P 32(R4), [V10.B16, V11.B16] + VUSHR $4, V12.B16, V14.B16 + VUSHR $4, V13.B16, V15.B16 + VUSHR $4, V10.B16, V16.B16 + VUSHR $4, V11.B16, V17.B16 + VAND V4.B16, V12.B16, V12.B16 + VAND V4.B16, V13.B16, V13.B16 + VAND V4.B16, V10.B16, V10.B16 + VAND V4.B16, V11.B16, V11.B16 + VAND V4.B16, V14.B16, V14.B16 + VAND V4.B16, V15.B16, V15.B16 + VAND V4.B16, V16.B16, V16.B16 + VAND V4.B16, V17.B16, V17.B16 + VLD1.P 32(R2), [V6.B16, V7.B16] + VLD1.P 32(R2), [V8.B16, V9.B16] + VTBL V10.B16, [V6.B16], V10.B16 + VTBL V11.B16, [V7.B16], V11.B16 + VTBL V12.B16, [V6.B16], V6.B16 + VTBL V13.B16, [V7.B16], V7.B16 + VTBL V16.B16, [V8.B16], V12.B16 + VTBL V17.B16, [V9.B16], V13.B16 + VTBL V14.B16, [V8.B16], V8.B16 + VTBL V15.B16, [V9.B16], V9.B16 + VEOR V6.B16, V0.B16, V0.B16 + VEOR V7.B16, V1.B16, V1.B16 + VEOR V8.B16, V0.B16, V0.B16 + VEOR V9.B16, V1.B16, V1.B16 + VEOR V10.B16, V2.B16, V2.B16 + VEOR V11.B16, V3.B16, V3.B16 + VEOR V12.B16, V2.B16, V2.B16 + VEOR V13.B16, V3.B16, V3.B16 + // Check for early termination + CMP $2, R16 + BEQ mulNeon_10x1_64_store + + // Load and process 64 bytes from input 2 to 1 outputs + VLD1.P 32(R5), [V12.B16, V13.B16] + VLD1.P 32(R5), [V10.B16, V11.B16] + VUSHR $4, V12.B16, V14.B16 + VUSHR $4, V13.B16, V15.B16 + VUSHR $4, V10.B16, V16.B16 + VUSHR $4, V11.B16, V17.B16 + VAND V4.B16, V12.B16, V12.B16 + VAND V4.B16, V13.B16, V13.B16 + VAND V4.B16, V10.B16, V10.B16 + VAND V4.B16, V11.B16, V11.B16 + VAND V4.B16, V14.B16, V14.B16 + VAND V4.B16, V15.B16, V15.B16 + VAND V4.B16, V16.B16, V16.B16 + VAND V4.B16, V17.B16, V17.B16 + VLD1.P 32(R2), [V6.B16, V7.B16] + VLD1.P 32(R2), [V8.B16, V9.B16] + VTBL V10.B16, [V6.B16], V10.B16 + VTBL V11.B16, [V7.B16], V11.B16 + VTBL V12.B16, [V6.B16], V6.B16 + VTBL V13.B16, [V7.B16], V7.B16 + VTBL V16.B16, [V8.B16], V12.B16 + VTBL V17.B16, [V9.B16], V13.B16 + VTBL V14.B16, [V8.B16], V8.B16 + VTBL V15.B16, [V9.B16], V9.B16 + VEOR V6.B16, V0.B16, V0.B16 + VEOR V7.B16, V1.B16, V1.B16 + VEOR V8.B16, V0.B16, V0.B16 + VEOR V9.B16, V1.B16, V1.B16 + VEOR V10.B16, V2.B16, V2.B16 + VEOR V11.B16, V3.B16, V3.B16 + VEOR V12.B16, V2.B16, V2.B16 + VEOR V13.B16, V3.B16, V3.B16 + // Check for early termination + CMP $3, R16 + BEQ mulNeon_10x1_64_store + + // Load and process 64 bytes from input 3 to 1 outputs + VLD1.P 32(R8), [V12.B16, V13.B16] + VLD1.P 32(R8), [V10.B16, V11.B16] + VUSHR $4, V12.B16, V14.B16 + VUSHR $4, V13.B16, V15.B16 + VUSHR $4, V10.B16, V16.B16 + VUSHR $4, V11.B16, V17.B16 + VAND V4.B16, V12.B16, V12.B16 + VAND V4.B16, V13.B16, V13.B16 + VAND V4.B16, V10.B16, V10.B16 + VAND V4.B16, V11.B16, V11.B16 + VAND V4.B16, V14.B16, V14.B16 + VAND V4.B16, V15.B16, V15.B16 + VAND V4.B16, V16.B16, V16.B16 + VAND V4.B16, V17.B16, V17.B16 + VLD1.P 32(R2), [V6.B16, V7.B16] + VLD1.P 32(R2), [V8.B16, V9.B16] + VTBL V10.B16, [V6.B16], V10.B16 + VTBL V11.B16, [V7.B16], V11.B16 + VTBL V12.B16, [V6.B16], V6.B16 + VTBL V13.B16, [V7.B16], V7.B16 + VTBL V16.B16, [V8.B16], V12.B16 + VTBL V17.B16, [V9.B16], V13.B16 + VTBL V14.B16, [V8.B16], V8.B16 + VTBL V15.B16, [V9.B16], V9.B16 + VEOR V6.B16, V0.B16, V0.B16 + VEOR V7.B16, V1.B16, V1.B16 + VEOR V8.B16, V0.B16, V0.B16 + VEOR V9.B16, V1.B16, V1.B16 + VEOR V10.B16, V2.B16, V2.B16 + VEOR V11.B16, V3.B16, V3.B16 + VEOR V12.B16, V2.B16, V2.B16 + VEOR V13.B16, V3.B16, V3.B16 + // Check for early termination + CMP $4, R16 + BEQ mulNeon_10x1_64_store + + // Load and process 64 bytes from input 4 to 1 outputs + VLD1.P 32(R9), [V12.B16, V13.B16] + VLD1.P 32(R9), [V10.B16, V11.B16] + VUSHR $4, V12.B16, V14.B16 + VUSHR $4, V13.B16, V15.B16 + VUSHR $4, V10.B16, V16.B16 + VUSHR $4, V11.B16, V17.B16 + VAND V4.B16, V12.B16, V12.B16 + VAND V4.B16, V13.B16, V13.B16 + VAND V4.B16, V10.B16, V10.B16 + VAND V4.B16, V11.B16, V11.B16 + VAND V4.B16, V14.B16, V14.B16 + VAND V4.B16, V15.B16, V15.B16 + VAND V4.B16, V16.B16, V16.B16 + VAND V4.B16, V17.B16, V17.B16 + VLD1.P 32(R2), [V6.B16, V7.B16] + VLD1.P 32(R2), [V8.B16, V9.B16] + VTBL V10.B16, [V6.B16], V10.B16 + VTBL V11.B16, [V7.B16], V11.B16 + VTBL V12.B16, [V6.B16], V6.B16 + VTBL V13.B16, [V7.B16], V7.B16 + VTBL V16.B16, [V8.B16], V12.B16 + VTBL V17.B16, [V9.B16], V13.B16 + VTBL V14.B16, [V8.B16], V8.B16 + VTBL V15.B16, [V9.B16], V9.B16 + VEOR V6.B16, V0.B16, V0.B16 + VEOR V7.B16, V1.B16, V1.B16 + VEOR V8.B16, V0.B16, V0.B16 + VEOR V9.B16, V1.B16, V1.B16 + VEOR V10.B16, V2.B16, V2.B16 + VEOR V11.B16, V3.B16, V3.B16 + VEOR V12.B16, V2.B16, V2.B16 + VEOR V13.B16, V3.B16, V3.B16 + // Check for early termination + CMP $5, R16 + BEQ mulNeon_10x1_64_store + + // Load and process 64 bytes from input 5 to 1 outputs + VLD1.P 32(R10), [V12.B16, V13.B16] + VLD1.P 32(R10), [V10.B16, V11.B16] + VUSHR $4, V12.B16, V14.B16 + VUSHR $4, V13.B16, V15.B16 + VUSHR $4, V10.B16, V16.B16 + VUSHR $4, V11.B16, V17.B16 + VAND V4.B16, V12.B16, V12.B16 + VAND V4.B16, V13.B16, V13.B16 + VAND V4.B16, V10.B16, V10.B16 + VAND V4.B16, V11.B16, V11.B16 + VAND V4.B16, V14.B16, V14.B16 + VAND V4.B16, V15.B16, V15.B16 + VAND V4.B16, V16.B16, V16.B16 + VAND V4.B16, V17.B16, V17.B16 + VLD1.P 32(R2), [V6.B16, V7.B16] + VLD1.P 32(R2), [V8.B16, V9.B16] + VTBL V10.B16, [V6.B16], V10.B16 + VTBL V11.B16, [V7.B16], V11.B16 + VTBL V12.B16, [V6.B16], V6.B16 + VTBL V13.B16, [V7.B16], V7.B16 + VTBL V16.B16, [V8.B16], V12.B16 + VTBL V17.B16, [V9.B16], V13.B16 + VTBL V14.B16, [V8.B16], V8.B16 + VTBL V15.B16, [V9.B16], V9.B16 + VEOR V6.B16, V0.B16, V0.B16 + VEOR V7.B16, V1.B16, V1.B16 + VEOR V8.B16, V0.B16, V0.B16 + VEOR V9.B16, V1.B16, V1.B16 + VEOR V10.B16, V2.B16, V2.B16 + VEOR V11.B16, V3.B16, V3.B16 + VEOR V12.B16, V2.B16, V2.B16 + VEOR V13.B16, V3.B16, V3.B16 + // Check for early termination + CMP $6, R16 + BEQ mulNeon_10x1_64_store + + // Load and process 64 bytes from input 6 to 1 outputs + VLD1.P 32(R11), [V12.B16, V13.B16] + VLD1.P 32(R11), [V10.B16, V11.B16] + VUSHR $4, V12.B16, V14.B16 + VUSHR $4, V13.B16, V15.B16 + VUSHR $4, V10.B16, V16.B16 + VUSHR $4, V11.B16, V17.B16 + VAND V4.B16, V12.B16, V12.B16 + VAND V4.B16, V13.B16, V13.B16 + VAND V4.B16, V10.B16, V10.B16 + VAND V4.B16, V11.B16, V11.B16 + VAND V4.B16, V14.B16, V14.B16 + VAND V4.B16, V15.B16, V15.B16 + VAND V4.B16, V16.B16, V16.B16 + VAND V4.B16, V17.B16, V17.B16 + VLD1.P 32(R2), [V6.B16, V7.B16] + VLD1.P 32(R2), [V8.B16, V9.B16] + VTBL V10.B16, [V6.B16], V10.B16 + VTBL V11.B16, [V7.B16], V11.B16 + VTBL V12.B16, [V6.B16], V6.B16 + VTBL V13.B16, [V7.B16], V7.B16 + VTBL V16.B16, [V8.B16], V12.B16 + VTBL V17.B16, [V9.B16], V13.B16 + VTBL V14.B16, [V8.B16], V8.B16 + VTBL V15.B16, [V9.B16], V9.B16 + VEOR V6.B16, V0.B16, V0.B16 + VEOR V7.B16, V1.B16, V1.B16 + VEOR V8.B16, V0.B16, V0.B16 + VEOR V9.B16, V1.B16, V1.B16 + VEOR V10.B16, V2.B16, V2.B16 + VEOR V11.B16, V3.B16, V3.B16 + VEOR V12.B16, V2.B16, V2.B16 + VEOR V13.B16, V3.B16, V3.B16 + // Check for early termination + CMP $7, R16 + BEQ mulNeon_10x1_64_store + + // Load and process 64 bytes from input 7 to 1 outputs + VLD1.P 32(R12), [V12.B16, V13.B16] + VLD1.P 32(R12), [V10.B16, V11.B16] + VUSHR $4, V12.B16, V14.B16 + VUSHR $4, V13.B16, V15.B16 + VUSHR $4, V10.B16, V16.B16 + VUSHR $4, V11.B16, V17.B16 + VAND V4.B16, V12.B16, V12.B16 + VAND V4.B16, V13.B16, V13.B16 + VAND V4.B16, V10.B16, V10.B16 + VAND V4.B16, V11.B16, V11.B16 + VAND V4.B16, V14.B16, V14.B16 + VAND V4.B16, V15.B16, V15.B16 + VAND V4.B16, V16.B16, V16.B16 + VAND V4.B16, V17.B16, V17.B16 + VLD1.P 32(R2), [V6.B16, V7.B16] + VLD1.P 32(R2), [V8.B16, V9.B16] + VTBL V10.B16, [V6.B16], V10.B16 + VTBL V11.B16, [V7.B16], V11.B16 + VTBL V12.B16, [V6.B16], V6.B16 + VTBL V13.B16, [V7.B16], V7.B16 + VTBL V16.B16, [V8.B16], V12.B16 + VTBL V17.B16, [V9.B16], V13.B16 + VTBL V14.B16, [V8.B16], V8.B16 + VTBL V15.B16, [V9.B16], V9.B16 + VEOR V6.B16, V0.B16, V0.B16 + VEOR V7.B16, V1.B16, V1.B16 + VEOR V8.B16, V0.B16, V0.B16 + VEOR V9.B16, V1.B16, V1.B16 + VEOR V10.B16, V2.B16, V2.B16 + VEOR V11.B16, V3.B16, V3.B16 + VEOR V12.B16, V2.B16, V2.B16 + VEOR V13.B16, V3.B16, V3.B16 + // Check for early termination + CMP $8, R16 + BEQ mulNeon_10x1_64_store + + // Load and process 64 bytes from input 8 to 1 outputs + VLD1.P 32(R13), [V12.B16, V13.B16] + VLD1.P 32(R13), [V10.B16, V11.B16] + VUSHR $4, V12.B16, V14.B16 + VUSHR $4, V13.B16, V15.B16 + VUSHR $4, V10.B16, V16.B16 + VUSHR $4, V11.B16, V17.B16 + VAND V4.B16, V12.B16, V12.B16 + VAND V4.B16, V13.B16, V13.B16 + VAND V4.B16, V10.B16, V10.B16 + VAND V4.B16, V11.B16, V11.B16 + VAND V4.B16, V14.B16, V14.B16 + VAND V4.B16, V15.B16, V15.B16 + VAND V4.B16, V16.B16, V16.B16 + VAND V4.B16, V17.B16, V17.B16 + VLD1.P 32(R2), [V6.B16, V7.B16] + VLD1.P 32(R2), [V8.B16, V9.B16] + VTBL V10.B16, [V6.B16], V10.B16 + VTBL V11.B16, [V7.B16], V11.B16 + VTBL V12.B16, [V6.B16], V6.B16 + VTBL V13.B16, [V7.B16], V7.B16 + VTBL V16.B16, [V8.B16], V12.B16 + VTBL V17.B16, [V9.B16], V13.B16 + VTBL V14.B16, [V8.B16], V8.B16 + VTBL V15.B16, [V9.B16], V9.B16 + VEOR V6.B16, V0.B16, V0.B16 + VEOR V7.B16, V1.B16, V1.B16 + VEOR V8.B16, V0.B16, V0.B16 + VEOR V9.B16, V1.B16, V1.B16 + VEOR V10.B16, V2.B16, V2.B16 + VEOR V11.B16, V3.B16, V3.B16 + VEOR V12.B16, V2.B16, V2.B16 + VEOR V13.B16, V3.B16, V3.B16 + // Check for early termination + CMP $9, R16 + BEQ mulNeon_10x1_64_store + + // Load and process 64 bytes from input 9 to 1 outputs + VLD1.P 32(R3), [V12.B16, V13.B16] + VLD1.P 32(R3), [V10.B16, V11.B16] + VUSHR $4, V12.B16, V14.B16 + VUSHR $4, V13.B16, V15.B16 + VUSHR $4, V10.B16, V16.B16 + VUSHR $4, V11.B16, V17.B16 + VAND V4.B16, V12.B16, V12.B16 + VAND V4.B16, V13.B16, V13.B16 + VAND V4.B16, V10.B16, V10.B16 + VAND V4.B16, V11.B16, V11.B16 + VAND V4.B16, V14.B16, V14.B16 + VAND V4.B16, V15.B16, V15.B16 + VAND V4.B16, V16.B16, V16.B16 + VAND V4.B16, V17.B16, V17.B16 + VLD1.P 32(R2), [V6.B16, V7.B16] + VLD1.P 32(R2), [V8.B16, V9.B16] + VTBL V10.B16, [V6.B16], V10.B16 + VTBL V11.B16, [V7.B16], V11.B16 + VTBL V12.B16, [V6.B16], V6.B16 + VTBL V13.B16, [V7.B16], V7.B16 + VTBL V16.B16, [V8.B16], V12.B16 + VTBL V17.B16, [V9.B16], V13.B16 + VTBL V14.B16, [V8.B16], V8.B16 + VTBL V15.B16, [V9.B16], V9.B16 + VEOR V6.B16, V0.B16, V0.B16 + VEOR V7.B16, V1.B16, V1.B16 + VEOR V8.B16, V0.B16, V0.B16 + VEOR V9.B16, V1.B16, V1.B16 + VEOR V10.B16, V2.B16, V2.B16 + VEOR V11.B16, V3.B16, V3.B16 + VEOR V12.B16, V2.B16, V2.B16 + VEOR V13.B16, V3.B16, V3.B16 + +mulNeon_10x1_64_store: + // Store 1 outputs + VST1.P [V0.D2, V1.D2], 32(R14) + VST1.P [V2.D2, V3.D2], 32(R14) + + // Prepare for next loop + SUBS $1, R0 + BNE mulNeon_10x1_64_loop + +mulNeon_10x1_64_end: + RET + +// func mulNeon_10x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: NEON +TEXT ·mulNeon_10x1_64Xor(SB), $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 46 YMM used + MOVD n+80(FP), R0 + LSR $6, R0 + TST R0, R0 + BEQ mulNeon_10x1_64Xor_end + MOVD in_base+24(FP), R3 + MOVD (R3), R1 + MOVD 24(R3), R4 + MOVD 48(R3), R5 + MOVD 72(R3), R8 + MOVD 96(R3), R9 + MOVD 120(R3), R10 + MOVD 144(R3), R11 + MOVD 168(R3), R12 + MOVD 192(R3), R13 + MOVD 216(R3), R3 + MOVD out_base+48(FP), R14 + MOVD (R14), R14 + MOVD start+72(FP), R15 + + // Add start offset to output + ADD R15, R14 + + // Add start offset to input + ADD R15, R1 + ADD R15, R4 + ADD R15, R5 + ADD R15, R8 + ADD R15, R9 + ADD R15, R10 + ADD R15, R11 + ADD R15, R12 + ADD R15, R13 + ADD R15, R3 + MOVD $15, R15 + VMOV R15, V4.B[0] + VDUP V4.B[0], V4.B16 + + // Load number of input shards + MOVD in_len+32(FP), R16 + +mulNeon_10x1_64Xor_loop: + MOVD matrix_base+0(FP), R2 + // Load 1 outputs + VLD1.P 32(R14), [V0.B16, V1.B16] + VLD1.P 32(R14), [V2.B16, V3.B16] + + // Load and process 64 bytes from input 0 to 1 outputs + VLD1.P 32(R1), [V12.B16, V13.B16] + VLD1.P 32(R1), [V10.B16, V11.B16] + VUSHR $4, V12.B16, V14.B16 + VUSHR $4, V13.B16, V15.B16 + VUSHR $4, V10.B16, V16.B16 + VUSHR $4, V11.B16, V17.B16 + VAND V4.B16, V12.B16, V12.B16 + VAND V4.B16, V13.B16, V13.B16 + VAND V4.B16, V10.B16, V10.B16 + VAND V4.B16, V11.B16, V11.B16 + VAND V4.B16, V14.B16, V14.B16 + VAND V4.B16, V15.B16, V15.B16 + VAND V4.B16, V16.B16, V16.B16 + VAND V4.B16, V17.B16, V17.B16 + VLD1.P 32(R2), [V6.B16, V7.B16] + VLD1.P 32(R2), [V8.B16, V9.B16] + VTBL V10.B16, [V6.B16], V10.B16 + VTBL V11.B16, [V7.B16], V11.B16 + VTBL V12.B16, [V6.B16], V6.B16 + VTBL V13.B16, [V7.B16], V7.B16 + VTBL V16.B16, [V8.B16], V12.B16 + VTBL V17.B16, [V9.B16], V13.B16 + VTBL V14.B16, [V8.B16], V8.B16 + VTBL V15.B16, [V9.B16], V9.B16 + VEOR V6.B16, V0.B16, V0.B16 + VEOR V7.B16, V1.B16, V1.B16 + VEOR V8.B16, V0.B16, V0.B16 + VEOR V9.B16, V1.B16, V1.B16 + VEOR V10.B16, V2.B16, V2.B16 + VEOR V11.B16, V3.B16, V3.B16 + VEOR V12.B16, V2.B16, V2.B16 + VEOR V13.B16, V3.B16, V3.B16 + // Check for early termination + CMP $1, R16 + BEQ mulNeon_10x1_64Xor_store + + // Load and process 64 bytes from input 1 to 1 outputs + VLD1.P 32(R4), [V12.B16, V13.B16] + VLD1.P 32(R4), [V10.B16, V11.B16] + VUSHR $4, V12.B16, V14.B16 + VUSHR $4, V13.B16, V15.B16 + VUSHR $4, V10.B16, V16.B16 + VUSHR $4, V11.B16, V17.B16 + VAND V4.B16, V12.B16, V12.B16 + VAND V4.B16, V13.B16, V13.B16 + VAND V4.B16, V10.B16, V10.B16 + VAND V4.B16, V11.B16, V11.B16 + VAND V4.B16, V14.B16, V14.B16 + VAND V4.B16, V15.B16, V15.B16 + VAND V4.B16, V16.B16, V16.B16 + VAND V4.B16, V17.B16, V17.B16 + VLD1.P 32(R2), [V6.B16, V7.B16] + VLD1.P 32(R2), [V8.B16, V9.B16] + VTBL V10.B16, [V6.B16], V10.B16 + VTBL V11.B16, [V7.B16], V11.B16 + VTBL V12.B16, [V6.B16], V6.B16 + VTBL V13.B16, [V7.B16], V7.B16 + VTBL V16.B16, [V8.B16], V12.B16 + VTBL V17.B16, [V9.B16], V13.B16 + VTBL V14.B16, [V8.B16], V8.B16 + VTBL V15.B16, [V9.B16], V9.B16 + VEOR V6.B16, V0.B16, V0.B16 + VEOR V7.B16, V1.B16, V1.B16 + VEOR V8.B16, V0.B16, V0.B16 + VEOR V9.B16, V1.B16, V1.B16 + VEOR V10.B16, V2.B16, V2.B16 + VEOR V11.B16, V3.B16, V3.B16 + VEOR V12.B16, V2.B16, V2.B16 + VEOR V13.B16, V3.B16, V3.B16 + // Check for early termination + CMP $2, R16 + BEQ mulNeon_10x1_64Xor_store + + // Load and process 64 bytes from input 2 to 1 outputs + VLD1.P 32(R5), [V12.B16, V13.B16] + VLD1.P 32(R5), [V10.B16, V11.B16] + VUSHR $4, V12.B16, V14.B16 + VUSHR $4, V13.B16, V15.B16 + VUSHR $4, V10.B16, V16.B16 + VUSHR $4, V11.B16, V17.B16 + VAND V4.B16, V12.B16, V12.B16 + VAND V4.B16, V13.B16, V13.B16 + VAND V4.B16, V10.B16, V10.B16 + VAND V4.B16, V11.B16, V11.B16 + VAND V4.B16, V14.B16, V14.B16 + VAND V4.B16, V15.B16, V15.B16 + VAND V4.B16, V16.B16, V16.B16 + VAND V4.B16, V17.B16, V17.B16 + VLD1.P 32(R2), [V6.B16, V7.B16] + VLD1.P 32(R2), [V8.B16, V9.B16] + VTBL V10.B16, [V6.B16], V10.B16 + VTBL V11.B16, [V7.B16], V11.B16 + VTBL V12.B16, [V6.B16], V6.B16 + VTBL V13.B16, [V7.B16], V7.B16 + VTBL V16.B16, [V8.B16], V12.B16 + VTBL V17.B16, [V9.B16], V13.B16 + VTBL V14.B16, [V8.B16], V8.B16 + VTBL V15.B16, [V9.B16], V9.B16 + VEOR V6.B16, V0.B16, V0.B16 + VEOR V7.B16, V1.B16, V1.B16 + VEOR V8.B16, V0.B16, V0.B16 + VEOR V9.B16, V1.B16, V1.B16 + VEOR V10.B16, V2.B16, V2.B16 + VEOR V11.B16, V3.B16, V3.B16 + VEOR V12.B16, V2.B16, V2.B16 + VEOR V13.B16, V3.B16, V3.B16 + // Check for early termination + CMP $3, R16 + BEQ mulNeon_10x1_64Xor_store + + // Load and process 64 bytes from input 3 to 1 outputs + VLD1.P 32(R8), [V12.B16, V13.B16] + VLD1.P 32(R8), [V10.B16, V11.B16] + VUSHR $4, V12.B16, V14.B16 + VUSHR $4, V13.B16, V15.B16 + VUSHR $4, V10.B16, V16.B16 + VUSHR $4, V11.B16, V17.B16 + VAND V4.B16, V12.B16, V12.B16 + VAND V4.B16, V13.B16, V13.B16 + VAND V4.B16, V10.B16, V10.B16 + VAND V4.B16, V11.B16, V11.B16 + VAND V4.B16, V14.B16, V14.B16 + VAND V4.B16, V15.B16, V15.B16 + VAND V4.B16, V16.B16, V16.B16 + VAND V4.B16, V17.B16, V17.B16 + VLD1.P 32(R2), [V6.B16, V7.B16] + VLD1.P 32(R2), [V8.B16, V9.B16] + VTBL V10.B16, [V6.B16], V10.B16 + VTBL V11.B16, [V7.B16], V11.B16 + VTBL V12.B16, [V6.B16], V6.B16 + VTBL V13.B16, [V7.B16], V7.B16 + VTBL V16.B16, [V8.B16], V12.B16 + VTBL V17.B16, [V9.B16], V13.B16 + VTBL V14.B16, [V8.B16], V8.B16 + VTBL V15.B16, [V9.B16], V9.B16 + VEOR V6.B16, V0.B16, V0.B16 + VEOR V7.B16, V1.B16, V1.B16 + VEOR V8.B16, V0.B16, V0.B16 + VEOR V9.B16, V1.B16, V1.B16 + VEOR V10.B16, V2.B16, V2.B16 + VEOR V11.B16, V3.B16, V3.B16 + VEOR V12.B16, V2.B16, V2.B16 + VEOR V13.B16, V3.B16, V3.B16 + // Check for early termination + CMP $4, R16 + BEQ mulNeon_10x1_64Xor_store + + // Load and process 64 bytes from input 4 to 1 outputs + VLD1.P 32(R9), [V12.B16, V13.B16] + VLD1.P 32(R9), [V10.B16, V11.B16] + VUSHR $4, V12.B16, V14.B16 + VUSHR $4, V13.B16, V15.B16 + VUSHR $4, V10.B16, V16.B16 + VUSHR $4, V11.B16, V17.B16 + VAND V4.B16, V12.B16, V12.B16 + VAND V4.B16, V13.B16, V13.B16 + VAND V4.B16, V10.B16, V10.B16 + VAND V4.B16, V11.B16, V11.B16 + VAND V4.B16, V14.B16, V14.B16 + VAND V4.B16, V15.B16, V15.B16 + VAND V4.B16, V16.B16, V16.B16 + VAND V4.B16, V17.B16, V17.B16 + VLD1.P 32(R2), [V6.B16, V7.B16] + VLD1.P 32(R2), [V8.B16, V9.B16] + VTBL V10.B16, [V6.B16], V10.B16 + VTBL V11.B16, [V7.B16], V11.B16 + VTBL V12.B16, [V6.B16], V6.B16 + VTBL V13.B16, [V7.B16], V7.B16 + VTBL V16.B16, [V8.B16], V12.B16 + VTBL V17.B16, [V9.B16], V13.B16 + VTBL V14.B16, [V8.B16], V8.B16 + VTBL V15.B16, [V9.B16], V9.B16 + VEOR V6.B16, V0.B16, V0.B16 + VEOR V7.B16, V1.B16, V1.B16 + VEOR V8.B16, V0.B16, V0.B16 + VEOR V9.B16, V1.B16, V1.B16 + VEOR V10.B16, V2.B16, V2.B16 + VEOR V11.B16, V3.B16, V3.B16 + VEOR V12.B16, V2.B16, V2.B16 + VEOR V13.B16, V3.B16, V3.B16 + // Check for early termination + CMP $5, R16 + BEQ mulNeon_10x1_64Xor_store + + // Load and process 64 bytes from input 5 to 1 outputs + VLD1.P 32(R10), [V12.B16, V13.B16] + VLD1.P 32(R10), [V10.B16, V11.B16] + VUSHR $4, V12.B16, V14.B16 + VUSHR $4, V13.B16, V15.B16 + VUSHR $4, V10.B16, V16.B16 + VUSHR $4, V11.B16, V17.B16 + VAND V4.B16, V12.B16, V12.B16 + VAND V4.B16, V13.B16, V13.B16 + VAND V4.B16, V10.B16, V10.B16 + VAND V4.B16, V11.B16, V11.B16 + VAND V4.B16, V14.B16, V14.B16 + VAND V4.B16, V15.B16, V15.B16 + VAND V4.B16, V16.B16, V16.B16 + VAND V4.B16, V17.B16, V17.B16 + VLD1.P 32(R2), [V6.B16, V7.B16] + VLD1.P 32(R2), [V8.B16, V9.B16] + VTBL V10.B16, [V6.B16], V10.B16 + VTBL V11.B16, [V7.B16], V11.B16 + VTBL V12.B16, [V6.B16], V6.B16 + VTBL V13.B16, [V7.B16], V7.B16 + VTBL V16.B16, [V8.B16], V12.B16 + VTBL V17.B16, [V9.B16], V13.B16 + VTBL V14.B16, [V8.B16], V8.B16 + VTBL V15.B16, [V9.B16], V9.B16 + VEOR V6.B16, V0.B16, V0.B16 + VEOR V7.B16, V1.B16, V1.B16 + VEOR V8.B16, V0.B16, V0.B16 + VEOR V9.B16, V1.B16, V1.B16 + VEOR V10.B16, V2.B16, V2.B16 + VEOR V11.B16, V3.B16, V3.B16 + VEOR V12.B16, V2.B16, V2.B16 + VEOR V13.B16, V3.B16, V3.B16 + // Check for early termination + CMP $6, R16 + BEQ mulNeon_10x1_64Xor_store + + // Load and process 64 bytes from input 6 to 1 outputs + VLD1.P 32(R11), [V12.B16, V13.B16] + VLD1.P 32(R11), [V10.B16, V11.B16] + VUSHR $4, V12.B16, V14.B16 + VUSHR $4, V13.B16, V15.B16 + VUSHR $4, V10.B16, V16.B16 + VUSHR $4, V11.B16, V17.B16 + VAND V4.B16, V12.B16, V12.B16 + VAND V4.B16, V13.B16, V13.B16 + VAND V4.B16, V10.B16, V10.B16 + VAND V4.B16, V11.B16, V11.B16 + VAND V4.B16, V14.B16, V14.B16 + VAND V4.B16, V15.B16, V15.B16 + VAND V4.B16, V16.B16, V16.B16 + VAND V4.B16, V17.B16, V17.B16 + VLD1.P 32(R2), [V6.B16, V7.B16] + VLD1.P 32(R2), [V8.B16, V9.B16] + VTBL V10.B16, [V6.B16], V10.B16 + VTBL V11.B16, [V7.B16], V11.B16 + VTBL V12.B16, [V6.B16], V6.B16 + VTBL V13.B16, [V7.B16], V7.B16 + VTBL V16.B16, [V8.B16], V12.B16 + VTBL V17.B16, [V9.B16], V13.B16 + VTBL V14.B16, [V8.B16], V8.B16 + VTBL V15.B16, [V9.B16], V9.B16 + VEOR V6.B16, V0.B16, V0.B16 + VEOR V7.B16, V1.B16, V1.B16 + VEOR V8.B16, V0.B16, V0.B16 + VEOR V9.B16, V1.B16, V1.B16 + VEOR V10.B16, V2.B16, V2.B16 + VEOR V11.B16, V3.B16, V3.B16 + VEOR V12.B16, V2.B16, V2.B16 + VEOR V13.B16, V3.B16, V3.B16 + // Check for early termination + CMP $7, R16 + BEQ mulNeon_10x1_64Xor_store + + // Load and process 64 bytes from input 7 to 1 outputs + VLD1.P 32(R12), [V12.B16, V13.B16] + VLD1.P 32(R12), [V10.B16, V11.B16] + VUSHR $4, V12.B16, V14.B16 + VUSHR $4, V13.B16, V15.B16 + VUSHR $4, V10.B16, V16.B16 + VUSHR $4, V11.B16, V17.B16 + VAND V4.B16, V12.B16, V12.B16 + VAND V4.B16, V13.B16, V13.B16 + VAND V4.B16, V10.B16, V10.B16 + VAND V4.B16, V11.B16, V11.B16 + VAND V4.B16, V14.B16, V14.B16 + VAND V4.B16, V15.B16, V15.B16 + VAND V4.B16, V16.B16, V16.B16 + VAND V4.B16, V17.B16, V17.B16 + VLD1.P 32(R2), [V6.B16, V7.B16] + VLD1.P 32(R2), [V8.B16, V9.B16] + VTBL V10.B16, [V6.B16], V10.B16 + VTBL V11.B16, [V7.B16], V11.B16 + VTBL V12.B16, [V6.B16], V6.B16 + VTBL V13.B16, [V7.B16], V7.B16 + VTBL V16.B16, [V8.B16], V12.B16 + VTBL V17.B16, [V9.B16], V13.B16 + VTBL V14.B16, [V8.B16], V8.B16 + VTBL V15.B16, [V9.B16], V9.B16 + VEOR V6.B16, V0.B16, V0.B16 + VEOR V7.B16, V1.B16, V1.B16 + VEOR V8.B16, V0.B16, V0.B16 + VEOR V9.B16, V1.B16, V1.B16 + VEOR V10.B16, V2.B16, V2.B16 + VEOR V11.B16, V3.B16, V3.B16 + VEOR V12.B16, V2.B16, V2.B16 + VEOR V13.B16, V3.B16, V3.B16 + // Check for early termination + CMP $8, R16 + BEQ mulNeon_10x1_64Xor_store + + // Load and process 64 bytes from input 8 to 1 outputs + VLD1.P 32(R13), [V12.B16, V13.B16] + VLD1.P 32(R13), [V10.B16, V11.B16] + VUSHR $4, V12.B16, V14.B16 + VUSHR $4, V13.B16, V15.B16 + VUSHR $4, V10.B16, V16.B16 + VUSHR $4, V11.B16, V17.B16 + VAND V4.B16, V12.B16, V12.B16 + VAND V4.B16, V13.B16, V13.B16 + VAND V4.B16, V10.B16, V10.B16 + VAND V4.B16, V11.B16, V11.B16 + VAND V4.B16, V14.B16, V14.B16 + VAND V4.B16, V15.B16, V15.B16 + VAND V4.B16, V16.B16, V16.B16 + VAND V4.B16, V17.B16, V17.B16 + VLD1.P 32(R2), [V6.B16, V7.B16] + VLD1.P 32(R2), [V8.B16, V9.B16] + VTBL V10.B16, [V6.B16], V10.B16 + VTBL V11.B16, [V7.B16], V11.B16 + VTBL V12.B16, [V6.B16], V6.B16 + VTBL V13.B16, [V7.B16], V7.B16 + VTBL V16.B16, [V8.B16], V12.B16 + VTBL V17.B16, [V9.B16], V13.B16 + VTBL V14.B16, [V8.B16], V8.B16 + VTBL V15.B16, [V9.B16], V9.B16 + VEOR V6.B16, V0.B16, V0.B16 + VEOR V7.B16, V1.B16, V1.B16 + VEOR V8.B16, V0.B16, V0.B16 + VEOR V9.B16, V1.B16, V1.B16 + VEOR V10.B16, V2.B16, V2.B16 + VEOR V11.B16, V3.B16, V3.B16 + VEOR V12.B16, V2.B16, V2.B16 + VEOR V13.B16, V3.B16, V3.B16 + // Check for early termination + CMP $9, R16 + BEQ mulNeon_10x1_64Xor_store + + // Load and process 64 bytes from input 9 to 1 outputs + VLD1.P 32(R3), [V12.B16, V13.B16] + VLD1.P 32(R3), [V10.B16, V11.B16] + VUSHR $4, V12.B16, V14.B16 + VUSHR $4, V13.B16, V15.B16 + VUSHR $4, V10.B16, V16.B16 + VUSHR $4, V11.B16, V17.B16 + VAND V4.B16, V12.B16, V12.B16 + VAND V4.B16, V13.B16, V13.B16 + VAND V4.B16, V10.B16, V10.B16 + VAND V4.B16, V11.B16, V11.B16 + VAND V4.B16, V14.B16, V14.B16 + VAND V4.B16, V15.B16, V15.B16 + VAND V4.B16, V16.B16, V16.B16 + VAND V4.B16, V17.B16, V17.B16 + VLD1.P 32(R2), [V6.B16, V7.B16] + VLD1.P 32(R2), [V8.B16, V9.B16] + VTBL V10.B16, [V6.B16], V10.B16 + VTBL V11.B16, [V7.B16], V11.B16 + VTBL V12.B16, [V6.B16], V6.B16 + VTBL V13.B16, [V7.B16], V7.B16 + VTBL V16.B16, [V8.B16], V12.B16 + VTBL V17.B16, [V9.B16], V13.B16 + VTBL V14.B16, [V8.B16], V8.B16 + VTBL V15.B16, [V9.B16], V9.B16 + VEOR V6.B16, V0.B16, V0.B16 + VEOR V7.B16, V1.B16, V1.B16 + VEOR V8.B16, V0.B16, V0.B16 + VEOR V9.B16, V1.B16, V1.B16 + VEOR V10.B16, V2.B16, V2.B16 + VEOR V11.B16, V3.B16, V3.B16 + VEOR V12.B16, V2.B16, V2.B16 + VEOR V13.B16, V3.B16, V3.B16 + +mulNeon_10x1_64Xor_store: + // Store 1 outputs + SUB $64, R14 + VST1.P [V0.D2, V1.D2], 32(R14) + VST1.P [V2.D2, V3.D2], 32(R14) + + // Prepare for next loop + SUBS $1, R0 + BNE mulNeon_10x1_64Xor_loop + +mulNeon_10x1_64Xor_end: + RET + +// func mulNeon_10x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: NEON +TEXT ·mulNeon_10x2_64(SB), $8-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 89 YMM used + MOVD n+80(FP), R0 + LSR $6, R0 + TST R0, R0 + BEQ mulNeon_10x2_64_end + MOVD in_base+24(FP), R3 + MOVD (R3), R1 + MOVD 24(R3), R4 + MOVD 48(R3), R5 + MOVD 72(R3), R8 + MOVD 96(R3), R9 + MOVD 120(R3), R10 + MOVD 144(R3), R11 + MOVD 168(R3), R12 + MOVD 192(R3), R13 + MOVD 216(R3), R3 + MOVD out_base+48(FP), R14 + MOVD (R14), R15 + MOVD 24(R14), R14 + MOVD start+72(FP), R6 + + // Add start offset to output + ADD R6, R15 + ADD R6, R14 + + // Add start offset to input + ADD R6, R1 + ADD R6, R4 + ADD R6, R5 + ADD R6, R8 + ADD R6, R9 + ADD R6, R10 + ADD R6, R11 + ADD R6, R12 + ADD R6, R13 + ADD R6, R3 + MOVD $15, R6 + VMOV R6, V8.B[0] + VDUP V8.B[0], V8.B16 + + // Load number of input shards + MOVD in_len+32(FP), R16 + +mulNeon_10x2_64_loop: + MOVD matrix_base+0(FP), R2 + // Load and process 64 bytes from input 0 to 2 outputs + VLD1.P 32(R1), [V18.B16, V19.B16] + VLD1.P 32(R1), [V22.B16, V23.B16] + VUSHR $4, V18.B16, V20.B16 + VUSHR $4, V19.B16, V21.B16 + VUSHR $4, V22.B16, V24.B16 + VUSHR $4, V23.B16, V25.B16 + VAND V8.B16, V18.B16, V18.B16 + VAND V8.B16, V19.B16, V19.B16 + VAND V8.B16, V22.B16, V22.B16 + VAND V8.B16, V23.B16, V23.B16 + VAND V8.B16, V20.B16, V20.B16 + VAND V8.B16, V21.B16, V21.B16 + VAND V8.B16, V24.B16, V24.B16 + VAND V8.B16, V25.B16, V25.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V22.B16, [V10.B16], V14.B16 + VTBL V23.B16, [V11.B16], V15.B16 + VTBL V18.B16, [V10.B16], V10.B16 + VTBL V19.B16, [V11.B16], V11.B16 + VTBL V24.B16, [V12.B16], V16.B16 + VTBL V25.B16, [V13.B16], V17.B16 + VTBL V20.B16, [V12.B16], V12.B16 + VTBL V21.B16, [V13.B16], V13.B16 + VEOR V10.B16, V12.B16, V0.B16 + VEOR V11.B16, V13.B16, V1.B16 + VEOR V14.B16, V16.B16, V2.B16 + VEOR V15.B16, V17.B16, V3.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V22.B16, [V10.B16], V14.B16 + VTBL V23.B16, [V11.B16], V15.B16 + VTBL V18.B16, [V10.B16], V10.B16 + VTBL V19.B16, [V11.B16], V11.B16 + VTBL V24.B16, [V12.B16], V16.B16 + VTBL V25.B16, [V13.B16], V17.B16 + VTBL V20.B16, [V12.B16], V12.B16 + VTBL V21.B16, [V13.B16], V13.B16 + VEOR V10.B16, V12.B16, V4.B16 + VEOR V11.B16, V13.B16, V5.B16 + VEOR V14.B16, V16.B16, V6.B16 + VEOR V15.B16, V17.B16, V7.B16 + // Check for early termination + CMP $1, R16 + BEQ mulNeon_10x2_64_store + + // Load and process 64 bytes from input 1 to 2 outputs + VLD1.P 32(R4), [V18.B16, V19.B16] + VLD1.P 32(R4), [V22.B16, V23.B16] + VUSHR $4, V18.B16, V20.B16 + VUSHR $4, V19.B16, V21.B16 + VUSHR $4, V22.B16, V24.B16 + VUSHR $4, V23.B16, V25.B16 + VAND V8.B16, V18.B16, V18.B16 + VAND V8.B16, V19.B16, V19.B16 + VAND V8.B16, V22.B16, V22.B16 + VAND V8.B16, V23.B16, V23.B16 + VAND V8.B16, V20.B16, V20.B16 + VAND V8.B16, V21.B16, V21.B16 + VAND V8.B16, V24.B16, V24.B16 + VAND V8.B16, V25.B16, V25.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V22.B16, [V10.B16], V14.B16 + VTBL V23.B16, [V11.B16], V15.B16 + VTBL V18.B16, [V10.B16], V10.B16 + VTBL V19.B16, [V11.B16], V11.B16 + VTBL V24.B16, [V12.B16], V16.B16 + VTBL V25.B16, [V13.B16], V17.B16 + VTBL V20.B16, [V12.B16], V12.B16 + VTBL V21.B16, [V13.B16], V13.B16 + VEOR V10.B16, V0.B16, V0.B16 + VEOR V11.B16, V1.B16, V1.B16 + VEOR V12.B16, V0.B16, V0.B16 + VEOR V13.B16, V1.B16, V1.B16 + VEOR V14.B16, V2.B16, V2.B16 + VEOR V15.B16, V3.B16, V3.B16 + VEOR V16.B16, V2.B16, V2.B16 + VEOR V17.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V22.B16, [V10.B16], V14.B16 + VTBL V23.B16, [V11.B16], V15.B16 + VTBL V18.B16, [V10.B16], V10.B16 + VTBL V19.B16, [V11.B16], V11.B16 + VTBL V24.B16, [V12.B16], V16.B16 + VTBL V25.B16, [V13.B16], V17.B16 + VTBL V20.B16, [V12.B16], V12.B16 + VTBL V21.B16, [V13.B16], V13.B16 + VEOR V10.B16, V4.B16, V4.B16 + VEOR V11.B16, V5.B16, V5.B16 + VEOR V12.B16, V4.B16, V4.B16 + VEOR V13.B16, V5.B16, V5.B16 + VEOR V14.B16, V6.B16, V6.B16 + VEOR V15.B16, V7.B16, V7.B16 + VEOR V16.B16, V6.B16, V6.B16 + VEOR V17.B16, V7.B16, V7.B16 + // Check for early termination + CMP $2, R16 + BEQ mulNeon_10x2_64_store + + // Load and process 64 bytes from input 2 to 2 outputs + VLD1.P 32(R5), [V18.B16, V19.B16] + VLD1.P 32(R5), [V22.B16, V23.B16] + VUSHR $4, V18.B16, V20.B16 + VUSHR $4, V19.B16, V21.B16 + VUSHR $4, V22.B16, V24.B16 + VUSHR $4, V23.B16, V25.B16 + VAND V8.B16, V18.B16, V18.B16 + VAND V8.B16, V19.B16, V19.B16 + VAND V8.B16, V22.B16, V22.B16 + VAND V8.B16, V23.B16, V23.B16 + VAND V8.B16, V20.B16, V20.B16 + VAND V8.B16, V21.B16, V21.B16 + VAND V8.B16, V24.B16, V24.B16 + VAND V8.B16, V25.B16, V25.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V22.B16, [V10.B16], V14.B16 + VTBL V23.B16, [V11.B16], V15.B16 + VTBL V18.B16, [V10.B16], V10.B16 + VTBL V19.B16, [V11.B16], V11.B16 + VTBL V24.B16, [V12.B16], V16.B16 + VTBL V25.B16, [V13.B16], V17.B16 + VTBL V20.B16, [V12.B16], V12.B16 + VTBL V21.B16, [V13.B16], V13.B16 + VEOR V10.B16, V0.B16, V0.B16 + VEOR V11.B16, V1.B16, V1.B16 + VEOR V12.B16, V0.B16, V0.B16 + VEOR V13.B16, V1.B16, V1.B16 + VEOR V14.B16, V2.B16, V2.B16 + VEOR V15.B16, V3.B16, V3.B16 + VEOR V16.B16, V2.B16, V2.B16 + VEOR V17.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V22.B16, [V10.B16], V14.B16 + VTBL V23.B16, [V11.B16], V15.B16 + VTBL V18.B16, [V10.B16], V10.B16 + VTBL V19.B16, [V11.B16], V11.B16 + VTBL V24.B16, [V12.B16], V16.B16 + VTBL V25.B16, [V13.B16], V17.B16 + VTBL V20.B16, [V12.B16], V12.B16 + VTBL V21.B16, [V13.B16], V13.B16 + VEOR V10.B16, V4.B16, V4.B16 + VEOR V11.B16, V5.B16, V5.B16 + VEOR V12.B16, V4.B16, V4.B16 + VEOR V13.B16, V5.B16, V5.B16 + VEOR V14.B16, V6.B16, V6.B16 + VEOR V15.B16, V7.B16, V7.B16 + VEOR V16.B16, V6.B16, V6.B16 + VEOR V17.B16, V7.B16, V7.B16 + // Check for early termination + CMP $3, R16 + BEQ mulNeon_10x2_64_store + + // Load and process 64 bytes from input 3 to 2 outputs + VLD1.P 32(R8), [V18.B16, V19.B16] + VLD1.P 32(R8), [V22.B16, V23.B16] + VUSHR $4, V18.B16, V20.B16 + VUSHR $4, V19.B16, V21.B16 + VUSHR $4, V22.B16, V24.B16 + VUSHR $4, V23.B16, V25.B16 + VAND V8.B16, V18.B16, V18.B16 + VAND V8.B16, V19.B16, V19.B16 + VAND V8.B16, V22.B16, V22.B16 + VAND V8.B16, V23.B16, V23.B16 + VAND V8.B16, V20.B16, V20.B16 + VAND V8.B16, V21.B16, V21.B16 + VAND V8.B16, V24.B16, V24.B16 + VAND V8.B16, V25.B16, V25.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V22.B16, [V10.B16], V14.B16 + VTBL V23.B16, [V11.B16], V15.B16 + VTBL V18.B16, [V10.B16], V10.B16 + VTBL V19.B16, [V11.B16], V11.B16 + VTBL V24.B16, [V12.B16], V16.B16 + VTBL V25.B16, [V13.B16], V17.B16 + VTBL V20.B16, [V12.B16], V12.B16 + VTBL V21.B16, [V13.B16], V13.B16 + VEOR V10.B16, V0.B16, V0.B16 + VEOR V11.B16, V1.B16, V1.B16 + VEOR V12.B16, V0.B16, V0.B16 + VEOR V13.B16, V1.B16, V1.B16 + VEOR V14.B16, V2.B16, V2.B16 + VEOR V15.B16, V3.B16, V3.B16 + VEOR V16.B16, V2.B16, V2.B16 + VEOR V17.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V22.B16, [V10.B16], V14.B16 + VTBL V23.B16, [V11.B16], V15.B16 + VTBL V18.B16, [V10.B16], V10.B16 + VTBL V19.B16, [V11.B16], V11.B16 + VTBL V24.B16, [V12.B16], V16.B16 + VTBL V25.B16, [V13.B16], V17.B16 + VTBL V20.B16, [V12.B16], V12.B16 + VTBL V21.B16, [V13.B16], V13.B16 + VEOR V10.B16, V4.B16, V4.B16 + VEOR V11.B16, V5.B16, V5.B16 + VEOR V12.B16, V4.B16, V4.B16 + VEOR V13.B16, V5.B16, V5.B16 + VEOR V14.B16, V6.B16, V6.B16 + VEOR V15.B16, V7.B16, V7.B16 + VEOR V16.B16, V6.B16, V6.B16 + VEOR V17.B16, V7.B16, V7.B16 + // Check for early termination + CMP $4, R16 + BEQ mulNeon_10x2_64_store + + // Load and process 64 bytes from input 4 to 2 outputs + VLD1.P 32(R9), [V18.B16, V19.B16] + VLD1.P 32(R9), [V22.B16, V23.B16] + VUSHR $4, V18.B16, V20.B16 + VUSHR $4, V19.B16, V21.B16 + VUSHR $4, V22.B16, V24.B16 + VUSHR $4, V23.B16, V25.B16 + VAND V8.B16, V18.B16, V18.B16 + VAND V8.B16, V19.B16, V19.B16 + VAND V8.B16, V22.B16, V22.B16 + VAND V8.B16, V23.B16, V23.B16 + VAND V8.B16, V20.B16, V20.B16 + VAND V8.B16, V21.B16, V21.B16 + VAND V8.B16, V24.B16, V24.B16 + VAND V8.B16, V25.B16, V25.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V22.B16, [V10.B16], V14.B16 + VTBL V23.B16, [V11.B16], V15.B16 + VTBL V18.B16, [V10.B16], V10.B16 + VTBL V19.B16, [V11.B16], V11.B16 + VTBL V24.B16, [V12.B16], V16.B16 + VTBL V25.B16, [V13.B16], V17.B16 + VTBL V20.B16, [V12.B16], V12.B16 + VTBL V21.B16, [V13.B16], V13.B16 + VEOR V10.B16, V0.B16, V0.B16 + VEOR V11.B16, V1.B16, V1.B16 + VEOR V12.B16, V0.B16, V0.B16 + VEOR V13.B16, V1.B16, V1.B16 + VEOR V14.B16, V2.B16, V2.B16 + VEOR V15.B16, V3.B16, V3.B16 + VEOR V16.B16, V2.B16, V2.B16 + VEOR V17.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V22.B16, [V10.B16], V14.B16 + VTBL V23.B16, [V11.B16], V15.B16 + VTBL V18.B16, [V10.B16], V10.B16 + VTBL V19.B16, [V11.B16], V11.B16 + VTBL V24.B16, [V12.B16], V16.B16 + VTBL V25.B16, [V13.B16], V17.B16 + VTBL V20.B16, [V12.B16], V12.B16 + VTBL V21.B16, [V13.B16], V13.B16 + VEOR V10.B16, V4.B16, V4.B16 + VEOR V11.B16, V5.B16, V5.B16 + VEOR V12.B16, V4.B16, V4.B16 + VEOR V13.B16, V5.B16, V5.B16 + VEOR V14.B16, V6.B16, V6.B16 + VEOR V15.B16, V7.B16, V7.B16 + VEOR V16.B16, V6.B16, V6.B16 + VEOR V17.B16, V7.B16, V7.B16 + // Check for early termination + CMP $5, R16 + BEQ mulNeon_10x2_64_store + + // Load and process 64 bytes from input 5 to 2 outputs + VLD1.P 32(R10), [V18.B16, V19.B16] + VLD1.P 32(R10), [V22.B16, V23.B16] + VUSHR $4, V18.B16, V20.B16 + VUSHR $4, V19.B16, V21.B16 + VUSHR $4, V22.B16, V24.B16 + VUSHR $4, V23.B16, V25.B16 + VAND V8.B16, V18.B16, V18.B16 + VAND V8.B16, V19.B16, V19.B16 + VAND V8.B16, V22.B16, V22.B16 + VAND V8.B16, V23.B16, V23.B16 + VAND V8.B16, V20.B16, V20.B16 + VAND V8.B16, V21.B16, V21.B16 + VAND V8.B16, V24.B16, V24.B16 + VAND V8.B16, V25.B16, V25.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V22.B16, [V10.B16], V14.B16 + VTBL V23.B16, [V11.B16], V15.B16 + VTBL V18.B16, [V10.B16], V10.B16 + VTBL V19.B16, [V11.B16], V11.B16 + VTBL V24.B16, [V12.B16], V16.B16 + VTBL V25.B16, [V13.B16], V17.B16 + VTBL V20.B16, [V12.B16], V12.B16 + VTBL V21.B16, [V13.B16], V13.B16 + VEOR V10.B16, V0.B16, V0.B16 + VEOR V11.B16, V1.B16, V1.B16 + VEOR V12.B16, V0.B16, V0.B16 + VEOR V13.B16, V1.B16, V1.B16 + VEOR V14.B16, V2.B16, V2.B16 + VEOR V15.B16, V3.B16, V3.B16 + VEOR V16.B16, V2.B16, V2.B16 + VEOR V17.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V22.B16, [V10.B16], V14.B16 + VTBL V23.B16, [V11.B16], V15.B16 + VTBL V18.B16, [V10.B16], V10.B16 + VTBL V19.B16, [V11.B16], V11.B16 + VTBL V24.B16, [V12.B16], V16.B16 + VTBL V25.B16, [V13.B16], V17.B16 + VTBL V20.B16, [V12.B16], V12.B16 + VTBL V21.B16, [V13.B16], V13.B16 + VEOR V10.B16, V4.B16, V4.B16 + VEOR V11.B16, V5.B16, V5.B16 + VEOR V12.B16, V4.B16, V4.B16 + VEOR V13.B16, V5.B16, V5.B16 + VEOR V14.B16, V6.B16, V6.B16 + VEOR V15.B16, V7.B16, V7.B16 + VEOR V16.B16, V6.B16, V6.B16 + VEOR V17.B16, V7.B16, V7.B16 + // Check for early termination + CMP $6, R16 + BEQ mulNeon_10x2_64_store + + // Load and process 64 bytes from input 6 to 2 outputs + VLD1.P 32(R11), [V18.B16, V19.B16] + VLD1.P 32(R11), [V22.B16, V23.B16] + VUSHR $4, V18.B16, V20.B16 + VUSHR $4, V19.B16, V21.B16 + VUSHR $4, V22.B16, V24.B16 + VUSHR $4, V23.B16, V25.B16 + VAND V8.B16, V18.B16, V18.B16 + VAND V8.B16, V19.B16, V19.B16 + VAND V8.B16, V22.B16, V22.B16 + VAND V8.B16, V23.B16, V23.B16 + VAND V8.B16, V20.B16, V20.B16 + VAND V8.B16, V21.B16, V21.B16 + VAND V8.B16, V24.B16, V24.B16 + VAND V8.B16, V25.B16, V25.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V22.B16, [V10.B16], V14.B16 + VTBL V23.B16, [V11.B16], V15.B16 + VTBL V18.B16, [V10.B16], V10.B16 + VTBL V19.B16, [V11.B16], V11.B16 + VTBL V24.B16, [V12.B16], V16.B16 + VTBL V25.B16, [V13.B16], V17.B16 + VTBL V20.B16, [V12.B16], V12.B16 + VTBL V21.B16, [V13.B16], V13.B16 + VEOR V10.B16, V0.B16, V0.B16 + VEOR V11.B16, V1.B16, V1.B16 + VEOR V12.B16, V0.B16, V0.B16 + VEOR V13.B16, V1.B16, V1.B16 + VEOR V14.B16, V2.B16, V2.B16 + VEOR V15.B16, V3.B16, V3.B16 + VEOR V16.B16, V2.B16, V2.B16 + VEOR V17.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V22.B16, [V10.B16], V14.B16 + VTBL V23.B16, [V11.B16], V15.B16 + VTBL V18.B16, [V10.B16], V10.B16 + VTBL V19.B16, [V11.B16], V11.B16 + VTBL V24.B16, [V12.B16], V16.B16 + VTBL V25.B16, [V13.B16], V17.B16 + VTBL V20.B16, [V12.B16], V12.B16 + VTBL V21.B16, [V13.B16], V13.B16 + VEOR V10.B16, V4.B16, V4.B16 + VEOR V11.B16, V5.B16, V5.B16 + VEOR V12.B16, V4.B16, V4.B16 + VEOR V13.B16, V5.B16, V5.B16 + VEOR V14.B16, V6.B16, V6.B16 + VEOR V15.B16, V7.B16, V7.B16 + VEOR V16.B16, V6.B16, V6.B16 + VEOR V17.B16, V7.B16, V7.B16 + // Check for early termination + CMP $7, R16 + BEQ mulNeon_10x2_64_store + + // Load and process 64 bytes from input 7 to 2 outputs + VLD1.P 32(R12), [V18.B16, V19.B16] + VLD1.P 32(R12), [V22.B16, V23.B16] + VUSHR $4, V18.B16, V20.B16 + VUSHR $4, V19.B16, V21.B16 + VUSHR $4, V22.B16, V24.B16 + VUSHR $4, V23.B16, V25.B16 + VAND V8.B16, V18.B16, V18.B16 + VAND V8.B16, V19.B16, V19.B16 + VAND V8.B16, V22.B16, V22.B16 + VAND V8.B16, V23.B16, V23.B16 + VAND V8.B16, V20.B16, V20.B16 + VAND V8.B16, V21.B16, V21.B16 + VAND V8.B16, V24.B16, V24.B16 + VAND V8.B16, V25.B16, V25.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V22.B16, [V10.B16], V14.B16 + VTBL V23.B16, [V11.B16], V15.B16 + VTBL V18.B16, [V10.B16], V10.B16 + VTBL V19.B16, [V11.B16], V11.B16 + VTBL V24.B16, [V12.B16], V16.B16 + VTBL V25.B16, [V13.B16], V17.B16 + VTBL V20.B16, [V12.B16], V12.B16 + VTBL V21.B16, [V13.B16], V13.B16 + VEOR V10.B16, V0.B16, V0.B16 + VEOR V11.B16, V1.B16, V1.B16 + VEOR V12.B16, V0.B16, V0.B16 + VEOR V13.B16, V1.B16, V1.B16 + VEOR V14.B16, V2.B16, V2.B16 + VEOR V15.B16, V3.B16, V3.B16 + VEOR V16.B16, V2.B16, V2.B16 + VEOR V17.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V22.B16, [V10.B16], V14.B16 + VTBL V23.B16, [V11.B16], V15.B16 + VTBL V18.B16, [V10.B16], V10.B16 + VTBL V19.B16, [V11.B16], V11.B16 + VTBL V24.B16, [V12.B16], V16.B16 + VTBL V25.B16, [V13.B16], V17.B16 + VTBL V20.B16, [V12.B16], V12.B16 + VTBL V21.B16, [V13.B16], V13.B16 + VEOR V10.B16, V4.B16, V4.B16 + VEOR V11.B16, V5.B16, V5.B16 + VEOR V12.B16, V4.B16, V4.B16 + VEOR V13.B16, V5.B16, V5.B16 + VEOR V14.B16, V6.B16, V6.B16 + VEOR V15.B16, V7.B16, V7.B16 + VEOR V16.B16, V6.B16, V6.B16 + VEOR V17.B16, V7.B16, V7.B16 + // Check for early termination + CMP $8, R16 + BEQ mulNeon_10x2_64_store + + // Load and process 64 bytes from input 8 to 2 outputs + VLD1.P 32(R13), [V18.B16, V19.B16] + VLD1.P 32(R13), [V22.B16, V23.B16] + VUSHR $4, V18.B16, V20.B16 + VUSHR $4, V19.B16, V21.B16 + VUSHR $4, V22.B16, V24.B16 + VUSHR $4, V23.B16, V25.B16 + VAND V8.B16, V18.B16, V18.B16 + VAND V8.B16, V19.B16, V19.B16 + VAND V8.B16, V22.B16, V22.B16 + VAND V8.B16, V23.B16, V23.B16 + VAND V8.B16, V20.B16, V20.B16 + VAND V8.B16, V21.B16, V21.B16 + VAND V8.B16, V24.B16, V24.B16 + VAND V8.B16, V25.B16, V25.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V22.B16, [V10.B16], V14.B16 + VTBL V23.B16, [V11.B16], V15.B16 + VTBL V18.B16, [V10.B16], V10.B16 + VTBL V19.B16, [V11.B16], V11.B16 + VTBL V24.B16, [V12.B16], V16.B16 + VTBL V25.B16, [V13.B16], V17.B16 + VTBL V20.B16, [V12.B16], V12.B16 + VTBL V21.B16, [V13.B16], V13.B16 + VEOR V10.B16, V0.B16, V0.B16 + VEOR V11.B16, V1.B16, V1.B16 + VEOR V12.B16, V0.B16, V0.B16 + VEOR V13.B16, V1.B16, V1.B16 + VEOR V14.B16, V2.B16, V2.B16 + VEOR V15.B16, V3.B16, V3.B16 + VEOR V16.B16, V2.B16, V2.B16 + VEOR V17.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V22.B16, [V10.B16], V14.B16 + VTBL V23.B16, [V11.B16], V15.B16 + VTBL V18.B16, [V10.B16], V10.B16 + VTBL V19.B16, [V11.B16], V11.B16 + VTBL V24.B16, [V12.B16], V16.B16 + VTBL V25.B16, [V13.B16], V17.B16 + VTBL V20.B16, [V12.B16], V12.B16 + VTBL V21.B16, [V13.B16], V13.B16 + VEOR V10.B16, V4.B16, V4.B16 + VEOR V11.B16, V5.B16, V5.B16 + VEOR V12.B16, V4.B16, V4.B16 + VEOR V13.B16, V5.B16, V5.B16 + VEOR V14.B16, V6.B16, V6.B16 + VEOR V15.B16, V7.B16, V7.B16 + VEOR V16.B16, V6.B16, V6.B16 + VEOR V17.B16, V7.B16, V7.B16 + // Check for early termination + CMP $9, R16 + BEQ mulNeon_10x2_64_store + + // Load and process 64 bytes from input 9 to 2 outputs + VLD1.P 32(R3), [V18.B16, V19.B16] + VLD1.P 32(R3), [V22.B16, V23.B16] + VUSHR $4, V18.B16, V20.B16 + VUSHR $4, V19.B16, V21.B16 + VUSHR $4, V22.B16, V24.B16 + VUSHR $4, V23.B16, V25.B16 + VAND V8.B16, V18.B16, V18.B16 + VAND V8.B16, V19.B16, V19.B16 + VAND V8.B16, V22.B16, V22.B16 + VAND V8.B16, V23.B16, V23.B16 + VAND V8.B16, V20.B16, V20.B16 + VAND V8.B16, V21.B16, V21.B16 + VAND V8.B16, V24.B16, V24.B16 + VAND V8.B16, V25.B16, V25.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V22.B16, [V10.B16], V14.B16 + VTBL V23.B16, [V11.B16], V15.B16 + VTBL V18.B16, [V10.B16], V10.B16 + VTBL V19.B16, [V11.B16], V11.B16 + VTBL V24.B16, [V12.B16], V16.B16 + VTBL V25.B16, [V13.B16], V17.B16 + VTBL V20.B16, [V12.B16], V12.B16 + VTBL V21.B16, [V13.B16], V13.B16 + VEOR V10.B16, V0.B16, V0.B16 + VEOR V11.B16, V1.B16, V1.B16 + VEOR V12.B16, V0.B16, V0.B16 + VEOR V13.B16, V1.B16, V1.B16 + VEOR V14.B16, V2.B16, V2.B16 + VEOR V15.B16, V3.B16, V3.B16 + VEOR V16.B16, V2.B16, V2.B16 + VEOR V17.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V22.B16, [V10.B16], V14.B16 + VTBL V23.B16, [V11.B16], V15.B16 + VTBL V18.B16, [V10.B16], V10.B16 + VTBL V19.B16, [V11.B16], V11.B16 + VTBL V24.B16, [V12.B16], V16.B16 + VTBL V25.B16, [V13.B16], V17.B16 + VTBL V20.B16, [V12.B16], V12.B16 + VTBL V21.B16, [V13.B16], V13.B16 + VEOR V10.B16, V4.B16, V4.B16 + VEOR V11.B16, V5.B16, V5.B16 + VEOR V12.B16, V4.B16, V4.B16 + VEOR V13.B16, V5.B16, V5.B16 + VEOR V14.B16, V6.B16, V6.B16 + VEOR V15.B16, V7.B16, V7.B16 + VEOR V16.B16, V6.B16, V6.B16 + VEOR V17.B16, V7.B16, V7.B16 + +mulNeon_10x2_64_store: + // Store 2 outputs + VST1.P [V0.D2, V1.D2], 32(R15) + VST1.P [V2.D2, V3.D2], 32(R15) + VST1.P [V4.D2, V5.D2], 32(R14) + VST1.P [V6.D2, V7.D2], 32(R14) + + // Prepare for next loop + SUBS $1, R0 + BNE mulNeon_10x2_64_loop + +mulNeon_10x2_64_end: + RET + +// func mulNeon_10x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: NEON +TEXT ·mulNeon_10x2_64Xor(SB), $8-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 89 YMM used + MOVD n+80(FP), R0 + LSR $6, R0 + TST R0, R0 + BEQ mulNeon_10x2_64Xor_end + MOVD in_base+24(FP), R3 + MOVD (R3), R1 + MOVD 24(R3), R4 + MOVD 48(R3), R5 + MOVD 72(R3), R8 + MOVD 96(R3), R9 + MOVD 120(R3), R10 + MOVD 144(R3), R11 + MOVD 168(R3), R12 + MOVD 192(R3), R13 + MOVD 216(R3), R3 + MOVD out_base+48(FP), R14 + MOVD (R14), R15 + MOVD 24(R14), R14 + MOVD start+72(FP), R6 + + // Add start offset to output + ADD R6, R15 + ADD R6, R14 + + // Add start offset to input + ADD R6, R1 + ADD R6, R4 + ADD R6, R5 + ADD R6, R8 + ADD R6, R9 + ADD R6, R10 + ADD R6, R11 + ADD R6, R12 + ADD R6, R13 + ADD R6, R3 + MOVD $15, R6 + VMOV R6, V8.B[0] + VDUP V8.B[0], V8.B16 + + // Load number of input shards + MOVD in_len+32(FP), R16 + +mulNeon_10x2_64Xor_loop: + MOVD matrix_base+0(FP), R2 + // Load 2 outputs + VLD1.P 32(R15), [V0.B16, V1.B16] + VLD1.P 32(R15), [V2.B16, V3.B16] + VLD1.P 32(R14), [V4.B16, V5.B16] + VLD1.P 32(R14), [V6.B16, V7.B16] + + // Load and process 64 bytes from input 0 to 2 outputs + VLD1.P 32(R1), [V18.B16, V19.B16] + VLD1.P 32(R1), [V22.B16, V23.B16] + VUSHR $4, V18.B16, V20.B16 + VUSHR $4, V19.B16, V21.B16 + VUSHR $4, V22.B16, V24.B16 + VUSHR $4, V23.B16, V25.B16 + VAND V8.B16, V18.B16, V18.B16 + VAND V8.B16, V19.B16, V19.B16 + VAND V8.B16, V22.B16, V22.B16 + VAND V8.B16, V23.B16, V23.B16 + VAND V8.B16, V20.B16, V20.B16 + VAND V8.B16, V21.B16, V21.B16 + VAND V8.B16, V24.B16, V24.B16 + VAND V8.B16, V25.B16, V25.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V22.B16, [V10.B16], V14.B16 + VTBL V23.B16, [V11.B16], V15.B16 + VTBL V18.B16, [V10.B16], V10.B16 + VTBL V19.B16, [V11.B16], V11.B16 + VTBL V24.B16, [V12.B16], V16.B16 + VTBL V25.B16, [V13.B16], V17.B16 + VTBL V20.B16, [V12.B16], V12.B16 + VTBL V21.B16, [V13.B16], V13.B16 + VEOR V10.B16, V0.B16, V0.B16 + VEOR V11.B16, V1.B16, V1.B16 + VEOR V12.B16, V0.B16, V0.B16 + VEOR V13.B16, V1.B16, V1.B16 + VEOR V14.B16, V2.B16, V2.B16 + VEOR V15.B16, V3.B16, V3.B16 + VEOR V16.B16, V2.B16, V2.B16 + VEOR V17.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V22.B16, [V10.B16], V14.B16 + VTBL V23.B16, [V11.B16], V15.B16 + VTBL V18.B16, [V10.B16], V10.B16 + VTBL V19.B16, [V11.B16], V11.B16 + VTBL V24.B16, [V12.B16], V16.B16 + VTBL V25.B16, [V13.B16], V17.B16 + VTBL V20.B16, [V12.B16], V12.B16 + VTBL V21.B16, [V13.B16], V13.B16 + VEOR V10.B16, V4.B16, V4.B16 + VEOR V11.B16, V5.B16, V5.B16 + VEOR V12.B16, V4.B16, V4.B16 + VEOR V13.B16, V5.B16, V5.B16 + VEOR V14.B16, V6.B16, V6.B16 + VEOR V15.B16, V7.B16, V7.B16 + VEOR V16.B16, V6.B16, V6.B16 + VEOR V17.B16, V7.B16, V7.B16 + // Check for early termination + CMP $1, R16 + BEQ mulNeon_10x2_64Xor_store + + // Load and process 64 bytes from input 1 to 2 outputs + VLD1.P 32(R4), [V18.B16, V19.B16] + VLD1.P 32(R4), [V22.B16, V23.B16] + VUSHR $4, V18.B16, V20.B16 + VUSHR $4, V19.B16, V21.B16 + VUSHR $4, V22.B16, V24.B16 + VUSHR $4, V23.B16, V25.B16 + VAND V8.B16, V18.B16, V18.B16 + VAND V8.B16, V19.B16, V19.B16 + VAND V8.B16, V22.B16, V22.B16 + VAND V8.B16, V23.B16, V23.B16 + VAND V8.B16, V20.B16, V20.B16 + VAND V8.B16, V21.B16, V21.B16 + VAND V8.B16, V24.B16, V24.B16 + VAND V8.B16, V25.B16, V25.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V22.B16, [V10.B16], V14.B16 + VTBL V23.B16, [V11.B16], V15.B16 + VTBL V18.B16, [V10.B16], V10.B16 + VTBL V19.B16, [V11.B16], V11.B16 + VTBL V24.B16, [V12.B16], V16.B16 + VTBL V25.B16, [V13.B16], V17.B16 + VTBL V20.B16, [V12.B16], V12.B16 + VTBL V21.B16, [V13.B16], V13.B16 + VEOR V10.B16, V0.B16, V0.B16 + VEOR V11.B16, V1.B16, V1.B16 + VEOR V12.B16, V0.B16, V0.B16 + VEOR V13.B16, V1.B16, V1.B16 + VEOR V14.B16, V2.B16, V2.B16 + VEOR V15.B16, V3.B16, V3.B16 + VEOR V16.B16, V2.B16, V2.B16 + VEOR V17.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V22.B16, [V10.B16], V14.B16 + VTBL V23.B16, [V11.B16], V15.B16 + VTBL V18.B16, [V10.B16], V10.B16 + VTBL V19.B16, [V11.B16], V11.B16 + VTBL V24.B16, [V12.B16], V16.B16 + VTBL V25.B16, [V13.B16], V17.B16 + VTBL V20.B16, [V12.B16], V12.B16 + VTBL V21.B16, [V13.B16], V13.B16 + VEOR V10.B16, V4.B16, V4.B16 + VEOR V11.B16, V5.B16, V5.B16 + VEOR V12.B16, V4.B16, V4.B16 + VEOR V13.B16, V5.B16, V5.B16 + VEOR V14.B16, V6.B16, V6.B16 + VEOR V15.B16, V7.B16, V7.B16 + VEOR V16.B16, V6.B16, V6.B16 + VEOR V17.B16, V7.B16, V7.B16 + // Check for early termination + CMP $2, R16 + BEQ mulNeon_10x2_64Xor_store + + // Load and process 64 bytes from input 2 to 2 outputs + VLD1.P 32(R5), [V18.B16, V19.B16] + VLD1.P 32(R5), [V22.B16, V23.B16] + VUSHR $4, V18.B16, V20.B16 + VUSHR $4, V19.B16, V21.B16 + VUSHR $4, V22.B16, V24.B16 + VUSHR $4, V23.B16, V25.B16 + VAND V8.B16, V18.B16, V18.B16 + VAND V8.B16, V19.B16, V19.B16 + VAND V8.B16, V22.B16, V22.B16 + VAND V8.B16, V23.B16, V23.B16 + VAND V8.B16, V20.B16, V20.B16 + VAND V8.B16, V21.B16, V21.B16 + VAND V8.B16, V24.B16, V24.B16 + VAND V8.B16, V25.B16, V25.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V22.B16, [V10.B16], V14.B16 + VTBL V23.B16, [V11.B16], V15.B16 + VTBL V18.B16, [V10.B16], V10.B16 + VTBL V19.B16, [V11.B16], V11.B16 + VTBL V24.B16, [V12.B16], V16.B16 + VTBL V25.B16, [V13.B16], V17.B16 + VTBL V20.B16, [V12.B16], V12.B16 + VTBL V21.B16, [V13.B16], V13.B16 + VEOR V10.B16, V0.B16, V0.B16 + VEOR V11.B16, V1.B16, V1.B16 + VEOR V12.B16, V0.B16, V0.B16 + VEOR V13.B16, V1.B16, V1.B16 + VEOR V14.B16, V2.B16, V2.B16 + VEOR V15.B16, V3.B16, V3.B16 + VEOR V16.B16, V2.B16, V2.B16 + VEOR V17.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V22.B16, [V10.B16], V14.B16 + VTBL V23.B16, [V11.B16], V15.B16 + VTBL V18.B16, [V10.B16], V10.B16 + VTBL V19.B16, [V11.B16], V11.B16 + VTBL V24.B16, [V12.B16], V16.B16 + VTBL V25.B16, [V13.B16], V17.B16 + VTBL V20.B16, [V12.B16], V12.B16 + VTBL V21.B16, [V13.B16], V13.B16 + VEOR V10.B16, V4.B16, V4.B16 + VEOR V11.B16, V5.B16, V5.B16 + VEOR V12.B16, V4.B16, V4.B16 + VEOR V13.B16, V5.B16, V5.B16 + VEOR V14.B16, V6.B16, V6.B16 + VEOR V15.B16, V7.B16, V7.B16 + VEOR V16.B16, V6.B16, V6.B16 + VEOR V17.B16, V7.B16, V7.B16 + // Check for early termination + CMP $3, R16 + BEQ mulNeon_10x2_64Xor_store + + // Load and process 64 bytes from input 3 to 2 outputs + VLD1.P 32(R8), [V18.B16, V19.B16] + VLD1.P 32(R8), [V22.B16, V23.B16] + VUSHR $4, V18.B16, V20.B16 + VUSHR $4, V19.B16, V21.B16 + VUSHR $4, V22.B16, V24.B16 + VUSHR $4, V23.B16, V25.B16 + VAND V8.B16, V18.B16, V18.B16 + VAND V8.B16, V19.B16, V19.B16 + VAND V8.B16, V22.B16, V22.B16 + VAND V8.B16, V23.B16, V23.B16 + VAND V8.B16, V20.B16, V20.B16 + VAND V8.B16, V21.B16, V21.B16 + VAND V8.B16, V24.B16, V24.B16 + VAND V8.B16, V25.B16, V25.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V22.B16, [V10.B16], V14.B16 + VTBL V23.B16, [V11.B16], V15.B16 + VTBL V18.B16, [V10.B16], V10.B16 + VTBL V19.B16, [V11.B16], V11.B16 + VTBL V24.B16, [V12.B16], V16.B16 + VTBL V25.B16, [V13.B16], V17.B16 + VTBL V20.B16, [V12.B16], V12.B16 + VTBL V21.B16, [V13.B16], V13.B16 + VEOR V10.B16, V0.B16, V0.B16 + VEOR V11.B16, V1.B16, V1.B16 + VEOR V12.B16, V0.B16, V0.B16 + VEOR V13.B16, V1.B16, V1.B16 + VEOR V14.B16, V2.B16, V2.B16 + VEOR V15.B16, V3.B16, V3.B16 + VEOR V16.B16, V2.B16, V2.B16 + VEOR V17.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V22.B16, [V10.B16], V14.B16 + VTBL V23.B16, [V11.B16], V15.B16 + VTBL V18.B16, [V10.B16], V10.B16 + VTBL V19.B16, [V11.B16], V11.B16 + VTBL V24.B16, [V12.B16], V16.B16 + VTBL V25.B16, [V13.B16], V17.B16 + VTBL V20.B16, [V12.B16], V12.B16 + VTBL V21.B16, [V13.B16], V13.B16 + VEOR V10.B16, V4.B16, V4.B16 + VEOR V11.B16, V5.B16, V5.B16 + VEOR V12.B16, V4.B16, V4.B16 + VEOR V13.B16, V5.B16, V5.B16 + VEOR V14.B16, V6.B16, V6.B16 + VEOR V15.B16, V7.B16, V7.B16 + VEOR V16.B16, V6.B16, V6.B16 + VEOR V17.B16, V7.B16, V7.B16 + // Check for early termination + CMP $4, R16 + BEQ mulNeon_10x2_64Xor_store + + // Load and process 64 bytes from input 4 to 2 outputs + VLD1.P 32(R9), [V18.B16, V19.B16] + VLD1.P 32(R9), [V22.B16, V23.B16] + VUSHR $4, V18.B16, V20.B16 + VUSHR $4, V19.B16, V21.B16 + VUSHR $4, V22.B16, V24.B16 + VUSHR $4, V23.B16, V25.B16 + VAND V8.B16, V18.B16, V18.B16 + VAND V8.B16, V19.B16, V19.B16 + VAND V8.B16, V22.B16, V22.B16 + VAND V8.B16, V23.B16, V23.B16 + VAND V8.B16, V20.B16, V20.B16 + VAND V8.B16, V21.B16, V21.B16 + VAND V8.B16, V24.B16, V24.B16 + VAND V8.B16, V25.B16, V25.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V22.B16, [V10.B16], V14.B16 + VTBL V23.B16, [V11.B16], V15.B16 + VTBL V18.B16, [V10.B16], V10.B16 + VTBL V19.B16, [V11.B16], V11.B16 + VTBL V24.B16, [V12.B16], V16.B16 + VTBL V25.B16, [V13.B16], V17.B16 + VTBL V20.B16, [V12.B16], V12.B16 + VTBL V21.B16, [V13.B16], V13.B16 + VEOR V10.B16, V0.B16, V0.B16 + VEOR V11.B16, V1.B16, V1.B16 + VEOR V12.B16, V0.B16, V0.B16 + VEOR V13.B16, V1.B16, V1.B16 + VEOR V14.B16, V2.B16, V2.B16 + VEOR V15.B16, V3.B16, V3.B16 + VEOR V16.B16, V2.B16, V2.B16 + VEOR V17.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V22.B16, [V10.B16], V14.B16 + VTBL V23.B16, [V11.B16], V15.B16 + VTBL V18.B16, [V10.B16], V10.B16 + VTBL V19.B16, [V11.B16], V11.B16 + VTBL V24.B16, [V12.B16], V16.B16 + VTBL V25.B16, [V13.B16], V17.B16 + VTBL V20.B16, [V12.B16], V12.B16 + VTBL V21.B16, [V13.B16], V13.B16 + VEOR V10.B16, V4.B16, V4.B16 + VEOR V11.B16, V5.B16, V5.B16 + VEOR V12.B16, V4.B16, V4.B16 + VEOR V13.B16, V5.B16, V5.B16 + VEOR V14.B16, V6.B16, V6.B16 + VEOR V15.B16, V7.B16, V7.B16 + VEOR V16.B16, V6.B16, V6.B16 + VEOR V17.B16, V7.B16, V7.B16 + // Check for early termination + CMP $5, R16 + BEQ mulNeon_10x2_64Xor_store + + // Load and process 64 bytes from input 5 to 2 outputs + VLD1.P 32(R10), [V18.B16, V19.B16] + VLD1.P 32(R10), [V22.B16, V23.B16] + VUSHR $4, V18.B16, V20.B16 + VUSHR $4, V19.B16, V21.B16 + VUSHR $4, V22.B16, V24.B16 + VUSHR $4, V23.B16, V25.B16 + VAND V8.B16, V18.B16, V18.B16 + VAND V8.B16, V19.B16, V19.B16 + VAND V8.B16, V22.B16, V22.B16 + VAND V8.B16, V23.B16, V23.B16 + VAND V8.B16, V20.B16, V20.B16 + VAND V8.B16, V21.B16, V21.B16 + VAND V8.B16, V24.B16, V24.B16 + VAND V8.B16, V25.B16, V25.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V22.B16, [V10.B16], V14.B16 + VTBL V23.B16, [V11.B16], V15.B16 + VTBL V18.B16, [V10.B16], V10.B16 + VTBL V19.B16, [V11.B16], V11.B16 + VTBL V24.B16, [V12.B16], V16.B16 + VTBL V25.B16, [V13.B16], V17.B16 + VTBL V20.B16, [V12.B16], V12.B16 + VTBL V21.B16, [V13.B16], V13.B16 + VEOR V10.B16, V0.B16, V0.B16 + VEOR V11.B16, V1.B16, V1.B16 + VEOR V12.B16, V0.B16, V0.B16 + VEOR V13.B16, V1.B16, V1.B16 + VEOR V14.B16, V2.B16, V2.B16 + VEOR V15.B16, V3.B16, V3.B16 + VEOR V16.B16, V2.B16, V2.B16 + VEOR V17.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V22.B16, [V10.B16], V14.B16 + VTBL V23.B16, [V11.B16], V15.B16 + VTBL V18.B16, [V10.B16], V10.B16 + VTBL V19.B16, [V11.B16], V11.B16 + VTBL V24.B16, [V12.B16], V16.B16 + VTBL V25.B16, [V13.B16], V17.B16 + VTBL V20.B16, [V12.B16], V12.B16 + VTBL V21.B16, [V13.B16], V13.B16 + VEOR V10.B16, V4.B16, V4.B16 + VEOR V11.B16, V5.B16, V5.B16 + VEOR V12.B16, V4.B16, V4.B16 + VEOR V13.B16, V5.B16, V5.B16 + VEOR V14.B16, V6.B16, V6.B16 + VEOR V15.B16, V7.B16, V7.B16 + VEOR V16.B16, V6.B16, V6.B16 + VEOR V17.B16, V7.B16, V7.B16 + // Check for early termination + CMP $6, R16 + BEQ mulNeon_10x2_64Xor_store + + // Load and process 64 bytes from input 6 to 2 outputs + VLD1.P 32(R11), [V18.B16, V19.B16] + VLD1.P 32(R11), [V22.B16, V23.B16] + VUSHR $4, V18.B16, V20.B16 + VUSHR $4, V19.B16, V21.B16 + VUSHR $4, V22.B16, V24.B16 + VUSHR $4, V23.B16, V25.B16 + VAND V8.B16, V18.B16, V18.B16 + VAND V8.B16, V19.B16, V19.B16 + VAND V8.B16, V22.B16, V22.B16 + VAND V8.B16, V23.B16, V23.B16 + VAND V8.B16, V20.B16, V20.B16 + VAND V8.B16, V21.B16, V21.B16 + VAND V8.B16, V24.B16, V24.B16 + VAND V8.B16, V25.B16, V25.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V22.B16, [V10.B16], V14.B16 + VTBL V23.B16, [V11.B16], V15.B16 + VTBL V18.B16, [V10.B16], V10.B16 + VTBL V19.B16, [V11.B16], V11.B16 + VTBL V24.B16, [V12.B16], V16.B16 + VTBL V25.B16, [V13.B16], V17.B16 + VTBL V20.B16, [V12.B16], V12.B16 + VTBL V21.B16, [V13.B16], V13.B16 + VEOR V10.B16, V0.B16, V0.B16 + VEOR V11.B16, V1.B16, V1.B16 + VEOR V12.B16, V0.B16, V0.B16 + VEOR V13.B16, V1.B16, V1.B16 + VEOR V14.B16, V2.B16, V2.B16 + VEOR V15.B16, V3.B16, V3.B16 + VEOR V16.B16, V2.B16, V2.B16 + VEOR V17.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V22.B16, [V10.B16], V14.B16 + VTBL V23.B16, [V11.B16], V15.B16 + VTBL V18.B16, [V10.B16], V10.B16 + VTBL V19.B16, [V11.B16], V11.B16 + VTBL V24.B16, [V12.B16], V16.B16 + VTBL V25.B16, [V13.B16], V17.B16 + VTBL V20.B16, [V12.B16], V12.B16 + VTBL V21.B16, [V13.B16], V13.B16 + VEOR V10.B16, V4.B16, V4.B16 + VEOR V11.B16, V5.B16, V5.B16 + VEOR V12.B16, V4.B16, V4.B16 + VEOR V13.B16, V5.B16, V5.B16 + VEOR V14.B16, V6.B16, V6.B16 + VEOR V15.B16, V7.B16, V7.B16 + VEOR V16.B16, V6.B16, V6.B16 + VEOR V17.B16, V7.B16, V7.B16 + // Check for early termination + CMP $7, R16 + BEQ mulNeon_10x2_64Xor_store + + // Load and process 64 bytes from input 7 to 2 outputs + VLD1.P 32(R12), [V18.B16, V19.B16] + VLD1.P 32(R12), [V22.B16, V23.B16] + VUSHR $4, V18.B16, V20.B16 + VUSHR $4, V19.B16, V21.B16 + VUSHR $4, V22.B16, V24.B16 + VUSHR $4, V23.B16, V25.B16 + VAND V8.B16, V18.B16, V18.B16 + VAND V8.B16, V19.B16, V19.B16 + VAND V8.B16, V22.B16, V22.B16 + VAND V8.B16, V23.B16, V23.B16 + VAND V8.B16, V20.B16, V20.B16 + VAND V8.B16, V21.B16, V21.B16 + VAND V8.B16, V24.B16, V24.B16 + VAND V8.B16, V25.B16, V25.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V22.B16, [V10.B16], V14.B16 + VTBL V23.B16, [V11.B16], V15.B16 + VTBL V18.B16, [V10.B16], V10.B16 + VTBL V19.B16, [V11.B16], V11.B16 + VTBL V24.B16, [V12.B16], V16.B16 + VTBL V25.B16, [V13.B16], V17.B16 + VTBL V20.B16, [V12.B16], V12.B16 + VTBL V21.B16, [V13.B16], V13.B16 + VEOR V10.B16, V0.B16, V0.B16 + VEOR V11.B16, V1.B16, V1.B16 + VEOR V12.B16, V0.B16, V0.B16 + VEOR V13.B16, V1.B16, V1.B16 + VEOR V14.B16, V2.B16, V2.B16 + VEOR V15.B16, V3.B16, V3.B16 + VEOR V16.B16, V2.B16, V2.B16 + VEOR V17.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V22.B16, [V10.B16], V14.B16 + VTBL V23.B16, [V11.B16], V15.B16 + VTBL V18.B16, [V10.B16], V10.B16 + VTBL V19.B16, [V11.B16], V11.B16 + VTBL V24.B16, [V12.B16], V16.B16 + VTBL V25.B16, [V13.B16], V17.B16 + VTBL V20.B16, [V12.B16], V12.B16 + VTBL V21.B16, [V13.B16], V13.B16 + VEOR V10.B16, V4.B16, V4.B16 + VEOR V11.B16, V5.B16, V5.B16 + VEOR V12.B16, V4.B16, V4.B16 + VEOR V13.B16, V5.B16, V5.B16 + VEOR V14.B16, V6.B16, V6.B16 + VEOR V15.B16, V7.B16, V7.B16 + VEOR V16.B16, V6.B16, V6.B16 + VEOR V17.B16, V7.B16, V7.B16 + // Check for early termination + CMP $8, R16 + BEQ mulNeon_10x2_64Xor_store + + // Load and process 64 bytes from input 8 to 2 outputs + VLD1.P 32(R13), [V18.B16, V19.B16] + VLD1.P 32(R13), [V22.B16, V23.B16] + VUSHR $4, V18.B16, V20.B16 + VUSHR $4, V19.B16, V21.B16 + VUSHR $4, V22.B16, V24.B16 + VUSHR $4, V23.B16, V25.B16 + VAND V8.B16, V18.B16, V18.B16 + VAND V8.B16, V19.B16, V19.B16 + VAND V8.B16, V22.B16, V22.B16 + VAND V8.B16, V23.B16, V23.B16 + VAND V8.B16, V20.B16, V20.B16 + VAND V8.B16, V21.B16, V21.B16 + VAND V8.B16, V24.B16, V24.B16 + VAND V8.B16, V25.B16, V25.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V22.B16, [V10.B16], V14.B16 + VTBL V23.B16, [V11.B16], V15.B16 + VTBL V18.B16, [V10.B16], V10.B16 + VTBL V19.B16, [V11.B16], V11.B16 + VTBL V24.B16, [V12.B16], V16.B16 + VTBL V25.B16, [V13.B16], V17.B16 + VTBL V20.B16, [V12.B16], V12.B16 + VTBL V21.B16, [V13.B16], V13.B16 + VEOR V10.B16, V0.B16, V0.B16 + VEOR V11.B16, V1.B16, V1.B16 + VEOR V12.B16, V0.B16, V0.B16 + VEOR V13.B16, V1.B16, V1.B16 + VEOR V14.B16, V2.B16, V2.B16 + VEOR V15.B16, V3.B16, V3.B16 + VEOR V16.B16, V2.B16, V2.B16 + VEOR V17.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V22.B16, [V10.B16], V14.B16 + VTBL V23.B16, [V11.B16], V15.B16 + VTBL V18.B16, [V10.B16], V10.B16 + VTBL V19.B16, [V11.B16], V11.B16 + VTBL V24.B16, [V12.B16], V16.B16 + VTBL V25.B16, [V13.B16], V17.B16 + VTBL V20.B16, [V12.B16], V12.B16 + VTBL V21.B16, [V13.B16], V13.B16 + VEOR V10.B16, V4.B16, V4.B16 + VEOR V11.B16, V5.B16, V5.B16 + VEOR V12.B16, V4.B16, V4.B16 + VEOR V13.B16, V5.B16, V5.B16 + VEOR V14.B16, V6.B16, V6.B16 + VEOR V15.B16, V7.B16, V7.B16 + VEOR V16.B16, V6.B16, V6.B16 + VEOR V17.B16, V7.B16, V7.B16 + // Check for early termination + CMP $9, R16 + BEQ mulNeon_10x2_64Xor_store + + // Load and process 64 bytes from input 9 to 2 outputs + VLD1.P 32(R3), [V18.B16, V19.B16] + VLD1.P 32(R3), [V22.B16, V23.B16] + VUSHR $4, V18.B16, V20.B16 + VUSHR $4, V19.B16, V21.B16 + VUSHR $4, V22.B16, V24.B16 + VUSHR $4, V23.B16, V25.B16 + VAND V8.B16, V18.B16, V18.B16 + VAND V8.B16, V19.B16, V19.B16 + VAND V8.B16, V22.B16, V22.B16 + VAND V8.B16, V23.B16, V23.B16 + VAND V8.B16, V20.B16, V20.B16 + VAND V8.B16, V21.B16, V21.B16 + VAND V8.B16, V24.B16, V24.B16 + VAND V8.B16, V25.B16, V25.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V22.B16, [V10.B16], V14.B16 + VTBL V23.B16, [V11.B16], V15.B16 + VTBL V18.B16, [V10.B16], V10.B16 + VTBL V19.B16, [V11.B16], V11.B16 + VTBL V24.B16, [V12.B16], V16.B16 + VTBL V25.B16, [V13.B16], V17.B16 + VTBL V20.B16, [V12.B16], V12.B16 + VTBL V21.B16, [V13.B16], V13.B16 + VEOR V10.B16, V0.B16, V0.B16 + VEOR V11.B16, V1.B16, V1.B16 + VEOR V12.B16, V0.B16, V0.B16 + VEOR V13.B16, V1.B16, V1.B16 + VEOR V14.B16, V2.B16, V2.B16 + VEOR V15.B16, V3.B16, V3.B16 + VEOR V16.B16, V2.B16, V2.B16 + VEOR V17.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V22.B16, [V10.B16], V14.B16 + VTBL V23.B16, [V11.B16], V15.B16 + VTBL V18.B16, [V10.B16], V10.B16 + VTBL V19.B16, [V11.B16], V11.B16 + VTBL V24.B16, [V12.B16], V16.B16 + VTBL V25.B16, [V13.B16], V17.B16 + VTBL V20.B16, [V12.B16], V12.B16 + VTBL V21.B16, [V13.B16], V13.B16 + VEOR V10.B16, V4.B16, V4.B16 + VEOR V11.B16, V5.B16, V5.B16 + VEOR V12.B16, V4.B16, V4.B16 + VEOR V13.B16, V5.B16, V5.B16 + VEOR V14.B16, V6.B16, V6.B16 + VEOR V15.B16, V7.B16, V7.B16 + VEOR V16.B16, V6.B16, V6.B16 + VEOR V17.B16, V7.B16, V7.B16 + +mulNeon_10x2_64Xor_store: + // Store 2 outputs + SUB $64, R15 + VST1.P [V0.D2, V1.D2], 32(R15) + VST1.P [V2.D2, V3.D2], 32(R15) + SUB $64, R14 + VST1.P [V4.D2, V5.D2], 32(R14) + VST1.P [V6.D2, V7.D2], 32(R14) + + // Prepare for next loop + SUBS $1, R0 + BNE mulNeon_10x2_64Xor_loop + +mulNeon_10x2_64Xor_end: + RET + +// func mulNeon_10x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: NEON +TEXT ·mulNeon_10x3_64(SB), $8-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 130 YMM used + MOVD n+80(FP), R0 + LSR $6, R0 + TST R0, R0 + BEQ mulNeon_10x3_64_end + MOVD in_base+24(FP), R0 + MOVD (R0), R3 + MOVD 24(R0), R1 + MOVD 48(R0), R4 + MOVD 72(R0), R5 + MOVD 96(R0), R8 + MOVD 120(R0), R9 + MOVD 144(R0), R10 + MOVD 168(R0), R11 + MOVD 192(R0), R12 + MOVD 216(R0), R0 + MOVD out_base+48(FP), R13 + MOVD (R13), R14 + MOVD 24(R13), R15 + MOVD 48(R13), R13 + MOVD start+72(FP), R6 + + // Add start offset to output + ADD R6, R14 + ADD R6, R15 + ADD R6, R13 + + // Add start offset to input + ADD R6, R3 + ADD R6, R1 + ADD R6, R4 + ADD R6, R5 + ADD R6, R8 + ADD R6, R9 + ADD R6, R10 + ADD R6, R11 + ADD R6, R12 + ADD R6, R0 + MOVD $15, R6 + VMOV R6, V12.B[0] + VDUP V12.B[0], V12.B16 + + // Reload length to save a register + MOVD n+80(FP), R6 + LSR $6, R6 + + // Load number of input shards + MOVD in_len+32(FP), R16 + +mulNeon_10x3_64_loop: + MOVD matrix_base+0(FP), R2 + // Load and process 64 bytes from input 0 to 3 outputs + VLD1.P 32(R3), [V22.B16, V23.B16] + VLD1.P 32(R3), [V26.B16, V27.B16] + VUSHR $4, V22.B16, V24.B16 + VUSHR $4, V23.B16, V25.B16 + VUSHR $4, V26.B16, V28.B16 + VUSHR $4, V27.B16, V29.B16 + VAND V12.B16, V22.B16, V22.B16 + VAND V12.B16, V23.B16, V23.B16 + VAND V12.B16, V26.B16, V26.B16 + VAND V12.B16, V27.B16, V27.B16 + VAND V12.B16, V24.B16, V24.B16 + VAND V12.B16, V25.B16, V25.B16 + VAND V12.B16, V28.B16, V28.B16 + VAND V12.B16, V29.B16, V29.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V26.B16, [V14.B16], V18.B16 + VTBL V27.B16, [V15.B16], V19.B16 + VTBL V22.B16, [V14.B16], V14.B16 + VTBL V23.B16, [V15.B16], V15.B16 + VTBL V28.B16, [V16.B16], V20.B16 + VTBL V29.B16, [V17.B16], V21.B16 + VTBL V24.B16, [V16.B16], V16.B16 + VTBL V25.B16, [V17.B16], V17.B16 + VEOR V14.B16, V16.B16, V0.B16 + VEOR V15.B16, V17.B16, V1.B16 + VEOR V18.B16, V20.B16, V2.B16 + VEOR V19.B16, V21.B16, V3.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V26.B16, [V14.B16], V18.B16 + VTBL V27.B16, [V15.B16], V19.B16 + VTBL V22.B16, [V14.B16], V14.B16 + VTBL V23.B16, [V15.B16], V15.B16 + VTBL V28.B16, [V16.B16], V20.B16 + VTBL V29.B16, [V17.B16], V21.B16 + VTBL V24.B16, [V16.B16], V16.B16 + VTBL V25.B16, [V17.B16], V17.B16 + VEOR V14.B16, V16.B16, V4.B16 + VEOR V15.B16, V17.B16, V5.B16 + VEOR V18.B16, V20.B16, V6.B16 + VEOR V19.B16, V21.B16, V7.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V26.B16, [V14.B16], V18.B16 + VTBL V27.B16, [V15.B16], V19.B16 + VTBL V22.B16, [V14.B16], V14.B16 + VTBL V23.B16, [V15.B16], V15.B16 + VTBL V28.B16, [V16.B16], V20.B16 + VTBL V29.B16, [V17.B16], V21.B16 + VTBL V24.B16, [V16.B16], V16.B16 + VTBL V25.B16, [V17.B16], V17.B16 + VEOR V14.B16, V16.B16, V8.B16 + VEOR V15.B16, V17.B16, V9.B16 + VEOR V18.B16, V20.B16, V10.B16 + VEOR V19.B16, V21.B16, V11.B16 + // Check for early termination + CMP $1, R16 + BEQ mulNeon_10x3_64_store + + // Load and process 64 bytes from input 1 to 3 outputs + VLD1.P 32(R1), [V22.B16, V23.B16] + VLD1.P 32(R1), [V26.B16, V27.B16] + VUSHR $4, V22.B16, V24.B16 + VUSHR $4, V23.B16, V25.B16 + VUSHR $4, V26.B16, V28.B16 + VUSHR $4, V27.B16, V29.B16 + VAND V12.B16, V22.B16, V22.B16 + VAND V12.B16, V23.B16, V23.B16 + VAND V12.B16, V26.B16, V26.B16 + VAND V12.B16, V27.B16, V27.B16 + VAND V12.B16, V24.B16, V24.B16 + VAND V12.B16, V25.B16, V25.B16 + VAND V12.B16, V28.B16, V28.B16 + VAND V12.B16, V29.B16, V29.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V26.B16, [V14.B16], V18.B16 + VTBL V27.B16, [V15.B16], V19.B16 + VTBL V22.B16, [V14.B16], V14.B16 + VTBL V23.B16, [V15.B16], V15.B16 + VTBL V28.B16, [V16.B16], V20.B16 + VTBL V29.B16, [V17.B16], V21.B16 + VTBL V24.B16, [V16.B16], V16.B16 + VTBL V25.B16, [V17.B16], V17.B16 + VEOR V14.B16, V0.B16, V0.B16 + VEOR V15.B16, V1.B16, V1.B16 + VEOR V16.B16, V0.B16, V0.B16 + VEOR V17.B16, V1.B16, V1.B16 + VEOR V18.B16, V2.B16, V2.B16 + VEOR V19.B16, V3.B16, V3.B16 + VEOR V20.B16, V2.B16, V2.B16 + VEOR V21.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V26.B16, [V14.B16], V18.B16 + VTBL V27.B16, [V15.B16], V19.B16 + VTBL V22.B16, [V14.B16], V14.B16 + VTBL V23.B16, [V15.B16], V15.B16 + VTBL V28.B16, [V16.B16], V20.B16 + VTBL V29.B16, [V17.B16], V21.B16 + VTBL V24.B16, [V16.B16], V16.B16 + VTBL V25.B16, [V17.B16], V17.B16 + VEOR V14.B16, V4.B16, V4.B16 + VEOR V15.B16, V5.B16, V5.B16 + VEOR V16.B16, V4.B16, V4.B16 + VEOR V17.B16, V5.B16, V5.B16 + VEOR V18.B16, V6.B16, V6.B16 + VEOR V19.B16, V7.B16, V7.B16 + VEOR V20.B16, V6.B16, V6.B16 + VEOR V21.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V26.B16, [V14.B16], V18.B16 + VTBL V27.B16, [V15.B16], V19.B16 + VTBL V22.B16, [V14.B16], V14.B16 + VTBL V23.B16, [V15.B16], V15.B16 + VTBL V28.B16, [V16.B16], V20.B16 + VTBL V29.B16, [V17.B16], V21.B16 + VTBL V24.B16, [V16.B16], V16.B16 + VTBL V25.B16, [V17.B16], V17.B16 + VEOR V14.B16, V8.B16, V8.B16 + VEOR V15.B16, V9.B16, V9.B16 + VEOR V16.B16, V8.B16, V8.B16 + VEOR V17.B16, V9.B16, V9.B16 + VEOR V18.B16, V10.B16, V10.B16 + VEOR V19.B16, V11.B16, V11.B16 + VEOR V20.B16, V10.B16, V10.B16 + VEOR V21.B16, V11.B16, V11.B16 + // Check for early termination + CMP $2, R16 + BEQ mulNeon_10x3_64_store + + // Load and process 64 bytes from input 2 to 3 outputs + VLD1.P 32(R4), [V22.B16, V23.B16] + VLD1.P 32(R4), [V26.B16, V27.B16] + VUSHR $4, V22.B16, V24.B16 + VUSHR $4, V23.B16, V25.B16 + VUSHR $4, V26.B16, V28.B16 + VUSHR $4, V27.B16, V29.B16 + VAND V12.B16, V22.B16, V22.B16 + VAND V12.B16, V23.B16, V23.B16 + VAND V12.B16, V26.B16, V26.B16 + VAND V12.B16, V27.B16, V27.B16 + VAND V12.B16, V24.B16, V24.B16 + VAND V12.B16, V25.B16, V25.B16 + VAND V12.B16, V28.B16, V28.B16 + VAND V12.B16, V29.B16, V29.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V26.B16, [V14.B16], V18.B16 + VTBL V27.B16, [V15.B16], V19.B16 + VTBL V22.B16, [V14.B16], V14.B16 + VTBL V23.B16, [V15.B16], V15.B16 + VTBL V28.B16, [V16.B16], V20.B16 + VTBL V29.B16, [V17.B16], V21.B16 + VTBL V24.B16, [V16.B16], V16.B16 + VTBL V25.B16, [V17.B16], V17.B16 + VEOR V14.B16, V0.B16, V0.B16 + VEOR V15.B16, V1.B16, V1.B16 + VEOR V16.B16, V0.B16, V0.B16 + VEOR V17.B16, V1.B16, V1.B16 + VEOR V18.B16, V2.B16, V2.B16 + VEOR V19.B16, V3.B16, V3.B16 + VEOR V20.B16, V2.B16, V2.B16 + VEOR V21.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V26.B16, [V14.B16], V18.B16 + VTBL V27.B16, [V15.B16], V19.B16 + VTBL V22.B16, [V14.B16], V14.B16 + VTBL V23.B16, [V15.B16], V15.B16 + VTBL V28.B16, [V16.B16], V20.B16 + VTBL V29.B16, [V17.B16], V21.B16 + VTBL V24.B16, [V16.B16], V16.B16 + VTBL V25.B16, [V17.B16], V17.B16 + VEOR V14.B16, V4.B16, V4.B16 + VEOR V15.B16, V5.B16, V5.B16 + VEOR V16.B16, V4.B16, V4.B16 + VEOR V17.B16, V5.B16, V5.B16 + VEOR V18.B16, V6.B16, V6.B16 + VEOR V19.B16, V7.B16, V7.B16 + VEOR V20.B16, V6.B16, V6.B16 + VEOR V21.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V26.B16, [V14.B16], V18.B16 + VTBL V27.B16, [V15.B16], V19.B16 + VTBL V22.B16, [V14.B16], V14.B16 + VTBL V23.B16, [V15.B16], V15.B16 + VTBL V28.B16, [V16.B16], V20.B16 + VTBL V29.B16, [V17.B16], V21.B16 + VTBL V24.B16, [V16.B16], V16.B16 + VTBL V25.B16, [V17.B16], V17.B16 + VEOR V14.B16, V8.B16, V8.B16 + VEOR V15.B16, V9.B16, V9.B16 + VEOR V16.B16, V8.B16, V8.B16 + VEOR V17.B16, V9.B16, V9.B16 + VEOR V18.B16, V10.B16, V10.B16 + VEOR V19.B16, V11.B16, V11.B16 + VEOR V20.B16, V10.B16, V10.B16 + VEOR V21.B16, V11.B16, V11.B16 + // Check for early termination + CMP $3, R16 + BEQ mulNeon_10x3_64_store + + // Load and process 64 bytes from input 3 to 3 outputs + VLD1.P 32(R5), [V22.B16, V23.B16] + VLD1.P 32(R5), [V26.B16, V27.B16] + VUSHR $4, V22.B16, V24.B16 + VUSHR $4, V23.B16, V25.B16 + VUSHR $4, V26.B16, V28.B16 + VUSHR $4, V27.B16, V29.B16 + VAND V12.B16, V22.B16, V22.B16 + VAND V12.B16, V23.B16, V23.B16 + VAND V12.B16, V26.B16, V26.B16 + VAND V12.B16, V27.B16, V27.B16 + VAND V12.B16, V24.B16, V24.B16 + VAND V12.B16, V25.B16, V25.B16 + VAND V12.B16, V28.B16, V28.B16 + VAND V12.B16, V29.B16, V29.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V26.B16, [V14.B16], V18.B16 + VTBL V27.B16, [V15.B16], V19.B16 + VTBL V22.B16, [V14.B16], V14.B16 + VTBL V23.B16, [V15.B16], V15.B16 + VTBL V28.B16, [V16.B16], V20.B16 + VTBL V29.B16, [V17.B16], V21.B16 + VTBL V24.B16, [V16.B16], V16.B16 + VTBL V25.B16, [V17.B16], V17.B16 + VEOR V14.B16, V0.B16, V0.B16 + VEOR V15.B16, V1.B16, V1.B16 + VEOR V16.B16, V0.B16, V0.B16 + VEOR V17.B16, V1.B16, V1.B16 + VEOR V18.B16, V2.B16, V2.B16 + VEOR V19.B16, V3.B16, V3.B16 + VEOR V20.B16, V2.B16, V2.B16 + VEOR V21.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V26.B16, [V14.B16], V18.B16 + VTBL V27.B16, [V15.B16], V19.B16 + VTBL V22.B16, [V14.B16], V14.B16 + VTBL V23.B16, [V15.B16], V15.B16 + VTBL V28.B16, [V16.B16], V20.B16 + VTBL V29.B16, [V17.B16], V21.B16 + VTBL V24.B16, [V16.B16], V16.B16 + VTBL V25.B16, [V17.B16], V17.B16 + VEOR V14.B16, V4.B16, V4.B16 + VEOR V15.B16, V5.B16, V5.B16 + VEOR V16.B16, V4.B16, V4.B16 + VEOR V17.B16, V5.B16, V5.B16 + VEOR V18.B16, V6.B16, V6.B16 + VEOR V19.B16, V7.B16, V7.B16 + VEOR V20.B16, V6.B16, V6.B16 + VEOR V21.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V26.B16, [V14.B16], V18.B16 + VTBL V27.B16, [V15.B16], V19.B16 + VTBL V22.B16, [V14.B16], V14.B16 + VTBL V23.B16, [V15.B16], V15.B16 + VTBL V28.B16, [V16.B16], V20.B16 + VTBL V29.B16, [V17.B16], V21.B16 + VTBL V24.B16, [V16.B16], V16.B16 + VTBL V25.B16, [V17.B16], V17.B16 + VEOR V14.B16, V8.B16, V8.B16 + VEOR V15.B16, V9.B16, V9.B16 + VEOR V16.B16, V8.B16, V8.B16 + VEOR V17.B16, V9.B16, V9.B16 + VEOR V18.B16, V10.B16, V10.B16 + VEOR V19.B16, V11.B16, V11.B16 + VEOR V20.B16, V10.B16, V10.B16 + VEOR V21.B16, V11.B16, V11.B16 + // Check for early termination + CMP $4, R16 + BEQ mulNeon_10x3_64_store + + // Load and process 64 bytes from input 4 to 3 outputs + VLD1.P 32(R8), [V22.B16, V23.B16] + VLD1.P 32(R8), [V26.B16, V27.B16] + VUSHR $4, V22.B16, V24.B16 + VUSHR $4, V23.B16, V25.B16 + VUSHR $4, V26.B16, V28.B16 + VUSHR $4, V27.B16, V29.B16 + VAND V12.B16, V22.B16, V22.B16 + VAND V12.B16, V23.B16, V23.B16 + VAND V12.B16, V26.B16, V26.B16 + VAND V12.B16, V27.B16, V27.B16 + VAND V12.B16, V24.B16, V24.B16 + VAND V12.B16, V25.B16, V25.B16 + VAND V12.B16, V28.B16, V28.B16 + VAND V12.B16, V29.B16, V29.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V26.B16, [V14.B16], V18.B16 + VTBL V27.B16, [V15.B16], V19.B16 + VTBL V22.B16, [V14.B16], V14.B16 + VTBL V23.B16, [V15.B16], V15.B16 + VTBL V28.B16, [V16.B16], V20.B16 + VTBL V29.B16, [V17.B16], V21.B16 + VTBL V24.B16, [V16.B16], V16.B16 + VTBL V25.B16, [V17.B16], V17.B16 + VEOR V14.B16, V0.B16, V0.B16 + VEOR V15.B16, V1.B16, V1.B16 + VEOR V16.B16, V0.B16, V0.B16 + VEOR V17.B16, V1.B16, V1.B16 + VEOR V18.B16, V2.B16, V2.B16 + VEOR V19.B16, V3.B16, V3.B16 + VEOR V20.B16, V2.B16, V2.B16 + VEOR V21.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V26.B16, [V14.B16], V18.B16 + VTBL V27.B16, [V15.B16], V19.B16 + VTBL V22.B16, [V14.B16], V14.B16 + VTBL V23.B16, [V15.B16], V15.B16 + VTBL V28.B16, [V16.B16], V20.B16 + VTBL V29.B16, [V17.B16], V21.B16 + VTBL V24.B16, [V16.B16], V16.B16 + VTBL V25.B16, [V17.B16], V17.B16 + VEOR V14.B16, V4.B16, V4.B16 + VEOR V15.B16, V5.B16, V5.B16 + VEOR V16.B16, V4.B16, V4.B16 + VEOR V17.B16, V5.B16, V5.B16 + VEOR V18.B16, V6.B16, V6.B16 + VEOR V19.B16, V7.B16, V7.B16 + VEOR V20.B16, V6.B16, V6.B16 + VEOR V21.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V26.B16, [V14.B16], V18.B16 + VTBL V27.B16, [V15.B16], V19.B16 + VTBL V22.B16, [V14.B16], V14.B16 + VTBL V23.B16, [V15.B16], V15.B16 + VTBL V28.B16, [V16.B16], V20.B16 + VTBL V29.B16, [V17.B16], V21.B16 + VTBL V24.B16, [V16.B16], V16.B16 + VTBL V25.B16, [V17.B16], V17.B16 + VEOR V14.B16, V8.B16, V8.B16 + VEOR V15.B16, V9.B16, V9.B16 + VEOR V16.B16, V8.B16, V8.B16 + VEOR V17.B16, V9.B16, V9.B16 + VEOR V18.B16, V10.B16, V10.B16 + VEOR V19.B16, V11.B16, V11.B16 + VEOR V20.B16, V10.B16, V10.B16 + VEOR V21.B16, V11.B16, V11.B16 + // Check for early termination + CMP $5, R16 + BEQ mulNeon_10x3_64_store + + // Load and process 64 bytes from input 5 to 3 outputs + VLD1.P 32(R9), [V22.B16, V23.B16] + VLD1.P 32(R9), [V26.B16, V27.B16] + VUSHR $4, V22.B16, V24.B16 + VUSHR $4, V23.B16, V25.B16 + VUSHR $4, V26.B16, V28.B16 + VUSHR $4, V27.B16, V29.B16 + VAND V12.B16, V22.B16, V22.B16 + VAND V12.B16, V23.B16, V23.B16 + VAND V12.B16, V26.B16, V26.B16 + VAND V12.B16, V27.B16, V27.B16 + VAND V12.B16, V24.B16, V24.B16 + VAND V12.B16, V25.B16, V25.B16 + VAND V12.B16, V28.B16, V28.B16 + VAND V12.B16, V29.B16, V29.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V26.B16, [V14.B16], V18.B16 + VTBL V27.B16, [V15.B16], V19.B16 + VTBL V22.B16, [V14.B16], V14.B16 + VTBL V23.B16, [V15.B16], V15.B16 + VTBL V28.B16, [V16.B16], V20.B16 + VTBL V29.B16, [V17.B16], V21.B16 + VTBL V24.B16, [V16.B16], V16.B16 + VTBL V25.B16, [V17.B16], V17.B16 + VEOR V14.B16, V0.B16, V0.B16 + VEOR V15.B16, V1.B16, V1.B16 + VEOR V16.B16, V0.B16, V0.B16 + VEOR V17.B16, V1.B16, V1.B16 + VEOR V18.B16, V2.B16, V2.B16 + VEOR V19.B16, V3.B16, V3.B16 + VEOR V20.B16, V2.B16, V2.B16 + VEOR V21.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V26.B16, [V14.B16], V18.B16 + VTBL V27.B16, [V15.B16], V19.B16 + VTBL V22.B16, [V14.B16], V14.B16 + VTBL V23.B16, [V15.B16], V15.B16 + VTBL V28.B16, [V16.B16], V20.B16 + VTBL V29.B16, [V17.B16], V21.B16 + VTBL V24.B16, [V16.B16], V16.B16 + VTBL V25.B16, [V17.B16], V17.B16 + VEOR V14.B16, V4.B16, V4.B16 + VEOR V15.B16, V5.B16, V5.B16 + VEOR V16.B16, V4.B16, V4.B16 + VEOR V17.B16, V5.B16, V5.B16 + VEOR V18.B16, V6.B16, V6.B16 + VEOR V19.B16, V7.B16, V7.B16 + VEOR V20.B16, V6.B16, V6.B16 + VEOR V21.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V26.B16, [V14.B16], V18.B16 + VTBL V27.B16, [V15.B16], V19.B16 + VTBL V22.B16, [V14.B16], V14.B16 + VTBL V23.B16, [V15.B16], V15.B16 + VTBL V28.B16, [V16.B16], V20.B16 + VTBL V29.B16, [V17.B16], V21.B16 + VTBL V24.B16, [V16.B16], V16.B16 + VTBL V25.B16, [V17.B16], V17.B16 + VEOR V14.B16, V8.B16, V8.B16 + VEOR V15.B16, V9.B16, V9.B16 + VEOR V16.B16, V8.B16, V8.B16 + VEOR V17.B16, V9.B16, V9.B16 + VEOR V18.B16, V10.B16, V10.B16 + VEOR V19.B16, V11.B16, V11.B16 + VEOR V20.B16, V10.B16, V10.B16 + VEOR V21.B16, V11.B16, V11.B16 + // Check for early termination + CMP $6, R16 + BEQ mulNeon_10x3_64_store + + // Load and process 64 bytes from input 6 to 3 outputs + VLD1.P 32(R10), [V22.B16, V23.B16] + VLD1.P 32(R10), [V26.B16, V27.B16] + VUSHR $4, V22.B16, V24.B16 + VUSHR $4, V23.B16, V25.B16 + VUSHR $4, V26.B16, V28.B16 + VUSHR $4, V27.B16, V29.B16 + VAND V12.B16, V22.B16, V22.B16 + VAND V12.B16, V23.B16, V23.B16 + VAND V12.B16, V26.B16, V26.B16 + VAND V12.B16, V27.B16, V27.B16 + VAND V12.B16, V24.B16, V24.B16 + VAND V12.B16, V25.B16, V25.B16 + VAND V12.B16, V28.B16, V28.B16 + VAND V12.B16, V29.B16, V29.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V26.B16, [V14.B16], V18.B16 + VTBL V27.B16, [V15.B16], V19.B16 + VTBL V22.B16, [V14.B16], V14.B16 + VTBL V23.B16, [V15.B16], V15.B16 + VTBL V28.B16, [V16.B16], V20.B16 + VTBL V29.B16, [V17.B16], V21.B16 + VTBL V24.B16, [V16.B16], V16.B16 + VTBL V25.B16, [V17.B16], V17.B16 + VEOR V14.B16, V0.B16, V0.B16 + VEOR V15.B16, V1.B16, V1.B16 + VEOR V16.B16, V0.B16, V0.B16 + VEOR V17.B16, V1.B16, V1.B16 + VEOR V18.B16, V2.B16, V2.B16 + VEOR V19.B16, V3.B16, V3.B16 + VEOR V20.B16, V2.B16, V2.B16 + VEOR V21.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V26.B16, [V14.B16], V18.B16 + VTBL V27.B16, [V15.B16], V19.B16 + VTBL V22.B16, [V14.B16], V14.B16 + VTBL V23.B16, [V15.B16], V15.B16 + VTBL V28.B16, [V16.B16], V20.B16 + VTBL V29.B16, [V17.B16], V21.B16 + VTBL V24.B16, [V16.B16], V16.B16 + VTBL V25.B16, [V17.B16], V17.B16 + VEOR V14.B16, V4.B16, V4.B16 + VEOR V15.B16, V5.B16, V5.B16 + VEOR V16.B16, V4.B16, V4.B16 + VEOR V17.B16, V5.B16, V5.B16 + VEOR V18.B16, V6.B16, V6.B16 + VEOR V19.B16, V7.B16, V7.B16 + VEOR V20.B16, V6.B16, V6.B16 + VEOR V21.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V26.B16, [V14.B16], V18.B16 + VTBL V27.B16, [V15.B16], V19.B16 + VTBL V22.B16, [V14.B16], V14.B16 + VTBL V23.B16, [V15.B16], V15.B16 + VTBL V28.B16, [V16.B16], V20.B16 + VTBL V29.B16, [V17.B16], V21.B16 + VTBL V24.B16, [V16.B16], V16.B16 + VTBL V25.B16, [V17.B16], V17.B16 + VEOR V14.B16, V8.B16, V8.B16 + VEOR V15.B16, V9.B16, V9.B16 + VEOR V16.B16, V8.B16, V8.B16 + VEOR V17.B16, V9.B16, V9.B16 + VEOR V18.B16, V10.B16, V10.B16 + VEOR V19.B16, V11.B16, V11.B16 + VEOR V20.B16, V10.B16, V10.B16 + VEOR V21.B16, V11.B16, V11.B16 + // Check for early termination + CMP $7, R16 + BEQ mulNeon_10x3_64_store + + // Load and process 64 bytes from input 7 to 3 outputs + VLD1.P 32(R11), [V22.B16, V23.B16] + VLD1.P 32(R11), [V26.B16, V27.B16] + VUSHR $4, V22.B16, V24.B16 + VUSHR $4, V23.B16, V25.B16 + VUSHR $4, V26.B16, V28.B16 + VUSHR $4, V27.B16, V29.B16 + VAND V12.B16, V22.B16, V22.B16 + VAND V12.B16, V23.B16, V23.B16 + VAND V12.B16, V26.B16, V26.B16 + VAND V12.B16, V27.B16, V27.B16 + VAND V12.B16, V24.B16, V24.B16 + VAND V12.B16, V25.B16, V25.B16 + VAND V12.B16, V28.B16, V28.B16 + VAND V12.B16, V29.B16, V29.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V26.B16, [V14.B16], V18.B16 + VTBL V27.B16, [V15.B16], V19.B16 + VTBL V22.B16, [V14.B16], V14.B16 + VTBL V23.B16, [V15.B16], V15.B16 + VTBL V28.B16, [V16.B16], V20.B16 + VTBL V29.B16, [V17.B16], V21.B16 + VTBL V24.B16, [V16.B16], V16.B16 + VTBL V25.B16, [V17.B16], V17.B16 + VEOR V14.B16, V0.B16, V0.B16 + VEOR V15.B16, V1.B16, V1.B16 + VEOR V16.B16, V0.B16, V0.B16 + VEOR V17.B16, V1.B16, V1.B16 + VEOR V18.B16, V2.B16, V2.B16 + VEOR V19.B16, V3.B16, V3.B16 + VEOR V20.B16, V2.B16, V2.B16 + VEOR V21.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V26.B16, [V14.B16], V18.B16 + VTBL V27.B16, [V15.B16], V19.B16 + VTBL V22.B16, [V14.B16], V14.B16 + VTBL V23.B16, [V15.B16], V15.B16 + VTBL V28.B16, [V16.B16], V20.B16 + VTBL V29.B16, [V17.B16], V21.B16 + VTBL V24.B16, [V16.B16], V16.B16 + VTBL V25.B16, [V17.B16], V17.B16 + VEOR V14.B16, V4.B16, V4.B16 + VEOR V15.B16, V5.B16, V5.B16 + VEOR V16.B16, V4.B16, V4.B16 + VEOR V17.B16, V5.B16, V5.B16 + VEOR V18.B16, V6.B16, V6.B16 + VEOR V19.B16, V7.B16, V7.B16 + VEOR V20.B16, V6.B16, V6.B16 + VEOR V21.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V26.B16, [V14.B16], V18.B16 + VTBL V27.B16, [V15.B16], V19.B16 + VTBL V22.B16, [V14.B16], V14.B16 + VTBL V23.B16, [V15.B16], V15.B16 + VTBL V28.B16, [V16.B16], V20.B16 + VTBL V29.B16, [V17.B16], V21.B16 + VTBL V24.B16, [V16.B16], V16.B16 + VTBL V25.B16, [V17.B16], V17.B16 + VEOR V14.B16, V8.B16, V8.B16 + VEOR V15.B16, V9.B16, V9.B16 + VEOR V16.B16, V8.B16, V8.B16 + VEOR V17.B16, V9.B16, V9.B16 + VEOR V18.B16, V10.B16, V10.B16 + VEOR V19.B16, V11.B16, V11.B16 + VEOR V20.B16, V10.B16, V10.B16 + VEOR V21.B16, V11.B16, V11.B16 + // Check for early termination + CMP $8, R16 + BEQ mulNeon_10x3_64_store + + // Load and process 64 bytes from input 8 to 3 outputs + VLD1.P 32(R12), [V22.B16, V23.B16] + VLD1.P 32(R12), [V26.B16, V27.B16] + VUSHR $4, V22.B16, V24.B16 + VUSHR $4, V23.B16, V25.B16 + VUSHR $4, V26.B16, V28.B16 + VUSHR $4, V27.B16, V29.B16 + VAND V12.B16, V22.B16, V22.B16 + VAND V12.B16, V23.B16, V23.B16 + VAND V12.B16, V26.B16, V26.B16 + VAND V12.B16, V27.B16, V27.B16 + VAND V12.B16, V24.B16, V24.B16 + VAND V12.B16, V25.B16, V25.B16 + VAND V12.B16, V28.B16, V28.B16 + VAND V12.B16, V29.B16, V29.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V26.B16, [V14.B16], V18.B16 + VTBL V27.B16, [V15.B16], V19.B16 + VTBL V22.B16, [V14.B16], V14.B16 + VTBL V23.B16, [V15.B16], V15.B16 + VTBL V28.B16, [V16.B16], V20.B16 + VTBL V29.B16, [V17.B16], V21.B16 + VTBL V24.B16, [V16.B16], V16.B16 + VTBL V25.B16, [V17.B16], V17.B16 + VEOR V14.B16, V0.B16, V0.B16 + VEOR V15.B16, V1.B16, V1.B16 + VEOR V16.B16, V0.B16, V0.B16 + VEOR V17.B16, V1.B16, V1.B16 + VEOR V18.B16, V2.B16, V2.B16 + VEOR V19.B16, V3.B16, V3.B16 + VEOR V20.B16, V2.B16, V2.B16 + VEOR V21.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V26.B16, [V14.B16], V18.B16 + VTBL V27.B16, [V15.B16], V19.B16 + VTBL V22.B16, [V14.B16], V14.B16 + VTBL V23.B16, [V15.B16], V15.B16 + VTBL V28.B16, [V16.B16], V20.B16 + VTBL V29.B16, [V17.B16], V21.B16 + VTBL V24.B16, [V16.B16], V16.B16 + VTBL V25.B16, [V17.B16], V17.B16 + VEOR V14.B16, V4.B16, V4.B16 + VEOR V15.B16, V5.B16, V5.B16 + VEOR V16.B16, V4.B16, V4.B16 + VEOR V17.B16, V5.B16, V5.B16 + VEOR V18.B16, V6.B16, V6.B16 + VEOR V19.B16, V7.B16, V7.B16 + VEOR V20.B16, V6.B16, V6.B16 + VEOR V21.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V26.B16, [V14.B16], V18.B16 + VTBL V27.B16, [V15.B16], V19.B16 + VTBL V22.B16, [V14.B16], V14.B16 + VTBL V23.B16, [V15.B16], V15.B16 + VTBL V28.B16, [V16.B16], V20.B16 + VTBL V29.B16, [V17.B16], V21.B16 + VTBL V24.B16, [V16.B16], V16.B16 + VTBL V25.B16, [V17.B16], V17.B16 + VEOR V14.B16, V8.B16, V8.B16 + VEOR V15.B16, V9.B16, V9.B16 + VEOR V16.B16, V8.B16, V8.B16 + VEOR V17.B16, V9.B16, V9.B16 + VEOR V18.B16, V10.B16, V10.B16 + VEOR V19.B16, V11.B16, V11.B16 + VEOR V20.B16, V10.B16, V10.B16 + VEOR V21.B16, V11.B16, V11.B16 + // Check for early termination + CMP $9, R16 + BEQ mulNeon_10x3_64_store + + // Load and process 64 bytes from input 9 to 3 outputs + VLD1.P 32(R0), [V22.B16, V23.B16] + VLD1.P 32(R0), [V26.B16, V27.B16] + VUSHR $4, V22.B16, V24.B16 + VUSHR $4, V23.B16, V25.B16 + VUSHR $4, V26.B16, V28.B16 + VUSHR $4, V27.B16, V29.B16 + VAND V12.B16, V22.B16, V22.B16 + VAND V12.B16, V23.B16, V23.B16 + VAND V12.B16, V26.B16, V26.B16 + VAND V12.B16, V27.B16, V27.B16 + VAND V12.B16, V24.B16, V24.B16 + VAND V12.B16, V25.B16, V25.B16 + VAND V12.B16, V28.B16, V28.B16 + VAND V12.B16, V29.B16, V29.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V26.B16, [V14.B16], V18.B16 + VTBL V27.B16, [V15.B16], V19.B16 + VTBL V22.B16, [V14.B16], V14.B16 + VTBL V23.B16, [V15.B16], V15.B16 + VTBL V28.B16, [V16.B16], V20.B16 + VTBL V29.B16, [V17.B16], V21.B16 + VTBL V24.B16, [V16.B16], V16.B16 + VTBL V25.B16, [V17.B16], V17.B16 + VEOR V14.B16, V0.B16, V0.B16 + VEOR V15.B16, V1.B16, V1.B16 + VEOR V16.B16, V0.B16, V0.B16 + VEOR V17.B16, V1.B16, V1.B16 + VEOR V18.B16, V2.B16, V2.B16 + VEOR V19.B16, V3.B16, V3.B16 + VEOR V20.B16, V2.B16, V2.B16 + VEOR V21.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V26.B16, [V14.B16], V18.B16 + VTBL V27.B16, [V15.B16], V19.B16 + VTBL V22.B16, [V14.B16], V14.B16 + VTBL V23.B16, [V15.B16], V15.B16 + VTBL V28.B16, [V16.B16], V20.B16 + VTBL V29.B16, [V17.B16], V21.B16 + VTBL V24.B16, [V16.B16], V16.B16 + VTBL V25.B16, [V17.B16], V17.B16 + VEOR V14.B16, V4.B16, V4.B16 + VEOR V15.B16, V5.B16, V5.B16 + VEOR V16.B16, V4.B16, V4.B16 + VEOR V17.B16, V5.B16, V5.B16 + VEOR V18.B16, V6.B16, V6.B16 + VEOR V19.B16, V7.B16, V7.B16 + VEOR V20.B16, V6.B16, V6.B16 + VEOR V21.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V26.B16, [V14.B16], V18.B16 + VTBL V27.B16, [V15.B16], V19.B16 + VTBL V22.B16, [V14.B16], V14.B16 + VTBL V23.B16, [V15.B16], V15.B16 + VTBL V28.B16, [V16.B16], V20.B16 + VTBL V29.B16, [V17.B16], V21.B16 + VTBL V24.B16, [V16.B16], V16.B16 + VTBL V25.B16, [V17.B16], V17.B16 + VEOR V14.B16, V8.B16, V8.B16 + VEOR V15.B16, V9.B16, V9.B16 + VEOR V16.B16, V8.B16, V8.B16 + VEOR V17.B16, V9.B16, V9.B16 + VEOR V18.B16, V10.B16, V10.B16 + VEOR V19.B16, V11.B16, V11.B16 + VEOR V20.B16, V10.B16, V10.B16 + VEOR V21.B16, V11.B16, V11.B16 + +mulNeon_10x3_64_store: + // Store 3 outputs + VST1.P [V0.D2, V1.D2], 32(R14) + VST1.P [V2.D2, V3.D2], 32(R14) + VST1.P [V4.D2, V5.D2], 32(R15) + VST1.P [V6.D2, V7.D2], 32(R15) + VST1.P [V8.D2, V9.D2], 32(R13) + VST1.P [V10.D2, V11.D2], 32(R13) + + // Prepare for next loop + SUBS $1, R6 + BNE mulNeon_10x3_64_loop + +mulNeon_10x3_64_end: + RET + +// func mulNeon_10x3_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: NEON +TEXT ·mulNeon_10x3_64Xor(SB), $8-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 130 YMM used + MOVD n+80(FP), R0 + LSR $6, R0 + TST R0, R0 + BEQ mulNeon_10x3_64Xor_end + MOVD in_base+24(FP), R0 + MOVD (R0), R3 + MOVD 24(R0), R1 + MOVD 48(R0), R4 + MOVD 72(R0), R5 + MOVD 96(R0), R8 + MOVD 120(R0), R9 + MOVD 144(R0), R10 + MOVD 168(R0), R11 + MOVD 192(R0), R12 + MOVD 216(R0), R0 + MOVD out_base+48(FP), R13 + MOVD (R13), R14 + MOVD 24(R13), R15 + MOVD 48(R13), R13 + MOVD start+72(FP), R6 + + // Add start offset to output + ADD R6, R14 + ADD R6, R15 + ADD R6, R13 + + // Add start offset to input + ADD R6, R3 + ADD R6, R1 + ADD R6, R4 + ADD R6, R5 + ADD R6, R8 + ADD R6, R9 + ADD R6, R10 + ADD R6, R11 + ADD R6, R12 + ADD R6, R0 + MOVD $15, R6 + VMOV R6, V12.B[0] + VDUP V12.B[0], V12.B16 + + // Reload length to save a register + MOVD n+80(FP), R6 + LSR $6, R6 + + // Load number of input shards + MOVD in_len+32(FP), R16 + +mulNeon_10x3_64Xor_loop: + MOVD matrix_base+0(FP), R2 + // Load 3 outputs + VLD1.P 32(R14), [V0.B16, V1.B16] + VLD1.P 32(R14), [V2.B16, V3.B16] + VLD1.P 32(R15), [V4.B16, V5.B16] + VLD1.P 32(R15), [V6.B16, V7.B16] + VLD1.P 32(R13), [V8.B16, V9.B16] + VLD1.P 32(R13), [V10.B16, V11.B16] + + // Load and process 64 bytes from input 0 to 3 outputs + VLD1.P 32(R3), [V22.B16, V23.B16] + VLD1.P 32(R3), [V26.B16, V27.B16] + VUSHR $4, V22.B16, V24.B16 + VUSHR $4, V23.B16, V25.B16 + VUSHR $4, V26.B16, V28.B16 + VUSHR $4, V27.B16, V29.B16 + VAND V12.B16, V22.B16, V22.B16 + VAND V12.B16, V23.B16, V23.B16 + VAND V12.B16, V26.B16, V26.B16 + VAND V12.B16, V27.B16, V27.B16 + VAND V12.B16, V24.B16, V24.B16 + VAND V12.B16, V25.B16, V25.B16 + VAND V12.B16, V28.B16, V28.B16 + VAND V12.B16, V29.B16, V29.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V26.B16, [V14.B16], V18.B16 + VTBL V27.B16, [V15.B16], V19.B16 + VTBL V22.B16, [V14.B16], V14.B16 + VTBL V23.B16, [V15.B16], V15.B16 + VTBL V28.B16, [V16.B16], V20.B16 + VTBL V29.B16, [V17.B16], V21.B16 + VTBL V24.B16, [V16.B16], V16.B16 + VTBL V25.B16, [V17.B16], V17.B16 + VEOR V14.B16, V0.B16, V0.B16 + VEOR V15.B16, V1.B16, V1.B16 + VEOR V16.B16, V0.B16, V0.B16 + VEOR V17.B16, V1.B16, V1.B16 + VEOR V18.B16, V2.B16, V2.B16 + VEOR V19.B16, V3.B16, V3.B16 + VEOR V20.B16, V2.B16, V2.B16 + VEOR V21.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V26.B16, [V14.B16], V18.B16 + VTBL V27.B16, [V15.B16], V19.B16 + VTBL V22.B16, [V14.B16], V14.B16 + VTBL V23.B16, [V15.B16], V15.B16 + VTBL V28.B16, [V16.B16], V20.B16 + VTBL V29.B16, [V17.B16], V21.B16 + VTBL V24.B16, [V16.B16], V16.B16 + VTBL V25.B16, [V17.B16], V17.B16 + VEOR V14.B16, V4.B16, V4.B16 + VEOR V15.B16, V5.B16, V5.B16 + VEOR V16.B16, V4.B16, V4.B16 + VEOR V17.B16, V5.B16, V5.B16 + VEOR V18.B16, V6.B16, V6.B16 + VEOR V19.B16, V7.B16, V7.B16 + VEOR V20.B16, V6.B16, V6.B16 + VEOR V21.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V26.B16, [V14.B16], V18.B16 + VTBL V27.B16, [V15.B16], V19.B16 + VTBL V22.B16, [V14.B16], V14.B16 + VTBL V23.B16, [V15.B16], V15.B16 + VTBL V28.B16, [V16.B16], V20.B16 + VTBL V29.B16, [V17.B16], V21.B16 + VTBL V24.B16, [V16.B16], V16.B16 + VTBL V25.B16, [V17.B16], V17.B16 + VEOR V14.B16, V8.B16, V8.B16 + VEOR V15.B16, V9.B16, V9.B16 + VEOR V16.B16, V8.B16, V8.B16 + VEOR V17.B16, V9.B16, V9.B16 + VEOR V18.B16, V10.B16, V10.B16 + VEOR V19.B16, V11.B16, V11.B16 + VEOR V20.B16, V10.B16, V10.B16 + VEOR V21.B16, V11.B16, V11.B16 + // Check for early termination + CMP $1, R16 + BEQ mulNeon_10x3_64Xor_store + + // Load and process 64 bytes from input 1 to 3 outputs + VLD1.P 32(R1), [V22.B16, V23.B16] + VLD1.P 32(R1), [V26.B16, V27.B16] + VUSHR $4, V22.B16, V24.B16 + VUSHR $4, V23.B16, V25.B16 + VUSHR $4, V26.B16, V28.B16 + VUSHR $4, V27.B16, V29.B16 + VAND V12.B16, V22.B16, V22.B16 + VAND V12.B16, V23.B16, V23.B16 + VAND V12.B16, V26.B16, V26.B16 + VAND V12.B16, V27.B16, V27.B16 + VAND V12.B16, V24.B16, V24.B16 + VAND V12.B16, V25.B16, V25.B16 + VAND V12.B16, V28.B16, V28.B16 + VAND V12.B16, V29.B16, V29.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V26.B16, [V14.B16], V18.B16 + VTBL V27.B16, [V15.B16], V19.B16 + VTBL V22.B16, [V14.B16], V14.B16 + VTBL V23.B16, [V15.B16], V15.B16 + VTBL V28.B16, [V16.B16], V20.B16 + VTBL V29.B16, [V17.B16], V21.B16 + VTBL V24.B16, [V16.B16], V16.B16 + VTBL V25.B16, [V17.B16], V17.B16 + VEOR V14.B16, V0.B16, V0.B16 + VEOR V15.B16, V1.B16, V1.B16 + VEOR V16.B16, V0.B16, V0.B16 + VEOR V17.B16, V1.B16, V1.B16 + VEOR V18.B16, V2.B16, V2.B16 + VEOR V19.B16, V3.B16, V3.B16 + VEOR V20.B16, V2.B16, V2.B16 + VEOR V21.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V26.B16, [V14.B16], V18.B16 + VTBL V27.B16, [V15.B16], V19.B16 + VTBL V22.B16, [V14.B16], V14.B16 + VTBL V23.B16, [V15.B16], V15.B16 + VTBL V28.B16, [V16.B16], V20.B16 + VTBL V29.B16, [V17.B16], V21.B16 + VTBL V24.B16, [V16.B16], V16.B16 + VTBL V25.B16, [V17.B16], V17.B16 + VEOR V14.B16, V4.B16, V4.B16 + VEOR V15.B16, V5.B16, V5.B16 + VEOR V16.B16, V4.B16, V4.B16 + VEOR V17.B16, V5.B16, V5.B16 + VEOR V18.B16, V6.B16, V6.B16 + VEOR V19.B16, V7.B16, V7.B16 + VEOR V20.B16, V6.B16, V6.B16 + VEOR V21.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V26.B16, [V14.B16], V18.B16 + VTBL V27.B16, [V15.B16], V19.B16 + VTBL V22.B16, [V14.B16], V14.B16 + VTBL V23.B16, [V15.B16], V15.B16 + VTBL V28.B16, [V16.B16], V20.B16 + VTBL V29.B16, [V17.B16], V21.B16 + VTBL V24.B16, [V16.B16], V16.B16 + VTBL V25.B16, [V17.B16], V17.B16 + VEOR V14.B16, V8.B16, V8.B16 + VEOR V15.B16, V9.B16, V9.B16 + VEOR V16.B16, V8.B16, V8.B16 + VEOR V17.B16, V9.B16, V9.B16 + VEOR V18.B16, V10.B16, V10.B16 + VEOR V19.B16, V11.B16, V11.B16 + VEOR V20.B16, V10.B16, V10.B16 + VEOR V21.B16, V11.B16, V11.B16 + // Check for early termination + CMP $2, R16 + BEQ mulNeon_10x3_64Xor_store + + // Load and process 64 bytes from input 2 to 3 outputs + VLD1.P 32(R4), [V22.B16, V23.B16] + VLD1.P 32(R4), [V26.B16, V27.B16] + VUSHR $4, V22.B16, V24.B16 + VUSHR $4, V23.B16, V25.B16 + VUSHR $4, V26.B16, V28.B16 + VUSHR $4, V27.B16, V29.B16 + VAND V12.B16, V22.B16, V22.B16 + VAND V12.B16, V23.B16, V23.B16 + VAND V12.B16, V26.B16, V26.B16 + VAND V12.B16, V27.B16, V27.B16 + VAND V12.B16, V24.B16, V24.B16 + VAND V12.B16, V25.B16, V25.B16 + VAND V12.B16, V28.B16, V28.B16 + VAND V12.B16, V29.B16, V29.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V26.B16, [V14.B16], V18.B16 + VTBL V27.B16, [V15.B16], V19.B16 + VTBL V22.B16, [V14.B16], V14.B16 + VTBL V23.B16, [V15.B16], V15.B16 + VTBL V28.B16, [V16.B16], V20.B16 + VTBL V29.B16, [V17.B16], V21.B16 + VTBL V24.B16, [V16.B16], V16.B16 + VTBL V25.B16, [V17.B16], V17.B16 + VEOR V14.B16, V0.B16, V0.B16 + VEOR V15.B16, V1.B16, V1.B16 + VEOR V16.B16, V0.B16, V0.B16 + VEOR V17.B16, V1.B16, V1.B16 + VEOR V18.B16, V2.B16, V2.B16 + VEOR V19.B16, V3.B16, V3.B16 + VEOR V20.B16, V2.B16, V2.B16 + VEOR V21.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V26.B16, [V14.B16], V18.B16 + VTBL V27.B16, [V15.B16], V19.B16 + VTBL V22.B16, [V14.B16], V14.B16 + VTBL V23.B16, [V15.B16], V15.B16 + VTBL V28.B16, [V16.B16], V20.B16 + VTBL V29.B16, [V17.B16], V21.B16 + VTBL V24.B16, [V16.B16], V16.B16 + VTBL V25.B16, [V17.B16], V17.B16 + VEOR V14.B16, V4.B16, V4.B16 + VEOR V15.B16, V5.B16, V5.B16 + VEOR V16.B16, V4.B16, V4.B16 + VEOR V17.B16, V5.B16, V5.B16 + VEOR V18.B16, V6.B16, V6.B16 + VEOR V19.B16, V7.B16, V7.B16 + VEOR V20.B16, V6.B16, V6.B16 + VEOR V21.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V26.B16, [V14.B16], V18.B16 + VTBL V27.B16, [V15.B16], V19.B16 + VTBL V22.B16, [V14.B16], V14.B16 + VTBL V23.B16, [V15.B16], V15.B16 + VTBL V28.B16, [V16.B16], V20.B16 + VTBL V29.B16, [V17.B16], V21.B16 + VTBL V24.B16, [V16.B16], V16.B16 + VTBL V25.B16, [V17.B16], V17.B16 + VEOR V14.B16, V8.B16, V8.B16 + VEOR V15.B16, V9.B16, V9.B16 + VEOR V16.B16, V8.B16, V8.B16 + VEOR V17.B16, V9.B16, V9.B16 + VEOR V18.B16, V10.B16, V10.B16 + VEOR V19.B16, V11.B16, V11.B16 + VEOR V20.B16, V10.B16, V10.B16 + VEOR V21.B16, V11.B16, V11.B16 + // Check for early termination + CMP $3, R16 + BEQ mulNeon_10x3_64Xor_store + + // Load and process 64 bytes from input 3 to 3 outputs + VLD1.P 32(R5), [V22.B16, V23.B16] + VLD1.P 32(R5), [V26.B16, V27.B16] + VUSHR $4, V22.B16, V24.B16 + VUSHR $4, V23.B16, V25.B16 + VUSHR $4, V26.B16, V28.B16 + VUSHR $4, V27.B16, V29.B16 + VAND V12.B16, V22.B16, V22.B16 + VAND V12.B16, V23.B16, V23.B16 + VAND V12.B16, V26.B16, V26.B16 + VAND V12.B16, V27.B16, V27.B16 + VAND V12.B16, V24.B16, V24.B16 + VAND V12.B16, V25.B16, V25.B16 + VAND V12.B16, V28.B16, V28.B16 + VAND V12.B16, V29.B16, V29.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V26.B16, [V14.B16], V18.B16 + VTBL V27.B16, [V15.B16], V19.B16 + VTBL V22.B16, [V14.B16], V14.B16 + VTBL V23.B16, [V15.B16], V15.B16 + VTBL V28.B16, [V16.B16], V20.B16 + VTBL V29.B16, [V17.B16], V21.B16 + VTBL V24.B16, [V16.B16], V16.B16 + VTBL V25.B16, [V17.B16], V17.B16 + VEOR V14.B16, V0.B16, V0.B16 + VEOR V15.B16, V1.B16, V1.B16 + VEOR V16.B16, V0.B16, V0.B16 + VEOR V17.B16, V1.B16, V1.B16 + VEOR V18.B16, V2.B16, V2.B16 + VEOR V19.B16, V3.B16, V3.B16 + VEOR V20.B16, V2.B16, V2.B16 + VEOR V21.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V26.B16, [V14.B16], V18.B16 + VTBL V27.B16, [V15.B16], V19.B16 + VTBL V22.B16, [V14.B16], V14.B16 + VTBL V23.B16, [V15.B16], V15.B16 + VTBL V28.B16, [V16.B16], V20.B16 + VTBL V29.B16, [V17.B16], V21.B16 + VTBL V24.B16, [V16.B16], V16.B16 + VTBL V25.B16, [V17.B16], V17.B16 + VEOR V14.B16, V4.B16, V4.B16 + VEOR V15.B16, V5.B16, V5.B16 + VEOR V16.B16, V4.B16, V4.B16 + VEOR V17.B16, V5.B16, V5.B16 + VEOR V18.B16, V6.B16, V6.B16 + VEOR V19.B16, V7.B16, V7.B16 + VEOR V20.B16, V6.B16, V6.B16 + VEOR V21.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V26.B16, [V14.B16], V18.B16 + VTBL V27.B16, [V15.B16], V19.B16 + VTBL V22.B16, [V14.B16], V14.B16 + VTBL V23.B16, [V15.B16], V15.B16 + VTBL V28.B16, [V16.B16], V20.B16 + VTBL V29.B16, [V17.B16], V21.B16 + VTBL V24.B16, [V16.B16], V16.B16 + VTBL V25.B16, [V17.B16], V17.B16 + VEOR V14.B16, V8.B16, V8.B16 + VEOR V15.B16, V9.B16, V9.B16 + VEOR V16.B16, V8.B16, V8.B16 + VEOR V17.B16, V9.B16, V9.B16 + VEOR V18.B16, V10.B16, V10.B16 + VEOR V19.B16, V11.B16, V11.B16 + VEOR V20.B16, V10.B16, V10.B16 + VEOR V21.B16, V11.B16, V11.B16 + // Check for early termination + CMP $4, R16 + BEQ mulNeon_10x3_64Xor_store + + // Load and process 64 bytes from input 4 to 3 outputs + VLD1.P 32(R8), [V22.B16, V23.B16] + VLD1.P 32(R8), [V26.B16, V27.B16] + VUSHR $4, V22.B16, V24.B16 + VUSHR $4, V23.B16, V25.B16 + VUSHR $4, V26.B16, V28.B16 + VUSHR $4, V27.B16, V29.B16 + VAND V12.B16, V22.B16, V22.B16 + VAND V12.B16, V23.B16, V23.B16 + VAND V12.B16, V26.B16, V26.B16 + VAND V12.B16, V27.B16, V27.B16 + VAND V12.B16, V24.B16, V24.B16 + VAND V12.B16, V25.B16, V25.B16 + VAND V12.B16, V28.B16, V28.B16 + VAND V12.B16, V29.B16, V29.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V26.B16, [V14.B16], V18.B16 + VTBL V27.B16, [V15.B16], V19.B16 + VTBL V22.B16, [V14.B16], V14.B16 + VTBL V23.B16, [V15.B16], V15.B16 + VTBL V28.B16, [V16.B16], V20.B16 + VTBL V29.B16, [V17.B16], V21.B16 + VTBL V24.B16, [V16.B16], V16.B16 + VTBL V25.B16, [V17.B16], V17.B16 + VEOR V14.B16, V0.B16, V0.B16 + VEOR V15.B16, V1.B16, V1.B16 + VEOR V16.B16, V0.B16, V0.B16 + VEOR V17.B16, V1.B16, V1.B16 + VEOR V18.B16, V2.B16, V2.B16 + VEOR V19.B16, V3.B16, V3.B16 + VEOR V20.B16, V2.B16, V2.B16 + VEOR V21.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V26.B16, [V14.B16], V18.B16 + VTBL V27.B16, [V15.B16], V19.B16 + VTBL V22.B16, [V14.B16], V14.B16 + VTBL V23.B16, [V15.B16], V15.B16 + VTBL V28.B16, [V16.B16], V20.B16 + VTBL V29.B16, [V17.B16], V21.B16 + VTBL V24.B16, [V16.B16], V16.B16 + VTBL V25.B16, [V17.B16], V17.B16 + VEOR V14.B16, V4.B16, V4.B16 + VEOR V15.B16, V5.B16, V5.B16 + VEOR V16.B16, V4.B16, V4.B16 + VEOR V17.B16, V5.B16, V5.B16 + VEOR V18.B16, V6.B16, V6.B16 + VEOR V19.B16, V7.B16, V7.B16 + VEOR V20.B16, V6.B16, V6.B16 + VEOR V21.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V26.B16, [V14.B16], V18.B16 + VTBL V27.B16, [V15.B16], V19.B16 + VTBL V22.B16, [V14.B16], V14.B16 + VTBL V23.B16, [V15.B16], V15.B16 + VTBL V28.B16, [V16.B16], V20.B16 + VTBL V29.B16, [V17.B16], V21.B16 + VTBL V24.B16, [V16.B16], V16.B16 + VTBL V25.B16, [V17.B16], V17.B16 + VEOR V14.B16, V8.B16, V8.B16 + VEOR V15.B16, V9.B16, V9.B16 + VEOR V16.B16, V8.B16, V8.B16 + VEOR V17.B16, V9.B16, V9.B16 + VEOR V18.B16, V10.B16, V10.B16 + VEOR V19.B16, V11.B16, V11.B16 + VEOR V20.B16, V10.B16, V10.B16 + VEOR V21.B16, V11.B16, V11.B16 + // Check for early termination + CMP $5, R16 + BEQ mulNeon_10x3_64Xor_store + + // Load and process 64 bytes from input 5 to 3 outputs + VLD1.P 32(R9), [V22.B16, V23.B16] + VLD1.P 32(R9), [V26.B16, V27.B16] + VUSHR $4, V22.B16, V24.B16 + VUSHR $4, V23.B16, V25.B16 + VUSHR $4, V26.B16, V28.B16 + VUSHR $4, V27.B16, V29.B16 + VAND V12.B16, V22.B16, V22.B16 + VAND V12.B16, V23.B16, V23.B16 + VAND V12.B16, V26.B16, V26.B16 + VAND V12.B16, V27.B16, V27.B16 + VAND V12.B16, V24.B16, V24.B16 + VAND V12.B16, V25.B16, V25.B16 + VAND V12.B16, V28.B16, V28.B16 + VAND V12.B16, V29.B16, V29.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V26.B16, [V14.B16], V18.B16 + VTBL V27.B16, [V15.B16], V19.B16 + VTBL V22.B16, [V14.B16], V14.B16 + VTBL V23.B16, [V15.B16], V15.B16 + VTBL V28.B16, [V16.B16], V20.B16 + VTBL V29.B16, [V17.B16], V21.B16 + VTBL V24.B16, [V16.B16], V16.B16 + VTBL V25.B16, [V17.B16], V17.B16 + VEOR V14.B16, V0.B16, V0.B16 + VEOR V15.B16, V1.B16, V1.B16 + VEOR V16.B16, V0.B16, V0.B16 + VEOR V17.B16, V1.B16, V1.B16 + VEOR V18.B16, V2.B16, V2.B16 + VEOR V19.B16, V3.B16, V3.B16 + VEOR V20.B16, V2.B16, V2.B16 + VEOR V21.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V26.B16, [V14.B16], V18.B16 + VTBL V27.B16, [V15.B16], V19.B16 + VTBL V22.B16, [V14.B16], V14.B16 + VTBL V23.B16, [V15.B16], V15.B16 + VTBL V28.B16, [V16.B16], V20.B16 + VTBL V29.B16, [V17.B16], V21.B16 + VTBL V24.B16, [V16.B16], V16.B16 + VTBL V25.B16, [V17.B16], V17.B16 + VEOR V14.B16, V4.B16, V4.B16 + VEOR V15.B16, V5.B16, V5.B16 + VEOR V16.B16, V4.B16, V4.B16 + VEOR V17.B16, V5.B16, V5.B16 + VEOR V18.B16, V6.B16, V6.B16 + VEOR V19.B16, V7.B16, V7.B16 + VEOR V20.B16, V6.B16, V6.B16 + VEOR V21.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V26.B16, [V14.B16], V18.B16 + VTBL V27.B16, [V15.B16], V19.B16 + VTBL V22.B16, [V14.B16], V14.B16 + VTBL V23.B16, [V15.B16], V15.B16 + VTBL V28.B16, [V16.B16], V20.B16 + VTBL V29.B16, [V17.B16], V21.B16 + VTBL V24.B16, [V16.B16], V16.B16 + VTBL V25.B16, [V17.B16], V17.B16 + VEOR V14.B16, V8.B16, V8.B16 + VEOR V15.B16, V9.B16, V9.B16 + VEOR V16.B16, V8.B16, V8.B16 + VEOR V17.B16, V9.B16, V9.B16 + VEOR V18.B16, V10.B16, V10.B16 + VEOR V19.B16, V11.B16, V11.B16 + VEOR V20.B16, V10.B16, V10.B16 + VEOR V21.B16, V11.B16, V11.B16 + // Check for early termination + CMP $6, R16 + BEQ mulNeon_10x3_64Xor_store + + // Load and process 64 bytes from input 6 to 3 outputs + VLD1.P 32(R10), [V22.B16, V23.B16] + VLD1.P 32(R10), [V26.B16, V27.B16] + VUSHR $4, V22.B16, V24.B16 + VUSHR $4, V23.B16, V25.B16 + VUSHR $4, V26.B16, V28.B16 + VUSHR $4, V27.B16, V29.B16 + VAND V12.B16, V22.B16, V22.B16 + VAND V12.B16, V23.B16, V23.B16 + VAND V12.B16, V26.B16, V26.B16 + VAND V12.B16, V27.B16, V27.B16 + VAND V12.B16, V24.B16, V24.B16 + VAND V12.B16, V25.B16, V25.B16 + VAND V12.B16, V28.B16, V28.B16 + VAND V12.B16, V29.B16, V29.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V26.B16, [V14.B16], V18.B16 + VTBL V27.B16, [V15.B16], V19.B16 + VTBL V22.B16, [V14.B16], V14.B16 + VTBL V23.B16, [V15.B16], V15.B16 + VTBL V28.B16, [V16.B16], V20.B16 + VTBL V29.B16, [V17.B16], V21.B16 + VTBL V24.B16, [V16.B16], V16.B16 + VTBL V25.B16, [V17.B16], V17.B16 + VEOR V14.B16, V0.B16, V0.B16 + VEOR V15.B16, V1.B16, V1.B16 + VEOR V16.B16, V0.B16, V0.B16 + VEOR V17.B16, V1.B16, V1.B16 + VEOR V18.B16, V2.B16, V2.B16 + VEOR V19.B16, V3.B16, V3.B16 + VEOR V20.B16, V2.B16, V2.B16 + VEOR V21.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V26.B16, [V14.B16], V18.B16 + VTBL V27.B16, [V15.B16], V19.B16 + VTBL V22.B16, [V14.B16], V14.B16 + VTBL V23.B16, [V15.B16], V15.B16 + VTBL V28.B16, [V16.B16], V20.B16 + VTBL V29.B16, [V17.B16], V21.B16 + VTBL V24.B16, [V16.B16], V16.B16 + VTBL V25.B16, [V17.B16], V17.B16 + VEOR V14.B16, V4.B16, V4.B16 + VEOR V15.B16, V5.B16, V5.B16 + VEOR V16.B16, V4.B16, V4.B16 + VEOR V17.B16, V5.B16, V5.B16 + VEOR V18.B16, V6.B16, V6.B16 + VEOR V19.B16, V7.B16, V7.B16 + VEOR V20.B16, V6.B16, V6.B16 + VEOR V21.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V26.B16, [V14.B16], V18.B16 + VTBL V27.B16, [V15.B16], V19.B16 + VTBL V22.B16, [V14.B16], V14.B16 + VTBL V23.B16, [V15.B16], V15.B16 + VTBL V28.B16, [V16.B16], V20.B16 + VTBL V29.B16, [V17.B16], V21.B16 + VTBL V24.B16, [V16.B16], V16.B16 + VTBL V25.B16, [V17.B16], V17.B16 + VEOR V14.B16, V8.B16, V8.B16 + VEOR V15.B16, V9.B16, V9.B16 + VEOR V16.B16, V8.B16, V8.B16 + VEOR V17.B16, V9.B16, V9.B16 + VEOR V18.B16, V10.B16, V10.B16 + VEOR V19.B16, V11.B16, V11.B16 + VEOR V20.B16, V10.B16, V10.B16 + VEOR V21.B16, V11.B16, V11.B16 + // Check for early termination + CMP $7, R16 + BEQ mulNeon_10x3_64Xor_store + + // Load and process 64 bytes from input 7 to 3 outputs + VLD1.P 32(R11), [V22.B16, V23.B16] + VLD1.P 32(R11), [V26.B16, V27.B16] + VUSHR $4, V22.B16, V24.B16 + VUSHR $4, V23.B16, V25.B16 + VUSHR $4, V26.B16, V28.B16 + VUSHR $4, V27.B16, V29.B16 + VAND V12.B16, V22.B16, V22.B16 + VAND V12.B16, V23.B16, V23.B16 + VAND V12.B16, V26.B16, V26.B16 + VAND V12.B16, V27.B16, V27.B16 + VAND V12.B16, V24.B16, V24.B16 + VAND V12.B16, V25.B16, V25.B16 + VAND V12.B16, V28.B16, V28.B16 + VAND V12.B16, V29.B16, V29.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V26.B16, [V14.B16], V18.B16 + VTBL V27.B16, [V15.B16], V19.B16 + VTBL V22.B16, [V14.B16], V14.B16 + VTBL V23.B16, [V15.B16], V15.B16 + VTBL V28.B16, [V16.B16], V20.B16 + VTBL V29.B16, [V17.B16], V21.B16 + VTBL V24.B16, [V16.B16], V16.B16 + VTBL V25.B16, [V17.B16], V17.B16 + VEOR V14.B16, V0.B16, V0.B16 + VEOR V15.B16, V1.B16, V1.B16 + VEOR V16.B16, V0.B16, V0.B16 + VEOR V17.B16, V1.B16, V1.B16 + VEOR V18.B16, V2.B16, V2.B16 + VEOR V19.B16, V3.B16, V3.B16 + VEOR V20.B16, V2.B16, V2.B16 + VEOR V21.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V26.B16, [V14.B16], V18.B16 + VTBL V27.B16, [V15.B16], V19.B16 + VTBL V22.B16, [V14.B16], V14.B16 + VTBL V23.B16, [V15.B16], V15.B16 + VTBL V28.B16, [V16.B16], V20.B16 + VTBL V29.B16, [V17.B16], V21.B16 + VTBL V24.B16, [V16.B16], V16.B16 + VTBL V25.B16, [V17.B16], V17.B16 + VEOR V14.B16, V4.B16, V4.B16 + VEOR V15.B16, V5.B16, V5.B16 + VEOR V16.B16, V4.B16, V4.B16 + VEOR V17.B16, V5.B16, V5.B16 + VEOR V18.B16, V6.B16, V6.B16 + VEOR V19.B16, V7.B16, V7.B16 + VEOR V20.B16, V6.B16, V6.B16 + VEOR V21.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V26.B16, [V14.B16], V18.B16 + VTBL V27.B16, [V15.B16], V19.B16 + VTBL V22.B16, [V14.B16], V14.B16 + VTBL V23.B16, [V15.B16], V15.B16 + VTBL V28.B16, [V16.B16], V20.B16 + VTBL V29.B16, [V17.B16], V21.B16 + VTBL V24.B16, [V16.B16], V16.B16 + VTBL V25.B16, [V17.B16], V17.B16 + VEOR V14.B16, V8.B16, V8.B16 + VEOR V15.B16, V9.B16, V9.B16 + VEOR V16.B16, V8.B16, V8.B16 + VEOR V17.B16, V9.B16, V9.B16 + VEOR V18.B16, V10.B16, V10.B16 + VEOR V19.B16, V11.B16, V11.B16 + VEOR V20.B16, V10.B16, V10.B16 + VEOR V21.B16, V11.B16, V11.B16 + // Check for early termination + CMP $8, R16 + BEQ mulNeon_10x3_64Xor_store + + // Load and process 64 bytes from input 8 to 3 outputs + VLD1.P 32(R12), [V22.B16, V23.B16] + VLD1.P 32(R12), [V26.B16, V27.B16] + VUSHR $4, V22.B16, V24.B16 + VUSHR $4, V23.B16, V25.B16 + VUSHR $4, V26.B16, V28.B16 + VUSHR $4, V27.B16, V29.B16 + VAND V12.B16, V22.B16, V22.B16 + VAND V12.B16, V23.B16, V23.B16 + VAND V12.B16, V26.B16, V26.B16 + VAND V12.B16, V27.B16, V27.B16 + VAND V12.B16, V24.B16, V24.B16 + VAND V12.B16, V25.B16, V25.B16 + VAND V12.B16, V28.B16, V28.B16 + VAND V12.B16, V29.B16, V29.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V26.B16, [V14.B16], V18.B16 + VTBL V27.B16, [V15.B16], V19.B16 + VTBL V22.B16, [V14.B16], V14.B16 + VTBL V23.B16, [V15.B16], V15.B16 + VTBL V28.B16, [V16.B16], V20.B16 + VTBL V29.B16, [V17.B16], V21.B16 + VTBL V24.B16, [V16.B16], V16.B16 + VTBL V25.B16, [V17.B16], V17.B16 + VEOR V14.B16, V0.B16, V0.B16 + VEOR V15.B16, V1.B16, V1.B16 + VEOR V16.B16, V0.B16, V0.B16 + VEOR V17.B16, V1.B16, V1.B16 + VEOR V18.B16, V2.B16, V2.B16 + VEOR V19.B16, V3.B16, V3.B16 + VEOR V20.B16, V2.B16, V2.B16 + VEOR V21.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V26.B16, [V14.B16], V18.B16 + VTBL V27.B16, [V15.B16], V19.B16 + VTBL V22.B16, [V14.B16], V14.B16 + VTBL V23.B16, [V15.B16], V15.B16 + VTBL V28.B16, [V16.B16], V20.B16 + VTBL V29.B16, [V17.B16], V21.B16 + VTBL V24.B16, [V16.B16], V16.B16 + VTBL V25.B16, [V17.B16], V17.B16 + VEOR V14.B16, V4.B16, V4.B16 + VEOR V15.B16, V5.B16, V5.B16 + VEOR V16.B16, V4.B16, V4.B16 + VEOR V17.B16, V5.B16, V5.B16 + VEOR V18.B16, V6.B16, V6.B16 + VEOR V19.B16, V7.B16, V7.B16 + VEOR V20.B16, V6.B16, V6.B16 + VEOR V21.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V26.B16, [V14.B16], V18.B16 + VTBL V27.B16, [V15.B16], V19.B16 + VTBL V22.B16, [V14.B16], V14.B16 + VTBL V23.B16, [V15.B16], V15.B16 + VTBL V28.B16, [V16.B16], V20.B16 + VTBL V29.B16, [V17.B16], V21.B16 + VTBL V24.B16, [V16.B16], V16.B16 + VTBL V25.B16, [V17.B16], V17.B16 + VEOR V14.B16, V8.B16, V8.B16 + VEOR V15.B16, V9.B16, V9.B16 + VEOR V16.B16, V8.B16, V8.B16 + VEOR V17.B16, V9.B16, V9.B16 + VEOR V18.B16, V10.B16, V10.B16 + VEOR V19.B16, V11.B16, V11.B16 + VEOR V20.B16, V10.B16, V10.B16 + VEOR V21.B16, V11.B16, V11.B16 + // Check for early termination + CMP $9, R16 + BEQ mulNeon_10x3_64Xor_store + + // Load and process 64 bytes from input 9 to 3 outputs + VLD1.P 32(R0), [V22.B16, V23.B16] + VLD1.P 32(R0), [V26.B16, V27.B16] + VUSHR $4, V22.B16, V24.B16 + VUSHR $4, V23.B16, V25.B16 + VUSHR $4, V26.B16, V28.B16 + VUSHR $4, V27.B16, V29.B16 + VAND V12.B16, V22.B16, V22.B16 + VAND V12.B16, V23.B16, V23.B16 + VAND V12.B16, V26.B16, V26.B16 + VAND V12.B16, V27.B16, V27.B16 + VAND V12.B16, V24.B16, V24.B16 + VAND V12.B16, V25.B16, V25.B16 + VAND V12.B16, V28.B16, V28.B16 + VAND V12.B16, V29.B16, V29.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V26.B16, [V14.B16], V18.B16 + VTBL V27.B16, [V15.B16], V19.B16 + VTBL V22.B16, [V14.B16], V14.B16 + VTBL V23.B16, [V15.B16], V15.B16 + VTBL V28.B16, [V16.B16], V20.B16 + VTBL V29.B16, [V17.B16], V21.B16 + VTBL V24.B16, [V16.B16], V16.B16 + VTBL V25.B16, [V17.B16], V17.B16 + VEOR V14.B16, V0.B16, V0.B16 + VEOR V15.B16, V1.B16, V1.B16 + VEOR V16.B16, V0.B16, V0.B16 + VEOR V17.B16, V1.B16, V1.B16 + VEOR V18.B16, V2.B16, V2.B16 + VEOR V19.B16, V3.B16, V3.B16 + VEOR V20.B16, V2.B16, V2.B16 + VEOR V21.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V26.B16, [V14.B16], V18.B16 + VTBL V27.B16, [V15.B16], V19.B16 + VTBL V22.B16, [V14.B16], V14.B16 + VTBL V23.B16, [V15.B16], V15.B16 + VTBL V28.B16, [V16.B16], V20.B16 + VTBL V29.B16, [V17.B16], V21.B16 + VTBL V24.B16, [V16.B16], V16.B16 + VTBL V25.B16, [V17.B16], V17.B16 + VEOR V14.B16, V4.B16, V4.B16 + VEOR V15.B16, V5.B16, V5.B16 + VEOR V16.B16, V4.B16, V4.B16 + VEOR V17.B16, V5.B16, V5.B16 + VEOR V18.B16, V6.B16, V6.B16 + VEOR V19.B16, V7.B16, V7.B16 + VEOR V20.B16, V6.B16, V6.B16 + VEOR V21.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V26.B16, [V14.B16], V18.B16 + VTBL V27.B16, [V15.B16], V19.B16 + VTBL V22.B16, [V14.B16], V14.B16 + VTBL V23.B16, [V15.B16], V15.B16 + VTBL V28.B16, [V16.B16], V20.B16 + VTBL V29.B16, [V17.B16], V21.B16 + VTBL V24.B16, [V16.B16], V16.B16 + VTBL V25.B16, [V17.B16], V17.B16 + VEOR V14.B16, V8.B16, V8.B16 + VEOR V15.B16, V9.B16, V9.B16 + VEOR V16.B16, V8.B16, V8.B16 + VEOR V17.B16, V9.B16, V9.B16 + VEOR V18.B16, V10.B16, V10.B16 + VEOR V19.B16, V11.B16, V11.B16 + VEOR V20.B16, V10.B16, V10.B16 + VEOR V21.B16, V11.B16, V11.B16 + +mulNeon_10x3_64Xor_store: + // Store 3 outputs + SUB $64, R14 + VST1.P [V0.D2, V1.D2], 32(R14) + VST1.P [V2.D2, V3.D2], 32(R14) + SUB $64, R15 + VST1.P [V4.D2, V5.D2], 32(R15) + VST1.P [V6.D2, V7.D2], 32(R15) + SUB $64, R13 + VST1.P [V8.D2, V9.D2], 32(R13) + VST1.P [V10.D2, V11.D2], 32(R13) + + // Prepare for next loop + SUBS $1, R6 + BNE mulNeon_10x3_64Xor_loop + +mulNeon_10x3_64Xor_end: + RET + +// func mulNeon_10x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: NEON +TEXT ·mulNeon_10x4(SB), NOSPLIT, $8-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 89 YMM used + MOVD n+80(FP), R0 + LSR $5, R0 + TST R0, R0 + BEQ mulNeon_10x4_end + MOVD in_base+24(FP), R3 + MOVD (R3), R1 + MOVD 24(R3), R4 + MOVD 48(R3), R5 + MOVD 72(R3), R8 + MOVD 96(R3), R9 + MOVD 120(R3), R10 + MOVD 144(R3), R11 + MOVD 168(R3), R12 + MOVD 192(R3), R13 + MOVD 216(R3), R3 + MOVD out_base+48(FP), R14 + MOVD start+72(FP), R15 + + // Add start offset to input + ADD R15, R1 + ADD R15, R4 + ADD R15, R5 + ADD R15, R8 + ADD R15, R9 + ADD R15, R10 + ADD R15, R11 + ADD R15, R12 + ADD R15, R13 + ADD R15, R3 + LSR $3, R15 + MOVD $15, R6 + VMOV R6, V8.B[0] + VDUP V8.B[0], V8.B16 + + // Load number of input shards + MOVD in_len+32(FP), R16 + +mulNeon_10x4_loop: + MOVD matrix_base+0(FP), R2 + // Load and process 32 bytes from input 0 to 4 outputs + VLD1.P 32(R1), [V14.B16, V15.B16] + VUSHR $4, V14.B16, V16.B16 + VUSHR $4, V15.B16, V17.B16 + VAND V8.B16, V14.B16, V14.B16 + VAND V8.B16, V15.B16, V15.B16 + VAND V8.B16, V16.B16, V16.B16 + VAND V8.B16, V17.B16, V17.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V12.B16, V0.B16 + VEOR V11.B16, V13.B16, V1.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V12.B16, V2.B16 + VEOR V11.B16, V13.B16, V3.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V12.B16, V4.B16 + VEOR V11.B16, V13.B16, V5.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V12.B16, V6.B16 + VEOR V11.B16, V13.B16, V7.B16 + // Check for early termination + CMP $1, R16 + BEQ mulNeon_10x4_store + + // Load and process 32 bytes from input 1 to 4 outputs + VLD1.P 32(R4), [V14.B16, V15.B16] + VUSHR $4, V14.B16, V16.B16 + VUSHR $4, V15.B16, V17.B16 + VAND V8.B16, V14.B16, V14.B16 + VAND V8.B16, V15.B16, V15.B16 + VAND V8.B16, V16.B16, V16.B16 + VAND V8.B16, V17.B16, V17.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V0.B16, V0.B16 + VEOR V11.B16, V1.B16, V1.B16 + VEOR V12.B16, V0.B16, V0.B16 + VEOR V13.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V2.B16, V2.B16 + VEOR V11.B16, V3.B16, V3.B16 + VEOR V12.B16, V2.B16, V2.B16 + VEOR V13.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V4.B16, V4.B16 + VEOR V11.B16, V5.B16, V5.B16 + VEOR V12.B16, V4.B16, V4.B16 + VEOR V13.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V6.B16, V6.B16 + VEOR V11.B16, V7.B16, V7.B16 + VEOR V12.B16, V6.B16, V6.B16 + VEOR V13.B16, V7.B16, V7.B16 + // Check for early termination + CMP $2, R16 + BEQ mulNeon_10x4_store + + // Load and process 32 bytes from input 2 to 4 outputs + VLD1.P 32(R5), [V14.B16, V15.B16] + VUSHR $4, V14.B16, V16.B16 + VUSHR $4, V15.B16, V17.B16 + VAND V8.B16, V14.B16, V14.B16 + VAND V8.B16, V15.B16, V15.B16 + VAND V8.B16, V16.B16, V16.B16 + VAND V8.B16, V17.B16, V17.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V0.B16, V0.B16 + VEOR V11.B16, V1.B16, V1.B16 + VEOR V12.B16, V0.B16, V0.B16 + VEOR V13.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V2.B16, V2.B16 + VEOR V11.B16, V3.B16, V3.B16 + VEOR V12.B16, V2.B16, V2.B16 + VEOR V13.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V4.B16, V4.B16 + VEOR V11.B16, V5.B16, V5.B16 + VEOR V12.B16, V4.B16, V4.B16 + VEOR V13.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V6.B16, V6.B16 + VEOR V11.B16, V7.B16, V7.B16 + VEOR V12.B16, V6.B16, V6.B16 + VEOR V13.B16, V7.B16, V7.B16 + // Check for early termination + CMP $3, R16 + BEQ mulNeon_10x4_store + + // Load and process 32 bytes from input 3 to 4 outputs + VLD1.P 32(R8), [V14.B16, V15.B16] + VUSHR $4, V14.B16, V16.B16 + VUSHR $4, V15.B16, V17.B16 + VAND V8.B16, V14.B16, V14.B16 + VAND V8.B16, V15.B16, V15.B16 + VAND V8.B16, V16.B16, V16.B16 + VAND V8.B16, V17.B16, V17.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V0.B16, V0.B16 + VEOR V11.B16, V1.B16, V1.B16 + VEOR V12.B16, V0.B16, V0.B16 + VEOR V13.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V2.B16, V2.B16 + VEOR V11.B16, V3.B16, V3.B16 + VEOR V12.B16, V2.B16, V2.B16 + VEOR V13.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V4.B16, V4.B16 + VEOR V11.B16, V5.B16, V5.B16 + VEOR V12.B16, V4.B16, V4.B16 + VEOR V13.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V6.B16, V6.B16 + VEOR V11.B16, V7.B16, V7.B16 + VEOR V12.B16, V6.B16, V6.B16 + VEOR V13.B16, V7.B16, V7.B16 + // Check for early termination + CMP $4, R16 + BEQ mulNeon_10x4_store + + // Load and process 32 bytes from input 4 to 4 outputs + VLD1.P 32(R9), [V14.B16, V15.B16] + VUSHR $4, V14.B16, V16.B16 + VUSHR $4, V15.B16, V17.B16 + VAND V8.B16, V14.B16, V14.B16 + VAND V8.B16, V15.B16, V15.B16 + VAND V8.B16, V16.B16, V16.B16 + VAND V8.B16, V17.B16, V17.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V0.B16, V0.B16 + VEOR V11.B16, V1.B16, V1.B16 + VEOR V12.B16, V0.B16, V0.B16 + VEOR V13.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V2.B16, V2.B16 + VEOR V11.B16, V3.B16, V3.B16 + VEOR V12.B16, V2.B16, V2.B16 + VEOR V13.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V4.B16, V4.B16 + VEOR V11.B16, V5.B16, V5.B16 + VEOR V12.B16, V4.B16, V4.B16 + VEOR V13.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V6.B16, V6.B16 + VEOR V11.B16, V7.B16, V7.B16 + VEOR V12.B16, V6.B16, V6.B16 + VEOR V13.B16, V7.B16, V7.B16 + // Check for early termination + CMP $5, R16 + BEQ mulNeon_10x4_store + + // Load and process 32 bytes from input 5 to 4 outputs + VLD1.P 32(R10), [V14.B16, V15.B16] + VUSHR $4, V14.B16, V16.B16 + VUSHR $4, V15.B16, V17.B16 + VAND V8.B16, V14.B16, V14.B16 + VAND V8.B16, V15.B16, V15.B16 + VAND V8.B16, V16.B16, V16.B16 + VAND V8.B16, V17.B16, V17.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V0.B16, V0.B16 + VEOR V11.B16, V1.B16, V1.B16 + VEOR V12.B16, V0.B16, V0.B16 + VEOR V13.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V2.B16, V2.B16 + VEOR V11.B16, V3.B16, V3.B16 + VEOR V12.B16, V2.B16, V2.B16 + VEOR V13.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V4.B16, V4.B16 + VEOR V11.B16, V5.B16, V5.B16 + VEOR V12.B16, V4.B16, V4.B16 + VEOR V13.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V6.B16, V6.B16 + VEOR V11.B16, V7.B16, V7.B16 + VEOR V12.B16, V6.B16, V6.B16 + VEOR V13.B16, V7.B16, V7.B16 + // Check for early termination + CMP $6, R16 + BEQ mulNeon_10x4_store + + // Load and process 32 bytes from input 6 to 4 outputs + VLD1.P 32(R11), [V14.B16, V15.B16] + VUSHR $4, V14.B16, V16.B16 + VUSHR $4, V15.B16, V17.B16 + VAND V8.B16, V14.B16, V14.B16 + VAND V8.B16, V15.B16, V15.B16 + VAND V8.B16, V16.B16, V16.B16 + VAND V8.B16, V17.B16, V17.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V0.B16, V0.B16 + VEOR V11.B16, V1.B16, V1.B16 + VEOR V12.B16, V0.B16, V0.B16 + VEOR V13.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V2.B16, V2.B16 + VEOR V11.B16, V3.B16, V3.B16 + VEOR V12.B16, V2.B16, V2.B16 + VEOR V13.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V4.B16, V4.B16 + VEOR V11.B16, V5.B16, V5.B16 + VEOR V12.B16, V4.B16, V4.B16 + VEOR V13.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V6.B16, V6.B16 + VEOR V11.B16, V7.B16, V7.B16 + VEOR V12.B16, V6.B16, V6.B16 + VEOR V13.B16, V7.B16, V7.B16 + // Check for early termination + CMP $7, R16 + BEQ mulNeon_10x4_store + + // Load and process 32 bytes from input 7 to 4 outputs + VLD1.P 32(R12), [V14.B16, V15.B16] + VUSHR $4, V14.B16, V16.B16 + VUSHR $4, V15.B16, V17.B16 + VAND V8.B16, V14.B16, V14.B16 + VAND V8.B16, V15.B16, V15.B16 + VAND V8.B16, V16.B16, V16.B16 + VAND V8.B16, V17.B16, V17.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V0.B16, V0.B16 + VEOR V11.B16, V1.B16, V1.B16 + VEOR V12.B16, V0.B16, V0.B16 + VEOR V13.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V2.B16, V2.B16 + VEOR V11.B16, V3.B16, V3.B16 + VEOR V12.B16, V2.B16, V2.B16 + VEOR V13.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V4.B16, V4.B16 + VEOR V11.B16, V5.B16, V5.B16 + VEOR V12.B16, V4.B16, V4.B16 + VEOR V13.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V6.B16, V6.B16 + VEOR V11.B16, V7.B16, V7.B16 + VEOR V12.B16, V6.B16, V6.B16 + VEOR V13.B16, V7.B16, V7.B16 + // Check for early termination + CMP $8, R16 + BEQ mulNeon_10x4_store + + // Load and process 32 bytes from input 8 to 4 outputs + VLD1.P 32(R13), [V14.B16, V15.B16] + VUSHR $4, V14.B16, V16.B16 + VUSHR $4, V15.B16, V17.B16 + VAND V8.B16, V14.B16, V14.B16 + VAND V8.B16, V15.B16, V15.B16 + VAND V8.B16, V16.B16, V16.B16 + VAND V8.B16, V17.B16, V17.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V0.B16, V0.B16 + VEOR V11.B16, V1.B16, V1.B16 + VEOR V12.B16, V0.B16, V0.B16 + VEOR V13.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V2.B16, V2.B16 + VEOR V11.B16, V3.B16, V3.B16 + VEOR V12.B16, V2.B16, V2.B16 + VEOR V13.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V4.B16, V4.B16 + VEOR V11.B16, V5.B16, V5.B16 + VEOR V12.B16, V4.B16, V4.B16 + VEOR V13.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V6.B16, V6.B16 + VEOR V11.B16, V7.B16, V7.B16 + VEOR V12.B16, V6.B16, V6.B16 + VEOR V13.B16, V7.B16, V7.B16 + // Check for early termination + CMP $9, R16 + BEQ mulNeon_10x4_store + + // Load and process 32 bytes from input 9 to 4 outputs + VLD1.P 32(R3), [V14.B16, V15.B16] + VUSHR $4, V14.B16, V16.B16 + VUSHR $4, V15.B16, V17.B16 + VAND V8.B16, V14.B16, V14.B16 + VAND V8.B16, V15.B16, V15.B16 + VAND V8.B16, V16.B16, V16.B16 + VAND V8.B16, V17.B16, V17.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V0.B16, V0.B16 + VEOR V11.B16, V1.B16, V1.B16 + VEOR V12.B16, V0.B16, V0.B16 + VEOR V13.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V2.B16, V2.B16 + VEOR V11.B16, V3.B16, V3.B16 + VEOR V12.B16, V2.B16, V2.B16 + VEOR V13.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V4.B16, V4.B16 + VEOR V11.B16, V5.B16, V5.B16 + VEOR V12.B16, V4.B16, V4.B16 + VEOR V13.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V6.B16, V6.B16 + VEOR V11.B16, V7.B16, V7.B16 + VEOR V12.B16, V6.B16, V6.B16 + VEOR V13.B16, V7.B16, V7.B16 + +mulNeon_10x4_store: + // Store 4 outputs + MOVD (R14), R6 + ADD R15<<3, R6 + VST1 [V0.D2, V1.D2], (R6) + MOVD 24(R14), R6 + ADD R15<<3, R6 + VST1 [V2.D2, V3.D2], (R6) + MOVD 48(R14), R6 + ADD R15<<3, R6 + VST1 [V4.D2, V5.D2], (R6) + MOVD 72(R14), R6 + ADD R15<<3, R6 + VST1 [V6.D2, V7.D2], (R6) + + // Prepare for next loop + ADD $4, R15 + SUBS $1, R0 + BNE mulNeon_10x4_loop + +mulNeon_10x4_end: + RET + +// func mulNeon_10x4Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: NEON +TEXT ·mulNeon_10x4Xor(SB), NOSPLIT, $8-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 89 YMM used + MOVD n+80(FP), R0 + LSR $5, R0 + TST R0, R0 + BEQ mulNeon_10x4Xor_end + MOVD in_base+24(FP), R3 + MOVD (R3), R1 + MOVD 24(R3), R4 + MOVD 48(R3), R5 + MOVD 72(R3), R8 + MOVD 96(R3), R9 + MOVD 120(R3), R10 + MOVD 144(R3), R11 + MOVD 168(R3), R12 + MOVD 192(R3), R13 + MOVD 216(R3), R3 + MOVD out_base+48(FP), R14 + MOVD start+72(FP), R15 + + // Add start offset to input + ADD R15, R1 + ADD R15, R4 + ADD R15, R5 + ADD R15, R8 + ADD R15, R9 + ADD R15, R10 + ADD R15, R11 + ADD R15, R12 + ADD R15, R13 + ADD R15, R3 + LSR $3, R15 + MOVD $15, R6 + VMOV R6, V8.B[0] + VDUP V8.B[0], V8.B16 + + // Load number of input shards + MOVD in_len+32(FP), R16 + +mulNeon_10x4Xor_loop: + MOVD matrix_base+0(FP), R2 + // Load and process 32 bytes from input 0 to 4 outputs + VLD1.P 32(R1), [V14.B16, V15.B16] + VUSHR $4, V14.B16, V16.B16 + VUSHR $4, V15.B16, V17.B16 + VAND V8.B16, V14.B16, V14.B16 + VAND V8.B16, V15.B16, V15.B16 + VAND V8.B16, V16.B16, V16.B16 + VAND V8.B16, V17.B16, V17.B16 + MOVD (R14), R6 + ADD R15<<3, R6 + VLD1 (R6), [V0.B16, V1.B16] + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V0.B16, V0.B16 + VEOR V11.B16, V1.B16, V1.B16 + VEOR V12.B16, V0.B16, V0.B16 + VEOR V13.B16, V1.B16, V1.B16 + MOVD 24(R14), R6 + ADD R15<<3, R6 + VLD1 (R6), [V2.B16, V3.B16] + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V2.B16, V2.B16 + VEOR V11.B16, V3.B16, V3.B16 + VEOR V12.B16, V2.B16, V2.B16 + VEOR V13.B16, V3.B16, V3.B16 + MOVD 48(R14), R6 + ADD R15<<3, R6 + VLD1 (R6), [V4.B16, V5.B16] + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V4.B16, V4.B16 + VEOR V11.B16, V5.B16, V5.B16 + VEOR V12.B16, V4.B16, V4.B16 + VEOR V13.B16, V5.B16, V5.B16 + MOVD 72(R14), R6 + ADD R15<<3, R6 + VLD1 (R6), [V6.B16, V7.B16] + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V6.B16, V6.B16 + VEOR V11.B16, V7.B16, V7.B16 + VEOR V12.B16, V6.B16, V6.B16 + VEOR V13.B16, V7.B16, V7.B16 + // Check for early termination + CMP $1, R16 + BEQ mulNeon_10x4Xor_store + + // Load and process 32 bytes from input 1 to 4 outputs + VLD1.P 32(R4), [V14.B16, V15.B16] + VUSHR $4, V14.B16, V16.B16 + VUSHR $4, V15.B16, V17.B16 + VAND V8.B16, V14.B16, V14.B16 + VAND V8.B16, V15.B16, V15.B16 + VAND V8.B16, V16.B16, V16.B16 + VAND V8.B16, V17.B16, V17.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V0.B16, V0.B16 + VEOR V11.B16, V1.B16, V1.B16 + VEOR V12.B16, V0.B16, V0.B16 + VEOR V13.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V2.B16, V2.B16 + VEOR V11.B16, V3.B16, V3.B16 + VEOR V12.B16, V2.B16, V2.B16 + VEOR V13.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V4.B16, V4.B16 + VEOR V11.B16, V5.B16, V5.B16 + VEOR V12.B16, V4.B16, V4.B16 + VEOR V13.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V6.B16, V6.B16 + VEOR V11.B16, V7.B16, V7.B16 + VEOR V12.B16, V6.B16, V6.B16 + VEOR V13.B16, V7.B16, V7.B16 + // Check for early termination + CMP $2, R16 + BEQ mulNeon_10x4Xor_store + + // Load and process 32 bytes from input 2 to 4 outputs + VLD1.P 32(R5), [V14.B16, V15.B16] + VUSHR $4, V14.B16, V16.B16 + VUSHR $4, V15.B16, V17.B16 + VAND V8.B16, V14.B16, V14.B16 + VAND V8.B16, V15.B16, V15.B16 + VAND V8.B16, V16.B16, V16.B16 + VAND V8.B16, V17.B16, V17.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V0.B16, V0.B16 + VEOR V11.B16, V1.B16, V1.B16 + VEOR V12.B16, V0.B16, V0.B16 + VEOR V13.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V2.B16, V2.B16 + VEOR V11.B16, V3.B16, V3.B16 + VEOR V12.B16, V2.B16, V2.B16 + VEOR V13.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V4.B16, V4.B16 + VEOR V11.B16, V5.B16, V5.B16 + VEOR V12.B16, V4.B16, V4.B16 + VEOR V13.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V6.B16, V6.B16 + VEOR V11.B16, V7.B16, V7.B16 + VEOR V12.B16, V6.B16, V6.B16 + VEOR V13.B16, V7.B16, V7.B16 + // Check for early termination + CMP $3, R16 + BEQ mulNeon_10x4Xor_store + + // Load and process 32 bytes from input 3 to 4 outputs + VLD1.P 32(R8), [V14.B16, V15.B16] + VUSHR $4, V14.B16, V16.B16 + VUSHR $4, V15.B16, V17.B16 + VAND V8.B16, V14.B16, V14.B16 + VAND V8.B16, V15.B16, V15.B16 + VAND V8.B16, V16.B16, V16.B16 + VAND V8.B16, V17.B16, V17.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V0.B16, V0.B16 + VEOR V11.B16, V1.B16, V1.B16 + VEOR V12.B16, V0.B16, V0.B16 + VEOR V13.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V2.B16, V2.B16 + VEOR V11.B16, V3.B16, V3.B16 + VEOR V12.B16, V2.B16, V2.B16 + VEOR V13.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V4.B16, V4.B16 + VEOR V11.B16, V5.B16, V5.B16 + VEOR V12.B16, V4.B16, V4.B16 + VEOR V13.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V6.B16, V6.B16 + VEOR V11.B16, V7.B16, V7.B16 + VEOR V12.B16, V6.B16, V6.B16 + VEOR V13.B16, V7.B16, V7.B16 + // Check for early termination + CMP $4, R16 + BEQ mulNeon_10x4Xor_store + + // Load and process 32 bytes from input 4 to 4 outputs + VLD1.P 32(R9), [V14.B16, V15.B16] + VUSHR $4, V14.B16, V16.B16 + VUSHR $4, V15.B16, V17.B16 + VAND V8.B16, V14.B16, V14.B16 + VAND V8.B16, V15.B16, V15.B16 + VAND V8.B16, V16.B16, V16.B16 + VAND V8.B16, V17.B16, V17.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V0.B16, V0.B16 + VEOR V11.B16, V1.B16, V1.B16 + VEOR V12.B16, V0.B16, V0.B16 + VEOR V13.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V2.B16, V2.B16 + VEOR V11.B16, V3.B16, V3.B16 + VEOR V12.B16, V2.B16, V2.B16 + VEOR V13.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V4.B16, V4.B16 + VEOR V11.B16, V5.B16, V5.B16 + VEOR V12.B16, V4.B16, V4.B16 + VEOR V13.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V6.B16, V6.B16 + VEOR V11.B16, V7.B16, V7.B16 + VEOR V12.B16, V6.B16, V6.B16 + VEOR V13.B16, V7.B16, V7.B16 + // Check for early termination + CMP $5, R16 + BEQ mulNeon_10x4Xor_store + + // Load and process 32 bytes from input 5 to 4 outputs + VLD1.P 32(R10), [V14.B16, V15.B16] + VUSHR $4, V14.B16, V16.B16 + VUSHR $4, V15.B16, V17.B16 + VAND V8.B16, V14.B16, V14.B16 + VAND V8.B16, V15.B16, V15.B16 + VAND V8.B16, V16.B16, V16.B16 + VAND V8.B16, V17.B16, V17.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V0.B16, V0.B16 + VEOR V11.B16, V1.B16, V1.B16 + VEOR V12.B16, V0.B16, V0.B16 + VEOR V13.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V2.B16, V2.B16 + VEOR V11.B16, V3.B16, V3.B16 + VEOR V12.B16, V2.B16, V2.B16 + VEOR V13.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V4.B16, V4.B16 + VEOR V11.B16, V5.B16, V5.B16 + VEOR V12.B16, V4.B16, V4.B16 + VEOR V13.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V6.B16, V6.B16 + VEOR V11.B16, V7.B16, V7.B16 + VEOR V12.B16, V6.B16, V6.B16 + VEOR V13.B16, V7.B16, V7.B16 + // Check for early termination + CMP $6, R16 + BEQ mulNeon_10x4Xor_store + + // Load and process 32 bytes from input 6 to 4 outputs + VLD1.P 32(R11), [V14.B16, V15.B16] + VUSHR $4, V14.B16, V16.B16 + VUSHR $4, V15.B16, V17.B16 + VAND V8.B16, V14.B16, V14.B16 + VAND V8.B16, V15.B16, V15.B16 + VAND V8.B16, V16.B16, V16.B16 + VAND V8.B16, V17.B16, V17.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V0.B16, V0.B16 + VEOR V11.B16, V1.B16, V1.B16 + VEOR V12.B16, V0.B16, V0.B16 + VEOR V13.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V2.B16, V2.B16 + VEOR V11.B16, V3.B16, V3.B16 + VEOR V12.B16, V2.B16, V2.B16 + VEOR V13.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V4.B16, V4.B16 + VEOR V11.B16, V5.B16, V5.B16 + VEOR V12.B16, V4.B16, V4.B16 + VEOR V13.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V6.B16, V6.B16 + VEOR V11.B16, V7.B16, V7.B16 + VEOR V12.B16, V6.B16, V6.B16 + VEOR V13.B16, V7.B16, V7.B16 + // Check for early termination + CMP $7, R16 + BEQ mulNeon_10x4Xor_store + + // Load and process 32 bytes from input 7 to 4 outputs + VLD1.P 32(R12), [V14.B16, V15.B16] + VUSHR $4, V14.B16, V16.B16 + VUSHR $4, V15.B16, V17.B16 + VAND V8.B16, V14.B16, V14.B16 + VAND V8.B16, V15.B16, V15.B16 + VAND V8.B16, V16.B16, V16.B16 + VAND V8.B16, V17.B16, V17.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V0.B16, V0.B16 + VEOR V11.B16, V1.B16, V1.B16 + VEOR V12.B16, V0.B16, V0.B16 + VEOR V13.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V2.B16, V2.B16 + VEOR V11.B16, V3.B16, V3.B16 + VEOR V12.B16, V2.B16, V2.B16 + VEOR V13.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V4.B16, V4.B16 + VEOR V11.B16, V5.B16, V5.B16 + VEOR V12.B16, V4.B16, V4.B16 + VEOR V13.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V6.B16, V6.B16 + VEOR V11.B16, V7.B16, V7.B16 + VEOR V12.B16, V6.B16, V6.B16 + VEOR V13.B16, V7.B16, V7.B16 + // Check for early termination + CMP $8, R16 + BEQ mulNeon_10x4Xor_store + + // Load and process 32 bytes from input 8 to 4 outputs + VLD1.P 32(R13), [V14.B16, V15.B16] + VUSHR $4, V14.B16, V16.B16 + VUSHR $4, V15.B16, V17.B16 + VAND V8.B16, V14.B16, V14.B16 + VAND V8.B16, V15.B16, V15.B16 + VAND V8.B16, V16.B16, V16.B16 + VAND V8.B16, V17.B16, V17.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V0.B16, V0.B16 + VEOR V11.B16, V1.B16, V1.B16 + VEOR V12.B16, V0.B16, V0.B16 + VEOR V13.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V2.B16, V2.B16 + VEOR V11.B16, V3.B16, V3.B16 + VEOR V12.B16, V2.B16, V2.B16 + VEOR V13.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V4.B16, V4.B16 + VEOR V11.B16, V5.B16, V5.B16 + VEOR V12.B16, V4.B16, V4.B16 + VEOR V13.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V6.B16, V6.B16 + VEOR V11.B16, V7.B16, V7.B16 + VEOR V12.B16, V6.B16, V6.B16 + VEOR V13.B16, V7.B16, V7.B16 + // Check for early termination + CMP $9, R16 + BEQ mulNeon_10x4Xor_store + + // Load and process 32 bytes from input 9 to 4 outputs + VLD1.P 32(R3), [V14.B16, V15.B16] + VUSHR $4, V14.B16, V16.B16 + VUSHR $4, V15.B16, V17.B16 + VAND V8.B16, V14.B16, V14.B16 + VAND V8.B16, V15.B16, V15.B16 + VAND V8.B16, V16.B16, V16.B16 + VAND V8.B16, V17.B16, V17.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V0.B16, V0.B16 + VEOR V11.B16, V1.B16, V1.B16 + VEOR V12.B16, V0.B16, V0.B16 + VEOR V13.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V2.B16, V2.B16 + VEOR V11.B16, V3.B16, V3.B16 + VEOR V12.B16, V2.B16, V2.B16 + VEOR V13.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V4.B16, V4.B16 + VEOR V11.B16, V5.B16, V5.B16 + VEOR V12.B16, V4.B16, V4.B16 + VEOR V13.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V6.B16, V6.B16 + VEOR V11.B16, V7.B16, V7.B16 + VEOR V12.B16, V6.B16, V6.B16 + VEOR V13.B16, V7.B16, V7.B16 + +mulNeon_10x4Xor_store: + // Store 4 outputs + MOVD (R14), R6 + ADD R15<<3, R6 + VST1 [V0.D2, V1.D2], (R6) + MOVD 24(R14), R6 + ADD R15<<3, R6 + VST1 [V2.D2, V3.D2], (R6) + MOVD 48(R14), R6 + ADD R15<<3, R6 + VST1 [V4.D2, V5.D2], (R6) + MOVD 72(R14), R6 + ADD R15<<3, R6 + VST1 [V6.D2, V7.D2], (R6) + + // Prepare for next loop + ADD $4, R15 + SUBS $1, R0 + BNE mulNeon_10x4Xor_loop + +mulNeon_10x4Xor_end: + RET + +// func mulNeon_10x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: NEON +TEXT ·mulNeon_10x5(SB), NOSPLIT, $8-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 110 YMM used + MOVD n+80(FP), R0 + LSR $5, R0 + TST R0, R0 + BEQ mulNeon_10x5_end + MOVD in_base+24(FP), R3 + MOVD (R3), R1 + MOVD 24(R3), R4 + MOVD 48(R3), R5 + MOVD 72(R3), R8 + MOVD 96(R3), R9 + MOVD 120(R3), R10 + MOVD 144(R3), R11 + MOVD 168(R3), R12 + MOVD 192(R3), R13 + MOVD 216(R3), R3 + MOVD out_base+48(FP), R14 + MOVD start+72(FP), R15 + + // Add start offset to input + ADD R15, R1 + ADD R15, R4 + ADD R15, R5 + ADD R15, R8 + ADD R15, R9 + ADD R15, R10 + ADD R15, R11 + ADD R15, R12 + ADD R15, R13 + ADD R15, R3 + LSR $3, R15 + MOVD $15, R6 + VMOV R6, V10.B[0] + VDUP V10.B[0], V10.B16 + + // Load number of input shards + MOVD in_len+32(FP), R16 + +mulNeon_10x5_loop: + MOVD matrix_base+0(FP), R2 + // Load and process 32 bytes from input 0 to 5 outputs + VLD1.P 32(R1), [V16.B16, V17.B16] + VUSHR $4, V16.B16, V18.B16 + VUSHR $4, V17.B16, V19.B16 + VAND V10.B16, V16.B16, V16.B16 + VAND V10.B16, V17.B16, V17.B16 + VAND V10.B16, V18.B16, V18.B16 + VAND V10.B16, V19.B16, V19.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V14.B16, V0.B16 + VEOR V13.B16, V15.B16, V1.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V14.B16, V2.B16 + VEOR V13.B16, V15.B16, V3.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V14.B16, V4.B16 + VEOR V13.B16, V15.B16, V5.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V14.B16, V6.B16 + VEOR V13.B16, V15.B16, V7.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V14.B16, V8.B16 + VEOR V13.B16, V15.B16, V9.B16 + // Check for early termination + CMP $1, R16 + BEQ mulNeon_10x5_store + + // Load and process 32 bytes from input 1 to 5 outputs + VLD1.P 32(R4), [V16.B16, V17.B16] + VUSHR $4, V16.B16, V18.B16 + VUSHR $4, V17.B16, V19.B16 + VAND V10.B16, V16.B16, V16.B16 + VAND V10.B16, V17.B16, V17.B16 + VAND V10.B16, V18.B16, V18.B16 + VAND V10.B16, V19.B16, V19.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V0.B16, V0.B16 + VEOR V13.B16, V1.B16, V1.B16 + VEOR V14.B16, V0.B16, V0.B16 + VEOR V15.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V2.B16, V2.B16 + VEOR V13.B16, V3.B16, V3.B16 + VEOR V14.B16, V2.B16, V2.B16 + VEOR V15.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V4.B16, V4.B16 + VEOR V13.B16, V5.B16, V5.B16 + VEOR V14.B16, V4.B16, V4.B16 + VEOR V15.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V6.B16, V6.B16 + VEOR V13.B16, V7.B16, V7.B16 + VEOR V14.B16, V6.B16, V6.B16 + VEOR V15.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V8.B16, V8.B16 + VEOR V13.B16, V9.B16, V9.B16 + VEOR V14.B16, V8.B16, V8.B16 + VEOR V15.B16, V9.B16, V9.B16 + // Check for early termination + CMP $2, R16 + BEQ mulNeon_10x5_store + + // Load and process 32 bytes from input 2 to 5 outputs + VLD1.P 32(R5), [V16.B16, V17.B16] + VUSHR $4, V16.B16, V18.B16 + VUSHR $4, V17.B16, V19.B16 + VAND V10.B16, V16.B16, V16.B16 + VAND V10.B16, V17.B16, V17.B16 + VAND V10.B16, V18.B16, V18.B16 + VAND V10.B16, V19.B16, V19.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V0.B16, V0.B16 + VEOR V13.B16, V1.B16, V1.B16 + VEOR V14.B16, V0.B16, V0.B16 + VEOR V15.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V2.B16, V2.B16 + VEOR V13.B16, V3.B16, V3.B16 + VEOR V14.B16, V2.B16, V2.B16 + VEOR V15.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V4.B16, V4.B16 + VEOR V13.B16, V5.B16, V5.B16 + VEOR V14.B16, V4.B16, V4.B16 + VEOR V15.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V6.B16, V6.B16 + VEOR V13.B16, V7.B16, V7.B16 + VEOR V14.B16, V6.B16, V6.B16 + VEOR V15.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V8.B16, V8.B16 + VEOR V13.B16, V9.B16, V9.B16 + VEOR V14.B16, V8.B16, V8.B16 + VEOR V15.B16, V9.B16, V9.B16 + // Check for early termination + CMP $3, R16 + BEQ mulNeon_10x5_store + + // Load and process 32 bytes from input 3 to 5 outputs + VLD1.P 32(R8), [V16.B16, V17.B16] + VUSHR $4, V16.B16, V18.B16 + VUSHR $4, V17.B16, V19.B16 + VAND V10.B16, V16.B16, V16.B16 + VAND V10.B16, V17.B16, V17.B16 + VAND V10.B16, V18.B16, V18.B16 + VAND V10.B16, V19.B16, V19.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V0.B16, V0.B16 + VEOR V13.B16, V1.B16, V1.B16 + VEOR V14.B16, V0.B16, V0.B16 + VEOR V15.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V2.B16, V2.B16 + VEOR V13.B16, V3.B16, V3.B16 + VEOR V14.B16, V2.B16, V2.B16 + VEOR V15.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V4.B16, V4.B16 + VEOR V13.B16, V5.B16, V5.B16 + VEOR V14.B16, V4.B16, V4.B16 + VEOR V15.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V6.B16, V6.B16 + VEOR V13.B16, V7.B16, V7.B16 + VEOR V14.B16, V6.B16, V6.B16 + VEOR V15.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V8.B16, V8.B16 + VEOR V13.B16, V9.B16, V9.B16 + VEOR V14.B16, V8.B16, V8.B16 + VEOR V15.B16, V9.B16, V9.B16 + // Check for early termination + CMP $4, R16 + BEQ mulNeon_10x5_store + + // Load and process 32 bytes from input 4 to 5 outputs + VLD1.P 32(R9), [V16.B16, V17.B16] + VUSHR $4, V16.B16, V18.B16 + VUSHR $4, V17.B16, V19.B16 + VAND V10.B16, V16.B16, V16.B16 + VAND V10.B16, V17.B16, V17.B16 + VAND V10.B16, V18.B16, V18.B16 + VAND V10.B16, V19.B16, V19.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V0.B16, V0.B16 + VEOR V13.B16, V1.B16, V1.B16 + VEOR V14.B16, V0.B16, V0.B16 + VEOR V15.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V2.B16, V2.B16 + VEOR V13.B16, V3.B16, V3.B16 + VEOR V14.B16, V2.B16, V2.B16 + VEOR V15.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V4.B16, V4.B16 + VEOR V13.B16, V5.B16, V5.B16 + VEOR V14.B16, V4.B16, V4.B16 + VEOR V15.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V6.B16, V6.B16 + VEOR V13.B16, V7.B16, V7.B16 + VEOR V14.B16, V6.B16, V6.B16 + VEOR V15.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V8.B16, V8.B16 + VEOR V13.B16, V9.B16, V9.B16 + VEOR V14.B16, V8.B16, V8.B16 + VEOR V15.B16, V9.B16, V9.B16 + // Check for early termination + CMP $5, R16 + BEQ mulNeon_10x5_store + + // Load and process 32 bytes from input 5 to 5 outputs + VLD1.P 32(R10), [V16.B16, V17.B16] + VUSHR $4, V16.B16, V18.B16 + VUSHR $4, V17.B16, V19.B16 + VAND V10.B16, V16.B16, V16.B16 + VAND V10.B16, V17.B16, V17.B16 + VAND V10.B16, V18.B16, V18.B16 + VAND V10.B16, V19.B16, V19.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V0.B16, V0.B16 + VEOR V13.B16, V1.B16, V1.B16 + VEOR V14.B16, V0.B16, V0.B16 + VEOR V15.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V2.B16, V2.B16 + VEOR V13.B16, V3.B16, V3.B16 + VEOR V14.B16, V2.B16, V2.B16 + VEOR V15.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V4.B16, V4.B16 + VEOR V13.B16, V5.B16, V5.B16 + VEOR V14.B16, V4.B16, V4.B16 + VEOR V15.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V6.B16, V6.B16 + VEOR V13.B16, V7.B16, V7.B16 + VEOR V14.B16, V6.B16, V6.B16 + VEOR V15.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V8.B16, V8.B16 + VEOR V13.B16, V9.B16, V9.B16 + VEOR V14.B16, V8.B16, V8.B16 + VEOR V15.B16, V9.B16, V9.B16 + // Check for early termination + CMP $6, R16 + BEQ mulNeon_10x5_store + + // Load and process 32 bytes from input 6 to 5 outputs + VLD1.P 32(R11), [V16.B16, V17.B16] + VUSHR $4, V16.B16, V18.B16 + VUSHR $4, V17.B16, V19.B16 + VAND V10.B16, V16.B16, V16.B16 + VAND V10.B16, V17.B16, V17.B16 + VAND V10.B16, V18.B16, V18.B16 + VAND V10.B16, V19.B16, V19.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V0.B16, V0.B16 + VEOR V13.B16, V1.B16, V1.B16 + VEOR V14.B16, V0.B16, V0.B16 + VEOR V15.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V2.B16, V2.B16 + VEOR V13.B16, V3.B16, V3.B16 + VEOR V14.B16, V2.B16, V2.B16 + VEOR V15.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V4.B16, V4.B16 + VEOR V13.B16, V5.B16, V5.B16 + VEOR V14.B16, V4.B16, V4.B16 + VEOR V15.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V6.B16, V6.B16 + VEOR V13.B16, V7.B16, V7.B16 + VEOR V14.B16, V6.B16, V6.B16 + VEOR V15.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V8.B16, V8.B16 + VEOR V13.B16, V9.B16, V9.B16 + VEOR V14.B16, V8.B16, V8.B16 + VEOR V15.B16, V9.B16, V9.B16 + // Check for early termination + CMP $7, R16 + BEQ mulNeon_10x5_store + + // Load and process 32 bytes from input 7 to 5 outputs + VLD1.P 32(R12), [V16.B16, V17.B16] + VUSHR $4, V16.B16, V18.B16 + VUSHR $4, V17.B16, V19.B16 + VAND V10.B16, V16.B16, V16.B16 + VAND V10.B16, V17.B16, V17.B16 + VAND V10.B16, V18.B16, V18.B16 + VAND V10.B16, V19.B16, V19.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V0.B16, V0.B16 + VEOR V13.B16, V1.B16, V1.B16 + VEOR V14.B16, V0.B16, V0.B16 + VEOR V15.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V2.B16, V2.B16 + VEOR V13.B16, V3.B16, V3.B16 + VEOR V14.B16, V2.B16, V2.B16 + VEOR V15.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V4.B16, V4.B16 + VEOR V13.B16, V5.B16, V5.B16 + VEOR V14.B16, V4.B16, V4.B16 + VEOR V15.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V6.B16, V6.B16 + VEOR V13.B16, V7.B16, V7.B16 + VEOR V14.B16, V6.B16, V6.B16 + VEOR V15.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V8.B16, V8.B16 + VEOR V13.B16, V9.B16, V9.B16 + VEOR V14.B16, V8.B16, V8.B16 + VEOR V15.B16, V9.B16, V9.B16 + // Check for early termination + CMP $8, R16 + BEQ mulNeon_10x5_store + + // Load and process 32 bytes from input 8 to 5 outputs + VLD1.P 32(R13), [V16.B16, V17.B16] + VUSHR $4, V16.B16, V18.B16 + VUSHR $4, V17.B16, V19.B16 + VAND V10.B16, V16.B16, V16.B16 + VAND V10.B16, V17.B16, V17.B16 + VAND V10.B16, V18.B16, V18.B16 + VAND V10.B16, V19.B16, V19.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V0.B16, V0.B16 + VEOR V13.B16, V1.B16, V1.B16 + VEOR V14.B16, V0.B16, V0.B16 + VEOR V15.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V2.B16, V2.B16 + VEOR V13.B16, V3.B16, V3.B16 + VEOR V14.B16, V2.B16, V2.B16 + VEOR V15.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V4.B16, V4.B16 + VEOR V13.B16, V5.B16, V5.B16 + VEOR V14.B16, V4.B16, V4.B16 + VEOR V15.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V6.B16, V6.B16 + VEOR V13.B16, V7.B16, V7.B16 + VEOR V14.B16, V6.B16, V6.B16 + VEOR V15.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V8.B16, V8.B16 + VEOR V13.B16, V9.B16, V9.B16 + VEOR V14.B16, V8.B16, V8.B16 + VEOR V15.B16, V9.B16, V9.B16 + // Check for early termination + CMP $9, R16 + BEQ mulNeon_10x5_store + + // Load and process 32 bytes from input 9 to 5 outputs + VLD1.P 32(R3), [V16.B16, V17.B16] + VUSHR $4, V16.B16, V18.B16 + VUSHR $4, V17.B16, V19.B16 + VAND V10.B16, V16.B16, V16.B16 + VAND V10.B16, V17.B16, V17.B16 + VAND V10.B16, V18.B16, V18.B16 + VAND V10.B16, V19.B16, V19.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V0.B16, V0.B16 + VEOR V13.B16, V1.B16, V1.B16 + VEOR V14.B16, V0.B16, V0.B16 + VEOR V15.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V2.B16, V2.B16 + VEOR V13.B16, V3.B16, V3.B16 + VEOR V14.B16, V2.B16, V2.B16 + VEOR V15.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V4.B16, V4.B16 + VEOR V13.B16, V5.B16, V5.B16 + VEOR V14.B16, V4.B16, V4.B16 + VEOR V15.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V6.B16, V6.B16 + VEOR V13.B16, V7.B16, V7.B16 + VEOR V14.B16, V6.B16, V6.B16 + VEOR V15.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V8.B16, V8.B16 + VEOR V13.B16, V9.B16, V9.B16 + VEOR V14.B16, V8.B16, V8.B16 + VEOR V15.B16, V9.B16, V9.B16 + +mulNeon_10x5_store: + // Store 5 outputs + MOVD (R14), R6 + ADD R15<<3, R6 + VST1 [V0.D2, V1.D2], (R6) + MOVD 24(R14), R6 + ADD R15<<3, R6 + VST1 [V2.D2, V3.D2], (R6) + MOVD 48(R14), R6 + ADD R15<<3, R6 + VST1 [V4.D2, V5.D2], (R6) + MOVD 72(R14), R6 + ADD R15<<3, R6 + VST1 [V6.D2, V7.D2], (R6) + MOVD 96(R14), R6 + ADD R15<<3, R6 + VST1 [V8.D2, V9.D2], (R6) + + // Prepare for next loop + ADD $4, R15 + SUBS $1, R0 + BNE mulNeon_10x5_loop + +mulNeon_10x5_end: + RET + +// func mulNeon_10x5Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: NEON +TEXT ·mulNeon_10x5Xor(SB), NOSPLIT, $8-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 110 YMM used + MOVD n+80(FP), R0 + LSR $5, R0 + TST R0, R0 + BEQ mulNeon_10x5Xor_end + MOVD in_base+24(FP), R3 + MOVD (R3), R1 + MOVD 24(R3), R4 + MOVD 48(R3), R5 + MOVD 72(R3), R8 + MOVD 96(R3), R9 + MOVD 120(R3), R10 + MOVD 144(R3), R11 + MOVD 168(R3), R12 + MOVD 192(R3), R13 + MOVD 216(R3), R3 + MOVD out_base+48(FP), R14 + MOVD start+72(FP), R15 + + // Add start offset to input + ADD R15, R1 + ADD R15, R4 + ADD R15, R5 + ADD R15, R8 + ADD R15, R9 + ADD R15, R10 + ADD R15, R11 + ADD R15, R12 + ADD R15, R13 + ADD R15, R3 + LSR $3, R15 + MOVD $15, R6 + VMOV R6, V10.B[0] + VDUP V10.B[0], V10.B16 + + // Load number of input shards + MOVD in_len+32(FP), R16 + +mulNeon_10x5Xor_loop: + MOVD matrix_base+0(FP), R2 + // Load and process 32 bytes from input 0 to 5 outputs + VLD1.P 32(R1), [V16.B16, V17.B16] + VUSHR $4, V16.B16, V18.B16 + VUSHR $4, V17.B16, V19.B16 + VAND V10.B16, V16.B16, V16.B16 + VAND V10.B16, V17.B16, V17.B16 + VAND V10.B16, V18.B16, V18.B16 + VAND V10.B16, V19.B16, V19.B16 + MOVD (R14), R6 + ADD R15<<3, R6 + VLD1 (R6), [V0.B16, V1.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V0.B16, V0.B16 + VEOR V13.B16, V1.B16, V1.B16 + VEOR V14.B16, V0.B16, V0.B16 + VEOR V15.B16, V1.B16, V1.B16 + MOVD 24(R14), R6 + ADD R15<<3, R6 + VLD1 (R6), [V2.B16, V3.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V2.B16, V2.B16 + VEOR V13.B16, V3.B16, V3.B16 + VEOR V14.B16, V2.B16, V2.B16 + VEOR V15.B16, V3.B16, V3.B16 + MOVD 48(R14), R6 + ADD R15<<3, R6 + VLD1 (R6), [V4.B16, V5.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V4.B16, V4.B16 + VEOR V13.B16, V5.B16, V5.B16 + VEOR V14.B16, V4.B16, V4.B16 + VEOR V15.B16, V5.B16, V5.B16 + MOVD 72(R14), R6 + ADD R15<<3, R6 + VLD1 (R6), [V6.B16, V7.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V6.B16, V6.B16 + VEOR V13.B16, V7.B16, V7.B16 + VEOR V14.B16, V6.B16, V6.B16 + VEOR V15.B16, V7.B16, V7.B16 + MOVD 96(R14), R6 + ADD R15<<3, R6 + VLD1 (R6), [V8.B16, V9.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V8.B16, V8.B16 + VEOR V13.B16, V9.B16, V9.B16 + VEOR V14.B16, V8.B16, V8.B16 + VEOR V15.B16, V9.B16, V9.B16 + // Check for early termination + CMP $1, R16 + BEQ mulNeon_10x5Xor_store + + // Load and process 32 bytes from input 1 to 5 outputs + VLD1.P 32(R4), [V16.B16, V17.B16] + VUSHR $4, V16.B16, V18.B16 + VUSHR $4, V17.B16, V19.B16 + VAND V10.B16, V16.B16, V16.B16 + VAND V10.B16, V17.B16, V17.B16 + VAND V10.B16, V18.B16, V18.B16 + VAND V10.B16, V19.B16, V19.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V0.B16, V0.B16 + VEOR V13.B16, V1.B16, V1.B16 + VEOR V14.B16, V0.B16, V0.B16 + VEOR V15.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V2.B16, V2.B16 + VEOR V13.B16, V3.B16, V3.B16 + VEOR V14.B16, V2.B16, V2.B16 + VEOR V15.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V4.B16, V4.B16 + VEOR V13.B16, V5.B16, V5.B16 + VEOR V14.B16, V4.B16, V4.B16 + VEOR V15.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V6.B16, V6.B16 + VEOR V13.B16, V7.B16, V7.B16 + VEOR V14.B16, V6.B16, V6.B16 + VEOR V15.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V8.B16, V8.B16 + VEOR V13.B16, V9.B16, V9.B16 + VEOR V14.B16, V8.B16, V8.B16 + VEOR V15.B16, V9.B16, V9.B16 + // Check for early termination + CMP $2, R16 + BEQ mulNeon_10x5Xor_store + + // Load and process 32 bytes from input 2 to 5 outputs + VLD1.P 32(R5), [V16.B16, V17.B16] + VUSHR $4, V16.B16, V18.B16 + VUSHR $4, V17.B16, V19.B16 + VAND V10.B16, V16.B16, V16.B16 + VAND V10.B16, V17.B16, V17.B16 + VAND V10.B16, V18.B16, V18.B16 + VAND V10.B16, V19.B16, V19.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V0.B16, V0.B16 + VEOR V13.B16, V1.B16, V1.B16 + VEOR V14.B16, V0.B16, V0.B16 + VEOR V15.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V2.B16, V2.B16 + VEOR V13.B16, V3.B16, V3.B16 + VEOR V14.B16, V2.B16, V2.B16 + VEOR V15.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V4.B16, V4.B16 + VEOR V13.B16, V5.B16, V5.B16 + VEOR V14.B16, V4.B16, V4.B16 + VEOR V15.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V6.B16, V6.B16 + VEOR V13.B16, V7.B16, V7.B16 + VEOR V14.B16, V6.B16, V6.B16 + VEOR V15.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V8.B16, V8.B16 + VEOR V13.B16, V9.B16, V9.B16 + VEOR V14.B16, V8.B16, V8.B16 + VEOR V15.B16, V9.B16, V9.B16 + // Check for early termination + CMP $3, R16 + BEQ mulNeon_10x5Xor_store + + // Load and process 32 bytes from input 3 to 5 outputs + VLD1.P 32(R8), [V16.B16, V17.B16] + VUSHR $4, V16.B16, V18.B16 + VUSHR $4, V17.B16, V19.B16 + VAND V10.B16, V16.B16, V16.B16 + VAND V10.B16, V17.B16, V17.B16 + VAND V10.B16, V18.B16, V18.B16 + VAND V10.B16, V19.B16, V19.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V0.B16, V0.B16 + VEOR V13.B16, V1.B16, V1.B16 + VEOR V14.B16, V0.B16, V0.B16 + VEOR V15.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V2.B16, V2.B16 + VEOR V13.B16, V3.B16, V3.B16 + VEOR V14.B16, V2.B16, V2.B16 + VEOR V15.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V4.B16, V4.B16 + VEOR V13.B16, V5.B16, V5.B16 + VEOR V14.B16, V4.B16, V4.B16 + VEOR V15.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V6.B16, V6.B16 + VEOR V13.B16, V7.B16, V7.B16 + VEOR V14.B16, V6.B16, V6.B16 + VEOR V15.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V8.B16, V8.B16 + VEOR V13.B16, V9.B16, V9.B16 + VEOR V14.B16, V8.B16, V8.B16 + VEOR V15.B16, V9.B16, V9.B16 + // Check for early termination + CMP $4, R16 + BEQ mulNeon_10x5Xor_store + + // Load and process 32 bytes from input 4 to 5 outputs + VLD1.P 32(R9), [V16.B16, V17.B16] + VUSHR $4, V16.B16, V18.B16 + VUSHR $4, V17.B16, V19.B16 + VAND V10.B16, V16.B16, V16.B16 + VAND V10.B16, V17.B16, V17.B16 + VAND V10.B16, V18.B16, V18.B16 + VAND V10.B16, V19.B16, V19.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V0.B16, V0.B16 + VEOR V13.B16, V1.B16, V1.B16 + VEOR V14.B16, V0.B16, V0.B16 + VEOR V15.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V2.B16, V2.B16 + VEOR V13.B16, V3.B16, V3.B16 + VEOR V14.B16, V2.B16, V2.B16 + VEOR V15.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V4.B16, V4.B16 + VEOR V13.B16, V5.B16, V5.B16 + VEOR V14.B16, V4.B16, V4.B16 + VEOR V15.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V6.B16, V6.B16 + VEOR V13.B16, V7.B16, V7.B16 + VEOR V14.B16, V6.B16, V6.B16 + VEOR V15.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V8.B16, V8.B16 + VEOR V13.B16, V9.B16, V9.B16 + VEOR V14.B16, V8.B16, V8.B16 + VEOR V15.B16, V9.B16, V9.B16 + // Check for early termination + CMP $5, R16 + BEQ mulNeon_10x5Xor_store + + // Load and process 32 bytes from input 5 to 5 outputs + VLD1.P 32(R10), [V16.B16, V17.B16] + VUSHR $4, V16.B16, V18.B16 + VUSHR $4, V17.B16, V19.B16 + VAND V10.B16, V16.B16, V16.B16 + VAND V10.B16, V17.B16, V17.B16 + VAND V10.B16, V18.B16, V18.B16 + VAND V10.B16, V19.B16, V19.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V0.B16, V0.B16 + VEOR V13.B16, V1.B16, V1.B16 + VEOR V14.B16, V0.B16, V0.B16 + VEOR V15.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V2.B16, V2.B16 + VEOR V13.B16, V3.B16, V3.B16 + VEOR V14.B16, V2.B16, V2.B16 + VEOR V15.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V4.B16, V4.B16 + VEOR V13.B16, V5.B16, V5.B16 + VEOR V14.B16, V4.B16, V4.B16 + VEOR V15.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V6.B16, V6.B16 + VEOR V13.B16, V7.B16, V7.B16 + VEOR V14.B16, V6.B16, V6.B16 + VEOR V15.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V8.B16, V8.B16 + VEOR V13.B16, V9.B16, V9.B16 + VEOR V14.B16, V8.B16, V8.B16 + VEOR V15.B16, V9.B16, V9.B16 + // Check for early termination + CMP $6, R16 + BEQ mulNeon_10x5Xor_store + + // Load and process 32 bytes from input 6 to 5 outputs + VLD1.P 32(R11), [V16.B16, V17.B16] + VUSHR $4, V16.B16, V18.B16 + VUSHR $4, V17.B16, V19.B16 + VAND V10.B16, V16.B16, V16.B16 + VAND V10.B16, V17.B16, V17.B16 + VAND V10.B16, V18.B16, V18.B16 + VAND V10.B16, V19.B16, V19.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V0.B16, V0.B16 + VEOR V13.B16, V1.B16, V1.B16 + VEOR V14.B16, V0.B16, V0.B16 + VEOR V15.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V2.B16, V2.B16 + VEOR V13.B16, V3.B16, V3.B16 + VEOR V14.B16, V2.B16, V2.B16 + VEOR V15.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V4.B16, V4.B16 + VEOR V13.B16, V5.B16, V5.B16 + VEOR V14.B16, V4.B16, V4.B16 + VEOR V15.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V6.B16, V6.B16 + VEOR V13.B16, V7.B16, V7.B16 + VEOR V14.B16, V6.B16, V6.B16 + VEOR V15.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V8.B16, V8.B16 + VEOR V13.B16, V9.B16, V9.B16 + VEOR V14.B16, V8.B16, V8.B16 + VEOR V15.B16, V9.B16, V9.B16 + // Check for early termination + CMP $7, R16 + BEQ mulNeon_10x5Xor_store + + // Load and process 32 bytes from input 7 to 5 outputs + VLD1.P 32(R12), [V16.B16, V17.B16] + VUSHR $4, V16.B16, V18.B16 + VUSHR $4, V17.B16, V19.B16 + VAND V10.B16, V16.B16, V16.B16 + VAND V10.B16, V17.B16, V17.B16 + VAND V10.B16, V18.B16, V18.B16 + VAND V10.B16, V19.B16, V19.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V0.B16, V0.B16 + VEOR V13.B16, V1.B16, V1.B16 + VEOR V14.B16, V0.B16, V0.B16 + VEOR V15.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V2.B16, V2.B16 + VEOR V13.B16, V3.B16, V3.B16 + VEOR V14.B16, V2.B16, V2.B16 + VEOR V15.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V4.B16, V4.B16 + VEOR V13.B16, V5.B16, V5.B16 + VEOR V14.B16, V4.B16, V4.B16 + VEOR V15.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V6.B16, V6.B16 + VEOR V13.B16, V7.B16, V7.B16 + VEOR V14.B16, V6.B16, V6.B16 + VEOR V15.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V8.B16, V8.B16 + VEOR V13.B16, V9.B16, V9.B16 + VEOR V14.B16, V8.B16, V8.B16 + VEOR V15.B16, V9.B16, V9.B16 + // Check for early termination + CMP $8, R16 + BEQ mulNeon_10x5Xor_store + + // Load and process 32 bytes from input 8 to 5 outputs + VLD1.P 32(R13), [V16.B16, V17.B16] + VUSHR $4, V16.B16, V18.B16 + VUSHR $4, V17.B16, V19.B16 + VAND V10.B16, V16.B16, V16.B16 + VAND V10.B16, V17.B16, V17.B16 + VAND V10.B16, V18.B16, V18.B16 + VAND V10.B16, V19.B16, V19.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V0.B16, V0.B16 + VEOR V13.B16, V1.B16, V1.B16 + VEOR V14.B16, V0.B16, V0.B16 + VEOR V15.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V2.B16, V2.B16 + VEOR V13.B16, V3.B16, V3.B16 + VEOR V14.B16, V2.B16, V2.B16 + VEOR V15.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V4.B16, V4.B16 + VEOR V13.B16, V5.B16, V5.B16 + VEOR V14.B16, V4.B16, V4.B16 + VEOR V15.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V6.B16, V6.B16 + VEOR V13.B16, V7.B16, V7.B16 + VEOR V14.B16, V6.B16, V6.B16 + VEOR V15.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V8.B16, V8.B16 + VEOR V13.B16, V9.B16, V9.B16 + VEOR V14.B16, V8.B16, V8.B16 + VEOR V15.B16, V9.B16, V9.B16 + // Check for early termination + CMP $9, R16 + BEQ mulNeon_10x5Xor_store + + // Load and process 32 bytes from input 9 to 5 outputs + VLD1.P 32(R3), [V16.B16, V17.B16] + VUSHR $4, V16.B16, V18.B16 + VUSHR $4, V17.B16, V19.B16 + VAND V10.B16, V16.B16, V16.B16 + VAND V10.B16, V17.B16, V17.B16 + VAND V10.B16, V18.B16, V18.B16 + VAND V10.B16, V19.B16, V19.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V0.B16, V0.B16 + VEOR V13.B16, V1.B16, V1.B16 + VEOR V14.B16, V0.B16, V0.B16 + VEOR V15.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V2.B16, V2.B16 + VEOR V13.B16, V3.B16, V3.B16 + VEOR V14.B16, V2.B16, V2.B16 + VEOR V15.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V4.B16, V4.B16 + VEOR V13.B16, V5.B16, V5.B16 + VEOR V14.B16, V4.B16, V4.B16 + VEOR V15.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V6.B16, V6.B16 + VEOR V13.B16, V7.B16, V7.B16 + VEOR V14.B16, V6.B16, V6.B16 + VEOR V15.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V8.B16, V8.B16 + VEOR V13.B16, V9.B16, V9.B16 + VEOR V14.B16, V8.B16, V8.B16 + VEOR V15.B16, V9.B16, V9.B16 + +mulNeon_10x5Xor_store: + // Store 5 outputs + MOVD (R14), R6 + ADD R15<<3, R6 + VST1 [V0.D2, V1.D2], (R6) + MOVD 24(R14), R6 + ADD R15<<3, R6 + VST1 [V2.D2, V3.D2], (R6) + MOVD 48(R14), R6 + ADD R15<<3, R6 + VST1 [V4.D2, V5.D2], (R6) + MOVD 72(R14), R6 + ADD R15<<3, R6 + VST1 [V6.D2, V7.D2], (R6) + MOVD 96(R14), R6 + ADD R15<<3, R6 + VST1 [V8.D2, V9.D2], (R6) + + // Prepare for next loop + ADD $4, R15 + SUBS $1, R0 + BNE mulNeon_10x5Xor_loop + +mulNeon_10x5Xor_end: + RET + +// func mulNeon_10x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: NEON +TEXT ·mulNeon_10x6(SB), NOSPLIT, $8-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 131 YMM used + MOVD n+80(FP), R0 + LSR $5, R0 + TST R0, R0 + BEQ mulNeon_10x6_end + MOVD in_base+24(FP), R3 + MOVD (R3), R1 + MOVD 24(R3), R4 + MOVD 48(R3), R5 + MOVD 72(R3), R8 + MOVD 96(R3), R9 + MOVD 120(R3), R10 + MOVD 144(R3), R11 + MOVD 168(R3), R12 + MOVD 192(R3), R13 + MOVD 216(R3), R3 + MOVD out_base+48(FP), R14 + MOVD start+72(FP), R15 + + // Add start offset to input + ADD R15, R1 + ADD R15, R4 + ADD R15, R5 + ADD R15, R8 + ADD R15, R9 + ADD R15, R10 + ADD R15, R11 + ADD R15, R12 + ADD R15, R13 + ADD R15, R3 + LSR $3, R15 + MOVD $15, R6 + VMOV R6, V12.B[0] + VDUP V12.B[0], V12.B16 + + // Load number of input shards + MOVD in_len+32(FP), R16 + +mulNeon_10x6_loop: + MOVD matrix_base+0(FP), R2 + // Load and process 32 bytes from input 0 to 6 outputs + VLD1.P 32(R1), [V18.B16, V19.B16] + VUSHR $4, V18.B16, V20.B16 + VUSHR $4, V19.B16, V21.B16 + VAND V12.B16, V18.B16, V18.B16 + VAND V12.B16, V19.B16, V19.B16 + VAND V12.B16, V20.B16, V20.B16 + VAND V12.B16, V21.B16, V21.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V16.B16, V0.B16 + VEOR V15.B16, V17.B16, V1.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V16.B16, V2.B16 + VEOR V15.B16, V17.B16, V3.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V16.B16, V4.B16 + VEOR V15.B16, V17.B16, V5.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V16.B16, V6.B16 + VEOR V15.B16, V17.B16, V7.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V16.B16, V8.B16 + VEOR V15.B16, V17.B16, V9.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V16.B16, V10.B16 + VEOR V15.B16, V17.B16, V11.B16 + // Check for early termination + CMP $1, R16 + BEQ mulNeon_10x6_store + + // Load and process 32 bytes from input 1 to 6 outputs + VLD1.P 32(R4), [V18.B16, V19.B16] + VUSHR $4, V18.B16, V20.B16 + VUSHR $4, V19.B16, V21.B16 + VAND V12.B16, V18.B16, V18.B16 + VAND V12.B16, V19.B16, V19.B16 + VAND V12.B16, V20.B16, V20.B16 + VAND V12.B16, V21.B16, V21.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V0.B16, V0.B16 + VEOR V15.B16, V1.B16, V1.B16 + VEOR V16.B16, V0.B16, V0.B16 + VEOR V17.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V2.B16, V2.B16 + VEOR V15.B16, V3.B16, V3.B16 + VEOR V16.B16, V2.B16, V2.B16 + VEOR V17.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V4.B16, V4.B16 + VEOR V15.B16, V5.B16, V5.B16 + VEOR V16.B16, V4.B16, V4.B16 + VEOR V17.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V6.B16, V6.B16 + VEOR V15.B16, V7.B16, V7.B16 + VEOR V16.B16, V6.B16, V6.B16 + VEOR V17.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V8.B16, V8.B16 + VEOR V15.B16, V9.B16, V9.B16 + VEOR V16.B16, V8.B16, V8.B16 + VEOR V17.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V10.B16, V10.B16 + VEOR V15.B16, V11.B16, V11.B16 + VEOR V16.B16, V10.B16, V10.B16 + VEOR V17.B16, V11.B16, V11.B16 + // Check for early termination + CMP $2, R16 + BEQ mulNeon_10x6_store + + // Load and process 32 bytes from input 2 to 6 outputs + VLD1.P 32(R5), [V18.B16, V19.B16] + VUSHR $4, V18.B16, V20.B16 + VUSHR $4, V19.B16, V21.B16 + VAND V12.B16, V18.B16, V18.B16 + VAND V12.B16, V19.B16, V19.B16 + VAND V12.B16, V20.B16, V20.B16 + VAND V12.B16, V21.B16, V21.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V0.B16, V0.B16 + VEOR V15.B16, V1.B16, V1.B16 + VEOR V16.B16, V0.B16, V0.B16 + VEOR V17.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V2.B16, V2.B16 + VEOR V15.B16, V3.B16, V3.B16 + VEOR V16.B16, V2.B16, V2.B16 + VEOR V17.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V4.B16, V4.B16 + VEOR V15.B16, V5.B16, V5.B16 + VEOR V16.B16, V4.B16, V4.B16 + VEOR V17.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V6.B16, V6.B16 + VEOR V15.B16, V7.B16, V7.B16 + VEOR V16.B16, V6.B16, V6.B16 + VEOR V17.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V8.B16, V8.B16 + VEOR V15.B16, V9.B16, V9.B16 + VEOR V16.B16, V8.B16, V8.B16 + VEOR V17.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V10.B16, V10.B16 + VEOR V15.B16, V11.B16, V11.B16 + VEOR V16.B16, V10.B16, V10.B16 + VEOR V17.B16, V11.B16, V11.B16 + // Check for early termination + CMP $3, R16 + BEQ mulNeon_10x6_store + + // Load and process 32 bytes from input 3 to 6 outputs + VLD1.P 32(R8), [V18.B16, V19.B16] + VUSHR $4, V18.B16, V20.B16 + VUSHR $4, V19.B16, V21.B16 + VAND V12.B16, V18.B16, V18.B16 + VAND V12.B16, V19.B16, V19.B16 + VAND V12.B16, V20.B16, V20.B16 + VAND V12.B16, V21.B16, V21.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V0.B16, V0.B16 + VEOR V15.B16, V1.B16, V1.B16 + VEOR V16.B16, V0.B16, V0.B16 + VEOR V17.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V2.B16, V2.B16 + VEOR V15.B16, V3.B16, V3.B16 + VEOR V16.B16, V2.B16, V2.B16 + VEOR V17.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V4.B16, V4.B16 + VEOR V15.B16, V5.B16, V5.B16 + VEOR V16.B16, V4.B16, V4.B16 + VEOR V17.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V6.B16, V6.B16 + VEOR V15.B16, V7.B16, V7.B16 + VEOR V16.B16, V6.B16, V6.B16 + VEOR V17.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V8.B16, V8.B16 + VEOR V15.B16, V9.B16, V9.B16 + VEOR V16.B16, V8.B16, V8.B16 + VEOR V17.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V10.B16, V10.B16 + VEOR V15.B16, V11.B16, V11.B16 + VEOR V16.B16, V10.B16, V10.B16 + VEOR V17.B16, V11.B16, V11.B16 + // Check for early termination + CMP $4, R16 + BEQ mulNeon_10x6_store + + // Load and process 32 bytes from input 4 to 6 outputs + VLD1.P 32(R9), [V18.B16, V19.B16] + VUSHR $4, V18.B16, V20.B16 + VUSHR $4, V19.B16, V21.B16 + VAND V12.B16, V18.B16, V18.B16 + VAND V12.B16, V19.B16, V19.B16 + VAND V12.B16, V20.B16, V20.B16 + VAND V12.B16, V21.B16, V21.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V0.B16, V0.B16 + VEOR V15.B16, V1.B16, V1.B16 + VEOR V16.B16, V0.B16, V0.B16 + VEOR V17.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V2.B16, V2.B16 + VEOR V15.B16, V3.B16, V3.B16 + VEOR V16.B16, V2.B16, V2.B16 + VEOR V17.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V4.B16, V4.B16 + VEOR V15.B16, V5.B16, V5.B16 + VEOR V16.B16, V4.B16, V4.B16 + VEOR V17.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V6.B16, V6.B16 + VEOR V15.B16, V7.B16, V7.B16 + VEOR V16.B16, V6.B16, V6.B16 + VEOR V17.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V8.B16, V8.B16 + VEOR V15.B16, V9.B16, V9.B16 + VEOR V16.B16, V8.B16, V8.B16 + VEOR V17.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V10.B16, V10.B16 + VEOR V15.B16, V11.B16, V11.B16 + VEOR V16.B16, V10.B16, V10.B16 + VEOR V17.B16, V11.B16, V11.B16 + // Check for early termination + CMP $5, R16 + BEQ mulNeon_10x6_store + + // Load and process 32 bytes from input 5 to 6 outputs + VLD1.P 32(R10), [V18.B16, V19.B16] + VUSHR $4, V18.B16, V20.B16 + VUSHR $4, V19.B16, V21.B16 + VAND V12.B16, V18.B16, V18.B16 + VAND V12.B16, V19.B16, V19.B16 + VAND V12.B16, V20.B16, V20.B16 + VAND V12.B16, V21.B16, V21.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V0.B16, V0.B16 + VEOR V15.B16, V1.B16, V1.B16 + VEOR V16.B16, V0.B16, V0.B16 + VEOR V17.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V2.B16, V2.B16 + VEOR V15.B16, V3.B16, V3.B16 + VEOR V16.B16, V2.B16, V2.B16 + VEOR V17.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V4.B16, V4.B16 + VEOR V15.B16, V5.B16, V5.B16 + VEOR V16.B16, V4.B16, V4.B16 + VEOR V17.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V6.B16, V6.B16 + VEOR V15.B16, V7.B16, V7.B16 + VEOR V16.B16, V6.B16, V6.B16 + VEOR V17.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V8.B16, V8.B16 + VEOR V15.B16, V9.B16, V9.B16 + VEOR V16.B16, V8.B16, V8.B16 + VEOR V17.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V10.B16, V10.B16 + VEOR V15.B16, V11.B16, V11.B16 + VEOR V16.B16, V10.B16, V10.B16 + VEOR V17.B16, V11.B16, V11.B16 + // Check for early termination + CMP $6, R16 + BEQ mulNeon_10x6_store + + // Load and process 32 bytes from input 6 to 6 outputs + VLD1.P 32(R11), [V18.B16, V19.B16] + VUSHR $4, V18.B16, V20.B16 + VUSHR $4, V19.B16, V21.B16 + VAND V12.B16, V18.B16, V18.B16 + VAND V12.B16, V19.B16, V19.B16 + VAND V12.B16, V20.B16, V20.B16 + VAND V12.B16, V21.B16, V21.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V0.B16, V0.B16 + VEOR V15.B16, V1.B16, V1.B16 + VEOR V16.B16, V0.B16, V0.B16 + VEOR V17.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V2.B16, V2.B16 + VEOR V15.B16, V3.B16, V3.B16 + VEOR V16.B16, V2.B16, V2.B16 + VEOR V17.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V4.B16, V4.B16 + VEOR V15.B16, V5.B16, V5.B16 + VEOR V16.B16, V4.B16, V4.B16 + VEOR V17.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V6.B16, V6.B16 + VEOR V15.B16, V7.B16, V7.B16 + VEOR V16.B16, V6.B16, V6.B16 + VEOR V17.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V8.B16, V8.B16 + VEOR V15.B16, V9.B16, V9.B16 + VEOR V16.B16, V8.B16, V8.B16 + VEOR V17.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V10.B16, V10.B16 + VEOR V15.B16, V11.B16, V11.B16 + VEOR V16.B16, V10.B16, V10.B16 + VEOR V17.B16, V11.B16, V11.B16 + // Check for early termination + CMP $7, R16 + BEQ mulNeon_10x6_store + + // Load and process 32 bytes from input 7 to 6 outputs + VLD1.P 32(R12), [V18.B16, V19.B16] + VUSHR $4, V18.B16, V20.B16 + VUSHR $4, V19.B16, V21.B16 + VAND V12.B16, V18.B16, V18.B16 + VAND V12.B16, V19.B16, V19.B16 + VAND V12.B16, V20.B16, V20.B16 + VAND V12.B16, V21.B16, V21.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V0.B16, V0.B16 + VEOR V15.B16, V1.B16, V1.B16 + VEOR V16.B16, V0.B16, V0.B16 + VEOR V17.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V2.B16, V2.B16 + VEOR V15.B16, V3.B16, V3.B16 + VEOR V16.B16, V2.B16, V2.B16 + VEOR V17.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V4.B16, V4.B16 + VEOR V15.B16, V5.B16, V5.B16 + VEOR V16.B16, V4.B16, V4.B16 + VEOR V17.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V6.B16, V6.B16 + VEOR V15.B16, V7.B16, V7.B16 + VEOR V16.B16, V6.B16, V6.B16 + VEOR V17.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V8.B16, V8.B16 + VEOR V15.B16, V9.B16, V9.B16 + VEOR V16.B16, V8.B16, V8.B16 + VEOR V17.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V10.B16, V10.B16 + VEOR V15.B16, V11.B16, V11.B16 + VEOR V16.B16, V10.B16, V10.B16 + VEOR V17.B16, V11.B16, V11.B16 + // Check for early termination + CMP $8, R16 + BEQ mulNeon_10x6_store + + // Load and process 32 bytes from input 8 to 6 outputs + VLD1.P 32(R13), [V18.B16, V19.B16] + VUSHR $4, V18.B16, V20.B16 + VUSHR $4, V19.B16, V21.B16 + VAND V12.B16, V18.B16, V18.B16 + VAND V12.B16, V19.B16, V19.B16 + VAND V12.B16, V20.B16, V20.B16 + VAND V12.B16, V21.B16, V21.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V0.B16, V0.B16 + VEOR V15.B16, V1.B16, V1.B16 + VEOR V16.B16, V0.B16, V0.B16 + VEOR V17.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V2.B16, V2.B16 + VEOR V15.B16, V3.B16, V3.B16 + VEOR V16.B16, V2.B16, V2.B16 + VEOR V17.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V4.B16, V4.B16 + VEOR V15.B16, V5.B16, V5.B16 + VEOR V16.B16, V4.B16, V4.B16 + VEOR V17.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V6.B16, V6.B16 + VEOR V15.B16, V7.B16, V7.B16 + VEOR V16.B16, V6.B16, V6.B16 + VEOR V17.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V8.B16, V8.B16 + VEOR V15.B16, V9.B16, V9.B16 + VEOR V16.B16, V8.B16, V8.B16 + VEOR V17.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V10.B16, V10.B16 + VEOR V15.B16, V11.B16, V11.B16 + VEOR V16.B16, V10.B16, V10.B16 + VEOR V17.B16, V11.B16, V11.B16 + // Check for early termination + CMP $9, R16 + BEQ mulNeon_10x6_store + + // Load and process 32 bytes from input 9 to 6 outputs + VLD1.P 32(R3), [V18.B16, V19.B16] + VUSHR $4, V18.B16, V20.B16 + VUSHR $4, V19.B16, V21.B16 + VAND V12.B16, V18.B16, V18.B16 + VAND V12.B16, V19.B16, V19.B16 + VAND V12.B16, V20.B16, V20.B16 + VAND V12.B16, V21.B16, V21.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V0.B16, V0.B16 + VEOR V15.B16, V1.B16, V1.B16 + VEOR V16.B16, V0.B16, V0.B16 + VEOR V17.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V2.B16, V2.B16 + VEOR V15.B16, V3.B16, V3.B16 + VEOR V16.B16, V2.B16, V2.B16 + VEOR V17.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V4.B16, V4.B16 + VEOR V15.B16, V5.B16, V5.B16 + VEOR V16.B16, V4.B16, V4.B16 + VEOR V17.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V6.B16, V6.B16 + VEOR V15.B16, V7.B16, V7.B16 + VEOR V16.B16, V6.B16, V6.B16 + VEOR V17.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V8.B16, V8.B16 + VEOR V15.B16, V9.B16, V9.B16 + VEOR V16.B16, V8.B16, V8.B16 + VEOR V17.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V10.B16, V10.B16 + VEOR V15.B16, V11.B16, V11.B16 + VEOR V16.B16, V10.B16, V10.B16 + VEOR V17.B16, V11.B16, V11.B16 + +mulNeon_10x6_store: + // Store 6 outputs + MOVD (R14), R6 + ADD R15<<3, R6 + VST1 [V0.D2, V1.D2], (R6) + MOVD 24(R14), R6 + ADD R15<<3, R6 + VST1 [V2.D2, V3.D2], (R6) + MOVD 48(R14), R6 + ADD R15<<3, R6 + VST1 [V4.D2, V5.D2], (R6) + MOVD 72(R14), R6 + ADD R15<<3, R6 + VST1 [V6.D2, V7.D2], (R6) + MOVD 96(R14), R6 + ADD R15<<3, R6 + VST1 [V8.D2, V9.D2], (R6) + MOVD 120(R14), R6 + ADD R15<<3, R6 + VST1 [V10.D2, V11.D2], (R6) + + // Prepare for next loop + ADD $4, R15 + SUBS $1, R0 + BNE mulNeon_10x6_loop + +mulNeon_10x6_end: + RET + +// func mulNeon_10x6Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: NEON +TEXT ·mulNeon_10x6Xor(SB), NOSPLIT, $8-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 131 YMM used + MOVD n+80(FP), R0 + LSR $5, R0 + TST R0, R0 + BEQ mulNeon_10x6Xor_end + MOVD in_base+24(FP), R3 + MOVD (R3), R1 + MOVD 24(R3), R4 + MOVD 48(R3), R5 + MOVD 72(R3), R8 + MOVD 96(R3), R9 + MOVD 120(R3), R10 + MOVD 144(R3), R11 + MOVD 168(R3), R12 + MOVD 192(R3), R13 + MOVD 216(R3), R3 + MOVD out_base+48(FP), R14 + MOVD start+72(FP), R15 + + // Add start offset to input + ADD R15, R1 + ADD R15, R4 + ADD R15, R5 + ADD R15, R8 + ADD R15, R9 + ADD R15, R10 + ADD R15, R11 + ADD R15, R12 + ADD R15, R13 + ADD R15, R3 + LSR $3, R15 + MOVD $15, R6 + VMOV R6, V12.B[0] + VDUP V12.B[0], V12.B16 + + // Load number of input shards + MOVD in_len+32(FP), R16 + +mulNeon_10x6Xor_loop: + MOVD matrix_base+0(FP), R2 + // Load and process 32 bytes from input 0 to 6 outputs + VLD1.P 32(R1), [V18.B16, V19.B16] + VUSHR $4, V18.B16, V20.B16 + VUSHR $4, V19.B16, V21.B16 + VAND V12.B16, V18.B16, V18.B16 + VAND V12.B16, V19.B16, V19.B16 + VAND V12.B16, V20.B16, V20.B16 + VAND V12.B16, V21.B16, V21.B16 + MOVD (R14), R6 + ADD R15<<3, R6 + VLD1 (R6), [V0.B16, V1.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V0.B16, V0.B16 + VEOR V15.B16, V1.B16, V1.B16 + VEOR V16.B16, V0.B16, V0.B16 + VEOR V17.B16, V1.B16, V1.B16 + MOVD 24(R14), R6 + ADD R15<<3, R6 + VLD1 (R6), [V2.B16, V3.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V2.B16, V2.B16 + VEOR V15.B16, V3.B16, V3.B16 + VEOR V16.B16, V2.B16, V2.B16 + VEOR V17.B16, V3.B16, V3.B16 + MOVD 48(R14), R6 + ADD R15<<3, R6 + VLD1 (R6), [V4.B16, V5.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V4.B16, V4.B16 + VEOR V15.B16, V5.B16, V5.B16 + VEOR V16.B16, V4.B16, V4.B16 + VEOR V17.B16, V5.B16, V5.B16 + MOVD 72(R14), R6 + ADD R15<<3, R6 + VLD1 (R6), [V6.B16, V7.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V6.B16, V6.B16 + VEOR V15.B16, V7.B16, V7.B16 + VEOR V16.B16, V6.B16, V6.B16 + VEOR V17.B16, V7.B16, V7.B16 + MOVD 96(R14), R6 + ADD R15<<3, R6 + VLD1 (R6), [V8.B16, V9.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V8.B16, V8.B16 + VEOR V15.B16, V9.B16, V9.B16 + VEOR V16.B16, V8.B16, V8.B16 + VEOR V17.B16, V9.B16, V9.B16 + MOVD 120(R14), R6 + ADD R15<<3, R6 + VLD1 (R6), [V10.B16, V11.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V10.B16, V10.B16 + VEOR V15.B16, V11.B16, V11.B16 + VEOR V16.B16, V10.B16, V10.B16 + VEOR V17.B16, V11.B16, V11.B16 + // Check for early termination + CMP $1, R16 + BEQ mulNeon_10x6Xor_store + + // Load and process 32 bytes from input 1 to 6 outputs + VLD1.P 32(R4), [V18.B16, V19.B16] + VUSHR $4, V18.B16, V20.B16 + VUSHR $4, V19.B16, V21.B16 + VAND V12.B16, V18.B16, V18.B16 + VAND V12.B16, V19.B16, V19.B16 + VAND V12.B16, V20.B16, V20.B16 + VAND V12.B16, V21.B16, V21.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V0.B16, V0.B16 + VEOR V15.B16, V1.B16, V1.B16 + VEOR V16.B16, V0.B16, V0.B16 + VEOR V17.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V2.B16, V2.B16 + VEOR V15.B16, V3.B16, V3.B16 + VEOR V16.B16, V2.B16, V2.B16 + VEOR V17.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V4.B16, V4.B16 + VEOR V15.B16, V5.B16, V5.B16 + VEOR V16.B16, V4.B16, V4.B16 + VEOR V17.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V6.B16, V6.B16 + VEOR V15.B16, V7.B16, V7.B16 + VEOR V16.B16, V6.B16, V6.B16 + VEOR V17.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V8.B16, V8.B16 + VEOR V15.B16, V9.B16, V9.B16 + VEOR V16.B16, V8.B16, V8.B16 + VEOR V17.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V10.B16, V10.B16 + VEOR V15.B16, V11.B16, V11.B16 + VEOR V16.B16, V10.B16, V10.B16 + VEOR V17.B16, V11.B16, V11.B16 + // Check for early termination + CMP $2, R16 + BEQ mulNeon_10x6Xor_store + + // Load and process 32 bytes from input 2 to 6 outputs + VLD1.P 32(R5), [V18.B16, V19.B16] + VUSHR $4, V18.B16, V20.B16 + VUSHR $4, V19.B16, V21.B16 + VAND V12.B16, V18.B16, V18.B16 + VAND V12.B16, V19.B16, V19.B16 + VAND V12.B16, V20.B16, V20.B16 + VAND V12.B16, V21.B16, V21.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V0.B16, V0.B16 + VEOR V15.B16, V1.B16, V1.B16 + VEOR V16.B16, V0.B16, V0.B16 + VEOR V17.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V2.B16, V2.B16 + VEOR V15.B16, V3.B16, V3.B16 + VEOR V16.B16, V2.B16, V2.B16 + VEOR V17.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V4.B16, V4.B16 + VEOR V15.B16, V5.B16, V5.B16 + VEOR V16.B16, V4.B16, V4.B16 + VEOR V17.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V6.B16, V6.B16 + VEOR V15.B16, V7.B16, V7.B16 + VEOR V16.B16, V6.B16, V6.B16 + VEOR V17.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V8.B16, V8.B16 + VEOR V15.B16, V9.B16, V9.B16 + VEOR V16.B16, V8.B16, V8.B16 + VEOR V17.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V10.B16, V10.B16 + VEOR V15.B16, V11.B16, V11.B16 + VEOR V16.B16, V10.B16, V10.B16 + VEOR V17.B16, V11.B16, V11.B16 + // Check for early termination + CMP $3, R16 + BEQ mulNeon_10x6Xor_store + + // Load and process 32 bytes from input 3 to 6 outputs + VLD1.P 32(R8), [V18.B16, V19.B16] + VUSHR $4, V18.B16, V20.B16 + VUSHR $4, V19.B16, V21.B16 + VAND V12.B16, V18.B16, V18.B16 + VAND V12.B16, V19.B16, V19.B16 + VAND V12.B16, V20.B16, V20.B16 + VAND V12.B16, V21.B16, V21.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V0.B16, V0.B16 + VEOR V15.B16, V1.B16, V1.B16 + VEOR V16.B16, V0.B16, V0.B16 + VEOR V17.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V2.B16, V2.B16 + VEOR V15.B16, V3.B16, V3.B16 + VEOR V16.B16, V2.B16, V2.B16 + VEOR V17.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V4.B16, V4.B16 + VEOR V15.B16, V5.B16, V5.B16 + VEOR V16.B16, V4.B16, V4.B16 + VEOR V17.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V6.B16, V6.B16 + VEOR V15.B16, V7.B16, V7.B16 + VEOR V16.B16, V6.B16, V6.B16 + VEOR V17.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V8.B16, V8.B16 + VEOR V15.B16, V9.B16, V9.B16 + VEOR V16.B16, V8.B16, V8.B16 + VEOR V17.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V10.B16, V10.B16 + VEOR V15.B16, V11.B16, V11.B16 + VEOR V16.B16, V10.B16, V10.B16 + VEOR V17.B16, V11.B16, V11.B16 + // Check for early termination + CMP $4, R16 + BEQ mulNeon_10x6Xor_store + + // Load and process 32 bytes from input 4 to 6 outputs + VLD1.P 32(R9), [V18.B16, V19.B16] + VUSHR $4, V18.B16, V20.B16 + VUSHR $4, V19.B16, V21.B16 + VAND V12.B16, V18.B16, V18.B16 + VAND V12.B16, V19.B16, V19.B16 + VAND V12.B16, V20.B16, V20.B16 + VAND V12.B16, V21.B16, V21.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V0.B16, V0.B16 + VEOR V15.B16, V1.B16, V1.B16 + VEOR V16.B16, V0.B16, V0.B16 + VEOR V17.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V2.B16, V2.B16 + VEOR V15.B16, V3.B16, V3.B16 + VEOR V16.B16, V2.B16, V2.B16 + VEOR V17.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V4.B16, V4.B16 + VEOR V15.B16, V5.B16, V5.B16 + VEOR V16.B16, V4.B16, V4.B16 + VEOR V17.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V6.B16, V6.B16 + VEOR V15.B16, V7.B16, V7.B16 + VEOR V16.B16, V6.B16, V6.B16 + VEOR V17.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V8.B16, V8.B16 + VEOR V15.B16, V9.B16, V9.B16 + VEOR V16.B16, V8.B16, V8.B16 + VEOR V17.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V10.B16, V10.B16 + VEOR V15.B16, V11.B16, V11.B16 + VEOR V16.B16, V10.B16, V10.B16 + VEOR V17.B16, V11.B16, V11.B16 + // Check for early termination + CMP $5, R16 + BEQ mulNeon_10x6Xor_store + + // Load and process 32 bytes from input 5 to 6 outputs + VLD1.P 32(R10), [V18.B16, V19.B16] + VUSHR $4, V18.B16, V20.B16 + VUSHR $4, V19.B16, V21.B16 + VAND V12.B16, V18.B16, V18.B16 + VAND V12.B16, V19.B16, V19.B16 + VAND V12.B16, V20.B16, V20.B16 + VAND V12.B16, V21.B16, V21.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V0.B16, V0.B16 + VEOR V15.B16, V1.B16, V1.B16 + VEOR V16.B16, V0.B16, V0.B16 + VEOR V17.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V2.B16, V2.B16 + VEOR V15.B16, V3.B16, V3.B16 + VEOR V16.B16, V2.B16, V2.B16 + VEOR V17.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V4.B16, V4.B16 + VEOR V15.B16, V5.B16, V5.B16 + VEOR V16.B16, V4.B16, V4.B16 + VEOR V17.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V6.B16, V6.B16 + VEOR V15.B16, V7.B16, V7.B16 + VEOR V16.B16, V6.B16, V6.B16 + VEOR V17.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V8.B16, V8.B16 + VEOR V15.B16, V9.B16, V9.B16 + VEOR V16.B16, V8.B16, V8.B16 + VEOR V17.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V10.B16, V10.B16 + VEOR V15.B16, V11.B16, V11.B16 + VEOR V16.B16, V10.B16, V10.B16 + VEOR V17.B16, V11.B16, V11.B16 + // Check for early termination + CMP $6, R16 + BEQ mulNeon_10x6Xor_store + + // Load and process 32 bytes from input 6 to 6 outputs + VLD1.P 32(R11), [V18.B16, V19.B16] + VUSHR $4, V18.B16, V20.B16 + VUSHR $4, V19.B16, V21.B16 + VAND V12.B16, V18.B16, V18.B16 + VAND V12.B16, V19.B16, V19.B16 + VAND V12.B16, V20.B16, V20.B16 + VAND V12.B16, V21.B16, V21.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V0.B16, V0.B16 + VEOR V15.B16, V1.B16, V1.B16 + VEOR V16.B16, V0.B16, V0.B16 + VEOR V17.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V2.B16, V2.B16 + VEOR V15.B16, V3.B16, V3.B16 + VEOR V16.B16, V2.B16, V2.B16 + VEOR V17.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V4.B16, V4.B16 + VEOR V15.B16, V5.B16, V5.B16 + VEOR V16.B16, V4.B16, V4.B16 + VEOR V17.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V6.B16, V6.B16 + VEOR V15.B16, V7.B16, V7.B16 + VEOR V16.B16, V6.B16, V6.B16 + VEOR V17.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V8.B16, V8.B16 + VEOR V15.B16, V9.B16, V9.B16 + VEOR V16.B16, V8.B16, V8.B16 + VEOR V17.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V10.B16, V10.B16 + VEOR V15.B16, V11.B16, V11.B16 + VEOR V16.B16, V10.B16, V10.B16 + VEOR V17.B16, V11.B16, V11.B16 + // Check for early termination + CMP $7, R16 + BEQ mulNeon_10x6Xor_store + + // Load and process 32 bytes from input 7 to 6 outputs + VLD1.P 32(R12), [V18.B16, V19.B16] + VUSHR $4, V18.B16, V20.B16 + VUSHR $4, V19.B16, V21.B16 + VAND V12.B16, V18.B16, V18.B16 + VAND V12.B16, V19.B16, V19.B16 + VAND V12.B16, V20.B16, V20.B16 + VAND V12.B16, V21.B16, V21.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V0.B16, V0.B16 + VEOR V15.B16, V1.B16, V1.B16 + VEOR V16.B16, V0.B16, V0.B16 + VEOR V17.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V2.B16, V2.B16 + VEOR V15.B16, V3.B16, V3.B16 + VEOR V16.B16, V2.B16, V2.B16 + VEOR V17.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V4.B16, V4.B16 + VEOR V15.B16, V5.B16, V5.B16 + VEOR V16.B16, V4.B16, V4.B16 + VEOR V17.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V6.B16, V6.B16 + VEOR V15.B16, V7.B16, V7.B16 + VEOR V16.B16, V6.B16, V6.B16 + VEOR V17.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V8.B16, V8.B16 + VEOR V15.B16, V9.B16, V9.B16 + VEOR V16.B16, V8.B16, V8.B16 + VEOR V17.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V10.B16, V10.B16 + VEOR V15.B16, V11.B16, V11.B16 + VEOR V16.B16, V10.B16, V10.B16 + VEOR V17.B16, V11.B16, V11.B16 + // Check for early termination + CMP $8, R16 + BEQ mulNeon_10x6Xor_store + + // Load and process 32 bytes from input 8 to 6 outputs + VLD1.P 32(R13), [V18.B16, V19.B16] + VUSHR $4, V18.B16, V20.B16 + VUSHR $4, V19.B16, V21.B16 + VAND V12.B16, V18.B16, V18.B16 + VAND V12.B16, V19.B16, V19.B16 + VAND V12.B16, V20.B16, V20.B16 + VAND V12.B16, V21.B16, V21.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V0.B16, V0.B16 + VEOR V15.B16, V1.B16, V1.B16 + VEOR V16.B16, V0.B16, V0.B16 + VEOR V17.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V2.B16, V2.B16 + VEOR V15.B16, V3.B16, V3.B16 + VEOR V16.B16, V2.B16, V2.B16 + VEOR V17.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V4.B16, V4.B16 + VEOR V15.B16, V5.B16, V5.B16 + VEOR V16.B16, V4.B16, V4.B16 + VEOR V17.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V6.B16, V6.B16 + VEOR V15.B16, V7.B16, V7.B16 + VEOR V16.B16, V6.B16, V6.B16 + VEOR V17.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V8.B16, V8.B16 + VEOR V15.B16, V9.B16, V9.B16 + VEOR V16.B16, V8.B16, V8.B16 + VEOR V17.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V10.B16, V10.B16 + VEOR V15.B16, V11.B16, V11.B16 + VEOR V16.B16, V10.B16, V10.B16 + VEOR V17.B16, V11.B16, V11.B16 + // Check for early termination + CMP $9, R16 + BEQ mulNeon_10x6Xor_store + + // Load and process 32 bytes from input 9 to 6 outputs + VLD1.P 32(R3), [V18.B16, V19.B16] + VUSHR $4, V18.B16, V20.B16 + VUSHR $4, V19.B16, V21.B16 + VAND V12.B16, V18.B16, V18.B16 + VAND V12.B16, V19.B16, V19.B16 + VAND V12.B16, V20.B16, V20.B16 + VAND V12.B16, V21.B16, V21.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V0.B16, V0.B16 + VEOR V15.B16, V1.B16, V1.B16 + VEOR V16.B16, V0.B16, V0.B16 + VEOR V17.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V2.B16, V2.B16 + VEOR V15.B16, V3.B16, V3.B16 + VEOR V16.B16, V2.B16, V2.B16 + VEOR V17.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V4.B16, V4.B16 + VEOR V15.B16, V5.B16, V5.B16 + VEOR V16.B16, V4.B16, V4.B16 + VEOR V17.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V6.B16, V6.B16 + VEOR V15.B16, V7.B16, V7.B16 + VEOR V16.B16, V6.B16, V6.B16 + VEOR V17.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V8.B16, V8.B16 + VEOR V15.B16, V9.B16, V9.B16 + VEOR V16.B16, V8.B16, V8.B16 + VEOR V17.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V10.B16, V10.B16 + VEOR V15.B16, V11.B16, V11.B16 + VEOR V16.B16, V10.B16, V10.B16 + VEOR V17.B16, V11.B16, V11.B16 + +mulNeon_10x6Xor_store: + // Store 6 outputs + MOVD (R14), R6 + ADD R15<<3, R6 + VST1 [V0.D2, V1.D2], (R6) + MOVD 24(R14), R6 + ADD R15<<3, R6 + VST1 [V2.D2, V3.D2], (R6) + MOVD 48(R14), R6 + ADD R15<<3, R6 + VST1 [V4.D2, V5.D2], (R6) + MOVD 72(R14), R6 + ADD R15<<3, R6 + VST1 [V6.D2, V7.D2], (R6) + MOVD 96(R14), R6 + ADD R15<<3, R6 + VST1 [V8.D2, V9.D2], (R6) + MOVD 120(R14), R6 + ADD R15<<3, R6 + VST1 [V10.D2, V11.D2], (R6) + + // Prepare for next loop + ADD $4, R15 + SUBS $1, R0 + BNE mulNeon_10x6Xor_loop + +mulNeon_10x6Xor_end: + RET + +// func mulNeon_10x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: NEON +TEXT ·mulNeon_10x7(SB), NOSPLIT, $8-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 152 YMM used + MOVD n+80(FP), R0 + LSR $5, R0 + TST R0, R0 + BEQ mulNeon_10x7_end + MOVD in_base+24(FP), R3 + MOVD (R3), R1 + MOVD 24(R3), R4 + MOVD 48(R3), R5 + MOVD 72(R3), R8 + MOVD 96(R3), R9 + MOVD 120(R3), R10 + MOVD 144(R3), R11 + MOVD 168(R3), R12 + MOVD 192(R3), R13 + MOVD 216(R3), R3 + MOVD out_base+48(FP), R14 + MOVD start+72(FP), R15 + + // Add start offset to input + ADD R15, R1 + ADD R15, R4 + ADD R15, R5 + ADD R15, R8 + ADD R15, R9 + ADD R15, R10 + ADD R15, R11 + ADD R15, R12 + ADD R15, R13 + ADD R15, R3 + LSR $3, R15 + MOVD $15, R6 + VMOV R6, V14.B[0] + VDUP V14.B[0], V14.B16 + + // Load number of input shards + MOVD in_len+32(FP), R16 + +mulNeon_10x7_loop: + MOVD matrix_base+0(FP), R2 + // Load and process 32 bytes from input 0 to 7 outputs + VLD1.P 32(R1), [V20.B16, V21.B16] + VUSHR $4, V20.B16, V22.B16 + VUSHR $4, V21.B16, V23.B16 + VAND V14.B16, V20.B16, V20.B16 + VAND V14.B16, V21.B16, V21.B16 + VAND V14.B16, V22.B16, V22.B16 + VAND V14.B16, V23.B16, V23.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V18.B16, V0.B16 + VEOR V17.B16, V19.B16, V1.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V18.B16, V2.B16 + VEOR V17.B16, V19.B16, V3.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V18.B16, V4.B16 + VEOR V17.B16, V19.B16, V5.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V18.B16, V6.B16 + VEOR V17.B16, V19.B16, V7.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V18.B16, V8.B16 + VEOR V17.B16, V19.B16, V9.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V18.B16, V10.B16 + VEOR V17.B16, V19.B16, V11.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V18.B16, V12.B16 + VEOR V17.B16, V19.B16, V13.B16 + // Check for early termination + CMP $1, R16 + BEQ mulNeon_10x7_store + + // Load and process 32 bytes from input 1 to 7 outputs + VLD1.P 32(R4), [V20.B16, V21.B16] + VUSHR $4, V20.B16, V22.B16 + VUSHR $4, V21.B16, V23.B16 + VAND V14.B16, V20.B16, V20.B16 + VAND V14.B16, V21.B16, V21.B16 + VAND V14.B16, V22.B16, V22.B16 + VAND V14.B16, V23.B16, V23.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V0.B16, V0.B16 + VEOR V17.B16, V1.B16, V1.B16 + VEOR V18.B16, V0.B16, V0.B16 + VEOR V19.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V2.B16, V2.B16 + VEOR V17.B16, V3.B16, V3.B16 + VEOR V18.B16, V2.B16, V2.B16 + VEOR V19.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V4.B16, V4.B16 + VEOR V17.B16, V5.B16, V5.B16 + VEOR V18.B16, V4.B16, V4.B16 + VEOR V19.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V6.B16, V6.B16 + VEOR V17.B16, V7.B16, V7.B16 + VEOR V18.B16, V6.B16, V6.B16 + VEOR V19.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V8.B16, V8.B16 + VEOR V17.B16, V9.B16, V9.B16 + VEOR V18.B16, V8.B16, V8.B16 + VEOR V19.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V10.B16, V10.B16 + VEOR V17.B16, V11.B16, V11.B16 + VEOR V18.B16, V10.B16, V10.B16 + VEOR V19.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V12.B16, V12.B16 + VEOR V17.B16, V13.B16, V13.B16 + VEOR V18.B16, V12.B16, V12.B16 + VEOR V19.B16, V13.B16, V13.B16 + // Check for early termination + CMP $2, R16 + BEQ mulNeon_10x7_store + + // Load and process 32 bytes from input 2 to 7 outputs + VLD1.P 32(R5), [V20.B16, V21.B16] + VUSHR $4, V20.B16, V22.B16 + VUSHR $4, V21.B16, V23.B16 + VAND V14.B16, V20.B16, V20.B16 + VAND V14.B16, V21.B16, V21.B16 + VAND V14.B16, V22.B16, V22.B16 + VAND V14.B16, V23.B16, V23.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V0.B16, V0.B16 + VEOR V17.B16, V1.B16, V1.B16 + VEOR V18.B16, V0.B16, V0.B16 + VEOR V19.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V2.B16, V2.B16 + VEOR V17.B16, V3.B16, V3.B16 + VEOR V18.B16, V2.B16, V2.B16 + VEOR V19.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V4.B16, V4.B16 + VEOR V17.B16, V5.B16, V5.B16 + VEOR V18.B16, V4.B16, V4.B16 + VEOR V19.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V6.B16, V6.B16 + VEOR V17.B16, V7.B16, V7.B16 + VEOR V18.B16, V6.B16, V6.B16 + VEOR V19.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V8.B16, V8.B16 + VEOR V17.B16, V9.B16, V9.B16 + VEOR V18.B16, V8.B16, V8.B16 + VEOR V19.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V10.B16, V10.B16 + VEOR V17.B16, V11.B16, V11.B16 + VEOR V18.B16, V10.B16, V10.B16 + VEOR V19.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V12.B16, V12.B16 + VEOR V17.B16, V13.B16, V13.B16 + VEOR V18.B16, V12.B16, V12.B16 + VEOR V19.B16, V13.B16, V13.B16 + // Check for early termination + CMP $3, R16 + BEQ mulNeon_10x7_store + + // Load and process 32 bytes from input 3 to 7 outputs + VLD1.P 32(R8), [V20.B16, V21.B16] + VUSHR $4, V20.B16, V22.B16 + VUSHR $4, V21.B16, V23.B16 + VAND V14.B16, V20.B16, V20.B16 + VAND V14.B16, V21.B16, V21.B16 + VAND V14.B16, V22.B16, V22.B16 + VAND V14.B16, V23.B16, V23.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V0.B16, V0.B16 + VEOR V17.B16, V1.B16, V1.B16 + VEOR V18.B16, V0.B16, V0.B16 + VEOR V19.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V2.B16, V2.B16 + VEOR V17.B16, V3.B16, V3.B16 + VEOR V18.B16, V2.B16, V2.B16 + VEOR V19.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V4.B16, V4.B16 + VEOR V17.B16, V5.B16, V5.B16 + VEOR V18.B16, V4.B16, V4.B16 + VEOR V19.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V6.B16, V6.B16 + VEOR V17.B16, V7.B16, V7.B16 + VEOR V18.B16, V6.B16, V6.B16 + VEOR V19.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V8.B16, V8.B16 + VEOR V17.B16, V9.B16, V9.B16 + VEOR V18.B16, V8.B16, V8.B16 + VEOR V19.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V10.B16, V10.B16 + VEOR V17.B16, V11.B16, V11.B16 + VEOR V18.B16, V10.B16, V10.B16 + VEOR V19.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V12.B16, V12.B16 + VEOR V17.B16, V13.B16, V13.B16 + VEOR V18.B16, V12.B16, V12.B16 + VEOR V19.B16, V13.B16, V13.B16 + // Check for early termination + CMP $4, R16 + BEQ mulNeon_10x7_store + + // Load and process 32 bytes from input 4 to 7 outputs + VLD1.P 32(R9), [V20.B16, V21.B16] + VUSHR $4, V20.B16, V22.B16 + VUSHR $4, V21.B16, V23.B16 + VAND V14.B16, V20.B16, V20.B16 + VAND V14.B16, V21.B16, V21.B16 + VAND V14.B16, V22.B16, V22.B16 + VAND V14.B16, V23.B16, V23.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V0.B16, V0.B16 + VEOR V17.B16, V1.B16, V1.B16 + VEOR V18.B16, V0.B16, V0.B16 + VEOR V19.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V2.B16, V2.B16 + VEOR V17.B16, V3.B16, V3.B16 + VEOR V18.B16, V2.B16, V2.B16 + VEOR V19.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V4.B16, V4.B16 + VEOR V17.B16, V5.B16, V5.B16 + VEOR V18.B16, V4.B16, V4.B16 + VEOR V19.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V6.B16, V6.B16 + VEOR V17.B16, V7.B16, V7.B16 + VEOR V18.B16, V6.B16, V6.B16 + VEOR V19.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V8.B16, V8.B16 + VEOR V17.B16, V9.B16, V9.B16 + VEOR V18.B16, V8.B16, V8.B16 + VEOR V19.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V10.B16, V10.B16 + VEOR V17.B16, V11.B16, V11.B16 + VEOR V18.B16, V10.B16, V10.B16 + VEOR V19.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V12.B16, V12.B16 + VEOR V17.B16, V13.B16, V13.B16 + VEOR V18.B16, V12.B16, V12.B16 + VEOR V19.B16, V13.B16, V13.B16 + // Check for early termination + CMP $5, R16 + BEQ mulNeon_10x7_store + + // Load and process 32 bytes from input 5 to 7 outputs + VLD1.P 32(R10), [V20.B16, V21.B16] + VUSHR $4, V20.B16, V22.B16 + VUSHR $4, V21.B16, V23.B16 + VAND V14.B16, V20.B16, V20.B16 + VAND V14.B16, V21.B16, V21.B16 + VAND V14.B16, V22.B16, V22.B16 + VAND V14.B16, V23.B16, V23.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V0.B16, V0.B16 + VEOR V17.B16, V1.B16, V1.B16 + VEOR V18.B16, V0.B16, V0.B16 + VEOR V19.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V2.B16, V2.B16 + VEOR V17.B16, V3.B16, V3.B16 + VEOR V18.B16, V2.B16, V2.B16 + VEOR V19.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V4.B16, V4.B16 + VEOR V17.B16, V5.B16, V5.B16 + VEOR V18.B16, V4.B16, V4.B16 + VEOR V19.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V6.B16, V6.B16 + VEOR V17.B16, V7.B16, V7.B16 + VEOR V18.B16, V6.B16, V6.B16 + VEOR V19.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V8.B16, V8.B16 + VEOR V17.B16, V9.B16, V9.B16 + VEOR V18.B16, V8.B16, V8.B16 + VEOR V19.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V10.B16, V10.B16 + VEOR V17.B16, V11.B16, V11.B16 + VEOR V18.B16, V10.B16, V10.B16 + VEOR V19.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V12.B16, V12.B16 + VEOR V17.B16, V13.B16, V13.B16 + VEOR V18.B16, V12.B16, V12.B16 + VEOR V19.B16, V13.B16, V13.B16 + // Check for early termination + CMP $6, R16 + BEQ mulNeon_10x7_store + + // Load and process 32 bytes from input 6 to 7 outputs + VLD1.P 32(R11), [V20.B16, V21.B16] + VUSHR $4, V20.B16, V22.B16 + VUSHR $4, V21.B16, V23.B16 + VAND V14.B16, V20.B16, V20.B16 + VAND V14.B16, V21.B16, V21.B16 + VAND V14.B16, V22.B16, V22.B16 + VAND V14.B16, V23.B16, V23.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V0.B16, V0.B16 + VEOR V17.B16, V1.B16, V1.B16 + VEOR V18.B16, V0.B16, V0.B16 + VEOR V19.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V2.B16, V2.B16 + VEOR V17.B16, V3.B16, V3.B16 + VEOR V18.B16, V2.B16, V2.B16 + VEOR V19.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V4.B16, V4.B16 + VEOR V17.B16, V5.B16, V5.B16 + VEOR V18.B16, V4.B16, V4.B16 + VEOR V19.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V6.B16, V6.B16 + VEOR V17.B16, V7.B16, V7.B16 + VEOR V18.B16, V6.B16, V6.B16 + VEOR V19.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V8.B16, V8.B16 + VEOR V17.B16, V9.B16, V9.B16 + VEOR V18.B16, V8.B16, V8.B16 + VEOR V19.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V10.B16, V10.B16 + VEOR V17.B16, V11.B16, V11.B16 + VEOR V18.B16, V10.B16, V10.B16 + VEOR V19.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V12.B16, V12.B16 + VEOR V17.B16, V13.B16, V13.B16 + VEOR V18.B16, V12.B16, V12.B16 + VEOR V19.B16, V13.B16, V13.B16 + // Check for early termination + CMP $7, R16 + BEQ mulNeon_10x7_store + + // Load and process 32 bytes from input 7 to 7 outputs + VLD1.P 32(R12), [V20.B16, V21.B16] + VUSHR $4, V20.B16, V22.B16 + VUSHR $4, V21.B16, V23.B16 + VAND V14.B16, V20.B16, V20.B16 + VAND V14.B16, V21.B16, V21.B16 + VAND V14.B16, V22.B16, V22.B16 + VAND V14.B16, V23.B16, V23.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V0.B16, V0.B16 + VEOR V17.B16, V1.B16, V1.B16 + VEOR V18.B16, V0.B16, V0.B16 + VEOR V19.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V2.B16, V2.B16 + VEOR V17.B16, V3.B16, V3.B16 + VEOR V18.B16, V2.B16, V2.B16 + VEOR V19.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V4.B16, V4.B16 + VEOR V17.B16, V5.B16, V5.B16 + VEOR V18.B16, V4.B16, V4.B16 + VEOR V19.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V6.B16, V6.B16 + VEOR V17.B16, V7.B16, V7.B16 + VEOR V18.B16, V6.B16, V6.B16 + VEOR V19.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V8.B16, V8.B16 + VEOR V17.B16, V9.B16, V9.B16 + VEOR V18.B16, V8.B16, V8.B16 + VEOR V19.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V10.B16, V10.B16 + VEOR V17.B16, V11.B16, V11.B16 + VEOR V18.B16, V10.B16, V10.B16 + VEOR V19.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V12.B16, V12.B16 + VEOR V17.B16, V13.B16, V13.B16 + VEOR V18.B16, V12.B16, V12.B16 + VEOR V19.B16, V13.B16, V13.B16 + // Check for early termination + CMP $8, R16 + BEQ mulNeon_10x7_store + + // Load and process 32 bytes from input 8 to 7 outputs + VLD1.P 32(R13), [V20.B16, V21.B16] + VUSHR $4, V20.B16, V22.B16 + VUSHR $4, V21.B16, V23.B16 + VAND V14.B16, V20.B16, V20.B16 + VAND V14.B16, V21.B16, V21.B16 + VAND V14.B16, V22.B16, V22.B16 + VAND V14.B16, V23.B16, V23.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V0.B16, V0.B16 + VEOR V17.B16, V1.B16, V1.B16 + VEOR V18.B16, V0.B16, V0.B16 + VEOR V19.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V2.B16, V2.B16 + VEOR V17.B16, V3.B16, V3.B16 + VEOR V18.B16, V2.B16, V2.B16 + VEOR V19.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V4.B16, V4.B16 + VEOR V17.B16, V5.B16, V5.B16 + VEOR V18.B16, V4.B16, V4.B16 + VEOR V19.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V6.B16, V6.B16 + VEOR V17.B16, V7.B16, V7.B16 + VEOR V18.B16, V6.B16, V6.B16 + VEOR V19.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V8.B16, V8.B16 + VEOR V17.B16, V9.B16, V9.B16 + VEOR V18.B16, V8.B16, V8.B16 + VEOR V19.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V10.B16, V10.B16 + VEOR V17.B16, V11.B16, V11.B16 + VEOR V18.B16, V10.B16, V10.B16 + VEOR V19.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V12.B16, V12.B16 + VEOR V17.B16, V13.B16, V13.B16 + VEOR V18.B16, V12.B16, V12.B16 + VEOR V19.B16, V13.B16, V13.B16 + // Check for early termination + CMP $9, R16 + BEQ mulNeon_10x7_store + + // Load and process 32 bytes from input 9 to 7 outputs + VLD1.P 32(R3), [V20.B16, V21.B16] + VUSHR $4, V20.B16, V22.B16 + VUSHR $4, V21.B16, V23.B16 + VAND V14.B16, V20.B16, V20.B16 + VAND V14.B16, V21.B16, V21.B16 + VAND V14.B16, V22.B16, V22.B16 + VAND V14.B16, V23.B16, V23.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V0.B16, V0.B16 + VEOR V17.B16, V1.B16, V1.B16 + VEOR V18.B16, V0.B16, V0.B16 + VEOR V19.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V2.B16, V2.B16 + VEOR V17.B16, V3.B16, V3.B16 + VEOR V18.B16, V2.B16, V2.B16 + VEOR V19.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V4.B16, V4.B16 + VEOR V17.B16, V5.B16, V5.B16 + VEOR V18.B16, V4.B16, V4.B16 + VEOR V19.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V6.B16, V6.B16 + VEOR V17.B16, V7.B16, V7.B16 + VEOR V18.B16, V6.B16, V6.B16 + VEOR V19.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V8.B16, V8.B16 + VEOR V17.B16, V9.B16, V9.B16 + VEOR V18.B16, V8.B16, V8.B16 + VEOR V19.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V10.B16, V10.B16 + VEOR V17.B16, V11.B16, V11.B16 + VEOR V18.B16, V10.B16, V10.B16 + VEOR V19.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V12.B16, V12.B16 + VEOR V17.B16, V13.B16, V13.B16 + VEOR V18.B16, V12.B16, V12.B16 + VEOR V19.B16, V13.B16, V13.B16 + +mulNeon_10x7_store: + // Store 7 outputs + MOVD (R14), R6 + ADD R15<<3, R6 + VST1 [V0.D2, V1.D2], (R6) + MOVD 24(R14), R6 + ADD R15<<3, R6 + VST1 [V2.D2, V3.D2], (R6) + MOVD 48(R14), R6 + ADD R15<<3, R6 + VST1 [V4.D2, V5.D2], (R6) + MOVD 72(R14), R6 + ADD R15<<3, R6 + VST1 [V6.D2, V7.D2], (R6) + MOVD 96(R14), R6 + ADD R15<<3, R6 + VST1 [V8.D2, V9.D2], (R6) + MOVD 120(R14), R6 + ADD R15<<3, R6 + VST1 [V10.D2, V11.D2], (R6) + MOVD 144(R14), R6 + ADD R15<<3, R6 + VST1 [V12.D2, V13.D2], (R6) + + // Prepare for next loop + ADD $4, R15 + SUBS $1, R0 + BNE mulNeon_10x7_loop + +mulNeon_10x7_end: + RET + +// func mulNeon_10x7Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: NEON +TEXT ·mulNeon_10x7Xor(SB), NOSPLIT, $8-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 152 YMM used + MOVD n+80(FP), R0 + LSR $5, R0 + TST R0, R0 + BEQ mulNeon_10x7Xor_end + MOVD in_base+24(FP), R3 + MOVD (R3), R1 + MOVD 24(R3), R4 + MOVD 48(R3), R5 + MOVD 72(R3), R8 + MOVD 96(R3), R9 + MOVD 120(R3), R10 + MOVD 144(R3), R11 + MOVD 168(R3), R12 + MOVD 192(R3), R13 + MOVD 216(R3), R3 + MOVD out_base+48(FP), R14 + MOVD start+72(FP), R15 + + // Add start offset to input + ADD R15, R1 + ADD R15, R4 + ADD R15, R5 + ADD R15, R8 + ADD R15, R9 + ADD R15, R10 + ADD R15, R11 + ADD R15, R12 + ADD R15, R13 + ADD R15, R3 + LSR $3, R15 + MOVD $15, R6 + VMOV R6, V14.B[0] + VDUP V14.B[0], V14.B16 + + // Load number of input shards + MOVD in_len+32(FP), R16 + +mulNeon_10x7Xor_loop: + MOVD matrix_base+0(FP), R2 + // Load and process 32 bytes from input 0 to 7 outputs + VLD1.P 32(R1), [V20.B16, V21.B16] + VUSHR $4, V20.B16, V22.B16 + VUSHR $4, V21.B16, V23.B16 + VAND V14.B16, V20.B16, V20.B16 + VAND V14.B16, V21.B16, V21.B16 + VAND V14.B16, V22.B16, V22.B16 + VAND V14.B16, V23.B16, V23.B16 + MOVD (R14), R6 + ADD R15<<3, R6 + VLD1 (R6), [V0.B16, V1.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V0.B16, V0.B16 + VEOR V17.B16, V1.B16, V1.B16 + VEOR V18.B16, V0.B16, V0.B16 + VEOR V19.B16, V1.B16, V1.B16 + MOVD 24(R14), R6 + ADD R15<<3, R6 + VLD1 (R6), [V2.B16, V3.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V2.B16, V2.B16 + VEOR V17.B16, V3.B16, V3.B16 + VEOR V18.B16, V2.B16, V2.B16 + VEOR V19.B16, V3.B16, V3.B16 + MOVD 48(R14), R6 + ADD R15<<3, R6 + VLD1 (R6), [V4.B16, V5.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V4.B16, V4.B16 + VEOR V17.B16, V5.B16, V5.B16 + VEOR V18.B16, V4.B16, V4.B16 + VEOR V19.B16, V5.B16, V5.B16 + MOVD 72(R14), R6 + ADD R15<<3, R6 + VLD1 (R6), [V6.B16, V7.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V6.B16, V6.B16 + VEOR V17.B16, V7.B16, V7.B16 + VEOR V18.B16, V6.B16, V6.B16 + VEOR V19.B16, V7.B16, V7.B16 + MOVD 96(R14), R6 + ADD R15<<3, R6 + VLD1 (R6), [V8.B16, V9.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V8.B16, V8.B16 + VEOR V17.B16, V9.B16, V9.B16 + VEOR V18.B16, V8.B16, V8.B16 + VEOR V19.B16, V9.B16, V9.B16 + MOVD 120(R14), R6 + ADD R15<<3, R6 + VLD1 (R6), [V10.B16, V11.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V10.B16, V10.B16 + VEOR V17.B16, V11.B16, V11.B16 + VEOR V18.B16, V10.B16, V10.B16 + VEOR V19.B16, V11.B16, V11.B16 + MOVD 144(R14), R6 + ADD R15<<3, R6 + VLD1 (R6), [V12.B16, V13.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V12.B16, V12.B16 + VEOR V17.B16, V13.B16, V13.B16 + VEOR V18.B16, V12.B16, V12.B16 + VEOR V19.B16, V13.B16, V13.B16 + // Check for early termination + CMP $1, R16 + BEQ mulNeon_10x7Xor_store + + // Load and process 32 bytes from input 1 to 7 outputs + VLD1.P 32(R4), [V20.B16, V21.B16] + VUSHR $4, V20.B16, V22.B16 + VUSHR $4, V21.B16, V23.B16 + VAND V14.B16, V20.B16, V20.B16 + VAND V14.B16, V21.B16, V21.B16 + VAND V14.B16, V22.B16, V22.B16 + VAND V14.B16, V23.B16, V23.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V0.B16, V0.B16 + VEOR V17.B16, V1.B16, V1.B16 + VEOR V18.B16, V0.B16, V0.B16 + VEOR V19.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V2.B16, V2.B16 + VEOR V17.B16, V3.B16, V3.B16 + VEOR V18.B16, V2.B16, V2.B16 + VEOR V19.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V4.B16, V4.B16 + VEOR V17.B16, V5.B16, V5.B16 + VEOR V18.B16, V4.B16, V4.B16 + VEOR V19.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V6.B16, V6.B16 + VEOR V17.B16, V7.B16, V7.B16 + VEOR V18.B16, V6.B16, V6.B16 + VEOR V19.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V8.B16, V8.B16 + VEOR V17.B16, V9.B16, V9.B16 + VEOR V18.B16, V8.B16, V8.B16 + VEOR V19.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V10.B16, V10.B16 + VEOR V17.B16, V11.B16, V11.B16 + VEOR V18.B16, V10.B16, V10.B16 + VEOR V19.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V12.B16, V12.B16 + VEOR V17.B16, V13.B16, V13.B16 + VEOR V18.B16, V12.B16, V12.B16 + VEOR V19.B16, V13.B16, V13.B16 + // Check for early termination + CMP $2, R16 + BEQ mulNeon_10x7Xor_store + + // Load and process 32 bytes from input 2 to 7 outputs + VLD1.P 32(R5), [V20.B16, V21.B16] + VUSHR $4, V20.B16, V22.B16 + VUSHR $4, V21.B16, V23.B16 + VAND V14.B16, V20.B16, V20.B16 + VAND V14.B16, V21.B16, V21.B16 + VAND V14.B16, V22.B16, V22.B16 + VAND V14.B16, V23.B16, V23.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V0.B16, V0.B16 + VEOR V17.B16, V1.B16, V1.B16 + VEOR V18.B16, V0.B16, V0.B16 + VEOR V19.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V2.B16, V2.B16 + VEOR V17.B16, V3.B16, V3.B16 + VEOR V18.B16, V2.B16, V2.B16 + VEOR V19.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V4.B16, V4.B16 + VEOR V17.B16, V5.B16, V5.B16 + VEOR V18.B16, V4.B16, V4.B16 + VEOR V19.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V6.B16, V6.B16 + VEOR V17.B16, V7.B16, V7.B16 + VEOR V18.B16, V6.B16, V6.B16 + VEOR V19.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V8.B16, V8.B16 + VEOR V17.B16, V9.B16, V9.B16 + VEOR V18.B16, V8.B16, V8.B16 + VEOR V19.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V10.B16, V10.B16 + VEOR V17.B16, V11.B16, V11.B16 + VEOR V18.B16, V10.B16, V10.B16 + VEOR V19.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V12.B16, V12.B16 + VEOR V17.B16, V13.B16, V13.B16 + VEOR V18.B16, V12.B16, V12.B16 + VEOR V19.B16, V13.B16, V13.B16 + // Check for early termination + CMP $3, R16 + BEQ mulNeon_10x7Xor_store + + // Load and process 32 bytes from input 3 to 7 outputs + VLD1.P 32(R8), [V20.B16, V21.B16] + VUSHR $4, V20.B16, V22.B16 + VUSHR $4, V21.B16, V23.B16 + VAND V14.B16, V20.B16, V20.B16 + VAND V14.B16, V21.B16, V21.B16 + VAND V14.B16, V22.B16, V22.B16 + VAND V14.B16, V23.B16, V23.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V0.B16, V0.B16 + VEOR V17.B16, V1.B16, V1.B16 + VEOR V18.B16, V0.B16, V0.B16 + VEOR V19.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V2.B16, V2.B16 + VEOR V17.B16, V3.B16, V3.B16 + VEOR V18.B16, V2.B16, V2.B16 + VEOR V19.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V4.B16, V4.B16 + VEOR V17.B16, V5.B16, V5.B16 + VEOR V18.B16, V4.B16, V4.B16 + VEOR V19.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V6.B16, V6.B16 + VEOR V17.B16, V7.B16, V7.B16 + VEOR V18.B16, V6.B16, V6.B16 + VEOR V19.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V8.B16, V8.B16 + VEOR V17.B16, V9.B16, V9.B16 + VEOR V18.B16, V8.B16, V8.B16 + VEOR V19.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V10.B16, V10.B16 + VEOR V17.B16, V11.B16, V11.B16 + VEOR V18.B16, V10.B16, V10.B16 + VEOR V19.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V12.B16, V12.B16 + VEOR V17.B16, V13.B16, V13.B16 + VEOR V18.B16, V12.B16, V12.B16 + VEOR V19.B16, V13.B16, V13.B16 + // Check for early termination + CMP $4, R16 + BEQ mulNeon_10x7Xor_store + + // Load and process 32 bytes from input 4 to 7 outputs + VLD1.P 32(R9), [V20.B16, V21.B16] + VUSHR $4, V20.B16, V22.B16 + VUSHR $4, V21.B16, V23.B16 + VAND V14.B16, V20.B16, V20.B16 + VAND V14.B16, V21.B16, V21.B16 + VAND V14.B16, V22.B16, V22.B16 + VAND V14.B16, V23.B16, V23.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V0.B16, V0.B16 + VEOR V17.B16, V1.B16, V1.B16 + VEOR V18.B16, V0.B16, V0.B16 + VEOR V19.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V2.B16, V2.B16 + VEOR V17.B16, V3.B16, V3.B16 + VEOR V18.B16, V2.B16, V2.B16 + VEOR V19.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V4.B16, V4.B16 + VEOR V17.B16, V5.B16, V5.B16 + VEOR V18.B16, V4.B16, V4.B16 + VEOR V19.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V6.B16, V6.B16 + VEOR V17.B16, V7.B16, V7.B16 + VEOR V18.B16, V6.B16, V6.B16 + VEOR V19.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V8.B16, V8.B16 + VEOR V17.B16, V9.B16, V9.B16 + VEOR V18.B16, V8.B16, V8.B16 + VEOR V19.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V10.B16, V10.B16 + VEOR V17.B16, V11.B16, V11.B16 + VEOR V18.B16, V10.B16, V10.B16 + VEOR V19.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V12.B16, V12.B16 + VEOR V17.B16, V13.B16, V13.B16 + VEOR V18.B16, V12.B16, V12.B16 + VEOR V19.B16, V13.B16, V13.B16 + // Check for early termination + CMP $5, R16 + BEQ mulNeon_10x7Xor_store + + // Load and process 32 bytes from input 5 to 7 outputs + VLD1.P 32(R10), [V20.B16, V21.B16] + VUSHR $4, V20.B16, V22.B16 + VUSHR $4, V21.B16, V23.B16 + VAND V14.B16, V20.B16, V20.B16 + VAND V14.B16, V21.B16, V21.B16 + VAND V14.B16, V22.B16, V22.B16 + VAND V14.B16, V23.B16, V23.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V0.B16, V0.B16 + VEOR V17.B16, V1.B16, V1.B16 + VEOR V18.B16, V0.B16, V0.B16 + VEOR V19.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V2.B16, V2.B16 + VEOR V17.B16, V3.B16, V3.B16 + VEOR V18.B16, V2.B16, V2.B16 + VEOR V19.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V4.B16, V4.B16 + VEOR V17.B16, V5.B16, V5.B16 + VEOR V18.B16, V4.B16, V4.B16 + VEOR V19.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V6.B16, V6.B16 + VEOR V17.B16, V7.B16, V7.B16 + VEOR V18.B16, V6.B16, V6.B16 + VEOR V19.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V8.B16, V8.B16 + VEOR V17.B16, V9.B16, V9.B16 + VEOR V18.B16, V8.B16, V8.B16 + VEOR V19.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V10.B16, V10.B16 + VEOR V17.B16, V11.B16, V11.B16 + VEOR V18.B16, V10.B16, V10.B16 + VEOR V19.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V12.B16, V12.B16 + VEOR V17.B16, V13.B16, V13.B16 + VEOR V18.B16, V12.B16, V12.B16 + VEOR V19.B16, V13.B16, V13.B16 + // Check for early termination + CMP $6, R16 + BEQ mulNeon_10x7Xor_store + + // Load and process 32 bytes from input 6 to 7 outputs + VLD1.P 32(R11), [V20.B16, V21.B16] + VUSHR $4, V20.B16, V22.B16 + VUSHR $4, V21.B16, V23.B16 + VAND V14.B16, V20.B16, V20.B16 + VAND V14.B16, V21.B16, V21.B16 + VAND V14.B16, V22.B16, V22.B16 + VAND V14.B16, V23.B16, V23.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V0.B16, V0.B16 + VEOR V17.B16, V1.B16, V1.B16 + VEOR V18.B16, V0.B16, V0.B16 + VEOR V19.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V2.B16, V2.B16 + VEOR V17.B16, V3.B16, V3.B16 + VEOR V18.B16, V2.B16, V2.B16 + VEOR V19.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V4.B16, V4.B16 + VEOR V17.B16, V5.B16, V5.B16 + VEOR V18.B16, V4.B16, V4.B16 + VEOR V19.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V6.B16, V6.B16 + VEOR V17.B16, V7.B16, V7.B16 + VEOR V18.B16, V6.B16, V6.B16 + VEOR V19.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V8.B16, V8.B16 + VEOR V17.B16, V9.B16, V9.B16 + VEOR V18.B16, V8.B16, V8.B16 + VEOR V19.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V10.B16, V10.B16 + VEOR V17.B16, V11.B16, V11.B16 + VEOR V18.B16, V10.B16, V10.B16 + VEOR V19.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V12.B16, V12.B16 + VEOR V17.B16, V13.B16, V13.B16 + VEOR V18.B16, V12.B16, V12.B16 + VEOR V19.B16, V13.B16, V13.B16 + // Check for early termination + CMP $7, R16 + BEQ mulNeon_10x7Xor_store + + // Load and process 32 bytes from input 7 to 7 outputs + VLD1.P 32(R12), [V20.B16, V21.B16] + VUSHR $4, V20.B16, V22.B16 + VUSHR $4, V21.B16, V23.B16 + VAND V14.B16, V20.B16, V20.B16 + VAND V14.B16, V21.B16, V21.B16 + VAND V14.B16, V22.B16, V22.B16 + VAND V14.B16, V23.B16, V23.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V0.B16, V0.B16 + VEOR V17.B16, V1.B16, V1.B16 + VEOR V18.B16, V0.B16, V0.B16 + VEOR V19.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V2.B16, V2.B16 + VEOR V17.B16, V3.B16, V3.B16 + VEOR V18.B16, V2.B16, V2.B16 + VEOR V19.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V4.B16, V4.B16 + VEOR V17.B16, V5.B16, V5.B16 + VEOR V18.B16, V4.B16, V4.B16 + VEOR V19.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V6.B16, V6.B16 + VEOR V17.B16, V7.B16, V7.B16 + VEOR V18.B16, V6.B16, V6.B16 + VEOR V19.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V8.B16, V8.B16 + VEOR V17.B16, V9.B16, V9.B16 + VEOR V18.B16, V8.B16, V8.B16 + VEOR V19.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V10.B16, V10.B16 + VEOR V17.B16, V11.B16, V11.B16 + VEOR V18.B16, V10.B16, V10.B16 + VEOR V19.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V12.B16, V12.B16 + VEOR V17.B16, V13.B16, V13.B16 + VEOR V18.B16, V12.B16, V12.B16 + VEOR V19.B16, V13.B16, V13.B16 + // Check for early termination + CMP $8, R16 + BEQ mulNeon_10x7Xor_store + + // Load and process 32 bytes from input 8 to 7 outputs + VLD1.P 32(R13), [V20.B16, V21.B16] + VUSHR $4, V20.B16, V22.B16 + VUSHR $4, V21.B16, V23.B16 + VAND V14.B16, V20.B16, V20.B16 + VAND V14.B16, V21.B16, V21.B16 + VAND V14.B16, V22.B16, V22.B16 + VAND V14.B16, V23.B16, V23.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V0.B16, V0.B16 + VEOR V17.B16, V1.B16, V1.B16 + VEOR V18.B16, V0.B16, V0.B16 + VEOR V19.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V2.B16, V2.B16 + VEOR V17.B16, V3.B16, V3.B16 + VEOR V18.B16, V2.B16, V2.B16 + VEOR V19.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V4.B16, V4.B16 + VEOR V17.B16, V5.B16, V5.B16 + VEOR V18.B16, V4.B16, V4.B16 + VEOR V19.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V6.B16, V6.B16 + VEOR V17.B16, V7.B16, V7.B16 + VEOR V18.B16, V6.B16, V6.B16 + VEOR V19.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V8.B16, V8.B16 + VEOR V17.B16, V9.B16, V9.B16 + VEOR V18.B16, V8.B16, V8.B16 + VEOR V19.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V10.B16, V10.B16 + VEOR V17.B16, V11.B16, V11.B16 + VEOR V18.B16, V10.B16, V10.B16 + VEOR V19.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V12.B16, V12.B16 + VEOR V17.B16, V13.B16, V13.B16 + VEOR V18.B16, V12.B16, V12.B16 + VEOR V19.B16, V13.B16, V13.B16 + // Check for early termination + CMP $9, R16 + BEQ mulNeon_10x7Xor_store + + // Load and process 32 bytes from input 9 to 7 outputs + VLD1.P 32(R3), [V20.B16, V21.B16] + VUSHR $4, V20.B16, V22.B16 + VUSHR $4, V21.B16, V23.B16 + VAND V14.B16, V20.B16, V20.B16 + VAND V14.B16, V21.B16, V21.B16 + VAND V14.B16, V22.B16, V22.B16 + VAND V14.B16, V23.B16, V23.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V0.B16, V0.B16 + VEOR V17.B16, V1.B16, V1.B16 + VEOR V18.B16, V0.B16, V0.B16 + VEOR V19.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V2.B16, V2.B16 + VEOR V17.B16, V3.B16, V3.B16 + VEOR V18.B16, V2.B16, V2.B16 + VEOR V19.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V4.B16, V4.B16 + VEOR V17.B16, V5.B16, V5.B16 + VEOR V18.B16, V4.B16, V4.B16 + VEOR V19.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V6.B16, V6.B16 + VEOR V17.B16, V7.B16, V7.B16 + VEOR V18.B16, V6.B16, V6.B16 + VEOR V19.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V8.B16, V8.B16 + VEOR V17.B16, V9.B16, V9.B16 + VEOR V18.B16, V8.B16, V8.B16 + VEOR V19.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V10.B16, V10.B16 + VEOR V17.B16, V11.B16, V11.B16 + VEOR V18.B16, V10.B16, V10.B16 + VEOR V19.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V12.B16, V12.B16 + VEOR V17.B16, V13.B16, V13.B16 + VEOR V18.B16, V12.B16, V12.B16 + VEOR V19.B16, V13.B16, V13.B16 + +mulNeon_10x7Xor_store: + // Store 7 outputs + MOVD (R14), R6 + ADD R15<<3, R6 + VST1 [V0.D2, V1.D2], (R6) + MOVD 24(R14), R6 + ADD R15<<3, R6 + VST1 [V2.D2, V3.D2], (R6) + MOVD 48(R14), R6 + ADD R15<<3, R6 + VST1 [V4.D2, V5.D2], (R6) + MOVD 72(R14), R6 + ADD R15<<3, R6 + VST1 [V6.D2, V7.D2], (R6) + MOVD 96(R14), R6 + ADD R15<<3, R6 + VST1 [V8.D2, V9.D2], (R6) + MOVD 120(R14), R6 + ADD R15<<3, R6 + VST1 [V10.D2, V11.D2], (R6) + MOVD 144(R14), R6 + ADD R15<<3, R6 + VST1 [V12.D2, V13.D2], (R6) + + // Prepare for next loop + ADD $4, R15 + SUBS $1, R0 + BNE mulNeon_10x7Xor_loop + +mulNeon_10x7Xor_end: + RET + +// func mulNeon_10x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: NEON +TEXT ·mulNeon_10x8(SB), NOSPLIT, $8-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 173 YMM used + MOVD n+80(FP), R0 + LSR $5, R0 + TST R0, R0 + BEQ mulNeon_10x8_end + MOVD in_base+24(FP), R3 + MOVD (R3), R1 + MOVD 24(R3), R4 + MOVD 48(R3), R5 + MOVD 72(R3), R8 + MOVD 96(R3), R9 + MOVD 120(R3), R10 + MOVD 144(R3), R11 + MOVD 168(R3), R12 + MOVD 192(R3), R13 + MOVD 216(R3), R3 + MOVD out_base+48(FP), R14 + MOVD start+72(FP), R15 + + // Add start offset to input + ADD R15, R1 + ADD R15, R4 + ADD R15, R5 + ADD R15, R8 + ADD R15, R9 + ADD R15, R10 + ADD R15, R11 + ADD R15, R12 + ADD R15, R13 + ADD R15, R3 + LSR $3, R15 + MOVD $15, R6 + VMOV R6, V16.B[0] + VDUP V16.B[0], V16.B16 + + // Load number of input shards + MOVD in_len+32(FP), R16 + +mulNeon_10x8_loop: + MOVD matrix_base+0(FP), R2 + // Load and process 32 bytes from input 0 to 8 outputs + VLD1.P 32(R1), [V22.B16, V23.B16] + VUSHR $4, V22.B16, V24.B16 + VUSHR $4, V23.B16, V25.B16 + VAND V16.B16, V22.B16, V22.B16 + VAND V16.B16, V23.B16, V23.B16 + VAND V16.B16, V24.B16, V24.B16 + VAND V16.B16, V25.B16, V25.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V20.B16, V0.B16 + VEOR V19.B16, V21.B16, V1.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V20.B16, V2.B16 + VEOR V19.B16, V21.B16, V3.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V20.B16, V4.B16 + VEOR V19.B16, V21.B16, V5.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V20.B16, V6.B16 + VEOR V19.B16, V21.B16, V7.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V20.B16, V8.B16 + VEOR V19.B16, V21.B16, V9.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V20.B16, V10.B16 + VEOR V19.B16, V21.B16, V11.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V20.B16, V12.B16 + VEOR V19.B16, V21.B16, V13.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V20.B16, V14.B16 + VEOR V19.B16, V21.B16, V15.B16 + // Check for early termination + CMP $1, R16 + BEQ mulNeon_10x8_store + + // Load and process 32 bytes from input 1 to 8 outputs + VLD1.P 32(R4), [V22.B16, V23.B16] + VUSHR $4, V22.B16, V24.B16 + VUSHR $4, V23.B16, V25.B16 + VAND V16.B16, V22.B16, V22.B16 + VAND V16.B16, V23.B16, V23.B16 + VAND V16.B16, V24.B16, V24.B16 + VAND V16.B16, V25.B16, V25.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V0.B16, V0.B16 + VEOR V19.B16, V1.B16, V1.B16 + VEOR V20.B16, V0.B16, V0.B16 + VEOR V21.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V2.B16, V2.B16 + VEOR V19.B16, V3.B16, V3.B16 + VEOR V20.B16, V2.B16, V2.B16 + VEOR V21.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V4.B16, V4.B16 + VEOR V19.B16, V5.B16, V5.B16 + VEOR V20.B16, V4.B16, V4.B16 + VEOR V21.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V6.B16, V6.B16 + VEOR V19.B16, V7.B16, V7.B16 + VEOR V20.B16, V6.B16, V6.B16 + VEOR V21.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V8.B16, V8.B16 + VEOR V19.B16, V9.B16, V9.B16 + VEOR V20.B16, V8.B16, V8.B16 + VEOR V21.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V10.B16, V10.B16 + VEOR V19.B16, V11.B16, V11.B16 + VEOR V20.B16, V10.B16, V10.B16 + VEOR V21.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V12.B16, V12.B16 + VEOR V19.B16, V13.B16, V13.B16 + VEOR V20.B16, V12.B16, V12.B16 + VEOR V21.B16, V13.B16, V13.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V14.B16, V14.B16 + VEOR V19.B16, V15.B16, V15.B16 + VEOR V20.B16, V14.B16, V14.B16 + VEOR V21.B16, V15.B16, V15.B16 + // Check for early termination + CMP $2, R16 + BEQ mulNeon_10x8_store + + // Load and process 32 bytes from input 2 to 8 outputs + VLD1.P 32(R5), [V22.B16, V23.B16] + VUSHR $4, V22.B16, V24.B16 + VUSHR $4, V23.B16, V25.B16 + VAND V16.B16, V22.B16, V22.B16 + VAND V16.B16, V23.B16, V23.B16 + VAND V16.B16, V24.B16, V24.B16 + VAND V16.B16, V25.B16, V25.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V0.B16, V0.B16 + VEOR V19.B16, V1.B16, V1.B16 + VEOR V20.B16, V0.B16, V0.B16 + VEOR V21.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V2.B16, V2.B16 + VEOR V19.B16, V3.B16, V3.B16 + VEOR V20.B16, V2.B16, V2.B16 + VEOR V21.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V4.B16, V4.B16 + VEOR V19.B16, V5.B16, V5.B16 + VEOR V20.B16, V4.B16, V4.B16 + VEOR V21.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V6.B16, V6.B16 + VEOR V19.B16, V7.B16, V7.B16 + VEOR V20.B16, V6.B16, V6.B16 + VEOR V21.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V8.B16, V8.B16 + VEOR V19.B16, V9.B16, V9.B16 + VEOR V20.B16, V8.B16, V8.B16 + VEOR V21.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V10.B16, V10.B16 + VEOR V19.B16, V11.B16, V11.B16 + VEOR V20.B16, V10.B16, V10.B16 + VEOR V21.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V12.B16, V12.B16 + VEOR V19.B16, V13.B16, V13.B16 + VEOR V20.B16, V12.B16, V12.B16 + VEOR V21.B16, V13.B16, V13.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V14.B16, V14.B16 + VEOR V19.B16, V15.B16, V15.B16 + VEOR V20.B16, V14.B16, V14.B16 + VEOR V21.B16, V15.B16, V15.B16 + // Check for early termination + CMP $3, R16 + BEQ mulNeon_10x8_store + + // Load and process 32 bytes from input 3 to 8 outputs + VLD1.P 32(R8), [V22.B16, V23.B16] + VUSHR $4, V22.B16, V24.B16 + VUSHR $4, V23.B16, V25.B16 + VAND V16.B16, V22.B16, V22.B16 + VAND V16.B16, V23.B16, V23.B16 + VAND V16.B16, V24.B16, V24.B16 + VAND V16.B16, V25.B16, V25.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V0.B16, V0.B16 + VEOR V19.B16, V1.B16, V1.B16 + VEOR V20.B16, V0.B16, V0.B16 + VEOR V21.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V2.B16, V2.B16 + VEOR V19.B16, V3.B16, V3.B16 + VEOR V20.B16, V2.B16, V2.B16 + VEOR V21.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V4.B16, V4.B16 + VEOR V19.B16, V5.B16, V5.B16 + VEOR V20.B16, V4.B16, V4.B16 + VEOR V21.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V6.B16, V6.B16 + VEOR V19.B16, V7.B16, V7.B16 + VEOR V20.B16, V6.B16, V6.B16 + VEOR V21.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V8.B16, V8.B16 + VEOR V19.B16, V9.B16, V9.B16 + VEOR V20.B16, V8.B16, V8.B16 + VEOR V21.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V10.B16, V10.B16 + VEOR V19.B16, V11.B16, V11.B16 + VEOR V20.B16, V10.B16, V10.B16 + VEOR V21.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V12.B16, V12.B16 + VEOR V19.B16, V13.B16, V13.B16 + VEOR V20.B16, V12.B16, V12.B16 + VEOR V21.B16, V13.B16, V13.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V14.B16, V14.B16 + VEOR V19.B16, V15.B16, V15.B16 + VEOR V20.B16, V14.B16, V14.B16 + VEOR V21.B16, V15.B16, V15.B16 + // Check for early termination + CMP $4, R16 + BEQ mulNeon_10x8_store + + // Load and process 32 bytes from input 4 to 8 outputs + VLD1.P 32(R9), [V22.B16, V23.B16] + VUSHR $4, V22.B16, V24.B16 + VUSHR $4, V23.B16, V25.B16 + VAND V16.B16, V22.B16, V22.B16 + VAND V16.B16, V23.B16, V23.B16 + VAND V16.B16, V24.B16, V24.B16 + VAND V16.B16, V25.B16, V25.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V0.B16, V0.B16 + VEOR V19.B16, V1.B16, V1.B16 + VEOR V20.B16, V0.B16, V0.B16 + VEOR V21.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V2.B16, V2.B16 + VEOR V19.B16, V3.B16, V3.B16 + VEOR V20.B16, V2.B16, V2.B16 + VEOR V21.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V4.B16, V4.B16 + VEOR V19.B16, V5.B16, V5.B16 + VEOR V20.B16, V4.B16, V4.B16 + VEOR V21.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V6.B16, V6.B16 + VEOR V19.B16, V7.B16, V7.B16 + VEOR V20.B16, V6.B16, V6.B16 + VEOR V21.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V8.B16, V8.B16 + VEOR V19.B16, V9.B16, V9.B16 + VEOR V20.B16, V8.B16, V8.B16 + VEOR V21.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V10.B16, V10.B16 + VEOR V19.B16, V11.B16, V11.B16 + VEOR V20.B16, V10.B16, V10.B16 + VEOR V21.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V12.B16, V12.B16 + VEOR V19.B16, V13.B16, V13.B16 + VEOR V20.B16, V12.B16, V12.B16 + VEOR V21.B16, V13.B16, V13.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V14.B16, V14.B16 + VEOR V19.B16, V15.B16, V15.B16 + VEOR V20.B16, V14.B16, V14.B16 + VEOR V21.B16, V15.B16, V15.B16 + // Check for early termination + CMP $5, R16 + BEQ mulNeon_10x8_store + + // Load and process 32 bytes from input 5 to 8 outputs + VLD1.P 32(R10), [V22.B16, V23.B16] + VUSHR $4, V22.B16, V24.B16 + VUSHR $4, V23.B16, V25.B16 + VAND V16.B16, V22.B16, V22.B16 + VAND V16.B16, V23.B16, V23.B16 + VAND V16.B16, V24.B16, V24.B16 + VAND V16.B16, V25.B16, V25.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V0.B16, V0.B16 + VEOR V19.B16, V1.B16, V1.B16 + VEOR V20.B16, V0.B16, V0.B16 + VEOR V21.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V2.B16, V2.B16 + VEOR V19.B16, V3.B16, V3.B16 + VEOR V20.B16, V2.B16, V2.B16 + VEOR V21.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V4.B16, V4.B16 + VEOR V19.B16, V5.B16, V5.B16 + VEOR V20.B16, V4.B16, V4.B16 + VEOR V21.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V6.B16, V6.B16 + VEOR V19.B16, V7.B16, V7.B16 + VEOR V20.B16, V6.B16, V6.B16 + VEOR V21.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V8.B16, V8.B16 + VEOR V19.B16, V9.B16, V9.B16 + VEOR V20.B16, V8.B16, V8.B16 + VEOR V21.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V10.B16, V10.B16 + VEOR V19.B16, V11.B16, V11.B16 + VEOR V20.B16, V10.B16, V10.B16 + VEOR V21.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V12.B16, V12.B16 + VEOR V19.B16, V13.B16, V13.B16 + VEOR V20.B16, V12.B16, V12.B16 + VEOR V21.B16, V13.B16, V13.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V14.B16, V14.B16 + VEOR V19.B16, V15.B16, V15.B16 + VEOR V20.B16, V14.B16, V14.B16 + VEOR V21.B16, V15.B16, V15.B16 + // Check for early termination + CMP $6, R16 + BEQ mulNeon_10x8_store + + // Load and process 32 bytes from input 6 to 8 outputs + VLD1.P 32(R11), [V22.B16, V23.B16] + VUSHR $4, V22.B16, V24.B16 + VUSHR $4, V23.B16, V25.B16 + VAND V16.B16, V22.B16, V22.B16 + VAND V16.B16, V23.B16, V23.B16 + VAND V16.B16, V24.B16, V24.B16 + VAND V16.B16, V25.B16, V25.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V0.B16, V0.B16 + VEOR V19.B16, V1.B16, V1.B16 + VEOR V20.B16, V0.B16, V0.B16 + VEOR V21.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V2.B16, V2.B16 + VEOR V19.B16, V3.B16, V3.B16 + VEOR V20.B16, V2.B16, V2.B16 + VEOR V21.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V4.B16, V4.B16 + VEOR V19.B16, V5.B16, V5.B16 + VEOR V20.B16, V4.B16, V4.B16 + VEOR V21.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V6.B16, V6.B16 + VEOR V19.B16, V7.B16, V7.B16 + VEOR V20.B16, V6.B16, V6.B16 + VEOR V21.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V8.B16, V8.B16 + VEOR V19.B16, V9.B16, V9.B16 + VEOR V20.B16, V8.B16, V8.B16 + VEOR V21.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V10.B16, V10.B16 + VEOR V19.B16, V11.B16, V11.B16 + VEOR V20.B16, V10.B16, V10.B16 + VEOR V21.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V12.B16, V12.B16 + VEOR V19.B16, V13.B16, V13.B16 + VEOR V20.B16, V12.B16, V12.B16 + VEOR V21.B16, V13.B16, V13.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V14.B16, V14.B16 + VEOR V19.B16, V15.B16, V15.B16 + VEOR V20.B16, V14.B16, V14.B16 + VEOR V21.B16, V15.B16, V15.B16 + // Check for early termination + CMP $7, R16 + BEQ mulNeon_10x8_store + + // Load and process 32 bytes from input 7 to 8 outputs + VLD1.P 32(R12), [V22.B16, V23.B16] + VUSHR $4, V22.B16, V24.B16 + VUSHR $4, V23.B16, V25.B16 + VAND V16.B16, V22.B16, V22.B16 + VAND V16.B16, V23.B16, V23.B16 + VAND V16.B16, V24.B16, V24.B16 + VAND V16.B16, V25.B16, V25.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V0.B16, V0.B16 + VEOR V19.B16, V1.B16, V1.B16 + VEOR V20.B16, V0.B16, V0.B16 + VEOR V21.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V2.B16, V2.B16 + VEOR V19.B16, V3.B16, V3.B16 + VEOR V20.B16, V2.B16, V2.B16 + VEOR V21.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V4.B16, V4.B16 + VEOR V19.B16, V5.B16, V5.B16 + VEOR V20.B16, V4.B16, V4.B16 + VEOR V21.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V6.B16, V6.B16 + VEOR V19.B16, V7.B16, V7.B16 + VEOR V20.B16, V6.B16, V6.B16 + VEOR V21.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V8.B16, V8.B16 + VEOR V19.B16, V9.B16, V9.B16 + VEOR V20.B16, V8.B16, V8.B16 + VEOR V21.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V10.B16, V10.B16 + VEOR V19.B16, V11.B16, V11.B16 + VEOR V20.B16, V10.B16, V10.B16 + VEOR V21.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V12.B16, V12.B16 + VEOR V19.B16, V13.B16, V13.B16 + VEOR V20.B16, V12.B16, V12.B16 + VEOR V21.B16, V13.B16, V13.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V14.B16, V14.B16 + VEOR V19.B16, V15.B16, V15.B16 + VEOR V20.B16, V14.B16, V14.B16 + VEOR V21.B16, V15.B16, V15.B16 + // Check for early termination + CMP $8, R16 + BEQ mulNeon_10x8_store + + // Load and process 32 bytes from input 8 to 8 outputs + VLD1.P 32(R13), [V22.B16, V23.B16] + VUSHR $4, V22.B16, V24.B16 + VUSHR $4, V23.B16, V25.B16 + VAND V16.B16, V22.B16, V22.B16 + VAND V16.B16, V23.B16, V23.B16 + VAND V16.B16, V24.B16, V24.B16 + VAND V16.B16, V25.B16, V25.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V0.B16, V0.B16 + VEOR V19.B16, V1.B16, V1.B16 + VEOR V20.B16, V0.B16, V0.B16 + VEOR V21.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V2.B16, V2.B16 + VEOR V19.B16, V3.B16, V3.B16 + VEOR V20.B16, V2.B16, V2.B16 + VEOR V21.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V4.B16, V4.B16 + VEOR V19.B16, V5.B16, V5.B16 + VEOR V20.B16, V4.B16, V4.B16 + VEOR V21.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V6.B16, V6.B16 + VEOR V19.B16, V7.B16, V7.B16 + VEOR V20.B16, V6.B16, V6.B16 + VEOR V21.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V8.B16, V8.B16 + VEOR V19.B16, V9.B16, V9.B16 + VEOR V20.B16, V8.B16, V8.B16 + VEOR V21.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V10.B16, V10.B16 + VEOR V19.B16, V11.B16, V11.B16 + VEOR V20.B16, V10.B16, V10.B16 + VEOR V21.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V12.B16, V12.B16 + VEOR V19.B16, V13.B16, V13.B16 + VEOR V20.B16, V12.B16, V12.B16 + VEOR V21.B16, V13.B16, V13.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V14.B16, V14.B16 + VEOR V19.B16, V15.B16, V15.B16 + VEOR V20.B16, V14.B16, V14.B16 + VEOR V21.B16, V15.B16, V15.B16 + // Check for early termination + CMP $9, R16 + BEQ mulNeon_10x8_store + + // Load and process 32 bytes from input 9 to 8 outputs + VLD1.P 32(R3), [V22.B16, V23.B16] + VUSHR $4, V22.B16, V24.B16 + VUSHR $4, V23.B16, V25.B16 + VAND V16.B16, V22.B16, V22.B16 + VAND V16.B16, V23.B16, V23.B16 + VAND V16.B16, V24.B16, V24.B16 + VAND V16.B16, V25.B16, V25.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V0.B16, V0.B16 + VEOR V19.B16, V1.B16, V1.B16 + VEOR V20.B16, V0.B16, V0.B16 + VEOR V21.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V2.B16, V2.B16 + VEOR V19.B16, V3.B16, V3.B16 + VEOR V20.B16, V2.B16, V2.B16 + VEOR V21.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V4.B16, V4.B16 + VEOR V19.B16, V5.B16, V5.B16 + VEOR V20.B16, V4.B16, V4.B16 + VEOR V21.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V6.B16, V6.B16 + VEOR V19.B16, V7.B16, V7.B16 + VEOR V20.B16, V6.B16, V6.B16 + VEOR V21.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V8.B16, V8.B16 + VEOR V19.B16, V9.B16, V9.B16 + VEOR V20.B16, V8.B16, V8.B16 + VEOR V21.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V10.B16, V10.B16 + VEOR V19.B16, V11.B16, V11.B16 + VEOR V20.B16, V10.B16, V10.B16 + VEOR V21.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V12.B16, V12.B16 + VEOR V19.B16, V13.B16, V13.B16 + VEOR V20.B16, V12.B16, V12.B16 + VEOR V21.B16, V13.B16, V13.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V14.B16, V14.B16 + VEOR V19.B16, V15.B16, V15.B16 + VEOR V20.B16, V14.B16, V14.B16 + VEOR V21.B16, V15.B16, V15.B16 + +mulNeon_10x8_store: + // Store 8 outputs + MOVD (R14), R6 + ADD R15<<3, R6 + VST1 [V0.D2, V1.D2], (R6) + MOVD 24(R14), R6 + ADD R15<<3, R6 + VST1 [V2.D2, V3.D2], (R6) + MOVD 48(R14), R6 + ADD R15<<3, R6 + VST1 [V4.D2, V5.D2], (R6) + MOVD 72(R14), R6 + ADD R15<<3, R6 + VST1 [V6.D2, V7.D2], (R6) + MOVD 96(R14), R6 + ADD R15<<3, R6 + VST1 [V8.D2, V9.D2], (R6) + MOVD 120(R14), R6 + ADD R15<<3, R6 + VST1 [V10.D2, V11.D2], (R6) + MOVD 144(R14), R6 + ADD R15<<3, R6 + VST1 [V12.D2, V13.D2], (R6) + MOVD 168(R14), R6 + ADD R15<<3, R6 + VST1 [V14.D2, V15.D2], (R6) + + // Prepare for next loop + ADD $4, R15 + SUBS $1, R0 + BNE mulNeon_10x8_loop + +mulNeon_10x8_end: + RET + +// func mulNeon_10x8Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: NEON +TEXT ·mulNeon_10x8Xor(SB), NOSPLIT, $8-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 173 YMM used + MOVD n+80(FP), R0 + LSR $5, R0 + TST R0, R0 + BEQ mulNeon_10x8Xor_end + MOVD in_base+24(FP), R3 + MOVD (R3), R1 + MOVD 24(R3), R4 + MOVD 48(R3), R5 + MOVD 72(R3), R8 + MOVD 96(R3), R9 + MOVD 120(R3), R10 + MOVD 144(R3), R11 + MOVD 168(R3), R12 + MOVD 192(R3), R13 + MOVD 216(R3), R3 + MOVD out_base+48(FP), R14 + MOVD start+72(FP), R15 + + // Add start offset to input + ADD R15, R1 + ADD R15, R4 + ADD R15, R5 + ADD R15, R8 + ADD R15, R9 + ADD R15, R10 + ADD R15, R11 + ADD R15, R12 + ADD R15, R13 + ADD R15, R3 + LSR $3, R15 + MOVD $15, R6 + VMOV R6, V16.B[0] + VDUP V16.B[0], V16.B16 + + // Load number of input shards + MOVD in_len+32(FP), R16 + +mulNeon_10x8Xor_loop: + MOVD matrix_base+0(FP), R2 + // Load and process 32 bytes from input 0 to 8 outputs + VLD1.P 32(R1), [V22.B16, V23.B16] + VUSHR $4, V22.B16, V24.B16 + VUSHR $4, V23.B16, V25.B16 + VAND V16.B16, V22.B16, V22.B16 + VAND V16.B16, V23.B16, V23.B16 + VAND V16.B16, V24.B16, V24.B16 + VAND V16.B16, V25.B16, V25.B16 + MOVD (R14), R6 + ADD R15<<3, R6 + VLD1 (R6), [V0.B16, V1.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V0.B16, V0.B16 + VEOR V19.B16, V1.B16, V1.B16 + VEOR V20.B16, V0.B16, V0.B16 + VEOR V21.B16, V1.B16, V1.B16 + MOVD 24(R14), R6 + ADD R15<<3, R6 + VLD1 (R6), [V2.B16, V3.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V2.B16, V2.B16 + VEOR V19.B16, V3.B16, V3.B16 + VEOR V20.B16, V2.B16, V2.B16 + VEOR V21.B16, V3.B16, V3.B16 + MOVD 48(R14), R6 + ADD R15<<3, R6 + VLD1 (R6), [V4.B16, V5.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V4.B16, V4.B16 + VEOR V19.B16, V5.B16, V5.B16 + VEOR V20.B16, V4.B16, V4.B16 + VEOR V21.B16, V5.B16, V5.B16 + MOVD 72(R14), R6 + ADD R15<<3, R6 + VLD1 (R6), [V6.B16, V7.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V6.B16, V6.B16 + VEOR V19.B16, V7.B16, V7.B16 + VEOR V20.B16, V6.B16, V6.B16 + VEOR V21.B16, V7.B16, V7.B16 + MOVD 96(R14), R6 + ADD R15<<3, R6 + VLD1 (R6), [V8.B16, V9.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V8.B16, V8.B16 + VEOR V19.B16, V9.B16, V9.B16 + VEOR V20.B16, V8.B16, V8.B16 + VEOR V21.B16, V9.B16, V9.B16 + MOVD 120(R14), R6 + ADD R15<<3, R6 + VLD1 (R6), [V10.B16, V11.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V10.B16, V10.B16 + VEOR V19.B16, V11.B16, V11.B16 + VEOR V20.B16, V10.B16, V10.B16 + VEOR V21.B16, V11.B16, V11.B16 + MOVD 144(R14), R6 + ADD R15<<3, R6 + VLD1 (R6), [V12.B16, V13.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V12.B16, V12.B16 + VEOR V19.B16, V13.B16, V13.B16 + VEOR V20.B16, V12.B16, V12.B16 + VEOR V21.B16, V13.B16, V13.B16 + MOVD 168(R14), R6 + ADD R15<<3, R6 + VLD1 (R6), [V14.B16, V15.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V14.B16, V14.B16 + VEOR V19.B16, V15.B16, V15.B16 + VEOR V20.B16, V14.B16, V14.B16 + VEOR V21.B16, V15.B16, V15.B16 + // Check for early termination + CMP $1, R16 + BEQ mulNeon_10x8Xor_store + + // Load and process 32 bytes from input 1 to 8 outputs + VLD1.P 32(R4), [V22.B16, V23.B16] + VUSHR $4, V22.B16, V24.B16 + VUSHR $4, V23.B16, V25.B16 + VAND V16.B16, V22.B16, V22.B16 + VAND V16.B16, V23.B16, V23.B16 + VAND V16.B16, V24.B16, V24.B16 + VAND V16.B16, V25.B16, V25.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V0.B16, V0.B16 + VEOR V19.B16, V1.B16, V1.B16 + VEOR V20.B16, V0.B16, V0.B16 + VEOR V21.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V2.B16, V2.B16 + VEOR V19.B16, V3.B16, V3.B16 + VEOR V20.B16, V2.B16, V2.B16 + VEOR V21.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V4.B16, V4.B16 + VEOR V19.B16, V5.B16, V5.B16 + VEOR V20.B16, V4.B16, V4.B16 + VEOR V21.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V6.B16, V6.B16 + VEOR V19.B16, V7.B16, V7.B16 + VEOR V20.B16, V6.B16, V6.B16 + VEOR V21.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V8.B16, V8.B16 + VEOR V19.B16, V9.B16, V9.B16 + VEOR V20.B16, V8.B16, V8.B16 + VEOR V21.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V10.B16, V10.B16 + VEOR V19.B16, V11.B16, V11.B16 + VEOR V20.B16, V10.B16, V10.B16 + VEOR V21.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V12.B16, V12.B16 + VEOR V19.B16, V13.B16, V13.B16 + VEOR V20.B16, V12.B16, V12.B16 + VEOR V21.B16, V13.B16, V13.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V14.B16, V14.B16 + VEOR V19.B16, V15.B16, V15.B16 + VEOR V20.B16, V14.B16, V14.B16 + VEOR V21.B16, V15.B16, V15.B16 + // Check for early termination + CMP $2, R16 + BEQ mulNeon_10x8Xor_store + + // Load and process 32 bytes from input 2 to 8 outputs + VLD1.P 32(R5), [V22.B16, V23.B16] + VUSHR $4, V22.B16, V24.B16 + VUSHR $4, V23.B16, V25.B16 + VAND V16.B16, V22.B16, V22.B16 + VAND V16.B16, V23.B16, V23.B16 + VAND V16.B16, V24.B16, V24.B16 + VAND V16.B16, V25.B16, V25.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V0.B16, V0.B16 + VEOR V19.B16, V1.B16, V1.B16 + VEOR V20.B16, V0.B16, V0.B16 + VEOR V21.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V2.B16, V2.B16 + VEOR V19.B16, V3.B16, V3.B16 + VEOR V20.B16, V2.B16, V2.B16 + VEOR V21.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V4.B16, V4.B16 + VEOR V19.B16, V5.B16, V5.B16 + VEOR V20.B16, V4.B16, V4.B16 + VEOR V21.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V6.B16, V6.B16 + VEOR V19.B16, V7.B16, V7.B16 + VEOR V20.B16, V6.B16, V6.B16 + VEOR V21.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V8.B16, V8.B16 + VEOR V19.B16, V9.B16, V9.B16 + VEOR V20.B16, V8.B16, V8.B16 + VEOR V21.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V10.B16, V10.B16 + VEOR V19.B16, V11.B16, V11.B16 + VEOR V20.B16, V10.B16, V10.B16 + VEOR V21.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V12.B16, V12.B16 + VEOR V19.B16, V13.B16, V13.B16 + VEOR V20.B16, V12.B16, V12.B16 + VEOR V21.B16, V13.B16, V13.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V14.B16, V14.B16 + VEOR V19.B16, V15.B16, V15.B16 + VEOR V20.B16, V14.B16, V14.B16 + VEOR V21.B16, V15.B16, V15.B16 + // Check for early termination + CMP $3, R16 + BEQ mulNeon_10x8Xor_store + + // Load and process 32 bytes from input 3 to 8 outputs + VLD1.P 32(R8), [V22.B16, V23.B16] + VUSHR $4, V22.B16, V24.B16 + VUSHR $4, V23.B16, V25.B16 + VAND V16.B16, V22.B16, V22.B16 + VAND V16.B16, V23.B16, V23.B16 + VAND V16.B16, V24.B16, V24.B16 + VAND V16.B16, V25.B16, V25.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V0.B16, V0.B16 + VEOR V19.B16, V1.B16, V1.B16 + VEOR V20.B16, V0.B16, V0.B16 + VEOR V21.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V2.B16, V2.B16 + VEOR V19.B16, V3.B16, V3.B16 + VEOR V20.B16, V2.B16, V2.B16 + VEOR V21.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V4.B16, V4.B16 + VEOR V19.B16, V5.B16, V5.B16 + VEOR V20.B16, V4.B16, V4.B16 + VEOR V21.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V6.B16, V6.B16 + VEOR V19.B16, V7.B16, V7.B16 + VEOR V20.B16, V6.B16, V6.B16 + VEOR V21.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V8.B16, V8.B16 + VEOR V19.B16, V9.B16, V9.B16 + VEOR V20.B16, V8.B16, V8.B16 + VEOR V21.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V10.B16, V10.B16 + VEOR V19.B16, V11.B16, V11.B16 + VEOR V20.B16, V10.B16, V10.B16 + VEOR V21.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V12.B16, V12.B16 + VEOR V19.B16, V13.B16, V13.B16 + VEOR V20.B16, V12.B16, V12.B16 + VEOR V21.B16, V13.B16, V13.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V14.B16, V14.B16 + VEOR V19.B16, V15.B16, V15.B16 + VEOR V20.B16, V14.B16, V14.B16 + VEOR V21.B16, V15.B16, V15.B16 + // Check for early termination + CMP $4, R16 + BEQ mulNeon_10x8Xor_store + + // Load and process 32 bytes from input 4 to 8 outputs + VLD1.P 32(R9), [V22.B16, V23.B16] + VUSHR $4, V22.B16, V24.B16 + VUSHR $4, V23.B16, V25.B16 + VAND V16.B16, V22.B16, V22.B16 + VAND V16.B16, V23.B16, V23.B16 + VAND V16.B16, V24.B16, V24.B16 + VAND V16.B16, V25.B16, V25.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V0.B16, V0.B16 + VEOR V19.B16, V1.B16, V1.B16 + VEOR V20.B16, V0.B16, V0.B16 + VEOR V21.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V2.B16, V2.B16 + VEOR V19.B16, V3.B16, V3.B16 + VEOR V20.B16, V2.B16, V2.B16 + VEOR V21.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V4.B16, V4.B16 + VEOR V19.B16, V5.B16, V5.B16 + VEOR V20.B16, V4.B16, V4.B16 + VEOR V21.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V6.B16, V6.B16 + VEOR V19.B16, V7.B16, V7.B16 + VEOR V20.B16, V6.B16, V6.B16 + VEOR V21.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V8.B16, V8.B16 + VEOR V19.B16, V9.B16, V9.B16 + VEOR V20.B16, V8.B16, V8.B16 + VEOR V21.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V10.B16, V10.B16 + VEOR V19.B16, V11.B16, V11.B16 + VEOR V20.B16, V10.B16, V10.B16 + VEOR V21.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V12.B16, V12.B16 + VEOR V19.B16, V13.B16, V13.B16 + VEOR V20.B16, V12.B16, V12.B16 + VEOR V21.B16, V13.B16, V13.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V14.B16, V14.B16 + VEOR V19.B16, V15.B16, V15.B16 + VEOR V20.B16, V14.B16, V14.B16 + VEOR V21.B16, V15.B16, V15.B16 + // Check for early termination + CMP $5, R16 + BEQ mulNeon_10x8Xor_store + + // Load and process 32 bytes from input 5 to 8 outputs + VLD1.P 32(R10), [V22.B16, V23.B16] + VUSHR $4, V22.B16, V24.B16 + VUSHR $4, V23.B16, V25.B16 + VAND V16.B16, V22.B16, V22.B16 + VAND V16.B16, V23.B16, V23.B16 + VAND V16.B16, V24.B16, V24.B16 + VAND V16.B16, V25.B16, V25.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V0.B16, V0.B16 + VEOR V19.B16, V1.B16, V1.B16 + VEOR V20.B16, V0.B16, V0.B16 + VEOR V21.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V2.B16, V2.B16 + VEOR V19.B16, V3.B16, V3.B16 + VEOR V20.B16, V2.B16, V2.B16 + VEOR V21.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V4.B16, V4.B16 + VEOR V19.B16, V5.B16, V5.B16 + VEOR V20.B16, V4.B16, V4.B16 + VEOR V21.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V6.B16, V6.B16 + VEOR V19.B16, V7.B16, V7.B16 + VEOR V20.B16, V6.B16, V6.B16 + VEOR V21.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V8.B16, V8.B16 + VEOR V19.B16, V9.B16, V9.B16 + VEOR V20.B16, V8.B16, V8.B16 + VEOR V21.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V10.B16, V10.B16 + VEOR V19.B16, V11.B16, V11.B16 + VEOR V20.B16, V10.B16, V10.B16 + VEOR V21.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V12.B16, V12.B16 + VEOR V19.B16, V13.B16, V13.B16 + VEOR V20.B16, V12.B16, V12.B16 + VEOR V21.B16, V13.B16, V13.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V14.B16, V14.B16 + VEOR V19.B16, V15.B16, V15.B16 + VEOR V20.B16, V14.B16, V14.B16 + VEOR V21.B16, V15.B16, V15.B16 + // Check for early termination + CMP $6, R16 + BEQ mulNeon_10x8Xor_store + + // Load and process 32 bytes from input 6 to 8 outputs + VLD1.P 32(R11), [V22.B16, V23.B16] + VUSHR $4, V22.B16, V24.B16 + VUSHR $4, V23.B16, V25.B16 + VAND V16.B16, V22.B16, V22.B16 + VAND V16.B16, V23.B16, V23.B16 + VAND V16.B16, V24.B16, V24.B16 + VAND V16.B16, V25.B16, V25.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V0.B16, V0.B16 + VEOR V19.B16, V1.B16, V1.B16 + VEOR V20.B16, V0.B16, V0.B16 + VEOR V21.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V2.B16, V2.B16 + VEOR V19.B16, V3.B16, V3.B16 + VEOR V20.B16, V2.B16, V2.B16 + VEOR V21.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V4.B16, V4.B16 + VEOR V19.B16, V5.B16, V5.B16 + VEOR V20.B16, V4.B16, V4.B16 + VEOR V21.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V6.B16, V6.B16 + VEOR V19.B16, V7.B16, V7.B16 + VEOR V20.B16, V6.B16, V6.B16 + VEOR V21.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V8.B16, V8.B16 + VEOR V19.B16, V9.B16, V9.B16 + VEOR V20.B16, V8.B16, V8.B16 + VEOR V21.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V10.B16, V10.B16 + VEOR V19.B16, V11.B16, V11.B16 + VEOR V20.B16, V10.B16, V10.B16 + VEOR V21.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V12.B16, V12.B16 + VEOR V19.B16, V13.B16, V13.B16 + VEOR V20.B16, V12.B16, V12.B16 + VEOR V21.B16, V13.B16, V13.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V14.B16, V14.B16 + VEOR V19.B16, V15.B16, V15.B16 + VEOR V20.B16, V14.B16, V14.B16 + VEOR V21.B16, V15.B16, V15.B16 + // Check for early termination + CMP $7, R16 + BEQ mulNeon_10x8Xor_store + + // Load and process 32 bytes from input 7 to 8 outputs + VLD1.P 32(R12), [V22.B16, V23.B16] + VUSHR $4, V22.B16, V24.B16 + VUSHR $4, V23.B16, V25.B16 + VAND V16.B16, V22.B16, V22.B16 + VAND V16.B16, V23.B16, V23.B16 + VAND V16.B16, V24.B16, V24.B16 + VAND V16.B16, V25.B16, V25.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V0.B16, V0.B16 + VEOR V19.B16, V1.B16, V1.B16 + VEOR V20.B16, V0.B16, V0.B16 + VEOR V21.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V2.B16, V2.B16 + VEOR V19.B16, V3.B16, V3.B16 + VEOR V20.B16, V2.B16, V2.B16 + VEOR V21.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V4.B16, V4.B16 + VEOR V19.B16, V5.B16, V5.B16 + VEOR V20.B16, V4.B16, V4.B16 + VEOR V21.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V6.B16, V6.B16 + VEOR V19.B16, V7.B16, V7.B16 + VEOR V20.B16, V6.B16, V6.B16 + VEOR V21.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V8.B16, V8.B16 + VEOR V19.B16, V9.B16, V9.B16 + VEOR V20.B16, V8.B16, V8.B16 + VEOR V21.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V10.B16, V10.B16 + VEOR V19.B16, V11.B16, V11.B16 + VEOR V20.B16, V10.B16, V10.B16 + VEOR V21.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V12.B16, V12.B16 + VEOR V19.B16, V13.B16, V13.B16 + VEOR V20.B16, V12.B16, V12.B16 + VEOR V21.B16, V13.B16, V13.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V14.B16, V14.B16 + VEOR V19.B16, V15.B16, V15.B16 + VEOR V20.B16, V14.B16, V14.B16 + VEOR V21.B16, V15.B16, V15.B16 + // Check for early termination + CMP $8, R16 + BEQ mulNeon_10x8Xor_store + + // Load and process 32 bytes from input 8 to 8 outputs + VLD1.P 32(R13), [V22.B16, V23.B16] + VUSHR $4, V22.B16, V24.B16 + VUSHR $4, V23.B16, V25.B16 + VAND V16.B16, V22.B16, V22.B16 + VAND V16.B16, V23.B16, V23.B16 + VAND V16.B16, V24.B16, V24.B16 + VAND V16.B16, V25.B16, V25.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V0.B16, V0.B16 + VEOR V19.B16, V1.B16, V1.B16 + VEOR V20.B16, V0.B16, V0.B16 + VEOR V21.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V2.B16, V2.B16 + VEOR V19.B16, V3.B16, V3.B16 + VEOR V20.B16, V2.B16, V2.B16 + VEOR V21.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V4.B16, V4.B16 + VEOR V19.B16, V5.B16, V5.B16 + VEOR V20.B16, V4.B16, V4.B16 + VEOR V21.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V6.B16, V6.B16 + VEOR V19.B16, V7.B16, V7.B16 + VEOR V20.B16, V6.B16, V6.B16 + VEOR V21.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V8.B16, V8.B16 + VEOR V19.B16, V9.B16, V9.B16 + VEOR V20.B16, V8.B16, V8.B16 + VEOR V21.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V10.B16, V10.B16 + VEOR V19.B16, V11.B16, V11.B16 + VEOR V20.B16, V10.B16, V10.B16 + VEOR V21.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V12.B16, V12.B16 + VEOR V19.B16, V13.B16, V13.B16 + VEOR V20.B16, V12.B16, V12.B16 + VEOR V21.B16, V13.B16, V13.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V14.B16, V14.B16 + VEOR V19.B16, V15.B16, V15.B16 + VEOR V20.B16, V14.B16, V14.B16 + VEOR V21.B16, V15.B16, V15.B16 + // Check for early termination + CMP $9, R16 + BEQ mulNeon_10x8Xor_store + + // Load and process 32 bytes from input 9 to 8 outputs + VLD1.P 32(R3), [V22.B16, V23.B16] + VUSHR $4, V22.B16, V24.B16 + VUSHR $4, V23.B16, V25.B16 + VAND V16.B16, V22.B16, V22.B16 + VAND V16.B16, V23.B16, V23.B16 + VAND V16.B16, V24.B16, V24.B16 + VAND V16.B16, V25.B16, V25.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V0.B16, V0.B16 + VEOR V19.B16, V1.B16, V1.B16 + VEOR V20.B16, V0.B16, V0.B16 + VEOR V21.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V2.B16, V2.B16 + VEOR V19.B16, V3.B16, V3.B16 + VEOR V20.B16, V2.B16, V2.B16 + VEOR V21.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V4.B16, V4.B16 + VEOR V19.B16, V5.B16, V5.B16 + VEOR V20.B16, V4.B16, V4.B16 + VEOR V21.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V6.B16, V6.B16 + VEOR V19.B16, V7.B16, V7.B16 + VEOR V20.B16, V6.B16, V6.B16 + VEOR V21.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V8.B16, V8.B16 + VEOR V19.B16, V9.B16, V9.B16 + VEOR V20.B16, V8.B16, V8.B16 + VEOR V21.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V10.B16, V10.B16 + VEOR V19.B16, V11.B16, V11.B16 + VEOR V20.B16, V10.B16, V10.B16 + VEOR V21.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V12.B16, V12.B16 + VEOR V19.B16, V13.B16, V13.B16 + VEOR V20.B16, V12.B16, V12.B16 + VEOR V21.B16, V13.B16, V13.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V14.B16, V14.B16 + VEOR V19.B16, V15.B16, V15.B16 + VEOR V20.B16, V14.B16, V14.B16 + VEOR V21.B16, V15.B16, V15.B16 + +mulNeon_10x8Xor_store: + // Store 8 outputs + MOVD (R14), R6 + ADD R15<<3, R6 + VST1 [V0.D2, V1.D2], (R6) + MOVD 24(R14), R6 + ADD R15<<3, R6 + VST1 [V2.D2, V3.D2], (R6) + MOVD 48(R14), R6 + ADD R15<<3, R6 + VST1 [V4.D2, V5.D2], (R6) + MOVD 72(R14), R6 + ADD R15<<3, R6 + VST1 [V6.D2, V7.D2], (R6) + MOVD 96(R14), R6 + ADD R15<<3, R6 + VST1 [V8.D2, V9.D2], (R6) + MOVD 120(R14), R6 + ADD R15<<3, R6 + VST1 [V10.D2, V11.D2], (R6) + MOVD 144(R14), R6 + ADD R15<<3, R6 + VST1 [V12.D2, V13.D2], (R6) + MOVD 168(R14), R6 + ADD R15<<3, R6 + VST1 [V14.D2, V15.D2], (R6) + + // Prepare for next loop + ADD $4, R15 + SUBS $1, R0 + BNE mulNeon_10x8Xor_loop + +mulNeon_10x8Xor_end: + RET + +// func mulNeon_10x9(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: NEON +TEXT ·mulNeon_10x9(SB), NOSPLIT, $8-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 194 YMM used + MOVD n+80(FP), R0 + LSR $5, R0 + TST R0, R0 + BEQ mulNeon_10x9_end + MOVD in_base+24(FP), R3 + MOVD (R3), R1 + MOVD 24(R3), R4 + MOVD 48(R3), R5 + MOVD 72(R3), R8 + MOVD 96(R3), R9 + MOVD 120(R3), R10 + MOVD 144(R3), R11 + MOVD 168(R3), R12 + MOVD 192(R3), R13 + MOVD 216(R3), R3 + MOVD out_base+48(FP), R14 + MOVD start+72(FP), R15 + + // Add start offset to input + ADD R15, R1 + ADD R15, R4 + ADD R15, R5 + ADD R15, R8 + ADD R15, R9 + ADD R15, R10 + ADD R15, R11 + ADD R15, R12 + ADD R15, R13 + ADD R15, R3 + LSR $3, R15 + MOVD $15, R6 + VMOV R6, V18.B[0] + VDUP V18.B[0], V18.B16 + + // Load number of input shards + MOVD in_len+32(FP), R16 + +mulNeon_10x9_loop: + MOVD matrix_base+0(FP), R2 + // Load and process 32 bytes from input 0 to 9 outputs + VLD1.P 32(R1), [V24.B16, V25.B16] + VUSHR $4, V24.B16, V26.B16 + VUSHR $4, V25.B16, V27.B16 + VAND V18.B16, V24.B16, V24.B16 + VAND V18.B16, V25.B16, V25.B16 + VAND V18.B16, V26.B16, V26.B16 + VAND V18.B16, V27.B16, V27.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V22.B16, V0.B16 + VEOR V21.B16, V23.B16, V1.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V22.B16, V2.B16 + VEOR V21.B16, V23.B16, V3.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V22.B16, V4.B16 + VEOR V21.B16, V23.B16, V5.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V22.B16, V6.B16 + VEOR V21.B16, V23.B16, V7.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V22.B16, V8.B16 + VEOR V21.B16, V23.B16, V9.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V22.B16, V10.B16 + VEOR V21.B16, V23.B16, V11.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V22.B16, V12.B16 + VEOR V21.B16, V23.B16, V13.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V22.B16, V14.B16 + VEOR V21.B16, V23.B16, V15.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V22.B16, V16.B16 + VEOR V21.B16, V23.B16, V17.B16 + // Check for early termination + CMP $1, R16 + BEQ mulNeon_10x9_store + + // Load and process 32 bytes from input 1 to 9 outputs + VLD1.P 32(R4), [V24.B16, V25.B16] + VUSHR $4, V24.B16, V26.B16 + VUSHR $4, V25.B16, V27.B16 + VAND V18.B16, V24.B16, V24.B16 + VAND V18.B16, V25.B16, V25.B16 + VAND V18.B16, V26.B16, V26.B16 + VAND V18.B16, V27.B16, V27.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V0.B16, V0.B16 + VEOR V21.B16, V1.B16, V1.B16 + VEOR V22.B16, V0.B16, V0.B16 + VEOR V23.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V2.B16, V2.B16 + VEOR V21.B16, V3.B16, V3.B16 + VEOR V22.B16, V2.B16, V2.B16 + VEOR V23.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V4.B16, V4.B16 + VEOR V21.B16, V5.B16, V5.B16 + VEOR V22.B16, V4.B16, V4.B16 + VEOR V23.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V6.B16, V6.B16 + VEOR V21.B16, V7.B16, V7.B16 + VEOR V22.B16, V6.B16, V6.B16 + VEOR V23.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V8.B16, V8.B16 + VEOR V21.B16, V9.B16, V9.B16 + VEOR V22.B16, V8.B16, V8.B16 + VEOR V23.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V10.B16, V10.B16 + VEOR V21.B16, V11.B16, V11.B16 + VEOR V22.B16, V10.B16, V10.B16 + VEOR V23.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V12.B16, V12.B16 + VEOR V21.B16, V13.B16, V13.B16 + VEOR V22.B16, V12.B16, V12.B16 + VEOR V23.B16, V13.B16, V13.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V14.B16, V14.B16 + VEOR V21.B16, V15.B16, V15.B16 + VEOR V22.B16, V14.B16, V14.B16 + VEOR V23.B16, V15.B16, V15.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V16.B16, V16.B16 + VEOR V21.B16, V17.B16, V17.B16 + VEOR V22.B16, V16.B16, V16.B16 + VEOR V23.B16, V17.B16, V17.B16 + // Check for early termination + CMP $2, R16 + BEQ mulNeon_10x9_store + + // Load and process 32 bytes from input 2 to 9 outputs + VLD1.P 32(R5), [V24.B16, V25.B16] + VUSHR $4, V24.B16, V26.B16 + VUSHR $4, V25.B16, V27.B16 + VAND V18.B16, V24.B16, V24.B16 + VAND V18.B16, V25.B16, V25.B16 + VAND V18.B16, V26.B16, V26.B16 + VAND V18.B16, V27.B16, V27.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V0.B16, V0.B16 + VEOR V21.B16, V1.B16, V1.B16 + VEOR V22.B16, V0.B16, V0.B16 + VEOR V23.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V2.B16, V2.B16 + VEOR V21.B16, V3.B16, V3.B16 + VEOR V22.B16, V2.B16, V2.B16 + VEOR V23.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V4.B16, V4.B16 + VEOR V21.B16, V5.B16, V5.B16 + VEOR V22.B16, V4.B16, V4.B16 + VEOR V23.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V6.B16, V6.B16 + VEOR V21.B16, V7.B16, V7.B16 + VEOR V22.B16, V6.B16, V6.B16 + VEOR V23.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V8.B16, V8.B16 + VEOR V21.B16, V9.B16, V9.B16 + VEOR V22.B16, V8.B16, V8.B16 + VEOR V23.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V10.B16, V10.B16 + VEOR V21.B16, V11.B16, V11.B16 + VEOR V22.B16, V10.B16, V10.B16 + VEOR V23.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V12.B16, V12.B16 + VEOR V21.B16, V13.B16, V13.B16 + VEOR V22.B16, V12.B16, V12.B16 + VEOR V23.B16, V13.B16, V13.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V14.B16, V14.B16 + VEOR V21.B16, V15.B16, V15.B16 + VEOR V22.B16, V14.B16, V14.B16 + VEOR V23.B16, V15.B16, V15.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V16.B16, V16.B16 + VEOR V21.B16, V17.B16, V17.B16 + VEOR V22.B16, V16.B16, V16.B16 + VEOR V23.B16, V17.B16, V17.B16 + // Check for early termination + CMP $3, R16 + BEQ mulNeon_10x9_store + + // Load and process 32 bytes from input 3 to 9 outputs + VLD1.P 32(R8), [V24.B16, V25.B16] + VUSHR $4, V24.B16, V26.B16 + VUSHR $4, V25.B16, V27.B16 + VAND V18.B16, V24.B16, V24.B16 + VAND V18.B16, V25.B16, V25.B16 + VAND V18.B16, V26.B16, V26.B16 + VAND V18.B16, V27.B16, V27.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V0.B16, V0.B16 + VEOR V21.B16, V1.B16, V1.B16 + VEOR V22.B16, V0.B16, V0.B16 + VEOR V23.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V2.B16, V2.B16 + VEOR V21.B16, V3.B16, V3.B16 + VEOR V22.B16, V2.B16, V2.B16 + VEOR V23.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V4.B16, V4.B16 + VEOR V21.B16, V5.B16, V5.B16 + VEOR V22.B16, V4.B16, V4.B16 + VEOR V23.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V6.B16, V6.B16 + VEOR V21.B16, V7.B16, V7.B16 + VEOR V22.B16, V6.B16, V6.B16 + VEOR V23.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V8.B16, V8.B16 + VEOR V21.B16, V9.B16, V9.B16 + VEOR V22.B16, V8.B16, V8.B16 + VEOR V23.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V10.B16, V10.B16 + VEOR V21.B16, V11.B16, V11.B16 + VEOR V22.B16, V10.B16, V10.B16 + VEOR V23.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V12.B16, V12.B16 + VEOR V21.B16, V13.B16, V13.B16 + VEOR V22.B16, V12.B16, V12.B16 + VEOR V23.B16, V13.B16, V13.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V14.B16, V14.B16 + VEOR V21.B16, V15.B16, V15.B16 + VEOR V22.B16, V14.B16, V14.B16 + VEOR V23.B16, V15.B16, V15.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V16.B16, V16.B16 + VEOR V21.B16, V17.B16, V17.B16 + VEOR V22.B16, V16.B16, V16.B16 + VEOR V23.B16, V17.B16, V17.B16 + // Check for early termination + CMP $4, R16 + BEQ mulNeon_10x9_store + + // Load and process 32 bytes from input 4 to 9 outputs + VLD1.P 32(R9), [V24.B16, V25.B16] + VUSHR $4, V24.B16, V26.B16 + VUSHR $4, V25.B16, V27.B16 + VAND V18.B16, V24.B16, V24.B16 + VAND V18.B16, V25.B16, V25.B16 + VAND V18.B16, V26.B16, V26.B16 + VAND V18.B16, V27.B16, V27.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V0.B16, V0.B16 + VEOR V21.B16, V1.B16, V1.B16 + VEOR V22.B16, V0.B16, V0.B16 + VEOR V23.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V2.B16, V2.B16 + VEOR V21.B16, V3.B16, V3.B16 + VEOR V22.B16, V2.B16, V2.B16 + VEOR V23.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V4.B16, V4.B16 + VEOR V21.B16, V5.B16, V5.B16 + VEOR V22.B16, V4.B16, V4.B16 + VEOR V23.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V6.B16, V6.B16 + VEOR V21.B16, V7.B16, V7.B16 + VEOR V22.B16, V6.B16, V6.B16 + VEOR V23.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V8.B16, V8.B16 + VEOR V21.B16, V9.B16, V9.B16 + VEOR V22.B16, V8.B16, V8.B16 + VEOR V23.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V10.B16, V10.B16 + VEOR V21.B16, V11.B16, V11.B16 + VEOR V22.B16, V10.B16, V10.B16 + VEOR V23.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V12.B16, V12.B16 + VEOR V21.B16, V13.B16, V13.B16 + VEOR V22.B16, V12.B16, V12.B16 + VEOR V23.B16, V13.B16, V13.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V14.B16, V14.B16 + VEOR V21.B16, V15.B16, V15.B16 + VEOR V22.B16, V14.B16, V14.B16 + VEOR V23.B16, V15.B16, V15.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V16.B16, V16.B16 + VEOR V21.B16, V17.B16, V17.B16 + VEOR V22.B16, V16.B16, V16.B16 + VEOR V23.B16, V17.B16, V17.B16 + // Check for early termination + CMP $5, R16 + BEQ mulNeon_10x9_store + + // Load and process 32 bytes from input 5 to 9 outputs + VLD1.P 32(R10), [V24.B16, V25.B16] + VUSHR $4, V24.B16, V26.B16 + VUSHR $4, V25.B16, V27.B16 + VAND V18.B16, V24.B16, V24.B16 + VAND V18.B16, V25.B16, V25.B16 + VAND V18.B16, V26.B16, V26.B16 + VAND V18.B16, V27.B16, V27.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V0.B16, V0.B16 + VEOR V21.B16, V1.B16, V1.B16 + VEOR V22.B16, V0.B16, V0.B16 + VEOR V23.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V2.B16, V2.B16 + VEOR V21.B16, V3.B16, V3.B16 + VEOR V22.B16, V2.B16, V2.B16 + VEOR V23.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V4.B16, V4.B16 + VEOR V21.B16, V5.B16, V5.B16 + VEOR V22.B16, V4.B16, V4.B16 + VEOR V23.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V6.B16, V6.B16 + VEOR V21.B16, V7.B16, V7.B16 + VEOR V22.B16, V6.B16, V6.B16 + VEOR V23.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V8.B16, V8.B16 + VEOR V21.B16, V9.B16, V9.B16 + VEOR V22.B16, V8.B16, V8.B16 + VEOR V23.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V10.B16, V10.B16 + VEOR V21.B16, V11.B16, V11.B16 + VEOR V22.B16, V10.B16, V10.B16 + VEOR V23.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V12.B16, V12.B16 + VEOR V21.B16, V13.B16, V13.B16 + VEOR V22.B16, V12.B16, V12.B16 + VEOR V23.B16, V13.B16, V13.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V14.B16, V14.B16 + VEOR V21.B16, V15.B16, V15.B16 + VEOR V22.B16, V14.B16, V14.B16 + VEOR V23.B16, V15.B16, V15.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V16.B16, V16.B16 + VEOR V21.B16, V17.B16, V17.B16 + VEOR V22.B16, V16.B16, V16.B16 + VEOR V23.B16, V17.B16, V17.B16 + // Check for early termination + CMP $6, R16 + BEQ mulNeon_10x9_store + + // Load and process 32 bytes from input 6 to 9 outputs + VLD1.P 32(R11), [V24.B16, V25.B16] + VUSHR $4, V24.B16, V26.B16 + VUSHR $4, V25.B16, V27.B16 + VAND V18.B16, V24.B16, V24.B16 + VAND V18.B16, V25.B16, V25.B16 + VAND V18.B16, V26.B16, V26.B16 + VAND V18.B16, V27.B16, V27.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V0.B16, V0.B16 + VEOR V21.B16, V1.B16, V1.B16 + VEOR V22.B16, V0.B16, V0.B16 + VEOR V23.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V2.B16, V2.B16 + VEOR V21.B16, V3.B16, V3.B16 + VEOR V22.B16, V2.B16, V2.B16 + VEOR V23.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V4.B16, V4.B16 + VEOR V21.B16, V5.B16, V5.B16 + VEOR V22.B16, V4.B16, V4.B16 + VEOR V23.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V6.B16, V6.B16 + VEOR V21.B16, V7.B16, V7.B16 + VEOR V22.B16, V6.B16, V6.B16 + VEOR V23.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V8.B16, V8.B16 + VEOR V21.B16, V9.B16, V9.B16 + VEOR V22.B16, V8.B16, V8.B16 + VEOR V23.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V10.B16, V10.B16 + VEOR V21.B16, V11.B16, V11.B16 + VEOR V22.B16, V10.B16, V10.B16 + VEOR V23.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V12.B16, V12.B16 + VEOR V21.B16, V13.B16, V13.B16 + VEOR V22.B16, V12.B16, V12.B16 + VEOR V23.B16, V13.B16, V13.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V14.B16, V14.B16 + VEOR V21.B16, V15.B16, V15.B16 + VEOR V22.B16, V14.B16, V14.B16 + VEOR V23.B16, V15.B16, V15.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V16.B16, V16.B16 + VEOR V21.B16, V17.B16, V17.B16 + VEOR V22.B16, V16.B16, V16.B16 + VEOR V23.B16, V17.B16, V17.B16 + // Check for early termination + CMP $7, R16 + BEQ mulNeon_10x9_store + + // Load and process 32 bytes from input 7 to 9 outputs + VLD1.P 32(R12), [V24.B16, V25.B16] + VUSHR $4, V24.B16, V26.B16 + VUSHR $4, V25.B16, V27.B16 + VAND V18.B16, V24.B16, V24.B16 + VAND V18.B16, V25.B16, V25.B16 + VAND V18.B16, V26.B16, V26.B16 + VAND V18.B16, V27.B16, V27.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V0.B16, V0.B16 + VEOR V21.B16, V1.B16, V1.B16 + VEOR V22.B16, V0.B16, V0.B16 + VEOR V23.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V2.B16, V2.B16 + VEOR V21.B16, V3.B16, V3.B16 + VEOR V22.B16, V2.B16, V2.B16 + VEOR V23.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V4.B16, V4.B16 + VEOR V21.B16, V5.B16, V5.B16 + VEOR V22.B16, V4.B16, V4.B16 + VEOR V23.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V6.B16, V6.B16 + VEOR V21.B16, V7.B16, V7.B16 + VEOR V22.B16, V6.B16, V6.B16 + VEOR V23.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V8.B16, V8.B16 + VEOR V21.B16, V9.B16, V9.B16 + VEOR V22.B16, V8.B16, V8.B16 + VEOR V23.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V10.B16, V10.B16 + VEOR V21.B16, V11.B16, V11.B16 + VEOR V22.B16, V10.B16, V10.B16 + VEOR V23.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V12.B16, V12.B16 + VEOR V21.B16, V13.B16, V13.B16 + VEOR V22.B16, V12.B16, V12.B16 + VEOR V23.B16, V13.B16, V13.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V14.B16, V14.B16 + VEOR V21.B16, V15.B16, V15.B16 + VEOR V22.B16, V14.B16, V14.B16 + VEOR V23.B16, V15.B16, V15.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V16.B16, V16.B16 + VEOR V21.B16, V17.B16, V17.B16 + VEOR V22.B16, V16.B16, V16.B16 + VEOR V23.B16, V17.B16, V17.B16 + // Check for early termination + CMP $8, R16 + BEQ mulNeon_10x9_store + + // Load and process 32 bytes from input 8 to 9 outputs + VLD1.P 32(R13), [V24.B16, V25.B16] + VUSHR $4, V24.B16, V26.B16 + VUSHR $4, V25.B16, V27.B16 + VAND V18.B16, V24.B16, V24.B16 + VAND V18.B16, V25.B16, V25.B16 + VAND V18.B16, V26.B16, V26.B16 + VAND V18.B16, V27.B16, V27.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V0.B16, V0.B16 + VEOR V21.B16, V1.B16, V1.B16 + VEOR V22.B16, V0.B16, V0.B16 + VEOR V23.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V2.B16, V2.B16 + VEOR V21.B16, V3.B16, V3.B16 + VEOR V22.B16, V2.B16, V2.B16 + VEOR V23.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V4.B16, V4.B16 + VEOR V21.B16, V5.B16, V5.B16 + VEOR V22.B16, V4.B16, V4.B16 + VEOR V23.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V6.B16, V6.B16 + VEOR V21.B16, V7.B16, V7.B16 + VEOR V22.B16, V6.B16, V6.B16 + VEOR V23.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V8.B16, V8.B16 + VEOR V21.B16, V9.B16, V9.B16 + VEOR V22.B16, V8.B16, V8.B16 + VEOR V23.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V10.B16, V10.B16 + VEOR V21.B16, V11.B16, V11.B16 + VEOR V22.B16, V10.B16, V10.B16 + VEOR V23.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V12.B16, V12.B16 + VEOR V21.B16, V13.B16, V13.B16 + VEOR V22.B16, V12.B16, V12.B16 + VEOR V23.B16, V13.B16, V13.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V14.B16, V14.B16 + VEOR V21.B16, V15.B16, V15.B16 + VEOR V22.B16, V14.B16, V14.B16 + VEOR V23.B16, V15.B16, V15.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V16.B16, V16.B16 + VEOR V21.B16, V17.B16, V17.B16 + VEOR V22.B16, V16.B16, V16.B16 + VEOR V23.B16, V17.B16, V17.B16 + // Check for early termination + CMP $9, R16 + BEQ mulNeon_10x9_store + + // Load and process 32 bytes from input 9 to 9 outputs + VLD1.P 32(R3), [V24.B16, V25.B16] + VUSHR $4, V24.B16, V26.B16 + VUSHR $4, V25.B16, V27.B16 + VAND V18.B16, V24.B16, V24.B16 + VAND V18.B16, V25.B16, V25.B16 + VAND V18.B16, V26.B16, V26.B16 + VAND V18.B16, V27.B16, V27.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V0.B16, V0.B16 + VEOR V21.B16, V1.B16, V1.B16 + VEOR V22.B16, V0.B16, V0.B16 + VEOR V23.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V2.B16, V2.B16 + VEOR V21.B16, V3.B16, V3.B16 + VEOR V22.B16, V2.B16, V2.B16 + VEOR V23.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V4.B16, V4.B16 + VEOR V21.B16, V5.B16, V5.B16 + VEOR V22.B16, V4.B16, V4.B16 + VEOR V23.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V6.B16, V6.B16 + VEOR V21.B16, V7.B16, V7.B16 + VEOR V22.B16, V6.B16, V6.B16 + VEOR V23.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V8.B16, V8.B16 + VEOR V21.B16, V9.B16, V9.B16 + VEOR V22.B16, V8.B16, V8.B16 + VEOR V23.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V10.B16, V10.B16 + VEOR V21.B16, V11.B16, V11.B16 + VEOR V22.B16, V10.B16, V10.B16 + VEOR V23.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V12.B16, V12.B16 + VEOR V21.B16, V13.B16, V13.B16 + VEOR V22.B16, V12.B16, V12.B16 + VEOR V23.B16, V13.B16, V13.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V14.B16, V14.B16 + VEOR V21.B16, V15.B16, V15.B16 + VEOR V22.B16, V14.B16, V14.B16 + VEOR V23.B16, V15.B16, V15.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V16.B16, V16.B16 + VEOR V21.B16, V17.B16, V17.B16 + VEOR V22.B16, V16.B16, V16.B16 + VEOR V23.B16, V17.B16, V17.B16 + +mulNeon_10x9_store: + // Store 9 outputs + MOVD (R14), R6 + ADD R15<<3, R6 + VST1 [V0.D2, V1.D2], (R6) + MOVD 24(R14), R6 + ADD R15<<3, R6 + VST1 [V2.D2, V3.D2], (R6) + MOVD 48(R14), R6 + ADD R15<<3, R6 + VST1 [V4.D2, V5.D2], (R6) + MOVD 72(R14), R6 + ADD R15<<3, R6 + VST1 [V6.D2, V7.D2], (R6) + MOVD 96(R14), R6 + ADD R15<<3, R6 + VST1 [V8.D2, V9.D2], (R6) + MOVD 120(R14), R6 + ADD R15<<3, R6 + VST1 [V10.D2, V11.D2], (R6) + MOVD 144(R14), R6 + ADD R15<<3, R6 + VST1 [V12.D2, V13.D2], (R6) + MOVD 168(R14), R6 + ADD R15<<3, R6 + VST1 [V14.D2, V15.D2], (R6) + MOVD 192(R14), R6 + ADD R15<<3, R6 + VST1 [V16.D2, V17.D2], (R6) + + // Prepare for next loop + ADD $4, R15 + SUBS $1, R0 + BNE mulNeon_10x9_loop + +mulNeon_10x9_end: + RET + +// func mulNeon_10x9Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: NEON +TEXT ·mulNeon_10x9Xor(SB), NOSPLIT, $8-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 194 YMM used + MOVD n+80(FP), R0 + LSR $5, R0 + TST R0, R0 + BEQ mulNeon_10x9Xor_end + MOVD in_base+24(FP), R3 + MOVD (R3), R1 + MOVD 24(R3), R4 + MOVD 48(R3), R5 + MOVD 72(R3), R8 + MOVD 96(R3), R9 + MOVD 120(R3), R10 + MOVD 144(R3), R11 + MOVD 168(R3), R12 + MOVD 192(R3), R13 + MOVD 216(R3), R3 + MOVD out_base+48(FP), R14 + MOVD start+72(FP), R15 + + // Add start offset to input + ADD R15, R1 + ADD R15, R4 + ADD R15, R5 + ADD R15, R8 + ADD R15, R9 + ADD R15, R10 + ADD R15, R11 + ADD R15, R12 + ADD R15, R13 + ADD R15, R3 + LSR $3, R15 + MOVD $15, R6 + VMOV R6, V18.B[0] + VDUP V18.B[0], V18.B16 + + // Load number of input shards + MOVD in_len+32(FP), R16 + +mulNeon_10x9Xor_loop: + MOVD matrix_base+0(FP), R2 + // Load and process 32 bytes from input 0 to 9 outputs + VLD1.P 32(R1), [V24.B16, V25.B16] + VUSHR $4, V24.B16, V26.B16 + VUSHR $4, V25.B16, V27.B16 + VAND V18.B16, V24.B16, V24.B16 + VAND V18.B16, V25.B16, V25.B16 + VAND V18.B16, V26.B16, V26.B16 + VAND V18.B16, V27.B16, V27.B16 + MOVD (R14), R6 + ADD R15<<3, R6 + VLD1 (R6), [V0.B16, V1.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V0.B16, V0.B16 + VEOR V21.B16, V1.B16, V1.B16 + VEOR V22.B16, V0.B16, V0.B16 + VEOR V23.B16, V1.B16, V1.B16 + MOVD 24(R14), R6 + ADD R15<<3, R6 + VLD1 (R6), [V2.B16, V3.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V2.B16, V2.B16 + VEOR V21.B16, V3.B16, V3.B16 + VEOR V22.B16, V2.B16, V2.B16 + VEOR V23.B16, V3.B16, V3.B16 + MOVD 48(R14), R6 + ADD R15<<3, R6 + VLD1 (R6), [V4.B16, V5.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V4.B16, V4.B16 + VEOR V21.B16, V5.B16, V5.B16 + VEOR V22.B16, V4.B16, V4.B16 + VEOR V23.B16, V5.B16, V5.B16 + MOVD 72(R14), R6 + ADD R15<<3, R6 + VLD1 (R6), [V6.B16, V7.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V6.B16, V6.B16 + VEOR V21.B16, V7.B16, V7.B16 + VEOR V22.B16, V6.B16, V6.B16 + VEOR V23.B16, V7.B16, V7.B16 + MOVD 96(R14), R6 + ADD R15<<3, R6 + VLD1 (R6), [V8.B16, V9.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V8.B16, V8.B16 + VEOR V21.B16, V9.B16, V9.B16 + VEOR V22.B16, V8.B16, V8.B16 + VEOR V23.B16, V9.B16, V9.B16 + MOVD 120(R14), R6 + ADD R15<<3, R6 + VLD1 (R6), [V10.B16, V11.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V10.B16, V10.B16 + VEOR V21.B16, V11.B16, V11.B16 + VEOR V22.B16, V10.B16, V10.B16 + VEOR V23.B16, V11.B16, V11.B16 + MOVD 144(R14), R6 + ADD R15<<3, R6 + VLD1 (R6), [V12.B16, V13.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V12.B16, V12.B16 + VEOR V21.B16, V13.B16, V13.B16 + VEOR V22.B16, V12.B16, V12.B16 + VEOR V23.B16, V13.B16, V13.B16 + MOVD 168(R14), R6 + ADD R15<<3, R6 + VLD1 (R6), [V14.B16, V15.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V14.B16, V14.B16 + VEOR V21.B16, V15.B16, V15.B16 + VEOR V22.B16, V14.B16, V14.B16 + VEOR V23.B16, V15.B16, V15.B16 + MOVD 192(R14), R6 + ADD R15<<3, R6 + VLD1 (R6), [V16.B16, V17.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V16.B16, V16.B16 + VEOR V21.B16, V17.B16, V17.B16 + VEOR V22.B16, V16.B16, V16.B16 + VEOR V23.B16, V17.B16, V17.B16 + // Check for early termination + CMP $1, R16 + BEQ mulNeon_10x9Xor_store + + // Load and process 32 bytes from input 1 to 9 outputs + VLD1.P 32(R4), [V24.B16, V25.B16] + VUSHR $4, V24.B16, V26.B16 + VUSHR $4, V25.B16, V27.B16 + VAND V18.B16, V24.B16, V24.B16 + VAND V18.B16, V25.B16, V25.B16 + VAND V18.B16, V26.B16, V26.B16 + VAND V18.B16, V27.B16, V27.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V0.B16, V0.B16 + VEOR V21.B16, V1.B16, V1.B16 + VEOR V22.B16, V0.B16, V0.B16 + VEOR V23.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V2.B16, V2.B16 + VEOR V21.B16, V3.B16, V3.B16 + VEOR V22.B16, V2.B16, V2.B16 + VEOR V23.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V4.B16, V4.B16 + VEOR V21.B16, V5.B16, V5.B16 + VEOR V22.B16, V4.B16, V4.B16 + VEOR V23.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V6.B16, V6.B16 + VEOR V21.B16, V7.B16, V7.B16 + VEOR V22.B16, V6.B16, V6.B16 + VEOR V23.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V8.B16, V8.B16 + VEOR V21.B16, V9.B16, V9.B16 + VEOR V22.B16, V8.B16, V8.B16 + VEOR V23.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V10.B16, V10.B16 + VEOR V21.B16, V11.B16, V11.B16 + VEOR V22.B16, V10.B16, V10.B16 + VEOR V23.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V12.B16, V12.B16 + VEOR V21.B16, V13.B16, V13.B16 + VEOR V22.B16, V12.B16, V12.B16 + VEOR V23.B16, V13.B16, V13.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V14.B16, V14.B16 + VEOR V21.B16, V15.B16, V15.B16 + VEOR V22.B16, V14.B16, V14.B16 + VEOR V23.B16, V15.B16, V15.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V16.B16, V16.B16 + VEOR V21.B16, V17.B16, V17.B16 + VEOR V22.B16, V16.B16, V16.B16 + VEOR V23.B16, V17.B16, V17.B16 + // Check for early termination + CMP $2, R16 + BEQ mulNeon_10x9Xor_store + + // Load and process 32 bytes from input 2 to 9 outputs + VLD1.P 32(R5), [V24.B16, V25.B16] + VUSHR $4, V24.B16, V26.B16 + VUSHR $4, V25.B16, V27.B16 + VAND V18.B16, V24.B16, V24.B16 + VAND V18.B16, V25.B16, V25.B16 + VAND V18.B16, V26.B16, V26.B16 + VAND V18.B16, V27.B16, V27.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V0.B16, V0.B16 + VEOR V21.B16, V1.B16, V1.B16 + VEOR V22.B16, V0.B16, V0.B16 + VEOR V23.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V2.B16, V2.B16 + VEOR V21.B16, V3.B16, V3.B16 + VEOR V22.B16, V2.B16, V2.B16 + VEOR V23.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V4.B16, V4.B16 + VEOR V21.B16, V5.B16, V5.B16 + VEOR V22.B16, V4.B16, V4.B16 + VEOR V23.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V6.B16, V6.B16 + VEOR V21.B16, V7.B16, V7.B16 + VEOR V22.B16, V6.B16, V6.B16 + VEOR V23.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V8.B16, V8.B16 + VEOR V21.B16, V9.B16, V9.B16 + VEOR V22.B16, V8.B16, V8.B16 + VEOR V23.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V10.B16, V10.B16 + VEOR V21.B16, V11.B16, V11.B16 + VEOR V22.B16, V10.B16, V10.B16 + VEOR V23.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V12.B16, V12.B16 + VEOR V21.B16, V13.B16, V13.B16 + VEOR V22.B16, V12.B16, V12.B16 + VEOR V23.B16, V13.B16, V13.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V14.B16, V14.B16 + VEOR V21.B16, V15.B16, V15.B16 + VEOR V22.B16, V14.B16, V14.B16 + VEOR V23.B16, V15.B16, V15.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V16.B16, V16.B16 + VEOR V21.B16, V17.B16, V17.B16 + VEOR V22.B16, V16.B16, V16.B16 + VEOR V23.B16, V17.B16, V17.B16 + // Check for early termination + CMP $3, R16 + BEQ mulNeon_10x9Xor_store + + // Load and process 32 bytes from input 3 to 9 outputs + VLD1.P 32(R8), [V24.B16, V25.B16] + VUSHR $4, V24.B16, V26.B16 + VUSHR $4, V25.B16, V27.B16 + VAND V18.B16, V24.B16, V24.B16 + VAND V18.B16, V25.B16, V25.B16 + VAND V18.B16, V26.B16, V26.B16 + VAND V18.B16, V27.B16, V27.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V0.B16, V0.B16 + VEOR V21.B16, V1.B16, V1.B16 + VEOR V22.B16, V0.B16, V0.B16 + VEOR V23.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V2.B16, V2.B16 + VEOR V21.B16, V3.B16, V3.B16 + VEOR V22.B16, V2.B16, V2.B16 + VEOR V23.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V4.B16, V4.B16 + VEOR V21.B16, V5.B16, V5.B16 + VEOR V22.B16, V4.B16, V4.B16 + VEOR V23.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V6.B16, V6.B16 + VEOR V21.B16, V7.B16, V7.B16 + VEOR V22.B16, V6.B16, V6.B16 + VEOR V23.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V8.B16, V8.B16 + VEOR V21.B16, V9.B16, V9.B16 + VEOR V22.B16, V8.B16, V8.B16 + VEOR V23.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V10.B16, V10.B16 + VEOR V21.B16, V11.B16, V11.B16 + VEOR V22.B16, V10.B16, V10.B16 + VEOR V23.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V12.B16, V12.B16 + VEOR V21.B16, V13.B16, V13.B16 + VEOR V22.B16, V12.B16, V12.B16 + VEOR V23.B16, V13.B16, V13.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V14.B16, V14.B16 + VEOR V21.B16, V15.B16, V15.B16 + VEOR V22.B16, V14.B16, V14.B16 + VEOR V23.B16, V15.B16, V15.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V16.B16, V16.B16 + VEOR V21.B16, V17.B16, V17.B16 + VEOR V22.B16, V16.B16, V16.B16 + VEOR V23.B16, V17.B16, V17.B16 + // Check for early termination + CMP $4, R16 + BEQ mulNeon_10x9Xor_store + + // Load and process 32 bytes from input 4 to 9 outputs + VLD1.P 32(R9), [V24.B16, V25.B16] + VUSHR $4, V24.B16, V26.B16 + VUSHR $4, V25.B16, V27.B16 + VAND V18.B16, V24.B16, V24.B16 + VAND V18.B16, V25.B16, V25.B16 + VAND V18.B16, V26.B16, V26.B16 + VAND V18.B16, V27.B16, V27.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V0.B16, V0.B16 + VEOR V21.B16, V1.B16, V1.B16 + VEOR V22.B16, V0.B16, V0.B16 + VEOR V23.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V2.B16, V2.B16 + VEOR V21.B16, V3.B16, V3.B16 + VEOR V22.B16, V2.B16, V2.B16 + VEOR V23.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V4.B16, V4.B16 + VEOR V21.B16, V5.B16, V5.B16 + VEOR V22.B16, V4.B16, V4.B16 + VEOR V23.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V6.B16, V6.B16 + VEOR V21.B16, V7.B16, V7.B16 + VEOR V22.B16, V6.B16, V6.B16 + VEOR V23.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V8.B16, V8.B16 + VEOR V21.B16, V9.B16, V9.B16 + VEOR V22.B16, V8.B16, V8.B16 + VEOR V23.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V10.B16, V10.B16 + VEOR V21.B16, V11.B16, V11.B16 + VEOR V22.B16, V10.B16, V10.B16 + VEOR V23.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V12.B16, V12.B16 + VEOR V21.B16, V13.B16, V13.B16 + VEOR V22.B16, V12.B16, V12.B16 + VEOR V23.B16, V13.B16, V13.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V14.B16, V14.B16 + VEOR V21.B16, V15.B16, V15.B16 + VEOR V22.B16, V14.B16, V14.B16 + VEOR V23.B16, V15.B16, V15.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V16.B16, V16.B16 + VEOR V21.B16, V17.B16, V17.B16 + VEOR V22.B16, V16.B16, V16.B16 + VEOR V23.B16, V17.B16, V17.B16 + // Check for early termination + CMP $5, R16 + BEQ mulNeon_10x9Xor_store + + // Load and process 32 bytes from input 5 to 9 outputs + VLD1.P 32(R10), [V24.B16, V25.B16] + VUSHR $4, V24.B16, V26.B16 + VUSHR $4, V25.B16, V27.B16 + VAND V18.B16, V24.B16, V24.B16 + VAND V18.B16, V25.B16, V25.B16 + VAND V18.B16, V26.B16, V26.B16 + VAND V18.B16, V27.B16, V27.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V0.B16, V0.B16 + VEOR V21.B16, V1.B16, V1.B16 + VEOR V22.B16, V0.B16, V0.B16 + VEOR V23.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V2.B16, V2.B16 + VEOR V21.B16, V3.B16, V3.B16 + VEOR V22.B16, V2.B16, V2.B16 + VEOR V23.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V4.B16, V4.B16 + VEOR V21.B16, V5.B16, V5.B16 + VEOR V22.B16, V4.B16, V4.B16 + VEOR V23.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V6.B16, V6.B16 + VEOR V21.B16, V7.B16, V7.B16 + VEOR V22.B16, V6.B16, V6.B16 + VEOR V23.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V8.B16, V8.B16 + VEOR V21.B16, V9.B16, V9.B16 + VEOR V22.B16, V8.B16, V8.B16 + VEOR V23.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V10.B16, V10.B16 + VEOR V21.B16, V11.B16, V11.B16 + VEOR V22.B16, V10.B16, V10.B16 + VEOR V23.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V12.B16, V12.B16 + VEOR V21.B16, V13.B16, V13.B16 + VEOR V22.B16, V12.B16, V12.B16 + VEOR V23.B16, V13.B16, V13.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V14.B16, V14.B16 + VEOR V21.B16, V15.B16, V15.B16 + VEOR V22.B16, V14.B16, V14.B16 + VEOR V23.B16, V15.B16, V15.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V16.B16, V16.B16 + VEOR V21.B16, V17.B16, V17.B16 + VEOR V22.B16, V16.B16, V16.B16 + VEOR V23.B16, V17.B16, V17.B16 + // Check for early termination + CMP $6, R16 + BEQ mulNeon_10x9Xor_store + + // Load and process 32 bytes from input 6 to 9 outputs + VLD1.P 32(R11), [V24.B16, V25.B16] + VUSHR $4, V24.B16, V26.B16 + VUSHR $4, V25.B16, V27.B16 + VAND V18.B16, V24.B16, V24.B16 + VAND V18.B16, V25.B16, V25.B16 + VAND V18.B16, V26.B16, V26.B16 + VAND V18.B16, V27.B16, V27.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V0.B16, V0.B16 + VEOR V21.B16, V1.B16, V1.B16 + VEOR V22.B16, V0.B16, V0.B16 + VEOR V23.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V2.B16, V2.B16 + VEOR V21.B16, V3.B16, V3.B16 + VEOR V22.B16, V2.B16, V2.B16 + VEOR V23.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V4.B16, V4.B16 + VEOR V21.B16, V5.B16, V5.B16 + VEOR V22.B16, V4.B16, V4.B16 + VEOR V23.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V6.B16, V6.B16 + VEOR V21.B16, V7.B16, V7.B16 + VEOR V22.B16, V6.B16, V6.B16 + VEOR V23.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V8.B16, V8.B16 + VEOR V21.B16, V9.B16, V9.B16 + VEOR V22.B16, V8.B16, V8.B16 + VEOR V23.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V10.B16, V10.B16 + VEOR V21.B16, V11.B16, V11.B16 + VEOR V22.B16, V10.B16, V10.B16 + VEOR V23.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V12.B16, V12.B16 + VEOR V21.B16, V13.B16, V13.B16 + VEOR V22.B16, V12.B16, V12.B16 + VEOR V23.B16, V13.B16, V13.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V14.B16, V14.B16 + VEOR V21.B16, V15.B16, V15.B16 + VEOR V22.B16, V14.B16, V14.B16 + VEOR V23.B16, V15.B16, V15.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V16.B16, V16.B16 + VEOR V21.B16, V17.B16, V17.B16 + VEOR V22.B16, V16.B16, V16.B16 + VEOR V23.B16, V17.B16, V17.B16 + // Check for early termination + CMP $7, R16 + BEQ mulNeon_10x9Xor_store + + // Load and process 32 bytes from input 7 to 9 outputs + VLD1.P 32(R12), [V24.B16, V25.B16] + VUSHR $4, V24.B16, V26.B16 + VUSHR $4, V25.B16, V27.B16 + VAND V18.B16, V24.B16, V24.B16 + VAND V18.B16, V25.B16, V25.B16 + VAND V18.B16, V26.B16, V26.B16 + VAND V18.B16, V27.B16, V27.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V0.B16, V0.B16 + VEOR V21.B16, V1.B16, V1.B16 + VEOR V22.B16, V0.B16, V0.B16 + VEOR V23.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V2.B16, V2.B16 + VEOR V21.B16, V3.B16, V3.B16 + VEOR V22.B16, V2.B16, V2.B16 + VEOR V23.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V4.B16, V4.B16 + VEOR V21.B16, V5.B16, V5.B16 + VEOR V22.B16, V4.B16, V4.B16 + VEOR V23.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V6.B16, V6.B16 + VEOR V21.B16, V7.B16, V7.B16 + VEOR V22.B16, V6.B16, V6.B16 + VEOR V23.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V8.B16, V8.B16 + VEOR V21.B16, V9.B16, V9.B16 + VEOR V22.B16, V8.B16, V8.B16 + VEOR V23.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V10.B16, V10.B16 + VEOR V21.B16, V11.B16, V11.B16 + VEOR V22.B16, V10.B16, V10.B16 + VEOR V23.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V12.B16, V12.B16 + VEOR V21.B16, V13.B16, V13.B16 + VEOR V22.B16, V12.B16, V12.B16 + VEOR V23.B16, V13.B16, V13.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V14.B16, V14.B16 + VEOR V21.B16, V15.B16, V15.B16 + VEOR V22.B16, V14.B16, V14.B16 + VEOR V23.B16, V15.B16, V15.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V16.B16, V16.B16 + VEOR V21.B16, V17.B16, V17.B16 + VEOR V22.B16, V16.B16, V16.B16 + VEOR V23.B16, V17.B16, V17.B16 + // Check for early termination + CMP $8, R16 + BEQ mulNeon_10x9Xor_store + + // Load and process 32 bytes from input 8 to 9 outputs + VLD1.P 32(R13), [V24.B16, V25.B16] + VUSHR $4, V24.B16, V26.B16 + VUSHR $4, V25.B16, V27.B16 + VAND V18.B16, V24.B16, V24.B16 + VAND V18.B16, V25.B16, V25.B16 + VAND V18.B16, V26.B16, V26.B16 + VAND V18.B16, V27.B16, V27.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V0.B16, V0.B16 + VEOR V21.B16, V1.B16, V1.B16 + VEOR V22.B16, V0.B16, V0.B16 + VEOR V23.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V2.B16, V2.B16 + VEOR V21.B16, V3.B16, V3.B16 + VEOR V22.B16, V2.B16, V2.B16 + VEOR V23.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V4.B16, V4.B16 + VEOR V21.B16, V5.B16, V5.B16 + VEOR V22.B16, V4.B16, V4.B16 + VEOR V23.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V6.B16, V6.B16 + VEOR V21.B16, V7.B16, V7.B16 + VEOR V22.B16, V6.B16, V6.B16 + VEOR V23.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V8.B16, V8.B16 + VEOR V21.B16, V9.B16, V9.B16 + VEOR V22.B16, V8.B16, V8.B16 + VEOR V23.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V10.B16, V10.B16 + VEOR V21.B16, V11.B16, V11.B16 + VEOR V22.B16, V10.B16, V10.B16 + VEOR V23.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V12.B16, V12.B16 + VEOR V21.B16, V13.B16, V13.B16 + VEOR V22.B16, V12.B16, V12.B16 + VEOR V23.B16, V13.B16, V13.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V14.B16, V14.B16 + VEOR V21.B16, V15.B16, V15.B16 + VEOR V22.B16, V14.B16, V14.B16 + VEOR V23.B16, V15.B16, V15.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V16.B16, V16.B16 + VEOR V21.B16, V17.B16, V17.B16 + VEOR V22.B16, V16.B16, V16.B16 + VEOR V23.B16, V17.B16, V17.B16 + // Check for early termination + CMP $9, R16 + BEQ mulNeon_10x9Xor_store + + // Load and process 32 bytes from input 9 to 9 outputs + VLD1.P 32(R3), [V24.B16, V25.B16] + VUSHR $4, V24.B16, V26.B16 + VUSHR $4, V25.B16, V27.B16 + VAND V18.B16, V24.B16, V24.B16 + VAND V18.B16, V25.B16, V25.B16 + VAND V18.B16, V26.B16, V26.B16 + VAND V18.B16, V27.B16, V27.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V0.B16, V0.B16 + VEOR V21.B16, V1.B16, V1.B16 + VEOR V22.B16, V0.B16, V0.B16 + VEOR V23.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V2.B16, V2.B16 + VEOR V21.B16, V3.B16, V3.B16 + VEOR V22.B16, V2.B16, V2.B16 + VEOR V23.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V4.B16, V4.B16 + VEOR V21.B16, V5.B16, V5.B16 + VEOR V22.B16, V4.B16, V4.B16 + VEOR V23.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V6.B16, V6.B16 + VEOR V21.B16, V7.B16, V7.B16 + VEOR V22.B16, V6.B16, V6.B16 + VEOR V23.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V8.B16, V8.B16 + VEOR V21.B16, V9.B16, V9.B16 + VEOR V22.B16, V8.B16, V8.B16 + VEOR V23.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V10.B16, V10.B16 + VEOR V21.B16, V11.B16, V11.B16 + VEOR V22.B16, V10.B16, V10.B16 + VEOR V23.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V12.B16, V12.B16 + VEOR V21.B16, V13.B16, V13.B16 + VEOR V22.B16, V12.B16, V12.B16 + VEOR V23.B16, V13.B16, V13.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V14.B16, V14.B16 + VEOR V21.B16, V15.B16, V15.B16 + VEOR V22.B16, V14.B16, V14.B16 + VEOR V23.B16, V15.B16, V15.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V16.B16, V16.B16 + VEOR V21.B16, V17.B16, V17.B16 + VEOR V22.B16, V16.B16, V16.B16 + VEOR V23.B16, V17.B16, V17.B16 + +mulNeon_10x9Xor_store: + // Store 9 outputs + MOVD (R14), R6 + ADD R15<<3, R6 + VST1 [V0.D2, V1.D2], (R6) + MOVD 24(R14), R6 + ADD R15<<3, R6 + VST1 [V2.D2, V3.D2], (R6) + MOVD 48(R14), R6 + ADD R15<<3, R6 + VST1 [V4.D2, V5.D2], (R6) + MOVD 72(R14), R6 + ADD R15<<3, R6 + VST1 [V6.D2, V7.D2], (R6) + MOVD 96(R14), R6 + ADD R15<<3, R6 + VST1 [V8.D2, V9.D2], (R6) + MOVD 120(R14), R6 + ADD R15<<3, R6 + VST1 [V10.D2, V11.D2], (R6) + MOVD 144(R14), R6 + ADD R15<<3, R6 + VST1 [V12.D2, V13.D2], (R6) + MOVD 168(R14), R6 + ADD R15<<3, R6 + VST1 [V14.D2, V15.D2], (R6) + MOVD 192(R14), R6 + ADD R15<<3, R6 + VST1 [V16.D2, V17.D2], (R6) + + // Prepare for next loop + ADD $4, R15 + SUBS $1, R0 + BNE mulNeon_10x9Xor_loop + +mulNeon_10x9Xor_end: + RET + +// func mulNeon_10x10(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: NEON +TEXT ·mulNeon_10x10(SB), NOSPLIT, $8-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 215 YMM used + MOVD n+80(FP), R0 + LSR $5, R0 + TST R0, R0 + BEQ mulNeon_10x10_end + MOVD in_base+24(FP), R3 + MOVD (R3), R1 + MOVD 24(R3), R4 + MOVD 48(R3), R5 + MOVD 72(R3), R8 + MOVD 96(R3), R9 + MOVD 120(R3), R10 + MOVD 144(R3), R11 + MOVD 168(R3), R12 + MOVD 192(R3), R13 + MOVD 216(R3), R3 + MOVD out_base+48(FP), R14 + MOVD start+72(FP), R15 + + // Add start offset to input + ADD R15, R1 + ADD R15, R4 + ADD R15, R5 + ADD R15, R8 + ADD R15, R9 + ADD R15, R10 + ADD R15, R11 + ADD R15, R12 + ADD R15, R13 + ADD R15, R3 + LSR $3, R15 + MOVD $15, R6 + VMOV R6, V20.B[0] + VDUP V20.B[0], V20.B16 + + // Load number of input shards + MOVD in_len+32(FP), R16 + +mulNeon_10x10_loop: + MOVD matrix_base+0(FP), R2 + // Load and process 32 bytes from input 0 to 10 outputs + VLD1.P 32(R1), [V26.B16, V27.B16] + VUSHR $4, V26.B16, V28.B16 + VUSHR $4, V27.B16, V29.B16 + VAND V20.B16, V26.B16, V26.B16 + VAND V20.B16, V27.B16, V27.B16 + VAND V20.B16, V28.B16, V28.B16 + VAND V20.B16, V29.B16, V29.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V24.B16, V0.B16 + VEOR V23.B16, V25.B16, V1.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V24.B16, V2.B16 + VEOR V23.B16, V25.B16, V3.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V24.B16, V4.B16 + VEOR V23.B16, V25.B16, V5.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V24.B16, V6.B16 + VEOR V23.B16, V25.B16, V7.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V24.B16, V8.B16 + VEOR V23.B16, V25.B16, V9.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V24.B16, V10.B16 + VEOR V23.B16, V25.B16, V11.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V24.B16, V12.B16 + VEOR V23.B16, V25.B16, V13.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V24.B16, V14.B16 + VEOR V23.B16, V25.B16, V15.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V24.B16, V16.B16 + VEOR V23.B16, V25.B16, V17.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V24.B16, V18.B16 + VEOR V23.B16, V25.B16, V19.B16 + // Check for early termination + CMP $1, R16 + BEQ mulNeon_10x10_store + + // Load and process 32 bytes from input 1 to 10 outputs + VLD1.P 32(R4), [V26.B16, V27.B16] + VUSHR $4, V26.B16, V28.B16 + VUSHR $4, V27.B16, V29.B16 + VAND V20.B16, V26.B16, V26.B16 + VAND V20.B16, V27.B16, V27.B16 + VAND V20.B16, V28.B16, V28.B16 + VAND V20.B16, V29.B16, V29.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V0.B16, V0.B16 + VEOR V23.B16, V1.B16, V1.B16 + VEOR V24.B16, V0.B16, V0.B16 + VEOR V25.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V2.B16, V2.B16 + VEOR V23.B16, V3.B16, V3.B16 + VEOR V24.B16, V2.B16, V2.B16 + VEOR V25.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V4.B16, V4.B16 + VEOR V23.B16, V5.B16, V5.B16 + VEOR V24.B16, V4.B16, V4.B16 + VEOR V25.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V6.B16, V6.B16 + VEOR V23.B16, V7.B16, V7.B16 + VEOR V24.B16, V6.B16, V6.B16 + VEOR V25.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V8.B16, V8.B16 + VEOR V23.B16, V9.B16, V9.B16 + VEOR V24.B16, V8.B16, V8.B16 + VEOR V25.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V10.B16, V10.B16 + VEOR V23.B16, V11.B16, V11.B16 + VEOR V24.B16, V10.B16, V10.B16 + VEOR V25.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V12.B16, V12.B16 + VEOR V23.B16, V13.B16, V13.B16 + VEOR V24.B16, V12.B16, V12.B16 + VEOR V25.B16, V13.B16, V13.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V14.B16, V14.B16 + VEOR V23.B16, V15.B16, V15.B16 + VEOR V24.B16, V14.B16, V14.B16 + VEOR V25.B16, V15.B16, V15.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V16.B16, V16.B16 + VEOR V23.B16, V17.B16, V17.B16 + VEOR V24.B16, V16.B16, V16.B16 + VEOR V25.B16, V17.B16, V17.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V18.B16, V18.B16 + VEOR V23.B16, V19.B16, V19.B16 + VEOR V24.B16, V18.B16, V18.B16 + VEOR V25.B16, V19.B16, V19.B16 + // Check for early termination + CMP $2, R16 + BEQ mulNeon_10x10_store + + // Load and process 32 bytes from input 2 to 10 outputs + VLD1.P 32(R5), [V26.B16, V27.B16] + VUSHR $4, V26.B16, V28.B16 + VUSHR $4, V27.B16, V29.B16 + VAND V20.B16, V26.B16, V26.B16 + VAND V20.B16, V27.B16, V27.B16 + VAND V20.B16, V28.B16, V28.B16 + VAND V20.B16, V29.B16, V29.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V0.B16, V0.B16 + VEOR V23.B16, V1.B16, V1.B16 + VEOR V24.B16, V0.B16, V0.B16 + VEOR V25.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V2.B16, V2.B16 + VEOR V23.B16, V3.B16, V3.B16 + VEOR V24.B16, V2.B16, V2.B16 + VEOR V25.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V4.B16, V4.B16 + VEOR V23.B16, V5.B16, V5.B16 + VEOR V24.B16, V4.B16, V4.B16 + VEOR V25.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V6.B16, V6.B16 + VEOR V23.B16, V7.B16, V7.B16 + VEOR V24.B16, V6.B16, V6.B16 + VEOR V25.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V8.B16, V8.B16 + VEOR V23.B16, V9.B16, V9.B16 + VEOR V24.B16, V8.B16, V8.B16 + VEOR V25.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V10.B16, V10.B16 + VEOR V23.B16, V11.B16, V11.B16 + VEOR V24.B16, V10.B16, V10.B16 + VEOR V25.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V12.B16, V12.B16 + VEOR V23.B16, V13.B16, V13.B16 + VEOR V24.B16, V12.B16, V12.B16 + VEOR V25.B16, V13.B16, V13.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V14.B16, V14.B16 + VEOR V23.B16, V15.B16, V15.B16 + VEOR V24.B16, V14.B16, V14.B16 + VEOR V25.B16, V15.B16, V15.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V16.B16, V16.B16 + VEOR V23.B16, V17.B16, V17.B16 + VEOR V24.B16, V16.B16, V16.B16 + VEOR V25.B16, V17.B16, V17.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V18.B16, V18.B16 + VEOR V23.B16, V19.B16, V19.B16 + VEOR V24.B16, V18.B16, V18.B16 + VEOR V25.B16, V19.B16, V19.B16 + // Check for early termination + CMP $3, R16 + BEQ mulNeon_10x10_store + + // Load and process 32 bytes from input 3 to 10 outputs + VLD1.P 32(R8), [V26.B16, V27.B16] + VUSHR $4, V26.B16, V28.B16 + VUSHR $4, V27.B16, V29.B16 + VAND V20.B16, V26.B16, V26.B16 + VAND V20.B16, V27.B16, V27.B16 + VAND V20.B16, V28.B16, V28.B16 + VAND V20.B16, V29.B16, V29.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V0.B16, V0.B16 + VEOR V23.B16, V1.B16, V1.B16 + VEOR V24.B16, V0.B16, V0.B16 + VEOR V25.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V2.B16, V2.B16 + VEOR V23.B16, V3.B16, V3.B16 + VEOR V24.B16, V2.B16, V2.B16 + VEOR V25.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V4.B16, V4.B16 + VEOR V23.B16, V5.B16, V5.B16 + VEOR V24.B16, V4.B16, V4.B16 + VEOR V25.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V6.B16, V6.B16 + VEOR V23.B16, V7.B16, V7.B16 + VEOR V24.B16, V6.B16, V6.B16 + VEOR V25.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V8.B16, V8.B16 + VEOR V23.B16, V9.B16, V9.B16 + VEOR V24.B16, V8.B16, V8.B16 + VEOR V25.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V10.B16, V10.B16 + VEOR V23.B16, V11.B16, V11.B16 + VEOR V24.B16, V10.B16, V10.B16 + VEOR V25.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V12.B16, V12.B16 + VEOR V23.B16, V13.B16, V13.B16 + VEOR V24.B16, V12.B16, V12.B16 + VEOR V25.B16, V13.B16, V13.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V14.B16, V14.B16 + VEOR V23.B16, V15.B16, V15.B16 + VEOR V24.B16, V14.B16, V14.B16 + VEOR V25.B16, V15.B16, V15.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V16.B16, V16.B16 + VEOR V23.B16, V17.B16, V17.B16 + VEOR V24.B16, V16.B16, V16.B16 + VEOR V25.B16, V17.B16, V17.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V18.B16, V18.B16 + VEOR V23.B16, V19.B16, V19.B16 + VEOR V24.B16, V18.B16, V18.B16 + VEOR V25.B16, V19.B16, V19.B16 + // Check for early termination + CMP $4, R16 + BEQ mulNeon_10x10_store + + // Load and process 32 bytes from input 4 to 10 outputs + VLD1.P 32(R9), [V26.B16, V27.B16] + VUSHR $4, V26.B16, V28.B16 + VUSHR $4, V27.B16, V29.B16 + VAND V20.B16, V26.B16, V26.B16 + VAND V20.B16, V27.B16, V27.B16 + VAND V20.B16, V28.B16, V28.B16 + VAND V20.B16, V29.B16, V29.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V0.B16, V0.B16 + VEOR V23.B16, V1.B16, V1.B16 + VEOR V24.B16, V0.B16, V0.B16 + VEOR V25.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V2.B16, V2.B16 + VEOR V23.B16, V3.B16, V3.B16 + VEOR V24.B16, V2.B16, V2.B16 + VEOR V25.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V4.B16, V4.B16 + VEOR V23.B16, V5.B16, V5.B16 + VEOR V24.B16, V4.B16, V4.B16 + VEOR V25.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V6.B16, V6.B16 + VEOR V23.B16, V7.B16, V7.B16 + VEOR V24.B16, V6.B16, V6.B16 + VEOR V25.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V8.B16, V8.B16 + VEOR V23.B16, V9.B16, V9.B16 + VEOR V24.B16, V8.B16, V8.B16 + VEOR V25.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V10.B16, V10.B16 + VEOR V23.B16, V11.B16, V11.B16 + VEOR V24.B16, V10.B16, V10.B16 + VEOR V25.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V12.B16, V12.B16 + VEOR V23.B16, V13.B16, V13.B16 + VEOR V24.B16, V12.B16, V12.B16 + VEOR V25.B16, V13.B16, V13.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V14.B16, V14.B16 + VEOR V23.B16, V15.B16, V15.B16 + VEOR V24.B16, V14.B16, V14.B16 + VEOR V25.B16, V15.B16, V15.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V16.B16, V16.B16 + VEOR V23.B16, V17.B16, V17.B16 + VEOR V24.B16, V16.B16, V16.B16 + VEOR V25.B16, V17.B16, V17.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V18.B16, V18.B16 + VEOR V23.B16, V19.B16, V19.B16 + VEOR V24.B16, V18.B16, V18.B16 + VEOR V25.B16, V19.B16, V19.B16 + // Check for early termination + CMP $5, R16 + BEQ mulNeon_10x10_store + + // Load and process 32 bytes from input 5 to 10 outputs + VLD1.P 32(R10), [V26.B16, V27.B16] + VUSHR $4, V26.B16, V28.B16 + VUSHR $4, V27.B16, V29.B16 + VAND V20.B16, V26.B16, V26.B16 + VAND V20.B16, V27.B16, V27.B16 + VAND V20.B16, V28.B16, V28.B16 + VAND V20.B16, V29.B16, V29.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V0.B16, V0.B16 + VEOR V23.B16, V1.B16, V1.B16 + VEOR V24.B16, V0.B16, V0.B16 + VEOR V25.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V2.B16, V2.B16 + VEOR V23.B16, V3.B16, V3.B16 + VEOR V24.B16, V2.B16, V2.B16 + VEOR V25.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V4.B16, V4.B16 + VEOR V23.B16, V5.B16, V5.B16 + VEOR V24.B16, V4.B16, V4.B16 + VEOR V25.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V6.B16, V6.B16 + VEOR V23.B16, V7.B16, V7.B16 + VEOR V24.B16, V6.B16, V6.B16 + VEOR V25.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V8.B16, V8.B16 + VEOR V23.B16, V9.B16, V9.B16 + VEOR V24.B16, V8.B16, V8.B16 + VEOR V25.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V10.B16, V10.B16 + VEOR V23.B16, V11.B16, V11.B16 + VEOR V24.B16, V10.B16, V10.B16 + VEOR V25.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V12.B16, V12.B16 + VEOR V23.B16, V13.B16, V13.B16 + VEOR V24.B16, V12.B16, V12.B16 + VEOR V25.B16, V13.B16, V13.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V14.B16, V14.B16 + VEOR V23.B16, V15.B16, V15.B16 + VEOR V24.B16, V14.B16, V14.B16 + VEOR V25.B16, V15.B16, V15.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V16.B16, V16.B16 + VEOR V23.B16, V17.B16, V17.B16 + VEOR V24.B16, V16.B16, V16.B16 + VEOR V25.B16, V17.B16, V17.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V18.B16, V18.B16 + VEOR V23.B16, V19.B16, V19.B16 + VEOR V24.B16, V18.B16, V18.B16 + VEOR V25.B16, V19.B16, V19.B16 + // Check for early termination + CMP $6, R16 + BEQ mulNeon_10x10_store + + // Load and process 32 bytes from input 6 to 10 outputs + VLD1.P 32(R11), [V26.B16, V27.B16] + VUSHR $4, V26.B16, V28.B16 + VUSHR $4, V27.B16, V29.B16 + VAND V20.B16, V26.B16, V26.B16 + VAND V20.B16, V27.B16, V27.B16 + VAND V20.B16, V28.B16, V28.B16 + VAND V20.B16, V29.B16, V29.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V0.B16, V0.B16 + VEOR V23.B16, V1.B16, V1.B16 + VEOR V24.B16, V0.B16, V0.B16 + VEOR V25.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V2.B16, V2.B16 + VEOR V23.B16, V3.B16, V3.B16 + VEOR V24.B16, V2.B16, V2.B16 + VEOR V25.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V4.B16, V4.B16 + VEOR V23.B16, V5.B16, V5.B16 + VEOR V24.B16, V4.B16, V4.B16 + VEOR V25.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V6.B16, V6.B16 + VEOR V23.B16, V7.B16, V7.B16 + VEOR V24.B16, V6.B16, V6.B16 + VEOR V25.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V8.B16, V8.B16 + VEOR V23.B16, V9.B16, V9.B16 + VEOR V24.B16, V8.B16, V8.B16 + VEOR V25.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V10.B16, V10.B16 + VEOR V23.B16, V11.B16, V11.B16 + VEOR V24.B16, V10.B16, V10.B16 + VEOR V25.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V12.B16, V12.B16 + VEOR V23.B16, V13.B16, V13.B16 + VEOR V24.B16, V12.B16, V12.B16 + VEOR V25.B16, V13.B16, V13.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V14.B16, V14.B16 + VEOR V23.B16, V15.B16, V15.B16 + VEOR V24.B16, V14.B16, V14.B16 + VEOR V25.B16, V15.B16, V15.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V16.B16, V16.B16 + VEOR V23.B16, V17.B16, V17.B16 + VEOR V24.B16, V16.B16, V16.B16 + VEOR V25.B16, V17.B16, V17.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V18.B16, V18.B16 + VEOR V23.B16, V19.B16, V19.B16 + VEOR V24.B16, V18.B16, V18.B16 + VEOR V25.B16, V19.B16, V19.B16 + // Check for early termination + CMP $7, R16 + BEQ mulNeon_10x10_store + + // Load and process 32 bytes from input 7 to 10 outputs + VLD1.P 32(R12), [V26.B16, V27.B16] + VUSHR $4, V26.B16, V28.B16 + VUSHR $4, V27.B16, V29.B16 + VAND V20.B16, V26.B16, V26.B16 + VAND V20.B16, V27.B16, V27.B16 + VAND V20.B16, V28.B16, V28.B16 + VAND V20.B16, V29.B16, V29.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V0.B16, V0.B16 + VEOR V23.B16, V1.B16, V1.B16 + VEOR V24.B16, V0.B16, V0.B16 + VEOR V25.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V2.B16, V2.B16 + VEOR V23.B16, V3.B16, V3.B16 + VEOR V24.B16, V2.B16, V2.B16 + VEOR V25.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V4.B16, V4.B16 + VEOR V23.B16, V5.B16, V5.B16 + VEOR V24.B16, V4.B16, V4.B16 + VEOR V25.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V6.B16, V6.B16 + VEOR V23.B16, V7.B16, V7.B16 + VEOR V24.B16, V6.B16, V6.B16 + VEOR V25.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V8.B16, V8.B16 + VEOR V23.B16, V9.B16, V9.B16 + VEOR V24.B16, V8.B16, V8.B16 + VEOR V25.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V10.B16, V10.B16 + VEOR V23.B16, V11.B16, V11.B16 + VEOR V24.B16, V10.B16, V10.B16 + VEOR V25.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V12.B16, V12.B16 + VEOR V23.B16, V13.B16, V13.B16 + VEOR V24.B16, V12.B16, V12.B16 + VEOR V25.B16, V13.B16, V13.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V14.B16, V14.B16 + VEOR V23.B16, V15.B16, V15.B16 + VEOR V24.B16, V14.B16, V14.B16 + VEOR V25.B16, V15.B16, V15.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V16.B16, V16.B16 + VEOR V23.B16, V17.B16, V17.B16 + VEOR V24.B16, V16.B16, V16.B16 + VEOR V25.B16, V17.B16, V17.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V18.B16, V18.B16 + VEOR V23.B16, V19.B16, V19.B16 + VEOR V24.B16, V18.B16, V18.B16 + VEOR V25.B16, V19.B16, V19.B16 + // Check for early termination + CMP $8, R16 + BEQ mulNeon_10x10_store + + // Load and process 32 bytes from input 8 to 10 outputs + VLD1.P 32(R13), [V26.B16, V27.B16] + VUSHR $4, V26.B16, V28.B16 + VUSHR $4, V27.B16, V29.B16 + VAND V20.B16, V26.B16, V26.B16 + VAND V20.B16, V27.B16, V27.B16 + VAND V20.B16, V28.B16, V28.B16 + VAND V20.B16, V29.B16, V29.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V0.B16, V0.B16 + VEOR V23.B16, V1.B16, V1.B16 + VEOR V24.B16, V0.B16, V0.B16 + VEOR V25.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V2.B16, V2.B16 + VEOR V23.B16, V3.B16, V3.B16 + VEOR V24.B16, V2.B16, V2.B16 + VEOR V25.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V4.B16, V4.B16 + VEOR V23.B16, V5.B16, V5.B16 + VEOR V24.B16, V4.B16, V4.B16 + VEOR V25.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V6.B16, V6.B16 + VEOR V23.B16, V7.B16, V7.B16 + VEOR V24.B16, V6.B16, V6.B16 + VEOR V25.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V8.B16, V8.B16 + VEOR V23.B16, V9.B16, V9.B16 + VEOR V24.B16, V8.B16, V8.B16 + VEOR V25.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V10.B16, V10.B16 + VEOR V23.B16, V11.B16, V11.B16 + VEOR V24.B16, V10.B16, V10.B16 + VEOR V25.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V12.B16, V12.B16 + VEOR V23.B16, V13.B16, V13.B16 + VEOR V24.B16, V12.B16, V12.B16 + VEOR V25.B16, V13.B16, V13.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V14.B16, V14.B16 + VEOR V23.B16, V15.B16, V15.B16 + VEOR V24.B16, V14.B16, V14.B16 + VEOR V25.B16, V15.B16, V15.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V16.B16, V16.B16 + VEOR V23.B16, V17.B16, V17.B16 + VEOR V24.B16, V16.B16, V16.B16 + VEOR V25.B16, V17.B16, V17.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V18.B16, V18.B16 + VEOR V23.B16, V19.B16, V19.B16 + VEOR V24.B16, V18.B16, V18.B16 + VEOR V25.B16, V19.B16, V19.B16 + // Check for early termination + CMP $9, R16 + BEQ mulNeon_10x10_store + + // Load and process 32 bytes from input 9 to 10 outputs + VLD1.P 32(R3), [V26.B16, V27.B16] + VUSHR $4, V26.B16, V28.B16 + VUSHR $4, V27.B16, V29.B16 + VAND V20.B16, V26.B16, V26.B16 + VAND V20.B16, V27.B16, V27.B16 + VAND V20.B16, V28.B16, V28.B16 + VAND V20.B16, V29.B16, V29.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V0.B16, V0.B16 + VEOR V23.B16, V1.B16, V1.B16 + VEOR V24.B16, V0.B16, V0.B16 + VEOR V25.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V2.B16, V2.B16 + VEOR V23.B16, V3.B16, V3.B16 + VEOR V24.B16, V2.B16, V2.B16 + VEOR V25.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V4.B16, V4.B16 + VEOR V23.B16, V5.B16, V5.B16 + VEOR V24.B16, V4.B16, V4.B16 + VEOR V25.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V6.B16, V6.B16 + VEOR V23.B16, V7.B16, V7.B16 + VEOR V24.B16, V6.B16, V6.B16 + VEOR V25.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V8.B16, V8.B16 + VEOR V23.B16, V9.B16, V9.B16 + VEOR V24.B16, V8.B16, V8.B16 + VEOR V25.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V10.B16, V10.B16 + VEOR V23.B16, V11.B16, V11.B16 + VEOR V24.B16, V10.B16, V10.B16 + VEOR V25.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V12.B16, V12.B16 + VEOR V23.B16, V13.B16, V13.B16 + VEOR V24.B16, V12.B16, V12.B16 + VEOR V25.B16, V13.B16, V13.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V14.B16, V14.B16 + VEOR V23.B16, V15.B16, V15.B16 + VEOR V24.B16, V14.B16, V14.B16 + VEOR V25.B16, V15.B16, V15.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V16.B16, V16.B16 + VEOR V23.B16, V17.B16, V17.B16 + VEOR V24.B16, V16.B16, V16.B16 + VEOR V25.B16, V17.B16, V17.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V18.B16, V18.B16 + VEOR V23.B16, V19.B16, V19.B16 + VEOR V24.B16, V18.B16, V18.B16 + VEOR V25.B16, V19.B16, V19.B16 + +mulNeon_10x10_store: + // Store 10 outputs + MOVD (R14), R6 + ADD R15<<3, R6 + VST1 [V0.D2, V1.D2], (R6) + MOVD 24(R14), R6 + ADD R15<<3, R6 + VST1 [V2.D2, V3.D2], (R6) + MOVD 48(R14), R6 + ADD R15<<3, R6 + VST1 [V4.D2, V5.D2], (R6) + MOVD 72(R14), R6 + ADD R15<<3, R6 + VST1 [V6.D2, V7.D2], (R6) + MOVD 96(R14), R6 + ADD R15<<3, R6 + VST1 [V8.D2, V9.D2], (R6) + MOVD 120(R14), R6 + ADD R15<<3, R6 + VST1 [V10.D2, V11.D2], (R6) + MOVD 144(R14), R6 + ADD R15<<3, R6 + VST1 [V12.D2, V13.D2], (R6) + MOVD 168(R14), R6 + ADD R15<<3, R6 + VST1 [V14.D2, V15.D2], (R6) + MOVD 192(R14), R6 + ADD R15<<3, R6 + VST1 [V16.D2, V17.D2], (R6) + MOVD 216(R14), R6 + ADD R15<<3, R6 + VST1 [V18.D2, V19.D2], (R6) + + // Prepare for next loop + ADD $4, R15 + SUBS $1, R0 + BNE mulNeon_10x10_loop + +mulNeon_10x10_end: + RET + +// func mulNeon_10x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: NEON +TEXT ·mulNeon_10x10Xor(SB), NOSPLIT, $8-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 215 YMM used + MOVD n+80(FP), R0 + LSR $5, R0 + TST R0, R0 + BEQ mulNeon_10x10Xor_end + MOVD in_base+24(FP), R3 + MOVD (R3), R1 + MOVD 24(R3), R4 + MOVD 48(R3), R5 + MOVD 72(R3), R8 + MOVD 96(R3), R9 + MOVD 120(R3), R10 + MOVD 144(R3), R11 + MOVD 168(R3), R12 + MOVD 192(R3), R13 + MOVD 216(R3), R3 + MOVD out_base+48(FP), R14 + MOVD start+72(FP), R15 + + // Add start offset to input + ADD R15, R1 + ADD R15, R4 + ADD R15, R5 + ADD R15, R8 + ADD R15, R9 + ADD R15, R10 + ADD R15, R11 + ADD R15, R12 + ADD R15, R13 + ADD R15, R3 + LSR $3, R15 + MOVD $15, R6 + VMOV R6, V20.B[0] + VDUP V20.B[0], V20.B16 + + // Load number of input shards + MOVD in_len+32(FP), R16 + +mulNeon_10x10Xor_loop: + MOVD matrix_base+0(FP), R2 + // Load and process 32 bytes from input 0 to 10 outputs + VLD1.P 32(R1), [V26.B16, V27.B16] + VUSHR $4, V26.B16, V28.B16 + VUSHR $4, V27.B16, V29.B16 + VAND V20.B16, V26.B16, V26.B16 + VAND V20.B16, V27.B16, V27.B16 + VAND V20.B16, V28.B16, V28.B16 + VAND V20.B16, V29.B16, V29.B16 + MOVD (R14), R6 + ADD R15<<3, R6 + VLD1 (R6), [V0.B16, V1.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V0.B16, V0.B16 + VEOR V23.B16, V1.B16, V1.B16 + VEOR V24.B16, V0.B16, V0.B16 + VEOR V25.B16, V1.B16, V1.B16 + MOVD 24(R14), R6 + ADD R15<<3, R6 + VLD1 (R6), [V2.B16, V3.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V2.B16, V2.B16 + VEOR V23.B16, V3.B16, V3.B16 + VEOR V24.B16, V2.B16, V2.B16 + VEOR V25.B16, V3.B16, V3.B16 + MOVD 48(R14), R6 + ADD R15<<3, R6 + VLD1 (R6), [V4.B16, V5.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V4.B16, V4.B16 + VEOR V23.B16, V5.B16, V5.B16 + VEOR V24.B16, V4.B16, V4.B16 + VEOR V25.B16, V5.B16, V5.B16 + MOVD 72(R14), R6 + ADD R15<<3, R6 + VLD1 (R6), [V6.B16, V7.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V6.B16, V6.B16 + VEOR V23.B16, V7.B16, V7.B16 + VEOR V24.B16, V6.B16, V6.B16 + VEOR V25.B16, V7.B16, V7.B16 + MOVD 96(R14), R6 + ADD R15<<3, R6 + VLD1 (R6), [V8.B16, V9.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V8.B16, V8.B16 + VEOR V23.B16, V9.B16, V9.B16 + VEOR V24.B16, V8.B16, V8.B16 + VEOR V25.B16, V9.B16, V9.B16 + MOVD 120(R14), R6 + ADD R15<<3, R6 + VLD1 (R6), [V10.B16, V11.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V10.B16, V10.B16 + VEOR V23.B16, V11.B16, V11.B16 + VEOR V24.B16, V10.B16, V10.B16 + VEOR V25.B16, V11.B16, V11.B16 + MOVD 144(R14), R6 + ADD R15<<3, R6 + VLD1 (R6), [V12.B16, V13.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V12.B16, V12.B16 + VEOR V23.B16, V13.B16, V13.B16 + VEOR V24.B16, V12.B16, V12.B16 + VEOR V25.B16, V13.B16, V13.B16 + MOVD 168(R14), R6 + ADD R15<<3, R6 + VLD1 (R6), [V14.B16, V15.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V14.B16, V14.B16 + VEOR V23.B16, V15.B16, V15.B16 + VEOR V24.B16, V14.B16, V14.B16 + VEOR V25.B16, V15.B16, V15.B16 + MOVD 192(R14), R6 + ADD R15<<3, R6 + VLD1 (R6), [V16.B16, V17.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V16.B16, V16.B16 + VEOR V23.B16, V17.B16, V17.B16 + VEOR V24.B16, V16.B16, V16.B16 + VEOR V25.B16, V17.B16, V17.B16 + MOVD 216(R14), R6 + ADD R15<<3, R6 + VLD1 (R6), [V18.B16, V19.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V18.B16, V18.B16 + VEOR V23.B16, V19.B16, V19.B16 + VEOR V24.B16, V18.B16, V18.B16 + VEOR V25.B16, V19.B16, V19.B16 + // Check for early termination + CMP $1, R16 + BEQ mulNeon_10x10Xor_store + + // Load and process 32 bytes from input 1 to 10 outputs + VLD1.P 32(R4), [V26.B16, V27.B16] + VUSHR $4, V26.B16, V28.B16 + VUSHR $4, V27.B16, V29.B16 + VAND V20.B16, V26.B16, V26.B16 + VAND V20.B16, V27.B16, V27.B16 + VAND V20.B16, V28.B16, V28.B16 + VAND V20.B16, V29.B16, V29.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V0.B16, V0.B16 + VEOR V23.B16, V1.B16, V1.B16 + VEOR V24.B16, V0.B16, V0.B16 + VEOR V25.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V2.B16, V2.B16 + VEOR V23.B16, V3.B16, V3.B16 + VEOR V24.B16, V2.B16, V2.B16 + VEOR V25.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V4.B16, V4.B16 + VEOR V23.B16, V5.B16, V5.B16 + VEOR V24.B16, V4.B16, V4.B16 + VEOR V25.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V6.B16, V6.B16 + VEOR V23.B16, V7.B16, V7.B16 + VEOR V24.B16, V6.B16, V6.B16 + VEOR V25.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V8.B16, V8.B16 + VEOR V23.B16, V9.B16, V9.B16 + VEOR V24.B16, V8.B16, V8.B16 + VEOR V25.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V10.B16, V10.B16 + VEOR V23.B16, V11.B16, V11.B16 + VEOR V24.B16, V10.B16, V10.B16 + VEOR V25.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V12.B16, V12.B16 + VEOR V23.B16, V13.B16, V13.B16 + VEOR V24.B16, V12.B16, V12.B16 + VEOR V25.B16, V13.B16, V13.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V14.B16, V14.B16 + VEOR V23.B16, V15.B16, V15.B16 + VEOR V24.B16, V14.B16, V14.B16 + VEOR V25.B16, V15.B16, V15.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V16.B16, V16.B16 + VEOR V23.B16, V17.B16, V17.B16 + VEOR V24.B16, V16.B16, V16.B16 + VEOR V25.B16, V17.B16, V17.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V18.B16, V18.B16 + VEOR V23.B16, V19.B16, V19.B16 + VEOR V24.B16, V18.B16, V18.B16 + VEOR V25.B16, V19.B16, V19.B16 + // Check for early termination + CMP $2, R16 + BEQ mulNeon_10x10Xor_store + + // Load and process 32 bytes from input 2 to 10 outputs + VLD1.P 32(R5), [V26.B16, V27.B16] + VUSHR $4, V26.B16, V28.B16 + VUSHR $4, V27.B16, V29.B16 + VAND V20.B16, V26.B16, V26.B16 + VAND V20.B16, V27.B16, V27.B16 + VAND V20.B16, V28.B16, V28.B16 + VAND V20.B16, V29.B16, V29.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V0.B16, V0.B16 + VEOR V23.B16, V1.B16, V1.B16 + VEOR V24.B16, V0.B16, V0.B16 + VEOR V25.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V2.B16, V2.B16 + VEOR V23.B16, V3.B16, V3.B16 + VEOR V24.B16, V2.B16, V2.B16 + VEOR V25.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V4.B16, V4.B16 + VEOR V23.B16, V5.B16, V5.B16 + VEOR V24.B16, V4.B16, V4.B16 + VEOR V25.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V6.B16, V6.B16 + VEOR V23.B16, V7.B16, V7.B16 + VEOR V24.B16, V6.B16, V6.B16 + VEOR V25.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V8.B16, V8.B16 + VEOR V23.B16, V9.B16, V9.B16 + VEOR V24.B16, V8.B16, V8.B16 + VEOR V25.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V10.B16, V10.B16 + VEOR V23.B16, V11.B16, V11.B16 + VEOR V24.B16, V10.B16, V10.B16 + VEOR V25.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V12.B16, V12.B16 + VEOR V23.B16, V13.B16, V13.B16 + VEOR V24.B16, V12.B16, V12.B16 + VEOR V25.B16, V13.B16, V13.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V14.B16, V14.B16 + VEOR V23.B16, V15.B16, V15.B16 + VEOR V24.B16, V14.B16, V14.B16 + VEOR V25.B16, V15.B16, V15.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V16.B16, V16.B16 + VEOR V23.B16, V17.B16, V17.B16 + VEOR V24.B16, V16.B16, V16.B16 + VEOR V25.B16, V17.B16, V17.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V18.B16, V18.B16 + VEOR V23.B16, V19.B16, V19.B16 + VEOR V24.B16, V18.B16, V18.B16 + VEOR V25.B16, V19.B16, V19.B16 + // Check for early termination + CMP $3, R16 + BEQ mulNeon_10x10Xor_store + + // Load and process 32 bytes from input 3 to 10 outputs + VLD1.P 32(R8), [V26.B16, V27.B16] + VUSHR $4, V26.B16, V28.B16 + VUSHR $4, V27.B16, V29.B16 + VAND V20.B16, V26.B16, V26.B16 + VAND V20.B16, V27.B16, V27.B16 + VAND V20.B16, V28.B16, V28.B16 + VAND V20.B16, V29.B16, V29.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V0.B16, V0.B16 + VEOR V23.B16, V1.B16, V1.B16 + VEOR V24.B16, V0.B16, V0.B16 + VEOR V25.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V2.B16, V2.B16 + VEOR V23.B16, V3.B16, V3.B16 + VEOR V24.B16, V2.B16, V2.B16 + VEOR V25.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V4.B16, V4.B16 + VEOR V23.B16, V5.B16, V5.B16 + VEOR V24.B16, V4.B16, V4.B16 + VEOR V25.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V6.B16, V6.B16 + VEOR V23.B16, V7.B16, V7.B16 + VEOR V24.B16, V6.B16, V6.B16 + VEOR V25.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V8.B16, V8.B16 + VEOR V23.B16, V9.B16, V9.B16 + VEOR V24.B16, V8.B16, V8.B16 + VEOR V25.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V10.B16, V10.B16 + VEOR V23.B16, V11.B16, V11.B16 + VEOR V24.B16, V10.B16, V10.B16 + VEOR V25.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V12.B16, V12.B16 + VEOR V23.B16, V13.B16, V13.B16 + VEOR V24.B16, V12.B16, V12.B16 + VEOR V25.B16, V13.B16, V13.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V14.B16, V14.B16 + VEOR V23.B16, V15.B16, V15.B16 + VEOR V24.B16, V14.B16, V14.B16 + VEOR V25.B16, V15.B16, V15.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V16.B16, V16.B16 + VEOR V23.B16, V17.B16, V17.B16 + VEOR V24.B16, V16.B16, V16.B16 + VEOR V25.B16, V17.B16, V17.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V18.B16, V18.B16 + VEOR V23.B16, V19.B16, V19.B16 + VEOR V24.B16, V18.B16, V18.B16 + VEOR V25.B16, V19.B16, V19.B16 + // Check for early termination + CMP $4, R16 + BEQ mulNeon_10x10Xor_store + + // Load and process 32 bytes from input 4 to 10 outputs + VLD1.P 32(R9), [V26.B16, V27.B16] + VUSHR $4, V26.B16, V28.B16 + VUSHR $4, V27.B16, V29.B16 + VAND V20.B16, V26.B16, V26.B16 + VAND V20.B16, V27.B16, V27.B16 + VAND V20.B16, V28.B16, V28.B16 + VAND V20.B16, V29.B16, V29.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V0.B16, V0.B16 + VEOR V23.B16, V1.B16, V1.B16 + VEOR V24.B16, V0.B16, V0.B16 + VEOR V25.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V2.B16, V2.B16 + VEOR V23.B16, V3.B16, V3.B16 + VEOR V24.B16, V2.B16, V2.B16 + VEOR V25.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V4.B16, V4.B16 + VEOR V23.B16, V5.B16, V5.B16 + VEOR V24.B16, V4.B16, V4.B16 + VEOR V25.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V6.B16, V6.B16 + VEOR V23.B16, V7.B16, V7.B16 + VEOR V24.B16, V6.B16, V6.B16 + VEOR V25.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V8.B16, V8.B16 + VEOR V23.B16, V9.B16, V9.B16 + VEOR V24.B16, V8.B16, V8.B16 + VEOR V25.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V10.B16, V10.B16 + VEOR V23.B16, V11.B16, V11.B16 + VEOR V24.B16, V10.B16, V10.B16 + VEOR V25.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V12.B16, V12.B16 + VEOR V23.B16, V13.B16, V13.B16 + VEOR V24.B16, V12.B16, V12.B16 + VEOR V25.B16, V13.B16, V13.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V14.B16, V14.B16 + VEOR V23.B16, V15.B16, V15.B16 + VEOR V24.B16, V14.B16, V14.B16 + VEOR V25.B16, V15.B16, V15.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V16.B16, V16.B16 + VEOR V23.B16, V17.B16, V17.B16 + VEOR V24.B16, V16.B16, V16.B16 + VEOR V25.B16, V17.B16, V17.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V18.B16, V18.B16 + VEOR V23.B16, V19.B16, V19.B16 + VEOR V24.B16, V18.B16, V18.B16 + VEOR V25.B16, V19.B16, V19.B16 + // Check for early termination + CMP $5, R16 + BEQ mulNeon_10x10Xor_store + + // Load and process 32 bytes from input 5 to 10 outputs + VLD1.P 32(R10), [V26.B16, V27.B16] + VUSHR $4, V26.B16, V28.B16 + VUSHR $4, V27.B16, V29.B16 + VAND V20.B16, V26.B16, V26.B16 + VAND V20.B16, V27.B16, V27.B16 + VAND V20.B16, V28.B16, V28.B16 + VAND V20.B16, V29.B16, V29.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V0.B16, V0.B16 + VEOR V23.B16, V1.B16, V1.B16 + VEOR V24.B16, V0.B16, V0.B16 + VEOR V25.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V2.B16, V2.B16 + VEOR V23.B16, V3.B16, V3.B16 + VEOR V24.B16, V2.B16, V2.B16 + VEOR V25.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V4.B16, V4.B16 + VEOR V23.B16, V5.B16, V5.B16 + VEOR V24.B16, V4.B16, V4.B16 + VEOR V25.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V6.B16, V6.B16 + VEOR V23.B16, V7.B16, V7.B16 + VEOR V24.B16, V6.B16, V6.B16 + VEOR V25.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V8.B16, V8.B16 + VEOR V23.B16, V9.B16, V9.B16 + VEOR V24.B16, V8.B16, V8.B16 + VEOR V25.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V10.B16, V10.B16 + VEOR V23.B16, V11.B16, V11.B16 + VEOR V24.B16, V10.B16, V10.B16 + VEOR V25.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V12.B16, V12.B16 + VEOR V23.B16, V13.B16, V13.B16 + VEOR V24.B16, V12.B16, V12.B16 + VEOR V25.B16, V13.B16, V13.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V14.B16, V14.B16 + VEOR V23.B16, V15.B16, V15.B16 + VEOR V24.B16, V14.B16, V14.B16 + VEOR V25.B16, V15.B16, V15.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V16.B16, V16.B16 + VEOR V23.B16, V17.B16, V17.B16 + VEOR V24.B16, V16.B16, V16.B16 + VEOR V25.B16, V17.B16, V17.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V18.B16, V18.B16 + VEOR V23.B16, V19.B16, V19.B16 + VEOR V24.B16, V18.B16, V18.B16 + VEOR V25.B16, V19.B16, V19.B16 + // Check for early termination + CMP $6, R16 + BEQ mulNeon_10x10Xor_store + + // Load and process 32 bytes from input 6 to 10 outputs + VLD1.P 32(R11), [V26.B16, V27.B16] + VUSHR $4, V26.B16, V28.B16 + VUSHR $4, V27.B16, V29.B16 + VAND V20.B16, V26.B16, V26.B16 + VAND V20.B16, V27.B16, V27.B16 + VAND V20.B16, V28.B16, V28.B16 + VAND V20.B16, V29.B16, V29.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V0.B16, V0.B16 + VEOR V23.B16, V1.B16, V1.B16 + VEOR V24.B16, V0.B16, V0.B16 + VEOR V25.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V2.B16, V2.B16 + VEOR V23.B16, V3.B16, V3.B16 + VEOR V24.B16, V2.B16, V2.B16 + VEOR V25.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V4.B16, V4.B16 + VEOR V23.B16, V5.B16, V5.B16 + VEOR V24.B16, V4.B16, V4.B16 + VEOR V25.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V6.B16, V6.B16 + VEOR V23.B16, V7.B16, V7.B16 + VEOR V24.B16, V6.B16, V6.B16 + VEOR V25.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V8.B16, V8.B16 + VEOR V23.B16, V9.B16, V9.B16 + VEOR V24.B16, V8.B16, V8.B16 + VEOR V25.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V10.B16, V10.B16 + VEOR V23.B16, V11.B16, V11.B16 + VEOR V24.B16, V10.B16, V10.B16 + VEOR V25.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V12.B16, V12.B16 + VEOR V23.B16, V13.B16, V13.B16 + VEOR V24.B16, V12.B16, V12.B16 + VEOR V25.B16, V13.B16, V13.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V14.B16, V14.B16 + VEOR V23.B16, V15.B16, V15.B16 + VEOR V24.B16, V14.B16, V14.B16 + VEOR V25.B16, V15.B16, V15.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V16.B16, V16.B16 + VEOR V23.B16, V17.B16, V17.B16 + VEOR V24.B16, V16.B16, V16.B16 + VEOR V25.B16, V17.B16, V17.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V18.B16, V18.B16 + VEOR V23.B16, V19.B16, V19.B16 + VEOR V24.B16, V18.B16, V18.B16 + VEOR V25.B16, V19.B16, V19.B16 + // Check for early termination + CMP $7, R16 + BEQ mulNeon_10x10Xor_store + + // Load and process 32 bytes from input 7 to 10 outputs + VLD1.P 32(R12), [V26.B16, V27.B16] + VUSHR $4, V26.B16, V28.B16 + VUSHR $4, V27.B16, V29.B16 + VAND V20.B16, V26.B16, V26.B16 + VAND V20.B16, V27.B16, V27.B16 + VAND V20.B16, V28.B16, V28.B16 + VAND V20.B16, V29.B16, V29.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V0.B16, V0.B16 + VEOR V23.B16, V1.B16, V1.B16 + VEOR V24.B16, V0.B16, V0.B16 + VEOR V25.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V2.B16, V2.B16 + VEOR V23.B16, V3.B16, V3.B16 + VEOR V24.B16, V2.B16, V2.B16 + VEOR V25.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V4.B16, V4.B16 + VEOR V23.B16, V5.B16, V5.B16 + VEOR V24.B16, V4.B16, V4.B16 + VEOR V25.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V6.B16, V6.B16 + VEOR V23.B16, V7.B16, V7.B16 + VEOR V24.B16, V6.B16, V6.B16 + VEOR V25.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V8.B16, V8.B16 + VEOR V23.B16, V9.B16, V9.B16 + VEOR V24.B16, V8.B16, V8.B16 + VEOR V25.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V10.B16, V10.B16 + VEOR V23.B16, V11.B16, V11.B16 + VEOR V24.B16, V10.B16, V10.B16 + VEOR V25.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V12.B16, V12.B16 + VEOR V23.B16, V13.B16, V13.B16 + VEOR V24.B16, V12.B16, V12.B16 + VEOR V25.B16, V13.B16, V13.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V14.B16, V14.B16 + VEOR V23.B16, V15.B16, V15.B16 + VEOR V24.B16, V14.B16, V14.B16 + VEOR V25.B16, V15.B16, V15.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V16.B16, V16.B16 + VEOR V23.B16, V17.B16, V17.B16 + VEOR V24.B16, V16.B16, V16.B16 + VEOR V25.B16, V17.B16, V17.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V18.B16, V18.B16 + VEOR V23.B16, V19.B16, V19.B16 + VEOR V24.B16, V18.B16, V18.B16 + VEOR V25.B16, V19.B16, V19.B16 + // Check for early termination + CMP $8, R16 + BEQ mulNeon_10x10Xor_store + + // Load and process 32 bytes from input 8 to 10 outputs + VLD1.P 32(R13), [V26.B16, V27.B16] + VUSHR $4, V26.B16, V28.B16 + VUSHR $4, V27.B16, V29.B16 + VAND V20.B16, V26.B16, V26.B16 + VAND V20.B16, V27.B16, V27.B16 + VAND V20.B16, V28.B16, V28.B16 + VAND V20.B16, V29.B16, V29.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V0.B16, V0.B16 + VEOR V23.B16, V1.B16, V1.B16 + VEOR V24.B16, V0.B16, V0.B16 + VEOR V25.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V2.B16, V2.B16 + VEOR V23.B16, V3.B16, V3.B16 + VEOR V24.B16, V2.B16, V2.B16 + VEOR V25.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V4.B16, V4.B16 + VEOR V23.B16, V5.B16, V5.B16 + VEOR V24.B16, V4.B16, V4.B16 + VEOR V25.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V6.B16, V6.B16 + VEOR V23.B16, V7.B16, V7.B16 + VEOR V24.B16, V6.B16, V6.B16 + VEOR V25.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V8.B16, V8.B16 + VEOR V23.B16, V9.B16, V9.B16 + VEOR V24.B16, V8.B16, V8.B16 + VEOR V25.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V10.B16, V10.B16 + VEOR V23.B16, V11.B16, V11.B16 + VEOR V24.B16, V10.B16, V10.B16 + VEOR V25.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V12.B16, V12.B16 + VEOR V23.B16, V13.B16, V13.B16 + VEOR V24.B16, V12.B16, V12.B16 + VEOR V25.B16, V13.B16, V13.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V14.B16, V14.B16 + VEOR V23.B16, V15.B16, V15.B16 + VEOR V24.B16, V14.B16, V14.B16 + VEOR V25.B16, V15.B16, V15.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V16.B16, V16.B16 + VEOR V23.B16, V17.B16, V17.B16 + VEOR V24.B16, V16.B16, V16.B16 + VEOR V25.B16, V17.B16, V17.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V18.B16, V18.B16 + VEOR V23.B16, V19.B16, V19.B16 + VEOR V24.B16, V18.B16, V18.B16 + VEOR V25.B16, V19.B16, V19.B16 + // Check for early termination + CMP $9, R16 + BEQ mulNeon_10x10Xor_store + + // Load and process 32 bytes from input 9 to 10 outputs + VLD1.P 32(R3), [V26.B16, V27.B16] + VUSHR $4, V26.B16, V28.B16 + VUSHR $4, V27.B16, V29.B16 + VAND V20.B16, V26.B16, V26.B16 + VAND V20.B16, V27.B16, V27.B16 + VAND V20.B16, V28.B16, V28.B16 + VAND V20.B16, V29.B16, V29.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V0.B16, V0.B16 + VEOR V23.B16, V1.B16, V1.B16 + VEOR V24.B16, V0.B16, V0.B16 + VEOR V25.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V2.B16, V2.B16 + VEOR V23.B16, V3.B16, V3.B16 + VEOR V24.B16, V2.B16, V2.B16 + VEOR V25.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V4.B16, V4.B16 + VEOR V23.B16, V5.B16, V5.B16 + VEOR V24.B16, V4.B16, V4.B16 + VEOR V25.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V6.B16, V6.B16 + VEOR V23.B16, V7.B16, V7.B16 + VEOR V24.B16, V6.B16, V6.B16 + VEOR V25.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V8.B16, V8.B16 + VEOR V23.B16, V9.B16, V9.B16 + VEOR V24.B16, V8.B16, V8.B16 + VEOR V25.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V10.B16, V10.B16 + VEOR V23.B16, V11.B16, V11.B16 + VEOR V24.B16, V10.B16, V10.B16 + VEOR V25.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V12.B16, V12.B16 + VEOR V23.B16, V13.B16, V13.B16 + VEOR V24.B16, V12.B16, V12.B16 + VEOR V25.B16, V13.B16, V13.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V14.B16, V14.B16 + VEOR V23.B16, V15.B16, V15.B16 + VEOR V24.B16, V14.B16, V14.B16 + VEOR V25.B16, V15.B16, V15.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V16.B16, V16.B16 + VEOR V23.B16, V17.B16, V17.B16 + VEOR V24.B16, V16.B16, V16.B16 + VEOR V25.B16, V17.B16, V17.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V18.B16, V18.B16 + VEOR V23.B16, V19.B16, V19.B16 + VEOR V24.B16, V18.B16, V18.B16 + VEOR V25.B16, V19.B16, V19.B16 + +mulNeon_10x10Xor_store: + // Store 10 outputs + MOVD (R14), R6 + ADD R15<<3, R6 + VST1 [V0.D2, V1.D2], (R6) + MOVD 24(R14), R6 + ADD R15<<3, R6 + VST1 [V2.D2, V3.D2], (R6) + MOVD 48(R14), R6 + ADD R15<<3, R6 + VST1 [V4.D2, V5.D2], (R6) + MOVD 72(R14), R6 + ADD R15<<3, R6 + VST1 [V6.D2, V7.D2], (R6) + MOVD 96(R14), R6 + ADD R15<<3, R6 + VST1 [V8.D2, V9.D2], (R6) + MOVD 120(R14), R6 + ADD R15<<3, R6 + VST1 [V10.D2, V11.D2], (R6) + MOVD 144(R14), R6 + ADD R15<<3, R6 + VST1 [V12.D2, V13.D2], (R6) + MOVD 168(R14), R6 + ADD R15<<3, R6 + VST1 [V14.D2, V15.D2], (R6) + MOVD 192(R14), R6 + ADD R15<<3, R6 + VST1 [V16.D2, V17.D2], (R6) + MOVD 216(R14), R6 + ADD R15<<3, R6 + VST1 [V18.D2, V19.D2], (R6) + + // Prepare for next loop + ADD $4, R15 + SUBS $1, R0 + BNE mulNeon_10x10Xor_loop + +mulNeon_10x10Xor_end: + RET + diff --git a/vendor/github.com/klauspost/reedsolomon/galois_gen_none.go b/vendor/github.com/klauspost/reedsolomon/galois_gen_none.go index 1bb268a3b..3e258986f 100644 --- a/vendor/github.com/klauspost/reedsolomon/galois_gen_none.go +++ b/vendor/github.com/klauspost/reedsolomon/galois_gen_none.go @@ -1,33 +1,19 @@ -//go:build !amd64 || noasm || appengine || gccgo || nogen +//go:build !(amd64 || arm64) || noasm || appengine || gccgo || nogen package reedsolomon -const maxAvx2Inputs = 1 -const maxAvx2Outputs = 1 -const minAvx2Size = 1 -const avxSizeMask = 0 -const avx2CodeGen = false +const ( + codeGen = false + codeGenMaxGoroutines = 8 + codeGenMaxInputs = 1 + codeGenMaxOutputs = 1 + minCodeGenSize = 1 +) -func galMulSlicesAvx2(matrix []byte, in, out [][]byte, start, stop int) int { - panic("codegen not available") +func (r *reedSolomon) hasCodeGen(int, int, int) (_, _ *func(matrix []byte, in, out [][]byte, start, stop int) int, ok bool) { + return nil, nil, false } -func galMulSlicesAvx2Xor(matrix []byte, in, out [][]byte, start, stop int) int { - panic("codegen not available") -} - -func galMulSlicesGFNI(matrix []uint64, in, out [][]byte, start, stop int) int { - panic("codegen not available") -} - -func galMulSlicesGFNIXor(matrix []uint64, in, out [][]byte, start, stop int) int { - panic("codegen not available") -} - -func galMulSlicesAvxGFNI(matrix []uint64, in, out [][]byte, start, stop int) int { - panic("codegen not available") -} - -func galMulSlicesAvxGFNIXor(matrix []uint64, in, out [][]byte, start, stop int) int { - panic("codegen not available") +func (r *reedSolomon) canGFNI(int, int, int) (_, _ *func(matrix []uint64, in, out [][]byte, start, stop int) int, ok bool) { + return nil, nil, false } diff --git a/vendor/github.com/klauspost/reedsolomon/galois_gen_switch_amd64.go b/vendor/github.com/klauspost/reedsolomon/galois_gen_switch_amd64.go index 429e2c20d..d4f46ea2d 100644 --- a/vendor/github.com/klauspost/reedsolomon/galois_gen_switch_amd64.go +++ b/vendor/github.com/klauspost/reedsolomon/galois_gen_switch_amd64.go @@ -10,12 +10,39 @@ import ( ) const ( - avx2CodeGen = true - maxAvx2Inputs = 10 - maxAvx2Outputs = 10 - minAvx2Size = 64 + codeGen = true + codeGenMaxGoroutines = 8 + codeGenMaxInputs = 10 + codeGenMaxOutputs = 10 + minCodeGenSize = 64 ) +var ( + fAvx2 = galMulSlicesAvx2 + fAvx2Xor = galMulSlicesAvx2Xor + fGFNI = galMulSlicesGFNI + fGFNIXor = galMulSlicesGFNIXor + fAvxGFNI = galMulSlicesAvxGFNI + fAvxGFNIXor = galMulSlicesAvxGFNIXor +) + +func (r *reedSolomon) hasCodeGen(byteCount int, inputs, outputs int) (_, _ *func(matrix []byte, in, out [][]byte, start, stop int) int, ok bool) { + return &fAvx2, &fAvx2Xor, codeGen && pshufb && r.o.useAVX2 && + byteCount >= codeGenMinSize && inputs+outputs >= codeGenMinShards && + inputs <= codeGenMaxInputs && outputs <= codeGenMaxOutputs +} + +func (r *reedSolomon) canGFNI(byteCount int, inputs, outputs int) (_, _ *func(matrix []uint64, in, out [][]byte, start, stop int) int, ok bool) { + if r.o.useAvx512GFNI { + return &fGFNI, &fGFNIXor, codeGen && + byteCount >= codeGenMinSize && inputs+outputs >= codeGenMinShards && + inputs <= codeGenMaxInputs && outputs <= codeGenMaxOutputs + } + return &fAvxGFNI, &fAvxGFNIXor, codeGen && r.o.useAvxGNFI && + byteCount >= codeGenMinSize && inputs+outputs >= codeGenMinShards && + inputs <= codeGenMaxInputs && outputs <= codeGenMaxOutputs +} + func galMulSlicesAvx2(matrix []byte, in, out [][]byte, start, stop int) int { n := stop - start diff --git a/vendor/github.com/klauspost/reedsolomon/galois_gen_switch_arm64.go b/vendor/github.com/klauspost/reedsolomon/galois_gen_switch_arm64.go new file mode 100644 index 000000000..ff2541b8e --- /dev/null +++ b/vendor/github.com/klauspost/reedsolomon/galois_gen_switch_arm64.go @@ -0,0 +1,195 @@ +//go:build !appengine && !noasm && gc && !nogen && !nopshufb +// +build !appengine,!noasm,gc,!nogen,!nopshufb + +package reedsolomon + +import ( + "fmt" +) + +const ( + codeGen = true + codeGenMaxGoroutines = 16 + codeGenMaxInputs = 10 + codeGenMaxOutputs = 10 + minCodeGenSize = 64 +) + +var ( + fSve = galMulSlicesSve + fSveXor = galMulSlicesSveXor + fNeon = galMulSlicesNeon + fNeonXor = galMulSlicesNeonXor +) + +func (r *reedSolomon) hasCodeGen(byteCount int, inputs, outputs int) (_, _ *func(matrix []byte, in, out [][]byte, start, stop int) int, ok bool) { + if r.o.useSVE { + return &fSve, &fSveXor, codeGen && pshufb && + byteCount >= codeGenMinSize && inputs+outputs >= codeGenMinShards && + inputs <= codeGenMaxInputs && outputs <= codeGenMaxOutputs + } + return &fNeon, &fNeonXor, codeGen && pshufb && r.o.useNEON && + byteCount >= codeGenMinSize && inputs+outputs >= codeGenMinShards && + inputs <= codeGenMaxInputs && outputs <= codeGenMaxOutputs +} + +func (r *reedSolomon) canGFNI(byteCount int, inputs, outputs int) (_, _ *func(matrix []uint64, in, out [][]byte, start, stop int) int, ok bool) { + return nil, nil, false +} + +// galMulSlicesSve +func galMulSlicesSve(matrix []byte, in, out [][]byte, start, stop int) int { + n := stop - start + + // fmt.Println(len(in), len(out)) + switch len(out) { + case 1: + mulSve_10x1_64(matrix, in, out, start, n) + return n & (maxInt - 63) + case 2: + mulSve_10x2_64(matrix, in, out, start, n) + return n & (maxInt - 63) + case 3: + mulSve_10x3_64(matrix, in, out, start, n) + return n & (maxInt - 63) + case 4: + mulSve_10x4(matrix, in, out, start, n) + return n & (maxInt - 31) + case 5: + mulSve_10x5(matrix, in, out, start, n) + return n & (maxInt - 31) + case 6: + mulSve_10x6(matrix, in, out, start, n) + return n & (maxInt - 31) + case 7: + mulSve_10x7(matrix, in, out, start, n) + return n & (maxInt - 31) + case 8: + mulSve_10x8(matrix, in, out, start, n) + return n & (maxInt - 31) + case 9: + mulSve_10x9(matrix, in, out, start, n) + return n & (maxInt - 31) + case 10: + mulSve_10x10(matrix, in, out, start, n) + return n & (maxInt - 31) + } + panic(fmt.Sprintf("ARM SVE: unhandled size: %dx%d", len(in), len(out))) +} + +// galMulSlicesSveXor +func galMulSlicesSveXor(matrix []byte, in, out [][]byte, start, stop int) int { + n := (stop - start) + + switch len(out) { + case 1: + mulSve_10x1_64Xor(matrix, in, out, start, n) + return n & (maxInt - 63) + case 2: + mulSve_10x2_64Xor(matrix, in, out, start, n) + return n & (maxInt - 63) + case 3: + mulSve_10x3_64Xor(matrix, in, out, start, n) + return n & (maxInt - 63) + case 4: + mulSve_10x4Xor(matrix, in, out, start, n) + return n & (maxInt - 31) + case 5: + mulSve_10x5Xor(matrix, in, out, start, n) + return n & (maxInt - 31) + case 6: + mulSve_10x6Xor(matrix, in, out, start, n) + return n & (maxInt - 31) + case 7: + mulSve_10x7Xor(matrix, in, out, start, n) + return n & (maxInt - 31) + case 8: + mulSve_10x8Xor(matrix, in, out, start, n) + return n & (maxInt - 31) + case 9: + mulSve_10x9Xor(matrix, in, out, start, n) + return n & (maxInt - 31) + case 10: + mulSve_10x10Xor(matrix, in, out, start, n) + return n & (maxInt - 31) + } + panic(fmt.Sprintf("ARM SVE: unhandled size: %dx%d", len(in), len(out))) +} + +// galMulSlicesNeon +func galMulSlicesNeon(matrix []byte, in, out [][]byte, start, stop int) int { + n := stop - start + + switch len(out) { + case 1: + mulNeon_10x1_64(matrix, in, out, start, n) + return n & (maxInt - 63) + case 2: + mulNeon_10x2_64(matrix, in, out, start, n) + return n & (maxInt - 63) + case 3: + mulNeon_10x3_64(matrix, in, out, start, n) + return n & (maxInt - 63) + case 4: + mulNeon_10x4(matrix, in, out, start, n) + return n & (maxInt - 31) + case 5: + mulNeon_10x5(matrix, in, out, start, n) + return n & (maxInt - 31) + case 6: + mulNeon_10x6(matrix, in, out, start, n) + return n & (maxInt - 31) + case 7: + mulNeon_10x7(matrix, in, out, start, n) + return n & (maxInt - 31) + case 8: + mulNeon_10x8(matrix, in, out, start, n) + return n & (maxInt - 31) + case 9: + mulNeon_10x9(matrix, in, out, start, n) + return n & (maxInt - 31) + case 10: + mulNeon_10x10(matrix, in, out, start, n) + return n & (maxInt - 31) + } + panic(fmt.Sprintf("ARM NEON: unhandled size: %dx%d", len(in), len(out))) +} + +// galMulSlicesNeonXor +func galMulSlicesNeonXor(matrix []byte, in, out [][]byte, start, stop int) int { + n := (stop - start) + + switch len(out) { + case 1: + mulNeon_10x1_64Xor(matrix, in, out, start, n) + return n & (maxInt - 63) + case 2: + mulNeon_10x2_64Xor(matrix, in, out, start, n) + return n & (maxInt - 63) + case 3: + mulNeon_10x3_64Xor(matrix, in, out, start, n) + return n & (maxInt - 63) + case 4: + mulNeon_10x4Xor(matrix, in, out, start, n) + return n & (maxInt - 31) + case 5: + mulNeon_10x5Xor(matrix, in, out, start, n) + return n & (maxInt - 31) + case 6: + mulNeon_10x6Xor(matrix, in, out, start, n) + return n & (maxInt - 31) + case 7: + mulNeon_10x7Xor(matrix, in, out, start, n) + return n & (maxInt - 31) + case 8: + mulNeon_10x8Xor(matrix, in, out, start, n) + return n & (maxInt - 31) + case 9: + mulNeon_10x9Xor(matrix, in, out, start, n) + return n & (maxInt - 31) + case 10: + mulNeon_10x10Xor(matrix, in, out, start, n) + return n & (maxInt - 31) + } + panic(fmt.Sprintf("ARM NEON: unhandled size: %dx%d", len(in), len(out))) +} diff --git a/vendor/github.com/klauspost/reedsolomon/galois_gen_switch_nopshufb_amd64.go b/vendor/github.com/klauspost/reedsolomon/galois_gen_switch_nopshufb_amd64.go index 1ba08b5e1..66bab8a0b 100644 --- a/vendor/github.com/klauspost/reedsolomon/galois_gen_switch_nopshufb_amd64.go +++ b/vendor/github.com/klauspost/reedsolomon/galois_gen_switch_nopshufb_amd64.go @@ -10,12 +10,35 @@ import ( ) const ( - avx2CodeGen = true - maxAvx2Inputs = 10 - maxAvx2Outputs = 10 - minAvx2Size = 64 + codeGen = true + codeGenMaxGoroutines = 8 + codeGenMaxInputs = 10 + codeGenMaxOutputs = 10 + minCodeGenSize = 64 ) +var ( + fGFNI = galMulSlicesGFNI + fGFNIXor = galMulSlicesGFNIXor + fAvxGFNI = galMulSlicesAvxGFNI + fAvxGFNIXor = galMulSlicesAvxGFNIXor +) + +func (r *reedSolomon) hasCodeGen(byteCount int, inputs, outputs int) (_, _ *func(matrix []byte, in, out [][]byte, start, stop int) int, ok bool) { + return nil, nil, false // no code generation for generic case (only GFNI cases) +} + +func (r *reedSolomon) canGFNI(byteCount int, inputs, outputs int) (_, _ *func(matrix []uint64, in, out [][]byte, start, stop int) int, ok bool) { + if r.o.useAvx512GFNI { + return &fGFNI, &fGFNIXor, codeGen && + byteCount >= codeGenMinSize && inputs+outputs >= codeGenMinShards && + inputs <= codeGenMaxInputs && outputs <= codeGenMaxOutputs + } + return &fAvxGFNI, &fAvxGFNIXor, codeGen && r.o.useAvxGNFI && + byteCount >= codeGenMinSize && inputs+outputs >= codeGenMinShards && + inputs <= codeGenMaxInputs && outputs <= codeGenMaxOutputs +} + func galMulSlicesAvx2(matrix []byte, in, out [][]byte, start, stop int) int { panic(`no pshufb`) } func galMulSlicesAvx2Xor(matrix []byte, in, out [][]byte, start, stop int) int { panic(`no pshufb`) } diff --git a/vendor/github.com/klauspost/reedsolomon/galois_gen_switch_nopshufb_arm64.go b/vendor/github.com/klauspost/reedsolomon/galois_gen_switch_nopshufb_arm64.go new file mode 100644 index 000000000..db2aaa613 --- /dev/null +++ b/vendor/github.com/klauspost/reedsolomon/galois_gen_switch_nopshufb_arm64.go @@ -0,0 +1,22 @@ +// Code generated by command: go generate gen.go. DO NOT EDIT. + +//go:build !appengine && !noasm && gc && !nogen && nopshufb +// +build !appengine,!noasm,gc,!nogen,nopshufb + +package reedsolomon + +const ( + codeGen = false + codeGenMaxGoroutines = 16 + codeGenMaxInputs = 10 + codeGenMaxOutputs = 10 + minCodeGenSize = 64 +) + +func (r *reedSolomon) hasCodeGen(byteCount int, inputs, outputs int) (_, _ *func(matrix []byte, in, out [][]byte, start, stop int) int, ok bool) { + return nil, nil, false +} + +func (r *reedSolomon) canGFNI(byteCount int, inputs, outputs int) (_, _ *func(matrix []uint64, in, out [][]byte, start, stop int) int, ok bool) { + return nil, nil, false +} diff --git a/vendor/github.com/klauspost/reedsolomon/galois_notamd64.go b/vendor/github.com/klauspost/reedsolomon/galois_notamd64.go deleted file mode 100644 index f98bfed11..000000000 --- a/vendor/github.com/klauspost/reedsolomon/galois_notamd64.go +++ /dev/null @@ -1,13 +0,0 @@ -//go:build !amd64 || noasm || appengine || gccgo || pshufb - -// Copyright 2020, Klaus Post, see LICENSE for details. - -package reedsolomon - -func (r *reedSolomon) codeSomeShardsAvx512(matrixRows, inputs, outputs [][]byte, byteCount int) { - panic("codeSomeShardsAvx512 should not be called if built without asm") -} - -func (r *reedSolomon) codeSomeShardsAvx512P(matrixRows, inputs, outputs [][]byte, byteCount int) { - panic("codeSomeShardsAvx512P should not be called if built without asm") -} diff --git a/vendor/github.com/klauspost/reedsolomon/leopard.go b/vendor/github.com/klauspost/reedsolomon/leopard.go index 6b4c80184..adf72c8f1 100644 --- a/vendor/github.com/klauspost/reedsolomon/leopard.go +++ b/vendor/github.com/klauspost/reedsolomon/leopard.go @@ -451,13 +451,13 @@ func (r *leopardFF16) reconstruct(shards [][]byte, recoverAll bool) error { } // Evaluate error locator polynomial - fwht(&errLocs, order, m+r.dataShards) + fwht(&errLocs, m+r.dataShards) for i := 0; i < order; i++ { errLocs[i] = ffe((uint(errLocs[i]) * uint(logWalsh[i])) % modulus) } - fwht(&errLocs, order, order) + fwht(&errLocs, order) var work [][]byte if w, ok := r.workPool.Get().([][]byte); ok { @@ -863,11 +863,11 @@ func ceilPow2(n int) int { // Decimation in time (DIT) Fast Walsh-Hadamard Transform // Unrolls pairs of layers to perform cross-layer operations in registers // mtrunc: Number of elements that are non-zero at the front of data -func fwht(data *[order]ffe, m, mtrunc int) { +func fwht(data *[order]ffe, mtrunc int) { // Decimation in time: Unroll 2 layers at a time dist := 1 dist4 := 4 - for dist4 <= m { + for dist4 <= order { // For each set of dist*4 elements: for r := 0; r < mtrunc; r += dist4 { // For each set of dist elements: @@ -898,14 +898,6 @@ func fwht(data *[order]ffe, m, mtrunc int) { dist = dist4 dist4 <<= 2 } - - // If there is one layer left: - if dist < m { - dist := uint16(dist) - for i := uint16(0); i < dist; i++ { - fwht2(&data[i], &data[i+dist]) - } - } } func fwht4(data []ffe, s int) { @@ -1036,7 +1028,7 @@ func initFFTSkew() { } logWalsh[0] = 0 - fwht(logWalsh, order, order) + fwht(logWalsh, order) } func initMul16LUT() { diff --git a/vendor/github.com/klauspost/reedsolomon/leopard8.go b/vendor/github.com/klauspost/reedsolomon/leopard8.go index cd863a136..cd0a23eed 100644 --- a/vendor/github.com/klauspost/reedsolomon/leopard8.go +++ b/vendor/github.com/klauspost/reedsolomon/leopard8.go @@ -509,13 +509,13 @@ func (r *leopardFF8) reconstruct(shards [][]byte, recoverAll bool) error { } // Evaluate error locator polynomial8 - fwht8(&errLocs, order8, m+r.dataShards) + fwht8(&errLocs, m+r.dataShards) for i := 0; i < order8; i++ { errLocs[i] = ffe8((uint(errLocs[i]) * uint(logWalsh8[i])) % modulus8) } - fwht8(&errLocs, order8, order8) + fwht8(&errLocs, order8) if r.inversion != nil { c := leopardGF8cache{ @@ -943,11 +943,11 @@ func subMod8(a, b ffe8) ffe8 { // Decimation in time (DIT) Fast Walsh-Hadamard Transform // Unrolls pairs of layers to perform cross-layer operations in registers // mtrunc: Number of elements that are non-zero at the front of data -func fwht8(data *[order8]ffe8, m, mtrunc int) { +func fwht8(data *[order8]ffe8, mtrunc int) { // Decimation in time: Unroll 2 layers at a time dist := 1 dist4 := 4 - for dist4 <= m { + for dist4 <= order8 { // For each set of dist*4 elements: for r := 0; r < mtrunc; r += dist4 { // For each set of dist elements: @@ -978,14 +978,6 @@ func fwht8(data *[order8]ffe8, m, mtrunc int) { dist = dist4 dist4 <<= 2 } - - // If there is one layer left: - if dist < m { - dist := uint16(dist) - for i := uint16(0); i < dist; i++ { - fwht28(&data[i], &data[i+dist]) - } - } } func fwht48(data []ffe8, s int) { @@ -1113,7 +1105,7 @@ func initFFTSkew8() { } logWalsh8[0] = 0 - fwht8(logWalsh8, order8, order8) + fwht8(logWalsh8, order8) } func initMul8LUT() { diff --git a/vendor/github.com/klauspost/reedsolomon/options.go b/vendor/github.com/klauspost/reedsolomon/options.go index 73cc7d6d2..377137ef5 100644 --- a/vendor/github.com/klauspost/reedsolomon/options.go +++ b/vendor/github.com/klauspost/reedsolomon/options.go @@ -21,7 +21,9 @@ type options struct { useAVX512, useAVX2, useSSSE3, - useSSE2 bool + useSSE2, + useNEON, + useSVE bool useJerasureMatrix bool usePAR1Matrix bool @@ -51,6 +53,8 @@ var defaultOptions = options{ useAVX512: cpuid.CPU.Supports(cpuid.AVX512F, cpuid.AVX512BW, cpuid.AVX512VL), useAvx512GFNI: cpuid.CPU.Supports(cpuid.AVX512F, cpuid.GFNI, cpuid.AVX512DQ), useAvxGNFI: cpuid.CPU.Supports(cpuid.AVX, cpuid.GFNI), + useNEON: cpuid.CPU.Supports(cpuid.ASIMD), + useSVE: cpuid.CPU.Supports(cpuid.SVE), } // leopardMode controls the use of leopard GF in encoding and decoding. @@ -316,6 +320,11 @@ func (o *options) cpuOptions() string { if o.useAvxGNFI { res = append(res, "AVX+GFNI") } + if o.useSVE { + res = append(res, "ARM+SVE") + } else if o.useNEON { + res = append(res, "ARM+NEON") + } if len(res) == 0 { return "pure Go" } diff --git a/vendor/github.com/klauspost/reedsolomon/reedsolomon.go b/vendor/github.com/klauspost/reedsolomon/reedsolomon.go index bebba0445..3b6f5b785 100644 --- a/vendor/github.com/klauspost/reedsolomon/reedsolomon.go +++ b/vendor/github.com/klauspost/reedsolomon/reedsolomon.go @@ -153,9 +153,8 @@ type Extensions interface { } const ( - avx2CodeGenMinSize = 64 - avx2CodeGenMinShards = 3 - avx2CodeGenMaxGoroutines = 8 + codeGenMinSize = 64 + codeGenMinShards = 3 gfniCodeGenMaxGoroutines = 4 intSize = 32 << (^uint(0) >> 63) // 32 or 64 @@ -482,21 +481,23 @@ func New(dataShards, parityShards int, opts ...Option) (Encoder, error) { r.o.perRound = 128 << 10 } + _, _, useCodeGen := r.hasCodeGen(codeGenMinSize, codeGenMaxInputs, codeGenMaxOutputs) + divide := parityShards + 1 - if avx2CodeGen && r.o.useAVX2 && (dataShards > maxAvx2Inputs || parityShards > maxAvx2Outputs) { + if codeGen && useCodeGen && (dataShards > codeGenMaxInputs || parityShards > codeGenMaxOutputs) { // Base on L1 cache if we have many inputs. r.o.perRound = cpuid.CPU.Cache.L1D if r.o.perRound < 32<<10 { r.o.perRound = 32 << 10 } divide = 0 - if dataShards > maxAvx2Inputs { - divide += maxAvx2Inputs + if dataShards > codeGenMaxInputs { + divide += codeGenMaxInputs } else { divide += dataShards } - if parityShards > maxAvx2Inputs { - divide += maxAvx2Outputs + if parityShards > codeGenMaxInputs { + divide += codeGenMaxOutputs } else { divide += parityShards } @@ -555,11 +556,11 @@ func New(dataShards, parityShards int, opts ...Option) (Encoder, error) { // Generated AVX2 does not need data to stay in L1 cache between runs. // We will be purely limited by RAM speed. - if r.canAVX2C(avx2CodeGenMinSize, maxAvx2Inputs, maxAvx2Outputs) && r.o.maxGoroutines > avx2CodeGenMaxGoroutines { - r.o.maxGoroutines = avx2CodeGenMaxGoroutines + if useCodeGen && r.o.maxGoroutines > codeGenMaxGoroutines { + r.o.maxGoroutines = codeGenMaxGoroutines } - if r.canGFNI(avx2CodeGenMinSize, maxAvx2Inputs, maxAvx2Outputs) && r.o.maxGoroutines > gfniCodeGenMaxGoroutines { + if _, _, useGFNI := r.canGFNI(codeGenMinSize, codeGenMaxInputs, codeGenMaxOutputs); useGFNI && r.o.maxGoroutines > gfniCodeGenMaxGoroutines { r.o.maxGoroutines = gfniCodeGenMaxGoroutines } @@ -577,7 +578,7 @@ func New(dataShards, parityShards int, opts ...Option) (Encoder, error) { r.parity[i] = r.m[dataShards+i] } - if avx2CodeGen && r.o.useAVX2 { + if codeGen /* && r.o.useAVX2 */ { sz := r.dataShards * r.parityShards * 2 * 32 r.mPool.New = func() interface{} { return AllocAligned(1, sz)[0] @@ -653,15 +654,15 @@ func (r *reedSolomon) EncodeIdx(dataShard []byte, idx int, parity [][]byte) erro return ErrShardSize } - if avx2CodeGen && len(dataShard) >= r.o.perRound && len(parity) >= avx2CodeGenMinShards && ((pshufb && r.o.useAVX2) || r.o.useAvx512GFNI || r.o.useAvxGNFI) { + if codeGen && len(dataShard) >= r.o.perRound && len(parity) >= codeGenMinShards && (pshufb || r.o.useAvx512GFNI || r.o.useAvxGNFI) { m := make([][]byte, r.parityShards) for iRow := range m { m[iRow] = r.parity[iRow][idx : idx+1] } if r.o.useAvx512GFNI || r.o.useAvxGNFI { - r.codeSomeShardsGFNI(m, [][]byte{dataShard}, parity, len(dataShard), false) + r.codeSomeShardsGFNI(m, [][]byte{dataShard}, parity, len(dataShard), false, nil, nil) } else { - r.codeSomeShardsAVXP(m, [][]byte{dataShard}, parity, len(dataShard), false) + r.codeSomeShardsAVXP(m, [][]byte{dataShard}, parity, len(dataShard), false, nil, nil) } return nil } @@ -803,18 +804,6 @@ func (r *reedSolomon) Verify(shards [][]byte) (bool, error) { return r.checkSomeShards(r.parity, shards[:r.dataShards], toCheck[:r.parityShards], len(shards[0])), nil } -func (r *reedSolomon) canAVX2C(byteCount int, inputs, outputs int) bool { - return avx2CodeGen && pshufb && r.o.useAVX2 && - byteCount >= avx2CodeGenMinSize && inputs+outputs >= avx2CodeGenMinShards && - inputs <= maxAvx2Inputs && outputs <= maxAvx2Outputs -} - -func (r *reedSolomon) canGFNI(byteCount int, inputs, outputs int) bool { - return avx2CodeGen && (r.o.useAvx512GFNI || r.o.useAvxGNFI) && - byteCount >= avx2CodeGenMinSize && inputs+outputs >= avx2CodeGenMinShards && - inputs <= maxAvx2Inputs && outputs <= maxAvx2Outputs -} - // Multiplies a subset of rows from a coding matrix by a full set of // input totalShards to produce some output totalShards. // 'matrixRows' is The rows from the matrix to use. @@ -838,22 +827,18 @@ func (r *reedSolomon) codeSomeShards(matrixRows, inputs, outputs [][]byte, byteC if end > len(inputs[0]) { end = len(inputs[0]) } - if r.canGFNI(byteCount, len(inputs), len(outputs)) { - var gfni [maxAvx2Inputs * maxAvx2Outputs]uint64 + if galMulGFNI, galMulGFNIXor, useGFNI := r.canGFNI(byteCount, len(inputs), len(outputs)); useGFNI { + var gfni [codeGenMaxInputs * codeGenMaxOutputs]uint64 m := genGFNIMatrix(matrixRows, len(inputs), 0, len(outputs), gfni[:]) - if r.o.useAvx512GFNI { - start += galMulSlicesGFNI(m, inputs, outputs, 0, byteCount) - } else { - start += galMulSlicesAvxGFNI(m, inputs, outputs, 0, byteCount) - } + start += (*galMulGFNI)(m, inputs, outputs, 0, byteCount) end = len(inputs[0]) - } else if r.canAVX2C(byteCount, len(inputs), len(outputs)) { - m := genAvx2Matrix(matrixRows, len(inputs), 0, len(outputs), r.getTmpSlice()) - start += galMulSlicesAvx2(m, inputs, outputs, 0, byteCount) + } else if galMulGen, _, ok := r.hasCodeGen(byteCount, len(inputs), len(outputs)); ok { + m := genCodeGenMatrix(matrixRows, len(inputs), 0, len(outputs), r.getTmpSlice()) + start += (*galMulGen)(m, inputs, outputs, 0, byteCount) r.putTmpSlice(m) end = len(inputs[0]) - } else if len(inputs)+len(outputs) > avx2CodeGenMinShards && r.canAVX2C(byteCount, maxAvx2Inputs, maxAvx2Outputs) { - var gfni [maxAvx2Inputs * maxAvx2Outputs]uint64 + } else if galMulGen, galMulGenXor, ok := r.hasCodeGen(byteCount, codeGenMaxInputs, codeGenMaxOutputs); len(inputs)+len(outputs) > codeGenMinShards && ok { + var gfni [codeGenMaxInputs * codeGenMaxOutputs]uint64 end = len(inputs[0]) inIdx := 0 m := r.getTmpSlice() @@ -861,36 +846,29 @@ func (r *reedSolomon) codeSomeShards(matrixRows, inputs, outputs [][]byte, byteC ins := inputs for len(ins) > 0 { inPer := ins - if len(inPer) > maxAvx2Inputs { - inPer = inPer[:maxAvx2Inputs] + if len(inPer) > codeGenMaxInputs { + inPer = inPer[:codeGenMaxInputs] } outs := outputs outIdx := 0 for len(outs) > 0 { outPer := outs - if len(outPer) > maxAvx2Outputs { - outPer = outPer[:maxAvx2Outputs] + if len(outPer) > codeGenMaxOutputs { + outPer = outPer[:codeGenMaxOutputs] } - if r.o.useAvx512GFNI { - m := genGFNIMatrix(matrixRows[outIdx:], len(inPer), inIdx, len(outPer), gfni[:]) - if inIdx == 0 { - start = galMulSlicesGFNI(m, inPer, outPer, 0, byteCount) - } else { - start = galMulSlicesGFNIXor(m, inPer, outPer, 0, byteCount) - } - } else if r.o.useAvxGNFI { + if useGFNI { m := genGFNIMatrix(matrixRows[outIdx:], len(inPer), inIdx, len(outPer), gfni[:]) if inIdx == 0 { - start = galMulSlicesAvxGFNI(m, inPer, outPer, 0, byteCount) + start = (*galMulGFNI)(m, inPer, outPer, 0, byteCount) } else { - start = galMulSlicesAvxGFNIXor(m, inPer, outPer, 0, byteCount) + start = (*galMulGFNIXor)(m, inPer, outPer, 0, byteCount) } } else { - m = genAvx2Matrix(matrixRows[outIdx:], len(inPer), inIdx, len(outPer), m) + m = genCodeGenMatrix(matrixRows[outIdx:], len(inPer), inIdx, len(outPer), m) if inIdx == 0 { - start = galMulSlicesAvx2(m, inPer, outPer, 0, byteCount) + start = (*galMulGen)(m, inPer, outPer, 0, byteCount) } else { - start = galMulSlicesAvx2Xor(m, inPer, outPer, 0, byteCount) + start = (*galMulGenXor)(m, inPer, outPer, 0, byteCount) } } outIdx += len(outPer) @@ -928,27 +906,27 @@ func (r *reedSolomon) codeSomeShardsP(matrixRows, inputs, outputs [][]byte, byte var wg sync.WaitGroup gor := r.o.maxGoroutines - var avx2Matrix []byte + var genMatrix []byte var gfniMatrix []uint64 - useAvx2 := r.canAVX2C(byteCount, len(inputs), len(outputs)) - useGFNI := r.canGFNI(byteCount, len(inputs), len(outputs)) + galMulGen, _, useCodeGen := r.hasCodeGen(byteCount, len(inputs), len(outputs)) + galMulGFNI, _, useGFNI := r.canGFNI(byteCount, len(inputs), len(outputs)) if useGFNI { - var tmp [maxAvx2Inputs * maxAvx2Outputs]uint64 + var tmp [codeGenMaxInputs * codeGenMaxOutputs]uint64 gfniMatrix = genGFNIMatrix(matrixRows, len(inputs), 0, len(outputs), tmp[:]) - } else if useAvx2 { - avx2Matrix = genAvx2Matrix(matrixRows, len(inputs), 0, len(outputs), r.getTmpSlice()) - defer r.putTmpSlice(avx2Matrix) - } else if (r.o.useAvx512GFNI || r.o.useAvxGNFI) && byteCount < 10<<20 && len(inputs)+len(outputs) > avx2CodeGenMinShards && - r.canGFNI(byteCount/4, maxAvx2Inputs, maxAvx2Outputs) { + } else if useCodeGen { + genMatrix = genCodeGenMatrix(matrixRows, len(inputs), 0, len(outputs), r.getTmpSlice()) + defer r.putTmpSlice(genMatrix) + } else if galMulGFNI, galMulGFNIXor, useGFNI := r.canGFNI(byteCount/4, codeGenMaxInputs, codeGenMaxOutputs); useGFNI && + byteCount < 10<<20 && len(inputs)+len(outputs) > codeGenMinShards { // It appears there is a switchover point at around 10MB where // Regular processing is faster... - r.codeSomeShardsGFNI(matrixRows, inputs, outputs, byteCount, true) + r.codeSomeShardsGFNI(matrixRows, inputs, outputs, byteCount, true, galMulGFNI, galMulGFNIXor) return - } else if r.o.useAVX2 && byteCount < 10<<20 && len(inputs)+len(outputs) > avx2CodeGenMinShards && - r.canAVX2C(byteCount/4, maxAvx2Inputs, maxAvx2Outputs) { + } else if galMulGen, galMulGenXor, ok := r.hasCodeGen(byteCount/4, codeGenMaxInputs, codeGenMaxOutputs); ok && + byteCount < 10<<20 && len(inputs)+len(outputs) > codeGenMinShards { // It appears there is a switchover point at around 10MB where // Regular processing is faster... - r.codeSomeShardsAVXP(matrixRows, inputs, outputs, byteCount, true) + r.codeSomeShardsAVXP(matrixRows, inputs, outputs, byteCount, true, galMulGen, galMulGenXor) return } @@ -960,13 +938,9 @@ func (r *reedSolomon) codeSomeShardsP(matrixRows, inputs, outputs [][]byte, byte exec := func(start, stop int) { if stop-start >= 64 { if useGFNI { - if r.o.useAvx512GFNI { - start += galMulSlicesGFNI(gfniMatrix, inputs, outputs, start, stop) - } else { - start += galMulSlicesAvxGFNI(gfniMatrix, inputs, outputs, start, stop) - } - } else if useAvx2 { - start += galMulSlicesAvx2(avx2Matrix, inputs, outputs, start, stop) + start += (*galMulGFNI)(gfniMatrix, inputs, outputs, start, stop) + } else if useCodeGen { + start += (*galMulGen)(genMatrix, inputs, outputs, start, stop) } } @@ -1017,7 +991,7 @@ func (r *reedSolomon) codeSomeShardsP(matrixRows, inputs, outputs [][]byte, byte // Perform the same as codeSomeShards, but split the workload into // several goroutines. // If clear is set, the first write will overwrite the output. -func (r *reedSolomon) codeSomeShardsAVXP(matrixRows, inputs, outputs [][]byte, byteCount int, clear bool) { +func (r *reedSolomon) codeSomeShardsAVXP(matrixRows, inputs, outputs [][]byte, byteCount int, clear bool, galMulGen, galMulGenXor *func(matrix []byte, in [][]byte, out [][]byte, start int, stop int) int) { var wg sync.WaitGroup gor := r.o.maxGoroutines @@ -1028,7 +1002,7 @@ func (r *reedSolomon) codeSomeShardsAVXP(matrixRows, inputs, outputs [][]byte, b first bool } // Make a plan... - plan := make([]state, 0, ((len(inputs)+maxAvx2Inputs-1)/maxAvx2Inputs)*((len(outputs)+maxAvx2Outputs-1)/maxAvx2Outputs)) + plan := make([]state, 0, ((len(inputs)+codeGenMaxInputs-1)/codeGenMaxInputs)*((len(outputs)+codeGenMaxOutputs-1)/codeGenMaxOutputs)) tmp := r.getTmpSlice() defer r.putTmpSlice(tmp) @@ -1040,18 +1014,18 @@ func (r *reedSolomon) codeSomeShardsAVXP(matrixRows, inputs, outputs [][]byte, b ins := inputs for len(ins) > 0 { inPer := ins - if len(inPer) > maxAvx2Inputs { - inPer = inPer[:maxAvx2Inputs] + if len(inPer) > codeGenMaxInputs { + inPer = inPer[:codeGenMaxInputs] } outs := outputs outIdx := 0 for len(outs) > 0 { outPer := outs - if len(outPer) > maxAvx2Outputs { - outPer = outPer[:maxAvx2Outputs] + if len(outPer) > codeGenMaxOutputs { + outPer = outPer[:codeGenMaxOutputs] } // Generate local matrix - m := genAvx2Matrix(matrixRows[outIdx:], len(inPer), inIdx, len(outPer), tmp) + m := genCodeGenMatrix(matrixRows[outIdx:], len(inPer), inIdx, len(outPer), tmp) tmp = tmp[len(m):] plan = append(plan, state{ input: inPer, @@ -1070,19 +1044,19 @@ func (r *reedSolomon) codeSomeShardsAVXP(matrixRows, inputs, outputs [][]byte, b outIdx := 0 for len(outs) > 0 { outPer := outs - if len(outPer) > maxAvx2Outputs { - outPer = outPer[:maxAvx2Outputs] + if len(outPer) > codeGenMaxOutputs { + outPer = outPer[:codeGenMaxOutputs] } inIdx := 0 ins := inputs for len(ins) > 0 { inPer := ins - if len(inPer) > maxAvx2Inputs { - inPer = inPer[:maxAvx2Inputs] + if len(inPer) > codeGenMaxInputs { + inPer = inPer[:codeGenMaxInputs] } // Generate local matrix - m := genAvx2Matrix(matrixRows[outIdx:], len(inPer), inIdx, len(outPer), tmp) + m := genCodeGenMatrix(matrixRows[outIdx:], len(inPer), inIdx, len(outPer), tmp) tmp = tmp[len(m):] //fmt.Println("bytes:", len(inPer)*r.o.perRound, "out:", len(outPer)*r.o.perRound) plan = append(plan, state{ @@ -1111,14 +1085,14 @@ func (r *reedSolomon) codeSomeShardsAVXP(matrixRows, inputs, outputs [][]byte, b lstop = stop } for lstart < stop { - if lstop-lstart >= minAvx2Size { + if galMulGen != nil && galMulGenXor != nil && lstop-lstart >= minCodeGenSize { // Execute plan... var n int for _, p := range plan { if p.first { - n = galMulSlicesAvx2(p.m, p.input, p.output, lstart, lstop) + n = (*galMulGen)(p.m, p.input, p.output, lstart, lstop) } else { - n = galMulSlicesAvx2Xor(p.m, p.input, p.output, lstart, lstop) + n = (*galMulGenXor)(p.m, p.input, p.output, lstart, lstop) } } lstart += n @@ -1172,7 +1146,7 @@ func (r *reedSolomon) codeSomeShardsAVXP(matrixRows, inputs, outputs [][]byte, b // Perform the same as codeSomeShards, but split the workload into // several goroutines. // If clear is set, the first write will overwrite the output. -func (r *reedSolomon) codeSomeShardsGFNI(matrixRows, inputs, outputs [][]byte, byteCount int, clear bool) { +func (r *reedSolomon) codeSomeShardsGFNI(matrixRows, inputs, outputs [][]byte, byteCount int, clear bool, galMulGFNI, galMulGFNIXor *func(matrix []uint64, in, out [][]byte, start, stop int) int) { var wg sync.WaitGroup gor := r.o.maxGoroutines @@ -1183,7 +1157,7 @@ func (r *reedSolomon) codeSomeShardsGFNI(matrixRows, inputs, outputs [][]byte, b first bool } // Make a plan... - plan := make([]state, 0, ((len(inputs)+maxAvx2Inputs-1)/maxAvx2Inputs)*((len(outputs)+maxAvx2Outputs-1)/maxAvx2Outputs)) + plan := make([]state, 0, ((len(inputs)+codeGenMaxInputs-1)/codeGenMaxInputs)*((len(outputs)+codeGenMaxOutputs-1)/codeGenMaxOutputs)) // Flips between input first to output first. // We put the smallest data load in the inner loop. @@ -1192,15 +1166,15 @@ func (r *reedSolomon) codeSomeShardsGFNI(matrixRows, inputs, outputs [][]byte, b ins := inputs for len(ins) > 0 { inPer := ins - if len(inPer) > maxAvx2Inputs { - inPer = inPer[:maxAvx2Inputs] + if len(inPer) > codeGenMaxInputs { + inPer = inPer[:codeGenMaxInputs] } outs := outputs outIdx := 0 for len(outs) > 0 { outPer := outs - if len(outPer) > maxAvx2Outputs { - outPer = outPer[:maxAvx2Outputs] + if len(outPer) > codeGenMaxOutputs { + outPer = outPer[:codeGenMaxOutputs] } // Generate local matrix m := genGFNIMatrix(matrixRows[outIdx:], len(inPer), inIdx, len(outPer), make([]uint64, len(inPer)*len(outPer))) @@ -1221,16 +1195,16 @@ func (r *reedSolomon) codeSomeShardsGFNI(matrixRows, inputs, outputs [][]byte, b outIdx := 0 for len(outs) > 0 { outPer := outs - if len(outPer) > maxAvx2Outputs { - outPer = outPer[:maxAvx2Outputs] + if len(outPer) > codeGenMaxOutputs { + outPer = outPer[:codeGenMaxOutputs] } inIdx := 0 ins := inputs for len(ins) > 0 { inPer := ins - if len(inPer) > maxAvx2Inputs { - inPer = inPer[:maxAvx2Inputs] + if len(inPer) > codeGenMaxInputs { + inPer = inPer[:codeGenMaxInputs] } // Generate local matrix m := genGFNIMatrix(matrixRows[outIdx:], len(inPer), inIdx, len(outPer), make([]uint64, len(inPer)*len(outPer))) @@ -1261,24 +1235,14 @@ func (r *reedSolomon) codeSomeShardsGFNI(matrixRows, inputs, outputs [][]byte, b lstop = stop } for lstart < stop { - if lstop-lstart >= minAvx2Size { + if galMulGFNI != nil && galMulGFNIXor != nil && lstop-lstart >= minCodeGenSize { // Execute plan... var n int - if r.o.useAvx512GFNI { - for _, p := range plan { - if p.first { - n = galMulSlicesGFNI(p.m, p.input, p.output, lstart, lstop) - } else { - n = galMulSlicesGFNIXor(p.m, p.input, p.output, lstart, lstop) - } - } - } else { - for _, p := range plan { - if p.first { - n = galMulSlicesAvxGFNI(p.m, p.input, p.output, lstart, lstop) - } else { - n = galMulSlicesAvxGFNIXor(p.m, p.input, p.output, lstart, lstop) - } + for _, p := range plan { + if p.first { + n = (*galMulGFNI)(p.m, p.input, p.output, lstart, lstop) + } else { + n = (*galMulGFNIXor)(p.m, p.input, p.output, lstart, lstop) } } lstart += n diff --git a/vendor/github.com/xtaci/qpp/README.md b/vendor/github.com/xtaci/qpp/README.md index 5f546085d..2df7cacaa 100644 --- a/vendor/github.com/xtaci/qpp/README.md +++ b/vendor/github.com/xtaci/qpp/README.md @@ -82,6 +82,10 @@ func main() { } ``` +## Performance +![QQ_1720105700910](https://github.com/xtaci/qpp/assets/2346725/2c724b40-7210-4ebc-bb6d-08c4c6f5663b) + + ## Conclusion The Quantum Permutation Pad is a promising approach in the field of quantum cryptography, utilizing quantum mechanical properties to achieve secure communication. By applying quantum permutations to encrypt and decrypt data, QPP ensures high security and leverages the unique capabilities of quantum technology. As research and technology in quantum computing and quantum communication advance, protocols like QPP will play a crucial role in the next generation of secure communication systems. diff --git a/vendor/github.com/xtaci/qpp/qpp.go b/vendor/github.com/xtaci/qpp/qpp.go index 83d9fa222..12aca06f0 100644 --- a/vendor/github.com/xtaci/qpp/qpp.go +++ b/vendor/github.com/xtaci/qpp/qpp.go @@ -8,7 +8,8 @@ import ( "encoding/binary" "fmt" "math/big" - "math/rand/v2" + "math/rand" + "unsafe" "golang.org/x/crypto/pbkdf2" ) @@ -25,17 +26,14 @@ const ( CHUNK_DERIVE_LOOPS = 1024 ) -type Source uint64 - -func (s Source) Uint64() uint64 { - return uint64(s) -} - // QuantumPermutationPad represents the encryption/decryption structure using quantum permutation pads // QPP is a cryptographic technique that leverages quantum-inspired permutation matrices to provide secure encryption. type QuantumPermutationPad struct { - pads [][]byte // Encryption pads, each pad is a permutation matrix for encryption - rpads [][]byte // Decryption pads, each pad is a reverse permutation matrix for decryption + pads []byte // Encryption pads, each pad is a permutation matrix for encryption + rpads []byte // Decryption pads, each pad is a reverse permutation matrix for decryption + padsPtr uintptr // raw pointer to encryption pads + rpadsPtr uintptr // raw pointer to encryption pads + numPads uint16 // Number of pads (permutation matrices) qubits uint8 // Number of quantum bits, determines the size of each pad encRand *rand.Rand // Default random source for encryption pad selection @@ -50,21 +48,24 @@ func NewQPP(seed []byte, numPads uint16, qubits uint8) *QuantumPermutationPad { qubits: qubits, } - qpp.pads = make([][]byte, numPads) - qpp.rpads = make([][]byte, numPads) + matrixBytes := 1 << qubits + qpp.pads = make([]byte, int(numPads)*matrixBytes) + qpp.rpads = make([]byte, int(numPads)*matrixBytes) + qpp.padsPtr = uintptr(unsafe.Pointer(unsafe.SliceData(qpp.pads))) + qpp.rpadsPtr = uintptr(unsafe.Pointer(unsafe.SliceData(qpp.rpads))) chunks := seedToChunks(seed, qubits) // Initialize and shuffle pads to create permutation matrices for i := 0; i < int(numPads); i++ { - qpp.pads[i] = make([]byte, 1< 0; i-- { block.Encrypt(sum, sum) - j := binary.LittleEndian.Uint64(sum) % uint64(i+1) + bigrand := new(big.Int).SetBytes(sum) + j := bigrand.Mod(bigrand, big.NewInt(int64(i+1))).Uint64() pad[i], pad[j] = pad[j], pad[i] } } diff --git a/vendor/golang.org/x/crypto/blowfish/cipher.go b/vendor/golang.org/x/crypto/blowfish/cipher.go index 213bf204a..089895680 100644 --- a/vendor/golang.org/x/crypto/blowfish/cipher.go +++ b/vendor/golang.org/x/crypto/blowfish/cipher.go @@ -11,7 +11,7 @@ // Deprecated: any new system should use AES (from crypto/aes, if necessary in // an AEAD mode like crypto/cipher.NewGCM) or XChaCha20-Poly1305 (from // golang.org/x/crypto/chacha20poly1305). -package blowfish // import "golang.org/x/crypto/blowfish" +package blowfish // The code is a port of Bruce Schneier's C implementation. // See https://www.schneier.com/blowfish.html. diff --git a/vendor/golang.org/x/crypto/cast5/cast5.go b/vendor/golang.org/x/crypto/cast5/cast5.go index 425e8eecb..016e90215 100644 --- a/vendor/golang.org/x/crypto/cast5/cast5.go +++ b/vendor/golang.org/x/crypto/cast5/cast5.go @@ -11,7 +11,7 @@ // Deprecated: any new system should use AES (from crypto/aes, if necessary in // an AEAD mode like crypto/cipher.NewGCM) or XChaCha20-Poly1305 (from // golang.org/x/crypto/chacha20poly1305). -package cast5 // import "golang.org/x/crypto/cast5" +package cast5 import ( "errors" diff --git a/vendor/golang.org/x/crypto/pbkdf2/pbkdf2.go b/vendor/golang.org/x/crypto/pbkdf2/pbkdf2.go index 904b57e01..28cd99c7f 100644 --- a/vendor/golang.org/x/crypto/pbkdf2/pbkdf2.go +++ b/vendor/golang.org/x/crypto/pbkdf2/pbkdf2.go @@ -16,7 +16,7 @@ Hash Functions SHA-1, SHA-224, SHA-256, SHA-384 and SHA-512 for HMAC. To choose, you can pass the `New` functions from the different SHA packages to pbkdf2.Key. */ -package pbkdf2 // import "golang.org/x/crypto/pbkdf2" +package pbkdf2 import ( "crypto/hmac" diff --git a/vendor/golang.org/x/crypto/salsa20/salsa/hsalsa20.go b/vendor/golang.org/x/crypto/salsa20/salsa/hsalsa20.go index 3fd05b275..3685b3445 100644 --- a/vendor/golang.org/x/crypto/salsa20/salsa/hsalsa20.go +++ b/vendor/golang.org/x/crypto/salsa20/salsa/hsalsa20.go @@ -3,7 +3,7 @@ // license that can be found in the LICENSE file. // Package salsa provides low-level access to functions in the Salsa family. -package salsa // import "golang.org/x/crypto/salsa20/salsa" +package salsa import "math/bits" diff --git a/vendor/golang.org/x/crypto/salsa20/salsa20.go b/vendor/golang.org/x/crypto/salsa20/salsa20.go index 8f4f896c7..e75c9342a 100644 --- a/vendor/golang.org/x/crypto/salsa20/salsa20.go +++ b/vendor/golang.org/x/crypto/salsa20/salsa20.go @@ -19,7 +19,7 @@ This package also implements XSalsa20: a version of Salsa20 with a 24-byte nonce as specified in https://cr.yp.to/snuffle/xsalsa-20081128.pdf. Simply passing a 24-byte slice as the nonce triggers XSalsa20. */ -package salsa20 // import "golang.org/x/crypto/salsa20" +package salsa20 // TODO(agl): implement XORKeyStream12 and XORKeyStream8 - the reduced round variants of Salsa20. diff --git a/vendor/golang.org/x/crypto/twofish/twofish.go b/vendor/golang.org/x/crypto/twofish/twofish.go index e4eeae17f..6d0a3028d 100644 --- a/vendor/golang.org/x/crypto/twofish/twofish.go +++ b/vendor/golang.org/x/crypto/twofish/twofish.go @@ -9,7 +9,7 @@ // implementation. Instead, use AES (from crypto/aes, if necessary in an AEAD // mode like crypto/cipher.NewGCM) or XChaCha20-Poly1305 (from // golang.org/x/crypto/chacha20poly1305). -package twofish // import "golang.org/x/crypto/twofish" +package twofish // Twofish is defined in https://www.schneier.com/paper-twofish-paper.pdf [TWOFISH] diff --git a/vendor/golang.org/x/crypto/xtea/cipher.go b/vendor/golang.org/x/crypto/xtea/cipher.go index a4c2fd02b..7b4f8aaa6 100644 --- a/vendor/golang.org/x/crypto/xtea/cipher.go +++ b/vendor/golang.org/x/crypto/xtea/cipher.go @@ -12,7 +12,7 @@ // Deprecated: any new system should use AES (from crypto/aes, if necessary in // an AEAD mode like crypto/cipher.NewGCM) or XChaCha20-Poly1305 (from // golang.org/x/crypto/chacha20poly1305). -package xtea // import "golang.org/x/crypto/xtea" +package xtea // For details, see http://www.cix.co.uk/~klockstone/xtea.pdf diff --git a/vendor/golang.org/x/sys/unix/mremap.go b/vendor/golang.org/x/sys/unix/mremap.go index fd45fe529..3a5e776f8 100644 --- a/vendor/golang.org/x/sys/unix/mremap.go +++ b/vendor/golang.org/x/sys/unix/mremap.go @@ -50,3 +50,8 @@ func (m *mremapMmapper) Mremap(oldData []byte, newLength int, flags int) (data [ func Mremap(oldData []byte, newLength int, flags int) (data []byte, err error) { return mapper.Mremap(oldData, newLength, flags) } + +func MremapPtr(oldAddr unsafe.Pointer, oldSize uintptr, newAddr unsafe.Pointer, newSize uintptr, flags int) (ret unsafe.Pointer, err error) { + xaddr, err := mapper.mremap(uintptr(oldAddr), oldSize, newSize, flags, uintptr(newAddr)) + return unsafe.Pointer(xaddr), err +} diff --git a/vendor/golang.org/x/sys/unix/syscall_darwin.go b/vendor/golang.org/x/sys/unix/syscall_darwin.go index 59542a897..4cc7b0059 100644 --- a/vendor/golang.org/x/sys/unix/syscall_darwin.go +++ b/vendor/golang.org/x/sys/unix/syscall_darwin.go @@ -542,6 +542,18 @@ func SysctlKinfoProcSlice(name string, args ...int) ([]KinfoProc, error) { } } +//sys pthread_chdir_np(path string) (err error) + +func PthreadChdir(path string) (err error) { + return pthread_chdir_np(path) +} + +//sys pthread_fchdir_np(fd int) (err error) + +func PthreadFchdir(fd int) (err error) { + return pthread_fchdir_np(fd) +} + //sys sendfile(infd int, outfd int, offset int64, len *int64, hdtr unsafe.Pointer, flags int) (err error) //sys shmat(id int, addr uintptr, flag int) (ret uintptr, err error) diff --git a/vendor/golang.org/x/sys/unix/syscall_unix.go b/vendor/golang.org/x/sys/unix/syscall_unix.go index 77081de8c..4e92e5aa4 100644 --- a/vendor/golang.org/x/sys/unix/syscall_unix.go +++ b/vendor/golang.org/x/sys/unix/syscall_unix.go @@ -154,6 +154,15 @@ func Munmap(b []byte) (err error) { return mapper.Munmap(b) } +func MmapPtr(fd int, offset int64, addr unsafe.Pointer, length uintptr, prot int, flags int) (ret unsafe.Pointer, err error) { + xaddr, err := mapper.mmap(uintptr(addr), length, prot, flags, fd, offset) + return unsafe.Pointer(xaddr), err +} + +func MunmapPtr(addr unsafe.Pointer, length uintptr) (err error) { + return mapper.munmap(uintptr(addr), length) +} + func Read(fd int, p []byte) (n int, err error) { n, err = read(fd, p) if raceenabled { diff --git a/vendor/golang.org/x/sys/unix/zsyscall_darwin_amd64.go b/vendor/golang.org/x/sys/unix/zsyscall_darwin_amd64.go index ccb02f240..07642c308 100644 --- a/vendor/golang.org/x/sys/unix/zsyscall_darwin_amd64.go +++ b/vendor/golang.org/x/sys/unix/zsyscall_darwin_amd64.go @@ -760,6 +760,39 @@ var libc_sysctl_trampoline_addr uintptr // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT +func pthread_chdir_np(path string) (err error) { + var _p0 *byte + _p0, err = BytePtrFromString(path) + if err != nil { + return + } + _, _, e1 := syscall_syscall(libc_pthread_chdir_np_trampoline_addr, uintptr(unsafe.Pointer(_p0)), 0, 0) + if e1 != 0 { + err = errnoErr(e1) + } + return +} + +var libc_pthread_chdir_np_trampoline_addr uintptr + +//go:cgo_import_dynamic libc_pthread_chdir_np pthread_chdir_np "/usr/lib/libSystem.B.dylib" + +// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT + +func pthread_fchdir_np(fd int) (err error) { + _, _, e1 := syscall_syscall(libc_pthread_fchdir_np_trampoline_addr, uintptr(fd), 0, 0) + if e1 != 0 { + err = errnoErr(e1) + } + return +} + +var libc_pthread_fchdir_np_trampoline_addr uintptr + +//go:cgo_import_dynamic libc_pthread_fchdir_np pthread_fchdir_np "/usr/lib/libSystem.B.dylib" + +// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT + func sendfile(infd int, outfd int, offset int64, len *int64, hdtr unsafe.Pointer, flags int) (err error) { _, _, e1 := syscall_syscall6(libc_sendfile_trampoline_addr, uintptr(infd), uintptr(outfd), uintptr(offset), uintptr(unsafe.Pointer(len)), uintptr(hdtr), uintptr(flags)) if e1 != 0 { diff --git a/vendor/golang.org/x/sys/unix/zsyscall_darwin_amd64.s b/vendor/golang.org/x/sys/unix/zsyscall_darwin_amd64.s index 8b8bb2840..923e08cb7 100644 --- a/vendor/golang.org/x/sys/unix/zsyscall_darwin_amd64.s +++ b/vendor/golang.org/x/sys/unix/zsyscall_darwin_amd64.s @@ -228,6 +228,16 @@ TEXT libc_sysctl_trampoline<>(SB),NOSPLIT,$0-0 GLOBL ·libc_sysctl_trampoline_addr(SB), RODATA, $8 DATA ·libc_sysctl_trampoline_addr(SB)/8, $libc_sysctl_trampoline<>(SB) +TEXT libc_pthread_chdir_np_trampoline<>(SB),NOSPLIT,$0-0 + JMP libc_pthread_chdir_np(SB) +GLOBL ·libc_pthread_chdir_np_trampoline_addr(SB), RODATA, $8 +DATA ·libc_pthread_chdir_np_trampoline_addr(SB)/8, $libc_pthread_chdir_np_trampoline<>(SB) + +TEXT libc_pthread_fchdir_np_trampoline<>(SB),NOSPLIT,$0-0 + JMP libc_pthread_fchdir_np(SB) +GLOBL ·libc_pthread_fchdir_np_trampoline_addr(SB), RODATA, $8 +DATA ·libc_pthread_fchdir_np_trampoline_addr(SB)/8, $libc_pthread_fchdir_np_trampoline<>(SB) + TEXT libc_sendfile_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_sendfile(SB) GLOBL ·libc_sendfile_trampoline_addr(SB), RODATA, $8 diff --git a/vendor/golang.org/x/sys/unix/zsyscall_darwin_arm64.go b/vendor/golang.org/x/sys/unix/zsyscall_darwin_arm64.go index 1b40b997b..7d73dda64 100644 --- a/vendor/golang.org/x/sys/unix/zsyscall_darwin_arm64.go +++ b/vendor/golang.org/x/sys/unix/zsyscall_darwin_arm64.go @@ -760,6 +760,39 @@ var libc_sysctl_trampoline_addr uintptr // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT +func pthread_chdir_np(path string) (err error) { + var _p0 *byte + _p0, err = BytePtrFromString(path) + if err != nil { + return + } + _, _, e1 := syscall_syscall(libc_pthread_chdir_np_trampoline_addr, uintptr(unsafe.Pointer(_p0)), 0, 0) + if e1 != 0 { + err = errnoErr(e1) + } + return +} + +var libc_pthread_chdir_np_trampoline_addr uintptr + +//go:cgo_import_dynamic libc_pthread_chdir_np pthread_chdir_np "/usr/lib/libSystem.B.dylib" + +// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT + +func pthread_fchdir_np(fd int) (err error) { + _, _, e1 := syscall_syscall(libc_pthread_fchdir_np_trampoline_addr, uintptr(fd), 0, 0) + if e1 != 0 { + err = errnoErr(e1) + } + return +} + +var libc_pthread_fchdir_np_trampoline_addr uintptr + +//go:cgo_import_dynamic libc_pthread_fchdir_np pthread_fchdir_np "/usr/lib/libSystem.B.dylib" + +// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT + func sendfile(infd int, outfd int, offset int64, len *int64, hdtr unsafe.Pointer, flags int) (err error) { _, _, e1 := syscall_syscall6(libc_sendfile_trampoline_addr, uintptr(infd), uintptr(outfd), uintptr(offset), uintptr(unsafe.Pointer(len)), uintptr(hdtr), uintptr(flags)) if e1 != 0 { diff --git a/vendor/golang.org/x/sys/unix/zsyscall_darwin_arm64.s b/vendor/golang.org/x/sys/unix/zsyscall_darwin_arm64.s index 08362c1ab..057700111 100644 --- a/vendor/golang.org/x/sys/unix/zsyscall_darwin_arm64.s +++ b/vendor/golang.org/x/sys/unix/zsyscall_darwin_arm64.s @@ -228,6 +228,16 @@ TEXT libc_sysctl_trampoline<>(SB),NOSPLIT,$0-0 GLOBL ·libc_sysctl_trampoline_addr(SB), RODATA, $8 DATA ·libc_sysctl_trampoline_addr(SB)/8, $libc_sysctl_trampoline<>(SB) +TEXT libc_pthread_chdir_np_trampoline<>(SB),NOSPLIT,$0-0 + JMP libc_pthread_chdir_np(SB) +GLOBL ·libc_pthread_chdir_np_trampoline_addr(SB), RODATA, $8 +DATA ·libc_pthread_chdir_np_trampoline_addr(SB)/8, $libc_pthread_chdir_np_trampoline<>(SB) + +TEXT libc_pthread_fchdir_np_trampoline<>(SB),NOSPLIT,$0-0 + JMP libc_pthread_fchdir_np(SB) +GLOBL ·libc_pthread_fchdir_np_trampoline_addr(SB), RODATA, $8 +DATA ·libc_pthread_fchdir_np_trampoline_addr(SB)/8, $libc_pthread_fchdir_np_trampoline<>(SB) + TEXT libc_sendfile_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_sendfile(SB) GLOBL ·libc_sendfile_trampoline_addr(SB), RODATA, $8 diff --git a/vendor/golang.org/x/sys/windows/security_windows.go b/vendor/golang.org/x/sys/windows/security_windows.go index 6f7d2ac70..97651b5bd 100644 --- a/vendor/golang.org/x/sys/windows/security_windows.go +++ b/vendor/golang.org/x/sys/windows/security_windows.go @@ -894,7 +894,7 @@ type ACL struct { aclRevision byte sbz1 byte aclSize uint16 - aceCount uint16 + AceCount uint16 sbz2 uint16 } @@ -1087,6 +1087,27 @@ type EXPLICIT_ACCESS struct { Trustee TRUSTEE } +// https://learn.microsoft.com/en-us/windows/win32/api/winnt/ns-winnt-ace_header +type ACE_HEADER struct { + AceType uint8 + AceFlags uint8 + AceSize uint16 +} + +// https://learn.microsoft.com/en-us/windows/win32/api/winnt/ns-winnt-access_allowed_ace +type ACCESS_ALLOWED_ACE struct { + Header ACE_HEADER + Mask ACCESS_MASK + SidStart uint32 +} + +const ( + // Constants for AceType + // https://learn.microsoft.com/en-us/windows/win32/api/winnt/ns-winnt-ace_header + ACCESS_ALLOWED_ACE_TYPE = 0 + ACCESS_DENIED_ACE_TYPE = 1 +) + // This type is the union inside of TRUSTEE and must be created using one of the TrusteeValueFrom* functions. type TrusteeValue uintptr @@ -1158,6 +1179,7 @@ type OBJECTS_AND_NAME struct { //sys makeSelfRelativeSD(absoluteSD *SECURITY_DESCRIPTOR, selfRelativeSD *SECURITY_DESCRIPTOR, selfRelativeSDSize *uint32) (err error) = advapi32.MakeSelfRelativeSD //sys setEntriesInAcl(countExplicitEntries uint32, explicitEntries *EXPLICIT_ACCESS, oldACL *ACL, newACL **ACL) (ret error) = advapi32.SetEntriesInAclW +//sys GetAce(acl *ACL, aceIndex uint32, pAce **ACCESS_ALLOWED_ACE) (ret error) = advapi32.GetAce // Control returns the security descriptor control bits. func (sd *SECURITY_DESCRIPTOR) Control() (control SECURITY_DESCRIPTOR_CONTROL, revision uint32, err error) { diff --git a/vendor/golang.org/x/sys/windows/zsyscall_windows.go b/vendor/golang.org/x/sys/windows/zsyscall_windows.go index 9f73df75b..eba761018 100644 --- a/vendor/golang.org/x/sys/windows/zsyscall_windows.go +++ b/vendor/golang.org/x/sys/windows/zsyscall_windows.go @@ -91,6 +91,7 @@ var ( procEnumServicesStatusExW = modadvapi32.NewProc("EnumServicesStatusExW") procEqualSid = modadvapi32.NewProc("EqualSid") procFreeSid = modadvapi32.NewProc("FreeSid") + procGetAce = modadvapi32.NewProc("GetAce") procGetLengthSid = modadvapi32.NewProc("GetLengthSid") procGetNamedSecurityInfoW = modadvapi32.NewProc("GetNamedSecurityInfoW") procGetSecurityDescriptorControl = modadvapi32.NewProc("GetSecurityDescriptorControl") @@ -1224,6 +1225,14 @@ func setEntriesInAcl(countExplicitEntries uint32, explicitEntries *EXPLICIT_ACCE return } +func GetAce(acl *ACL, aceIndex uint32, pAce **ACCESS_ALLOWED_ACE) (ret error) { + r0, _, _ := syscall.Syscall(procGetAce.Addr(), 3, uintptr(unsafe.Pointer(acl)), uintptr(aceIndex), uintptr(unsafe.Pointer(pAce))) + if r0 == 0 { + ret = GetLastError() + } + return +} + func SetKernelObjectSecurity(handle Handle, securityInformation SECURITY_INFORMATION, securityDescriptor *SECURITY_DESCRIPTOR) (err error) { r1, _, e1 := syscall.Syscall(procSetKernelObjectSecurity.Addr(), 3, uintptr(handle), uintptr(securityInformation), uintptr(unsafe.Pointer(securityDescriptor))) if r1 == 0 { diff --git a/vendor/modules.txt b/vendor/modules.txt index 3e2b4087b..344115ec3 100644 --- a/vendor/modules.txt +++ b/vendor/modules.txt @@ -14,7 +14,7 @@ github.com/google/gopacket/layers # github.com/klauspost/cpuid/v2 v2.2.8 ## explicit; go 1.15 github.com/klauspost/cpuid/v2 -# github.com/klauspost/reedsolomon v1.12.1 +# github.com/klauspost/reedsolomon v1.12.2 ## explicit; go 1.18 github.com/klauspost/reedsolomon # github.com/pkg/errors v0.9.1 @@ -38,7 +38,7 @@ github.com/urfave/cli # github.com/xtaci/kcp-go/v5 v5.6.8 ## explicit; go 1.21 github.com/xtaci/kcp-go/v5 -# github.com/xtaci/qpp v1.1.5 +# github.com/xtaci/qpp v1.1.6 ## explicit; go 1.22.3 github.com/xtaci/qpp # github.com/xtaci/smux v1.5.24 @@ -47,8 +47,8 @@ github.com/xtaci/smux # github.com/xtaci/tcpraw v1.2.25 ## explicit github.com/xtaci/tcpraw -# golang.org/x/crypto v0.24.0 -## explicit; go 1.18 +# golang.org/x/crypto v0.25.0 +## explicit; go 1.20 golang.org/x/crypto/blowfish golang.org/x/crypto/cast5 golang.org/x/crypto/internal/alias @@ -58,14 +58,14 @@ golang.org/x/crypto/salsa20/salsa golang.org/x/crypto/tea golang.org/x/crypto/twofish golang.org/x/crypto/xtea -# golang.org/x/net v0.26.0 +# golang.org/x/net v0.27.0 ## explicit; go 1.18 golang.org/x/net/bpf golang.org/x/net/internal/iana golang.org/x/net/internal/socket golang.org/x/net/ipv4 golang.org/x/net/ipv6 -# golang.org/x/sys v0.21.0 +# golang.org/x/sys v0.22.0 ## explicit; go 1.18 golang.org/x/sys/unix golang.org/x/sys/windows